summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore5
-rw-r--r--kernel/Kconfig.kexec150
-rw-r--r--kernel/Kconfig.locks2
-rw-r--r--kernel/Kconfig.preempt58
-rw-r--r--kernel/Makefile49
-rw-r--r--kernel/acct.c67
-rw-r--r--kernel/async.c157
-rw-r--r--kernel/audit.c220
-rw-r--r--kernel/audit.h50
-rw-r--r--kernel/audit_fsnotify.c29
-rw-r--r--kernel/audit_tree.c93
-rw-r--r--kernel/audit_watch.c34
-rw-r--r--kernel/auditfilter.c42
-rw-r--r--kernel/auditsc.c819
-rw-r--r--kernel/backtracetest.c2
-rw-r--r--kernel/bounds.c7
-rw-r--r--kernel/bpf/Kconfig103
-rw-r--r--kernel/bpf/Makefile27
-rw-r--r--kernel/bpf/arraymap.c480
-rw-r--r--kernel/bpf/bloom_filter.c206
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c240
-rw-r--r--kernel/bpf/bpf_inode_storage.c237
-rw-r--r--kernel/bpf/bpf_iter.c389
-rw-r--r--kernel/bpf/bpf_local_storage.c917
-rw-r--r--kernel/bpf/bpf_lru_list.c28
-rw-r--r--kernel/bpf/bpf_lru_list.h9
-rw-r--r--kernel/bpf/bpf_lsm.c342
-rw-r--r--kernel/bpf/bpf_struct_ops.c446
-rw-r--r--kernel/bpf/bpf_struct_ops_types.h3
-rw-r--r--kernel/bpf/bpf_task_storage.c374
-rw-r--r--kernel/bpf/btf.c5197
-rw-r--r--kernel/bpf/cgroup.c1309
-rw-r--r--kernel/bpf/cgroup_iter.c359
-rw-r--r--kernel/bpf/core.c1177
-rw-r--r--kernel/bpf/cpumap.c563
-rw-r--r--kernel/bpf/cpumask.c482
-rw-r--r--kernel/bpf/devmap.c591
-rw-r--r--kernel/bpf/disasm.c132
-rw-r--r--kernel/bpf/disasm.h2
-rw-r--r--kernel/bpf/dispatcher.c51
-rw-r--r--kernel/bpf/hashtab.c1120
-rw-r--r--kernel/bpf/helpers.c2128
-rw-r--r--kernel/bpf/inode.c213
-rw-r--r--kernel/bpf/link_iter.c107
-rw-r--r--kernel/bpf/local_storage.c300
-rw-r--r--kernel/bpf/log.c831
-rw-r--r--kernel/bpf/lpm_trie.c64
-rw-r--r--kernel/bpf/map_in_map.c86
-rw-r--r--kernel/bpf/map_in_map.h4
-rw-r--r--kernel/bpf/map_iter.c155
-rw-r--r--kernel/bpf/memalloc.c1012
-rw-r--r--kernel/bpf/mmap_unlock_work.h65
-rw-r--r--kernel/bpf/mprog.c452
-rw-r--r--kernel/bpf/net_namespace.c140
-rw-r--r--kernel/bpf/offload.c445
-rw-r--r--kernel/bpf/percpu_freelist.c134
-rw-r--r--kernel/bpf/percpu_freelist.h1
-rw-r--r--kernel/bpf/preload/.gitignore2
-rw-r--r--kernel/bpf/preload/Kconfig26
-rw-r--r--kernel/bpf/preload/Makefile7
-rw-r--r--kernel/bpf/preload/bpf_preload.h16
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c92
-rw-r--r--kernel/bpf/preload/iterators/.gitignore2
-rw-r--r--kernel/bpf/preload/iterators/Makefile67
-rw-r--r--kernel/bpf/preload/iterators/README7
-rw-r--r--kernel/bpf/preload/iterators/iterators.bpf.c118
-rw-r--r--kernel/bpf/preload/iterators/iterators.lskel-big-endian.h419
-rw-r--r--kernel/bpf/preload/iterators/iterators.lskel-little-endian.h435
-rw-r--r--kernel/bpf/prog_iter.c107
-rw-r--r--kernel/bpf/queue_stack_maps.c88
-rw-r--r--kernel/bpf/reuseport_array.c52
-rw-r--r--kernel/bpf/ringbuf.c454
-rw-r--r--kernel/bpf/stackmap.c585
-rw-r--r--kernel/bpf/syscall.c2858
-rw-r--r--kernel/bpf/sysfs_btf.c8
-rw-r--r--kernel/bpf/task_iter.c888
-rw-r--r--kernel/bpf/tcx.c346
-rw-r--r--kernel/bpf/tnum.c54
-rw-r--r--kernel/bpf/trampoline.c1018
-rw-r--r--kernel/bpf/verifier.c15969
-rw-r--r--kernel/capability.c127
-rw-r--r--kernel/cfi.c101
-rw-r--r--kernel/cgroup/Makefile1
-rw-r--r--kernel/cgroup/cgroup-internal.h31
-rw-r--r--kernel/cgroup/cgroup-v1.c189
-rw-r--r--kernel/cgroup/cgroup.c1643
-rw-r--r--kernel/cgroup/cpuset.c2775
-rw-r--r--kernel/cgroup/freezer.c2
-rw-r--r--kernel/cgroup/legacy_freezer.c38
-rw-r--r--kernel/cgroup/misc.c422
-rw-r--r--kernel/cgroup/namespace.c10
-rw-r--r--kernel/cgroup/pids.c37
-rw-r--r--kernel/cgroup/rdma.c4
-rw-r--r--kernel/cgroup/rstat.c349
-rw-r--r--kernel/compat.c29
-rw-r--r--kernel/configs/android-base.config161
-rw-r--r--kernel/configs/android-recommended.config129
-rw-r--r--kernel/configs/debug.config108
-rw-r--r--kernel/configs/hardening.config98
-rw-r--r--kernel/configs/kvm_guest.config1
-rw-r--r--kernel/configs/nopm.config2
-rw-r--r--kernel/configs/rust.config2
-rw-r--r--kernel/configs/tiny-base.config1
-rw-r--r--kernel/configs/tiny.config5
-rw-r--r--kernel/configs/x86_debug.config18
-rw-r--r--kernel/configs/xen.config3
-rw-r--r--kernel/context_tracking.c617
-rw-r--r--kernel/cpu.c1080
-rw-r--r--kernel/cpu_pm.c91
-rw-r--r--kernel/crash_core.c645
-rw-r--r--kernel/cred.c337
-rw-r--r--kernel/debug/debug_core.c142
-rw-r--r--kernel/debug/gdbstub.c24
-rw-r--r--kernel/debug/kdb/kdb_bp.c84
-rw-r--r--kernel/debug/kdb/kdb_bt.c20
-rw-r--r--kernel/debug/kdb/kdb_debugger.c3
-rw-r--r--kernel/debug/kdb/kdb_io.c61
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c7
-rw-r--r--kernel/debug/kdb/kdb_main.c1022
-rw-r--r--kernel/debug/kdb/kdb_private.h46
-rw-r--r--kernel/debug/kdb/kdb_support.c508
-rw-r--r--kernel/delayacct.c183
-rw-r--r--kernel/dma/Kconfig121
-rw-r--r--kernel/dma/Makefile8
-rw-r--r--kernel/dma/coherent.c203
-rw-r--r--kernel/dma/contiguous.c243
-rw-r--r--kernel/dma/debug.c298
-rw-r--r--kernel/dma/debug.h130
-rw-r--r--kernel/dma/direct.c602
-rw-r--r--kernel/dma/direct.h128
-rw-r--r--kernel/dma/dummy.c5
-rw-r--r--kernel/dma/map_benchmark.c358
-rw-r--r--kernel/dma/mapping.c586
-rw-r--r--kernel/dma/ops_helpers.c93
-rw-r--r--kernel/dma/pool.c165
-rw-r--r--kernel/dma/remap.c7
-rw-r--r--kernel/dma/swiotlb.c1858
-rw-r--r--kernel/dma/virt.c59
-rw-r--r--kernel/elfcore.c26
-rw-r--r--kernel/entry/Makefile13
-rw-r--r--kernel/entry/common.c398
-rw-r--r--kernel/entry/common.h7
-rw-r--r--kernel/entry/kvm.c49
-rw-r--r--kernel/entry/syscall_user_dispatch.c164
-rw-r--r--kernel/events/Makefile6
-rw-r--r--kernel/events/callchain.c18
-rw-r--r--kernel/events/core.c4458
-rw-r--r--kernel/events/hw_breakpoint.c674
-rw-r--r--kernel/events/hw_breakpoint_test.c332
-rw-r--r--kernel/events/internal.h16
-rw-r--r--kernel/events/ring_buffer.c92
-rw-r--r--kernel/events/uprobes.c175
-rw-r--r--kernel/exit.c585
-rw-r--r--kernel/exit.h30
-rw-r--r--kernel/extable.c63
-rw-r--r--kernel/fail_function.c38
-rw-r--r--kernel/fork.c1328
-rw-r--r--kernel/freezer.c129
-rw-r--r--kernel/futex.c4112
-rw-r--r--kernel/futex/Makefile3
-rw-r--r--kernel/futex/core.c1173
-rw-r--r--kernel/futex/futex.h386
-rw-r--r--kernel/futex/pi.c1269
-rw-r--r--kernel/futex/requeue.c903
-rw-r--r--kernel/futex/syscalls.c512
-rw-r--r--kernel/futex/waitwake.c734
-rw-r--r--kernel/gcov/Kconfig3
-rw-r--r--kernel/gcov/Makefile2
-rw-r--r--kernel/gcov/base.c49
-rw-r--r--kernel/gcov/clang.c222
-rw-r--r--kernel/gcov/fs.c110
-rw-r--r--kernel/gcov/gcc_4_7.c210
-rw-r--r--kernel/gcov/gcov.h14
-rwxr-xr-xkernel/gen_kheaders.sh27
-rw-r--r--kernel/groups.c24
-rw-r--r--kernel/hung_task.c109
-rw-r--r--kernel/iomem.c13
-rw-r--r--kernel/irq/Kconfig35
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/affinity.c408
-rw-r--r--kernel/irq/chip.c128
-rw-r--r--kernel/irq/cpuhotplug.c6
-rw-r--r--kernel/irq/debugfs.c23
-rw-r--r--kernel/irq/dummychip.c2
-rw-r--r--kernel/irq/generic-chip.c58
-rw-r--r--kernel/irq/handle.c31
-rw-r--r--kernel/irq/internals.h32
-rw-r--r--kernel/irq/ipi-mux.c206
-rw-r--r--kernel/irq/ipi.c56
-rw-r--r--kernel/irq/irq_sim.c37
-rw-r--r--kernel/irq/irqdesc.c343
-rw-r--r--kernel/irq/irqdomain.c802
-rw-r--r--kernel/irq/manage.c368
-rw-r--r--kernel/irq/matrix.c29
-rw-r--r--kernel/irq/migration.c2
-rw-r--r--kernel/irq/msi.c1473
-rw-r--r--kernel/irq/pm.c46
-rw-r--r--kernel/irq/proc.c21
-rw-r--r--kernel/irq/resend.c71
-rw-r--r--kernel/irq/settings.h19
-rw-r--r--kernel/irq/spurious.c12
-rw-r--r--kernel/irq/timings.c17
-rw-r--r--kernel/irq_work.c181
-rw-r--r--kernel/jump_label.c152
-rw-r--r--kernel/kallsyms.c524
-rw-r--r--kernel/kallsyms_internal.h31
-rw-r--r--kernel/kallsyms_selftest.c469
-rw-r--r--kernel/kallsyms_selftest.h13
-rw-r--r--kernel/kcmp.c61
-rw-r--r--kernel/kcov.c169
-rw-r--r--kernel/kcsan/.kunitconfig24
-rw-r--r--kernel/kcsan/Makefile13
-rw-r--r--kernel/kcsan/atomic.h20
-rw-r--r--kernel/kcsan/core.c815
-rw-r--r--kernel/kcsan/debugfs.c142
-rw-r--r--kernel/kcsan/encoding.h27
-rw-r--r--kernel/kcsan/kcsan.h60
-rw-r--r--kernel/kcsan/kcsan_test.c1607
-rw-r--r--kernel/kcsan/permissive.h94
-rw-r--r--kernel/kcsan/report.c291
-rw-r--r--kernel/kcsan/selftest.c261
-rw-r--r--kernel/kcsan/test.c131
-rw-r--r--kernel/kexec.c111
-rw-r--r--kernel/kexec_core.c378
-rw-r--r--kernel/kexec_file.c405
-rw-r--r--kernel/kexec_internal.h17
-rw-r--r--kernel/kheaders.c10
-rw-r--r--kernel/kprobes.c1239
-rw-r--r--kernel/ksyms_common.c43
-rw-r--r--kernel/ksysfs.c72
-rw-r--r--kernel/kthread.c452
-rw-r--r--kernel/latencytop.c45
-rw-r--r--kernel/livepatch/Kconfig2
-rw-r--r--kernel/livepatch/core.c163
-rw-r--r--kernel/livepatch/patch.c42
-rw-r--r--kernel/livepatch/shadow.c6
-rw-r--r--kernel/livepatch/state.c2
-rw-r--r--kernel/livepatch/transition.c286
-rw-r--r--kernel/locking/Makefile8
-rw-r--r--kernel/locking/irqflag-debug.c13
-rw-r--r--kernel/locking/lock_events.c10
-rw-r--r--kernel/locking/lock_events.h4
-rw-r--r--kernel/locking/lock_events_list.h6
-rw-r--r--kernel/locking/lockdep.c1805
-rw-r--r--kernel/locking/lockdep_internals.h23
-rw-r--r--kernel/locking/lockdep_proc.c83
-rw-r--r--kernel/locking/locktorture.c734
-rw-r--r--kernel/locking/mcs_spinlock.h2
-rw-r--r--kernel/locking/mutex-debug.c9
-rw-r--r--kernel/locking/mutex-debug.h29
-rw-r--r--kernel/locking/mutex.c688
-rw-r--r--kernel/locking/mutex.h50
-rw-r--r--kernel/locking/osq_lock.c47
-rw-r--r--kernel/locking/percpu-rwsem.c20
-rw-r--r--kernel/locking/qrwlock.c35
-rw-r--r--kernel/locking/qspinlock.c18
-rw-r--r--kernel/locking/qspinlock_paravirt.h24
-rw-r--r--kernel/locking/rtmutex-debug.c182
-rw-r--r--kernel/locking/rtmutex-debug.h37
-rw-r--r--kernel/locking/rtmutex.c1607
-rw-r--r--kernel/locking/rtmutex.h35
-rw-r--r--kernel/locking/rtmutex_api.c612
-rw-r--r--kernel/locking/rtmutex_common.h232
-rw-r--r--kernel/locking/rwbase_rt.c297
-rw-r--r--kernel/locking/rwsem.c932
-rw-r--r--kernel/locking/rwsem.h0
-rw-r--r--kernel/locking/semaphore.c33
-rw-r--r--kernel/locking/spinlock.c80
-rw-r--r--kernel/locking/spinlock_debug.c6
-rw-r--r--kernel/locking/spinlock_rt.c286
-rw-r--r--kernel/locking/test-ww_mutex.c139
-rw-r--r--kernel/locking/ww_mutex.h569
-rw-r--r--kernel/locking/ww_rt_mutex.c101
-rw-r--r--kernel/module-internal.h31
-rw-r--r--kernel/module.c4548
-rw-r--r--kernel/module/Kconfig397
-rw-r--r--kernel/module/Makefile25
-rw-r--r--kernel/module/debug_kmemleak.c30
-rw-r--r--kernel/module/decompress.c368
-rw-r--r--kernel/module/dups.c248
-rw-r--r--kernel/module/internal.h406
-rw-r--r--kernel/module/kallsyms.c521
-rw-r--r--kernel/module/kdb.c63
-rw-r--r--kernel/module/kmod.c (renamed from kernel/kmod.c)56
-rw-r--r--kernel/module/livepatch.c74
-rw-r--r--kernel/module/main.c3369
-rw-r--r--kernel/module/procfs.c152
-rw-r--r--kernel/module/signing.c125
-rw-r--r--kernel/module/stats.c432
-rw-r--r--kernel/module/strict_rwx.c80
-rw-r--r--kernel/module/sysfs.c436
-rw-r--r--kernel/module/tracking.c129
-rw-r--r--kernel/module/tree_lookup.c112
-rw-r--r--kernel/module/version.c101
-rw-r--r--kernel/module_signature.c2
-rw-r--r--kernel/module_signing.c45
-rw-r--r--kernel/notifier.c246
-rw-r--r--kernel/nsproxy.c78
-rw-r--r--kernel/numa.c26
-rw-r--r--kernel/padata.c256
-rw-r--r--kernel/panic.c227
-rw-r--r--kernel/params.c135
-rw-r--r--kernel/pid.c118
-rw-r--r--kernel/pid_namespace.c58
-rw-r--r--kernel/pid_sysctl.h54
-rw-r--r--kernel/power/Kconfig45
-rw-r--r--kernel/power/Makefile6
-rw-r--r--kernel/power/autosleep.c2
-rw-r--r--kernel/power/energy_model.c410
-rw-r--r--kernel/power/hibernate.c347
-rw-r--r--kernel/power/main.c179
-rw-r--r--kernel/power/power.h38
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/process.c48
-rw-r--r--kernel/power/qos.c22
-rw-r--r--kernel/power/snapshot.c384
-rw-r--r--kernel/power/suspend.c54
-rw-r--r--kernel/power/suspend_test.c10
-rw-r--r--kernel/power/swap.c208
-rw-r--r--kernel/power/user.c79
-rw-r--r--kernel/power/wakelock.c11
-rw-r--r--kernel/printk/Makefile7
-rw-r--r--kernel/printk/index.c194
-rw-r--r--kernel/printk/internal.h120
-rw-r--r--kernel/printk/nbcon.c1029
-rw-r--r--kernel/printk/printk.c3360
-rw-r--r--kernel/printk/printk_ringbuffer.c2124
-rw-r--r--kernel/printk/printk_ringbuffer.h384
-rw-r--r--kernel/printk/printk_safe.c377
-rw-r--r--kernel/printk/sysctl.c85
-rw-r--r--kernel/profile.c131
-rw-r--r--kernel/ptrace.c363
-rw-r--r--kernel/range.c3
-rw-r--r--kernel/rcu/Kconfig196
-rw-r--r--kernel/rcu/Kconfig.debug91
-rw-r--r--kernel/rcu/Makefile3
-rw-r--r--kernel/rcu/rcu.h186
-rw-r--r--kernel/rcu/rcu_segcblist.c239
-rw-r--r--kernel/rcu/rcu_segcblist.h61
-rw-r--r--kernel/rcu/rcuperf.c856
-rw-r--r--kernel/rcu/rcuscale.c1058
-rw-r--r--kernel/rcu/rcutorture.c2039
-rw-r--r--kernel/rcu/refscale.c1169
-rw-r--r--kernel/rcu/srcutiny.c94
-rw-r--r--kernel/rcu/srcutree.c1405
-rw-r--r--kernel/rcu/sync.c8
-rw-r--r--kernel/rcu/tasks.h1582
-rw-r--r--kernel/rcu/tiny.c96
-rw-r--r--kernel/rcu/tree.c3295
-rw-r--r--kernel/rcu/tree.h143
-rw-r--r--kernel/rcu/tree_exp.h412
-rw-r--r--kernel/rcu/tree_nocb.h1805
-rw-r--r--kernel/rcu/tree_plugin.h1634
-rw-r--r--kernel/rcu/tree_stall.h491
-rw-r--r--kernel/rcu/update.c151
-rw-r--r--kernel/reboot.c786
-rw-r--r--kernel/regset.c76
-rw-r--r--kernel/relay.c296
-rw-r--r--kernel/resource.c791
-rw-r--r--kernel/resource_kunit.c152
-rw-r--r--kernel/rseq.c148
-rw-r--r--kernel/scftorture.c666
-rw-r--r--kernel/sched/Makefile38
-rw-r--r--kernel/sched/autogroup.c31
-rw-r--r--kernel/sched/autogroup.h6
-rw-r--r--kernel/sched/build_policy.c54
-rw-r--r--kernel/sched/build_utility.c109
-rw-r--r--kernel/sched/clock.c77
-rw-r--r--kernel/sched/completion.c40
-rw-r--r--kernel/sched/core.c7047
-rw-r--r--kernel/sched/core_sched.c300
-rw-r--r--kernel/sched/cpuacct.c127
-rw-r--r--kernel/sched/cpudeadline.c31
-rw-r--r--kernel/sched/cpufreq.c3
-rw-r--r--kernel/sched/cpufreq_schedutil.c462
-rw-r--r--kernel/sched/cpupri.c60
-rw-r--r--kernel/sched/cpupri.h8
-rw-r--r--kernel/sched/cputime.c120
-rw-r--r--kernel/sched/deadline.c1499
-rw-r--r--kernel/sched/debug.c661
-rw-r--r--kernel/sched/fair.c6053
-rw-r--r--kernel/sched/features.h39
-rw-r--r--kernel/sched/idle.c123
-rw-r--r--kernel/sched/isolation.c170
-rw-r--r--kernel/sched/loadavg.c7
-rw-r--r--kernel/sched/membarrier.c381
-rw-r--r--kernel/sched/pelt.c14
-rw-r--r--kernel/sched/pelt.h66
-rw-r--r--kernel/sched/psi.c1202
-rw-r--r--kernel/sched/rt.c577
-rw-r--r--kernel/sched/sched.h1712
-rw-r--r--kernel/sched/smp.h8
-rw-r--r--kernel/sched/stats.c107
-rw-r--r--kernel/sched/stats.h181
-rw-r--r--kernel/sched/stop_task.c48
-rw-r--r--kernel/sched/swait.c9
-rw-r--r--kernel/sched/topology.c1087
-rw-r--r--kernel/sched/wait.c120
-rw-r--r--kernel/sched/wait_bit.c4
-rw-r--r--kernel/scs.c90
-rw-r--r--kernel/seccomp.c1019
-rw-r--r--kernel/signal.c887
-rw-r--r--kernel/smp.c569
-rw-r--r--kernel/smpboot.c170
-rw-r--r--kernel/softirq.c529
-rw-r--r--kernel/stackleak.c155
-rw-r--r--kernel/stacktrace.c46
-rw-r--r--kernel/static_call.c8
-rw-r--r--kernel/static_call_inline.c556
-rw-r--r--kernel/stop_machine.c53
-rw-r--r--kernel/sys.c634
-rw-r--r--kernel/sys_ni.c153
-rw-r--r--kernel/sysctl-test.c67
-rw-r--r--kernel/sysctl.c1615
-rw-r--r--kernel/sysctl_binary.c171
-rw-r--r--kernel/task_work.c105
-rw-r--r--kernel/taskstats.c77
-rw-r--r--kernel/test_kprobes.c313
-rw-r--r--kernel/time/Kconfig97
-rw-r--r--kernel/time/Makefile3
-rw-r--r--kernel/time/alarmtimer.c73
-rw-r--r--kernel/time/clockevents.c34
-rw-r--r--kernel/time/clocksource-wdtest.c201
-rw-r--r--kernel/time/clocksource.c360
-rw-r--r--kernel/time/hrtimer.c506
-rw-r--r--kernel/time/itimer.c4
-rw-r--r--kernel/time/jiffies.c41
-rw-r--r--kernel/time/namespace.c63
-rw-r--r--kernel/time/ntp.c229
-rw-r--r--kernel/time/ntp_internal.h7
-rw-r--r--kernel/time/posix-clock.c36
-rw-r--r--kernel/time/posix-cpu-timers.c451
-rw-r--r--kernel/time/posix-stubs.c50
-rw-r--r--kernel/time/posix-timers.c558
-rw-r--r--kernel/time/sched_clock.c76
-rw-r--r--kernel/time/test_udelay.c7
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c31
-rw-r--r--kernel/time/tick-broadcast.c312
-rw-r--r--kernel/time/tick-common.c39
-rw-r--r--kernel/time/tick-internal.h41
-rw-r--r--kernel/time/tick-legacy.c37
-rw-r--r--kernel/time/tick-oneshot.c6
-rw-r--r--kernel/time/tick-sched.c727
-rw-r--r--kernel/time/tick-sched.h65
-rw-r--r--kernel/time/time.c183
-rw-r--r--kernel/time/time_test.c99
-rw-r--r--kernel/time/timeconv.c134
-rw-r--r--kernel/time/timecounter.c2
-rw-r--r--kernel/time/timekeeping.c394
-rw-r--r--kernel/time/timekeeping.h3
-rw-r--r--kernel/time/timekeeping_internal.h11
-rw-r--r--kernel/time/timer.c963
-rw-r--r--kernel/time/timer_list.c76
-rw-r--r--kernel/time/vsyscall.c43
-rw-r--r--kernel/torture.c259
-rw-r--r--kernel/trace/Kconfig346
-rw-r--r--kernel/trace/Makefile21
-rw-r--r--kernel/trace/blktrace.c527
-rw-r--r--kernel/trace/bpf_trace.c2288
-rw-r--r--kernel/trace/bpf_trace.h34
-rw-r--r--kernel/trace/error_report-traces.c11
-rw-r--r--kernel/trace/fgraph.c83
-rw-r--r--kernel/trace/fprobe.c386
-rw-r--r--kernel/trace/ftrace.c2072
-rw-r--r--kernel/trace/ftrace_internal.h5
-rw-r--r--kernel/trace/kprobe_event_gen_test.c119
-rw-r--r--kernel/trace/pid_list.c495
-rw-r--r--kernel/trace/pid_list.h88
-rw-r--r--kernel/trace/preemptirq_delay_test.c14
-rw-r--r--kernel/trace/rethook.c335
-rw-r--r--kernel/trace/ring_buffer.c2164
-rw-r--r--kernel/trace/ring_buffer_benchmark.c60
-rw-r--r--kernel/trace/rv/Kconfig78
-rw-r--r--kernel/trace/rv/Makefile8
-rw-r--r--kernel/trace/rv/monitors/wip/wip.c88
-rw-r--r--kernel/trace/rv/monitors/wip/wip.h46
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.c87
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.h46
-rw-r--r--kernel/trace/rv/reactor_panic.c42
-rw-r--r--kernel/trace/rv/reactor_printk.c41
-rw-r--r--kernel/trace/rv/rv.c797
-rw-r--r--kernel/trace/rv/rv.h68
-rw-r--r--kernel/trace/rv/rv_reactors.c510
-rw-r--r--kernel/trace/synth_event_gen_test.c51
-rw-r--r--kernel/trace/trace.c3246
-rw-r--r--kernel/trace/trace.h632
-rw-r--r--kernel/trace/trace_benchmark.c8
-rw-r--r--kernel/trace/trace_benchmark.h8
-rw-r--r--kernel/trace/trace_boot.c371
-rw-r--r--kernel/trace/trace_branch.c6
-rw-r--r--kernel/trace/trace_btf.c122
-rw-r--r--kernel/trace/trace_btf.h11
-rw-r--r--kernel/trace/trace_clock.c44
-rw-r--r--kernel/trace/trace_dynevent.c100
-rw-r--r--kernel/trace/trace_dynevent.h14
-rw-r--r--kernel/trace/trace_entries.h95
-rw-r--r--kernel/trace/trace_eprobe.c984
-rw-r--r--kernel/trace/trace_event_perf.c54
-rw-r--r--kernel/trace/trace_events.c1170
-rw-r--r--kernel/trace/trace_events_filter.c814
-rw-r--r--kernel/trace/trace_events_hist.c1752
-rw-r--r--kernel/trace/trace_events_inject.c24
-rw-r--r--kernel/trace/trace_events_synth.c841
-rw-r--r--kernel/trace/trace_events_trigger.c821
-rw-r--r--kernel/trace/trace_events_user.c2784
-rw-r--r--kernel/trace/trace_export.c16
-rw-r--r--kernel/trace/trace_fprobe.c1231
-rw-r--r--kernel/trace/trace_functions.c294
-rw-r--r--kernel/trace/trace_functions_graph.c139
-rw-r--r--kernel/trace/trace_hwlat.c568
-rw-r--r--kernel/trace/trace_irqsoff.c95
-rw-r--r--kernel/trace/trace_kdb.c12
-rw-r--r--kernel/trace/trace_kprobe.c505
-rw-r--r--kernel/trace/trace_kprobe_selftest.c3
-rw-r--r--kernel/trace/trace_mmiotrace.c16
-rw-r--r--kernel/trace/trace_osnoise.c3133
-rw-r--r--kernel/trace/trace_output.c534
-rw-r--r--kernel/trace/trace_output.h3
-rw-r--r--kernel/trace/trace_preemptirq.c63
-rw-r--r--kernel/trace/trace_printk.c23
-rw-r--r--kernel/trace/trace_probe.c1146
-rw-r--r--kernel/trace/trace_probe.h127
-rw-r--r--kernel/trace/trace_probe_kernel.h119
-rw-r--r--kernel/trace/trace_probe_tmpl.h111
-rw-r--r--kernel/trace/trace_recursion_record.c233
-rw-r--r--kernel/trace/trace_sched_switch.c5
-rw-r--r--kernel/trace/trace_sched_wakeup.c102
-rw-r--r--kernel/trace/trace_selftest.c147
-rw-r--r--kernel/trace/trace_seq.c45
-rw-r--r--kernel/trace/trace_stack.c15
-rw-r--r--kernel/trace/trace_stat.c14
-rw-r--r--kernel/trace/trace_synth.h9
-rw-r--r--kernel/trace/trace_syscalls.c59
-rw-r--r--kernel/trace/trace_uprobe.c171
-rw-r--r--kernel/trace/tracing_map.c66
-rw-r--r--kernel/trace/tracing_map.h6
-rw-r--r--kernel/tracepoint.c311
-rw-r--r--kernel/tsacct.c19
-rw-r--r--kernel/ucount.c205
-rw-r--r--kernel/umh.c295
-rw-r--r--kernel/up.c40
-rw-r--r--kernel/user.c44
-rw-r--r--kernel/user_namespace.c130
-rw-r--r--kernel/usermode_driver.c191
-rw-r--r--kernel/utsname.c7
-rw-r--r--kernel/utsname_sysctl.c21
-rw-r--r--kernel/vhost_task.c149
-rw-r--r--kernel/watch_queue.c143
-rw-r--r--kernel/watchdog.c621
-rw-r--r--kernel/watchdog_buddy.c113
-rw-r--r--kernel/watchdog_perf.c (renamed from kernel/watchdog_hld.c)105
-rw-r--r--kernel/workqueue.c2952
-rw-r--r--kernel/workqueue_internal.h25
553 files changed, 161507 insertions, 58762 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index 78701ea37c97..c6b299a6b786 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -1,4 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
-kheaders.md5
-timeconst.h
-hz.bc
+/config_data
+/kheaders.md5
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
new file mode 100644
index 000000000000..946dffa048b7
--- /dev/null
+++ b/kernel/Kconfig.kexec
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menu "Kexec and crash features"
+
+config CRASH_CORE
+ bool
+
+config KEXEC_CORE
+ select CRASH_CORE
+ bool
+
+config KEXEC_ELF
+ bool
+
+config HAVE_IMA_KEXEC
+ bool
+
+config KEXEC
+ bool "Enable kexec system call"
+ depends on ARCH_SUPPORTS_KEXEC
+ select KEXEC_CORE
+ help
+ kexec is a system call that implements the ability to shutdown your
+ current kernel, and to start another kernel. It is like a reboot
+ but it is independent of the system firmware. And like a reboot
+ you can start any kernel with it, not just Linux.
+
+ The name comes from the similarity to the exec system call.
+
+ It is an ongoing process to be certain the hardware in a machine
+ is properly shutdown, so do not be surprised if this code does not
+ initially work for you. As of this writing the exact hardware
+ interface is strongly in flux, so no good recommendation can be
+ made.
+
+config KEXEC_FILE
+ bool "Enable kexec file based system call"
+ depends on ARCH_SUPPORTS_KEXEC_FILE
+ select CRYPTO
+ select CRYPTO_SHA256
+ select KEXEC_CORE
+ help
+ This is new version of kexec system call. This system call is
+ file based and takes file descriptors as system call argument
+ for kernel and initramfs as opposed to list of segments as
+ accepted by kexec system call.
+
+config KEXEC_SIG
+ bool "Verify kernel signature during kexec_file_load() syscall"
+ depends on ARCH_SUPPORTS_KEXEC_SIG
+ depends on KEXEC_FILE
+ help
+ This option makes the kexec_file_load() syscall check for a valid
+ signature of the kernel image. The image can still be loaded without
+ a valid signature unless you also enable KEXEC_SIG_FORCE, though if
+ there's a signature that we can check, then it must be valid.
+
+ In addition to this option, you need to enable signature
+ verification for the corresponding kernel image type being
+ loaded in order for this to work.
+
+config KEXEC_SIG_FORCE
+ bool "Require a valid signature in kexec_file_load() syscall"
+ depends on ARCH_SUPPORTS_KEXEC_SIG_FORCE
+ depends on KEXEC_SIG
+ help
+ This option makes kernel signature verification mandatory for
+ the kexec_file_load() syscall.
+
+config KEXEC_IMAGE_VERIFY_SIG
+ bool "Enable Image signature verification support (ARM)"
+ default ARCH_DEFAULT_KEXEC_IMAGE_VERIFY_SIG
+ depends on ARCH_SUPPORTS_KEXEC_IMAGE_VERIFY_SIG
+ depends on KEXEC_SIG
+ depends on EFI && SIGNED_PE_FILE_VERIFICATION
+ help
+ Enable Image signature verification support.
+
+config KEXEC_BZIMAGE_VERIFY_SIG
+ bool "Enable bzImage signature verification support"
+ depends on ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG
+ depends on KEXEC_SIG
+ depends on SIGNED_PE_FILE_VERIFICATION
+ select SYSTEM_TRUSTED_KEYRING
+ help
+ Enable bzImage signature verification support.
+
+config KEXEC_JUMP
+ bool "kexec jump"
+ depends on ARCH_SUPPORTS_KEXEC_JUMP
+ depends on KEXEC && HIBERNATION
+ help
+ Jump between original kernel and kexeced kernel and invoke
+ code in physical address mode via KEXEC
+
+config CRASH_DUMP
+ bool "kernel crash dumps"
+ depends on ARCH_SUPPORTS_CRASH_DUMP
+ select CRASH_CORE
+ select KEXEC_CORE
+ help
+ Generate crash dump after being started by kexec.
+ This should be normally only set in special crash dump kernels
+ which are loaded in the main kernel with kexec-tools into
+ a specially reserved region and then later executed after
+ a crash by kdump/kexec. The crash dump kernel must be compiled
+ to a memory address not used by the main kernel or BIOS using
+ PHYSICAL_START, or it must be built as a relocatable image
+ (CONFIG_RELOCATABLE=y).
+ For more details see Documentation/admin-guide/kdump/kdump.rst
+
+ For s390, this option also enables zfcpdump.
+ See also <file:Documentation/arch/s390/zfcpdump.rst>
+
+config CRASH_HOTPLUG
+ bool "Update the crash elfcorehdr on system configuration changes"
+ default y
+ depends on CRASH_DUMP && (HOTPLUG_CPU || MEMORY_HOTPLUG)
+ depends on ARCH_SUPPORTS_CRASH_HOTPLUG
+ help
+ Enable direct update to the crash elfcorehdr (which contains
+ the list of CPUs and memory regions to be dumped upon a crash)
+ in response to hot plug/unplug or online/offline of CPUs or
+ memory. This is a much more advanced approach than userspace
+ attempting that.
+
+ If unsure, say Y.
+
+config CRASH_MAX_MEMORY_RANGES
+ int "Specify the maximum number of memory regions for the elfcorehdr"
+ default 8192
+ depends on CRASH_HOTPLUG
+ help
+ For the kexec_file_load() syscall path, specify the maximum number of
+ memory regions that the elfcorehdr buffer/segment can accommodate.
+ These regions are obtained via walk_system_ram_res(); eg. the
+ 'System RAM' entries in /proc/iomem.
+ This value is combined with NR_CPUS_DEFAULT and multiplied by
+ sizeof(Elf64_Phdr) to determine the final elfcorehdr memory buffer/
+ segment size.
+ The value 8192, for example, covers a (sparsely populated) 1TiB system
+ consisting of 128MiB memblocks, while resulting in an elfcorehdr
+ memory buffer/segment size under 1MiB. This represents a sane choice
+ to accommodate both baremetal and virtual machine configurations.
+
+ For the kexec_load() syscall path, CRASH_MAX_MEMORY_RANGES is part of
+ the computation behind the value provided through the
+ /sys/kernel/crash_elfcorehdr_size attribute.
+
+endmenu
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 3de8fd11873b..4198f0273ecd 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -251,7 +251,7 @@ config ARCH_USE_QUEUED_RWLOCKS
config QUEUED_RWLOCKS
def_bool y if ARCH_USE_QUEUED_RWLOCKS
- depends on SMP
+ depends on SMP && !PREEMPT_RT
config ARCH_HAS_MMIOWB
bool
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf82259cff96..c2f1fd95a821 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,11 +1,23 @@
# SPDX-License-Identifier: GPL-2.0-only
+config PREEMPT_NONE_BUILD
+ bool
+
+config PREEMPT_VOLUNTARY_BUILD
+ bool
+
+config PREEMPT_BUILD
+ bool
+ select PREEMPTION
+ select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
+
choice
prompt "Preemption Model"
default PREEMPT_NONE
config PREEMPT_NONE
bool "No Forced Preemption (Server)"
+ select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
help
This is the traditional Linux preemption model, geared towards
throughput. It will still provide good latencies most of the
@@ -20,6 +32,7 @@ config PREEMPT_NONE
config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)"
depends on !ARCH_NO_PREEMPT
+ select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
help
This option reduces the latency of the kernel by adding more
"explicit preemption points" to the kernel code. These new
@@ -38,8 +51,7 @@ config PREEMPT_VOLUNTARY
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
depends on !ARCH_NO_PREEMPT
- select PREEMPTION
- select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
+ select PREEMPT_BUILD
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
@@ -80,3 +92,45 @@ config PREEMPT_COUNT
config PREEMPTION
bool
select PREEMPT_COUNT
+
+config PREEMPT_DYNAMIC
+ bool "Preemption behaviour defined on boot"
+ depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
+ select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY
+ select PREEMPT_BUILD
+ default y if HAVE_PREEMPT_DYNAMIC_CALL
+ help
+ This option allows to define the preemption model on the kernel
+ command line parameter and thus override the default preemption
+ model defined during compile time.
+
+ The feature is primarily interesting for Linux distributions which
+ provide a pre-built kernel binary to reduce the number of kernel
+ flavors they offer while still offering different usecases.
+
+ The runtime overhead is negligible with HAVE_STATIC_CALL_INLINE enabled
+ but if runtime patching is not available for the specific architecture
+ then the potential overhead should be considered.
+
+ Interesting if you want the same pre-built kernel should be used for
+ both Server and Desktop workloads.
+
+config SCHED_CORE
+ bool "Core Scheduling for SMT"
+ depends on SCHED_SMT
+ help
+ This option permits Core Scheduling, a means of coordinated task
+ selection across SMT siblings. When enabled -- see
+ prctl(PR_SCHED_CORE) -- task selection ensures that all SMT siblings
+ will execute a task from the same 'core group', forcing idle when no
+ matching task is found.
+
+ Use of this feature includes:
+ - mitigation of some (not all) SMT side channels;
+ - limiting SMT interference to improve determinism and/or performance.
+
+ SCHED_CORE is default disabled. When it is enabled and unused,
+ which is the likely usage by Linux distributions, there should
+ be no measurable impact on performance.
+
+
diff --git a/kernel/Makefile b/kernel/Makefile
index f3218bc5ec69..ce105a5558fc 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,15 +5,16 @@
obj-y = fork.o exec_domain.o panic.o \
cpu.o exit.o softirq.o resource.o \
- sysctl.o sysctl_binary.o capability.o ptrace.o user.o \
+ sysctl.o capability.o ptrace.o user.o \
signal.o sys.o umh.o workqueue.o pid.o task_work.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
- async.o range.o smpboot.o ucount.o
+ async.o range.o smpboot.o ucount.o regset.o ksyms_common.o
-obj-$(CONFIG_MODULES) += kmod.o
+obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o
obj-$(CONFIG_MULTIUSER) += groups.o
+obj-$(CONFIG_VHOST_TASK) += vhost_task.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace internal ftrace files
@@ -28,17 +29,17 @@ KCOV_INSTRUMENT_softirq.o := n
KCSAN_SANITIZE_softirq.o = n
# These are called from save_stack_trace() on slub debug path,
# and produce insane amounts of uninteresting coverage.
-KCOV_INSTRUMENT_module.o := n
KCOV_INSTRUMENT_extable.o := n
KCOV_INSTRUMENT_stacktrace.o := n
# Don't self-instrument.
KCOV_INSTRUMENT_kcov.o := n
+# If sanitizers detect any issues in kcov, it may lead to recursion
+# via printk, etc.
KASAN_SANITIZE_kcov.o := n
KCSAN_SANITIZE_kcov.o := n
-CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
-
-# cond_syscall is currently not LTO compatible
-CFLAGS_sys_ni.o = $(DISABLE_LTO)
+UBSAN_SANITIZE_kcov.o := n
+KMSAN_SANITIZE_kcov.o := n
+CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
obj-y += sched/
obj-y += locking/
@@ -48,23 +49,24 @@ obj-y += irq/
obj-y += rcu/
obj-y += livepatch/
obj-y += dma/
+obj-y += entry/
+obj-$(CONFIG_MODULES) += module/
-obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
+obj-$(CONFIG_KCMP) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
-obj-$(CONFIG_FUTEX) += futex.o
+obj-$(CONFIG_FUTEX) += futex/
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
obj-$(CONFIG_UID16) += uid16.o
-obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_MODULE_SIG) += module_signing.o
obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_CRASH_CORE) += crash_core.o
obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
@@ -80,7 +82,6 @@ obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_IKHEADERS) += kheaders.o
obj-$(CONFIG_SMP) += stop_machine.o
-obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_watch.o audit_fsnotify.o audit_tree.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/
@@ -90,7 +91,8 @@ obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
-obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR_BUDDY) += watchdog_buddy.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_perf.o
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
@@ -98,17 +100,21 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
-obj-$(CONFIG_ELFCORE) += elfcore.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_TRACE_CLOCK) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
+obj-$(CONFIG_RETHOOK) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_CPU_PM) += cpu_pm.o
obj-$(CONFIG_BPF) += bpf/
obj-$(CONFIG_KCSAN) += kcsan/
obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
+obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o
+obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o
+obj-$(CONFIG_CFI_CLANG) += cfi.o
+obj-$(CONFIG_NUMA) += numa.o
obj-$(CONFIG_PERF_EVENTS) += events/
@@ -123,19 +129,28 @@ obj-$(CONFIG_HAS_IOMEM) += iomem.o
obj-$(CONFIG_RSEQ) += rseq.o
obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o
+obj-$(CONFIG_RESOURCE_KUNIT_TEST) += resource_kunit.o
obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o
+CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN)
obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
KASAN_SANITIZE_stackleak.o := n
KCSAN_SANITIZE_stackleak.o := n
KCOV_INSTRUMENT_stackleak.o := n
+obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o
+
$(obj)/configs.o: $(obj)/config_data.gz
-targets += config_data.gz
-$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
+targets += config_data config_data.gz
+$(obj)/config_data.gz: $(obj)/config_data FORCE
$(call if_changed,gzip)
+filechk_cat = cat $<
+
+$(obj)/config_data: $(KCONFIG_CONFIG) FORCE
+ $(call filechk,cat)
+
$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz
diff --git a/kernel/acct.c b/kernel/acct.c
index b0c5b3a9f5af..986c8214dabf 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -25,7 +25,7 @@
* Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
* XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
*
- * Fixed a nasty interaction with with sys_umount(). If the accointing
+ * Fixed a nasty interaction with sys_umount(). If the accounting
* was suspeneded we failed to stop it on umount(). Messy.
* Another one: remount to readonly didn't stop accounting.
* Question: what should we do if we have CAP_SYS_ADMIN but not
@@ -60,7 +60,6 @@
#include <linux/sched/cputime.h>
#include <asm/div64.h>
-#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
#include <linux/fs_pin.h>
@@ -71,11 +70,31 @@
* Turned into sysctl-controllable parameters. AV, 12/11/98
*/
-int acct_parm[3] = {4, 2, 30};
+static int acct_parm[3] = {4, 2, 30};
#define RESUME (acct_parm[0]) /* >foo% free space - resume */
#define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */
#define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_acct_table[] = {
+ {
+ .procname = "acct",
+ .data = &acct_parm,
+ .maxlen = 3*sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static __init int kernel_acct_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_acct_table);
+ return 0;
+}
+late_initcall(kernel_acct_sysctls_init);
+#endif /* CONFIG_SYSCTL */
+
/*
* External references and all of the globals.
*/
@@ -227,7 +246,7 @@ static int acct_on(struct filename *pathname)
filp_close(file, NULL);
return PTR_ERR(internal);
}
- err = __mnt_want_write(internal);
+ err = mnt_get_write_access(internal);
if (err) {
mntput(internal);
kfree(acct);
@@ -252,7 +271,7 @@ static int acct_on(struct filename *pathname)
old = xchg(&ns->bacct, &acct->pin);
mutex_unlock(&acct->lock);
pin_kill(old);
- __mnt_drop_write(mnt);
+ mnt_put_write_access(mnt);
mntput(mnt);
return 0;
}
@@ -263,12 +282,12 @@ static DEFINE_MUTEX(acct_on_mutex);
* sys_acct - enable/disable process accounting
* @name: file name for accounting records or NULL to shutdown accounting
*
- * Returns 0 for success or negative errno values for failure.
- *
* sys_acct() is the only system call needed to implement process
* accounting. It takes the name of the file where accounting records
* should be written. If the filename is NULL, accounting will be
* shutdown.
+ *
+ * Returns: 0 for success or negative errno values for failure.
*/
SYSCALL_DEFINE1(acct, const char __user *, name)
{
@@ -301,7 +320,7 @@ void acct_exit_ns(struct pid_namespace *ns)
}
/*
- * encode an unsigned long into a comp_t
+ * encode an u64 into a comp_t
*
* This routine has been adopted from the encode_comp_t() function in
* the kern_acct.c file of the FreeBSD operating system. The encoding
@@ -312,7 +331,7 @@ void acct_exit_ns(struct pid_namespace *ns)
#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
-static comp_t encode_comp_t(unsigned long value)
+static comp_t encode_comp_t(u64 value)
{
int exp, rnd;
@@ -331,6 +350,8 @@ static comp_t encode_comp_t(unsigned long value)
exp++;
}
+ if (exp > (((comp_t) ~0U) >> MANTSIZE))
+ return (comp_t) ~0U;
/*
* Clean it up and polish it off.
*/
@@ -381,9 +402,7 @@ static comp2_t encode_comp2_t(u64 value)
return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
}
}
-#endif
-
-#if ACCT_VERSION == 3
+#elif ACCT_VERSION == 3
/*
* encode an u64 into a 32 bit IEEE float
*/
@@ -426,7 +445,7 @@ static void fill_ac(acct_t *ac)
memset(ac, 0, sizeof(acct_t));
ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
- strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
+ strscpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
/* calculate run_time in nsec*/
run_time = ktime_get_ns();
@@ -451,7 +470,7 @@ static void fill_ac(acct_t *ac)
do_div(elapsed, AHZ);
btime = ktime_get_real_seconds() - elapsed;
ac->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
-#if ACCT_VERSION==2
+#if ACCT_VERSION == 2
ac->ac_ahz = AHZ;
#endif
@@ -480,7 +499,7 @@ static void do_acct_process(struct bsd_acct_struct *acct)
/*
* Accounting records are not subject to resource limits.
*/
- flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ flim = rlimit(RLIMIT_FSIZE);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
/* Perform file operations on behalf of whoever enabled accounting */
orig_cred = override_creds(file->f_cred);
@@ -500,8 +519,7 @@ static void do_acct_process(struct bsd_acct_struct *acct)
/* backward-compatible 16 bit fields */
ac.ac_uid16 = ac.ac_uid;
ac.ac_gid16 = ac.ac_gid;
-#endif
-#if ACCT_VERSION == 3
+#elif ACCT_VERSION == 3
{
struct pid_namespace *ns = acct->ns;
@@ -539,15 +557,14 @@ void acct_collect(long exitcode, int group_dead)
unsigned long vsize = 0;
if (group_dead && current->mm) {
+ struct mm_struct *mm = current->mm;
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- mmap_read_lock(current->mm);
- vma = current->mm->mmap;
- while (vma) {
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma)
vsize += vma->vm_end - vma->vm_start;
- vma = vma->vm_next;
- }
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
}
spin_lock_irq(&current->sighand->siglock);
@@ -586,9 +603,7 @@ static void slow_acct_process(struct pid_namespace *ns)
}
/**
- * acct_process
- *
- * handles process accounting for an exiting task
+ * acct_process - handles process accounting for an exiting task
*/
void acct_process(void)
{
diff --git a/kernel/async.c b/kernel/async.c
index 4f9c1d614016..97f224a5257b 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -46,11 +46,12 @@ asynchronous and synchronous parts of the kernel.
#include <linux/async.h>
#include <linux/atomic.h>
-#include <linux/ktime.h>
#include <linux/export.h>
-#include <linux/wait.h>
+#include <linux/ktime.h>
+#include <linux/pid.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/wait.h>
#include <linux/workqueue.h>
#include "workqueue_internal.h"
@@ -78,6 +79,12 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done);
static atomic_t entry_count;
+static long long microseconds_since(ktime_t start)
+{
+ ktime_t now = ktime_get();
+ return ktime_to_ns(ktime_sub(now, start)) >> 10;
+}
+
static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
struct async_entry *first = NULL;
@@ -111,24 +118,18 @@ static void async_run_entry_fn(struct work_struct *work)
struct async_entry *entry =
container_of(work, struct async_entry, work);
unsigned long flags;
- ktime_t uninitialized_var(calltime), delta, rettime;
+ ktime_t calltime;
/* 1) run (and print duration) */
- if (initcall_debug && system_state < SYSTEM_RUNNING) {
- pr_debug("calling %lli_%pS @ %i\n",
- (long long)entry->cookie,
- entry->func, task_pid_nr(current));
- calltime = ktime_get();
- }
+ pr_debug("calling %lli_%pS @ %i\n", (long long)entry->cookie,
+ entry->func, task_pid_nr(current));
+ calltime = ktime_get();
+
entry->func(entry->data, entry->cookie);
- if (initcall_debug && system_state < SYSTEM_RUNNING) {
- rettime = ktime_get();
- delta = ktime_sub(rettime, calltime);
- pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n",
- (long long)entry->cookie,
- entry->func,
- (long long)ktime_to_ns(delta) >> 10);
- }
+
+ pr_debug("initcall %lli_%pS returned after %lld usecs\n",
+ (long long)entry->cookie, entry->func,
+ microseconds_since(calltime));
/* 2) remove self from the pending queues */
spin_lock_irqsave(&async_lock, flags);
@@ -145,6 +146,39 @@ static void async_run_entry_fn(struct work_struct *work)
wake_up(&async_done);
}
+static async_cookie_t __async_schedule_node_domain(async_func_t func,
+ void *data, int node,
+ struct async_domain *domain,
+ struct async_entry *entry)
+{
+ async_cookie_t newcookie;
+ unsigned long flags;
+
+ INIT_LIST_HEAD(&entry->domain_list);
+ INIT_LIST_HEAD(&entry->global_list);
+ INIT_WORK(&entry->work, async_run_entry_fn);
+ entry->func = func;
+ entry->data = data;
+ entry->domain = domain;
+
+ spin_lock_irqsave(&async_lock, flags);
+
+ /* allocate cookie and queue */
+ newcookie = entry->cookie = next_cookie++;
+
+ list_add_tail(&entry->domain_list, &domain->pending);
+ if (domain->registered)
+ list_add_tail(&entry->global_list, &async_global_pending);
+
+ atomic_inc(&entry_count);
+ spin_unlock_irqrestore(&async_lock, flags);
+
+ /* schedule for execution */
+ queue_work_node(node, system_unbound_wq, &entry->work);
+
+ return newcookie;
+}
+
/**
* async_schedule_node_domain - NUMA specific version of async_schedule_domain
* @func: function to execute asynchronously
@@ -186,32 +220,8 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data,
func(data, newcookie);
return newcookie;
}
- INIT_LIST_HEAD(&entry->domain_list);
- INIT_LIST_HEAD(&entry->global_list);
- INIT_WORK(&entry->work, async_run_entry_fn);
- entry->func = func;
- entry->data = data;
- entry->domain = domain;
-
- spin_lock_irqsave(&async_lock, flags);
-
- /* allocate cookie and queue */
- newcookie = entry->cookie = next_cookie++;
-
- list_add_tail(&entry->domain_list, &domain->pending);
- if (domain->registered)
- list_add_tail(&entry->global_list, &async_global_pending);
-
- atomic_inc(&entry_count);
- spin_unlock_irqrestore(&async_lock, flags);
-
- /* mark that this task has queued an async job, used by module init */
- current->flags |= PF_USED_ASYNC;
- /* schedule for execution */
- queue_work_node(node, system_unbound_wq, &entry->work);
-
- return newcookie;
+ return __async_schedule_node_domain(func, data, node, domain, entry);
}
EXPORT_SYMBOL_GPL(async_schedule_node_domain);
@@ -235,33 +245,44 @@ async_cookie_t async_schedule_node(async_func_t func, void *data, int node)
EXPORT_SYMBOL_GPL(async_schedule_node);
/**
- * async_synchronize_full - synchronize all asynchronous function calls
+ * async_schedule_dev_nocall - A simplified variant of async_schedule_dev()
+ * @func: function to execute asynchronously
+ * @dev: device argument to be passed to function
*
- * This function waits until all asynchronous function calls have been done.
+ * @dev is used as both the argument for the function and to provide NUMA
+ * context for where to run the function.
+ *
+ * If the asynchronous execution of @func is scheduled successfully, return
+ * true. Otherwise, do nothing and return false, unlike async_schedule_dev()
+ * that will run the function synchronously then.
*/
-void async_synchronize_full(void)
+bool async_schedule_dev_nocall(async_func_t func, struct device *dev)
{
- async_synchronize_full_domain(NULL);
+ struct async_entry *entry;
+
+ entry = kzalloc(sizeof(struct async_entry), GFP_KERNEL);
+
+ /* Give up if there is no memory or too much work. */
+ if (!entry || atomic_read(&entry_count) > MAX_WORK) {
+ kfree(entry);
+ return false;
+ }
+
+ __async_schedule_node_domain(func, dev, dev_to_node(dev),
+ &async_dfl_domain, entry);
+ return true;
}
-EXPORT_SYMBOL_GPL(async_synchronize_full);
/**
- * async_unregister_domain - ensure no more anonymous waiters on this domain
- * @domain: idle domain to flush out of any async_synchronize_full instances
- *
- * async_synchronize_{cookie|full}_domain() are not flushed since callers
- * of these routines should know the lifetime of @domain
+ * async_synchronize_full - synchronize all asynchronous function calls
*
- * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
+ * This function waits until all asynchronous function calls have been done.
*/
-void async_unregister_domain(struct async_domain *domain)
+void async_synchronize_full(void)
{
- spin_lock_irq(&async_lock);
- WARN_ON(!domain->registered || !list_empty(&domain->pending));
- domain->registered = 0;
- spin_unlock_irq(&async_lock);
+ async_synchronize_full_domain(NULL);
}
-EXPORT_SYMBOL_GPL(async_unregister_domain);
+EXPORT_SYMBOL_GPL(async_synchronize_full);
/**
* async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
@@ -287,23 +308,15 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
*/
void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
{
- ktime_t uninitialized_var(starttime), delta, endtime;
+ ktime_t starttime;
- if (initcall_debug && system_state < SYSTEM_RUNNING) {
- pr_debug("async_waiting @ %i\n", task_pid_nr(current));
- starttime = ktime_get();
- }
+ pr_debug("async_waiting @ %i\n", task_pid_nr(current));
+ starttime = ktime_get();
wait_event(async_done, lowest_in_progress(domain) >= cookie);
- if (initcall_debug && system_state < SYSTEM_RUNNING) {
- endtime = ktime_get();
- delta = ktime_sub(endtime, starttime);
-
- pr_debug("async_continuing @ %i after %lli usec\n",
- task_pid_nr(current),
- (long long)ktime_to_ns(delta) >> 10);
- }
+ pr_debug("async_continuing @ %i after %lli usec\n", task_pid_nr(current),
+ microseconds_since(starttime));
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
diff --git a/kernel/audit.c b/kernel/audit.c
index b2301bdc9773..9c8e5f732c4c 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -53,9 +53,7 @@
#include <net/sock.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
-#ifdef CONFIG_SECURITY
#include <linux/security.h>
-#endif
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <net/netns/generic.h>
@@ -67,7 +65,7 @@
#define AUDIT_DISABLED -1
#define AUDIT_UNINITIALIZED 0
#define AUDIT_INITIALIZED 1
-static int audit_initialized;
+static int audit_initialized = AUDIT_UNINITIALIZED;
u32 audit_enabled = AUDIT_OFF;
bool audit_ever_enabled = !!AUDIT_OFF;
@@ -123,9 +121,9 @@ static u32 audit_backlog_limit = 64;
static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
/* The identity of the user shutting down the audit system. */
-kuid_t audit_sig_uid = INVALID_UID;
-pid_t audit_sig_pid = -1;
-u32 audit_sig_sid = 0;
+static kuid_t audit_sig_uid = INVALID_UID;
+static pid_t audit_sig_pid = -1;
+static u32 audit_sig_sid;
/* Records can be lost in several ways:
0) [suppressed in audit_alloc]
@@ -136,6 +134,11 @@ u32 audit_sig_sid = 0;
*/
static atomic_t audit_lost = ATOMIC_INIT(0);
+/* Monotonically increasing sum of time the kernel has spent
+ * waiting while the backlog limit is exceeded.
+ */
+static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0);
+
/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -316,18 +319,17 @@ static inline int audit_rate_check(void)
static DEFINE_SPINLOCK(lock);
unsigned long flags;
unsigned long now;
- unsigned long elapsed;
int retval = 0;
- if (!audit_rate_limit) return 1;
+ if (!audit_rate_limit)
+ return 1;
spin_lock_irqsave(&lock, flags);
if (++messages < audit_rate_limit) {
retval = 1;
} else {
- now = jiffies;
- elapsed = now - last_check;
- if (elapsed > HZ) {
+ now = jiffies;
+ if (time_after(now, last_check + HZ)) {
last_check = now;
messages = 0;
retval = 1;
@@ -361,7 +363,7 @@ void audit_log_lost(const char *message)
if (!print) {
spin_lock_irqsave(&lock, flags);
now = jiffies;
- if (now - last_msg > HZ) {
+ if (time_after(now, last_msg + HZ)) {
print = 1;
last_msg = now;
}
@@ -485,15 +487,19 @@ static void auditd_conn_free(struct rcu_head *rcu)
* @pid: auditd PID
* @portid: auditd netlink portid
* @net: auditd network namespace pointer
+ * @skb: the netlink command from the audit daemon
+ * @ack: netlink ack flag, cleared if ack'd here
*
* Description:
* This function will obtain and drop network namespace references as
* necessary. Returns zero on success, negative values on failure.
*/
-static int auditd_set(struct pid *pid, u32 portid, struct net *net)
+static int auditd_set(struct pid *pid, u32 portid, struct net *net,
+ struct sk_buff *skb, bool *ack)
{
unsigned long flags;
struct auditd_connection *ac_old, *ac_new;
+ struct nlmsghdr *nlh;
if (!pid || !net)
return -EINVAL;
@@ -505,6 +511,13 @@ static int auditd_set(struct pid *pid, u32 portid, struct net *net)
ac_new->portid = portid;
ac_new->net = get_net(net);
+ /* send the ack now to avoid a race with the queue backlog */
+ if (*ack) {
+ nlh = nlmsg_hdr(skb);
+ netlink_ack(skb, nlh, 0, NULL);
+ *ack = false;
+ }
+
spin_lock_irqsave(&auditd_conn_lock, flags);
ac_old = rcu_dereference_protected(auditd_conn,
lockdep_is_held(&auditd_conn_lock));
@@ -518,7 +531,7 @@ static int auditd_set(struct pid *pid, u32 portid, struct net *net)
}
/**
- * kauditd_print_skb - Print the audit record to the ring buffer
+ * kauditd_printk_skb - Print the audit record to the ring buffer
* @skb: audit record
*
* Whatever the reason, this packet may not make it to the auditd connection
@@ -536,20 +549,22 @@ static void kauditd_printk_skb(struct sk_buff *skb)
/**
* kauditd_rehold_skb - Handle a audit record send failure in the hold queue
* @skb: audit record
+ * @error: error code (unused)
*
* Description:
* This should only be used by the kauditd_thread when it fails to flush the
* hold queue.
*/
-static void kauditd_rehold_skb(struct sk_buff *skb)
+static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error)
{
- /* put the record back in the queue at the same place */
- skb_queue_head(&audit_hold_queue, skb);
+ /* put the record back in the queue */
+ skb_queue_tail(&audit_hold_queue, skb);
}
/**
* kauditd_hold_skb - Queue an audit record, waiting for auditd
* @skb: audit record
+ * @error: error code
*
* Description:
* Queue the audit record, waiting for an instance of auditd. When this
@@ -559,19 +574,31 @@ static void kauditd_rehold_skb(struct sk_buff *skb)
* and queue it, if we have room. If we want to hold on to the record, but we
* don't have room, record a record lost message.
*/
-static void kauditd_hold_skb(struct sk_buff *skb)
+static void kauditd_hold_skb(struct sk_buff *skb, int error)
{
/* at this point it is uncertain if we will ever send this to auditd so
* try to send the message via printk before we go any further */
kauditd_printk_skb(skb);
/* can we just silently drop the message? */
- if (!audit_default) {
- kfree_skb(skb);
- return;
+ if (!audit_default)
+ goto drop;
+
+ /* the hold queue is only for when the daemon goes away completely,
+ * not -EAGAIN failures; if we are in a -EAGAIN state requeue the
+ * record on the retry queue unless it's full, in which case drop it
+ */
+ if (error == -EAGAIN) {
+ if (!audit_backlog_limit ||
+ skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
+ skb_queue_tail(&audit_retry_queue, skb);
+ return;
+ }
+ audit_log_lost("kauditd retry queue overflow");
+ goto drop;
}
- /* if we have room, queue the message */
+ /* if we have room in the hold queue, queue the message */
if (!audit_backlog_limit ||
skb_queue_len(&audit_hold_queue) < audit_backlog_limit) {
skb_queue_tail(&audit_hold_queue, skb);
@@ -580,24 +607,32 @@ static void kauditd_hold_skb(struct sk_buff *skb)
/* we have no other options - drop the message */
audit_log_lost("kauditd hold queue overflow");
+drop:
kfree_skb(skb);
}
/**
* kauditd_retry_skb - Queue an audit record, attempt to send again to auditd
* @skb: audit record
+ * @error: error code (unused)
*
* Description:
* Not as serious as kauditd_hold_skb() as we still have a connected auditd,
* but for some reason we are having problems sending it audit records so
* queue the given record and attempt to resend.
*/
-static void kauditd_retry_skb(struct sk_buff *skb)
+static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error)
{
- /* NOTE: because records should only live in the retry queue for a
- * short period of time, before either being sent or moved to the hold
- * queue, we don't currently enforce a limit on this queue */
- skb_queue_tail(&audit_retry_queue, skb);
+ if (!audit_backlog_limit ||
+ skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
+ skb_queue_tail(&audit_retry_queue, skb);
+ return;
+ }
+
+ /* we have to drop the record, send it via printk as a last effort */
+ kauditd_printk_skb(skb);
+ audit_log_lost("kauditd retry queue overflow");
+ kfree_skb(skb);
}
/**
@@ -635,7 +670,7 @@ static void auditd_reset(const struct auditd_connection *ac)
/* flush the retry queue to the hold queue, but don't touch the main
* queue since we need to process that normally for multicast */
while ((skb = skb_dequeue(&audit_retry_queue)))
- kauditd_hold_skb(skb);
+ kauditd_hold_skb(skb, -ECONNREFUSED);
}
/**
@@ -709,16 +744,18 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
struct sk_buff_head *queue,
unsigned int retry_limit,
void (*skb_hook)(struct sk_buff *skb),
- void (*err_hook)(struct sk_buff *skb))
+ void (*err_hook)(struct sk_buff *skb, int error))
{
int rc = 0;
- struct sk_buff *skb;
- static unsigned int failed = 0;
+ struct sk_buff *skb = NULL;
+ struct sk_buff *skb_tail;
+ unsigned int failed = 0;
/* NOTE: kauditd_thread takes care of all our locking, we just use
* the netlink info passed to us (e.g. sk and portid) */
- while ((skb = skb_dequeue(queue))) {
+ skb_tail = skb_peek_tail(queue);
+ while ((skb != skb_tail) && (skb = skb_dequeue(queue))) {
/* call the skb_hook for each skb we touch */
if (skb_hook)
(*skb_hook)(skb);
@@ -726,36 +763,34 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
/* can we send to anyone via unicast? */
if (!sk) {
if (err_hook)
- (*err_hook)(skb);
+ (*err_hook)(skb, -ECONNREFUSED);
continue;
}
+retry:
/* grab an extra skb reference in case of error */
skb_get(skb);
rc = netlink_unicast(sk, skb, portid, 0);
if (rc < 0) {
- /* fatal failure for our queue flush attempt? */
+ /* send failed - try a few times unless fatal error */
if (++failed >= retry_limit ||
rc == -ECONNREFUSED || rc == -EPERM) {
- /* yes - error processing for the queue */
sk = NULL;
if (err_hook)
- (*err_hook)(skb);
- if (!skb_hook)
- goto out;
- /* keep processing with the skb_hook */
+ (*err_hook)(skb, rc);
+ if (rc == -EAGAIN)
+ rc = 0;
+ /* continue to drain the queue */
continue;
} else
- /* no - requeue to preserve ordering */
- skb_queue_head(queue, skb);
+ goto retry;
} else {
- /* it worked - drop the extra reference and continue */
+ /* skb sent - drop the extra reference and continue */
consume_skb(skb);
failed = 0;
}
}
-out:
return (rc >= 0 ? 0 : rc);
}
@@ -929,8 +964,7 @@ static void audit_free_reply(struct audit_reply *reply)
if (!reply)
return;
- if (reply->skb)
- kfree_skb(reply->skb);
+ kfree_skb(reply->skb);
if (reply->net)
put_net(reply->net);
kfree(reply);
@@ -1074,7 +1108,7 @@ static inline void audit_log_user_recv_msg(struct audit_buffer **ab,
audit_log_common_recv_msg(NULL, ab, msg_type);
}
-int is_audit_feature_set(int i)
+static int is_audit_feature_set(int i)
{
return af.features & AUDIT_FEATURE_TO_MASK(i);
}
@@ -1177,7 +1211,8 @@ static int audit_replace(struct pid *pid)
return auditd_send_unicast_skb(skb);
}
-static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+ bool *ack)
{
u32 seq;
void *data;
@@ -1201,17 +1236,18 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
case AUDIT_GET: {
struct audit_status s;
memset(&s, 0, sizeof(s));
- s.enabled = audit_enabled;
- s.failure = audit_failure;
+ s.enabled = audit_enabled;
+ s.failure = audit_failure;
/* NOTE: use pid_vnr() so the PID is relative to the current
* namespace */
- s.pid = auditd_pid_vnr();
- s.rate_limit = audit_rate_limit;
- s.backlog_limit = audit_backlog_limit;
- s.lost = atomic_read(&audit_lost);
- s.backlog = skb_queue_len(&audit_queue);
- s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
- s.backlog_wait_time = audit_backlog_wait_time;
+ s.pid = auditd_pid_vnr();
+ s.rate_limit = audit_rate_limit;
+ s.backlog_limit = audit_backlog_limit;
+ s.lost = atomic_read(&audit_lost);
+ s.backlog = skb_queue_len(&audit_queue);
+ s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
+ s.backlog_wait_time = audit_backlog_wait_time;
+ s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual);
audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
}
@@ -1269,7 +1305,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* register a new auditd connection */
err = auditd_set(req_pid,
NETLINK_CB(skb).portid,
- sock_net(NETLINK_CB(skb).sk));
+ sock_net(NETLINK_CB(skb).sk),
+ skb, ack);
if (audit_enabled != AUDIT_OFF)
audit_log_config_change("audit_pid",
new_pid,
@@ -1315,6 +1352,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
audit_log_config_change("lost", 0, lost, 1);
return lost;
}
+ if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) {
+ u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0);
+
+ audit_log_config_change("backlog_wait_time_actual", 0, actual, 1);
+ return actual;
+ }
break;
}
case AUDIT_GET_FEATURE:
@@ -1357,7 +1400,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
str);
} else {
audit_log_format(ab, " data=");
- if (data_len > 0 && str[data_len - 1] == '\0')
+ if (str[data_len - 1] == '\0')
data_len--;
audit_log_n_untrustedstring(ab, str, data_len);
}
@@ -1435,7 +1478,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err)
return err;
}
- sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
+ sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL);
if (!sig_data) {
if (audit_sig_sid)
security_release_secctx(ctx, len);
@@ -1448,7 +1491,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
security_release_secctx(ctx, len);
}
audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
- sig_data, sizeof(*sig_data) + len);
+ sig_data, struct_size(sig_data, ctx, len));
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
@@ -1508,9 +1551,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
* Parse the provided skb and deal with any messages that may be present,
* malformed skbs are discarded.
*/
-static void audit_receive(struct sk_buff *skb)
+static void audit_receive(struct sk_buff *skb)
{
struct nlmsghdr *nlh;
+ bool ack;
/*
* len MUST be signed for nlmsg_next to be able to dec it below 0
* if the nlmsg_len was not aligned
@@ -1523,14 +1567,31 @@ static void audit_receive(struct sk_buff *skb)
audit_ctl_lock();
while (nlmsg_ok(nlh, len)) {
- err = audit_receive_msg(skb, nlh);
- /* if err or if this message says it wants a response */
- if (err || (nlh->nlmsg_flags & NLM_F_ACK))
+ ack = nlh->nlmsg_flags & NLM_F_ACK;
+ err = audit_receive_msg(skb, nlh, &ack);
+
+ /* send an ack if the user asked for one and audit_receive_msg
+ * didn't already do it, or if there was an error. */
+ if (ack || err)
netlink_ack(skb, nlh, err, NULL);
nlh = nlmsg_next(nlh, &len);
}
audit_ctl_unlock();
+
+ /* can't block with the ctrl lock, so penalize the sender now */
+ if (audit_backlog_limit &&
+ (skb_queue_len(&audit_queue) > audit_backlog_limit)) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ /* wake kauditd to try and flush the queue */
+ wake_up_interruptible(&kauditd_wait);
+
+ add_wait_queue_exclusive(&audit_backlog_wait, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(audit_backlog_wait_time);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+ }
}
/* Log information about who is connecting to the audit multicast socket */
@@ -1598,7 +1659,8 @@ static int __net_init audit_net_init(struct net *net)
audit_panic("cannot initialize netlink socket in namespace");
return -ENOMEM;
}
- aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ /* limit the timeout in case auditd is blocked/stopped */
+ aunet->sk->sk_sndtimeo = HZ / 10;
return 0;
}
@@ -1768,7 +1830,7 @@ unsigned int audit_serial(void)
{
static atomic_t serial = ATOMIC_INIT(0);
- return atomic_add_return(1, &serial);
+ return atomic_inc_return(&serial);
}
static inline void audit_get_stamp(struct audit_context *ctx,
@@ -1800,7 +1862,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
{
struct audit_buffer *ab;
struct timespec64 t;
- unsigned int uninitialized_var(serial);
+ unsigned int serial;
if (audit_initialized != AUDIT_INITIALIZED)
return NULL;
@@ -1814,7 +1876,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
* task_tgid_vnr() since auditd_pid is set in audit_receive_msg()
* using a PID anchored in the caller's namespace
* 2. generator holding the audit_cmd_mutex - we don't want to block
- * while holding the mutex */
+ * while holding the mutex, although we do penalize the sender
+ * later in audit_receive() when it is safe to block
+ */
if (!(auditd_test_task(current) || audit_ctl_owner_current())) {
long stime = audit_backlog_wait_time;
@@ -1826,12 +1890,15 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
/* sleep if we are allowed and we haven't exhausted our
* backlog wait limit */
if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) {
+ long rtime = stime;
+
DECLARE_WAITQUEUE(wait, current);
add_wait_queue_exclusive(&audit_backlog_wait,
&wait);
set_current_state(TASK_UNINTERRUPTIBLE);
- stime = schedule_timeout(stime);
+ stime = schedule_timeout(rtime);
+ atomic_add(rtime - stime, &audit_backlog_wait_time_actual);
remove_wait_queue(&audit_backlog_wait, &wait);
} else {
if (audit_rate_check() && printk_ratelimit())
@@ -1851,6 +1918,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
}
audit_get_stamp(ab->ctx, &t, &serial);
+ /* cancel dummy context to enable supporting records */
+ if (ctx)
+ ctx->dummy = 0;
audit_log_format(ab, "audit(%llu.%03lu:%u): ",
(unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial);
@@ -2079,13 +2149,13 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
/* We will allow 11 spaces for ' (deleted)' to be appended */
pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
if (!pathname) {
- audit_log_string(ab, "<no_memory>");
+ audit_log_format(ab, "\"<no_memory>\"");
return;
}
p = d_path(path, pathname, PATH_MAX+11);
if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
/* FIXME: can we save some information here? */
- audit_log_string(ab, "<too_long>");
+ audit_log_format(ab, "\"<too_long>\"");
} else
audit_log_untrustedstring(ab, p);
kfree(pathname);
@@ -2115,7 +2185,7 @@ int audit_log_task_context(struct audit_buffer *ab)
int error;
u32 sid;
- security_task_getsecid(current, &sid);
+ security_current_getsecid_subj(&sid);
if (!sid)
return 0;
@@ -2268,7 +2338,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
uid = from_kuid(&init_user_ns, task_uid(current));
oldloginuid = from_kuid(&init_user_ns, koldloginuid);
- loginuid = from_kuid(&init_user_ns, kloginuid),
+ loginuid = from_kuid(&init_user_ns, kloginuid);
tty = audit_get_tty();
audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
@@ -2336,7 +2406,7 @@ int audit_signal_info(int sig, struct task_struct *t)
audit_sig_uid = auid;
else
audit_sig_uid = uid;
- security_task_getsecid(current, &audit_sig_sid);
+ security_current_getsecid_subj(&audit_sig_sid);
}
return audit_signal_info_syscall(t);
@@ -2348,7 +2418,7 @@ int audit_signal_info(int sig, struct task_struct *t)
*
* We can not do a netlink send inside an irq context because it blocks (last
* arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a
- * queue and a tasklet is scheduled to remove them from the queue outside the
+ * queue and a kthread is scheduled to remove them from the queue outside the
* irq context. May be called in any context.
*/
void audit_log_end(struct audit_buffer *ab)
diff --git a/kernel/audit.h b/kernel/audit.h
index ddc22878433d..a60d2840559e 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -1,16 +1,20 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* audit -- definition of audit_context structure and supporting types
+/* audit -- definition of audit_context structure and supporting types
*
* Copyright 2003-2004 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
*/
+#ifndef _KERNEL_AUDIT_H_
+#define _KERNEL_AUDIT_H_
+
#include <linux/fs.h>
#include <linux/audit.h>
#include <linux/skbuff.h>
#include <uapi/linux/mqueue.h>
#include <linux/tty.h>
+#include <uapi/linux/openat2.h> // struct open_how
/* AUDIT_NAMES is the number of slots we reserve in the audit_context
* for saving names from getname(). If we get more names we will allocate
@@ -21,16 +25,16 @@
a per-task filter. At syscall entry, the audit_state is augmented by
the syscall filter. */
enum audit_state {
- AUDIT_DISABLED, /* Do not create per-task audit_context.
+ AUDIT_STATE_DISABLED, /* Do not create per-task audit_context.
* No syscall-specific audit records can
* be generated. */
- AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
+ AUDIT_STATE_BUILD, /* Create the per-task audit_context,
* and fill it in at syscall
* entry time. This makes a full
* syscall record available if some
* other part of the kernel decides it
* should be recorded. */
- AUDIT_RECORD_CONTEXT /* Create the per-task audit_context,
+ AUDIT_STATE_RECORD /* Create the per-task audit_context,
* always fill it in at syscall entry
* time, and always write out the audit
* record at syscall exit time. */
@@ -97,10 +101,15 @@ struct audit_proctitle {
/* The per-task audit context. */
struct audit_context {
int dummy; /* must be the first element */
- int in_syscall; /* 1 if task is in a syscall */
+ enum {
+ AUDIT_CTX_UNUSED, /* audit_context is currently unused */
+ AUDIT_CTX_SYSCALL, /* in use by syscall */
+ AUDIT_CTX_URING, /* in use by io_uring */
+ } context;
enum audit_state state, current_state;
unsigned int serial; /* serial number for record */
int major; /* syscall number */
+ int uring_op; /* uring operation */
struct timespec64 ctime; /* time of syscall entry */
unsigned long argv[4]; /* syscall arguments */
long return_code;/* syscall return code */
@@ -124,7 +133,7 @@ struct audit_context {
struct sockaddr_storage *sockaddr;
size_t sockaddr_len;
/* Save things to print about task_struct */
- pid_t pid, ppid;
+ pid_t ppid;
kuid_t uid, euid, suid, fsuid;
kgid_t gid, egid, sgid, fsgid;
unsigned long personality;
@@ -185,12 +194,17 @@ struct audit_context {
int fd;
int flags;
} mmap;
+ struct open_how openat2;
struct {
int argc;
} execve;
struct {
char *name;
} module;
+ struct {
+ struct audit_ntp_data ntp_data;
+ struct timespec64 tk_injoffset;
+ } time;
};
int fds[2];
struct audit_proctitle proctitle;
@@ -231,8 +245,6 @@ struct audit_netlink_list {
int audit_send_list_thread(void *_dest);
-extern int selinux_audit_rule_update(void);
-
extern struct mutex audit_filter_mutex;
extern int audit_del_rule(struct audit_entry *entry);
extern void audit_free_rule_rcu(struct rcu_head *head);
@@ -247,8 +259,8 @@ extern struct tty_struct *audit_get_tty(void);
extern void audit_put_tty(struct tty_struct *tty);
/* audit watch/mark/tree functions */
-#ifdef CONFIG_AUDITSYSCALL
extern unsigned int audit_serial(void);
+#ifdef CONFIG_AUDITSYSCALL
extern int auditsc_get_stamp(struct audit_context *ctx,
struct timespec64 *t, unsigned int *serial);
@@ -292,8 +304,8 @@ extern void audit_filter_inodes(struct task_struct *tsk,
extern struct list_head *audit_killed_trees(void);
#else /* CONFIG_AUDITSYSCALL */
#define auditsc_get_stamp(c, t, s) 0
-#define audit_put_watch(w) {}
-#define audit_get_watch(w) {}
+#define audit_put_watch(w) do { } while (0)
+#define audit_get_watch(w) do { } while (0)
#define audit_to_watch(k, p, l, o) (-EINVAL)
#define audit_add_watch(k, l) (-EINVAL)
#define audit_remove_watch_rule(k) BUG()
@@ -302,8 +314,8 @@ extern struct list_head *audit_killed_trees(void);
#define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL))
#define audit_mark_path(m) ""
-#define audit_remove_mark(m)
-#define audit_remove_mark_rule(k)
+#define audit_remove_mark(m) do { } while (0)
+#define audit_remove_mark_rule(k) do { } while (0)
#define audit_mark_compare(m, i, d) 0
#define audit_exe_compare(t, m) (-EINVAL)
#define audit_dupe_exe(n, o) (-EINVAL)
@@ -311,8 +323,8 @@ extern struct list_head *audit_killed_trees(void);
#define audit_remove_tree_rule(rule) BUG()
#define audit_add_tree_rule(rule) -EINVAL
#define audit_make_tree(rule, str, op) -EINVAL
-#define audit_trim_trees() (void)0
-#define audit_put_tree(tree) (void)0
+#define audit_trim_trees() do { } while (0)
+#define audit_put_tree(tree) do { } while (0)
#define audit_tag_tree(old, new) -EINVAL
#define audit_tree_path(rule) "" /* never called */
#define audit_kill_trees(context) BUG()
@@ -322,16 +334,14 @@ static inline int audit_signal_info_syscall(struct task_struct *t)
return 0;
}
-#define audit_filter_inodes(t, c) AUDIT_DISABLED
+#define audit_filter_inodes(t, c) do { } while (0)
#endif /* CONFIG_AUDITSYSCALL */
extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
-extern pid_t audit_sig_pid;
-extern kuid_t audit_sig_uid;
-extern u32 audit_sig_sid;
-
extern int audit_filter(int msgtype, unsigned int listtype);
extern void audit_ctl_lock(void);
extern void audit_ctl_unlock(void);
+
+#endif
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 3596448bfdab..c565fbf66ac8 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -36,7 +36,7 @@ static struct fsnotify_group *audit_fsnotify_group;
/* fsnotify events we care about. */
#define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
- FS_MOVE_SELF | FS_EVENT_ON_CHILD)
+ FS_MOVE_SELF)
static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark)
{
@@ -84,7 +84,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
dentry = kern_path_locked(pathname, &path);
if (IS_ERR(dentry))
- return (void *)dentry; /* returning an error */
+ return ERR_CAST(dentry); /* returning an error */
inode = path.dentry->d_inode;
inode_unlock(inode);
@@ -100,8 +100,9 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
audit_update_mark(audit_mark, dentry->d_inode);
audit_mark->rule = krule;
- ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, true);
+ ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0);
if (ret < 0) {
+ audit_mark->path = NULL;
fsnotify_put_mark(&audit_mark->mark);
audit_mark = ERR_PTR(ret);
}
@@ -152,41 +153,37 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
}
/* Update mark data in audit rules based on fsnotify events. */
-static int audit_mark_handle_event(struct fsnotify_group *group,
- struct inode *to_tell,
- u32 mask, const void *data, int data_type,
- const struct qstr *dname, u32 cookie,
- struct fsnotify_iter_info *iter_info)
+static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *dname, u32 cookie)
{
- struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
struct audit_fsnotify_mark *audit_mark;
- const struct inode *inode = fsnotify_data_inode(data, data_type);
audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
- BUG_ON(group != audit_fsnotify_group);
-
- if (WARN_ON(!inode))
+ if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group))
return 0;
if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL))
return 0;
audit_update_mark(audit_mark, inode);
- } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
+ } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) {
audit_autoremove_mark_rule(audit_mark);
+ }
return 0;
}
static const struct fsnotify_ops audit_mark_fsnotify_ops = {
- .handle_event = audit_mark_handle_event,
+ .handle_inode_event = audit_mark_handle_event,
.free_mark = audit_fsnotify_free_mark,
};
static int __init audit_fsnotify_init(void)
{
- audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops);
+ audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops,
+ FSNOTIFY_GROUP_DUPS);
if (IS_ERR(audit_fsnotify_group)) {
audit_fsnotify_group = NULL;
audit_panic("cannot create audit fsnotify group");
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e49c912f862d..1b07e6f12a07 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -30,11 +30,11 @@ struct audit_chunk {
int count;
atomic_long_t refs;
struct rcu_head head;
- struct node {
+ struct audit_node {
struct list_head list;
struct audit_tree *owner;
unsigned index; /* index; upper bit indicates 'will prune' */
- } owners[];
+ } owners[] __counted_by(count);
};
struct audit_tree_mark {
@@ -87,14 +87,14 @@ static struct task_struct *prune_thread;
* that makes a difference. Some.
*/
-static struct fsnotify_group *audit_tree_group;
-static struct kmem_cache *audit_tree_mark_cachep __read_mostly;
+static struct fsnotify_group *audit_tree_group __ro_after_init;
+static struct kmem_cache *audit_tree_mark_cachep __ro_after_init;
static struct audit_tree *alloc_tree(const char *s)
{
struct audit_tree *tree;
- tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL);
+ tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL);
if (tree) {
refcount_set(&tree->count, 1);
tree->goner = 0;
@@ -188,11 +188,9 @@ static struct fsnotify_mark *alloc_mark(void)
static struct audit_chunk *alloc_chunk(int count)
{
struct audit_chunk *chunk;
- size_t size;
int i;
- size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
- chunk = kzalloc(size, GFP_KERNEL);
+ chunk = kzalloc(struct_size(chunk, owners, count), GFP_KERNEL);
if (!chunk)
return NULL;
@@ -271,7 +269,7 @@ bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
/* tagging and untagging inodes with trees */
-static struct audit_chunk *find_chunk(struct node *p)
+static struct audit_chunk *find_chunk(struct audit_node *p)
{
int index = p->index & ~(1U<<31);
p -= index;
@@ -324,7 +322,7 @@ static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old)
list_replace_rcu(&old->hash, &new->hash);
}
-static void remove_chunk_node(struct audit_chunk *chunk, struct node *p)
+static void remove_chunk_node(struct audit_chunk *chunk, struct audit_node *p)
{
struct audit_tree *owner = p->owner;
@@ -353,7 +351,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
struct audit_chunk *new;
int size;
- mutex_lock(&audit_tree_group->mark_mutex);
+ fsnotify_group_lock(audit_tree_group);
/*
* mark_mutex stabilizes chunk attached to the mark so we can check
* whether it didn't change while we've dropped hash_lock.
@@ -370,7 +368,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
replace_mark_chunk(mark, NULL);
spin_unlock(&hash_lock);
fsnotify_detach_mark(mark);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
audit_mark_put_chunk(chunk);
fsnotify_free_mark(mark);
return;
@@ -387,12 +385,12 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
*/
replace_chunk(new, chunk);
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
audit_mark_put_chunk(chunk);
return;
out_mutex:
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
}
/* Call with group->mark_mutex held, releases it */
@@ -402,19 +400,19 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
struct audit_chunk *chunk = alloc_chunk(1);
if (!chunk) {
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
return -ENOMEM;
}
mark = alloc_mark();
if (!mark) {
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
kfree(chunk);
return -ENOMEM;
}
if (fsnotify_add_inode_mark_locked(mark, inode, 0)) {
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark);
kfree(chunk);
return -ENOSPC;
@@ -424,7 +422,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
if (tree->goner) {
spin_unlock(&hash_lock);
fsnotify_detach_mark(mark);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_free_mark(mark);
fsnotify_put_mark(mark);
kfree(chunk);
@@ -446,7 +444,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
*/
insert_hash(chunk);
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
/*
* Drop our initial reference. When mark we point to is getting freed,
* we get notification through ->freeing_mark callback and cleanup
@@ -461,10 +459,10 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
{
struct fsnotify_mark *mark;
struct audit_chunk *chunk, *old;
- struct node *p;
+ struct audit_node *p;
int n;
- mutex_lock(&audit_tree_group->mark_mutex);
+ fsnotify_group_lock(audit_tree_group);
mark = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group);
if (!mark)
return create_chunk(inode, tree);
@@ -480,7 +478,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
for (n = 0; n < old->count; n++) {
if (old->owners[n].owner == tree) {
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark);
return 0;
}
@@ -489,7 +487,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk = alloc_chunk(old->count + 1);
if (!chunk) {
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark);
return -ENOMEM;
}
@@ -497,7 +495,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
spin_lock(&hash_lock);
if (tree->goner) {
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark);
kfree(chunk);
return 0;
@@ -517,7 +515,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
*/
replace_chunk(chunk, old);
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark); /* pair to fsnotify_find_mark */
audit_mark_put_chunk(old);
@@ -572,11 +570,11 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
{
spin_lock(&hash_lock);
while (!list_empty(&victim->chunks)) {
- struct node *p;
+ struct audit_node *p;
struct audit_chunk *chunk;
struct fsnotify_mark *mark;
- p = list_first_entry(&victim->chunks, struct node, list);
+ p = list_first_entry(&victim->chunks, struct audit_node, list);
/* have we run out of marked? */
if (tagged && !(p->index & (1U<<31)))
break;
@@ -595,7 +593,6 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
spin_lock(&hash_lock);
}
spin_unlock(&hash_lock);
- put_tree(victim);
}
/*
@@ -604,6 +601,7 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
static void prune_one(struct audit_tree *victim)
{
prune_tree_chunks(victim, false);
+ put_tree(victim);
}
/* trim the uncommitted chunks from tree */
@@ -618,7 +616,7 @@ static void trim_marked(struct audit_tree *tree)
}
/* reorder */
for (p = tree->chunks.next; p != &tree->chunks; p = q) {
- struct node *node = list_entry(p, struct node, list);
+ struct audit_node *node = list_entry(p, struct audit_node, list);
q = p->next;
if (node->index & (1U<<31)) {
list_del_init(p);
@@ -686,13 +684,12 @@ void audit_trim_trees(void)
struct audit_tree *tree;
struct path path;
struct vfsmount *root_mnt;
- struct node *node;
+ struct audit_node *node;
int err;
tree = container_of(cursor.next, struct audit_tree, list);
get_tree(tree);
- list_del(&cursor);
- list_add(&cursor, &tree->list);
+ list_move(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
err = kern_path(tree->pathname, 0, &path);
@@ -729,7 +726,8 @@ int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
{
if (pathname[0] != '/' ||
- rule->listnr != AUDIT_FILTER_EXIT ||
+ (rule->listnr != AUDIT_FILTER_EXIT &&
+ rule->listnr != AUDIT_FILTER_URING_EXIT) ||
op != Audit_equal ||
rule->inode_f || rule->watch || rule->tree)
return -EINVAL;
@@ -842,7 +840,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
drop_collected_mounts(mnt);
if (!err) {
- struct node *node;
+ struct audit_node *node;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list)
node->index &= ~(1U<<31);
@@ -901,8 +899,7 @@ int audit_tag_tree(char *old, char *new)
tree = container_of(cursor.next, struct audit_tree, list);
get_tree(tree);
- list_del(&cursor);
- list_add(&cursor, &tree->list);
+ list_move(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
err = kern_path(tree->pathname, 0, &path2);
@@ -927,8 +924,7 @@ int audit_tag_tree(char *old, char *new)
mutex_lock(&audit_filter_mutex);
spin_lock(&hash_lock);
if (!tree->goner) {
- list_del(&tree->list);
- list_add(&tree->list, &tree_list);
+ list_move(&tree->list, &tree_list);
}
spin_unlock(&hash_lock);
put_tree(tree);
@@ -939,12 +935,11 @@ int audit_tag_tree(char *old, char *new)
tree = container_of(barrier.prev, struct audit_tree, list);
get_tree(tree);
- list_del(&tree->list);
- list_add(&tree->list, &barrier);
+ list_move(&tree->list, &barrier);
mutex_unlock(&audit_filter_mutex);
if (!failed) {
- struct node *node;
+ struct audit_node *node;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list)
node->index &= ~(1U<<31);
@@ -1037,11 +1032,9 @@ static void evict_chunk(struct audit_chunk *chunk)
audit_schedule_prune();
}
-static int audit_tree_handle_event(struct fsnotify_group *group,
- struct inode *to_tell,
- u32 mask, const void *data, int data_type,
- const struct qstr *file_name, u32 cookie,
- struct fsnotify_iter_info *iter_info)
+static int audit_tree_handle_event(struct fsnotify_mark *mark, u32 mask,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *file_name, u32 cookie)
{
return 0;
}
@@ -1051,12 +1044,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *mark,
{
struct audit_chunk *chunk;
- mutex_lock(&mark->group->mark_mutex);
+ fsnotify_group_lock(mark->group);
spin_lock(&hash_lock);
chunk = mark_chunk(mark);
replace_mark_chunk(mark, NULL);
spin_unlock(&hash_lock);
- mutex_unlock(&mark->group->mark_mutex);
+ fsnotify_group_unlock(mark->group);
if (chunk) {
evict_chunk(chunk);
audit_mark_put_chunk(chunk);
@@ -1070,7 +1063,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *mark,
}
static const struct fsnotify_ops audit_tree_ops = {
- .handle_event = audit_tree_handle_event,
+ .handle_inode_event = audit_tree_handle_event,
.freeing_mark = audit_tree_freeing_mark,
.free_mark = audit_tree_destroy_watch,
};
@@ -1081,7 +1074,7 @@ static int __init audit_tree_init(void)
audit_tree_mark_cachep = KMEM_CACHE(audit_tree_mark, SLAB_PANIC);
- audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
+ audit_tree_group = fsnotify_alloc_group(&audit_tree_ops, 0);
if (IS_ERR(audit_tree_group))
audit_panic("cannot initialize fsnotify group for rectree watches");
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index e09c551ae52d..7a98cd176a12 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -53,7 +53,7 @@ static struct fsnotify_group *audit_watch_group;
/* fsnotify events we care about. */
#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
- FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT)
+ FS_MOVE_SELF | FS_UNMOUNT)
static void audit_free_parent(struct audit_parent *parent)
{
@@ -133,7 +133,7 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
}
/* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct path *path)
+static struct audit_parent *audit_init_parent(const struct path *path)
{
struct inode *inode = d_backing_inode(path->dentry);
struct audit_parent *parent;
@@ -183,7 +183,8 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
return -EOPNOTSUPP;
if (path[0] != '/' || path[len-1] == '/' ||
- krule->listnr != AUDIT_FILTER_EXIT ||
+ (krule->listnr != AUDIT_FILTER_EXIT &&
+ krule->listnr != AUDIT_FILTER_URING_EXIT) ||
op != Audit_equal ||
krule->inode_f || krule->watch || krule->tree)
return -EINVAL;
@@ -464,20 +465,16 @@ void audit_remove_watch_rule(struct audit_krule *krule)
}
/* Update watch data in audit rules based on fsnotify events. */
-static int audit_watch_handle_event(struct fsnotify_group *group,
- struct inode *to_tell,
- u32 mask, const void *data, int data_type,
- const struct qstr *dname, u32 cookie,
- struct fsnotify_iter_info *iter_info)
+static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *dname, u32 cookie)
{
- struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
- const struct inode *inode = fsnotify_data_inode(data, data_type);
struct audit_parent *parent;
parent = container_of(inode_mark, struct audit_parent, mark);
- BUG_ON(group != audit_watch_group);
- WARN_ON(!inode);
+ if (WARN_ON_ONCE(inode_mark->group != audit_watch_group))
+ return 0;
if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
@@ -490,13 +487,13 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
}
static const struct fsnotify_ops audit_watch_fsnotify_ops = {
- .handle_event = audit_watch_handle_event,
+ .handle_inode_event = audit_watch_handle_event,
.free_mark = audit_watch_free_mark,
};
static int __init audit_watch_init(void)
{
- audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
+ audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops, 0);
if (IS_ERR(audit_watch_group)) {
audit_watch_group = NULL;
audit_panic("cannot create audit fsnotify group");
@@ -530,11 +527,18 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
unsigned long ino;
dev_t dev;
- exe_file = get_task_exe_file(tsk);
+ /* only do exe filtering if we are recording @current events/records */
+ if (tsk != current)
+ return 0;
+
+ if (!current->mm)
+ return 0;
+ exe_file = get_mm_exe_file(current->mm);
if (!exe_file)
return 0;
ino = file_inode(exe_file)->i_ino;
dev = file_inode(exe_file)->i_sb->s_dev;
fput(exe_file);
+
return audit_mark_compare(mark, ino, dev);
}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a10e2997aa6c..8317a37dea0b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -44,7 +44,8 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_filter_list[4]),
LIST_HEAD_INIT(audit_filter_list[5]),
LIST_HEAD_INIT(audit_filter_list[6]),
-#if AUDIT_NR_FILTERS != 7
+ LIST_HEAD_INIT(audit_filter_list[7]),
+#if AUDIT_NR_FILTERS != 8
#error Fix audit_filter_list initialiser
#endif
};
@@ -56,6 +57,7 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_rules_list[4]),
LIST_HEAD_INIT(audit_rules_list[5]),
LIST_HEAD_INIT(audit_rules_list[6]),
+ LIST_HEAD_INIT(audit_rules_list[7]),
};
DEFINE_MUTEX(audit_filter_mutex);
@@ -151,7 +153,8 @@ char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
static inline int audit_to_inode(struct audit_krule *krule,
struct audit_field *f)
{
- if (krule->listnr != AUDIT_FILTER_EXIT ||
+ if ((krule->listnr != AUDIT_FILTER_EXIT &&
+ krule->listnr != AUDIT_FILTER_URING_EXIT) ||
krule->inode_f || krule->watch || krule->tree ||
(f->op != Audit_equal && f->op != Audit_not_equal))
return -EINVAL;
@@ -218,7 +221,7 @@ static int audit_match_signal(struct audit_entry *entry)
entry->rule.mask));
}
- switch(audit_classify_arch(arch->val)) {
+ switch (audit_classify_arch(arch->val)) {
case 0: /* native */
return (audit_match_class_bits(AUDIT_CLASS_SIGNAL,
entry->rule.mask));
@@ -240,7 +243,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
err = -EINVAL;
listnr = rule->flags & ~AUDIT_FILTER_PREPEND;
- switch(listnr) {
+ switch (listnr) {
default:
goto exit_err;
#ifdef CONFIG_AUDITSYSCALL
@@ -248,6 +251,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
pr_err("AUDIT_FILTER_ENTRY is deprecated\n");
goto exit_err;
case AUDIT_FILTER_EXIT:
+ case AUDIT_FILTER_URING_EXIT:
case AUDIT_FILTER_TASK:
#endif
case AUDIT_FILTER_USER:
@@ -332,11 +336,15 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
if (entry->rule.listnr != AUDIT_FILTER_FS)
return -EINVAL;
break;
+ case AUDIT_PERM:
+ if (entry->rule.listnr == AUDIT_FILTER_URING_EXIT)
+ return -EINVAL;
+ break;
}
switch (entry->rule.listnr) {
case AUDIT_FILTER_FS:
- switch(f->type) {
+ switch (f->type) {
case AUDIT_FSTYPE:
case AUDIT_FILTERKEY:
break;
@@ -629,7 +637,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
void *bufp;
int i;
- data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
+ data = kmalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL);
if (unlikely(!data))
return NULL;
memset(data, 0, sizeof(*data));
@@ -643,7 +651,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->fields[i] = f->type;
data->fieldflags[i] = audit_ops[f->op];
- switch(f->type) {
+ switch (f->type) {
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
@@ -681,12 +689,13 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->values[i] = AUDIT_UID_UNSET;
break;
}
- /* fall through - if set */
+ fallthrough; /* if set */
default:
data->values[i] = f->val;
}
}
- for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i];
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+ data->mask[i] = krule->mask[i];
return data;
}
@@ -709,7 +718,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
a->fields[i].op != b->fields[i].op)
return 1;
- switch(a->fields[i].type) {
+ switch (a->fields[i].type) {
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
@@ -938,7 +947,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
int dont_count = 0;
/* If any of these, don't count towards total */
- switch(entry->rule.listnr) {
+ switch (entry->rule.listnr) {
case AUDIT_FILTER_USER:
case AUDIT_FILTER_EXCLUDE:
case AUDIT_FILTER_FS:
@@ -980,7 +989,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
}
entry->rule.prio = ~0ULL;
- if (entry->rule.listnr == AUDIT_FILTER_EXIT) {
+ if (entry->rule.listnr == AUDIT_FILTER_EXIT ||
+ entry->rule.listnr == AUDIT_FILTER_URING_EXIT) {
if (entry->rule.flags & AUDIT_FILTER_PREPEND)
entry->rule.prio = ++prio_high;
else
@@ -1020,7 +1030,7 @@ int audit_del_rule(struct audit_entry *entry)
int dont_count = 0;
/* If any of these, don't count towards total */
- switch(entry->rule.listnr) {
+ switch (entry->rule.listnr) {
case AUDIT_FILTER_USER:
case AUDIT_FILTER_EXCLUDE:
case AUDIT_FILTER_FS:
@@ -1074,7 +1084,7 @@ static void audit_list_rules(int seq, struct sk_buff_head *q)
/* This is a blocking read, so use audit_filter_mutex instead of rcu
* iterator to sync with list writers. */
- for (i=0; i<AUDIT_NR_FILTERS; i++) {
+ for (i = 0; i < AUDIT_NR_FILTERS; i++) {
list_for_each_entry(r, &audit_rules_list[i], list) {
struct audit_rule_data *data;
@@ -1083,7 +1093,7 @@ static void audit_list_rules(int seq, struct sk_buff_head *q)
break;
skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1,
data,
- sizeof(*data) + data->buflen);
+ struct_size(data, buf, data->buflen));
if (skb)
skb_queue_tail(q, skb);
kfree(data);
@@ -1359,7 +1369,7 @@ int audit_filter(int msgtype, unsigned int listtype)
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
if (f->lsm_rule) {
- security_task_getsecid(current, &sid);
+ security_current_getsecid_subj(&sid);
result = security_audit_rule_match(sid,
f->type, f->op, f->lsm_rule);
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fd840c40abf7..6f0d6fb6523f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* auditsc.c -- System-call auditing support
* Handles all system-call specific auditing features.
*
@@ -6,20 +7,6 @@
* Copyright (C) 2005, 2006 IBM Corporation
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
* Written by Rickard E. (Rik) Faith <faith@redhat.com>
*
* Many of the ideas implemented here are from Stephen C. Tweedie,
@@ -75,6 +62,9 @@
#include <linux/uaccess.h>
#include <linux/fsnotify_backend.h>
#include <uapi/linux/limits.h>
+#include <uapi/linux/netfilter/nf_tables.h>
+#include <uapi/linux/openat2.h> // struct open_how
+#include <uapi/linux/fanotify.h>
#include "audit.h"
@@ -101,8 +91,6 @@ struct audit_aux_data {
int type;
};
-#define AUDIT_AUX_IPCPERM 0
-
/* Number of target pids per aux struct. */
#define AUDIT_AUX_PIDS 16
@@ -136,20 +124,40 @@ struct audit_nfcfgop_tab {
};
static const struct audit_nfcfgop_tab audit_nfcfgs[] = {
- { AUDIT_XT_OP_REGISTER, "register" },
- { AUDIT_XT_OP_REPLACE, "replace" },
- { AUDIT_XT_OP_UNREGISTER, "unregister" },
+ { AUDIT_XT_OP_REGISTER, "xt_register" },
+ { AUDIT_XT_OP_REPLACE, "xt_replace" },
+ { AUDIT_XT_OP_UNREGISTER, "xt_unregister" },
+ { AUDIT_NFT_OP_TABLE_REGISTER, "nft_register_table" },
+ { AUDIT_NFT_OP_TABLE_UNREGISTER, "nft_unregister_table" },
+ { AUDIT_NFT_OP_CHAIN_REGISTER, "nft_register_chain" },
+ { AUDIT_NFT_OP_CHAIN_UNREGISTER, "nft_unregister_chain" },
+ { AUDIT_NFT_OP_RULE_REGISTER, "nft_register_rule" },
+ { AUDIT_NFT_OP_RULE_UNREGISTER, "nft_unregister_rule" },
+ { AUDIT_NFT_OP_SET_REGISTER, "nft_register_set" },
+ { AUDIT_NFT_OP_SET_UNREGISTER, "nft_unregister_set" },
+ { AUDIT_NFT_OP_SETELEM_REGISTER, "nft_register_setelem" },
+ { AUDIT_NFT_OP_SETELEM_UNREGISTER, "nft_unregister_setelem" },
+ { AUDIT_NFT_OP_GEN_REGISTER, "nft_register_gen" },
+ { AUDIT_NFT_OP_OBJ_REGISTER, "nft_register_obj" },
+ { AUDIT_NFT_OP_OBJ_UNREGISTER, "nft_unregister_obj" },
+ { AUDIT_NFT_OP_OBJ_RESET, "nft_reset_obj" },
+ { AUDIT_NFT_OP_FLOWTABLE_REGISTER, "nft_register_flowtable" },
+ { AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, "nft_unregister_flowtable" },
+ { AUDIT_NFT_OP_SETELEM_RESET, "nft_reset_setelem" },
+ { AUDIT_NFT_OP_RULE_RESET, "nft_reset_rule" },
+ { AUDIT_NFT_OP_INVALID, "nft_invalid" },
};
static int audit_match_perm(struct audit_context *ctx, int mask)
{
unsigned n;
+
if (unlikely(!ctx))
return 0;
n = ctx->major;
switch (audit_classify_syscall(ctx->arch, n)) {
- case 0: /* native */
+ case AUDITSC_NATIVE:
if ((mask & AUDIT_PERM_WRITE) &&
audit_match_class(AUDIT_CLASS_WRITE, n))
return 1;
@@ -160,7 +168,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
audit_match_class(AUDIT_CLASS_CHATTR, n))
return 1;
return 0;
- case 1: /* 32bit on biarch */
+ case AUDITSC_COMPAT: /* 32bit on biarch */
if ((mask & AUDIT_PERM_WRITE) &&
audit_match_class(AUDIT_CLASS_WRITE_32, n))
return 1;
@@ -171,14 +179,16 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
audit_match_class(AUDIT_CLASS_CHATTR_32, n))
return 1;
return 0;
- case 2: /* open */
+ case AUDITSC_OPEN:
return mask & ACC_MODE(ctx->argv[1]);
- case 3: /* openat */
+ case AUDITSC_OPENAT:
return mask & ACC_MODE(ctx->argv[2]);
- case 4: /* socketcall */
+ case AUDITSC_SOCKETCALL:
return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND);
- case 5: /* execve */
+ case AUDITSC_EXECVE:
return mask & AUDIT_PERM_EXEC;
+ case AUDITSC_OPENAT2:
+ return mask & ACC_MODE((u32)ctx->openat2.flags);
default:
return 0;
}
@@ -215,7 +225,7 @@ static void audit_set_auditable(struct audit_context *ctx)
{
if (!ctx->prio) {
ctx->prio = 1;
- ctx->current_state = AUDIT_RECORD_CONTEXT;
+ ctx->current_state = AUDIT_STATE_RECORD;
}
}
@@ -223,6 +233,7 @@ static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
{
struct audit_tree_refs *p = ctx->trees;
int left = ctx->tree_count;
+
if (likely(left)) {
p->c[--left] = chunk;
ctx->tree_count = left;
@@ -243,6 +254,7 @@ static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
static int grow_tree_refs(struct audit_context *ctx)
{
struct audit_tree_refs *p = ctx->trees;
+
ctx->trees = kzalloc(sizeof(struct audit_tree_refs), GFP_KERNEL);
if (!ctx->trees) {
ctx->trees = p;
@@ -261,6 +273,7 @@ static void unroll_tree_refs(struct audit_context *ctx,
{
struct audit_tree_refs *q;
int n;
+
if (!p) {
/* we started with empty chain */
p = ctx->first_trees;
@@ -287,6 +300,7 @@ static void unroll_tree_refs(struct audit_context *ctx,
static void free_tree_refs(struct audit_context *ctx)
{
struct audit_tree_refs *p, *q;
+
for (p = ctx->first_trees; p; p = q) {
q = p->next;
kfree(p);
@@ -297,6 +311,7 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
{
struct audit_tree_refs *p;
int n;
+
if (!tree)
return 0;
/* full ones */
@@ -321,13 +336,13 @@ static int audit_compare_uid(kuid_t uid,
{
struct audit_names *n;
int rc;
-
+
if (name) {
rc = audit_uid_comparator(uid, f->op, name->uid);
if (rc)
return rc;
}
-
+
if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
rc = audit_uid_comparator(uid, f->op, n->uid);
@@ -345,13 +360,13 @@ static int audit_compare_gid(kgid_t gid,
{
struct audit_names *n;
int rc;
-
+
if (name) {
rc = audit_gid_comparator(gid, f->op, name->gid);
if (rc)
return rc;
}
-
+
if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
rc = audit_gid_comparator(gid, f->op, n->gid);
@@ -458,6 +473,9 @@ static int audit_filter_rules(struct task_struct *tsk,
u32 sid;
unsigned int sessionid;
+ if (ctx && rule->prio <= ctx->prio)
+ return 0;
+
cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
for (i = 0; i < rule->field_count; i++) {
@@ -534,11 +552,11 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_EXIT:
- if (ctx && ctx->return_valid)
+ if (ctx && ctx->return_valid != AUDITSC_INVALID)
result = audit_comparator(ctx->return_code, f->op, f->val);
break;
case AUDIT_SUCCESS:
- if (ctx && ctx->return_valid) {
+ if (ctx && ctx->return_valid != AUDITSC_INVALID) {
if (f->val)
result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS);
else
@@ -635,7 +653,7 @@ static int audit_filter_rules(struct task_struct *tsk,
result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
break;
case AUDIT_SADDR_FAM:
- if (ctx->sockaddr)
+ if (ctx && ctx->sockaddr)
result = audit_comparator(ctx->sockaddr->ss_family,
f->op, f->val);
break;
@@ -651,7 +669,16 @@ static int audit_filter_rules(struct task_struct *tsk,
logged upon error */
if (f->lsm_rule) {
if (need_sid) {
- security_task_getsecid(tsk, &sid);
+ /* @tsk should always be equal to
+ * @current with the exception of
+ * fork()/copy_process() in which case
+ * the new @tsk creds are still a dup
+ * of @current's creds so we can still
+ * use security_current_getsecid_subj()
+ * here even though it always refs
+ * @current's creds
+ */
+ security_current_getsecid_subj(&sid);
need_sid = 0;
}
result = security_audit_rule_match(sid, f->type,
@@ -725,8 +752,6 @@ static int audit_filter_rules(struct task_struct *tsk,
}
if (ctx) {
- if (rule->prio <= ctx->prio)
- return 0;
if (rule->filterkey) {
kfree(ctx->filterkey);
ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
@@ -735,10 +760,10 @@ static int audit_filter_rules(struct task_struct *tsk,
}
switch (rule->action) {
case AUDIT_NEVER:
- *state = AUDIT_DISABLED;
+ *state = AUDIT_STATE_DISABLED;
break;
case AUDIT_ALWAYS:
- *state = AUDIT_RECORD_CONTEXT;
+ *state = AUDIT_STATE_RECORD;
break;
}
return 1;
@@ -757,14 +782,14 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
if (audit_filter_rules(tsk, &e->rule, NULL, NULL,
&state, true)) {
- if (state == AUDIT_RECORD_CONTEXT)
+ if (state == AUDIT_STATE_RECORD)
*key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
rcu_read_unlock();
return state;
}
}
rcu_read_unlock();
- return AUDIT_BUILD_CONTEXT;
+ return AUDIT_STATE_BUILD;
}
static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
@@ -783,33 +808,72 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
return rule->mask[word] & bit;
}
-/* At syscall entry and exit time, this filter is called if the
- * audit_state is not low enough that auditing cannot take place, but is
- * also not high enough that we already know we have to write an audit
- * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
+/**
+ * __audit_filter_op - common filter helper for operations (syscall/uring/etc)
+ * @tsk: associated task
+ * @ctx: audit context
+ * @list: audit filter list
+ * @name: audit_name (can be NULL)
+ * @op: current syscall/uring_op
+ *
+ * Run the udit filters specified in @list against @tsk using @ctx,
+ * @name, and @op, as necessary; the caller is responsible for ensuring
+ * that the call is made while the RCU read lock is held. The @name
+ * parameter can be NULL, but all others must be specified.
+ * Returns 1/true if the filter finds a match, 0/false if none are found.
*/
-static enum audit_state audit_filter_syscall(struct task_struct *tsk,
- struct audit_context *ctx,
- struct list_head *list)
+static int __audit_filter_op(struct task_struct *tsk,
+ struct audit_context *ctx,
+ struct list_head *list,
+ struct audit_names *name,
+ unsigned long op)
{
struct audit_entry *e;
enum audit_state state;
- if (auditd_test_task(tsk))
- return AUDIT_DISABLED;
-
- rcu_read_lock();
list_for_each_entry_rcu(e, list, list) {
- if (audit_in_mask(&e->rule, ctx->major) &&
- audit_filter_rules(tsk, &e->rule, ctx, NULL,
+ if (audit_in_mask(&e->rule, op) &&
+ audit_filter_rules(tsk, &e->rule, ctx, name,
&state, false)) {
- rcu_read_unlock();
ctx->current_state = state;
- return state;
+ return 1;
}
}
+ return 0;
+}
+
+/**
+ * audit_filter_uring - apply filters to an io_uring operation
+ * @tsk: associated task
+ * @ctx: audit context
+ */
+static void audit_filter_uring(struct task_struct *tsk,
+ struct audit_context *ctx)
+{
+ if (auditd_test_task(tsk))
+ return;
+
+ rcu_read_lock();
+ __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_URING_EXIT],
+ NULL, ctx->uring_op);
+ rcu_read_unlock();
+}
+
+/* At syscall exit time, this filter is called if the audit_state is
+ * not low enough that auditing cannot take place, but is also not
+ * high enough that we already know we have to write an audit record
+ * (i.e., the state is AUDIT_STATE_BUILD).
+ */
+static void audit_filter_syscall(struct task_struct *tsk,
+ struct audit_context *ctx)
+{
+ if (auditd_test_task(tsk))
+ return;
+
+ rcu_read_lock();
+ __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_EXIT],
+ NULL, ctx->major);
rcu_read_unlock();
- return AUDIT_BUILD_CONTEXT;
}
/*
@@ -818,20 +882,12 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
*/
static int audit_filter_inode_name(struct task_struct *tsk,
struct audit_names *n,
- struct audit_context *ctx) {
+ struct audit_context *ctx)
+{
int h = audit_hash_ino((u32)n->ino);
struct list_head *list = &audit_inode_hash[h];
- struct audit_entry *e;
- enum audit_state state;
- list_for_each_entry_rcu(e, list, list) {
- if (audit_in_mask(&e->rule, ctx->major) &&
- audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
- ctx->current_state = state;
- return 1;
- }
- }
- return 0;
+ return __audit_filter_op(tsk, ctx, list, n, ctx->major);
}
/* At syscall exit time, this filter is called if any audit_names have been
@@ -894,10 +950,80 @@ static inline void audit_free_aux(struct audit_context *context)
context->aux = aux->next;
kfree(aux);
}
+ context->aux = NULL;
while ((aux = context->aux_pids)) {
context->aux_pids = aux->next;
kfree(aux);
}
+ context->aux_pids = NULL;
+}
+
+/**
+ * audit_reset_context - reset a audit_context structure
+ * @ctx: the audit_context to reset
+ *
+ * All fields in the audit_context will be reset to an initial state, all
+ * references held by fields will be dropped, and private memory will be
+ * released. When this function returns the audit_context will be suitable
+ * for reuse, so long as the passed context is not NULL or a dummy context.
+ */
+static void audit_reset_context(struct audit_context *ctx)
+{
+ if (!ctx)
+ return;
+
+ /* if ctx is non-null, reset the "ctx->context" regardless */
+ ctx->context = AUDIT_CTX_UNUSED;
+ if (ctx->dummy)
+ return;
+
+ /*
+ * NOTE: It shouldn't matter in what order we release the fields, so
+ * release them in the order in which they appear in the struct;
+ * this gives us some hope of quickly making sure we are
+ * resetting the audit_context properly.
+ *
+ * Other things worth mentioning:
+ * - we don't reset "dummy"
+ * - we don't reset "state", we do reset "current_state"
+ * - we preserve "filterkey" if "state" is AUDIT_STATE_RECORD
+ * - much of this is likely overkill, but play it safe for now
+ * - we really need to work on improving the audit_context struct
+ */
+
+ ctx->current_state = ctx->state;
+ ctx->serial = 0;
+ ctx->major = 0;
+ ctx->uring_op = 0;
+ ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 };
+ memset(ctx->argv, 0, sizeof(ctx->argv));
+ ctx->return_code = 0;
+ ctx->prio = (ctx->state == AUDIT_STATE_RECORD ? ~0ULL : 0);
+ ctx->return_valid = AUDITSC_INVALID;
+ audit_free_names(ctx);
+ if (ctx->state != AUDIT_STATE_RECORD) {
+ kfree(ctx->filterkey);
+ ctx->filterkey = NULL;
+ }
+ audit_free_aux(ctx);
+ kfree(ctx->sockaddr);
+ ctx->sockaddr = NULL;
+ ctx->sockaddr_len = 0;
+ ctx->ppid = 0;
+ ctx->uid = ctx->euid = ctx->suid = ctx->fsuid = KUIDT_INIT(0);
+ ctx->gid = ctx->egid = ctx->sgid = ctx->fsgid = KGIDT_INIT(0);
+ ctx->personality = 0;
+ ctx->arch = 0;
+ ctx->target_pid = 0;
+ ctx->target_auid = ctx->target_uid = KUIDT_INIT(0);
+ ctx->target_sessionid = 0;
+ ctx->target_sid = 0;
+ ctx->target_comm[0] = '\0';
+ unroll_tree_refs(ctx, NULL, 0);
+ WARN_ON(!list_empty(&ctx->killed_trees));
+ audit_free_module(ctx);
+ ctx->fds[0] = -1;
+ ctx->type = 0; /* reset last for audit_free_*() */
}
static inline struct audit_context *audit_alloc_context(enum audit_state state)
@@ -907,10 +1033,13 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
context = kzalloc(sizeof(*context), GFP_KERNEL);
if (!context)
return NULL;
+ context->context = AUDIT_CTX_UNUSED;
context->state = state;
- context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
+ context->prio = state == AUDIT_STATE_RECORD ? ~0ULL : 0;
INIT_LIST_HEAD(&context->killed_trees);
INIT_LIST_HEAD(&context->names_list);
+ context->fds[0] = -1;
+ context->return_valid = AUDITSC_INVALID;
return context;
}
@@ -930,15 +1059,16 @@ int audit_alloc(struct task_struct *tsk)
char *key = NULL;
if (likely(!audit_ever_enabled))
- return 0; /* Return if not auditing. */
+ return 0;
state = audit_filter_task(tsk, &key);
- if (state == AUDIT_DISABLED) {
- clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+ if (state == AUDIT_STATE_DISABLED) {
+ clear_task_syscall_work(tsk, SYSCALL_AUDIT);
return 0;
}
- if (!(context = audit_alloc_context(state))) {
+ context = audit_alloc_context(state);
+ if (!context) {
kfree(key);
audit_log_lost("out of memory in audit_alloc");
return -ENOMEM;
@@ -946,20 +1076,17 @@ int audit_alloc(struct task_struct *tsk)
context->filterkey = key;
audit_set_context(tsk, context);
- set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+ set_task_syscall_work(tsk, SYSCALL_AUDIT);
return 0;
}
static inline void audit_free_context(struct audit_context *context)
{
- audit_free_module(context);
- audit_free_names(context);
- unroll_tree_refs(context, NULL, 0);
+ /* resetting is extra work, but it is likely just noise */
+ audit_reset_context(context);
+ audit_proctitle_free(context);
free_tree_refs(context);
- audit_free_aux(context);
kfree(context->filterkey);
- kfree(context->sockaddr);
- audit_proctitle_free(context);
kfree(context);
}
@@ -1172,15 +1299,11 @@ out:
static void audit_log_cap(struct audit_buffer *ab, char *prefix,
kernel_cap_t *cap)
{
- int i;
-
if (cap_isclear(*cap)) {
audit_log_format(ab, " %s=0", prefix);
return;
}
- audit_log_format(ab, " %s=", prefix);
- CAP_FOR_EACH_U32(i)
- audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]);
+ audit_log_format(ab, " %s=%016llx", prefix, cap->val);
}
static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
@@ -1196,6 +1319,53 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
from_kuid(&init_user_ns, name->fcap.rootid));
}
+static void audit_log_time(struct audit_context *context, struct audit_buffer **ab)
+{
+ const struct audit_ntp_data *ntp = &context->time.ntp_data;
+ const struct timespec64 *tk = &context->time.tk_injoffset;
+ static const char * const ntp_name[] = {
+ "offset",
+ "freq",
+ "status",
+ "tai",
+ "tick",
+ "adjust",
+ };
+ int type;
+
+ if (context->type == AUDIT_TIME_ADJNTPVAL) {
+ for (type = 0; type < AUDIT_NTP_NVALS; type++) {
+ if (ntp->vals[type].newval != ntp->vals[type].oldval) {
+ if (!*ab) {
+ *ab = audit_log_start(context,
+ GFP_KERNEL,
+ AUDIT_TIME_ADJNTPVAL);
+ if (!*ab)
+ return;
+ }
+ audit_log_format(*ab, "op=%s old=%lli new=%lli",
+ ntp_name[type],
+ ntp->vals[type].oldval,
+ ntp->vals[type].newval);
+ audit_log_end(*ab);
+ *ab = NULL;
+ }
+ }
+ }
+ if (tk->tv_sec != 0 || tk->tv_nsec != 0) {
+ if (!*ab) {
+ *ab = audit_log_start(context, GFP_KERNEL,
+ AUDIT_TIME_INJOFFSET);
+ if (!*ab)
+ return;
+ }
+ audit_log_format(*ab, "sec=%lli nsec=%li",
+ (long long)tk->tv_sec, tk->tv_nsec);
+ audit_log_end(*ab);
+ *ab = NULL;
+ }
+}
+
static void show_special(struct audit_context *context, int *call_panic)
{
struct audit_buffer *ab;
@@ -1208,6 +1378,7 @@ static void show_special(struct audit_context *context, int *call_panic)
switch (context->type) {
case AUDIT_SOCKETCALL: {
int nargs = context->socketcall.nargs;
+
audit_log_format(ab, "nargs=%d", nargs);
for (i = 0; i < nargs; i++)
audit_log_format(ab, " a%d=%lx", i,
@@ -1223,6 +1394,7 @@ static void show_special(struct audit_context *context, int *call_panic)
if (osid) {
char *ctx = NULL;
u32 len;
+
if (security_secid_to_secctx(osid, &ctx, &len)) {
audit_log_format(ab, " osid=%u", osid);
*call_panic = 1;
@@ -1272,6 +1444,7 @@ static void show_special(struct audit_context *context, int *call_panic)
break;
case AUDIT_MQ_GETSETATTR: {
struct mq_attr *attr = &context->mq_getsetattr.mqstat;
+
audit_log_format(ab,
"mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
"mq_curmsgs=%ld ",
@@ -1290,6 +1463,12 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
context->mmap.flags);
break;
+ case AUDIT_OPENAT2:
+ audit_log_format(ab, "oflag=0%llo mode=0%llo resolve=0x%llx",
+ context->openat2.flags,
+ context->openat2.mode,
+ context->openat2.resolve);
+ break;
case AUDIT_EXECVE:
audit_log_execve_info(context, &ab);
break;
@@ -1301,6 +1480,11 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, "(null)");
break;
+ case AUDIT_TIME_ADJNTPVAL:
+ case AUDIT_TIME_INJOFFSET:
+ /* this call deviates from the rest, eating the buffer */
+ audit_log_time(context, &ab);
+ break;
}
audit_log_end(ab);
}
@@ -1308,6 +1492,7 @@ static void show_special(struct audit_context *context, int *call_panic)
static inline int audit_proctitle_rtrim(char *proctitle, int len)
{
char *end = proctitle + len - 1;
+
while (end > proctitle && !isprint(*end))
end--;
@@ -1349,7 +1534,10 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
/* name was specified as a relative path and the
* directory component is the cwd
*/
- audit_log_d_path(ab, " name=", &context->pwd);
+ if (context->pwd.dentry && context->pwd.mnt)
+ audit_log_d_path(ab, " name=", &context->pwd);
+ else
+ audit_log_format(ab, " name=(null)");
break;
default:
/* log the name's directory component */
@@ -1417,9 +1605,6 @@ static void audit_log_proctitle(void)
struct audit_context *context = audit_context();
struct audit_buffer *ab;
- if (!context || context->dummy)
- return;
-
ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE);
if (!ab)
return; /* audit_panic or being filtered */
@@ -1452,6 +1637,44 @@ out:
audit_log_end(ab);
}
+/**
+ * audit_log_uring - generate a AUDIT_URINGOP record
+ * @ctx: the audit context
+ */
+static void audit_log_uring(struct audit_context *ctx)
+{
+ struct audit_buffer *ab;
+ const struct cred *cred;
+
+ ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_URINGOP);
+ if (!ab)
+ return;
+ cred = current_cred();
+ audit_log_format(ab, "uring_op=%d", ctx->uring_op);
+ if (ctx->return_valid != AUDITSC_INVALID)
+ audit_log_format(ab, " success=%s exit=%ld",
+ (ctx->return_valid == AUDITSC_SUCCESS ?
+ "yes" : "no"),
+ ctx->return_code);
+ audit_log_format(ab,
+ " items=%d"
+ " ppid=%d pid=%d uid=%u gid=%u euid=%u suid=%u"
+ " fsuid=%u egid=%u sgid=%u fsgid=%u",
+ ctx->name_count,
+ task_ppid_nr(current), task_tgid_nr(current),
+ from_kuid(&init_user_ns, cred->uid),
+ from_kgid(&init_user_ns, cred->gid),
+ from_kuid(&init_user_ns, cred->euid),
+ from_kuid(&init_user_ns, cred->suid),
+ from_kuid(&init_user_ns, cred->fsuid),
+ from_kgid(&init_user_ns, cred->egid),
+ from_kgid(&init_user_ns, cred->sgid),
+ from_kgid(&init_user_ns, cred->fsgid));
+ audit_log_task_context(ab);
+ audit_log_key(ab, ctx->filterkey);
+ audit_log_end(ab);
+}
+
static void audit_log_exit(void)
{
int i, call_panic = 0;
@@ -1462,29 +1685,38 @@ static void audit_log_exit(void)
context->personality = current->personality;
- ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
- if (!ab)
- return; /* audit_panic has been called */
- audit_log_format(ab, "arch=%x syscall=%d",
- context->arch, context->major);
- if (context->personality != PER_LINUX)
- audit_log_format(ab, " per=%lx", context->personality);
- if (context->return_valid)
- audit_log_format(ab, " success=%s exit=%ld",
- (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
- context->return_code);
-
- audit_log_format(ab,
- " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
- context->argv[0],
- context->argv[1],
- context->argv[2],
- context->argv[3],
- context->name_count);
-
- audit_log_task_info(ab);
- audit_log_key(ab, context->filterkey);
- audit_log_end(ab);
+ switch (context->context) {
+ case AUDIT_CTX_SYSCALL:
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
+ if (!ab)
+ return;
+ audit_log_format(ab, "arch=%x syscall=%d",
+ context->arch, context->major);
+ if (context->personality != PER_LINUX)
+ audit_log_format(ab, " per=%lx", context->personality);
+ if (context->return_valid != AUDITSC_INVALID)
+ audit_log_format(ab, " success=%s exit=%ld",
+ (context->return_valid == AUDITSC_SUCCESS ?
+ "yes" : "no"),
+ context->return_code);
+ audit_log_format(ab,
+ " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
+ context->argv[0],
+ context->argv[1],
+ context->argv[2],
+ context->argv[3],
+ context->name_count);
+ audit_log_task_info(ab);
+ audit_log_key(ab, context->filterkey);
+ audit_log_end(ab);
+ break;
+ case AUDIT_CTX_URING:
+ audit_log_uring(context);
+ break;
+ default:
+ BUG();
+ break;
+ }
for (aux = context->aux; aux; aux = aux->next) {
@@ -1496,6 +1728,7 @@ static void audit_log_exit(void)
case AUDIT_BPRM_FCAPS: {
struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
+
audit_log_format(ab, "fver=%x", axs->fcap_ver);
audit_log_cap(ab, "fp", &axs->fcap.permitted);
audit_log_cap(ab, "fi", &axs->fcap.inheritable);
@@ -1574,21 +1807,22 @@ static void audit_log_exit(void)
audit_log_name(context, n, NULL, i++, &call_panic);
}
- audit_log_proctitle();
+ if (context->context == AUDIT_CTX_SYSCALL)
+ audit_log_proctitle();
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
if (ab)
audit_log_end(ab);
if (call_panic)
- audit_panic("error converting sid to string");
+ audit_panic("error in audit_log_exit()");
}
/**
* __audit_free - free a per-task audit context
* @tsk: task whose audit context block to free
*
- * Called from copy_process and do_exit
+ * Called from copy_process, do_exit, and the io_uring code
*/
void __audit_free(struct task_struct *tsk)
{
@@ -1597,23 +1831,30 @@ void __audit_free(struct task_struct *tsk)
if (!context)
return;
+ /* this may generate CONFIG_CHANGE records */
if (!list_empty(&context->killed_trees))
audit_kill_trees(context);
/* We are called either by do_exit() or the fork() error handling code;
* in the former case tsk == current and in the latter tsk is a
- * random task_struct that doesn't doesn't have any meaningful data we
+ * random task_struct that doesn't have any meaningful data we
* need to log via audit_log_exit().
*/
- if (tsk == current && !context->dummy && context->in_syscall) {
- context->return_valid = 0;
+ if (tsk == current && !context->dummy) {
+ context->return_valid = AUDITSC_INVALID;
context->return_code = 0;
-
- audit_filter_syscall(tsk, context,
- &audit_filter_list[AUDIT_FILTER_EXIT]);
- audit_filter_inodes(tsk, context);
- if (context->current_state == AUDIT_RECORD_CONTEXT)
- audit_log_exit();
+ if (context->context == AUDIT_CTX_SYSCALL) {
+ audit_filter_syscall(tsk, context);
+ audit_filter_inodes(tsk, context);
+ if (context->current_state == AUDIT_STATE_RECORD)
+ audit_log_exit();
+ } else if (context->context == AUDIT_CTX_URING) {
+ /* TODO: verify this case is real and valid */
+ audit_filter_uring(tsk, context);
+ audit_filter_inodes(tsk, context);
+ if (context->current_state == AUDIT_STATE_RECORD)
+ audit_log_uring(context);
+ }
}
audit_set_context(tsk, NULL);
@@ -1621,6 +1862,137 @@ void __audit_free(struct task_struct *tsk)
}
/**
+ * audit_return_fixup - fixup the return codes in the audit_context
+ * @ctx: the audit_context
+ * @success: true/false value to indicate if the operation succeeded or not
+ * @code: operation return code
+ *
+ * We need to fixup the return code in the audit logs if the actual return
+ * codes are later going to be fixed by the arch specific signal handlers.
+ */
+static void audit_return_fixup(struct audit_context *ctx,
+ int success, long code)
+{
+ /*
+ * This is actually a test for:
+ * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
+ * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
+ *
+ * but is faster than a bunch of ||
+ */
+ if (unlikely(code <= -ERESTARTSYS) &&
+ (code >= -ERESTART_RESTARTBLOCK) &&
+ (code != -ENOIOCTLCMD))
+ ctx->return_code = -EINTR;
+ else
+ ctx->return_code = code;
+ ctx->return_valid = (success ? AUDITSC_SUCCESS : AUDITSC_FAILURE);
+}
+
+/**
+ * __audit_uring_entry - prepare the kernel task's audit context for io_uring
+ * @op: the io_uring opcode
+ *
+ * This is similar to audit_syscall_entry() but is intended for use by io_uring
+ * operations. This function should only ever be called from
+ * audit_uring_entry() as we rely on the audit context checking present in that
+ * function.
+ */
+void __audit_uring_entry(u8 op)
+{
+ struct audit_context *ctx = audit_context();
+
+ if (ctx->state == AUDIT_STATE_DISABLED)
+ return;
+
+ /*
+ * NOTE: It's possible that we can be called from the process' context
+ * before it returns to userspace, and before audit_syscall_exit()
+ * is called. In this case there is not much to do, just record
+ * the io_uring details and return.
+ */
+ ctx->uring_op = op;
+ if (ctx->context == AUDIT_CTX_SYSCALL)
+ return;
+
+ ctx->dummy = !audit_n_rules;
+ if (!ctx->dummy && ctx->state == AUDIT_STATE_BUILD)
+ ctx->prio = 0;
+
+ ctx->context = AUDIT_CTX_URING;
+ ctx->current_state = ctx->state;
+ ktime_get_coarse_real_ts64(&ctx->ctime);
+}
+
+/**
+ * __audit_uring_exit - wrap up the kernel task's audit context after io_uring
+ * @success: true/false value to indicate if the operation succeeded or not
+ * @code: operation return code
+ *
+ * This is similar to audit_syscall_exit() but is intended for use by io_uring
+ * operations. This function should only ever be called from
+ * audit_uring_exit() as we rely on the audit context checking present in that
+ * function.
+ */
+void __audit_uring_exit(int success, long code)
+{
+ struct audit_context *ctx = audit_context();
+
+ if (ctx->dummy) {
+ if (ctx->context != AUDIT_CTX_URING)
+ return;
+ goto out;
+ }
+
+ audit_return_fixup(ctx, success, code);
+ if (ctx->context == AUDIT_CTX_SYSCALL) {
+ /*
+ * NOTE: See the note in __audit_uring_entry() about the case
+ * where we may be called from process context before we
+ * return to userspace via audit_syscall_exit(). In this
+ * case we simply emit a URINGOP record and bail, the
+ * normal syscall exit handling will take care of
+ * everything else.
+ * It is also worth mentioning that when we are called,
+ * the current process creds may differ from the creds
+ * used during the normal syscall processing; keep that
+ * in mind if/when we move the record generation code.
+ */
+
+ /*
+ * We need to filter on the syscall info here to decide if we
+ * should emit a URINGOP record. I know it seems odd but this
+ * solves the problem where users have a filter to block *all*
+ * syscall records in the "exit" filter; we want to preserve
+ * the behavior here.
+ */
+ audit_filter_syscall(current, ctx);
+ if (ctx->current_state != AUDIT_STATE_RECORD)
+ audit_filter_uring(current, ctx);
+ audit_filter_inodes(current, ctx);
+ if (ctx->current_state != AUDIT_STATE_RECORD)
+ return;
+
+ audit_log_uring(ctx);
+ return;
+ }
+
+ /* this may generate CONFIG_CHANGE records */
+ if (!list_empty(&ctx->killed_trees))
+ audit_kill_trees(ctx);
+
+ /* run through both filters to ensure we set the filterkey properly */
+ audit_filter_uring(current, ctx);
+ audit_filter_inodes(current, ctx);
+ if (ctx->current_state != AUDIT_STATE_RECORD)
+ goto out;
+ audit_log_exit();
+
+out:
+ audit_reset_context(ctx);
+}
+
+/**
* __audit_syscall_entry - fill in an audit record at syscall entry
* @major: major syscall type (function)
* @a1: additional syscall register 1
@@ -1631,7 +2003,7 @@ void __audit_free(struct task_struct *tsk)
* Fill in audit context at syscall entry. This only happens if the
* audit context was created when the task was created and the state or
* filters demand the audit context be built. If the state from the
- * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
+ * per-task filter or from the per-syscall filter is AUDIT_STATE_RECORD,
* then the record will be written at syscall exit time (otherwise, it
* will only be written if another part of the kernel requests that it
* be written).
@@ -1645,14 +2017,19 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
if (!audit_enabled || !context)
return;
- BUG_ON(context->in_syscall || context->name_count);
+ WARN_ON(context->context != AUDIT_CTX_UNUSED);
+ WARN_ON(context->name_count);
+ if (context->context != AUDIT_CTX_UNUSED || context->name_count) {
+ audit_panic("unrecoverable error in audit_syscall_entry()");
+ return;
+ }
state = context->state;
- if (state == AUDIT_DISABLED)
+ if (state == AUDIT_STATE_DISABLED)
return;
context->dummy = !audit_n_rules;
- if (!context->dummy && state == AUDIT_BUILD_CONTEXT) {
+ if (!context->dummy && state == AUDIT_STATE_BUILD) {
context->prio = 0;
if (auditd_test_task(current))
return;
@@ -1664,10 +2041,8 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
context->argv[1] = a2;
context->argv[2] = a3;
context->argv[3] = a4;
- context->serial = 0;
- context->in_syscall = 1;
+ context->context = AUDIT_CTX_SYSCALL;
context->current_state = state;
- context->ppid = 0;
ktime_get_coarse_real_ts64(&context->ctime);
}
@@ -1677,71 +2052,34 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
* @return_code: return value of the syscall
*
* Tear down after system call. If the audit context has been marked as
- * auditable (either because of the AUDIT_RECORD_CONTEXT state from
+ * auditable (either because of the AUDIT_STATE_RECORD state from
* filtering, or because some other part of the kernel wrote an audit
* message), then write out the syscall information. In call cases,
* free the names stored from getname().
*/
void __audit_syscall_exit(int success, long return_code)
{
- struct audit_context *context;
+ struct audit_context *context = audit_context();
- context = audit_context();
- if (!context)
- return;
+ if (!context || context->dummy ||
+ context->context != AUDIT_CTX_SYSCALL)
+ goto out;
+ /* this may generate CONFIG_CHANGE records */
if (!list_empty(&context->killed_trees))
audit_kill_trees(context);
- if (!context->dummy && context->in_syscall) {
- if (success)
- context->return_valid = AUDITSC_SUCCESS;
- else
- context->return_valid = AUDITSC_FAILURE;
+ audit_return_fixup(context, success, return_code);
+ /* run through both filters to ensure we set the filterkey properly */
+ audit_filter_syscall(current, context);
+ audit_filter_inodes(current, context);
+ if (context->current_state != AUDIT_STATE_RECORD)
+ goto out;
- /*
- * we need to fix up the return code in the audit logs if the
- * actual return codes are later going to be fixed up by the
- * arch specific signal handlers
- *
- * This is actually a test for:
- * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
- * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
- *
- * but is faster than a bunch of ||
- */
- if (unlikely(return_code <= -ERESTARTSYS) &&
- (return_code >= -ERESTART_RESTARTBLOCK) &&
- (return_code != -ENOIOCTLCMD))
- context->return_code = -EINTR;
- else
- context->return_code = return_code;
-
- audit_filter_syscall(current, context,
- &audit_filter_list[AUDIT_FILTER_EXIT]);
- audit_filter_inodes(current, context);
- if (context->current_state == AUDIT_RECORD_CONTEXT)
- audit_log_exit();
- }
-
- context->in_syscall = 0;
- context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
-
- audit_free_module(context);
- audit_free_names(context);
- unroll_tree_refs(context, NULL, 0);
- audit_free_aux(context);
- context->aux = NULL;
- context->aux_pids = NULL;
- context->target_pid = 0;
- context->target_sid = 0;
- context->sockaddr_len = 0;
- context->type = 0;
- context->fds[0] = -1;
- if (context->state != AUDIT_RECORD_CONTEXT) {
- kfree(context->filterkey);
- context->filterkey = NULL;
- }
+ audit_log_exit();
+
+out:
+ audit_reset_context(context);
}
static inline void handle_one(const struct inode *inode)
@@ -1750,6 +2088,7 @@ static inline void handle_one(const struct inode *inode)
struct audit_tree_refs *p;
struct audit_chunk *chunk;
int count;
+
if (likely(!inode->i_fsnotify_marks))
return;
context = audit_context();
@@ -1789,10 +2128,12 @@ retry:
d = dentry;
rcu_read_lock();
seq = read_seqbegin(&rename_lock);
- for(;;) {
+ for (;;) {
struct inode *inode = d_backing_inode(d);
+
if (inode && unlikely(inode->i_fsnotify_marks)) {
struct audit_chunk *chunk;
+
chunk = audit_tree_lookup(inode);
if (chunk) {
if (unlikely(!put_tree_ref(context, chunk))) {
@@ -1848,6 +2189,8 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
list_add_tail(&aname->list, &context->names_list);
context->name_count++;
+ if (!context->pwd.dentry)
+ get_fs_pwd(current->fs, &context->pwd);
return aname;
}
@@ -1869,7 +2212,7 @@ __audit_reusename(const __user char *uptr)
if (!n->name)
continue;
if (n->name->uptr == uptr) {
- n->name->refcnt++;
+ atomic_inc(&n->name->refcnt);
return n->name;
}
}
@@ -1888,7 +2231,7 @@ void __audit_getname(struct filename *name)
struct audit_context *context = audit_context();
struct audit_names *n;
- if (!context->in_syscall)
+ if (context->context == AUDIT_CTX_UNUSED)
return;
n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
@@ -1898,10 +2241,7 @@ void __audit_getname(struct filename *name)
n->name = name;
n->name_len = AUDIT_NAME_FULL;
name->aname = n;
- name->refcnt++;
-
- if (!context->pwd.dentry)
- get_fs_pwd(current->fs, &context->pwd);
+ atomic_inc(&name->refcnt);
}
static inline int audit_copy_fcaps(struct audit_names *name,
@@ -1913,7 +2253,7 @@ static inline int audit_copy_fcaps(struct audit_names *name,
if (!dentry)
return 0;
- rc = get_vfs_caps_from_disk(dentry, &caps);
+ rc = get_vfs_caps_from_disk(&nop_mnt_idmap, dentry, &caps);
if (rc)
return rc;
@@ -1963,7 +2303,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
int i;
- if (!context->in_syscall)
+ if (context->context == AUDIT_CTX_UNUSED)
return;
rcu_read_lock();
@@ -2033,7 +2373,7 @@ out_alloc:
return;
if (name) {
n->name = name;
- name->refcnt++;
+ atomic_inc(&name->refcnt);
}
out:
@@ -2081,7 +2421,7 @@ void __audit_inode_child(struct inode *parent,
struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
int i;
- if (!context->in_syscall)
+ if (context->context == AUDIT_CTX_UNUSED)
return;
rcu_read_lock();
@@ -2120,6 +2460,8 @@ void __audit_inode_child(struct inode *parent,
}
}
+ cond_resched();
+
/* is there a matching child entry? */
list_for_each_entry(n, &context->names_list, list) {
/* can only match entries that have a name */
@@ -2158,7 +2500,7 @@ void __audit_inode_child(struct inode *parent,
if (found_parent) {
found_child->name = found_parent->name;
found_child->name_len = AUDIT_NAME_FULL;
- found_child->name->refcnt++;
+ atomic_inc(&found_child->name->refcnt);
}
}
@@ -2180,7 +2522,7 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
int auditsc_get_stamp(struct audit_context *ctx,
struct timespec64 *t, unsigned int *serial)
{
- if (!ctx->in_syscall)
+ if (ctx->context == AUDIT_CTX_UNUSED)
return 0;
if (!ctx->serial)
ctx->serial = audit_serial();
@@ -2189,7 +2531,7 @@ int auditsc_get_stamp(struct audit_context *ctx,
*serial = ctx->serial;
if (!ctx->prio) {
ctx->prio = 1;
- ctx->current_state = AUDIT_RECORD_CONTEXT;
+ ctx->current_state = AUDIT_STATE_RECORD;
}
return 1;
}
@@ -2271,6 +2613,7 @@ void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{
struct audit_context *context = audit_context();
+
context->mq_getsetattr.mqdes = mqdes;
context->mq_getsetattr.mqstat = *mqstat;
context->type = AUDIT_MQ_GETSETATTR;
@@ -2284,6 +2627,7 @@ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
{
struct audit_context *context = audit_context();
+
context->ipc.uid = ipcp->uid;
context->ipc.gid = ipcp->gid;
context->ipc.mode = ipcp->mode;
@@ -2348,6 +2692,7 @@ int __audit_socketcall(int nargs, unsigned long *args)
void __audit_fd_pair(int fd1, int fd2)
{
struct audit_context *context = audit_context();
+
context->fds[0] = fd1;
context->fds[1] = fd2;
}
@@ -2365,6 +2710,7 @@ int __audit_sockaddr(int len, void *a)
if (!context->sockaddr) {
void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
+
if (!p)
return -ENOMEM;
context->sockaddr = p;
@@ -2383,7 +2729,7 @@ void __audit_ptrace(struct task_struct *t)
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
- security_task_getsecid(t, &context->target_sid);
+ security_task_getsecid_obj(t, &context->target_sid);
memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
}
@@ -2410,7 +2756,7 @@ int audit_signal_info_syscall(struct task_struct *t)
ctx->target_auid = audit_get_loginuid(t);
ctx->target_uid = t_uid;
ctx->target_sessionid = audit_get_sessionid(t);
- security_task_getsecid(t, &ctx->target_sid);
+ security_task_getsecid_obj(t, &ctx->target_sid);
memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
return 0;
}
@@ -2431,7 +2777,7 @@ int audit_signal_info_syscall(struct task_struct *t)
axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
axp->target_uid[axp->pid_count] = t_uid;
axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
- security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
+ security_task_getsecid_obj(t, &axp->target_sid[axp->pid_count]);
memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
axp->pid_count++;
@@ -2464,7 +2810,8 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->d.next = context->aux;
context->aux = (void *)ax;
- get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+ get_vfs_caps_from_disk(&nop_mnt_idmap,
+ bprm->file->f_path.dentry, &vcaps);
ax->fcap.permitted = vcaps.permitted;
ax->fcap.inheritable = vcaps.inheritable;
@@ -2495,6 +2842,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
void __audit_log_capset(const struct cred *new, const struct cred *old)
{
struct audit_context *context = audit_context();
+
context->capset.pid = task_tgid_nr(current);
context->capset.cap.effective = new->cap_effective;
context->capset.cap.inheritable = new->cap_effective;
@@ -2506,11 +2854,22 @@ void __audit_log_capset(const struct cred *new, const struct cred *old)
void __audit_mmap_fd(int fd, int flags)
{
struct audit_context *context = audit_context();
+
context->mmap.fd = fd;
context->mmap.flags = flags;
context->type = AUDIT_MMAP;
}
+void __audit_openat2_how(struct open_how *how)
+{
+ struct audit_context *context = audit_context();
+
+ context->openat2.flags = how->flags;
+ context->openat2.mode = how->mode;
+ context->openat2.resolve = how->resolve;
+ context->type = AUDIT_OPENAT2;
+}
+
void __audit_log_kern_module(char *name)
{
struct audit_context *context = audit_context();
@@ -2521,48 +2880,54 @@ void __audit_log_kern_module(char *name)
context->type = AUDIT_KERN_MODULE;
}
-void __audit_fanotify(unsigned int response)
+void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
{
- audit_log(audit_context(), GFP_KERNEL,
- AUDIT_FANOTIFY, "resp=%u", response);
+ /* {subj,obj}_trust values are {0,1,2}: no,yes,unknown */
+ switch (friar->hdr.type) {
+ case FAN_RESPONSE_INFO_NONE:
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY,
+ "resp=%u fan_type=%u fan_info=0 subj_trust=2 obj_trust=2",
+ response, FAN_RESPONSE_INFO_NONE);
+ break;
+ case FAN_RESPONSE_INFO_AUDIT_RULE:
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY,
+ "resp=%u fan_type=%u fan_info=%X subj_trust=%u obj_trust=%u",
+ response, friar->hdr.type, friar->rule_number,
+ friar->subj_trust, friar->obj_trust);
+ }
}
void __audit_tk_injoffset(struct timespec64 offset)
{
- audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET,
- "sec=%lli nsec=%li",
- (long long)offset.tv_sec, offset.tv_nsec);
-}
-
-static void audit_log_ntp_val(const struct audit_ntp_data *ad,
- const char *op, enum audit_ntp_type type)
-{
- const struct audit_ntp_val *val = &ad->vals[type];
-
- if (val->newval == val->oldval)
- return;
+ struct audit_context *context = audit_context();
- audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL,
- "op=%s old=%lli new=%lli", op, val->oldval, val->newval);
+ /* only set type if not already set by NTP */
+ if (!context->type)
+ context->type = AUDIT_TIME_INJOFFSET;
+ memcpy(&context->time.tk_injoffset, &offset, sizeof(offset));
}
void __audit_ntp_log(const struct audit_ntp_data *ad)
{
- audit_log_ntp_val(ad, "offset", AUDIT_NTP_OFFSET);
- audit_log_ntp_val(ad, "freq", AUDIT_NTP_FREQ);
- audit_log_ntp_val(ad, "status", AUDIT_NTP_STATUS);
- audit_log_ntp_val(ad, "tai", AUDIT_NTP_TAI);
- audit_log_ntp_val(ad, "tick", AUDIT_NTP_TICK);
- audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST);
+ struct audit_context *context = audit_context();
+ int type;
+
+ for (type = 0; type < AUDIT_NTP_NVALS; type++)
+ if (ad->vals[type].newval != ad->vals[type].oldval) {
+ /* unconditionally set type, overwriting TK */
+ context->type = AUDIT_TIME_ADJNTPVAL;
+ memcpy(&context->time.ntp_data, ad, sizeof(*ad));
+ break;
+ }
}
void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
- enum audit_nfcfgop op)
+ enum audit_nfcfgop op, gfp_t gfp)
{
struct audit_buffer *ab;
char comm[sizeof(current->comm)];
- ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_NETFILTER_CFG);
+ ab = audit_log_start(audit_context(), gfp, AUDIT_NETFILTER_CFG);
if (!ab)
return;
audit_log_format(ab, "table=%s family=%u entries=%u op=%s",
@@ -2671,7 +3036,7 @@ void audit_seccomp_actions_logged(const char *names, const char *old_names,
struct list_head *audit_killed_trees(void)
{
struct audit_context *ctx = audit_context();
- if (likely(!ctx || !ctx->in_syscall))
+ if (likely(!ctx || ctx->context == AUDIT_CTX_UNUSED))
return NULL;
return &ctx->killed_trees;
}
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a2a97fa3071b..370217dd7e39 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -29,7 +29,7 @@ static void backtrace_test_irq_callback(unsigned long data)
complete(&backtrace_work);
}
-static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
+static DECLARE_TASKLET_OLD(backtrace_tasklet, &backtrace_test_irq_callback);
static void backtrace_test_irq(void)
{
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 9795d75b09b2..b529182e8b04 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -22,6 +22,13 @@ int main(void)
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
+#ifdef CONFIG_LRU_GEN
+ DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
+ DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2);
+#else
+ DEFINE(LRU_GEN_WIDTH, 0);
+ DEFINE(__LRU_REFS_WIDTH, 0);
+#endif
/* End of constants */
return 0;
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
new file mode 100644
index 000000000000..6a906ff93006
--- /dev/null
+++ b/kernel/bpf/Kconfig
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# BPF interpreter that, for example, classic socket filters depend on.
+config BPF
+ bool
+
+# Used by archs to tell that they support BPF JIT compiler plus which
+# flavour. Only one of the two can be selected for a specific arch since
+# eBPF JIT supersedes the cBPF JIT.
+
+# Classic BPF JIT (cBPF)
+config HAVE_CBPF_JIT
+ bool
+
+# Extended BPF JIT (eBPF)
+config HAVE_EBPF_JIT
+ bool
+
+# Used by archs to tell that they want the BPF JIT compiler enabled by
+# default for kernels that were compiled with BPF JIT support.
+config ARCH_WANT_DEFAULT_BPF_JIT
+ bool
+
+menu "BPF subsystem"
+
+config BPF_SYSCALL
+ bool "Enable bpf() system call"
+ select BPF
+ select IRQ_WORK
+ select TASKS_RCU if PREEMPTION
+ select TASKS_TRACE_RCU
+ select BINARY_PRINTF
+ select NET_SOCK_MSG if NET
+ select NET_XGRESS if NET
+ select PAGE_POOL if NET
+ default n
+ help
+ Enable the bpf() system call that allows to manipulate BPF programs
+ and maps via file descriptors.
+
+config BPF_JIT
+ bool "Enable BPF Just In Time compiler"
+ depends on BPF
+ depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
+ depends on MODULES
+ help
+ BPF programs are normally handled by a BPF interpreter. This option
+ allows the kernel to generate native code when a program is loaded
+ into the kernel. This will significantly speed-up processing of BPF
+ programs.
+
+ Note, an admin should enable this feature changing:
+ /proc/sys/net/core/bpf_jit_enable
+ /proc/sys/net/core/bpf_jit_harden (optional)
+ /proc/sys/net/core/bpf_jit_kallsyms (optional)
+
+config BPF_JIT_ALWAYS_ON
+ bool "Permanently enable BPF JIT and remove BPF interpreter"
+ depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT
+ help
+ Enables BPF JIT and removes BPF interpreter to avoid speculative
+ execution of BPF instructions by the interpreter.
+
+ When CONFIG_BPF_JIT_ALWAYS_ON is enabled, /proc/sys/net/core/bpf_jit_enable
+ is permanently set to 1 and setting any other value than that will
+ return failure.
+
+config BPF_JIT_DEFAULT_ON
+ def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON
+ depends on HAVE_EBPF_JIT && BPF_JIT
+
+config BPF_UNPRIV_DEFAULT_OFF
+ bool "Disable unprivileged BPF by default"
+ default y
+ depends on BPF_SYSCALL
+ help
+ Disables unprivileged BPF by default by setting the corresponding
+ /proc/sys/kernel/unprivileged_bpf_disabled knob to 2. An admin can
+ still reenable it by setting it to 0 later on, or permanently
+ disable it by setting it to 1 (from which no other transition to
+ 0 is possible anymore).
+
+ Unprivileged BPF could be used to exploit certain potential
+ speculative execution side-channel vulnerabilities on unmitigated
+ affected hardware.
+
+ If you are unsure how to answer this question, answer Y.
+
+source "kernel/bpf/preload/Kconfig"
+
+config BPF_LSM
+ bool "Enable BPF LSM Instrumentation"
+ depends on BPF_EVENTS
+ depends on BPF_SYSCALL
+ depends on SECURITY
+ depends on BPF_JIT
+ help
+ Enables instrumentation of the security hooks with BPF programs for
+ implementing dynamic MAC and Audit Policies.
+
+ If you are unsure how to answer this question, answer N.
+
+endmenu # "BPF subsystem"
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 1131a921e1a6..f526b7573e97 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,23 +1,34 @@
# SPDX-License-Identifier: GPL-2.0
obj-y := core.o
-CFLAGS_core.o += $(call cc-disable-warning, override-init)
+ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y)
+# ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details
+cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
+endif
+CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
-obj-$(CONFIG_BPF_SYSCALL) += disasm.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
+obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
+obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
-obj-$(CONFIG_BPF_SYSCALL) += btf.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
obj-$(CONFIG_BPF_SYSCALL) += offload.o
obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o
+obj-$(CONFIG_BPF_SYSCALL) += tcx.o
endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
+ifeq ($(CONFIG_CGROUPS),y)
+obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o bpf_cgrp_storage.o
+endif
obj-$(CONFIG_CGROUP_BPF) += cgroup.o
ifeq ($(CONFIG_INET),y)
obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
@@ -27,5 +38,11 @@ obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o
endif
ifeq ($(CONFIG_BPF_JIT),y)
obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
+obj-$(CONFIG_BPF_SYSCALL) += cpumask.o
obj-${CONFIG_BPF_LSM} += bpf_lsm.o
endif
+obj-$(CONFIG_BPF_PRELOAD) += preload/
+
+obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
+$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE
+ $(call if_changed_rule,cc_o_c)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 11584618e861..0bdbbbeab155 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -10,11 +10,14 @@
#include <linux/filter.h>
#include <linux/perf_event.h>
#include <uapi/linux/btf.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/btf_ids.h>
#include "map_in_map.h"
#define ARRAY_CREATE_FLAG_MASK \
- (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK)
+ (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \
+ BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP)
static void bpf_array_free_percpu(struct bpf_array *array)
{
@@ -32,8 +35,8 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
int i;
for (i = 0; i < array->map.max_entries; i++) {
- ptr = __alloc_percpu_gfp(array->elem_size, 8,
- GFP_USER | __GFP_NOWARN);
+ ptr = bpf_map_alloc_percpu(&array->map, array->elem_size, 8,
+ GFP_USER | __GFP_NOWARN);
if (!ptr) {
bpf_array_free_percpu(array);
return -ENOMEM;
@@ -60,13 +63,15 @@ int array_map_alloc_check(union bpf_attr *attr)
return -EINVAL;
if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
- attr->map_flags & BPF_F_MMAPABLE)
+ attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP))
return -EINVAL;
- if (attr->value_size > KMALLOC_MAX_SIZE)
- /* if value_size is bigger, the user space won't be able to
- * access the elements.
- */
+ if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
+ attr->map_flags & BPF_F_PRESERVE_ELEMS)
+ return -EINVAL;
+
+ /* avoid overflow on round_up(map->value_size) */
+ if (attr->value_size > INT_MAX)
return -E2BIG;
return 0;
@@ -75,11 +80,10 @@ int array_map_alloc_check(union bpf_attr *attr)
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
- int ret, numa_node = bpf_map_attr_numa_node(attr);
+ int numa_node = bpf_map_attr_numa_node(attr);
u32 elem_size, index_mask, max_entries;
bool bypass_spec_v1 = bpf_bypass_spec_v1();
- u64 cost, array_size, mask64;
- struct bpf_map_memory mem;
+ u64 array_size, mask64;
struct bpf_array *array;
elem_size = round_up(attr->value_size, 8);
@@ -120,44 +124,29 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
}
}
- /* make sure there is no u32 overflow later in round_up() */
- cost = array_size;
- if (percpu)
- cost += (u64)attr->max_entries * elem_size * num_possible_cpus();
-
- ret = bpf_map_charge_init(&mem, cost);
- if (ret < 0)
- return ERR_PTR(ret);
-
/* allocate all map elements and zero-initialize them */
if (attr->map_flags & BPF_F_MMAPABLE) {
void *data;
/* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */
data = bpf_map_area_mmapable_alloc(array_size, numa_node);
- if (!data) {
- bpf_map_charge_finish(&mem);
+ if (!data)
return ERR_PTR(-ENOMEM);
- }
array = data + PAGE_ALIGN(sizeof(struct bpf_array))
- offsetof(struct bpf_array, value);
} else {
array = bpf_map_area_alloc(array_size, numa_node);
}
- if (!array) {
- bpf_map_charge_finish(&mem);
+ if (!array)
return ERR_PTR(-ENOMEM);
- }
array->index_mask = index_mask;
array->map.bypass_spec_v1 = bypass_spec_v1;
/* copy mandatory map attributes */
bpf_map_init_from_attr(&array->map, attr);
- bpf_map_charge_move(&array->map.memory, &mem);
array->elem_size = elem_size;
if (percpu && bpf_array_alloc_percpu(array)) {
- bpf_map_charge_finish(&array->map.memory);
bpf_map_area_free(array);
return ERR_PTR(-ENOMEM);
}
@@ -165,6 +154,11 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
return &array->map;
}
+static void *array_map_elem_ptr(struct bpf_array* array, u32 index)
+{
+ return array->value + (u64)array->elem_size * index;
+}
+
/* Called from syscall or from eBPF program */
static void *array_map_lookup_elem(struct bpf_map *map, void *key)
{
@@ -174,7 +168,7 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
if (unlikely(index >= array->map.max_entries))
return NULL;
- return array->value + array->elem_size * (index & array->index_mask);
+ return array->value + (u64)array->elem_size * (index & array->index_mask);
}
static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
@@ -208,15 +202,18 @@ static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
}
/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
-static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_insn *insn = insn_buf;
- u32 elem_size = round_up(map->value_size, 8);
+ u32 elem_size = array->elem_size;
const int ret = BPF_REG_0;
const int map_ptr = BPF_REG_1;
const int index = BPF_REG_2;
+ if (map->map_flags & BPF_F_INNER_MAP)
+ return -EOPNOTSUPP;
+
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
if (!map->bypass_spec_v1) {
@@ -249,6 +246,20 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
return this_cpu_ptr(array->pptrs[index & array->index_mask]);
}
+static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (cpu >= nr_cpu_ids)
+ return NULL;
+
+ if (unlikely(index >= array->map.max_entries))
+ return NULL;
+
+ return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu);
+}
+
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
@@ -264,11 +275,12 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
* access 'value_size' of them, so copying rounded areas
* will not leak any kernel data
*/
- size = round_up(map->value_size, 8);
+ size = array->elem_size;
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
+ copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, value + off);
off += size;
}
rcu_read_unlock();
@@ -295,8 +307,8 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
}
/* Called from syscall or from eBPF program */
-static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long array_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
@@ -315,19 +327,21 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EEXIST;
if (unlikely((map_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)))
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
- value, map->value_size);
+ val = this_cpu_ptr(array->pptrs[index & array->index_mask]);
+ copy_map_value(map, val, value);
+ bpf_obj_free_fields(array->map.record, val);
} else {
val = array->value +
- array->elem_size * (index & array->index_mask);
+ (u64)array->elem_size * (index & array->index_mask);
if (map_flags & BPF_F_LOCK)
copy_map_value_locked(map, val, value, false);
else
copy_map_value(map, val, value);
+ bpf_obj_free_fields(array->map.record, val);
}
return 0;
}
@@ -359,11 +373,12 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
* returned or zeros which were zero-filled by percpu_alloc,
* so no kernel data leaks possible
*/
- size = round_up(map->value_size, 8);
+ size = array->elem_size;
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
+ copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off);
+ bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu));
off += size;
}
rcu_read_unlock();
@@ -371,7 +386,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
}
/* Called from syscall or from eBPF program */
-static int array_map_delete_elem(struct bpf_map *map, void *key)
+static long array_map_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
@@ -381,17 +396,41 @@ static void *array_map_vmalloc_addr(struct bpf_array *array)
return (void *)round_down((unsigned long)array, PAGE_SIZE);
}
+static void array_map_free_timers(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ /* We don't reset or free fields other than timer on uref dropping to zero. */
+ if (!btf_record_has_field(map->record, BPF_TIMER))
+ return;
+
+ for (i = 0; i < array->map.max_entries; i++)
+ bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
+}
+
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void array_map_free(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
- /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
- * so the programs (can be more than one that used this map) were
- * disconnected from events. Wait for outstanding programs to complete
- * and free the array
- */
- synchronize_rcu();
+ if (!IS_ERR_OR_NULL(map->record)) {
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ for (i = 0; i < array->map.max_entries; i++) {
+ void __percpu *pptr = array->pptrs[i & array->index_mask];
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ bpf_obj_free_fields(map->record, per_cpu_ptr(pptr, cpu));
+ cond_resched();
+ }
+ }
+ } else {
+ for (i = 0; i < array->map.max_entries; i++)
+ bpf_obj_free_fields(map->record, array_map_elem_ptr(array, i));
+ }
+ }
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
bpf_array_free_percpu(array);
@@ -494,11 +533,224 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
vma->vm_pgoff + pgoff);
}
+static bool array_map_meta_equal(const struct bpf_map *meta0,
+ const struct bpf_map *meta1)
+{
+ if (!bpf_map_meta_equal(meta0, meta1))
+ return false;
+ return meta0->map_flags & BPF_F_INNER_MAP ? true :
+ meta0->max_entries == meta1->max_entries;
+}
+
+struct bpf_iter_seq_array_map_info {
+ struct bpf_map *map;
+ void *percpu_value_buf;
+ u32 index;
+};
+
+static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_array_map_info *info = seq->private;
+ struct bpf_map *map = info->map;
+ struct bpf_array *array;
+ u32 index;
+
+ if (info->index >= map->max_entries)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ array = container_of(map, struct bpf_array, map);
+ index = info->index & array->index_mask;
+ if (info->percpu_value_buf)
+ return array->pptrs[index];
+ return array_map_elem_ptr(array, index);
+}
+
+static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_array_map_info *info = seq->private;
+ struct bpf_map *map = info->map;
+ struct bpf_array *array;
+ u32 index;
+
+ ++*pos;
+ ++info->index;
+ if (info->index >= map->max_entries)
+ return NULL;
+
+ array = container_of(map, struct bpf_array, map);
+ index = info->index & array->index_mask;
+ if (info->percpu_value_buf)
+ return array->pptrs[index];
+ return array_map_elem_ptr(array, index);
+}
+
+static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_seq_array_map_info *info = seq->private;
+ struct bpf_iter__bpf_map_elem ctx = {};
+ struct bpf_map *map = info->map;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int off = 0, cpu = 0;
+ void __percpu **pptr;
+ u32 size;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, v == NULL);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (v) {
+ ctx.key = &info->index;
+
+ if (!info->percpu_value_buf) {
+ ctx.value = v;
+ } else {
+ pptr = v;
+ size = array->elem_size;
+ for_each_possible_cpu(cpu) {
+ copy_map_value_long(map, info->percpu_value_buf + off,
+ per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, info->percpu_value_buf + off);
+ off += size;
+ }
+ ctx.value = info->percpu_value_buf;
+ }
+ }
+
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_array_map_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_array_map_seq_show(seq, v);
+}
+
+static void bpf_array_map_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__bpf_array_map_seq_show(seq, NULL);
+}
+
+static int bpf_iter_init_array_map(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct bpf_iter_seq_array_map_info *seq_info = priv_data;
+ struct bpf_map *map = aux->map;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ void *value_buf;
+ u32 buf_size;
+
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ buf_size = array->elem_size * num_possible_cpus();
+ value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
+ if (!value_buf)
+ return -ENOMEM;
+
+ seq_info->percpu_value_buf = value_buf;
+ }
+
+ /* bpf_iter_attach_map() acquires a map uref, and the uref may be
+ * released before or in the middle of iterating map elements, so
+ * acquire an extra map uref for iterator.
+ */
+ bpf_map_inc_with_uref(map);
+ seq_info->map = map;
+ return 0;
+}
+
+static void bpf_iter_fini_array_map(void *priv_data)
+{
+ struct bpf_iter_seq_array_map_info *seq_info = priv_data;
+
+ bpf_map_put_with_uref(seq_info->map);
+ kfree(seq_info->percpu_value_buf);
+}
+
+static const struct seq_operations bpf_array_map_seq_ops = {
+ .start = bpf_array_map_seq_start,
+ .next = bpf_array_map_seq_next,
+ .stop = bpf_array_map_seq_stop,
+ .show = bpf_array_map_seq_show,
+};
+
+static const struct bpf_iter_seq_info iter_seq_info = {
+ .seq_ops = &bpf_array_map_seq_ops,
+ .init_seq_private = bpf_iter_init_array_map,
+ .fini_seq_private = bpf_iter_fini_array_map,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info),
+};
+
+static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+ void *callback_ctx, u64 flags)
+{
+ u32 i, key, num_elems = 0;
+ struct bpf_array *array;
+ bool is_percpu;
+ u64 ret = 0;
+ void *val;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ array = container_of(map, struct bpf_array, map);
+ if (is_percpu)
+ migrate_disable();
+ for (i = 0; i < map->max_entries; i++) {
+ if (is_percpu)
+ val = this_cpu_ptr(array->pptrs[i]);
+ else
+ val = array_map_elem_ptr(array, i);
+ num_elems++;
+ key = i;
+ ret = callback_fn((u64)(long)map, (u64)(long)&key,
+ (u64)(long)val, (u64)(long)callback_ctx, 0);
+ /* return value: 0 - continue, 1 - stop and return */
+ if (ret)
+ break;
+ }
+
+ if (is_percpu)
+ migrate_enable();
+ return num_elems;
+}
+
+static u64 array_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ u32 elem_size = array->elem_size;
+ u64 entries = map->max_entries;
+ u64 usage = sizeof(*array);
+
+ if (percpu) {
+ usage += entries * sizeof(void *);
+ usage += entries * elem_size * num_possible_cpus();
+ } else {
+ if (map->map_flags & BPF_F_MMAPABLE) {
+ usage = PAGE_ALIGN(usage);
+ usage += PAGE_ALIGN(entries * elem_size);
+ } else {
+ usage += entries * elem_size;
+ }
+ }
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array)
const struct bpf_map_ops array_map_ops = {
+ .map_meta_equal = array_map_meta_equal,
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
+ .map_release_uref = array_map_free_timers,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
@@ -510,9 +762,15 @@ const struct bpf_map_ops array_map_ops = {
.map_check_btf = array_map_check_btf,
.map_lookup_batch = generic_map_lookup_batch,
.map_update_batch = generic_map_update_batch,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_array_elem,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
+ .iter_seq_info = &iter_seq_info,
};
const struct bpf_map_ops percpu_array_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
@@ -520,8 +778,16 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_lookup_elem = percpu_array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
+ .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem,
.map_seq_show_elem = percpu_array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_array_elem,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
+ .iter_seq_info = &iter_seq_info,
};
static int fd_array_map_alloc_check(union bpf_attr *attr)
@@ -540,8 +806,6 @@ static void fd_array_map_free(struct bpf_map *map)
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
- synchronize_rcu();
-
/* make sure it's empty */
for (i = 0; i < array->map.max_entries; i++)
BUG_ON(array->ptrs[i] != NULL);
@@ -603,11 +867,11 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
}
if (old_ptr)
- map->ops->map_fd_put_ptr(old_ptr);
+ map->ops->map_fd_put_ptr(map, old_ptr, true);
return 0;
}
-static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
+static long __fd_array_map_delete_elem(struct bpf_map *map, void *key, bool need_defer)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
void *old_ptr;
@@ -626,23 +890,27 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
}
if (old_ptr) {
- map->ops->map_fd_put_ptr(old_ptr);
+ map->ops->map_fd_put_ptr(map, old_ptr, need_defer);
return 0;
} else {
return -ENOENT;
}
}
+static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return __fd_array_map_delete_elem(map, key, true);
+}
+
static void *prog_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file, int fd)
{
- struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog = bpf_prog_get(fd);
if (IS_ERR(prog))
return prog;
- if (!bpf_prog_array_compatible(array, prog)) {
+ if (!bpf_prog_map_compatible(map, prog)) {
bpf_prog_put(prog);
return ERR_PTR(-EINVAL);
}
@@ -650,8 +918,9 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map,
return prog;
}
-static void prog_fd_array_put_ptr(void *ptr)
+static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
+ /* bpf_prog is freed after one RCU or tasks trace grace period */
bpf_prog_put(ptr);
}
@@ -661,13 +930,13 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr)
}
/* decrement refcnt of all bpf_progs that are stored in this map */
-static void bpf_fd_array_map_clear(struct bpf_map *map)
+static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
for (i = 0; i < array->map.max_entries; i++)
- fd_array_map_delete_elem(map, &i);
+ __fd_array_map_delete_elem(map, &i, need_defer);
}
static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
@@ -749,6 +1018,12 @@ static void prog_array_map_poke_untrack(struct bpf_map *map,
mutex_unlock(&aux->poke_mutex);
}
+void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+ struct bpf_prog *new, struct bpf_prog *old)
+{
+ WARN_ON_ONCE(1);
+}
+
static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
struct bpf_prog *old,
struct bpf_prog *new)
@@ -761,7 +1036,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
list_for_each_entry(elem, &aux->poke_progs, list) {
struct bpf_jit_poke_descriptor *poke;
- int i, ret;
+ int i;
for (i = 0; i < elem->aux->size_poke_tab; i++) {
poke = &elem->aux->poke_tab[i];
@@ -773,29 +1048,19 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
* there could be danger of use after free otherwise.
* 2) Initially when we start tracking aux, the program
* is not JITed yet and also does not have a kallsyms
- * entry. We skip these as poke->ip_stable is not
- * active yet. The JIT will do the final fixup before
- * setting it stable. The various poke->ip_stable are
- * successively activated, so tail call updates can
- * arrive from here while JIT is still finishing its
- * final fixup for non-activated poke entries.
- * 3) On program teardown, the program's kallsym entry gets
- * removed out of RCU callback, but we can only untrack
- * from sleepable context, therefore bpf_arch_text_poke()
- * might not see that this is in BPF text section and
- * bails out with -EINVAL. As these are unreachable since
- * RCU grace period already passed, we simply skip them.
- * 4) Also programs reaching refcount of zero while patching
+ * entry. We skip these as poke->tailcall_target_stable
+ * is not active yet. The JIT will do the final fixup
+ * before setting it stable. The various
+ * poke->tailcall_target_stable are successively
+ * activated, so tail call updates can arrive from here
+ * while JIT is still finishing its final fixup for
+ * non-activated poke entries.
+ * 3) Also programs reaching refcount of zero while patching
* is in progress is okay since we're protected under
* poke_mutex and untrack the programs before the JIT
- * buffer is freed. When we're still in the middle of
- * patching and suddenly kallsyms entry of the program
- * gets evicted, we just skip the rest which is fine due
- * to point 3).
- * 5) Any other error happening below from bpf_arch_text_poke()
- * is a unexpected bug.
+ * buffer is freed.
*/
- if (!READ_ONCE(poke->ip_stable))
+ if (!READ_ONCE(poke->tailcall_target_stable))
continue;
if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
continue;
@@ -803,12 +1068,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
poke->tail_call.key != key)
continue;
- ret = bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP,
- old ? (u8 *)old->bpf_func +
- poke->adj_off : NULL,
- new ? (u8 *)new->bpf_func +
- poke->adj_off : NULL);
- BUG_ON(ret < 0 && ret != -EINVAL);
+ bpf_arch_poke_desc_update(poke, new, old);
}
}
}
@@ -817,7 +1077,7 @@ static void prog_array_map_clear_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_array_aux,
work)->map;
- bpf_fd_array_map_clear(map);
+ bpf_fd_array_map_clear(map, true);
bpf_map_put(map);
}
@@ -834,7 +1094,7 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
struct bpf_array_aux *aux;
struct bpf_map *map;
- aux = kzalloc(sizeof(*aux), GFP_KERNEL);
+ aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT);
if (!aux)
return ERR_PTR(-ENOMEM);
@@ -868,6 +1128,11 @@ static void prog_array_map_free(struct bpf_map *map)
fd_array_map_free(map);
}
+/* prog_array->aux->{type,jited} is a runtime binding.
+ * Doing static check alone in the verifier is not enough.
+ * Thus, prog_array_map cannot be used as an inner_map
+ * and map_meta_equal is not implemented.
+ */
const struct bpf_map_ops prog_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = prog_array_map_alloc,
@@ -883,6 +1148,8 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
.map_release_uref = prog_array_map_clear,
.map_seq_show_elem = prog_array_map_seq_show_elem,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
};
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
@@ -890,7 +1157,7 @@ static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
{
struct bpf_event_entry *ee;
- ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
+ ee = kzalloc(sizeof(*ee), GFP_KERNEL);
if (ee) {
ee->event = perf_file->private_data;
ee->perf_file = perf_file;
@@ -940,8 +1207,9 @@ err_out:
return ee;
}
-static void perf_event_fd_array_put_ptr(void *ptr)
+static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
+ /* bpf_perf_event is freed after one RCU grace period */
bpf_event_entry_free_rcu(ptr);
}
@@ -952,19 +1220,30 @@ static void perf_event_fd_array_release(struct bpf_map *map,
struct bpf_event_entry *ee;
int i;
+ if (map->map_flags & BPF_F_PRESERVE_ELEMS)
+ return;
+
rcu_read_lock();
for (i = 0; i < array->map.max_entries; i++) {
ee = READ_ONCE(array->ptrs[i]);
if (ee && ee->map_file == map_file)
- fd_array_map_delete_elem(map, &i);
+ __fd_array_map_delete_elem(map, &i, true);
}
rcu_read_unlock();
}
+static void perf_event_fd_array_map_free(struct bpf_map *map)
+{
+ if (map->map_flags & BPF_F_PRESERVE_ELEMS)
+ bpf_fd_array_map_clear(map, false);
+ fd_array_map_free(map);
+}
+
const struct bpf_map_ops perf_event_array_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
- .map_free = fd_array_map_free,
+ .map_free = perf_event_fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
@@ -972,6 +1251,8 @@ const struct bpf_map_ops perf_event_array_map_ops = {
.map_fd_put_ptr = perf_event_fd_array_put_ptr,
.map_release = perf_event_fd_array_release,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
};
#ifdef CONFIG_CGROUPS
@@ -982,7 +1263,7 @@ static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
return cgroup_get_from_fd(fd);
}
-static void cgroup_fd_array_put_ptr(void *ptr)
+static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
/* cgroup_put free cgrp after a rcu grace period */
cgroup_put(ptr);
@@ -990,11 +1271,12 @@ static void cgroup_fd_array_put_ptr(void *ptr)
static void cgroup_fd_array_free(struct bpf_map *map)
{
- bpf_fd_array_map_clear(map);
+ bpf_fd_array_map_clear(map, false);
fd_array_map_free(map);
}
const struct bpf_map_ops cgroup_array_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = cgroup_fd_array_free,
@@ -1004,6 +1286,8 @@ const struct bpf_map_ops cgroup_array_map_ops = {
.map_fd_get_ptr = cgroup_fd_array_get_ptr,
.map_fd_put_ptr = cgroup_fd_array_put_ptr,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
};
#endif
@@ -1032,7 +1316,7 @@ static void array_of_map_free(struct bpf_map *map)
* is protected by fdget/fdput.
*/
bpf_map_meta_free(map->inner_map_meta);
- bpf_fd_array_map_clear(map);
+ bpf_fd_array_map_clear(map, false);
fd_array_map_free(map);
}
@@ -1046,11 +1330,11 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
return READ_ONCE(*inner_map);
}
-static u32 array_of_map_gen_lookup(struct bpf_map *map,
+static int array_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
- u32 elem_size = round_up(map->value_size, 8);
+ u32 elem_size = array->elem_size;
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
const int map_ptr = BPF_REG_1;
@@ -1088,5 +1372,9 @@ const struct bpf_map_ops array_of_maps_map_ops = {
.map_fd_put_ptr = bpf_map_fd_put_ptr,
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
.map_gen_lookup = array_of_map_gen_lookup,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
};
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
new file mode 100644
index 000000000000..addf3dd57b59
--- /dev/null
+++ b/kernel/bpf/bloom_filter.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include <linux/bitmap.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <linux/btf_ids.h>
+
+#define BLOOM_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_ZERO_SEED | BPF_F_ACCESS_MASK)
+
+struct bpf_bloom_filter {
+ struct bpf_map map;
+ u32 bitset_mask;
+ u32 hash_seed;
+ u32 nr_hash_funcs;
+ unsigned long bitset[];
+};
+
+static u32 hash(struct bpf_bloom_filter *bloom, void *value,
+ u32 value_size, u32 index)
+{
+ u32 h;
+
+ if (likely(value_size % 4 == 0))
+ h = jhash2(value, value_size / 4, bloom->hash_seed + index);
+ else
+ h = jhash(value, value_size, bloom->hash_seed + index);
+
+ return h & bloom->bitset_mask;
+}
+
+static long bloom_map_peek_elem(struct bpf_map *map, void *value)
+{
+ struct bpf_bloom_filter *bloom =
+ container_of(map, struct bpf_bloom_filter, map);
+ u32 i, h;
+
+ for (i = 0; i < bloom->nr_hash_funcs; i++) {
+ h = hash(bloom, value, map->value_size, i);
+ if (!test_bit(h, bloom->bitset))
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+static long bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
+{
+ struct bpf_bloom_filter *bloom =
+ container_of(map, struct bpf_bloom_filter, map);
+ u32 i, h;
+
+ if (flags != BPF_ANY)
+ return -EINVAL;
+
+ for (i = 0; i < bloom->nr_hash_funcs; i++) {
+ h = hash(bloom, value, map->value_size, i);
+ set_bit(h, bloom->bitset);
+ }
+
+ return 0;
+}
+
+static long bloom_map_pop_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static long bloom_map_delete_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -EOPNOTSUPP;
+}
+
+static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
+{
+ u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits;
+ int numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_bloom_filter *bloom;
+
+ if (attr->key_size != 0 || attr->value_size == 0 ||
+ attr->max_entries == 0 ||
+ attr->map_flags & ~BLOOM_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags) ||
+ /* The lower 4 bits of map_extra (0xF) specify the number
+ * of hash functions
+ */
+ (attr->map_extra & ~0xF))
+ return ERR_PTR(-EINVAL);
+
+ nr_hash_funcs = attr->map_extra;
+ if (nr_hash_funcs == 0)
+ /* Default to using 5 hash functions if unspecified */
+ nr_hash_funcs = 5;
+
+ /* For the bloom filter, the optimal bit array size that minimizes the
+ * false positive probability is n * k / ln(2) where n is the number of
+ * expected entries in the bloom filter and k is the number of hash
+ * functions. We use 7 / 5 to approximate 1 / ln(2).
+ *
+ * We round this up to the nearest power of two to enable more efficient
+ * hashing using bitmasks. The bitmask will be the bit array size - 1.
+ *
+ * If this overflows a u32, the bit array size will have 2^32 (4
+ * GB) bits.
+ */
+ if (check_mul_overflow(attr->max_entries, nr_hash_funcs, &nr_bits) ||
+ check_mul_overflow(nr_bits / 5, (u32)7, &nr_bits) ||
+ nr_bits > (1UL << 31)) {
+ /* The bit array size is 2^32 bits but to avoid overflowing the
+ * u32, we use U32_MAX, which will round up to the equivalent
+ * number of bytes
+ */
+ bitset_bytes = BITS_TO_BYTES(U32_MAX);
+ bitset_mask = U32_MAX;
+ } else {
+ if (nr_bits <= BITS_PER_LONG)
+ nr_bits = BITS_PER_LONG;
+ else
+ nr_bits = roundup_pow_of_two(nr_bits);
+ bitset_bytes = BITS_TO_BYTES(nr_bits);
+ bitset_mask = nr_bits - 1;
+ }
+
+ bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long));
+ bloom = bpf_map_area_alloc(sizeof(*bloom) + bitset_bytes, numa_node);
+
+ if (!bloom)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&bloom->map, attr);
+
+ bloom->nr_hash_funcs = nr_hash_funcs;
+ bloom->bitset_mask = bitset_mask;
+
+ if (!(attr->map_flags & BPF_F_ZERO_SEED))
+ bloom->hash_seed = get_random_u32();
+
+ return &bloom->map;
+}
+
+static void bloom_map_free(struct bpf_map *map)
+{
+ struct bpf_bloom_filter *bloom =
+ container_of(map, struct bpf_bloom_filter, map);
+
+ bpf_map_area_free(bloom);
+}
+
+static void *bloom_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ /* The eBPF program should use map_peek_elem instead */
+ return ERR_PTR(-EINVAL);
+}
+
+static long bloom_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ /* The eBPF program should use map_push_elem instead */
+ return -EINVAL;
+}
+
+static int bloom_map_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ /* Bloom filter maps are keyless */
+ return btf_type_is_void(key_type) ? 0 : -EINVAL;
+}
+
+static u64 bloom_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_bloom_filter *bloom;
+ u64 bitset_bytes;
+
+ bloom = container_of(map, struct bpf_bloom_filter, map);
+ bitset_bytes = BITS_TO_BYTES((u64)bloom->bitset_mask + 1);
+ bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long));
+ return sizeof(*bloom) + bitset_bytes;
+}
+
+BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter)
+const struct bpf_map_ops bloom_filter_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = bloom_map_alloc,
+ .map_free = bloom_map_free,
+ .map_get_next_key = bloom_map_get_next_key,
+ .map_push_elem = bloom_map_push_elem,
+ .map_peek_elem = bloom_map_peek_elem,
+ .map_pop_elem = bloom_map_pop_elem,
+ .map_lookup_elem = bloom_map_lookup_elem,
+ .map_update_elem = bloom_map_update_elem,
+ .map_delete_elem = bloom_map_delete_elem,
+ .map_check_btf = bloom_map_check_btf,
+ .map_mem_usage = bloom_map_mem_usage,
+ .map_btf_id = &bpf_bloom_map_btf_ids[0],
+};
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
new file mode 100644
index 000000000000..28efd0a3f220
--- /dev/null
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
+#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
+
+DEFINE_BPF_STORAGE_CACHE(cgroup_cache);
+
+static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy);
+
+static void bpf_cgrp_storage_lock(void)
+{
+ migrate_disable();
+ this_cpu_inc(bpf_cgrp_storage_busy);
+}
+
+static void bpf_cgrp_storage_unlock(void)
+{
+ this_cpu_dec(bpf_cgrp_storage_busy);
+ migrate_enable();
+}
+
+static bool bpf_cgrp_storage_trylock(void)
+{
+ migrate_disable();
+ if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) {
+ this_cpu_dec(bpf_cgrp_storage_busy);
+ migrate_enable();
+ return false;
+ }
+ return true;
+}
+
+static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner)
+{
+ struct cgroup *cg = owner;
+
+ return &cg->bpf_cgrp_storage;
+}
+
+void bpf_cgrp_storage_free(struct cgroup *cgroup)
+{
+ struct bpf_local_storage *local_storage;
+
+ rcu_read_lock();
+ local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
+ if (!local_storage) {
+ rcu_read_unlock();
+ return;
+ }
+
+ bpf_cgrp_storage_lock();
+ bpf_local_storage_destroy(local_storage);
+ bpf_cgrp_storage_unlock();
+ rcu_read_unlock();
+}
+
+static struct bpf_local_storage_data *
+cgroup_storage_lookup(struct cgroup *cgroup, struct bpf_map *map, bool cacheit_lockit)
+{
+ struct bpf_local_storage *cgroup_storage;
+ struct bpf_local_storage_map *smap;
+
+ cgroup_storage = rcu_dereference_check(cgroup->bpf_cgrp_storage,
+ bpf_rcu_lock_held());
+ if (!cgroup_storage)
+ return NULL;
+
+ smap = (struct bpf_local_storage_map *)map;
+ return bpf_local_storage_lookup(cgroup_storage, smap, cacheit_lockit);
+}
+
+static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_local_storage_data *sdata;
+ struct cgroup *cgroup;
+ int fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_v1v2_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return ERR_CAST(cgroup);
+
+ bpf_cgrp_storage_lock();
+ sdata = cgroup_storage_lookup(cgroup, map, true);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return sdata ? sdata->data : NULL;
+}
+
+static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_local_storage_data *sdata;
+ struct cgroup *cgroup;
+ int fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_v1v2_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return PTR_ERR(cgroup);
+
+ bpf_cgrp_storage_lock();
+ sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
+ value, map_flags, GFP_ATOMIC);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return PTR_ERR_OR_ZERO(sdata);
+}
+
+static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
+{
+ struct bpf_local_storage_data *sdata;
+
+ sdata = cgroup_storage_lookup(cgroup, map, false);
+ if (!sdata)
+ return -ENOENT;
+
+ bpf_selem_unlink(SELEM(sdata), false);
+ return 0;
+}
+
+static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
+{
+ struct cgroup *cgroup;
+ int err, fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_v1v2_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return PTR_ERR(cgroup);
+
+ bpf_cgrp_storage_lock();
+ err = cgroup_storage_delete(cgroup, map);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return err;
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
+{
+ return bpf_local_storage_map_alloc(attr, &cgroup_cache, true);
+}
+
+static void cgroup_storage_map_free(struct bpf_map *map)
+{
+ bpf_local_storage_map_free(map, &cgroup_cache, NULL);
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
+ void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ struct bpf_local_storage_data *sdata;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
+ return (unsigned long)NULL;
+
+ if (!cgroup)
+ return (unsigned long)NULL;
+
+ if (!bpf_cgrp_storage_trylock())
+ return (unsigned long)NULL;
+
+ sdata = cgroup_storage_lookup(cgroup, map, true);
+ if (sdata)
+ goto unlock;
+
+ /* only allocate new storage, when the cgroup is refcounted */
+ if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
+ sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
+ value, BPF_NOEXIST, gfp_flags);
+
+unlock:
+ bpf_cgrp_storage_unlock();
+ return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
+}
+
+BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup)
+{
+ int ret;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!cgroup)
+ return -EINVAL;
+
+ if (!bpf_cgrp_storage_trylock())
+ return -EBUSY;
+
+ ret = cgroup_storage_delete(cgroup, map);
+ bpf_cgrp_storage_unlock();
+ return ret;
+}
+
+const struct bpf_map_ops cgrp_storage_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bpf_local_storage_map_alloc_check,
+ .map_alloc = cgroup_storage_map_alloc,
+ .map_free = cgroup_storage_map_free,
+ .map_get_next_key = notsupp_get_next_key,
+ .map_lookup_elem = bpf_cgrp_storage_lookup_elem,
+ .map_update_elem = bpf_cgrp_storage_update_elem,
+ .map_delete_elem = bpf_cgrp_storage_delete_elem,
+ .map_check_btf = bpf_local_storage_map_check_btf,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
+ .map_owner_storage_ptr = cgroup_storage_ptr,
+};
+
+const struct bpf_func_proto bpf_cgrp_storage_get_proto = {
+ .func = bpf_cgrp_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &bpf_cgroup_btf_id[0],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_cgrp_storage_delete_proto = {
+ .func = bpf_cgrp_storage_delete,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &bpf_cgroup_btf_id[0],
+};
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
new file mode 100644
index 000000000000..b0ef45db207c
--- /dev/null
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Facebook
+ * Copyright 2020 Google LLC.
+ */
+
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
+#include <net/sock.h>
+#include <uapi/linux/sock_diag.h>
+#include <uapi/linux/btf.h>
+#include <linux/bpf_lsm.h>
+#include <linux/btf_ids.h>
+#include <linux/fdtable.h>
+#include <linux/rcupdate_trace.h>
+
+DEFINE_BPF_STORAGE_CACHE(inode_cache);
+
+static struct bpf_local_storage __rcu **
+inode_storage_ptr(void *owner)
+{
+ struct inode *inode = owner;
+ struct bpf_storage_blob *bsb;
+
+ bsb = bpf_inode(inode);
+ if (!bsb)
+ return NULL;
+ return &bsb->storage;
+}
+
+static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
+ struct bpf_map *map,
+ bool cacheit_lockit)
+{
+ struct bpf_local_storage *inode_storage;
+ struct bpf_local_storage_map *smap;
+ struct bpf_storage_blob *bsb;
+
+ bsb = bpf_inode(inode);
+ if (!bsb)
+ return NULL;
+
+ inode_storage =
+ rcu_dereference_check(bsb->storage, bpf_rcu_lock_held());
+ if (!inode_storage)
+ return NULL;
+
+ smap = (struct bpf_local_storage_map *)map;
+ return bpf_local_storage_lookup(inode_storage, smap, cacheit_lockit);
+}
+
+void bpf_inode_storage_free(struct inode *inode)
+{
+ struct bpf_local_storage *local_storage;
+ struct bpf_storage_blob *bsb;
+
+ bsb = bpf_inode(inode);
+ if (!bsb)
+ return;
+
+ rcu_read_lock();
+
+ local_storage = rcu_dereference(bsb->storage);
+ if (!local_storage) {
+ rcu_read_unlock();
+ return;
+ }
+
+ bpf_local_storage_destroy(local_storage);
+ rcu_read_unlock();
+}
+
+static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_local_storage_data *sdata;
+ struct fd f = fdget_raw(*(int *)key);
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ sdata = inode_storage_lookup(file_inode(f.file), map, true);
+ fdput(f);
+ return sdata ? sdata->data : NULL;
+}
+
+static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_local_storage_data *sdata;
+ struct fd f = fdget_raw(*(int *)key);
+
+ if (!f.file)
+ return -EBADF;
+ if (!inode_storage_ptr(file_inode(f.file))) {
+ fdput(f);
+ return -EBADF;
+ }
+
+ sdata = bpf_local_storage_update(file_inode(f.file),
+ (struct bpf_local_storage_map *)map,
+ value, map_flags, GFP_ATOMIC);
+ fdput(f);
+ return PTR_ERR_OR_ZERO(sdata);
+}
+
+static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
+{
+ struct bpf_local_storage_data *sdata;
+
+ sdata = inode_storage_lookup(inode, map, false);
+ if (!sdata)
+ return -ENOENT;
+
+ bpf_selem_unlink(SELEM(sdata), false);
+
+ return 0;
+}
+
+static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
+{
+ struct fd f = fdget_raw(*(int *)key);
+ int err;
+
+ if (!f.file)
+ return -EBADF;
+
+ err = inode_storage_delete(file_inode(f.file), map);
+ fdput(f);
+ return err;
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
+ void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ struct bpf_local_storage_data *sdata;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
+ return (unsigned long)NULL;
+
+ /* explicitly check that the inode_storage_ptr is not
+ * NULL as inode_storage_lookup returns NULL in this case and
+ * bpf_local_storage_update expects the owner to have a
+ * valid storage pointer.
+ */
+ if (!inode || !inode_storage_ptr(inode))
+ return (unsigned long)NULL;
+
+ sdata = inode_storage_lookup(inode, map, true);
+ if (sdata)
+ return (unsigned long)sdata->data;
+
+ /* This helper must only called from where the inode is guaranteed
+ * to have a refcount and cannot be freed.
+ */
+ if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
+ sdata = bpf_local_storage_update(
+ inode, (struct bpf_local_storage_map *)map, value,
+ BPF_NOEXIST, gfp_flags);
+ return IS_ERR(sdata) ? (unsigned long)NULL :
+ (unsigned long)sdata->data;
+ }
+
+ return (unsigned long)NULL;
+}
+
+BPF_CALL_2(bpf_inode_storage_delete,
+ struct bpf_map *, map, struct inode *, inode)
+{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!inode)
+ return -EINVAL;
+
+ /* This helper must only called from where the inode is guaranteed
+ * to have a refcount and cannot be freed.
+ */
+ return inode_storage_delete(inode, map);
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
+{
+ return bpf_local_storage_map_alloc(attr, &inode_cache, false);
+}
+
+static void inode_storage_map_free(struct bpf_map *map)
+{
+ bpf_local_storage_map_free(map, &inode_cache, NULL);
+}
+
+const struct bpf_map_ops inode_storage_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bpf_local_storage_map_alloc_check,
+ .map_alloc = inode_storage_map_alloc,
+ .map_free = inode_storage_map_free,
+ .map_get_next_key = notsupp_get_next_key,
+ .map_lookup_elem = bpf_fd_inode_storage_lookup_elem,
+ .map_update_elem = bpf_fd_inode_storage_update_elem,
+ .map_delete_elem = bpf_fd_inode_storage_delete_elem,
+ .map_check_btf = bpf_local_storage_map_check_btf,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
+ .map_owner_storage_ptr = inode_storage_ptr,
+};
+
+BTF_ID_LIST_SINGLE(bpf_inode_storage_btf_ids, struct, inode)
+
+const struct bpf_func_proto bpf_inode_storage_get_proto = {
+ .func = bpf_inode_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &bpf_inode_storage_btf_ids[0],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_inode_storage_delete_proto = {
+ .func = bpf_inode_storage_delete,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &bpf_inode_storage_btf_ids[0],
+};
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index dd612b80b9fe..0fae79164187 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -5,6 +5,7 @@
#include <linux/anon_inodes.h>
#include <linux/filter.h>
#include <linux/bpf.h>
+#include <linux/rcupdate_trace.h>
struct bpf_iter_target_info {
struct list_head list;
@@ -14,11 +15,13 @@ struct bpf_iter_target_info {
struct bpf_iter_link {
struct bpf_link link;
+ struct bpf_iter_aux_info aux;
struct bpf_iter_target_info *tinfo;
};
struct bpf_iter_priv_data {
struct bpf_iter_target_info *tinfo;
+ const struct bpf_iter_seq_info *seq_info;
struct bpf_prog *prog;
u64 session_id;
u64 seq_num;
@@ -35,7 +38,8 @@ static DEFINE_MUTEX(link_mutex);
/* incremented on every opened seq_file */
static atomic64_t session_id;
-static int prepare_seq_file(struct file *file, struct bpf_iter_link *link);
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
+ const struct bpf_iter_seq_info *seq_info);
static void bpf_iter_inc_seq_num(struct seq_file *seq)
{
@@ -64,11 +68,27 @@ static void bpf_iter_done_stop(struct seq_file *seq)
iter_priv->done_stop = true;
}
+static inline bool bpf_iter_target_support_resched(const struct bpf_iter_target_info *tinfo)
+{
+ return tinfo->reg_info->feature & BPF_ITER_RESCHED;
+}
+
+static bool bpf_iter_support_resched(struct seq_file *seq)
+{
+ struct bpf_iter_priv_data *iter_priv;
+
+ iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+ target_private);
+ return bpf_iter_target_support_resched(iter_priv->tinfo);
+}
+
+/* maximum visited objects before bailing out */
+#define MAX_ITER_OBJECTS 1000000
+
/* bpf_seq_read, a customized and simpler version for bpf iterator.
- * no_llseek is assumed for this file.
* The following are differences from seq_read():
* . fixed buffer size (PAGE_SIZE)
- * . assuming no_llseek
+ * . assuming NULL ->llseek()
* . stop() may call bpf program, handling potential overflow there
*/
static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
@@ -76,14 +96,15 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
{
struct seq_file *seq = file->private_data;
size_t n, offs, copied = 0;
- int err = 0;
+ int err = 0, num_objs = 0;
+ bool can_resched;
void *p;
mutex_lock(&seq->lock);
if (!seq->buf) {
- seq->size = PAGE_SIZE;
- seq->buf = kmalloc(seq->size, GFP_KERNEL);
+ seq->size = PAGE_SIZE << 3;
+ seq->buf = kvmalloc(seq->size, GFP_KERNEL);
if (!seq->buf) {
err = -ENOMEM;
goto done;
@@ -129,9 +150,11 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
goto done;
}
+ can_resched = bpf_iter_support_resched(seq);
while (1) {
loff_t pos = seq->index;
+ num_objs++;
offs = seq->count;
p = seq->op->next(seq, p, &seq->index);
if (pos == seq->index) {
@@ -150,6 +173,15 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
if (seq->count >= size)
break;
+ if (num_objs >= MAX_ITER_OBJECTS) {
+ if (offs == 0) {
+ err = -EAGAIN;
+ seq->op->stop(seq, p);
+ goto done;
+ }
+ break;
+ }
+
err = seq->op->show(seq, p);
if (err > 0) {
bpf_iter_dec_seq_num(seq);
@@ -164,9 +196,17 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
}
break;
}
+
+ if (can_resched)
+ cond_resched();
}
stop:
offs = seq->count;
+ if (IS_ERR(p)) {
+ seq->op->stop(seq, NULL);
+ err = PTR_ERR(p);
+ goto done;
+ }
/* bpf program called if !p */
seq->op->stop(seq, p);
if (!p) {
@@ -199,11 +239,25 @@ done:
return copied;
}
+static const struct bpf_iter_seq_info *
+__get_seq_info(struct bpf_iter_link *link)
+{
+ const struct bpf_iter_seq_info *seq_info;
+
+ if (link->aux.map) {
+ seq_info = link->aux.map->ops->iter_seq_info;
+ if (seq_info)
+ return seq_info;
+ }
+
+ return link->tinfo->reg_info->seq_info;
+}
+
static int iter_open(struct inode *inode, struct file *file)
{
struct bpf_iter_link *link = inode->i_private;
- return prepare_seq_file(file, link);
+ return prepare_seq_file(file, link, __get_seq_info(link));
}
static int iter_release(struct inode *inode, struct file *file)
@@ -218,8 +272,8 @@ static int iter_release(struct inode *inode, struct file *file)
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
- if (iter_priv->tinfo->reg_info->fini_seq_private)
- iter_priv->tinfo->reg_info->fini_seq_private(seq->private);
+ if (iter_priv->seq_info->fini_seq_private)
+ iter_priv->seq_info->fini_seq_private(seq->private);
bpf_prog_put(iter_priv->prog);
seq->private = iter_priv;
@@ -243,7 +297,7 @@ int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info)
{
struct bpf_iter_target_info *tinfo;
- tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
+ tinfo = kzalloc(sizeof(*tinfo), GFP_KERNEL);
if (!tinfo)
return -ENOMEM;
@@ -285,39 +339,65 @@ static void cache_btf_id(struct bpf_iter_target_info *tinfo,
bool bpf_iter_prog_supported(struct bpf_prog *prog)
{
const char *attach_fname = prog->aux->attach_func_name;
+ struct bpf_iter_target_info *tinfo = NULL, *iter;
u32 prog_btf_id = prog->aux->attach_btf_id;
const char *prefix = BPF_ITER_FUNC_PREFIX;
- struct bpf_iter_target_info *tinfo;
int prefix_len = strlen(prefix);
- bool supported = false;
if (strncmp(attach_fname, prefix, prefix_len))
return false;
mutex_lock(&targets_mutex);
- list_for_each_entry(tinfo, &targets, list) {
- if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) {
- supported = true;
+ list_for_each_entry(iter, &targets, list) {
+ if (iter->btf_id && iter->btf_id == prog_btf_id) {
+ tinfo = iter;
break;
}
- if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) {
- cache_btf_id(tinfo, prog);
- supported = true;
+ if (!strcmp(attach_fname + prefix_len, iter->reg_info->target)) {
+ cache_btf_id(iter, prog);
+ tinfo = iter;
break;
}
}
mutex_unlock(&targets_mutex);
- if (supported) {
+ if (tinfo) {
prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size;
prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info;
}
- return supported;
+ return tinfo != NULL;
+}
+
+const struct bpf_func_proto *
+bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ const struct bpf_iter_target_info *tinfo;
+ const struct bpf_func_proto *fn = NULL;
+
+ mutex_lock(&targets_mutex);
+ list_for_each_entry(tinfo, &targets, list) {
+ if (tinfo->btf_id == prog->aux->attach_btf_id) {
+ const struct bpf_iter_reg *reg_info;
+
+ reg_info = tinfo->reg_info;
+ if (reg_info->get_func_proto)
+ fn = reg_info->get_func_proto(func_id, prog);
+ break;
+ }
+ }
+ mutex_unlock(&targets_mutex);
+
+ return fn;
}
static void bpf_iter_link_release(struct bpf_link *link)
{
+ struct bpf_iter_link *iter_link =
+ container_of(link, struct bpf_iter_link, link);
+
+ if (iter_link->tinfo->reg_info->detach_target)
+ iter_link->tinfo->reg_info->detach_target(&iter_link->aux);
}
static void bpf_iter_link_dealloc(struct bpf_link *link)
@@ -355,10 +435,68 @@ out_unlock:
return ret;
}
+static void bpf_iter_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_iter_link *iter_link =
+ container_of(link, struct bpf_iter_link, link);
+ bpf_iter_show_fdinfo_t show_fdinfo;
+
+ seq_printf(seq,
+ "target_name:\t%s\n",
+ iter_link->tinfo->reg_info->target);
+
+ show_fdinfo = iter_link->tinfo->reg_info->show_fdinfo;
+ if (show_fdinfo)
+ show_fdinfo(&iter_link->aux, seq);
+}
+
+static int bpf_iter_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_iter_link *iter_link =
+ container_of(link, struct bpf_iter_link, link);
+ char __user *ubuf = u64_to_user_ptr(info->iter.target_name);
+ bpf_iter_fill_link_info_t fill_link_info;
+ u32 ulen = info->iter.target_name_len;
+ const char *target_name;
+ u32 target_len;
+
+ if (!ulen ^ !ubuf)
+ return -EINVAL;
+
+ target_name = iter_link->tinfo->reg_info->target;
+ target_len = strlen(target_name);
+ info->iter.target_name_len = target_len + 1;
+
+ if (ubuf) {
+ if (ulen >= target_len + 1) {
+ if (copy_to_user(ubuf, target_name, target_len + 1))
+ return -EFAULT;
+ } else {
+ char zero = '\0';
+
+ if (copy_to_user(ubuf, target_name, ulen - 1))
+ return -EFAULT;
+ if (put_user(zero, ubuf + ulen - 1))
+ return -EFAULT;
+ return -ENOSPC;
+ }
+ }
+
+ fill_link_info = iter_link->tinfo->reg_info->fill_link_info;
+ if (fill_link_info)
+ return fill_link_info(&iter_link->aux, info);
+
+ return 0;
+}
+
static const struct bpf_link_ops bpf_iter_link_lops = {
.release = bpf_iter_link_release,
.dealloc = bpf_iter_link_dealloc,
.update_prog = bpf_iter_link_replace,
+ .show_fdinfo = bpf_iter_link_show_fdinfo,
+ .fill_link_info = bpf_iter_link_fill_link_info,
};
bool bpf_link_is_iter(struct bpf_link *link)
@@ -366,30 +504,53 @@ bool bpf_link_is_iter(struct bpf_link *link)
return link->ops == &bpf_iter_link_lops;
}
-int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
+ struct bpf_prog *prog)
{
+ struct bpf_iter_target_info *tinfo = NULL, *iter;
struct bpf_link_primer link_primer;
- struct bpf_iter_target_info *tinfo;
+ union bpf_iter_link_info linfo;
struct bpf_iter_link *link;
- bool existed = false;
- u32 prog_btf_id;
+ u32 prog_btf_id, linfo_len;
+ bpfptr_t ulinfo;
int err;
if (attr->link_create.target_fd || attr->link_create.flags)
return -EINVAL;
+ memset(&linfo, 0, sizeof(union bpf_iter_link_info));
+
+ ulinfo = make_bpfptr(attr->link_create.iter_info, uattr.is_kernel);
+ linfo_len = attr->link_create.iter_info_len;
+ if (bpfptr_is_null(ulinfo) ^ !linfo_len)
+ return -EINVAL;
+
+ if (!bpfptr_is_null(ulinfo)) {
+ err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo),
+ linfo_len);
+ if (err)
+ return err;
+ linfo_len = min_t(u32, linfo_len, sizeof(linfo));
+ if (copy_from_bpfptr(&linfo, ulinfo, linfo_len))
+ return -EFAULT;
+ }
+
prog_btf_id = prog->aux->attach_btf_id;
mutex_lock(&targets_mutex);
- list_for_each_entry(tinfo, &targets, list) {
- if (tinfo->btf_id == prog_btf_id) {
- existed = true;
+ list_for_each_entry(iter, &targets, list) {
+ if (iter->btf_id == prog_btf_id) {
+ tinfo = iter;
break;
}
}
mutex_unlock(&targets_mutex);
- if (!existed)
+ if (!tinfo)
return -ENOENT;
+ /* Only allow sleepable program for resched-able iterator */
+ if (prog->aux->sleepable && !bpf_iter_target_support_resched(tinfo))
+ return -EINVAL;
+
link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
if (!link)
return -ENOMEM;
@@ -397,27 +558,38 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
link->tinfo = tinfo;
- err = bpf_link_prime(&link->link, &link_primer);
+ err = bpf_link_prime(&link->link, &link_primer);
if (err) {
kfree(link);
return err;
}
+ if (tinfo->reg_info->attach_target) {
+ err = tinfo->reg_info->attach_target(prog, &linfo, &link->aux);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+ }
+
return bpf_link_settle(&link_primer);
}
static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
struct bpf_iter_target_info *tinfo,
+ const struct bpf_iter_seq_info *seq_info,
struct bpf_prog *prog)
{
priv_data->tinfo = tinfo;
+ priv_data->seq_info = seq_info;
priv_data->prog = prog;
priv_data->session_id = atomic64_inc_return(&session_id);
priv_data->seq_num = 0;
priv_data->done_stop = false;
}
-static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
+ const struct bpf_iter_seq_info *seq_info)
{
struct bpf_iter_priv_data *priv_data;
struct bpf_iter_target_info *tinfo;
@@ -433,21 +605,21 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
tinfo = link->tinfo;
total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) +
- tinfo->reg_info->seq_priv_size;
- priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops,
+ seq_info->seq_priv_size;
+ priv_data = __seq_open_private(file, seq_info->seq_ops,
total_priv_dsize);
if (!priv_data) {
err = -ENOMEM;
goto release_prog;
}
- if (tinfo->reg_info->init_seq_private) {
- err = tinfo->reg_info->init_seq_private(priv_data->target_private);
+ if (seq_info->init_seq_private) {
+ err = seq_info->init_seq_private(priv_data->target_private, &link->aux);
if (err)
goto release_seq_file;
}
- init_seq_meta(priv_data, tinfo, prog);
+ init_seq_meta(priv_data, tinfo, seq_info, prog);
seq = file->private_data;
seq->private = priv_data->target_private;
@@ -463,6 +635,7 @@ release_prog:
int bpf_iter_new_fd(struct bpf_link *link)
{
+ struct bpf_iter_link *iter_link;
struct file *file;
unsigned int flags;
int err, fd;
@@ -481,8 +654,8 @@ int bpf_iter_new_fd(struct bpf_link *link)
goto free_fd;
}
- err = prepare_seq_file(file,
- container_of(link, struct bpf_iter_link, link));
+ iter_link = container_of(link, struct bpf_iter_link, link);
+ err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link));
if (err)
goto free_file;
@@ -521,13 +694,27 @@ struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
{
+ struct bpf_run_ctx run_ctx, *old_run_ctx;
int ret;
- rcu_read_lock();
- migrate_disable();
- ret = BPF_PROG_RUN(prog, ctx);
- migrate_enable();
- rcu_read_unlock();
+ if (prog->aux->sleepable) {
+ rcu_read_lock_trace();
+ migrate_disable();
+ might_fault();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx);
+ ret = bpf_prog_run(prog, ctx);
+ bpf_reset_run_ctx(old_run_ctx);
+ migrate_enable();
+ rcu_read_unlock_trace();
+ } else {
+ rcu_read_lock();
+ migrate_disable();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx);
+ ret = bpf_prog_run(prog, ctx);
+ bpf_reset_run_ctx(old_run_ctx);
+ migrate_enable();
+ rcu_read_unlock();
+ }
/* bpf program can only return 0 or 1:
* 0 : okay
@@ -537,3 +724,121 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
*/
return ret == 0 ? 0 : -EAGAIN;
}
+
+BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn,
+ void *, callback_ctx, u64, flags)
+{
+ return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags);
+}
+
+const struct bpf_func_proto bpf_for_each_map_elem_proto = {
+ .func = bpf_for_each_map_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
+ u64, flags)
+{
+ bpf_callback_t callback = (bpf_callback_t)callback_fn;
+ u64 ret;
+ u32 i;
+
+ /* Note: these safety checks are also verified when bpf_loop
+ * is inlined, be careful to modify this code in sync. See
+ * function verifier.c:inline_bpf_loop.
+ */
+ if (flags)
+ return -EINVAL;
+ if (nr_loops > BPF_MAX_LOOPS)
+ return -E2BIG;
+
+ for (i = 0; i < nr_loops; i++) {
+ ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0);
+ /* return value: 0 - continue, 1 - stop and return */
+ if (ret)
+ return i + 1;
+ }
+
+ return i;
+}
+
+const struct bpf_func_proto bpf_loop_proto = {
+ .func = bpf_loop,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+struct bpf_iter_num_kern {
+ int cur; /* current value, inclusive */
+ int end; /* final value, exclusive */
+} __aligned(8);
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num));
+
+ /* start == end is legit, it's an empty range and we'll just get NULL
+ * on first (and any subsequent) bpf_iter_num_next() call
+ */
+ if (start > end) {
+ s->cur = s->end = 0;
+ return -EINVAL;
+ }
+
+ /* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */
+ if ((s64)end - (s64)start > BPF_MAX_LOOPS) {
+ s->cur = s->end = 0;
+ return -E2BIG;
+ }
+
+ /* user will call bpf_iter_num_next() first,
+ * which will set s->cur to exactly start value;
+ * underflow shouldn't matter
+ */
+ s->cur = start - 1;
+ s->end = end;
+
+ return 0;
+}
+
+__bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ /* check failed initialization or if we are done (same behavior);
+ * need to be careful about overflow, so convert to s64 for checks,
+ * e.g., if s->cur == s->end == INT_MAX, we can't just do
+ * s->cur + 1 >= s->end
+ */
+ if ((s64)(s->cur + 1) >= s->end) {
+ s->cur = s->end = 0;
+ return NULL;
+ }
+
+ s->cur++;
+
+ return &s->cur;
+}
+
+__bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ s->cur = s->end = 0;
+}
+
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
new file mode 100644
index 000000000000..146824cc9689
--- /dev/null
+++ b/kernel/bpf/bpf_local_storage.c
@@ -0,0 +1,917 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/bpf_local_storage.h>
+#include <net/sock.h>
+#include <uapi/linux/sock_diag.h>
+#include <uapi/linux/btf.h>
+#include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/rcupdate_wait.h>
+
+#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
+
+static struct bpf_local_storage_map_bucket *
+select_bucket(struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *selem)
+{
+ return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
+}
+
+static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size)
+{
+ struct bpf_map *map = &smap->map;
+
+ if (!map->ops->map_local_storage_charge)
+ return 0;
+
+ return map->ops->map_local_storage_charge(smap, owner, size);
+}
+
+static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner,
+ u32 size)
+{
+ struct bpf_map *map = &smap->map;
+
+ if (map->ops->map_local_storage_uncharge)
+ map->ops->map_local_storage_uncharge(smap, owner, size);
+}
+
+static struct bpf_local_storage __rcu **
+owner_storage(struct bpf_local_storage_map *smap, void *owner)
+{
+ struct bpf_map *map = &smap->map;
+
+ return map->ops->map_owner_storage_ptr(owner);
+}
+
+static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed_lockless(&selem->snode);
+}
+
+static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed(&selem->snode);
+}
+
+static bool selem_linked_to_map_lockless(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed_lockless(&selem->map_node);
+}
+
+static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed(&selem->map_node);
+}
+
+struct bpf_local_storage_elem *
+bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
+ void *value, bool charge_mem, gfp_t gfp_flags)
+{
+ struct bpf_local_storage_elem *selem;
+
+ if (charge_mem && mem_charge(smap, owner, smap->elem_size))
+ return NULL;
+
+ if (smap->bpf_ma) {
+ migrate_disable();
+ selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags);
+ migrate_enable();
+ if (selem)
+ /* Keep the original bpf_map_kzalloc behavior
+ * before started using the bpf_mem_cache_alloc.
+ *
+ * No need to use zero_map_value. The bpf_selem_free()
+ * only does bpf_mem_cache_free when there is
+ * no other bpf prog is using the selem.
+ */
+ memset(SDATA(selem)->data, 0, smap->map.value_size);
+ } else {
+ selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
+ gfp_flags | __GFP_NOWARN);
+ }
+
+ if (selem) {
+ if (value)
+ copy_map_value(&smap->map, SDATA(selem)->data, value);
+ /* No need to call check_and_init_map_value as memory is zero init */
+ return selem;
+ }
+
+ if (charge_mem)
+ mem_uncharge(smap, owner, smap->elem_size);
+
+ return NULL;
+}
+
+/* rcu tasks trace callback for bpf_ma == false */
+static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage *local_storage;
+
+ /* If RCU Tasks Trace grace period implies RCU grace period, do
+ * kfree(), else do kfree_rcu().
+ */
+ local_storage = container_of(rcu, struct bpf_local_storage, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(local_storage);
+ else
+ kfree_rcu(local_storage, rcu);
+}
+
+static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage *local_storage;
+
+ local_storage = container_of(rcu, struct bpf_local_storage, rcu);
+ bpf_mem_cache_raw_free(local_storage);
+}
+
+static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_local_storage_free_rcu(rcu);
+ else
+ call_rcu(rcu, bpf_local_storage_free_rcu);
+}
+
+/* Handle bpf_ma == false */
+static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
+ bool vanilla_rcu)
+{
+ if (vanilla_rcu)
+ kfree_rcu(local_storage, rcu);
+ else
+ call_rcu_tasks_trace(&local_storage->rcu,
+ __bpf_local_storage_free_trace_rcu);
+}
+
+static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_map *smap,
+ bool bpf_ma, bool reuse_now)
+{
+ if (!local_storage)
+ return;
+
+ if (!bpf_ma) {
+ __bpf_local_storage_free(local_storage, reuse_now);
+ return;
+ }
+
+ if (!reuse_now) {
+ call_rcu_tasks_trace(&local_storage->rcu,
+ bpf_local_storage_free_trace_rcu);
+ return;
+ }
+
+ if (smap) {
+ migrate_disable();
+ bpf_mem_cache_free(&smap->storage_ma, local_storage);
+ migrate_enable();
+ } else {
+ /* smap could be NULL if the selem that triggered
+ * this 'local_storage' creation had been long gone.
+ * In this case, directly do call_rcu().
+ */
+ call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
+ }
+}
+
+/* rcu tasks trace callback for bpf_ma == false */
+static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage_elem *selem;
+
+ selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(selem);
+ else
+ kfree_rcu(selem, rcu);
+}
+
+/* Handle bpf_ma == false */
+static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
+ bool vanilla_rcu)
+{
+ if (vanilla_rcu)
+ kfree_rcu(selem, rcu);
+ else
+ call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
+}
+
+static void bpf_selem_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage_elem *selem;
+
+ selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+ bpf_mem_cache_raw_free(selem);
+}
+
+static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_selem_free_rcu(rcu);
+ else
+ call_rcu(rcu, bpf_selem_free_rcu);
+}
+
+void bpf_selem_free(struct bpf_local_storage_elem *selem,
+ struct bpf_local_storage_map *smap,
+ bool reuse_now)
+{
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+
+ if (!smap->bpf_ma) {
+ __bpf_selem_free(selem, reuse_now);
+ return;
+ }
+
+ if (!reuse_now) {
+ call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
+ } else {
+ /* Instead of using the vanilla call_rcu(),
+ * bpf_mem_cache_free will be able to reuse selem
+ * immediately.
+ */
+ migrate_disable();
+ bpf_mem_cache_free(&smap->selem_ma, selem);
+ migrate_enable();
+ }
+}
+
+/* local_storage->lock must be held and selem->local_storage == local_storage.
+ * The caller must ensure selem->smap is still valid to be
+ * dereferenced for its smap->elem_size and smap->cache_idx.
+ */
+static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem,
+ bool uncharge_mem, bool reuse_now)
+{
+ struct bpf_local_storage_map *smap;
+ bool free_local_storage;
+ void *owner;
+
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+ owner = local_storage->owner;
+
+ /* All uncharging on the owner must be done first.
+ * The owner may be freed once the last selem is unlinked
+ * from local_storage.
+ */
+ if (uncharge_mem)
+ mem_uncharge(smap, owner, smap->elem_size);
+
+ free_local_storage = hlist_is_singular_node(&selem->snode,
+ &local_storage->list);
+ if (free_local_storage) {
+ mem_uncharge(smap, owner, sizeof(struct bpf_local_storage));
+ local_storage->owner = NULL;
+
+ /* After this RCU_INIT, owner may be freed and cannot be used */
+ RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
+
+ /* local_storage is not freed now. local_storage->lock is
+ * still held and raw_spin_unlock_bh(&local_storage->lock)
+ * will be done by the caller.
+ *
+ * Although the unlock will be done under
+ * rcu_read_lock(), it is more intuitive to
+ * read if the freeing of the storage is done
+ * after the raw_spin_unlock_bh(&local_storage->lock).
+ *
+ * Hence, a "bool free_local_storage" is returned
+ * to the caller which then calls then frees the storage after
+ * all the RCU grace periods have expired.
+ */
+ }
+ hlist_del_init_rcu(&selem->snode);
+ if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
+ SDATA(selem))
+ RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
+
+ bpf_selem_free(selem, smap, reuse_now);
+
+ if (rcu_access_pointer(local_storage->smap) == smap)
+ RCU_INIT_POINTER(local_storage->smap, NULL);
+
+ return free_local_storage;
+}
+
+static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_map *storage_smap,
+ struct bpf_local_storage_elem *selem)
+{
+
+ struct bpf_local_storage_map *selem_smap;
+
+ /* local_storage->smap may be NULL. If it is, get the bpf_ma
+ * from any selem in the local_storage->list. The bpf_ma of all
+ * local_storage and selem should have the same value
+ * for the same map type.
+ *
+ * If the local_storage->list is already empty, the caller will not
+ * care about the bpf_ma value also because the caller is not
+ * responsibile to free the local_storage.
+ */
+
+ if (storage_smap)
+ return storage_smap->bpf_ma;
+
+ if (!selem) {
+ struct hlist_node *n;
+
+ n = rcu_dereference_check(hlist_first_rcu(&local_storage->list),
+ bpf_rcu_lock_held());
+ if (!n)
+ return false;
+
+ selem = hlist_entry(n, struct bpf_local_storage_elem, snode);
+ }
+ selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+
+ return selem_smap->bpf_ma;
+}
+
+static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
+ bool reuse_now)
+{
+ struct bpf_local_storage_map *storage_smap;
+ struct bpf_local_storage *local_storage;
+ bool bpf_ma, free_local_storage = false;
+ unsigned long flags;
+
+ if (unlikely(!selem_linked_to_storage_lockless(selem)))
+ /* selem has already been unlinked from sk */
+ return;
+
+ local_storage = rcu_dereference_check(selem->local_storage,
+ bpf_rcu_lock_held());
+ storage_smap = rcu_dereference_check(local_storage->smap,
+ bpf_rcu_lock_held());
+ bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem);
+
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+ if (likely(selem_linked_to_storage(selem)))
+ free_local_storage = bpf_selem_unlink_storage_nolock(
+ local_storage, selem, true, reuse_now);
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+
+ if (free_local_storage)
+ bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now);
+}
+
+void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem)
+{
+ RCU_INIT_POINTER(selem->local_storage, local_storage);
+ hlist_add_head_rcu(&selem->snode, &local_storage->list);
+}
+
+static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
+{
+ struct bpf_local_storage_map *smap;
+ struct bpf_local_storage_map_bucket *b;
+ unsigned long flags;
+
+ if (unlikely(!selem_linked_to_map_lockless(selem)))
+ /* selem has already be unlinked from smap */
+ return;
+
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+ b = select_bucket(smap, selem);
+ raw_spin_lock_irqsave(&b->lock, flags);
+ if (likely(selem_linked_to_map(selem)))
+ hlist_del_init_rcu(&selem->map_node);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+}
+
+void bpf_selem_link_map(struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *selem)
+{
+ struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&b->lock, flags);
+ RCU_INIT_POINTER(SDATA(selem)->smap, smap);
+ hlist_add_head_rcu(&selem->map_node, &b->list);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+}
+
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
+{
+ /* Always unlink from map before unlinking from local_storage
+ * because selem will be freed after successfully unlinked from
+ * the local_storage.
+ */
+ bpf_selem_unlink_map(selem);
+ bpf_selem_unlink_storage(selem, reuse_now);
+}
+
+/* If cacheit_lockit is false, this lookup function is lockless */
+struct bpf_local_storage_data *
+bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_map *smap,
+ bool cacheit_lockit)
+{
+ struct bpf_local_storage_data *sdata;
+ struct bpf_local_storage_elem *selem;
+
+ /* Fast path (cache hit) */
+ sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx],
+ bpf_rcu_lock_held());
+ if (sdata && rcu_access_pointer(sdata->smap) == smap)
+ return sdata;
+
+ /* Slow path (cache miss) */
+ hlist_for_each_entry_rcu(selem, &local_storage->list, snode,
+ rcu_read_lock_trace_held())
+ if (rcu_access_pointer(SDATA(selem)->smap) == smap)
+ break;
+
+ if (!selem)
+ return NULL;
+
+ sdata = SDATA(selem);
+ if (cacheit_lockit) {
+ unsigned long flags;
+
+ /* spinlock is needed to avoid racing with the
+ * parallel delete. Otherwise, publishing an already
+ * deleted sdata to the cache will become a use-after-free
+ * problem in the next bpf_local_storage_lookup().
+ */
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+ if (selem_linked_to_storage(selem))
+ rcu_assign_pointer(local_storage->cache[smap->cache_idx],
+ sdata);
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ }
+
+ return sdata;
+}
+
+static int check_flags(const struct bpf_local_storage_data *old_sdata,
+ u64 map_flags)
+{
+ if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
+ /* elem already exists */
+ return -EEXIST;
+
+ if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
+ /* elem doesn't exist, cannot update it */
+ return -ENOENT;
+
+ return 0;
+}
+
+int bpf_local_storage_alloc(void *owner,
+ struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *first_selem,
+ gfp_t gfp_flags)
+{
+ struct bpf_local_storage *prev_storage, *storage;
+ struct bpf_local_storage **owner_storage_ptr;
+ int err;
+
+ err = mem_charge(smap, owner, sizeof(*storage));
+ if (err)
+ return err;
+
+ if (smap->bpf_ma) {
+ migrate_disable();
+ storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags);
+ migrate_enable();
+ } else {
+ storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
+ gfp_flags | __GFP_NOWARN);
+ }
+
+ if (!storage) {
+ err = -ENOMEM;
+ goto uncharge;
+ }
+
+ RCU_INIT_POINTER(storage->smap, smap);
+ INIT_HLIST_HEAD(&storage->list);
+ raw_spin_lock_init(&storage->lock);
+ storage->owner = owner;
+
+ bpf_selem_link_storage_nolock(storage, first_selem);
+ bpf_selem_link_map(smap, first_selem);
+
+ owner_storage_ptr =
+ (struct bpf_local_storage **)owner_storage(smap, owner);
+ /* Publish storage to the owner.
+ * Instead of using any lock of the kernel object (i.e. owner),
+ * cmpxchg will work with any kernel object regardless what
+ * the running context is, bh, irq...etc.
+ *
+ * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage)
+ * is protected by the storage->lock. Hence, when freeing
+ * the owner->storage, the storage->lock must be held before
+ * setting owner->storage ptr to NULL.
+ */
+ prev_storage = cmpxchg(owner_storage_ptr, NULL, storage);
+ if (unlikely(prev_storage)) {
+ bpf_selem_unlink_map(first_selem);
+ err = -EAGAIN;
+ goto uncharge;
+
+ /* Note that even first_selem was linked to smap's
+ * bucket->list, first_selem can be freed immediately
+ * (instead of kfree_rcu) because
+ * bpf_local_storage_map_free() does a
+ * synchronize_rcu_mult (waiting for both sleepable and
+ * normal programs) before walking the bucket->list.
+ * Hence, no one is accessing selem from the
+ * bucket->list under rcu_read_lock().
+ */
+ }
+
+ return 0;
+
+uncharge:
+ bpf_local_storage_free(storage, smap, smap->bpf_ma, true);
+ mem_uncharge(smap, owner, sizeof(*storage));
+ return err;
+}
+
+/* sk cannot be going away because it is linking new elem
+ * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
+ * Otherwise, it will become a leak (and other memory issues
+ * during map destruction).
+ */
+struct bpf_local_storage_data *
+bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
+ void *value, u64 map_flags, gfp_t gfp_flags)
+{
+ struct bpf_local_storage_data *old_sdata = NULL;
+ struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
+ struct bpf_local_storage *local_storage;
+ unsigned long flags;
+ int err;
+
+ /* BPF_EXIST and BPF_NOEXIST cannot be both set */
+ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
+ /* BPF_F_LOCK can only be used in a value with spin_lock */
+ unlikely((map_flags & BPF_F_LOCK) &&
+ !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)))
+ return ERR_PTR(-EINVAL);
+
+ if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST)
+ return ERR_PTR(-EINVAL);
+
+ local_storage = rcu_dereference_check(*owner_storage(smap, owner),
+ bpf_rcu_lock_held());
+ if (!local_storage || hlist_empty(&local_storage->list)) {
+ /* Very first elem for the owner */
+ err = check_flags(NULL, map_flags);
+ if (err)
+ return ERR_PTR(err);
+
+ selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
+ if (!selem)
+ return ERR_PTR(-ENOMEM);
+
+ err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
+ if (err) {
+ bpf_selem_free(selem, smap, true);
+ mem_uncharge(smap, owner, smap->elem_size);
+ return ERR_PTR(err);
+ }
+
+ return SDATA(selem);
+ }
+
+ if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
+ /* Hoping to find an old_sdata to do inline update
+ * such that it can avoid taking the local_storage->lock
+ * and changing the lists.
+ */
+ old_sdata =
+ bpf_local_storage_lookup(local_storage, smap, false);
+ err = check_flags(old_sdata, map_flags);
+ if (err)
+ return ERR_PTR(err);
+ if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) {
+ copy_map_value_locked(&smap->map, old_sdata->data,
+ value, false);
+ return old_sdata;
+ }
+ }
+
+ /* A lookup has just been done before and concluded a new selem is
+ * needed. The chance of an unnecessary alloc is unlikely.
+ */
+ alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
+ if (!alloc_selem)
+ return ERR_PTR(-ENOMEM);
+
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+
+ /* Recheck local_storage->list under local_storage->lock */
+ if (unlikely(hlist_empty(&local_storage->list))) {
+ /* A parallel del is happening and local_storage is going
+ * away. It has just been checked before, so very
+ * unlikely. Return instead of retry to keep things
+ * simple.
+ */
+ err = -EAGAIN;
+ goto unlock;
+ }
+
+ old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
+ err = check_flags(old_sdata, map_flags);
+ if (err)
+ goto unlock;
+
+ if (old_sdata && (map_flags & BPF_F_LOCK)) {
+ copy_map_value_locked(&smap->map, old_sdata->data, value,
+ false);
+ selem = SELEM(old_sdata);
+ goto unlock;
+ }
+
+ alloc_selem = NULL;
+ /* First, link the new selem to the map */
+ bpf_selem_link_map(smap, selem);
+
+ /* Second, link (and publish) the new selem to local_storage */
+ bpf_selem_link_storage_nolock(local_storage, selem);
+
+ /* Third, remove old selem, SELEM(old_sdata) */
+ if (old_sdata) {
+ bpf_selem_unlink_map(SELEM(old_sdata));
+ bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
+ true, false);
+ }
+
+unlock:
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ if (alloc_selem) {
+ mem_uncharge(smap, owner, smap->elem_size);
+ bpf_selem_free(alloc_selem, smap, true);
+ }
+ return err ? ERR_PTR(err) : SDATA(selem);
+}
+
+static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
+{
+ u64 min_usage = U64_MAX;
+ u16 i, res = 0;
+
+ spin_lock(&cache->idx_lock);
+
+ for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) {
+ if (cache->idx_usage_counts[i] < min_usage) {
+ min_usage = cache->idx_usage_counts[i];
+ res = i;
+
+ /* Found a free cache_idx */
+ if (!min_usage)
+ break;
+ }
+ }
+ cache->idx_usage_counts[res]++;
+
+ spin_unlock(&cache->idx_lock);
+
+ return res;
+}
+
+static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
+ u16 idx)
+{
+ spin_lock(&cache->idx_lock);
+ cache->idx_usage_counts[idx]--;
+ spin_unlock(&cache->idx_lock);
+}
+
+int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
+{
+ if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
+ !(attr->map_flags & BPF_F_NO_PREALLOC) ||
+ attr->max_entries ||
+ attr->key_size != sizeof(int) || !attr->value_size ||
+ /* Enforce BTF for userspace sk dumping */
+ !attr->btf_key_type_id || !attr->btf_value_type_id)
+ return -EINVAL;
+
+ if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
+ return -E2BIG;
+
+ return 0;
+}
+
+int bpf_local_storage_map_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ u32 int_data;
+
+ if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+ return -EINVAL;
+
+ int_data = *(u32 *)(key_type + 1);
+ if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
+ return -EINVAL;
+
+ return 0;
+}
+
+void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
+{
+ struct bpf_local_storage_map *storage_smap;
+ struct bpf_local_storage_elem *selem;
+ bool bpf_ma, free_storage = false;
+ struct hlist_node *n;
+ unsigned long flags;
+
+ storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held());
+ bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL);
+
+ /* Neither the bpf_prog nor the bpf_map's syscall
+ * could be modifying the local_storage->list now.
+ * Thus, no elem can be added to or deleted from the
+ * local_storage->list by the bpf_prog or by the bpf_map's syscall.
+ *
+ * It is racing with bpf_local_storage_map_free() alone
+ * when unlinking elem from the local_storage->list and
+ * the map's bucket->list.
+ */
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+ hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
+ /* Always unlink from map before unlinking from
+ * local_storage.
+ */
+ bpf_selem_unlink_map(selem);
+ /* If local_storage list has only one element, the
+ * bpf_selem_unlink_storage_nolock() will return true.
+ * Otherwise, it will return false. The current loop iteration
+ * intends to remove all local storage. So the last iteration
+ * of the loop will set the free_cgroup_storage to true.
+ */
+ free_storage = bpf_selem_unlink_storage_nolock(
+ local_storage, selem, true, true);
+ }
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+
+ if (free_storage)
+ bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true);
+}
+
+u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map;
+ u64 usage = sizeof(*smap);
+
+ /* The dynamically callocated selems are not counted currently. */
+ usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log);
+ return usage;
+}
+
+/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory.
+ * A deadlock free allocator is useful for storage that the bpf prog can easily
+ * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf.
+ * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses
+ * memory immediately. To be reuse-immediate safe, the owner destruction
+ * code path needs to go through a rcu grace period before calling
+ * bpf_local_storage_destroy().
+ *
+ * When bpf_ma == false, the kmalloc and kfree are used.
+ */
+struct bpf_map *
+bpf_local_storage_map_alloc(union bpf_attr *attr,
+ struct bpf_local_storage_cache *cache,
+ bool bpf_ma)
+{
+ struct bpf_local_storage_map *smap;
+ unsigned int i;
+ u32 nbuckets;
+ int err;
+
+ smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
+ if (!smap)
+ return ERR_PTR(-ENOMEM);
+ bpf_map_init_from_attr(&smap->map, attr);
+
+ nbuckets = roundup_pow_of_two(num_possible_cpus());
+ /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
+ nbuckets = max_t(u32, 2, nbuckets);
+ smap->bucket_log = ilog2(nbuckets);
+
+ smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
+ nbuckets, GFP_USER | __GFP_NOWARN);
+ if (!smap->buckets) {
+ err = -ENOMEM;
+ goto free_smap;
+ }
+
+ for (i = 0; i < nbuckets; i++) {
+ INIT_HLIST_HEAD(&smap->buckets[i].list);
+ raw_spin_lock_init(&smap->buckets[i].lock);
+ }
+
+ smap->elem_size = offsetof(struct bpf_local_storage_elem,
+ sdata.data[attr->value_size]);
+
+ smap->bpf_ma = bpf_ma;
+ if (bpf_ma) {
+ err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false);
+ if (err)
+ goto free_smap;
+
+ err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false);
+ if (err) {
+ bpf_mem_alloc_destroy(&smap->selem_ma);
+ goto free_smap;
+ }
+ }
+
+ smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
+ return &smap->map;
+
+free_smap:
+ kvfree(smap->buckets);
+ bpf_map_area_free(smap);
+ return ERR_PTR(err);
+}
+
+void bpf_local_storage_map_free(struct bpf_map *map,
+ struct bpf_local_storage_cache *cache,
+ int __percpu *busy_counter)
+{
+ struct bpf_local_storage_map_bucket *b;
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
+ unsigned int i;
+
+ smap = (struct bpf_local_storage_map *)map;
+ bpf_local_storage_cache_idx_free(cache, smap->cache_idx);
+
+ /* Note that this map might be concurrently cloned from
+ * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
+ * RCU read section to finish before proceeding. New RCU
+ * read sections should be prevented via bpf_map_inc_not_zero.
+ */
+ synchronize_rcu();
+
+ /* bpf prog and the userspace can no longer access this map
+ * now. No new selem (of this map) can be added
+ * to the owner->storage or to the map bucket's list.
+ *
+ * The elem of this map can be cleaned up here
+ * or when the storage is freed e.g.
+ * by bpf_sk_storage_free() during __sk_destruct().
+ */
+ for (i = 0; i < (1U << smap->bucket_log); i++) {
+ b = &smap->buckets[i];
+
+ rcu_read_lock();
+ /* No one is adding to b->list now */
+ while ((selem = hlist_entry_safe(
+ rcu_dereference_raw(hlist_first_rcu(&b->list)),
+ struct bpf_local_storage_elem, map_node))) {
+ if (busy_counter) {
+ migrate_disable();
+ this_cpu_inc(*busy_counter);
+ }
+ bpf_selem_unlink(selem, true);
+ if (busy_counter) {
+ this_cpu_dec(*busy_counter);
+ migrate_enable();
+ }
+ cond_resched_rcu();
+ }
+ rcu_read_unlock();
+ }
+
+ /* While freeing the storage we may still need to access the map.
+ *
+ * e.g. when bpf_sk_storage_free() has unlinked selem from the map
+ * which then made the above while((selem = ...)) loop
+ * exit immediately.
+ *
+ * However, while freeing the storage one still needs to access the
+ * smap->elem_size to do the uncharging in
+ * bpf_selem_unlink_storage_nolock().
+ *
+ * Hence, wait another rcu grace period for the storage to be freed.
+ */
+ synchronize_rcu();
+
+ if (smap->bpf_ma) {
+ bpf_mem_alloc_destroy(&smap->selem_ma);
+ bpf_mem_alloc_destroy(&smap->storage_ma);
+ }
+ kvfree(smap->buckets);
+ bpf_map_area_free(smap);
+}
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index 1b6b9349cb85..3dabdd137d10 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -41,7 +41,12 @@ static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l)
/* bpf_lru_node helpers */
static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node)
{
- return node->ref;
+ return READ_ONCE(node->ref);
+}
+
+static void bpf_lru_node_clear_ref(struct bpf_lru_node *node)
+{
+ WRITE_ONCE(node->ref, 0);
}
static void bpf_lru_list_count_inc(struct bpf_lru_list *l,
@@ -89,7 +94,7 @@ static void __bpf_lru_node_move_in(struct bpf_lru_list *l,
bpf_lru_list_count_inc(l, tgt_type);
node->type = tgt_type;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_move(&node->list, &l->lists[tgt_type]);
}
@@ -110,7 +115,7 @@ static void __bpf_lru_node_move(struct bpf_lru_list *l,
bpf_lru_list_count_inc(l, tgt_type);
node->type = tgt_type;
}
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
/* If the moving node is the next_inactive_rotation candidate,
* move the next_inactive_rotation pointer also.
@@ -353,7 +358,7 @@ static void __local_list_add_pending(struct bpf_lru *lru,
*(u32 *)((void *)node + lru->hash_offset) = hash;
node->cpu = cpu;
node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_add(&node->list, local_pending_list(loc_l));
}
@@ -419,7 +424,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
if (!list_empty(free_list)) {
node = list_first_entry(free_list, struct bpf_lru_node, list);
*(u32 *)((void *)node + lru->hash_offset) = hash;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
}
@@ -502,13 +507,14 @@ struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash)
static void bpf_common_lru_push_free(struct bpf_lru *lru,
struct bpf_lru_node *node)
{
+ u8 node_type = READ_ONCE(node->type);
unsigned long flags;
- if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) ||
- WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE))
+ if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) ||
+ WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE))
return;
- if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) {
+ if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) {
struct bpf_lru_locallist *loc_l;
loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu);
@@ -521,7 +527,7 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru,
}
node->type = BPF_LRU_LOCAL_LIST_T_FREE;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_move(&node->list, local_free_list(loc_l));
raw_spin_unlock_irqrestore(&loc_l->lock, flags);
@@ -567,7 +573,7 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
node = (struct bpf_lru_node *)(buf + node_offset);
node->type = BPF_LRU_LIST_T_FREE;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
buf += elem_size;
}
@@ -593,7 +599,7 @@ again:
node = (struct bpf_lru_node *)(buf + node_offset);
node->cpu = cpu;
node->type = BPF_LRU_LIST_T_FREE;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
i++;
buf += elem_size;
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 6b12f06ee18c..cbd8d3720c2b 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -4,6 +4,7 @@
#ifndef __BPF_LRU_LIST_H_
#define __BPF_LRU_LIST_H_
+#include <linux/cache.h>
#include <linux/list.h>
#include <linux/spinlock_types.h>
@@ -63,11 +64,8 @@ struct bpf_lru {
static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
{
- /* ref is an approximation on access frequency. It does not
- * have to be very accurate. Hence, no protection is used.
- */
- if (!node->ref)
- node->ref = 1;
+ if (!READ_ONCE(node->ref))
+ WRITE_ONCE(node->ref, 1);
}
int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
@@ -77,6 +75,5 @@ void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
void bpf_lru_destroy(struct bpf_lru *lru);
struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash);
void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node);
-void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node);
#endif
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index fb278144e9fd..e8e910395bf6 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -7,10 +7,16 @@
#include <linux/filter.h>
#include <linux/bpf.h>
#include <linux/btf.h>
+#include <linux/binfmts.h>
#include <linux/lsm_hooks.h>
#include <linux/bpf_lsm.h>
#include <linux/kallsyms.h>
#include <linux/bpf_verifier.h>
+#include <net/bpf_sk_storage.h>
+#include <linux/bpf_local_storage.h>
+#include <linux/btf_ids.h>
+#include <linux/ima.h>
+#include <linux/bpf-cgroup.h>
/* For every LSM hook that allows attachment of BPF programs, declare a nop
* function where a BPF program can be attached.
@@ -24,7 +30,69 @@ noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK
-#define BPF_LSM_SYM_PREFX "bpf_lsm_"
+#define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME)
+BTF_SET_START(bpf_lsm_hooks)
+#include <linux/lsm_hook_defs.h>
+#undef LSM_HOOK
+BTF_SET_END(bpf_lsm_hooks)
+
+/* List of LSM hooks that should operate on 'current' cgroup regardless
+ * of function signature.
+ */
+BTF_SET_START(bpf_lsm_current_hooks)
+/* operate on freshly allocated sk without any cgroup association */
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_sk_alloc_security)
+BTF_ID(func, bpf_lsm_sk_free_security)
+#endif
+BTF_SET_END(bpf_lsm_current_hooks)
+
+/* List of LSM hooks that trigger while the socket is properly locked.
+ */
+BTF_SET_START(bpf_lsm_locked_sockopt_hooks)
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_sock_graft)
+BTF_ID(func, bpf_lsm_inet_csk_clone)
+BTF_ID(func, bpf_lsm_inet_conn_established)
+#endif
+BTF_SET_END(bpf_lsm_locked_sockopt_hooks)
+
+/* List of LSM hooks that trigger while the socket is _not_ locked,
+ * but it's ok to call bpf_{g,s}etsockopt because the socket is still
+ * in the early init phase.
+ */
+BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks)
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_socket_post_create)
+BTF_ID(func, bpf_lsm_socket_socketpair)
+#endif
+BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks)
+
+#ifdef CONFIG_CGROUP_BPF
+void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
+ bpf_func_t *bpf_func)
+{
+ const struct btf_param *args __maybe_unused;
+
+ if (btf_type_vlen(prog->aux->attach_func_proto) < 1 ||
+ btf_id_set_contains(&bpf_lsm_current_hooks,
+ prog->aux->attach_btf_id)) {
+ *bpf_func = __cgroup_bpf_run_lsm_current;
+ return;
+ }
+
+#ifdef CONFIG_NET
+ args = btf_params(prog->aux->attach_func_proto);
+
+ if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCKET])
+ *bpf_func = __cgroup_bpf_run_lsm_socket;
+ else if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCK])
+ *bpf_func = __cgroup_bpf_run_lsm_sock;
+ else
+#endif
+ *bpf_func = __cgroup_bpf_run_lsm_current;
+}
+#endif
int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
const struct bpf_prog *prog)
@@ -35,8 +103,7 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
return -EINVAL;
}
- if (strncmp(BPF_LSM_SYM_PREFX, prog->aux->attach_func_name,
- sizeof(BPF_LSM_SYM_PREFX) - 1)) {
+ if (!btf_id_set_contains(&bpf_lsm_hooks, prog->aux->attach_btf_id)) {
bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n",
prog->aux->attach_btf_id, prog->aux->attach_func_name);
return -EINVAL;
@@ -45,10 +112,277 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
return 0;
}
+/* Mask for all the currently supported BPRM option flags */
+#define BPF_F_BRPM_OPTS_MASK BPF_F_BPRM_SECUREEXEC
+
+BPF_CALL_2(bpf_bprm_opts_set, struct linux_binprm *, bprm, u64, flags)
+{
+ if (flags & ~BPF_F_BRPM_OPTS_MASK)
+ return -EINVAL;
+
+ bprm->secureexec = (flags & BPF_F_BPRM_SECUREEXEC);
+ return 0;
+}
+
+BTF_ID_LIST_SINGLE(bpf_bprm_opts_set_btf_ids, struct, linux_binprm)
+
+static const struct bpf_func_proto bpf_bprm_opts_set_proto = {
+ .func = bpf_bprm_opts_set,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_bprm_opts_set_btf_ids[0],
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_ima_inode_hash, struct inode *, inode, void *, dst, u32, size)
+{
+ return ima_inode_hash(inode, dst, size);
+}
+
+static bool bpf_ima_inode_hash_allowed(const struct bpf_prog *prog)
+{
+ return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
+}
+
+BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode)
+
+static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
+ .func = bpf_ima_inode_hash,
+ .gpl_only = false,
+ .might_sleep = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_ima_inode_hash_btf_ids[0],
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .allowed = bpf_ima_inode_hash_allowed,
+};
+
+BPF_CALL_3(bpf_ima_file_hash, struct file *, file, void *, dst, u32, size)
+{
+ return ima_file_hash(file, dst, size);
+}
+
+BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file)
+
+static const struct bpf_func_proto bpf_ima_file_hash_proto = {
+ .func = bpf_ima_file_hash,
+ .gpl_only = false,
+ .might_sleep = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_ima_file_hash_btf_ids[0],
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .allowed = bpf_ima_inode_hash_allowed,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie, void *, ctx)
+{
+ struct bpf_trace_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+ return run_ctx->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto = {
+ .func = bpf_get_attach_cookie,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+static const struct bpf_func_proto *
+bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ const struct bpf_func_proto *func_proto;
+
+ if (prog->expected_attach_type == BPF_LSM_CGROUP) {
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+ }
+
+ switch (func_id) {
+ case BPF_FUNC_inode_storage_get:
+ return &bpf_inode_storage_get_proto;
+ case BPF_FUNC_inode_storage_delete:
+ return &bpf_inode_storage_delete_proto;
+#ifdef CONFIG_NET
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+#endif /* CONFIG_NET */
+ case BPF_FUNC_spin_lock:
+ return &bpf_spin_lock_proto;
+ case BPF_FUNC_spin_unlock:
+ return &bpf_spin_unlock_proto;
+ case BPF_FUNC_bprm_opts_set:
+ return &bpf_bprm_opts_set_proto;
+ case BPF_FUNC_ima_inode_hash:
+ return &bpf_ima_inode_hash_proto;
+ case BPF_FUNC_ima_file_hash:
+ return &bpf_ima_file_hash_proto;
+ case BPF_FUNC_get_attach_cookie:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL;
+#ifdef CONFIG_NET
+ case BPF_FUNC_setsockopt:
+ if (prog->expected_attach_type != BPF_LSM_CGROUP)
+ return NULL;
+ if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_sk_setsockopt_proto;
+ if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_unlocked_sk_setsockopt_proto;
+ return NULL;
+ case BPF_FUNC_getsockopt:
+ if (prog->expected_attach_type != BPF_LSM_CGROUP)
+ return NULL;
+ if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_sk_getsockopt_proto;
+ if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_unlocked_sk_getsockopt_proto;
+ return NULL;
+#endif
+ default:
+ return tracing_prog_func_proto(func_id, prog);
+ }
+}
+
+/* The set of hooks which are called without pagefaults disabled and are allowed
+ * to "sleep" and thus can be used for sleepable BPF programs.
+ */
+BTF_SET_START(sleepable_lsm_hooks)
+BTF_ID(func, bpf_lsm_bpf)
+BTF_ID(func, bpf_lsm_bpf_map)
+BTF_ID(func, bpf_lsm_bpf_map_alloc_security)
+BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_prog)
+BTF_ID(func, bpf_lsm_bprm_check_security)
+BTF_ID(func, bpf_lsm_bprm_committed_creds)
+BTF_ID(func, bpf_lsm_bprm_committing_creds)
+BTF_ID(func, bpf_lsm_bprm_creds_for_exec)
+BTF_ID(func, bpf_lsm_bprm_creds_from_file)
+BTF_ID(func, bpf_lsm_capget)
+BTF_ID(func, bpf_lsm_capset)
+BTF_ID(func, bpf_lsm_cred_prepare)
+BTF_ID(func, bpf_lsm_file_ioctl)
+BTF_ID(func, bpf_lsm_file_lock)
+BTF_ID(func, bpf_lsm_file_open)
+BTF_ID(func, bpf_lsm_file_receive)
+
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_inet_conn_established)
+#endif /* CONFIG_SECURITY_NETWORK */
+
+BTF_ID(func, bpf_lsm_inode_create)
+BTF_ID(func, bpf_lsm_inode_free_security)
+BTF_ID(func, bpf_lsm_inode_getattr)
+BTF_ID(func, bpf_lsm_inode_getxattr)
+BTF_ID(func, bpf_lsm_inode_mknod)
+BTF_ID(func, bpf_lsm_inode_need_killpriv)
+BTF_ID(func, bpf_lsm_inode_post_setxattr)
+BTF_ID(func, bpf_lsm_inode_readlink)
+BTF_ID(func, bpf_lsm_inode_rename)
+BTF_ID(func, bpf_lsm_inode_rmdir)
+BTF_ID(func, bpf_lsm_inode_setattr)
+BTF_ID(func, bpf_lsm_inode_setxattr)
+BTF_ID(func, bpf_lsm_inode_symlink)
+BTF_ID(func, bpf_lsm_inode_unlink)
+BTF_ID(func, bpf_lsm_kernel_module_request)
+BTF_ID(func, bpf_lsm_kernel_read_file)
+BTF_ID(func, bpf_lsm_kernfs_init_security)
+
+#ifdef CONFIG_SECURITY_PATH
+BTF_ID(func, bpf_lsm_path_unlink)
+BTF_ID(func, bpf_lsm_path_mkdir)
+BTF_ID(func, bpf_lsm_path_rmdir)
+BTF_ID(func, bpf_lsm_path_truncate)
+BTF_ID(func, bpf_lsm_path_symlink)
+BTF_ID(func, bpf_lsm_path_link)
+BTF_ID(func, bpf_lsm_path_rename)
+BTF_ID(func, bpf_lsm_path_chmod)
+BTF_ID(func, bpf_lsm_path_chown)
+#endif /* CONFIG_SECURITY_PATH */
+
+#ifdef CONFIG_KEYS
+BTF_ID(func, bpf_lsm_key_free)
+#endif /* CONFIG_KEYS */
+
+BTF_ID(func, bpf_lsm_mmap_file)
+BTF_ID(func, bpf_lsm_netlink_send)
+BTF_ID(func, bpf_lsm_path_notify)
+BTF_ID(func, bpf_lsm_release_secctx)
+BTF_ID(func, bpf_lsm_sb_alloc_security)
+BTF_ID(func, bpf_lsm_sb_eat_lsm_opts)
+BTF_ID(func, bpf_lsm_sb_kern_mount)
+BTF_ID(func, bpf_lsm_sb_mount)
+BTF_ID(func, bpf_lsm_sb_remount)
+BTF_ID(func, bpf_lsm_sb_set_mnt_opts)
+BTF_ID(func, bpf_lsm_sb_show_options)
+BTF_ID(func, bpf_lsm_sb_statfs)
+BTF_ID(func, bpf_lsm_sb_umount)
+BTF_ID(func, bpf_lsm_settime)
+
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_socket_accept)
+BTF_ID(func, bpf_lsm_socket_bind)
+BTF_ID(func, bpf_lsm_socket_connect)
+BTF_ID(func, bpf_lsm_socket_create)
+BTF_ID(func, bpf_lsm_socket_getpeername)
+BTF_ID(func, bpf_lsm_socket_getpeersec_dgram)
+BTF_ID(func, bpf_lsm_socket_getsockname)
+BTF_ID(func, bpf_lsm_socket_getsockopt)
+BTF_ID(func, bpf_lsm_socket_listen)
+BTF_ID(func, bpf_lsm_socket_post_create)
+BTF_ID(func, bpf_lsm_socket_recvmsg)
+BTF_ID(func, bpf_lsm_socket_sendmsg)
+BTF_ID(func, bpf_lsm_socket_shutdown)
+BTF_ID(func, bpf_lsm_socket_socketpair)
+#endif /* CONFIG_SECURITY_NETWORK */
+
+BTF_ID(func, bpf_lsm_syslog)
+BTF_ID(func, bpf_lsm_task_alloc)
+BTF_ID(func, bpf_lsm_current_getsecid_subj)
+BTF_ID(func, bpf_lsm_task_getsecid_obj)
+BTF_ID(func, bpf_lsm_task_prctl)
+BTF_ID(func, bpf_lsm_task_setscheduler)
+BTF_ID(func, bpf_lsm_task_to_inode)
+BTF_ID(func, bpf_lsm_userns_create)
+BTF_SET_END(sleepable_lsm_hooks)
+
+BTF_SET_START(untrusted_lsm_hooks)
+BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_prog_alloc_security)
+BTF_ID(func, bpf_lsm_bpf_prog_free_security)
+BTF_ID(func, bpf_lsm_file_alloc_security)
+BTF_ID(func, bpf_lsm_file_free_security)
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_sk_alloc_security)
+BTF_ID(func, bpf_lsm_sk_free_security)
+#endif /* CONFIG_SECURITY_NETWORK */
+BTF_ID(func, bpf_lsm_task_free)
+BTF_SET_END(untrusted_lsm_hooks)
+
+bool bpf_lsm_is_sleepable_hook(u32 btf_id)
+{
+ return btf_id_set_contains(&sleepable_lsm_hooks, btf_id);
+}
+
+bool bpf_lsm_is_trusted(const struct bpf_prog *prog)
+{
+ return !btf_id_set_contains(&untrusted_lsm_hooks, prog->aux->attach_btf_id);
+}
+
const struct bpf_prog_ops lsm_prog_ops = {
};
const struct bpf_verifier_ops lsm_verifier_ops = {
- .get_func_proto = tracing_prog_func_proto,
+ .get_func_proto = bpf_lsm_func_proto,
.is_valid_access = btf_ctx_access,
};
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index c6b0decaa46a..02068bd0e4d9 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -10,11 +10,14 @@
#include <linux/seq_file.h>
#include <linux/refcount.h>
#include <linux/mutex.h>
+#include <linux/btf_ids.h>
+#include <linux/rcupdate_wait.h>
enum bpf_struct_ops_state {
BPF_STRUCT_OPS_STATE_INIT,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE,
+ BPF_STRUCT_OPS_STATE_READY,
};
#define BPF_STRUCT_OPS_COMMON_VALUE \
@@ -28,18 +31,19 @@ struct bpf_struct_ops_value {
struct bpf_struct_ops_map {
struct bpf_map map;
+ struct rcu_head rcu;
const struct bpf_struct_ops *st_ops;
/* protect map_update */
struct mutex lock;
- /* progs has all the bpf_prog that is populated
+ /* link has all the bpf_links that is populated
* to the func ptr of the kernel's struct
* (in kvalue.data).
*/
- struct bpf_prog **progs;
+ struct bpf_link **links;
/* image is a page that has all the trampolines
* that stores the func args before calling the bpf_prog.
* A PAGE_SIZE "image" is enough to store all trampoline for
- * "progs[]".
+ * "links[]".
*/
void *image;
/* uvalue->data stores the kernel struct
@@ -56,6 +60,13 @@ struct bpf_struct_ops_map {
struct bpf_struct_ops_value kvalue;
};
+struct bpf_struct_ops_link {
+ struct bpf_link link;
+ struct bpf_map __rcu *map;
+};
+
+static DEFINE_MUTEX(update_mutex);
+
#define VALUE_PREFIX "bpf_struct_ops_"
#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
@@ -92,6 +103,9 @@ const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = {
};
const struct bpf_prog_ops bpf_struct_ops_prog_ops = {
+#ifdef CONFIG_NET
+ .test_run = bpf_struct_ops_test_run,
+#endif
};
static const struct btf_type *module_type;
@@ -161,7 +175,7 @@ void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
break;
}
- if (btf_member_bitfield_size(t, member)) {
+ if (__btf_member_bitfield_size(t, member)) {
pr_warn("bit field member %s in struct %s is not supported\n",
mname, st_ops->name);
break;
@@ -244,6 +258,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
struct bpf_struct_ops_value *uvalue, *kvalue;
enum bpf_struct_ops_state state;
+ s64 refcnt;
if (unlikely(*(u32 *)key != 0))
return -ENOENT;
@@ -259,10 +274,17 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
/* No lock is needed. state and refcnt do not need
* to be updated together under atomic context.
*/
- uvalue = (struct bpf_struct_ops_value *)value;
+ uvalue = value;
memcpy(uvalue, st_map->uvalue, map->value_size);
uvalue->state = state;
- refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
+
+ /* This value offers the user space a general estimate of how
+ * many sockets are still utilizing this struct_ops for TCP
+ * congestion control. The number might not be exact, but it
+ * should sufficiently meet our present goals.
+ */
+ refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
+ refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0));
return 0;
}
@@ -278,9 +300,9 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
u32 i;
for (i = 0; i < btf_type_vlen(t); i++) {
- if (st_map->progs[i]) {
- bpf_prog_put(st_map->progs[i]);
- st_map->progs[i] = NULL;
+ if (st_map->links[i]) {
+ bpf_link_put(st_map->links[i]);
+ st_map->links[i] = NULL;
}
}
}
@@ -292,14 +314,13 @@ static int check_zero_holes(const struct btf_type *t, void *data)
const struct btf_type *mtype;
for_each_member(i, t, member) {
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
if (moff > prev_mend &&
memchr_inv(data + prev_mend, 0, moff - prev_mend))
return -EINVAL;
mtype = btf_type_by_id(btf_vmlinux, member->type);
- mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
- NULL, NULL);
+ mtype = btf_resolve_size(btf_vmlinux, mtype, &msize);
if (IS_ERR(mtype))
return PTR_ERR(mtype);
prev_mend = moff + msize;
@@ -312,18 +333,57 @@ static int check_zero_holes(const struct btf_type *t, void *data)
return 0;
}
-static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static void bpf_struct_ops_link_release(struct bpf_link *link)
+{
+}
+
+static void bpf_struct_ops_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link);
+
+ kfree(tlink);
+}
+
+const struct bpf_link_ops bpf_struct_ops_link_lops = {
+ .release = bpf_struct_ops_link_release,
+ .dealloc = bpf_struct_ops_link_dealloc,
+};
+
+int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
+ struct bpf_tramp_link *link,
+ const struct btf_func_model *model,
+ void *stub_func, void *image, void *image_end)
+{
+ u32 flags = BPF_TRAMP_F_INDIRECT;
+ int size;
+
+ tlinks[BPF_TRAMP_FENTRY].links[0] = link;
+ tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
+
+ if (model->ret_size > 0)
+ flags |= BPF_TRAMP_F_RET_FENTRY_RET;
+
+ size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
+ if (size < 0)
+ return size;
+ if (size > (unsigned long)image_end - (unsigned long)image)
+ return -E2BIG;
+ return arch_prepare_bpf_trampoline(NULL, image, image_end,
+ model, flags, tlinks, stub_func);
+}
+
+static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
const struct bpf_struct_ops *st_ops = st_map->st_ops;
struct bpf_struct_ops_value *uvalue, *kvalue;
const struct btf_member *member;
const struct btf_type *t = st_ops->type;
- struct bpf_tramp_progs *tprogs = NULL;
+ struct bpf_tramp_links *tlinks;
void *udata, *kdata;
- int prog_fd, err = 0;
- void *image;
+ int prog_fd, err;
+ void *image, *image_end;
u32 i;
if (flags)
@@ -336,7 +396,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (err)
return err;
- uvalue = (struct bpf_struct_ops_value *)value;
+ uvalue = value;
err = check_zero_holes(t, uvalue->data);
if (err)
return err;
@@ -344,8 +404,8 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (uvalue->state || refcount_read(&uvalue->refcnt))
return -EINVAL;
- tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
- if (!tprogs)
+ tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
+ if (!tlinks)
return -ENOMEM;
uvalue = (struct bpf_struct_ops_value *)st_map->uvalue;
@@ -363,13 +423,15 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
udata = &uvalue->data;
kdata = &kvalue->data;
image = st_map->image;
+ image_end = st_map->image + PAGE_SIZE;
for_each_member(i, t, member) {
const struct btf_type *mtype, *ptype;
struct bpf_prog *prog;
+ struct bpf_tramp_link *link;
u32 moff;
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
if (ptype == module_type) {
if (*(void **)(udata + moff))
@@ -396,8 +458,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
u32 msize;
mtype = btf_type_by_id(btf_vmlinux, member->type);
- mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
- NULL, NULL);
+ mtype = btf_resolve_size(btf_vmlinux, mtype, &msize);
if (IS_ERR(mtype)) {
err = PTR_ERR(mtype);
goto reset_unlock;
@@ -421,38 +482,65 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
err = PTR_ERR(prog);
goto reset_unlock;
}
- st_map->progs[i] = prog;
if (prog->type != BPF_PROG_TYPE_STRUCT_OPS ||
prog->aux->attach_btf_id != st_ops->type_id ||
prog->expected_attach_type != i) {
+ bpf_prog_put(prog);
err = -EINVAL;
goto reset_unlock;
}
- tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
- tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
- err = arch_prepare_bpf_trampoline(image,
- st_map->image + PAGE_SIZE,
- &st_ops->func_models[i], 0,
- tprogs, NULL);
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ bpf_prog_put(prog);
+ err = -ENOMEM;
+ goto reset_unlock;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
+ &bpf_struct_ops_link_lops, prog);
+ st_map->links[i] = &link->link;
+
+ err = bpf_struct_ops_prepare_trampoline(tlinks, link,
+ &st_ops->func_models[i],
+ *(void **)(st_ops->cfi_stubs + moff),
+ image, image_end);
if (err < 0)
goto reset_unlock;
- *(void **)(kdata + moff) = image;
+ *(void **)(kdata + moff) = image + cfi_get_offset();
image += err;
/* put prog_id to udata */
*(unsigned long *)(udata + moff) = prog->aux->id;
}
- refcount_set(&kvalue->refcnt, 1);
- bpf_map_inc(map);
+ if (st_map->map.map_flags & BPF_F_LINK) {
+ err = 0;
+ if (st_ops->validate) {
+ err = st_ops->validate(kdata);
+ if (err)
+ goto reset_unlock;
+ }
+ arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
+ /* Let bpf_link handle registration & unregistration.
+ *
+ * Pair with smp_load_acquire() during lookup_elem().
+ */
+ smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
+ goto unlock;
+ }
- set_memory_ro((long)st_map->image, 1);
- set_memory_x((long)st_map->image, 1);
+ arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
err = st_ops->reg(kdata);
if (likely(!err)) {
+ /* This refcnt increment on the map here after
+ * 'st_ops->reg()' is secure since the state of the
+ * map must be set to INIT at this moment, and thus
+ * bpf_struct_ops_map_delete_elem() can't unregister
+ * or transition it to TOBEFREE concurrently.
+ */
+ bpf_map_inc(map);
/* Pair with smp_load_acquire() during lookup_elem().
* It ensures the above udata updates (e.g. prog->aux->id)
* can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
@@ -461,40 +549,39 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto unlock;
}
- /* Error during st_ops->reg(). It is very unlikely since
- * the above init_member() should have caught it earlier
- * before reg(). The only possibility is if there was a race
- * in registering the struct_ops (under the same name) to
+ /* Error during st_ops->reg(). Can happen if this struct_ops needs to be
+ * verified as a whole, after all init_member() calls. Can also happen if
+ * there was a race in registering the struct_ops (under the same name) to
* a sub-system through different struct_ops's maps.
*/
- set_memory_nx((long)st_map->image, 1);
- set_memory_rw((long)st_map->image, 1);
- bpf_map_put(map);
+ arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
reset_unlock:
bpf_struct_ops_map_put_progs(st_map);
memset(uvalue, 0, map->value_size);
memset(kvalue, 0, map->value_size);
unlock:
- kfree(tprogs);
+ kfree(tlinks);
mutex_unlock(&st_map->lock);
return err;
}
-static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
+static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
{
enum bpf_struct_ops_state prev_state;
struct bpf_struct_ops_map *st_map;
st_map = (struct bpf_struct_ops_map *)map;
+ if (st_map->map.map_flags & BPF_F_LINK)
+ return -EOPNOTSUPP;
+
prev_state = cmpxchg(&st_map->kvalue.state,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE);
switch (prev_state) {
case BPF_STRUCT_OPS_STATE_INUSE:
st_map->st_ops->unreg(&st_map->kvalue.data);
- if (refcount_dec_and_test(&st_map->kvalue.refcnt))
- bpf_map_put(map);
+ bpf_map_put(map);
return 0;
case BPF_STRUCT_OPS_STATE_TOBEFREE:
return -EINPROGRESS;
@@ -527,22 +614,47 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
kfree(value);
}
-static void bpf_struct_ops_map_free(struct bpf_map *map)
+static void __bpf_struct_ops_map_free(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
- if (st_map->progs)
+ if (st_map->links)
bpf_struct_ops_map_put_progs(st_map);
- bpf_map_area_free(st_map->progs);
- bpf_jit_free_exec(st_map->image);
+ bpf_map_area_free(st_map->links);
+ if (st_map->image) {
+ arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
+ }
bpf_map_area_free(st_map->uvalue);
bpf_map_area_free(st_map);
}
+static void bpf_struct_ops_map_free(struct bpf_map *map)
+{
+ /* The struct_ops's function may switch to another struct_ops.
+ *
+ * For example, bpf_tcp_cc_x->init() may switch to
+ * another tcp_cc_y by calling
+ * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+ * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
+ * and its refcount may reach 0 which then free its
+ * trampoline image while tcp_cc_x is still running.
+ *
+ * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
+ * to finish. bpf-tcp-cc prog is non sleepable.
+ * A rcu_tasks gp is to wait for the last few insn
+ * in the tramopline image to finish before releasing
+ * the trampoline image.
+ */
+ synchronize_rcu_mult(call_rcu, call_rcu_tasks);
+
+ __bpf_struct_ops_map_free(map);
+}
+
static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
{
if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
- attr->map_flags || !attr->btf_vmlinux_value_type_id)
+ (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
return -EINVAL;
return 0;
}
@@ -550,15 +662,11 @@ static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
{
const struct bpf_struct_ops *st_ops;
- size_t map_total_size, st_map_size;
+ size_t st_map_size;
struct bpf_struct_ops_map *st_map;
const struct btf_type *t, *vt;
- struct bpf_map_memory mem;
struct bpf_map *map;
- int err;
-
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
+ int ret;
st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
if (!st_ops)
@@ -575,42 +683,61 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
* struct bpf_struct_ops_tcp_congestions_ops
*/
(vt->size - sizeof(struct bpf_struct_ops_value));
- map_total_size = st_map_size +
- /* uvalue */
- sizeof(vt->size) +
- /* struct bpf_progs **progs */
- btf_type_vlen(t) * sizeof(struct bpf_prog *);
- err = bpf_map_charge_init(&mem, map_total_size);
- if (err < 0)
- return ERR_PTR(err);
st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE);
- if (!st_map) {
- bpf_map_charge_finish(&mem);
+ if (!st_map)
return ERR_PTR(-ENOMEM);
- }
+
st_map->st_ops = st_ops;
map = &st_map->map;
+ ret = bpf_jit_charge_modmem(PAGE_SIZE);
+ if (ret) {
+ __bpf_struct_ops_map_free(map);
+ return ERR_PTR(ret);
+ }
+
+ st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
+ if (!st_map->image) {
+ /* __bpf_struct_ops_map_free() uses st_map->image as flag
+ * for "charged or not". In this case, we need to unchange
+ * here.
+ */
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
+ __bpf_struct_ops_map_free(map);
+ return ERR_PTR(-ENOMEM);
+ }
st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
- st_map->progs =
- bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *),
+ st_map->links =
+ bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *),
NUMA_NO_NODE);
- st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
- if (!st_map->uvalue || !st_map->progs || !st_map->image) {
- bpf_struct_ops_map_free(map);
- bpf_map_charge_finish(&mem);
+ if (!st_map->uvalue || !st_map->links) {
+ __bpf_struct_ops_map_free(map);
return ERR_PTR(-ENOMEM);
}
mutex_init(&st_map->lock);
- set_vm_flush_reset_perms(st_map->image);
bpf_map_init_from_attr(map, attr);
- bpf_map_charge_move(&map->memory, &mem);
return map;
}
+static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ const struct bpf_struct_ops *st_ops = st_map->st_ops;
+ const struct btf_type *vt = st_ops->value_type;
+ u64 usage;
+
+ usage = sizeof(*st_map) +
+ vt->size - sizeof(struct bpf_struct_ops_value);
+ usage += vt->size;
+ usage += btf_type_vlen(vt) * sizeof(struct bpf_links *);
+ usage += PAGE_SIZE;
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map)
const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_alloc_check = bpf_struct_ops_map_alloc_check,
.map_alloc = bpf_struct_ops_map_alloc,
@@ -620,6 +747,8 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_delete_elem = bpf_struct_ops_map_delete_elem,
.map_update_elem = bpf_struct_ops_map_update_elem,
.map_seq_show_elem = bpf_struct_ops_map_seq_show_elem,
+ .map_mem_usage = bpf_struct_ops_map_mem_usage,
+ .map_btf_id = &bpf_struct_ops_map_btf_ids[0],
};
/* "const void *" because some subsystem is
@@ -628,22 +757,177 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
bool bpf_struct_ops_get(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
- return refcount_inc_not_zero(&kvalue->refcnt);
+ map = __bpf_map_inc_not_zero(&st_map->map, false);
+ return !IS_ERR(map);
}
void bpf_struct_ops_put(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
- if (refcount_dec_and_test(&kvalue->refcnt)) {
- struct bpf_struct_ops_map *st_map;
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+ bpf_map_put(&st_map->map);
+}
+
+static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ return map->map_type == BPF_MAP_TYPE_STRUCT_OPS &&
+ map->map_flags & BPF_F_LINK &&
+ /* Pair with smp_store_release() during map_update */
+ smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY;
+}
+
+static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_struct_ops_map *st_map;
- st_map = container_of(kvalue, struct bpf_struct_ops_map,
- kvalue);
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ st_map = (struct bpf_struct_ops_map *)
+ rcu_dereference_protected(st_link->map, true);
+ if (st_map) {
+ /* st_link->map can be NULL if
+ * bpf_struct_ops_link_create() fails to register.
+ */
+ st_map->st_ops->unreg(&st_map->kvalue.data);
bpf_map_put(&st_map->map);
}
+ kfree(st_link);
+}
+
+static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock();
+ map = rcu_dereference(st_link->map);
+ seq_printf(seq, "map_id:\t%d\n", map->id);
+ rcu_read_unlock();
+}
+
+static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock();
+ map = rcu_dereference(st_link->map);
+ info->struct_ops.map_id = map->id;
+ rcu_read_unlock();
+ return 0;
+}
+
+static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map,
+ struct bpf_map *expected_old_map)
+{
+ struct bpf_struct_ops_map *st_map, *old_st_map;
+ struct bpf_map *old_map;
+ struct bpf_struct_ops_link *st_link;
+ int err;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ st_map = container_of(new_map, struct bpf_struct_ops_map, map);
+
+ if (!bpf_struct_ops_valid_to_reg(new_map))
+ return -EINVAL;
+
+ if (!st_map->st_ops->update)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&update_mutex);
+
+ old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ if (expected_old_map && old_map != expected_old_map) {
+ err = -EPERM;
+ goto err_out;
+ }
+
+ old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
+ /* The new and old struct_ops must be the same type. */
+ if (st_map->st_ops != old_st_map->st_ops) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
+ if (err)
+ goto err_out;
+
+ bpf_map_inc(new_map);
+ rcu_assign_pointer(st_link->map, new_map);
+ bpf_map_put(old_map);
+
+err_out:
+ mutex_unlock(&update_mutex);
+
+ return err;
+}
+
+static const struct bpf_link_ops bpf_struct_ops_map_lops = {
+ .dealloc = bpf_struct_ops_map_link_dealloc,
+ .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
+ .fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+ .update_map = bpf_struct_ops_map_link_update,
+};
+
+int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+ struct bpf_struct_ops_link *link = NULL;
+ struct bpf_link_primer link_primer;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
+ int err;
+
+ map = bpf_map_get(attr->link_create.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ st_map = (struct bpf_struct_ops_map *)map;
+
+ if (!bpf_struct_ops_valid_to_reg(map)) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto err_out;
+
+ err = st_map->st_ops->reg(st_map->kvalue.data);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ link = NULL;
+ goto err_out;
+ }
+ RCU_INIT_POINTER(link->map, map);
+
+ return bpf_link_settle(&link_primer);
+
+err_out:
+ bpf_map_put(map);
+ kfree(link);
+ return err;
}
diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h
index 066d83ea1c99..5678a9ddf817 100644
--- a/kernel/bpf/bpf_struct_ops_types.h
+++ b/kernel/bpf/bpf_struct_ops_types.h
@@ -2,6 +2,9 @@
/* internal file - do not include directly */
#ifdef CONFIG_BPF_JIT
+#ifdef CONFIG_NET
+BPF_STRUCT_OPS_TYPE(bpf_dummy_ops)
+#endif
#ifdef CONFIG_INET
#include <net/tcp.h>
BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
new file mode 100644
index 000000000000..adf6dfe0ba68
--- /dev/null
+++ b/kernel/bpf/bpf_task_storage.c
@@ -0,0 +1,374 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Facebook
+ * Copyright 2020 Google LLC.
+ */
+
+#include <linux/pid.h>
+#include <linux/sched.h>
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
+#include <linux/filter.h>
+#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/fdtable.h>
+#include <linux/rcupdate_trace.h>
+
+DEFINE_BPF_STORAGE_CACHE(task_cache);
+
+static DEFINE_PER_CPU(int, bpf_task_storage_busy);
+
+static void bpf_task_storage_lock(void)
+{
+ migrate_disable();
+ this_cpu_inc(bpf_task_storage_busy);
+}
+
+static void bpf_task_storage_unlock(void)
+{
+ this_cpu_dec(bpf_task_storage_busy);
+ migrate_enable();
+}
+
+static bool bpf_task_storage_trylock(void)
+{
+ migrate_disable();
+ if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
+ this_cpu_dec(bpf_task_storage_busy);
+ migrate_enable();
+ return false;
+ }
+ return true;
+}
+
+static struct bpf_local_storage __rcu **task_storage_ptr(void *owner)
+{
+ struct task_struct *task = owner;
+
+ return &task->bpf_storage;
+}
+
+static struct bpf_local_storage_data *
+task_storage_lookup(struct task_struct *task, struct bpf_map *map,
+ bool cacheit_lockit)
+{
+ struct bpf_local_storage *task_storage;
+ struct bpf_local_storage_map *smap;
+
+ task_storage =
+ rcu_dereference_check(task->bpf_storage, bpf_rcu_lock_held());
+ if (!task_storage)
+ return NULL;
+
+ smap = (struct bpf_local_storage_map *)map;
+ return bpf_local_storage_lookup(task_storage, smap, cacheit_lockit);
+}
+
+void bpf_task_storage_free(struct task_struct *task)
+{
+ struct bpf_local_storage *local_storage;
+
+ rcu_read_lock();
+
+ local_storage = rcu_dereference(task->bpf_storage);
+ if (!local_storage) {
+ rcu_read_unlock();
+ return;
+ }
+
+ bpf_task_storage_lock();
+ bpf_local_storage_destroy(local_storage);
+ bpf_task_storage_unlock();
+ rcu_read_unlock();
+}
+
+static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_local_storage_data *sdata;
+ struct task_struct *task;
+ unsigned int f_flags;
+ struct pid *pid;
+ int fd, err;
+
+ fd = *(int *)key;
+ pid = pidfd_get_pid(fd, &f_flags);
+ if (IS_ERR(pid))
+ return ERR_CAST(pid);
+
+ /* We should be in an RCU read side critical section, it should be safe
+ * to call pid_task.
+ */
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ task = pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ bpf_task_storage_lock();
+ sdata = task_storage_lookup(task, map, true);
+ bpf_task_storage_unlock();
+ put_pid(pid);
+ return sdata ? sdata->data : NULL;
+out:
+ put_pid(pid);
+ return ERR_PTR(err);
+}
+
+static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_local_storage_data *sdata;
+ struct task_struct *task;
+ unsigned int f_flags;
+ struct pid *pid;
+ int fd, err;
+
+ fd = *(int *)key;
+ pid = pidfd_get_pid(fd, &f_flags);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
+
+ /* We should be in an RCU read side critical section, it should be safe
+ * to call pid_task.
+ */
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ task = pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ bpf_task_storage_lock();
+ sdata = bpf_local_storage_update(
+ task, (struct bpf_local_storage_map *)map, value, map_flags,
+ GFP_ATOMIC);
+ bpf_task_storage_unlock();
+
+ err = PTR_ERR_OR_ZERO(sdata);
+out:
+ put_pid(pid);
+ return err;
+}
+
+static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
+ bool nobusy)
+{
+ struct bpf_local_storage_data *sdata;
+
+ sdata = task_storage_lookup(task, map, false);
+ if (!sdata)
+ return -ENOENT;
+
+ if (!nobusy)
+ return -EBUSY;
+
+ bpf_selem_unlink(SELEM(sdata), false);
+
+ return 0;
+}
+
+static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
+{
+ struct task_struct *task;
+ unsigned int f_flags;
+ struct pid *pid;
+ int fd, err;
+
+ fd = *(int *)key;
+ pid = pidfd_get_pid(fd, &f_flags);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
+
+ /* We should be in an RCU read side critical section, it should be safe
+ * to call pid_task.
+ */
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ task = pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ bpf_task_storage_lock();
+ err = task_storage_delete(task, map, true);
+ bpf_task_storage_unlock();
+out:
+ put_pid(pid);
+ return err;
+}
+
+/* Called by bpf_task_storage_get*() helpers */
+static void *__bpf_task_storage_get(struct bpf_map *map,
+ struct task_struct *task, void *value,
+ u64 flags, gfp_t gfp_flags, bool nobusy)
+{
+ struct bpf_local_storage_data *sdata;
+
+ sdata = task_storage_lookup(task, map, nobusy);
+ if (sdata)
+ return sdata->data;
+
+ /* only allocate new storage, when the task is refcounted */
+ if (refcount_read(&task->usage) &&
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) {
+ sdata = bpf_local_storage_update(
+ task, (struct bpf_local_storage_map *)map, value,
+ BPF_NOEXIST, gfp_flags);
+ return IS_ERR(sdata) ? NULL : sdata->data;
+ }
+
+ return NULL;
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get_recur, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ bool nobusy;
+ void *data;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
+ return (unsigned long)NULL;
+
+ nobusy = bpf_task_storage_trylock();
+ data = __bpf_task_storage_get(map, task, value, flags,
+ gfp_flags, nobusy);
+ if (nobusy)
+ bpf_task_storage_unlock();
+ return (unsigned long)data;
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ void *data;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
+ return (unsigned long)NULL;
+
+ bpf_task_storage_lock();
+ data = __bpf_task_storage_get(map, task, value, flags,
+ gfp_flags, true);
+ bpf_task_storage_unlock();
+ return (unsigned long)data;
+}
+
+BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_struct *,
+ task)
+{
+ bool nobusy;
+ int ret;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!task)
+ return -EINVAL;
+
+ nobusy = bpf_task_storage_trylock();
+ /* This helper must only be called from places where the lifetime of the task
+ * is guaranteed. Either by being refcounted or by being protected
+ * by an RCU read-side critical section.
+ */
+ ret = task_storage_delete(task, map, nobusy);
+ if (nobusy)
+ bpf_task_storage_unlock();
+ return ret;
+}
+
+BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
+ task)
+{
+ int ret;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!task)
+ return -EINVAL;
+
+ bpf_task_storage_lock();
+ /* This helper must only be called from places where the lifetime of the task
+ * is guaranteed. Either by being refcounted or by being protected
+ * by an RCU read-side critical section.
+ */
+ ret = task_storage_delete(task, map, true);
+ bpf_task_storage_unlock();
+ return ret;
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
+{
+ return bpf_local_storage_map_alloc(attr, &task_cache, true);
+}
+
+static void task_storage_map_free(struct bpf_map *map)
+{
+ bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy);
+}
+
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map)
+const struct bpf_map_ops task_storage_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bpf_local_storage_map_alloc_check,
+ .map_alloc = task_storage_map_alloc,
+ .map_free = task_storage_map_free,
+ .map_get_next_key = notsupp_get_next_key,
+ .map_lookup_elem = bpf_pid_task_storage_lookup_elem,
+ .map_update_elem = bpf_pid_task_storage_update_elem,
+ .map_delete_elem = bpf_pid_task_storage_delete_elem,
+ .map_check_btf = bpf_local_storage_map_check_btf,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
+ .map_owner_storage_ptr = task_storage_ptr,
+};
+
+const struct bpf_func_proto bpf_task_storage_get_recur_proto = {
+ .func = bpf_task_storage_get_recur,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_task_storage_get_proto = {
+ .func = bpf_task_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_task_storage_delete_recur_proto = {
+ .func = bpf_task_storage_delete_recur,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+};
+
+const struct bpf_func_proto bpf_task_storage_delete_proto = {
+ .func = bpf_task_storage_delete,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+};
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 0443600146dc..596471189176 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2018 Facebook */
#include <uapi/linux/btf.h>
@@ -18,9 +18,19 @@
#include <linux/sort.h>
#include <linux/bpf_verifier.h>
#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/bpf_lsm.h>
#include <linux/skmsg.h>
#include <linux/perf_event.h>
+#include <linux/bsearch.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+
+#include <net/netfilter/nf_bpf_link.h>
+
#include <net/sock.h>
+#include <net/xdp.h>
+#include "../tools/lib/bpf/relo_core.h"
/* BTF (BPF Type Format) is the meta data format which describes
* the data types of BPF program/map. Hence, it basically focus
@@ -47,7 +57,7 @@
* The BTF type section contains a list of 'struct btf_type' objects.
* Each one describes a C type. Recall from the above section
* that a 'struct btf_type' object could be immediately followed by extra
- * data in order to desribe some particular C types.
+ * data in order to describe some particular C types.
*
* type_id:
* ~~~~~~~
@@ -169,7 +179,7 @@
#define BITS_ROUNDUP_BYTES(bits) \
(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
-#define BTF_INFO_MASK 0x8f00ffff
+#define BTF_INFO_MASK 0x9f00ffff
#define BTF_INT_MASK 0x0fffffff
#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
@@ -185,11 +195,6 @@
i < btf_type_vlen(struct_type); \
i++, member++)
-#define for_each_vsi(i, struct_type, member) \
- for (i = 0, member = btf_type_var_secinfo(struct_type); \
- i < btf_type_vlen(struct_type); \
- i++, member++)
-
#define for_each_vsi_from(i, from, struct_type, member) \
for (i = from, member = btf_type_var_secinfo(struct_type) + from; \
i < btf_type_vlen(struct_type); \
@@ -198,6 +203,44 @@
DEFINE_IDR(btf_idr);
DEFINE_SPINLOCK(btf_idr_lock);
+enum btf_kfunc_hook {
+ BTF_KFUNC_HOOK_COMMON,
+ BTF_KFUNC_HOOK_XDP,
+ BTF_KFUNC_HOOK_TC,
+ BTF_KFUNC_HOOK_STRUCT_OPS,
+ BTF_KFUNC_HOOK_TRACING,
+ BTF_KFUNC_HOOK_SYSCALL,
+ BTF_KFUNC_HOOK_FMODRET,
+ BTF_KFUNC_HOOK_CGROUP_SKB,
+ BTF_KFUNC_HOOK_SCHED_ACT,
+ BTF_KFUNC_HOOK_SK_SKB,
+ BTF_KFUNC_HOOK_SOCKET_FILTER,
+ BTF_KFUNC_HOOK_LWT,
+ BTF_KFUNC_HOOK_NETFILTER,
+ BTF_KFUNC_HOOK_MAX,
+};
+
+enum {
+ BTF_KFUNC_SET_MAX_CNT = 256,
+ BTF_DTOR_KFUNC_MAX_CNT = 256,
+ BTF_KFUNC_FILTER_MAX_CNT = 16,
+};
+
+struct btf_kfunc_hook_filter {
+ btf_kfunc_filter_t filters[BTF_KFUNC_FILTER_MAX_CNT];
+ u32 nr_filters;
+};
+
+struct btf_kfunc_set_tab {
+ struct btf_id_set8 *sets[BTF_KFUNC_HOOK_MAX];
+ struct btf_kfunc_hook_filter hook_filters[BTF_KFUNC_HOOK_MAX];
+};
+
+struct btf_id_dtor_kfunc_tab {
+ u32 cnt;
+ struct btf_id_dtor_kfunc dtors[];
+};
+
struct btf {
void *data;
struct btf_type **types;
@@ -206,12 +249,22 @@ struct btf {
const char *strings;
void *nohdr_data;
struct btf_header hdr;
- u32 nr_types;
+ u32 nr_types; /* includes VOID for base BTF */
u32 types_size;
u32 data_size;
refcount_t refcnt;
u32 id;
struct rcu_head rcu;
+ struct btf_kfunc_set_tab *kfunc_set_tab;
+ struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
+ struct btf_struct_metas *struct_meta_tab;
+
+ /* split BTF support */
+ struct btf *base_btf;
+ u32 start_id; /* first type ID in this BTF (0 for base BTF) */
+ u32 start_str_off; /* first string offset (0 for base BTF) */
+ char name[MODULE_NAME_LEN];
+ bool kernel_btf;
};
enum verifier_phase {
@@ -274,13 +327,108 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_FUNC_PROTO] = "FUNC_PROTO",
[BTF_KIND_VAR] = "VAR",
[BTF_KIND_DATASEC] = "DATASEC",
+ [BTF_KIND_FLOAT] = "FLOAT",
+ [BTF_KIND_DECL_TAG] = "DECL_TAG",
+ [BTF_KIND_TYPE_TAG] = "TYPE_TAG",
+ [BTF_KIND_ENUM64] = "ENUM64",
};
-static const char *btf_type_str(const struct btf_type *t)
+const char *btf_type_str(const struct btf_type *t)
{
return btf_kind_str[BTF_INFO_KIND(t->info)];
}
+/* Chunk size we use in safe copy of data to be shown. */
+#define BTF_SHOW_OBJ_SAFE_SIZE 32
+
+/*
+ * This is the maximum size of a base type value (equivalent to a
+ * 128-bit int); if we are at the end of our safe buffer and have
+ * less than 16 bytes space we can't be assured of being able
+ * to copy the next type safely, so in such cases we will initiate
+ * a new copy.
+ */
+#define BTF_SHOW_OBJ_BASE_TYPE_SIZE 16
+
+/* Type name size */
+#define BTF_SHOW_NAME_SIZE 80
+
+/*
+ * The suffix of a type that indicates it cannot alias another type when
+ * comparing BTF IDs for kfunc invocations.
+ */
+#define NOCAST_ALIAS_SUFFIX "___init"
+
+/*
+ * Common data to all BTF show operations. Private show functions can add
+ * their own data to a structure containing a struct btf_show and consult it
+ * in the show callback. See btf_type_show() below.
+ *
+ * One challenge with showing nested data is we want to skip 0-valued
+ * data, but in order to figure out whether a nested object is all zeros
+ * we need to walk through it. As a result, we need to make two passes
+ * when handling structs, unions and arrays; the first path simply looks
+ * for nonzero data, while the second actually does the display. The first
+ * pass is signalled by show->state.depth_check being set, and if we
+ * encounter a non-zero value we set show->state.depth_to_show to
+ * the depth at which we encountered it. When we have completed the
+ * first pass, we will know if anything needs to be displayed if
+ * depth_to_show > depth. See btf_[struct,array]_show() for the
+ * implementation of this.
+ *
+ * Another problem is we want to ensure the data for display is safe to
+ * access. To support this, the anonymous "struct {} obj" tracks the data
+ * object and our safe copy of it. We copy portions of the data needed
+ * to the object "copy" buffer, but because its size is limited to
+ * BTF_SHOW_OBJ_COPY_LEN bytes, multiple copies may be required as we
+ * traverse larger objects for display.
+ *
+ * The various data type show functions all start with a call to
+ * btf_show_start_type() which returns a pointer to the safe copy
+ * of the data needed (or if BTF_SHOW_UNSAFE is specified, to the
+ * raw data itself). btf_show_obj_safe() is responsible for
+ * using copy_from_kernel_nofault() to update the safe data if necessary
+ * as we traverse the object's data. skbuff-like semantics are
+ * used:
+ *
+ * - obj.head points to the start of the toplevel object for display
+ * - obj.size is the size of the toplevel object
+ * - obj.data points to the current point in the original data at
+ * which our safe data starts. obj.data will advance as we copy
+ * portions of the data.
+ *
+ * In most cases a single copy will suffice, but larger data structures
+ * such as "struct task_struct" will require many copies. The logic in
+ * btf_show_obj_safe() handles the logic that determines if a new
+ * copy_from_kernel_nofault() is needed.
+ */
+struct btf_show {
+ u64 flags;
+ void *target; /* target of show operation (seq file, buffer) */
+ void (*showfn)(struct btf_show *show, const char *fmt, va_list args);
+ const struct btf *btf;
+ /* below are used during iteration */
+ struct {
+ u8 depth;
+ u8 depth_to_show;
+ u8 depth_check;
+ u8 array_member:1,
+ array_terminated:1;
+ u16 array_encoding;
+ u32 type_id;
+ int status; /* non-zero for error */
+ const struct btf_type *type;
+ const struct btf_member *member;
+ char name[BTF_SHOW_NAME_SIZE]; /* space for member name/type */
+ } state;
+ struct {
+ u32 size;
+ void *head;
+ void *data;
+ u8 safe[BTF_SHOW_OBJ_SAFE_SIZE];
+ } obj;
+};
+
struct btf_kind_operations {
s32 (*check_meta)(struct btf_verifier_env *env,
const struct btf_type *t,
@@ -297,9 +445,9 @@ struct btf_kind_operations {
const struct btf_type *member_type);
void (*log_details)(struct btf_verifier_env *env,
const struct btf_type *t);
- void (*seq_show)(const struct btf *btf, const struct btf_type *t,
+ void (*show)(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offsets,
- struct seq_file *m);
+ struct btf_show *show);
};
static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
@@ -308,6 +456,9 @@ static struct btf_type btf_void;
static int btf_resolve(struct btf_verifier_env *env,
const struct btf_type *t, u32 type_id);
+static int btf_func_check(struct btf_verifier_env *env,
+ const struct btf_type *t);
+
static bool btf_type_is_modifier(const struct btf_type *t)
{
/* Some of them is not strictly a C modifier
@@ -325,6 +476,7 @@ static bool btf_type_is_modifier(const struct btf_type *t)
case BTF_KIND_VOLATILE:
case BTF_KIND_CONST:
case BTF_KIND_RESTRICT:
+ case BTF_KIND_TYPE_TAG:
return true;
}
@@ -341,55 +493,55 @@ static bool btf_type_is_fwd(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
}
-static bool btf_type_nosize(const struct btf_type *t)
+static bool btf_type_is_datasec(const struct btf_type *t)
{
- return btf_type_is_void(t) || btf_type_is_fwd(t) ||
- btf_type_is_func(t) || btf_type_is_func_proto(t);
+ return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
}
-static bool btf_type_nosize_or_null(const struct btf_type *t)
+static bool btf_type_is_decl_tag(const struct btf_type *t)
{
- return !t || btf_type_nosize(t);
+ return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG;
}
-/* union is only a special case of struct:
- * all its offsetof(member) == 0
- */
-static bool btf_type_is_struct(const struct btf_type *t)
+static bool btf_type_nosize(const struct btf_type *t)
{
- u8 kind = BTF_INFO_KIND(t->info);
-
- return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
+ return btf_type_is_void(t) || btf_type_is_fwd(t) ||
+ btf_type_is_func(t) || btf_type_is_func_proto(t) ||
+ btf_type_is_decl_tag(t);
}
-static bool __btf_type_is_struct(const struct btf_type *t)
+static bool btf_type_nosize_or_null(const struct btf_type *t)
{
- return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
+ return !t || btf_type_nosize(t);
}
-static bool btf_type_is_array(const struct btf_type *t)
+static bool btf_type_is_decl_tag_target(const struct btf_type *t)
{
- return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
+ return btf_type_is_func(t) || btf_type_is_struct(t) ||
+ btf_type_is_var(t) || btf_type_is_typedef(t);
}
-static bool btf_type_is_var(const struct btf_type *t)
+u32 btf_nr_types(const struct btf *btf)
{
- return BTF_INFO_KIND(t->info) == BTF_KIND_VAR;
-}
+ u32 total = 0;
-static bool btf_type_is_datasec(const struct btf_type *t)
-{
- return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
+ while (btf) {
+ total += btf->nr_types;
+ btf = btf->base_btf;
+ }
+
+ return total;
}
s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
{
const struct btf_type *t;
const char *tname;
- u32 i;
+ u32 i, total;
- for (i = 1; i <= btf->nr_types; i++) {
- t = btf->types[i];
+ total = btf_nr_types(btf);
+ for (i = 1; i < total; i++) {
+ t = btf_type_by_id(btf, i);
if (BTF_INFO_KIND(t->info) != kind)
continue;
@@ -401,6 +553,50 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
return -ENOENT;
}
+s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
+{
+ struct btf *btf;
+ s32 ret;
+ int id;
+
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+ if (!btf)
+ return -EINVAL;
+
+ ret = btf_find_by_name_kind(btf, name, kind);
+ /* ret is never zero, since btf_find_by_name_kind returns
+ * positive btf_id or negative error.
+ */
+ if (ret > 0) {
+ btf_get(btf);
+ *btf_p = btf;
+ return ret;
+ }
+
+ /* If name is not found in vmlinux's BTF then search in module's BTFs */
+ spin_lock_bh(&btf_idr_lock);
+ idr_for_each_entry(&btf_idr, btf, id) {
+ if (!btf_is_module(btf))
+ continue;
+ /* linear search could be slow hence unlock/lock
+ * the IDR to avoiding holding it for too long
+ */
+ btf_get(btf);
+ spin_unlock_bh(&btf_idr_lock);
+ ret = btf_find_by_name_kind(btf, name, kind);
+ if (ret > 0) {
+ *btf_p = btf;
+ return ret;
+ }
+ btf_put(btf);
+ spin_lock_bh(&btf_idr_lock);
+ }
+ spin_unlock_bh(&btf_idr_lock);
+ return ret;
+}
+
const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
u32 id, u32 *res_id)
{
@@ -447,6 +643,7 @@ const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
static bool btf_type_is_resolve_source_only(const struct btf_type *t)
{
return btf_type_is_var(t) ||
+ btf_type_is_decl_tag(t) ||
btf_type_is_datasec(t);
}
@@ -473,6 +670,8 @@ static bool btf_type_needs_resolve(const struct btf_type *t)
btf_type_is_struct(t) ||
btf_type_is_array(t) ||
btf_type_is_var(t) ||
+ btf_type_is_func(t) ||
+ btf_type_is_decl_tag(t) ||
btf_type_is_datasec(t);
}
@@ -485,6 +684,8 @@ static bool btf_type_has_size(const struct btf_type *t)
case BTF_KIND_UNION:
case BTF_KIND_ENUM:
case BTF_KIND_DATASEC:
+ case BTF_KIND_FLOAT:
+ case BTF_KIND_ENUM64:
return true;
}
@@ -525,9 +726,14 @@ static const struct btf_var *btf_type_var(const struct btf_type *t)
return (const struct btf_var *)(t + 1);
}
-static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t)
+static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t)
{
- return (const struct btf_var_secinfo *)(t + 1);
+ return (const struct btf_decl_tag *)(t + 1);
+}
+
+static const struct btf_enum64 *btf_type_enum64(const struct btf_type *t)
+{
+ return (const struct btf_enum64 *)(t + 1);
}
static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
@@ -537,35 +743,52 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
{
- return BTF_STR_OFFSET_VALID(offset) &&
- offset < btf->hdr.str_len;
+ if (!BTF_STR_OFFSET_VALID(offset))
+ return false;
+
+ while (offset < btf->start_str_off)
+ btf = btf->base_btf;
+
+ offset -= btf->start_str_off;
+ return offset < btf->hdr.str_len;
}
-static bool __btf_name_char_ok(char c, bool first, bool dot_ok)
+static bool __btf_name_char_ok(char c, bool first)
{
if ((first ? !isalpha(c) :
!isalnum(c)) &&
c != '_' &&
- ((c == '.' && !dot_ok) ||
- c != '.'))
+ c != '.')
return false;
return true;
}
-static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok)
+static const char *btf_str_by_offset(const struct btf *btf, u32 offset)
+{
+ while (offset < btf->start_str_off)
+ btf = btf->base_btf;
+
+ offset -= btf->start_str_off;
+ if (offset < btf->hdr.str_len)
+ return &btf->strings[offset];
+
+ return NULL;
+}
+
+static bool __btf_name_valid(const struct btf *btf, u32 offset)
{
/* offset must be valid */
- const char *src = &btf->strings[offset];
+ const char *src = btf_str_by_offset(btf, offset);
const char *src_limit;
- if (!__btf_name_char_ok(*src, true, dot_ok))
+ if (!__btf_name_char_ok(*src, true))
return false;
/* set a limit on identifier length */
src_limit = src + KSYM_NAME_LEN;
src++;
while (*src && src < src_limit) {
- if (!__btf_name_char_ok(*src, false, dot_ok))
+ if (!__btf_name_char_ok(*src, false))
return false;
src++;
}
@@ -573,44 +796,43 @@ static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok)
return !*src;
}
-/* Only C-style identifier is permitted. This can be relaxed if
- * necessary.
- */
static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
{
- return __btf_name_valid(btf, offset, false);
+ return __btf_name_valid(btf, offset);
}
static bool btf_name_valid_section(const struct btf *btf, u32 offset)
{
- return __btf_name_valid(btf, offset, true);
+ return __btf_name_valid(btf, offset);
}
static const char *__btf_name_by_offset(const struct btf *btf, u32 offset)
{
+ const char *name;
+
if (!offset)
return "(anon)";
- else if (offset < btf->hdr.str_len)
- return &btf->strings[offset];
- else
- return "(invalid-name-offset)";
+
+ name = btf_str_by_offset(btf, offset);
+ return name ?: "(invalid-name-offset)";
}
const char *btf_name_by_offset(const struct btf *btf, u32 offset)
{
- if (offset < btf->hdr.str_len)
- return &btf->strings[offset];
-
- return NULL;
+ return btf_str_by_offset(btf, offset);
}
const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
{
- if (type_id > btf->nr_types)
- return NULL;
+ while (type_id < btf->start_id)
+ btf = btf->base_btf;
+ type_id -= btf->start_id;
+ if (type_id >= btf->nr_types)
+ return NULL;
return btf->types[type_id];
}
+EXPORT_SYMBOL_GPL(btf_type_by_id);
/*
* Regular int is not a bit field and it must be either
@@ -676,6 +898,489 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
return true;
}
+/* Similar to btf_type_skip_modifiers() but does not skip typedefs. */
+static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf,
+ u32 id)
+{
+ const struct btf_type *t = btf_type_by_id(btf, id);
+
+ while (btf_type_is_modifier(t) &&
+ BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) {
+ t = btf_type_by_id(btf, t->type);
+ }
+
+ return t;
+}
+
+#define BTF_SHOW_MAX_ITER 10
+
+#define BTF_KIND_BIT(kind) (1ULL << kind)
+
+/*
+ * Populate show->state.name with type name information.
+ * Format of type name is
+ *
+ * [.member_name = ] (type_name)
+ */
+static const char *btf_show_name(struct btf_show *show)
+{
+ /* BTF_MAX_ITER array suffixes "[]" */
+ const char *array_suffixes = "[][][][][][][][][][]";
+ const char *array_suffix = &array_suffixes[strlen(array_suffixes)];
+ /* BTF_MAX_ITER pointer suffixes "*" */
+ const char *ptr_suffixes = "**********";
+ const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)];
+ const char *name = NULL, *prefix = "", *parens = "";
+ const struct btf_member *m = show->state.member;
+ const struct btf_type *t;
+ const struct btf_array *array;
+ u32 id = show->state.type_id;
+ const char *member = NULL;
+ bool show_member = false;
+ u64 kinds = 0;
+ int i;
+
+ show->state.name[0] = '\0';
+
+ /*
+ * Don't show type name if we're showing an array member;
+ * in that case we show the array type so don't need to repeat
+ * ourselves for each member.
+ */
+ if (show->state.array_member)
+ return "";
+
+ /* Retrieve member name, if any. */
+ if (m) {
+ member = btf_name_by_offset(show->btf, m->name_off);
+ show_member = strlen(member) > 0;
+ id = m->type;
+ }
+
+ /*
+ * Start with type_id, as we have resolved the struct btf_type *
+ * via btf_modifier_show() past the parent typedef to the child
+ * struct, int etc it is defined as. In such cases, the type_id
+ * still represents the starting type while the struct btf_type *
+ * in our show->state points at the resolved type of the typedef.
+ */
+ t = btf_type_by_id(show->btf, id);
+ if (!t)
+ return "";
+
+ /*
+ * The goal here is to build up the right number of pointer and
+ * array suffixes while ensuring the type name for a typedef
+ * is represented. Along the way we accumulate a list of
+ * BTF kinds we have encountered, since these will inform later
+ * display; for example, pointer types will not require an
+ * opening "{" for struct, we will just display the pointer value.
+ *
+ * We also want to accumulate the right number of pointer or array
+ * indices in the format string while iterating until we get to
+ * the typedef/pointee/array member target type.
+ *
+ * We start by pointing at the end of pointer and array suffix
+ * strings; as we accumulate pointers and arrays we move the pointer
+ * or array string backwards so it will show the expected number of
+ * '*' or '[]' for the type. BTF_SHOW_MAX_ITER of nesting of pointers
+ * and/or arrays and typedefs are supported as a precaution.
+ *
+ * We also want to get typedef name while proceeding to resolve
+ * type it points to so that we can add parentheses if it is a
+ * "typedef struct" etc.
+ */
+ for (i = 0; i < BTF_SHOW_MAX_ITER; i++) {
+
+ switch (BTF_INFO_KIND(t->info)) {
+ case BTF_KIND_TYPEDEF:
+ if (!name)
+ name = btf_name_by_offset(show->btf,
+ t->name_off);
+ kinds |= BTF_KIND_BIT(BTF_KIND_TYPEDEF);
+ id = t->type;
+ break;
+ case BTF_KIND_ARRAY:
+ kinds |= BTF_KIND_BIT(BTF_KIND_ARRAY);
+ parens = "[";
+ if (!t)
+ return "";
+ array = btf_type_array(t);
+ if (array_suffix > array_suffixes)
+ array_suffix -= 2;
+ id = array->type;
+ break;
+ case BTF_KIND_PTR:
+ kinds |= BTF_KIND_BIT(BTF_KIND_PTR);
+ if (ptr_suffix > ptr_suffixes)
+ ptr_suffix -= 1;
+ id = t->type;
+ break;
+ default:
+ id = 0;
+ break;
+ }
+ if (!id)
+ break;
+ t = btf_type_skip_qualifiers(show->btf, id);
+ }
+ /* We may not be able to represent this type; bail to be safe */
+ if (i == BTF_SHOW_MAX_ITER)
+ return "";
+
+ if (!name)
+ name = btf_name_by_offset(show->btf, t->name_off);
+
+ switch (BTF_INFO_KIND(t->info)) {
+ case BTF_KIND_STRUCT:
+ case BTF_KIND_UNION:
+ prefix = BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT ?
+ "struct" : "union";
+ /* if it's an array of struct/union, parens is already set */
+ if (!(kinds & (BTF_KIND_BIT(BTF_KIND_ARRAY))))
+ parens = "{";
+ break;
+ case BTF_KIND_ENUM:
+ case BTF_KIND_ENUM64:
+ prefix = "enum";
+ break;
+ default:
+ break;
+ }
+
+ /* pointer does not require parens */
+ if (kinds & BTF_KIND_BIT(BTF_KIND_PTR))
+ parens = "";
+ /* typedef does not require struct/union/enum prefix */
+ if (kinds & BTF_KIND_BIT(BTF_KIND_TYPEDEF))
+ prefix = "";
+
+ if (!name)
+ name = "";
+
+ /* Even if we don't want type name info, we want parentheses etc */
+ if (show->flags & BTF_SHOW_NONAME)
+ snprintf(show->state.name, sizeof(show->state.name), "%s",
+ parens);
+ else
+ snprintf(show->state.name, sizeof(show->state.name),
+ "%s%s%s(%s%s%s%s%s%s)%s",
+ /* first 3 strings comprise ".member = " */
+ show_member ? "." : "",
+ show_member ? member : "",
+ show_member ? " = " : "",
+ /* ...next is our prefix (struct, enum, etc) */
+ prefix,
+ strlen(prefix) > 0 && strlen(name) > 0 ? " " : "",
+ /* ...this is the type name itself */
+ name,
+ /* ...suffixed by the appropriate '*', '[]' suffixes */
+ strlen(ptr_suffix) > 0 ? " " : "", ptr_suffix,
+ array_suffix, parens);
+
+ return show->state.name;
+}
+
+static const char *__btf_show_indent(struct btf_show *show)
+{
+ const char *indents = " ";
+ const char *indent = &indents[strlen(indents)];
+
+ if ((indent - show->state.depth) >= indents)
+ return indent - show->state.depth;
+ return indents;
+}
+
+static const char *btf_show_indent(struct btf_show *show)
+{
+ return show->flags & BTF_SHOW_COMPACT ? "" : __btf_show_indent(show);
+}
+
+static const char *btf_show_newline(struct btf_show *show)
+{
+ return show->flags & BTF_SHOW_COMPACT ? "" : "\n";
+}
+
+static const char *btf_show_delim(struct btf_show *show)
+{
+ if (show->state.depth == 0)
+ return "";
+
+ if ((show->flags & BTF_SHOW_COMPACT) && show->state.type &&
+ BTF_INFO_KIND(show->state.type->info) == BTF_KIND_UNION)
+ return "|";
+
+ return ",";
+}
+
+__printf(2, 3) static void btf_show(struct btf_show *show, const char *fmt, ...)
+{
+ va_list args;
+
+ if (!show->state.depth_check) {
+ va_start(args, fmt);
+ show->showfn(show, fmt, args);
+ va_end(args);
+ }
+}
+
+/* Macros are used here as btf_show_type_value[s]() prepends and appends
+ * format specifiers to the format specifier passed in; these do the work of
+ * adding indentation, delimiters etc while the caller simply has to specify
+ * the type value(s) in the format specifier + value(s).
+ */
+#define btf_show_type_value(show, fmt, value) \
+ do { \
+ if ((value) != (__typeof__(value))0 || \
+ (show->flags & BTF_SHOW_ZERO) || \
+ show->state.depth == 0) { \
+ btf_show(show, "%s%s" fmt "%s%s", \
+ btf_show_indent(show), \
+ btf_show_name(show), \
+ value, btf_show_delim(show), \
+ btf_show_newline(show)); \
+ if (show->state.depth > show->state.depth_to_show) \
+ show->state.depth_to_show = show->state.depth; \
+ } \
+ } while (0)
+
+#define btf_show_type_values(show, fmt, ...) \
+ do { \
+ btf_show(show, "%s%s" fmt "%s%s", btf_show_indent(show), \
+ btf_show_name(show), \
+ __VA_ARGS__, btf_show_delim(show), \
+ btf_show_newline(show)); \
+ if (show->state.depth > show->state.depth_to_show) \
+ show->state.depth_to_show = show->state.depth; \
+ } while (0)
+
+/* How much is left to copy to safe buffer after @data? */
+static int btf_show_obj_size_left(struct btf_show *show, void *data)
+{
+ return show->obj.head + show->obj.size - data;
+}
+
+/* Is object pointed to by @data of @size already copied to our safe buffer? */
+static bool btf_show_obj_is_safe(struct btf_show *show, void *data, int size)
+{
+ return data >= show->obj.data &&
+ (data + size) < (show->obj.data + BTF_SHOW_OBJ_SAFE_SIZE);
+}
+
+/*
+ * If object pointed to by @data of @size falls within our safe buffer, return
+ * the equivalent pointer to the same safe data. Assumes
+ * copy_from_kernel_nofault() has already happened and our safe buffer is
+ * populated.
+ */
+static void *__btf_show_obj_safe(struct btf_show *show, void *data, int size)
+{
+ if (btf_show_obj_is_safe(show, data, size))
+ return show->obj.safe + (data - show->obj.data);
+ return NULL;
+}
+
+/*
+ * Return a safe-to-access version of data pointed to by @data.
+ * We do this by copying the relevant amount of information
+ * to the struct btf_show obj.safe buffer using copy_from_kernel_nofault().
+ *
+ * If BTF_SHOW_UNSAFE is specified, just return data as-is; no
+ * safe copy is needed.
+ *
+ * Otherwise we need to determine if we have the required amount
+ * of data (determined by the @data pointer and the size of the
+ * largest base type we can encounter (represented by
+ * BTF_SHOW_OBJ_BASE_TYPE_SIZE). Having that much data ensures
+ * that we will be able to print some of the current object,
+ * and if more is needed a copy will be triggered.
+ * Some objects such as structs will not fit into the buffer;
+ * in such cases additional copies when we iterate over their
+ * members may be needed.
+ *
+ * btf_show_obj_safe() is used to return a safe buffer for
+ * btf_show_start_type(); this ensures that as we recurse into
+ * nested types we always have safe data for the given type.
+ * This approach is somewhat wasteful; it's possible for example
+ * that when iterating over a large union we'll end up copying the
+ * same data repeatedly, but the goal is safety not performance.
+ * We use stack data as opposed to per-CPU buffers because the
+ * iteration over a type can take some time, and preemption handling
+ * would greatly complicate use of the safe buffer.
+ */
+static void *btf_show_obj_safe(struct btf_show *show,
+ const struct btf_type *t,
+ void *data)
+{
+ const struct btf_type *rt;
+ int size_left, size;
+ void *safe = NULL;
+
+ if (show->flags & BTF_SHOW_UNSAFE)
+ return data;
+
+ rt = btf_resolve_size(show->btf, t, &size);
+ if (IS_ERR(rt)) {
+ show->state.status = PTR_ERR(rt);
+ return NULL;
+ }
+
+ /*
+ * Is this toplevel object? If so, set total object size and
+ * initialize pointers. Otherwise check if we still fall within
+ * our safe object data.
+ */
+ if (show->state.depth == 0) {
+ show->obj.size = size;
+ show->obj.head = data;
+ } else {
+ /*
+ * If the size of the current object is > our remaining
+ * safe buffer we _may_ need to do a new copy. However
+ * consider the case of a nested struct; it's size pushes
+ * us over the safe buffer limit, but showing any individual
+ * struct members does not. In such cases, we don't need
+ * to initiate a fresh copy yet; however we definitely need
+ * at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes left
+ * in our buffer, regardless of the current object size.
+ * The logic here is that as we resolve types we will
+ * hit a base type at some point, and we need to be sure
+ * the next chunk of data is safely available to display
+ * that type info safely. We cannot rely on the size of
+ * the current object here because it may be much larger
+ * than our current buffer (e.g. task_struct is 8k).
+ * All we want to do here is ensure that we can print the
+ * next basic type, which we can if either
+ * - the current type size is within the safe buffer; or
+ * - at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes are left in
+ * the safe buffer.
+ */
+ safe = __btf_show_obj_safe(show, data,
+ min(size,
+ BTF_SHOW_OBJ_BASE_TYPE_SIZE));
+ }
+
+ /*
+ * We need a new copy to our safe object, either because we haven't
+ * yet copied and are initializing safe data, or because the data
+ * we want falls outside the boundaries of the safe object.
+ */
+ if (!safe) {
+ size_left = btf_show_obj_size_left(show, data);
+ if (size_left > BTF_SHOW_OBJ_SAFE_SIZE)
+ size_left = BTF_SHOW_OBJ_SAFE_SIZE;
+ show->state.status = copy_from_kernel_nofault(show->obj.safe,
+ data, size_left);
+ if (!show->state.status) {
+ show->obj.data = data;
+ safe = show->obj.safe;
+ }
+ }
+
+ return safe;
+}
+
+/*
+ * Set the type we are starting to show and return a safe data pointer
+ * to be used for showing the associated data.
+ */
+static void *btf_show_start_type(struct btf_show *show,
+ const struct btf_type *t,
+ u32 type_id, void *data)
+{
+ show->state.type = t;
+ show->state.type_id = type_id;
+ show->state.name[0] = '\0';
+
+ return btf_show_obj_safe(show, t, data);
+}
+
+static void btf_show_end_type(struct btf_show *show)
+{
+ show->state.type = NULL;
+ show->state.type_id = 0;
+ show->state.name[0] = '\0';
+}
+
+static void *btf_show_start_aggr_type(struct btf_show *show,
+ const struct btf_type *t,
+ u32 type_id, void *data)
+{
+ void *safe_data = btf_show_start_type(show, t, type_id, data);
+
+ if (!safe_data)
+ return safe_data;
+
+ btf_show(show, "%s%s%s", btf_show_indent(show),
+ btf_show_name(show),
+ btf_show_newline(show));
+ show->state.depth++;
+ return safe_data;
+}
+
+static void btf_show_end_aggr_type(struct btf_show *show,
+ const char *suffix)
+{
+ show->state.depth--;
+ btf_show(show, "%s%s%s%s", btf_show_indent(show), suffix,
+ btf_show_delim(show), btf_show_newline(show));
+ btf_show_end_type(show);
+}
+
+static void btf_show_start_member(struct btf_show *show,
+ const struct btf_member *m)
+{
+ show->state.member = m;
+}
+
+static void btf_show_start_array_member(struct btf_show *show)
+{
+ show->state.array_member = 1;
+ btf_show_start_member(show, NULL);
+}
+
+static void btf_show_end_member(struct btf_show *show)
+{
+ show->state.member = NULL;
+}
+
+static void btf_show_end_array_member(struct btf_show *show)
+{
+ show->state.array_member = 0;
+ btf_show_end_member(show);
+}
+
+static void *btf_show_start_array_type(struct btf_show *show,
+ const struct btf_type *t,
+ u32 type_id,
+ u16 array_encoding,
+ void *data)
+{
+ show->state.array_encoding = array_encoding;
+ show->state.array_terminated = 0;
+ return btf_show_start_aggr_type(show, t, type_id, data);
+}
+
+static void btf_show_end_array_type(struct btf_show *show)
+{
+ show->state.array_encoding = 0;
+ show->state.array_terminated = 0;
+ btf_show_end_aggr_type(show, "]");
+}
+
+static void *btf_show_start_struct_type(struct btf_show *show,
+ const struct btf_type *t,
+ u32 type_id,
+ void *data)
+{
+ return btf_show_start_aggr_type(show, t, type_id, data);
+}
+
+static void btf_show_end_struct_type(struct btf_show *show)
+{
+ btf_show_end_aggr_type(show, "}");
+}
+
__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
const char *fmt, ...)
{
@@ -706,23 +1411,28 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
const char *fmt, ...)
{
struct bpf_verifier_log *log = &env->log;
- u8 kind = BTF_INFO_KIND(t->info);
struct btf *btf = env->btf;
va_list args;
if (!bpf_verifier_log_needed(log))
return;
- /* btf verifier prints all types it is processing via
- * btf_verifier_log_type(..., fmt = NULL).
- * Skip those prints for in-kernel BTF verification.
- */
- if (log->level == BPF_LOG_KERNEL && !fmt)
- return;
+ if (log->level == BPF_LOG_KERNEL) {
+ /* btf verifier prints all types it is processing via
+ * btf_verifier_log_type(..., fmt = NULL).
+ * Skip those prints for in-kernel BTF verification.
+ */
+ if (!fmt)
+ return;
+
+ /* Skip logging when loading module BTF with mismatches permitted */
+ if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ return;
+ }
__btf_verifier_log(log, "[%u] %s %s%s",
env->log_type_id,
- btf_kind_str[kind],
+ btf_type_str(t),
__btf_name_by_offset(btf, t->name_off),
log_details ? " " : "");
@@ -757,8 +1467,15 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
if (!bpf_verifier_log_needed(log))
return;
- if (log->level == BPF_LOG_KERNEL && !fmt)
- return;
+ if (log->level == BPF_LOG_KERNEL) {
+ if (!fmt)
+ return;
+
+ /* Skip logging when loading module BTF with mismatches permitted */
+ if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ return;
+ }
+
/* The CHECK_META phase already did a btf dump.
*
* If member is logged again, it must hit an error in
@@ -846,17 +1563,13 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
{
struct btf *btf = env->btf;
- /* < 2 because +1 for btf_void which is always in btf->types[0].
- * btf_void is not accounted in btf->nr_types because btf_void
- * does not come from the BTF file.
- */
- if (btf->types_size - btf->nr_types < 2) {
+ if (btf->types_size == btf->nr_types) {
/* Expand 'types' array */
struct btf_type **new_types;
u32 expand_by, new_size;
- if (btf->types_size == BTF_MAX_TYPE) {
+ if (btf->start_id + btf->types_size == BTF_MAX_TYPE) {
btf_verifier_log(env, "Exceeded max num of types");
return -E2BIG;
}
@@ -870,18 +1583,23 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
if (!new_types)
return -ENOMEM;
- if (btf->nr_types == 0)
- new_types[0] = &btf_void;
- else
+ if (btf->nr_types == 0) {
+ if (!btf->base_btf) {
+ /* lazily init VOID type */
+ new_types[0] = &btf_void;
+ btf->nr_types++;
+ }
+ } else {
memcpy(new_types, btf->types,
- sizeof(*btf->types) * (btf->nr_types + 1));
+ sizeof(*btf->types) * btf->nr_types);
+ }
kvfree(btf->types);
btf->types = new_types;
btf->types_size = new_size;
}
- btf->types[++(btf->nr_types)] = t;
+ btf->types[btf->nr_types++] = t;
return 0;
}
@@ -922,8 +1640,59 @@ static void btf_free_id(struct btf *btf)
spin_unlock_irqrestore(&btf_idr_lock, flags);
}
+static void btf_free_kfunc_set_tab(struct btf *btf)
+{
+ struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
+ int hook;
+
+ if (!tab)
+ return;
+ /* For module BTF, we directly assign the sets being registered, so
+ * there is nothing to free except kfunc_set_tab.
+ */
+ if (btf_is_module(btf))
+ goto free_tab;
+ for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++)
+ kfree(tab->sets[hook]);
+free_tab:
+ kfree(tab);
+ btf->kfunc_set_tab = NULL;
+}
+
+static void btf_free_dtor_kfunc_tab(struct btf *btf)
+{
+ struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab;
+
+ if (!tab)
+ return;
+ kfree(tab);
+ btf->dtor_kfunc_tab = NULL;
+}
+
+static void btf_struct_metas_free(struct btf_struct_metas *tab)
+{
+ int i;
+
+ if (!tab)
+ return;
+ for (i = 0; i < tab->cnt; i++)
+ btf_record_free(tab->types[i].record);
+ kfree(tab);
+}
+
+static void btf_free_struct_meta_tab(struct btf *btf)
+{
+ struct btf_struct_metas *tab = btf->struct_meta_tab;
+
+ btf_struct_metas_free(tab);
+ btf->struct_meta_tab = NULL;
+}
+
static void btf_free(struct btf *btf)
{
+ btf_free_struct_meta_tab(btf);
+ btf_free_dtor_kfunc_tab(btf);
+ btf_free_kfunc_set_tab(btf);
kvfree(btf->types);
kvfree(btf->resolved_sizes);
kvfree(btf->resolved_ids);
@@ -938,6 +1707,11 @@ static void btf_free_rcu(struct rcu_head *rcu)
btf_free(btf);
}
+void btf_get(struct btf *btf)
+{
+ refcount_inc(&btf->refcnt);
+}
+
void btf_put(struct btf *btf)
{
if (btf && refcount_dec_and_test(&btf->refcnt)) {
@@ -954,18 +1728,17 @@ static int env_resolve_init(struct btf_verifier_env *env)
u32 *resolved_ids = NULL;
u8 *visit_states = NULL;
- /* +1 for btf_void */
- resolved_sizes = kvcalloc(nr_types + 1, sizeof(*resolved_sizes),
+ resolved_sizes = kvcalloc(nr_types, sizeof(*resolved_sizes),
GFP_KERNEL | __GFP_NOWARN);
if (!resolved_sizes)
goto nomem;
- resolved_ids = kvcalloc(nr_types + 1, sizeof(*resolved_ids),
+ resolved_ids = kvcalloc(nr_types, sizeof(*resolved_ids),
GFP_KERNEL | __GFP_NOWARN);
if (!resolved_ids)
goto nomem;
- visit_states = kvcalloc(nr_types + 1, sizeof(*visit_states),
+ visit_states = kvcalloc(nr_types, sizeof(*visit_states),
GFP_KERNEL | __GFP_NOWARN);
if (!visit_states)
goto nomem;
@@ -1017,21 +1790,27 @@ static bool env_type_is_resolve_sink(const struct btf_verifier_env *env,
static bool env_type_is_resolved(const struct btf_verifier_env *env,
u32 type_id)
{
- return env->visit_states[type_id] == RESOLVED;
+ /* base BTF types should be resolved by now */
+ if (type_id < env->btf->start_id)
+ return true;
+
+ return env->visit_states[type_id - env->btf->start_id] == RESOLVED;
}
static int env_stack_push(struct btf_verifier_env *env,
const struct btf_type *t, u32 type_id)
{
+ const struct btf *btf = env->btf;
struct resolve_vertex *v;
if (env->top_stack == MAX_RESOLVE_DEPTH)
return -E2BIG;
- if (env->visit_states[type_id] != NOT_VISITED)
+ if (type_id < btf->start_id
+ || env->visit_states[type_id - btf->start_id] != NOT_VISITED)
return -EEXIST;
- env->visit_states[type_id] = VISITED;
+ env->visit_states[type_id - btf->start_id] = VISITED;
v = &env->stack[env->top_stack++];
v->t = t;
@@ -1061,6 +1840,7 @@ static void env_stack_pop_resolved(struct btf_verifier_env *env,
u32 type_id = env->stack[--(env->top_stack)].type_id;
struct btf *btf = env->btf;
+ type_id -= btf->start_id; /* adjust to local type id */
btf->resolved_sizes[type_id] = resolved_size;
btf->resolved_ids[type_id] = resolved_type_id;
env->visit_states[type_id] = RESOLVED;
@@ -1078,23 +1858,27 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env)
* *type_size: (x * y * sizeof(u32)). Hence, *type_size always
* corresponds to the return type.
* *elem_type: u32
+ * *elem_id: id of u32
* *total_nelems: (x * y). Hence, individual elem size is
* (*type_size / *total_nelems)
+ * *type_id: id of type if it's changed within the function, 0 if not
*
* type: is not an array (e.g. const struct X)
* return type: type "struct X"
* *type_size: sizeof(struct X)
* *elem_type: same as return type ("struct X")
+ * *elem_id: 0
* *total_nelems: 1
+ * *type_id: id of type if it's changed within the function, 0 if not
*/
-const struct btf_type *
-btf_resolve_size(const struct btf *btf, const struct btf_type *type,
- u32 *type_size, const struct btf_type **elem_type,
- u32 *total_nelems)
+static const struct btf_type *
+__btf_resolve_size(const struct btf *btf, const struct btf_type *type,
+ u32 *type_size, const struct btf_type **elem_type,
+ u32 *elem_id, u32 *total_nelems, u32 *type_id)
{
const struct btf_type *array_type = NULL;
- const struct btf_array *array;
- u32 i, size, nelems = 1;
+ const struct btf_array *array = NULL;
+ u32 i, size, nelems = 1, id = 0;
for (i = 0; i < MAX_RESOLVE_DEPTH; i++) {
switch (BTF_INFO_KIND(type->info)) {
@@ -1103,6 +1887,8 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type,
case BTF_KIND_STRUCT:
case BTF_KIND_UNION:
case BTF_KIND_ENUM:
+ case BTF_KIND_FLOAT:
+ case BTF_KIND_ENUM64:
size = type->size;
goto resolved;
@@ -1115,6 +1901,8 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type,
case BTF_KIND_VOLATILE:
case BTF_KIND_CONST:
case BTF_KIND_RESTRICT:
+ case BTF_KIND_TYPE_TAG:
+ id = type->type;
type = btf_type_by_id(btf, type->type);
break;
@@ -1145,18 +1933,45 @@ resolved:
*total_nelems = nelems;
if (elem_type)
*elem_type = type;
+ if (elem_id)
+ *elem_id = array ? array->type : 0;
+ if (type_id && id)
+ *type_id = id;
return array_type ? : type;
}
+const struct btf_type *
+btf_resolve_size(const struct btf *btf, const struct btf_type *type,
+ u32 *type_size)
+{
+ return __btf_resolve_size(btf, type, type_size, NULL, NULL, NULL, NULL);
+}
+
+static u32 btf_resolved_type_id(const struct btf *btf, u32 type_id)
+{
+ while (type_id < btf->start_id)
+ btf = btf->base_btf;
+
+ return btf->resolved_ids[type_id - btf->start_id];
+}
+
/* The input param "type_id" must point to a needs_resolve type */
static const struct btf_type *btf_type_id_resolve(const struct btf *btf,
u32 *type_id)
{
- *type_id = btf->resolved_ids[*type_id];
+ *type_id = btf_resolved_type_id(btf, *type_id);
return btf_type_by_id(btf, *type_id);
}
+static u32 btf_resolved_type_size(const struct btf *btf, u32 type_id)
+{
+ while (type_id < btf->start_id)
+ btf = btf->base_btf;
+
+ return btf->resolved_sizes[type_id - btf->start_id];
+}
+
const struct btf_type *btf_type_id_size(const struct btf *btf,
u32 *type_id, u32 *ret_size)
{
@@ -1171,7 +1986,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
if (btf_type_has_size(size_type)) {
size = size_type->size;
} else if (btf_type_is_array(size_type)) {
- size = btf->resolved_sizes[size_type_id];
+ size = btf_resolved_type_size(btf, size_type_id);
} else if (btf_type_is_ptr(size_type)) {
size = sizeof(void *);
} else {
@@ -1179,14 +1994,14 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
!btf_type_is_var(size_type)))
return NULL;
- size_type_id = btf->resolved_ids[size_type_id];
+ size_type_id = btf_resolved_type_id(btf, size_type_id);
size_type = btf_type_by_id(btf, size_type_id);
if (btf_type_nosize_or_null(size_type))
return NULL;
else if (btf_type_has_size(size_type))
size = size_type->size;
else if (btf_type_is_array(size_type))
- size = btf->resolved_sizes[size_type_id];
+ size = btf_resolved_type_size(btf, size_type_id);
else if (btf_type_is_ptr(size_type))
size = sizeof(void *);
else
@@ -1220,7 +2035,7 @@ static int btf_df_check_kflag_member(struct btf_verifier_env *env,
return -EINVAL;
}
-/* Used for ptr, array and struct/union type members.
+/* Used for ptr, array struct/union and float type members.
* int, enum and modifier types have their specific callback functions.
*/
static int btf_generic_check_kflag_member(struct btf_verifier_env *env,
@@ -1249,11 +2064,11 @@ static int btf_df_resolve(struct btf_verifier_env *env,
return -EINVAL;
}
-static void btf_df_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offsets,
- struct seq_file *m)
+static void btf_df_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offsets,
+ struct btf_show *show)
{
- seq_printf(m, "<unsupported kind:%u>", BTF_INFO_KIND(t->info));
+ btf_show(show, "<unsupported kind:%u>", BTF_INFO_KIND(t->info));
}
static int btf_int_check_member(struct btf_verifier_env *env,
@@ -1426,7 +2241,7 @@ static void btf_int_log(struct btf_verifier_env *env,
btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
}
-static void btf_int128_print(struct seq_file *m, void *data)
+static void btf_int128_print(struct btf_show *show, void *data)
{
/* data points to a __int128 number.
* Suppose
@@ -1445,9 +2260,10 @@ static void btf_int128_print(struct seq_file *m, void *data)
lower_num = *(u64 *)data;
#endif
if (upper_num == 0)
- seq_printf(m, "0x%llx", lower_num);
+ btf_show_type_value(show, "0x%llx", lower_num);
else
- seq_printf(m, "0x%llx%016llx", upper_num, lower_num);
+ btf_show_type_values(show, "0x%llx%016llx", upper_num,
+ lower_num);
}
static void btf_int128_shift(u64 *print_num, u16 left_shift_bits,
@@ -1491,8 +2307,8 @@ static void btf_int128_shift(u64 *print_num, u16 left_shift_bits,
#endif
}
-static void btf_bitfield_seq_show(void *data, u8 bits_offset,
- u8 nr_bits, struct seq_file *m)
+static void btf_bitfield_show(void *data, u8 bits_offset,
+ u8 nr_bits, struct btf_show *show)
{
u16 left_shift_bits, right_shift_bits;
u8 nr_copy_bytes;
@@ -1512,14 +2328,14 @@ static void btf_bitfield_seq_show(void *data, u8 bits_offset,
right_shift_bits = BITS_PER_U128 - nr_bits;
btf_int128_shift(print_num, left_shift_bits, right_shift_bits);
- btf_int128_print(m, print_num);
+ btf_int128_print(show, print_num);
}
-static void btf_int_bits_seq_show(const struct btf *btf,
- const struct btf_type *t,
- void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_int_bits_show(const struct btf *btf,
+ const struct btf_type *t,
+ void *data, u8 bits_offset,
+ struct btf_show *show)
{
u32 int_data = btf_type_int(t);
u8 nr_bits = BTF_INT_BITS(int_data);
@@ -1532,55 +2348,77 @@ static void btf_int_bits_seq_show(const struct btf *btf,
total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
- btf_bitfield_seq_show(data, bits_offset, nr_bits, m);
+ btf_bitfield_show(data, bits_offset, nr_bits, show);
}
-static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_int_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
{
u32 int_data = btf_type_int(t);
u8 encoding = BTF_INT_ENCODING(int_data);
bool sign = encoding & BTF_INT_SIGNED;
u8 nr_bits = BTF_INT_BITS(int_data);
+ void *safe_data;
+
+ safe_data = btf_show_start_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
if (bits_offset || BTF_INT_OFFSET(int_data) ||
BITS_PER_BYTE_MASKED(nr_bits)) {
- btf_int_bits_seq_show(btf, t, data, bits_offset, m);
- return;
+ btf_int_bits_show(btf, t, safe_data, bits_offset, show);
+ goto out;
}
switch (nr_bits) {
case 128:
- btf_int128_print(m, data);
+ btf_int128_print(show, safe_data);
break;
case 64:
if (sign)
- seq_printf(m, "%lld", *(s64 *)data);
+ btf_show_type_value(show, "%lld", *(s64 *)safe_data);
else
- seq_printf(m, "%llu", *(u64 *)data);
+ btf_show_type_value(show, "%llu", *(u64 *)safe_data);
break;
case 32:
if (sign)
- seq_printf(m, "%d", *(s32 *)data);
+ btf_show_type_value(show, "%d", *(s32 *)safe_data);
else
- seq_printf(m, "%u", *(u32 *)data);
+ btf_show_type_value(show, "%u", *(u32 *)safe_data);
break;
case 16:
if (sign)
- seq_printf(m, "%d", *(s16 *)data);
+ btf_show_type_value(show, "%d", *(s16 *)safe_data);
else
- seq_printf(m, "%u", *(u16 *)data);
+ btf_show_type_value(show, "%u", *(u16 *)safe_data);
break;
case 8:
+ if (show->state.array_encoding == BTF_INT_CHAR) {
+ /* check for null terminator */
+ if (show->state.array_terminated)
+ break;
+ if (*(char *)data == '\0') {
+ show->state.array_terminated = 1;
+ break;
+ }
+ if (isprint(*(char *)data)) {
+ btf_show_type_value(show, "'%c'",
+ *(char *)safe_data);
+ break;
+ }
+ }
if (sign)
- seq_printf(m, "%d", *(s8 *)data);
+ btf_show_type_value(show, "%d", *(s8 *)safe_data);
else
- seq_printf(m, "%u", *(u8 *)data);
+ btf_show_type_value(show, "%u", *(u8 *)safe_data);
break;
default:
- btf_int_bits_seq_show(btf, t, data, bits_offset, m);
+ btf_int_bits_show(btf, t, safe_data, bits_offset, show);
+ break;
}
+out:
+ btf_show_end_type(show);
}
static const struct btf_kind_operations int_ops = {
@@ -1589,7 +2427,7 @@ static const struct btf_kind_operations int_ops = {
.check_member = btf_int_check_member,
.check_kflag_member = btf_int_check_kflag_member,
.log_details = btf_int_log,
- .seq_show = btf_int_seq_show,
+ .show = btf_int_show,
};
static int btf_modifier_check_member(struct btf_verifier_env *env,
@@ -1672,6 +2510,8 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
+ const char *value;
+
if (btf_type_vlen(t)) {
btf_verifier_log_type(env, t, "vlen != 0");
return -EINVAL;
@@ -1687,7 +2527,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- /* typedef type must have a valid name, and other ref types,
+ /* typedef/type_tag type must have a valid name, and other ref types,
* volatile, const, restrict, should have a null name.
*/
if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) {
@@ -1696,6 +2536,12 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
+ } else if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG) {
+ value = btf_name_by_offset(env->btf, t->name_off);
+ if (!value || !value[0]) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
} else {
if (t->name_off) {
btf_verifier_log_type(env, t, "Invalid name");
@@ -1820,7 +2666,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
*
* We now need to continue from the last-resolved-ptr to
* ensure the last-resolved-ptr will not referring back to
- * the currenct ptr (t).
+ * the current ptr (t).
*/
if (btf_type_is_modifier(next_type)) {
const struct btf_type *resolved_type;
@@ -1853,34 +2699,44 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
return 0;
}
-static void btf_modifier_seq_show(const struct btf *btf,
- const struct btf_type *t,
- u32 type_id, void *data,
- u8 bits_offset, struct seq_file *m)
+static void btf_modifier_show(const struct btf *btf,
+ const struct btf_type *t,
+ u32 type_id, void *data,
+ u8 bits_offset, struct btf_show *show)
{
if (btf->resolved_ids)
t = btf_type_id_resolve(btf, &type_id);
else
t = btf_type_skip_modifiers(btf, type_id, NULL);
- btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
+ btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show);
}
-static void btf_var_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_var_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
{
t = btf_type_id_resolve(btf, &type_id);
- btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
+ btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show);
}
-static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_ptr_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
{
- /* It is a hashed value */
- seq_printf(m, "%p", *(void **)data);
+ void *safe_data;
+
+ safe_data = btf_show_start_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
+
+ /* It is a hashed value unless BTF_SHOW_PTR_RAW is specified */
+ if (show->flags & BTF_SHOW_PTR_RAW)
+ btf_show_type_value(show, "0x%px", *(void **)safe_data);
+ else
+ btf_show_type_value(show, "0x%p", *(void **)safe_data);
+ btf_show_end_type(show);
}
static void btf_ref_type_log(struct btf_verifier_env *env,
@@ -1895,7 +2751,7 @@ static struct btf_kind_operations modifier_ops = {
.check_member = btf_modifier_check_member,
.check_kflag_member = btf_modifier_check_kflag_member,
.log_details = btf_ref_type_log,
- .seq_show = btf_modifier_seq_show,
+ .show = btf_modifier_show,
};
static struct btf_kind_operations ptr_ops = {
@@ -1904,7 +2760,7 @@ static struct btf_kind_operations ptr_ops = {
.check_member = btf_ptr_check_member,
.check_kflag_member = btf_generic_check_kflag_member,
.log_details = btf_ref_type_log,
- .seq_show = btf_ptr_seq_show,
+ .show = btf_ptr_show,
};
static s32 btf_fwd_check_meta(struct btf_verifier_env *env,
@@ -1945,7 +2801,7 @@ static struct btf_kind_operations fwd_ops = {
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_fwd_type_log,
- .seq_show = btf_df_seq_show,
+ .show = btf_df_show,
};
static int btf_array_check_member(struct btf_verifier_env *env,
@@ -2104,28 +2960,90 @@ static void btf_array_log(struct btf_verifier_env *env,
array->type, array->index_type, array->nelems);
}
-static void btf_array_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offset,
- struct seq_file *m)
+static void __btf_array_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
{
const struct btf_array *array = btf_type_array(t);
const struct btf_kind_operations *elem_ops;
const struct btf_type *elem_type;
- u32 i, elem_size, elem_type_id;
+ u32 i, elem_size = 0, elem_type_id;
+ u16 encoding = 0;
elem_type_id = array->type;
- elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+ elem_type = btf_type_skip_modifiers(btf, elem_type_id, NULL);
+ if (elem_type && btf_type_has_size(elem_type))
+ elem_size = elem_type->size;
+
+ if (elem_type && btf_type_is_int(elem_type)) {
+ u32 int_type = btf_type_int(elem_type);
+
+ encoding = BTF_INT_ENCODING(int_type);
+
+ /*
+ * BTF_INT_CHAR encoding never seems to be set for
+ * char arrays, so if size is 1 and element is
+ * printable as a char, we'll do that.
+ */
+ if (elem_size == 1)
+ encoding = BTF_INT_CHAR;
+ }
+
+ if (!btf_show_start_array_type(show, t, type_id, encoding, data))
+ return;
+
+ if (!elem_type)
+ goto out;
elem_ops = btf_type_ops(elem_type);
- seq_puts(m, "[");
+
for (i = 0; i < array->nelems; i++) {
- if (i)
- seq_puts(m, ",");
- elem_ops->seq_show(btf, elem_type, elem_type_id, data,
- bits_offset, m);
+ btf_show_start_array_member(show);
+
+ elem_ops->show(btf, elem_type, elem_type_id, data,
+ bits_offset, show);
data += elem_size;
+
+ btf_show_end_array_member(show);
+
+ if (show->state.array_terminated)
+ break;
}
- seq_puts(m, "]");
+out:
+ btf_show_end_array_type(show);
+}
+
+static void btf_array_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ const struct btf_member *m = show->state.member;
+
+ /*
+ * First check if any members would be shown (are non-zero).
+ * See comments above "struct btf_show" definition for more
+ * details on how this works at a high-level.
+ */
+ if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) {
+ if (!show->state.depth_check) {
+ show->state.depth_check = show->state.depth + 1;
+ show->state.depth_to_show = 0;
+ }
+ __btf_array_show(btf, t, type_id, data, bits_offset, show);
+ show->state.member = m;
+
+ if (show->state.depth_check != show->state.depth + 1)
+ return;
+ show->state.depth_check = 0;
+
+ if (show->state.depth_to_show <= show->state.depth)
+ return;
+ /*
+ * Reaching here indicates we have recursed and found
+ * non-zero array member(s).
+ */
+ }
+ __btf_array_show(btf, t, type_id, data, bits_offset, show);
}
static struct btf_kind_operations array_ops = {
@@ -2134,7 +3052,7 @@ static struct btf_kind_operations array_ops = {
.check_member = btf_array_check_member,
.check_kflag_member = btf_generic_check_kflag_member,
.log_details = btf_array_log,
- .seq_show = btf_array_seq_show,
+ .show = btf_array_show,
};
static int btf_struct_check_member(struct btf_verifier_env *env,
@@ -2213,7 +3131,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- offset = btf_member_bit_offset(t, member);
+ offset = __btf_member_bit_offset(t, member);
if (is_union && offset) {
btf_verifier_log_member(env, t, member,
"Invalid member bits_offset");
@@ -2257,7 +3175,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
if (v->next_member) {
const struct btf_type *last_member_type;
const struct btf_member *last_member;
- u16 last_member_type_id;
+ u32 last_member_type_id;
last_member = btf_type_member(v->t) + v->next_member - 1;
last_member_type_id = last_member->type;
@@ -2320,52 +3238,692 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
-/* find 'struct bpf_spin_lock' in map value.
- * return >= 0 offset if found
- * and < 0 in case of error
- */
-int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+enum {
+ BTF_FIELD_IGNORE = 0,
+ BTF_FIELD_FOUND = 1,
+};
+
+struct btf_field_info {
+ enum btf_field_type type;
+ u32 off;
+ union {
+ struct {
+ u32 type_id;
+ } kptr;
+ struct {
+ const char *node_name;
+ u32 value_btf_id;
+ } graph_root;
+ };
+};
+
+static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
+ u32 off, int sz, enum btf_field_type field_type,
+ struct btf_field_info *info)
{
- const struct btf_member *member;
- u32 i, off = -ENOENT;
+ if (!__btf_type_is_struct(t))
+ return BTF_FIELD_IGNORE;
+ if (t->size != sz)
+ return BTF_FIELD_IGNORE;
+ info->type = field_type;
+ info->off = off;
+ return BTF_FIELD_FOUND;
+}
+static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
+ u32 off, int sz, struct btf_field_info *info)
+{
+ enum btf_field_type type;
+ u32 res_id;
+
+ /* Permit modifiers on the pointer itself */
+ if (btf_type_is_volatile(t))
+ t = btf_type_by_id(btf, t->type);
+ /* For PTR, sz is always == 8 */
+ if (!btf_type_is_ptr(t))
+ return BTF_FIELD_IGNORE;
+ t = btf_type_by_id(btf, t->type);
+
+ if (!btf_type_is_type_tag(t))
+ return BTF_FIELD_IGNORE;
+ /* Reject extra tags */
+ if (btf_type_is_type_tag(btf_type_by_id(btf, t->type)))
+ return -EINVAL;
+ if (!strcmp("kptr_untrusted", __btf_name_by_offset(btf, t->name_off)))
+ type = BPF_KPTR_UNREF;
+ else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
+ type = BPF_KPTR_REF;
+ else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off)))
+ type = BPF_KPTR_PERCPU;
+ else
+ return -EINVAL;
+
+ /* Get the base type */
+ t = btf_type_skip_modifiers(btf, t->type, &res_id);
+ /* Only pointer to struct is allowed */
if (!__btf_type_is_struct(t))
return -EINVAL;
+ info->type = type;
+ info->off = off;
+ info->kptr.type_id = res_id;
+ return BTF_FIELD_FOUND;
+}
+
+const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
+ int comp_idx, const char *tag_key)
+{
+ const char *value = NULL;
+ int i;
+
+ for (i = 1; i < btf_nr_types(btf); i++) {
+ const struct btf_type *t = btf_type_by_id(btf, i);
+ int len = strlen(tag_key);
+
+ if (!btf_type_is_decl_tag(t))
+ continue;
+ if (pt != btf_type_by_id(btf, t->type) ||
+ btf_type_decl_tag(t)->component_idx != comp_idx)
+ continue;
+ if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
+ continue;
+ /* Prevent duplicate entries for same type */
+ if (value)
+ return ERR_PTR(-EEXIST);
+ value = __btf_name_by_offset(btf, t->name_off) + len;
+ }
+ if (!value)
+ return ERR_PTR(-ENOENT);
+ return value;
+}
+
+static int
+btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
+ const struct btf_type *t, int comp_idx, u32 off,
+ int sz, struct btf_field_info *info,
+ enum btf_field_type head_type)
+{
+ const char *node_field_name;
+ const char *value_type;
+ s32 id;
+
+ if (!__btf_type_is_struct(t))
+ return BTF_FIELD_IGNORE;
+ if (t->size != sz)
+ return BTF_FIELD_IGNORE;
+ value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
+ if (IS_ERR(value_type))
+ return -EINVAL;
+ node_field_name = strstr(value_type, ":");
+ if (!node_field_name)
+ return -EINVAL;
+ value_type = kstrndup(value_type, node_field_name - value_type, GFP_KERNEL | __GFP_NOWARN);
+ if (!value_type)
+ return -ENOMEM;
+ id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT);
+ kfree(value_type);
+ if (id < 0)
+ return id;
+ node_field_name++;
+ if (str_is_empty(node_field_name))
+ return -EINVAL;
+ info->type = head_type;
+ info->off = off;
+ info->graph_root.value_btf_id = id;
+ info->graph_root.node_name = node_field_name;
+ return BTF_FIELD_FOUND;
+}
+
+#define field_mask_test_name(field_type, field_type_str) \
+ if (field_mask & field_type && !strcmp(name, field_type_str)) { \
+ type = field_type; \
+ goto end; \
+ }
+
+static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
+ int *align, int *sz)
+{
+ int type = 0;
+
+ if (field_mask & BPF_SPIN_LOCK) {
+ if (!strcmp(name, "bpf_spin_lock")) {
+ if (*seen_mask & BPF_SPIN_LOCK)
+ return -E2BIG;
+ *seen_mask |= BPF_SPIN_LOCK;
+ type = BPF_SPIN_LOCK;
+ goto end;
+ }
+ }
+ if (field_mask & BPF_TIMER) {
+ if (!strcmp(name, "bpf_timer")) {
+ if (*seen_mask & BPF_TIMER)
+ return -E2BIG;
+ *seen_mask |= BPF_TIMER;
+ type = BPF_TIMER;
+ goto end;
+ }
+ }
+ field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head");
+ field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
+ field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root");
+ field_mask_test_name(BPF_RB_NODE, "bpf_rb_node");
+ field_mask_test_name(BPF_REFCOUNT, "bpf_refcount");
+
+ /* Only return BPF_KPTR when all other types with matchable names fail */
+ if (field_mask & BPF_KPTR) {
+ type = BPF_KPTR_REF;
+ goto end;
+ }
+ return 0;
+end:
+ *sz = btf_field_type_size(type);
+ *align = btf_field_type_align(type);
+ return type;
+}
+
+#undef field_mask_test_name
+
+static int btf_find_struct_field(const struct btf *btf,
+ const struct btf_type *t, u32 field_mask,
+ struct btf_field_info *info, int info_cnt)
+{
+ int ret, idx = 0, align, sz, field_type;
+ const struct btf_member *member;
+ struct btf_field_info tmp;
+ u32 i, off, seen_mask = 0;
+
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
- if (!__btf_type_is_struct(member_type))
- continue;
- if (member_type->size != sizeof(struct bpf_spin_lock))
- continue;
- if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
- "bpf_spin_lock"))
+
+ field_type = btf_get_field_type(__btf_name_by_offset(btf, member_type->name_off),
+ field_mask, &seen_mask, &align, &sz);
+ if (field_type == 0)
continue;
- if (off != -ENOENT)
- /* only one 'struct bpf_spin_lock' is allowed */
- return -E2BIG;
- off = btf_member_bit_offset(t, member);
+ if (field_type < 0)
+ return field_type;
+
+ off = __btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
return -EINVAL;
off /= 8;
- if (off % __alignof__(struct bpf_spin_lock))
- /* valid struct bpf_spin_lock will be 4 byte aligned */
+ if (off % align)
+ continue;
+
+ switch (field_type) {
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ case BPF_REFCOUNT:
+ ret = btf_find_struct(btf, member_type, off, sz, field_type,
+ idx < info_cnt ? &info[idx] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ ret = btf_find_kptr(btf, member_type, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ ret = btf_find_graph_root(btf, t, member_type,
+ i, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp,
+ field_type);
+ if (ret < 0)
+ return ret;
+ break;
+ default:
+ return -EFAULT;
+ }
+
+ if (ret == BTF_FIELD_IGNORE)
+ continue;
+ if (idx >= info_cnt)
+ return -E2BIG;
+ ++idx;
+ }
+ return idx;
+}
+
+static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt)
+{
+ int ret, idx = 0, align, sz, field_type;
+ const struct btf_var_secinfo *vsi;
+ struct btf_field_info tmp;
+ u32 i, off, seen_mask = 0;
+
+ for_each_vsi(i, t, vsi) {
+ const struct btf_type *var = btf_type_by_id(btf, vsi->type);
+ const struct btf_type *var_type = btf_type_by_id(btf, var->type);
+
+ field_type = btf_get_field_type(__btf_name_by_offset(btf, var_type->name_off),
+ field_mask, &seen_mask, &align, &sz);
+ if (field_type == 0)
+ continue;
+ if (field_type < 0)
+ return field_type;
+
+ off = vsi->offset;
+ if (vsi->size != sz)
+ continue;
+ if (off % align)
+ continue;
+
+ switch (field_type) {
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ case BPF_REFCOUNT:
+ ret = btf_find_struct(btf, var_type, off, sz, field_type,
+ idx < info_cnt ? &info[idx] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ ret = btf_find_kptr(btf, var_type, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ ret = btf_find_graph_root(btf, var, var_type,
+ -1, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp,
+ field_type);
+ if (ret < 0)
+ return ret;
+ break;
+ default:
+ return -EFAULT;
+ }
+
+ if (ret == BTF_FIELD_IGNORE)
+ continue;
+ if (idx >= info_cnt)
+ return -E2BIG;
+ ++idx;
+ }
+ return idx;
+}
+
+static int btf_find_field(const struct btf *btf, const struct btf_type *t,
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt)
+{
+ if (__btf_type_is_struct(t))
+ return btf_find_struct_field(btf, t, field_mask, info, info_cnt);
+ else if (btf_type_is_datasec(t))
+ return btf_find_datasec_var(btf, t, field_mask, info, info_cnt);
+ return -EINVAL;
+}
+
+static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
+{
+ struct module *mod = NULL;
+ const struct btf_type *t;
+ /* If a matching btf type is found in kernel or module BTFs, kptr_ref
+ * is that BTF, otherwise it's program BTF
+ */
+ struct btf *kptr_btf;
+ int ret;
+ s32 id;
+
+ /* Find type in map BTF, and use it to look up the matching type
+ * in vmlinux or module BTFs, by name and kind.
+ */
+ t = btf_type_by_id(btf, info->kptr.type_id);
+ id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
+ &kptr_btf);
+ if (id == -ENOENT) {
+ /* btf_parse_kptr should only be called w/ btf = program BTF */
+ WARN_ON_ONCE(btf_is_kernel(btf));
+
+ /* Type exists only in program BTF. Assume that it's a MEM_ALLOC
+ * kptr allocated via bpf_obj_new
+ */
+ field->kptr.dtor = NULL;
+ id = info->kptr.type_id;
+ kptr_btf = (struct btf *)btf;
+ btf_get(kptr_btf);
+ goto found_dtor;
+ }
+ if (id < 0)
+ return id;
+
+ /* Find and stash the function pointer for the destruction function that
+ * needs to be eventually invoked from the map free path.
+ */
+ if (info->type == BPF_KPTR_REF) {
+ const struct btf_type *dtor_func;
+ const char *dtor_func_name;
+ unsigned long addr;
+ s32 dtor_btf_id;
+
+ /* This call also serves as a whitelist of allowed objects that
+ * can be used as a referenced pointer and be stored in a map at
+ * the same time.
+ */
+ dtor_btf_id = btf_find_dtor_kfunc(kptr_btf, id);
+ if (dtor_btf_id < 0) {
+ ret = dtor_btf_id;
+ goto end_btf;
+ }
+
+ dtor_func = btf_type_by_id(kptr_btf, dtor_btf_id);
+ if (!dtor_func) {
+ ret = -ENOENT;
+ goto end_btf;
+ }
+
+ if (btf_is_module(kptr_btf)) {
+ mod = btf_try_get_module(kptr_btf);
+ if (!mod) {
+ ret = -ENXIO;
+ goto end_btf;
+ }
+ }
+
+ /* We already verified dtor_func to be btf_type_is_func
+ * in register_btf_id_dtor_kfuncs.
+ */
+ dtor_func_name = __btf_name_by_offset(kptr_btf, dtor_func->name_off);
+ addr = kallsyms_lookup_name(dtor_func_name);
+ if (!addr) {
+ ret = -EINVAL;
+ goto end_mod;
+ }
+ field->kptr.dtor = (void *)addr;
+ }
+
+found_dtor:
+ field->kptr.btf_id = id;
+ field->kptr.btf = kptr_btf;
+ field->kptr.module = mod;
+ return 0;
+end_mod:
+ module_put(mod);
+end_btf:
+ btf_put(kptr_btf);
+ return ret;
+}
+
+static int btf_parse_graph_root(const struct btf *btf,
+ struct btf_field *field,
+ struct btf_field_info *info,
+ const char *node_type_name,
+ size_t node_type_align)
+{
+ const struct btf_type *t, *n = NULL;
+ const struct btf_member *member;
+ u32 offset;
+ int i;
+
+ t = btf_type_by_id(btf, info->graph_root.value_btf_id);
+ /* We've already checked that value_btf_id is a struct type. We
+ * just need to figure out the offset of the list_node, and
+ * verify its type.
+ */
+ for_each_member(i, t, member) {
+ if (strcmp(info->graph_root.node_name,
+ __btf_name_by_offset(btf, member->name_off)))
+ continue;
+ /* Invalid BTF, two members with same name */
+ if (n)
+ return -EINVAL;
+ n = btf_type_by_id(btf, member->type);
+ if (!__btf_type_is_struct(n))
+ return -EINVAL;
+ if (strcmp(node_type_name, __btf_name_by_offset(btf, n->name_off)))
return -EINVAL;
+ offset = __btf_member_bit_offset(n, member);
+ if (offset % 8)
+ return -EINVAL;
+ offset /= 8;
+ if (offset % node_type_align)
+ return -EINVAL;
+
+ field->graph_root.btf = (struct btf *)btf;
+ field->graph_root.value_btf_id = info->graph_root.value_btf_id;
+ field->graph_root.node_offset = offset;
}
- return off;
+ if (!n)
+ return -ENOENT;
+ return 0;
+}
+
+static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
+{
+ return btf_parse_graph_root(btf, field, info, "bpf_list_node",
+ __alignof__(struct bpf_list_node));
+}
+
+static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
+{
+ return btf_parse_graph_root(btf, field, info, "bpf_rb_node",
+ __alignof__(struct bpf_rb_node));
}
-static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offset,
- struct seq_file *m)
+static int btf_field_cmp(const void *_a, const void *_b, const void *priv)
+{
+ const struct btf_field *a = (const struct btf_field *)_a;
+ const struct btf_field *b = (const struct btf_field *)_b;
+
+ if (a->offset < b->offset)
+ return -1;
+ else if (a->offset > b->offset)
+ return 1;
+ return 0;
+}
+
+struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
+ u32 field_mask, u32 value_size)
+{
+ struct btf_field_info info_arr[BTF_FIELDS_MAX];
+ u32 next_off = 0, field_type_size;
+ struct btf_record *rec;
+ int ret, i, cnt;
+
+ ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr));
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (!ret)
+ return NULL;
+
+ cnt = ret;
+ /* This needs to be kzalloc to zero out padding and unused fields, see
+ * comment in btf_record_equal.
+ */
+ rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN);
+ if (!rec)
+ return ERR_PTR(-ENOMEM);
+
+ rec->spin_lock_off = -EINVAL;
+ rec->timer_off = -EINVAL;
+ rec->refcount_off = -EINVAL;
+ for (i = 0; i < cnt; i++) {
+ field_type_size = btf_field_type_size(info_arr[i].type);
+ if (info_arr[i].off + field_type_size > value_size) {
+ WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size);
+ ret = -EFAULT;
+ goto end;
+ }
+ if (info_arr[i].off < next_off) {
+ ret = -EEXIST;
+ goto end;
+ }
+ next_off = info_arr[i].off + field_type_size;
+
+ rec->field_mask |= info_arr[i].type;
+ rec->fields[i].offset = info_arr[i].off;
+ rec->fields[i].type = info_arr[i].type;
+ rec->fields[i].size = field_type_size;
+
+ switch (info_arr[i].type) {
+ case BPF_SPIN_LOCK:
+ WARN_ON_ONCE(rec->spin_lock_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->spin_lock_off = rec->fields[i].offset;
+ break;
+ case BPF_TIMER:
+ WARN_ON_ONCE(rec->timer_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->timer_off = rec->fields[i].offset;
+ break;
+ case BPF_REFCOUNT:
+ WARN_ON_ONCE(rec->refcount_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->refcount_off = rec->fields[i].offset;
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
+ case BPF_LIST_HEAD:
+ ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
+ case BPF_RB_ROOT:
+ ret = btf_parse_rb_root(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ break;
+ default:
+ ret = -EFAULT;
+ goto end;
+ }
+ rec->cnt++;
+ }
+
+ /* bpf_{list_head, rb_node} require bpf_spin_lock */
+ if ((btf_record_has_field(rec, BPF_LIST_HEAD) ||
+ btf_record_has_field(rec, BPF_RB_ROOT)) && rec->spin_lock_off < 0) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ if (rec->refcount_off < 0 &&
+ btf_record_has_field(rec, BPF_LIST_NODE) &&
+ btf_record_has_field(rec, BPF_RB_NODE)) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ sort_r(rec->fields, rec->cnt, sizeof(struct btf_field), btf_field_cmp,
+ NULL, rec);
+
+ return rec;
+end:
+ btf_record_free(rec);
+ return ERR_PTR(ret);
+}
+
+int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
+{
+ int i;
+
+ /* There are three types that signify ownership of some other type:
+ * kptr_ref, bpf_list_head, bpf_rb_root.
+ * kptr_ref only supports storing kernel types, which can't store
+ * references to program allocated local types.
+ *
+ * Hence we only need to ensure that bpf_{list_head,rb_root} ownership
+ * does not form cycles.
+ */
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_GRAPH_ROOT))
+ return 0;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_struct_meta *meta;
+ u32 btf_id;
+
+ if (!(rec->fields[i].type & BPF_GRAPH_ROOT))
+ continue;
+ btf_id = rec->fields[i].graph_root.value_btf_id;
+ meta = btf_find_struct_meta(btf, btf_id);
+ if (!meta)
+ return -EFAULT;
+ rec->fields[i].graph_root.value_rec = meta->record;
+
+ /* We need to set value_rec for all root types, but no need
+ * to check ownership cycle for a type unless it's also a
+ * node type.
+ */
+ if (!(rec->field_mask & BPF_GRAPH_NODE))
+ continue;
+
+ /* We need to ensure ownership acyclicity among all types. The
+ * proper way to do it would be to topologically sort all BTF
+ * IDs based on the ownership edges, since there can be multiple
+ * bpf_{list_head,rb_node} in a type. Instead, we use the
+ * following resaoning:
+ *
+ * - A type can only be owned by another type in user BTF if it
+ * has a bpf_{list,rb}_node. Let's call these node types.
+ * - A type can only _own_ another type in user BTF if it has a
+ * bpf_{list_head,rb_root}. Let's call these root types.
+ *
+ * We ensure that if a type is both a root and node, its
+ * element types cannot be root types.
+ *
+ * To ensure acyclicity:
+ *
+ * When A is an root type but not a node, its ownership
+ * chain can be:
+ * A -> B -> C
+ * Where:
+ * - A is an root, e.g. has bpf_rb_root.
+ * - B is both a root and node, e.g. has bpf_rb_node and
+ * bpf_list_head.
+ * - C is only an root, e.g. has bpf_list_node
+ *
+ * When A is both a root and node, some other type already
+ * owns it in the BTF domain, hence it can not own
+ * another root type through any of the ownership edges.
+ * A -> B
+ * Where:
+ * - A is both an root and node.
+ * - B is only an node.
+ */
+ if (meta->record->field_mask & BPF_GRAPH_ROOT)
+ return -ELOOP;
+ }
+ return 0;
+}
+
+static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
{
- const char *seq = BTF_INFO_KIND(t->info) == BTF_KIND_UNION ? "|" : ",";
const struct btf_member *member;
+ void *safe_data;
u32 i;
- seq_puts(m, "{");
+ safe_data = btf_show_start_struct_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
+
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
@@ -2374,23 +3932,65 @@ static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
u32 bytes_offset;
u8 bits8_offset;
- if (i)
- seq_puts(m, seq);
+ btf_show_start_member(show, member);
- member_offset = btf_member_bit_offset(t, member);
- bitfield_size = btf_member_bitfield_size(t, member);
+ member_offset = __btf_member_bit_offset(t, member);
+ bitfield_size = __btf_member_bitfield_size(t, member);
bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
if (bitfield_size) {
- btf_bitfield_seq_show(data + bytes_offset, bits8_offset,
- bitfield_size, m);
+ safe_data = btf_show_start_type(show, member_type,
+ member->type,
+ data + bytes_offset);
+ if (safe_data)
+ btf_bitfield_show(safe_data,
+ bits8_offset,
+ bitfield_size, show);
+ btf_show_end_type(show);
} else {
ops = btf_type_ops(member_type);
- ops->seq_show(btf, member_type, member->type,
- data + bytes_offset, bits8_offset, m);
+ ops->show(btf, member_type, member->type,
+ data + bytes_offset, bits8_offset, show);
}
+
+ btf_show_end_member(show);
+ }
+
+ btf_show_end_struct_type(show);
+}
+
+static void btf_struct_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ const struct btf_member *m = show->state.member;
+
+ /*
+ * First check if any members would be shown (are non-zero).
+ * See comments above "struct btf_show" definition for more
+ * details on how this works at a high-level.
+ */
+ if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) {
+ if (!show->state.depth_check) {
+ show->state.depth_check = show->state.depth + 1;
+ show->state.depth_to_show = 0;
+ }
+ __btf_struct_show(btf, t, type_id, data, bits_offset, show);
+ /* Restore saved member data here */
+ show->state.member = m;
+ if (show->state.depth_check != show->state.depth + 1)
+ return;
+ show->state.depth_check = 0;
+
+ if (show->state.depth_to_show <= show->state.depth)
+ return;
+ /*
+ * Reaching here indicates we have recursed and found
+ * non-zero child values.
+ */
}
- seq_puts(m, "}");
+
+ __btf_struct_show(btf, t, type_id, data, bits_offset, show);
}
static struct btf_kind_operations struct_ops = {
@@ -2399,7 +3999,7 @@ static struct btf_kind_operations struct_ops = {
.check_member = btf_struct_check_member,
.check_kflag_member = btf_generic_check_kflag_member,
.log_details = btf_struct_log,
- .seq_show = btf_struct_seq_show,
+ .show = btf_struct_show,
};
static int btf_enum_check_member(struct btf_verifier_env *env,
@@ -2468,6 +4068,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
{
const struct btf_enum *enums = btf_type_enum(t);
struct btf *btf = env->btf;
+ const char *fmt_str;
u16 i, nr_enums;
u32 meta_needed;
@@ -2481,11 +4082,6 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (btf_type_kflag(t)) {
- btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
- return -EINVAL;
- }
-
if (t->size > 8 || !is_power_of_2(t->size)) {
btf_verifier_log_type(env, t, "Unexpected size");
return -EINVAL;
@@ -2516,7 +4112,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
if (env->log.level == BPF_LOG_KERNEL)
continue;
- btf_verifier_log(env, "\t%s val=%d\n",
+ fmt_str = btf_type_kflag(t) ? "\t%s val=%d\n" : "\t%s val=%u\n";
+ btf_verifier_log(env, fmt_str,
__btf_name_by_offset(btf, enums[i].name_off),
enums[i].val);
}
@@ -2530,24 +4127,38 @@ static void btf_enum_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
-static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_enum_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
{
const struct btf_enum *enums = btf_type_enum(t);
u32 i, nr_enums = btf_type_vlen(t);
- int v = *(int *)data;
+ void *safe_data;
+ int v;
+
+ safe_data = btf_show_start_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
+
+ v = *(int *)safe_data;
for (i = 0; i < nr_enums; i++) {
- if (v == enums[i].val) {
- seq_printf(m, "%s",
- __btf_name_by_offset(btf,
- enums[i].name_off));
- return;
- }
+ if (v != enums[i].val)
+ continue;
+
+ btf_show_type_value(show, "%s",
+ __btf_name_by_offset(btf,
+ enums[i].name_off));
+
+ btf_show_end_type(show);
+ return;
}
- seq_printf(m, "%d", v);
+ if (btf_type_kflag(t))
+ btf_show_type_value(show, "%d", v);
+ else
+ btf_show_type_value(show, "%u", v);
+ btf_show_end_type(show);
}
static struct btf_kind_operations enum_ops = {
@@ -2556,7 +4167,110 @@ static struct btf_kind_operations enum_ops = {
.check_member = btf_enum_check_member,
.check_kflag_member = btf_enum_check_kflag_member,
.log_details = btf_enum_log,
- .seq_show = btf_enum_seq_show,
+ .show = btf_enum_show,
+};
+
+static s32 btf_enum64_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_enum64 *enums = btf_type_enum64(t);
+ struct btf *btf = env->btf;
+ const char *fmt_str;
+ u16 i, nr_enums;
+ u32 meta_needed;
+
+ nr_enums = btf_type_vlen(t);
+ meta_needed = nr_enums * sizeof(*enums);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (t->size > 8 || !is_power_of_2(t->size)) {
+ btf_verifier_log_type(env, t, "Unexpected size");
+ return -EINVAL;
+ }
+
+ /* enum type either no name or a valid one */
+ if (t->name_off &&
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ for (i = 0; i < nr_enums; i++) {
+ if (!btf_name_offset_valid(btf, enums[i].name_off)) {
+ btf_verifier_log(env, "\tInvalid name_offset:%u",
+ enums[i].name_off);
+ return -EINVAL;
+ }
+
+ /* enum member must have a valid name */
+ if (!enums[i].name_off ||
+ !btf_name_valid_identifier(btf, enums[i].name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ if (env->log.level == BPF_LOG_KERNEL)
+ continue;
+
+ fmt_str = btf_type_kflag(t) ? "\t%s val=%lld\n" : "\t%s val=%llu\n";
+ btf_verifier_log(env, fmt_str,
+ __btf_name_by_offset(btf, enums[i].name_off),
+ btf_enum64_value(enums + i));
+ }
+
+ return meta_needed;
+}
+
+static void btf_enum64_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ const struct btf_enum64 *enums = btf_type_enum64(t);
+ u32 i, nr_enums = btf_type_vlen(t);
+ void *safe_data;
+ s64 v;
+
+ safe_data = btf_show_start_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
+
+ v = *(u64 *)safe_data;
+
+ for (i = 0; i < nr_enums; i++) {
+ if (v != btf_enum64_value(enums + i))
+ continue;
+
+ btf_show_type_value(show, "%s",
+ __btf_name_by_offset(btf,
+ enums[i].name_off));
+
+ btf_show_end_type(show);
+ return;
+ }
+
+ if (btf_type_kflag(t))
+ btf_show_type_value(show, "%lld", v);
+ else
+ btf_show_type_value(show, "%llu", v);
+ btf_show_end_type(show);
+}
+
+static struct btf_kind_operations enum64_ops = {
+ .check_meta = btf_enum64_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_enum_check_member,
+ .check_kflag_member = btf_enum_check_kflag_member,
+ .log_details = btf_enum_log,
+ .show = btf_enum64_show,
};
static s32 btf_func_proto_check_meta(struct btf_verifier_env *env,
@@ -2635,7 +4349,7 @@ static struct btf_kind_operations func_proto_ops = {
* BTF_KIND_FUNC_PROTO cannot be directly referred by
* a struct's member.
*
- * It should be a funciton pointer instead.
+ * It should be a function pointer instead.
* (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO)
*
* Hence, there is no btf_func_check_member().
@@ -2643,7 +4357,7 @@ static struct btf_kind_operations func_proto_ops = {
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_func_proto_log,
- .seq_show = btf_df_seq_show,
+ .show = btf_df_show,
};
static s32 btf_func_check_meta(struct btf_verifier_env *env,
@@ -2671,13 +4385,28 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env,
return 0;
}
+static int btf_func_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *t = v->t;
+ u32 next_type_id = t->type;
+ int err;
+
+ err = btf_func_check(env, t);
+ if (err)
+ return err;
+
+ env_stack_pop_resolved(env, next_type_id, 0);
+ return 0;
+}
+
static struct btf_kind_operations func_ops = {
.check_meta = btf_func_check_meta,
- .resolve = btf_df_resolve,
+ .resolve = btf_func_resolve,
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_ref_type_log,
- .seq_show = btf_df_seq_show,
+ .show = btf_df_show,
};
static s32 btf_var_check_meta(struct btf_verifier_env *env,
@@ -2705,7 +4434,7 @@ static s32 btf_var_check_meta(struct btf_verifier_env *env,
}
if (!t->name_off ||
- !__btf_name_valid(env->btf, t->name_off, true)) {
+ !__btf_name_valid(env->btf, t->name_off)) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
@@ -2741,7 +4470,7 @@ static const struct btf_kind_operations var_ops = {
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_var_log,
- .seq_show = btf_var_seq_show,
+ .show = btf_var_show,
};
static s32 btf_datasec_check_meta(struct btf_verifier_env *env,
@@ -2760,11 +4489,6 @@ static s32 btf_datasec_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (!btf_type_vlen(t)) {
- btf_verifier_log_type(env, t, "vlen == 0");
- return -EINVAL;
- }
-
if (!t->size) {
btf_verifier_log_type(env, t, "size == 0");
return -EINVAL;
@@ -2829,6 +4553,7 @@ static int btf_datasec_resolve(struct btf_verifier_env *env,
struct btf *btf = env->btf;
u16 i;
+ env->resolve_mode = RESOLVE_TBD;
for_each_vsi_from(i, v->next_member, v->t, vsi) {
u32 var_type_id = vsi->type, type_id, type_size = 0;
const struct btf_type *var_type = btf_type_by_id(env->btf,
@@ -2867,24 +4592,28 @@ static void btf_datasec_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
-static void btf_datasec_seq_show(const struct btf *btf,
- const struct btf_type *t, u32 type_id,
- void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_datasec_show(const struct btf *btf,
+ const struct btf_type *t, u32 type_id,
+ void *data, u8 bits_offset,
+ struct btf_show *show)
{
const struct btf_var_secinfo *vsi;
const struct btf_type *var;
u32 i;
- seq_printf(m, "section (\"%s\") = {", __btf_name_by_offset(btf, t->name_off));
+ if (!btf_show_start_type(show, t, type_id, data))
+ return;
+
+ btf_show_type_value(show, "section (\"%s\") = {",
+ __btf_name_by_offset(btf, t->name_off));
for_each_vsi(i, t, vsi) {
var = btf_type_by_id(btf, vsi->type);
if (i)
- seq_puts(m, ",");
- btf_type_ops(var)->seq_show(btf, var, vsi->type,
- data + vsi->offset, bits_offset, m);
+ btf_show(show, ",");
+ btf_type_ops(var)->show(btf, var, vsi->type,
+ data + vsi->offset, bits_offset, show);
}
- seq_puts(m, "}");
+ btf_show_end_type(show);
}
static const struct btf_kind_operations datasec_ops = {
@@ -2893,7 +4622,186 @@ static const struct btf_kind_operations datasec_ops = {
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_datasec_log,
- .seq_show = btf_datasec_seq_show,
+ .show = btf_datasec_show,
+};
+
+static s32 btf_float_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ if (t->size != 2 && t->size != 4 && t->size != 8 && t->size != 12 &&
+ t->size != 16) {
+ btf_verifier_log_type(env, t, "Invalid type_size");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return 0;
+}
+
+static int btf_float_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u64 start_offset_bytes;
+ u64 end_offset_bytes;
+ u64 misalign_bits;
+ u64 align_bytes;
+ u64 align_bits;
+
+ /* Different architectures have different alignment requirements, so
+ * here we check only for the reasonable minimum. This way we ensure
+ * that types after CO-RE can pass the kernel BTF verifier.
+ */
+ align_bytes = min_t(u64, sizeof(void *), member_type->size);
+ align_bits = align_bytes * BITS_PER_BYTE;
+ div64_u64_rem(member->offset, align_bits, &misalign_bits);
+ if (misalign_bits) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not properly aligned");
+ return -EINVAL;
+ }
+
+ start_offset_bytes = member->offset / BITS_PER_BYTE;
+ end_offset_bytes = start_offset_bytes + member_type->size;
+ if (end_offset_bytes > struct_type->size) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void btf_float_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "size=%u", t->size);
+}
+
+static const struct btf_kind_operations float_ops = {
+ .check_meta = btf_float_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_float_check_member,
+ .check_kflag_member = btf_generic_check_kflag_member,
+ .log_details = btf_float_log,
+ .show = btf_df_show,
+};
+
+static s32 btf_decl_tag_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_decl_tag *tag;
+ u32 meta_needed = sizeof(*tag);
+ s32 component_idx;
+ const char *value;
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ value = btf_name_by_offset(env->btf, t->name_off);
+ if (!value || !value[0]) {
+ btf_verifier_log_type(env, t, "Invalid value");
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ component_idx = btf_type_decl_tag(t)->component_idx;
+ if (component_idx < -1) {
+ btf_verifier_log_type(env, t, "Invalid component_idx");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static int btf_decl_tag_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *next_type;
+ const struct btf_type *t = v->t;
+ u32 next_type_id = t->type;
+ struct btf *btf = env->btf;
+ s32 component_idx;
+ u32 vlen;
+
+ next_type = btf_type_by_id(btf, next_type_id);
+ if (!next_type || !btf_type_is_decl_tag_target(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, next_type) &&
+ !env_type_is_resolved(env, next_type_id))
+ return env_stack_push(env, next_type, next_type_id);
+
+ component_idx = btf_type_decl_tag(t)->component_idx;
+ if (component_idx != -1) {
+ if (btf_type_is_var(next_type) || btf_type_is_typedef(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid component_idx");
+ return -EINVAL;
+ }
+
+ if (btf_type_is_struct(next_type)) {
+ vlen = btf_type_vlen(next_type);
+ } else {
+ /* next_type should be a function */
+ next_type = btf_type_by_id(btf, next_type->type);
+ vlen = btf_type_vlen(next_type);
+ }
+
+ if ((u32)component_idx >= vlen) {
+ btf_verifier_log_type(env, v->t, "Invalid component_idx");
+ return -EINVAL;
+ }
+ }
+
+ env_stack_pop_resolved(env, next_type_id, 0);
+
+ return 0;
+}
+
+static void btf_decl_tag_log(struct btf_verifier_env *env, const struct btf_type *t)
+{
+ btf_verifier_log(env, "type=%u component_idx=%d", t->type,
+ btf_type_decl_tag(t)->component_idx);
+}
+
+static const struct btf_kind_operations decl_tag_ops = {
+ .check_meta = btf_decl_tag_check_meta,
+ .resolve = btf_decl_tag_resolve,
+ .check_member = btf_df_check_member,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_decl_tag_log,
+ .show = btf_df_show,
};
static int btf_func_proto_check(struct btf_verifier_env *env,
@@ -2919,6 +4827,11 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
return -EINVAL;
}
+ if (btf_type_is_resolve_source_only(ret_type)) {
+ btf_verifier_log_type(env, t, "Invalid return type");
+ return -EINVAL;
+ }
+
if (btf_type_needs_resolve(ret_type) &&
!env_type_is_resolved(env, ret_type_id)) {
err = btf_resolve(env, ret_type, ret_type_id);
@@ -2946,7 +4859,6 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
nr_args--;
}
- err = 0;
for (i = 0; i < nr_args; i++) {
const struct btf_type *arg_type;
u32 arg_type_id;
@@ -2955,8 +4867,12 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
arg_type = btf_type_by_id(btf, arg_type_id);
if (!arg_type) {
btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
- err = -EINVAL;
- break;
+ return -EINVAL;
+ }
+
+ if (btf_type_is_resolve_source_only(arg_type)) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+ return -EINVAL;
}
if (args[i].name_off &&
@@ -2964,25 +4880,23 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
!btf_name_valid_identifier(btf, args[i].name_off))) {
btf_verifier_log_type(env, t,
"Invalid arg#%u", i + 1);
- err = -EINVAL;
- break;
+ return -EINVAL;
}
if (btf_type_needs_resolve(arg_type) &&
!env_type_is_resolved(env, arg_type_id)) {
err = btf_resolve(env, arg_type, arg_type_id);
if (err)
- break;
+ return err;
}
if (!btf_type_id_size(btf, &arg_type_id, NULL)) {
btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
- err = -EINVAL;
- break;
+ return -EINVAL;
}
}
- return err;
+ return 0;
}
static int btf_func_check(struct btf_verifier_env *env,
@@ -3029,6 +4943,10 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_FUNC_PROTO] = &func_proto_ops,
[BTF_KIND_VAR] = &var_ops,
[BTF_KIND_DATASEC] = &datasec_ops,
+ [BTF_KIND_FLOAT] = &float_ops,
+ [BTF_KIND_DECL_TAG] = &decl_tag_ops,
+ [BTF_KIND_TYPE_TAG] = &modifier_ops,
+ [BTF_KIND_ENUM64] = &enum64_ops,
};
static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -3083,7 +5001,7 @@ static int btf_check_all_metas(struct btf_verifier_env *env)
cur = btf->nohdr_data + hdr->type_off;
end = cur + hdr->type_len;
- env->log_type_id = 1;
+ env->log_type_id = btf->base_btf ? btf->start_id : 1;
while (cur < end) {
struct btf_type *t = cur;
s32 meta_size;
@@ -3110,8 +5028,12 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
return false;
if (btf_type_is_struct(t) || btf_type_is_datasec(t))
- return !btf->resolved_ids[type_id] &&
- !btf->resolved_sizes[type_id];
+ return !btf_resolved_type_id(btf, type_id) &&
+ !btf_resolved_type_size(btf, type_id);
+
+ if (btf_type_is_decl_tag(t) || btf_type_is_func(t))
+ return btf_resolved_type_id(btf, type_id) &&
+ !btf_resolved_type_size(btf, type_id);
if (btf_type_is_modifier(t) || btf_type_is_ptr(t) ||
btf_type_is_var(t)) {
@@ -3131,7 +5053,7 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
return elem_type && !btf_type_is_modifier(elem_type) &&
(array->nelems * elem_size ==
- btf->resolved_sizes[type_id]);
+ btf_resolved_type_size(btf, type_id));
}
return false;
@@ -3173,7 +5095,8 @@ static int btf_resolve(struct btf_verifier_env *env,
static int btf_check_all_types(struct btf_verifier_env *env)
{
struct btf *btf = env->btf;
- u32 type_id;
+ const struct btf_type *t;
+ u32 type_id, i;
int err;
err = env_resolve_init(env);
@@ -3181,8 +5104,9 @@ static int btf_check_all_types(struct btf_verifier_env *env)
return err;
env->phase++;
- for (type_id = 1; type_id <= btf->nr_types; type_id++) {
- const struct btf_type *t = btf_type_by_id(btf, type_id);
+ for (i = btf->base_btf ? 0 : 1; i < btf->nr_types; i++) {
+ type_id = btf->start_id + i;
+ t = btf_type_by_id(btf, type_id);
env->log_type_id = type_id;
if (btf_type_needs_resolve(t) &&
@@ -3197,12 +5121,6 @@ static int btf_check_all_types(struct btf_verifier_env *env)
if (err)
return err;
}
-
- if (btf_type_is_func(t)) {
- err = btf_func_check(env, t);
- if (err)
- return err;
- }
}
return 0;
@@ -3219,7 +5137,7 @@ static int btf_parse_type_sec(struct btf_verifier_env *env)
return -EINVAL;
}
- if (!hdr->type_len) {
+ if (!env->btf->base_btf && !hdr->type_len) {
btf_verifier_log(env, "No type found");
return -EINVAL;
}
@@ -3246,13 +5164,18 @@ static int btf_parse_str_sec(struct btf_verifier_env *env)
return -EINVAL;
}
- if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET ||
- start[0] || end[-1]) {
+ btf->strings = start;
+
+ if (btf->base_btf && !hdr->str_len)
+ return 0;
+ if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || end[-1]) {
+ btf_verifier_log(env, "Invalid string section");
+ return -EINVAL;
+ }
+ if (!btf->base_btf && start[0]) {
btf_verifier_log(env, "Invalid string section");
return -EINVAL;
}
-
- btf->strings = start;
return 0;
}
@@ -3328,13 +5251,11 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
u32 hdr_len, hdr_copy, btf_data_size;
const struct btf_header *hdr;
struct btf *btf;
- int err;
btf = env->btf;
btf_data_size = btf->data_size;
- if (btf_data_size <
- offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) {
+ if (btf_data_size < offsetofend(struct btf_header, hdr_len)) {
btf_verifier_log(env, "hdr_len not found");
return -EINVAL;
}
@@ -3381,50 +5302,206 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
return -ENOTSUPP;
}
- if (btf_data_size == hdr->hdr_len) {
+ if (!btf->base_btf && btf_data_size == hdr->hdr_len) {
btf_verifier_log(env, "No data");
return -EINVAL;
}
- err = btf_check_sec_info(env, btf_data_size);
- if (err)
- return err;
+ return btf_check_sec_info(env, btf_data_size);
+}
+
+static const char *alloc_obj_fields[] = {
+ "bpf_spin_lock",
+ "bpf_list_head",
+ "bpf_list_node",
+ "bpf_rb_root",
+ "bpf_rb_node",
+ "bpf_refcount",
+};
+static struct btf_struct_metas *
+btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
+{
+ union {
+ struct btf_id_set set;
+ struct {
+ u32 _cnt;
+ u32 _ids[ARRAY_SIZE(alloc_obj_fields)];
+ } _arr;
+ } aof;
+ struct btf_struct_metas *tab = NULL;
+ int i, n, id, ret;
+
+ BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0);
+ BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32));
+
+ memset(&aof, 0, sizeof(aof));
+ for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) {
+ /* Try to find whether this special type exists in user BTF, and
+ * if so remember its ID so we can easily find it among members
+ * of structs that we iterate in the next loop.
+ */
+ id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT);
+ if (id < 0)
+ continue;
+ aof.set.ids[aof.set.cnt++] = id;
+ }
+
+ if (!aof.set.cnt)
+ return NULL;
+ sort(&aof.set.ids, aof.set.cnt, sizeof(aof.set.ids[0]), btf_id_cmp_func, NULL);
+
+ n = btf_nr_types(btf);
+ for (i = 1; i < n; i++) {
+ struct btf_struct_metas *new_tab;
+ const struct btf_member *member;
+ struct btf_struct_meta *type;
+ struct btf_record *record;
+ const struct btf_type *t;
+ int j, tab_cnt;
+
+ t = btf_type_by_id(btf, i);
+ if (!t) {
+ ret = -EINVAL;
+ goto free;
+ }
+ if (!__btf_type_is_struct(t))
+ continue;
+
+ cond_resched();
+
+ for_each_member(j, t, member) {
+ if (btf_id_set_contains(&aof.set, member->type))
+ goto parse;
+ }
+ continue;
+ parse:
+ tab_cnt = tab ? tab->cnt : 0;
+ new_tab = krealloc(tab, offsetof(struct btf_struct_metas, types[tab_cnt + 1]),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_tab) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ if (!tab)
+ new_tab->cnt = 0;
+ tab = new_tab;
+
+ type = &tab->types[tab->cnt];
+ type->btf_id = i;
+ record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
+ BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);
+ /* The record cannot be unset, treat it as an error if so */
+ if (IS_ERR_OR_NULL(record)) {
+ ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
+ goto free;
+ }
+ type->record = record;
+ tab->cnt++;
+ }
+ return tab;
+free:
+ btf_struct_metas_free(tab);
+ return ERR_PTR(ret);
+}
+
+struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id)
+{
+ struct btf_struct_metas *tab;
+
+ BUILD_BUG_ON(offsetof(struct btf_struct_meta, btf_id) != 0);
+ tab = btf->struct_meta_tab;
+ if (!tab)
+ return NULL;
+ return bsearch(&btf_id, tab->types, tab->cnt, sizeof(tab->types[0]), btf_id_cmp_func);
+}
+
+static int btf_check_type_tags(struct btf_verifier_env *env,
+ struct btf *btf, int start_id)
+{
+ int i, n, good_id = start_id - 1;
+ bool in_tags;
+
+ n = btf_nr_types(btf);
+ for (i = start_id; i < n; i++) {
+ const struct btf_type *t;
+ int chain_limit = 32;
+ u32 cur_id = i;
+
+ t = btf_type_by_id(btf, i);
+ if (!t)
+ return -EINVAL;
+ if (!btf_type_is_modifier(t))
+ continue;
+
+ cond_resched();
+
+ in_tags = btf_type_is_type_tag(t);
+ while (btf_type_is_modifier(t)) {
+ if (!chain_limit--) {
+ btf_verifier_log(env, "Max chain length or cycle detected");
+ return -ELOOP;
+ }
+ if (btf_type_is_type_tag(t)) {
+ if (!in_tags) {
+ btf_verifier_log(env, "Type tags don't precede modifiers");
+ return -EINVAL;
+ }
+ } else if (in_tags) {
+ in_tags = false;
+ }
+ if (cur_id <= good_id)
+ break;
+ /* Move to next type */
+ cur_id = t->type;
+ t = btf_type_by_id(btf, cur_id);
+ if (!t)
+ return -EINVAL;
+ }
+ good_id = i;
+ }
return 0;
}
-static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
- u32 log_level, char __user *log_ubuf, u32 log_size)
+static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size)
+{
+ u32 log_true_size;
+ int err;
+
+ err = bpf_vlog_finalize(log, &log_true_size);
+
+ if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
+ copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
+ &log_true_size, sizeof(log_true_size)))
+ err = -EFAULT;
+
+ return err;
+}
+
+static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
+ bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel);
+ char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf);
+ struct btf_struct_metas *struct_meta_tab;
struct btf_verifier_env *env = NULL;
- struct bpf_verifier_log *log;
struct btf *btf = NULL;
u8 *data;
- int err;
+ int err, ret;
- if (btf_data_size > BTF_MAX_SIZE)
+ if (attr->btf_size > BTF_MAX_SIZE)
return ERR_PTR(-E2BIG);
env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
if (!env)
return ERR_PTR(-ENOMEM);
- log = &env->log;
- if (log_level || log_ubuf || log_size) {
- /* user requested verbose verifier output
- * and supplied buffer to store the verification trace
- */
- log->level = log_level;
- log->ubuf = log_ubuf;
- log->len_total = log_size;
-
- /* log attributes have to be sane */
- if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
- !log->level || !log->ubuf) {
- err = -EINVAL;
- goto errout;
- }
- }
+ /* user could have requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ err = bpf_vlog_init(&env->log, attr->btf_log_level,
+ log_ubuf, attr->btf_log_size);
+ if (err)
+ goto errout_free;
btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
if (!btf) {
@@ -3433,16 +5510,16 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
}
env->btf = btf;
- data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
+ data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN);
if (!data) {
err = -ENOMEM;
goto errout;
}
btf->data = data;
- btf->data_size = btf_data_size;
+ btf->data_size = attr->btf_size;
- if (copy_from_user(data, btf_data, btf_data_size)) {
+ if (copy_from_bpfptr(data, btf_data, attr->btf_size)) {
err = -EFAULT;
goto errout;
}
@@ -3461,16 +5538,43 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
if (err)
goto errout;
- if (log->level && bpf_verifier_log_full(log)) {
- err = -ENOSPC;
+ err = btf_check_type_tags(env, btf, 1);
+ if (err)
+ goto errout;
+
+ struct_meta_tab = btf_parse_struct_metas(&env->log, btf);
+ if (IS_ERR(struct_meta_tab)) {
+ err = PTR_ERR(struct_meta_tab);
goto errout;
}
+ btf->struct_meta_tab = struct_meta_tab;
+
+ if (struct_meta_tab) {
+ int i;
+
+ for (i = 0; i < struct_meta_tab->cnt; i++) {
+ err = btf_check_and_fixup_fields(btf, struct_meta_tab->types[i].record);
+ if (err < 0)
+ goto errout_meta;
+ }
+ }
+
+ err = finalize_log(&env->log, uattr, uattr_size);
+ if (err)
+ goto errout_free;
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
return btf;
+errout_meta:
+ btf_free_struct_meta_tab(btf);
errout:
+ /* overwrite err with -ENOSPC or -EFAULT */
+ ret = finalize_log(&env->log, uattr, uattr_size);
+ if (ret)
+ err = ret;
+errout_free:
btf_verifier_env_free(env);
if (btf)
btf_free(btf);
@@ -3511,21 +5615,46 @@ static u8 bpf_ctx_convert_map[] = {
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
-static const struct btf_member *
-btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
- const struct btf_type *t, enum bpf_prog_type prog_type,
- int arg)
+static const struct btf_type *find_canonical_prog_ctx_type(enum bpf_prog_type prog_type)
{
const struct btf_type *conv_struct;
- const struct btf_type *ctx_struct;
const struct btf_member *ctx_type;
- const char *tname, *ctx_tname;
conv_struct = bpf_ctx_convert.t;
- if (!conv_struct) {
- bpf_log(log, "btf_vmlinux is malformed\n");
+ if (!conv_struct)
return NULL;
- }
+ /* prog_type is valid bpf program type. No need for bounds check. */
+ ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2;
+ /* ctx_type is a pointer to prog_ctx_type in vmlinux.
+ * Like 'struct __sk_buff'
+ */
+ return btf_type_by_id(btf_vmlinux, ctx_type->type);
+}
+
+static int find_kern_ctx_type_id(enum bpf_prog_type prog_type)
+{
+ const struct btf_type *conv_struct;
+ const struct btf_member *ctx_type;
+
+ conv_struct = bpf_ctx_convert.t;
+ if (!conv_struct)
+ return -EFAULT;
+ /* prog_type is valid bpf program type. No need for bounds check. */
+ ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1;
+ /* ctx_type is a pointer to prog_ctx_type in vmlinux.
+ * Like 'struct sk_buff'
+ */
+ return ctx_type->type;
+}
+
+const struct btf_type *
+btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+ const struct btf_type *t, enum bpf_prog_type prog_type,
+ int arg)
+{
+ const struct btf_type *ctx_type;
+ const char *tname, *ctx_tname;
+
t = btf_type_by_id(btf, t->type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
@@ -3535,8 +5664,6 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
* is not supported yet.
* BPF_PROG_TYPE_RAW_TRACEPOINT is fine.
*/
- if (log->level & BPF_LOG_LEVEL)
- bpf_log(log, "arg#%d type is not a struct\n", arg);
return NULL;
}
tname = btf_name_by_offset(btf, t->name_off);
@@ -3544,16 +5671,15 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
bpf_log(log, "arg#%d struct doesn't have a name\n", arg);
return NULL;
}
- /* prog_type is valid bpf program type. No need for bounds check. */
- ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2;
- /* ctx_struct is a pointer to prog_ctx_type in vmlinux.
- * Like 'struct __sk_buff'
- */
- ctx_struct = btf_type_by_id(btf_vmlinux, ctx_type->type);
- if (!ctx_struct)
+
+ ctx_type = find_canonical_prog_ctx_type(prog_type);
+ if (!ctx_type) {
+ bpf_log(log, "btf_vmlinux is malformed\n");
/* should not happen */
return NULL;
- ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off);
+ }
+again:
+ ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
if (!ctx_tname) {
/* should not happen */
bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n");
@@ -3566,32 +5692,206 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
* int socket_filter_bpf_prog(struct __sk_buff *skb)
* { // no fields of skb are ever used }
*/
- if (strcmp(ctx_tname, tname))
- return NULL;
+ if (strcmp(ctx_tname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0)
+ return ctx_type;
+ if (strcmp(ctx_tname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0)
+ return ctx_type;
+ if (strcmp(ctx_tname, tname)) {
+ /* bpf_user_pt_regs_t is a typedef, so resolve it to
+ * underlying struct and check name again
+ */
+ if (!btf_type_is_modifier(ctx_type))
+ return NULL;
+ while (btf_type_is_modifier(ctx_type))
+ ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);
+ goto again;
+ }
return ctx_type;
}
+/* forward declarations for arch-specific underlying types of
+ * bpf_user_pt_regs_t; this avoids the need for arch-specific #ifdef
+ * compilation guards below for BPF_PROG_TYPE_PERF_EVENT checks, but still
+ * works correctly with __builtin_types_compatible_p() on respective
+ * architectures
+ */
+struct user_regs_struct;
+struct user_pt_regs;
+
+static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+ const struct btf_type *t, int arg,
+ enum bpf_prog_type prog_type,
+ enum bpf_attach_type attach_type)
+{
+ const struct btf_type *ctx_type;
+ const char *tname, *ctx_tname;
+
+ if (!btf_is_ptr(t)) {
+ bpf_log(log, "arg#%d type isn't a pointer\n", arg);
+ return -EINVAL;
+ }
+ t = btf_type_by_id(btf, t->type);
+
+ /* KPROBE and PERF_EVENT programs allow bpf_user_pt_regs_t typedef */
+ if (prog_type == BPF_PROG_TYPE_KPROBE || prog_type == BPF_PROG_TYPE_PERF_EVENT) {
+ while (btf_type_is_modifier(t) && !btf_type_is_typedef(t))
+ t = btf_type_by_id(btf, t->type);
+
+ if (btf_type_is_typedef(t)) {
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0)
+ return 0;
+ }
+ }
+
+ /* all other program types don't use typedefs for context type */
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+
+ /* `void *ctx __arg_ctx` is always valid */
+ if (btf_type_is_void(t))
+ return 0;
+
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (str_is_empty(tname)) {
+ bpf_log(log, "arg#%d type doesn't have a name\n", arg);
+ return -EINVAL;
+ }
+
+ /* special cases */
+ switch (prog_type) {
+ case BPF_PROG_TYPE_KPROBE:
+ if (__btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_PERF_EVENT:
+ if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct pt_regs) &&
+ __btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0)
+ return 0;
+ if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_pt_regs) &&
+ __btf_type_is_struct(t) && strcmp(tname, "user_pt_regs") == 0)
+ return 0;
+ if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_regs_struct) &&
+ __btf_type_is_struct(t) && strcmp(tname, "user_regs_struct") == 0)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_TRACING:
+ switch (attach_type) {
+ case BPF_TRACE_RAW_TP:
+ /* tp_btf program is TRACING, so need special case here */
+ if (__btf_type_is_struct(t) &&
+ strcmp(tname, "bpf_raw_tracepoint_args") == 0)
+ return 0;
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ case BPF_TRACE_ITER:
+ /* allow struct bpf_iter__xxx types only */
+ if (__btf_type_is_struct(t) &&
+ strncmp(tname, "bpf_iter__", sizeof("bpf_iter__") - 1) == 0)
+ return 0;
+ break;
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ case BPF_MODIFY_RETURN:
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ default:
+ break;
+ }
+ break;
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_SYSCALL:
+ case BPF_PROG_TYPE_EXT:
+ return 0; /* anything goes */
+ default:
+ break;
+ }
+
+ ctx_type = find_canonical_prog_ctx_type(prog_type);
+ if (!ctx_type) {
+ /* should not happen */
+ bpf_log(log, "btf_vmlinux is malformed\n");
+ return -EINVAL;
+ }
+
+ /* resolve typedefs and check that underlying structs are matching as well */
+ while (btf_type_is_modifier(ctx_type))
+ ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);
+
+ /* if program type doesn't have distinctly named struct type for
+ * context, then __arg_ctx argument can only be `void *`, which we
+ * already checked above
+ */
+ if (!__btf_type_is_struct(ctx_type)) {
+ bpf_log(log, "arg#%d should be void pointer\n", arg);
+ return -EINVAL;
+ }
+
+ ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
+ if (!__btf_type_is_struct(t) || strcmp(ctx_tname, tname) != 0) {
+ bpf_log(log, "arg#%d should be `struct %s *`\n", arg, ctx_tname);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
struct btf *btf,
const struct btf_type *t,
enum bpf_prog_type prog_type,
int arg)
{
- const struct btf_member *prog_ctx_type, *kern_ctx_type;
-
- prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg);
- if (!prog_ctx_type)
+ if (!btf_get_prog_ctx_type(log, btf, t, prog_type, arg))
return -ENOENT;
- kern_ctx_type = prog_ctx_type + 1;
- return kern_ctx_type->type;
+ return find_kern_ctx_type_id(prog_type);
}
+int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type)
+{
+ const struct btf_member *kctx_member;
+ const struct btf_type *conv_struct;
+ const struct btf_type *kctx_type;
+ u32 kctx_type_id;
+
+ conv_struct = bpf_ctx_convert.t;
+ /* get member for kernel ctx type */
+ kctx_member = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1;
+ kctx_type_id = kctx_member->type;
+ kctx_type = btf_type_by_id(btf_vmlinux, kctx_type_id);
+ if (!btf_type_is_struct(kctx_type)) {
+ bpf_log(log, "kern ctx type id %u is not a struct\n", kctx_type_id);
+ return -EINVAL;
+ }
+
+ return kctx_type_id;
+}
+
+BTF_ID_LIST(bpf_ctx_convert_btf_id)
+BTF_ID(struct, bpf_ctx_convert)
+
struct btf *btf_parse_vmlinux(void)
{
struct btf_verifier_env *env = NULL;
struct bpf_verifier_log *log;
struct btf *btf = NULL;
- int err, i;
+ int err;
env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
if (!env)
@@ -3609,6 +5909,8 @@ struct btf *btf_parse_vmlinux(void)
btf->data = __start_BTF;
btf->data_size = __stop_BTF - __start_BTF;
+ btf->kernel_btf = true;
+ snprintf(btf->name, sizeof(btf->name), "vmlinux");
err = btf_parse_hdr(env);
if (err)
@@ -3624,27 +5926,93 @@ struct btf *btf_parse_vmlinux(void)
if (err)
goto errout;
- /* find struct bpf_ctx_convert for type checking later */
- for (i = 1; i <= btf->nr_types; i++) {
- const struct btf_type *t;
- const char *tname;
+ err = btf_check_type_tags(env, btf, 1);
+ if (err)
+ goto errout;
- t = btf_type_by_id(btf, i);
- if (!__btf_type_is_struct(t))
- continue;
- tname = __btf_name_by_offset(btf, t->name_off);
- if (!strcmp(tname, "bpf_ctx_convert")) {
- /* btf_parse_vmlinux() runs under bpf_verifier_lock */
- bpf_ctx_convert.t = t;
- break;
- }
+ /* btf_parse_vmlinux() runs under bpf_verifier_lock */
+ bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
+
+ bpf_struct_ops_init(btf, log);
+
+ refcount_set(&btf->refcnt, 1);
+
+ err = btf_alloc_id(btf);
+ if (err)
+ goto errout;
+
+ btf_verifier_env_free(env);
+ return btf;
+
+errout:
+ btf_verifier_env_free(env);
+ if (btf) {
+ kvfree(btf->types);
+ kfree(btf);
}
- if (i > btf->nr_types) {
- err = -ENOENT;
+ return ERR_PTR(err);
+}
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+
+static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size)
+{
+ struct btf_verifier_env *env = NULL;
+ struct bpf_verifier_log *log;
+ struct btf *btf = NULL, *base_btf;
+ int err;
+
+ base_btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(base_btf))
+ return base_btf;
+ if (!base_btf)
+ return ERR_PTR(-EINVAL);
+
+ env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+ if (!env)
+ return ERR_PTR(-ENOMEM);
+
+ log = &env->log;
+ log->level = BPF_LOG_KERNEL;
+
+ btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
+ if (!btf) {
+ err = -ENOMEM;
goto errout;
}
+ env->btf = btf;
- bpf_struct_ops_init(btf, log);
+ btf->base_btf = base_btf;
+ btf->start_id = base_btf->nr_types;
+ btf->start_str_off = base_btf->hdr.str_len;
+ btf->kernel_btf = true;
+ snprintf(btf->name, sizeof(btf->name), "%s", module_name);
+
+ btf->data = kvmalloc(data_size, GFP_KERNEL | __GFP_NOWARN);
+ if (!btf->data) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ memcpy(btf->data, data, data_size);
+ btf->data_size = data_size;
+
+ err = btf_parse_hdr(env);
+ if (err)
+ goto errout;
+
+ btf->nohdr_data = btf->data + btf->hdr.hdr_len;
+
+ err = btf_parse_str_sec(env);
+ if (err)
+ goto errout;
+
+ err = btf_check_all_metas(env);
+ if (err)
+ goto errout;
+
+ err = btf_check_type_tags(env, btf, btf_nr_types(base_btf));
+ if (err)
+ goto errout;
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
@@ -3653,34 +6021,75 @@ struct btf *btf_parse_vmlinux(void)
errout:
btf_verifier_env_free(env);
if (btf) {
+ kvfree(btf->data);
kvfree(btf->types);
kfree(btf);
}
return ERR_PTR(err);
}
+#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */
+
struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
{
- struct bpf_prog *tgt_prog = prog->aux->linked_prog;
+ struct bpf_prog *tgt_prog = prog->aux->dst_prog;
- if (tgt_prog) {
+ if (tgt_prog)
return tgt_prog->aux->btf;
- } else {
- return btf_vmlinux;
- }
+ else
+ return prog->aux->attach_btf;
}
-static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
+static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
{
- /* t comes in already as a pointer */
- t = btf_type_by_id(btf, t->type);
+ /* skip modifiers */
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
- /* allow const */
- if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
- t = btf_type_by_id(btf, t->type);
+ return btf_type_is_int(t);
+}
+
+static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
+ int off)
+{
+ const struct btf_param *args;
+ const struct btf_type *t;
+ u32 offset = 0, nr_args;
+ int i;
- /* char, signed char, unsigned char */
- return btf_type_is_int(t) && t->size == 1;
+ if (!func_proto)
+ return off / 8;
+
+ nr_args = btf_type_vlen(func_proto);
+ args = (const struct btf_param *)(func_proto + 1);
+ for (i = 0; i < nr_args; i++) {
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+ offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
+ if (off < offset)
+ return i;
+ }
+
+ t = btf_type_skip_modifiers(btf, func_proto->type, NULL);
+ offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
+ if (off < offset)
+ return nr_args;
+
+ return nr_args + 1;
+}
+
+static bool prog_args_trusted(const struct bpf_prog *prog)
+{
+ enum bpf_attach_type atype = prog->expected_attach_type;
+
+ switch (prog->type) {
+ case BPF_PROG_TYPE_TRACING:
+ return atype == BPF_TRACE_RAW_TP || atype == BPF_TRACE_ITER;
+ case BPF_PROG_TYPE_LSM:
+ return bpf_lsm_is_trusted(prog);
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ return true;
+ default:
+ return false;
+ }
}
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
@@ -3688,11 +6097,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
const struct btf_type *t = prog->aux->attach_func_proto;
- struct bpf_prog *tgt_prog = prog->aux->linked_prog;
+ struct bpf_prog *tgt_prog = prog->aux->dst_prog;
struct btf *btf = bpf_prog_get_target_btf(prog);
const char *tname = prog->aux->attach_func_name;
struct bpf_verifier_log *log = info->log;
const struct btf_param *args;
+ const char *tag_value;
u32 nr_args, arg;
int i, ret;
@@ -3701,10 +6111,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
tname, off);
return false;
}
- arg = off / 8;
+ arg = get_ctx_arg_idx(btf, t, off);
args = (const struct btf_param *)(t + 1);
- /* if (t == NULL) Fall back to default BPF prog with 5 u64 arguments */
- nr_args = t ? btf_type_vlen(t) : 5;
+ /* if (t == NULL) Fall back to default BPF prog with
+ * MAX_BPF_FUNC_REG_ARGS u64 arguments.
+ */
+ nr_args = t ? btf_type_vlen(t) : MAX_BPF_FUNC_REG_ARGS;
if (prog->aux->attach_btf_trace) {
/* skip first 'void *__data' argument in btf_trace_##name typedef */
args++;
@@ -3719,6 +6131,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (arg == nr_args) {
switch (prog->expected_attach_type) {
+ case BPF_LSM_CGROUP:
case BPF_LSM_MAC:
case BPF_TRACE_FEXIT:
/* When LSM programs are attached to void LSM hooks
@@ -3749,7 +6162,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (!btf_type_is_small_int(t)) {
bpf_log(log,
"ret type %s not allowed for fmod_ret\n",
- btf_kind_str[BTF_INFO_KIND(t->info)]);
+ btf_type_str(t));
return false;
}
break;
@@ -3760,7 +6173,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
} else {
if (!t)
- /* Default prog with 5 args */
+ /* Default prog with MAX_BPF_FUNC_REG_ARGS args */
return true;
t = btf_type_by_id(btf, args[arg].type);
}
@@ -3768,7 +6181,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* skip modifiers */
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_small_int(t) || btf_type_is_enum(t))
+ if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t))
/* accessing a scalar */
return true;
if (!btf_type_is_ptr(t)) {
@@ -3776,9 +6189,24 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
"func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n",
tname, arg,
__btf_name_by_offset(btf, t->name_off),
- btf_kind_str[BTF_INFO_KIND(t->info)]);
+ btf_type_str(t));
return false;
}
+
+ /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
+ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
+ const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
+ u32 type, flag;
+
+ type = base_type(ctx_arg_info->reg_type);
+ flag = type_flag(ctx_arg_info->reg_type);
+ if (ctx_arg_info->offset == off && type == PTR_TO_BUF &&
+ (flag & PTR_MAYBE_NULL)) {
+ info->reg_type = ctx_arg_info->reg_type;
+ return true;
+ }
+ }
+
if (t->type == 0)
/* This is a pointer to void.
* It is the same as scalar from the verifier safety pov.
@@ -3786,23 +6214,41 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
*/
return true;
- if (is_string_ptr(btf, t))
+ if (is_int_ptr(btf, t))
return true;
/* this is a pointer to another type */
- info->reg_type = PTR_TO_BTF_ID;
for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
if (ctx_arg_info->offset == off) {
+ if (!ctx_arg_info->btf_id) {
+ bpf_log(log,"invalid btf_id for context argument offset %u\n", off);
+ return false;
+ }
+
info->reg_type = ctx_arg_info->reg_type;
- break;
+ info->btf = btf_vmlinux;
+ info->btf_id = ctx_arg_info->btf_id;
+ return true;
}
}
+ info->reg_type = PTR_TO_BTF_ID;
+ if (prog_args_trusted(prog))
+ info->reg_type |= PTR_TRUSTED;
+
if (tgt_prog) {
- ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg);
+ enum bpf_prog_type tgt_type;
+
+ if (tgt_prog->type == BPF_PROG_TYPE_EXT)
+ tgt_type = tgt_prog->aux->saved_dst_prog_type;
+ else
+ tgt_type = tgt_prog->type;
+
+ ret = btf_translate_to_vmlinux(log, btf, t, tgt_type, arg);
if (ret > 0) {
+ info->btf = btf_vmlinux;
info->btf_id = ret;
return true;
} else {
@@ -3810,8 +6256,18 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
}
+ info->btf = btf;
info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
+
+ if (btf_type_is_type_tag(t)) {
+ tag_value = __btf_name_by_offset(btf, t->name_off);
+ if (strcmp(tag_value, "user") == 0)
+ info->reg_type |= MEM_USER;
+ if (strcmp(tag_value, "percpu") == 0)
+ info->reg_type |= MEM_PERCPU;
+ }
+
/* skip modifiers */
while (btf_type_is_modifier(t)) {
info->btf_id = t->type;
@@ -3820,34 +6276,51 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (!btf_type_is_struct(t)) {
bpf_log(log,
"func '%s' arg%d type %s is not a struct\n",
- tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ tname, arg, btf_type_str(t));
return false;
}
bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n",
- tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)],
+ tname, arg, info->btf_id, btf_type_str(t),
__btf_name_by_offset(btf, t->name_off));
return true;
}
-int btf_struct_access(struct bpf_verifier_log *log,
- const struct btf_type *t, int off, int size,
- enum bpf_access_type atype,
- u32 *next_btf_id)
+enum bpf_struct_walk_result {
+ /* < 0 error */
+ WALK_SCALAR = 0,
+ WALK_PTR,
+ WALK_STRUCT,
+};
+
+static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
+ const struct btf_type *t, int off, int size,
+ u32 *next_btf_id, enum bpf_type_flag *flag,
+ const char **field_name)
{
u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
const struct btf_type *mtype, *elem_type = NULL;
const struct btf_member *member;
- const char *tname, *mname;
- u32 vlen;
+ const char *tname, *mname, *tag_value;
+ u32 vlen, elem_id, mid;
again:
- tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
+ if (btf_type_is_modifier(t))
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
+ tname = __btf_name_by_offset(btf, t->name_off);
if (!btf_type_is_struct(t)) {
bpf_log(log, "Type '%s' is not a struct\n", tname);
return -EINVAL;
}
vlen = btf_type_vlen(t);
+ if (BTF_INFO_KIND(t->info) == BTF_KIND_UNION && vlen != 1 && !(*flag & PTR_UNTRUSTED))
+ /*
+ * walking unions yields untrusted pointers
+ * with exception of __bpf_md_ptr and other
+ * unions with a single member
+ */
+ *flag |= PTR_UNTRUSTED;
+
if (off + size > t->size) {
/* If the last element is a variable size array, we may
* need to relax the rule.
@@ -3858,7 +6331,7 @@ again:
goto error;
member = btf_type_member(t) + vlen - 1;
- mtype = btf_type_skip_modifiers(btf_vmlinux, member->type,
+ mtype = btf_type_skip_modifiers(btf, member->type,
NULL);
if (!btf_type_is_array(mtype))
goto error;
@@ -3867,21 +6340,22 @@ again:
if (array_elem->nelems != 0)
goto error;
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
if (off < moff)
goto error;
- /* Only allow structure for now, can be relaxed for
- * other types later.
- */
- elem_type = btf_type_skip_modifiers(btf_vmlinux,
- array_elem->type, NULL);
- if (!btf_type_is_struct(elem_type))
+ /* allow structure and integer */
+ t = btf_type_skip_modifiers(btf, array_elem->type,
+ NULL);
+
+ if (btf_type_is_int(t))
+ return WALK_SCALAR;
+
+ if (!btf_type_is_struct(t))
goto error;
- off = (off - moff) % elem_type->size;
- return btf_struct_access(log, elem_type, off, size, atype,
- next_btf_id);
+ off = (off - moff) % t->size;
+ goto again;
error:
bpf_log(log, "access beyond struct %s at off %u size %u\n",
@@ -3891,14 +6365,14 @@ error:
for_each_member(i, t, member) {
/* offset of the field in bytes */
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
if (off + size <= moff)
/* won't find anything, field is already too far */
break;
- if (btf_member_bitfield_size(t, member)) {
- u32 end_bit = btf_member_bit_offset(t, member) +
- btf_member_bitfield_size(t, member);
+ if (__btf_member_bitfield_size(t, member)) {
+ u32 end_bit = __btf_member_bit_offset(t, member) +
+ __btf_member_bitfield_size(t, member);
/* off <= moff instead of off == moff because clang
* does not generate a BTF member for anonymous
@@ -3910,7 +6384,7 @@ error:
*/
if (off <= moff &&
BITS_ROUNDUP_BYTES(end_bit) <= off + size)
- return SCALAR_VALUE;
+ return WALK_SCALAR;
/* off may be accessing a following member
*
@@ -3932,11 +6406,13 @@ error:
break;
/* type of the field */
- mtype = btf_type_by_id(btf_vmlinux, member->type);
- mname = __btf_name_by_offset(btf_vmlinux, member->name_off);
+ mid = member->type;
+ mtype = btf_type_by_id(btf, member->type);
+ mname = __btf_name_by_offset(btf, member->name_off);
- mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
- &elem_type, &total_nelems);
+ mtype = __btf_resolve_size(btf, mtype, &msize,
+ &elem_type, &elem_id, &total_nelems,
+ &mid);
if (IS_ERR(mtype)) {
bpf_log(log, "field %s doesn't have size\n", mname);
return -EFAULT;
@@ -3950,7 +6426,7 @@ error:
if (btf_type_is_array(mtype)) {
u32 elem_idx;
- /* btf_resolve_size() above helps to
+ /* __btf_resolve_size() above helps to
* linearize a multi-dimensional array.
*
* The logic here is treating an array
@@ -3998,6 +6474,7 @@ error:
elem_idx = (off - moff) / msize;
moff += elem_idx * msize;
mtype = elem_type;
+ mid = elem_id;
}
/* the 'off' we're looking for is either equal to start
@@ -4007,13 +6484,20 @@ error:
/* our field must be inside that union or struct */
t = mtype;
+ /* return if the offset matches the member offset */
+ if (off == moff) {
+ *next_btf_id = mid;
+ return WALK_STRUCT;
+ }
+
/* adjust offset we're looking for */
off -= moff;
goto again;
}
if (btf_type_is_ptr(mtype)) {
- const struct btf_type *stype;
+ const struct btf_type *stype, *t;
+ enum bpf_type_flag tmp_flag = 0;
u32 id;
if (msize != size || off != moff) {
@@ -4023,10 +6507,28 @@ error:
return -EACCES;
}
- stype = btf_type_skip_modifiers(btf_vmlinux, mtype->type, &id);
+ /* check type tag */
+ t = btf_type_by_id(btf, mtype->type);
+ if (btf_type_is_type_tag(t)) {
+ tag_value = __btf_name_by_offset(btf, t->name_off);
+ /* check __user tag */
+ if (strcmp(tag_value, "user") == 0)
+ tmp_flag = MEM_USER;
+ /* check __percpu tag */
+ if (strcmp(tag_value, "percpu") == 0)
+ tmp_flag = MEM_PERCPU;
+ /* check __rcu tag */
+ if (strcmp(tag_value, "rcu") == 0)
+ tmp_flag = MEM_RCU;
+ }
+
+ stype = btf_type_skip_modifiers(btf, mtype->type, &id);
if (btf_type_is_struct(stype)) {
*next_btf_id = id;
- return PTR_TO_BTF_ID;
+ *flag |= tmp_flag;
+ if (field_name)
+ *field_name = mname;
+ return WALK_PTR;
}
}
@@ -4036,140 +6538,182 @@ error:
* that also allows using an array of int as a scratch
* space. e.g. skb->cb[].
*/
- if (off + size > mtrue_end) {
+ if (off + size > mtrue_end && !(*flag & PTR_UNTRUSTED)) {
bpf_log(log,
"access beyond the end of member %s (mend:%u) in struct %s with off %u size %u\n",
mname, mtrue_end, tname, off, size);
return -EACCES;
}
- return SCALAR_VALUE;
+ return WALK_SCALAR;
}
bpf_log(log, "struct %s doesn't have field at offset %d\n", tname, off);
return -EINVAL;
}
-static int __btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn,
- int arg)
+int btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size, enum bpf_access_type atype __maybe_unused,
+ u32 *next_btf_id, enum bpf_type_flag *flag,
+ const char **field_name)
{
- char fnname[KSYM_SYMBOL_LEN + 4] = "btf_";
- const struct btf_param *args;
+ const struct btf *btf = reg->btf;
+ enum bpf_type_flag tmp_flag = 0;
const struct btf_type *t;
- const char *tname, *sym;
- u32 btf_id, i;
+ u32 id = reg->btf_id;
+ int err;
- if (!btf_vmlinux) {
- bpf_log(log, "btf_vmlinux doesn't exist\n");
- return -EINVAL;
- }
+ while (type_is_alloc(reg->type)) {
+ struct btf_struct_meta *meta;
+ struct btf_record *rec;
+ int i;
- if (IS_ERR(btf_vmlinux)) {
- bpf_log(log, "btf_vmlinux is malformed\n");
- return -EINVAL;
+ meta = btf_find_struct_meta(btf, id);
+ if (!meta)
+ break;
+ rec = meta->record;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_field *field = &rec->fields[i];
+ u32 offset = field->offset;
+ if (off < offset + btf_field_type_size(field->type) && offset < off + size) {
+ bpf_log(log,
+ "direct access to %s is disallowed\n",
+ btf_field_type_name(field->type));
+ return -EACCES;
+ }
+ }
+ break;
}
- sym = kallsyms_lookup((long)fn, NULL, NULL, NULL, fnname + 4);
- if (!sym) {
- bpf_log(log, "kernel doesn't have kallsyms\n");
- return -EFAULT;
- }
+ t = btf_type_by_id(btf, id);
+ do {
+ err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag, field_name);
- for (i = 1; i <= btf_vmlinux->nr_types; i++) {
- t = btf_type_by_id(btf_vmlinux, i);
- if (BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF)
- continue;
- tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
- if (!strcmp(tname, fnname))
+ switch (err) {
+ case WALK_PTR:
+ /* For local types, the destination register cannot
+ * become a pointer again.
+ */
+ if (type_is_alloc(reg->type))
+ return SCALAR_VALUE;
+ /* If we found the pointer or scalar on t+off,
+ * we're done.
+ */
+ *next_btf_id = id;
+ *flag = tmp_flag;
+ return PTR_TO_BTF_ID;
+ case WALK_SCALAR:
+ return SCALAR_VALUE;
+ case WALK_STRUCT:
+ /* We found nested struct, so continue the search
+ * by diving in it. At this point the offset is
+ * aligned with the new type, so set it to 0.
+ */
+ t = btf_type_by_id(btf, id);
+ off = 0;
break;
- }
- if (i > btf_vmlinux->nr_types) {
- bpf_log(log, "helper %s type is not found\n", fnname);
- return -ENOENT;
- }
-
- t = btf_type_by_id(btf_vmlinux, t->type);
- if (!btf_type_is_ptr(t))
- return -EFAULT;
- t = btf_type_by_id(btf_vmlinux, t->type);
- if (!btf_type_is_func_proto(t))
- return -EFAULT;
+ default:
+ /* It's either error or unknown return value..
+ * scream and leave.
+ */
+ if (WARN_ONCE(err > 0, "unknown btf_struct_walk return value"))
+ return -EINVAL;
+ return err;
+ }
+ } while (t);
- args = (const struct btf_param *)(t + 1);
- if (arg >= btf_type_vlen(t)) {
- bpf_log(log, "bpf helper %s doesn't have %d-th argument\n",
- fnname, arg);
- return -EINVAL;
- }
+ return -EINVAL;
+}
- t = btf_type_by_id(btf_vmlinux, args[arg].type);
- if (!btf_type_is_ptr(t) || !t->type) {
- /* anything but the pointer to struct is a helper config bug */
- bpf_log(log, "ARG_PTR_TO_BTF is misconfigured\n");
- return -EFAULT;
- }
- btf_id = t->type;
- t = btf_type_by_id(btf_vmlinux, t->type);
- /* skip modifiers */
- while (btf_type_is_modifier(t)) {
- btf_id = t->type;
- t = btf_type_by_id(btf_vmlinux, t->type);
- }
- if (!btf_type_is_struct(t)) {
- bpf_log(log, "ARG_PTR_TO_BTF is not a struct\n");
- return -EFAULT;
- }
- bpf_log(log, "helper %s arg%d has btf_id %d struct %s\n", fnname + 4,
- arg, btf_id, __btf_name_by_offset(btf_vmlinux, t->name_off));
- return btf_id;
+/* Check that two BTF types, each specified as an BTF object + id, are exactly
+ * the same. Trivial ID check is not enough due to module BTFs, because we can
+ * end up with two different module BTFs, but IDs point to the common type in
+ * vmlinux BTF.
+ */
+bool btf_types_are_same(const struct btf *btf1, u32 id1,
+ const struct btf *btf2, u32 id2)
+{
+ if (id1 != id2)
+ return false;
+ if (btf1 == btf2)
+ return true;
+ return btf_type_by_id(btf1, id1) == btf_type_by_id(btf2, id2);
}
-int btf_resolve_helper_id(struct bpf_verifier_log *log,
- const struct bpf_func_proto *fn, int arg)
+bool btf_struct_ids_match(struct bpf_verifier_log *log,
+ const struct btf *btf, u32 id, int off,
+ const struct btf *need_btf, u32 need_type_id,
+ bool strict)
{
- int *btf_id = &fn->btf_id[arg];
- int ret;
+ const struct btf_type *type;
+ enum bpf_type_flag flag = 0;
+ int err;
- if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID)
- return -EINVAL;
+ /* Are we already done? */
+ if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id))
+ return true;
+ /* In case of strict type match, we do not walk struct, the top level
+ * type match must succeed. When strict is true, off should have already
+ * been 0.
+ */
+ if (strict)
+ return false;
+again:
+ type = btf_type_by_id(btf, id);
+ if (!type)
+ return false;
+ err = btf_struct_walk(log, btf, type, off, 1, &id, &flag, NULL);
+ if (err != WALK_STRUCT)
+ return false;
- ret = READ_ONCE(*btf_id);
- if (ret)
- return ret;
- /* ok to race the search. The result is the same */
- ret = __btf_resolve_helper_id(log, fn->func, arg);
- if (!ret) {
- /* Function argument cannot be type 'void' */
- bpf_log(log, "BTF resolution bug\n");
- return -EFAULT;
+ /* We found nested struct object. If it matches
+ * the requested ID, we're done. Otherwise let's
+ * continue the search with offset 0 in the new
+ * type.
+ */
+ if (!btf_types_are_same(btf, id, need_btf, need_type_id)) {
+ off = 0;
+ goto again;
}
- WRITE_ONCE(*btf_id, ret);
- return ret;
+
+ return true;
}
static int __get_type_size(struct btf *btf, u32 btf_id,
- const struct btf_type **bad_type)
+ const struct btf_type **ret_type)
{
const struct btf_type *t;
+ *ret_type = btf_type_by_id(btf, 0);
if (!btf_id)
/* void */
return 0;
t = btf_type_by_id(btf, btf_id);
while (t && btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (!t) {
- *bad_type = btf->types[0];
+ if (!t)
return -EINVAL;
- }
+ *ret_type = t;
if (btf_type_is_ptr(t))
/* kernel size of pointer. Not BPF's size of pointer*/
return sizeof(void *);
- if (btf_type_is_int(t) || btf_type_is_enum(t))
+ if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t))
return t->size;
- *bad_type = t;
return -EINVAL;
}
+static u8 __get_type_fmodel_flags(const struct btf_type *t)
+{
+ u8 flags = 0;
+
+ if (__btf_type_is_struct(t))
+ flags |= BTF_FMODEL_STRUCT_ARG;
+ if (btf_type_is_signed_int(t))
+ flags |= BTF_FMODEL_SIGNED_ARG;
+
+ return flags;
+}
+
int btf_distill_func_proto(struct bpf_verifier_log *log,
struct btf *btf,
const struct btf_type *func,
@@ -4183,40 +6727,59 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
if (!func) {
/* BTF function prototype doesn't match the verifier types.
- * Fall back to 5 u64 args.
+ * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args.
*/
- for (i = 0; i < 5; i++)
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
m->arg_size[i] = 8;
+ m->arg_flags[i] = 0;
+ }
m->ret_size = 8;
- m->nr_args = 5;
+ m->ret_flags = 0;
+ m->nr_args = MAX_BPF_FUNC_REG_ARGS;
return 0;
}
args = (const struct btf_param *)(func + 1);
nargs = btf_type_vlen(func);
- if (nargs >= MAX_BPF_FUNC_ARGS) {
+ if (nargs > MAX_BPF_FUNC_ARGS) {
bpf_log(log,
"The function %s has %d arguments. Too many.\n",
tname, nargs);
return -EINVAL;
}
ret = __get_type_size(btf, func->type, &t);
- if (ret < 0) {
+ if (ret < 0 || __btf_type_is_struct(t)) {
bpf_log(log,
"The function %s return type %s is unsupported.\n",
- tname, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ tname, btf_type_str(t));
return -EINVAL;
}
m->ret_size = ret;
+ m->ret_flags = __get_type_fmodel_flags(t);
for (i = 0; i < nargs; i++) {
+ if (i == nargs - 1 && args[i].type == 0) {
+ bpf_log(log,
+ "The function %s with variable args is unsupported.\n",
+ tname);
+ return -EINVAL;
+ }
ret = __get_type_size(btf, args[i].type, &t);
- if (ret < 0) {
+
+ /* No support of struct argument size greater than 16 bytes */
+ if (ret < 0 || ret > 16) {
bpf_log(log,
"The function %s arg%d type %s is unsupported.\n",
- tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ tname, i, btf_type_str(t));
+ return -EINVAL;
+ }
+ if (ret == 0) {
+ bpf_log(log,
+ "The function %s has malformed void argument.\n",
+ tname);
return -EINVAL;
}
m->arg_size[i] = ret;
+ m->arg_flags[i] = __get_type_fmodel_flags(t);
}
m->nr_args = nargs;
return 0;
@@ -4300,7 +6863,7 @@ static int btf_check_func_type_match(struct bpf_verifier_log *log,
* to context only. And only global functions can be replaced.
* Hence type check only those types.
*/
- if (btf_type_is_int(t1) || btf_type_is_enum(t1))
+ if (btf_type_is_int(t1) || btf_is_any_enum(t1))
continue;
if (!btf_type_is_ptr(t1)) {
bpf_log(log,
@@ -4341,7 +6904,7 @@ static int btf_check_func_type_match(struct bpf_verifier_log *log,
}
/* Compare BTFs of given program with BTF of target program */
-int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog,
+int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
struct btf *btf2, const struct btf_type *t2)
{
struct btf *btf1 = prog->aux->btf;
@@ -4349,7 +6912,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog,
u32 btf_id = 0;
if (!prog->aux->func_info) {
- bpf_log(&env->log, "Program extension requires BTF\n");
+ bpf_log(log, "Program extension requires BTF\n");
return -EINVAL;
}
@@ -4361,139 +6924,67 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog,
if (!t1 || !btf_type_is_func(t1))
return -EFAULT;
- return btf_check_func_type_match(&env->log, btf1, t1, btf2, t2);
+ return btf_check_func_type_match(log, btf1, t1, btf2, t2);
}
-/* Compare BTF of a function with given bpf_reg_state.
- * Returns:
- * EFAULT - there is a verifier bug. Abort verification.
- * EINVAL - there is a type mismatch or BTF is not available.
- * 0 - BTF matches with what bpf_reg_state expects.
- * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
- */
-int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *reg)
+static bool btf_is_dynptr_ptr(const struct btf *btf, const struct btf_type *t)
{
- struct bpf_verifier_log *log = &env->log;
- struct bpf_prog *prog = env->prog;
- struct btf *btf = prog->aux->btf;
- const struct btf_param *args;
- const struct btf_type *t;
- u32 i, nargs, btf_id;
- const char *tname;
-
- if (!prog->aux->func_info)
- return -EINVAL;
+ const char *name;
- btf_id = prog->aux->func_info[subprog].type_id;
- if (!btf_id)
- return -EFAULT;
+ t = btf_type_by_id(btf, t->type); /* skip PTR */
- if (prog->aux->func_info_aux[subprog].unreliable)
- return -EINVAL;
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
- t = btf_type_by_id(btf, btf_id);
- if (!t || !btf_type_is_func(t)) {
- /* These checks were already done by the verifier while loading
- * struct bpf_func_info
- */
- bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
- subprog);
- return -EFAULT;
+ /* allow either struct or struct forward declaration */
+ if (btf_type_is_struct(t) ||
+ (btf_type_is_fwd(t) && btf_type_kflag(t) == 0)) {
+ name = btf_str_by_offset(btf, t->name_off);
+ return name && strcmp(name, "bpf_dynptr") == 0;
}
- tname = btf_name_by_offset(btf, t->name_off);
- t = btf_type_by_id(btf, t->type);
- if (!t || !btf_type_is_func_proto(t)) {
- bpf_log(log, "Invalid BTF of func %s\n", tname);
- return -EFAULT;
- }
- args = (const struct btf_param *)(t + 1);
- nargs = btf_type_vlen(t);
- if (nargs > 5) {
- bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs);
- goto out;
- }
- /* check that BTF function arguments match actual types that the
- * verifier sees.
- */
- for (i = 0; i < nargs; i++) {
- t = btf_type_by_id(btf, args[i].type);
- while (btf_type_is_modifier(t))
- t = btf_type_by_id(btf, t->type);
- if (btf_type_is_int(t) || btf_type_is_enum(t)) {
- if (reg[i + 1].type == SCALAR_VALUE)
- continue;
- bpf_log(log, "R%d is not a scalar\n", i + 1);
- goto out;
- }
- if (btf_type_is_ptr(t)) {
- if (reg[i + 1].type == SCALAR_VALUE) {
- bpf_log(log, "R%d is not a pointer\n", i + 1);
- goto out;
- }
- /* If function expects ctx type in BTF check that caller
- * is passing PTR_TO_CTX.
- */
- if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) {
- if (reg[i + 1].type != PTR_TO_CTX) {
- bpf_log(log,
- "arg#%d expected pointer to ctx, but got %s\n",
- i, btf_kind_str[BTF_INFO_KIND(t->info)]);
- goto out;
- }
- if (check_ctx_reg(env, &reg[i + 1], i + 1))
- goto out;
- continue;
- }
- }
- bpf_log(log, "Unrecognized arg#%d type %s\n",
- i, btf_kind_str[BTF_INFO_KIND(t->info)]);
- goto out;
- }
- return 0;
-out:
- /* Compiler optimizations can remove arguments from static functions
- * or mismatched type can be passed into a global function.
- * In such cases mark the function as unreliable from BTF point of view.
- */
- prog->aux->func_info_aux[subprog].unreliable = true;
- return -EINVAL;
+ return false;
}
-/* Convert BTF of a function into bpf_reg_state if possible
+/* Process BTF of a function to produce high-level expectation of function
+ * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information
+ * is cached in subprog info for reuse.
* Returns:
* EFAULT - there is a verifier bug. Abort verification.
* EINVAL - cannot convert BTF.
- * 0 - Successfully converted BTF into bpf_reg_state
- * (either PTR_TO_CTX or SCALAR_VALUE).
+ * 0 - Successfully processed BTF and constructed argument expectations.
*/
-int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *reg)
+int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
{
+ bool is_global = subprog_aux(env, subprog)->linkage == BTF_FUNC_GLOBAL;
+ struct bpf_subprog_info *sub = subprog_info(env, subprog);
struct bpf_verifier_log *log = &env->log;
struct bpf_prog *prog = env->prog;
enum bpf_prog_type prog_type = prog->type;
struct btf *btf = prog->aux->btf;
const struct btf_param *args;
- const struct btf_type *t;
+ const struct btf_type *t, *ref_t, *fn_t;
u32 i, nargs, btf_id;
const char *tname;
- if (!prog->aux->func_info ||
- prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) {
+ if (sub->args_cached)
+ return 0;
+
+ if (!prog->aux->func_info) {
bpf_log(log, "Verifier bug\n");
return -EFAULT;
}
btf_id = prog->aux->func_info[subprog].type_id;
if (!btf_id) {
+ if (!is_global) /* not fatal for static funcs */
+ return -EINVAL;
bpf_log(log, "Global functions need valid BTF\n");
return -EFAULT;
}
- t = btf_type_by_id(btf, btf_id);
- if (!t || !btf_type_is_func(t)) {
+ fn_t = btf_type_by_id(btf, btf_id);
+ if (!fn_t || !btf_type_is_func(fn_t)) {
/* These checks were already done by the verifier while loading
* struct bpf_func_info
*/
@@ -4501,36 +6992,32 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
subprog);
return -EFAULT;
}
- tname = btf_name_by_offset(btf, t->name_off);
-
- if (log->level & BPF_LOG_LEVEL)
- bpf_log(log, "Validating %s() func#%d...\n",
- tname, subprog);
+ tname = btf_name_by_offset(btf, fn_t->name_off);
if (prog->aux->func_info_aux[subprog].unreliable) {
bpf_log(log, "Verifier bug in function %s()\n", tname);
return -EFAULT;
}
if (prog_type == BPF_PROG_TYPE_EXT)
- prog_type = prog->aux->linked_prog->type;
+ prog_type = prog->aux->dst_prog->type;
- t = btf_type_by_id(btf, t->type);
+ t = btf_type_by_id(btf, fn_t->type);
if (!t || !btf_type_is_func_proto(t)) {
bpf_log(log, "Invalid type of function %s()\n", tname);
return -EFAULT;
}
args = (const struct btf_param *)(t + 1);
nargs = btf_type_vlen(t);
- if (nargs > 5) {
- bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n",
- tname, nargs);
+ if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n",
+ tname, nargs, MAX_BPF_FUNC_REG_ARGS);
return -EINVAL;
}
- /* check that function returns int */
+ /* check that function returns int, exception cb also requires this */
t = btf_type_by_id(btf, t->type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (!btf_type_is_int(t) && !btf_type_is_enum(t)) {
+ if (!btf_type_is_int(t) && !btf_is_any_enum(t)) {
bpf_log(log,
"Global function %s() doesn't return scalar. Only those are supported.\n",
tname);
@@ -4540,31 +7027,185 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
* Only PTR_TO_CTX and SCALAR are supported atm.
*/
for (i = 0; i < nargs; i++) {
+ bool is_nonnull = false;
+ const char *tag;
+
t = btf_type_by_id(btf, args[i].type);
+
+ tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:");
+ if (IS_ERR(tag) && PTR_ERR(tag) == -ENOENT) {
+ tag = NULL;
+ } else if (IS_ERR(tag)) {
+ bpf_log(log, "arg#%d type's tag fetching failure: %ld\n", i, PTR_ERR(tag));
+ return PTR_ERR(tag);
+ }
+ /* 'arg:<tag>' decl_tag takes precedence over derivation of
+ * register type from BTF type itself
+ */
+ if (tag) {
+ /* disallow arg tags in static subprogs */
+ if (!is_global) {
+ bpf_log(log, "arg#%d type tag is not supported in static functions\n", i);
+ return -EOPNOTSUPP;
+ }
+ if (strcmp(tag, "ctx") == 0) {
+ sub->args[i].arg_type = ARG_PTR_TO_CTX;
+ continue;
+ }
+ if (strcmp(tag, "nonnull") == 0)
+ is_nonnull = true;
+ }
+
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_int(t) || btf_type_is_enum(t)) {
- reg[i + 1].type = SCALAR_VALUE;
+ if (btf_type_is_int(t) || btf_is_any_enum(t)) {
+ sub->args[i].arg_type = ARG_ANYTHING;
+ continue;
+ }
+ if (btf_type_is_ptr(t) && btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+ sub->args[i].arg_type = ARG_PTR_TO_CTX;
continue;
}
- if (btf_type_is_ptr(t) &&
- btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
- reg[i + 1].type = PTR_TO_CTX;
+ if (btf_type_is_ptr(t) && btf_is_dynptr_ptr(btf, t)) {
+ sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY;
+ continue;
+ }
+ if (is_global && btf_type_is_ptr(t)) {
+ u32 mem_size;
+
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
+ ref_t = btf_resolve_size(btf, t, &mem_size);
+ if (IS_ERR(ref_t)) {
+ bpf_log(log,
+ "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+ i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
+ PTR_ERR(ref_t));
+ return -EINVAL;
+ }
+
+ sub->args[i].arg_type = is_nonnull ? ARG_PTR_TO_MEM : ARG_PTR_TO_MEM_OR_NULL;
+ sub->args[i].mem_size = mem_size;
continue;
}
+ if (is_nonnull) {
+ bpf_log(log, "arg#%d marked as non-null, but is not a pointer type\n", i);
+ return -EINVAL;
+ }
bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
- i, btf_kind_str[BTF_INFO_KIND(t->info)], tname);
+ i, btf_type_str(t), tname);
return -EINVAL;
}
+
+ for (i = 0; i < nargs; i++) {
+ const char *tag;
+
+ if (sub->args[i].arg_type != ARG_PTR_TO_CTX)
+ continue;
+
+ /* check if arg has "arg:ctx" tag */
+ t = btf_type_by_id(btf, args[i].type);
+ tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:");
+ if (IS_ERR_OR_NULL(tag) || strcmp(tag, "ctx") != 0)
+ continue;
+
+ if (btf_validate_prog_ctx_type(log, btf, t, i, prog_type,
+ prog->expected_attach_type))
+ return -EINVAL;
+ }
+
+ sub->arg_cnt = nargs;
+ sub->args_cached = true;
+
return 0;
}
+static void btf_type_show(const struct btf *btf, u32 type_id, void *obj,
+ struct btf_show *show)
+{
+ const struct btf_type *t = btf_type_by_id(btf, type_id);
+
+ show->btf = btf;
+ memset(&show->state, 0, sizeof(show->state));
+ memset(&show->obj, 0, sizeof(show->obj));
+
+ btf_type_ops(t)->show(btf, t, type_id, obj, 0, show);
+}
+
+static void btf_seq_show(struct btf_show *show, const char *fmt,
+ va_list args)
+{
+ seq_vprintf((struct seq_file *)show->target, fmt, args);
+}
+
+int btf_type_seq_show_flags(const struct btf *btf, u32 type_id,
+ void *obj, struct seq_file *m, u64 flags)
+{
+ struct btf_show sseq;
+
+ sseq.target = m;
+ sseq.showfn = btf_seq_show;
+ sseq.flags = flags;
+
+ btf_type_show(btf, type_id, obj, &sseq);
+
+ return sseq.state.status;
+}
+
void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
struct seq_file *m)
{
- const struct btf_type *t = btf_type_by_id(btf, type_id);
+ (void) btf_type_seq_show_flags(btf, type_id, obj, m,
+ BTF_SHOW_NONAME | BTF_SHOW_COMPACT |
+ BTF_SHOW_ZERO | BTF_SHOW_UNSAFE);
+}
+
+struct btf_show_snprintf {
+ struct btf_show show;
+ int len_left; /* space left in string */
+ int len; /* length we would have written */
+};
- btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
+static void btf_snprintf_show(struct btf_show *show, const char *fmt,
+ va_list args)
+{
+ struct btf_show_snprintf *ssnprintf = (struct btf_show_snprintf *)show;
+ int len;
+
+ len = vsnprintf(show->target, ssnprintf->len_left, fmt, args);
+
+ if (len < 0) {
+ ssnprintf->len_left = 0;
+ ssnprintf->len = len;
+ } else if (len >= ssnprintf->len_left) {
+ /* no space, drive on to get length we would have written */
+ ssnprintf->len_left = 0;
+ ssnprintf->len += len;
+ } else {
+ ssnprintf->len_left -= len;
+ ssnprintf->len += len;
+ show->target += len;
+ }
+}
+
+int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj,
+ char *buf, int len, u64 flags)
+{
+ struct btf_show_snprintf ssnprintf;
+
+ ssnprintf.show.target = buf;
+ ssnprintf.show.flags = flags;
+ ssnprintf.show.showfn = btf_snprintf_show;
+ ssnprintf.len_left = len;
+ ssnprintf.len = 0;
+
+ btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf);
+
+ /* If we encountered an error, return it. */
+ if (ssnprintf.show.state.status)
+ return ssnprintf.show.state.status;
+
+ /* Otherwise return length we would have written */
+ return ssnprintf.len;
}
#ifdef CONFIG_PROC_FS
@@ -4594,15 +7235,12 @@ static int __btf_new_fd(struct btf *btf)
return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
}
-int btf_new_fd(const union bpf_attr *attr)
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
struct btf *btf;
int ret;
- btf = btf_parse(u64_to_user_ptr(attr->btf),
- attr->btf_size, attr->btf_log_level,
- u64_to_user_ptr(attr->btf_log_buf),
- attr->btf_log_size);
+ btf = btf_parse(attr, uattr, uattr_size);
if (IS_ERR(btf))
return PTR_ERR(btf);
@@ -4655,7 +7293,9 @@ int btf_get_info_by_fd(const struct btf *btf,
struct bpf_btf_info info;
u32 info_copy, btf_copy;
void __user *ubtf;
- u32 uinfo_len;
+ char __user *uname;
+ u32 uinfo_len, uname_len, name_len;
+ int ret = 0;
uinfo = u64_to_user_ptr(attr->info.info);
uinfo_len = attr->info.info_len;
@@ -4672,11 +7312,37 @@ int btf_get_info_by_fd(const struct btf *btf,
return -EFAULT;
info.btf_size = btf->data_size;
+ info.kernel_btf = btf->kernel_btf;
+
+ uname = u64_to_user_ptr(info.name);
+ uname_len = info.name_len;
+ if (!uname ^ !uname_len)
+ return -EINVAL;
+
+ name_len = strlen(btf->name);
+ info.name_len = name_len;
+
+ if (uname) {
+ if (uname_len >= name_len + 1) {
+ if (copy_to_user(uname, btf->name, name_len + 1))
+ return -EFAULT;
+ } else {
+ char zero = '\0';
+
+ if (copy_to_user(uname, btf->name, uname_len - 1))
+ return -EFAULT;
+ if (put_user(zero, uname + uname_len - 1))
+ return -EFAULT;
+ /* let user-space know about too short buffer */
+ ret = -ENOSPC;
+ }
+ }
+
if (copy_to_user(uinfo, &info, info_copy) ||
put_user(info_copy, &uattr->info.info_len))
return -EFAULT;
- return 0;
+ return ret;
}
int btf_get_fd_by_id(u32 id)
@@ -4700,7 +7366,1282 @@ int btf_get_fd_by_id(u32 id)
return fd;
}
-u32 btf_id(const struct btf *btf)
+u32 btf_obj_id(const struct btf *btf)
{
return btf->id;
}
+
+bool btf_is_kernel(const struct btf *btf)
+{
+ return btf->kernel_btf;
+}
+
+bool btf_is_module(const struct btf *btf)
+{
+ return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0;
+}
+
+enum {
+ BTF_MODULE_F_LIVE = (1 << 0),
+};
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+struct btf_module {
+ struct list_head list;
+ struct module *module;
+ struct btf *btf;
+ struct bin_attribute *sysfs_attr;
+ int flags;
+};
+
+static LIST_HEAD(btf_modules);
+static DEFINE_MUTEX(btf_module_mutex);
+
+static ssize_t
+btf_module_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t len)
+{
+ const struct btf *btf = bin_attr->private;
+
+ memcpy(buf, btf->data + off, len);
+ return len;
+}
+
+static void purge_cand_cache(struct btf *btf);
+
+static int btf_module_notify(struct notifier_block *nb, unsigned long op,
+ void *module)
+{
+ struct btf_module *btf_mod, *tmp;
+ struct module *mod = module;
+ struct btf *btf;
+ int err = 0;
+
+ if (mod->btf_data_size == 0 ||
+ (op != MODULE_STATE_COMING && op != MODULE_STATE_LIVE &&
+ op != MODULE_STATE_GOING))
+ goto out;
+
+ switch (op) {
+ case MODULE_STATE_COMING:
+ btf_mod = kzalloc(sizeof(*btf_mod), GFP_KERNEL);
+ if (!btf_mod) {
+ err = -ENOMEM;
+ goto out;
+ }
+ btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size);
+ if (IS_ERR(btf)) {
+ kfree(btf_mod);
+ if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) {
+ pr_warn("failed to validate module [%s] BTF: %ld\n",
+ mod->name, PTR_ERR(btf));
+ err = PTR_ERR(btf);
+ } else {
+ pr_warn_once("Kernel module BTF mismatch detected, BTF debug info may be unavailable for some modules\n");
+ }
+ goto out;
+ }
+ err = btf_alloc_id(btf);
+ if (err) {
+ btf_free(btf);
+ kfree(btf_mod);
+ goto out;
+ }
+
+ purge_cand_cache(NULL);
+ mutex_lock(&btf_module_mutex);
+ btf_mod->module = module;
+ btf_mod->btf = btf;
+ list_add(&btf_mod->list, &btf_modules);
+ mutex_unlock(&btf_module_mutex);
+
+ if (IS_ENABLED(CONFIG_SYSFS)) {
+ struct bin_attribute *attr;
+
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+ if (!attr)
+ goto out;
+
+ sysfs_bin_attr_init(attr);
+ attr->attr.name = btf->name;
+ attr->attr.mode = 0444;
+ attr->size = btf->data_size;
+ attr->private = btf;
+ attr->read = btf_module_read;
+
+ err = sysfs_create_bin_file(btf_kobj, attr);
+ if (err) {
+ pr_warn("failed to register module [%s] BTF in sysfs: %d\n",
+ mod->name, err);
+ kfree(attr);
+ err = 0;
+ goto out;
+ }
+
+ btf_mod->sysfs_attr = attr;
+ }
+
+ break;
+ case MODULE_STATE_LIVE:
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->module != module)
+ continue;
+
+ btf_mod->flags |= BTF_MODULE_F_LIVE;
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+ break;
+ case MODULE_STATE_GOING:
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->module != module)
+ continue;
+
+ list_del(&btf_mod->list);
+ if (btf_mod->sysfs_attr)
+ sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr);
+ purge_cand_cache(btf_mod->btf);
+ btf_put(btf_mod->btf);
+ kfree(btf_mod->sysfs_attr);
+ kfree(btf_mod);
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+ break;
+ }
+out:
+ return notifier_from_errno(err);
+}
+
+static struct notifier_block btf_module_nb = {
+ .notifier_call = btf_module_notify,
+};
+
+static int __init btf_module_init(void)
+{
+ register_module_notifier(&btf_module_nb);
+ return 0;
+}
+
+fs_initcall(btf_module_init);
+#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */
+
+struct module *btf_try_get_module(const struct btf *btf)
+{
+ struct module *res = NULL;
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ struct btf_module *btf_mod, *tmp;
+
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->btf != btf)
+ continue;
+
+ /* We must only consider module whose __init routine has
+ * finished, hence we must check for BTF_MODULE_F_LIVE flag,
+ * which is set from the notifier callback for
+ * MODULE_STATE_LIVE.
+ */
+ if ((btf_mod->flags & BTF_MODULE_F_LIVE) && try_module_get(btf_mod->module))
+ res = btf_mod->module;
+
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+#endif
+
+ return res;
+}
+
+/* Returns struct btf corresponding to the struct module.
+ * This function can return NULL or ERR_PTR.
+ */
+static struct btf *btf_get_module_btf(const struct module *module)
+{
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ struct btf_module *btf_mod, *tmp;
+#endif
+ struct btf *btf = NULL;
+
+ if (!module) {
+ btf = bpf_get_btf_vmlinux();
+ if (!IS_ERR_OR_NULL(btf))
+ btf_get(btf);
+ return btf;
+ }
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->module != module)
+ continue;
+
+ btf_get(btf_mod->btf);
+ btf = btf_mod->btf;
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+#endif
+
+ return btf;
+}
+
+BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags)
+{
+ struct btf *btf = NULL;
+ int btf_obj_fd = 0;
+ long ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (name_sz <= 1 || name[name_sz - 1])
+ return -EINVAL;
+
+ ret = bpf_find_btf_id(name, kind, &btf);
+ if (ret > 0 && btf_is_module(btf)) {
+ btf_obj_fd = __btf_new_fd(btf);
+ if (btf_obj_fd < 0) {
+ btf_put(btf);
+ return btf_obj_fd;
+ }
+ return ret | (((u64)btf_obj_fd) << 32);
+ }
+ if (ret > 0)
+ btf_put(btf);
+ return ret;
+}
+
+const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {
+ .func = bpf_btf_find_by_name_kind,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
+#define BTF_TRACING_TYPE(name, type) BTF_ID(struct, type)
+BTF_TRACING_TYPE_xxx
+#undef BTF_TRACING_TYPE
+
+static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
+ const struct btf_type *func, u32 func_flags)
+{
+ u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+ const char *name, *sfx, *iter_name;
+ const struct btf_param *arg;
+ const struct btf_type *t;
+ char exp_name[128];
+ u32 nr_args;
+
+ /* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */
+ if (!flags || (flags & (flags - 1)))
+ return -EINVAL;
+
+ /* any BPF iter kfunc should have `struct bpf_iter_<type> *` first arg */
+ nr_args = btf_type_vlen(func);
+ if (nr_args < 1)
+ return -EINVAL;
+
+ arg = &btf_params(func)[0];
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
+ if (!t || !__btf_type_is_struct(t))
+ return -EINVAL;
+
+ name = btf_name_by_offset(btf, t->name_off);
+ if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
+ return -EINVAL;
+
+ /* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to
+ * fit nicely in stack slots
+ */
+ if (t->size == 0 || (t->size % 8))
+ return -EINVAL;
+
+ /* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *)
+ * naming pattern
+ */
+ iter_name = name + sizeof(ITER_PREFIX) - 1;
+ if (flags & KF_ITER_NEW)
+ sfx = "new";
+ else if (flags & KF_ITER_NEXT)
+ sfx = "next";
+ else /* (flags & KF_ITER_DESTROY) */
+ sfx = "destroy";
+
+ snprintf(exp_name, sizeof(exp_name), "bpf_iter_%s_%s", iter_name, sfx);
+ if (strcmp(func_name, exp_name))
+ return -EINVAL;
+
+ /* only iter constructor should have extra arguments */
+ if (!(flags & KF_ITER_NEW) && nr_args != 1)
+ return -EINVAL;
+
+ if (flags & KF_ITER_NEXT) {
+ /* bpf_iter_<type>_next() should return pointer */
+ t = btf_type_skip_modifiers(btf, func->type, NULL);
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ }
+
+ if (flags & KF_ITER_DESTROY) {
+ /* bpf_iter_<type>_destroy() should return void */
+ t = btf_type_by_id(btf, func->type);
+ if (!t || !btf_type_is_void(t))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
+{
+ const struct btf_type *func;
+ const char *func_name;
+ int err;
+
+ /* any kfunc should be FUNC -> FUNC_PROTO */
+ func = btf_type_by_id(btf, func_id);
+ if (!func || !btf_type_is_func(func))
+ return -EINVAL;
+
+ /* sanity check kfunc name */
+ func_name = btf_name_by_offset(btf, func->name_off);
+ if (!func_name || !func_name[0])
+ return -EINVAL;
+
+ func = btf_type_by_id(btf, func->type);
+ if (!func || !btf_type_is_func_proto(func))
+ return -EINVAL;
+
+ if (func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY)) {
+ err = btf_check_iter_kfuncs(btf, func_name, func, func_flags);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/* Kernel Function (kfunc) BTF ID set registration API */
+
+static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
+ const struct btf_kfunc_id_set *kset)
+{
+ struct btf_kfunc_hook_filter *hook_filter;
+ struct btf_id_set8 *add_set = kset->set;
+ bool vmlinux_set = !btf_is_module(btf);
+ bool add_filter = !!kset->filter;
+ struct btf_kfunc_set_tab *tab;
+ struct btf_id_set8 *set;
+ u32 set_cnt;
+ int ret;
+
+ if (hook >= BTF_KFUNC_HOOK_MAX) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ if (!add_set->cnt)
+ return 0;
+
+ tab = btf->kfunc_set_tab;
+
+ if (tab && add_filter) {
+ u32 i;
+
+ hook_filter = &tab->hook_filters[hook];
+ for (i = 0; i < hook_filter->nr_filters; i++) {
+ if (hook_filter->filters[i] == kset->filter) {
+ add_filter = false;
+ break;
+ }
+ }
+
+ if (add_filter && hook_filter->nr_filters == BTF_KFUNC_FILTER_MAX_CNT) {
+ ret = -E2BIG;
+ goto end;
+ }
+ }
+
+ if (!tab) {
+ tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN);
+ if (!tab)
+ return -ENOMEM;
+ btf->kfunc_set_tab = tab;
+ }
+
+ set = tab->sets[hook];
+ /* Warn when register_btf_kfunc_id_set is called twice for the same hook
+ * for module sets.
+ */
+ if (WARN_ON_ONCE(set && !vmlinux_set)) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ /* We don't need to allocate, concatenate, and sort module sets, because
+ * only one is allowed per hook. Hence, we can directly assign the
+ * pointer and return.
+ */
+ if (!vmlinux_set) {
+ tab->sets[hook] = add_set;
+ goto do_add_filter;
+ }
+
+ /* In case of vmlinux sets, there may be more than one set being
+ * registered per hook. To create a unified set, we allocate a new set
+ * and concatenate all individual sets being registered. While each set
+ * is individually sorted, they may become unsorted when concatenated,
+ * hence re-sorting the final set again is required to make binary
+ * searching the set using btf_id_set8_contains function work.
+ */
+ set_cnt = set ? set->cnt : 0;
+
+ if (set_cnt > U32_MAX - add_set->cnt) {
+ ret = -EOVERFLOW;
+ goto end;
+ }
+
+ if (set_cnt + add_set->cnt > BTF_KFUNC_SET_MAX_CNT) {
+ ret = -E2BIG;
+ goto end;
+ }
+
+ /* Grow set */
+ set = krealloc(tab->sets[hook],
+ offsetof(struct btf_id_set8, pairs[set_cnt + add_set->cnt]),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!set) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ /* For newly allocated set, initialize set->cnt to 0 */
+ if (!tab->sets[hook])
+ set->cnt = 0;
+ tab->sets[hook] = set;
+
+ /* Concatenate the two sets */
+ memcpy(set->pairs + set->cnt, add_set->pairs, add_set->cnt * sizeof(set->pairs[0]));
+ set->cnt += add_set->cnt;
+
+ sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL);
+
+do_add_filter:
+ if (add_filter) {
+ hook_filter = &tab->hook_filters[hook];
+ hook_filter->filters[hook_filter->nr_filters++] = kset->filter;
+ }
+ return 0;
+end:
+ btf_free_kfunc_set_tab(btf);
+ return ret;
+}
+
+static u32 *__btf_kfunc_id_set_contains(const struct btf *btf,
+ enum btf_kfunc_hook hook,
+ u32 kfunc_btf_id,
+ const struct bpf_prog *prog)
+{
+ struct btf_kfunc_hook_filter *hook_filter;
+ struct btf_id_set8 *set;
+ u32 *id, i;
+
+ if (hook >= BTF_KFUNC_HOOK_MAX)
+ return NULL;
+ if (!btf->kfunc_set_tab)
+ return NULL;
+ hook_filter = &btf->kfunc_set_tab->hook_filters[hook];
+ for (i = 0; i < hook_filter->nr_filters; i++) {
+ if (hook_filter->filters[i](prog, kfunc_btf_id))
+ return NULL;
+ }
+ set = btf->kfunc_set_tab->sets[hook];
+ if (!set)
+ return NULL;
+ id = btf_id_set8_contains(set, kfunc_btf_id);
+ if (!id)
+ return NULL;
+ /* The flags for BTF ID are located next to it */
+ return id + 1;
+}
+
+static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
+{
+ switch (prog_type) {
+ case BPF_PROG_TYPE_UNSPEC:
+ return BTF_KFUNC_HOOK_COMMON;
+ case BPF_PROG_TYPE_XDP:
+ return BTF_KFUNC_HOOK_XDP;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ return BTF_KFUNC_HOOK_TC;
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ return BTF_KFUNC_HOOK_STRUCT_OPS;
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ return BTF_KFUNC_HOOK_TRACING;
+ case BPF_PROG_TYPE_SYSCALL:
+ return BTF_KFUNC_HOOK_SYSCALL;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ return BTF_KFUNC_HOOK_CGROUP_SKB;
+ case BPF_PROG_TYPE_SCHED_ACT:
+ return BTF_KFUNC_HOOK_SCHED_ACT;
+ case BPF_PROG_TYPE_SK_SKB:
+ return BTF_KFUNC_HOOK_SK_SKB;
+ case BPF_PROG_TYPE_SOCKET_FILTER:
+ return BTF_KFUNC_HOOK_SOCKET_FILTER;
+ case BPF_PROG_TYPE_LWT_OUT:
+ case BPF_PROG_TYPE_LWT_IN:
+ case BPF_PROG_TYPE_LWT_XMIT:
+ case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+ return BTF_KFUNC_HOOK_LWT;
+ case BPF_PROG_TYPE_NETFILTER:
+ return BTF_KFUNC_HOOK_NETFILTER;
+ default:
+ return BTF_KFUNC_HOOK_MAX;
+ }
+}
+
+/* Caution:
+ * Reference to the module (obtained using btf_try_get_module) corresponding to
+ * the struct btf *MUST* be held when calling this function from verifier
+ * context. This is usually true as we stash references in prog's kfunc_btf_tab;
+ * keeping the reference for the duration of the call provides the necessary
+ * protection for looking up a well-formed btf->kfunc_set_tab.
+ */
+u32 *btf_kfunc_id_set_contains(const struct btf *btf,
+ u32 kfunc_btf_id,
+ const struct bpf_prog *prog)
+{
+ enum bpf_prog_type prog_type = resolve_prog_type(prog);
+ enum btf_kfunc_hook hook;
+ u32 *kfunc_flags;
+
+ kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog);
+ if (kfunc_flags)
+ return kfunc_flags;
+
+ hook = bpf_prog_type_to_kfunc_hook(prog_type);
+ return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id, prog);
+}
+
+u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id,
+ const struct bpf_prog *prog)
+{
+ return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog);
+}
+
+static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
+ const struct btf_kfunc_id_set *kset)
+{
+ struct btf *btf;
+ int ret, i;
+
+ btf = btf_get_module_btf(kset->owner);
+ if (!btf) {
+ if (!kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+ pr_err("missing vmlinux BTF, cannot register kfuncs\n");
+ return -ENOENT;
+ }
+ if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+ pr_warn("missing module BTF, cannot register kfuncs\n");
+ return 0;
+ }
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ for (i = 0; i < kset->set->cnt; i++) {
+ ret = btf_check_kfunc_protos(btf, kset->set->pairs[i].id,
+ kset->set->pairs[i].flags);
+ if (ret)
+ goto err_out;
+ }
+
+ ret = btf_populate_kfunc_set(btf, hook, kset);
+
+err_out:
+ btf_put(btf);
+ return ret;
+}
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
+ const struct btf_kfunc_id_set *kset)
+{
+ enum btf_kfunc_hook hook;
+
+ hook = bpf_prog_type_to_kfunc_hook(prog_type);
+ return __register_btf_kfunc_id_set(hook, kset);
+}
+EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset)
+{
+ return __register_btf_kfunc_id_set(BTF_KFUNC_HOOK_FMODRET, kset);
+}
+EXPORT_SYMBOL_GPL(register_btf_fmodret_id_set);
+
+s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id)
+{
+ struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab;
+ struct btf_id_dtor_kfunc *dtor;
+
+ if (!tab)
+ return -ENOENT;
+ /* Even though the size of tab->dtors[0] is > sizeof(u32), we only need
+ * to compare the first u32 with btf_id, so we can reuse btf_id_cmp_func.
+ */
+ BUILD_BUG_ON(offsetof(struct btf_id_dtor_kfunc, btf_id) != 0);
+ dtor = bsearch(&btf_id, tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func);
+ if (!dtor)
+ return -ENOENT;
+ return dtor->kfunc_btf_id;
+}
+
+static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc *dtors, u32 cnt)
+{
+ const struct btf_type *dtor_func, *dtor_func_proto, *t;
+ const struct btf_param *args;
+ s32 dtor_btf_id;
+ u32 nr_args, i;
+
+ for (i = 0; i < cnt; i++) {
+ dtor_btf_id = dtors[i].kfunc_btf_id;
+
+ dtor_func = btf_type_by_id(btf, dtor_btf_id);
+ if (!dtor_func || !btf_type_is_func(dtor_func))
+ return -EINVAL;
+
+ dtor_func_proto = btf_type_by_id(btf, dtor_func->type);
+ if (!dtor_func_proto || !btf_type_is_func_proto(dtor_func_proto))
+ return -EINVAL;
+
+ /* Make sure the prototype of the destructor kfunc is 'void func(type *)' */
+ t = btf_type_by_id(btf, dtor_func_proto->type);
+ if (!t || !btf_type_is_void(t))
+ return -EINVAL;
+
+ nr_args = btf_type_vlen(dtor_func_proto);
+ if (nr_args != 1)
+ return -EINVAL;
+ args = btf_params(dtor_func_proto);
+ t = btf_type_by_id(btf, args[0].type);
+ /* Allow any pointer type, as width on targets Linux supports
+ * will be same for all pointer types (i.e. sizeof(void *))
+ */
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt,
+ struct module *owner)
+{
+ struct btf_id_dtor_kfunc_tab *tab;
+ struct btf *btf;
+ u32 tab_cnt;
+ int ret;
+
+ btf = btf_get_module_btf(owner);
+ if (!btf) {
+ if (!owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+ pr_err("missing vmlinux BTF, cannot register dtor kfuncs\n");
+ return -ENOENT;
+ }
+ if (owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) {
+ pr_err("missing module BTF, cannot register dtor kfuncs\n");
+ return -ENOENT;
+ }
+ return 0;
+ }
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ if (add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) {
+ pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT);
+ ret = -E2BIG;
+ goto end;
+ }
+
+ /* Ensure that the prototype of dtor kfuncs being registered is sane */
+ ret = btf_check_dtor_kfuncs(btf, dtors, add_cnt);
+ if (ret < 0)
+ goto end;
+
+ tab = btf->dtor_kfunc_tab;
+ /* Only one call allowed for modules */
+ if (WARN_ON_ONCE(tab && btf_is_module(btf))) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ tab_cnt = tab ? tab->cnt : 0;
+ if (tab_cnt > U32_MAX - add_cnt) {
+ ret = -EOVERFLOW;
+ goto end;
+ }
+ if (tab_cnt + add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) {
+ pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT);
+ ret = -E2BIG;
+ goto end;
+ }
+
+ tab = krealloc(btf->dtor_kfunc_tab,
+ offsetof(struct btf_id_dtor_kfunc_tab, dtors[tab_cnt + add_cnt]),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!tab) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ if (!btf->dtor_kfunc_tab)
+ tab->cnt = 0;
+ btf->dtor_kfunc_tab = tab;
+
+ memcpy(tab->dtors + tab->cnt, dtors, add_cnt * sizeof(tab->dtors[0]));
+ tab->cnt += add_cnt;
+
+ sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL);
+
+end:
+ if (ret)
+ btf_free_dtor_kfunc_tab(btf);
+ btf_put(btf);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_btf_id_dtor_kfuncs);
+
+#define MAX_TYPES_ARE_COMPAT_DEPTH 2
+
+/* Check local and target types for compatibility. This check is used for
+ * type-based CO-RE relocations and follow slightly different rules than
+ * field-based relocations. This function assumes that root types were already
+ * checked for name match. Beyond that initial root-level name check, names
+ * are completely ignored. Compatibility rules are as follows:
+ * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs/ENUM64s are considered compatible, but
+ * kind should match for local and target types (i.e., STRUCT is not
+ * compatible with UNION);
+ * - for ENUMs/ENUM64s, the size is ignored;
+ * - for INT, size and signedness are ignored;
+ * - for ARRAY, dimensionality is ignored, element types are checked for
+ * compatibility recursively;
+ * - CONST/VOLATILE/RESTRICT modifiers are ignored;
+ * - TYPEDEFs/PTRs are compatible if types they pointing to are compatible;
+ * - FUNC_PROTOs are compatible if they have compatible signature: same
+ * number of input args and compatible return and argument types.
+ * These rules are not set in stone and probably will be adjusted as we get
+ * more experience with using BPF CO-RE relocations.
+ */
+int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
+ const struct btf *targ_btf, __u32 targ_id)
+{
+ return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id,
+ MAX_TYPES_ARE_COMPAT_DEPTH);
+}
+
+#define MAX_TYPES_MATCH_DEPTH 2
+
+int bpf_core_types_match(const struct btf *local_btf, u32 local_id,
+ const struct btf *targ_btf, u32 targ_id)
+{
+ return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false,
+ MAX_TYPES_MATCH_DEPTH);
+}
+
+static bool bpf_core_is_flavor_sep(const char *s)
+{
+ /* check X___Y name pattern, where X and Y are not underscores */
+ return s[0] != '_' && /* X */
+ s[1] == '_' && s[2] == '_' && s[3] == '_' && /* ___ */
+ s[4] != '_'; /* Y */
+}
+
+size_t bpf_core_essential_name_len(const char *name)
+{
+ size_t n = strlen(name);
+ int i;
+
+ for (i = n - 5; i >= 0; i--) {
+ if (bpf_core_is_flavor_sep(name + i))
+ return i + 1;
+ }
+ return n;
+}
+
+struct bpf_cand_cache {
+ const char *name;
+ u32 name_len;
+ u16 kind;
+ u16 cnt;
+ struct {
+ const struct btf *btf;
+ u32 id;
+ } cands[];
+};
+
+static void bpf_free_cands(struct bpf_cand_cache *cands)
+{
+ if (!cands->cnt)
+ /* empty candidate array was allocated on stack */
+ return;
+ kfree(cands);
+}
+
+static void bpf_free_cands_from_cache(struct bpf_cand_cache *cands)
+{
+ kfree(cands->name);
+ kfree(cands);
+}
+
+#define VMLINUX_CAND_CACHE_SIZE 31
+static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE];
+
+#define MODULE_CAND_CACHE_SIZE 31
+static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE];
+
+static DEFINE_MUTEX(cand_cache_mutex);
+
+static void __print_cand_cache(struct bpf_verifier_log *log,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc;
+ int i, j;
+
+ for (i = 0; i < cache_size; i++) {
+ cc = cache[i];
+ if (!cc)
+ continue;
+ bpf_log(log, "[%d]%s(", i, cc->name);
+ for (j = 0; j < cc->cnt; j++) {
+ bpf_log(log, "%d", cc->cands[j].id);
+ if (j < cc->cnt - 1)
+ bpf_log(log, " ");
+ }
+ bpf_log(log, "), ");
+ }
+}
+
+static void print_cand_cache(struct bpf_verifier_log *log)
+{
+ mutex_lock(&cand_cache_mutex);
+ bpf_log(log, "vmlinux_cand_cache:");
+ __print_cand_cache(log, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ bpf_log(log, "\nmodule_cand_cache:");
+ __print_cand_cache(log, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ bpf_log(log, "\n");
+ mutex_unlock(&cand_cache_mutex);
+}
+
+static u32 hash_cands(struct bpf_cand_cache *cands)
+{
+ return jhash(cands->name, cands->name_len, 0);
+}
+
+static struct bpf_cand_cache *check_cand_cache(struct bpf_cand_cache *cands,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc = cache[hash_cands(cands) % cache_size];
+
+ if (cc && cc->name_len == cands->name_len &&
+ !strncmp(cc->name, cands->name, cands->name_len))
+ return cc;
+ return NULL;
+}
+
+static size_t sizeof_cands(int cnt)
+{
+ return offsetof(struct bpf_cand_cache, cands[cnt]);
+}
+
+static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache **cc = &cache[hash_cands(cands) % cache_size], *new_cands;
+
+ if (*cc) {
+ bpf_free_cands_from_cache(*cc);
+ *cc = NULL;
+ }
+ new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL);
+ if (!new_cands) {
+ bpf_free_cands(cands);
+ return ERR_PTR(-ENOMEM);
+ }
+ /* strdup the name, since it will stay in cache.
+ * the cands->name points to strings in prog's BTF and the prog can be unloaded.
+ */
+ new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL);
+ bpf_free_cands(cands);
+ if (!new_cands->name) {
+ kfree(new_cands);
+ return ERR_PTR(-ENOMEM);
+ }
+ *cc = new_cands;
+ return new_cands;
+}
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc;
+ int i, j;
+
+ for (i = 0; i < cache_size; i++) {
+ cc = cache[i];
+ if (!cc)
+ continue;
+ if (!btf) {
+ /* when new module is loaded purge all of module_cand_cache,
+ * since new module might have candidates with the name
+ * that matches cached cands.
+ */
+ bpf_free_cands_from_cache(cc);
+ cache[i] = NULL;
+ continue;
+ }
+ /* when module is unloaded purge cache entries
+ * that match module's btf
+ */
+ for (j = 0; j < cc->cnt; j++)
+ if (cc->cands[j].btf == btf) {
+ bpf_free_cands_from_cache(cc);
+ cache[i] = NULL;
+ break;
+ }
+ }
+
+}
+
+static void purge_cand_cache(struct btf *btf)
+{
+ mutex_lock(&cand_cache_mutex);
+ __purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ mutex_unlock(&cand_cache_mutex);
+}
+#endif
+
+static struct bpf_cand_cache *
+bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf,
+ int targ_start_id)
+{
+ struct bpf_cand_cache *new_cands;
+ const struct btf_type *t;
+ const char *targ_name;
+ size_t targ_essent_len;
+ int n, i;
+
+ n = btf_nr_types(targ_btf);
+ for (i = targ_start_id; i < n; i++) {
+ t = btf_type_by_id(targ_btf, i);
+ if (btf_kind(t) != cands->kind)
+ continue;
+
+ targ_name = btf_name_by_offset(targ_btf, t->name_off);
+ if (!targ_name)
+ continue;
+
+ /* the resched point is before strncmp to make sure that search
+ * for non-existing name will have a chance to schedule().
+ */
+ cond_resched();
+
+ if (strncmp(cands->name, targ_name, cands->name_len) != 0)
+ continue;
+
+ targ_essent_len = bpf_core_essential_name_len(targ_name);
+ if (targ_essent_len != cands->name_len)
+ continue;
+
+ /* most of the time there is only one candidate for a given kind+name pair */
+ new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL);
+ if (!new_cands) {
+ bpf_free_cands(cands);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ memcpy(new_cands, cands, sizeof_cands(cands->cnt));
+ bpf_free_cands(cands);
+ cands = new_cands;
+ cands->cands[cands->cnt].btf = targ_btf;
+ cands->cands[cands->cnt].id = i;
+ cands->cnt++;
+ }
+ return cands;
+}
+
+static struct bpf_cand_cache *
+bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
+{
+ struct bpf_cand_cache *cands, *cc, local_cand = {};
+ const struct btf *local_btf = ctx->btf;
+ const struct btf_type *local_type;
+ const struct btf *main_btf;
+ size_t local_essent_len;
+ struct btf *mod_btf;
+ const char *name;
+ int id;
+
+ main_btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(main_btf))
+ return ERR_CAST(main_btf);
+ if (!main_btf)
+ return ERR_PTR(-EINVAL);
+
+ local_type = btf_type_by_id(local_btf, local_type_id);
+ if (!local_type)
+ return ERR_PTR(-EINVAL);
+
+ name = btf_name_by_offset(local_btf, local_type->name_off);
+ if (str_is_empty(name))
+ return ERR_PTR(-EINVAL);
+ local_essent_len = bpf_core_essential_name_len(name);
+
+ cands = &local_cand;
+ cands->name = name;
+ cands->kind = btf_kind(local_type);
+ cands->name_len = local_essent_len;
+
+ cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ /* cands is a pointer to stack here */
+ if (cc) {
+ if (cc->cnt)
+ return cc;
+ goto check_modules;
+ }
+
+ /* Attempt to find target candidates in vmlinux BTF first */
+ cands = bpf_core_add_cands(cands, main_btf, 1);
+ if (IS_ERR(cands))
+ return ERR_CAST(cands);
+
+ /* cands is a pointer to kmalloced memory here if cands->cnt > 0 */
+
+ /* populate cache even when cands->cnt == 0 */
+ cc = populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ if (IS_ERR(cc))
+ return ERR_CAST(cc);
+
+ /* if vmlinux BTF has any candidate, don't go for module BTFs */
+ if (cc->cnt)
+ return cc;
+
+check_modules:
+ /* cands is a pointer to stack here and cands->cnt == 0 */
+ cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ if (cc)
+ /* if cache has it return it even if cc->cnt == 0 */
+ return cc;
+
+ /* If candidate is not found in vmlinux's BTF then search in module's BTFs */
+ spin_lock_bh(&btf_idr_lock);
+ idr_for_each_entry(&btf_idr, mod_btf, id) {
+ if (!btf_is_module(mod_btf))
+ continue;
+ /* linear search could be slow hence unlock/lock
+ * the IDR to avoiding holding it for too long
+ */
+ btf_get(mod_btf);
+ spin_unlock_bh(&btf_idr_lock);
+ cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf));
+ btf_put(mod_btf);
+ if (IS_ERR(cands))
+ return ERR_CAST(cands);
+ spin_lock_bh(&btf_idr_lock);
+ }
+ spin_unlock_bh(&btf_idr_lock);
+ /* cands is a pointer to kmalloced memory here if cands->cnt > 0
+ * or pointer to stack if cands->cnd == 0.
+ * Copy it into the cache even when cands->cnt == 0 and
+ * return the result.
+ */
+ return populate_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+}
+
+int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
+ int relo_idx, void *insn)
+{
+ bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL;
+ struct bpf_core_cand_list cands = {};
+ struct bpf_core_relo_res targ_res;
+ struct bpf_core_spec *specs;
+ int err;
+
+ /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5"
+ * into arrays of btf_ids of struct fields and array indices.
+ */
+ specs = kcalloc(3, sizeof(*specs), GFP_KERNEL);
+ if (!specs)
+ return -ENOMEM;
+
+ if (need_cands) {
+ struct bpf_cand_cache *cc;
+ int i;
+
+ mutex_lock(&cand_cache_mutex);
+ cc = bpf_core_find_cands(ctx, relo->type_id);
+ if (IS_ERR(cc)) {
+ bpf_log(ctx->log, "target candidate search failed for %d\n",
+ relo->type_id);
+ err = PTR_ERR(cc);
+ goto out;
+ }
+ if (cc->cnt) {
+ cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL);
+ if (!cands.cands) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+ for (i = 0; i < cc->cnt; i++) {
+ bpf_log(ctx->log,
+ "CO-RE relocating %s %s: found target candidate [%d]\n",
+ btf_kind_str[cc->kind], cc->name, cc->cands[i].id);
+ cands.cands[i].btf = cc->cands[i].btf;
+ cands.cands[i].id = cc->cands[i].id;
+ }
+ cands.len = cc->cnt;
+ /* cand_cache_mutex needs to span the cache lookup and
+ * copy of btf pointer into bpf_core_cand_list,
+ * since module can be unloaded while bpf_core_calc_relo_insn
+ * is working with module's btf.
+ */
+ }
+
+ err = bpf_core_calc_relo_insn((void *)ctx->log, relo, relo_idx, ctx->btf, &cands, specs,
+ &targ_res);
+ if (err)
+ goto out;
+
+ err = bpf_core_patch_insn((void *)ctx->log, insn, relo->insn_off / 8, relo, relo_idx,
+ &targ_res);
+
+out:
+ kfree(specs);
+ if (need_cands) {
+ kfree(cands.cands);
+ mutex_unlock(&cand_cache_mutex);
+ if (ctx->log->level & BPF_LOG_LEVEL2)
+ print_cand_cache(ctx->log);
+ }
+ return err;
+}
+
+bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id, const char *suffix)
+{
+ struct btf *btf = reg->btf;
+ const struct btf_type *walk_type, *safe_type;
+ const char *tname;
+ char safe_tname[64];
+ long ret, safe_id;
+ const struct btf_member *member;
+ u32 i;
+
+ walk_type = btf_type_by_id(btf, reg->btf_id);
+ if (!walk_type)
+ return false;
+
+ tname = btf_name_by_offset(btf, walk_type->name_off);
+
+ ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix);
+ if (ret >= sizeof(safe_tname))
+ return false;
+
+ safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info));
+ if (safe_id < 0)
+ return false;
+
+ safe_type = btf_type_by_id(btf, safe_id);
+ if (!safe_type)
+ return false;
+
+ for_each_member(i, safe_type, member) {
+ const char *m_name = __btf_name_by_offset(btf, member->name_off);
+ const struct btf_type *mtype = btf_type_by_id(btf, member->type);
+ u32 id;
+
+ if (!btf_type_is_ptr(mtype))
+ continue;
+
+ btf_type_skip_modifiers(btf, mtype->type, &id);
+ /* If we match on both type and name, the field is considered trusted. */
+ if (btf_id == id && !strcmp(field_name, m_name))
+ return true;
+ }
+
+ return false;
+}
+
+bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
+ const struct btf *reg_btf, u32 reg_id,
+ const struct btf *arg_btf, u32 arg_id)
+{
+ const char *reg_name, *arg_name, *search_needle;
+ const struct btf_type *reg_type, *arg_type;
+ int reg_len, arg_len, cmp_len;
+ size_t pattern_len = sizeof(NOCAST_ALIAS_SUFFIX) - sizeof(char);
+
+ reg_type = btf_type_by_id(reg_btf, reg_id);
+ if (!reg_type)
+ return false;
+
+ arg_type = btf_type_by_id(arg_btf, arg_id);
+ if (!arg_type)
+ return false;
+
+ reg_name = btf_name_by_offset(reg_btf, reg_type->name_off);
+ arg_name = btf_name_by_offset(arg_btf, arg_type->name_off);
+
+ reg_len = strlen(reg_name);
+ arg_len = strlen(arg_name);
+
+ /* Exactly one of the two type names may be suffixed with ___init, so
+ * if the strings are the same size, they can't possibly be no-cast
+ * aliases of one another. If you have two of the same type names, e.g.
+ * they're both nf_conn___init, it would be improper to return true
+ * because they are _not_ no-cast aliases, they are the same type.
+ */
+ if (reg_len == arg_len)
+ return false;
+
+ /* Either of the two names must be the other name, suffixed with ___init. */
+ if ((reg_len != arg_len + pattern_len) &&
+ (arg_len != reg_len + pattern_len))
+ return false;
+
+ if (reg_len < arg_len) {
+ search_needle = strstr(arg_name, NOCAST_ALIAS_SUFFIX);
+ cmp_len = reg_len;
+ } else {
+ search_needle = strstr(reg_name, NOCAST_ALIAS_SUFFIX);
+ cmp_len = arg_len;
+ }
+
+ if (!search_needle)
+ return false;
+
+ /* ___init suffix must come at the end of the name */
+ if (*(search_needle + pattern_len) != '\0')
+ return false;
+
+ return !strncmp(reg_name, arg_name, cmp_len);
+}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ac53102e244a..491d20038cbe 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -14,14 +14,181 @@
#include <linux/string.h>
#include <linux/bpf.h>
#include <linux/bpf-cgroup.h>
+#include <linux/bpf_lsm.h>
+#include <linux/bpf_verifier.h>
#include <net/sock.h>
#include <net/bpf_sk_storage.h>
#include "../cgroup/cgroup-internal.h"
-DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
+DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+/* __always_inline is necessary to prevent indirect call through run_prog
+ * function pointer.
+ */
+static __always_inline int
+bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
+ enum cgroup_bpf_attach_type atype,
+ const void *ctx, bpf_prog_run_fn run_prog,
+ int retval, u32 *ret_flags)
+{
+ const struct bpf_prog_array_item *item;
+ const struct bpf_prog *prog;
+ const struct bpf_prog_array *array;
+ struct bpf_run_ctx *old_run_ctx;
+ struct bpf_cg_run_ctx run_ctx;
+ u32 func_ret;
+
+ run_ctx.retval = retval;
+ migrate_disable();
+ rcu_read_lock();
+ array = rcu_dereference(cgrp->effective[atype]);
+ item = &array->items[0];
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ while ((prog = READ_ONCE(item->prog))) {
+ run_ctx.prog_item = item;
+ func_ret = run_prog(prog, ctx);
+ if (ret_flags) {
+ *(ret_flags) |= (func_ret >> 1);
+ func_ret &= 1;
+ }
+ if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
+ run_ctx.retval = -EPERM;
+ item++;
+ }
+ bpf_reset_run_ctx(old_run_ctx);
+ rcu_read_unlock();
+ migrate_enable();
+ return run_ctx.retval;
+}
+
+unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_prog *shim_prog;
+ struct sock *sk;
+ struct cgroup *cgrp;
+ int ret = 0;
+ u64 *args;
+
+ args = (u64 *)ctx;
+ sk = (void *)(unsigned long)args[0];
+ /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ if (likely(cgrp))
+ ret = bpf_prog_run_array_cg(&cgrp->bpf,
+ shim_prog->aux->cgroup_atype,
+ ctx, bpf_prog_run, 0, NULL);
+ return ret;
+}
+
+unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_prog *shim_prog;
+ struct socket *sock;
+ struct cgroup *cgrp;
+ int ret = 0;
+ u64 *args;
+
+ args = (u64 *)ctx;
+ sock = (void *)(unsigned long)args[0];
+ /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+ cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data);
+ if (likely(cgrp))
+ ret = bpf_prog_run_array_cg(&cgrp->bpf,
+ shim_prog->aux->cgroup_atype,
+ ctx, bpf_prog_run, 0, NULL);
+ return ret;
+}
+
+unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_prog *shim_prog;
+ struct cgroup *cgrp;
+ int ret = 0;
+
+ /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+ /* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */
+ cgrp = task_dfl_cgroup(current);
+ if (likely(cgrp))
+ ret = bpf_prog_run_array_cg(&cgrp->bpf,
+ shim_prog->aux->cgroup_atype,
+ ctx, bpf_prog_run, 0, NULL);
+ return ret;
+}
+
+#ifdef CONFIG_BPF_LSM
+struct cgroup_lsm_atype {
+ u32 attach_btf_id;
+ int refcnt;
+};
+
+static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM];
+
+static enum cgroup_bpf_attach_type
+bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
+{
+ int i;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (attach_type != BPF_LSM_CGROUP)
+ return to_cgroup_bpf_attach_type(attach_type);
+
+ for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
+ if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id)
+ return CGROUP_LSM_START + i;
+
+ for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
+ if (cgroup_lsm_atype[i].attach_btf_id == 0)
+ return CGROUP_LSM_START + i;
+
+ return -E2BIG;
+
+}
+
+void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype)
+{
+ int i = cgroup_atype - CGROUP_LSM_START;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id &&
+ cgroup_lsm_atype[i].attach_btf_id != attach_btf_id);
+
+ cgroup_lsm_atype[i].attach_btf_id = attach_btf_id;
+ cgroup_lsm_atype[i].refcnt++;
+}
+
+void bpf_cgroup_atype_put(int cgroup_atype)
+{
+ int i = cgroup_atype - CGROUP_LSM_START;
+
+ cgroup_lock();
+ if (--cgroup_lsm_atype[i].refcnt <= 0)
+ cgroup_lsm_atype[i].attach_btf_id = 0;
+ WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0);
+ cgroup_unlock();
+}
+#else
+static enum cgroup_bpf_attach_type
+bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
+{
+ if (attach_type != BPF_LSM_CGROUP)
+ return to_cgroup_bpf_attach_type(attach_type);
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_BPF_LSM */
+
void cgroup_bpf_offline(struct cgroup *cgrp)
{
cgroup_get(cgrp);
@@ -37,17 +204,34 @@ static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
}
static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
- struct bpf_prog *prog)
+ struct bpf_cgroup_storage *new_storages[],
+ enum bpf_attach_type type,
+ struct bpf_prog *prog,
+ struct cgroup *cgrp)
{
enum bpf_cgroup_storage_type stype;
+ struct bpf_cgroup_storage_key key;
+ struct bpf_map *map;
+
+ key.cgroup_inode_id = cgroup_id(cgrp);
+ key.attach_type = type;
for_each_cgroup_storage_type(stype) {
+ map = prog->aux->cgroup_storage[stype];
+ if (!map)
+ continue;
+
+ storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
+ if (storages[stype])
+ continue;
+
storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
if (IS_ERR(storages[stype])) {
- storages[stype] = NULL;
- bpf_cgroup_storages_free(storages);
+ bpf_cgroup_storages_free(new_storages);
return -ENOMEM;
}
+
+ new_storages[stype] = storages[stype];
}
return 0;
@@ -63,7 +247,7 @@ static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
}
static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
- struct cgroup* cgrp,
+ struct cgroup *cgrp,
enum bpf_attach_type attach_type)
{
enum bpf_cgroup_storage_type stype;
@@ -72,14 +256,6 @@ static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
}
-static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[])
-{
- enum bpf_cgroup_storage_type stype;
-
- for_each_cgroup_storage_type(stype)
- bpf_cgroup_storage_unlink(storages[stype]);
-}
-
/* Called when bpf_cgroup_link is auto-detached from dying cgroup.
* It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
* doesn't free link memory, which will eventually be done by bpf_link's
@@ -101,32 +277,45 @@ static void cgroup_bpf_release(struct work_struct *work)
struct cgroup *p, *cgrp = container_of(work, struct cgroup,
bpf.release_work);
struct bpf_prog_array *old_array;
- unsigned int type;
+ struct list_head *storages = &cgrp->bpf.storages;
+ struct bpf_cgroup_storage *storage, *stmp;
+
+ unsigned int atype;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
- for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
- struct list_head *progs = &cgrp->bpf.progs[type];
- struct bpf_prog_list *pl, *tmp;
+ for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
+ struct hlist_head *progs = &cgrp->bpf.progs[atype];
+ struct bpf_prog_list *pl;
+ struct hlist_node *pltmp;
- list_for_each_entry_safe(pl, tmp, progs, node) {
- list_del(&pl->node);
- if (pl->prog)
+ hlist_for_each_entry_safe(pl, pltmp, progs, node) {
+ hlist_del(&pl->node);
+ if (pl->prog) {
+ if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(pl->prog);
bpf_prog_put(pl->prog);
- if (pl->link)
+ }
+ if (pl->link) {
+ if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
bpf_cgroup_link_auto_detach(pl->link);
- bpf_cgroup_storages_unlink(pl->storage);
- bpf_cgroup_storages_free(pl->storage);
+ }
kfree(pl);
- static_branch_dec(&cgroup_bpf_enabled_key);
+ static_branch_dec(&cgroup_bpf_enabled_key[atype]);
}
old_array = rcu_dereference_protected(
- cgrp->bpf.effective[type],
+ cgrp->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
bpf_prog_array_free(old_array);
}
- mutex_unlock(&cgroup_mutex);
+ list_for_each_entry_safe(storage, stmp, storages, list_cg) {
+ bpf_cgroup_storage_unlink(storage);
+ bpf_cgroup_storage_free(storage);
+ }
+
+ cgroup_unlock();
for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
cgroup_bpf_put(p);
@@ -163,12 +352,12 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
/* count number of elements in the list.
* it's slow but the list cannot be long
*/
-static u32 prog_list_length(struct list_head *head)
+static u32 prog_list_length(struct hlist_head *head)
{
struct bpf_prog_list *pl;
u32 cnt = 0;
- list_for_each_entry(pl, head, node) {
+ hlist_for_each_entry(pl, head, node) {
if (!prog_list_prog(pl))
continue;
cnt++;
@@ -181,7 +370,7 @@ static u32 prog_list_length(struct list_head *head)
* if parent has overridable or multi-prog, allow attaching
*/
static bool hierarchy_allows_attach(struct cgroup *cgrp,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
struct cgroup *p;
@@ -189,12 +378,12 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
if (!p)
return true;
do {
- u32 flags = p->bpf.flags[type];
+ u32 flags = p->bpf.flags[atype];
u32 cnt;
if (flags & BPF_F_ALLOW_MULTI)
return true;
- cnt = prog_list_length(&p->bpf.progs[type]);
+ cnt = prog_list_length(&p->bpf.progs[atype]);
WARN_ON_ONCE(cnt > 1);
if (cnt == 1)
return !!(flags & BPF_F_ALLOW_OVERRIDE);
@@ -210,7 +399,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
* to programs in this cgroup
*/
static int compute_effective_progs(struct cgroup *cgrp,
- enum bpf_attach_type type,
+ enum cgroup_bpf_attach_type atype,
struct bpf_prog_array **array)
{
struct bpf_prog_array_item *item;
@@ -221,8 +410,8 @@ static int compute_effective_progs(struct cgroup *cgrp,
/* count number of effective programs by walking parents */
do {
- if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
- cnt += prog_list_length(&p->bpf.progs[type]);
+ if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
+ cnt += prog_list_length(&p->bpf.progs[atype]);
p = cgroup_parent(p);
} while (p);
@@ -234,10 +423,10 @@ static int compute_effective_progs(struct cgroup *cgrp,
cnt = 0;
p = cgrp;
do {
- if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
continue;
- list_for_each_entry(pl, &p->bpf.progs[type], node) {
+ hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
if (!prog_list_prog(pl))
continue;
@@ -254,10 +443,10 @@ static int compute_effective_progs(struct cgroup *cgrp,
}
static void activate_effective_progs(struct cgroup *cgrp,
- enum bpf_attach_type type,
+ enum cgroup_bpf_attach_type atype,
struct bpf_prog_array *old_array)
{
- old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array,
+ old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
lockdep_is_held(&cgroup_mutex));
/* free prog array after grace period, since __cgroup_bpf_run_*()
* might be still walking the array
@@ -288,7 +477,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
cgroup_bpf_get(p);
for (i = 0; i < NR; i++)
- INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
+ INIT_HLIST_HEAD(&cgrp->bpf.progs[i]);
+
+ INIT_LIST_HEAD(&cgrp->bpf.storages);
for (i = 0; i < NR; i++)
if (compute_effective_progs(cgrp, i, &arrays[i]))
@@ -311,7 +502,7 @@ cleanup:
}
static int update_effective_progs(struct cgroup *cgrp,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
struct cgroup_subsys_state *css;
int err;
@@ -323,7 +514,7 @@ static int update_effective_progs(struct cgroup *cgrp,
if (percpu_ref_is_zero(&desc->bpf.refcnt))
continue;
- err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+ err = compute_effective_progs(desc, atype, &desc->bpf.inactive);
if (err)
goto cleanup;
}
@@ -340,7 +531,7 @@ static int update_effective_progs(struct cgroup *cgrp,
continue;
}
- activate_effective_progs(desc, type, desc->bpf.inactive);
+ activate_effective_progs(desc, atype, desc->bpf.inactive);
desc->bpf.inactive = NULL;
}
@@ -362,7 +553,7 @@ cleanup:
#define BPF_CGROUP_MAX_PROGS 64
-static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
+static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
struct bpf_prog *prog,
struct bpf_cgroup_link *link,
struct bpf_prog *replace_prog,
@@ -372,12 +563,12 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
/* single-attach case */
if (!allow_multi) {
- if (list_empty(progs))
+ if (hlist_empty(progs))
return NULL;
- return list_first_entry(progs, typeof(*pl), node);
+ return hlist_entry(progs->first, typeof(*pl), node);
}
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (prog && pl->prog == prog && prog != replace_prog)
/* disallow attaching the same prog twice */
return ERR_PTR(-EINVAL);
@@ -388,7 +579,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
/* direct prog multi-attach w/ replacement case */
if (replace_prog) {
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (pl->prog == replace_prog)
/* a match found */
return pl;
@@ -413,17 +604,19 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
* Exactly one of @prog or @link can be non-null.
* Must be called with cgroup_mutex held.
*/
-int __cgroup_bpf_attach(struct cgroup *cgrp,
- struct bpf_prog *prog, struct bpf_prog *replace_prog,
- struct bpf_cgroup_link *link,
- enum bpf_attach_type type, u32 flags)
+static int __cgroup_bpf_attach(struct cgroup *cgrp,
+ struct bpf_prog *prog, struct bpf_prog *replace_prog,
+ struct bpf_cgroup_link *link,
+ enum bpf_attach_type type, u32 flags)
{
u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
- struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog *old_prog = NULL;
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
- struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+ struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+ struct bpf_prog *new_prog = prog ? : link->link.prog;
+ enum cgroup_bpf_attach_type atype;
struct bpf_prog_list *pl;
+ struct hlist_head *progs;
int err;
if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
@@ -437,10 +630,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
/* replace_prog implies BPF_F_REPLACE, and vice versa */
return -EINVAL;
- if (!hierarchy_allows_attach(cgrp, type))
+ atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
+ if (atype < 0)
+ return -EINVAL;
+
+ progs = &cgrp->bpf.progs[atype];
+
+ if (!hierarchy_allows_attach(cgrp, atype))
return -EPERM;
- if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags)
+ if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
/* Disallow attaching non-overridable on top
* of existing overridable in this cgroup.
* Disallow attaching multi-prog if overridable or none
@@ -455,66 +654,99 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
if (IS_ERR(pl))
return PTR_ERR(pl);
- if (bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog))
+ if (bpf_cgroup_storages_alloc(storage, new_storage, type,
+ prog ? : link->link.prog, cgrp))
return -ENOMEM;
if (pl) {
old_prog = pl->prog;
- bpf_cgroup_storages_unlink(pl->storage);
- bpf_cgroup_storages_assign(old_storage, pl->storage);
} else {
+ struct hlist_node *last = NULL;
+
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
if (!pl) {
- bpf_cgroup_storages_free(storage);
+ bpf_cgroup_storages_free(new_storage);
return -ENOMEM;
}
- list_add_tail(&pl->node, progs);
+ if (hlist_empty(progs))
+ hlist_add_head(&pl->node, progs);
+ else
+ hlist_for_each(last, progs) {
+ if (last->next)
+ continue;
+ hlist_add_behind(&pl->node, last);
+ break;
+ }
}
pl->prog = prog;
pl->link = link;
bpf_cgroup_storages_assign(pl->storage, storage);
- cgrp->bpf.flags[type] = saved_flags;
+ cgrp->bpf.flags[atype] = saved_flags;
- err = update_effective_progs(cgrp, type);
+ if (type == BPF_LSM_CGROUP) {
+ err = bpf_trampoline_link_cgroup_shim(new_prog, atype);
+ if (err)
+ goto cleanup;
+ }
+
+ err = update_effective_progs(cgrp, atype);
if (err)
- goto cleanup;
+ goto cleanup_trampoline;
- bpf_cgroup_storages_free(old_storage);
- if (old_prog)
+ if (old_prog) {
+ if (type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(old_prog);
bpf_prog_put(old_prog);
- else
- static_branch_inc(&cgroup_bpf_enabled_key);
- bpf_cgroup_storages_link(pl->storage, cgrp, type);
+ } else {
+ static_branch_inc(&cgroup_bpf_enabled_key[atype]);
+ }
+ bpf_cgroup_storages_link(new_storage, cgrp, type);
return 0;
+cleanup_trampoline:
+ if (type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(new_prog);
+
cleanup:
if (old_prog) {
pl->prog = old_prog;
pl->link = NULL;
}
- bpf_cgroup_storages_free(pl->storage);
- bpf_cgroup_storages_assign(pl->storage, old_storage);
- bpf_cgroup_storages_link(pl->storage, cgrp, type);
+ bpf_cgroup_storages_free(new_storage);
if (!old_prog) {
- list_del(&pl->node);
+ hlist_del(&pl->node);
kfree(pl);
}
return err;
}
+static int cgroup_bpf_attach(struct cgroup *cgrp,
+ struct bpf_prog *prog, struct bpf_prog *replace_prog,
+ struct bpf_cgroup_link *link,
+ enum bpf_attach_type type,
+ u32 flags)
+{
+ int ret;
+
+ cgroup_lock();
+ ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
+ cgroup_unlock();
+ return ret;
+}
+
/* Swap updated BPF program for given link in effective program arrays across
* all descendant cgroups. This function is guaranteed to succeed.
*/
static void replace_effective_prog(struct cgroup *cgrp,
- enum bpf_attach_type type,
+ enum cgroup_bpf_attach_type atype,
struct bpf_cgroup_link *link)
{
struct bpf_prog_array_item *item;
struct cgroup_subsys_state *css;
struct bpf_prog_array *progs;
struct bpf_prog_list *pl;
- struct list_head *head;
+ struct hlist_head *head;
struct cgroup *cg;
int pos;
@@ -526,11 +758,11 @@ static void replace_effective_prog(struct cgroup *cgrp,
/* find position of link in effective progs array */
for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
- if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
continue;
- head = &cg->bpf.progs[type];
- list_for_each_entry(pl, head, node) {
+ head = &cg->bpf.progs[atype];
+ hlist_for_each_entry(pl, head, node) {
if (!prog_list_prog(pl))
continue;
if (pl->link == link)
@@ -541,7 +773,7 @@ static void replace_effective_prog(struct cgroup *cgrp,
found:
BUG_ON(!cg);
progs = rcu_dereference_protected(
- desc->bpf.effective[type],
+ desc->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
item = &progs->items[pos];
WRITE_ONCE(item->prog, link->link.prog);
@@ -553,7 +785,8 @@ found:
* to descendants
* @cgrp: The cgroup which descendants to traverse
* @link: A link for which to replace BPF program
- * @type: Type of attach operation
+ * @new_prog: &struct bpf_prog for the target BPF program with its refcnt
+ * incremented
*
* Must be called with cgroup_mutex held.
*/
@@ -561,15 +794,22 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
struct bpf_cgroup_link *link,
struct bpf_prog *new_prog)
{
- struct list_head *progs = &cgrp->bpf.progs[link->type];
+ enum cgroup_bpf_attach_type atype;
struct bpf_prog *old_prog;
struct bpf_prog_list *pl;
+ struct hlist_head *progs;
bool found = false;
+ atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id);
+ if (atype < 0)
+ return -EINVAL;
+
+ progs = &cgrp->bpf.progs[atype];
+
if (link->link.prog->type != new_prog->type)
return -EINVAL;
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (pl->link == link) {
found = true;
break;
@@ -579,7 +819,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
return -ENOENT;
old_prog = xchg(&link->link.prog, new_prog);
- replace_effective_prog(cgrp, link->type, link);
+ replace_effective_prog(cgrp, atype, link);
bpf_prog_put(old_prog);
return 0;
}
@@ -592,7 +832,7 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
cg_link = container_of(link, struct bpf_cgroup_link, link);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
/* link might have been auto-released by dying cgroup, so fail */
if (!cg_link->cgroup) {
ret = -ENOLINK;
@@ -604,11 +844,11 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
}
ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
out_unlock:
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
-static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
+static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
struct bpf_prog *prog,
struct bpf_cgroup_link *link,
bool allow_multi)
@@ -616,14 +856,14 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
struct bpf_prog_list *pl;
if (!allow_multi) {
- if (list_empty(progs))
+ if (hlist_empty(progs))
/* report error when trying to detach and nothing is attached */
return ERR_PTR(-ENOENT);
/* to maintain backward compatibility NONE and OVERRIDE cgroups
* allow detaching with invalid FD (prog==NULL) in legacy mode
*/
- return list_first_entry(progs, typeof(*pl), node);
+ return hlist_entry(progs->first, typeof(*pl), node);
}
if (!prog && !link)
@@ -633,7 +873,7 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
return ERR_PTR(-EINVAL);
/* find the prog or link and detach it */
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (pl->prog == prog && pl->link == link)
return pl;
}
@@ -641,24 +881,93 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
}
/**
+ * purge_effective_progs() - After compute_effective_progs fails to alloc new
+ * cgrp->bpf.inactive table we can recover by
+ * recomputing the array in place.
+ *
+ * @cgrp: The cgroup which descendants to travers
+ * @prog: A program to detach or NULL
+ * @link: A link to detach or NULL
+ * @atype: Type of detach operation
+ */
+static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
+ struct bpf_cgroup_link *link,
+ enum cgroup_bpf_attach_type atype)
+{
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_array *progs;
+ struct bpf_prog_list *pl;
+ struct hlist_head *head;
+ struct cgroup *cg;
+ int pos;
+
+ /* recompute effective prog array in place */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ if (percpu_ref_is_zero(&desc->bpf.refcnt))
+ continue;
+
+ /* find position of link or prog in effective progs array */
+ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
+ if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
+ continue;
+
+ head = &cg->bpf.progs[atype];
+ hlist_for_each_entry(pl, head, node) {
+ if (!prog_list_prog(pl))
+ continue;
+ if (pl->prog == prog && pl->link == link)
+ goto found;
+ pos++;
+ }
+ }
+
+ /* no link or prog match, skip the cgroup of this layer */
+ continue;
+found:
+ progs = rcu_dereference_protected(
+ desc->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+
+ /* Remove the program from the array */
+ WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
+ "Failed to purge a prog from array at index %d", pos);
+ }
+}
+
+/**
* __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
* propagate the change to descendants
* @cgrp: The cgroup which descendants to traverse
* @prog: A program to detach or NULL
- * @prog: A link to detach or NULL
+ * @link: A link to detach or NULL
* @type: Type of detach operation
*
* At most one of @prog or @link can be non-NULL.
* Must be called with cgroup_mutex held.
*/
-int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- struct bpf_cgroup_link *link, enum bpf_attach_type type)
+static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ struct bpf_cgroup_link *link, enum bpf_attach_type type)
{
- struct list_head *progs = &cgrp->bpf.progs[type];
- u32 flags = cgrp->bpf.flags[type];
- struct bpf_prog_list *pl;
+ enum cgroup_bpf_attach_type atype;
struct bpf_prog *old_prog;
- int err;
+ struct bpf_prog_list *pl;
+ struct hlist_head *progs;
+ u32 attach_btf_id = 0;
+ u32 flags;
+
+ if (prog)
+ attach_btf_id = prog->aux->attach_btf_id;
+ if (link)
+ attach_btf_id = link->link.prog->aux->attach_btf_id;
+
+ atype = bpf_cgroup_atype_find(type, attach_btf_id);
+ if (atype < 0)
+ return -EINVAL;
+
+ progs = &cgrp->bpf.progs[atype];
+ flags = cgrp->bpf.flags[atype];
if (prog && link)
/* only one of prog or link can be specified */
@@ -673,81 +982,151 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
pl->prog = NULL;
pl->link = NULL;
- err = update_effective_progs(cgrp, type);
- if (err)
- goto cleanup;
+ if (update_effective_progs(cgrp, atype)) {
+ /* if update effective array failed replace the prog with a dummy prog*/
+ pl->prog = old_prog;
+ pl->link = link;
+ purge_effective_progs(cgrp, old_prog, link, atype);
+ }
/* now can actually delete it from this cgroup list */
- list_del(&pl->node);
- bpf_cgroup_storages_unlink(pl->storage);
- bpf_cgroup_storages_free(pl->storage);
+ hlist_del(&pl->node);
+
kfree(pl);
- if (list_empty(progs))
+ if (hlist_empty(progs))
/* last program was detached, reset flags to zero */
- cgrp->bpf.flags[type] = 0;
- if (old_prog)
+ cgrp->bpf.flags[atype] = 0;
+ if (old_prog) {
+ if (type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(old_prog);
bpf_prog_put(old_prog);
- static_branch_dec(&cgroup_bpf_enabled_key);
+ }
+ static_branch_dec(&cgroup_bpf_enabled_key[atype]);
return 0;
+}
-cleanup:
- /* restore back prog or link */
- pl->prog = old_prog;
- pl->link = link;
- return err;
+static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type)
+{
+ int ret;
+
+ cgroup_lock();
+ ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
+ cgroup_unlock();
+ return ret;
}
/* Must be called with cgroup_mutex held to avoid races. */
-int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
- union bpf_attr __user *uattr)
+static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
{
+ __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
+ bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
enum bpf_attach_type type = attr->query.attach_type;
- struct list_head *progs = &cgrp->bpf.progs[type];
- u32 flags = cgrp->bpf.flags[type];
+ enum cgroup_bpf_attach_type from_atype, to_atype;
+ enum cgroup_bpf_attach_type atype;
struct bpf_prog_array *effective;
- struct bpf_prog *prog;
int cnt, ret = 0, i;
+ int total_cnt = 0;
+ u32 flags;
- effective = rcu_dereference_protected(cgrp->bpf.effective[type],
- lockdep_is_held(&cgroup_mutex));
+ if (effective_query && prog_attach_flags)
+ return -EINVAL;
- if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
- cnt = bpf_prog_array_length(effective);
- else
- cnt = prog_list_length(progs);
+ if (type == BPF_LSM_CGROUP) {
+ if (!effective_query && attr->query.prog_cnt &&
+ prog_ids && !prog_attach_flags)
+ return -EINVAL;
+ from_atype = CGROUP_LSM_START;
+ to_atype = CGROUP_LSM_END;
+ flags = 0;
+ } else {
+ from_atype = to_cgroup_bpf_attach_type(type);
+ if (from_atype < 0)
+ return -EINVAL;
+ to_atype = from_atype;
+ flags = cgrp->bpf.flags[from_atype];
+ }
+
+ for (atype = from_atype; atype <= to_atype; atype++) {
+ if (effective_query) {
+ effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+ total_cnt += bpf_prog_array_length(effective);
+ } else {
+ total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
+ }
+ }
+
+ /* always output uattr->query.attach_flags as 0 during effective query */
+ flags = effective_query ? 0 : flags;
if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
return -EFAULT;
- if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
+ if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
return -EFAULT;
- if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
+ if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
/* return early if user requested only program count + flags */
return 0;
- if (attr->query.prog_cnt < cnt) {
- cnt = attr->query.prog_cnt;
+
+ if (attr->query.prog_cnt < total_cnt) {
+ total_cnt = attr->query.prog_cnt;
ret = -ENOSPC;
}
- if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
- return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
- } else {
- struct bpf_prog_list *pl;
- u32 id;
-
- i = 0;
- list_for_each_entry(pl, progs, node) {
- prog = prog_list_prog(pl);
- id = prog->aux->id;
- if (copy_to_user(prog_ids + i, &id, sizeof(id)))
- return -EFAULT;
- if (++i == cnt)
- break;
+ for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
+ if (effective_query) {
+ effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+ cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
+ ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
+ } else {
+ struct hlist_head *progs;
+ struct bpf_prog_list *pl;
+ struct bpf_prog *prog;
+ u32 id;
+
+ progs = &cgrp->bpf.progs[atype];
+ cnt = min_t(int, prog_list_length(progs), total_cnt);
+ i = 0;
+ hlist_for_each_entry(pl, progs, node) {
+ prog = prog_list_prog(pl);
+ id = prog->aux->id;
+ if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (++i == cnt)
+ break;
+ }
+
+ if (prog_attach_flags) {
+ flags = cgrp->bpf.flags[atype];
+
+ for (i = 0; i < cnt; i++)
+ if (copy_to_user(prog_attach_flags + i,
+ &flags, sizeof(flags)))
+ return -EFAULT;
+ prog_attach_flags += cnt;
+ }
}
+
+ prog_ids += cnt;
+ total_cnt -= cnt;
}
return ret;
}
+static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ int ret;
+
+ cgroup_lock();
+ ret = __cgroup_bpf_query(cgrp, attr, uattr);
+ cgroup_unlock();
+ return ret;
+}
+
int cgroup_bpf_prog_attach(const union bpf_attr *attr,
enum bpf_prog_type ptype, struct bpf_prog *prog)
{
@@ -803,6 +1182,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
{
struct bpf_cgroup_link *cg_link =
container_of(link, struct bpf_cgroup_link, link);
+ struct cgroup *cg;
/* link might have been auto-detached by dying cgroup already,
* in that case our work is done here
@@ -810,19 +1190,25 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
if (!cg_link->cgroup)
return;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
/* re-check cgroup under lock again */
if (!cg_link->cgroup) {
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return;
}
WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
cg_link->type));
+ if (cg_link->type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);
+
+ cg = cg_link->cgroup;
+ cg_link->cgroup = NULL;
- mutex_unlock(&cgroup_mutex);
- cgroup_put(cg_link->cgroup);
+ cgroup_unlock();
+
+ cgroup_put(cg);
}
static void bpf_cgroup_link_dealloc(struct bpf_link *link)
@@ -833,6 +1219,13 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link)
kfree(cg_link);
}
+static int bpf_cgroup_link_detach(struct bpf_link *link)
+{
+ bpf_cgroup_link_release(link);
+
+ return 0;
+}
+
static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
struct seq_file *seq)
{
@@ -840,10 +1233,10 @@ static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
container_of(link, struct bpf_cgroup_link, link);
u64 cg_id = 0;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
if (cg_link->cgroup)
cg_id = cgroup_id(cg_link->cgroup);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
seq_printf(seq,
"cgroup_id:\t%llu\n"
@@ -859,10 +1252,10 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
container_of(link, struct bpf_cgroup_link, link);
u64 cg_id = 0;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
if (cg_link->cgroup)
cg_id = cgroup_id(cg_link->cgroup);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
info->cgroup.cgroup_id = cg_id;
info->cgroup.attach_type = cg_link->type;
@@ -872,6 +1265,7 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_cgroup_link_lops = {
.release = bpf_cgroup_link_release,
.dealloc = bpf_cgroup_link_dealloc,
+ .detach = bpf_cgroup_link_detach,
.update_prog = cgroup_bpf_replace,
.show_fdinfo = bpf_cgroup_link_show_fdinfo,
.fill_link_info = bpf_cgroup_link_fill_link_info,
@@ -901,14 +1295,14 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
link->cgroup = cgrp;
link->type = attr->link_create.attach_type;
- err = bpf_link_prime(&link->link, &link_primer);
+ err = bpf_link_prime(&link->link, &link_primer);
if (err) {
kfree(link);
goto out_put_cgroup;
}
- err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type,
- BPF_F_ALLOW_MULTI);
+ err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
+ link->type, BPF_F_ALLOW_MULTI);
if (err) {
bpf_link_cleanup(&link_primer);
goto out_put_cgroup;
@@ -941,7 +1335,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
* @sk: The socket sending or receiving traffic
* @skb: The skb that is being sent or received
- * @type: The type of program to be exectuted
+ * @atype: The type of program to be executed
*
* If no socket is passed, or the socket is not of type INET or INET6,
* this function does nothing and returns 0.
@@ -954,7 +1348,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr
* NET_XMIT_CN (2) - continue with packet output and notify TCP
* to call cwr
- * -EPERM - drop packet
+ * -err - drop packet
*
* For ingress packets, this function will return -EPERM if any
* attached program was found and if it returned != 1 during execution.
@@ -962,7 +1356,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
*/
int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
unsigned int offset = skb->data - skb_network_header(skb);
struct sock *save_sk;
@@ -984,13 +1378,41 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
/* compute pointers for the bpf prog */
bpf_compute_and_save_data_end(skb, &saved_data_end);
- if (type == BPF_CGROUP_INET_EGRESS) {
- ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
- cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
+ if (atype == CGROUP_INET_EGRESS) {
+ u32 flags = 0;
+ bool cn;
+
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb,
+ __bpf_prog_run_save_cb, 0, &flags);
+
+ /* Return values of CGROUP EGRESS BPF programs are:
+ * 0: drop packet
+ * 1: keep packet
+ * 2: drop packet and cn
+ * 3: keep packet and cn
+ *
+ * The returned value is then converted to one of the NET_XMIT
+ * or an error code that is then interpreted as drop packet
+ * (and no cn):
+ * 0: NET_XMIT_SUCCESS skb should be transmitted
+ * 1: NET_XMIT_DROP skb should be dropped and cn
+ * 2: NET_XMIT_CN skb should be transmitted and cn
+ * 3: -err skb should be dropped
+ */
+
+ cn = flags & BPF_RET_SET_CN;
+ if (ret && !IS_ERR_VALUE((long)ret))
+ ret = -EFAULT;
+ if (!ret)
+ ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
+ else
+ ret = (cn ? NET_XMIT_DROP : ret);
} else {
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
- __bpf_prog_run_save_cb);
- ret = (ret == 1 ? 0 : -EPERM);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
+ skb, __bpf_prog_run_save_cb, 0,
+ NULL);
+ if (ret && !IS_ERR_VALUE((long)ret))
+ ret = -EFAULT;
}
bpf_restore_data_end(skb, saved_data_end);
__skb_pull(skb, offset);
@@ -1003,7 +1425,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
/**
* __cgroup_bpf_run_filter_sk() - Run a program on a sock
* @sk: sock structure to manipulate
- * @type: The type of program to be exectuted
+ * @atype: The type of program to be executed
*
* socket is passed is expected to be of type INET or INET6.
*
@@ -1014,13 +1436,12 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
* and if it returned != 1 during execution. In all other cases, 0 is returned.
*/
int __cgroup_bpf_run_filter_sk(struct sock *sk,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- int ret;
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
- return ret == 1 ? 0 : -EPERM;
+ return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0,
+ NULL);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -1029,18 +1450,25 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
* provided by user sockaddr
* @sk: sock struct that will use sockaddr
* @uaddr: sockaddr struct provided by user
- * @type: The type of program to be exectuted
+ * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is
+ * read-only for AF_INET[6] uaddr but can be modified for AF_UNIX
+ * uaddr.
+ * @atype: The type of program to be executed
* @t_ctx: Pointer to attach type specific context
+ * @flags: Pointer to u32 which contains higher bits of BPF program
+ * return value (OR'ed together).
*
- * socket is expected to be of type INET or INET6.
+ * socket is expected to be of type INET, INET6 or UNIX.
*
* This function will return %-EPERM if an attached program is found and
* returned value != 1 during execution. In all other cases, 0 is returned.
*/
int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
struct sockaddr *uaddr,
- enum bpf_attach_type type,
- void *t_ctx)
+ int *uaddrlen,
+ enum cgroup_bpf_attach_type atype,
+ void *t_ctx,
+ u32 *flags)
{
struct bpf_sock_addr_kern ctx = {
.sk = sk,
@@ -1054,18 +1482,26 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
/* Check socket family since not all sockets represent network
* endpoint (e.g. AF_UNIX).
*/
- if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
+ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 &&
+ sk->sk_family != AF_UNIX)
return 0;
if (!ctx.uaddr) {
memset(&unspec, 0, sizeof(unspec));
ctx.uaddr = (struct sockaddr *)&unspec;
+ ctx.uaddrlen = 0;
+ } else {
+ ctx.uaddrlen = *uaddrlen;
}
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
+ 0, flags);
+
+ if (!ret && uaddr)
+ *uaddrlen = ctx.uaddrlen;
- return ret == 1 ? 0 : -EPERM;
+ return ret;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
@@ -1075,7 +1511,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
* @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
* sk with connection information (IP addresses, etc.) May not contain
* cgroup info if it is a req sock.
- * @type: The type of program to be exectuted
+ * @atype: The type of program to be executed
*
* socket passed is expected to be of type INET or INET6.
*
@@ -1087,19 +1523,17 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
*/
int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
struct bpf_sock_ops_kern *sock_ops,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- int ret;
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
- BPF_PROG_RUN);
- return ret == 1 ? 0 : -EPERM;
+ return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
+ 0, NULL);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
- short access, enum bpf_attach_type type)
+ short access, enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp;
struct bpf_cgroup_dev_ctx ctx = {
@@ -1107,27 +1541,92 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
.major = major,
.minor = minor,
};
- int allow = 1;
+ int ret;
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
- allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
- BPF_PROG_RUN);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
+ NULL);
rcu_read_unlock();
- return !allow;
+ return ret;
}
+BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
+{
+ /* flags argument is not used now,
+ * but provides an ability to extend the API.
+ * verifier checks that its value is correct.
+ */
+ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
+ struct bpf_cgroup_storage *storage;
+ struct bpf_cg_run_ctx *ctx;
+ void *ptr;
+
+ /* get current cgroup storage from BPF run context */
+ ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+ storage = ctx->prog_item->cgroup_storage[stype];
+
+ if (stype == BPF_CGROUP_STORAGE_SHARED)
+ ptr = &READ_ONCE(storage->buf)->data[0];
+ else
+ ptr = this_cpu_ptr(storage->percpu_buf);
+
+ return (unsigned long)ptr;
+}
+
+const struct bpf_func_proto bpf_get_local_storage_proto = {
+ .func = bpf_get_local_storage,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_0(bpf_get_retval)
+{
+ struct bpf_cg_run_ctx *ctx =
+ container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+ return ctx->retval;
+}
+
+const struct bpf_func_proto bpf_get_retval_proto = {
+ .func = bpf_get_retval,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_1(bpf_set_retval, int, retval)
+{
+ struct bpf_cg_run_ctx *ctx =
+ container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+ ctx->retval = retval;
+ return 0;
+}
+
+const struct bpf_func_proto bpf_set_retval_proto = {
+ .func = bpf_set_retval,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *
-cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
- case BPF_FUNC_get_current_cgroup_id:
- return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
default:
@@ -1135,12 +1634,6 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
}
-static const struct bpf_func_proto *
-cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
- return cgroup_base_func_proto(func_id, prog);
-}
-
static bool cgroup_dev_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
@@ -1192,7 +1685,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
* @ppos: value-result argument: value is position at which read from or write
* to sysctl is happening, result is new position if program overrode it,
* initial value otherwise
- * @type: type of program to be executed
+ * @atype: type of program to be executed
*
* Program is run when sysctl is being accessed, either read or written, and
* can allow or deny such access.
@@ -1202,8 +1695,8 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
*/
int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
struct ctl_table *table, int write,
- void **buf, size_t *pcount, loff_t *ppos,
- enum bpf_attach_type type)
+ char **buf, size_t *pcount, loff_t *ppos,
+ enum cgroup_bpf_attach_type atype)
{
struct bpf_sysctl_kern ctx = {
.head = head,
@@ -1243,7 +1736,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
+ NULL);
rcu_read_unlock();
kfree(ctx.cur_val);
@@ -1256,25 +1750,12 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
kfree(ctx.new_val);
}
- return ret == 1 ? 0 : -EPERM;
+ return ret;
}
#ifdef CONFIG_NET
-static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
- enum bpf_attach_type attach_type)
-{
- struct bpf_prog_array *prog_array;
- bool empty;
-
- rcu_read_lock();
- prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
- empty = bpf_prog_array_is_empty(prog_array);
- rcu_read_unlock();
-
- return empty;
-}
-
-static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
+static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
+ struct bpf_sockopt_buf *buf)
{
if (unlikely(max_optlen < 0))
return -EINVAL;
@@ -1286,6 +1767,15 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
max_optlen = PAGE_SIZE;
}
+ if (max_optlen <= sizeof(buf->data)) {
+ /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
+ * bytes avoid the cost of kzalloc.
+ */
+ ctx->optval = buf->data;
+ ctx->optval_end = ctx->optval + max_optlen;
+ return max_optlen;
+ }
+
ctx->optval = kzalloc(max_optlen, GFP_USER);
if (!ctx->optval)
return -ENOMEM;
@@ -1295,16 +1785,26 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
return max_optlen;
}
-static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
+static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
+ struct bpf_sockopt_buf *buf)
{
+ if (ctx->optval == buf->data)
+ return;
kfree(ctx->optval);
}
+static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
+ struct bpf_sockopt_buf *buf)
+{
+ return ctx->optval != buf->data;
+}
+
int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
- int *optname, char __user *optval,
+ int *optname, sockptr_t optval,
int *optlen, char **kernel_optval)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_buf buf = {};
struct bpf_sockopt_kern ctx = {
.sk = sk,
.level = *level,
@@ -1312,46 +1812,42 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
};
int ret, max_optlen;
- /* Opportunistic check to see whether we have any BPF program
- * attached to the hook so we don't waste time allocating
- * memory and locking the socket.
- */
- if (!cgroup_bpf_enabled ||
- __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
- return 0;
-
/* Allocate a bit more than the initial user buffer for
* BPF program. The canonical use case is overriding
* TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
*/
max_optlen = max_t(int, 16, *optlen);
-
- max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+ max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0)
return max_optlen;
ctx.optlen = *optlen;
- if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
+ if (copy_from_sockptr(ctx.optval, optval,
+ min(*optlen, max_optlen))) {
ret = -EFAULT;
goto out;
}
lock_sock(sk);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
- &ctx, BPF_PROG_RUN);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
+ &ctx, bpf_prog_run, 0, NULL);
release_sock(sk);
- if (!ret) {
- ret = -EPERM;
+ if (ret)
goto out;
- }
if (ctx.optlen == -1) {
/* optlen set to -1, bypass kernel */
ret = 1;
} else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
/* optlen is out of bounds */
+ if (*optlen > PAGE_SIZE && ctx.optlen >= 0) {
+ pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
+ ctx.optlen, max_optlen);
+ ret = 0;
+ goto out;
+ }
ret = -EFAULT;
} else {
/* optlen within bounds, run kernel handler */
@@ -1366,41 +1862,53 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
*/
if (ctx.optlen != 0) {
*optlen = ctx.optlen;
- *kernel_optval = ctx.optval;
+ /* We've used bpf_sockopt_kern->buf as an intermediary
+ * storage, but the BPF program indicates that we need
+ * to pass this data to the kernel setsockopt handler.
+ * No way to export on-stack buf, have to allocate a
+ * new buffer.
+ */
+ if (!sockopt_buf_allocated(&ctx, &buf)) {
+ void *p = kmalloc(ctx.optlen, GFP_USER);
+
+ if (!p) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memcpy(p, ctx.optval, ctx.optlen);
+ *kernel_optval = p;
+ } else {
+ *kernel_optval = ctx.optval;
+ }
+ /* export and don't free sockopt buf */
+ return 0;
}
}
out:
- if (ret)
- sockopt_free_buf(&ctx);
+ sockopt_free_buf(&ctx, &buf);
return ret;
}
int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
- int optname, char __user *optval,
- int __user *optlen, int max_optlen,
+ int optname, sockptr_t optval,
+ sockptr_t optlen, int max_optlen,
int retval)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_buf buf = {};
struct bpf_sockopt_kern ctx = {
.sk = sk,
.level = level,
.optname = optname,
- .retval = retval,
+ .current_task = current,
};
+ int orig_optlen;
int ret;
- /* Opportunistic check to see whether we have any BPF program
- * attached to the hook so we don't waste time allocating
- * memory and locking the socket.
- */
- if (!cgroup_bpf_enabled ||
- __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
- return retval;
-
+ orig_optlen = max_optlen;
ctx.optlen = max_optlen;
-
- max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+ max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0)
return max_optlen;
@@ -1411,54 +1919,99 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
* one that kernel returned as well to let
* BPF programs inspect the value.
*/
+ if (copy_from_sockptr(&ctx.optlen, optlen,
+ sizeof(ctx.optlen))) {
+ ret = -EFAULT;
+ goto out;
+ }
- if (get_user(ctx.optlen, optlen)) {
+ if (ctx.optlen < 0) {
ret = -EFAULT;
goto out;
}
+ orig_optlen = ctx.optlen;
- if (copy_from_user(ctx.optval, optval,
- min(ctx.optlen, max_optlen)) != 0) {
+ if (copy_from_sockptr(ctx.optval, optval,
+ min(ctx.optlen, max_optlen))) {
ret = -EFAULT;
goto out;
}
}
lock_sock(sk);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
- &ctx, BPF_PROG_RUN);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
+ &ctx, bpf_prog_run, retval, NULL);
release_sock(sk);
- if (!ret) {
- ret = -EPERM;
- goto out;
- }
-
- if (ctx.optlen > max_optlen) {
- ret = -EFAULT;
+ if (ret < 0)
goto out;
- }
- /* BPF programs only allowed to set retval to 0, not some
- * arbitrary value.
- */
- if (ctx.retval != 0 && ctx.retval != retval) {
+ if (!sockptr_is_null(optval) &&
+ (ctx.optlen > max_optlen || ctx.optlen < 0)) {
+ if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) {
+ pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
+ ctx.optlen, max_optlen);
+ ret = retval;
+ goto out;
+ }
ret = -EFAULT;
goto out;
}
if (ctx.optlen != 0) {
- if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
- put_user(ctx.optlen, optlen)) {
+ if (!sockptr_is_null(optval) &&
+ copy_to_sockptr(optval, ctx.optval, ctx.optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (copy_to_sockptr(optlen, &ctx.optlen, sizeof(ctx.optlen))) {
ret = -EFAULT;
goto out;
}
}
- ret = ctx.retval;
-
out:
- sockopt_free_buf(&ctx);
+ sockopt_free_buf(&ctx, &buf);
+ return ret;
+}
+
+int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
+ int optname, void *optval,
+ int *optlen, int retval)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_kern ctx = {
+ .sk = sk,
+ .level = level,
+ .optname = optname,
+ .optlen = *optlen,
+ .optval = optval,
+ .optval_end = optval + *optlen,
+ .current_task = current,
+ };
+ int ret;
+
+ /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
+ * user data back into BPF buffer when reval != 0. This is
+ * done as an optimization to avoid extra copy, assuming
+ * kernel won't populate the data in case of an error.
+ * Here we always pass the data and memset() should
+ * be called if that data shouldn't be "exported".
+ */
+
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
+ &ctx, bpf_prog_run, retval, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (ctx.optlen > *optlen)
+ return -EFAULT;
+
+ /* BPF programs can shrink the buffer, export the modifications.
+ */
+ if (ctx.optlen != 0)
+ *optlen = ctx.optlen;
+
return ret;
}
#endif
@@ -1607,18 +2160,24 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
static const struct bpf_func_proto *
sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- case BPF_FUNC_strtol:
- return &bpf_strtol_proto;
- case BPF_FUNC_strtoul:
- return &bpf_strtoul_proto;
case BPF_FUNC_sysctl_get_name:
return &bpf_sysctl_get_name_proto;
case BPF_FUNC_sysctl_get_current_value:
@@ -1627,8 +2186,12 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sysctl_get_new_value_proto;
case BPF_FUNC_sysctl_set_new_value:
return &bpf_sysctl_set_new_value_proto;
+ case BPF_FUNC_ktime_get_coarse_ns:
+ return &bpf_ktime_get_coarse_ns_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
default:
- return cgroup_base_func_proto(func_id, prog);
+ return bpf_base_func_proto(func_id);
}
}
@@ -1696,10 +2259,12 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
treg, si->dst_reg,
offsetof(struct bpf_sysctl_kern, ppos));
- *insn++ = BPF_STX_MEM(
- BPF_SIZEOF(u32), treg, si->src_reg,
+ *insn++ = BPF_RAW_INSN(
+ BPF_CLASS(si->code) | BPF_MEM | BPF_SIZEOF(u32),
+ treg, si->src_reg,
bpf_ctx_narrow_access_offset(
- 0, sizeof(u32), sizeof(loff_t)));
+ 0, sizeof(u32), sizeof(loff_t)),
+ si->imm);
*insn++ = BPF_LDX_MEM(
BPF_DW, treg, si->dst_reg,
offsetof(struct bpf_sysctl_kern, tmp_reg));
@@ -1730,22 +2295,60 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
const struct bpf_prog_ops cg_sysctl_prog_ops = {
};
+#ifdef CONFIG_NET
+BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx)
+{
+ const struct net *net = ctx ? sock_net(ctx->sk) : &init_net;
+
+ return net->net_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
+ .func = bpf_get_netns_cookie_sockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+#endif
+
static const struct bpf_func_proto *
cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
#ifdef CONFIG_NET
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sockopt_proto;
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_setsockopt:
+ if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
+ return &bpf_sk_setsockopt_proto;
+ return NULL;
+ case BPF_FUNC_getsockopt:
+ if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
+ return &bpf_sk_getsockopt_proto;
+ return NULL;
#endif
#ifdef CONFIG_INET
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
#endif
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
default:
- return cgroup_base_func_proto(func_id, prog);
+ return bpf_base_func_proto(func_id);
}
}
@@ -1770,7 +2373,7 @@ static bool cg_sockopt_is_valid_access(int off, int size,
return prog->expected_attach_type ==
BPF_CGROUP_GETSOCKOPT;
case offsetof(struct bpf_sockopt, optname):
- /* fallthrough */
+ fallthrough;
case offsetof(struct bpf_sockopt, level):
if (size != size_default)
return false;
@@ -1811,10 +2414,17 @@ static bool cg_sockopt_is_valid_access(int off, int size,
return true;
}
-#define CG_SOCKOPT_ACCESS_FIELD(T, F) \
- T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
- si->dst_reg, si->src_reg, \
- offsetof(struct bpf_sockopt_kern, F))
+#define CG_SOCKOPT_READ_FIELD(F) \
+ BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sockopt_kern, F))
+
+#define CG_SOCKOPT_WRITE_FIELD(F) \
+ BPF_RAW_INSN((BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F) | \
+ BPF_MEM | BPF_CLASS(si->code)), \
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sockopt_kern, F), \
+ si->imm)
static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
@@ -1826,37 +2436,68 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
switch (si->off) {
case offsetof(struct bpf_sockopt, sk):
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
+ *insn++ = CG_SOCKOPT_READ_FIELD(sk);
break;
case offsetof(struct bpf_sockopt, level):
if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
+ *insn++ = CG_SOCKOPT_WRITE_FIELD(level);
else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
+ *insn++ = CG_SOCKOPT_READ_FIELD(level);
break;
case offsetof(struct bpf_sockopt, optname):
if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
+ *insn++ = CG_SOCKOPT_WRITE_FIELD(optname);
else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optname);
break;
case offsetof(struct bpf_sockopt, optlen):
if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
+ *insn++ = CG_SOCKOPT_WRITE_FIELD(optlen);
else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optlen);
break;
case offsetof(struct bpf_sockopt, retval):
- if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
- else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
+ BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
+
+ if (type == BPF_WRITE) {
+ int treg = BPF_REG_9;
+
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
+ offsetof(struct bpf_sockopt_kern, tmp_reg));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+ treg, si->dst_reg,
+ offsetof(struct bpf_sockopt_kern, current_task));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+ treg, treg,
+ offsetof(struct task_struct, bpf_ctx));
+ *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_MEM |
+ BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+ treg, si->src_reg,
+ offsetof(struct bpf_cg_run_ctx, retval),
+ si->imm);
+ *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
+ offsetof(struct bpf_sockopt_kern, tmp_reg));
+ } else {
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sockopt_kern, current_task));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct task_struct, bpf_ctx));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct bpf_cg_run_ctx, retval));
+ }
break;
case offsetof(struct bpf_sockopt, optval):
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optval);
break;
case offsetof(struct bpf_sockopt, optval_end):
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optval_end);
break;
}
@@ -1881,3 +2522,71 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
const struct bpf_prog_ops cg_sockopt_prog_ops = {
};
+
+/* Common helpers for cgroup hooks. */
+const struct bpf_func_proto *
+cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
+ case BPF_FUNC_get_retval:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
+ return NULL;
+ default:
+ return &bpf_get_retval_proto;
+ }
+ case BPF_FUNC_set_retval:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
+ return NULL;
+ default:
+ return &bpf_set_retval_proto;
+ }
+ default:
+ return NULL;
+ }
+}
+
+/* Common helpers for cgroup hooks with valid process context. */
+const struct bpf_func_proto *
+cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
+ default:
+ return NULL;
+ }
+}
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
new file mode 100644
index 000000000000..f04a468cf6a7
--- /dev/null
+++ b/kernel/bpf/cgroup_iter.c
@@ -0,0 +1,359 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Google */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/cgroup.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */
+
+/* cgroup_iter provides four modes of traversal to the cgroup hierarchy.
+ *
+ * 1. Walk the descendants of a cgroup in pre-order.
+ * 2. Walk the descendants of a cgroup in post-order.
+ * 3. Walk the ancestors of a cgroup.
+ * 4. Show the given cgroup only.
+ *
+ * For walking descendants, cgroup_iter can walk in either pre-order or
+ * post-order. For walking ancestors, the iter walks up from a cgroup to
+ * the root.
+ *
+ * The iter program can terminate the walk early by returning 1. Walk
+ * continues if prog returns 0.
+ *
+ * The prog can check (seq->num == 0) to determine whether this is
+ * the first element. The prog may also be passed a NULL cgroup,
+ * which means the walk has completed and the prog has a chance to
+ * do post-processing, such as outputting an epilogue.
+ *
+ * Note: the iter_prog is called with cgroup_mutex held.
+ *
+ * Currently only one session is supported, which means, depending on the
+ * volume of data bpf program intends to send to user space, the number
+ * of cgroups that can be walked is limited. For example, given the current
+ * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each
+ * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can
+ * be walked is 512. This is a limitation of cgroup_iter. If the output data
+ * is larger than the kernel buffer size, after all data in the kernel buffer
+ * is consumed by user space, the subsequent read() syscall will signal
+ * EOPNOTSUPP. In order to work around, the user may have to update their
+ * program to reduce the volume of data sent to output. For example, skip
+ * some uninteresting cgroups.
+ */
+
+struct bpf_iter__cgroup {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct cgroup *, cgroup);
+};
+
+struct cgroup_iter_priv {
+ struct cgroup_subsys_state *start_css;
+ bool visited_all;
+ bool terminate;
+ int order;
+};
+
+static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct cgroup_iter_priv *p = seq->private;
+
+ cgroup_lock();
+
+ /* cgroup_iter doesn't support read across multiple sessions. */
+ if (*pos > 0) {
+ if (p->visited_all)
+ return NULL;
+
+ /* Haven't visited all, but because cgroup_mutex has dropped,
+ * return -EOPNOTSUPP to indicate incomplete iteration.
+ */
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ ++*pos;
+ p->terminate = false;
+ p->visited_all = false;
+ if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ return css_next_descendant_pre(NULL, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ return css_next_descendant_post(NULL, p->start_css);
+ else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */
+ return p->start_css;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+ struct cgroup_subsys_state *css, int in_stop);
+
+static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
+{
+ struct cgroup_iter_priv *p = seq->private;
+
+ cgroup_unlock();
+
+ /* pass NULL to the prog for post-processing */
+ if (!v) {
+ __cgroup_iter_seq_show(seq, NULL, true);
+ p->visited_all = true;
+ }
+}
+
+static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
+ struct cgroup_iter_priv *p = seq->private;
+
+ ++*pos;
+ if (p->terminate)
+ return NULL;
+
+ if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ return css_next_descendant_pre(curr, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ return css_next_descendant_post(curr, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
+ return curr->parent;
+ else /* BPF_CGROUP_ITER_SELF_ONLY */
+ return NULL;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+ struct cgroup_subsys_state *css, int in_stop)
+{
+ struct cgroup_iter_priv *p = seq->private;
+ struct bpf_iter__cgroup ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ /* cgroup is dead, skip this element */
+ if (css && cgroup_is_dead(css->cgroup))
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.cgroup = css ? css->cgroup : NULL;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ /* if prog returns > 0, terminate after this element. */
+ if (ret != 0)
+ p->terminate = true;
+
+ return 0;
+}
+
+static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
+{
+ return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
+ false);
+}
+
+static const struct seq_operations cgroup_iter_seq_ops = {
+ .start = cgroup_iter_seq_start,
+ .next = cgroup_iter_seq_next,
+ .stop = cgroup_iter_seq_stop,
+ .show = cgroup_iter_seq_show,
+};
+
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
+
+static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
+{
+ struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
+ struct cgroup *cgrp = aux->cgroup.start;
+
+ /* bpf_iter_attach_cgroup() has already acquired an extra reference
+ * for the start cgroup, but the reference may be released after
+ * cgroup_iter_seq_init(), so acquire another reference for the
+ * start cgroup.
+ */
+ p->start_css = &cgrp->self;
+ css_get(p->start_css);
+ p->terminate = false;
+ p->visited_all = false;
+ p->order = aux->cgroup.order;
+ return 0;
+}
+
+static void cgroup_iter_seq_fini(void *priv)
+{
+ struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
+
+ css_put(p->start_css);
+}
+
+static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
+ .seq_ops = &cgroup_iter_seq_ops,
+ .init_seq_private = cgroup_iter_seq_init,
+ .fini_seq_private = cgroup_iter_seq_fini,
+ .seq_priv_size = sizeof(struct cgroup_iter_priv),
+};
+
+static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ int fd = linfo->cgroup.cgroup_fd;
+ u64 id = linfo->cgroup.cgroup_id;
+ int order = linfo->cgroup.order;
+ struct cgroup *cgrp;
+
+ if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE &&
+ order != BPF_CGROUP_ITER_DESCENDANTS_POST &&
+ order != BPF_CGROUP_ITER_ANCESTORS_UP &&
+ order != BPF_CGROUP_ITER_SELF_ONLY)
+ return -EINVAL;
+
+ if (fd && id)
+ return -EINVAL;
+
+ if (fd)
+ cgrp = cgroup_v1v2_get_from_fd(fd);
+ else if (id)
+ cgrp = cgroup_get_from_id(id);
+ else /* walk the entire hierarchy by default. */
+ cgrp = cgroup_get_from_path("/");
+
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ aux->cgroup.start = cgrp;
+ aux->cgroup.order = order;
+ return 0;
+}
+
+static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux)
+{
+ cgroup_put(aux->cgroup.start);
+}
+
+static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ char *buf;
+
+ buf = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf) {
+ seq_puts(seq, "cgroup_path:\t<unknown>\n");
+ goto show_order;
+ }
+
+ /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path
+ * will print nothing.
+ *
+ * Path is in the calling process's cgroup namespace.
+ */
+ cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ seq_printf(seq, "cgroup_path:\t%s\n", buf);
+ kfree(buf);
+
+show_order:
+ if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ seq_puts(seq, "order: descendants_pre\n");
+ else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ seq_puts(seq, "order: descendants_post\n");
+ else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP)
+ seq_puts(seq, "order: ancestors_up\n");
+ else /* BPF_CGROUP_ITER_SELF_ONLY */
+ seq_puts(seq, "order: self_only\n");
+}
+
+static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux,
+ struct bpf_link_info *info)
+{
+ info->iter.cgroup.order = aux->cgroup.order;
+ info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start);
+ return 0;
+}
+
+DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta,
+ struct cgroup *cgroup)
+
+static struct bpf_iter_reg bpf_cgroup_reg_info = {
+ .target = "cgroup",
+ .feature = BPF_ITER_RESCHED,
+ .attach_target = bpf_iter_attach_cgroup,
+ .detach_target = bpf_iter_detach_cgroup,
+ .show_fdinfo = bpf_iter_cgroup_show_fdinfo,
+ .fill_link_info = bpf_iter_cgroup_fill_link_info,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__cgroup, cgroup),
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
+ },
+ .seq_info = &cgroup_iter_seq_info,
+};
+
+static int __init bpf_cgroup_iter_init(void)
+{
+ bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0];
+ return bpf_iter_reg_target(&bpf_cgroup_reg_info);
+}
+
+late_initcall(bpf_cgroup_iter_init);
+
+struct bpf_iter_css {
+ __u64 __opaque[3];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_css_kern {
+ struct cgroup_subsys_state *start;
+ struct cgroup_subsys_state *pos;
+ unsigned int flags;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
+ struct cgroup_subsys_state *start, unsigned int flags)
+{
+ struct bpf_iter_css_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_css_kern) > sizeof(struct bpf_iter_css));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_css_kern) != __alignof__(struct bpf_iter_css));
+
+ kit->start = NULL;
+ switch (flags) {
+ case BPF_CGROUP_ITER_DESCENDANTS_PRE:
+ case BPF_CGROUP_ITER_DESCENDANTS_POST:
+ case BPF_CGROUP_ITER_ANCESTORS_UP:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ kit->start = start;
+ kit->pos = NULL;
+ kit->flags = flags;
+ return 0;
+}
+
+__bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it)
+{
+ struct bpf_iter_css_kern *kit = (void *)it;
+
+ if (!kit->start)
+ return NULL;
+
+ switch (kit->flags) {
+ case BPF_CGROUP_ITER_DESCENDANTS_PRE:
+ kit->pos = css_next_descendant_pre(kit->pos, kit->start);
+ break;
+ case BPF_CGROUP_ITER_DESCENDANTS_POST:
+ kit->pos = css_next_descendant_post(kit->pos, kit->start);
+ break;
+ case BPF_CGROUP_ITER_ANCESTORS_UP:
+ kit->pos = kit->pos ? kit->pos->parent : kit->start;
+ }
+
+ return kit->pos;
+}
+
+__bpf_kfunc void bpf_iter_css_destroy(struct bpf_iter_css *it)
+{
+}
+
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9df4cc9a2907..ea6843be2616 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -25,13 +25,20 @@
#include <linux/moduleloader.h>
#include <linux/bpf.h>
#include <linux/btf.h>
-#include <linux/frame.h>
+#include <linux/objtool.h>
#include <linux/rbtree_latch.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
#include <linux/perf_event.h>
#include <linux/extable.h>
#include <linux/log2.h>
+#include <linux/bpf_verifier.h>
+#include <linux/nodemask.h>
+#include <linux/nospec.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/memcontrol.h>
+
+#include <asm/barrier.h>
#include <asm/unaligned.h>
/* Registers */
@@ -54,8 +61,12 @@
#define AX regs[BPF_REG_AX]
#define ARG1 regs[BPF_REG_ARG1]
#define CTX regs[BPF_REG_CTX]
+#define OFF insn->off
#define IMM insn->imm
+struct bpf_mem_alloc bpf_global_ma;
+bool bpf_global_ma_set;
+
/* No hurry in this branch
*
* Exported for the bpf jit load helper.
@@ -64,11 +75,13 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
{
u8 *ptr = NULL;
- if (k >= SKF_NET_OFF)
+ if (k >= SKF_NET_OFF) {
ptr = skb_network_header(skb) + k - SKF_NET_OFF;
- else if (k >= SKF_LL_OFF)
+ } else if (k >= SKF_LL_OFF) {
+ if (unlikely(!skb_mac_header_was_set(skb)))
+ return NULL;
ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
-
+ }
if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
return ptr;
@@ -77,7 +90,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
+ gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog_aux *aux;
struct bpf_prog *fp;
@@ -86,25 +99,40 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
if (fp == NULL)
return NULL;
- aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
+ aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
if (aux == NULL) {
vfree(fp);
return NULL;
}
+ fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
+ if (!fp->active) {
+ vfree(fp);
+ kfree(aux);
+ return NULL;
+ }
fp->pages = size / PAGE_SIZE;
fp->aux = aux;
fp->aux->prog = fp;
fp->jit_requested = ebpf_jit_enabled();
+ fp->blinding_requested = bpf_jit_blinding_enabled(fp);
+#ifdef CONFIG_CGROUP_BPF
+ aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
+#endif
INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
+#ifdef CONFIG_FINEIBT
+ INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
+#endif
+ mutex_init(&fp->aux->used_maps_mutex);
+ mutex_init(&fp->aux->dst_mutex);
return fp;
}
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
+ gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog *prog;
int cpu;
@@ -112,8 +140,9 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
if (!prog)
return NULL;
- prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
- if (!prog->aux->stats) {
+ prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
+ if (!prog->stats) {
+ free_percpu(prog->active);
kfree(prog->aux);
vfree(prog);
return NULL;
@@ -122,7 +151,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
for_each_possible_cpu(cpu) {
struct bpf_prog_stats *pstats;
- pstats = per_cpu_ptr(prog->aux->stats, cpu);
+ pstats = per_cpu_ptr(prog->stats, cpu);
u64_stats_init(&pstats->syncp);
}
return prog;
@@ -134,25 +163,25 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
if (!prog->aux->nr_linfo || !prog->jit_requested)
return 0;
- prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo,
- sizeof(*prog->aux->jited_linfo),
- GFP_KERNEL | __GFP_NOWARN);
+ prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
+ sizeof(*prog->aux->jited_linfo),
+ bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN));
if (!prog->aux->jited_linfo)
return -ENOMEM;
return 0;
}
-void bpf_prog_free_jited_linfo(struct bpf_prog *prog)
+void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
{
- kfree(prog->aux->jited_linfo);
- prog->aux->jited_linfo = NULL;
-}
+ if (prog->aux->jited_linfo &&
+ (!prog->jited || !prog->aux->jited_linfo[0])) {
+ kvfree(prog->aux->jited_linfo);
+ prog->aux->jited_linfo = NULL;
+ }
-void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog)
-{
- if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0])
- bpf_prog_free_jited_linfo(prog);
+ kfree(prog->aux->kfunc_tab);
+ prog->aux->kfunc_tab = NULL;
}
/* The jit engine is responsible to provide an array
@@ -162,7 +191,7 @@ void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog)
* here is relative to the prog itself instead of the main prog.
* This array has one entry for each xlated bpf insn.
*
- * jited_off is the byte off to the last byte of the jited insn.
+ * jited_off is the byte off to the end of the jited insn.
*
* Hence, with
* insn_start:
@@ -186,7 +215,7 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
const struct bpf_line_info *linfo;
void **jited_linfo;
- if (!prog->aux->jited_linfo)
+ if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt)
/* Userspace did not provide linfo */
return;
@@ -208,34 +237,20 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
}
-void bpf_prog_free_linfo(struct bpf_prog *prog)
-{
- bpf_prog_free_jited_linfo(prog);
- kvfree(prog->aux->linfo);
-}
-
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
+ gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog *fp;
- u32 pages, delta;
- int ret;
+ u32 pages;
size = round_up(size, PAGE_SIZE);
pages = size / PAGE_SIZE;
if (pages <= fp_old->pages)
return fp_old;
- delta = pages - fp_old->pages;
- ret = __bpf_prog_charge(fp_old->aux->user, delta);
- if (ret)
- return NULL;
-
fp = __vmalloc(size, gfp_flags);
- if (fp == NULL) {
- __bpf_prog_uncharge(fp_old->aux->user, delta);
- } else {
+ if (fp) {
memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
fp->pages = pages;
fp->aux->prog = fp;
@@ -244,6 +259,8 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
* reallocated structure.
*/
fp_old->aux = NULL;
+ fp_old->stats = NULL;
+ fp_old->active = NULL;
__bpf_prog_free(fp_old);
}
@@ -253,10 +270,13 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
void __bpf_prog_free(struct bpf_prog *fp)
{
if (fp->aux) {
- free_percpu(fp->aux->stats);
+ mutex_destroy(&fp->aux->used_maps_mutex);
+ mutex_destroy(&fp->aux->dst_mutex);
kfree(fp->aux->poke_tab);
kfree(fp->aux);
}
+ free_percpu(fp->stats);
+ free_percpu(fp->active);
vfree(fp);
}
@@ -354,9 +374,18 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
s32 end_new, s32 curr, const bool probe_pass)
{
- const s32 off_min = S16_MIN, off_max = S16_MAX;
+ s64 off_min, off_max, off;
s32 delta = end_new - end_old;
- s32 off = insn->off;
+
+ if (insn->code == (BPF_JMP32 | BPF_JA)) {
+ off = insn->imm;
+ off_min = S32_MIN;
+ off_max = S32_MAX;
+ } else {
+ off = insn->off;
+ off_min = S16_MIN;
+ off_max = S16_MAX;
+ }
if (curr < pos && curr + off + 1 >= end_old)
off += delta;
@@ -364,8 +393,12 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
off -= delta;
if (off < off_min || off > off_max)
return -ERANGE;
- if (!probe_pass)
- insn->off = off;
+ if (!probe_pass) {
+ if (insn->code == (BPF_JMP32 | BPF_JA))
+ insn->imm = off;
+ else
+ insn->off = off;
+ }
return 0;
}
@@ -387,6 +420,13 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
i = end_new;
insn = prog->insnsi + end_old;
}
+ if (bpf_pseudo_func(insn)) {
+ ret = bpf_adj_delta_to_imm(insn, pos, end_old,
+ end_new, i, probe_pass);
+ if (ret)
+ return ret;
+ continue;
+ }
code = insn->code;
if ((BPF_CLASS(code) != BPF_JMP &&
BPF_CLASS(code) != BPF_JMP32) ||
@@ -506,7 +546,7 @@ static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
{
int i;
- for (i = 0; i < fp->aux->func_cnt; i++)
+ for (i = 0; i < fp->aux->real_func_cnt; i++)
bpf_prog_kallsyms_del(fp->aux->func[i]);
}
@@ -522,17 +562,15 @@ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_harden __read_mostly;
long bpf_jit_limit __read_mostly;
+long bpf_jit_limit_max __read_mostly;
static void
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
{
- const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog);
- unsigned long addr = (unsigned long)hdr;
-
WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
prog->aux->ksym.start = (unsigned long) prog->bpf_func;
- prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE;
+ prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len;
}
static void
@@ -558,7 +596,7 @@ bpf_prog_ksym_set_name(struct bpf_prog *prog)
sym = bin2hex(sym, prog->tag, sizeof(prog->tag));
/* prog->aux->name will be ignored if full btf name is available */
- if (prog->aux->func_info_cnt) {
+ if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) {
type = btf_type_by_id(prog->aux->btf,
prog->aux->func_info[prog->aux->func_idx].type_id);
func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
@@ -592,7 +630,11 @@ static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
if (val < ksym->start)
return -1;
- if (val >= ksym->end)
+ /* Ensure that we detect return addresses as part of the program, when
+ * the final instruction is a call for a program part of the stack
+ * trace. Therefore, do val > ksym->end instead of val >= ksym->end.
+ */
+ if (val > ksym->end)
return 1;
return 0;
@@ -637,12 +679,6 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
return fp->jited && !bpf_prog_was_classic(fp);
}
-static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
-{
- return list_empty(&fp->aux->ksym.lnode) ||
- fp->aux->ksym.lnode.prev == LIST_POISON2;
-}
-
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
if (!bpf_prog_kallsyms_candidate(fp) ||
@@ -654,6 +690,23 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp)
fp->aux->ksym.prog = true;
bpf_ksym_add(&fp->aux->ksym);
+
+#ifdef CONFIG_FINEIBT
+ /*
+ * When FineIBT, code in the __cfi_foo() symbols can get executed
+ * and hence unwinder needs help.
+ */
+ if (cfi_mode != CFI_FINEIBT)
+ return;
+
+ snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN,
+ "__cfi_%s", fp->aux->ksym.name);
+
+ fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16;
+ fp->aux->ksym_prefix.end = (unsigned long) fp->bpf_func;
+
+ bpf_ksym_add(&fp->aux->ksym_prefix);
+#endif
}
void bpf_prog_kallsyms_del(struct bpf_prog *fp)
@@ -662,6 +715,11 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp)
return;
bpf_ksym_del(&fp->aux->ksym);
+#ifdef CONFIG_FINEIBT
+ if (cfi_mode != CFI_FINEIBT)
+ return;
+ bpf_ksym_del(&fp->aux->ksym_prefix);
+#endif
}
static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
@@ -708,7 +766,7 @@ bool is_bpf_text_address(unsigned long addr)
return ret;
}
-static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
+struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
{
struct bpf_ksym *ksym = bpf_ksym_find(addr);
@@ -773,7 +831,8 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
if (size > poke_tab_max)
return -ENOSPC;
- if (poke->ip || poke->ip_stable || poke->adj_off)
+ if (poke->tailcall_target || poke->tailcall_target_stable ||
+ poke->tailcall_bypass || poke->adj_off || poke->bypass_addr)
return -EINVAL;
switch (poke->reason) {
@@ -796,6 +855,147 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
return slot;
}
+/*
+ * BPF program pack allocator.
+ *
+ * Most BPF programs are pretty small. Allocating a hole page for each
+ * program is sometime a waste. Many small bpf program also adds pressure
+ * to instruction TLB. To solve this issue, we introduce a BPF program pack
+ * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
+ * to host BPF programs.
+ */
+#define BPF_PROG_CHUNK_SHIFT 6
+#define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT)
+#define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1))
+
+struct bpf_prog_pack {
+ struct list_head list;
+ void *ptr;
+ unsigned long bitmap[];
+};
+
+void bpf_jit_fill_hole_with_zero(void *area, unsigned int size)
+{
+ memset(area, 0, size);
+}
+
+#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
+
+static DEFINE_MUTEX(pack_mutex);
+static LIST_HEAD(pack_list);
+
+/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with
+ * CONFIG_MMU=n. Use PAGE_SIZE in these cases.
+ */
+#ifdef PMD_SIZE
+#define BPF_PROG_PACK_SIZE (PMD_SIZE * num_possible_nodes())
+#else
+#define BPF_PROG_PACK_SIZE PAGE_SIZE
+#endif
+
+#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
+
+static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ struct bpf_prog_pack *pack;
+
+ pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
+ GFP_KERNEL);
+ if (!pack)
+ return NULL;
+ pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE);
+ if (!pack->ptr) {
+ kfree(pack);
+ return NULL;
+ }
+ bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
+ bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
+ list_add_tail(&pack->list, &pack_list);
+
+ set_vm_flush_reset_perms(pack->ptr);
+ set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+ return pack;
+}
+
+void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
+ struct bpf_prog_pack *pack;
+ unsigned long pos;
+ void *ptr = NULL;
+
+ mutex_lock(&pack_mutex);
+ if (size > BPF_PROG_PACK_SIZE) {
+ size = round_up(size, PAGE_SIZE);
+ ptr = bpf_jit_alloc_exec(size);
+ if (ptr) {
+ bpf_fill_ill_insns(ptr, size);
+ set_vm_flush_reset_perms(ptr);
+ set_memory_rox((unsigned long)ptr, size / PAGE_SIZE);
+ }
+ goto out;
+ }
+ list_for_each_entry(pack, &pack_list, list) {
+ pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+ nbits, 0);
+ if (pos < BPF_PROG_CHUNK_COUNT)
+ goto found_free_area;
+ }
+
+ pack = alloc_new_pack(bpf_fill_ill_insns);
+ if (!pack)
+ goto out;
+
+ pos = 0;
+
+found_free_area:
+ bitmap_set(pack->bitmap, pos, nbits);
+ ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
+
+out:
+ mutex_unlock(&pack_mutex);
+ return ptr;
+}
+
+void bpf_prog_pack_free(void *ptr, u32 size)
+{
+ struct bpf_prog_pack *pack = NULL, *tmp;
+ unsigned int nbits;
+ unsigned long pos;
+
+ mutex_lock(&pack_mutex);
+ if (size > BPF_PROG_PACK_SIZE) {
+ bpf_jit_free_exec(ptr);
+ goto out;
+ }
+
+ list_for_each_entry(tmp, &pack_list, list) {
+ if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) {
+ pack = tmp;
+ break;
+ }
+ }
+
+ if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
+ goto out;
+
+ nbits = BPF_PROG_SIZE_TO_NBITS(size);
+ pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
+
+ WARN_ONCE(bpf_arch_text_invalidate(ptr, size),
+ "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
+
+ bitmap_clear(pack->bitmap, pos, nbits);
+ if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+ BPF_PROG_CHUNK_COUNT, 0) == 0) {
+ list_del(&pack->list);
+ bpf_jit_free_exec(pack->ptr);
+ kfree(pack);
+ }
+out:
+ mutex_unlock(&pack_mutex);
+}
+
static atomic_long_t bpf_jit_current;
/* Can be overridden by an arch's JIT compiler if it has a custom,
@@ -814,18 +1014,18 @@ u64 __weak bpf_jit_alloc_exec_limit(void)
static int __init bpf_jit_charge_init(void)
{
/* Only used as heuristic here to derive limit. */
- bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2,
+ bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
+ bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1,
PAGE_SIZE), LONG_MAX);
return 0;
}
pure_initcall(bpf_jit_charge_init);
-static int bpf_jit_charge_modmem(u32 pages)
+int bpf_jit_charge_modmem(u32 size)
{
- if (atomic_long_add_return(pages, &bpf_jit_current) >
- (bpf_jit_limit >> PAGE_SHIFT)) {
- if (!capable(CAP_SYS_ADMIN)) {
- atomic_long_sub(pages, &bpf_jit_current);
+ if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) {
+ if (!bpf_capable()) {
+ atomic_long_sub(size, &bpf_jit_current);
return -EPERM;
}
}
@@ -833,9 +1033,9 @@ static int bpf_jit_charge_modmem(u32 pages)
return 0;
}
-static void bpf_jit_uncharge_modmem(u32 pages)
+void bpf_jit_uncharge_modmem(u32 size)
{
- atomic_long_sub(pages, &bpf_jit_current);
+ atomic_long_sub(size, &bpf_jit_current);
}
void *__weak bpf_jit_alloc_exec(unsigned long size)
@@ -854,7 +1054,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_binary_header *hdr;
- u32 size, hole, start, pages;
+ u32 size, hole, start;
WARN_ON_ONCE(!is_power_of_2(alignment) ||
alignment > BPF_IMAGE_ALIGNMENT);
@@ -864,23 +1064,22 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
* random section of illegal instructions.
*/
size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
- pages = size / PAGE_SIZE;
- if (bpf_jit_charge_modmem(pages))
+ if (bpf_jit_charge_modmem(size))
return NULL;
hdr = bpf_jit_alloc_exec(size);
if (!hdr) {
- bpf_jit_uncharge_modmem(pages);
+ bpf_jit_uncharge_modmem(size);
return NULL;
}
/* Fill space with illegal/arch-dep instructions. */
bpf_fill_ill_insns(hdr, size);
- hdr->pages = pages;
+ hdr->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
- start = (get_random_int() % hole) & ~(alignment - 1);
+ start = get_random_u32_below(hole) & ~(alignment - 1);
/* Leave a random number of instructions before BPF code. */
*image_ptr = &hdr->image[start];
@@ -890,10 +1089,121 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
- u32 pages = hdr->pages;
+ u32 size = hdr->size;
bpf_jit_free_exec(hdr);
- bpf_jit_uncharge_modmem(pages);
+ bpf_jit_uncharge_modmem(size);
+}
+
+/* Allocate jit binary from bpf_prog_pack allocator.
+ * Since the allocated memory is RO+X, the JIT engine cannot write directly
+ * to the memory. To solve this problem, a RW buffer is also allocated at
+ * as the same time. The JIT engine should calculate offsets based on the
+ * RO memory address, but write JITed program to the RW buffer. Once the
+ * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
+ * the JITed program to the RO memory.
+ */
+struct bpf_binary_header *
+bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
+ unsigned int alignment,
+ struct bpf_binary_header **rw_header,
+ u8 **rw_image,
+ bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ struct bpf_binary_header *ro_header;
+ u32 size, hole, start;
+
+ WARN_ON_ONCE(!is_power_of_2(alignment) ||
+ alignment > BPF_IMAGE_ALIGNMENT);
+
+ /* add 16 bytes for a random section of illegal instructions */
+ size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
+
+ if (bpf_jit_charge_modmem(size))
+ return NULL;
+ ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
+ if (!ro_header) {
+ bpf_jit_uncharge_modmem(size);
+ return NULL;
+ }
+
+ *rw_header = kvmalloc(size, GFP_KERNEL);
+ if (!*rw_header) {
+ bpf_prog_pack_free(ro_header, size);
+ bpf_jit_uncharge_modmem(size);
+ return NULL;
+ }
+
+ /* Fill space with illegal/arch-dep instructions. */
+ bpf_fill_ill_insns(*rw_header, size);
+ (*rw_header)->size = size;
+
+ hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
+ BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
+ start = get_random_u32_below(hole) & ~(alignment - 1);
+
+ *image_ptr = &ro_header->image[start];
+ *rw_image = &(*rw_header)->image[start];
+
+ return ro_header;
+}
+
+/* Copy JITed text from rw_header to its final location, the ro_header. */
+int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
+ struct bpf_binary_header *ro_header,
+ struct bpf_binary_header *rw_header)
+{
+ void *ptr;
+
+ ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
+
+ kvfree(rw_header);
+
+ if (IS_ERR(ptr)) {
+ bpf_prog_pack_free(ro_header, ro_header->size);
+ return PTR_ERR(ptr);
+ }
+ return 0;
+}
+
+/* bpf_jit_binary_pack_free is called in two different scenarios:
+ * 1) when the program is freed after;
+ * 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
+ * For case 2), we need to free both the RO memory and the RW buffer.
+ *
+ * bpf_jit_binary_pack_free requires proper ro_header->size. However,
+ * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size
+ * must be set with either bpf_jit_binary_pack_finalize (normal path) or
+ * bpf_arch_text_copy (when jit fails).
+ */
+void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
+ struct bpf_binary_header *rw_header)
+{
+ u32 size = ro_header->size;
+
+ bpf_prog_pack_free(ro_header, size);
+ kvfree(rw_header);
+ bpf_jit_uncharge_modmem(size);
+}
+
+struct bpf_binary_header *
+bpf_jit_binary_pack_hdr(const struct bpf_prog *fp)
+{
+ unsigned long real_start = (unsigned long)fp->bpf_func;
+ unsigned long addr;
+
+ addr = real_start & BPF_PROG_CHUNK_MASK;
+ return (void *)addr;
+}
+
+static inline struct bpf_binary_header *
+bpf_jit_binary_hdr(const struct bpf_prog *fp)
+{
+ unsigned long real_start = (unsigned long)fp->bpf_func;
+ unsigned long addr;
+
+ addr = real_start & PAGE_MASK;
+ return (void *)addr;
}
/* This symbol is only overridden by archs that have different
@@ -906,7 +1216,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
bpf_jit_binary_free(hdr);
-
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
}
@@ -920,6 +1229,7 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
s16 off = insn->off;
s32 imm = insn->imm;
u8 *addr;
+ int err;
*func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;
if (!*func_addr_fixed) {
@@ -930,10 +1240,15 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
if (!extra_pass)
addr = NULL;
else if (prog->aux->func &&
- off >= 0 && off < prog->aux->func_cnt)
+ off >= 0 && off < prog->aux->real_func_cnt)
addr = (u8 *)prog->aux->func[off]->bpf_func;
else
return -EINVAL;
+ } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
+ bpf_jit_supports_far_kfunc_call()) {
+ err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr);
+ if (err)
+ return err;
} else {
/* Address of a BPF helper call. Since part of the core
* kernel, it's always at a fixed location. __bpf_call_base
@@ -953,7 +1268,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
bool emit_zext)
{
struct bpf_insn *to = to_buff;
- u32 imm_rnd = get_random_int();
+ u32 imm_rnd = get_random_u32();
s16 off;
BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG);
@@ -998,7 +1313,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
case BPF_ALU | BPF_MOD | BPF_K:
*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
- *to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX);
+ *to++ = BPF_ALU32_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
break;
case BPF_ALU64 | BPF_ADD | BPF_K:
@@ -1012,7 +1327,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
case BPF_ALU64 | BPF_MOD | BPF_K:
*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
- *to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX);
+ *to++ = BPF_ALU64_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
break;
case BPF_JMP | BPF_JEQ | BPF_K:
@@ -1111,6 +1426,8 @@ static void bpf_prog_clone_free(struct bpf_prog *fp)
* clone is guaranteed to not be locked.
*/
fp->aux = NULL;
+ fp->stats = NULL;
+ fp->active = NULL;
__bpf_prog_free(fp);
}
@@ -1131,7 +1448,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
struct bpf_insn *insn;
int i, rewritten;
- if (!bpf_jit_blinding_enabled(prog) || prog->blinded)
+ if (!prog->blinding_requested || prog->blinded)
return prog;
clone = bpf_prog_clone_create(prog, GFP_USER);
@@ -1142,6 +1459,16 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
insn = clone->insnsi;
for (i = 0; i < insn_cnt; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ /* ld_imm64 with an address of bpf subprog is not
+ * a user controlled constant. Don't randomize it,
+ * since it will conflict with jit_subprogs() logic.
+ */
+ insn++;
+ i++;
+ continue;
+ }
+
/* We temporarily need to hold the original ld64 insn
* so that we can still access the first part in the
* second blinding run.
@@ -1238,6 +1565,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_3(ALU64, DIV, X), \
INSN_3(ALU64, MOD, X), \
INSN_2(ALU64, NEG), \
+ INSN_3(ALU64, END, TO_LE), \
/* Immediate based. */ \
INSN_3(ALU64, ADD, K), \
INSN_3(ALU64, SUB, K), \
@@ -1306,14 +1634,15 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_3(JMP, JSLE, K), \
INSN_3(JMP, JSET, K), \
INSN_2(JMP, JA), \
+ INSN_2(JMP32, JA), \
/* Store instructions. */ \
/* Register based. */ \
INSN_3(STX, MEM, B), \
INSN_3(STX, MEM, H), \
INSN_3(STX, MEM, W), \
INSN_3(STX, MEM, DW), \
- INSN_3(STX, XADD, W), \
- INSN_3(STX, XADD, DW), \
+ INSN_3(STX, ATOMIC, W), \
+ INSN_3(STX, ATOMIC, DW), \
/* Immediate based. */ \
INSN_3(ST, MEM, B), \
INSN_3(ST, MEM, H), \
@@ -1325,6 +1654,9 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_3(LDX, MEM, H), \
INSN_3(LDX, MEM, W), \
INSN_3(LDX, MEM, DW), \
+ INSN_3(LDX, MEMSX, B), \
+ INSN_3(LDX, MEMSX, H), \
+ INSN_3(LDX, MEMSX, W), \
/* Immediate based. */ \
INSN_3(LD, IMM, DW)
@@ -1350,21 +1682,16 @@ bool bpf_opcode_in_insntable(u8 code)
}
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
-u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
-{
- memset(dst, 0, size);
- return -EFAULT;
-}
-
/**
- * __bpf_prog_run - run eBPF program on a given context
+ * ___bpf_prog_run - run eBPF program on a given context
* @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
* @insn: is the array of eBPF instructions
- * @stack: is the eBPF storage stack
*
* Decode and execute eBPF instructions.
+ *
+ * Return: whatever value is in %BPF_R0 at program exit
*/
-static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
+static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
{
#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
@@ -1375,10 +1702,14 @@ static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u6
/* Non-UAPI available opcodes. */
[BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
+ [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC,
[BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
[BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
[BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
[BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW,
+ [BPF_LDX | BPF_PROBE_MEMSX | BPF_B] = &&LDX_PROBE_MEMSX_B,
+ [BPF_LDX | BPF_PROBE_MEMSX | BPF_H] = &&LDX_PROBE_MEMSX_H,
+ [BPF_LDX | BPF_PROBE_MEMSX | BPF_W] = &&LDX_PROBE_MEMSX_W,
};
#undef BPF_INSN_3_LBL
#undef BPF_INSN_2_LBL
@@ -1390,29 +1721,54 @@ static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u6
select_insn:
goto *jumptable[insn->code];
- /* ALU */
-#define ALU(OPCODE, OP) \
- ALU64_##OPCODE##_X: \
- DST = DST OP SRC; \
- CONT; \
- ALU_##OPCODE##_X: \
- DST = (u32) DST OP (u32) SRC; \
- CONT; \
- ALU64_##OPCODE##_K: \
- DST = DST OP IMM; \
- CONT; \
- ALU_##OPCODE##_K: \
- DST = (u32) DST OP (u32) IMM; \
+ /* Explicitly mask the register-based shift amounts with 63 or 31
+ * to avoid undefined behavior. Normally this won't affect the
+ * generated code, for example, in case of native 64 bit archs such
+ * as x86-64 or arm64, the compiler is optimizing the AND away for
+ * the interpreter. In case of JITs, each of the JIT backends compiles
+ * the BPF shift operations to machine instructions which produce
+ * implementation-defined results in such a case; the resulting
+ * contents of the register may be arbitrary, but program behaviour
+ * as a whole remains defined. In other words, in case of JIT backends,
+ * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation.
+ */
+ /* ALU (shifts) */
+#define SHT(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP (SRC & 63); \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP ((u32) SRC & 31); \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
+ CONT;
+ /* ALU (rest) */
+#define ALU(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP SRC; \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP (u32) SRC; \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
CONT;
-
ALU(ADD, +)
ALU(SUB, -)
ALU(AND, &)
ALU(OR, |)
- ALU(LSH, <<)
- ALU(RSH, >>)
ALU(XOR, ^)
ALU(MUL, *)
+ SHT(LSH, <<)
+ SHT(RSH, >>)
+#undef SHT
#undef ALU
ALU_NEG:
DST = (u32) -DST;
@@ -1421,13 +1777,36 @@ select_insn:
DST = -DST;
CONT;
ALU_MOV_X:
- DST = (u32) SRC;
+ switch (OFF) {
+ case 0:
+ DST = (u32) SRC;
+ break;
+ case 8:
+ DST = (u32)(s8) SRC;
+ break;
+ case 16:
+ DST = (u32)(s16) SRC;
+ break;
+ }
CONT;
ALU_MOV_K:
DST = (u32) IMM;
CONT;
ALU64_MOV_X:
- DST = SRC;
+ switch (OFF) {
+ case 0:
+ DST = SRC;
+ break;
+ case 8:
+ DST = (s8) SRC;
+ break;
+ case 16:
+ DST = (s16) SRC;
+ break;
+ case 32:
+ DST = (s32) SRC;
+ break;
+ }
CONT;
ALU64_MOV_K:
DST = IMM;
@@ -1437,48 +1816,126 @@ select_insn:
insn++;
CONT;
ALU_ARSH_X:
- DST = (u64) (u32) (((s32) DST) >> SRC);
+ DST = (u64) (u32) (((s32) DST) >> (SRC & 31));
CONT;
ALU_ARSH_K:
DST = (u64) (u32) (((s32) DST) >> IMM);
CONT;
ALU64_ARSH_X:
- (*(s64 *) &DST) >>= SRC;
+ (*(s64 *) &DST) >>= (SRC & 63);
CONT;
ALU64_ARSH_K:
(*(s64 *) &DST) >>= IMM;
CONT;
ALU64_MOD_X:
- div64_u64_rem(DST, SRC, &AX);
- DST = AX;
+ switch (OFF) {
+ case 0:
+ div64_u64_rem(DST, SRC, &AX);
+ DST = AX;
+ break;
+ case 1:
+ AX = div64_s64(DST, SRC);
+ DST = DST - AX * SRC;
+ break;
+ }
CONT;
ALU_MOD_X:
- AX = (u32) DST;
- DST = do_div(AX, (u32) SRC);
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ DST = do_div(AX, (u32) SRC);
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ AX = do_div(AX, abs((s32)SRC));
+ if ((s32)DST < 0)
+ DST = (u32)-AX;
+ else
+ DST = (u32)AX;
+ break;
+ }
CONT;
ALU64_MOD_K:
- div64_u64_rem(DST, IMM, &AX);
- DST = AX;
+ switch (OFF) {
+ case 0:
+ div64_u64_rem(DST, IMM, &AX);
+ DST = AX;
+ break;
+ case 1:
+ AX = div64_s64(DST, IMM);
+ DST = DST - AX * IMM;
+ break;
+ }
CONT;
ALU_MOD_K:
- AX = (u32) DST;
- DST = do_div(AX, (u32) IMM);
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ DST = do_div(AX, (u32) IMM);
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ AX = do_div(AX, abs((s32)IMM));
+ if ((s32)DST < 0)
+ DST = (u32)-AX;
+ else
+ DST = (u32)AX;
+ break;
+ }
CONT;
ALU64_DIV_X:
- DST = div64_u64(DST, SRC);
+ switch (OFF) {
+ case 0:
+ DST = div64_u64(DST, SRC);
+ break;
+ case 1:
+ DST = div64_s64(DST, SRC);
+ break;
+ }
CONT;
ALU_DIV_X:
- AX = (u32) DST;
- do_div(AX, (u32) SRC);
- DST = (u32) AX;
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ do_div(AX, (u32) SRC);
+ DST = (u32) AX;
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ do_div(AX, abs((s32)SRC));
+ if (((s32)DST < 0) == ((s32)SRC < 0))
+ DST = (u32)AX;
+ else
+ DST = (u32)-AX;
+ break;
+ }
CONT;
ALU64_DIV_K:
- DST = div64_u64(DST, IMM);
+ switch (OFF) {
+ case 0:
+ DST = div64_u64(DST, IMM);
+ break;
+ case 1:
+ DST = div64_s64(DST, IMM);
+ break;
+ }
CONT;
ALU_DIV_K:
- AX = (u32) DST;
- do_div(AX, (u32) IMM);
- DST = (u32) AX;
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ do_div(AX, (u32) IMM);
+ DST = (u32) AX;
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ do_div(AX, abs((s32)IMM));
+ if (((s32)DST < 0) == ((s32)IMM < 0))
+ DST = (u32)AX;
+ else
+ DST = (u32)-AX;
+ break;
+ }
CONT;
ALU_END_TO_BE:
switch (IMM) {
@@ -1506,6 +1963,19 @@ select_insn:
break;
}
CONT;
+ ALU64_END_TO_LE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) __swab16(DST);
+ break;
+ case 32:
+ DST = (__force u32) __swab32(DST);
+ break;
+ case 64:
+ DST = (__force u64) __swab64(DST);
+ break;
+ }
+ CONT;
/* CALL */
JMP_CALL:
@@ -1532,7 +2002,8 @@ select_insn:
if (unlikely(index >= array->map.max_entries))
goto out;
- if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
+
+ if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
goto out;
tail_call_cnt++;
@@ -1554,6 +2025,9 @@ out:
JMP_JA:
insn += insn->off;
CONT;
+ JMP32_JA:
+ insn += insn->imm;
+ CONT;
JMP_EXIT:
return BPF_R0;
/* JMP */
@@ -1594,7 +2068,19 @@ out:
COND_JMP(s, JSGE, >=)
COND_JMP(s, JSLE, <=)
#undef COND_JMP
- /* STX and ST and LDX*/
+ /* ST, STX and LDX*/
+ ST_NOSPEC:
+ /* Speculation barrier for mitigating Speculative Store Bypass.
+ * In case of arm64, we rely on the firmware mitigation as
+ * controlled via the ssbd kernel parameter. Whenever the
+ * mitigation is enabled, it works for all of the kernel code
+ * with no need to provide any additional instructions here.
+ * In case of x86, we use 'lfence' insn for mitigation. We
+ * reuse preexisting logic from Spectre v1 mitigation that
+ * happens to produce the required code on x86 for v4 as well.
+ */
+ barrier_nospec();
+ CONT;
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
*(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
@@ -1604,6 +2090,11 @@ out:
CONT; \
LDX_MEM_##SIZEOP: \
DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
+ CONT; \
+ LDX_PROBE_MEM_##SIZEOP: \
+ bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \
+ (const void *)(long) (SRC + insn->off)); \
+ DST = *((SIZE *)&DST); \
CONT;
LDST(B, u8)
@@ -1611,23 +2102,75 @@ out:
LDST(W, u32)
LDST(DW, u64)
#undef LDST
-#define LDX_PROBE(SIZEOP, SIZE) \
- LDX_PROBE_MEM_##SIZEOP: \
- bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) (SRC + insn->off)); \
- CONT;
- LDX_PROBE(B, 1)
- LDX_PROBE(H, 2)
- LDX_PROBE(W, 4)
- LDX_PROBE(DW, 8)
-#undef LDX_PROBE
-
- STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
- atomic_add((u32) SRC, (atomic_t *)(unsigned long)
- (DST + insn->off));
+
+#define LDSX(SIZEOP, SIZE) \
+ LDX_MEMSX_##SIZEOP: \
+ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
+ CONT; \
+ LDX_PROBE_MEMSX_##SIZEOP: \
+ bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \
+ (const void *)(long) (SRC + insn->off)); \
+ DST = *((SIZE *)&DST); \
CONT;
- STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
- atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
- (DST + insn->off));
+
+ LDSX(B, s8)
+ LDSX(H, s16)
+ LDSX(W, s32)
+#undef LDSX
+
+#define ATOMIC_ALU_OP(BOP, KOP) \
+ case BOP: \
+ if (BPF_SIZE(insn->code) == BPF_W) \
+ atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \
+ (DST + insn->off)); \
+ else \
+ atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \
+ (DST + insn->off)); \
+ break; \
+ case BOP | BPF_FETCH: \
+ if (BPF_SIZE(insn->code) == BPF_W) \
+ SRC = (u32) atomic_fetch_##KOP( \
+ (u32) SRC, \
+ (atomic_t *)(unsigned long) (DST + insn->off)); \
+ else \
+ SRC = (u64) atomic64_fetch_##KOP( \
+ (u64) SRC, \
+ (atomic64_t *)(unsigned long) (DST + insn->off)); \
+ break;
+
+ STX_ATOMIC_DW:
+ STX_ATOMIC_W:
+ switch (IMM) {
+ ATOMIC_ALU_OP(BPF_ADD, add)
+ ATOMIC_ALU_OP(BPF_AND, and)
+ ATOMIC_ALU_OP(BPF_OR, or)
+ ATOMIC_ALU_OP(BPF_XOR, xor)
+#undef ATOMIC_ALU_OP
+
+ case BPF_XCHG:
+ if (BPF_SIZE(insn->code) == BPF_W)
+ SRC = (u32) atomic_xchg(
+ (atomic_t *)(unsigned long) (DST + insn->off),
+ (u32) SRC);
+ else
+ SRC = (u64) atomic64_xchg(
+ (atomic64_t *)(unsigned long) (DST + insn->off),
+ (u64) SRC);
+ break;
+ case BPF_CMPXCHG:
+ if (BPF_SIZE(insn->code) == BPF_W)
+ BPF_R0 = (u32) atomic_cmpxchg(
+ (atomic_t *)(unsigned long) (DST + insn->off),
+ (u32) BPF_R0, (u32) SRC);
+ else
+ BPF_R0 = (u64) atomic64_cmpxchg(
+ (atomic64_t *)(unsigned long) (DST + insn->off),
+ (u64) BPF_R0, (u64) SRC);
+ break;
+
+ default:
+ goto default_label;
+ }
CONT;
default_label:
@@ -1637,7 +2180,8 @@ out:
*
* Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
*/
- pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code);
+ pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n",
+ insn->code, insn->imm);
BUG_ON(1);
return 0;
}
@@ -1647,11 +2191,11 @@ out:
static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
{ \
u64 stack[stack_size / sizeof(u64)]; \
- u64 regs[MAX_BPF_EXT_REG]; \
+ u64 regs[MAX_BPF_EXT_REG] = {}; \
\
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
ARG1 = (u64) (unsigned long) ctx; \
- return ___bpf_prog_run(regs, insn, stack); \
+ return ___bpf_prog_run(regs, insn); \
}
#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size
@@ -1668,7 +2212,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
BPF_R3 = r3; \
BPF_R4 = r4; \
BPF_R5 = r5; \
- return ___bpf_prog_run(regs, insn, stack); \
+ return ___bpf_prog_run(regs, insn); \
}
#define EVAL1(FN, X) FN(X)
@@ -1696,14 +2240,16 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST
#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size),
-static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
- const struct bpf_insn *insn) = {
+static __maybe_unused
+u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
+ const struct bpf_insn *insn) = {
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST
+#ifdef CONFIG_BPF_SYSCALL
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
{
stack_depth = max_t(u32, stack_depth, 1);
@@ -1712,7 +2258,7 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
__bpf_call_base_args;
insn->code = BPF_JMP | BPF_CALL_ARGS;
}
-
+#endif
#else
static unsigned int __bpf_prog_ret0_warn(const void *ctx,
const struct bpf_insn *insn)
@@ -1725,43 +2271,63 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
}
#endif
-bool bpf_prog_array_compatible(struct bpf_array *array,
- const struct bpf_prog *fp)
+bool bpf_prog_map_compatible(struct bpf_map *map,
+ const struct bpf_prog *fp)
{
+ enum bpf_prog_type prog_type = resolve_prog_type(fp);
+ bool ret;
+
if (fp->kprobe_override)
return false;
- if (!array->aux->type) {
+ /* XDP programs inserted into maps are not guaranteed to run on
+ * a particular netdev (and can run outside driver context entirely
+ * in the case of devmap and cpumap). Until device checks
+ * are implemented, prohibit adding dev-bound programs to program maps.
+ */
+ if (bpf_prog_is_dev_bound(fp->aux))
+ return false;
+
+ spin_lock(&map->owner.lock);
+ if (!map->owner.type) {
/* There's no owner yet where we could check for
* compatibility.
*/
- array->aux->type = fp->type;
- array->aux->jited = fp->jited;
- return true;
+ map->owner.type = prog_type;
+ map->owner.jited = fp->jited;
+ map->owner.xdp_has_frags = fp->aux->xdp_has_frags;
+ ret = true;
+ } else {
+ ret = map->owner.type == prog_type &&
+ map->owner.jited == fp->jited &&
+ map->owner.xdp_has_frags == fp->aux->xdp_has_frags;
}
+ spin_unlock(&map->owner.lock);
- return array->aux->type == fp->type &&
- array->aux->jited == fp->jited;
+ return ret;
}
static int bpf_check_tail_call(const struct bpf_prog *fp)
{
struct bpf_prog_aux *aux = fp->aux;
- int i;
+ int i, ret = 0;
+ mutex_lock(&aux->used_maps_mutex);
for (i = 0; i < aux->used_map_cnt; i++) {
struct bpf_map *map = aux->used_maps[i];
- struct bpf_array *array;
- if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+ if (!map_type_contains_progs(map))
continue;
- array = container_of(map, struct bpf_array, map);
- if (!bpf_prog_array_compatible(array, fp))
- return -EINVAL;
+ if (!bpf_prog_map_compatible(map, fp)) {
+ ret = -EINVAL;
+ goto out;
+ }
}
- return 0;
+out:
+ mutex_unlock(&aux->used_maps_mutex);
+ return ret;
}
static void bpf_prog_select_func(struct bpf_prog *fp)
@@ -1777,20 +2343,29 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
/**
* bpf_prog_select_runtime - select exec runtime for BPF program
- * @fp: bpf_prog populated with internal BPF program
+ * @fp: bpf_prog populated with BPF program
* @err: pointer to error variable
*
* Try to JIT eBPF program, if JIT is not available, use interpreter.
- * The BPF program will be executed via BPF_PROG_RUN() macro.
+ * The BPF program will be executed via bpf_prog_run() function.
+ *
+ * Return: the &fp argument along with &err set to 0 for success or
+ * a negative errno code on failure
*/
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
/* In case of BPF to BPF calls, verifier did all the prep
* work with regards to JITing, etc.
*/
+ bool jit_needed = false;
+
if (fp->bpf_func)
goto finalize;
+ if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) ||
+ bpf_prog_has_kfunc_call(fp))
+ jit_needed = true;
+
bpf_prog_select_func(fp);
/* eBPF JITs can rewrite the program in case constant
@@ -1799,20 +2374,16 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
* valid program, which in this case would simply not
* be JITed, but falls back to the interpreter.
*/
- if (!bpf_prog_is_dev_bound(fp->aux)) {
+ if (!bpf_prog_is_offloaded(fp->aux)) {
*err = bpf_prog_alloc_jited_linfo(fp);
if (*err)
return fp;
fp = bpf_int_jit_compile(fp);
- if (!fp->jited) {
- bpf_prog_free_jited_linfo(fp);
-#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+ bpf_prog_jit_attempt_done(fp);
+ if (!fp->jited && jit_needed) {
*err = -ENOTSUPP;
return fp;
-#endif
- } else {
- bpf_prog_free_unused_jited_linfo(fp);
}
} else {
*err = bpf_prog_offload_compile(fp);
@@ -1848,18 +2419,10 @@ static struct bpf_prog_dummy {
},
};
-/* to avoid allocating empty bpf_prog_array for cgroups that
- * don't have bpf program attached use one global 'empty_prog_array'
- * It will not be modified the caller of bpf_prog_array_alloc()
- * (since caller requested prog_cnt == 0)
- * that pointer should be 'freed' by bpf_prog_array_free()
- */
-static struct {
- struct bpf_prog_array hdr;
- struct bpf_prog *null_prog;
-} empty_prog_array = {
+struct bpf_empty_prog_array bpf_empty_prog_array = {
.null_prog = NULL,
};
+EXPORT_SYMBOL(bpf_empty_prog_array);
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
{
@@ -1869,16 +2432,37 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
(prog_cnt + 1),
flags);
- return &empty_prog_array.hdr;
+ return &bpf_empty_prog_array.hdr;
}
void bpf_prog_array_free(struct bpf_prog_array *progs)
{
- if (!progs || progs == &empty_prog_array.hdr)
+ if (!progs || progs == &bpf_empty_prog_array.hdr)
return;
kfree_rcu(progs, rcu);
}
+static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
+{
+ struct bpf_prog_array *progs;
+
+ /* If RCU Tasks Trace grace period implies RCU grace period, there is
+ * no need to call kfree_rcu(), just call kfree() directly.
+ */
+ progs = container_of(rcu, struct bpf_prog_array, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(progs);
+ else
+ kfree_rcu(progs, rcu);
+}
+
+void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
+{
+ if (!progs || progs == &bpf_empty_prog_array.hdr)
+ return;
+ call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
+}
+
int bpf_prog_array_length(struct bpf_prog_array *array)
{
struct bpf_prog_array_item *item;
@@ -1958,16 +2542,71 @@ void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
}
}
+/**
+ * bpf_prog_array_delete_safe_at() - Replaces the program at the given
+ * index into the program array with
+ * a dummy no-op program.
+ * @array: a bpf_prog_array
+ * @index: the index of the program to replace
+ *
+ * Skips over dummy programs, by not counting them, when calculating
+ * the position of the program to replace.
+ *
+ * Return:
+ * * 0 - Success
+ * * -EINVAL - Invalid index value. Must be a non-negative integer.
+ * * -ENOENT - Index out of range
+ */
+int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index)
+{
+ return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog);
+}
+
+/**
+ * bpf_prog_array_update_at() - Updates the program at the given index
+ * into the program array.
+ * @array: a bpf_prog_array
+ * @index: the index of the program to update
+ * @prog: the program to insert into the array
+ *
+ * Skips over dummy programs, by not counting them, when calculating
+ * the position of the program to update.
+ *
+ * Return:
+ * * 0 - Success
+ * * -EINVAL - Invalid index value. Must be a non-negative integer.
+ * * -ENOENT - Index out of range
+ */
+int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
+ struct bpf_prog *prog)
+{
+ struct bpf_prog_array_item *item;
+
+ if (unlikely(index < 0))
+ return -EINVAL;
+
+ for (item = array->items; item->prog; item++) {
+ if (item->prog == &dummy_bpf_prog.prog)
+ continue;
+ if (!index) {
+ WRITE_ONCE(item->prog, prog);
+ return 0;
+ }
+ index--;
+ }
+ return -ENOENT;
+}
+
int bpf_prog_array_copy(struct bpf_prog_array *old_array,
struct bpf_prog *exclude_prog,
struct bpf_prog *include_prog,
+ u64 bpf_cookie,
struct bpf_prog_array **new_array)
{
int new_prog_cnt, carry_prog_cnt = 0;
- struct bpf_prog_array_item *existing;
+ struct bpf_prog_array_item *existing, *new;
struct bpf_prog_array *array;
bool found_exclude = false;
- int new_prog_idx = 0;
/* Figure out how many existing progs we need to carry over to
* the new array.
@@ -2004,20 +2643,27 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
if (!array)
return -ENOMEM;
+ new = array->items;
/* Fill in the new prog array */
if (carry_prog_cnt) {
existing = old_array->items;
- for (; existing->prog; existing++)
- if (existing->prog != exclude_prog &&
- existing->prog != &dummy_bpf_prog.prog) {
- array->items[new_prog_idx++].prog =
- existing->prog;
- }
+ for (; existing->prog; existing++) {
+ if (existing->prog == exclude_prog ||
+ existing->prog == &dummy_bpf_prog.prog)
+ continue;
+
+ new->prog = existing->prog;
+ new->bpf_cookie = existing->bpf_cookie;
+ new++;
+ }
}
- if (include_prog)
- array->items[new_prog_idx++].prog = include_prog;
- array->items[new_prog_idx].prog = NULL;
+ if (include_prog) {
+ new->prog = include_prog;
+ new->bpf_cookie = bpf_cookie;
+ new++;
+ }
+ new->prog = NULL;
*new_array = array;
return 0;
}
@@ -2042,28 +2688,20 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array,
: 0;
}
-static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)
-{
- enum bpf_cgroup_storage_type stype;
-
- for_each_cgroup_storage_type(stype) {
- if (!aux->cgroup_storage[stype])
- continue;
- bpf_cgroup_storage_release(aux, aux->cgroup_storage[stype]);
- }
-}
-
void __bpf_free_used_maps(struct bpf_prog_aux *aux,
struct bpf_map **used_maps, u32 len)
{
struct bpf_map *map;
+ bool sleepable;
u32 i;
- bpf_free_cgroup_storage(aux);
+ sleepable = aux->sleepable;
for (i = 0; i < len; i++) {
map = used_maps[i];
if (map->ops->map_poke_untrack)
map->ops->map_poke_untrack(map, aux);
+ if (sleepable)
+ atomic64_dec(&map->sleepable_refcnt);
bpf_map_put(map);
}
}
@@ -2074,23 +2712,60 @@ static void bpf_free_used_maps(struct bpf_prog_aux *aux)
kfree(aux->used_maps);
}
+void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
+ struct btf_mod_pair *used_btfs, u32 len)
+{
+#ifdef CONFIG_BPF_SYSCALL
+ struct btf_mod_pair *btf_mod;
+ u32 i;
+
+ for (i = 0; i < len; i++) {
+ btf_mod = &used_btfs[i];
+ if (btf_mod->module)
+ module_put(btf_mod->module);
+ btf_put(btf_mod->btf);
+ }
+#endif
+}
+
+static void bpf_free_used_btfs(struct bpf_prog_aux *aux)
+{
+ __bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt);
+ kfree(aux->used_btfs);
+}
+
static void bpf_prog_free_deferred(struct work_struct *work)
{
struct bpf_prog_aux *aux;
int i;
aux = container_of(work, struct bpf_prog_aux, work);
+#ifdef CONFIG_BPF_SYSCALL
+ bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
+#endif
+#ifdef CONFIG_CGROUP_BPF
+ if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID)
+ bpf_cgroup_atype_put(aux->cgroup_atype);
+#endif
bpf_free_used_maps(aux);
+ bpf_free_used_btfs(aux);
if (bpf_prog_is_dev_bound(aux))
- bpf_prog_offload_destroy(aux->prog);
+ bpf_prog_dev_bound_destroy(aux->prog);
#ifdef CONFIG_PERF_EVENTS
if (aux->prog->has_callchain_buf)
put_callchain_buffers();
#endif
- bpf_trampoline_put(aux->trampoline);
- for (i = 0; i < aux->func_cnt; i++)
+ if (aux->dst_trampoline)
+ bpf_trampoline_put(aux->dst_trampoline);
+ for (i = 0; i < aux->real_func_cnt; i++) {
+ /* We can just unlink the subprog poke descriptor table as
+ * it was originally linked to the main program and is also
+ * released along with it.
+ */
+ aux->func[i]->aux->poke_tab = NULL;
bpf_jit_free(aux->func[i]);
- if (aux->func_cnt) {
+ }
+ if (aux->real_func_cnt) {
kfree(aux->func);
bpf_prog_unlock_free(aux->prog);
} else {
@@ -2098,13 +2773,12 @@ static void bpf_prog_free_deferred(struct work_struct *work)
}
}
-/* Free internal BPF program */
void bpf_prog_free(struct bpf_prog *fp)
{
struct bpf_prog_aux *aux = fp->aux;
- if (aux->linked_prog)
- bpf_prog_put(aux->linked_prog);
+ if (aux->dst_prog)
+ bpf_prog_put(aux->dst_prog);
INIT_WORK(&aux->work, bpf_prog_free_deferred);
schedule_work(&aux->work);
}
@@ -2148,6 +2822,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_map_push_elem_proto __weak;
const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
+const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak;
const struct bpf_func_proto bpf_spin_lock_proto __weak;
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
const struct bpf_func_proto bpf_jiffies64_proto __weak;
@@ -2157,6 +2832,8 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak;
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
@@ -2165,12 +2842,21 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak;
const struct bpf_func_proto bpf_get_local_storage_proto __weak;
const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
+const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
+const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
+const struct bpf_func_proto bpf_set_retval_proto __weak;
+const struct bpf_func_proto bpf_get_retval_proto __weak;
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
return NULL;
}
+const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void)
+{
+ return NULL;
+}
+
u64 __weak
bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
@@ -2213,12 +2899,32 @@ bool __weak bpf_helper_changes_pkt_data(void *func)
/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
* analysis code and wants explicit zero extension inserted by verifier.
* Otherwise, return FALSE.
+ *
+ * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if
+ * you don't override this. JITs that don't want these extra insns can detect
+ * them using insn_is_zext.
*/
bool __weak bpf_jit_needs_zext(void)
{
return false;
}
+/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
+bool __weak bpf_jit_supports_subprog_tailcalls(void)
+{
+ return false;
+}
+
+bool __weak bpf_jit_supports_kfunc_call(void)
+{
+ return false;
+}
+
+bool __weak bpf_jit_supports_far_kfunc_call(void)
+{
+ return false;
+}
+
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
*/
@@ -2234,6 +2940,37 @@ int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
return -ENOTSUPP;
}
+void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
+{
+ return ERR_PTR(-ENOTSUPP);
+}
+
+int __weak bpf_arch_text_invalidate(void *dst, size_t len)
+{
+ return -ENOTSUPP;
+}
+
+bool __weak bpf_jit_supports_exceptions(void)
+{
+ return false;
+}
+
+void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
+{
+}
+
+#ifdef CONFIG_BPF_SYSCALL
+static int __init bpf_global_ma_init(void)
+{
+ int ret;
+
+ ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
+ bpf_global_ma_set = !ret;
+ return ret;
+}
+late_initcall(bpf_global_ma_init);
+#endif
+
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 27595fc6da56..ef82ffc90cbe 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -4,18 +4,22 @@
* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
*/
-/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
+/**
+ * DOC: cpu map
+ * The 'cpumap' is primarily used as a backend map for XDP BPF helper
* call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
*
- * Unlike devmap which redirects XDP frames out another NIC device,
+ * Unlike devmap which redirects XDP frames out to another NIC device,
* this map type redirects raw XDP frames to another CPU. The remote
* CPU will do SKB-allocation and call the normal network stack.
- *
+ */
+/*
* This is a scalability and isolation mechanism, that allow
* separating the early driver network XDP layer, from the rest of the
* netstack, and assigning dedicated CPUs for this stage. This
* basically allows for 10G wirespeed pre-filtering via bpf.
*/
+#include <linux/bitops.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/ptr_ring.h>
@@ -24,10 +28,11 @@
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
-#include <linux/capability.h>
+#include <linux/completion.h>
#include <trace/events/xdp.h>
+#include <linux/btf_ids.h>
-#include <linux/netdevice.h> /* netif_receive_skb_core */
+#include <linux/netdevice.h> /* netif_receive_skb_list */
#include <linux/etherdevice.h> /* eth_type_trans */
/* General idea: XDP packets getting XDP redirected to another CPU,
@@ -52,180 +57,213 @@ struct xdp_bulk_queue {
struct bpf_cpu_map_entry {
u32 cpu; /* kthread CPU and map index */
int map_id; /* Back reference to map */
- u32 qsize; /* Queue size placeholder for map lookup */
/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
struct xdp_bulk_queue __percpu *bulkq;
- struct bpf_cpu_map *cmap;
-
/* Queue with potential multi-producers, and single-consumer kthread */
struct ptr_ring *queue;
struct task_struct *kthread;
- struct work_struct kthread_stop_wq;
- atomic_t refcnt; /* Control when this struct can be free'ed */
- struct rcu_head rcu;
+ struct bpf_cpumap_val value;
+ struct bpf_prog *prog;
+
+ struct completion kthread_running;
+ struct rcu_work free_work;
};
struct bpf_cpu_map {
struct bpf_map map;
/* Below members specific for map type */
- struct bpf_cpu_map_entry **cpu_map;
+ struct bpf_cpu_map_entry __rcu **cpu_map;
};
static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
-static int bq_flush_to_queue(struct xdp_bulk_queue *bq);
-
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
+ u32 value_size = attr->value_size;
struct bpf_cpu_map *cmap;
- int err = -ENOMEM;
- u64 cost;
- int ret;
-
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ (value_size != offsetofend(struct bpf_cpumap_val, qsize) &&
+ value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) ||
+ attr->map_flags & ~BPF_F_NUMA_NODE)
return ERR_PTR(-EINVAL);
- cmap = kzalloc(sizeof(*cmap), GFP_USER);
+ /* Pre-limit array size based on NR_CPUS, not final CPU check */
+ if (attr->max_entries > NR_CPUS)
+ return ERR_PTR(-E2BIG);
+
+ cmap = bpf_map_area_alloc(sizeof(*cmap), NUMA_NO_NODE);
if (!cmap)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&cmap->map, attr);
- /* Pre-limit array size based on NR_CPUS, not final CPU check */
- if (cmap->map.max_entries > NR_CPUS) {
- err = -E2BIG;
- goto free_cmap;
- }
-
- /* make sure page count doesn't overflow */
- cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
-
- /* Notice returns -EPERM on if map size is larger than memlock limit */
- ret = bpf_map_charge_init(&cmap->map.memory, cost);
- if (ret) {
- err = ret;
- goto free_cmap;
- }
-
/* Alloc array for possible remote "destination" CPUs */
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
sizeof(struct bpf_cpu_map_entry *),
cmap->map.numa_node);
- if (!cmap->cpu_map)
- goto free_charge;
+ if (!cmap->cpu_map) {
+ bpf_map_area_free(cmap);
+ return ERR_PTR(-ENOMEM);
+ }
return &cmap->map;
-free_charge:
- bpf_map_charge_finish(&cmap->map.memory);
-free_cmap:
- kfree(cmap);
- return ERR_PTR(err);
}
-static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
{
- atomic_inc(&rcpu->refcnt);
+ /* The tear-down procedure should have made sure that queue is
+ * empty. See __cpu_map_entry_replace() and work-queue
+ * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+ * gracefully and warn once.
+ */
+ void *ptr;
+
+ while ((ptr = ptr_ring_consume(ring))) {
+ WARN_ON_ONCE(1);
+ if (unlikely(__ptr_test_bit(0, &ptr))) {
+ __ptr_clear_bit(0, &ptr);
+ kfree_skb(ptr);
+ continue;
+ }
+ xdp_return_frame(ptr);
+ }
}
-/* called from workqueue, to workaround syscall using preempt_disable */
-static void cpu_map_kthread_stop(struct work_struct *work)
+static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
+ struct list_head *listp,
+ struct xdp_cpumap_stats *stats)
{
- struct bpf_cpu_map_entry *rcpu;
+ struct sk_buff *skb, *tmp;
+ struct xdp_buff xdp;
+ u32 act;
+ int err;
+
+ list_for_each_entry_safe(skb, tmp, listp, list) {
+ act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog);
+ switch (act) {
+ case XDP_PASS:
+ break;
+ case XDP_REDIRECT:
+ skb_list_del_init(skb);
+ err = xdp_do_generic_redirect(skb->dev, skb, &xdp,
+ rcpu->prog);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ stats->drop++;
+ } else {
+ stats->redirect++;
+ }
+ return;
+ default:
+ bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(skb->dev, rcpu->prog, act);
+ fallthrough;
+ case XDP_DROP:
+ skb_list_del_init(skb);
+ kfree_skb(skb);
+ stats->drop++;
+ return;
+ }
+ }
+}
- rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
+static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
+ void **frames, int n,
+ struct xdp_cpumap_stats *stats)
+{
+ struct xdp_rxq_info rxq = {};
+ struct xdp_buff xdp;
+ int i, nframes = 0;
- /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
- * as it waits until all in-flight call_rcu() callbacks complete.
- */
- rcu_barrier();
+ xdp_set_return_frame_no_direct();
+ xdp.rxq = &rxq;
- /* kthread_stop will wake_up_process and wait for it to complete */
- kthread_stop(rcpu->kthread);
-}
+ for (i = 0; i < n; i++) {
+ struct xdp_frame *xdpf = frames[i];
+ u32 act;
+ int err;
-static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
- struct xdp_frame *xdpf,
- struct sk_buff *skb)
-{
- unsigned int hard_start_headroom;
- unsigned int frame_size;
- void *pkt_data_start;
+ rxq.dev = xdpf->dev_rx;
+ rxq.mem = xdpf->mem;
+ /* TODO: report queue_index to xdp_rxq_info */
- /* Part of headroom was reserved to xdpf */
- hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom;
+ xdp_convert_frame_to_buff(xdpf, &xdp);
- /* Memory size backing xdp_frame data already have reserved
- * room for build_skb to place skb_shared_info in tailroom.
- */
- frame_size = xdpf->frame_sz;
+ act = bpf_prog_run_xdp(rcpu->prog, &xdp);
+ switch (act) {
+ case XDP_PASS:
+ err = xdp_update_frame_from_buff(&xdp, xdpf);
+ if (err < 0) {
+ xdp_return_frame(xdpf);
+ stats->drop++;
+ } else {
+ frames[nframes++] = xdpf;
+ stats->pass++;
+ }
+ break;
+ case XDP_REDIRECT:
+ err = xdp_do_redirect(xdpf->dev_rx, &xdp,
+ rcpu->prog);
+ if (unlikely(err)) {
+ xdp_return_frame(xdpf);
+ stats->drop++;
+ } else {
+ stats->redirect++;
+ }
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
+ fallthrough;
+ case XDP_DROP:
+ xdp_return_frame(xdpf);
+ stats->drop++;
+ break;
+ }
+ }
- pkt_data_start = xdpf->data - hard_start_headroom;
- skb = build_skb_around(skb, pkt_data_start, frame_size);
- if (unlikely(!skb))
- return NULL;
+ xdp_clear_return_frame_no_direct();
- skb_reserve(skb, hard_start_headroom);
- __skb_put(skb, xdpf->len);
- if (xdpf->metasize)
- skb_metadata_set(skb, xdpf->metasize);
+ return nframes;
+}
- /* Essential SKB info: protocol and skb->dev */
- skb->protocol = eth_type_trans(skb, xdpf->dev_rx);
+#define CPUMAP_BATCH 8
- /* Optional SKB info, currently missing:
- * - HW checksum info (skb->ip_summed)
- * - HW RX hash (skb_set_hash)
- * - RX ring dev queue index (skb_record_rx_queue)
- */
+static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
+ int xdp_n, struct xdp_cpumap_stats *stats,
+ struct list_head *list)
+{
+ int nframes;
- /* Until page_pool get SKB return path, release DMA here */
- xdp_release_frame(xdpf);
+ if (!rcpu->prog)
+ return xdp_n;
- /* Allow SKB to reuse area used by xdp_frame */
- xdp_scrub_frame(xdpf);
+ rcu_read_lock_bh();
- return skb;
-}
+ nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats);
-static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
-{
- /* The tear-down procedure should have made sure that queue is
- * empty. See __cpu_map_entry_replace() and work-queue
- * invoked cpu_map_kthread_stop(). Catch any broken behaviour
- * gracefully and warn once.
- */
- struct xdp_frame *xdpf;
+ if (stats->redirect)
+ xdp_do_flush();
- while ((xdpf = ptr_ring_consume(ring)))
- if (WARN_ON_ONCE(xdpf))
- xdp_return_frame(xdpf);
-}
+ if (unlikely(!list_empty(list)))
+ cpu_map_bpf_prog_run_skb(rcpu, list, stats);
-static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
-{
- if (atomic_dec_and_test(&rcpu->refcnt)) {
- /* The queue should be empty at this point */
- __cpu_map_ring_cleanup(rcpu->queue);
- ptr_ring_cleanup(rcpu->queue, NULL);
- kfree(rcpu->queue);
- kfree(rcpu);
- }
-}
+ rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
-#define CPUMAP_BATCH 8
+ return nframes;
+}
static int cpu_map_kthread_run(void *data)
{
struct bpf_cpu_map_entry *rcpu = data;
+ complete(&rcpu->kthread_running);
set_current_state(TASK_INTERRUPTIBLE);
/* When kthread gives stop order, then rcpu have been disconnected
@@ -234,11 +272,13 @@ static int cpu_map_kthread_run(void *data)
* kthread_stop signal until queue is empty.
*/
while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
- unsigned int drops = 0, sched = 0;
+ struct xdp_cpumap_stats stats = {}; /* zero stats */
+ unsigned int kmem_alloc_drops = 0, sched = 0;
+ gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
+ int i, n, m, nframes, xdp_n;
void *frames[CPUMAP_BATCH];
void *skbs[CPUMAP_BATCH];
- gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
- int i, n, m;
+ LIST_HEAD(list);
/* Release CPU reschedule checks */
if (__ptr_ring_empty(rcpu->queue)) {
@@ -259,11 +299,22 @@ static int cpu_map_kthread_run(void *data)
* kthread CPU pinned. Lockless access to ptr_ring
* consume side valid as no-resize allowed of queue.
*/
- n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH);
-
- for (i = 0; i < n; i++) {
+ n = __ptr_ring_consume_batched(rcpu->queue, frames,
+ CPUMAP_BATCH);
+ for (i = 0, xdp_n = 0; i < n; i++) {
void *f = frames[i];
- struct page *page = virt_to_page(f);
+ struct page *page;
+
+ if (unlikely(__ptr_test_bit(0, &f))) {
+ struct sk_buff *skb = f;
+
+ __ptr_clear_bit(0, &skb);
+ list_add_tail(&skb->list, &list);
+ continue;
+ }
+
+ frames[xdp_n++] = f;
+ page = virt_to_page(f);
/* Bring struct page memory area to curr CPU. Read by
* build_skb_around via page_is_pfmemalloc(), and when
@@ -272,59 +323,84 @@ static int cpu_map_kthread_run(void *data)
prefetchw(page);
}
- m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs);
- if (unlikely(m == 0)) {
- for (i = 0; i < n; i++)
- skbs[i] = NULL; /* effect: xdp_return_frame */
- drops = n;
+ /* Support running another XDP prog on this CPU */
+ nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list);
+ if (nframes) {
+ m = kmem_cache_alloc_bulk(skbuff_cache, gfp, nframes, skbs);
+ if (unlikely(m == 0)) {
+ for (i = 0; i < nframes; i++)
+ skbs[i] = NULL; /* effect: xdp_return_frame */
+ kmem_alloc_drops += nframes;
+ }
}
local_bh_disable();
- for (i = 0; i < n; i++) {
+ for (i = 0; i < nframes; i++) {
struct xdp_frame *xdpf = frames[i];
struct sk_buff *skb = skbs[i];
- int ret;
- skb = cpu_map_build_skb(rcpu, xdpf, skb);
+ skb = __xdp_build_skb_from_frame(xdpf, skb,
+ xdpf->dev_rx);
if (!skb) {
xdp_return_frame(xdpf);
continue;
}
- /* Inject into network stack */
- ret = netif_receive_skb_core(skb);
- if (ret == NET_RX_DROP)
- drops++;
+ list_add_tail(&skb->list, &list);
}
+ netif_receive_skb_list(&list);
+
/* Feedback loop via tracepoint */
- trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched);
+ trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops,
+ sched, &stats);
local_bh_enable(); /* resched point, may call do_softirq() */
}
__set_current_state(TASK_RUNNING);
- put_cpu_map_entry(rcpu);
return 0;
}
-static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
- int map_id)
+static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu,
+ struct bpf_map *map, int fd)
{
+ struct bpf_prog *prog;
+
+ prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->expected_attach_type != BPF_XDP_CPUMAP ||
+ !bpf_prog_map_compatible(map, prog)) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ rcpu->value.bpf_prog.id = prog->aux->id;
+ rcpu->prog = prog;
+
+ return 0;
+}
+
+static struct bpf_cpu_map_entry *
+__cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
+ u32 cpu)
+{
+ int numa, err, i, fd = value->bpf_prog.fd;
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct bpf_cpu_map_entry *rcpu;
struct xdp_bulk_queue *bq;
- int numa, err, i;
/* Have map->numa_node, but choose node of redirect target CPU */
numa = cpu_to_node(cpu);
- rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa);
+ rcpu = bpf_map_kmalloc_node(map, sizeof(*rcpu), gfp | __GFP_ZERO, numa);
if (!rcpu)
return NULL;
/* Alloc percpu bulkq */
- rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq),
- sizeof(void *), gfp);
+ rcpu->bulkq = bpf_map_alloc_percpu(map, sizeof(*rcpu->bulkq),
+ sizeof(void *), gfp);
if (!rcpu->bulkq)
goto free_rcu;
@@ -334,33 +410,45 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
}
/* Alloc queue */
- rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
+ rcpu->queue = bpf_map_kmalloc_node(map, sizeof(*rcpu->queue), gfp,
+ numa);
if (!rcpu->queue)
goto free_bulkq;
- err = ptr_ring_init(rcpu->queue, qsize, gfp);
+ err = ptr_ring_init(rcpu->queue, value->qsize, gfp);
if (err)
goto free_queue;
rcpu->cpu = cpu;
- rcpu->map_id = map_id;
- rcpu->qsize = qsize;
+ rcpu->map_id = map->id;
+ rcpu->value.qsize = value->qsize;
+
+ if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd))
+ goto free_ptr_ring;
/* Setup kthread */
+ init_completion(&rcpu->kthread_running);
rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
- "cpumap/%d/map:%d", cpu, map_id);
+ "cpumap/%d/map:%d", cpu,
+ map->id);
if (IS_ERR(rcpu->kthread))
- goto free_ptr_ring;
-
- get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
- get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
+ goto free_prog;
/* Make sure kthread runs on a single CPU */
kthread_bind(rcpu->kthread, cpu);
wake_up_process(rcpu->kthread);
+ /* Make sure kthread has been running, so kthread_stop() will not
+ * stop the kthread prematurely and all pending frames or skbs
+ * will be handled by the kthread before kthread_stop() returns.
+ */
+ wait_for_completion(&rcpu->kthread_running);
+
return rcpu;
+free_prog:
+ if (rcpu->prog)
+ bpf_prog_put(rcpu->prog);
free_ptr_ring:
ptr_ring_cleanup(rcpu->queue, NULL);
free_queue:
@@ -372,55 +460,54 @@ free_rcu:
return NULL;
}
-static void __cpu_map_entry_free(struct rcu_head *rcu)
+static void __cpu_map_entry_free(struct work_struct *work)
{
struct bpf_cpu_map_entry *rcpu;
/* This cpu_map_entry have been disconnected from map and one
- * RCU grace-period have elapsed. Thus, XDP cannot queue any
+ * RCU grace-period have elapsed. Thus, XDP cannot queue any
* new packets and cannot change/set flush_needed that can
* find this entry.
*/
- rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
+ rcpu = container_of(to_rcu_work(work), struct bpf_cpu_map_entry, free_work);
+ /* kthread_stop will wake_up_process and wait for it to complete.
+ * cpu_map_kthread_run() makes sure the pointer ring is empty
+ * before exiting.
+ */
+ kthread_stop(rcpu->kthread);
+
+ if (rcpu->prog)
+ bpf_prog_put(rcpu->prog);
+ /* The queue should be empty at this point */
+ __cpu_map_ring_cleanup(rcpu->queue);
+ ptr_ring_cleanup(rcpu->queue, NULL);
+ kfree(rcpu->queue);
free_percpu(rcpu->bulkq);
- /* Cannot kthread_stop() here, last put free rcpu resources */
- put_cpu_map_entry(rcpu);
+ kfree(rcpu);
}
-/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
- * ensure any driver rcu critical sections have completed, but this
- * does not guarantee a flush has happened yet. Because driver side
- * rcu_read_lock/unlock only protects the running XDP program. The
- * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
- * pending flush op doesn't fail.
- *
- * The bpf_cpu_map_entry is still used by the kthread, and there can
- * still be pending packets (in queue and percpu bulkq). A refcnt
- * makes sure to last user (kthread_stop vs. call_rcu) free memory
- * resources.
- *
- * The rcu callback __cpu_map_entry_free flush remaining packets in
- * percpu bulkq to queue. Due to caller map_delete_elem() disable
- * preemption, cannot call kthread_stop() to make sure queue is empty.
- * Instead a work_queue is started for stopping kthread,
- * cpu_map_kthread_stop, which waits for an RCU grace period before
- * stopping kthread, emptying the queue.
+/* After the xchg of the bpf_cpu_map_entry pointer, we need to make sure the old
+ * entry is no longer in use before freeing. We use queue_rcu_work() to call
+ * __cpu_map_entry_free() in a separate workqueue after waiting for an RCU grace
+ * period. This means that (a) all pending enqueue and flush operations have
+ * completed (because of the RCU callback), and (b) we are in a workqueue
+ * context where we can stop the kthread and wait for it to exit before freeing
+ * everything.
*/
static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
{
struct bpf_cpu_map_entry *old_rcpu;
- old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
+ old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu)));
if (old_rcpu) {
- call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
- INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
- schedule_work(&old_rcpu->kthread_stop_wq);
+ INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free);
+ queue_rcu_work(system_wq, &old_rcpu->free_work);
}
}
-static int cpu_map_delete_elem(struct bpf_map *map, void *key)
+static long cpu_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
u32 key_cpu = *(u32 *)key;
@@ -428,21 +515,21 @@ static int cpu_map_delete_elem(struct bpf_map *map, void *key)
if (key_cpu >= map->max_entries)
return -EINVAL;
- /* notice caller map_delete_elem() use preempt_disable() */
+ /* notice caller map_delete_elem() uses rcu_read_lock() */
__cpu_map_entry_replace(cmap, key_cpu, NULL);
return 0;
}
-static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ struct bpf_cpumap_val cpumap_value = {};
struct bpf_cpu_map_entry *rcpu;
-
/* Array index key correspond to CPU number */
u32 key_cpu = *(u32 *)key;
- /* Value is the queue size */
- u32 qsize = *(u32 *)value;
+
+ memcpy(&cpumap_value, value, map->value_size);
if (unlikely(map_flags > BPF_EXIST))
return -EINVAL;
@@ -450,21 +537,20 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
return -E2BIG;
if (unlikely(map_flags == BPF_NOEXIST))
return -EEXIST;
- if (unlikely(qsize > 16384)) /* sanity limit on qsize */
+ if (unlikely(cpumap_value.qsize > 16384)) /* sanity limit on qsize */
return -EOVERFLOW;
/* Make sure CPU is a valid possible cpu */
if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu))
return -ENODEV;
- if (qsize == 0) {
+ if (cpumap_value.qsize == 0) {
rcpu = NULL; /* Same as deleting */
} else {
/* Updating qsize cause re-allocation of bpf_cpu_map_entry */
- rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
+ rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu);
if (!rcpu)
return -ENOMEM;
- rcpu->cmap = cmap;
}
rcu_read_lock();
__cpu_map_entry_replace(cmap, key_cpu, rcpu);
@@ -480,33 +566,35 @@ static void cpu_map_free(struct bpf_map *map)
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
* so the bpf programs (can be more than one that used this map) were
* disconnected from events. Wait for outstanding critical sections in
- * these programs to complete. The rcu critical section only guarantees
- * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
- * It does __not__ ensure pending flush operations (if any) are
- * complete.
+ * these programs to complete. synchronize_rcu() below not only
+ * guarantees no further "XDP/bpf-side" reads against
+ * bpf_cpu_map->cpu_map, but also ensure pending flush operations
+ * (if any) are completed.
*/
-
- bpf_clear_redirect_map(map);
synchronize_rcu();
- /* For cpu_map the remote CPUs can still be using the entries
- * (struct bpf_cpu_map_entry).
+ /* The only possible user of bpf_cpu_map_entry is
+ * cpu_map_kthread_run().
*/
for (i = 0; i < cmap->map.max_entries; i++) {
struct bpf_cpu_map_entry *rcpu;
- rcpu = READ_ONCE(cmap->cpu_map[i]);
+ rcpu = rcu_dereference_raw(cmap->cpu_map[i]);
if (!rcpu)
continue;
- /* bq flush and cleanup happens after RCU grace-period */
- __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
+ /* Stop kthread and cleanup entry directly */
+ __cpu_map_entry_free(&rcpu->free_work.work);
}
bpf_map_area_free(cmap->cpu_map);
- kfree(cmap);
+ bpf_map_area_free(cmap);
}
-struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
+ */
+static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
struct bpf_cpu_map_entry *rcpu;
@@ -514,7 +602,8 @@ struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
if (key >= map->max_entries)
return NULL;
- rcpu = READ_ONCE(cmap->cpu_map[key]);
+ rcpu = rcu_dereference_check(cmap->cpu_map[key],
+ rcu_read_lock_bh_held());
return rcpu;
}
@@ -523,7 +612,7 @@ static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
struct bpf_cpu_map_entry *rcpu =
__cpu_map_lookup_elem(map, *(u32 *)key);
- return rcpu ? &rcpu->qsize : NULL;
+ return rcpu ? &rcpu->value : NULL;
}
static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -543,7 +632,24 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
+static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
+{
+ return __bpf_xdp_redirect_map(map, index, flags, 0,
+ __cpu_map_lookup_elem);
+}
+
+static u64 cpu_map_mem_usage(const struct bpf_map *map)
+{
+ u64 usage = sizeof(struct bpf_cpu_map);
+
+ /* Currently the dynamically allocated elements are not counted */
+ usage += (u64)map->max_entries * sizeof(struct bpf_cpu_map_entry *);
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(cpu_map_btf_ids, struct, bpf_cpu_map)
const struct bpf_map_ops cpu_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = cpu_map_alloc,
.map_free = cpu_map_free,
.map_delete_elem = cpu_map_delete_elem,
@@ -551,9 +657,12 @@ const struct bpf_map_ops cpu_map_ops = {
.map_lookup_elem = cpu_map_lookup_elem,
.map_get_next_key = cpu_map_get_next_key,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = cpu_map_mem_usage,
+ .map_btf_id = &cpu_map_btf_ids[0],
+ .map_redirect = cpu_map_redirect,
};
-static int bq_flush_to_queue(struct xdp_bulk_queue *bq)
+static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
{
struct bpf_cpu_map_entry *rcpu = bq->obj;
unsigned int processed = 0, drops = 0;
@@ -562,7 +671,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq)
int i;
if (unlikely(!bq->count))
- return 0;
+ return;
q = rcpu->queue;
spin_lock(&q->producer_lock);
@@ -585,13 +694,12 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq)
/* Feedback loop via tracepoints */
trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
- return 0;
}
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
* Thus, safe percpu variable access.
*/
-static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
+static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
@@ -612,19 +720,11 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
if (!bq->flush_node.prev)
list_add(&bq->flush_node, flush_list);
-
- return 0;
}
-int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
- struct xdp_frame *xdpf;
-
- xdpf = xdp_convert_buff_to_frame(xdp);
- if (unlikely(!xdpf))
- return -EOVERFLOW;
-
/* Info needed when constructing SKB on remote CPU */
xdpf->dev_rx = dev_rx;
@@ -632,6 +732,25 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
return 0;
}
+int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
+ struct sk_buff *skb)
+{
+ int ret;
+
+ __skb_pull(skb, skb->mac_len);
+ skb_set_redirected(skb, false);
+ __ptr_set_bit(0, &skb);
+
+ ret = ptr_ring_produce(rcpu->queue, skb);
+ if (ret < 0)
+ goto trace;
+
+ wake_up_process(rcpu->kthread);
+trace:
+ trace_xdp_cpumap_enqueue(rcpu->map_id, !ret, !!ret, rcpu->cpu);
+ return ret;
+}
+
void __cpu_map_flush(void)
{
struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
@@ -645,6 +764,16 @@ void __cpu_map_flush(void)
}
}
+#ifdef CONFIG_DEBUG_NET
+bool cpu_map_check_flush(void)
+{
+ if (list_empty(this_cpu_ptr(&cpu_map_flush_list)))
+ return false;
+ __cpu_map_flush();
+ return true;
+}
+#endif
+
static int __init cpu_map_init(void)
{
int cpu;
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
new file mode 100644
index 000000000000..2e73533a3811
--- /dev/null
+++ b/kernel/bpf/cpumask.c
@@ -0,0 +1,482 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2023 Meta, Inc */
+#include <linux/bpf.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/cpumask.h>
+
+/**
+ * struct bpf_cpumask - refcounted BPF cpumask wrapper structure
+ * @cpumask: The actual cpumask embedded in the struct.
+ * @usage: Object reference counter. When the refcount goes to 0, the
+ * memory is released back to the BPF allocator, which provides
+ * RCU safety.
+ *
+ * Note that we explicitly embed a cpumask_t rather than a cpumask_var_t. This
+ * is done to avoid confusing the verifier due to the typedef of cpumask_var_t
+ * changing depending on whether CONFIG_CPUMASK_OFFSTACK is defined or not. See
+ * the details in <linux/cpumask.h>. The consequence is that this structure is
+ * likely a bit larger than it needs to be when CONFIG_CPUMASK_OFFSTACK is
+ * defined due to embedding the whole NR_CPUS-size bitmap, but the extra memory
+ * overhead is minimal. For the more typical case of CONFIG_CPUMASK_OFFSTACK
+ * not being defined, the structure is the same size regardless.
+ */
+struct bpf_cpumask {
+ cpumask_t cpumask;
+ refcount_t usage;
+};
+
+static struct bpf_mem_alloc bpf_cpumask_ma;
+
+static bool cpu_valid(u32 cpu)
+{
+ return cpu < nr_cpu_ids;
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_cpumask_create() - Create a mutable BPF cpumask.
+ *
+ * Allocates a cpumask that can be queried, mutated, acquired, and released by
+ * a BPF program. The cpumask returned by this function must either be embedded
+ * in a map as a kptr, or freed with bpf_cpumask_release().
+ *
+ * bpf_cpumask_create() allocates memory using the BPF memory allocator, and
+ * will not block. It may return NULL if no memory is available.
+ */
+__bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void)
+{
+ struct bpf_cpumask *cpumask;
+
+ /* cpumask must be the first element so struct bpf_cpumask be cast to struct cpumask. */
+ BUILD_BUG_ON(offsetof(struct bpf_cpumask, cpumask) != 0);
+
+ cpumask = bpf_mem_cache_alloc(&bpf_cpumask_ma);
+ if (!cpumask)
+ return NULL;
+
+ memset(cpumask, 0, sizeof(*cpumask));
+ refcount_set(&cpumask->usage, 1);
+
+ return cpumask;
+}
+
+/**
+ * bpf_cpumask_acquire() - Acquire a reference to a BPF cpumask.
+ * @cpumask: The BPF cpumask being acquired. The cpumask must be a trusted
+ * pointer.
+ *
+ * Acquires a reference to a BPF cpumask. The cpumask returned by this function
+ * must either be embedded in a map as a kptr, or freed with
+ * bpf_cpumask_release().
+ */
+__bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)
+{
+ refcount_inc(&cpumask->usage);
+ return cpumask;
+}
+
+/**
+ * bpf_cpumask_release() - Release a previously acquired BPF cpumask.
+ * @cpumask: The cpumask being released.
+ *
+ * Releases a previously acquired reference to a BPF cpumask. When the final
+ * reference of the BPF cpumask has been released, it is subsequently freed in
+ * an RCU callback in the BPF memory allocator.
+ */
+__bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
+{
+ if (!refcount_dec_and_test(&cpumask->usage))
+ return;
+
+ migrate_disable();
+ bpf_mem_cache_free_rcu(&bpf_cpumask_ma, cpumask);
+ migrate_enable();
+}
+
+__bpf_kfunc void bpf_cpumask_release_dtor(void *cpumask)
+{
+ bpf_cpumask_release(cpumask);
+}
+CFI_NOSEAL(bpf_cpumask_release_dtor);
+
+/**
+ * bpf_cpumask_first() - Get the index of the first nonzero bit in the cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Find the index of the first nonzero bit of the cpumask. A struct bpf_cpumask
+ * pointer may be safely passed to this function.
+ */
+__bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask)
+{
+ return cpumask_first(cpumask);
+}
+
+/**
+ * bpf_cpumask_first_zero() - Get the index of the first unset bit in the
+ * cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Find the index of the first unset bit of the cpumask. A struct bpf_cpumask
+ * pointer may be safely passed to this function.
+ */
+__bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask)
+{
+ return cpumask_first_zero(cpumask);
+}
+
+/**
+ * bpf_cpumask_first_and() - Return the index of the first nonzero bit from the
+ * AND of two cpumasks.
+ * @src1: The first cpumask.
+ * @src2: The second cpumask.
+ *
+ * Find the index of the first nonzero bit of the AND of two cpumasks.
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc u32 bpf_cpumask_first_and(const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ return cpumask_first_and(src1, src2);
+}
+
+/**
+ * bpf_cpumask_set_cpu() - Set a bit for a CPU in a BPF cpumask.
+ * @cpu: The CPU to be set in the cpumask.
+ * @cpumask: The BPF cpumask in which a bit is being set.
+ */
+__bpf_kfunc void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return;
+
+ cpumask_set_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_clear_cpu() - Clear a bit for a CPU in a BPF cpumask.
+ * @cpu: The CPU to be cleared from the cpumask.
+ * @cpumask: The BPF cpumask in which a bit is being cleared.
+ */
+__bpf_kfunc void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return;
+
+ cpumask_clear_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_cpu() - Test whether a CPU is set in a cpumask.
+ * @cpu: The CPU being queried for.
+ * @cpumask: The cpumask being queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is an invalid cpu.
+ */
+__bpf_kfunc bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_and_set_cpu() - Atomically test and set a CPU in a BPF cpumask.
+ * @cpu: The CPU being set and queried for.
+ * @cpumask: The BPF cpumask being set and queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is invalid.
+ */
+__bpf_kfunc bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_and_set_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_and_clear_cpu() - Atomically test and clear a CPU in a BPF
+ * cpumask.
+ * @cpu: The CPU being cleared and queried for.
+ * @cpumask: The BPF cpumask being cleared and queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is invalid.
+ */
+__bpf_kfunc bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_and_clear_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_setall() - Set all of the bits in a BPF cpumask.
+ * @cpumask: The BPF cpumask having all of its bits set.
+ */
+__bpf_kfunc void bpf_cpumask_setall(struct bpf_cpumask *cpumask)
+{
+ cpumask_setall((struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_clear() - Clear all of the bits in a BPF cpumask.
+ * @cpumask: The BPF cpumask being cleared.
+ */
+__bpf_kfunc void bpf_cpumask_clear(struct bpf_cpumask *cpumask)
+{
+ cpumask_clear((struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_and() - AND two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @dst has at least one bit set following the operation
+ * * false - @dst is empty following the operation
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_and(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ return cpumask_and((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_or() - OR two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc void bpf_cpumask_or(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ cpumask_or((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_xor() - XOR two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc void bpf_cpumask_xor(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ cpumask_xor((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_equal() - Check two cpumasks for equality.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @src1 and @src2 have the same bits set.
+ * * false - @src1 and @src2 differ in at least one bit.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_equal(src1, src2);
+}
+
+/**
+ * bpf_cpumask_intersects() - Check two cpumasks for overlap.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @src1 and @src2 have at least one of the same bits set.
+ * * false - @src1 and @src2 don't have any of the same bits set.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_intersects(src1, src2);
+}
+
+/**
+ * bpf_cpumask_subset() - Check if a cpumask is a subset of another.
+ * @src1: The first cpumask being checked as a subset.
+ * @src2: The second cpumask being checked as a superset.
+ *
+ * Return:
+ * * true - All of the bits of @src1 are set in @src2.
+ * * false - At least one bit in @src1 is not set in @src2.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_subset(src1, src2);
+}
+
+/**
+ * bpf_cpumask_empty() - Check if a cpumask is empty.
+ * @cpumask: The cpumask being checked.
+ *
+ * Return:
+ * * true - None of the bits in @cpumask are set.
+ * * false - At least one bit in @cpumask is set.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @cpumask.
+ */
+__bpf_kfunc bool bpf_cpumask_empty(const struct cpumask *cpumask)
+{
+ return cpumask_empty(cpumask);
+}
+
+/**
+ * bpf_cpumask_full() - Check if a cpumask has all bits set.
+ * @cpumask: The cpumask being checked.
+ *
+ * Return:
+ * * true - All of the bits in @cpumask are set.
+ * * false - At least one bit in @cpumask is cleared.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @cpumask.
+ */
+__bpf_kfunc bool bpf_cpumask_full(const struct cpumask *cpumask)
+{
+ return cpumask_full(cpumask);
+}
+
+/**
+ * bpf_cpumask_copy() - Copy the contents of a cpumask into a BPF cpumask.
+ * @dst: The BPF cpumask being copied into.
+ * @src: The cpumask being copied.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @src.
+ */
+__bpf_kfunc void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src)
+{
+ cpumask_copy((struct cpumask *)dst, src);
+}
+
+/**
+ * bpf_cpumask_any_distribute() - Return a random set CPU from a cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Return:
+ * * A random set bit within [0, num_cpus) if at least one bit is set.
+ * * >= num_cpus if no bit is set.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @src.
+ */
+__bpf_kfunc u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask)
+{
+ return cpumask_any_distribute(cpumask);
+}
+
+/**
+ * bpf_cpumask_any_and_distribute() - Return a random set CPU from the AND of
+ * two cpumasks.
+ * @src1: The first cpumask.
+ * @src2: The second cpumask.
+ *
+ * Return:
+ * * A random set bit within [0, num_cpus) from the AND of two cpumasks, if at
+ * least one bit is set.
+ * * >= num_cpus if no bit is set.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ return cpumask_any_and_distribute(src1, src2);
+}
+
+/**
+ * bpf_cpumask_weight() - Return the number of bits in @cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Count the number of set bits in the given cpumask.
+ */
+__bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask)
+{
+ return cpumask_weight(cpumask);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_SET8_START(cpumask_kfunc_btf_ids)
+BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_first_and, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_and, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_or, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
+BTF_SET8_END(cpumask_kfunc_btf_ids)
+
+static const struct btf_kfunc_id_set cpumask_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &cpumask_kfunc_btf_ids,
+};
+
+BTF_ID_LIST(cpumask_dtor_ids)
+BTF_ID(struct, bpf_cpumask)
+BTF_ID(func, bpf_cpumask_release_dtor)
+
+static int __init cpumask_kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc cpumask_dtors[] = {
+ {
+ .btf_id = cpumask_dtor_ids[0],
+ .kfunc_btf_id = cpumask_dtor_ids[1]
+ },
+ };
+
+ ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), false);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set);
+ return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors,
+ ARRAY_SIZE(cpumask_dtors),
+ THIS_MODULE);
+}
+
+late_initcall(cpumask_kfunc_init);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 5fdbc776a760..a936c704d4e7 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -48,6 +48,7 @@
#include <net/xdp.h>
#include <linux/filter.h>
#include <trace/events/xdp.h>
+#include <linux/btf_ids.h>
#define DEV_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@@ -57,13 +58,13 @@ struct xdp_dev_bulk_queue {
struct list_head flush_node;
struct net_device *dev;
struct net_device *dev_rx;
+ struct bpf_prog *xdp_prog;
unsigned int count;
};
struct bpf_dtab_netdev {
struct net_device *dev; /* must be first member, due to tracepoint */
struct hlist_node index_hlist;
- struct bpf_dtab *dtab;
struct bpf_prog *xdp_prog;
struct rcu_head rcu;
unsigned int idx;
@@ -72,7 +73,7 @@ struct bpf_dtab_netdev {
struct bpf_dtab {
struct bpf_map map;
- struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */
+ struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */
struct list_head list;
/* these are only used for DEVMAP_HASH type maps */
@@ -92,7 +93,7 @@ static struct hlist_head *dev_map_create_hash(unsigned int entries,
int i;
struct hlist_head *hash;
- hash = bpf_map_area_alloc(entries * sizeof(*hash), numa_node);
+ hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node);
if (hash != NULL)
for (i = 0; i < entries; i++)
INIT_HLIST_HEAD(&hash[i]);
@@ -109,8 +110,6 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
{
u32 valsize = attr->value_size;
- u64 cost = 0;
- int err;
/* check sanity of attributes. 2 value sizes supported:
* 4 bytes: ifindex
@@ -135,36 +134,24 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
if (!dtab->n_buckets) /* Overflow check */
return -EINVAL;
- cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets;
- } else {
- cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
}
- /* if map size is larger than memlock limit, reject it */
- err = bpf_map_charge_init(&dtab->map.memory, cost);
- if (err)
- return -EINVAL;
-
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
dtab->map.numa_node);
if (!dtab->dev_index_head)
- goto free_charge;
+ return -ENOMEM;
spin_lock_init(&dtab->index_lock);
} else {
- dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
+ dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries *
sizeof(struct bpf_dtab_netdev *),
dtab->map.numa_node);
if (!dtab->netdev_map)
- goto free_charge;
+ return -ENOMEM;
}
return 0;
-
-free_charge:
- bpf_map_charge_finish(&dtab->map.memory);
- return -ENOMEM;
}
static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
@@ -172,16 +159,13 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
struct bpf_dtab *dtab;
int err;
- if (!capable(CAP_NET_ADMIN))
- return ERR_PTR(-EPERM);
-
- dtab = kzalloc(sizeof(*dtab), GFP_USER);
+ dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE);
if (!dtab)
return ERR_PTR(-ENOMEM);
err = dev_map_init_map(dtab, attr);
if (err) {
- kfree(dtab);
+ bpf_map_area_free(dtab);
return ERR_PTR(err);
}
@@ -239,7 +223,7 @@ static void dev_map_free(struct bpf_map *map)
for (i = 0; i < dtab->map.max_entries; i++) {
struct bpf_dtab_netdev *dev;
- dev = dtab->netdev_map[i];
+ dev = rcu_dereference_raw(dtab->netdev_map[i]);
if (!dev)
continue;
@@ -252,7 +236,7 @@ static void dev_map_free(struct bpf_map *map)
bpf_map_area_free(dtab->netdev_map);
}
- kfree(dtab);
+ bpf_map_area_free(dtab);
}
static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -272,7 +256,11 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
-struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
+ */
+static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct hlist_head *head = dev_map_index_hash(dtab, key);
@@ -331,82 +319,120 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
return -ENOENT;
}
-bool dev_map_can_have_prog(struct bpf_map *map)
+static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
+ struct xdp_frame **frames, int n,
+ struct net_device *dev)
{
- if ((map->map_type == BPF_MAP_TYPE_DEVMAP ||
- map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) &&
- map->value_size != offsetofend(struct bpf_devmap_val, ifindex))
- return true;
-
- return false;
+ struct xdp_txq_info txq = { .dev = dev };
+ struct xdp_buff xdp;
+ int i, nframes = 0;
+
+ for (i = 0; i < n; i++) {
+ struct xdp_frame *xdpf = frames[i];
+ u32 act;
+ int err;
+
+ xdp_convert_frame_to_buff(xdpf, &xdp);
+ xdp.txq = &txq;
+
+ act = bpf_prog_run_xdp(xdp_prog, &xdp);
+ switch (act) {
+ case XDP_PASS:
+ err = xdp_update_frame_from_buff(&xdp, xdpf);
+ if (unlikely(err < 0))
+ xdp_return_frame_rx_napi(xdpf);
+ else
+ frames[nframes++] = xdpf;
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(dev, xdp_prog, act);
+ fallthrough;
+ case XDP_DROP:
+ xdp_return_frame_rx_napi(xdpf);
+ break;
+ }
+ }
+ return nframes; /* sent frames count */
}
-static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
+static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
{
struct net_device *dev = bq->dev;
- int sent = 0, drops = 0, err = 0;
+ unsigned int cnt = bq->count;
+ int sent = 0, err = 0;
+ int to_send = cnt;
int i;
- if (unlikely(!bq->count))
- return 0;
+ if (unlikely(!cnt))
+ return;
- for (i = 0; i < bq->count; i++) {
+ for (i = 0; i < cnt; i++) {
struct xdp_frame *xdpf = bq->q[i];
prefetch(xdpf);
}
- sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
+ if (bq->xdp_prog) {
+ to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
+ if (!to_send)
+ goto out;
+ }
+
+ sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags);
if (sent < 0) {
+ /* If ndo_xdp_xmit fails with an errno, no frames have
+ * been xmit'ed.
+ */
err = sent;
sent = 0;
- goto error;
}
- drops = bq->count - sent;
-out:
- bq->count = 0;
- trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err);
- bq->dev_rx = NULL;
- __list_del_clearprev(&bq->flush_node);
- return 0;
-error:
- /* If ndo_xdp_xmit fails with an errno, no frames have been
- * xmit'ed and it's our responsibility to them free all.
+ /* If not all frames have been transmitted, it is our
+ * responsibility to free them
*/
- for (i = 0; i < bq->count; i++) {
- struct xdp_frame *xdpf = bq->q[i];
+ for (i = sent; unlikely(i < to_send); i++)
+ xdp_return_frame_rx_napi(bq->q[i]);
- xdp_return_frame_rx_napi(xdpf);
- drops++;
- }
- goto out;
-}
-
-/* __dev_flush is called from xdp_do_flush() which _must_ be signaled
- * from the driver before returning from its napi->poll() routine. The poll()
- * routine is called either from busy_poll context or net_rx_action signaled
- * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
- * net device can be torn down. On devmap tear down we ensure the flush list
- * is empty before completing to ensure all flush operations have completed.
- * When drivers update the bpf program they may need to ensure any flush ops
- * are also complete. Using synchronize_rcu or call_rcu will suffice for this
- * because both wait for napi context to exit.
+out:
+ bq->count = 0;
+ trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
+}
+
+/* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the
+ * driver before returning from its napi->poll() routine. See the comment above
+ * xdp_do_flush() in filter.c.
*/
void __dev_flush(void)
{
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq, *tmp;
- list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
+ list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
bq_xmit_all(bq, XDP_XMIT_FLUSH);
+ bq->dev_rx = NULL;
+ bq->xdp_prog = NULL;
+ __list_del_clearprev(&bq->flush_node);
+ }
+}
+
+#ifdef CONFIG_DEBUG_NET
+bool dev_check_flush(void)
+{
+ if (list_empty(this_cpu_ptr(&dev_flush_list)))
+ return false;
+ __dev_flush();
+ return true;
}
+#endif
-/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
- * update happens in parallel here a dev_put wont happen until after reading the
- * ifindex.
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
*/
-struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *obj;
@@ -414,15 +440,17 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
if (key >= map->max_entries)
return NULL;
- obj = READ_ONCE(dtab->netdev_map[key]);
+ obj = rcu_dereference_check(dtab->netdev_map[key],
+ rcu_read_lock_bh_held());
return obj;
}
-/* Runs under RCU-read-side, plus in softirq under NAPI protection.
- * Thus, safe percpu variable access.
+/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
+ * variable access, and map elements stick around. See comment above
+ * xdp_do_flush() in filter.c.
*/
-static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
- struct net_device *dev_rx)
+static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
+ struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
@@ -433,83 +461,215 @@ static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
/* Ingress dev_rx will be the same for all xdp_frame's in
* bulk_queue, because bq stored per-CPU and must be flushed
* from net_device drivers NAPI func end.
+ *
+ * Do the same with xdp_prog and flush_list since these fields
+ * are only ever modified together.
*/
- if (!bq->dev_rx)
+ if (!bq->dev_rx) {
bq->dev_rx = dev_rx;
-
- bq->q[bq->count++] = xdpf;
-
- if (!bq->flush_node.prev)
+ bq->xdp_prog = xdp_prog;
list_add(&bq->flush_node, flush_list);
+ }
- return 0;
+ bq->q[bq->count++] = xdpf;
}
-static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
- struct net_device *dev_rx)
+static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
+ struct net_device *dev_rx,
+ struct bpf_prog *xdp_prog)
{
- struct xdp_frame *xdpf;
int err;
- if (!dev->netdev_ops->ndo_xdp_xmit)
+ if (!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT))
+ return -EOPNOTSUPP;
+
+ if (unlikely(!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) &&
+ xdp_frame_has_frags(xdpf)))
return -EOPNOTSUPP;
- err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
+ err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf));
if (unlikely(err))
return err;
- xdpf = xdp_convert_buff_to_frame(xdp);
- if (unlikely(!xdpf))
- return -EOVERFLOW;
-
- return bq_enqueue(dev, xdpf, dev_rx);
+ bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
+ return 0;
}
-static struct xdp_buff *dev_map_run_prog(struct net_device *dev,
- struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst)
{
- struct xdp_txq_info txq = { .dev = dev };
+ struct xdp_txq_info txq = { .dev = dst->dev };
+ struct xdp_buff xdp;
u32 act;
- xdp_set_data_meta_invalid(xdp);
- xdp->txq = &txq;
+ if (!dst->xdp_prog)
+ return XDP_PASS;
+
+ __skb_pull(skb, skb->mac_len);
+ xdp.txq = &txq;
- act = bpf_prog_run_xdp(xdp_prog, xdp);
+ act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog);
switch (act) {
case XDP_PASS:
- return xdp;
- case XDP_DROP:
+ __skb_push(skb, skb->mac_len);
break;
default:
- bpf_warn_invalid_xdp_action(act);
+ bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act);
fallthrough;
case XDP_ABORTED:
- trace_xdp_exception(dev, xdp_prog, act);
+ trace_xdp_exception(dst->dev, dst->xdp_prog, act);
+ fallthrough;
+ case XDP_DROP:
+ kfree_skb(skb);
break;
}
- xdp_return_buff(xdp);
- return NULL;
+ return act;
}
-int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
- return __xdp_enqueue(dev, xdp, dev_rx);
+ return __xdp_enqueue(dev, xdpf, dev_rx, NULL);
}
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
struct net_device *dev = dst->dev;
- if (dst->xdp_prog) {
- xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog);
- if (!xdp)
- return 0;
+ return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog);
+}
+
+static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
+{
+ if (!obj)
+ return false;
+
+ if (!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT))
+ return false;
+
+ if (unlikely(!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) &&
+ xdp_frame_has_frags(xdpf)))
+ return false;
+
+ if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf)))
+ return false;
+
+ return true;
+}
+
+static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
+ struct net_device *dev_rx,
+ struct xdp_frame *xdpf)
+{
+ struct xdp_frame *nxdpf;
+
+ nxdpf = xdpf_clone(xdpf);
+ if (!nxdpf)
+ return -ENOMEM;
+
+ bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);
+
+ return 0;
+}
+
+static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex)
+{
+ while (num_excluded--) {
+ if (ifindex == excluded[num_excluded])
+ return true;
}
- return __xdp_enqueue(dev, xdp, dev_rx);
+ return false;
+}
+
+/* Get ifindex of each upper device. 'indexes' must be able to hold at
+ * least MAX_NEST_DEV elements.
+ * Returns the number of ifindexes added.
+ */
+static int get_upper_ifindexes(struct net_device *dev, int *indexes)
+{
+ struct net_device *upper;
+ struct list_head *iter;
+ int n = 0;
+
+ netdev_for_each_upper_dev_rcu(dev, upper, iter) {
+ indexes[n++] = upper->ifindex;
+ }
+ return n;
+}
+
+int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
+ struct bpf_map *map, bool exclude_ingress)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *dst, *last_dst = NULL;
+ int excluded_devices[1+MAX_NEST_DEV];
+ struct hlist_head *head;
+ int num_excluded = 0;
+ unsigned int i;
+ int err;
+
+ if (exclude_ingress) {
+ num_excluded = get_upper_ifindexes(dev_rx, excluded_devices);
+ excluded_devices[num_excluded++] = dev_rx->ifindex;
+ }
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+ for (i = 0; i < map->max_entries; i++) {
+ dst = rcu_dereference_check(dtab->netdev_map[i],
+ rcu_read_lock_bh_held());
+ if (!is_valid_dst(dst, xdpf))
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
+ for (i = 0; i < dtab->n_buckets; i++) {
+ head = dev_map_index_hash(dtab, i);
+ hlist_for_each_entry_rcu(dst, head, index_hlist,
+ lockdep_is_held(&dtab->index_lock)) {
+ if (!is_valid_dst(dst, xdpf))
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded,
+ dst->dev->ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ }
+ }
+
+ /* consume the last copy of the frame */
+ if (last_dst)
+ bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog);
+ else
+ xdp_return_frame_rx_napi(xdpf); /* dtab is empty */
+
+ return 0;
}
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
@@ -520,12 +680,116 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
err = xdp_ok_fwd_dev(dst->dev, skb->len);
if (unlikely(err))
return err;
+
+ /* Redirect has already succeeded semantically at this point, so we just
+ * return 0 even if packet is dropped. Helper below takes care of
+ * freeing skb.
+ */
+ if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS)
+ return 0;
+
skb->dev = dst->dev;
generic_xdp_tx(skb, xdp_prog);
return 0;
}
+static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
+ struct sk_buff *skb,
+ struct bpf_prog *xdp_prog)
+{
+ struct sk_buff *nskb;
+ int err;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return -ENOMEM;
+
+ err = dev_map_generic_redirect(dst, nskb, xdp_prog);
+ if (unlikely(err)) {
+ consume_skb(nskb);
+ return err;
+ }
+
+ return 0;
+}
+
+int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
+ struct bpf_prog *xdp_prog, struct bpf_map *map,
+ bool exclude_ingress)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *dst, *last_dst = NULL;
+ int excluded_devices[1+MAX_NEST_DEV];
+ struct hlist_head *head;
+ struct hlist_node *next;
+ int num_excluded = 0;
+ unsigned int i;
+ int err;
+
+ if (exclude_ingress) {
+ num_excluded = get_upper_ifindexes(dev, excluded_devices);
+ excluded_devices[num_excluded++] = dev->ifindex;
+ }
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+ for (i = 0; i < map->max_entries; i++) {
+ dst = rcu_dereference_check(dtab->netdev_map[i],
+ rcu_read_lock_bh_held());
+ if (!dst)
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
+ if (err)
+ return err;
+
+ last_dst = dst;
+
+ }
+ } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
+ for (i = 0; i < dtab->n_buckets; i++) {
+ head = dev_map_index_hash(dtab, i);
+ hlist_for_each_entry_safe(dst, next, head, index_hlist) {
+ if (!dst)
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded,
+ dst->dev->ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ }
+ }
+
+ /* consume the first skb and return */
+ if (last_dst)
+ return dev_map_generic_redirect(last_dst, skb, xdp_prog);
+
+ /* dtab is empty */
+ consume_skb(skb);
+ return 0;
+}
+
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
@@ -551,7 +815,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
kfree(dev);
}
-static int dev_map_delete_elem(struct bpf_map *map, void *key)
+static long dev_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
@@ -560,20 +824,15 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
if (k >= map->max_entries)
return -EINVAL;
- /* Use call_rcu() here to ensure any rcu critical sections have
- * completed as well as any flush operations because call_rcu
- * will wait for preempt-disable region to complete, NAPI in this
- * context. And additionally, the driver tear down ensures all
- * soft irqs are complete before removing the net device in the
- * case of dev_put equals zero.
- */
- old_dev = xchg(&dtab->netdev_map[k], NULL);
- if (old_dev)
+ old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL));
+ if (old_dev) {
call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ atomic_dec((atomic_t *)&dtab->items);
+ }
return 0;
}
-static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
+static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
@@ -603,8 +862,9 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
struct bpf_prog *prog = NULL;
struct bpf_dtab_netdev *dev;
- dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
- dtab->map.numa_node);
+ dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev),
+ GFP_NOWAIT | __GFP_NOWARN,
+ dtab->map.numa_node);
if (!dev)
return ERR_PTR(-ENOMEM);
@@ -617,12 +877,12 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
BPF_PROG_TYPE_XDP, false);
if (IS_ERR(prog))
goto err_put_dev;
- if (prog->expected_attach_type != BPF_XDP_DEVMAP)
+ if (prog->expected_attach_type != BPF_XDP_DEVMAP ||
+ !bpf_prog_map_compatible(&dtab->map, prog))
goto err_put_prog;
}
dev->idx = idx;
- dev->dtab = dtab;
if (prog) {
dev->xdp_prog = prog;
dev->val.bpf_prog.id = prog->aux->id;
@@ -642,8 +902,8 @@ err_out:
return ERR_PTR(-EINVAL);
}
-static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
- void *key, void *value, u64 map_flags)
+static long __dev_map_update_elem(struct net *net, struct bpf_map *map,
+ void *key, void *value, u64 map_flags)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev, *old_dev;
@@ -675,22 +935,24 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
* Remembering the driver side flush operation will happen before the
* net device is removed.
*/
- old_dev = xchg(&dtab->netdev_map[i], dev);
+ old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev)));
if (old_dev)
call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ else
+ atomic_inc((atomic_t *)&dtab->items);
return 0;
}
-static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
return __dev_map_update_elem(current->nsproxy->net_ns,
map, key, value, map_flags);
}
-static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
- void *key, void *value, u64 map_flags)
+static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
+ void *key, void *value, u64 map_flags)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev, *old_dev;
@@ -742,14 +1004,44 @@ out_err:
return err;
}
-static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
return __dev_map_hash_update_elem(current->nsproxy->net_ns,
map, key, value, map_flags);
}
+static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
+{
+ return __bpf_xdp_redirect_map(map, ifindex, flags,
+ BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
+ __dev_map_lookup_elem);
+}
+
+static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
+{
+ return __bpf_xdp_redirect_map(map, ifindex, flags,
+ BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
+ __dev_map_hash_lookup_elem);
+}
+
+static u64 dev_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ u64 usage = sizeof(struct bpf_dtab);
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)
+ usage += (u64)dtab->n_buckets * sizeof(struct hlist_head);
+ else
+ usage += (u64)map->max_entries * sizeof(struct bpf_dtab_netdev *);
+ usage += atomic_read((atomic_t *)&dtab->items) *
+ (u64)sizeof(struct bpf_dtab_netdev);
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)
const struct bpf_map_ops dev_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
.map_get_next_key = dev_map_get_next_key,
@@ -757,9 +1049,13 @@ const struct bpf_map_ops dev_map_ops = {
.map_update_elem = dev_map_update_elem,
.map_delete_elem = dev_map_delete_elem,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = dev_map_mem_usage,
+ .map_btf_id = &dev_map_btf_ids[0],
+ .map_redirect = dev_map_redirect,
};
const struct bpf_map_ops dev_map_hash_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
.map_get_next_key = dev_map_hash_get_next_key,
@@ -767,6 +1063,9 @@ const struct bpf_map_ops dev_map_hash_ops = {
.map_update_elem = dev_map_hash_update_elem,
.map_delete_elem = dev_map_hash_delete_elem,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = dev_map_mem_usage,
+ .map_btf_id = &dev_map_btf_ids[0],
+ .map_redirect = dev_hash_map_redirect,
};
static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
@@ -808,9 +1107,7 @@ static int dev_map_notification(struct notifier_block *notifier,
break;
/* will be freed in free_netdev() */
- netdev->xdp_bulkq =
- __alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue),
- sizeof(void *), GFP_ATOMIC);
+ netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue);
if (!netdev->xdp_bulkq)
return NOTIFY_BAD;
@@ -833,13 +1130,15 @@ static int dev_map_notification(struct notifier_block *notifier,
for (i = 0; i < dtab->map.max_entries; i++) {
struct bpf_dtab_netdev *dev, *odev;
- dev = READ_ONCE(dtab->netdev_map[i]);
+ dev = rcu_dereference(dtab->netdev_map[i]);
if (!dev || netdev != dev->dev)
continue;
- odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
- if (dev == odev)
+ odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL));
+ if (dev == odev) {
call_rcu(&dev->rcu,
__dev_map_entry_free);
+ atomic_dec((atomic_t *)&dtab->items);
+ }
}
}
rcu_read_unlock();
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index b44d8c447afd..49940c26a227 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0-only
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
*/
@@ -19,16 +19,23 @@ static const char *__func_get_name(const struct bpf_insn_cbs *cbs,
{
BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
- if (insn->src_reg != BPF_PSEUDO_CALL &&
+ if (!insn->src_reg &&
insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID &&
func_id_str[insn->imm])
return func_id_str[insn->imm];
- if (cbs && cbs->cb_call)
- return cbs->cb_call(cbs->private_data, insn);
+ if (cbs && cbs->cb_call) {
+ const char *res;
+
+ res = cbs->cb_call(cbs->private_data, insn);
+ if (res)
+ return res;
+ }
if (insn->src_reg == BPF_PSEUDO_CALL)
snprintf(buff, len, "%+d", insn->imm);
+ else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+ snprintf(buff, len, "kernel-function");
return buff;
}
@@ -80,6 +87,24 @@ const char *const bpf_alu_string[16] = {
[BPF_END >> 4] = "endian",
};
+static const char *const bpf_alu_sign_string[16] = {
+ [BPF_DIV >> 4] = "s/=",
+ [BPF_MOD >> 4] = "s%=",
+};
+
+static const char *const bpf_movsx_string[4] = {
+ [0] = "(s8)",
+ [1] = "(s16)",
+ [3] = "(s32)",
+};
+
+static const char *const bpf_atomic_alu_string[16] = {
+ [BPF_ADD >> 4] = "add",
+ [BPF_AND >> 4] = "and",
+ [BPF_OR >> 4] = "or",
+ [BPF_XOR >> 4] = "xor",
+};
+
static const char *const bpf_ldst_string[] = {
[BPF_W >> 3] = "u32",
[BPF_H >> 3] = "u16",
@@ -87,6 +112,12 @@ static const char *const bpf_ldst_string[] = {
[BPF_DW >> 3] = "u64",
};
+static const char *const bpf_ldsx_string[] = {
+ [BPF_W >> 3] = "s32",
+ [BPF_H >> 3] = "s16",
+ [BPF_B >> 3] = "s8",
+};
+
static const char *const bpf_jmp_string[16] = {
[BPF_JA >> 4] = "jmp",
[BPF_JEQ >> 4] = "==",
@@ -114,6 +145,27 @@ static void print_bpf_end_insn(bpf_insn_print_t verbose,
insn->imm, insn->dst_reg);
}
+static void print_bpf_bswap_insn(bpf_insn_print_t verbose,
+ void *private_data,
+ const struct bpf_insn *insn)
+{
+ verbose(private_data, "(%02x) r%d = bswap%d r%d\n",
+ insn->code, insn->dst_reg,
+ insn->imm, insn->dst_reg);
+}
+
+static bool is_sdiv_smod(const struct bpf_insn *insn)
+{
+ return (BPF_OP(insn->code) == BPF_DIV || BPF_OP(insn->code) == BPF_MOD) &&
+ insn->off == 1;
+}
+
+static bool is_movsx(const struct bpf_insn *insn)
+{
+ return BPF_OP(insn->code) == BPF_MOV &&
+ (insn->off == 8 || insn->off == 16 || insn->off == 32);
+}
+
void print_bpf_insn(const struct bpf_insn_cbs *cbs,
const struct bpf_insn *insn,
bool allow_ptr_leaks)
@@ -124,7 +176,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
if (class == BPF_ALU || class == BPF_ALU64) {
if (BPF_OP(insn->code) == BPF_END) {
if (class == BPF_ALU64)
- verbose(cbs->private_data, "BUG_alu64_%02x\n", insn->code);
+ print_bpf_bswap_insn(verbose, cbs->private_data, insn);
else
print_bpf_end_insn(verbose, cbs->private_data, insn);
} else if (BPF_OP(insn->code) == BPF_NEG) {
@@ -133,17 +185,20 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
insn->dst_reg, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg);
} else if (BPF_SRC(insn->code) == BPF_X) {
- verbose(cbs->private_data, "(%02x) %c%d %s %c%d\n",
+ verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n",
insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
+ is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4]
+ : bpf_alu_string[BPF_OP(insn->code) >> 4],
+ is_movsx(insn) ? bpf_movsx_string[(insn->off >> 3) - 1] : "",
class == BPF_ALU ? 'w' : 'r',
insn->src_reg);
} else {
verbose(cbs->private_data, "(%02x) %c%d %s %d\n",
insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
+ is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4]
+ : bpf_alu_string[BPF_OP(insn->code) >> 4],
insn->imm);
}
} else if (class == BPF_STX) {
@@ -153,32 +208,66 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->dst_reg,
insn->off, insn->src_reg);
- else if (BPF_MODE(insn->code) == BPF_XADD)
- verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+ else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ (insn->imm == BPF_ADD || insn->imm == BPF_AND ||
+ insn->imm == BPF_OR || insn->imm == BPF_XOR)) {
+ verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off,
+ bpf_alu_string[BPF_OP(insn->imm) >> 4],
+ insn->src_reg);
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ (insn->imm == (BPF_ADD | BPF_FETCH) ||
+ insn->imm == (BPF_AND | BPF_FETCH) ||
+ insn->imm == (BPF_OR | BPF_FETCH) ||
+ insn->imm == (BPF_XOR | BPF_FETCH))) {
+ verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_%s((%s *)(r%d %+d), r%d)\n",
+ insn->code, insn->src_reg,
+ BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
+ bpf_atomic_alu_string[BPF_OP(insn->imm) >> 4],
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off, insn->src_reg);
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_CMPXCHG) {
+ verbose(cbs->private_data, "(%02x) r0 = atomic%s_cmpxchg((%s *)(r%d %+d), r0, r%d)\n",
insn->code,
+ BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->dst_reg, insn->off,
insn->src_reg);
- else
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_XCHG) {
+ verbose(cbs->private_data, "(%02x) r%d = atomic%s_xchg((%s *)(r%d %+d), r%d)\n",
+ insn->code, insn->src_reg,
+ BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off, insn->src_reg);
+ } else {
verbose(cbs->private_data, "BUG_%02x\n", insn->code);
+ }
} else if (class == BPF_ST) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
+ if (BPF_MODE(insn->code) == BPF_MEM) {
+ verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) {
+ verbose(cbs->private_data, "(%02x) nospec\n", insn->code);
+ } else {
verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
- return;
}
- verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg,
- insn->off, insn->imm);
} else if (class == BPF_LDX) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
+ if (BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) {
verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code);
return;
}
verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n",
insn->code, insn->dst_reg,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ BPF_MODE(insn->code) == BPF_MEM ?
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3] :
+ bpf_ldsx_string[BPF_SIZE(insn->code) >> 3],
insn->src_reg, insn->off);
} else if (class == BPF_LD) {
if (BPF_MODE(insn->code) == BPF_ABS) {
@@ -233,6 +322,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
} else if (insn->code == (BPF_JMP | BPF_JA)) {
verbose(cbs->private_data, "(%02x) goto pc%+d\n",
insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP32 | BPF_JA)) {
+ verbose(cbs->private_data, "(%02x) gotol pc%+d\n",
+ insn->code, insn->imm);
} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
verbose(cbs->private_data, "(%02x) exit\n", insn->code);
} else if (BPF_SRC(insn->code) == BPF_X) {
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index e546b18d27da..a4b040793f44 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
*/
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index 2444bd15cc2d..70fb82bf1637 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -4,6 +4,7 @@
#include <linux/hash.h>
#include <linux/bpf.h>
#include <linux/filter.h>
+#include <linux/static_call.h>
/* The BPF dispatcher is a multiway branch code generator. The
* dispatcher is a mechanism to avoid the performance penalty of an
@@ -85,12 +86,12 @@ static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d,
return false;
}
-int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
+int __weak arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs)
{
return -ENOTSUPP;
}
-static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
+static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *buf)
{
s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0];
int i;
@@ -99,34 +100,38 @@ static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
if (d->progs[i].prog)
*ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func;
}
- return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs);
+ return arch_prepare_bpf_dispatcher(image, buf, &ips[0], d->num_progs);
}
static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
{
- void *old, *new;
- u32 noff;
- int err;
-
- if (!prev_num_progs) {
- old = NULL;
- noff = 0;
- } else {
- old = d->image + d->image_off;
+ void *new, *tmp;
+ u32 noff = 0;
+
+ if (prev_num_progs)
noff = d->image_off ^ (PAGE_SIZE / 2);
- }
new = d->num_progs ? d->image + noff : NULL;
+ tmp = d->num_progs ? d->rw_image + noff : NULL;
if (new) {
- if (bpf_dispatcher_prepare(d, new))
+ /* Prepare the dispatcher in d->rw_image. Then use
+ * bpf_arch_text_copy to update d->image, which is RO+X.
+ */
+ if (bpf_dispatcher_prepare(d, new, tmp))
+ return;
+ if (IS_ERR(bpf_arch_text_copy(new, tmp, PAGE_SIZE / 2)))
return;
}
- err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new);
- if (err || !new)
- return;
+ __BPF_DISPATCHER_UPDATE(d, new ?: (void *)&bpf_dispatcher_nop_func);
+
+ /* Make sure all the callers executing the previous/old half of the
+ * image leave it, so following update call can modify it safely.
+ */
+ synchronize_rcu();
- d->image_off = noff;
+ if (new)
+ d->image_off = noff;
}
void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
@@ -140,10 +145,16 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
mutex_lock(&d->mutex);
if (!d->image) {
- d->image = bpf_jit_alloc_exec_page();
+ d->image = bpf_prog_pack_alloc(PAGE_SIZE, bpf_jit_fill_hole_with_zero);
if (!d->image)
goto out;
- bpf_image_ksym_add(d->image, &d->ksym);
+ d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!d->rw_image) {
+ bpf_prog_pack_free(d->image, PAGE_SIZE);
+ d->image = NULL;
+ goto out;
+ }
+ bpf_image_ksym_add(d->image, PAGE_SIZE, &d->ksym);
}
prev_num_progs = d->num_progs;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b32cc8ce8ff6..03a6a2500b6a 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -7,11 +7,15 @@
#include <linux/jhash.h>
#include <linux/filter.h>
#include <linux/rculist_nulls.h>
+#include <linux/rcupdate_wait.h>
#include <linux/random.h>
#include <uapi/linux/btf.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/btf_ids.h>
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
#include "map_in_map.h"
+#include <linux/bpf_mem_alloc.h>
#define HTAB_CREATE_FLAG_MASK \
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
@@ -30,7 +34,7 @@
/*
* The bucket lock has two protection scopes:
*
- * 1) Serializing concurrent operations from BPF programs on differrent
+ * 1) Serializing concurrent operations from BPF programs on different
* CPUs
*
* 2) Serializing concurrent operations from BPF programs and sys_bpf()
@@ -45,12 +49,12 @@
* events, kprobes and tracing to be invoked before the prior invocation
* from one of these contexts completed. sys_bpf() uses the same mechanism
* by pinning the task to the current CPU and incrementing the recursion
- * protection accross the map operation.
+ * protection across the map operation.
*
* This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
* operations like memory allocations (even with GFP_ATOMIC) from atomic
* contexts. This is required because even with GFP_ATOMIC the memory
- * allocator calls into code pathes which acquire locks with long held lock
+ * allocator calls into code paths which acquire locks with long held lock
* sections. To ensure the deterministic behaviour these locks are regular
* spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
* true atomic contexts on an RT kernel are the low level hardware
@@ -59,34 +63,31 @@
*
* As regular device interrupt handlers and soft interrupts are forced into
* thread context, the existing code which does
- * spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*();
+ * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*();
* just works.
*
* In theory the BPF locks could be converted to regular spinlocks as well,
* but the bucket locks and percpu_freelist locks can be taken from
* arbitrary contexts (perf, kprobes, tracepoints) which are required to be
- * atomic contexts even on RT. These mechanisms require preallocated maps,
- * so there is no need to invoke memory allocations within the lock held
- * sections.
- *
- * BPF maps which need dynamic allocation are only used from (forced)
- * thread context on RT and can therefore use regular spinlocks which in
- * turn allows to invoke memory allocations from the lock held section.
- *
- * On a non RT kernel this distinction is neither possible nor required.
- * spinlock maps to raw_spinlock and the extra code is optimized out by the
- * compiler.
+ * atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
+ * it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
+ * because there is no memory allocation within the lock held sections. However
+ * after hash map was fully converted to use bpf_mem_alloc, there will be
+ * non-synchronous memory allocation for non-preallocated hash map, so it is
+ * safe to always use raw spinlock for bucket lock.
*/
struct bucket {
struct hlist_nulls_head head;
- union {
- raw_spinlock_t raw_lock;
- spinlock_t lock;
- };
+ raw_spinlock_t raw_lock;
};
+#define HASHTAB_MAP_LOCK_COUNT 8
+#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
+
struct bpf_htab {
struct bpf_map map;
+ struct bpf_mem_alloc ma;
+ struct bpf_mem_alloc pcpu_ma;
struct bucket *buckets;
void *elems;
union {
@@ -94,10 +95,17 @@ struct bpf_htab {
struct bpf_lru lru;
};
struct htab_elem *__percpu *extra_elems;
- atomic_t count; /* number of elements in this hashtable */
+ /* number of elements in non-preallocated hashtable are kept
+ * in either pcount or count
+ */
+ struct percpu_counter pcount;
+ atomic_t count;
+ bool use_percpu_counter;
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
u32 hashrnd;
+ struct lock_class_key lockdep_key;
+ int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
};
/* each htab element is struct htab_elem + key + value */
@@ -107,14 +115,14 @@ struct htab_elem {
struct {
void *padding;
union {
- struct bpf_htab *htab;
struct pcpu_freelist_node fnode;
struct htab_elem *batch_flink;
};
};
};
union {
- struct rcu_head rcu;
+ /* pointer to per-cpu pointer */
+ void *ptr_to_pptr;
struct bpf_lru_node lru_node;
};
u32 hash;
@@ -126,44 +134,51 @@ static inline bool htab_is_prealloc(const struct bpf_htab *htab)
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
}
-static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
-{
- return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab));
-}
-
static void htab_init_buckets(struct bpf_htab *htab)
{
- unsigned i;
+ unsigned int i;
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
- if (htab_use_raw_lock(htab))
- raw_spin_lock_init(&htab->buckets[i].raw_lock);
- else
- spin_lock_init(&htab->buckets[i].lock);
+ raw_spin_lock_init(&htab->buckets[i].raw_lock);
+ lockdep_set_class(&htab->buckets[i].raw_lock,
+ &htab->lockdep_key);
+ cond_resched();
}
}
-static inline unsigned long htab_lock_bucket(const struct bpf_htab *htab,
- struct bucket *b)
+static inline int htab_lock_bucket(const struct bpf_htab *htab,
+ struct bucket *b, u32 hash,
+ unsigned long *pflags)
{
unsigned long flags;
- if (htab_use_raw_lock(htab))
- raw_spin_lock_irqsave(&b->raw_lock, flags);
- else
- spin_lock_irqsave(&b->lock, flags);
- return flags;
+ hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
+
+ preempt_disable();
+ local_irq_save(flags);
+ if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
+ __this_cpu_dec(*(htab->map_locked[hash]));
+ local_irq_restore(flags);
+ preempt_enable();
+ return -EBUSY;
+ }
+
+ raw_spin_lock(&b->raw_lock);
+ *pflags = flags;
+
+ return 0;
}
static inline void htab_unlock_bucket(const struct bpf_htab *htab,
- struct bucket *b,
+ struct bucket *b, u32 hash,
unsigned long flags)
{
- if (htab_use_raw_lock(htab))
- raw_spin_unlock_irqrestore(&b->raw_lock, flags);
- else
- spin_unlock_irqrestore(&b->lock, flags);
+ hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
+ raw_spin_unlock(&b->raw_lock);
+ __this_cpu_dec(*(htab->map_locked[hash]));
+ local_irq_restore(flags);
+ preempt_enable();
}
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
@@ -198,7 +213,60 @@ static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
{
- return (struct htab_elem *) (htab->elems + i * htab->elem_size);
+ return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
+}
+
+static bool htab_has_extra_elems(struct bpf_htab *htab)
+{
+ return !htab_is_percpu(htab) && !htab_is_lru(htab);
+}
+
+static void htab_free_prealloced_timers(struct bpf_htab *htab)
+{
+ u32 num_entries = htab->map.max_entries;
+ int i;
+
+ if (!btf_record_has_field(htab->map.record, BPF_TIMER))
+ return;
+ if (htab_has_extra_elems(htab))
+ num_entries += num_possible_cpus();
+
+ for (i = 0; i < num_entries; i++) {
+ struct htab_elem *elem;
+
+ elem = get_htab_elem(htab, i);
+ bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
+ cond_resched();
+ }
+}
+
+static void htab_free_prealloced_fields(struct bpf_htab *htab)
+{
+ u32 num_entries = htab->map.max_entries;
+ int i;
+
+ if (IS_ERR_OR_NULL(htab->map.record))
+ return;
+ if (htab_has_extra_elems(htab))
+ num_entries += num_possible_cpus();
+ for (i = 0; i < num_entries; i++) {
+ struct htab_elem *elem;
+
+ elem = get_htab_elem(htab, i);
+ if (htab_is_percpu(htab)) {
+ void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
+ cond_resched();
+ }
+ } else {
+ bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
+ cond_resched();
+ }
+ cond_resched();
+ }
}
static void htab_free_elems(struct bpf_htab *htab)
@@ -238,6 +306,7 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
struct htab_elem *l;
if (node) {
+ bpf_map_inc_elem_count(&htab->map);
l = container_of(node, struct htab_elem, lru_node);
memcpy(l->key, key, htab->map.key_size);
return l;
@@ -251,10 +320,10 @@ static int prealloc_init(struct bpf_htab *htab)
u32 num_entries = htab->map.max_entries;
int err = -ENOMEM, i;
- if (!htab_is_percpu(htab) && !htab_is_lru(htab))
+ if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
- htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries,
+ htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries,
htab->map.numa_node);
if (!htab->elems)
return -ENOMEM;
@@ -266,7 +335,8 @@ static int prealloc_init(struct bpf_htab *htab)
u32 size = round_up(htab->map.value_size, 8);
void __percpu *pptr;
- pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
+ pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
+ GFP_USER | __GFP_NOWARN);
if (!pptr)
goto free_elems;
htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
@@ -320,8 +390,8 @@ static int alloc_extra_elems(struct bpf_htab *htab)
struct pcpu_freelist_node *l;
int cpu;
- pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8,
- GFP_USER | __GFP_NOWARN);
+ pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8,
+ GFP_USER | __GFP_NOWARN);
if (!pptr)
return -ENOMEM;
@@ -354,17 +424,9 @@ static int htab_map_alloc_check(union bpf_attr *attr)
bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
int numa_node = bpf_map_attr_numa_node(attr);
- BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
- offsetof(struct htab_elem, hash_node.pprev));
BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
offsetof(struct htab_elem, hash_node.pprev));
- if (lru && !bpf_capable())
- /* LRU implementation is much complicated than other
- * maps. Hence, limit to CAP_BPF.
- */
- return -EPERM;
-
if (zero_seed && !capable(CAP_SYS_ADMIN))
/* Guard against local DoS, and discourage production use. */
return -EPERM;
@@ -389,17 +451,11 @@ static int htab_map_alloc_check(union bpf_attr *attr)
attr->value_size == 0)
return -EINVAL;
- if (attr->key_size > MAX_BPF_STACK)
- /* eBPF programs initialize keys on stack, so they cannot be
- * larger than max stack size
- */
- return -E2BIG;
-
- if (attr->value_size >= KMALLOC_MAX_SIZE -
- MAX_BPF_STACK - sizeof(struct htab_elem))
- /* if value_size is bigger, the user space won't be able to
- * access the elements via bpf syscall. This check also makes
- * sure that the elem_size doesn't overflow and it's
+ if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE -
+ sizeof(struct htab_elem))
+ /* if key_size + value_size is bigger, the user space won't be
+ * able to access the elements via bpf syscall. This check
+ * also makes sure that the elem_size doesn't overflow and it's
* kmalloc-able later in htab_map_update_elem()
*/
return -E2BIG;
@@ -421,13 +477,14 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
struct bpf_htab *htab;
- u64 cost;
- int err;
+ int err, i;
- htab = kzalloc(sizeof(*htab), GFP_USER);
+ htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
if (!htab)
return ERR_PTR(-ENOMEM);
+ lockdep_register_key(&htab->lockdep_key);
+
bpf_map_init_from_attr(&htab->map, attr);
if (percpu_lru) {
@@ -458,17 +515,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab->n_buckets > U32_MAX / sizeof(struct bucket))
goto free_htab;
- cost = (u64) htab->n_buckets * sizeof(struct bucket) +
- (u64) htab->elem_size * htab->map.max_entries;
-
- if (percpu)
- cost += (u64) round_up(htab->map.value_size, 8) *
- num_possible_cpus() * htab->map.max_entries;
- else
- cost += (u64) htab->elem_size * num_possible_cpus();
-
- /* if map size is larger than memlock limit, reject it */
- err = bpf_map_charge_init(&htab->map.memory, cost);
+ err = bpf_map_init_elem_count(&htab->map);
if (err)
goto free_htab;
@@ -477,19 +524,51 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
sizeof(struct bucket),
htab->map.numa_node);
if (!htab->buckets)
- goto free_charge;
+ goto free_elem_count;
+
+ for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
+ htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
+ sizeof(int),
+ sizeof(int),
+ GFP_USER);
+ if (!htab->map_locked[i])
+ goto free_map_locked;
+ }
if (htab->map.map_flags & BPF_F_ZERO_SEED)
htab->hashrnd = 0;
else
- htab->hashrnd = get_random_int();
+ htab->hashrnd = get_random_u32();
htab_init_buckets(htab);
+/* compute_batch_value() computes batch value as num_online_cpus() * 2
+ * and __percpu_counter_compare() needs
+ * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
+ * for percpu_counter to be faster than atomic_t. In practice the average bpf
+ * hash map size is 10k, which means that a system with 64 cpus will fill
+ * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
+ * define our own batch count as 32 then 10k hash map can be filled up to 80%:
+ * 10k - 8k > 32 _batch_ * 64 _cpus_
+ * and __percpu_counter_compare() will still be fast. At that point hash map
+ * collisions will dominate its performance anyway. Assume that hash map filled
+ * to 50+% isn't going to be O(1) and use the following formula to choose
+ * between percpu_counter and atomic_t.
+ */
+#define PERCPU_COUNTER_BATCH 32
+ if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH)
+ htab->use_percpu_counter = true;
+
+ if (htab->use_percpu_counter) {
+ err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
+ if (err)
+ goto free_map_locked;
+ }
+
if (prealloc) {
err = prealloc_init(htab);
if (err)
- goto free_buckets;
+ goto free_map_locked;
if (!percpu && !lru) {
/* lru itself can remove the least used element, so
@@ -499,23 +578,42 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (err)
goto free_prealloc;
}
+ } else {
+ err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
+ if (err)
+ goto free_map_locked;
+ if (percpu) {
+ err = bpf_mem_alloc_init(&htab->pcpu_ma,
+ round_up(htab->map.value_size, 8), true);
+ if (err)
+ goto free_map_locked;
+ }
}
return &htab->map;
free_prealloc:
prealloc_destroy(htab);
-free_buckets:
+free_map_locked:
+ if (htab->use_percpu_counter)
+ percpu_counter_destroy(&htab->pcount);
+ for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
+ free_percpu(htab->map_locked[i]);
bpf_map_area_free(htab->buckets);
-free_charge:
- bpf_map_charge_finish(&htab->map.memory);
+ bpf_mem_alloc_destroy(&htab->pcpu_ma);
+ bpf_mem_alloc_destroy(&htab->ma);
+free_elem_count:
+ bpf_map_free_elem_count(&htab->map);
free_htab:
- kfree(htab);
+ lockdep_unregister_key(&htab->lockdep_key);
+ bpf_map_area_free(htab);
return ERR_PTR(err);
}
static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
{
+ if (likely(key_len % 4 == 0))
+ return jhash2(key, key_len / 4, hashrnd);
return jhash(key, key_len, hashrnd);
}
@@ -577,8 +675,8 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
struct htab_elem *l;
u32 hash, key_size;
- /* Must be called with rcu_read_lock. */
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
key_size = map->key_size;
@@ -612,14 +710,14 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
* bpf_prog
* __htab_map_lookup_elem
*/
-static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
- *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
+ *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
offsetof(struct htab_elem, key) +
@@ -651,7 +749,7 @@ static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
return __htab_lru_map_lookup_elem(map, key, false);
}
-static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
+static int htab_lru_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_insn *insn = insn_buf;
@@ -660,7 +758,7 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
- *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
+ *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
offsetof(struct htab_elem, lru_node) +
@@ -676,31 +774,52 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
return insn - insn_buf;
}
+static void check_and_free_fields(struct bpf_htab *htab,
+ struct htab_elem *elem)
+{
+ if (htab_is_percpu(htab)) {
+ void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
+ } else {
+ void *map_value = elem->key + round_up(htab->map.key_size, 8);
+
+ bpf_obj_free_fields(htab->map.record, map_value);
+ }
+}
+
/* It is called from the bpf_lru_list when the LRU needs to delete
* older elements from the htab.
*/
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
{
- struct bpf_htab *htab = (struct bpf_htab *)arg;
+ struct bpf_htab *htab = arg;
struct htab_elem *l = NULL, *tgt_l;
struct hlist_nulls_head *head;
struct hlist_nulls_node *n;
unsigned long flags;
struct bucket *b;
+ int ret;
tgt_l = container_of(node, struct htab_elem, lru_node);
b = __select_bucket(htab, tgt_l->hash);
head = &b->head;
- flags = htab_lock_bucket(htab, b);
+ ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags);
+ if (ret)
+ return false;
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l == tgt_l) {
hlist_nulls_del_rcu(&l->hash_node);
+ check_and_free_fields(htab, l);
+ bpf_map_dec_elem_count(&htab->map);
break;
}
- htab_unlock_bucket(htab, b, flags);
+ htab_unlock_bucket(htab, b, tgt_l->hash, flags);
return l == tgt_l;
}
@@ -766,17 +885,10 @@ find_first_elem:
static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
+ check_and_free_fields(htab, l);
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
- free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
- kfree(l);
-}
-
-static void htab_elem_free_rcu(struct rcu_head *head)
-{
- struct htab_elem *l = container_of(head, struct htab_elem, rcu);
- struct bpf_htab *htab = l->htab;
-
- htab_elem_free(htab, l);
+ bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
+ bpf_mem_cache_free(&htab->ma, l);
}
static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
@@ -786,20 +898,50 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
if (map->ops->map_fd_put_ptr) {
ptr = fd_htab_map_get_ptr(map, l);
- map->ops->map_fd_put_ptr(ptr);
+ map->ops->map_fd_put_ptr(map, ptr, true);
}
}
+static bool is_map_full(struct bpf_htab *htab)
+{
+ if (htab->use_percpu_counter)
+ return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
+ PERCPU_COUNTER_BATCH) >= 0;
+ return atomic_read(&htab->count) >= htab->map.max_entries;
+}
+
+static void inc_elem_count(struct bpf_htab *htab)
+{
+ bpf_map_inc_elem_count(&htab->map);
+
+ if (htab->use_percpu_counter)
+ percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
+ else
+ atomic_inc(&htab->count);
+}
+
+static void dec_elem_count(struct bpf_htab *htab)
+{
+ bpf_map_dec_elem_count(&htab->map);
+
+ if (htab->use_percpu_counter)
+ percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
+ else
+ atomic_dec(&htab->count);
+}
+
+
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
htab_put_fd_value(htab, l);
if (htab_is_prealloc(htab)) {
+ bpf_map_dec_elem_count(&htab->map);
+ check_and_free_fields(htab, l);
__pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
- atomic_dec(&htab->count);
- l->htab = htab;
- call_rcu(&l->rcu, htab_elem_free_rcu);
+ dec_elem_count(htab);
+ htab_elem_free(htab, l);
}
}
@@ -808,19 +950,41 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
{
if (!onallcpus) {
/* copy true value_size bytes */
- memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
+ copy_map_value(&htab->map, this_cpu_ptr(pptr), value);
} else {
u32 size = round_up(htab->map.value_size, 8);
int off = 0, cpu;
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
- value + off, size);
+ copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off);
off += size;
}
}
}
+static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
+ void *value, bool onallcpus)
+{
+ /* When not setting the initial value on all cpus, zero-fill element
+ * values for other cpus. Otherwise, bpf program has no way to ensure
+ * known initial values for cpus other than current one
+ * (onallcpus=false always when coming from bpf prog).
+ */
+ if (!onallcpus) {
+ int current_cpu = raw_smp_processor_id();
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu == current_cpu)
+ copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value);
+ else /* Since elem is preallocated, we cannot touch special fields */
+ zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
+ }
+ } else {
+ pcpu_copy_value(htab, pptr, value, onallcpus);
+ }
+}
+
static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
{
return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
@@ -853,45 +1017,42 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
if (!l)
return ERR_PTR(-E2BIG);
l_new = container_of(l, struct htab_elem, fnode);
+ bpf_map_inc_elem_count(&htab->map);
}
} else {
- if (atomic_inc_return(&htab->count) > htab->map.max_entries)
- if (!old_elem) {
+ if (is_map_full(htab))
+ if (!old_elem)
/* when map is full and update() is replacing
* old element, it's ok to allocate, since
* old element will be freed immediately.
* Otherwise return an error
*/
- l_new = ERR_PTR(-E2BIG);
- goto dec_count;
- }
- l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
- htab->map.numa_node);
+ return ERR_PTR(-E2BIG);
+ inc_elem_count(htab);
+ l_new = bpf_mem_cache_alloc(&htab->ma);
if (!l_new) {
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
- check_and_init_map_lock(&htab->map,
- l_new->key + round_up(key_size, 8));
}
memcpy(l_new->key, key, key_size);
if (percpu) {
- size = round_up(size, 8);
if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
/* alloc_percpu zero-fills */
- pptr = __alloc_percpu_gfp(size, 8,
- GFP_ATOMIC | __GFP_NOWARN);
+ pptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
if (!pptr) {
- kfree(l_new);
+ bpf_mem_cache_free(&htab->ma, l_new);
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
+ l_new->ptr_to_pptr = pptr;
+ pptr = *(void **)pptr;
}
- pcpu_copy_value(htab, pptr, value, onallcpus);
+ pcpu_init_value(htab, pptr, value, onallcpus);
if (!prealloc)
htab_elem_set_ptr(l_new, key_size, pptr);
@@ -907,7 +1068,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new->hash = hash;
return l_new;
dec_count:
- atomic_dec(&htab->count);
+ dec_elem_count(htab);
return l_new;
}
@@ -926,8 +1087,8 @@ static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
}
/* Called from syscall or from eBPF program */
-static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
@@ -941,7 +1102,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
key_size = map->key_size;
@@ -951,7 +1113,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
head = &b->head;
if (unlikely(map_flags & BPF_F_LOCK)) {
- if (unlikely(!map_value_has_spin_lock(map)))
+ if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
/* find an element without taking the bucket lock */
l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
@@ -972,7 +1134,9 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
*/
}
- flags = htab_lock_bucket(htab, b);
+ ret = htab_lock_bucket(htab, b, hash, &flags);
+ if (ret)
+ return ret;
l_old = lookup_elem_raw(head, hash, key, key_size);
@@ -1010,15 +1174,24 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_nulls_del_rcu(&l_old->hash_node);
if (!htab_is_prealloc(htab))
free_htab_elem(htab, l_old);
+ else
+ check_and_free_fields(htab, l_old);
}
ret = 0;
err:
- htab_unlock_bucket(htab, b, flags);
+ htab_unlock_bucket(htab, b, hash, flags);
return ret;
}
-static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
+{
+ check_and_free_fields(htab, elem);
+ bpf_map_dec_elem_count(&htab->map);
+ bpf_lru_push_free(&htab->lru, &elem->lru_node);
+}
+
+static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new, *l_old = NULL;
@@ -1032,7 +1205,8 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
key_size = map->key_size;
@@ -1049,9 +1223,12 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
l_new = prealloc_lru_pop(htab, key, hash);
if (!l_new)
return -ENOMEM;
- memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
+ copy_map_value(&htab->map,
+ l_new->key + round_up(map->key_size, 8), value);
- flags = htab_lock_bucket(htab, b);
+ ret = htab_lock_bucket(htab, b, hash, &flags);
+ if (ret)
+ goto err_lock_bucket;
l_old = lookup_elem_raw(head, hash, key, key_size);
@@ -1070,19 +1247,20 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
ret = 0;
err:
- htab_unlock_bucket(htab, b, flags);
+ htab_unlock_bucket(htab, b, hash, flags);
+err_lock_bucket:
if (ret)
- bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+ htab_lru_push_free(htab, l_new);
else if (l_old)
- bpf_lru_push_free(&htab->lru, &l_old->lru_node);
+ htab_lru_push_free(htab, l_old);
return ret;
}
-static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags,
- bool onallcpus)
+static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool onallcpus)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
@@ -1096,7 +1274,8 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
key_size = map->key_size;
@@ -1105,7 +1284,9 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
b = __select_bucket(htab, hash);
head = &b->head;
- flags = htab_lock_bucket(htab, b);
+ ret = htab_lock_bucket(htab, b, hash, &flags);
+ if (ret)
+ return ret;
l_old = lookup_elem_raw(head, hash, key, key_size);
@@ -1128,13 +1309,13 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
}
ret = 0;
err:
- htab_unlock_bucket(htab, b, flags);
+ htab_unlock_bucket(htab, b, hash, flags);
return ret;
}
-static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags,
- bool onallcpus)
+static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool onallcpus)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
@@ -1148,7 +1329,8 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
key_size = map->key_size;
@@ -1168,7 +1350,9 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
return -ENOMEM;
}
- flags = htab_lock_bucket(htab, b);
+ ret = htab_lock_bucket(htab, b, hash, &flags);
+ if (ret)
+ goto err_lock_bucket;
l_old = lookup_elem_raw(head, hash, key, key_size);
@@ -1183,34 +1367,37 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
value, onallcpus);
} else {
- pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size),
+ pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
value, onallcpus);
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
l_new = NULL;
}
ret = 0;
err:
- htab_unlock_bucket(htab, b, flags);
- if (l_new)
+ htab_unlock_bucket(htab, b, hash, flags);
+err_lock_bucket:
+ if (l_new) {
+ bpf_map_dec_elem_count(&htab->map);
bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+ }
return ret;
}
-static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
}
-static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
false);
}
/* Called from syscall or from eBPF program */
-static int htab_map_delete_elem(struct bpf_map *map, void *key)
+static long htab_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_nulls_head *head;
@@ -1218,9 +1405,10 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
struct htab_elem *l;
unsigned long flags;
u32 hash, key_size;
- int ret = -ENOENT;
+ int ret;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
key_size = map->key_size;
@@ -1228,21 +1416,24 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
b = __select_bucket(htab, hash);
head = &b->head;
- flags = htab_lock_bucket(htab, b);
+ ret = htab_lock_bucket(htab, b, hash, &flags);
+ if (ret)
+ return ret;
l = lookup_elem_raw(head, hash, key, key_size);
if (l) {
hlist_nulls_del_rcu(&l->hash_node);
free_htab_elem(htab, l);
- ret = 0;
+ } else {
+ ret = -ENOENT;
}
- htab_unlock_bucket(htab, b, flags);
+ htab_unlock_bucket(htab, b, hash, flags);
return ret;
}
-static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
+static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_nulls_head *head;
@@ -1250,9 +1441,10 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
struct htab_elem *l;
unsigned long flags;
u32 hash, key_size;
- int ret = -ENOENT;
+ int ret;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
key_size = map->key_size;
@@ -1260,18 +1452,20 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
b = __select_bucket(htab, hash);
head = &b->head;
- flags = htab_lock_bucket(htab, b);
+ ret = htab_lock_bucket(htab, b, hash, &flags);
+ if (ret)
+ return ret;
l = lookup_elem_raw(head, hash, key, key_size);
- if (l) {
+ if (l)
hlist_nulls_del_rcu(&l->hash_node);
- ret = 0;
- }
+ else
+ ret = -ENOENT;
- htab_unlock_bucket(htab, b, flags);
+ htab_unlock_bucket(htab, b, hash, flags);
if (l)
- bpf_lru_push_free(&htab->lru, &l->lru_node);
+ htab_lru_push_free(htab, l);
return ret;
}
@@ -1279,6 +1473,10 @@ static void delete_all_elements(struct bpf_htab *htab)
{
int i;
+ /* It's called from a worker thread, so disable migration here,
+ * since bpf_mem_cache_free() relies on that.
+ */
+ migrate_disable();
for (i = 0; i < htab->n_buckets; i++) {
struct hlist_nulls_head *head = select_bucket(htab, i);
struct hlist_nulls_node *n;
@@ -1289,32 +1487,74 @@ static void delete_all_elements(struct bpf_htab *htab)
htab_elem_free(htab, l);
}
}
+ migrate_enable();
+}
+
+static void htab_free_malloced_timers(struct bpf_htab *htab)
+{
+ int i;
+
+ rcu_read_lock();
+ for (i = 0; i < htab->n_buckets; i++) {
+ struct hlist_nulls_head *head = select_bucket(htab, i);
+ struct hlist_nulls_node *n;
+ struct htab_elem *l;
+
+ hlist_nulls_for_each_entry(l, n, head, hash_node) {
+ /* We only free timer on uref dropping to zero */
+ bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8));
+ }
+ cond_resched_rcu();
+ }
+ rcu_read_unlock();
+}
+
+static void htab_map_free_timers(struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ /* We only free timer on uref dropping to zero */
+ if (!btf_record_has_field(htab->map.record, BPF_TIMER))
+ return;
+ if (!htab_is_prealloc(htab))
+ htab_free_malloced_timers(htab);
+ else
+ htab_free_prealloced_timers(htab);
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void htab_map_free(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ int i;
- /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
- * so the programs (can be more than one that used this map) were
- * disconnected from events. Wait for outstanding critical sections in
- * these programs to complete
+ /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
+ * bpf_free_used_maps() is called after bpf prog is no longer executing.
+ * There is no need to synchronize_rcu() here to protect map elements.
*/
- synchronize_rcu();
- /* some of free_htab_elem() callbacks for elements of this map may
- * not have executed. Wait for them.
+ /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
+ * underneath and is reponsible for waiting for callbacks to finish
+ * during bpf_mem_alloc_destroy().
*/
- rcu_barrier();
- if (!htab_is_prealloc(htab))
+ if (!htab_is_prealloc(htab)) {
delete_all_elements(htab);
- else
+ } else {
+ htab_free_prealloced_fields(htab);
prealloc_destroy(htab);
+ }
+ bpf_map_free_elem_count(map);
free_percpu(htab->extra_elems);
bpf_map_area_free(htab->buckets);
- kfree(htab);
+ bpf_mem_alloc_destroy(&htab->pcpu_ma);
+ bpf_mem_alloc_destroy(&htab->ma);
+ if (htab->use_percpu_counter)
+ percpu_counter_destroy(&htab->pcount);
+ for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
+ free_percpu(htab->map_locked[i]);
+ lockdep_unregister_key(&htab->lockdep_key);
+ bpf_map_area_free(htab);
}
static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
@@ -1338,6 +1578,100 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
+static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+ void *value, bool is_lru_map,
+ bool is_percpu, u64 flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_nulls_head *head;
+ unsigned long bflags;
+ struct htab_elem *l;
+ u32 hash, key_size;
+ struct bucket *b;
+ int ret;
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ ret = htab_lock_bucket(htab, b, hash, &bflags);
+ if (ret)
+ return ret;
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+ if (!l) {
+ ret = -ENOENT;
+ } else {
+ if (is_percpu) {
+ u32 roundup_value_size = round_up(map->value_size, 8);
+ void __percpu *pptr;
+ int off = 0, cpu;
+
+ pptr = htab_elem_get_ptr(l, key_size);
+ for_each_possible_cpu(cpu) {
+ copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(&htab->map, value + off);
+ off += roundup_value_size;
+ }
+ } else {
+ u32 roundup_key_size = round_up(map->key_size, 8);
+
+ if (flags & BPF_F_LOCK)
+ copy_map_value_locked(map, value, l->key +
+ roundup_key_size,
+ true);
+ else
+ copy_map_value(map, value, l->key +
+ roundup_key_size);
+ /* Zeroing special fields in the temp buffer */
+ check_and_init_map_value(map, value);
+ }
+
+ hlist_nulls_del_rcu(&l->hash_node);
+ if (!is_lru_map)
+ free_htab_elem(htab, l);
+ }
+
+ htab_unlock_bucket(htab, b, hash, bflags);
+
+ if (is_lru_map && l)
+ htab_lru_push_free(htab, l);
+
+ return ret;
+}
+
+static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, false, false,
+ flags);
+}
+
+static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
+ void *key, void *value,
+ u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, false, true,
+ flags);
+}
+
+static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, true, false,
+ flags);
+}
+
+static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
+ void *key, void *value,
+ u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, true, true,
+ flags);
+}
+
static int
__htab_map_lookup_and_delete_batch(struct bpf_map *map,
const union bpf_attr *attr,
@@ -1350,8 +1684,8 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
void __user *uvalues = u64_to_user_ptr(attr->batch.values);
void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
- void *ubatch = u64_to_user_ptr(attr->batch.in_batch);
- u32 batch, max_count, size, bucket_size;
+ void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
+ u32 batch, max_count, size, bucket_size, map_id;
struct htab_elem *node_to_free = NULL;
u64 elem_map_flags, map_flags;
struct hlist_nulls_head *head;
@@ -1364,7 +1698,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
elem_map_flags = attr->batch.elem_flags;
if ((elem_map_flags & ~BPF_F_LOCK) ||
- ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
+ ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
map_flags = attr->batch.flags;
@@ -1393,7 +1727,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
value_size = size * num_possible_cpus();
total = 0;
/* while experimenting with hash tables with sizes ranging from 10 to
- * 1000, it was observed that a bucket can have upto 5 entries.
+ * 1000, it was observed that a bucket can have up to 5 entries.
*/
bucket_size = 5;
@@ -1401,8 +1735,8 @@ alloc:
/* We cannot do copy_from_user or copy_to_user inside
* the rcu_read_lock. Allocate enough space here.
*/
- keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN);
- values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN);
+ keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);
+ values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN);
if (!keys || !values) {
ret = -ENOMEM;
goto after_loop;
@@ -1417,8 +1751,14 @@ again_nocopy:
b = &htab->buckets[batch];
head = &b->head;
/* do not grab the lock unless need it (bucket_cnt > 0). */
- if (locked)
- flags = htab_lock_bucket(htab, b);
+ if (locked) {
+ ret = htab_lock_bucket(htab, b, batch, &flags);
+ if (ret) {
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ goto after_loop;
+ }
+ }
bucket_cnt = 0;
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
@@ -1435,7 +1775,7 @@ again_nocopy:
/* Note that since bucket_cnt > 0 here, it is implicit
* that the locked was grabbed, so release it.
*/
- htab_unlock_bucket(htab, b, flags);
+ htab_unlock_bucket(htab, b, batch, flags);
rcu_read_unlock();
bpf_enable_instrumentation();
goto after_loop;
@@ -1446,7 +1786,7 @@ again_nocopy:
/* Note that since bucket_cnt > 0 here, it is implicit
* that the locked was grabbed, so release it.
*/
- htab_unlock_bucket(htab, b, flags);
+ htab_unlock_bucket(htab, b, batch, flags);
rcu_read_unlock();
bpf_enable_instrumentation();
kvfree(keys);
@@ -1467,18 +1807,27 @@ again_nocopy:
pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(dst_val + off,
- per_cpu_ptr(pptr, cpu), size);
+ copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(&htab->map, dst_val + off);
off += size;
}
} else {
value = l->key + roundup_key_size;
+ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+ struct bpf_map **inner_map = value;
+
+ /* Actual value is the id of the inner map */
+ map_id = map->ops->map_fd_sys_lookup_elem(*inner_map);
+ value = &map_id;
+ }
+
if (elem_map_flags & BPF_F_LOCK)
copy_map_value_locked(map, dst_val, value,
true);
else
copy_map_value(map, dst_val, value);
- check_and_init_map_lock(map, dst_val);
+ /* Zeroing special fields in the temp buffer */
+ check_and_init_map_value(map, dst_val);
}
if (do_delete) {
hlist_nulls_del_rcu(&l->hash_node);
@@ -1499,13 +1848,13 @@ again_nocopy:
dst_val += value_size;
}
- htab_unlock_bucket(htab, b, flags);
+ htab_unlock_bucket(htab, b, batch, flags);
locked = false;
while (node_to_free) {
l = node_to_free;
node_to_free = node_to_free->batch_flink;
- bpf_lru_push_free(&htab->lru, &l->lru_node);
+ htab_lru_push_free(htab, l);
}
next_batch:
@@ -1620,31 +1969,327 @@ htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
true, false);
}
+struct bpf_iter_seq_hash_map_info {
+ struct bpf_map *map;
+ struct bpf_htab *htab;
+ void *percpu_value_buf; // non-zero means percpu hash
+ u32 bucket_id;
+ u32 skip_elems;
+};
+
+static struct htab_elem *
+bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
+ struct htab_elem *prev_elem)
+{
+ const struct bpf_htab *htab = info->htab;
+ u32 skip_elems = info->skip_elems;
+ u32 bucket_id = info->bucket_id;
+ struct hlist_nulls_head *head;
+ struct hlist_nulls_node *n;
+ struct htab_elem *elem;
+ struct bucket *b;
+ u32 i, count;
+
+ if (bucket_id >= htab->n_buckets)
+ return NULL;
+
+ /* try to find next elem in the same bucket */
+ if (prev_elem) {
+ /* no update/deletion on this bucket, prev_elem should be still valid
+ * and we won't skip elements.
+ */
+ n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
+ elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);
+ if (elem)
+ return elem;
+
+ /* not found, unlock and go to the next bucket */
+ b = &htab->buckets[bucket_id++];
+ rcu_read_unlock();
+ skip_elems = 0;
+ }
+
+ for (i = bucket_id; i < htab->n_buckets; i++) {
+ b = &htab->buckets[i];
+ rcu_read_lock();
+
+ count = 0;
+ head = &b->head;
+ hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
+ if (count >= skip_elems) {
+ info->bucket_id = i;
+ info->skip_elems = count;
+ return elem;
+ }
+ count++;
+ }
+
+ rcu_read_unlock();
+ skip_elems = 0;
+ }
+
+ info->bucket_id = i;
+ info->skip_elems = 0;
+ return NULL;
+}
+
+static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_hash_map_info *info = seq->private;
+ struct htab_elem *elem;
+
+ elem = bpf_hash_map_seq_find_next(info, NULL);
+ if (!elem)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ return elem;
+}
+
+static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_hash_map_info *info = seq->private;
+
+ ++*pos;
+ ++info->skip_elems;
+ return bpf_hash_map_seq_find_next(info, v);
+}
+
+static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
+{
+ struct bpf_iter_seq_hash_map_info *info = seq->private;
+ u32 roundup_key_size, roundup_value_size;
+ struct bpf_iter__bpf_map_elem ctx = {};
+ struct bpf_map *map = info->map;
+ struct bpf_iter_meta meta;
+ int ret = 0, off = 0, cpu;
+ struct bpf_prog *prog;
+ void __percpu *pptr;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, elem == NULL);
+ if (prog) {
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (elem) {
+ roundup_key_size = round_up(map->key_size, 8);
+ ctx.key = elem->key;
+ if (!info->percpu_value_buf) {
+ ctx.value = elem->key + roundup_key_size;
+ } else {
+ roundup_value_size = round_up(map->value_size, 8);
+ pptr = htab_elem_get_ptr(elem, map->key_size);
+ for_each_possible_cpu(cpu) {
+ copy_map_value_long(map, info->percpu_value_buf + off,
+ per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, info->percpu_value_buf + off);
+ off += roundup_value_size;
+ }
+ ctx.value = info->percpu_value_buf;
+ }
+ }
+ ret = bpf_iter_run_prog(prog, &ctx);
+ }
+
+ return ret;
+}
+
+static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_hash_map_seq_show(seq, v);
+}
+
+static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__bpf_hash_map_seq_show(seq, NULL);
+ else
+ rcu_read_unlock();
+}
+
+static int bpf_iter_init_hash_map(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
+ struct bpf_map *map = aux->map;
+ void *value_buf;
+ u32 buf_size;
+
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ buf_size = round_up(map->value_size, 8) * num_possible_cpus();
+ value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
+ if (!value_buf)
+ return -ENOMEM;
+
+ seq_info->percpu_value_buf = value_buf;
+ }
+
+ bpf_map_inc_with_uref(map);
+ seq_info->map = map;
+ seq_info->htab = container_of(map, struct bpf_htab, map);
+ return 0;
+}
+
+static void bpf_iter_fini_hash_map(void *priv_data)
+{
+ struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
+
+ bpf_map_put_with_uref(seq_info->map);
+ kfree(seq_info->percpu_value_buf);
+}
+
+static const struct seq_operations bpf_hash_map_seq_ops = {
+ .start = bpf_hash_map_seq_start,
+ .next = bpf_hash_map_seq_next,
+ .stop = bpf_hash_map_seq_stop,
+ .show = bpf_hash_map_seq_show,
+};
+
+static const struct bpf_iter_seq_info iter_seq_info = {
+ .seq_ops = &bpf_hash_map_seq_ops,
+ .init_seq_private = bpf_iter_init_hash_map,
+ .fini_seq_private = bpf_iter_fini_hash_map,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info),
+};
+
+static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+ void *callback_ctx, u64 flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_nulls_head *head;
+ struct hlist_nulls_node *n;
+ struct htab_elem *elem;
+ u32 roundup_key_size;
+ int i, num_elems = 0;
+ void __percpu *pptr;
+ struct bucket *b;
+ void *key, *val;
+ bool is_percpu;
+ u64 ret = 0;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ is_percpu = htab_is_percpu(htab);
+
+ roundup_key_size = round_up(map->key_size, 8);
+ /* disable migration so percpu value prepared here will be the
+ * same as the one seen by the bpf program with bpf_map_lookup_elem().
+ */
+ if (is_percpu)
+ migrate_disable();
+ for (i = 0; i < htab->n_buckets; i++) {
+ b = &htab->buckets[i];
+ rcu_read_lock();
+ head = &b->head;
+ hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
+ key = elem->key;
+ if (is_percpu) {
+ /* current cpu value for percpu map */
+ pptr = htab_elem_get_ptr(elem, map->key_size);
+ val = this_cpu_ptr(pptr);
+ } else {
+ val = elem->key + roundup_key_size;
+ }
+ num_elems++;
+ ret = callback_fn((u64)(long)map, (u64)(long)key,
+ (u64)(long)val, (u64)(long)callback_ctx, 0);
+ /* return value: 0 - continue, 1 - stop and return */
+ if (ret) {
+ rcu_read_unlock();
+ goto out;
+ }
+ }
+ rcu_read_unlock();
+ }
+out:
+ if (is_percpu)
+ migrate_enable();
+ return num_elems;
+}
+
+static u64 htab_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u32 value_size = round_up(htab->map.value_size, 8);
+ bool prealloc = htab_is_prealloc(htab);
+ bool percpu = htab_is_percpu(htab);
+ bool lru = htab_is_lru(htab);
+ u64 num_entries;
+ u64 usage = sizeof(struct bpf_htab);
+
+ usage += sizeof(struct bucket) * htab->n_buckets;
+ usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT;
+ if (prealloc) {
+ num_entries = map->max_entries;
+ if (htab_has_extra_elems(htab))
+ num_entries += num_possible_cpus();
+
+ usage += htab->elem_size * num_entries;
+
+ if (percpu)
+ usage += value_size * num_possible_cpus() * num_entries;
+ else if (!lru)
+ usage += sizeof(struct htab_elem *) * num_possible_cpus();
+ } else {
+#define LLIST_NODE_SZ sizeof(struct llist_node)
+
+ num_entries = htab->use_percpu_counter ?
+ percpu_counter_sum(&htab->pcount) :
+ atomic_read(&htab->count);
+ usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries;
+ if (percpu) {
+ usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries;
+ usage += value_size * num_possible_cpus() * num_entries;
+ }
+ }
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)
const struct bpf_map_ops htab_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
+ .map_release_uref = htab_map_free_timers,
.map_lookup_elem = htab_map_lookup_elem,
+ .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
.map_update_elem = htab_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
.map_gen_lookup = htab_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab),
+ .map_btf_id = &htab_map_btf_ids[0],
+ .iter_seq_info = &iter_seq_info,
};
const struct bpf_map_ops htab_lru_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
+ .map_release_uref = htab_map_free_timers,
.map_lookup_elem = htab_lru_map_lookup_elem,
+ .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
.map_update_elem = htab_lru_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
.map_gen_lookup = htab_lru_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_lru),
+ .map_btf_id = &htab_map_btf_ids[0],
+ .iter_seq_info = &iter_seq_info,
};
/* Called from eBPF program */
@@ -1658,6 +2303,20 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+ struct htab_elem *l;
+
+ if (cpu >= nr_cpu_ids)
+ return NULL;
+
+ l = __htab_map_lookup_elem(map, key);
+ if (l)
+ return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
+ else
+ return NULL;
+}
+
static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
{
struct htab_elem *l = __htab_map_lookup_elem(map, key);
@@ -1670,6 +2329,22 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+ struct htab_elem *l;
+
+ if (cpu >= nr_cpu_ids)
+ return NULL;
+
+ l = __htab_map_lookup_elem(map, key);
+ if (l) {
+ bpf_lru_node_set_ref(&l->lru_node);
+ return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
+ }
+
+ return NULL;
+}
+
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
{
struct htab_elem *l;
@@ -1692,8 +2367,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
*/
pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(value + off,
- per_cpu_ptr(pptr, cpu), size);
+ copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, value + off);
off += size;
}
ret = 0;
@@ -1750,27 +2425,43 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
}
const struct bpf_map_ops htab_percpu_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_percpu_map_lookup_elem,
+ .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
+ .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_percpu),
+ .map_btf_id = &htab_map_btf_ids[0],
+ .iter_seq_info = &iter_seq_info,
};
const struct bpf_map_ops htab_lru_percpu_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
+ .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
.map_update_elem = htab_lru_percpu_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
+ .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_lru_percpu),
+ .map_btf_id = &htab_map_btf_ids[0],
+ .iter_seq_info = &iter_seq_info,
};
static int fd_htab_map_alloc_check(union bpf_attr *attr)
@@ -1794,7 +2485,7 @@ static void fd_htab_map_free(struct bpf_map *map)
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
void *ptr = fd_htab_map_get_ptr(map, l);
- map->ops->map_fd_put_ptr(ptr);
+ map->ops->map_fd_put_ptr(map, ptr, false);
}
}
@@ -1833,9 +2524,15 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
if (IS_ERR(ptr))
return PTR_ERR(ptr);
+ /* The htab bucket lock is always held during update operations in fd
+ * htab map, and the following rcu_read_lock() is only used to avoid
+ * the WARN_ON_ONCE in htab_map_update_elem().
+ */
+ rcu_read_lock();
ret = htab_map_update_elem(map, key, &ptr, map_flags);
+ rcu_read_unlock();
if (ret)
- map->ops->map_fd_put_ptr(ptr);
+ map->ops->map_fd_put_ptr(map, ptr, false);
return ret;
}
@@ -1869,7 +2566,7 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
return READ_ONCE(*inner_map);
}
-static u32 htab_of_map_gen_lookup(struct bpf_map *map,
+static int htab_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_insn *insn = insn_buf;
@@ -1877,7 +2574,7 @@ static u32 htab_of_map_gen_lookup(struct bpf_map *map,
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
- *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
+ *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
offsetof(struct htab_elem, key) +
@@ -1905,4 +2602,7 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
.map_gen_lookup = htab_of_map_gen_lookup,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = htab_map_mem_usage,
+ BATCH_OPS(htab),
+ .map_btf_id = &htab_map_btf_ids[0],
};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index be43ab3e619f..d19cd863d294 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2,6 +2,9 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/bpf-cgroup.h>
+#include <linux/cgroup.h>
#include <linux/rcupdate.h>
#include <linux/random.h>
#include <linux/smp.h>
@@ -13,7 +16,13 @@
#include <linux/ctype.h>
#include <linux/jiffies.h>
#include <linux/pid_namespace.h>
+#include <linux/poison.h>
#include <linux/proc_ns.h>
+#include <linux/sched/task.h>
+#include <linux/security.h>
+#include <linux/btf_ids.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/kasan.h>
#include "../../lib/kstrtox.h"
@@ -23,12 +32,13 @@
*
* Different map implementations will rely on rcu in map methods
* lookup/update/delete, therefore eBPF programs must run under rcu lock
- * if program is allowed to access maps, so check rcu_read_lock_held in
- * all three functions.
+ * if program is allowed to access maps, so check rcu_read_lock_held() or
+ * rcu_read_lock_trace_held() in all three functions.
*/
BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
{
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
return (unsigned long) map->ops->map_lookup_elem(map, key);
}
@@ -44,7 +54,8 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = {
BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
void *, value, u64, flags)
{
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
return map->ops->map_update_elem(map, key, value, flags);
}
@@ -61,7 +72,8 @@ const struct bpf_func_proto bpf_map_update_elem_proto = {
BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
{
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
return map->ops->map_delete_elem(map, key);
}
@@ -99,7 +111,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE,
+ .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT,
};
BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
@@ -108,11 +120,27 @@ BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
}
const struct bpf_func_proto bpf_map_peek_elem_proto = {
- .func = bpf_map_pop_elem,
+ .func = bpf_map_peek_elem,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE,
+ .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT,
+};
+
+BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
+}
+
+const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = {
+ .func = bpf_map_lookup_percpu_elem,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .arg3_type = ARG_ANYTHING,
};
const struct bpf_func_proto bpf_get_prandom_u32_proto = {
@@ -167,6 +195,29 @@ const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = {
.ret_type = RET_INTEGER,
};
+BPF_CALL_0(bpf_ktime_get_coarse_ns)
+{
+ return ktime_get_coarse_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = {
+ .func = bpf_ktime_get_coarse_ns,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_0(bpf_ktime_get_tai_ns)
+{
+ /* NMI safe access to clock tai */
+ return ktime_get_tai_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = {
+ .func = bpf_ktime_get_tai_ns,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
BPF_CALL_0(bpf_get_current_pid_tgid)
{
struct task_struct *task = current;
@@ -210,13 +261,8 @@ BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
if (unlikely(!task))
goto err_clear;
- strncpy(buf, task->comm, size);
-
- /* Verifier guarantees that size > 0. For task->comm exceeding
- * size, guarantee that buf is %NUL-terminated. Unconditionally
- * done here to save the size test.
- */
- buf[size - 1] = 0;
+ /* Verifier guarantees that size > 0 */
+ strscpy_pad(buf, task->comm, size);
return 0;
err_clear:
memset(buf, 0, size);
@@ -244,6 +290,7 @@ static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
+ preempt_disable();
arch_spin_lock(l);
}
@@ -252,6 +299,7 @@ static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
arch_spinlock_t *l = (void *)lock;
arch_spin_unlock(l);
+ preempt_enable();
}
#else
@@ -277,13 +325,18 @@ static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
static DEFINE_PER_CPU(unsigned long, irqsave_flags);
-notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock)
{
unsigned long flags;
local_irq_save(flags);
__bpf_spin_lock(lock);
__this_cpu_write(irqsave_flags, flags);
+}
+
+notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+{
+ __bpf_spin_lock_irqsave(lock);
return 0;
}
@@ -292,15 +345,21 @@ const struct bpf_func_proto bpf_spin_lock_proto = {
.gpl_only = false,
.ret_type = RET_VOID,
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
+ .arg1_btf_id = BPF_PTR_POISON,
};
-notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
{
unsigned long flags;
flags = __this_cpu_read(irqsave_flags);
__bpf_spin_unlock(lock);
local_irq_restore(flags);
+}
+
+notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+{
+ __bpf_spin_unlock_irqrestore(lock);
return 0;
}
@@ -309,6 +368,7 @@ const struct bpf_func_proto bpf_spin_unlock_proto = {
.gpl_only = false,
.ret_type = RET_VOID,
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
+ .arg1_btf_id = BPF_PTR_POISON,
};
void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
@@ -317,13 +377,13 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
struct bpf_spin_lock *lock;
if (lock_src)
- lock = src + map->spin_lock_off;
+ lock = src + map->record->spin_lock_off;
else
- lock = dst + map->spin_lock_off;
+ lock = dst + map->record->spin_lock_off;
preempt_disable();
- ____bpf_spin_lock(lock);
+ __bpf_spin_lock_irqsave(lock);
copy_map_value(map, dst, src);
- ____bpf_spin_unlock(lock);
+ __bpf_spin_unlock_irqrestore(lock);
preempt_enable();
}
@@ -341,9 +401,15 @@ const struct bpf_func_proto bpf_jiffies64_proto = {
#ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id)
{
- struct cgroup *cgrp = task_dfl_cgroup(current);
+ struct cgroup *cgrp;
+ u64 cgrp_id;
+
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
+ cgrp_id = cgroup_id(cgrp);
+ rcu_read_unlock();
- return cgroup_id(cgrp);
+ return cgrp_id;
}
const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
@@ -354,13 +420,17 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level)
{
- struct cgroup *cgrp = task_dfl_cgroup(current);
+ struct cgroup *cgrp;
struct cgroup *ancestor;
+ u64 cgrp_id;
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
ancestor = cgroup_ancestor(cgrp, ancestor_level);
- if (!ancestor)
- return 0;
- return cgroup_id(ancestor);
+ cgrp_id = ancestor ? cgroup_id(ancestor) : 0;
+ rcu_read_unlock();
+
+ return cgrp_id;
}
const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
@@ -369,39 +439,7 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
};
-
-#ifdef CONFIG_CGROUP_BPF
-DECLARE_PER_CPU(struct bpf_cgroup_storage*,
- bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
-
-BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
-{
- /* flags argument is not used now,
- * but provides an ability to extend the API.
- * verifier checks that its value is correct.
- */
- enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
- struct bpf_cgroup_storage *storage;
- void *ptr;
-
- storage = this_cpu_read(bpf_cgroup_storage[stype]);
-
- if (stype == BPF_CGROUP_STORAGE_SHARED)
- ptr = &READ_ONCE(storage->buf)->data[0];
- else
- ptr = this_cpu_ptr(storage->percpu_buf);
-
- return (unsigned long)ptr;
-}
-
-const struct bpf_func_proto bpf_get_local_storage_proto = {
- .func = bpf_get_local_storage,
- .gpl_only = false,
- .ret_type = RET_PTR_TO_MAP_VALUE,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_ANYTHING,
-};
-#endif
+#endif /* CONFIG_CGROUPS */
#define BPF_STRTOX_BASE_MASK 0x1F
@@ -497,7 +535,7 @@ const struct bpf_func_proto bpf_strtol_proto = {
.func = bpf_strtol,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_LONG,
@@ -525,12 +563,25 @@ const struct bpf_func_proto bpf_strtoul_proto = {
.func = bpf_strtoul,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_LONG,
};
-#endif
+
+BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
+{
+ return strncmp(s1, s2, s1_sz);
+}
+
+static const struct bpf_func_proto bpf_strncmp_proto = {
+ .func = bpf_strncmp,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_PTR_TO_CONST_STR,
+};
BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino,
struct bpf_pidns_info *, nsdata, u32, size)
@@ -597,15 +648,1038 @@ const struct bpf_func_proto bpf_event_output_data_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
+BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size,
+ const void __user *, user_ptr)
+{
+ int ret = copy_from_user(dst, user_ptr, size);
+
+ if (unlikely(ret)) {
+ memset(dst, 0, size);
+ ret = -EFAULT;
+ }
+
+ return ret;
+}
+
+const struct bpf_func_proto bpf_copy_from_user_proto = {
+ .func = bpf_copy_from_user,
+ .gpl_only = false,
+ .might_sleep = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
+ const void __user *, user_ptr, struct task_struct *, tsk, u64, flags)
+{
+ int ret;
+
+ /* flags is not used yet */
+ if (unlikely(flags))
+ return -EINVAL;
+
+ if (unlikely(!size))
+ return 0;
+
+ ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0);
+ if (ret == size)
+ return 0;
+
+ memset(dst, 0, size);
+ /* Return -EFAULT for partial read */
+ return ret < 0 ? ret : -EFAULT;
+}
+
+const struct bpf_func_proto bpf_copy_from_user_task_proto = {
+ .func = bpf_copy_from_user_task,
+ .gpl_only = true,
+ .might_sleep = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_BTF_ID,
+ .arg4_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg5_type = ARG_ANYTHING
+};
+
+BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
+{
+ if (cpu >= nr_cpu_ids)
+ return (unsigned long)NULL;
+
+ return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu);
+}
+
+const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
+ .func = bpf_per_cpu_ptr,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY,
+ .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
+{
+ return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr);
+}
+
+const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
+ .func = bpf_this_cpu_ptr,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY,
+ .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
+};
+
+static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
+ size_t bufsz)
+{
+ void __user *user_ptr = (__force void __user *)unsafe_ptr;
+
+ buf[0] = 0;
+
+ switch (fmt_ptype) {
+ case 's':
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)unsafe_ptr < TASK_SIZE)
+ return strncpy_from_user_nofault(buf, user_ptr, bufsz);
+ fallthrough;
+#endif
+ case 'k':
+ return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz);
+ case 'u':
+ return strncpy_from_user_nofault(buf, user_ptr, bufsz);
+ }
+
+ return -EINVAL;
+}
+
+/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary
+ * arguments representation.
+ */
+#define MAX_BPRINTF_BIN_ARGS 512
+
+/* Support executing three nested bprintf helper calls on a given CPU */
+#define MAX_BPRINTF_NEST_LEVEL 3
+struct bpf_bprintf_buffers {
+ char bin_args[MAX_BPRINTF_BIN_ARGS];
+ char buf[MAX_BPRINTF_BUF];
+};
+
+static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs);
+static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);
+
+static int try_get_buffers(struct bpf_bprintf_buffers **bufs)
+{
+ int nest_level;
+
+ preempt_disable();
+ nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
+ if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) {
+ this_cpu_dec(bpf_bprintf_nest_level);
+ preempt_enable();
+ return -EBUSY;
+ }
+ *bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);
+
+ return 0;
+}
+
+void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
+{
+ if (!data->bin_args && !data->buf)
+ return;
+ if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
+ return;
+ this_cpu_dec(bpf_bprintf_nest_level);
+ preempt_enable();
+}
+
+/*
+ * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers
+ *
+ * Returns a negative value if fmt is an invalid format string or 0 otherwise.
+ *
+ * This can be used in two ways:
+ * - Format string verification only: when data->get_bin_args is false
+ * - Arguments preparation: in addition to the above verification, it writes in
+ * data->bin_args a binary representation of arguments usable by bstr_printf
+ * where pointers from BPF have been sanitized.
+ *
+ * In argument preparation mode, if 0 is returned, safe temporary buffers are
+ * allocated and bpf_bprintf_cleanup should be called to free them after use.
+ */
+int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
+ u32 num_args, struct bpf_bprintf_data *data)
+{
+ bool get_buffers = (data->get_bin_args && num_args) || data->get_buf;
+ char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end;
+ struct bpf_bprintf_buffers *buffers = NULL;
+ size_t sizeof_cur_arg, sizeof_cur_ip;
+ int err, i, num_spec = 0;
+ u64 cur_arg;
+ char fmt_ptype, cur_ip[16], ip_spec[] = "%pXX";
+
+ fmt_end = strnchr(fmt, fmt_size, 0);
+ if (!fmt_end)
+ return -EINVAL;
+ fmt_size = fmt_end - fmt;
+
+ if (get_buffers && try_get_buffers(&buffers))
+ return -EBUSY;
+
+ if (data->get_bin_args) {
+ if (num_args)
+ tmp_buf = buffers->bin_args;
+ tmp_buf_end = tmp_buf + MAX_BPRINTF_BIN_ARGS;
+ data->bin_args = (u32 *)tmp_buf;
+ }
+
+ if (data->get_buf)
+ data->buf = buffers->buf;
+
+ for (i = 0; i < fmt_size; i++) {
+ if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (fmt[i] != '%')
+ continue;
+
+ if (fmt[i + 1] == '%') {
+ i++;
+ continue;
+ }
+
+ if (num_spec >= num_args) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* The string is zero-terminated so if fmt[i] != 0, we can
+ * always access fmt[i + 1], in the worst case it will be a 0
+ */
+ i++;
+
+ /* skip optional "[0 +-][num]" width formatting field */
+ while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' ||
+ fmt[i] == ' ')
+ i++;
+ if (fmt[i] >= '1' && fmt[i] <= '9') {
+ i++;
+ while (fmt[i] >= '0' && fmt[i] <= '9')
+ i++;
+ }
+
+ if (fmt[i] == 'p') {
+ sizeof_cur_arg = sizeof(long);
+
+ if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') &&
+ fmt[i + 2] == 's') {
+ fmt_ptype = fmt[i + 1];
+ i += 2;
+ goto fmt_str;
+ }
+
+ if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
+ ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' ||
+ fmt[i + 1] == 'x' || fmt[i + 1] == 's' ||
+ fmt[i + 1] == 'S') {
+ /* just kernel pointers */
+ if (tmp_buf)
+ cur_arg = raw_args[num_spec];
+ i++;
+ goto nocopy_fmt;
+ }
+
+ if (fmt[i + 1] == 'B') {
+ if (tmp_buf) {
+ err = snprintf(tmp_buf,
+ (tmp_buf_end - tmp_buf),
+ "%pB",
+ (void *)(long)raw_args[num_spec]);
+ tmp_buf += (err + 1);
+ }
+
+ i++;
+ num_spec++;
+ continue;
+ }
+
+ /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
+ if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') ||
+ (fmt[i + 2] != '4' && fmt[i + 2] != '6')) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ i += 2;
+ if (!tmp_buf)
+ goto nocopy_fmt;
+
+ sizeof_cur_ip = (fmt[i] == '4') ? 4 : 16;
+ if (tmp_buf_end - tmp_buf < sizeof_cur_ip) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ unsafe_ptr = (char *)(long)raw_args[num_spec];
+ err = copy_from_kernel_nofault(cur_ip, unsafe_ptr,
+ sizeof_cur_ip);
+ if (err < 0)
+ memset(cur_ip, 0, sizeof_cur_ip);
+
+ /* hack: bstr_printf expects IP addresses to be
+ * pre-formatted as strings, ironically, the easiest way
+ * to do that is to call snprintf.
+ */
+ ip_spec[2] = fmt[i - 1];
+ ip_spec[3] = fmt[i];
+ err = snprintf(tmp_buf, tmp_buf_end - tmp_buf,
+ ip_spec, &cur_ip);
+
+ tmp_buf += err + 1;
+ num_spec++;
+
+ continue;
+ } else if (fmt[i] == 's') {
+ fmt_ptype = fmt[i];
+fmt_str:
+ if (fmt[i + 1] != 0 &&
+ !isspace(fmt[i + 1]) &&
+ !ispunct(fmt[i + 1])) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!tmp_buf)
+ goto nocopy_fmt;
+
+ if (tmp_buf_end == tmp_buf) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ unsafe_ptr = (char *)(long)raw_args[num_spec];
+ err = bpf_trace_copy_string(tmp_buf, unsafe_ptr,
+ fmt_ptype,
+ tmp_buf_end - tmp_buf);
+ if (err < 0) {
+ tmp_buf[0] = '\0';
+ err = 1;
+ }
+
+ tmp_buf += err;
+ num_spec++;
+
+ continue;
+ } else if (fmt[i] == 'c') {
+ if (!tmp_buf)
+ goto nocopy_fmt;
+
+ if (tmp_buf_end == tmp_buf) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ *tmp_buf = raw_args[num_spec];
+ tmp_buf++;
+ num_spec++;
+
+ continue;
+ }
+
+ sizeof_cur_arg = sizeof(int);
+
+ if (fmt[i] == 'l') {
+ sizeof_cur_arg = sizeof(long);
+ i++;
+ }
+ if (fmt[i] == 'l') {
+ sizeof_cur_arg = sizeof(long long);
+ i++;
+ }
+
+ if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' &&
+ fmt[i] != 'x' && fmt[i] != 'X') {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (tmp_buf)
+ cur_arg = raw_args[num_spec];
+nocopy_fmt:
+ if (tmp_buf) {
+ tmp_buf = PTR_ALIGN(tmp_buf, sizeof(u32));
+ if (tmp_buf_end - tmp_buf < sizeof_cur_arg) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ if (sizeof_cur_arg == 8) {
+ *(u32 *)tmp_buf = *(u32 *)&cur_arg;
+ *(u32 *)(tmp_buf + 4) = *((u32 *)&cur_arg + 1);
+ } else {
+ *(u32 *)tmp_buf = (u32)(long)cur_arg;
+ }
+ tmp_buf += sizeof_cur_arg;
+ }
+ num_spec++;
+ }
+
+ err = 0;
+out:
+ if (err)
+ bpf_bprintf_cleanup(data);
+ return err;
+}
+
+BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
+ const void *, args, u32, data_len)
+{
+ struct bpf_bprintf_data data = {
+ .get_bin_args = true,
+ };
+ int err, num_args;
+
+ if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 ||
+ (data_len && !args))
+ return -EINVAL;
+ num_args = data_len / 8;
+
+ /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we
+ * can safely give an unbounded size.
+ */
+ err = bpf_bprintf_prepare(fmt, UINT_MAX, args, num_args, &data);
+ if (err < 0)
+ return err;
+
+ err = bstr_printf(str, str_size, fmt, data.bin_args);
+
+ bpf_bprintf_cleanup(&data);
+
+ return err + 1;
+}
+
+const struct bpf_func_proto bpf_snprintf_proto = {
+ .func = bpf_snprintf,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_PTR_TO_CONST_STR,
+ .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+/* BPF map elements can contain 'struct bpf_timer'.
+ * Such map owns all of its BPF timers.
+ * 'struct bpf_timer' is allocated as part of map element allocation
+ * and it's zero initialized.
+ * That space is used to keep 'struct bpf_timer_kern'.
+ * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
+ * remembers 'struct bpf_map *' pointer it's part of.
+ * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
+ * bpf_timer_start() arms the timer.
+ * If user space reference to a map goes to zero at this point
+ * ops->map_release_uref callback is responsible for cancelling the timers,
+ * freeing their memory, and decrementing prog's refcnts.
+ * bpf_timer_cancel() cancels the timer and decrements prog's refcnt.
+ * Inner maps can contain bpf timers as well. ops->map_release_uref is
+ * freeing the timers when inner map is replaced or deleted by user space.
+ */
+struct bpf_hrtimer {
+ struct hrtimer timer;
+ struct bpf_map *map;
+ struct bpf_prog *prog;
+ void __rcu *callback_fn;
+ void *value;
+ struct rcu_head rcu;
+};
+
+/* the actual struct hidden inside uapi struct bpf_timer */
+struct bpf_timer_kern {
+ struct bpf_hrtimer *timer;
+ /* bpf_spin_lock is used here instead of spinlock_t to make
+ * sure that it always fits into space reserved by struct bpf_timer
+ * regardless of LOCKDEP and spinlock debug flags.
+ */
+ struct bpf_spin_lock lock;
+} __attribute__((aligned(8)));
+
+static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
+
+static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
+{
+ struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
+ struct bpf_map *map = t->map;
+ void *value = t->value;
+ bpf_callback_t callback_fn;
+ void *key;
+ u32 idx;
+
+ BTF_TYPE_EMIT(struct bpf_timer);
+ callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
+ if (!callback_fn)
+ goto out;
+
+ /* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
+ * cannot be preempted by another bpf_timer_cb() on the same cpu.
+ * Remember the timer this callback is servicing to prevent
+ * deadlock if callback_fn() calls bpf_timer_cancel() or
+ * bpf_map_delete_elem() on the same timer.
+ */
+ this_cpu_write(hrtimer_running, t);
+ if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ /* compute the key */
+ idx = ((char *)value - array->value) / array->elem_size;
+ key = &idx;
+ } else { /* hash or lru */
+ key = value - round_up(map->key_size, 8);
+ }
+
+ callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
+ /* The verifier checked that return value is zero. */
+
+ this_cpu_write(hrtimer_running, NULL);
+out:
+ return HRTIMER_NORESTART;
+}
+
+BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map,
+ u64, flags)
+{
+ clockid_t clockid = flags & (MAX_CLOCKS - 1);
+ struct bpf_hrtimer *t;
+ int ret = 0;
+
+ BUILD_BUG_ON(MAX_CLOCKS != 16);
+ BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer));
+ BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer));
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+
+ if (flags >= MAX_CLOCKS ||
+ /* similar to timerfd except _ALARM variants are not supported */
+ (clockid != CLOCK_MONOTONIC &&
+ clockid != CLOCK_REALTIME &&
+ clockid != CLOCK_BOOTTIME))
+ return -EINVAL;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (t) {
+ ret = -EBUSY;
+ goto out;
+ }
+ /* allocate hrtimer via map_kmalloc to use memcg accounting */
+ t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node);
+ if (!t) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ t->value = (void *)timer - map->record->timer_off;
+ t->map = map;
+ t->prog = NULL;
+ rcu_assign_pointer(t->callback_fn, NULL);
+ hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
+ t->timer.function = bpf_timer_cb;
+ WRITE_ONCE(timer->timer, t);
+ /* Guarantee the order between timer->timer and map->usercnt. So
+ * when there are concurrent uref release and bpf timer init, either
+ * bpf_timer_cancel_and_free() called by uref release reads a no-NULL
+ * timer or atomic64_read() below returns a zero usercnt.
+ */
+ smp_mb();
+ if (!atomic64_read(&map->usercnt)) {
+ /* maps with timers must be either held by user space
+ * or pinned in bpffs.
+ */
+ WRITE_ONCE(timer->timer, NULL);
+ kfree(t);
+ ret = -EPERM;
+ }
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_init_proto = {
+ .func = bpf_timer_init,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn,
+ struct bpf_prog_aux *, aux)
+{
+ struct bpf_prog *prev, *prog = aux->prog;
+ struct bpf_hrtimer *t;
+ int ret = 0;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (!t) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!atomic64_read(&t->map->usercnt)) {
+ /* maps with timers must be either held by user space
+ * or pinned in bpffs. Otherwise timer might still be
+ * running even when bpf prog is detached and user space
+ * is gone, since map_release_uref won't ever be called.
+ */
+ ret = -EPERM;
+ goto out;
+ }
+ prev = t->prog;
+ if (prev != prog) {
+ /* Bump prog refcnt once. Every bpf_timer_set_callback()
+ * can pick different callback_fn-s within the same prog.
+ */
+ prog = bpf_prog_inc_not_zero(prog);
+ if (IS_ERR(prog)) {
+ ret = PTR_ERR(prog);
+ goto out;
+ }
+ if (prev)
+ /* Drop prev prog refcnt when swapping with new prog */
+ bpf_prog_put(prev);
+ t->prog = prog;
+ }
+ rcu_assign_pointer(t->callback_fn, callback_fn);
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_set_callback_proto = {
+ .func = bpf_timer_set_callback,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+ .arg2_type = ARG_PTR_TO_FUNC,
+};
+
+BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags)
+{
+ struct bpf_hrtimer *t;
+ int ret = 0;
+ enum hrtimer_mode mode;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN))
+ return -EINVAL;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (!t || !t->prog) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (flags & BPF_F_TIMER_ABS)
+ mode = HRTIMER_MODE_ABS_SOFT;
+ else
+ mode = HRTIMER_MODE_REL_SOFT;
+
+ if (flags & BPF_F_TIMER_CPU_PIN)
+ mode |= HRTIMER_MODE_PINNED;
+
+ hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_start_proto = {
+ .func = bpf_timer_start,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static void drop_prog_refcnt(struct bpf_hrtimer *t)
+{
+ struct bpf_prog *prog = t->prog;
+
+ if (prog) {
+ bpf_prog_put(prog);
+ t->prog = NULL;
+ rcu_assign_pointer(t->callback_fn, NULL);
+ }
+}
+
+BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
+{
+ struct bpf_hrtimer *t;
+ int ret = 0;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ rcu_read_lock();
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (!t) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (this_cpu_read(hrtimer_running) == t) {
+ /* If bpf callback_fn is trying to bpf_timer_cancel()
+ * its own timer the hrtimer_cancel() will deadlock
+ * since it waits for callback_fn to finish
+ */
+ ret = -EDEADLK;
+ goto out;
+ }
+ drop_prog_refcnt(t);
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ /* Cancel the timer and wait for associated callback to finish
+ * if it was running.
+ */
+ ret = ret ?: hrtimer_cancel(&t->timer);
+ rcu_read_unlock();
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_cancel_proto = {
+ .func = bpf_timer_cancel,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+};
+
+/* This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_timer_cancel_and_free(void *val)
+{
+ struct bpf_timer_kern *timer = val;
+ struct bpf_hrtimer *t;
+
+ /* Performance optimization: read timer->timer without lock first. */
+ if (!READ_ONCE(timer->timer))
+ return;
+
+ __bpf_spin_lock_irqsave(&timer->lock);
+ /* re-read it under lock */
+ t = timer->timer;
+ if (!t)
+ goto out;
+ drop_prog_refcnt(t);
+ /* The subsequent bpf_timer_start/cancel() helpers won't be able to use
+ * this timer, since it won't be initialized.
+ */
+ WRITE_ONCE(timer->timer, NULL);
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ if (!t)
+ return;
+ /* Cancel the timer and wait for callback to complete if it was running.
+ * If hrtimer_cancel() can be safely called it's safe to call kfree(t)
+ * right after for both preallocated and non-preallocated maps.
+ * The timer->timer = NULL was already done and no code path can
+ * see address 't' anymore.
+ *
+ * Check that bpf_map_delete/update_elem() wasn't called from timer
+ * callback_fn. In such case don't call hrtimer_cancel() (since it will
+ * deadlock) and don't call hrtimer_try_to_cancel() (since it will just
+ * return -1). Though callback_fn is still running on this cpu it's
+ * safe to do kfree(t) because bpf_timer_cb() read everything it needed
+ * from 't'. The bpf subprog callback_fn won't be able to access 't',
+ * since timer->timer = NULL was already done. The timer will be
+ * effectively cancelled because bpf_timer_cb() will return
+ * HRTIMER_NORESTART.
+ */
+ if (this_cpu_read(hrtimer_running) != t)
+ hrtimer_cancel(&t->timer);
+ kfree_rcu(t, rcu);
+}
+
+BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
+{
+ unsigned long *kptr = map_value;
+
+ return xchg(kptr, (unsigned long)ptr);
+}
+
+/* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg()
+ * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to
+ * denote type that verifier will determine.
+ */
+static const struct bpf_func_proto bpf_kptr_xchg_proto = {
+ .func = bpf_kptr_xchg,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .ret_btf_id = BPF_PTR_POISON,
+ .arg1_type = ARG_PTR_TO_KPTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE,
+ .arg2_btf_id = BPF_PTR_POISON,
+};
+
+/* Since the upper 8 bits of dynptr->size is reserved, the
+ * maximum supported size is 2^24 - 1.
+ */
+#define DYNPTR_MAX_SIZE ((1UL << 24) - 1)
+#define DYNPTR_TYPE_SHIFT 28
+#define DYNPTR_SIZE_MASK 0xFFFFFF
+#define DYNPTR_RDONLY_BIT BIT(31)
+
+static bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
+{
+ return ptr->size & DYNPTR_RDONLY_BIT;
+}
+
+void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
+{
+ ptr->size |= DYNPTR_RDONLY_BIT;
+}
+
+static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
+{
+ ptr->size |= type << DYNPTR_TYPE_SHIFT;
+}
+
+static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr)
+{
+ return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
+}
+
+u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
+{
+ return ptr->size & DYNPTR_SIZE_MASK;
+}
+
+static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size)
+{
+ u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK;
+
+ ptr->size = new_size | metadata;
+}
+
+int bpf_dynptr_check_size(u32 size)
+{
+ return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
+}
+
+void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
+ enum bpf_dynptr_type type, u32 offset, u32 size)
+{
+ ptr->data = data;
+ ptr->offset = offset;
+ ptr->size = size;
+ bpf_dynptr_set_type(ptr, type);
+}
+
+void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
+{
+ memset(ptr, 0, sizeof(*ptr));
+}
+
+static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
+{
+ u32 size = __bpf_dynptr_size(ptr);
+
+ if (len > size || offset > size - len)
+ return -E2BIG;
+
+ return 0;
+}
+
+BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
+{
+ int err;
+
+ BTF_TYPE_EMIT(struct bpf_dynptr);
+
+ err = bpf_dynptr_check_size(size);
+ if (err)
+ goto error;
+
+ /* flags is currently unsupported */
+ if (flags) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size);
+
+ return 0;
+
+error:
+ bpf_dynptr_set_null(ptr);
+ return err;
+}
+
+static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
+ .func = bpf_dynptr_from_mem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
+};
+
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
+ u32, offset, u64, flags)
+{
+ enum bpf_dynptr_type type;
+ int err;
+
+ if (!src->data || flags)
+ return -EINVAL;
+
+ err = bpf_dynptr_check_off_len(src, offset, len);
+ if (err)
+ return err;
+
+ type = bpf_dynptr_get_type(src);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst, src->data + src->offset + offset, len);
+ return 0;
+ case BPF_DYNPTR_TYPE_SKB:
+ return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
+ case BPF_DYNPTR_TYPE_XDP:
+ return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
+ default:
+ WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
+ return -EFAULT;
+ }
+}
+
+static const struct bpf_func_proto bpf_dynptr_read_proto = {
+ .func = bpf_dynptr_read,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
+ u32, len, u64, flags)
+{
+ enum bpf_dynptr_type type;
+ int err;
+
+ if (!dst->data || __bpf_dynptr_is_rdonly(dst))
+ return -EINVAL;
+
+ err = bpf_dynptr_check_off_len(dst, offset, len);
+ if (err)
+ return err;
+
+ type = bpf_dynptr_get_type(dst);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ if (flags)
+ return -EINVAL;
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst->data + dst->offset + offset, src, len);
+ return 0;
+ case BPF_DYNPTR_TYPE_SKB:
+ return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len,
+ flags);
+ case BPF_DYNPTR_TYPE_XDP:
+ if (flags)
+ return -EINVAL;
+ return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
+ default:
+ WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
+ return -EFAULT;
+ }
+}
+
+static const struct bpf_func_proto bpf_dynptr_write_proto = {
+ .func = bpf_dynptr_write,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg4_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
+{
+ enum bpf_dynptr_type type;
+ int err;
+
+ if (!ptr->data)
+ return 0;
+
+ err = bpf_dynptr_check_off_len(ptr, offset, len);
+ if (err)
+ return 0;
+
+ if (__bpf_dynptr_is_rdonly(ptr))
+ return 0;
+
+ type = bpf_dynptr_get_type(ptr);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return (unsigned long)(ptr->data + ptr->offset + offset);
+ case BPF_DYNPTR_TYPE_SKB:
+ case BPF_DYNPTR_TYPE_XDP:
+ /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
+ return 0;
+ default:
+ WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type);
+ return 0;
+ }
+}
+
+static const struct bpf_func_proto bpf_dynptr_data_proto = {
+ .func = bpf_dynptr_data,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL,
+ .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
+};
+
const struct bpf_func_proto bpf_get_current_task_proto __weak;
+const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
const struct bpf_func_proto bpf_probe_read_kernel_proto __weak;
const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
+const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id)
@@ -623,6 +1697,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_map_pop_elem_proto;
case BPF_FUNC_map_peek_elem:
return &bpf_map_peek_elem_proto;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ return &bpf_map_lookup_percpu_elem_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_get_smp_processor_id:
@@ -635,6 +1711,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ktime_get_ns_proto;
case BPF_FUNC_ktime_get_boot_ns:
return &bpf_ktime_get_boot_ns_proto;
+ case BPF_FUNC_ktime_get_tai_ns:
+ return &bpf_ktime_get_tai_ns_proto;
case BPF_FUNC_ringbuf_output:
return &bpf_ringbuf_output_proto;
case BPF_FUNC_ringbuf_reserve:
@@ -645,6 +1723,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ringbuf_discard_proto;
case BPF_FUNC_ringbuf_query:
return &bpf_ringbuf_query_proto;
+ case BPF_FUNC_strncmp:
+ return &bpf_strncmp_proto;
+ case BPF_FUNC_strtol:
+ return &bpf_strtol_proto;
+ case BPF_FUNC_strtoul:
+ return &bpf_strtoul_proto;
default:
break;
}
@@ -657,12 +1741,52 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_spin_lock_proto;
case BPF_FUNC_spin_unlock:
return &bpf_spin_unlock_proto;
- case BPF_FUNC_trace_printk:
- if (!perfmon_capable())
- return NULL;
- return bpf_get_trace_printk_proto();
case BPF_FUNC_jiffies64:
return &bpf_jiffies64_proto;
+ case BPF_FUNC_per_cpu_ptr:
+ return &bpf_per_cpu_ptr_proto;
+ case BPF_FUNC_this_cpu_ptr:
+ return &bpf_this_cpu_ptr_proto;
+ case BPF_FUNC_timer_init:
+ return &bpf_timer_init_proto;
+ case BPF_FUNC_timer_set_callback:
+ return &bpf_timer_set_callback_proto;
+ case BPF_FUNC_timer_start:
+ return &bpf_timer_start_proto;
+ case BPF_FUNC_timer_cancel:
+ return &bpf_timer_cancel_proto;
+ case BPF_FUNC_kptr_xchg:
+ return &bpf_kptr_xchg_proto;
+ case BPF_FUNC_for_each_map_elem:
+ return &bpf_for_each_map_elem_proto;
+ case BPF_FUNC_loop:
+ return &bpf_loop_proto;
+ case BPF_FUNC_user_ringbuf_drain:
+ return &bpf_user_ringbuf_drain_proto;
+ case BPF_FUNC_ringbuf_reserve_dynptr:
+ return &bpf_ringbuf_reserve_dynptr_proto;
+ case BPF_FUNC_ringbuf_submit_dynptr:
+ return &bpf_ringbuf_submit_dynptr_proto;
+ case BPF_FUNC_ringbuf_discard_dynptr:
+ return &bpf_ringbuf_discard_dynptr_proto;
+ case BPF_FUNC_dynptr_from_mem:
+ return &bpf_dynptr_from_mem_proto;
+ case BPF_FUNC_dynptr_read:
+ return &bpf_dynptr_read_proto;
+ case BPF_FUNC_dynptr_write:
+ return &bpf_dynptr_write_proto;
+ case BPF_FUNC_dynptr_data:
+ return &bpf_dynptr_data_proto;
+#ifdef CONFIG_CGROUPS
+ case BPF_FUNC_cgrp_storage_get:
+ return &bpf_cgrp_storage_get_proto;
+ case BPF_FUNC_cgrp_storage_delete:
+ return &bpf_cgrp_storage_delete_proto;
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
+#endif
default:
break;
}
@@ -671,17 +1795,881 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return NULL;
switch (func_id) {
+ case BPF_FUNC_trace_printk:
+ return bpf_get_trace_printk_proto();
case BPF_FUNC_get_current_task:
return &bpf_get_current_task_proto;
+ case BPF_FUNC_get_current_task_btf:
+ return &bpf_get_current_task_btf_proto;
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
- return &bpf_probe_read_kernel_proto;
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
+ NULL : &bpf_probe_read_kernel_proto;
case BPF_FUNC_probe_read_user_str:
return &bpf_probe_read_user_str_proto;
case BPF_FUNC_probe_read_kernel_str:
- return &bpf_probe_read_kernel_str_proto;
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
+ NULL : &bpf_probe_read_kernel_str_proto;
+ case BPF_FUNC_snprintf_btf:
+ return &bpf_snprintf_btf_proto;
+ case BPF_FUNC_snprintf:
+ return &bpf_snprintf_proto;
+ case BPF_FUNC_task_pt_regs:
+ return &bpf_task_pt_regs_proto;
+ case BPF_FUNC_trace_vprintk:
+ return bpf_get_trace_vprintk_proto();
default:
return NULL;
}
}
+
+void bpf_list_head_free(const struct btf_field *field, void *list_head,
+ struct bpf_spin_lock *spin_lock)
+{
+ struct list_head *head = list_head, *orig_head = list_head;
+
+ BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
+ BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));
+
+ /* Do the actual list draining outside the lock to not hold the lock for
+ * too long, and also prevent deadlocks if tracing programs end up
+ * executing on entry/exit of functions called inside the critical
+ * section, and end up doing map ops that call bpf_list_head_free for
+ * the same map value again.
+ */
+ __bpf_spin_lock_irqsave(spin_lock);
+ if (!head->next || list_empty(head))
+ goto unlock;
+ head = head->next;
+unlock:
+ INIT_LIST_HEAD(orig_head);
+ __bpf_spin_unlock_irqrestore(spin_lock);
+
+ while (head != orig_head) {
+ void *obj = head;
+
+ obj -= field->graph_root.node_offset;
+ head = head->next;
+ /* The contained type can also have resources, including a
+ * bpf_list_head which needs to be freed.
+ */
+ migrate_disable();
+ __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
+ migrate_enable();
+ }
+}
+
+/* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are
+ * 'rb_node *', so field name of rb_node within containing struct is not
+ * needed.
+ *
+ * Since bpf_rb_tree's node type has a corresponding struct btf_field with
+ * graph_root.node_offset, it's not necessary to know field name
+ * or type of node struct
+ */
+#define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \
+ for (pos = rb_first_postorder(root); \
+ pos && ({ n = rb_next_postorder(pos); 1; }); \
+ pos = n)
+
+void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
+ struct bpf_spin_lock *spin_lock)
+{
+ struct rb_root_cached orig_root, *root = rb_root;
+ struct rb_node *pos, *n;
+ void *obj;
+
+ BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root));
+ BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root));
+
+ __bpf_spin_lock_irqsave(spin_lock);
+ orig_root = *root;
+ *root = RB_ROOT_CACHED;
+ __bpf_spin_unlock_irqrestore(spin_lock);
+
+ bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) {
+ obj = pos;
+ obj -= field->graph_root.node_offset;
+
+
+ migrate_disable();
+ __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
+ migrate_enable();
+ }
+}
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ u64 size = local_type_id__k;
+ void *p;
+
+ p = bpf_mem_alloc(&bpf_global_ma, size);
+ if (!p)
+ return NULL;
+ if (meta)
+ bpf_obj_init(meta->record, p);
+ return p;
+}
+
+__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+{
+ u64 size = local_type_id__k;
+
+ /* The verifier has ensured that meta__ign must be NULL */
+ return bpf_mem_alloc(&bpf_global_percpu_ma, size);
+}
+
+/* Must be called under migrate_disable(), as required by bpf_mem_free */
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
+{
+ struct bpf_mem_alloc *ma;
+
+ if (rec && rec->refcount_off >= 0 &&
+ !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
+ /* Object is refcounted and refcount_dec didn't result in 0
+ * refcount. Return without freeing the object
+ */
+ return;
+ }
+
+ if (rec)
+ bpf_obj_free_fields(rec, p);
+
+ if (percpu)
+ ma = &bpf_global_percpu_ma;
+ else
+ ma = &bpf_global_ma;
+ bpf_mem_free_rcu(ma, p);
+}
+
+__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ void *p = p__alloc;
+
+ __bpf_obj_drop_impl(p, meta ? meta->record : NULL, false);
+}
+
+__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
+{
+ /* The verifier has ensured that meta__ign must be NULL */
+ bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc);
+}
+
+__bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ struct bpf_refcount *ref;
+
+ /* Could just cast directly to refcount_t *, but need some code using
+ * bpf_refcount type so that it is emitted in vmlinux BTF
+ */
+ ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off);
+ if (!refcount_inc_not_zero((refcount_t *)ref))
+ return NULL;
+
+ /* Verifier strips KF_RET_NULL if input is owned ref, see is_kfunc_ret_null
+ * in verifier.c
+ */
+ return (void *)p__refcounted_kptr;
+}
+
+static int __bpf_list_add(struct bpf_list_node_kern *node,
+ struct bpf_list_head *head,
+ bool tail, struct btf_record *rec, u64 off)
+{
+ struct list_head *n = &node->list_head, *h = (void *)head;
+
+ /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
+ * called on its fields, so init here
+ */
+ if (unlikely(!h->next))
+ INIT_LIST_HEAD(h);
+
+ /* node->owner != NULL implies !list_empty(n), no need to separately
+ * check the latter
+ */
+ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
+ /* Only called from BPF prog, no need to migrate_disable */
+ __bpf_obj_drop_impl((void *)n - off, rec, false);
+ return -EINVAL;
+ }
+
+ tail ? list_add_tail(n, h) : list_add(n, h);
+ WRITE_ONCE(node->owner, head);
+
+ return 0;
+}
+
+__bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
+ struct bpf_list_node *node,
+ void *meta__ign, u64 off)
+{
+ struct bpf_list_node_kern *n = (void *)node;
+ struct btf_struct_meta *meta = meta__ign;
+
+ return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off);
+}
+
+__bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
+ struct bpf_list_node *node,
+ void *meta__ign, u64 off)
+{
+ struct bpf_list_node_kern *n = (void *)node;
+ struct btf_struct_meta *meta = meta__ign;
+
+ return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off);
+}
+
+static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
+{
+ struct list_head *n, *h = (void *)head;
+ struct bpf_list_node_kern *node;
+
+ /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
+ * called on its fields, so init here
+ */
+ if (unlikely(!h->next))
+ INIT_LIST_HEAD(h);
+ if (list_empty(h))
+ return NULL;
+
+ n = tail ? h->prev : h->next;
+ node = container_of(n, struct bpf_list_node_kern, list_head);
+ if (WARN_ON_ONCE(READ_ONCE(node->owner) != head))
+ return NULL;
+
+ list_del_init(n);
+ WRITE_ONCE(node->owner, NULL);
+ return (struct bpf_list_node *)n;
+}
+
+__bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
+{
+ return __bpf_list_del(head, false);
+}
+
+__bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
+{
+ return __bpf_list_del(head, true);
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
+ struct bpf_rb_node *node)
+{
+ struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
+ struct rb_root_cached *r = (struct rb_root_cached *)root;
+ struct rb_node *n = &node_internal->rb_node;
+
+ /* node_internal->owner != root implies either RB_EMPTY_NODE(n) or
+ * n is owned by some other tree. No need to check RB_EMPTY_NODE(n)
+ */
+ if (READ_ONCE(node_internal->owner) != root)
+ return NULL;
+
+ rb_erase_cached(n, r);
+ RB_CLEAR_NODE(n);
+ WRITE_ONCE(node_internal->owner, NULL);
+ return (struct bpf_rb_node *)n;
+}
+
+/* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF
+ * program
+ */
+static int __bpf_rbtree_add(struct bpf_rb_root *root,
+ struct bpf_rb_node_kern *node,
+ void *less, struct btf_record *rec, u64 off)
+{
+ struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node;
+ struct rb_node *parent = NULL, *n = &node->rb_node;
+ bpf_callback_t cb = (bpf_callback_t)less;
+ bool leftmost = true;
+
+ /* node->owner != NULL implies !RB_EMPTY_NODE(n), no need to separately
+ * check the latter
+ */
+ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
+ /* Only called from BPF prog, no need to migrate_disable */
+ __bpf_obj_drop_impl((void *)n - off, rec, false);
+ return -EINVAL;
+ }
+
+ while (*link) {
+ parent = *link;
+ if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = false;
+ }
+ }
+
+ rb_link_node(n, parent, link);
+ rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost);
+ WRITE_ONCE(node->owner, root);
+ return 0;
+}
+
+__bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
+ void *meta__ign, u64 off)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ struct bpf_rb_node_kern *n = (void *)node;
+
+ return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off);
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
+{
+ struct rb_root_cached *r = (struct rb_root_cached *)root;
+
+ return (struct bpf_rb_node *)rb_first_cached(r);
+}
+
+/**
+ * bpf_task_acquire - Acquire a reference to a task. A task acquired by this
+ * kfunc which is not stored in a map as a kptr, must be released by calling
+ * bpf_task_release().
+ * @p: The task on which a reference is being acquired.
+ */
+__bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
+{
+ if (refcount_inc_not_zero(&p->rcu_users))
+ return p;
+ return NULL;
+}
+
+/**
+ * bpf_task_release - Release the reference acquired on a task.
+ * @p: The task on which a reference is being released.
+ */
+__bpf_kfunc void bpf_task_release(struct task_struct *p)
+{
+ put_task_struct_rcu_user(p);
+}
+
+__bpf_kfunc void bpf_task_release_dtor(void *p)
+{
+ put_task_struct_rcu_user(p);
+}
+CFI_NOSEAL(bpf_task_release_dtor);
+
+#ifdef CONFIG_CGROUPS
+/**
+ * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by
+ * this kfunc which is not stored in a map as a kptr, must be released by
+ * calling bpf_cgroup_release().
+ * @cgrp: The cgroup on which a reference is being acquired.
+ */
+__bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
+{
+ return cgroup_tryget(cgrp) ? cgrp : NULL;
+}
+
+/**
+ * bpf_cgroup_release - Release the reference acquired on a cgroup.
+ * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to
+ * not be freed until the current grace period has ended, even if its refcount
+ * drops to 0.
+ * @cgrp: The cgroup on which a reference is being released.
+ */
+__bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
+{
+ cgroup_put(cgrp);
+}
+
+__bpf_kfunc void bpf_cgroup_release_dtor(void *cgrp)
+{
+ cgroup_put(cgrp);
+}
+CFI_NOSEAL(bpf_cgroup_release_dtor);
+
+/**
+ * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor
+ * array. A cgroup returned by this kfunc which is not subsequently stored in a
+ * map, must be released by calling bpf_cgroup_release().
+ * @cgrp: The cgroup for which we're performing a lookup.
+ * @level: The level of ancestor to look up.
+ */
+__bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
+{
+ struct cgroup *ancestor;
+
+ if (level > cgrp->level || level < 0)
+ return NULL;
+
+ /* cgrp's refcnt could be 0 here, but ancestors can still be accessed */
+ ancestor = cgrp->ancestors[level];
+ if (!cgroup_tryget(ancestor))
+ return NULL;
+ return ancestor;
+}
+
+/**
+ * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this
+ * kfunc which is not subsequently stored in a map, must be released by calling
+ * bpf_cgroup_release().
+ * @cgid: cgroup id.
+ */
+__bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
+{
+ struct cgroup *cgrp;
+
+ cgrp = cgroup_get_from_id(cgid);
+ if (IS_ERR(cgrp))
+ return NULL;
+ return cgrp;
+}
+
+/**
+ * bpf_task_under_cgroup - wrap task_under_cgroup_hierarchy() as a kfunc, test
+ * task's membership of cgroup ancestry.
+ * @task: the task to be tested
+ * @ancestor: possible ancestor of @task's cgroup
+ *
+ * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
+ * It follows all the same rules as cgroup_is_descendant, and only applies
+ * to the default hierarchy.
+ */
+__bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task,
+ struct cgroup *ancestor)
+{
+ long ret;
+
+ rcu_read_lock();
+ ret = task_under_cgroup_hierarchy(task, ancestor);
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a
+ * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
+ * hierarchy ID.
+ * @task: The target task
+ * @hierarchy_id: The ID of a cgroup1 hierarchy
+ *
+ * On success, the cgroup is returen. On failure, NULL is returned.
+ */
+__bpf_kfunc struct cgroup *
+bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id)
+{
+ struct cgroup *cgrp = task_get_cgroup1(task, hierarchy_id);
+
+ if (IS_ERR(cgrp))
+ return NULL;
+ return cgrp;
+}
+#endif /* CONFIG_CGROUPS */
+
+/**
+ * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up
+ * in the root pid namespace idr. If a task is returned, it must either be
+ * stored in a map, or released with bpf_task_release().
+ * @pid: The pid of the task being looked up.
+ */
+__bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (p)
+ p = bpf_task_acquire(p);
+ rcu_read_unlock();
+
+ return p;
+}
+
+/**
+ * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
+ * @ptr: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer__opt: User-provided buffer to copy contents into. May be NULL
+ * @buffer__szk: Size (in bytes) of the buffer if present. This is the
+ * length of the requested slice. This must be a constant.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * If buffer__opt is NULL, the call will fail if buffer_opt was needed.
+ *
+ * If the intention is to write to the data slice, please use
+ * bpf_dynptr_slice_rdwr.
+ *
+ * The user must check that the returned pointer is not null before using it.
+ *
+ * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * Return: NULL if the call failed (eg invalid dynptr), pointer to a read-only
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset,
+ void *buffer__opt, u32 buffer__szk)
+{
+ enum bpf_dynptr_type type;
+ u32 len = buffer__szk;
+ int err;
+
+ if (!ptr->data)
+ return NULL;
+
+ err = bpf_dynptr_check_off_len(ptr, offset, len);
+ if (err)
+ return NULL;
+
+ type = bpf_dynptr_get_type(ptr);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return ptr->data + ptr->offset + offset;
+ case BPF_DYNPTR_TYPE_SKB:
+ if (buffer__opt)
+ return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt);
+ else
+ return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len);
+ case BPF_DYNPTR_TYPE_XDP:
+ {
+ void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
+ if (!IS_ERR_OR_NULL(xdp_ptr))
+ return xdp_ptr;
+
+ if (!buffer__opt)
+ return NULL;
+ bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false);
+ return buffer__opt;
+ }
+ default:
+ WARN_ONCE(true, "unknown dynptr type %d\n", type);
+ return NULL;
+ }
+}
+
+/**
+ * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
+ * @ptr: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer__opt: User-provided buffer to copy contents into. May be NULL
+ * @buffer__szk: Size (in bytes) of the buffer if present. This is the
+ * length of the requested slice. This must be a constant.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * If buffer__opt is NULL, the call will fail if buffer_opt was needed.
+ *
+ * The returned pointer is writable and may point to either directly the dynptr
+ * data at the requested offset or to the buffer if unable to obtain a direct
+ * data pointer to (example: the requested slice is to the paged area of an skb
+ * packet). In the case where the returned pointer is to the buffer, the user
+ * is responsible for persisting writes through calling bpf_dynptr_write(). This
+ * usually looks something like this pattern:
+ *
+ * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer));
+ * if (!eth)
+ * return TC_ACT_SHOT;
+ *
+ * // mutate eth header //
+ *
+ * if (eth == buffer)
+ * bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0);
+ *
+ * Please note that, as in the example above, the user must check that the
+ * returned pointer is not null before using it.
+ *
+ * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * Return: NULL if the call failed (eg invalid dynptr), pointer to a
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset,
+ void *buffer__opt, u32 buffer__szk)
+{
+ if (!ptr->data || __bpf_dynptr_is_rdonly(ptr))
+ return NULL;
+
+ /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
+ *
+ * For skb-type dynptrs, it is safe to write into the returned pointer
+ * if the bpf program allows skb data writes. There are two possiblities
+ * that may occur when calling bpf_dynptr_slice_rdwr:
+ *
+ * 1) The requested slice is in the head of the skb. In this case, the
+ * returned pointer is directly to skb data, and if the skb is cloned, the
+ * verifier will have uncloned it (see bpf_unclone_prologue()) already.
+ * The pointer can be directly written into.
+ *
+ * 2) Some portion of the requested slice is in the paged buffer area.
+ * In this case, the requested data will be copied out into the buffer
+ * and the returned pointer will be a pointer to the buffer. The skb
+ * will not be pulled. To persist the write, the user will need to call
+ * bpf_dynptr_write(), which will pull the skb and commit the write.
+ *
+ * Similarly for xdp programs, if the requested slice is not across xdp
+ * fragments, then a direct pointer will be returned, otherwise the data
+ * will be copied out into the buffer and the user will need to call
+ * bpf_dynptr_write() to commit changes.
+ */
+ return bpf_dynptr_slice(ptr, offset, buffer__opt, buffer__szk);
+}
+
+__bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 end)
+{
+ u32 size;
+
+ if (!ptr->data || start > end)
+ return -EINVAL;
+
+ size = __bpf_dynptr_size(ptr);
+
+ if (start > size || end > size)
+ return -ERANGE;
+
+ ptr->offset += start;
+ bpf_dynptr_set_size(ptr, end - start);
+
+ return 0;
+}
+
+__bpf_kfunc bool bpf_dynptr_is_null(struct bpf_dynptr_kern *ptr)
+{
+ return !ptr->data;
+}
+
+__bpf_kfunc bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
+{
+ if (!ptr->data)
+ return false;
+
+ return __bpf_dynptr_is_rdonly(ptr);
+}
+
+__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
+{
+ if (!ptr->data)
+ return -EINVAL;
+
+ return __bpf_dynptr_size(ptr);
+}
+
+__bpf_kfunc int bpf_dynptr_clone(struct bpf_dynptr_kern *ptr,
+ struct bpf_dynptr_kern *clone__uninit)
+{
+ if (!ptr->data) {
+ bpf_dynptr_set_null(clone__uninit);
+ return -EINVAL;
+ }
+
+ *clone__uninit = *ptr;
+
+ return 0;
+}
+
+__bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
+{
+ return obj;
+}
+
+__bpf_kfunc void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k)
+{
+ return obj__ign;
+}
+
+__bpf_kfunc void bpf_rcu_read_lock(void)
+{
+ rcu_read_lock();
+}
+
+__bpf_kfunc void bpf_rcu_read_unlock(void)
+{
+ rcu_read_unlock();
+}
+
+struct bpf_throw_ctx {
+ struct bpf_prog_aux *aux;
+ u64 sp;
+ u64 bp;
+ int cnt;
+};
+
+static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp)
+{
+ struct bpf_throw_ctx *ctx = cookie;
+ struct bpf_prog *prog;
+
+ if (!is_bpf_text_address(ip))
+ return !ctx->cnt;
+ prog = bpf_prog_ksym_find(ip);
+ ctx->cnt++;
+ if (bpf_is_subprog(prog))
+ return true;
+ ctx->aux = prog->aux;
+ ctx->sp = sp;
+ ctx->bp = bp;
+ return false;
+}
+
+__bpf_kfunc void bpf_throw(u64 cookie)
+{
+ struct bpf_throw_ctx ctx = {};
+
+ arch_bpf_stack_walk(bpf_stack_walker, &ctx);
+ WARN_ON_ONCE(!ctx.aux);
+ if (ctx.aux)
+ WARN_ON_ONCE(!ctx.aux->exception_boundary);
+ WARN_ON_ONCE(!ctx.bp);
+ WARN_ON_ONCE(!ctx.cnt);
+ /* Prevent KASAN false positives for CONFIG_KASAN_STACK by unpoisoning
+ * deeper stack depths than ctx.sp as we do not return from bpf_throw,
+ * which skips compiler generated instrumentation to do the same.
+ */
+ kasan_unpoison_task_stack_below((void *)(long)ctx.sp);
+ ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0);
+ WARN(1, "A call to BPF exception callback should never return\n");
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_SET8_START(generic_btf_ids)
+#ifdef CONFIG_KEXEC_CORE
+BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
+#endif
+BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU)
+BTF_ID_FLAGS(func, bpf_list_push_front_impl)
+BTF_ID_FLAGS(func, bpf_list_push_back_impl)
+BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
+BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
+
+#ifdef CONFIG_CGROUPS
+BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
+BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+#endif
+BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_throw)
+BTF_SET8_END(generic_btf_ids)
+
+static const struct btf_kfunc_id_set generic_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &generic_btf_ids,
+};
+
+
+BTF_ID_LIST(generic_dtor_ids)
+BTF_ID(struct, task_struct)
+BTF_ID(func, bpf_task_release_dtor)
+#ifdef CONFIG_CGROUPS
+BTF_ID(struct, cgroup)
+BTF_ID(func, bpf_cgroup_release_dtor)
+#endif
+
+BTF_SET8_START(common_btf_ids)
+BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
+BTF_ID_FLAGS(func, bpf_rdonly_cast)
+BTF_ID_FLAGS(func, bpf_rcu_read_lock)
+BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
+BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
+#ifdef CONFIG_CGROUPS
+BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
+#endif
+BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_dynptr_adjust)
+BTF_ID_FLAGS(func, bpf_dynptr_is_null)
+BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
+BTF_ID_FLAGS(func, bpf_dynptr_size)
+BTF_ID_FLAGS(func, bpf_dynptr_clone)
+BTF_SET8_END(common_btf_ids)
+
+static const struct btf_kfunc_id_set common_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &common_btf_ids,
+};
+
+static int __init kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc generic_dtors[] = {
+ {
+ .btf_id = generic_dtor_ids[0],
+ .kfunc_btf_id = generic_dtor_ids[1]
+ },
+#ifdef CONFIG_CGROUPS
+ {
+ .btf_id = generic_dtor_ids[2],
+ .kfunc_btf_id = generic_dtor_ids[3]
+ },
+#endif
+ };
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
+ ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
+ ARRAY_SIZE(generic_dtors),
+ THIS_MODULE);
+ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
+}
+
+late_initcall(kfunc_init);
+
+/* Get a pointer to dynptr data up to len bytes for read only access. If
+ * the dynptr doesn't have continuous data up to len bytes, return NULL.
+ */
+const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len)
+{
+ return bpf_dynptr_slice(ptr, 0, NULL, len);
+}
+
+/* Get a pointer to dynptr data up to len bytes for read write access. If
+ * the dynptr doesn't have continuous data up to len bytes, or the dynptr
+ * is read only, return NULL.
+ */
+void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len)
+{
+ if (__bpf_dynptr_is_rdonly(ptr))
+ return NULL;
+ return (void *)__bpf_dynptr_data(ptr, len);
+}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index fb878ba3f22f..41e0a55c35f5 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -20,6 +20,7 @@
#include <linux/filter.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
+#include "preload/bpf_preload.h"
enum bpf_type {
BPF_TYPE_UNSPEC = 0,
@@ -117,11 +118,9 @@ static struct inode *bpf_get_inode(struct super_block *sb,
return ERR_PTR(-ENOSPC);
inode->i_ino = get_next_ino();
- inode->i_atime = current_time(inode);
- inode->i_mtime = inode->i_atime;
- inode->i_ctime = inode->i_atime;
+ simple_inode_init_ts(inode);
- inode_init_owner(inode, dir, mode);
+ inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
return inode;
}
@@ -147,11 +146,11 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
d_instantiate(dentry, inode);
dget(dentry);
- dir->i_mtime = current_time(dir);
- dir->i_ctime = dir->i_mtime;
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
}
-static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
@@ -226,10 +225,12 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
else
prev_key = key;
+ rcu_read_lock();
if (map->ops->map_get_next_key(map, prev_key, key)) {
map_iter(m)->done = true;
- return NULL;
+ key = NULL;
}
+ rcu_read_unlock();
return key;
}
@@ -369,16 +370,17 @@ static struct dentry *
bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
{
/* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future
- * extensions.
+ * extensions. That allows popoulate_bpffs() create special files.
*/
- if (strchr(dentry->d_name.name, '.'))
+ if ((dir->i_mode & S_IALLUGO) &&
+ strchr(dentry->d_name.name, '.'))
return ERR_PTR(-EPERM);
return simple_lookup(dir, dentry, flags);
}
-static int bpf_symlink(struct inode *dir, struct dentry *dentry,
- const char *target)
+static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, const char *target)
{
char *link = kstrdup(target, GFP_USER | __GFP_NOWARN);
struct inode *inode;
@@ -409,7 +411,28 @@ static const struct inode_operations bpf_dir_iops = {
.unlink = simple_unlink,
};
-static int bpf_obj_do_pin(const char __user *pathname, void *raw,
+/* pin iterator link into bpffs */
+static int bpf_iter_link_pin_kernel(struct dentry *parent,
+ const char *name, struct bpf_link *link)
+{
+ umode_t mode = S_IFREG | S_IRUSR;
+ struct dentry *dentry;
+ int ret;
+
+ inode_lock(parent->d_inode);
+ dentry = lookup_one_len(name, parent, strlen(name));
+ if (IS_ERR(dentry)) {
+ inode_unlock(parent->d_inode);
+ return PTR_ERR(dentry);
+ }
+ ret = bpf_mkobj_ops(dentry, mode, link, &bpf_link_iops,
+ &bpf_iter_fops);
+ dput(dentry);
+ inode_unlock(parent->d_inode);
+ return ret;
+}
+
+static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw,
enum bpf_type type)
{
struct dentry *dentry;
@@ -418,22 +441,21 @@ static int bpf_obj_do_pin(const char __user *pathname, void *raw,
umode_t mode;
int ret;
- dentry = user_path_create(AT_FDCWD, pathname, &path, 0);
+ dentry = user_path_create(path_fd, pathname, &path, 0);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
-
- ret = security_path_mknod(&path, dentry, mode, 0);
- if (ret)
- goto out;
-
dir = d_inode(path.dentry);
if (dir->i_op != &bpf_dir_iops) {
ret = -EPERM;
goto out;
}
+ mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
+ ret = security_path_mknod(&path, dentry, mode, 0);
+ if (ret)
+ goto out;
+
switch (type) {
case BPF_TYPE_PROG:
ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw);
@@ -452,7 +474,7 @@ out:
return ret;
}
-int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
+int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname)
{
enum bpf_type type;
void *raw;
@@ -462,14 +484,14 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
if (IS_ERR(raw))
return PTR_ERR(raw);
- ret = bpf_obj_do_pin(pathname, raw, type);
+ ret = bpf_obj_do_pin(path_fd, pathname, raw, type);
if (ret != 0)
bpf_any_put(raw, type);
return ret;
}
-static void *bpf_obj_do_get(const char __user *pathname,
+static void *bpf_obj_do_get(int path_fd, const char __user *pathname,
enum bpf_type *type, int flags)
{
struct inode *inode;
@@ -477,12 +499,12 @@ static void *bpf_obj_do_get(const char __user *pathname,
void *raw;
int ret;
- ret = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW, &path);
+ ret = user_path_at(path_fd, pathname, LOOKUP_FOLLOW, &path);
if (ret)
return ERR_PTR(ret);
inode = d_backing_inode(path.dentry);
- ret = inode_permission(inode, ACC_MODE(flags));
+ ret = path_permission(&path, ACC_MODE(flags));
if (ret)
goto out;
@@ -501,7 +523,7 @@ out:
return ERR_PTR(ret);
}
-int bpf_obj_get_user(const char __user *pathname, int flags)
+int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags)
{
enum bpf_type type = BPF_TYPE_UNSPEC;
int f_flags;
@@ -512,7 +534,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
if (f_flags < 0)
return f_flags;
- raw = bpf_obj_do_get(pathname, &type, f_flags);
+ raw = bpf_obj_do_get(path_fd, pathname, &type, f_flags);
if (IS_ERR(raw))
return PTR_ERR(raw);
@@ -521,7 +543,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
else if (type == BPF_TYPE_MAP)
ret = bpf_map_new_fd(raw, f_flags);
else if (type == BPF_TYPE_LINK)
- ret = bpf_link_new_fd(raw);
+ ret = (f_flags != O_RDWR) ? -EINVAL : bpf_link_new_fd(raw);
else
return -ENOENT;
@@ -533,7 +555,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
{
struct bpf_prog *prog;
- int ret = inode_permission(inode, MAY_READ);
+ int ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ);
if (ret)
return ERR_PTR(ret);
@@ -577,8 +599,15 @@ EXPORT_SYMBOL(bpf_prog_get_type_path);
*/
static int bpf_show_options(struct seq_file *m, struct dentry *root)
{
- umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
+ struct inode *inode = d_inode(root);
+ umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX;
+ if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID))
+ seq_printf(m, ",uid=%u",
+ from_kuid_munged(&init_user_ns, inode->i_uid));
+ if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID))
+ seq_printf(m, ",gid=%u",
+ from_kgid_munged(&init_user_ns, inode->i_gid));
if (mode != S_IRWXUGO)
seq_printf(m, ",mode=%o", mode);
return 0;
@@ -603,15 +632,21 @@ static const struct super_operations bpf_super_ops = {
};
enum {
+ OPT_UID,
+ OPT_GID,
OPT_MODE,
};
static const struct fs_parameter_spec bpf_fs_parameters[] = {
+ fsparam_u32 ("uid", OPT_UID),
+ fsparam_u32 ("gid", OPT_GID),
fsparam_u32oct ("mode", OPT_MODE),
{}
};
struct bpf_mount_opts {
+ kuid_t uid;
+ kgid_t gid;
umode_t mode;
};
@@ -619,23 +654,131 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct bpf_mount_opts *opts = fc->fs_private;
struct fs_parse_result result;
+ kuid_t uid;
+ kgid_t gid;
int opt;
opt = fs_parse(fc, bpf_fs_parameters, param, &result);
- if (opt < 0)
+ if (opt < 0) {
/* We might like to report bad mount options here, but
* traditionally we've ignored all mount options, so we'd
* better continue to ignore non-existing options for bpf.
*/
- return opt == -ENOPARAM ? 0 : opt;
+ if (opt == -ENOPARAM) {
+ opt = vfs_parse_fs_param_source(fc, param);
+ if (opt != -ENOPARAM)
+ return opt;
+
+ return 0;
+ }
+
+ if (opt < 0)
+ return opt;
+ }
switch (opt) {
+ case OPT_UID:
+ uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(uid))
+ goto bad_value;
+
+ /*
+ * The requested uid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kuid_has_mapping(fc->user_ns, uid))
+ goto bad_value;
+
+ opts->uid = uid;
+ break;
+ case OPT_GID:
+ gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(gid))
+ goto bad_value;
+
+ /*
+ * The requested gid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kgid_has_mapping(fc->user_ns, gid))
+ goto bad_value;
+
+ opts->gid = gid;
+ break;
case OPT_MODE:
opts->mode = result.uint_32 & S_IALLUGO;
break;
}
return 0;
+bad_value:
+ return invalfc(fc, "Bad value for '%s'", param->key);
+}
+
+struct bpf_preload_ops *bpf_preload_ops;
+EXPORT_SYMBOL_GPL(bpf_preload_ops);
+
+static bool bpf_preload_mod_get(void)
+{
+ /* If bpf_preload.ko wasn't loaded earlier then load it now.
+ * When bpf_preload is built into vmlinux the module's __init
+ * function will populate it.
+ */
+ if (!bpf_preload_ops) {
+ request_module("bpf_preload");
+ if (!bpf_preload_ops)
+ return false;
+ }
+ /* And grab the reference, so the module doesn't disappear while the
+ * kernel is interacting with the kernel module and its UMD.
+ */
+ if (!try_module_get(bpf_preload_ops->owner)) {
+ pr_err("bpf_preload module get failed.\n");
+ return false;
+ }
+ return true;
+}
+
+static void bpf_preload_mod_put(void)
+{
+ if (bpf_preload_ops)
+ /* now user can "rmmod bpf_preload" if necessary */
+ module_put(bpf_preload_ops->owner);
+}
+
+static DEFINE_MUTEX(bpf_preload_lock);
+
+static int populate_bpffs(struct dentry *parent)
+{
+ struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {};
+ int err = 0, i;
+
+ /* grab the mutex to make sure the kernel interactions with bpf_preload
+ * are serialized
+ */
+ mutex_lock(&bpf_preload_lock);
+
+ /* if bpf_preload.ko wasn't built into vmlinux then load it */
+ if (!bpf_preload_mod_get())
+ goto out;
+
+ err = bpf_preload_ops->preload(objs);
+ if (err)
+ goto out_put;
+ for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
+ bpf_link_inc(objs[i].link);
+ err = bpf_iter_link_pin_kernel(parent,
+ objs[i].link_name, objs[i].link);
+ if (err) {
+ bpf_link_put(objs[i].link);
+ goto out_put;
+ }
+ }
+out_put:
+ bpf_preload_mod_put();
+out:
+ mutex_unlock(&bpf_preload_lock);
+ return err;
}
static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
@@ -652,10 +795,12 @@ static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_op = &bpf_super_ops;
inode = sb->s_root->d_inode;
+ inode->i_uid = opts->uid;
+ inode->i_gid = opts->gid;
inode->i_op = &bpf_dir_iops;
inode->i_mode &= ~S_IALLUGO;
+ populate_bpffs(sb->s_root);
inode->i_mode |= S_ISVTX | opts->mode;
-
return 0;
}
@@ -687,6 +832,8 @@ static int bpf_init_fs_context(struct fs_context *fc)
return -ENOMEM;
opts->mode = S_IRWXUGO;
+ opts->uid = current_fsuid();
+ opts->gid = current_fsgid();
fc->fs_private = opts;
fc->ops = &bpf_context_ops;
diff --git a/kernel/bpf/link_iter.c b/kernel/bpf/link_iter.c
new file mode 100644
index 000000000000..fec8005a121c
--- /dev/null
+++ b/kernel/bpf/link_iter.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Red Hat, Inc. */
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/kernel.h>
+#include <linux/btf_ids.h>
+
+struct bpf_iter_seq_link_info {
+ u32 link_id;
+};
+
+static void *bpf_link_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_link_info *info = seq->private;
+ struct bpf_link *link;
+
+ link = bpf_link_get_curr_or_next(&info->link_id);
+ if (!link)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ return link;
+}
+
+static void *bpf_link_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_link_info *info = seq->private;
+
+ ++*pos;
+ ++info->link_id;
+ bpf_link_put((struct bpf_link *)v);
+ return bpf_link_get_curr_or_next(&info->link_id);
+}
+
+struct bpf_iter__bpf_link {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct bpf_link *, link);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_link, struct bpf_iter_meta *meta, struct bpf_link *link)
+
+static int __bpf_link_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+ struct bpf_iter__bpf_link ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ ctx.meta = &meta;
+ ctx.link = v;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ return ret;
+}
+
+static int bpf_link_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_link_seq_show(seq, v, false);
+}
+
+static void bpf_link_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__bpf_link_seq_show(seq, v, true);
+ else
+ bpf_link_put((struct bpf_link *)v);
+}
+
+static const struct seq_operations bpf_link_seq_ops = {
+ .start = bpf_link_seq_start,
+ .next = bpf_link_seq_next,
+ .stop = bpf_link_seq_stop,
+ .show = bpf_link_seq_show,
+};
+
+BTF_ID_LIST(btf_bpf_link_id)
+BTF_ID(struct, bpf_link)
+
+static const struct bpf_iter_seq_info bpf_link_seq_info = {
+ .seq_ops = &bpf_link_seq_ops,
+ .init_seq_private = NULL,
+ .fini_seq_private = NULL,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_link_info),
+};
+
+static struct bpf_iter_reg bpf_link_reg_info = {
+ .target = "bpf_link",
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__bpf_link, link),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &bpf_link_seq_info,
+};
+
+static int __init bpf_link_iter_init(void)
+{
+ bpf_link_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_link_id;
+ return bpf_iter_reg_target(&bpf_link_reg_info);
+}
+
+late_initcall(bpf_link_iter_init);
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 33d01866bcc2..a04f505aefe9 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -1,6 +1,7 @@
-//SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf-cgroup.h>
#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
#include <linux/btf.h>
#include <linux/bug.h>
#include <linux/filter.h>
@@ -8,11 +9,12 @@
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <uapi/linux/btf.h>
-
-DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+#include <linux/btf_ids.h>
#ifdef CONFIG_CGROUP_BPF
+#include "../cgroup/cgroup-internal.h"
+
#define LOCAL_STORAGE_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
@@ -20,7 +22,6 @@ struct bpf_cgroup_storage_map {
struct bpf_map map;
spinlock_t lock;
- struct bpf_prog_aux *aux;
struct rb_root root;
struct list_head list;
};
@@ -30,24 +31,41 @@ static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map)
return container_of(map, struct bpf_cgroup_storage_map, map);
}
-static int bpf_cgroup_storage_key_cmp(
- const struct bpf_cgroup_storage_key *key1,
- const struct bpf_cgroup_storage_key *key2)
+static bool attach_type_isolated(const struct bpf_map *map)
{
- if (key1->cgroup_inode_id < key2->cgroup_inode_id)
- return -1;
- else if (key1->cgroup_inode_id > key2->cgroup_inode_id)
- return 1;
- else if (key1->attach_type < key2->attach_type)
- return -1;
- else if (key1->attach_type > key2->attach_type)
- return 1;
+ return map->key_size == sizeof(struct bpf_cgroup_storage_key);
+}
+
+static int bpf_cgroup_storage_key_cmp(const struct bpf_cgroup_storage_map *map,
+ const void *_key1, const void *_key2)
+{
+ if (attach_type_isolated(&map->map)) {
+ const struct bpf_cgroup_storage_key *key1 = _key1;
+ const struct bpf_cgroup_storage_key *key2 = _key2;
+
+ if (key1->cgroup_inode_id < key2->cgroup_inode_id)
+ return -1;
+ else if (key1->cgroup_inode_id > key2->cgroup_inode_id)
+ return 1;
+ else if (key1->attach_type < key2->attach_type)
+ return -1;
+ else if (key1->attach_type > key2->attach_type)
+ return 1;
+ } else {
+ const __u64 *cgroup_inode_id1 = _key1;
+ const __u64 *cgroup_inode_id2 = _key2;
+
+ if (*cgroup_inode_id1 < *cgroup_inode_id2)
+ return -1;
+ else if (*cgroup_inode_id1 > *cgroup_inode_id2)
+ return 1;
+ }
return 0;
}
-static struct bpf_cgroup_storage *cgroup_storage_lookup(
- struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key,
- bool locked)
+struct bpf_cgroup_storage *
+cgroup_storage_lookup(struct bpf_cgroup_storage_map *map,
+ void *key, bool locked)
{
struct rb_root *root = &map->root;
struct rb_node *node;
@@ -61,7 +79,7 @@ static struct bpf_cgroup_storage *cgroup_storage_lookup(
storage = container_of(node, struct bpf_cgroup_storage, node);
- switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) {
+ switch (bpf_cgroup_storage_key_cmp(map, key, &storage->key)) {
case -1:
node = node->rb_left;
break;
@@ -93,7 +111,7 @@ static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map,
this = container_of(*new, struct bpf_cgroup_storage, node);
parent = *new;
- switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) {
+ switch (bpf_cgroup_storage_key_cmp(map, &storage->key, &this->key)) {
case -1:
new = &((*new)->rb_left);
break;
@@ -111,10 +129,9 @@ static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map,
return 0;
}
-static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key)
+static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key)
{
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
- struct bpf_cgroup_storage_key *key = _key;
struct bpf_cgroup_storage *storage;
storage = cgroup_storage_lookup(map, key, false);
@@ -124,21 +141,17 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key)
return &READ_ONCE(storage->buf)->data[0];
}
-static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
- void *value, u64 flags)
+static long cgroup_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
- struct bpf_cgroup_storage_key *key = _key;
struct bpf_cgroup_storage *storage;
struct bpf_storage_buffer *new;
- if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST)))
- return -EINVAL;
-
- if (unlikely(flags & BPF_NOEXIST))
+ if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST)))
return -EINVAL;
if (unlikely((flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)))
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
@@ -151,15 +164,14 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
return 0;
}
- new = kmalloc_node(sizeof(struct bpf_storage_buffer) +
- map->value_size,
- __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
- map->numa_node);
+ new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size),
+ __GFP_ZERO | GFP_NOWAIT | __GFP_NOWARN,
+ map->numa_node);
if (!new)
return -ENOMEM;
memcpy(&new->data[0], value, map->value_size);
- check_and_init_map_lock(map, new->data);
+ check_and_init_map_value(map, new->data);
new = xchg(&storage->buf, new);
kfree_rcu(new, rcu);
@@ -167,11 +179,10 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
return 0;
}
-int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
+int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key,
void *value)
{
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
- struct bpf_cgroup_storage_key *key = _key;
struct bpf_cgroup_storage *storage;
int cpu, off = 0;
u32 size;
@@ -197,11 +208,10 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
return 0;
}
-int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
+int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key,
void *value, u64 map_flags)
{
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
- struct bpf_cgroup_storage_key *key = _key;
struct bpf_cgroup_storage *storage;
int cpu, off = 0;
u32 size;
@@ -232,12 +242,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
return 0;
}
-static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
+static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key,
void *_next_key)
{
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
- struct bpf_cgroup_storage_key *key = _key;
- struct bpf_cgroup_storage_key *next = _next_key;
struct bpf_cgroup_storage *storage;
spin_lock_bh(&map->lock);
@@ -250,17 +258,23 @@ static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
if (!storage)
goto enoent;
- storage = list_next_entry(storage, list);
+ storage = list_next_entry(storage, list_map);
if (!storage)
goto enoent;
} else {
storage = list_first_entry(&map->list,
- struct bpf_cgroup_storage, list);
+ struct bpf_cgroup_storage, list_map);
}
spin_unlock_bh(&map->lock);
- next->attach_type = storage->key.attach_type;
- next->cgroup_inode_id = storage->key.cgroup_inode_id;
+
+ if (attach_type_isolated(&map->map)) {
+ struct bpf_cgroup_storage_key *next = _next_key;
+ *next = storage->key;
+ } else {
+ __u64 *next = _next_key;
+ *next = storage->key.cgroup_inode_id;
+ }
return 0;
enoent:
@@ -270,18 +284,25 @@ enoent:
static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
{
+ __u32 max_value_size = BPF_LOCAL_STORAGE_MAX_VALUE_SIZE;
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_cgroup_storage_map *map;
- struct bpf_map_memory mem;
- int ret;
- if (attr->key_size != sizeof(struct bpf_cgroup_storage_key))
+ /* percpu is bound by PCPU_MIN_UNIT_SIZE, non-percu
+ * is the same as other local storages.
+ */
+ if (attr->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+ max_value_size = min_t(__u32, max_value_size,
+ PCPU_MIN_UNIT_SIZE);
+
+ if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) &&
+ attr->key_size != sizeof(__u64))
return ERR_PTR(-EINVAL);
if (attr->value_size == 0)
return ERR_PTR(-EINVAL);
- if (attr->value_size > PAGE_SIZE)
+ if (attr->value_size > max_value_size)
return ERR_PTR(-E2BIG);
if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK ||
@@ -292,18 +313,9 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
/* max_entries is not used and enforced to be 0 */
return ERR_PTR(-EINVAL);
- ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map));
- if (ret < 0)
- return ERR_PTR(ret);
-
- map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
- __GFP_ZERO | GFP_USER, numa_node);
- if (!map) {
- bpf_map_charge_finish(&mem);
+ map = bpf_map_area_alloc(sizeof(struct bpf_cgroup_storage_map), numa_node);
+ if (!map)
return ERR_PTR(-ENOMEM);
- }
-
- bpf_map_charge_move(&map->map.memory, &mem);
/* copy mandatory map attributes */
bpf_map_init_from_attr(&map->map, attr);
@@ -318,14 +330,25 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
static void cgroup_storage_map_free(struct bpf_map *_map)
{
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+ struct list_head *storages = &map->list;
+ struct bpf_cgroup_storage *storage, *stmp;
+
+ cgroup_lock();
+
+ list_for_each_entry_safe(storage, stmp, storages, list_map) {
+ bpf_cgroup_storage_unlink(storage);
+ bpf_cgroup_storage_free(storage);
+ }
+
+ cgroup_unlock();
WARN_ON(!RB_EMPTY_ROOT(&map->root));
WARN_ON(!list_empty(&map->list));
- kfree(map);
+ bpf_map_area_free(map);
}
-static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
+static long cgroup_storage_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
@@ -335,49 +358,63 @@ static int cgroup_storage_check_btf(const struct bpf_map *map,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
- struct btf_member *m;
- u32 offset, size;
-
- /* Key is expected to be of struct bpf_cgroup_storage_key type,
- * which is:
- * struct bpf_cgroup_storage_key {
- * __u64 cgroup_inode_id;
- * __u32 attach_type;
- * };
- */
+ if (attach_type_isolated(map)) {
+ struct btf_member *m;
+ u32 offset, size;
+
+ /* Key is expected to be of struct bpf_cgroup_storage_key type,
+ * which is:
+ * struct bpf_cgroup_storage_key {
+ * __u64 cgroup_inode_id;
+ * __u32 attach_type;
+ * };
+ */
+
+ /*
+ * Key_type must be a structure with two fields.
+ */
+ if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ||
+ BTF_INFO_VLEN(key_type->info) != 2)
+ return -EINVAL;
+
+ /*
+ * The first field must be a 64 bit integer at 0 offset.
+ */
+ m = (struct btf_member *)(key_type + 1);
+ size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id);
+ if (!btf_member_is_reg_int(btf, key_type, m, 0, size))
+ return -EINVAL;
+
+ /*
+ * The second field must be a 32 bit integer at 64 bit offset.
+ */
+ m++;
+ offset = offsetof(struct bpf_cgroup_storage_key, attach_type);
+ size = sizeof_field(struct bpf_cgroup_storage_key, attach_type);
+ if (!btf_member_is_reg_int(btf, key_type, m, offset, size))
+ return -EINVAL;
+ } else {
+ u32 int_data;
- /*
- * Key_type must be a structure with two fields.
- */
- if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ||
- BTF_INFO_VLEN(key_type->info) != 2)
- return -EINVAL;
+ /*
+ * Key is expected to be u64, which stores the cgroup_inode_id
+ */
- /*
- * The first field must be a 64 bit integer at 0 offset.
- */
- m = (struct btf_member *)(key_type + 1);
- size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id);
- if (!btf_member_is_reg_int(btf, key_type, m, 0, size))
- return -EINVAL;
+ if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+ return -EINVAL;
- /*
- * The second field must be a 32 bit integer at 64 bit offset.
- */
- m++;
- offset = offsetof(struct bpf_cgroup_storage_key, attach_type);
- size = sizeof_field(struct bpf_cgroup_storage_key, attach_type);
- if (!btf_member_is_reg_int(btf, key_type, m, offset, size))
- return -EINVAL;
+ int_data = *(u32 *)(key_type + 1);
+ if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data))
+ return -EINVAL;
+ }
return 0;
}
-static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key,
+static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
struct seq_file *m)
{
- enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
- struct bpf_cgroup_storage_key *key = _key;
+ enum bpf_cgroup_storage_type stype;
struct bpf_cgroup_storage *storage;
int cpu;
@@ -409,6 +446,14 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key,
rcu_read_unlock();
}
+static u64 cgroup_storage_map_usage(const struct bpf_map *map)
+{
+ /* Currently the dynamically allocated elements are not counted. */
+ return sizeof(struct bpf_cgroup_storage_map);
+}
+
+BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct,
+ bpf_cgroup_storage_map)
const struct bpf_map_ops cgroup_storage_map_ops = {
.map_alloc = cgroup_storage_map_alloc,
.map_free = cgroup_storage_map_free,
@@ -418,43 +463,20 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
.map_delete_elem = cgroup_storage_delete_elem,
.map_check_btf = cgroup_storage_check_btf,
.map_seq_show_elem = cgroup_storage_seq_show_elem,
+ .map_mem_usage = cgroup_storage_map_usage,
+ .map_btf_id = &cgroup_storage_map_btf_ids[0],
};
int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map)
{
enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
- struct bpf_cgroup_storage_map *map = map_to_storage(_map);
- int ret = -EBUSY;
-
- spin_lock_bh(&map->lock);
- if (map->aux && map->aux != aux)
- goto unlock;
if (aux->cgroup_storage[stype] &&
aux->cgroup_storage[stype] != _map)
- goto unlock;
+ return -EBUSY;
- map->aux = aux;
aux->cgroup_storage[stype] = _map;
- ret = 0;
-unlock:
- spin_unlock_bh(&map->lock);
-
- return ret;
-}
-
-void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *_map)
-{
- enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
- struct bpf_cgroup_storage_map *map = map_to_storage(_map);
-
- spin_lock_bh(&map->lock);
- if (map->aux == aux) {
- WARN_ON(aux->cgroup_storage[stype] != _map);
- map->aux = NULL;
- aux->cgroup_storage[stype] = NULL;
- }
- spin_unlock_bh(&map->lock);
+ return 0;
}
static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages)
@@ -477,9 +499,9 @@ static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages)
struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
enum bpf_cgroup_storage_type stype)
{
+ const gfp_t gfp = __GFP_ZERO | GFP_USER;
struct bpf_cgroup_storage *storage;
struct bpf_map *map;
- gfp_t flags;
size_t size;
u32 pages;
@@ -489,23 +511,19 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
size = bpf_cgroup_storage_calculate_size(map, &pages);
- if (bpf_map_charge_memlock(map, pages))
- return ERR_PTR(-EPERM);
-
- storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),
- __GFP_ZERO | GFP_USER, map->numa_node);
+ storage = bpf_map_kmalloc_node(map, sizeof(struct bpf_cgroup_storage),
+ gfp, map->numa_node);
if (!storage)
goto enomem;
- flags = __GFP_ZERO | GFP_USER;
-
if (stype == BPF_CGROUP_STORAGE_SHARED) {
- storage->buf = kmalloc_node(size, flags, map->numa_node);
+ storage->buf = bpf_map_kmalloc_node(map, size, gfp,
+ map->numa_node);
if (!storage->buf)
goto enomem;
- check_and_init_map_lock(map, storage->buf->data);
+ check_and_init_map_value(map, storage->buf->data);
} else {
- storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);
+ storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp);
if (!storage->percpu_buf)
goto enomem;
}
@@ -515,7 +533,6 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
return storage;
enomem:
- bpf_map_uncharge_memlock(map, pages);
kfree(storage);
return ERR_PTR(-ENOMEM);
}
@@ -542,16 +559,11 @@ void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage)
{
enum bpf_cgroup_storage_type stype;
struct bpf_map *map;
- u32 pages;
if (!storage)
return;
map = &storage->map->map;
-
- bpf_cgroup_storage_calculate_size(map, &pages);
- bpf_map_uncharge_memlock(map, pages);
-
stype = cgroup_storage_type(map);
if (stype == BPF_CGROUP_STORAGE_SHARED)
call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu);
@@ -575,7 +587,8 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
spin_lock_bh(&map->lock);
WARN_ON(cgroup_storage_insert(map, storage));
- list_add(&storage->list, &map->list);
+ list_add(&storage->list_map, &map->list);
+ list_add(&storage->list_cg, &cgroup->bpf.storages);
spin_unlock_bh(&map->lock);
}
@@ -593,7 +606,8 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage)
root = &map->root;
rb_erase(&storage->node, root);
- list_del(&storage->list);
+ list_del(&storage->list_map);
+ list_del(&storage->list_cg);
spin_unlock_bh(&map->lock);
}
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
new file mode 100644
index 000000000000..594a234f122b
--- /dev/null
+++ b/kernel/bpf/log.c
@@ -0,0 +1,831 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+ */
+#include <uapi/linux/btf.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/math64.h>
+
+#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
+
+static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
+{
+ /* ubuf and len_total should both be specified (or not) together */
+ if (!!log->ubuf != !!log->len_total)
+ return false;
+ /* log buf without log_level is meaningless */
+ if (log->ubuf && log->level == 0)
+ return false;
+ if (log->level & ~BPF_LOG_MASK)
+ return false;
+ if (log->len_total > UINT_MAX >> 2)
+ return false;
+ return true;
+}
+
+int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
+ char __user *log_buf, u32 log_size)
+{
+ log->level = log_level;
+ log->ubuf = log_buf;
+ log->len_total = log_size;
+
+ /* log attributes have to be sane */
+ if (!bpf_verifier_log_attr_valid(log))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len)
+{
+ /* add_len includes terminal \0, so no need for +1. */
+ u64 len = log->end_pos + add_len;
+
+ /* log->len_max could be larger than our current len due to
+ * bpf_vlog_reset() calls, so we maintain the max of any length at any
+ * previous point
+ */
+ if (len > UINT_MAX)
+ log->len_max = UINT_MAX;
+ else if (len > log->len_max)
+ log->len_max = len;
+}
+
+void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
+ va_list args)
+{
+ u64 cur_pos;
+ u32 new_n, n;
+
+ n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
+
+ if (log->level == BPF_LOG_KERNEL) {
+ bool newline = n > 0 && log->kbuf[n - 1] == '\n';
+
+ pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
+ return;
+ }
+
+ n += 1; /* include terminating zero */
+ bpf_vlog_update_len_max(log, n);
+
+ if (log->level & BPF_LOG_FIXED) {
+ /* check if we have at least something to put into user buf */
+ new_n = 0;
+ if (log->end_pos < log->len_total) {
+ new_n = min_t(u32, log->len_total - log->end_pos, n);
+ log->kbuf[new_n - 1] = '\0';
+ }
+
+ cur_pos = log->end_pos;
+ log->end_pos += n - 1; /* don't count terminating '\0' */
+
+ if (log->ubuf && new_n &&
+ copy_to_user(log->ubuf + cur_pos, log->kbuf, new_n))
+ goto fail;
+ } else {
+ u64 new_end, new_start;
+ u32 buf_start, buf_end, new_n;
+
+ new_end = log->end_pos + n;
+ if (new_end - log->start_pos >= log->len_total)
+ new_start = new_end - log->len_total;
+ else
+ new_start = log->start_pos;
+
+ log->start_pos = new_start;
+ log->end_pos = new_end - 1; /* don't count terminating '\0' */
+
+ if (!log->ubuf)
+ return;
+
+ new_n = min(n, log->len_total);
+ cur_pos = new_end - new_n;
+ div_u64_rem(cur_pos, log->len_total, &buf_start);
+ div_u64_rem(new_end, log->len_total, &buf_end);
+ /* new_end and buf_end are exclusive indices, so if buf_end is
+ * exactly zero, then it actually points right to the end of
+ * ubuf and there is no wrap around
+ */
+ if (buf_end == 0)
+ buf_end = log->len_total;
+
+ /* if buf_start > buf_end, we wrapped around;
+ * if buf_start == buf_end, then we fill ubuf completely; we
+ * can't have buf_start == buf_end to mean that there is
+ * nothing to write, because we always write at least
+ * something, even if terminal '\0'
+ */
+ if (buf_start < buf_end) {
+ /* message fits within contiguous chunk of ubuf */
+ if (copy_to_user(log->ubuf + buf_start,
+ log->kbuf + n - new_n,
+ buf_end - buf_start))
+ goto fail;
+ } else {
+ /* message wraps around the end of ubuf, copy in two chunks */
+ if (copy_to_user(log->ubuf + buf_start,
+ log->kbuf + n - new_n,
+ log->len_total - buf_start))
+ goto fail;
+ if (copy_to_user(log->ubuf,
+ log->kbuf + n - buf_end,
+ buf_end))
+ goto fail;
+ }
+ }
+
+ return;
+fail:
+ log->ubuf = NULL;
+}
+
+void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos)
+{
+ char zero = 0;
+ u32 pos;
+
+ if (WARN_ON_ONCE(new_pos > log->end_pos))
+ return;
+
+ if (!bpf_verifier_log_needed(log) || log->level == BPF_LOG_KERNEL)
+ return;
+
+ /* if position to which we reset is beyond current log window,
+ * then we didn't preserve any useful content and should adjust
+ * start_pos to end up with an empty log (start_pos == end_pos)
+ */
+ log->end_pos = new_pos;
+ if (log->end_pos < log->start_pos)
+ log->start_pos = log->end_pos;
+
+ if (!log->ubuf)
+ return;
+
+ if (log->level & BPF_LOG_FIXED)
+ pos = log->end_pos + 1;
+ else
+ div_u64_rem(new_pos, log->len_total, &pos);
+
+ if (pos < log->len_total && put_user(zero, log->ubuf + pos))
+ log->ubuf = NULL;
+}
+
+static void bpf_vlog_reverse_kbuf(char *buf, int len)
+{
+ int i, j;
+
+ for (i = 0, j = len - 1; i < j; i++, j--)
+ swap(buf[i], buf[j]);
+}
+
+static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int end)
+{
+ /* we split log->kbuf into two equal parts for both ends of array */
+ int n = sizeof(log->kbuf) / 2, nn;
+ char *lbuf = log->kbuf, *rbuf = log->kbuf + n;
+
+ /* Read ubuf's section [start, end) two chunks at a time, from left
+ * and right side; within each chunk, swap all the bytes; after that
+ * reverse the order of lbuf and rbuf and write result back to ubuf.
+ * This way we'll end up with swapped contents of specified
+ * [start, end) ubuf segment.
+ */
+ while (end - start > 1) {
+ nn = min(n, (end - start ) / 2);
+
+ if (copy_from_user(lbuf, log->ubuf + start, nn))
+ return -EFAULT;
+ if (copy_from_user(rbuf, log->ubuf + end - nn, nn))
+ return -EFAULT;
+
+ bpf_vlog_reverse_kbuf(lbuf, nn);
+ bpf_vlog_reverse_kbuf(rbuf, nn);
+
+ /* we write lbuf to the right end of ubuf, while rbuf to the
+ * left one to end up with properly reversed overall ubuf
+ */
+ if (copy_to_user(log->ubuf + start, rbuf, nn))
+ return -EFAULT;
+ if (copy_to_user(log->ubuf + end - nn, lbuf, nn))
+ return -EFAULT;
+
+ start += nn;
+ end -= nn;
+ }
+
+ return 0;
+}
+
+int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual)
+{
+ u32 sublen;
+ int err;
+
+ *log_size_actual = 0;
+ if (!log || log->level == 0 || log->level == BPF_LOG_KERNEL)
+ return 0;
+
+ if (!log->ubuf)
+ goto skip_log_rotate;
+ /* If we never truncated log, there is nothing to move around. */
+ if (log->start_pos == 0)
+ goto skip_log_rotate;
+
+ /* Otherwise we need to rotate log contents to make it start from the
+ * buffer beginning and be a continuous zero-terminated string. Note
+ * that if log->start_pos != 0 then we definitely filled up entire log
+ * buffer with no gaps, and we just need to shift buffer contents to
+ * the left by (log->start_pos % log->len_total) bytes.
+ *
+ * Unfortunately, user buffer could be huge and we don't want to
+ * allocate temporary kernel memory of the same size just to shift
+ * contents in a straightforward fashion. Instead, we'll be clever and
+ * do in-place array rotation. This is a leetcode-style problem, which
+ * could be solved by three rotations.
+ *
+ * Let's say we have log buffer that has to be shifted left by 7 bytes
+ * (spaces and vertical bar is just for demonstrative purposes):
+ * E F G H I J K | A B C D
+ *
+ * First, we reverse entire array:
+ * D C B A | K J I H G F E
+ *
+ * Then we rotate first 4 bytes (DCBA) and separately last 7 bytes
+ * (KJIHGFE), resulting in a properly rotated array:
+ * A B C D | E F G H I J K
+ *
+ * We'll utilize log->kbuf to read user memory chunk by chunk, swap
+ * bytes, and write them back. Doing it byte-by-byte would be
+ * unnecessarily inefficient. Altogether we are going to read and
+ * write each byte twice, for total 4 memory copies between kernel and
+ * user space.
+ */
+
+ /* length of the chopped off part that will be the beginning;
+ * len(ABCD) in the example above
+ */
+ div_u64_rem(log->start_pos, log->len_total, &sublen);
+ sublen = log->len_total - sublen;
+
+ err = bpf_vlog_reverse_ubuf(log, 0, log->len_total);
+ err = err ?: bpf_vlog_reverse_ubuf(log, 0, sublen);
+ err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total);
+ if (err)
+ log->ubuf = NULL;
+
+skip_log_rotate:
+ *log_size_actual = log->len_max;
+
+ /* properly initialized log has either both ubuf!=NULL and len_total>0
+ * or ubuf==NULL and len_total==0, so if this condition doesn't hold,
+ * we got a fault somewhere along the way, so report it back
+ */
+ if (!!log->ubuf != !!log->len_total)
+ return -EFAULT;
+
+ /* did truncation actually happen? */
+ if (log->ubuf && log->len_max > log->len_total)
+ return -ENOSPC;
+
+ return 0;
+}
+
+/* log_level controls verbosity level of eBPF verifier.
+ * bpf_verifier_log_write() is used to dump the verification trace to the log,
+ * so the user can figure out what's wrong with the program
+ */
+__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ if (!bpf_verifier_log_needed(&env->log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(&env->log, fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
+
+__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_log);
+
+static const struct bpf_line_info *
+find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
+{
+ const struct bpf_line_info *linfo;
+ const struct bpf_prog *prog;
+ u32 i, nr_linfo;
+
+ prog = env->prog;
+ nr_linfo = prog->aux->nr_linfo;
+
+ if (!nr_linfo || insn_off >= prog->len)
+ return NULL;
+
+ linfo = prog->aux->linfo;
+ for (i = 1; i < nr_linfo; i++)
+ if (insn_off < linfo[i].insn_off)
+ break;
+
+ return &linfo[i - 1];
+}
+
+static const char *ltrim(const char *s)
+{
+ while (isspace(*s))
+ s++;
+
+ return s;
+}
+
+__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
+ u32 insn_off,
+ const char *prefix_fmt, ...)
+{
+ const struct bpf_line_info *linfo;
+
+ if (!bpf_verifier_log_needed(&env->log))
+ return;
+
+ linfo = find_linfo(env, insn_off);
+ if (!linfo || linfo == env->prev_linfo)
+ return;
+
+ if (prefix_fmt) {
+ va_list args;
+
+ va_start(args, prefix_fmt);
+ bpf_verifier_vlog(&env->log, prefix_fmt, args);
+ va_end(args);
+ }
+
+ verbose(env, "%s\n",
+ ltrim(btf_name_by_offset(env->prog->aux->btf,
+ linfo->line_off)));
+
+ env->prev_linfo = linfo;
+}
+
+static const char *btf_type_name(const struct btf *btf, u32 id)
+{
+ return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
+}
+
+/* string representation of 'enum bpf_reg_type'
+ *
+ * Note that reg_type_str() can not appear more than once in a single verbose()
+ * statement.
+ */
+const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
+{
+ char postfix[16] = {0}, prefix[64] = {0};
+ static const char * const str[] = {
+ [NOT_INIT] = "?",
+ [SCALAR_VALUE] = "scalar",
+ [PTR_TO_CTX] = "ctx",
+ [CONST_PTR_TO_MAP] = "map_ptr",
+ [PTR_TO_MAP_VALUE] = "map_value",
+ [PTR_TO_STACK] = "fp",
+ [PTR_TO_PACKET] = "pkt",
+ [PTR_TO_PACKET_META] = "pkt_meta",
+ [PTR_TO_PACKET_END] = "pkt_end",
+ [PTR_TO_FLOW_KEYS] = "flow_keys",
+ [PTR_TO_SOCKET] = "sock",
+ [PTR_TO_SOCK_COMMON] = "sock_common",
+ [PTR_TO_TCP_SOCK] = "tcp_sock",
+ [PTR_TO_TP_BUFFER] = "tp_buffer",
+ [PTR_TO_XDP_SOCK] = "xdp_sock",
+ [PTR_TO_BTF_ID] = "ptr_",
+ [PTR_TO_MEM] = "mem",
+ [PTR_TO_BUF] = "buf",
+ [PTR_TO_FUNC] = "func",
+ [PTR_TO_MAP_KEY] = "map_key",
+ [CONST_PTR_TO_DYNPTR] = "dynptr_ptr",
+ };
+
+ if (type & PTR_MAYBE_NULL) {
+ if (base_type(type) == PTR_TO_BTF_ID)
+ strncpy(postfix, "or_null_", 16);
+ else
+ strncpy(postfix, "_or_null", 16);
+ }
+
+ snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
+ type & MEM_RDONLY ? "rdonly_" : "",
+ type & MEM_RINGBUF ? "ringbuf_" : "",
+ type & MEM_USER ? "user_" : "",
+ type & MEM_PERCPU ? "percpu_" : "",
+ type & MEM_RCU ? "rcu_" : "",
+ type & PTR_UNTRUSTED ? "untrusted_" : "",
+ type & PTR_TRUSTED ? "trusted_" : ""
+ );
+
+ snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s",
+ prefix, str[base_type(type)], postfix);
+ return env->tmp_str_buf;
+}
+
+const char *dynptr_type_str(enum bpf_dynptr_type type)
+{
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ return "local";
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return "ringbuf";
+ case BPF_DYNPTR_TYPE_SKB:
+ return "skb";
+ case BPF_DYNPTR_TYPE_XDP:
+ return "xdp";
+ case BPF_DYNPTR_TYPE_INVALID:
+ return "<invalid>";
+ default:
+ WARN_ONCE(1, "unknown dynptr type %d\n", type);
+ return "<unknown>";
+ }
+}
+
+const char *iter_type_str(const struct btf *btf, u32 btf_id)
+{
+ if (!btf || btf_id == 0)
+ return "<invalid>";
+
+ /* we already validated that type is valid and has conforming name */
+ return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
+}
+
+const char *iter_state_str(enum bpf_iter_state state)
+{
+ switch (state) {
+ case BPF_ITER_STATE_ACTIVE:
+ return "active";
+ case BPF_ITER_STATE_DRAINED:
+ return "drained";
+ case BPF_ITER_STATE_INVALID:
+ return "<invalid>";
+ default:
+ WARN_ONCE(1, "unknown iter state %d\n", state);
+ return "<unknown>";
+ }
+}
+
+static char slot_type_char[] = {
+ [STACK_INVALID] = '?',
+ [STACK_SPILL] = 'r',
+ [STACK_MISC] = 'm',
+ [STACK_ZERO] = '0',
+ [STACK_DYNPTR] = 'd',
+ [STACK_ITER] = 'i',
+};
+
+static void print_liveness(struct bpf_verifier_env *env,
+ enum bpf_reg_liveness live)
+{
+ if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE))
+ verbose(env, "_");
+ if (live & REG_LIVE_READ)
+ verbose(env, "r");
+ if (live & REG_LIVE_WRITTEN)
+ verbose(env, "w");
+ if (live & REG_LIVE_DONE)
+ verbose(env, "D");
+}
+
+#define UNUM_MAX_DECIMAL U16_MAX
+#define SNUM_MAX_DECIMAL S16_MAX
+#define SNUM_MIN_DECIMAL S16_MIN
+
+static bool is_unum_decimal(u64 num)
+{
+ return num <= UNUM_MAX_DECIMAL;
+}
+
+static bool is_snum_decimal(s64 num)
+{
+ return num >= SNUM_MIN_DECIMAL && num <= SNUM_MAX_DECIMAL;
+}
+
+static void verbose_unum(struct bpf_verifier_env *env, u64 num)
+{
+ if (is_unum_decimal(num))
+ verbose(env, "%llu", num);
+ else
+ verbose(env, "%#llx", num);
+}
+
+static void verbose_snum(struct bpf_verifier_env *env, s64 num)
+{
+ if (is_snum_decimal(num))
+ verbose(env, "%lld", num);
+ else
+ verbose(env, "%#llx", num);
+}
+
+int tnum_strn(char *str, size_t size, struct tnum a)
+{
+ /* print as a constant, if tnum is fully known */
+ if (a.mask == 0) {
+ if (is_unum_decimal(a.value))
+ return snprintf(str, size, "%llu", a.value);
+ else
+ return snprintf(str, size, "%#llx", a.value);
+ }
+ return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
+}
+EXPORT_SYMBOL_GPL(tnum_strn);
+
+static void print_scalar_ranges(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ const char **sep)
+{
+ /* For signed ranges, we want to unify 64-bit and 32-bit values in the
+ * output as much as possible, but there is a bit of a complication.
+ * If we choose to print values as decimals, this is natural to do,
+ * because negative 64-bit and 32-bit values >= -S32_MIN have the same
+ * representation due to sign extension. But if we choose to print
+ * them in hex format (see is_snum_decimal()), then sign extension is
+ * misleading.
+ * E.g., smin=-2 and smin32=-2 are exactly the same in decimal, but in
+ * hex they will be smin=0xfffffffffffffffe and smin32=0xfffffffe, two
+ * very different numbers.
+ * So we avoid sign extension if we choose to print values in hex.
+ */
+ struct {
+ const char *name;
+ u64 val;
+ bool omit;
+ } minmaxs[] = {
+ {"smin", reg->smin_value, reg->smin_value == S64_MIN},
+ {"smax", reg->smax_value, reg->smax_value == S64_MAX},
+ {"umin", reg->umin_value, reg->umin_value == 0},
+ {"umax", reg->umax_value, reg->umax_value == U64_MAX},
+ {"smin32",
+ is_snum_decimal((s64)reg->s32_min_value)
+ ? (s64)reg->s32_min_value
+ : (u32)reg->s32_min_value, reg->s32_min_value == S32_MIN},
+ {"smax32",
+ is_snum_decimal((s64)reg->s32_max_value)
+ ? (s64)reg->s32_max_value
+ : (u32)reg->s32_max_value, reg->s32_max_value == S32_MAX},
+ {"umin32", reg->u32_min_value, reg->u32_min_value == 0},
+ {"umax32", reg->u32_max_value, reg->u32_max_value == U32_MAX},
+ }, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)];
+ bool neg1, neg2;
+
+ for (m1 = &minmaxs[0]; m1 < mend; m1++) {
+ if (m1->omit)
+ continue;
+
+ neg1 = m1->name[0] == 's' && (s64)m1->val < 0;
+
+ verbose(env, "%s%s=", *sep, m1->name);
+ *sep = ",";
+
+ for (m2 = m1 + 2; m2 < mend; m2 += 2) {
+ if (m2->omit || m2->val != m1->val)
+ continue;
+ /* don't mix negatives with positives */
+ neg2 = m2->name[0] == 's' && (s64)m2->val < 0;
+ if (neg2 != neg1)
+ continue;
+ m2->omit = true;
+ verbose(env, "%s=", m2->name);
+ }
+
+ if (m1->name[0] == 's')
+ verbose_snum(env, m1->val);
+ else
+ verbose_unum(env, m1->val);
+ }
+}
+
+static bool type_is_map_ptr(enum bpf_reg_type t) {
+ switch (base_type(t)) {
+ case CONST_PTR_TO_MAP:
+ case PTR_TO_MAP_KEY:
+ case PTR_TO_MAP_VALUE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * _a stands for append, was shortened to avoid multiline statements below.
+ * This macro is used to output a comma separated list of attributes.
+ */
+#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; })
+
+static void print_reg_state(struct bpf_verifier_env *env,
+ const struct bpf_func_state *state,
+ const struct bpf_reg_state *reg)
+{
+ enum bpf_reg_type t;
+ const char *sep = "";
+
+ t = reg->type;
+ if (t == SCALAR_VALUE && reg->precise)
+ verbose(env, "P");
+ if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) {
+ /* reg->off should be 0 for SCALAR_VALUE */
+ verbose_snum(env, reg->var_off.value + reg->off);
+ return;
+ }
+
+ verbose(env, "%s", reg_type_str(env, t));
+ if (t == PTR_TO_STACK) {
+ if (state->frameno != reg->frameno)
+ verbose(env, "[%d]", reg->frameno);
+ if (tnum_is_const(reg->var_off)) {
+ verbose_snum(env, reg->var_off.value + reg->off);
+ return;
+ }
+ }
+ if (base_type(t) == PTR_TO_BTF_ID)
+ verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
+ verbose(env, "(");
+ if (reg->id)
+ verbose_a("id=%d", reg->id);
+ if (reg->ref_obj_id)
+ verbose_a("ref_obj_id=%d", reg->ref_obj_id);
+ if (type_is_non_owning_ref(reg->type))
+ verbose_a("%s", "non_own_ref");
+ if (type_is_map_ptr(t)) {
+ if (reg->map_ptr->name[0])
+ verbose_a("map=%s", reg->map_ptr->name);
+ verbose_a("ks=%d,vs=%d",
+ reg->map_ptr->key_size,
+ reg->map_ptr->value_size);
+ }
+ if (t != SCALAR_VALUE && reg->off) {
+ verbose_a("off=");
+ verbose_snum(env, reg->off);
+ }
+ if (type_is_pkt_pointer(t)) {
+ verbose_a("r=");
+ verbose_unum(env, reg->range);
+ }
+ if (base_type(t) == PTR_TO_MEM) {
+ verbose_a("sz=");
+ verbose_unum(env, reg->mem_size);
+ }
+ if (t == CONST_PTR_TO_DYNPTR)
+ verbose_a("type=%s", dynptr_type_str(reg->dynptr.type));
+ if (tnum_is_const(reg->var_off)) {
+ /* a pointer register with fixed offset */
+ if (reg->var_off.value) {
+ verbose_a("imm=");
+ verbose_snum(env, reg->var_off.value);
+ }
+ } else {
+ print_scalar_ranges(env, reg, &sep);
+ if (!tnum_is_unknown(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose_a("var_off=%s", tn_buf);
+ }
+ }
+ verbose(env, ")");
+}
+
+void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state,
+ bool print_all)
+{
+ const struct bpf_reg_state *reg;
+ int i;
+
+ if (state->frameno)
+ verbose(env, " frame%d:", state->frameno);
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ reg = &state->regs[i];
+ if (reg->type == NOT_INIT)
+ continue;
+ if (!print_all && !reg_scratched(env, i))
+ continue;
+ verbose(env, " R%d", i);
+ print_liveness(env, reg->live);
+ verbose(env, "=");
+ print_reg_state(env, state, reg);
+ }
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ char types_buf[BPF_REG_SIZE + 1];
+ const char *sep = "";
+ bool valid = false;
+ u8 slot_type;
+ int j;
+
+ if (!print_all && !stack_slot_scratched(env, i))
+ continue;
+
+ for (j = 0; j < BPF_REG_SIZE; j++) {
+ slot_type = state->stack[i].slot_type[j];
+ if (slot_type != STACK_INVALID)
+ valid = true;
+ types_buf[j] = slot_type_char[slot_type];
+ }
+ types_buf[BPF_REG_SIZE] = 0;
+ if (!valid)
+ continue;
+
+ reg = &state->stack[i].spilled_ptr;
+ switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) {
+ case STACK_SPILL:
+ /* print MISC/ZERO/INVALID slots above subreg spill */
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ if (state->stack[i].slot_type[j] == STACK_SPILL)
+ break;
+ types_buf[j] = '\0';
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
+ verbose(env, "=%s", types_buf);
+ print_reg_state(env, state, reg);
+ break;
+ case STACK_DYNPTR:
+ /* skip to main dynptr slot */
+ i += BPF_DYNPTR_NR_SLOTS - 1;
+ reg = &state->stack[i].spilled_ptr;
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
+ verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type));
+ if (reg->id)
+ verbose_a("id=%d", reg->id);
+ if (reg->ref_obj_id)
+ verbose_a("ref_id=%d", reg->ref_obj_id);
+ if (reg->dynptr_id)
+ verbose_a("dynptr_id=%d", reg->dynptr_id);
+ verbose(env, ")");
+ break;
+ case STACK_ITER:
+ /* only main slot has ref_obj_id set; skip others */
+ if (!reg->ref_obj_id)
+ continue;
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
+ verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
+ iter_type_str(reg->iter.btf, reg->iter.btf_id),
+ reg->ref_obj_id, iter_state_str(reg->iter.state),
+ reg->iter.depth);
+ break;
+ case STACK_MISC:
+ case STACK_ZERO:
+ default:
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
+ verbose(env, "=%s", types_buf);
+ break;
+ }
+ }
+ if (state->acquired_refs && state->refs[0].id) {
+ verbose(env, " refs=%d", state->refs[0].id);
+ for (i = 1; i < state->acquired_refs; i++)
+ if (state->refs[i].id)
+ verbose(env, ",%d", state->refs[i].id);
+ }
+ if (state->in_callback_fn)
+ verbose(env, " cb");
+ if (state->in_async_callback_fn)
+ verbose(env, " async_cb");
+ verbose(env, "\n");
+ if (!print_all)
+ mark_verifier_state_clean(env);
+}
+
+static inline u32 vlog_alignment(u32 pos)
+{
+ return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
+ BPF_LOG_MIN_ALIGNMENT) - pos - 1;
+}
+
+void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state)
+{
+ if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
+ /* remove new line character */
+ bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
+ verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
+ } else {
+ verbose(env, "%d:", env->insn_idx);
+ }
+ print_verifier_state(env, state, false);
+}
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index c8cc4e4cf98d..b32be680da6c 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -14,6 +14,7 @@
#include <linux/vmalloc.h>
#include <net/ipv6.h>
#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
/* Intermediate node */
#define LPM_TREE_NODE_FLAG_IM BIT(0)
@@ -230,9 +231,13 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
struct lpm_trie_node *node, *found = NULL;
struct bpf_lpm_trie_key *key = _key;
+ if (key->prefixlen > trie->max_prefixlen)
+ return NULL;
+
/* Start walking the trie from the root node ... */
- for (node = rcu_dereference(trie->root); node;) {
+ for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held());
+ node;) {
unsigned int next_bit;
size_t matchlen;
@@ -264,7 +269,8 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
* traverse down.
*/
next_bit = extract_bit(key->data, node->prefixlen);
- node = rcu_dereference(node->child[next_bit]);
+ node = rcu_dereference_check(node->child[next_bit],
+ rcu_read_lock_bh_held());
}
if (!found)
@@ -282,8 +288,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
if (value)
size += trie->map.value_size;
- node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN,
- trie->map.numa_node);
+ node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN,
+ trie->map.numa_node);
if (!node)
return NULL;
@@ -297,8 +303,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
}
/* Called from syscall or from eBPF program */
-static int trie_update_elem(struct bpf_map *map,
- void *_key, void *value, u64 flags)
+static long trie_update_elem(struct bpf_map *map,
+ void *_key, void *value, u64 flags)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
@@ -410,7 +416,7 @@ static int trie_update_elem(struct bpf_map *map,
rcu_assign_pointer(im_node->child[1], node);
}
- /* Finally, assign the intermediate node to the determined spot */
+ /* Finally, assign the intermediate node to the determined slot */
rcu_assign_pointer(*slot, im_node);
out:
@@ -428,7 +434,7 @@ out:
}
/* Called from syscall or from eBPF program */
-static int trie_delete_elem(struct bpf_map *map, void *_key)
+static long trie_delete_elem(struct bpf_map *map, void *_key)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
struct bpf_lpm_trie_key *key = _key;
@@ -540,11 +546,6 @@ out:
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
struct lpm_trie *trie;
- u64 cost = sizeof(*trie), cost_per_node;
- int ret;
-
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
/* check sanity of attributes */
if (attr->max_entries == 0 ||
@@ -557,7 +558,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
attr->value_size > LPM_VAL_SIZE_MAX)
return ERR_PTR(-EINVAL);
- trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN);
+ trie = bpf_map_area_alloc(sizeof(*trie), NUMA_NO_NODE);
if (!trie)
return ERR_PTR(-ENOMEM);
@@ -567,20 +568,9 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
offsetof(struct bpf_lpm_trie_key, data);
trie->max_prefixlen = trie->data_size * 8;
- cost_per_node = sizeof(struct lpm_trie_node) +
- attr->value_size + trie->data_size;
- cost += (u64) attr->max_entries * cost_per_node;
-
- ret = bpf_map_charge_init(&trie->map.memory, cost);
- if (ret)
- goto out_err;
-
spin_lock_init(&trie->lock);
return &trie->map;
-out_err:
- kfree(trie);
- return ERR_PTR(ret);
}
static void trie_free(struct bpf_map *map)
@@ -589,11 +579,6 @@ static void trie_free(struct bpf_map *map)
struct lpm_trie_node __rcu **slot;
struct lpm_trie_node *node;
- /* Wait for outstanding programs to complete
- * update/lookup/delete/get_next_key and free the trie.
- */
- synchronize_rcu();
-
/* Always start at the root and walk down to a node that has no
* children. Then free that node, nullify its reference in the parent
* and start over.
@@ -624,7 +609,7 @@ static void trie_free(struct bpf_map *map)
}
out:
- kfree(trie);
+ bpf_map_area_free(trie);
}
static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
@@ -735,12 +720,29 @@ static int trie_check_btf(const struct bpf_map *map,
-EINVAL : 0;
}
+static u64 trie_mem_usage(const struct bpf_map *map)
+{
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ u64 elem_size;
+
+ elem_size = sizeof(struct lpm_trie_node) + trie->data_size +
+ trie->map.value_size;
+ return elem_size * READ_ONCE(trie->n_entries);
+}
+
+BTF_ID_LIST_SINGLE(trie_map_btf_ids, struct, lpm_trie)
const struct bpf_map_ops trie_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = trie_alloc,
.map_free = trie_free,
.map_get_next_key = trie_get_next_key,
.map_lookup_elem = trie_lookup_elem,
.map_update_elem = trie_update_elem,
.map_delete_elem = trie_delete_elem,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
+ .map_delete_batch = generic_map_delete_batch,
.map_check_btf = trie_check_btf,
+ .map_mem_usage = trie_mem_usage,
+ .map_btf_id = &trie_map_btf_ids[0],
};
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 17738c93bec8..8ef269e66ba5 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -3,6 +3,7 @@
*/
#include <linux/slab.h>
#include <linux/bpf.h>
+#include <linux/btf.h>
#include "map_in_map.h"
@@ -11,32 +12,22 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
struct bpf_map *inner_map, *inner_map_meta;
u32 inner_map_meta_size;
struct fd f;
+ int ret;
f = fdget(inner_map_ufd);
inner_map = __bpf_map_get(f);
if (IS_ERR(inner_map))
return inner_map;
- /* prog_array->aux->{type,jited} is a runtime binding.
- * Doing static check alone in the verifier is not enough.
- */
- if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
- inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
- inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE ||
- inner_map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
- fdput(f);
- return ERR_PTR(-ENOTSUPP);
- }
-
/* Does not support >1 level map-in-map */
if (inner_map->inner_map_meta) {
- fdput(f);
- return ERR_PTR(-EINVAL);
+ ret = -EINVAL;
+ goto put;
}
- if (map_value_has_spin_lock(inner_map)) {
- fdput(f);
- return ERR_PTR(-ENOTSUPP);
+ if (!inner_map->ops->map_meta_equal) {
+ ret = -ENOTSUPP;
+ goto put;
}
inner_map_meta_size = sizeof(*inner_map_meta);
@@ -46,8 +37,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER);
if (!inner_map_meta) {
- fdput(f);
- return ERR_PTR(-ENOMEM);
+ ret = -ENOMEM;
+ goto put;
}
inner_map_meta->map_type = inner_map->map_type;
@@ -55,22 +46,51 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta->value_size = inner_map->value_size;
inner_map_meta->map_flags = inner_map->map_flags;
inner_map_meta->max_entries = inner_map->max_entries;
- inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
+
+ inner_map_meta->record = btf_record_dup(inner_map->record);
+ if (IS_ERR(inner_map_meta->record)) {
+ /* btf_record_dup returns NULL or valid pointer in case of
+ * invalid/empty/valid, but ERR_PTR in case of errors. During
+ * equality NULL or IS_ERR is equivalent.
+ */
+ ret = PTR_ERR(inner_map_meta->record);
+ goto free;
+ }
+ /* Note: We must use the same BTF, as we also used btf_record_dup above
+ * which relies on BTF being same for both maps, as some members like
+ * record->fields.list_head have pointers like value_rec pointing into
+ * inner_map->btf.
+ */
+ if (inner_map->btf) {
+ btf_get(inner_map->btf);
+ inner_map_meta->btf = inner_map->btf;
+ }
/* Misc members not needed in bpf_map_meta_equal() check. */
inner_map_meta->ops = inner_map->ops;
if (inner_map->ops == &array_map_ops) {
+ struct bpf_array *inner_array_meta =
+ container_of(inner_map_meta, struct bpf_array, map);
+ struct bpf_array *inner_array = container_of(inner_map, struct bpf_array, map);
+
+ inner_array_meta->index_mask = inner_array->index_mask;
+ inner_array_meta->elem_size = inner_array->elem_size;
inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1;
- container_of(inner_map_meta, struct bpf_array, map)->index_mask =
- container_of(inner_map, struct bpf_array, map)->index_mask;
}
fdput(f);
return inner_map_meta;
+free:
+ kfree(inner_map_meta);
+put:
+ fdput(f);
+ return ERR_PTR(ret);
}
void bpf_map_meta_free(struct bpf_map *map_meta)
{
+ bpf_map_free_record(map_meta);
+ btf_put(map_meta->btf);
kfree(map_meta);
}
@@ -82,14 +102,14 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
meta0->key_size == meta1->key_size &&
meta0->value_size == meta1->value_size &&
meta0->map_flags == meta1->map_flags &&
- meta0->max_entries == meta1->max_entries;
+ btf_record_equal(meta0->record, meta1->record);
}
void *bpf_map_fd_get_ptr(struct bpf_map *map,
struct file *map_file /* not used */,
int ufd)
{
- struct bpf_map *inner_map;
+ struct bpf_map *inner_map, *inner_map_meta;
struct fd f;
f = fdget(ufd);
@@ -97,7 +117,8 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
if (IS_ERR(inner_map))
return inner_map;
- if (bpf_map_meta_equal(map->inner_map_meta, inner_map))
+ inner_map_meta = map->inner_map_meta;
+ if (inner_map_meta->ops->map_meta_equal(inner_map_meta, inner_map))
bpf_map_inc(inner_map);
else
inner_map = ERR_PTR(-EINVAL);
@@ -106,12 +127,21 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
return inner_map;
}
-void bpf_map_fd_put_ptr(void *ptr)
+void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
- /* ptr->ops->map_free() has to go through one
- * rcu grace period by itself.
+ struct bpf_map *inner_map = ptr;
+
+ /* Defer the freeing of inner map according to the sleepable attribute
+ * of bpf program which owns the outer map, so unnecessary waiting for
+ * RCU tasks trace grace period can be avoided.
*/
- bpf_map_put(ptr);
+ if (need_defer) {
+ if (atomic64_read(&map->sleepable_refcnt))
+ WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
+ else
+ WRITE_ONCE(inner_map->free_after_rcu_gp, true);
+ }
+ bpf_map_put(inner_map);
}
u32 bpf_map_fd_sys_lookup_elem(void *ptr)
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index a507bf6ef8b9..7d61602354de 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -11,11 +11,9 @@ struct bpf_map;
struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd);
void bpf_map_meta_free(struct bpf_map *map_meta);
-bool bpf_map_meta_equal(const struct bpf_map *meta0,
- const struct bpf_map *meta1);
void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
int ufd);
-void bpf_map_fd_put_ptr(void *ptr);
+void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer);
u32 bpf_map_fd_sys_lookup_elem(void *ptr);
#endif
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index c69071e334bf..6abd7c5df4b3 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -4,9 +4,10 @@
#include <linux/fs.h>
#include <linux/filter.h>
#include <linux/kernel.h>
+#include <linux/btf_ids.h>
struct bpf_iter_seq_map_info {
- u32 mid;
+ u32 map_id;
};
static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos)
@@ -14,27 +15,23 @@ static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos)
struct bpf_iter_seq_map_info *info = seq->private;
struct bpf_map *map;
- map = bpf_map_get_curr_or_next(&info->mid);
+ map = bpf_map_get_curr_or_next(&info->map_id);
if (!map)
return NULL;
- ++*pos;
+ if (*pos == 0)
+ ++*pos;
return map;
}
static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct bpf_iter_seq_map_info *info = seq->private;
- struct bpf_map *map;
++*pos;
- ++info->mid;
+ ++info->map_id;
bpf_map_put((struct bpf_map *)v);
- map = bpf_map_get_curr_or_next(&info->mid);
- if (!map)
- return NULL;
-
- return map;
+ return bpf_map_get_curr_or_next(&info->map_id);
}
struct bpf_iter__bpf_map {
@@ -81,22 +78,152 @@ static const struct seq_operations bpf_map_seq_ops = {
.show = bpf_map_seq_show,
};
-static const struct bpf_iter_reg bpf_map_reg_info = {
- .target = "bpf_map",
+BTF_ID_LIST_GLOBAL_SINGLE(btf_bpf_map_id, struct, bpf_map)
+
+static const struct bpf_iter_seq_info bpf_map_seq_info = {
.seq_ops = &bpf_map_seq_ops,
.init_seq_private = NULL,
.fini_seq_private = NULL,
.seq_priv_size = sizeof(struct bpf_iter_seq_map_info),
+};
+
+static struct bpf_iter_reg bpf_map_reg_info = {
+ .target = "bpf_map",
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__bpf_map, map),
- PTR_TO_BTF_ID_OR_NULL },
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
+ },
+ .seq_info = &bpf_map_seq_info,
+};
+
+static int bpf_iter_attach_map(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ u32 key_acc_size, value_acc_size, key_size, value_size;
+ struct bpf_map *map;
+ bool is_percpu = false;
+ int err = -EINVAL;
+
+ if (!linfo->map.map_fd)
+ return -EBADF;
+
+ map = bpf_map_get_with_uref(linfo->map.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ is_percpu = true;
+ else if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY)
+ goto put_map;
+
+ key_acc_size = prog->aux->max_rdonly_access;
+ value_acc_size = prog->aux->max_rdwr_access;
+ key_size = map->key_size;
+ if (!is_percpu)
+ value_size = map->value_size;
+ else
+ value_size = round_up(map->value_size, 8) * num_possible_cpus();
+
+ if (key_acc_size > key_size || value_acc_size > value_size) {
+ err = -EACCES;
+ goto put_map;
+ }
+
+ aux->map = map;
+ return 0;
+
+put_map:
+ bpf_map_put_with_uref(map);
+ return err;
+}
+
+static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux)
+{
+ bpf_map_put_with_uref(aux->map);
+}
+
+void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ seq_printf(seq, "map_id:\t%u\n", aux->map->id);
+}
+
+int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux,
+ struct bpf_link_info *info)
+{
+ info->iter.map.map_id = aux->map->id;
+ return 0;
+}
+
+DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta,
+ struct bpf_map *map, void *key, void *value)
+
+static const struct bpf_iter_reg bpf_map_elem_reg_info = {
+ .target = "bpf_map_elem",
+ .attach_target = bpf_iter_attach_map,
+ .detach_target = bpf_iter_detach_map,
+ .show_fdinfo = bpf_iter_map_show_fdinfo,
+ .fill_link_info = bpf_iter_map_fill_link_info,
+ .ctx_arg_info_size = 2,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__bpf_map_elem, key),
+ PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },
+ { offsetof(struct bpf_iter__bpf_map_elem, value),
+ PTR_TO_BUF | PTR_MAYBE_NULL },
},
};
static int __init bpf_map_iter_init(void)
{
- return bpf_iter_reg_target(&bpf_map_reg_info);
+ int ret;
+
+ bpf_map_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_map_id;
+ ret = bpf_iter_reg_target(&bpf_map_reg_info);
+ if (ret)
+ return ret;
+
+ return bpf_iter_reg_target(&bpf_map_elem_reg_info);
}
late_initcall(bpf_map_iter_init);
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map)
+{
+ s64 *pcount;
+ s64 ret = 0;
+ int cpu;
+
+ if (!map || !map->elem_count)
+ return 0;
+
+ for_each_possible_cpu(cpu) {
+ pcount = per_cpu_ptr(map->elem_count, cpu);
+ ret += READ_ONCE(*pcount);
+ }
+ return ret;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_SET8_START(bpf_map_iter_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS)
+BTF_SET8_END(bpf_map_iter_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_map_iter_kfunc_ids,
+};
+
+static int init_subsystem(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_map_iter_kfunc_set);
+}
+late_initcall(init_subsystem);
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
new file mode 100644
index 000000000000..550f02e2cb13
--- /dev/null
+++ b/kernel/bpf/memalloc.c
@@ -0,0 +1,1012 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#include <linux/mm.h>
+#include <linux/llist.h>
+#include <linux/bpf.h>
+#include <linux/irq_work.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/memcontrol.h>
+#include <asm/local.h>
+
+/* Any context (including NMI) BPF specific memory allocator.
+ *
+ * Tracing BPF programs can attach to kprobe and fentry. Hence they
+ * run in unknown context where calling plain kmalloc() might not be safe.
+ *
+ * Front-end kmalloc() with per-cpu per-bucket cache of free elements.
+ * Refill this cache asynchronously from irq_work.
+ *
+ * CPU_0 buckets
+ * 16 32 64 96 128 196 256 512 1024 2048 4096
+ * ...
+ * CPU_N buckets
+ * 16 32 64 96 128 196 256 512 1024 2048 4096
+ *
+ * The buckets are prefilled at the start.
+ * BPF programs always run with migration disabled.
+ * It's safe to allocate from cache of the current cpu with irqs disabled.
+ * Free-ing is always done into bucket of the current cpu as well.
+ * irq_work trims extra free elements from buckets with kfree
+ * and refills them with kmalloc, so global kmalloc logic takes care
+ * of freeing objects allocated by one cpu and freed on another.
+ *
+ * Every allocated objected is padded with extra 8 bytes that contains
+ * struct llist_node.
+ */
+#define LLIST_NODE_SZ sizeof(struct llist_node)
+
+/* similar to kmalloc, but sizeof == 8 bucket is gone */
+static u8 size_index[24] __ro_after_init = {
+ 3, /* 8 */
+ 3, /* 16 */
+ 4, /* 24 */
+ 4, /* 32 */
+ 5, /* 40 */
+ 5, /* 48 */
+ 5, /* 56 */
+ 5, /* 64 */
+ 1, /* 72 */
+ 1, /* 80 */
+ 1, /* 88 */
+ 1, /* 96 */
+ 6, /* 104 */
+ 6, /* 112 */
+ 6, /* 120 */
+ 6, /* 128 */
+ 2, /* 136 */
+ 2, /* 144 */
+ 2, /* 152 */
+ 2, /* 160 */
+ 2, /* 168 */
+ 2, /* 176 */
+ 2, /* 184 */
+ 2 /* 192 */
+};
+
+static int bpf_mem_cache_idx(size_t size)
+{
+ if (!size || size > 4096)
+ return -1;
+
+ if (size <= 192)
+ return size_index[(size - 1) / 8] - 1;
+
+ return fls(size - 1) - 2;
+}
+
+#define NUM_CACHES 11
+
+struct bpf_mem_cache {
+ /* per-cpu list of free objects of size 'unit_size'.
+ * All accesses are done with interrupts disabled and 'active' counter
+ * protection with __llist_add() and __llist_del_first().
+ */
+ struct llist_head free_llist;
+ local_t active;
+
+ /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill
+ * are sequenced by per-cpu 'active' counter. But unit_free() cannot
+ * fail. When 'active' is busy the unit_free() will add an object to
+ * free_llist_extra.
+ */
+ struct llist_head free_llist_extra;
+
+ struct irq_work refill_work;
+ struct obj_cgroup *objcg;
+ int unit_size;
+ /* count of objects in free_llist */
+ int free_cnt;
+ int low_watermark, high_watermark, batch;
+ int percpu_size;
+ bool draining;
+ struct bpf_mem_cache *tgt;
+
+ /* list of objects to be freed after RCU GP */
+ struct llist_head free_by_rcu;
+ struct llist_node *free_by_rcu_tail;
+ struct llist_head waiting_for_gp;
+ struct llist_node *waiting_for_gp_tail;
+ struct rcu_head rcu;
+ atomic_t call_rcu_in_progress;
+ struct llist_head free_llist_extra_rcu;
+
+ /* list of objects to be freed after RCU tasks trace GP */
+ struct llist_head free_by_rcu_ttrace;
+ struct llist_head waiting_for_gp_ttrace;
+ struct rcu_head rcu_ttrace;
+ atomic_t call_rcu_ttrace_in_progress;
+};
+
+struct bpf_mem_caches {
+ struct bpf_mem_cache cache[NUM_CACHES];
+};
+
+static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
+
+static struct llist_node notrace *__llist_del_first(struct llist_head *head)
+{
+ struct llist_node *entry, *next;
+
+ entry = head->first;
+ if (!entry)
+ return NULL;
+ next = entry->next;
+ head->first = next;
+ return entry;
+}
+
+static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
+{
+ if (c->percpu_size) {
+ void **obj = kmalloc_node(c->percpu_size, flags, node);
+ void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
+
+ if (!obj || !pptr) {
+ free_percpu(pptr);
+ kfree(obj);
+ return NULL;
+ }
+ obj[1] = pptr;
+ return obj;
+ }
+
+ return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node);
+}
+
+static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
+{
+#ifdef CONFIG_MEMCG_KMEM
+ if (c->objcg)
+ return get_mem_cgroup_from_objcg(c->objcg);
+#endif
+
+#ifdef CONFIG_MEMCG
+ return root_mem_cgroup;
+#else
+ return NULL;
+#endif
+}
+
+static void inc_active(struct bpf_mem_cache *c, unsigned long *flags)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ /* In RT irq_work runs in per-cpu kthread, so disable
+ * interrupts to avoid preemption and interrupts and
+ * reduce the chance of bpf prog executing on this cpu
+ * when active counter is busy.
+ */
+ local_irq_save(*flags);
+ /* alloc_bulk runs from irq_work which will not preempt a bpf
+ * program that does unit_alloc/unit_free since IRQs are
+ * disabled there. There is no race to increment 'active'
+ * counter. It protects free_llist from corruption in case NMI
+ * bpf prog preempted this loop.
+ */
+ WARN_ON_ONCE(local_inc_return(&c->active) != 1);
+}
+
+static void dec_active(struct bpf_mem_cache *c, unsigned long *flags)
+{
+ local_dec(&c->active);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_restore(*flags);
+}
+
+static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj)
+{
+ unsigned long flags;
+
+ inc_active(c, &flags);
+ __llist_add(obj, &c->free_llist);
+ c->free_cnt++;
+ dec_active(c, &flags);
+}
+
+/* Mostly runs from irq_work except __init phase. */
+static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic)
+{
+ struct mem_cgroup *memcg = NULL, *old_memcg;
+ gfp_t gfp;
+ void *obj;
+ int i;
+
+ gfp = __GFP_NOWARN | __GFP_ACCOUNT;
+ gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL;
+
+ for (i = 0; i < cnt; i++) {
+ /*
+ * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is
+ * done only by one CPU == current CPU. Other CPUs might
+ * llist_add() and llist_del_all() in parallel.
+ */
+ obj = llist_del_first(&c->free_by_rcu_ttrace);
+ if (!obj)
+ break;
+ add_obj_to_free_list(c, obj);
+ }
+ if (i >= cnt)
+ return;
+
+ for (; i < cnt; i++) {
+ obj = llist_del_first(&c->waiting_for_gp_ttrace);
+ if (!obj)
+ break;
+ add_obj_to_free_list(c, obj);
+ }
+ if (i >= cnt)
+ return;
+
+ memcg = get_memcg(c);
+ old_memcg = set_active_memcg(memcg);
+ for (; i < cnt; i++) {
+ /* Allocate, but don't deplete atomic reserves that typical
+ * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
+ * will allocate from the current numa node which is what we
+ * want here.
+ */
+ obj = __alloc(c, node, gfp);
+ if (!obj)
+ break;
+ add_obj_to_free_list(c, obj);
+ }
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+}
+
+static void free_one(void *obj, bool percpu)
+{
+ if (percpu) {
+ free_percpu(((void **)obj)[1]);
+ kfree(obj);
+ return;
+ }
+
+ kfree(obj);
+}
+
+static int free_all(struct llist_node *llnode, bool percpu)
+{
+ struct llist_node *pos, *t;
+ int cnt = 0;
+
+ llist_for_each_safe(pos, t, llnode) {
+ free_one(pos, percpu);
+ cnt++;
+ }
+ return cnt;
+}
+
+static void __free_rcu(struct rcu_head *head)
+{
+ struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace);
+
+ free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size);
+ atomic_set(&c->call_rcu_ttrace_in_progress, 0);
+}
+
+static void __free_rcu_tasks_trace(struct rcu_head *head)
+{
+ /* If RCU Tasks Trace grace period implies RCU grace period,
+ * there is no need to invoke call_rcu().
+ */
+ if (rcu_trace_implies_rcu_gp())
+ __free_rcu(head);
+ else
+ call_rcu(head, __free_rcu);
+}
+
+static void enque_to_free(struct bpf_mem_cache *c, void *obj)
+{
+ struct llist_node *llnode = obj;
+
+ /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
+ * Nothing races to add to free_by_rcu_ttrace list.
+ */
+ llist_add(llnode, &c->free_by_rcu_ttrace);
+}
+
+static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+
+ if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) {
+ if (unlikely(READ_ONCE(c->draining))) {
+ llnode = llist_del_all(&c->free_by_rcu_ttrace);
+ free_all(llnode, !!c->percpu_size);
+ }
+ return;
+ }
+
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace));
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace))
+ llist_add(llnode, &c->waiting_for_gp_ttrace);
+
+ if (unlikely(READ_ONCE(c->draining))) {
+ __free_rcu(&c->rcu_ttrace);
+ return;
+ }
+
+ /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
+ * If RCU Tasks Trace grace period implies RCU grace period, free
+ * these elements directly, else use call_rcu() to wait for normal
+ * progs to finish and finally do free_one() on each element.
+ */
+ call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace);
+}
+
+static void free_bulk(struct bpf_mem_cache *c)
+{
+ struct bpf_mem_cache *tgt = c->tgt;
+ struct llist_node *llnode, *t;
+ unsigned long flags;
+ int cnt;
+
+ WARN_ON_ONCE(tgt->unit_size != c->unit_size);
+ WARN_ON_ONCE(tgt->percpu_size != c->percpu_size);
+
+ do {
+ inc_active(c, &flags);
+ llnode = __llist_del_first(&c->free_llist);
+ if (llnode)
+ cnt = --c->free_cnt;
+ else
+ cnt = 0;
+ dec_active(c, &flags);
+ if (llnode)
+ enque_to_free(tgt, llnode);
+ } while (cnt > (c->high_watermark + c->low_watermark) / 2);
+
+ /* and drain free_llist_extra */
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
+ enque_to_free(tgt, llnode);
+ do_call_rcu_ttrace(tgt);
+}
+
+static void __free_by_rcu(struct rcu_head *head)
+{
+ struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
+ struct bpf_mem_cache *tgt = c->tgt;
+ struct llist_node *llnode;
+
+ WARN_ON_ONCE(tgt->unit_size != c->unit_size);
+ WARN_ON_ONCE(tgt->percpu_size != c->percpu_size);
+
+ llnode = llist_del_all(&c->waiting_for_gp);
+ if (!llnode)
+ goto out;
+
+ llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace);
+
+ /* Objects went through regular RCU GP. Send them to RCU tasks trace */
+ do_call_rcu_ttrace(tgt);
+out:
+ atomic_set(&c->call_rcu_in_progress, 0);
+}
+
+static void check_free_by_rcu(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+ unsigned long flags;
+
+ /* drain free_llist_extra_rcu */
+ if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) {
+ inc_active(c, &flags);
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu))
+ if (__llist_add(llnode, &c->free_by_rcu))
+ c->free_by_rcu_tail = llnode;
+ dec_active(c, &flags);
+ }
+
+ if (llist_empty(&c->free_by_rcu))
+ return;
+
+ if (atomic_xchg(&c->call_rcu_in_progress, 1)) {
+ /*
+ * Instead of kmalloc-ing new rcu_head and triggering 10k
+ * call_rcu() to hit rcutree.qhimark and force RCU to notice
+ * the overload just ask RCU to hurry up. There could be many
+ * objects in free_by_rcu list.
+ * This hint reduces memory consumption for an artificial
+ * benchmark from 2 Gbyte to 150 Mbyte.
+ */
+ rcu_request_urgent_qs_task(current);
+ return;
+ }
+
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
+
+ inc_active(c, &flags);
+ WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu));
+ c->waiting_for_gp_tail = c->free_by_rcu_tail;
+ dec_active(c, &flags);
+
+ if (unlikely(READ_ONCE(c->draining))) {
+ free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
+ atomic_set(&c->call_rcu_in_progress, 0);
+ } else {
+ call_rcu_hurry(&c->rcu, __free_by_rcu);
+ }
+}
+
+static void bpf_mem_refill(struct irq_work *work)
+{
+ struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
+ int cnt;
+
+ /* Racy access to free_cnt. It doesn't need to be 100% accurate */
+ cnt = c->free_cnt;
+ if (cnt < c->low_watermark)
+ /* irq_work runs on this cpu and kmalloc will allocate
+ * from the current numa node which is what we want here.
+ */
+ alloc_bulk(c, c->batch, NUMA_NO_NODE, true);
+ else if (cnt > c->high_watermark)
+ free_bulk(c);
+
+ check_free_by_rcu(c);
+}
+
+static void notrace irq_work_raise(struct bpf_mem_cache *c)
+{
+ irq_work_queue(&c->refill_work);
+}
+
+/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket
+ * the freelist cache will be elem_size * 64 (or less) on each cpu.
+ *
+ * For bpf programs that don't have statically known allocation sizes and
+ * assuming (low_mark + high_mark) / 2 as an average number of elements per
+ * bucket and all buckets are used the total amount of memory in freelists
+ * on each cpu will be:
+ * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096
+ * == ~ 116 Kbyte using below heuristic.
+ * Initialized, but unused bpf allocator (not bpf map specific one) will
+ * consume ~ 11 Kbyte per cpu.
+ * Typical case will be between 11K and 116K closer to 11K.
+ * bpf progs can and should share bpf_mem_cache when possible.
+ *
+ * Percpu allocation is typically rare. To avoid potential unnecessary large
+ * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1.
+ */
+static void init_refill_work(struct bpf_mem_cache *c)
+{
+ init_irq_work(&c->refill_work, bpf_mem_refill);
+ if (c->percpu_size) {
+ c->low_watermark = 1;
+ c->high_watermark = 3;
+ } else if (c->unit_size <= 256) {
+ c->low_watermark = 32;
+ c->high_watermark = 96;
+ } else {
+ /* When page_size == 4k, order-0 cache will have low_mark == 2
+ * and high_mark == 6 with batch alloc of 3 individual pages at
+ * a time.
+ * 8k allocs and above low == 1, high == 3, batch == 1.
+ */
+ c->low_watermark = max(32 * 256 / c->unit_size, 1);
+ c->high_watermark = max(96 * 256 / c->unit_size, 3);
+ }
+ c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1);
+}
+
+static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
+{
+ int cnt = 1;
+
+ /* To avoid consuming memory, for non-percpu allocation, assume that
+ * 1st run of bpf prog won't be doing more than 4 map_update_elem from
+ * irq disabled region if unit size is less than or equal to 256.
+ * For all other cases, let us just do one allocation.
+ */
+ if (!c->percpu_size && c->unit_size <= 256)
+ cnt = 4;
+ alloc_bulk(c, cnt, cpu_to_node(cpu), false);
+}
+
+/* When size != 0 bpf_mem_cache for each cpu.
+ * This is typical bpf hash map use case when all elements have equal size.
+ *
+ * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on
+ * kmalloc/kfree. Max allocation size is 4096 in this case.
+ * This is bpf_dynptr and bpf_kptr use case.
+ */
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
+{
+ struct bpf_mem_caches *cc, __percpu *pcc;
+ struct bpf_mem_cache *c, __percpu *pc;
+ struct obj_cgroup *objcg = NULL;
+ int cpu, i, unit_size, percpu_size = 0;
+
+ if (percpu && size == 0)
+ return -EINVAL;
+
+ /* room for llist_node and per-cpu pointer */
+ if (percpu)
+ percpu_size = LLIST_NODE_SZ + sizeof(void *);
+ ma->percpu = percpu;
+
+ if (size) {
+ pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
+ if (!pc)
+ return -ENOMEM;
+
+ if (!percpu)
+ size += LLIST_NODE_SZ; /* room for llist_node */
+ unit_size = size;
+
+#ifdef CONFIG_MEMCG_KMEM
+ if (memcg_bpf_enabled())
+ objcg = get_obj_cgroup_from_current();
+#endif
+ ma->objcg = objcg;
+
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(pc, cpu);
+ c->unit_size = unit_size;
+ c->objcg = objcg;
+ c->percpu_size = percpu_size;
+ c->tgt = c;
+ init_refill_work(c);
+ prefill_mem_cache(c, cpu);
+ }
+ ma->cache = pc;
+ return 0;
+ }
+
+ pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
+ if (!pcc)
+ return -ENOMEM;
+#ifdef CONFIG_MEMCG_KMEM
+ objcg = get_obj_cgroup_from_current();
+#endif
+ ma->objcg = objcg;
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(pcc, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ c->unit_size = sizes[i];
+ c->objcg = objcg;
+ c->percpu_size = percpu_size;
+ c->tgt = c;
+
+ init_refill_work(c);
+ prefill_mem_cache(c, cpu);
+ }
+ }
+
+ ma->caches = pcc;
+ return 0;
+}
+
+int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg)
+{
+ struct bpf_mem_caches __percpu *pcc;
+
+ pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL);
+ if (!pcc)
+ return -ENOMEM;
+
+ ma->caches = pcc;
+ ma->objcg = objcg;
+ ma->percpu = true;
+ return 0;
+}
+
+int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size)
+{
+ struct bpf_mem_caches *cc, __percpu *pcc;
+ int cpu, i, unit_size, percpu_size;
+ struct obj_cgroup *objcg;
+ struct bpf_mem_cache *c;
+
+ i = bpf_mem_cache_idx(size);
+ if (i < 0)
+ return -EINVAL;
+
+ /* room for llist_node and per-cpu pointer */
+ percpu_size = LLIST_NODE_SZ + sizeof(void *);
+
+ unit_size = sizes[i];
+ objcg = ma->objcg;
+ pcc = ma->caches;
+
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(pcc, cpu);
+ c = &cc->cache[i];
+ if (c->unit_size)
+ break;
+
+ c->unit_size = unit_size;
+ c->objcg = objcg;
+ c->percpu_size = percpu_size;
+ c->tgt = c;
+
+ init_refill_work(c);
+ prefill_mem_cache(c, cpu);
+ }
+
+ return 0;
+}
+
+static void drain_mem_cache(struct bpf_mem_cache *c)
+{
+ bool percpu = !!c->percpu_size;
+
+ /* No progs are using this bpf_mem_cache, but htab_map_free() called
+ * bpf_mem_cache_free() for all remaining elements and they can be in
+ * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now.
+ *
+ * Except for waiting_for_gp_ttrace list, there are no concurrent operations
+ * on these lists, so it is safe to use __llist_del_all().
+ */
+ free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu);
+ free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
+ free_all(__llist_del_all(&c->free_llist), percpu);
+ free_all(__llist_del_all(&c->free_llist_extra), percpu);
+ free_all(__llist_del_all(&c->free_by_rcu), percpu);
+ free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
+ free_all(llist_del_all(&c->waiting_for_gp), percpu);
+}
+
+static void check_mem_cache(struct bpf_mem_cache *c)
+{
+ WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace));
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace));
+ WARN_ON_ONCE(!llist_empty(&c->free_llist));
+ WARN_ON_ONCE(!llist_empty(&c->free_llist_extra));
+ WARN_ON_ONCE(!llist_empty(&c->free_by_rcu));
+ WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu));
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
+}
+
+static void check_leaked_objs(struct bpf_mem_alloc *ma)
+{
+ struct bpf_mem_caches *cc;
+ struct bpf_mem_cache *c;
+ int cpu, i;
+
+ if (ma->cache) {
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(ma->cache, cpu);
+ check_mem_cache(c);
+ }
+ }
+ if (ma->caches) {
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(ma->caches, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ check_mem_cache(c);
+ }
+ }
+ }
+}
+
+static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
+{
+ check_leaked_objs(ma);
+ free_percpu(ma->cache);
+ free_percpu(ma->caches);
+ ma->cache = NULL;
+ ma->caches = NULL;
+}
+
+static void free_mem_alloc(struct bpf_mem_alloc *ma)
+{
+ /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
+ * might still execute. Wait for them.
+ *
+ * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
+ * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
+ * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(),
+ * so if call_rcu(head, __free_rcu) is skipped due to
+ * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
+ * using rcu_trace_implies_rcu_gp() as well.
+ */
+ rcu_barrier(); /* wait for __free_by_rcu */
+ rcu_barrier_tasks_trace(); /* wait for __free_rcu */
+ if (!rcu_trace_implies_rcu_gp())
+ rcu_barrier();
+ free_mem_alloc_no_barrier(ma);
+}
+
+static void free_mem_alloc_deferred(struct work_struct *work)
+{
+ struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work);
+
+ free_mem_alloc(ma);
+ kfree(ma);
+}
+
+static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
+{
+ struct bpf_mem_alloc *copy;
+
+ if (!rcu_in_progress) {
+ /* Fast path. No callbacks are pending, hence no need to do
+ * rcu_barrier-s.
+ */
+ free_mem_alloc_no_barrier(ma);
+ return;
+ }
+
+ copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL);
+ if (!copy) {
+ /* Slow path with inline barrier-s */
+ free_mem_alloc(ma);
+ return;
+ }
+
+ /* Defer barriers into worker to let the rest of map memory to be freed */
+ memset(ma, 0, sizeof(*ma));
+ INIT_WORK(&copy->work, free_mem_alloc_deferred);
+ queue_work(system_unbound_wq, &copy->work);
+}
+
+void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
+{
+ struct bpf_mem_caches *cc;
+ struct bpf_mem_cache *c;
+ int cpu, i, rcu_in_progress;
+
+ if (ma->cache) {
+ rcu_in_progress = 0;
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(ma->cache, cpu);
+ WRITE_ONCE(c->draining, true);
+ irq_work_sync(&c->refill_work);
+ drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
+ rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
+ }
+ if (ma->objcg)
+ obj_cgroup_put(ma->objcg);
+ destroy_mem_alloc(ma, rcu_in_progress);
+ }
+ if (ma->caches) {
+ rcu_in_progress = 0;
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(ma->caches, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ WRITE_ONCE(c->draining, true);
+ irq_work_sync(&c->refill_work);
+ drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
+ rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
+ }
+ }
+ if (ma->objcg)
+ obj_cgroup_put(ma->objcg);
+ destroy_mem_alloc(ma, rcu_in_progress);
+ }
+}
+
+/* notrace is necessary here and in other functions to make sure
+ * bpf programs cannot attach to them and cause llist corruptions.
+ */
+static void notrace *unit_alloc(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode = NULL;
+ unsigned long flags;
+ int cnt = 0;
+
+ /* Disable irqs to prevent the following race for majority of prog types:
+ * prog_A
+ * bpf_mem_alloc
+ * preemption or irq -> prog_B
+ * bpf_mem_alloc
+ *
+ * but prog_B could be a perf_event NMI prog.
+ * Use per-cpu 'active' counter to order free_list access between
+ * unit_alloc/unit_free/bpf_mem_refill.
+ */
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ llnode = __llist_del_first(&c->free_llist);
+ if (llnode) {
+ cnt = --c->free_cnt;
+ *(struct bpf_mem_cache **)llnode = c;
+ }
+ }
+ local_dec(&c->active);
+
+ WARN_ON(cnt < 0);
+
+ if (cnt < c->low_watermark)
+ irq_work_raise(c);
+ /* Enable IRQ after the enqueue of irq work completes, so irq work
+ * will run after IRQ is enabled and free_llist may be refilled by
+ * irq work before other task preempts current task.
+ */
+ local_irq_restore(flags);
+
+ return llnode;
+}
+
+/* Though 'ptr' object could have been allocated on a different cpu
+ * add it to the free_llist of the current cpu.
+ * Let kfree() logic deal with it when it's later called from irq_work.
+ */
+static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
+{
+ struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+ unsigned long flags;
+ int cnt = 0;
+
+ BUILD_BUG_ON(LLIST_NODE_SZ > 8);
+
+ /*
+ * Remember bpf_mem_cache that allocated this object.
+ * The hint is not accurate.
+ */
+ c->tgt = *(struct bpf_mem_cache **)llnode;
+
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ __llist_add(llnode, &c->free_llist);
+ cnt = ++c->free_cnt;
+ } else {
+ /* unit_free() cannot fail. Therefore add an object to atomic
+ * llist. free_bulk() will drain it. Though free_llist_extra is
+ * a per-cpu list we have to use atomic llist_add here, since
+ * it also can be interrupted by bpf nmi prog that does another
+ * unit_free() into the same free_llist_extra.
+ */
+ llist_add(llnode, &c->free_llist_extra);
+ }
+ local_dec(&c->active);
+
+ if (cnt > c->high_watermark)
+ /* free few objects from current cpu into global kmalloc pool */
+ irq_work_raise(c);
+ /* Enable IRQ after irq_work_raise() completes, otherwise when current
+ * task is preempted by task which does unit_alloc(), unit_alloc() may
+ * return NULL unexpectedly because irq work is already pending but can
+ * not been triggered and free_llist can not be refilled timely.
+ */
+ local_irq_restore(flags);
+}
+
+static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
+{
+ struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+ unsigned long flags;
+
+ c->tgt = *(struct bpf_mem_cache **)llnode;
+
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ if (__llist_add(llnode, &c->free_by_rcu))
+ c->free_by_rcu_tail = llnode;
+ } else {
+ llist_add(llnode, &c->free_llist_extra_rcu);
+ }
+ local_dec(&c->active);
+
+ if (!atomic_read(&c->call_rcu_in_progress))
+ irq_work_raise(c);
+ local_irq_restore(flags);
+}
+
+/* Called from BPF program or from sys_bpf syscall.
+ * In both cases migration is disabled.
+ */
+void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
+{
+ int idx;
+ void *ret;
+
+ if (!size)
+ return NULL;
+
+ if (!ma->percpu)
+ size += LLIST_NODE_SZ;
+ idx = bpf_mem_cache_idx(size);
+ if (idx < 0)
+ return NULL;
+
+ ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx);
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
+{
+ struct bpf_mem_cache *c;
+ int idx;
+
+ if (!ptr)
+ return;
+
+ c = *(void **)(ptr - LLIST_NODE_SZ);
+ idx = bpf_mem_cache_idx(c->unit_size);
+ if (WARN_ON_ONCE(idx < 0))
+ return;
+
+ unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
+}
+
+void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
+{
+ struct bpf_mem_cache *c;
+ int idx;
+
+ if (!ptr)
+ return;
+
+ c = *(void **)(ptr - LLIST_NODE_SZ);
+ idx = bpf_mem_cache_idx(c->unit_size);
+ if (WARN_ON_ONCE(idx < 0))
+ return;
+
+ unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr);
+}
+
+void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
+{
+ void *ret;
+
+ ret = unit_alloc(this_cpu_ptr(ma->cache));
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
+{
+ if (!ptr)
+ return;
+
+ unit_free(this_cpu_ptr(ma->cache), ptr);
+}
+
+void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
+{
+ if (!ptr)
+ return;
+
+ unit_free_rcu(this_cpu_ptr(ma->cache), ptr);
+}
+
+/* Directly does a kfree() without putting 'ptr' back to the free_llist
+ * for reuse and without waiting for a rcu_tasks_trace gp.
+ * The caller must first go through the rcu_tasks_trace gp for 'ptr'
+ * before calling bpf_mem_cache_raw_free().
+ * It could be used when the rcu_tasks_trace callback does not have
+ * a hold on the original bpf_mem_alloc object that allocated the
+ * 'ptr'. This should only be used in the uncommon code path.
+ * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled
+ * and may affect performance.
+ */
+void bpf_mem_cache_raw_free(void *ptr)
+{
+ if (!ptr)
+ return;
+
+ kfree(ptr - LLIST_NODE_SZ);
+}
+
+/* When flags == GFP_KERNEL, it signals that the caller will not cause
+ * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use
+ * kmalloc if the free_llist is empty.
+ */
+void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
+{
+ struct bpf_mem_cache *c;
+ void *ret;
+
+ c = this_cpu_ptr(ma->cache);
+
+ ret = unit_alloc(c);
+ if (!ret && flags == GFP_KERNEL) {
+ struct mem_cgroup *memcg, *old_memcg;
+
+ memcg = get_memcg(c);
+ old_memcg = set_active_memcg(memcg);
+ ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT);
+ if (ret)
+ *(struct bpf_mem_cache **)ret = c;
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+ }
+
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
diff --git a/kernel/bpf/mmap_unlock_work.h b/kernel/bpf/mmap_unlock_work.h
new file mode 100644
index 000000000000..5d18d7d85bef
--- /dev/null
+++ b/kernel/bpf/mmap_unlock_work.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2021 Facebook
+ */
+
+#ifndef __MMAP_UNLOCK_WORK_H__
+#define __MMAP_UNLOCK_WORK_H__
+#include <linux/irq_work.h>
+
+/* irq_work to run mmap_read_unlock() in irq_work */
+struct mmap_unlock_irq_work {
+ struct irq_work irq_work;
+ struct mm_struct *mm;
+};
+
+DECLARE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
+
+/*
+ * We cannot do mmap_read_unlock() when the irq is disabled, because of
+ * risk to deadlock with rq_lock. To look up vma when the irqs are
+ * disabled, we need to run mmap_read_unlock() in irq_work. We use a
+ * percpu variable to do the irq_work. If the irq_work is already used
+ * by another lookup, we fall over.
+ */
+static inline bool bpf_mmap_unlock_get_irq_work(struct mmap_unlock_irq_work **work_ptr)
+{
+ struct mmap_unlock_irq_work *work = NULL;
+ bool irq_work_busy = false;
+
+ if (irqs_disabled()) {
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ work = this_cpu_ptr(&mmap_unlock_work);
+ if (irq_work_is_busy(&work->irq_work)) {
+ /* cannot queue more up_read, fallback */
+ irq_work_busy = true;
+ }
+ } else {
+ /*
+ * PREEMPT_RT does not allow to trylock mmap sem in
+ * interrupt disabled context. Force the fallback code.
+ */
+ irq_work_busy = true;
+ }
+ }
+
+ *work_ptr = work;
+ return irq_work_busy;
+}
+
+static inline void bpf_mmap_unlock_mm(struct mmap_unlock_irq_work *work, struct mm_struct *mm)
+{
+ if (!work) {
+ mmap_read_unlock(mm);
+ } else {
+ work->mm = mm;
+
+ /* The lock will be released once we're out of interrupt
+ * context. Tell lockdep that we've released it now so
+ * it doesn't complain that we forgot to release it.
+ */
+ rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_);
+ irq_work_queue(&work->irq_work);
+ }
+}
+
+#endif /* __MMAP_UNLOCK_WORK_H__ */
diff --git a/kernel/bpf/mprog.c b/kernel/bpf/mprog.c
new file mode 100644
index 000000000000..1394168062e8
--- /dev/null
+++ b/kernel/bpf/mprog.c
@@ -0,0 +1,452 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Isovalent */
+
+#include <linux/bpf.h>
+#include <linux/bpf_mprog.h>
+
+static int bpf_mprog_link(struct bpf_tuple *tuple,
+ u32 id_or_fd, u32 flags,
+ enum bpf_prog_type type)
+{
+ struct bpf_link *link = ERR_PTR(-EINVAL);
+ bool id = flags & BPF_F_ID;
+
+ if (id)
+ link = bpf_link_by_id(id_or_fd);
+ else if (id_or_fd)
+ link = bpf_link_get_from_fd(id_or_fd);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+ if (type && link->prog->type != type) {
+ bpf_link_put(link);
+ return -EINVAL;
+ }
+
+ tuple->link = link;
+ tuple->prog = link->prog;
+ return 0;
+}
+
+static int bpf_mprog_prog(struct bpf_tuple *tuple,
+ u32 id_or_fd, u32 flags,
+ enum bpf_prog_type type)
+{
+ struct bpf_prog *prog = ERR_PTR(-EINVAL);
+ bool id = flags & BPF_F_ID;
+
+ if (id)
+ prog = bpf_prog_by_id(id_or_fd);
+ else if (id_or_fd)
+ prog = bpf_prog_get(id_or_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ if (type && prog->type != type) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ tuple->link = NULL;
+ tuple->prog = prog;
+ return 0;
+}
+
+static int bpf_mprog_tuple_relative(struct bpf_tuple *tuple,
+ u32 id_or_fd, u32 flags,
+ enum bpf_prog_type type)
+{
+ bool link = flags & BPF_F_LINK;
+ bool id = flags & BPF_F_ID;
+
+ memset(tuple, 0, sizeof(*tuple));
+ if (link)
+ return bpf_mprog_link(tuple, id_or_fd, flags, type);
+ /* If no relevant flag is set and no id_or_fd was passed, then
+ * tuple link/prog is just NULLed. This is the case when before/
+ * after selects first/last position without passing fd.
+ */
+ if (!id && !id_or_fd)
+ return 0;
+ return bpf_mprog_prog(tuple, id_or_fd, flags, type);
+}
+
+static void bpf_mprog_tuple_put(struct bpf_tuple *tuple)
+{
+ if (tuple->link)
+ bpf_link_put(tuple->link);
+ else if (tuple->prog)
+ bpf_prog_put(tuple->prog);
+}
+
+/* The bpf_mprog_{replace,delete}() operate on exact idx position with the
+ * one exception that for deletion we support delete from front/back. In
+ * case of front idx is -1, in case of back idx is bpf_mprog_total(entry).
+ * Adjustment to first and last entry is trivial. The bpf_mprog_insert()
+ * we have to deal with the following cases:
+ *
+ * idx + before:
+ *
+ * Insert P4 before P3: idx for old array is 1, idx for new array is 2,
+ * hence we adjust target idx for the new array, so that memmove copies
+ * P1 and P2 to the new entry, and we insert P4 into idx 2. Inserting
+ * before P1 would have old idx -1 and new idx 0.
+ *
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3|
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ *
+ * idx + after:
+ *
+ * Insert P4 after P2: idx for old array is 2, idx for new array is 2.
+ * Again, memmove copies P1 and P2 to the new entry, and we insert P4
+ * into idx 2. Inserting after P3 would have both old/new idx at 4 aka
+ * bpf_mprog_total(entry).
+ *
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3|
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ */
+static int bpf_mprog_replace(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_tuple *ntuple, int idx)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ struct bpf_prog *oprog;
+
+ bpf_mprog_read(entry, idx, &fp, &cp);
+ oprog = READ_ONCE(fp->prog);
+ bpf_mprog_write(fp, cp, ntuple);
+ if (!ntuple->link) {
+ WARN_ON_ONCE(cp->link);
+ bpf_prog_put(oprog);
+ }
+ *entry_new = entry;
+ return 0;
+}
+
+static int bpf_mprog_insert(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_tuple *ntuple, int idx, u32 flags)
+{
+ int total = bpf_mprog_total(entry);
+ struct bpf_mprog_entry *peer;
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+
+ peer = bpf_mprog_peer(entry);
+ bpf_mprog_entry_copy(peer, entry);
+ if (idx == total)
+ goto insert;
+ else if (flags & BPF_F_BEFORE)
+ idx += 1;
+ bpf_mprog_entry_grow(peer, idx);
+insert:
+ bpf_mprog_read(peer, idx, &fp, &cp);
+ bpf_mprog_write(fp, cp, ntuple);
+ bpf_mprog_inc(peer);
+ *entry_new = peer;
+ return 0;
+}
+
+static int bpf_mprog_delete(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_tuple *dtuple, int idx)
+{
+ int total = bpf_mprog_total(entry);
+ struct bpf_mprog_entry *peer;
+
+ peer = bpf_mprog_peer(entry);
+ bpf_mprog_entry_copy(peer, entry);
+ if (idx == -1)
+ idx = 0;
+ else if (idx == total)
+ idx = total - 1;
+ bpf_mprog_entry_shrink(peer, idx);
+ bpf_mprog_dec(peer);
+ bpf_mprog_mark_for_release(peer, dtuple);
+ *entry_new = peer;
+ return 0;
+}
+
+/* In bpf_mprog_pos_*() we evaluate the target position for the BPF
+ * program/link that needs to be replaced, inserted or deleted for
+ * each "rule" independently. If all rules agree on that position
+ * or existing element, then enact replacement, addition or deletion.
+ * If this is not the case, then the request cannot be satisfied and
+ * we bail out with an error.
+ */
+static int bpf_mprog_pos_exact(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ int i;
+
+ for (i = 0; i < bpf_mprog_total(entry); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ if (tuple->prog == READ_ONCE(fp->prog))
+ return tuple->link == cp->link ? i : -EBUSY;
+ }
+ return -ENOENT;
+}
+
+static int bpf_mprog_pos_before(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ int i;
+
+ for (i = 0; i < bpf_mprog_total(entry); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ if (tuple->prog == READ_ONCE(fp->prog) &&
+ (!tuple->link || tuple->link == cp->link))
+ return i - 1;
+ }
+ return tuple->prog ? -ENOENT : -1;
+}
+
+static int bpf_mprog_pos_after(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ int i;
+
+ for (i = 0; i < bpf_mprog_total(entry); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ if (tuple->prog == READ_ONCE(fp->prog) &&
+ (!tuple->link || tuple->link == cp->link))
+ return i + 1;
+ }
+ return tuple->prog ? -ENOENT : bpf_mprog_total(entry);
+}
+
+int bpf_mprog_attach(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_prog *prog_new, struct bpf_link *link,
+ struct bpf_prog *prog_old,
+ u32 flags, u32 id_or_fd, u64 revision)
+{
+ struct bpf_tuple rtuple, ntuple = {
+ .prog = prog_new,
+ .link = link,
+ }, otuple = {
+ .prog = prog_old,
+ .link = link,
+ };
+ int ret, idx = -ERANGE, tidx;
+
+ if (revision && revision != bpf_mprog_revision(entry))
+ return -ESTALE;
+ if (bpf_mprog_exists(entry, prog_new))
+ return -EEXIST;
+ ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd,
+ flags & ~BPF_F_REPLACE,
+ prog_new->type);
+ if (ret)
+ return ret;
+ if (flags & BPF_F_REPLACE) {
+ tidx = bpf_mprog_pos_exact(entry, &otuple);
+ if (tidx < 0) {
+ ret = tidx;
+ goto out;
+ }
+ idx = tidx;
+ } else if (bpf_mprog_total(entry) == bpf_mprog_max()) {
+ ret = -ERANGE;
+ goto out;
+ }
+ if (flags & BPF_F_BEFORE) {
+ tidx = bpf_mprog_pos_before(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < -1 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (flags & BPF_F_AFTER) {
+ tidx = bpf_mprog_pos_after(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < 0 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (idx < -1) {
+ if (rtuple.prog || flags) {
+ ret = -EINVAL;
+ goto out;
+ }
+ idx = bpf_mprog_total(entry);
+ flags = BPF_F_AFTER;
+ }
+ if (idx >= bpf_mprog_max()) {
+ ret = -ERANGE;
+ goto out;
+ }
+ if (flags & BPF_F_REPLACE)
+ ret = bpf_mprog_replace(entry, entry_new, &ntuple, idx);
+ else
+ ret = bpf_mprog_insert(entry, entry_new, &ntuple, idx, flags);
+out:
+ bpf_mprog_tuple_put(&rtuple);
+ return ret;
+}
+
+static int bpf_mprog_fetch(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple, int idx)
+{
+ int total = bpf_mprog_total(entry);
+ struct bpf_mprog_cp *cp;
+ struct bpf_mprog_fp *fp;
+ struct bpf_prog *prog;
+ struct bpf_link *link;
+
+ if (idx == -1)
+ idx = 0;
+ else if (idx == total)
+ idx = total - 1;
+ bpf_mprog_read(entry, idx, &fp, &cp);
+ prog = READ_ONCE(fp->prog);
+ link = cp->link;
+ /* The deletion request can either be without filled tuple in which
+ * case it gets populated here based on idx, or with filled tuple
+ * where the only thing we end up doing is the WARN_ON_ONCE() assert.
+ * If we hit a BPF link at the given index, it must not be removed
+ * from opts path.
+ */
+ if (link && !tuple->link)
+ return -EBUSY;
+ WARN_ON_ONCE(tuple->prog && tuple->prog != prog);
+ WARN_ON_ONCE(tuple->link && tuple->link != link);
+ tuple->prog = prog;
+ tuple->link = link;
+ return 0;
+}
+
+int bpf_mprog_detach(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_prog *prog, struct bpf_link *link,
+ u32 flags, u32 id_or_fd, u64 revision)
+{
+ struct bpf_tuple rtuple, dtuple = {
+ .prog = prog,
+ .link = link,
+ };
+ int ret, idx = -ERANGE, tidx;
+
+ if (flags & BPF_F_REPLACE)
+ return -EINVAL;
+ if (revision && revision != bpf_mprog_revision(entry))
+ return -ESTALE;
+ if (!bpf_mprog_total(entry))
+ return -ENOENT;
+ ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, flags,
+ prog ? prog->type :
+ BPF_PROG_TYPE_UNSPEC);
+ if (ret)
+ return ret;
+ if (dtuple.prog) {
+ tidx = bpf_mprog_pos_exact(entry, &dtuple);
+ if (tidx < 0) {
+ ret = tidx;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (flags & BPF_F_BEFORE) {
+ tidx = bpf_mprog_pos_before(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < -1 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (flags & BPF_F_AFTER) {
+ tidx = bpf_mprog_pos_after(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < 0 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (idx < -1) {
+ if (rtuple.prog || flags) {
+ ret = -EINVAL;
+ goto out;
+ }
+ idx = bpf_mprog_total(entry);
+ flags = BPF_F_AFTER;
+ }
+ if (idx >= bpf_mprog_max()) {
+ ret = -ERANGE;
+ goto out;
+ }
+ ret = bpf_mprog_fetch(entry, &dtuple, idx);
+ if (ret)
+ goto out;
+ ret = bpf_mprog_delete(entry, entry_new, &dtuple, idx);
+out:
+ bpf_mprog_tuple_put(&rtuple);
+ return ret;
+}
+
+int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr,
+ struct bpf_mprog_entry *entry)
+{
+ u32 __user *uprog_flags, *ulink_flags;
+ u32 __user *uprog_id, *ulink_id;
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ struct bpf_prog *prog;
+ const u32 flags = 0;
+ u32 id, count = 0;
+ u64 revision = 1;
+ int i, ret = 0;
+
+ if (attr->query.query_flags || attr->query.attach_flags)
+ return -EINVAL;
+ if (entry) {
+ revision = bpf_mprog_revision(entry);
+ count = bpf_mprog_total(entry);
+ }
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+ return -EFAULT;
+ if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
+ return -EFAULT;
+ if (copy_to_user(&uattr->query.count, &count, sizeof(count)))
+ return -EFAULT;
+ uprog_id = u64_to_user_ptr(attr->query.prog_ids);
+ uprog_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
+ ulink_id = u64_to_user_ptr(attr->query.link_ids);
+ ulink_flags = u64_to_user_ptr(attr->query.link_attach_flags);
+ if (attr->query.count == 0 || !uprog_id || !count)
+ return 0;
+ if (attr->query.count < count) {
+ count = attr->query.count;
+ ret = -ENOSPC;
+ }
+ for (i = 0; i < bpf_mprog_max(); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ prog = READ_ONCE(fp->prog);
+ if (!prog)
+ break;
+ id = prog->aux->id;
+ if (copy_to_user(uprog_id + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (uprog_flags &&
+ copy_to_user(uprog_flags + i, &flags, sizeof(flags)))
+ return -EFAULT;
+ id = cp->link ? cp->link->id : 0;
+ if (ulink_id &&
+ copy_to_user(ulink_id + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (ulink_flags &&
+ copy_to_user(ulink_flags + i, &flags, sizeof(flags)))
+ return -EFAULT;
+ if (i + 1 == count)
+ break;
+ }
+ return ret;
+}
diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
index 310241ca7991..868cc2c43899 100644
--- a/kernel/bpf/net_namespace.c
+++ b/kernel/bpf/net_namespace.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
+#include <linux/bpf-netns.h>
#include <linux/filter.h>
#include <net/net_namespace.h>
@@ -25,6 +26,32 @@ struct bpf_netns_link {
/* Protects updates to netns_bpf */
DEFINE_MUTEX(netns_bpf_mutex);
+static void netns_bpf_attach_type_unneed(enum netns_bpf_attach_type type)
+{
+ switch (type) {
+#ifdef CONFIG_INET
+ case NETNS_BPF_SK_LOOKUP:
+ static_branch_dec(&bpf_sk_lookup_enabled);
+ break;
+#endif
+ default:
+ break;
+ }
+}
+
+static void netns_bpf_attach_type_need(enum netns_bpf_attach_type type)
+{
+ switch (type) {
+#ifdef CONFIG_INET
+ case NETNS_BPF_SK_LOOKUP:
+ static_branch_inc(&bpf_sk_lookup_enabled);
+ break;
+#endif
+ default:
+ break;
+ }
+}
+
/* Must be called with netns_bpf_mutex held. */
static void netns_bpf_run_array_detach(struct net *net,
enum netns_bpf_attach_type type)
@@ -36,12 +63,50 @@ static void netns_bpf_run_array_detach(struct net *net,
bpf_prog_array_free(run_array);
}
+static int link_index(struct net *net, enum netns_bpf_attach_type type,
+ struct bpf_netns_link *link)
+{
+ struct bpf_netns_link *pos;
+ int i = 0;
+
+ list_for_each_entry(pos, &net->bpf.links[type], node) {
+ if (pos == link)
+ return i;
+ i++;
+ }
+ return -ENOENT;
+}
+
+static int link_count(struct net *net, enum netns_bpf_attach_type type)
+{
+ struct list_head *pos;
+ int i = 0;
+
+ list_for_each(pos, &net->bpf.links[type])
+ i++;
+ return i;
+}
+
+static void fill_prog_array(struct net *net, enum netns_bpf_attach_type type,
+ struct bpf_prog_array *prog_array)
+{
+ struct bpf_netns_link *pos;
+ unsigned int i = 0;
+
+ list_for_each_entry(pos, &net->bpf.links[type], node) {
+ prog_array->items[i].prog = pos->link.prog;
+ i++;
+ }
+}
+
static void bpf_netns_link_release(struct bpf_link *link)
{
struct bpf_netns_link *net_link =
container_of(link, struct bpf_netns_link, link);
enum netns_bpf_attach_type type = net_link->netns_type;
+ struct bpf_prog_array *old_array, *new_array;
struct net *net;
+ int cnt, idx;
mutex_lock(&netns_bpf_mutex);
@@ -53,13 +118,41 @@ static void bpf_netns_link_release(struct bpf_link *link)
if (!net)
goto out_unlock;
- netns_bpf_run_array_detach(net, type);
+ /* Mark attach point as unused */
+ netns_bpf_attach_type_unneed(type);
+
+ /* Remember link position in case of safe delete */
+ idx = link_index(net, type, net_link);
list_del(&net_link->node);
+ cnt = link_count(net, type);
+ if (!cnt) {
+ netns_bpf_run_array_detach(net, type);
+ goto out_unlock;
+ }
+
+ old_array = rcu_dereference_protected(net->bpf.run_array[type],
+ lockdep_is_held(&netns_bpf_mutex));
+ new_array = bpf_prog_array_alloc(cnt, GFP_KERNEL);
+ if (!new_array) {
+ WARN_ON(bpf_prog_array_delete_safe_at(old_array, idx));
+ goto out_unlock;
+ }
+ fill_prog_array(net, type, new_array);
+ rcu_assign_pointer(net->bpf.run_array[type], new_array);
+ bpf_prog_array_free(old_array);
+
out_unlock:
+ net_link->net = NULL;
mutex_unlock(&netns_bpf_mutex);
}
+static int bpf_netns_link_detach(struct bpf_link *link)
+{
+ bpf_netns_link_release(link);
+ return 0;
+}
+
static void bpf_netns_link_dealloc(struct bpf_link *link)
{
struct bpf_netns_link *net_link =
@@ -77,7 +170,7 @@ static int bpf_netns_link_update_prog(struct bpf_link *link,
enum netns_bpf_attach_type type = net_link->netns_type;
struct bpf_prog_array *run_array;
struct net *net;
- int ret = 0;
+ int idx, ret;
if (old_prog && old_prog != link->prog)
return -EPERM;
@@ -95,7 +188,10 @@ static int bpf_netns_link_update_prog(struct bpf_link *link,
run_array = rcu_dereference_protected(net->bpf.run_array[type],
lockdep_is_held(&netns_bpf_mutex));
- WRITE_ONCE(run_array->items[0].prog, new_prog);
+ idx = link_index(net, type, net_link);
+ ret = bpf_prog_array_update_at(run_array, idx, new_prog);
+ if (ret)
+ goto out_unlock;
old_prog = xchg(&link->prog, new_prog);
bpf_prog_put(old_prog);
@@ -140,6 +236,7 @@ static void bpf_netns_link_show_fdinfo(const struct bpf_link *link,
static const struct bpf_link_ops bpf_netns_link_ops = {
.release = bpf_netns_link_release,
.dealloc = bpf_netns_link_dealloc,
+ .detach = bpf_netns_link_detach,
.update_prog = bpf_netns_link_update_prog,
.fill_link_info = bpf_netns_link_fill_info,
.show_fdinfo = bpf_netns_link_show_fdinfo,
@@ -309,18 +406,30 @@ int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
return ret;
}
+static int netns_bpf_max_progs(enum netns_bpf_attach_type type)
+{
+ switch (type) {
+ case NETNS_BPF_FLOW_DISSECTOR:
+ return 1;
+ case NETNS_BPF_SK_LOOKUP:
+ return 64;
+ default:
+ return 0;
+ }
+}
+
static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
enum netns_bpf_attach_type type)
{
struct bpf_netns_link *net_link =
container_of(link, struct bpf_netns_link, link);
struct bpf_prog_array *run_array;
- int err;
+ int cnt, err;
mutex_lock(&netns_bpf_mutex);
- /* Allow attaching only one prog or link for now */
- if (!list_empty(&net->bpf.links[type])) {
+ cnt = link_count(net, type);
+ if (cnt >= netns_bpf_max_progs(type)) {
err = -E2BIG;
goto out_unlock;
}
@@ -334,6 +443,9 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
case NETNS_BPF_FLOW_DISSECTOR:
err = flow_dissector_bpf_prog_attach_check(net, link->prog);
break;
+ case NETNS_BPF_SK_LOOKUP:
+ err = 0; /* nothing to check */
+ break;
default:
err = -EINVAL;
break;
@@ -341,16 +453,22 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
if (err)
goto out_unlock;
- run_array = bpf_prog_array_alloc(1, GFP_KERNEL);
+ run_array = bpf_prog_array_alloc(cnt + 1, GFP_KERNEL);
if (!run_array) {
err = -ENOMEM;
goto out_unlock;
}
- run_array->items[0].prog = link->prog;
- rcu_assign_pointer(net->bpf.run_array[type], run_array);
list_add_tail(&net_link->node, &net->bpf.links[type]);
+ fill_prog_array(net, type, run_array);
+ run_array = rcu_replace_pointer(net->bpf.run_array[type], run_array,
+ lockdep_is_held(&netns_bpf_mutex));
+ bpf_prog_array_free(run_array);
+
+ /* Mark attach point as used */
+ netns_bpf_attach_type_need(type);
+
out_unlock:
mutex_unlock(&netns_bpf_mutex);
return err;
@@ -426,8 +544,10 @@ static void __net_exit netns_bpf_pernet_pre_exit(struct net *net)
mutex_lock(&netns_bpf_mutex);
for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) {
netns_bpf_run_array_detach(net, type);
- list_for_each_entry(net_link, &net->bpf.links[type], node)
+ list_for_each_entry(net_link, &net->bpf.links[type], node) {
net_link->net = NULL; /* auto-detach link */
+ netns_bpf_attach_type_unneed(type);
+ }
if (net->bpf.progs[type])
bpf_prog_put(net->bpf.progs[type]);
}
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index bd09290e3648..1a4fec330eaa 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -25,6 +25,7 @@
#include <linux/rhashtable.h>
#include <linux/rtnetlink.h>
#include <linux/rwsem.h>
+#include <net/xdp.h>
/* Protects offdevs, members of bpf_offload_netdev and offload members
* of all progs.
@@ -41,7 +42,7 @@ struct bpf_offload_dev {
struct bpf_offload_netdev {
struct rhash_head l;
struct net_device *netdev;
- struct bpf_offload_dev *offdev;
+ struct bpf_offload_dev *offdev; /* NULL when bound-only */
struct list_head progs;
struct list_head maps;
struct list_head offdev_netdevs;
@@ -56,7 +57,6 @@ static const struct rhashtable_params offdevs_params = {
};
static struct rhashtable offdevs;
-static bool offdevs_inited;
static int bpf_dev_offload_check(struct net_device *netdev)
{
@@ -72,58 +72,227 @@ bpf_offload_find_netdev(struct net_device *netdev)
{
lockdep_assert_held(&bpf_devs_lock);
- if (!offdevs_inited)
- return NULL;
return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
}
-int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
+static int __bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
{
struct bpf_offload_netdev *ondev;
- struct bpf_prog_offload *offload;
int err;
- if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
- attr->prog_type != BPF_PROG_TYPE_XDP)
- return -EINVAL;
+ ondev = kzalloc(sizeof(*ondev), GFP_KERNEL);
+ if (!ondev)
+ return -ENOMEM;
- if (attr->prog_flags)
- return -EINVAL;
+ ondev->netdev = netdev;
+ ondev->offdev = offdev;
+ INIT_LIST_HEAD(&ondev->progs);
+ INIT_LIST_HEAD(&ondev->maps);
+
+ err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params);
+ if (err) {
+ netdev_warn(netdev, "failed to register for BPF offload\n");
+ goto err_free;
+ }
+
+ if (offdev)
+ list_add(&ondev->offdev_netdevs, &offdev->netdevs);
+ return 0;
+
+err_free:
+ kfree(ondev);
+ return err;
+}
+
+static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+ struct bpf_prog_offload *offload = prog->aux->offload;
+
+ if (offload->dev_state)
+ offload->offdev->ops->destroy(prog);
+
+ list_del_init(&offload->offloads);
+ kfree(offload);
+ prog->aux->offload = NULL;
+}
+
+static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
+ enum bpf_netdev_command cmd)
+{
+ struct netdev_bpf data = {};
+ struct net_device *netdev;
+
+ ASSERT_RTNL();
+
+ data.command = cmd;
+ data.offmap = offmap;
+ /* Caller must make sure netdev is valid */
+ netdev = offmap->netdev;
+
+ return netdev->netdev_ops->ndo_bpf(netdev, &data);
+}
+
+static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
+{
+ WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
+ /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
+ bpf_map_free_id(&offmap->map);
+ list_del_init(&offmap->offloads);
+ offmap->netdev = NULL;
+}
+
+static void __bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
+{
+ struct bpf_offload_netdev *ondev, *altdev = NULL;
+ struct bpf_offloaded_map *offmap, *mtmp;
+ struct bpf_prog_offload *offload, *ptmp;
+
+ ASSERT_RTNL();
+
+ ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
+ if (WARN_ON(!ondev))
+ return;
+
+ WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params));
+
+ /* Try to move the objects to another netdev of the device */
+ if (offdev) {
+ list_del(&ondev->offdev_netdevs);
+ altdev = list_first_entry_or_null(&offdev->netdevs,
+ struct bpf_offload_netdev,
+ offdev_netdevs);
+ }
+
+ if (altdev) {
+ list_for_each_entry(offload, &ondev->progs, offloads)
+ offload->netdev = altdev->netdev;
+ list_splice_init(&ondev->progs, &altdev->progs);
+
+ list_for_each_entry(offmap, &ondev->maps, offloads)
+ offmap->netdev = altdev->netdev;
+ list_splice_init(&ondev->maps, &altdev->maps);
+ } else {
+ list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads)
+ __bpf_prog_offload_destroy(offload->prog);
+ list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads)
+ __bpf_map_offload_destroy(offmap);
+ }
+
+ WARN_ON(!list_empty(&ondev->progs));
+ WARN_ON(!list_empty(&ondev->maps));
+ kfree(ondev);
+}
+
+static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *netdev)
+{
+ struct bpf_offload_netdev *ondev;
+ struct bpf_prog_offload *offload;
+ int err;
offload = kzalloc(sizeof(*offload), GFP_USER);
if (!offload)
return -ENOMEM;
offload->prog = prog;
+ offload->netdev = netdev;
- offload->netdev = dev_get_by_index(current->nsproxy->net_ns,
- attr->prog_ifindex);
- err = bpf_dev_offload_check(offload->netdev);
- if (err)
- goto err_maybe_put;
-
- down_write(&bpf_devs_lock);
ondev = bpf_offload_find_netdev(offload->netdev);
- if (!ondev) {
+ /* When program is offloaded require presence of "true"
+ * bpf_offload_netdev, avoid the one created for !ondev case below.
+ */
+ if (bpf_prog_is_offloaded(prog->aux) && (!ondev || !ondev->offdev)) {
err = -EINVAL;
- goto err_unlock;
+ goto err_free;
+ }
+ if (!ondev) {
+ /* When only binding to the device, explicitly
+ * create an entry in the hashtable.
+ */
+ err = __bpf_offload_dev_netdev_register(NULL, offload->netdev);
+ if (err)
+ goto err_free;
+ ondev = bpf_offload_find_netdev(offload->netdev);
}
offload->offdev = ondev->offdev;
prog->aux->offload = offload;
list_add_tail(&offload->offloads, &ondev->progs);
- dev_put(offload->netdev);
- up_write(&bpf_devs_lock);
return 0;
-err_unlock:
- up_write(&bpf_devs_lock);
-err_maybe_put:
- if (offload->netdev)
- dev_put(offload->netdev);
+err_free:
kfree(offload);
return err;
}
+int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr)
+{
+ struct net_device *netdev;
+ int err;
+
+ if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_type != BPF_PROG_TYPE_XDP)
+ return -EINVAL;
+
+ if (attr->prog_flags & ~(BPF_F_XDP_DEV_BOUND_ONLY | BPF_F_XDP_HAS_FRAGS))
+ return -EINVAL;
+
+ /* Frags are allowed only if program is dev-bound-only, but not
+ * if it is requesting bpf offload.
+ */
+ if (attr->prog_flags & BPF_F_XDP_HAS_FRAGS &&
+ !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY))
+ return -EINVAL;
+
+ if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY)
+ return -EINVAL;
+
+ netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex);
+ if (!netdev)
+ return -EINVAL;
+
+ err = bpf_dev_offload_check(netdev);
+ if (err)
+ goto out;
+
+ prog->aux->offload_requested = !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY);
+
+ down_write(&bpf_devs_lock);
+ err = __bpf_prog_dev_bound_init(prog, netdev);
+ up_write(&bpf_devs_lock);
+
+out:
+ dev_put(netdev);
+ return err;
+}
+
+int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog)
+{
+ int err;
+
+ if (!bpf_prog_is_dev_bound(old_prog->aux))
+ return 0;
+
+ if (bpf_prog_is_offloaded(old_prog->aux))
+ return -EINVAL;
+
+ new_prog->aux->dev_bound = old_prog->aux->dev_bound;
+ new_prog->aux->offload_requested = old_prog->aux->offload_requested;
+
+ down_write(&bpf_devs_lock);
+ if (!old_prog->aux->offload) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = __bpf_prog_dev_bound_init(new_prog, old_prog->aux->offload->netdev);
+
+out:
+ up_write(&bpf_devs_lock);
+ return err;
+}
+
int bpf_prog_offload_verifier_prep(struct bpf_prog *prog)
{
struct bpf_prog_offload *offload;
@@ -209,27 +378,25 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
up_read(&bpf_devs_lock);
}
-static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+void bpf_prog_dev_bound_destroy(struct bpf_prog *prog)
{
- struct bpf_prog_offload *offload = prog->aux->offload;
-
- if (offload->dev_state)
- offload->offdev->ops->destroy(prog);
-
- /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */
- bpf_prog_free_id(prog, true);
-
- list_del_init(&offload->offloads);
- kfree(offload);
- prog->aux->offload = NULL;
-}
+ struct bpf_offload_netdev *ondev;
+ struct net_device *netdev;
-void bpf_prog_offload_destroy(struct bpf_prog *prog)
-{
+ rtnl_lock();
down_write(&bpf_devs_lock);
- if (prog->aux->offload)
+ if (prog->aux->offload) {
+ list_del_init(&prog->aux->offload->offloads);
+
+ netdev = prog->aux->offload->netdev;
__bpf_prog_offload_destroy(prog);
+
+ ondev = bpf_offload_find_netdev(netdev);
+ if (!ondev->offdev && list_empty(&ondev->progs))
+ __bpf_offload_dev_netdev_unregister(NULL, netdev);
+ }
up_write(&bpf_devs_lock);
+ rtnl_unlock();
}
static int bpf_prog_offload_translate(struct bpf_prog *prog)
@@ -343,22 +510,6 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
const struct bpf_prog_ops bpf_offload_prog_ops = {
};
-static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
- enum bpf_netdev_command cmd)
-{
- struct netdev_bpf data = {};
- struct net_device *netdev;
-
- ASSERT_RTNL();
-
- data.command = cmd;
- data.offmap = offmap;
- /* Caller must make sure netdev is valid */
- netdev = offmap->netdev;
-
- return netdev->netdev_ops->ndo_bpf(netdev, &data);
-}
-
struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
{
struct net *net = current->nsproxy->net_ns;
@@ -372,7 +523,7 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
attr->map_type != BPF_MAP_TYPE_HASH)
return ERR_PTR(-EINVAL);
- offmap = kzalloc(sizeof(*offmap), GFP_USER);
+ offmap = bpf_map_area_alloc(sizeof(*offmap), NUMA_NO_NODE);
if (!offmap)
return ERR_PTR(-ENOMEM);
@@ -404,19 +555,10 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
err_unlock:
up_write(&bpf_devs_lock);
rtnl_unlock();
- kfree(offmap);
+ bpf_map_area_free(offmap);
return ERR_PTR(err);
}
-static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
-{
- WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
- /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
- bpf_map_free_id(&offmap->map, true);
- list_del_init(&offmap->offloads);
- offmap->netdev = NULL;
-}
-
void bpf_map_offload_map_free(struct bpf_map *map)
{
struct bpf_offloaded_map *offmap = map_to_offmap(map);
@@ -428,7 +570,13 @@ void bpf_map_offload_map_free(struct bpf_map *map)
up_write(&bpf_devs_lock);
rtnl_unlock();
- kfree(offmap);
+ bpf_map_area_free(offmap);
+}
+
+u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map)
+{
+ /* The memory dynamically allocated in netdev dev_ops is not counted */
+ return sizeof(struct bpf_offloaded_map);
}
int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
@@ -576,12 +724,28 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev)
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_match);
+bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs)
+{
+ bool ret;
+
+ if (bpf_prog_is_offloaded(lhs->aux) != bpf_prog_is_offloaded(rhs->aux))
+ return false;
+
+ down_read(&bpf_devs_lock);
+ ret = lhs->aux->offload && rhs->aux->offload &&
+ lhs->aux->offload->netdev &&
+ lhs->aux->offload->netdev == rhs->aux->offload->netdev;
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
{
struct bpf_offloaded_map *offmap;
bool ret;
- if (!bpf_map_is_dev_bound(map))
+ if (!bpf_map_is_offloaded(map))
return bpf_map_offload_neutral(map);
offmap = map_to_offmap(map);
@@ -595,32 +759,11 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
struct net_device *netdev)
{
- struct bpf_offload_netdev *ondev;
int err;
- ondev = kzalloc(sizeof(*ondev), GFP_KERNEL);
- if (!ondev)
- return -ENOMEM;
-
- ondev->netdev = netdev;
- ondev->offdev = offdev;
- INIT_LIST_HEAD(&ondev->progs);
- INIT_LIST_HEAD(&ondev->maps);
-
down_write(&bpf_devs_lock);
- err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params);
- if (err) {
- netdev_warn(netdev, "failed to register for BPF offload\n");
- goto err_unlock_free;
- }
-
- list_add(&ondev->offdev_netdevs, &offdev->netdevs);
+ err = __bpf_offload_dev_netdev_register(offdev, netdev);
up_write(&bpf_devs_lock);
- return 0;
-
-err_unlock_free:
- up_write(&bpf_devs_lock);
- kfree(ondev);
return err;
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);
@@ -628,43 +771,8 @@ EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);
void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
struct net_device *netdev)
{
- struct bpf_offload_netdev *ondev, *altdev;
- struct bpf_offloaded_map *offmap, *mtmp;
- struct bpf_prog_offload *offload, *ptmp;
-
- ASSERT_RTNL();
-
down_write(&bpf_devs_lock);
- ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
- if (WARN_ON(!ondev))
- goto unlock;
-
- WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params));
- list_del(&ondev->offdev_netdevs);
-
- /* Try to move the objects to another netdev of the device */
- altdev = list_first_entry_or_null(&offdev->netdevs,
- struct bpf_offload_netdev,
- offdev_netdevs);
- if (altdev) {
- list_for_each_entry(offload, &ondev->progs, offloads)
- offload->netdev = altdev->netdev;
- list_splice_init(&ondev->progs, &altdev->progs);
-
- list_for_each_entry(offmap, &ondev->maps, offloads)
- offmap->netdev = altdev->netdev;
- list_splice_init(&ondev->maps, &altdev->maps);
- } else {
- list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads)
- __bpf_prog_offload_destroy(offload->prog);
- list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads)
- __bpf_map_offload_destroy(offmap);
- }
-
- WARN_ON(!list_empty(&ondev->progs));
- WARN_ON(!list_empty(&ondev->maps));
- kfree(ondev);
-unlock:
+ __bpf_offload_dev_netdev_unregister(offdev, netdev);
up_write(&bpf_devs_lock);
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
@@ -673,18 +781,6 @@ struct bpf_offload_dev *
bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv)
{
struct bpf_offload_dev *offdev;
- int err;
-
- down_write(&bpf_devs_lock);
- if (!offdevs_inited) {
- err = rhashtable_init(&offdevs, &offdevs_params);
- if (err) {
- up_write(&bpf_devs_lock);
- return ERR_PTR(err);
- }
- offdevs_inited = true;
- }
- up_write(&bpf_devs_lock);
offdev = kzalloc(sizeof(*offdev), GFP_KERNEL);
if (!offdev)
@@ -710,3 +806,68 @@ void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev)
return offdev->priv;
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_priv);
+
+void bpf_dev_bound_netdev_unregister(struct net_device *dev)
+{
+ struct bpf_offload_netdev *ondev;
+
+ ASSERT_RTNL();
+
+ down_write(&bpf_devs_lock);
+ ondev = bpf_offload_find_netdev(dev);
+ if (ondev && !ondev->offdev)
+ __bpf_offload_dev_netdev_unregister(NULL, ondev->netdev);
+ up_write(&bpf_devs_lock);
+}
+
+int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
+ struct bpf_prog_aux *prog_aux)
+{
+ if (!bpf_prog_is_dev_bound(prog_aux)) {
+ bpf_log(log, "metadata kfuncs require device-bound program\n");
+ return -EINVAL;
+ }
+
+ if (bpf_prog_is_offloaded(prog_aux)) {
+ bpf_log(log, "metadata kfuncs can't be offloaded\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
+{
+ const struct xdp_metadata_ops *ops;
+ void *p = NULL;
+
+ /* We don't hold bpf_devs_lock while resolving several
+ * kfuncs and can race with the unregister_netdevice().
+ * We rely on bpf_dev_bound_match() check at attach
+ * to render this program unusable.
+ */
+ down_read(&bpf_devs_lock);
+ if (!prog->aux->offload)
+ goto out;
+
+ ops = prog->aux->offload->netdev->xdp_metadata_ops;
+ if (!ops)
+ goto out;
+
+#define XDP_METADATA_KFUNC(name, _, __, xmo) \
+ if (func_id == bpf_xdp_metadata_kfunc_id(name)) p = ops->xmo;
+ XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+
+out:
+ up_read(&bpf_devs_lock);
+
+ return p;
+}
+
+static int __init bpf_offload_init(void)
+{
+ return rhashtable_init(&offdevs, &offdevs_params);
+}
+
+core_initcall(bpf_offload_init);
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index b367430e611c..034cf87b54e9 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -17,6 +17,8 @@ int pcpu_freelist_init(struct pcpu_freelist *s)
raw_spin_lock_init(&head->lock);
head->first = NULL;
}
+ raw_spin_lock_init(&s->extralist.lock);
+ s->extralist.first = NULL;
return 0;
}
@@ -29,7 +31,7 @@ static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
struct pcpu_freelist_node *node)
{
node->next = head->first;
- head->first = node;
+ WRITE_ONCE(head->first, node);
}
static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
@@ -40,12 +42,48 @@ static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
raw_spin_unlock(&head->lock);
}
+static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s,
+ struct pcpu_freelist_node *node)
+{
+ if (!raw_spin_trylock(&s->extralist.lock))
+ return false;
+
+ pcpu_freelist_push_node(&s->extralist, node);
+ raw_spin_unlock(&s->extralist.lock);
+ return true;
+}
+
+static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
+ struct pcpu_freelist_node *node)
+{
+ int cpu, orig_cpu;
+
+ orig_cpu = raw_smp_processor_id();
+ while (1) {
+ for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) {
+ struct pcpu_freelist_head *head;
+
+ head = per_cpu_ptr(s->freelist, cpu);
+ if (raw_spin_trylock(&head->lock)) {
+ pcpu_freelist_push_node(head, node);
+ raw_spin_unlock(&head->lock);
+ return;
+ }
+ }
+
+ /* cannot lock any per cpu lock, try extralist */
+ if (pcpu_freelist_try_push_extra(s, node))
+ return;
+ }
+}
+
void __pcpu_freelist_push(struct pcpu_freelist *s,
struct pcpu_freelist_node *node)
{
- struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
-
- ___pcpu_freelist_push(head, node);
+ if (in_nmi())
+ ___pcpu_freelist_push_nmi(s, node);
+ else
+ ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node);
}
void pcpu_freelist_push(struct pcpu_freelist *s,
@@ -62,48 +100,92 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
u32 nr_elems)
{
struct pcpu_freelist_head *head;
- int i, cpu, pcpu_entries;
+ unsigned int cpu, cpu_idx, i, j, n, m;
- pcpu_entries = nr_elems / num_possible_cpus() + 1;
- i = 0;
+ n = nr_elems / num_possible_cpus();
+ m = nr_elems % num_possible_cpus();
+ cpu_idx = 0;
for_each_possible_cpu(cpu) {
-again:
head = per_cpu_ptr(s->freelist, cpu);
- /* No locking required as this is not visible yet. */
- pcpu_freelist_push_node(head, buf);
- i++;
- buf += elem_size;
- if (i == nr_elems)
- break;
- if (i % pcpu_entries)
- goto again;
+ j = n + (cpu_idx < m ? 1 : 0);
+ for (i = 0; i < j; i++) {
+ /* No locking required as this is not visible yet. */
+ pcpu_freelist_push_node(head, buf);
+ buf += elem_size;
+ }
+ cpu_idx++;
}
}
-struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
+static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
- int orig_cpu, cpu;
+ int cpu;
- orig_cpu = cpu = raw_smp_processor_id();
- while (1) {
+ for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
head = per_cpu_ptr(s->freelist, cpu);
+ if (!READ_ONCE(head->first))
+ continue;
raw_spin_lock(&head->lock);
node = head->first;
if (node) {
- head->first = node->next;
+ WRITE_ONCE(head->first, node->next);
raw_spin_unlock(&head->lock);
return node;
}
raw_spin_unlock(&head->lock);
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- cpu = 0;
- if (cpu == orig_cpu)
- return NULL;
}
+
+ /* per cpu lists are all empty, try extralist */
+ if (!READ_ONCE(s->extralist.first))
+ return NULL;
+ raw_spin_lock(&s->extralist.lock);
+ node = s->extralist.first;
+ if (node)
+ WRITE_ONCE(s->extralist.first, node->next);
+ raw_spin_unlock(&s->extralist.lock);
+ return node;
+}
+
+static struct pcpu_freelist_node *
+___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
+{
+ struct pcpu_freelist_head *head;
+ struct pcpu_freelist_node *node;
+ int cpu;
+
+ for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
+ head = per_cpu_ptr(s->freelist, cpu);
+ if (!READ_ONCE(head->first))
+ continue;
+ if (raw_spin_trylock(&head->lock)) {
+ node = head->first;
+ if (node) {
+ WRITE_ONCE(head->first, node->next);
+ raw_spin_unlock(&head->lock);
+ return node;
+ }
+ raw_spin_unlock(&head->lock);
+ }
+ }
+
+ /* cannot pop from per cpu lists, try extralist */
+ if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock))
+ return NULL;
+ node = s->extralist.first;
+ if (node)
+ WRITE_ONCE(s->extralist.first, node->next);
+ raw_spin_unlock(&s->extralist.lock);
+ return node;
+}
+
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+ if (in_nmi())
+ return ___pcpu_freelist_pop_nmi(s);
+ return ___pcpu_freelist_pop(s);
}
struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index fbf8a8a28979..3c76553cfe57 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -13,6 +13,7 @@ struct pcpu_freelist_head {
struct pcpu_freelist {
struct pcpu_freelist_head __percpu *freelist;
+ struct pcpu_freelist_head extralist;
};
struct pcpu_freelist_node {
diff --git a/kernel/bpf/preload/.gitignore b/kernel/bpf/preload/.gitignore
new file mode 100644
index 000000000000..9452322902a5
--- /dev/null
+++ b/kernel/bpf/preload/.gitignore
@@ -0,0 +1,2 @@
+/libbpf
+/bpf_preload_umd
diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig
new file mode 100644
index 000000000000..c9d45c9d6918
--- /dev/null
+++ b/kernel/bpf/preload/Kconfig
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config USERMODE_DRIVER
+ bool
+ default n
+
+menuconfig BPF_PRELOAD
+ bool "Preload BPF file system with kernel specific program and map iterators"
+ depends on BPF
+ depends on BPF_SYSCALL
+ # The dependency on !COMPILE_TEST prevents it from being enabled
+ # in allmodconfig or allyesconfig configurations
+ depends on !COMPILE_TEST
+ select USERMODE_DRIVER
+ help
+ This builds kernel module with several embedded BPF programs that are
+ pinned into BPF FS mount point as human readable files that are
+ useful in debugging and introspection of BPF programs and maps.
+
+if BPF_PRELOAD
+config BPF_PRELOAD_UMD
+ tristate "bpf_preload kernel module"
+ default m
+ help
+ This builds bpf_preload kernel module with embedded BPF programs for
+ introspection in bpffs.
+endif
diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile
new file mode 100644
index 000000000000..20f89cc0a0a6
--- /dev/null
+++ b/kernel/bpf/preload/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+
+LIBBPF_INCLUDE = $(srctree)/tools/lib
+
+obj-$(CONFIG_BPF_PRELOAD_UMD) += bpf_preload.o
+CFLAGS_bpf_preload_kern.o += -I$(LIBBPF_INCLUDE)
+bpf_preload-objs += bpf_preload_kern.o
diff --git a/kernel/bpf/preload/bpf_preload.h b/kernel/bpf/preload/bpf_preload.h
new file mode 100644
index 000000000000..f065c91213a0
--- /dev/null
+++ b/kernel/bpf/preload/bpf_preload.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BPF_PRELOAD_H
+#define _BPF_PRELOAD_H
+
+struct bpf_preload_info {
+ char link_name[16];
+ struct bpf_link *link;
+};
+
+struct bpf_preload_ops {
+ int (*preload)(struct bpf_preload_info *);
+ struct module *owner;
+};
+extern struct bpf_preload_ops *bpf_preload_ops;
+#define BPF_PRELOAD_LINKS 2
+#endif
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
new file mode 100644
index 000000000000..0c63bc2cd895
--- /dev/null
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/init.h>
+#include <linux/module.h>
+#include "bpf_preload.h"
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#include "iterators/iterators.lskel-little-endian.h"
+#else
+#include "iterators/iterators.lskel-big-endian.h"
+#endif
+
+static struct bpf_link *maps_link, *progs_link;
+static struct iterators_bpf *skel;
+
+static void free_links_and_skel(void)
+{
+ if (!IS_ERR_OR_NULL(maps_link))
+ bpf_link_put(maps_link);
+ if (!IS_ERR_OR_NULL(progs_link))
+ bpf_link_put(progs_link);
+ iterators_bpf__destroy(skel);
+}
+
+static int preload(struct bpf_preload_info *obj)
+{
+ strscpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name));
+ obj[0].link = maps_link;
+ strscpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name));
+ obj[1].link = progs_link;
+ return 0;
+}
+
+static struct bpf_preload_ops ops = {
+ .preload = preload,
+ .owner = THIS_MODULE,
+};
+
+static int load_skel(void)
+{
+ int err;
+
+ skel = iterators_bpf__open();
+ if (!skel)
+ return -ENOMEM;
+ err = iterators_bpf__load(skel);
+ if (err)
+ goto out;
+ err = iterators_bpf__attach(skel);
+ if (err)
+ goto out;
+ maps_link = bpf_link_get_from_fd(skel->links.dump_bpf_map_fd);
+ if (IS_ERR(maps_link)) {
+ err = PTR_ERR(maps_link);
+ goto out;
+ }
+ progs_link = bpf_link_get_from_fd(skel->links.dump_bpf_prog_fd);
+ if (IS_ERR(progs_link)) {
+ err = PTR_ERR(progs_link);
+ goto out;
+ }
+ /* Avoid taking over stdin/stdout/stderr of init process. Zeroing out
+ * makes skel_closenz() a no-op later in iterators_bpf__destroy().
+ */
+ close_fd(skel->links.dump_bpf_map_fd);
+ skel->links.dump_bpf_map_fd = 0;
+ close_fd(skel->links.dump_bpf_prog_fd);
+ skel->links.dump_bpf_prog_fd = 0;
+ return 0;
+out:
+ free_links_and_skel();
+ return err;
+}
+
+static int __init load(void)
+{
+ int err;
+
+ err = load_skel();
+ if (err)
+ return err;
+ bpf_preload_ops = &ops;
+ return err;
+}
+
+static void __exit fini(void)
+{
+ bpf_preload_ops = NULL;
+ free_links_and_skel();
+}
+late_initcall(load);
+module_exit(fini);
+MODULE_LICENSE("GPL");
diff --git a/kernel/bpf/preload/iterators/.gitignore b/kernel/bpf/preload/iterators/.gitignore
new file mode 100644
index 000000000000..ffdb70230c8b
--- /dev/null
+++ b/kernel/bpf/preload/iterators/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/.output
diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile
new file mode 100644
index 000000000000..b83c2f5e9be1
--- /dev/null
+++ b/kernel/bpf/preload/iterators/Makefile
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: GPL-2.0
+OUTPUT := .output
+abs_out := $(abspath $(OUTPUT))
+
+CLANG ?= clang
+LLC ?= llc
+LLVM_STRIP ?= llvm-strip
+
+TOOLS_PATH := $(abspath ../../../../tools)
+BPFTOOL_SRC := $(TOOLS_PATH)/bpf/bpftool
+BPFTOOL_OUTPUT := $(abs_out)/bpftool
+DEFAULT_BPFTOOL := $(BPFTOOL_OUTPUT)/bootstrap/bpftool
+BPFTOOL ?= $(DEFAULT_BPFTOOL)
+
+LIBBPF_SRC := $(TOOLS_PATH)/lib/bpf
+LIBBPF_OUTPUT := $(abs_out)/libbpf
+LIBBPF_DESTDIR := $(LIBBPF_OUTPUT)
+LIBBPF_INCLUDE := $(LIBBPF_DESTDIR)/include
+BPFOBJ := $(LIBBPF_OUTPUT)/libbpf.a
+
+INCLUDES := -I$(OUTPUT) -I$(LIBBPF_INCLUDE) -I$(TOOLS_PATH)/include/uapi
+CFLAGS := -g -Wall
+
+ifeq ($(V),1)
+Q =
+msg =
+else
+Q = @
+msg = @printf ' %-8s %s%s\n' "$(1)" "$(notdir $(2))" "$(if $(3), $(3))";
+MAKEFLAGS += --no-print-directory
+submake_extras := feature_display=0
+endif
+
+.DELETE_ON_ERROR:
+
+.PHONY: all clean
+
+all: iterators.lskel-little-endian.h
+
+big: iterators.lskel-big-endian.h
+
+clean:
+ $(call msg,CLEAN)
+ $(Q)rm -rf $(OUTPUT) iterators
+
+iterators.lskel-%.h: $(OUTPUT)/%/iterators.bpf.o | $(BPFTOOL)
+ $(call msg,GEN-SKEL,$@)
+ $(Q)$(BPFTOOL) gen skeleton -L $< > $@
+
+$(OUTPUT)/%/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
+ $(call msg,BPF,$@)
+ $(Q)mkdir -p $(@D)
+ $(Q)$(CLANG) -g -O2 --target=bpf -m$* $(INCLUDES) \
+ -c $(filter %.c,$^) -o $@ && \
+ $(LLVM_STRIP) -g $@
+
+$(OUTPUT) $(LIBBPF_OUTPUT) $(BPFTOOL_OUTPUT):
+ $(call msg,MKDIR,$@)
+ $(Q)mkdir -p $@
+
+$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUTPUT)
+ $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) \
+ OUTPUT=$(abspath $(dir $@))/ prefix= \
+ DESTDIR=$(LIBBPF_DESTDIR) $(abspath $@) install_headers
+
+$(DEFAULT_BPFTOOL): | $(BPFTOOL_OUTPUT)
+ $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOL_SRC) OUTPUT=$(BPFTOOL_OUTPUT)/ bootstrap
diff --git a/kernel/bpf/preload/iterators/README b/kernel/bpf/preload/iterators/README
new file mode 100644
index 000000000000..98e7c90ea012
--- /dev/null
+++ b/kernel/bpf/preload/iterators/README
@@ -0,0 +1,7 @@
+WARNING:
+If you change "iterators.bpf.c" do "make -j" in this directory to
+rebuild "iterators.lskel-little-endian.h". Then, on a big-endian
+machine, do "make -j big" in this directory to rebuild
+"iterators.lskel-big-endian.h". Commit both resulting headers.
+Make sure to have clang 10 installed.
+See Documentation/bpf/bpf_devel_QA.rst
diff --git a/kernel/bpf/preload/iterators/iterators.bpf.c b/kernel/bpf/preload/iterators/iterators.bpf.c
new file mode 100644
index 000000000000..b78968b63fab
--- /dev/null
+++ b/kernel/bpf/preload/iterators/iterators.bpf.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record)
+struct seq_file;
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+};
+
+struct bpf_map {
+ __u32 id;
+ char name[16];
+ __u32 max_entries;
+};
+
+struct bpf_iter__bpf_map {
+ struct bpf_iter_meta *meta;
+ struct bpf_map *map;
+};
+
+struct btf_type {
+ __u32 name_off;
+};
+
+struct btf_header {
+ __u32 str_len;
+};
+
+struct btf {
+ const char *strings;
+ struct btf_type **types;
+ struct btf_header hdr;
+};
+
+struct bpf_prog_aux {
+ __u32 id;
+ char name[16];
+ const char *attach_func_name;
+ struct bpf_prog *dst_prog;
+ struct bpf_func_info *func_info;
+ struct btf *btf;
+};
+
+struct bpf_prog {
+ struct bpf_prog_aux *aux;
+};
+
+struct bpf_iter__bpf_prog {
+ struct bpf_iter_meta *meta;
+ struct bpf_prog *prog;
+};
+#pragma clang attribute pop
+
+static const char *get_name(struct btf *btf, long btf_id, const char *fallback)
+{
+ struct btf_type **types, *t;
+ unsigned int name_off;
+ const char *str;
+
+ if (!btf)
+ return fallback;
+ str = btf->strings;
+ types = btf->types;
+ bpf_probe_read_kernel(&t, sizeof(t), types + btf_id);
+ name_off = BPF_CORE_READ(t, name_off);
+ if (name_off >= btf->hdr.str_len)
+ return fallback;
+ return str + name_off;
+}
+
+__s64 bpf_map_sum_elem_count(struct bpf_map *map) __ksym;
+
+SEC("iter/bpf_map")
+int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ __u64 seq_num = ctx->meta->seq_num;
+ struct bpf_map *map = ctx->map;
+
+ if (!map)
+ return 0;
+
+ if (seq_num == 0)
+ BPF_SEQ_PRINTF(seq, " id name max_entries cur_entries\n");
+
+ BPF_SEQ_PRINTF(seq, "%4u %-16s %10d %10lld\n",
+ map->id, map->name, map->max_entries,
+ bpf_map_sum_elem_count(map));
+
+ return 0;
+}
+
+SEC("iter/bpf_prog")
+int dump_bpf_prog(struct bpf_iter__bpf_prog *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ __u64 seq_num = ctx->meta->seq_num;
+ struct bpf_prog *prog = ctx->prog;
+ struct bpf_prog_aux *aux;
+
+ if (!prog)
+ return 0;
+
+ aux = prog->aux;
+ if (seq_num == 0)
+ BPF_SEQ_PRINTF(seq, " id name attached\n");
+
+ BPF_SEQ_PRINTF(seq, "%4u %-16s %s %s\n", aux->id,
+ get_name(aux->btf, aux->func_info[0].type_id, aux->name),
+ aux->attach_func_name, aux->dst_prog->aux->name);
+ return 0;
+}
+char LICENSE[] SEC("license") = "GPL";
diff --git a/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h
new file mode 100644
index 000000000000..ebdc6c0cdb70
--- /dev/null
+++ b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h
@@ -0,0 +1,419 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* THIS FILE IS AUTOGENERATED BY BPFTOOL! */
+#ifndef __ITERATORS_BPF_SKEL_H__
+#define __ITERATORS_BPF_SKEL_H__
+
+#include <bpf/skel_internal.h>
+
+struct iterators_bpf {
+ struct bpf_loader_ctx ctx;
+ struct {
+ struct bpf_map_desc rodata;
+ } maps;
+ struct {
+ struct bpf_prog_desc dump_bpf_map;
+ struct bpf_prog_desc dump_bpf_prog;
+ } progs;
+ struct {
+ int dump_bpf_map_fd;
+ int dump_bpf_prog_fd;
+ } links;
+};
+
+static inline int
+iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_map.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_map_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_prog.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_prog_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__attach(struct iterators_bpf *skel)
+{
+ int ret = 0;
+
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel);
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel);
+ return ret < 0 ? ret : 0;
+}
+
+static inline void
+iterators_bpf__detach(struct iterators_bpf *skel)
+{
+ skel_closenz(skel->links.dump_bpf_map_fd);
+ skel_closenz(skel->links.dump_bpf_prog_fd);
+}
+static void
+iterators_bpf__destroy(struct iterators_bpf *skel)
+{
+ if (!skel)
+ return;
+ iterators_bpf__detach(skel);
+ skel_closenz(skel->progs.dump_bpf_map.prog_fd);
+ skel_closenz(skel->progs.dump_bpf_prog.prog_fd);
+ skel_closenz(skel->maps.rodata.map_fd);
+ skel_free(skel);
+}
+static inline struct iterators_bpf *
+iterators_bpf__open(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = skel_alloc(sizeof(*skel));
+ if (!skel)
+ goto cleanup;
+ skel->ctx.sz = (void *)&skel->links - (void *)skel;
+ return skel;
+cleanup:
+ iterators_bpf__destroy(skel);
+ return NULL;
+}
+
+static inline int
+iterators_bpf__load(struct iterators_bpf *skel)
+{
+ struct bpf_load_and_run_opts opts = {};
+ int err;
+
+ opts.ctx = (struct bpf_loader_ctx *)skel;
+ opts.data_sz = 6008;
+ opts.data = (void *)"\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x9f\x01\0\
+\0\0\0\x18\0\0\0\0\0\0\x04\x1c\0\0\x04\x1c\0\0\x05\x18\0\0\0\0\x02\0\0\0\0\0\0\
+\x02\0\0\0\x01\x04\0\0\x02\0\0\0\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\
+\0\x04\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x08\0\0\0\0\x02\0\0\0\0\0\0\x0d\0\0\0\
+\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\x01\0\0\0\0\0\0\x04\x01\
+\0\0\x20\0\0\0\x24\x0c\0\0\x01\0\0\0\x05\0\0\0\xc2\x04\0\0\x03\0\0\0\x18\0\0\0\
+\xd0\0\0\0\x09\0\0\0\0\0\0\0\xd4\0\0\0\x0b\0\0\0\x40\0\0\0\xdf\0\0\0\x0b\0\0\0\
+\x80\0\0\0\0\x02\0\0\0\0\0\0\x0a\0\0\0\xe7\x07\0\0\0\0\0\0\0\0\0\0\xf0\x08\0\0\
+\0\0\0\0\x0c\0\0\0\xf6\x01\0\0\0\0\0\0\x08\0\0\0\x40\0\0\x01\xb3\x04\0\0\x03\0\
+\0\0\x18\0\0\x01\xbb\0\0\0\x0e\0\0\0\0\0\0\x01\xbe\0\0\0\x11\0\0\0\x20\0\0\x01\
+\xc3\0\0\0\x0e\0\0\0\xa0\0\0\x01\xcf\x08\0\0\0\0\0\0\x0f\0\0\x01\xd5\x01\0\0\0\
+\0\0\0\x04\0\0\0\x20\0\0\x01\xe2\x01\0\0\0\0\0\0\x01\x01\0\0\x08\0\0\0\0\x03\0\
+\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\x01\xe7\x01\0\0\0\0\0\0\x04\0\0\
+\0\x20\0\0\0\0\x02\0\0\0\0\0\0\x14\0\0\x02\x4b\x04\0\0\x02\0\0\0\x10\0\0\0\x13\
+\0\0\0\x03\0\0\0\0\0\0\x02\x5e\0\0\0\x15\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x18\
+\0\0\0\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x13\0\0\x02\x63\x0c\0\0\x01\0\0\
+\0\x16\0\0\x02\xaf\x04\0\0\x01\0\0\0\x08\0\0\x02\xb8\0\0\0\x19\0\0\0\0\0\0\0\0\
+\x02\0\0\0\0\0\0\x1a\0\0\x03\x09\x04\0\0\x06\0\0\0\x38\0\0\x01\xbb\0\0\0\x0e\0\
+\0\0\0\0\0\x01\xbe\0\0\0\x11\0\0\0\x20\0\0\x03\x16\0\0\0\x1b\0\0\0\xc0\0\0\x03\
+\x27\0\0\0\x15\0\0\x01\0\0\0\x03\x30\0\0\0\x1d\0\0\x01\x40\0\0\x03\x3a\0\0\0\
+\x1e\0\0\x01\x80\0\0\0\0\x02\0\0\0\0\0\0\x1c\0\0\0\0\x0a\0\0\0\0\0\0\x10\0\0\0\
+\0\x02\0\0\0\0\0\0\x1f\0\0\0\0\x02\0\0\0\0\0\0\x20\0\0\x03\x84\x04\0\0\x02\0\0\
+\0\x08\0\0\x03\x92\0\0\0\x0e\0\0\0\0\0\0\x03\x9b\0\0\0\x0e\0\0\0\x20\0\0\x03\
+\x3a\x04\0\0\x03\0\0\0\x18\0\0\x03\xa5\0\0\0\x1b\0\0\0\0\0\0\x03\xad\0\0\0\x21\
+\0\0\0\x40\0\0\x03\xb3\0\0\0\x23\0\0\0\x80\0\0\0\0\x02\0\0\0\0\0\0\x22\0\0\0\0\
+\x02\0\0\0\0\0\0\x24\0\0\x03\xb7\x04\0\0\x01\0\0\0\x04\0\0\x03\xc2\0\0\0\x0e\0\
+\0\0\0\0\0\x04\x2b\x04\0\0\x01\0\0\0\x04\0\0\x04\x34\0\0\0\x0e\0\0\0\0\0\0\0\0\
+\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\x04\xaa\x0e\0\0\0\0\0\0\
+\x25\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\x04\
+\xbe\x0e\0\0\0\0\0\0\x27\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\
+\0\0\0\x20\0\0\x04\xd4\x0e\0\0\0\0\0\0\x29\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\
+\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\x04\xe9\x0e\0\0\0\0\0\0\x2b\0\0\0\0\0\0\0\0\
+\x03\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\x05\0\x0e\0\0\0\0\0\0\x2d\
+\0\0\0\x01\0\0\x05\x08\x0f\0\0\x04\0\0\0\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\
+\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\
+\0\x11\0\0\x05\x10\x0f\0\0\x01\0\0\0\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\x62\x70\
+\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\
+\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
+\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\x3a\
+\x30\0\x2f\x68\x6f\x6d\x65\x2f\x69\x69\x69\x2f\x6c\x69\x6e\x75\x78\x2d\x6b\x65\
+\x72\x6e\x65\x6c\x2d\x74\x6f\x6f\x6c\x63\x68\x61\x69\x6e\x2f\x73\x72\x63\x2f\
+\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\
+\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\
+\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\
+\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\
+\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\
+\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\
+\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\
+\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\
+\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\
+\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\
+\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\
+\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\
+\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\
+\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\
+\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\
+\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\
+\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\
+\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x69\
+\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\x41\x59\x5f\x53\x49\x5a\x45\
+\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\
+\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\
+\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\x2d\x3e\x69\x64\x2c\x20\x6d\x61\
+\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\x70\x2d\x3e\x6d\x61\x78\x5f\x65\
+\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\
+\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\
+\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\
+\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\
+\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\
+\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\
+\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\
+\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\
+\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\
+\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\
+\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\
+\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\
+\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\
+\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\
+\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\
+\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
+\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\
+\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\
+\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\
+\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\
+\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\
+\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\
+\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\
+\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\
+\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\
+\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\
+\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\
+\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\
+\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\
+\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\
+\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\
+\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\
+\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\
+\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\
+\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x72\x6f\x64\
+\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x09\x4c\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x62\0\0\0\
+\x01\0\0\0\x80\0\0\0\0\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\
+\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\0\0\0\0\0\0\0\0\0\0\x20\
+\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\
+\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\
+\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\0\0\0\0\x79\x62\0\0\
+\0\0\0\0\x79\x71\0\x08\0\0\0\0\x15\x70\0\x1a\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\
+\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xe8\xbf\x16\0\0\
+\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb7\x30\0\0\0\0\0\x23\xb7\x50\0\0\
+\0\0\0\0\x85\0\0\0\0\0\0\x7e\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xe8\0\0\0\0\xb7\
+\x10\0\0\0\0\0\x04\xbf\x27\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x7b\xa2\xff\xf0\0\0\
+\0\0\x61\x17\0\x14\0\0\0\0\x7b\xa1\xff\xf8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\
+\0\0\xff\xff\xff\xe8\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x23\
+\xb7\x30\0\0\0\0\0\x0e\xb7\x50\0\0\0\0\0\x18\x85\0\0\0\0\0\0\x7e\xb7\0\0\0\0\0\
+\0\0\x95\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x9a\0\x01\x3c\
+\x1e\0\0\0\x01\0\0\0\x42\0\0\0\x9a\0\x01\x3c\x24\0\0\0\x02\0\0\0\x42\0\0\x01\
+\x0d\0\x01\x44\x1d\0\0\0\x03\0\0\0\x42\0\0\x01\x2e\0\x01\x4c\x06\0\0\0\x04\0\0\
+\0\x42\0\0\x01\x3d\0\x01\x40\x1d\0\0\0\x05\0\0\0\x42\0\0\x01\x62\0\x01\x58\x06\
+\0\0\0\x07\0\0\0\x42\0\0\x01\x75\0\x01\x5c\x03\0\0\0\x0e\0\0\0\x42\0\0\x01\xfb\
+\0\x01\x64\x02\0\0\0\x1e\0\0\0\x42\0\0\x02\x49\0\x01\x6c\x01\0\0\0\0\0\0\0\x02\
+\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\
+\0\x01\x09\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\x01\x39\0\0\0\0\0\0\0\x70\0\0\0\x0d\
+\0\0\0\x3e\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\x01\x09\0\0\0\0\0\0\0\xa0\0\0\0\x0d\
+\0\0\x01\x39\0\0\0\0\0\0\0\x1a\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\
+\x6d\x61\x70\0\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\
+\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\
+\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\
+\0\0\0\0\x79\x62\0\0\0\0\0\0\x79\x11\0\x08\0\0\0\0\x15\x10\0\x3b\0\0\0\0\x79\
+\x71\0\0\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\
+\0\x07\x40\0\0\xff\xff\xff\xd0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\x31\xb7\x30\0\0\0\0\0\x20\xb7\x50\0\0\0\0\0\0\x85\0\0\0\0\0\0\x7e\x7b\
+\xa6\xff\xc8\0\0\0\0\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xd0\0\0\0\0\xb7\x30\0\0\0\
+\0\0\x04\xbf\x97\0\0\0\0\0\0\x0f\x93\0\0\0\0\0\0\x79\x17\0\x28\0\0\0\0\x79\x87\
+\0\x30\0\0\0\0\x15\x80\0\x18\0\0\0\0\xb7\x20\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\
+\x61\x11\0\x04\0\0\0\0\x79\x38\0\x08\0\0\0\0\x67\x10\0\0\0\0\0\x03\x0f\x31\0\0\
+\0\0\0\0\x79\x68\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf8\
+\xb7\x20\0\0\0\0\0\x08\x85\0\0\0\0\0\0\x71\xb7\x10\0\0\0\0\0\0\x79\x3a\xff\xf8\
+\0\0\0\0\x0f\x31\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf4\
+\xb7\x20\0\0\0\0\0\x04\x85\0\0\0\0\0\0\x71\xb7\x30\0\0\0\0\0\x04\x61\x1a\xff\
+\xf4\0\0\0\0\x61\x28\0\x10\0\0\0\0\x3d\x12\0\x02\0\0\0\0\x0f\x61\0\0\0\0\0\0\
+\xbf\x96\0\0\0\0\0\0\x7b\xa9\xff\xd8\0\0\0\0\x79\x17\0\x18\0\0\0\0\x7b\xa1\xff\
+\xe0\0\0\0\0\x79\x17\0\x20\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x13\0\0\0\0\0\0\x7b\
+\xa1\xff\xe8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xd0\x79\x1a\
+\xff\xc8\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x51\xb7\x30\0\0\0\0\0\x11\
+\xb7\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x9a\0\x01\x80\x1e\0\0\0\x01\0\0\0\
+\x42\0\0\0\x9a\0\x01\x80\x24\0\0\0\x02\0\0\0\x42\0\0\x02\x7f\0\x01\x88\x1f\0\0\
+\0\x03\0\0\0\x42\0\0\x02\xa3\0\x01\x94\x06\0\0\0\x04\0\0\0\x42\0\0\x02\xbc\0\
+\x01\xa0\x0e\0\0\0\x05\0\0\0\x42\0\0\x01\x3d\0\x01\x84\x1d\0\0\0\x06\0\0\0\x42\
+\0\0\x01\x62\0\x01\xa4\x06\0\0\0\x08\0\0\0\x42\0\0\x02\xce\0\x01\xa8\x03\0\0\0\
+\x10\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x17\0\0\0\x42\0\0\x03\x79\0\x01\
+\x04\x06\0\0\0\x1a\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x1b\0\0\0\x42\0\0\
+\x03\xca\0\x01\x10\x0f\0\0\0\x1c\0\0\0\x42\0\0\x03\xdf\0\x01\x14\x2d\0\0\0\x1e\
+\0\0\0\x42\0\0\x04\x16\0\x01\x0c\x0d\0\0\0\x20\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\
+\x02\0\0\0\x21\0\0\0\x42\0\0\x03\xdf\0\x01\x14\x02\0\0\0\x24\0\0\0\x42\0\0\x04\
+\x3d\0\x01\x18\x0d\0\0\0\x27\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x28\0\0\
+\0\x42\0\0\x04\x3d\0\x01\x18\x0d\0\0\0\x2b\0\0\0\x42\0\0\x04\x3d\0\x01\x18\x0d\
+\0\0\0\x2c\0\0\0\x42\0\0\x04\x6b\0\x01\x1c\x1b\0\0\0\x2d\0\0\0\x42\0\0\x04\x6b\
+\0\x01\x1c\x06\0\0\0\x2e\0\0\0\x42\0\0\x04\x8e\0\x01\x24\x0d\0\0\0\x30\0\0\0\
+\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x3f\0\0\0\x42\0\0\x02\x49\0\x01\xc0\x01\0\
+\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\
+\x10\0\0\0\x14\0\0\x01\x09\0\0\0\0\0\0\0\x20\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\
+\x28\0\0\0\x08\0\0\x01\x39\0\0\0\0\0\0\0\x80\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\
+\x90\0\0\0\x1a\0\0\x01\x09\0\0\0\0\0\0\0\xa8\0\0\0\x1a\0\0\x03\x71\0\0\0\0\0\0\
+\0\xb0\0\0\0\x1a\0\0\x03\x75\0\0\0\0\0\0\0\xc0\0\0\0\x1f\0\0\x03\xa3\0\0\0\0\0\
+\0\0\xd8\0\0\0\x20\0\0\x01\x09\0\0\0\0\0\0\0\xf0\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\
+\0\x01\x18\0\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\x01\x50\0\0\0\x1a\0\0\x01\x09\0\0\0\
+\0\0\0\x01\x60\0\0\0\x20\0\0\x04\x65\0\0\0\0\0\0\x01\x88\0\0\0\x1a\0\0\x01\x39\
+\0\0\0\0\0\0\x01\x98\0\0\0\x1a\0\0\x04\xa6\0\0\0\0\0\0\x01\xa0\0\0\0\x18\0\0\0\
+\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\
+\x6f\x67\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\
+\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x19\0\0\0\x01\0\0\0\0\0\0\0\x12\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\
+\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0";
+ opts.insns_sz = 2216;
+ opts.insns = (void *)"\
+\xbf\x61\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\x78\xb7\x20\0\
+\0\0\0\0\x88\xb7\x30\0\0\0\0\0\0\x85\0\0\0\0\0\0\x71\x05\0\0\x14\0\0\0\0\x61\
+\x1a\xff\x78\0\0\0\0\xd5\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x7c\
+\0\0\0\0\xd5\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x80\0\0\0\0\xd5\
+\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x84\0\0\0\0\xd5\x10\0\x01\0\
+\0\0\0\x85\0\0\0\0\0\0\xa8\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\
+\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xbf\x07\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\
+\0\x0e\x68\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\
+\0\0\0\x0e\x64\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\
+\0\0\0\0\0\x0e\x58\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x05\0\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\x50\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\
+\x12\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0e\x50\xb7\x30\0\0\0\0\0\x1c\x85\0\0\0\0\
+\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\xd4\0\0\0\0\x63\xa7\xff\x78\0\0\0\0\
+\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xa0\x63\x10\0\0\0\
+\0\0\0\x61\x06\0\x1c\0\0\0\0\x15\0\0\x03\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\
+\0\x0e\x7c\x63\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\
+\0\0\x0e\x70\xb7\x30\0\0\0\0\0\x48\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\
+\x70\xff\xc3\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x17\0\0\0\0\0\0\
+\x79\x36\0\x20\0\0\0\0\x15\x30\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\
+\x0e\xb8\xb7\x20\0\0\0\0\0\x62\x61\x06\0\x04\0\0\0\0\x45\0\0\x02\0\0\0\x01\x85\
+\0\0\0\0\0\0\x94\x05\0\0\x01\0\0\0\0\x85\0\0\0\0\0\0\x71\x18\x26\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x28\x63\
+\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x20\x18\x16\0\0\0\0\0\0\0\
+\0\0\0\0\0\x0f\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xb8\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x38\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\
+\x02\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x28\xb7\x30\0\0\0\0\0\x20\x85\0\0\0\0\
+\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x9f\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\x63\x10\
+\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\x16\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\xb7\
+\x30\0\0\0\0\0\x04\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x92\0\0\
+\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x50\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\
+\x11\x70\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x58\x18\x16\0\
+\0\0\0\0\0\0\0\0\0\0\0\x11\x68\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\
+\0\0\x10\x58\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xb0\x7b\x10\0\0\0\0\0\0\x18\
+\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\x60\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xc0\
+\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\xf0\x18\x16\0\0\0\0\0\
+\0\0\0\0\0\0\0\x11\xe0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xd8\x7b\x10\0\0\0\0\0\0\x61\x06\0\x08\0\0\
+\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x78\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\
+\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x7c\x63\x10\0\0\0\0\0\0\x79\x06\0\
+\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x80\x7b\x10\0\0\0\0\0\0\x61\
+\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xa8\x63\x10\0\0\0\0\0\
+\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xf0\xb7\x20\0\0\0\0\0\x11\xb7\x30\0\0\0\
+\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\
+\xff\x5c\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\x60\x63\x07\0\x6c\0\0\0\0\
+\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\0\0\0\x05\x18\x26\0\0\
+\0\0\0\0\0\0\0\0\0\0\x11\x60\xb7\x30\0\0\0\0\0\x8c\x85\0\0\0\0\0\0\xa6\xbf\x70\
+\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\xd0\x61\x10\0\0\0\0\0\0\xd5\
+\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\xff\x4a\0\0\
+\0\0\x63\xa7\xff\x80\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x08\x18\x16\0\
+\0\0\0\0\0\0\0\0\0\0\0\x16\xe0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\
+\0\0\x12\x10\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd8\x7b\x10\0\0\0\0\0\0\x18\
+\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\x18\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x20\
+\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\x20\x18\x16\0\0\0\0\0\
+\0\0\0\0\0\0\0\x17\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x15\
+\xb0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x50\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x48\x7b\x10\0\0\0\0\
+\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xe8\x63\x10\0\0\
+\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xec\x63\x10\
+\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xf0\x7b\
+\x10\0\0\0\0\0\0\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\
+\x18\x63\x10\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x60\xb7\x20\0\0\0\
+\0\0\x12\xb7\x30\0\0\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\
+\0\0\0\0\0\0\xc5\x70\xff\x13\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd0\
+\x63\x07\0\x6c\0\0\0\0\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\
+\0\0\0\x05\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd0\xb7\x30\0\0\0\0\0\x8c\x85\0\
+\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\x40\x61\
+\x10\0\0\0\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\
+\xc5\x70\xff\x01\0\0\0\0\x63\xa7\xff\x84\0\0\0\0\x61\x1a\xff\x78\0\0\0\0\xd5\
+\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x0a\xff\x80\0\0\
+\0\0\x63\x60\0\x28\0\0\0\0\x61\x0a\xff\x84\0\0\0\0\x63\x60\0\x2c\0\0\0\0\x18\
+\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\0\0\x63\x60\0\x18\0\0\0\0\xb7\
+\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0";
+ err = bpf_load_and_run(&opts);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+static inline struct iterators_bpf *
+iterators_bpf__open_and_load(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = iterators_bpf__open();
+ if (!skel)
+ return NULL;
+ if (iterators_bpf__load(skel)) {
+ iterators_bpf__destroy(skel);
+ return NULL;
+ }
+ return skel;
+}
+
+__attribute__((unused)) static void
+iterators_bpf__assert(struct iterators_bpf *s __attribute__((unused)))
+{
+#ifdef __cplusplus
+#define _Static_assert static_assert
+#endif
+#ifdef __cplusplus
+#undef _Static_assert
+#endif
+}
+
+#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h
new file mode 100644
index 000000000000..5b98ab02025e
--- /dev/null
+++ b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h
@@ -0,0 +1,435 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* THIS FILE IS AUTOGENERATED BY BPFTOOL! */
+#ifndef __ITERATORS_BPF_SKEL_H__
+#define __ITERATORS_BPF_SKEL_H__
+
+#include <bpf/skel_internal.h>
+
+struct iterators_bpf {
+ struct bpf_loader_ctx ctx;
+ struct {
+ struct bpf_map_desc rodata;
+ } maps;
+ struct {
+ struct bpf_prog_desc dump_bpf_map;
+ struct bpf_prog_desc dump_bpf_prog;
+ } progs;
+ struct {
+ int dump_bpf_map_fd;
+ int dump_bpf_prog_fd;
+ } links;
+};
+
+static inline int
+iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_map.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_map_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_prog.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_prog_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__attach(struct iterators_bpf *skel)
+{
+ int ret = 0;
+
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel);
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel);
+ return ret < 0 ? ret : 0;
+}
+
+static inline void
+iterators_bpf__detach(struct iterators_bpf *skel)
+{
+ skel_closenz(skel->links.dump_bpf_map_fd);
+ skel_closenz(skel->links.dump_bpf_prog_fd);
+}
+static void
+iterators_bpf__destroy(struct iterators_bpf *skel)
+{
+ if (!skel)
+ return;
+ iterators_bpf__detach(skel);
+ skel_closenz(skel->progs.dump_bpf_map.prog_fd);
+ skel_closenz(skel->progs.dump_bpf_prog.prog_fd);
+ skel_closenz(skel->maps.rodata.map_fd);
+ skel_free(skel);
+}
+static inline struct iterators_bpf *
+iterators_bpf__open(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = skel_alloc(sizeof(*skel));
+ if (!skel)
+ goto cleanup;
+ skel->ctx.sz = (void *)&skel->links - (void *)skel;
+ return skel;
+cleanup:
+ iterators_bpf__destroy(skel);
+ return NULL;
+}
+
+static inline int
+iterators_bpf__load(struct iterators_bpf *skel)
+{
+ struct bpf_load_and_run_opts opts = {};
+ int err;
+
+ opts.ctx = (struct bpf_loader_ctx *)skel;
+ opts.data_sz = 6208;
+ opts.data = (void *)"\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x9f\xeb\x01\0\
+\x18\0\0\0\0\0\0\0\x80\x04\0\0\x80\x04\0\0\x31\x05\0\0\0\0\0\0\0\0\0\x02\x02\0\
+\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\0\x04\
+\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\0\0\0\0\0\
+\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\0\0\0\x20\
+\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xb0\0\0\0\x03\0\0\x04\x18\0\0\0\xbe\0\
+\0\0\x09\0\0\0\0\0\0\0\xc2\0\0\0\x0b\0\0\0\x40\0\0\0\xcd\0\0\0\x0b\0\0\0\x80\0\
+\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd5\0\0\0\0\0\0\x07\0\0\0\0\xde\0\0\0\0\0\0\
+\x08\x0c\0\0\0\xe4\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xae\x01\0\0\x03\0\0\x04\
+\x18\0\0\0\xb6\x01\0\0\x0e\0\0\0\0\0\0\0\xb9\x01\0\0\x11\0\0\0\x20\0\0\0\xbe\
+\x01\0\0\x0e\0\0\0\xa0\0\0\0\xca\x01\0\0\0\0\0\x08\x0f\0\0\0\xd0\x01\0\0\0\0\0\
+\x01\x04\0\0\0\x20\0\0\0\xdd\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\
+\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xe2\x01\0\0\0\0\0\x01\x04\0\0\0\
+\x20\0\0\0\0\0\0\0\x01\0\0\x0d\x14\0\0\0\x26\x05\0\0\x04\0\0\0\x2b\x02\0\0\0\0\
+\0\x08\x15\0\0\0\x31\x02\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\x01\x3b\x02\0\0\x01\0\
+\0\x0c\x13\0\0\0\0\0\0\0\0\0\0\x02\x18\0\0\0\x52\x02\0\0\x02\0\0\x04\x10\0\0\0\
+\x13\0\0\0\x03\0\0\0\0\0\0\0\x65\x02\0\0\x19\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\
+\x1c\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x17\0\0\0\x6a\x02\0\0\x01\0\
+\0\x0c\x1a\0\0\0\xb6\x02\0\0\x01\0\0\x04\x08\0\0\0\xbf\x02\0\0\x1d\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\x02\x1e\0\0\0\x10\x03\0\0\x06\0\0\x04\x38\0\0\0\xb6\x01\0\0\
+\x0e\0\0\0\0\0\0\0\xb9\x01\0\0\x11\0\0\0\x20\0\0\0\x1d\x03\0\0\x1f\0\0\0\xc0\0\
+\0\0\x2e\x03\0\0\x19\0\0\0\0\x01\0\0\x37\x03\0\0\x21\0\0\0\x40\x01\0\0\x41\x03\
+\0\0\x22\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\
+\0\0\0\0\0\0\0\0\0\x02\x23\0\0\0\0\0\0\0\0\0\0\x02\x24\0\0\0\x8b\x03\0\0\x02\0\
+\0\x04\x08\0\0\0\x99\x03\0\0\x0e\0\0\0\0\0\0\0\xa2\x03\0\0\x0e\0\0\0\x20\0\0\0\
+\x41\x03\0\0\x03\0\0\x04\x18\0\0\0\xac\x03\0\0\x1f\0\0\0\0\0\0\0\xb4\x03\0\0\
+\x25\0\0\0\x40\0\0\0\xba\x03\0\0\x27\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x26\0\0\
+\0\0\0\0\0\0\0\0\x02\x28\0\0\0\xbe\x03\0\0\x01\0\0\x04\x04\0\0\0\xc9\x03\0\0\
+\x0e\0\0\0\0\0\0\0\x32\x04\0\0\x01\0\0\x04\x04\0\0\0\x3b\x04\0\0\x0e\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\x12\0\0\0\x30\0\0\0\xb1\x04\0\0\0\0\0\
+\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\x12\0\0\0\x1a\0\0\0\
+\xc5\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\
+\x12\0\0\0\x20\0\0\0\xdb\x04\0\0\0\0\0\x0e\x2d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\
+\0\0\0\0\x20\0\0\0\x12\0\0\0\x11\0\0\0\xf0\x04\0\0\0\0\0\x0e\x2f\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\x07\x05\0\0\0\0\0\x0e\
+\x31\0\0\0\x01\0\0\0\x0f\x05\0\0\x01\0\0\x0f\x04\0\0\0\x36\0\0\0\0\0\0\0\x04\0\
+\0\0\x16\x05\0\0\x04\0\0\x0f\x7b\0\0\0\x2a\0\0\0\0\0\0\0\x30\0\0\0\x2c\0\0\0\
+\x30\0\0\0\x1a\0\0\0\x2e\0\0\0\x4a\0\0\0\x20\0\0\0\x30\0\0\0\x6a\0\0\0\x11\0\0\
+\0\x1e\x05\0\0\x01\0\0\x0f\x04\0\0\0\x32\0\0\0\0\0\0\0\x04\0\0\0\x26\x05\0\0\0\
+\0\0\x0e\x06\0\0\0\x01\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\
+\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\
+\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\
+\x62\x70\x66\x5f\x6d\x61\x70\0\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x73\
+\x70\x73\x6b\x2f\x73\x72\x63\x2f\x62\x70\x66\x2d\x6e\x65\x78\x74\x2f\x6b\x65\
+\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\
+\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\
+\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\
+\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\
+\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\
+\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\
+\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\
+\x69\x67\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\
+\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\
+\x70\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\
+\x6d\x61\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\
+\x6e\x75\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\
+\x71\x5f\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\
+\x3d\x3d\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\
+\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\
+\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\
+\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\
+\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\
+\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\
+\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\
+\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\
+\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\x31\x30\x64\x20\x20\x20\x25\x31\x30\
+\x6c\x6c\x64\x5c\x6e\x22\x2c\0\x7d\0\x5f\x5f\x73\x36\x34\0\x6c\x6f\x6e\x67\x20\
+\x6c\x6f\x6e\x67\0\x62\x70\x66\x5f\x6d\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\
+\x6d\x5f\x63\x6f\x75\x6e\x74\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\
+\x66\x5f\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
+\x5f\x70\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\
+\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\
+\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\
+\x20\x28\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\
+\x78\0\x09\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\
+\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\
+\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\
+\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\
+\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\
+\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\
+\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\
+\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\
+\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\
+\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\
+\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\
+\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\
+\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\
+\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\
+\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\
+\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\x20\
+\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\x74\
+\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\
+\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
+\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\
+\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\x66\
+\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\x3e\
+\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\x6e\
+\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\
+\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\
+\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\
+\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\
+\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\
+\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x6b\x73\x79\x6d\
+\x73\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x64\x75\x6d\
+\x6d\x79\x5f\x6b\x73\x79\x6d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\xc9\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x7b\0\0\0\x01\0\0\0\
+\x80\0\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\
+\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x34\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\
+\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\
+\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\
+\x31\x30\x64\x20\x20\x20\x25\x31\x30\x6c\x6c\x64\x0a\0\x20\x20\x69\x64\x20\x6e\
+\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\
+\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
+\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\
+\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1d\0\0\0\0\0\x79\x21\
+\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe0\xff\
+\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\
+\x30\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\
+\xe0\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\
+\x7b\x2a\xe8\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf0\xff\0\0\0\0\xbf\x71\
+\0\0\0\0\0\0\x85\x20\0\0\0\0\0\0\x7b\x0a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\
+\x07\x04\0\0\xe0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\
+\x30\0\0\0\xb7\x03\0\0\x1a\0\0\0\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\
+\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x88\0\0\0\
+\x1e\x44\x01\0\x01\0\0\0\x42\0\0\0\x88\0\0\0\x24\x44\x01\0\x02\0\0\0\x42\0\0\0\
+\xfb\0\0\0\x1d\x4c\x01\0\x03\0\0\0\x42\0\0\0\x1c\x01\0\0\x06\x54\x01\0\x04\0\0\
+\0\x42\0\0\0\x2b\x01\0\0\x1d\x48\x01\0\x05\0\0\0\x42\0\0\0\x50\x01\0\0\x06\x60\
+\x01\0\x07\0\0\0\x42\0\0\0\x63\x01\0\0\x03\x64\x01\0\x0e\0\0\0\x42\0\0\0\xf6\
+\x01\0\0\x02\x6c\x01\0\x21\0\0\0\x42\0\0\0\x29\x02\0\0\x01\x80\x01\0\0\0\0\0\
+\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\
+\x02\0\0\0\xf7\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x27\x01\0\0\0\0\0\0\x70\0\0\0\
+\x0d\0\0\0\x3e\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xf7\0\0\0\0\0\0\0\xa0\0\0\0\
+\x0d\0\0\0\x27\x01\0\0\0\0\0\0\x1a\0\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\
+\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\
+\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\
+\x65\x72\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x62\x70\x66\x5f\x6d\
+\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\x6d\x5f\x63\x6f\x75\x6e\x74\0\0\x47\
+\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\
+\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\
+\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\
+\x18\x62\0\0\0\0\0\0\0\0\0\0\x4a\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\
+\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\
+\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\
+\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\
+\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\
+\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\
+\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\
+\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\
+\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\
+\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\
+\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\
+\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\
+\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\
+\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x6a\0\0\0\xb7\
+\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\
+\x95\0\0\0\0\0\0\0\0\0\0\0\x1b\0\0\0\0\0\0\0\x42\0\0\0\x88\0\0\0\x1e\x94\x01\0\
+\x01\0\0\0\x42\0\0\0\x88\0\0\0\x24\x94\x01\0\x02\0\0\0\x42\0\0\0\x86\x02\0\0\
+\x1f\x9c\x01\0\x03\0\0\0\x42\0\0\0\xaa\x02\0\0\x06\xa8\x01\0\x04\0\0\0\x42\0\0\
+\0\xc3\x02\0\0\x0e\xb4\x01\0\x05\0\0\0\x42\0\0\0\x2b\x01\0\0\x1d\x98\x01\0\x06\
+\0\0\0\x42\0\0\0\x50\x01\0\0\x06\xb8\x01\0\x08\0\0\0\x42\0\0\0\xd5\x02\0\0\x03\
+\xbc\x01\0\x10\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x17\0\0\0\x42\0\0\0\
+\x80\x03\0\0\x06\x04\x01\0\x1a\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x1b\0\
+\0\0\x42\0\0\0\xd1\x03\0\0\x0f\x10\x01\0\x1c\0\0\0\x42\0\0\0\xe6\x03\0\0\x2d\
+\x14\x01\0\x1e\0\0\0\x42\0\0\0\x1d\x04\0\0\x0d\x0c\x01\0\x20\0\0\0\x42\0\0\0\
+\x45\x03\0\0\x02\xc4\x01\0\x21\0\0\0\x42\0\0\0\xe6\x03\0\0\x02\x14\x01\0\x24\0\
+\0\0\x42\0\0\0\x44\x04\0\0\x0d\x18\x01\0\x27\0\0\0\x42\0\0\0\x45\x03\0\0\x02\
+\xc4\x01\0\x28\0\0\0\x42\0\0\0\x44\x04\0\0\x0d\x18\x01\0\x2b\0\0\0\x42\0\0\0\
+\x44\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x72\x04\0\0\x1b\x1c\x01\0\x2d\0\
+\0\0\x42\0\0\0\x72\x04\0\0\x06\x1c\x01\0\x2e\0\0\0\x42\0\0\0\x95\x04\0\0\x0d\
+\x24\x01\0\x30\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x3f\0\0\0\x42\0\0\0\
+\x29\x02\0\0\x01\xd4\x01\0\0\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\
+\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x18\0\0\0\xf7\0\0\0\0\0\0\0\x20\0\0\0\x1c\0\0\
+\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x27\x01\0\0\0\0\0\0\x80\0\0\0\x1e\0\0\
+\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1e\0\0\0\xf7\0\0\0\0\0\0\0\xa8\0\0\0\x1e\0\0\0\
+\x78\x03\0\0\0\0\0\0\xb0\0\0\0\x1e\0\0\0\x7c\x03\0\0\0\0\0\0\xc0\0\0\0\x23\0\0\
+\0\xaa\x03\0\0\0\0\0\0\xd8\0\0\0\x24\0\0\0\xf7\0\0\0\0\0\0\0\xf0\0\0\0\x24\0\0\
+\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x28\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\0\0\x1e\0\
+\0\0\xf7\0\0\0\0\0\0\0\x60\x01\0\0\x24\0\0\0\x6c\x04\0\0\0\0\0\0\x88\x01\0\0\
+\x1e\0\0\0\x27\x01\0\0\0\0\0\0\x98\x01\0\0\x1e\0\0\0\xad\x04\0\0\0\0\0\0\xa0\
+\x01\0\0\x1c\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\
+\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\
+\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x19\0\0\0\x01\0\0\0\0\0\0\0\
+\x12\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\
+\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0";
+ opts.insns_sz = 2456;
+ opts.insns = (void *)"\
+\xbf\x16\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\x78\xff\xff\xff\xb7\x02\0\
+\0\x88\0\0\0\xb7\x03\0\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x05\0\x14\0\0\0\0\0\x61\
+\xa1\x78\xff\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x7c\xff\
+\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x80\xff\0\0\0\0\xd5\
+\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x84\xff\0\0\0\0\xd5\x01\x01\0\0\
+\0\0\0\x85\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\
+\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xbf\x70\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\xe8\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\
+\0\0\xe4\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\
+\0\0\0\0\xd8\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\xd0\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xd0\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xd4\xff\0\0\0\0\x63\x7a\x78\xff\0\0\0\0\
+\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x0f\0\0\x63\x01\0\0\0\
+\0\0\0\x61\x60\x1c\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\xfc\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\
+\0\xf0\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\
+\xc5\x07\xc3\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x71\0\0\0\0\0\
+\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x38\
+\x0f\0\0\xb7\x02\0\0\x7b\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\
+\0\0\0\x94\0\0\0\x05\0\x01\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x18\x62\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc0\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xb8\x0f\0\0\x18\x61\0\0\0\0\0\0\0\
+\0\0\0\xc8\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\xd0\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xc0\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x9f\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xe0\x0f\0\0\
+\xb7\x03\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x92\xff\
+\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x20\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xf0\x0f\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\x18\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
+\0\0\0\x08\x11\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x60\x12\0\0\x7b\x01\0\0\0\0\0\0\
+\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x11\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x70\x12\0\
+\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xa0\x11\0\0\x18\x61\0\0\0\0\
+\0\0\0\0\0\0\x90\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x12\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\
+\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x12\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\
+\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x2c\x12\0\0\x63\x01\0\0\0\0\0\0\x79\x60\
+\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x12\0\0\x7b\x01\0\0\0\0\0\0\x61\
+\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x58\x12\0\0\x63\x01\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xa0\x12\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\
+\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\
+\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x63\x70\x6c\0\0\0\0\0\
+\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\x18\x68\0\0\0\0\0\0\0\0\0\0\xa8\
+\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x12\0\0\xb7\x02\0\0\x17\0\0\0\xb7\x03\
+\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\
+\x07\x4d\xff\0\0\0\0\x75\x07\x03\0\0\0\0\0\x62\x08\x04\0\0\0\0\0\x6a\x08\x02\0\
+\0\0\0\0\x05\0\x0a\0\0\0\0\0\x63\x78\x04\0\0\0\0\0\xbf\x79\0\0\0\0\0\0\x77\x09\
+\0\0\x20\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\x63\x90\0\0\0\0\0\0\x55\
+\x09\x02\0\0\0\0\0\x6a\x08\x02\0\0\0\0\0\x05\0\x01\0\0\0\0\0\x6a\x08\x02\0\x40\
+\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\xb7\x03\0\
+\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\
+\0\0\x01\0\0\x61\x01\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\
+\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x80\x12\0\0\x61\x01\0\0\0\0\0\0\
+\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x2c\xff\
+\0\0\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x12\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\xa8\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
+\0\0\0\xd8\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xa0\x17\0\0\x7b\x01\0\0\0\0\0\0\
+\x18\x60\0\0\0\0\0\0\0\0\0\0\xe0\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe8\x17\0\
+\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x14\0\0\x18\x61\0\0\0\0\
+\0\0\0\0\0\0\xf8\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x78\
+\x16\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x18\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x10\x18\0\0\x7b\x01\0\0\
+\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x17\0\0\x63\x01\
+\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb4\x17\0\0\x63\
+\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x17\0\0\
+\x7b\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\
+\x17\0\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x18\0\0\xb7\x02\0\
+\0\x12\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\
+\x07\0\0\0\0\0\0\xc5\x07\xf5\xfe\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x17\0\
+\0\x63\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\
+\0\x05\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x98\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\
+\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x08\x18\0\0\
+\x61\x01\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\
+\0\0\xc5\x07\xe3\xfe\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\
+\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\
+\0\0\0\0\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\
+\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0";
+ err = bpf_load_and_run(&opts);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+static inline struct iterators_bpf *
+iterators_bpf__open_and_load(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = iterators_bpf__open();
+ if (!skel)
+ return NULL;
+ if (iterators_bpf__load(skel)) {
+ iterators_bpf__destroy(skel);
+ return NULL;
+ }
+ return skel;
+}
+
+__attribute__((unused)) static void
+iterators_bpf__assert(struct iterators_bpf *s __attribute__((unused)))
+{
+#ifdef __cplusplus
+#define _Static_assert static_assert
+#endif
+#ifdef __cplusplus
+#undef _Static_assert
+#endif
+}
+
+#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/prog_iter.c b/kernel/bpf/prog_iter.c
new file mode 100644
index 000000000000..53a73c841c13
--- /dev/null
+++ b/kernel/bpf/prog_iter.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/kernel.h>
+#include <linux/btf_ids.h>
+
+struct bpf_iter_seq_prog_info {
+ u32 prog_id;
+};
+
+static void *bpf_prog_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_prog_info *info = seq->private;
+ struct bpf_prog *prog;
+
+ prog = bpf_prog_get_curr_or_next(&info->prog_id);
+ if (!prog)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ return prog;
+}
+
+static void *bpf_prog_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_prog_info *info = seq->private;
+
+ ++*pos;
+ ++info->prog_id;
+ bpf_prog_put((struct bpf_prog *)v);
+ return bpf_prog_get_curr_or_next(&info->prog_id);
+}
+
+struct bpf_iter__bpf_prog {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct bpf_prog *, prog);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_prog, struct bpf_iter_meta *meta, struct bpf_prog *prog)
+
+static int __bpf_prog_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+ struct bpf_iter__bpf_prog ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ ctx.meta = &meta;
+ ctx.prog = v;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ return ret;
+}
+
+static int bpf_prog_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_prog_seq_show(seq, v, false);
+}
+
+static void bpf_prog_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__bpf_prog_seq_show(seq, v, true);
+ else
+ bpf_prog_put((struct bpf_prog *)v);
+}
+
+static const struct seq_operations bpf_prog_seq_ops = {
+ .start = bpf_prog_seq_start,
+ .next = bpf_prog_seq_next,
+ .stop = bpf_prog_seq_stop,
+ .show = bpf_prog_seq_show,
+};
+
+BTF_ID_LIST(btf_bpf_prog_id)
+BTF_ID(struct, bpf_prog)
+
+static const struct bpf_iter_seq_info bpf_prog_seq_info = {
+ .seq_ops = &bpf_prog_seq_ops,
+ .init_seq_private = NULL,
+ .fini_seq_private = NULL,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_prog_info),
+};
+
+static struct bpf_iter_reg bpf_prog_reg_info = {
+ .target = "bpf_prog",
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__bpf_prog, prog),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &bpf_prog_seq_info,
+};
+
+static int __init bpf_prog_iter_init(void)
+{
+ bpf_prog_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_prog_id;
+ return bpf_iter_reg_target(&bpf_prog_reg_info);
+}
+
+late_initcall(bpf_prog_iter_init);
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 05c8e043b9d2..d869f51ea93a 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -7,7 +7,7 @@
#include <linux/bpf.h>
#include <linux/list.h>
#include <linux/slab.h>
-#include <linux/capability.h>
+#include <linux/btf_ids.h>
#include "percpu_freelist.h"
#define QUEUE_STACK_CREATE_FLAG_MASK \
@@ -45,9 +45,6 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)
/* Called from syscall */
static int queue_stack_map_alloc_check(union bpf_attr *attr)
{
- if (!bpf_capable())
- return -EPERM;
-
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 0 ||
attr->value_size == 0 ||
@@ -66,29 +63,19 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr)
static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
{
- int ret, numa_node = bpf_map_attr_numa_node(attr);
- struct bpf_map_memory mem = {0};
+ int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_queue_stack *qs;
- u64 size, queue_size, cost;
+ u64 size, queue_size;
size = (u64) attr->max_entries + 1;
- cost = queue_size = sizeof(*qs) + size * attr->value_size;
-
- ret = bpf_map_charge_init(&mem, cost);
- if (ret < 0)
- return ERR_PTR(ret);
+ queue_size = sizeof(*qs) + size * attr->value_size;
qs = bpf_map_area_alloc(queue_size, numa_node);
- if (!qs) {
- bpf_map_charge_finish(&mem);
+ if (!qs)
return ERR_PTR(-ENOMEM);
- }
-
- memset(qs, 0, sizeof(*qs));
bpf_map_init_from_attr(&qs->map, attr);
- bpf_map_charge_move(&qs->map.memory, &mem);
qs->size = size;
raw_spin_lock_init(&qs->lock);
@@ -101,24 +88,22 @@ static void queue_stack_map_free(struct bpf_map *map)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
- /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
- * so the programs (can be more than one that used this map) were
- * disconnected from events. Wait for outstanding critical sections in
- * these programs to complete
- */
- synchronize_rcu();
-
bpf_map_area_free(qs);
}
-static int __queue_map_get(struct bpf_map *map, void *value, bool delete)
+static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
unsigned long flags;
int err = 0;
void *ptr;
- raw_spin_lock_irqsave(&qs->lock, flags);
+ if (in_nmi()) {
+ if (!raw_spin_trylock_irqsave(&qs->lock, flags))
+ return -EBUSY;
+ } else {
+ raw_spin_lock_irqsave(&qs->lock, flags);
+ }
if (queue_stack_map_is_empty(qs)) {
memset(value, 0, qs->map.value_size);
@@ -140,7 +125,7 @@ out:
}
-static int __stack_map_get(struct bpf_map *map, void *value, bool delete)
+static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
unsigned long flags;
@@ -148,7 +133,12 @@ static int __stack_map_get(struct bpf_map *map, void *value, bool delete)
void *ptr;
u32 index;
- raw_spin_lock_irqsave(&qs->lock, flags);
+ if (in_nmi()) {
+ if (!raw_spin_trylock_irqsave(&qs->lock, flags))
+ return -EBUSY;
+ } else {
+ raw_spin_lock_irqsave(&qs->lock, flags);
+ }
if (queue_stack_map_is_empty(qs)) {
memset(value, 0, qs->map.value_size);
@@ -172,32 +162,32 @@ out:
}
/* Called from syscall or from eBPF program */
-static int queue_map_peek_elem(struct bpf_map *map, void *value)
+static long queue_map_peek_elem(struct bpf_map *map, void *value)
{
return __queue_map_get(map, value, false);
}
/* Called from syscall or from eBPF program */
-static int stack_map_peek_elem(struct bpf_map *map, void *value)
+static long stack_map_peek_elem(struct bpf_map *map, void *value)
{
return __stack_map_get(map, value, false);
}
/* Called from syscall or from eBPF program */
-static int queue_map_pop_elem(struct bpf_map *map, void *value)
+static long queue_map_pop_elem(struct bpf_map *map, void *value)
{
return __queue_map_get(map, value, true);
}
/* Called from syscall or from eBPF program */
-static int stack_map_pop_elem(struct bpf_map *map, void *value)
+static long stack_map_pop_elem(struct bpf_map *map, void *value)
{
return __stack_map_get(map, value, true);
}
/* Called from syscall or from eBPF program */
-static int queue_stack_map_push_elem(struct bpf_map *map, void *value,
- u64 flags)
+static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
+ u64 flags)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
unsigned long irq_flags;
@@ -213,7 +203,12 @@ static int queue_stack_map_push_elem(struct bpf_map *map, void *value,
if (flags & BPF_NOEXIST || flags > BPF_EXIST)
return -EINVAL;
- raw_spin_lock_irqsave(&qs->lock, irq_flags);
+ if (in_nmi()) {
+ if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags))
+ return -EBUSY;
+ } else {
+ raw_spin_lock_irqsave(&qs->lock, irq_flags);
+ }
if (queue_stack_map_is_full(qs)) {
if (!replace) {
@@ -243,14 +238,14 @@ static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key)
}
/* Called from syscall or from eBPF program */
-static int queue_stack_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long queue_stack_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
return -EINVAL;
}
/* Called from syscall or from eBPF program */
-static int queue_stack_map_delete_elem(struct bpf_map *map, void *key)
+static long queue_stack_map_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
@@ -262,7 +257,17 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key,
return -EINVAL;
}
+static u64 queue_stack_map_mem_usage(const struct bpf_map *map)
+{
+ u64 usage = sizeof(struct bpf_queue_stack);
+
+ usage += ((u64)map->max_entries + 1) * map->value_size;
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(queue_map_btf_ids, struct, bpf_queue_stack)
const struct bpf_map_ops queue_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = queue_stack_map_alloc_check,
.map_alloc = queue_stack_map_alloc,
.map_free = queue_stack_map_free,
@@ -273,9 +278,12 @@ const struct bpf_map_ops queue_map_ops = {
.map_pop_elem = queue_map_pop_elem,
.map_peek_elem = queue_map_peek_elem,
.map_get_next_key = queue_stack_map_get_next_key,
+ .map_mem_usage = queue_stack_map_mem_usage,
+ .map_btf_id = &queue_map_btf_ids[0],
};
const struct bpf_map_ops stack_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = queue_stack_map_alloc_check,
.map_alloc = queue_stack_map_alloc,
.map_free = queue_stack_map_free,
@@ -286,4 +294,6 @@ const struct bpf_map_ops stack_map_ops = {
.map_pop_elem = stack_map_pop_elem,
.map_peek_elem = stack_map_peek_elem,
.map_get_next_key = queue_stack_map_get_next_key,
+ .map_mem_usage = queue_stack_map_mem_usage,
+ .map_btf_id = &queue_map_btf_ids[0],
};
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index cae9d505e04a..4b4f9670f1a9 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -6,6 +6,7 @@
#include <linux/err.h>
#include <linux/sock_diag.h>
#include <net/sock_reuseport.h>
+#include <linux/btf_ids.h>
struct reuseport_array {
struct bpf_map map;
@@ -20,14 +21,11 @@ static struct reuseport_array *reuseport_array(struct bpf_map *map)
/* The caller must hold the reuseport_lock */
void bpf_sk_reuseport_detach(struct sock *sk)
{
- uintptr_t sk_user_data;
+ struct sock __rcu **socks;
write_lock_bh(&sk->sk_callback_lock);
- sk_user_data = (uintptr_t)sk->sk_user_data;
- if (sk_user_data & SK_USER_DATA_BPF) {
- struct sock __rcu **socks;
-
- socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
+ socks = __locked_read_sk_user_data_with_flags(sk, SK_USER_DATA_BPF);
+ if (socks) {
WRITE_ONCE(sk->sk_user_data, NULL);
/*
* Do not move this NULL assignment outside of
@@ -61,7 +59,7 @@ static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key)
}
/* Called from syscall only */
-static int reuseport_array_delete_elem(struct bpf_map *map, void *key)
+static long reuseport_array_delete_elem(struct bpf_map *map, void *key)
{
struct reuseport_array *array = reuseport_array(map);
u32 index = *(u32 *)key;
@@ -99,12 +97,10 @@ static void reuseport_array_free(struct bpf_map *map)
struct sock *sk;
u32 i;
- synchronize_rcu();
-
/*
* ops->map_*_elem() will not be able to access this
* array now. Hence, this function only races with
- * bpf_sk_reuseport_detach() which was triggerred by
+ * bpf_sk_reuseport_detach() which was triggered by
* close() or disconnect().
*
* This function and bpf_sk_reuseport_detach() are
@@ -145,38 +141,23 @@ static void reuseport_array_free(struct bpf_map *map)
/*
* Once reaching here, all sk->sk_user_data is not
- * referenceing this "array". "array" can be freed now.
+ * referencing this "array". "array" can be freed now.
*/
bpf_map_area_free(array);
}
static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
{
- int err, numa_node = bpf_map_attr_numa_node(attr);
+ int numa_node = bpf_map_attr_numa_node(attr);
struct reuseport_array *array;
- struct bpf_map_memory mem;
- u64 array_size;
-
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
-
- array_size = sizeof(*array);
- array_size += (u64)attr->max_entries * sizeof(struct sock *);
-
- err = bpf_map_charge_init(&mem, array_size);
- if (err)
- return ERR_PTR(err);
/* allocate all map elements and zero-initialize them */
- array = bpf_map_area_alloc(array_size, numa_node);
- if (!array) {
- bpf_map_charge_finish(&mem);
+ array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node);
+ if (!array)
return ERR_PTR(-ENOMEM);
- }
/* copy mandatory map attributes */
bpf_map_init_from_attr(&array->map, attr);
- bpf_map_charge_move(&array->map.memory, &mem);
return &array->map;
}
@@ -193,7 +174,7 @@ int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
rcu_read_lock();
sk = reuseport_array_lookup_elem(map, key);
if (sk) {
- *(u64 *)value = sock_gen_cookie(sk);
+ *(u64 *)value = __sock_gen_cookie(sk);
err = 0;
} else {
err = -ENOENT;
@@ -351,11 +332,22 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key,
return 0;
}
+static u64 reuseport_array_mem_usage(const struct bpf_map *map)
+{
+ struct reuseport_array *array;
+
+ return struct_size(array, ptrs, map->max_entries);
+}
+
+BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array)
const struct bpf_map_ops reuseport_array_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = reuseport_array_alloc_check,
.map_alloc = reuseport_array_alloc,
.map_free = reuseport_array_free,
.map_lookup_elem = reuseport_array_lookup_elem,
.map_get_next_key = reuseport_array_get_next_key,
.map_delete_elem = reuseport_array_delete_elem,
+ .map_mem_usage = reuseport_array_mem_usage,
+ .map_btf_id = &reuseport_array_map_btf_ids[0],
};
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 0af88bbc1c15..0ee653a936ea 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -8,7 +8,9 @@
#include <linux/vmalloc.h>
#include <linux/wait.h>
#include <linux/poll.h>
+#include <linux/kmemleak.h>
#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
@@ -17,18 +19,10 @@
(offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT)
/* consumer page and producer page */
#define RINGBUF_POS_PAGES 2
+#define RINGBUF_NR_META_PAGES (RINGBUF_PGOFF + RINGBUF_POS_PAGES)
#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4)
-/* Maximum size of ring buffer area is limited by 32-bit page offset within
- * record header, counted in pages. Reserve 8 bits for extensibility, and take
- * into account few extra pages for consumer/producer pages and
- * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single
- * ring buffer.
- */
-#define RINGBUF_MAX_DATA_SZ \
- (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
-
struct bpf_ringbuf {
wait_queue_head_t waitq;
struct irq_work work;
@@ -36,10 +30,43 @@ struct bpf_ringbuf {
struct page **pages;
int nr_pages;
spinlock_t spinlock ____cacheline_aligned_in_smp;
- /* Consumer and producer counters are put into separate pages to allow
- * mapping consumer page as r/w, but restrict producer page to r/o.
- * This protects producer position from being modified by user-space
- * application and ruining in-kernel position tracking.
+ /* For user-space producer ring buffers, an atomic_t busy bit is used
+ * to synchronize access to the ring buffers in the kernel, rather than
+ * the spinlock that is used for kernel-producer ring buffers. This is
+ * done because the ring buffer must hold a lock across a BPF program's
+ * callback:
+ *
+ * __bpf_user_ringbuf_peek() // lock acquired
+ * -> program callback_fn()
+ * -> __bpf_user_ringbuf_sample_release() // lock released
+ *
+ * It is unsafe and incorrect to hold an IRQ spinlock across what could
+ * be a long execution window, so we instead simply disallow concurrent
+ * access to the ring buffer by kernel consumers, and return -EBUSY from
+ * __bpf_user_ringbuf_peek() if the busy bit is held by another task.
+ */
+ atomic_t busy ____cacheline_aligned_in_smp;
+ /* Consumer and producer counters are put into separate pages to
+ * allow each position to be mapped with different permissions.
+ * This prevents a user-space application from modifying the
+ * position and ruining in-kernel tracking. The permissions of the
+ * pages depend on who is producing samples: user-space or the
+ * kernel.
+ *
+ * Kernel-producer
+ * ---------------
+ * The producer position and data pages are mapped as r/o in
+ * userspace. For this approach, bits in the header of samples are
+ * used to signal to user-space, and to other producers, whether a
+ * sample is currently being written.
+ *
+ * User-space producer
+ * -------------------
+ * Only the page containing the consumer position is mapped r/o in
+ * user-space. User-space producers also use bits of the header to
+ * communicate to the kernel, but the kernel must carefully check and
+ * validate each sample to ensure that they're correctly formatted, and
+ * fully contained within the ring buffer.
*/
unsigned long consumer_pos __aligned(PAGE_SIZE);
unsigned long producer_pos __aligned(PAGE_SIZE);
@@ -48,7 +75,6 @@ struct bpf_ringbuf {
struct bpf_ringbuf_map {
struct bpf_map map;
- struct bpf_map_memory memory;
struct bpf_ringbuf *rb;
};
@@ -60,9 +86,9 @@ struct bpf_ringbuf_hdr {
static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
{
- const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN |
- __GFP_ZERO;
- int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES;
+ const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL |
+ __GFP_NOWARN | __GFP_ZERO;
+ int nr_meta_pages = RINGBUF_NR_META_PAGES;
int nr_data_pages = data_sz >> PAGE_SHIFT;
int nr_pages = nr_meta_pages + nr_data_pages;
struct page **pages, *page;
@@ -88,10 +114,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
* user-space implementations significantly.
*/
array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages);
- if (array_size > PAGE_SIZE)
- pages = vmalloc_node(array_size, numa_node);
- else
- pages = kmalloc_node(array_size, flags, numa_node);
+ pages = bpf_map_area_alloc(array_size, numa_node);
if (!pages)
return NULL;
@@ -107,8 +130,9 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
}
rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages,
- VM_ALLOC | VM_USERMAP, PAGE_KERNEL);
+ VM_MAP | VM_USERMAP, PAGE_KERNEL);
if (rb) {
+ kmemleak_not_leak(pages);
rb->pages = pages;
rb->nr_pages = nr_pages;
return rb;
@@ -117,7 +141,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
err_free_pages:
for (i = 0; i < nr_pages; i++)
__free_page(pages[i]);
- kvfree(pages);
+ bpf_map_area_free(pages);
return NULL;
}
@@ -128,15 +152,27 @@ static void bpf_ringbuf_notify(struct irq_work *work)
wake_up_all(&rb->waitq);
}
+/* Maximum size of ring buffer area is limited by 32-bit page offset within
+ * record header, counted in pages. Reserve 8 bits for extensibility, and
+ * take into account few extra pages for consumer/producer pages and
+ * non-mmap()'able parts, the current maximum size would be:
+ *
+ * (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
+ *
+ * This gives 64GB limit, which seems plenty for single ring buffer. Now
+ * considering that the maximum value of data_sz is (4GB - 1), there
+ * will be no overflow, so just note the size limit in the comments.
+ */
static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
{
struct bpf_ringbuf *rb;
rb = bpf_ringbuf_area_alloc(data_sz, numa_node);
if (!rb)
- return ERR_PTR(-ENOMEM);
+ return NULL;
spin_lock_init(&rb->spinlock);
+ atomic_set(&rb->busy, 0);
init_waitqueue_head(&rb->waitq);
init_irq_work(&rb->work, bpf_ringbuf_notify);
@@ -150,8 +186,6 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
{
struct bpf_ringbuf_map *rb_map;
- u64 cost;
- int err;
if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
@@ -161,38 +195,19 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
!PAGE_ALIGNED(attr->max_entries))
return ERR_PTR(-EINVAL);
-#ifdef CONFIG_64BIT
- /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */
- if (attr->max_entries > RINGBUF_MAX_DATA_SZ)
- return ERR_PTR(-E2BIG);
-#endif
-
- rb_map = kzalloc(sizeof(*rb_map), GFP_USER);
+ rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE);
if (!rb_map)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&rb_map->map, attr);
- cost = sizeof(struct bpf_ringbuf_map) +
- sizeof(struct bpf_ringbuf) +
- attr->max_entries;
- err = bpf_map_charge_init(&rb_map->map.memory, cost);
- if (err)
- goto err_free_map;
-
rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
- if (IS_ERR(rb_map->rb)) {
- err = PTR_ERR(rb_map->rb);
- goto err_uncharge;
+ if (!rb_map->rb) {
+ bpf_map_area_free(rb_map);
+ return ERR_PTR(-ENOMEM);
}
return &rb_map->map;
-
-err_uncharge:
- bpf_map_charge_finish(&rb_map->map.memory);
-err_free_map:
- kfree(rb_map);
- return ERR_PTR(err);
}
static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
@@ -206,23 +221,16 @@ static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
vunmap(rb);
for (i = 0; i < nr_pages; i++)
__free_page(pages[i]);
- kvfree(pages);
+ bpf_map_area_free(pages);
}
static void ringbuf_map_free(struct bpf_map *map)
{
struct bpf_ringbuf_map *rb_map;
- /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
- * so the programs (can be more than one that used this map) were
- * disconnected from events. Wait for outstanding critical sections in
- * these programs to complete
- */
- synchronize_rcu();
-
rb_map = container_of(map, struct bpf_ringbuf_map, map);
bpf_ringbuf_free(rb_map->rb);
- kfree(rb_map);
+ bpf_map_area_free(rb_map);
}
static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
@@ -230,13 +238,13 @@ static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
return ERR_PTR(-ENOTSUPP);
}
-static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 flags)
+static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 flags)
{
return -ENOTSUPP;
}
-static int ringbuf_map_delete_elem(struct bpf_map *map, void *key)
+static long ringbuf_map_delete_elem(struct bpf_map *map, void *key)
{
return -ENOTSUPP;
}
@@ -247,27 +255,42 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
return -ENOTSUPP;
}
-static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb)
+static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma)
{
- size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT;
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
- /* consumer page + producer page + 2 x data pages */
- return RINGBUF_POS_PAGES + 2 * data_pages;
+ if (vma->vm_flags & VM_WRITE) {
+ /* allow writable mapping for the consumer_pos only */
+ if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)
+ return -EPERM;
+ } else {
+ vm_flags_clear(vma, VM_MAYWRITE);
+ }
+ /* remap_vmalloc_range() checks size and offset constraints */
+ return remap_vmalloc_range(vma, rb_map->rb,
+ vma->vm_pgoff + RINGBUF_PGOFF);
}
-static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma)
{
struct bpf_ringbuf_map *rb_map;
- size_t mmap_sz;
rb_map = container_of(map, struct bpf_ringbuf_map, map);
- mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT;
-
- if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz)
- return -EINVAL;
- return remap_vmalloc_range(vma, rb_map->rb,
- vma->vm_pgoff + RINGBUF_PGOFF);
+ if (vma->vm_flags & VM_WRITE) {
+ if (vma->vm_pgoff == 0)
+ /* Disallow writable mappings to the consumer pointer,
+ * and allow writable mappings to both the producer
+ * position, and the ring buffer data itself.
+ */
+ return -EPERM;
+ } else {
+ vm_flags_clear(vma, VM_MAYWRITE);
+ }
+ /* remap_vmalloc_range() checks size and offset constraints */
+ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
}
static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
@@ -279,8 +302,13 @@ static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
return prod_pos - cons_pos;
}
-static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
- struct poll_table_struct *pts)
+static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb)
+{
+ return rb->mask + 1;
+}
+
+static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp,
+ struct poll_table_struct *pts)
{
struct bpf_ringbuf_map *rb_map;
@@ -292,15 +320,62 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
return 0;
}
+static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp,
+ struct poll_table_struct *pts)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ poll_wait(filp, &rb_map->rb->waitq, pts);
+
+ if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb))
+ return EPOLLOUT | EPOLLWRNORM;
+ return 0;
+}
+
+static u64 ringbuf_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_ringbuf *rb;
+ int nr_data_pages;
+ int nr_meta_pages;
+ u64 usage = sizeof(struct bpf_ringbuf_map);
+
+ rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
+ usage += (u64)rb->nr_pages << PAGE_SHIFT;
+ nr_meta_pages = RINGBUF_NR_META_PAGES;
+ nr_data_pages = map->max_entries >> PAGE_SHIFT;
+ usage += (nr_meta_pages + 2 * nr_data_pages) * sizeof(struct page *);
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
const struct bpf_map_ops ringbuf_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = ringbuf_map_alloc,
.map_free = ringbuf_map_free,
- .map_mmap = ringbuf_map_mmap,
- .map_poll = ringbuf_map_poll,
+ .map_mmap = ringbuf_map_mmap_kern,
+ .map_poll = ringbuf_map_poll_kern,
.map_lookup_elem = ringbuf_map_lookup_elem,
.map_update_elem = ringbuf_map_update_elem,
.map_delete_elem = ringbuf_map_delete_elem,
.map_get_next_key = ringbuf_map_get_next_key,
+ .map_mem_usage = ringbuf_map_mem_usage,
+ .map_btf_id = &ringbuf_map_btf_ids[0],
+};
+
+BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
+const struct bpf_map_ops user_ringbuf_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = ringbuf_map_alloc,
+ .map_free = ringbuf_map_free,
+ .map_mmap = ringbuf_map_mmap_user,
+ .map_poll = ringbuf_map_poll_user,
+ .map_lookup_elem = ringbuf_map_lookup_elem,
+ .map_update_elem = ringbuf_map_update_elem,
+ .map_delete_elem = ringbuf_map_delete_elem,
+ .map_get_next_key = ringbuf_map_get_next_key,
+ .map_mem_usage = ringbuf_map_mem_usage,
+ .map_btf_id = &user_ringbuf_map_btf_ids[0],
};
/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
@@ -337,6 +412,9 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
return NULL;
len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
+ if (len > ringbuf_total_data_sz(rb))
+ return NULL;
+
cons_pos = smp_load_acquire(&rb->consumer_pos);
if (in_nmi()) {
@@ -383,7 +461,7 @@ BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags)
const struct bpf_func_proto bpf_ringbuf_reserve_proto = {
.func = bpf_ringbuf_reserve,
- .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL,
+ .ret_type = RET_PTR_TO_RINGBUF_MEM_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
@@ -426,7 +504,7 @@ BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
const struct bpf_func_proto bpf_ringbuf_submit_proto = {
.func = bpf_ringbuf_submit,
.ret_type = RET_VOID,
- .arg1_type = ARG_PTR_TO_ALLOC_MEM,
+ .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE,
.arg2_type = ARG_ANYTHING,
};
@@ -439,7 +517,7 @@ BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
const struct bpf_func_proto bpf_ringbuf_discard_proto = {
.func = bpf_ringbuf_discard,
.ret_type = RET_VOID,
- .arg1_type = ARG_PTR_TO_ALLOC_MEM,
+ .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE,
.arg2_type = ARG_ANYTHING,
};
@@ -466,7 +544,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = {
.func = bpf_ringbuf_output,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
@@ -481,7 +559,7 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
case BPF_RB_AVAIL_DATA:
return ringbuf_avail_data_sz(rb);
case BPF_RB_RING_SIZE:
- return rb->mask + 1;
+ return ringbuf_total_data_sz(rb);
case BPF_RB_CONS_POS:
return smp_load_acquire(&rb->consumer_pos);
case BPF_RB_PROD_POS:
@@ -497,3 +575,215 @@ const struct bpf_func_proto bpf_ringbuf_query_proto = {
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_ANYTHING,
};
+
+BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags,
+ struct bpf_dynptr_kern *, ptr)
+{
+ struct bpf_ringbuf_map *rb_map;
+ void *sample;
+ int err;
+
+ if (unlikely(flags)) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ err = bpf_dynptr_check_size(size);
+ if (err) {
+ bpf_dynptr_set_null(ptr);
+ return err;
+ }
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+
+ sample = __bpf_ringbuf_reserve(rb_map->rb, size);
+ if (!sample) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = {
+ .func = bpf_ringbuf_reserve_dynptr,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT,
+};
+
+BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
+{
+ if (!ptr->data)
+ return 0;
+
+ bpf_ringbuf_commit(ptr->data, flags, false /* discard */);
+
+ bpf_dynptr_set_null(ptr);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = {
+ .func = bpf_ringbuf_submit_dynptr,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
+{
+ if (!ptr->data)
+ return 0;
+
+ bpf_ringbuf_commit(ptr->data, flags, true /* discard */);
+
+ bpf_dynptr_set_null(ptr);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = {
+ .func = bpf_ringbuf_discard_dynptr,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
+ .arg2_type = ARG_ANYTHING,
+};
+
+static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size)
+{
+ int err;
+ u32 hdr_len, sample_len, total_len, flags, *hdr;
+ u64 cons_pos, prod_pos;
+
+ /* Synchronizes with smp_store_release() in user-space producer. */
+ prod_pos = smp_load_acquire(&rb->producer_pos);
+ if (prod_pos % 8)
+ return -EINVAL;
+
+ /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */
+ cons_pos = smp_load_acquire(&rb->consumer_pos);
+ if (cons_pos >= prod_pos)
+ return -ENODATA;
+
+ hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask));
+ /* Synchronizes with smp_store_release() in user-space producer. */
+ hdr_len = smp_load_acquire(hdr);
+ flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT);
+ sample_len = hdr_len & ~flags;
+ total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8);
+
+ /* The sample must fit within the region advertised by the producer position. */
+ if (total_len > prod_pos - cons_pos)
+ return -EINVAL;
+
+ /* The sample must fit within the data region of the ring buffer. */
+ if (total_len > ringbuf_total_data_sz(rb))
+ return -E2BIG;
+
+ /* The sample must fit into a struct bpf_dynptr. */
+ err = bpf_dynptr_check_size(sample_len);
+ if (err)
+ return -E2BIG;
+
+ if (flags & BPF_RINGBUF_DISCARD_BIT) {
+ /* If the discard bit is set, the sample should be skipped.
+ *
+ * Update the consumer pos, and return -EAGAIN so the caller
+ * knows to skip this sample and try to read the next one.
+ */
+ smp_store_release(&rb->consumer_pos, cons_pos + total_len);
+ return -EAGAIN;
+ }
+
+ if (flags & BPF_RINGBUF_BUSY_BIT)
+ return -ENODATA;
+
+ *sample = (void *)((uintptr_t)rb->data +
+ (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
+ *size = sample_len;
+ return 0;
+}
+
+static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags)
+{
+ u64 consumer_pos;
+ u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
+
+ /* Using smp_load_acquire() is unnecessary here, as the busy-bit
+ * prevents another task from writing to consumer_pos after it was read
+ * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek().
+ */
+ consumer_pos = rb->consumer_pos;
+ /* Synchronizes with smp_load_acquire() in user-space producer. */
+ smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size);
+}
+
+BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map,
+ void *, callback_fn, void *, callback_ctx, u64, flags)
+{
+ struct bpf_ringbuf *rb;
+ long samples, discarded_samples = 0, ret = 0;
+ bpf_callback_t callback = (bpf_callback_t)callback_fn;
+ u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP;
+ int busy = 0;
+
+ if (unlikely(flags & ~wakeup_flags))
+ return -EINVAL;
+
+ rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
+
+ /* If another consumer is already consuming a sample, wait for them to finish. */
+ if (!atomic_try_cmpxchg(&rb->busy, &busy, 1))
+ return -EBUSY;
+
+ for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) {
+ int err;
+ u32 size;
+ void *sample;
+ struct bpf_dynptr_kern dynptr;
+
+ err = __bpf_user_ringbuf_peek(rb, &sample, &size);
+ if (err) {
+ if (err == -ENODATA) {
+ break;
+ } else if (err == -EAGAIN) {
+ discarded_samples++;
+ continue;
+ } else {
+ ret = err;
+ goto schedule_work_return;
+ }
+ }
+
+ bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
+ ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0);
+ __bpf_user_ringbuf_sample_release(rb, size, flags);
+ }
+ ret = samples - discarded_samples;
+
+schedule_work_return:
+ /* Prevent the clearing of the busy-bit from being reordered before the
+ * storing of any rb consumer or producer positions.
+ */
+ atomic_set_release(&rb->busy, 0);
+
+ if (flags & BPF_RB_FORCE_WAKEUP)
+ irq_work_queue(&rb->work);
+ else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0)
+ irq_work_queue(&rb->work);
+ return ret;
+}
+
+const struct bpf_func_proto bpf_user_ringbuf_drain_proto = {
+ .func = bpf_user_ringbuf_drain,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 599488f25e40..dff7ba539701 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -4,12 +4,13 @@
#include <linux/bpf.h>
#include <linux/jhash.h>
#include <linux/filter.h>
+#include <linux/kernel.h>
#include <linux/stacktrace.h>
#include <linux/perf_event.h>
-#include <linux/elf.h>
-#include <linux/pagemap.h>
-#include <linux/irq_work.h>
+#include <linux/btf_ids.h>
+#include <linux/buildid.h>
#include "percpu_freelist.h"
+#include "mmap_unlock_work.h"
#define STACK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
@@ -27,28 +28,9 @@ struct bpf_stack_map {
void *elems;
struct pcpu_freelist freelist;
u32 n_buckets;
- struct stack_map_bucket *buckets[];
+ struct stack_map_bucket *buckets[] __counted_by(n_buckets);
};
-/* irq_work to run up_read() for build_id lookup in nmi context */
-struct stack_map_irq_work {
- struct irq_work irq_work;
- struct mm_struct *mm;
-};
-
-static void do_up_read(struct irq_work *entry)
-{
- struct stack_map_irq_work *work;
-
- if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
- return;
-
- work = container_of(entry, struct stack_map_irq_work, irq_work);
- mmap_read_unlock_non_owner(work->mm);
-}
-
-static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
-
static inline bool stack_map_use_build_id(struct bpf_map *map)
{
return (map->map_flags & BPF_F_STACK_BUILD_ID);
@@ -62,7 +44,8 @@ static inline int stack_map_data_size(struct bpf_map *map)
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
{
- u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
+ u64 elem_size = sizeof(struct stack_map_bucket) +
+ (u64)smap->map.value_size;
int err;
smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
@@ -88,13 +71,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
{
u32 value_size = attr->value_size;
struct bpf_stack_map *smap;
- struct bpf_map_memory mem;
u64 cost, n_buckets;
int err;
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
-
if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
@@ -114,279 +93,138 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
/* hash table size must be power of 2 */
n_buckets = roundup_pow_of_two(attr->max_entries);
+ if (!n_buckets)
+ return ERR_PTR(-E2BIG);
cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
- cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
- err = bpf_map_charge_init(&mem, cost);
- if (err)
- return ERR_PTR(err);
-
smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
- if (!smap) {
- bpf_map_charge_finish(&mem);
+ if (!smap)
return ERR_PTR(-ENOMEM);
- }
bpf_map_init_from_attr(&smap->map, attr);
- smap->map.value_size = value_size;
smap->n_buckets = n_buckets;
err = get_callchain_buffers(sysctl_perf_event_max_stack);
if (err)
- goto free_charge;
+ goto free_smap;
err = prealloc_elems_and_freelist(smap);
if (err)
goto put_buffers;
- bpf_map_charge_move(&smap->map.memory, &mem);
-
return &smap->map;
put_buffers:
put_callchain_buffers();
-free_charge:
- bpf_map_charge_finish(&mem);
+free_smap:
bpf_map_area_free(smap);
return ERR_PTR(err);
}
-#define BPF_BUILD_ID 3
-/*
- * Parse build id from the note segment. This logic can be shared between
- * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
- * identical.
- */
-static inline int stack_map_parse_build_id(void *page_addr,
- unsigned char *build_id,
- void *note_start,
- Elf32_Word note_size)
-{
- Elf32_Word note_offs = 0, new_offs;
-
- /* check for overflow */
- if (note_start < page_addr || note_start + note_size < note_start)
- return -EINVAL;
-
- /* only supports note that fits in the first page */
- if (note_start + note_size > page_addr + PAGE_SIZE)
- return -EINVAL;
-
- while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
- Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
-
- if (nhdr->n_type == BPF_BUILD_ID &&
- nhdr->n_namesz == sizeof("GNU") &&
- nhdr->n_descsz > 0 &&
- nhdr->n_descsz <= BPF_BUILD_ID_SIZE) {
- memcpy(build_id,
- note_start + note_offs +
- ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
- nhdr->n_descsz);
- memset(build_id + nhdr->n_descsz, 0,
- BPF_BUILD_ID_SIZE - nhdr->n_descsz);
- return 0;
- }
- new_offs = note_offs + sizeof(Elf32_Nhdr) +
- ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
- if (new_offs <= note_offs) /* overflow */
- break;
- note_offs = new_offs;
- }
- return -EINVAL;
-}
-
-/* Parse build ID from 32-bit ELF */
-static int stack_map_get_build_id_32(void *page_addr,
- unsigned char *build_id)
-{
- Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
- Elf32_Phdr *phdr;
- int i;
-
- /* only supports phdr that fits in one page */
- if (ehdr->e_phnum >
- (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
- return -EINVAL;
-
- phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
-
- for (i = 0; i < ehdr->e_phnum; ++i)
- if (phdr[i].p_type == PT_NOTE)
- return stack_map_parse_build_id(page_addr, build_id,
- page_addr + phdr[i].p_offset,
- phdr[i].p_filesz);
- return -EINVAL;
-}
-
-/* Parse build ID from 64-bit ELF */
-static int stack_map_get_build_id_64(void *page_addr,
- unsigned char *build_id)
-{
- Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
- Elf64_Phdr *phdr;
- int i;
-
- /* only supports phdr that fits in one page */
- if (ehdr->e_phnum >
- (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
- return -EINVAL;
-
- phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
-
- for (i = 0; i < ehdr->e_phnum; ++i)
- if (phdr[i].p_type == PT_NOTE)
- return stack_map_parse_build_id(page_addr, build_id,
- page_addr + phdr[i].p_offset,
- phdr[i].p_filesz);
- return -EINVAL;
-}
-
-/* Parse build ID of ELF file mapped to vma */
-static int stack_map_get_build_id(struct vm_area_struct *vma,
- unsigned char *build_id)
-{
- Elf32_Ehdr *ehdr;
- struct page *page;
- void *page_addr;
- int ret;
-
- /* only works for page backed storage */
- if (!vma->vm_file)
- return -EINVAL;
-
- page = find_get_page(vma->vm_file->f_mapping, 0);
- if (!page)
- return -EFAULT; /* page not mapped */
-
- ret = -EINVAL;
- page_addr = kmap_atomic(page);
- ehdr = (Elf32_Ehdr *)page_addr;
-
- /* compare magic x7f "ELF" */
- if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
- goto out;
-
- /* only support executable file and shared object file */
- if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN)
- goto out;
-
- if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
- ret = stack_map_get_build_id_32(page_addr, build_id);
- else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
- ret = stack_map_get_build_id_64(page_addr, build_id);
-out:
- kunmap_atomic(page_addr);
- put_page(page);
- return ret;
-}
-
static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
u64 *ips, u32 trace_nr, bool user)
{
int i;
- struct vm_area_struct *vma;
- bool irq_work_busy = false;
- struct stack_map_irq_work *work = NULL;
-
- if (irqs_disabled()) {
- if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
- work = this_cpu_ptr(&up_read_work);
- if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) {
- /* cannot queue more up_read, fallback */
- irq_work_busy = true;
- }
- } else {
- /*
- * PREEMPT_RT does not allow to trylock mmap sem in
- * interrupt disabled context. Force the fallback code.
- */
- irq_work_busy = true;
- }
- }
-
- /*
- * We cannot do up_read() when the irq is disabled, because of
- * risk to deadlock with rq_lock. To do build_id lookup when the
- * irqs are disabled, we need to run up_read() in irq_work. We use
- * a percpu variable to do the irq_work. If the irq_work is
- * already used by another lookup, we fall back to report ips.
- *
- * Same fallback is used for kernel stack (!user) on a stackmap
- * with build_id.
+ struct mmap_unlock_irq_work *work = NULL;
+ bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
+ struct vm_area_struct *vma, *prev_vma = NULL;
+ const char *prev_build_id;
+
+ /* If the irq_work is in use, fall back to report ips. Same
+ * fallback is used for kernel stack (!user) on a stackmap with
+ * build_id.
*/
if (!user || !current || !current->mm || irq_work_busy ||
- !mmap_read_trylock_non_owner(current->mm)) {
+ !mmap_read_trylock(current->mm)) {
/* cannot access current->mm, fall back to ips */
for (i = 0; i < trace_nr; i++) {
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
id_offs[i].ip = ips[i];
- memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE);
+ memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
}
return;
}
for (i = 0; i < trace_nr; i++) {
+ if (range_in_vma(prev_vma, ips[i], ips[i])) {
+ vma = prev_vma;
+ memcpy(id_offs[i].build_id, prev_build_id,
+ BUILD_ID_SIZE_MAX);
+ goto build_id_valid;
+ }
vma = find_vma(current->mm, ips[i]);
- if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) {
+ if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) {
/* per entry fall back to ips */
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
id_offs[i].ip = ips[i];
- memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE);
+ memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
continue;
}
+build_id_valid:
id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
- vma->vm_start;
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
+ prev_vma = vma;
+ prev_build_id = id_offs[i].build_id;
}
+ bpf_mmap_unlock_mm(work, current->mm);
+}
- if (!work) {
- mmap_read_unlock_non_owner(current->mm);
- } else {
- work->mm = current->mm;
- irq_work_queue(&work->irq_work);
+static struct perf_callchain_entry *
+get_callchain_entry_for_task(struct task_struct *task, u32 max_depth)
+{
+#ifdef CONFIG_STACKTRACE
+ struct perf_callchain_entry *entry;
+ int rctx;
+
+ entry = get_callchain_entry(&rctx);
+
+ if (!entry)
+ return NULL;
+
+ entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip,
+ max_depth, 0);
+
+ /* stack_trace_save_tsk() works on unsigned long array, while
+ * perf_callchain_entry uses u64 array. For 32-bit systems, it is
+ * necessary to fix this mismatch.
+ */
+ if (__BITS_PER_LONG != 64) {
+ unsigned long *from = (unsigned long *) entry->ip;
+ u64 *to = entry->ip;
+ int i;
+
+ /* copy data from the end to avoid using extra buffer */
+ for (i = entry->nr - 1; i >= 0; i--)
+ to[i] = (u64)(from[i]);
}
+
+ put_callchain_entry(rctx);
+
+ return entry;
+#else /* CONFIG_STACKTRACE */
+ return NULL;
+#endif
}
-BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
- u64, flags)
+static long __bpf_get_stackid(struct bpf_map *map,
+ struct perf_callchain_entry *trace, u64 flags)
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
- struct perf_callchain_entry *trace;
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
- u32 max_depth = map->value_size / stack_map_data_size(map);
- /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
- u32 init_nr = sysctl_perf_event_max_stack - max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 hash, id, trace_nr, trace_len;
bool user = flags & BPF_F_USER_STACK;
- bool kernel = !user;
u64 *ips;
bool hash_matches;
- if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
- BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
- return -EINVAL;
-
- trace = get_perf_callchain(regs, init_nr, kernel, user,
- sysctl_perf_event_max_stack, false, false);
-
- if (unlikely(!trace))
- /* couldn't fetch the stack trace */
- return -EFAULT;
-
- /* get_perf_callchain() guarantees that trace->nr >= init_nr
- * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
- */
- trace_nr = trace->nr - init_nr;
-
- if (trace_nr <= skip)
+ if (trace->nr <= skip)
/* skipping more than usable stack trace */
return -EFAULT;
- trace_nr -= skip;
+ trace_nr = trace->nr - skip;
trace_len = trace_nr * sizeof(u64);
- ips = trace->ip + skip + init_nr;
+ ips = trace->ip + skip;
hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
id = hash & (smap->n_buckets - 1);
bucket = READ_ONCE(smap->buckets[id]);
@@ -439,6 +277,33 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
return id;
}
+BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
+ u64, flags)
+{
+ u32 max_depth = map->value_size / stack_map_data_size(map);
+ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+ bool user = flags & BPF_F_USER_STACK;
+ struct perf_callchain_entry *trace;
+ bool kernel = !user;
+
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+ BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+ return -EINVAL;
+
+ max_depth += skip;
+ if (max_depth > sysctl_perf_event_max_stack)
+ max_depth = sysctl_perf_event_max_stack;
+
+ trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
+ false, false);
+
+ if (unlikely(!trace))
+ /* couldn't fetch the stack trace */
+ return -EFAULT;
+
+ return __bpf_get_stackid(map, trace, flags);
+}
+
const struct bpf_func_proto bpf_get_stackid_proto = {
.func = bpf_get_stackid,
.gpl_only = true,
@@ -448,11 +313,82 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
- u64, flags)
+static __u64 count_kernel_ip(struct perf_callchain_entry *trace)
{
- u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+ __u64 nr_kernel = 0;
+
+ while (nr_kernel < trace->nr) {
+ if (trace->ip[nr_kernel] == PERF_CONTEXT_USER)
+ break;
+ nr_kernel++;
+ }
+ return nr_kernel;
+}
+
+BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
+ struct bpf_map *, map, u64, flags)
+{
+ struct perf_event *event = ctx->event;
+ struct perf_callchain_entry *trace;
+ bool kernel, user;
+ __u64 nr_kernel;
+ int ret;
+
+ /* perf_sample_data doesn't have callchain, use bpf_get_stackid */
+ if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
+ return bpf_get_stackid((unsigned long)(ctx->regs),
+ (unsigned long) map, flags, 0, 0);
+
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+ BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+ return -EINVAL;
+
+ user = flags & BPF_F_USER_STACK;
+ kernel = !user;
+
+ trace = ctx->data->callchain;
+ if (unlikely(!trace))
+ return -EFAULT;
+
+ nr_kernel = count_kernel_ip(trace);
+
+ if (kernel) {
+ __u64 nr = trace->nr;
+
+ trace->nr = nr_kernel;
+ ret = __bpf_get_stackid(map, trace, flags);
+
+ /* restore nr */
+ trace->nr = nr;
+ } else { /* user */
+ u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
+
+ skip += nr_kernel;
+ if (skip > BPF_F_SKIP_FIELD_MASK)
+ return -EFAULT;
+
+ flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
+ ret = __bpf_get_stackid(map, trace, flags);
+ }
+ return ret;
+}
+
+const struct bpf_func_proto bpf_get_stackid_proto_pe = {
+ .func = bpf_get_stackid_pe,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
+ struct perf_callchain_entry *trace_in,
+ void *buf, u32 size, u64 flags)
+{
+ u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
+ bool crosstask = task && task != current;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
bool user = flags & BPF_F_USER_STACK;
struct perf_callchain_entry *trace;
@@ -471,24 +407,41 @@ BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
if (unlikely(size % elem_size))
goto clear;
+ /* cannot get valid user stack for task without user_mode regs */
+ if (task && user && !user_mode(regs))
+ goto err_fault;
+
+ /* get_perf_callchain does not support crosstask user stack walking
+ * but returns an empty stack instead of NULL.
+ */
+ if (crosstask && user) {
+ err = -EOPNOTSUPP;
+ goto clear;
+ }
+
num_elem = size / elem_size;
- if (sysctl_perf_event_max_stack < num_elem)
- init_nr = 0;
+ max_depth = num_elem + skip;
+ if (sysctl_perf_event_max_stack < max_depth)
+ max_depth = sysctl_perf_event_max_stack;
+
+ if (trace_in)
+ trace = trace_in;
+ else if (kernel && task)
+ trace = get_callchain_entry_for_task(task, max_depth);
else
- init_nr = sysctl_perf_event_max_stack - num_elem;
- trace = get_perf_callchain(regs, init_nr, kernel, user,
- sysctl_perf_event_max_stack, false, false);
+ trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
+ crosstask, false);
if (unlikely(!trace))
goto err_fault;
- trace_nr = trace->nr - init_nr;
- if (trace_nr < skip)
+ if (trace->nr < skip)
goto err_fault;
- trace_nr -= skip;
+ trace_nr = trace->nr - skip;
trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
copy_len = trace_nr * elem_size;
- ips = trace->ip + skip + init_nr;
+
+ ips = trace->ip + skip;
if (user && user_build_id)
stack_map_get_build_id_offset(buf, ips, trace_nr, user);
else
@@ -505,6 +458,12 @@ clear:
return err;
}
+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
+ u64, flags)
+{
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
+}
+
const struct bpf_func_proto bpf_get_stack_proto = {
.func = bpf_get_stack,
.gpl_only = true,
@@ -515,6 +474,97 @@ const struct bpf_func_proto bpf_get_stack_proto = {
.arg4_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
+ u32, size, u64, flags)
+{
+ struct pt_regs *regs;
+ long res = -EINVAL;
+
+ if (!try_get_task_stack(task))
+ return -EFAULT;
+
+ regs = task_pt_regs(task);
+ if (regs)
+ res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
+ put_task_stack(task);
+
+ return res;
+}
+
+const struct bpf_func_proto bpf_get_task_stack_proto = {
+ .func = bpf_get_task_stack,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
+ void *, buf, u32, size, u64, flags)
+{
+ struct pt_regs *regs = (struct pt_regs *)(ctx->regs);
+ struct perf_event *event = ctx->event;
+ struct perf_callchain_entry *trace;
+ bool kernel, user;
+ int err = -EINVAL;
+ __u64 nr_kernel;
+
+ if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
+
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+ BPF_F_USER_BUILD_ID)))
+ goto clear;
+
+ user = flags & BPF_F_USER_STACK;
+ kernel = !user;
+
+ err = -EFAULT;
+ trace = ctx->data->callchain;
+ if (unlikely(!trace))
+ goto clear;
+
+ nr_kernel = count_kernel_ip(trace);
+
+ if (kernel) {
+ __u64 nr = trace->nr;
+
+ trace->nr = nr_kernel;
+ err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
+
+ /* restore nr */
+ trace->nr = nr;
+ } else { /* user */
+ u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
+
+ skip += nr_kernel;
+ if (skip > BPF_F_SKIP_FIELD_MASK)
+ goto clear;
+
+ flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
+ err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
+ }
+ return err;
+
+clear:
+ memset(buf, 0, size);
+ return err;
+
+}
+
+const struct bpf_func_proto bpf_get_stack_proto_pe = {
+ .func = bpf_get_stack_pe,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
/* Called from eBPF program */
static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
{
@@ -574,14 +624,14 @@ static int stack_map_get_next_key(struct bpf_map *map, void *key,
return 0;
}
-static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long stack_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
return -EINVAL;
}
/* Called from syscall or from eBPF program */
-static int stack_map_delete_elem(struct bpf_map *map, void *key)
+static long stack_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *old_bucket;
@@ -604,16 +654,28 @@ static void stack_map_free(struct bpf_map *map)
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
- /* wait for bpf programs to complete before freeing stack map */
- synchronize_rcu();
-
bpf_map_area_free(smap->elems);
pcpu_freelist_destroy(&smap->freelist);
bpf_map_area_free(smap);
put_callchain_buffers();
}
+static u64 stack_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+ u64 value_size = map->value_size;
+ u64 n_buckets = smap->n_buckets;
+ u64 enties = map->max_entries;
+ u64 usage = sizeof(*smap);
+
+ usage += n_buckets * sizeof(struct stack_map_bucket *);
+ usage += enties * (sizeof(struct stack_map_bucket) + value_size);
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map)
const struct bpf_map_ops stack_trace_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = stack_map_alloc,
.map_free = stack_map_free,
.map_get_next_key = stack_map_get_next_key,
@@ -621,17 +683,6 @@ const struct bpf_map_ops stack_trace_map_ops = {
.map_update_elem = stack_map_update_elem,
.map_delete_elem = stack_map_delete_elem,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = stack_map_mem_usage,
+ .map_btf_id = &stack_trace_map_btf_ids[0],
};
-
-static int __init stack_map_init(void)
-{
- int cpu;
- struct stack_map_irq_work *work;
-
- for_each_possible_cpu(cpu) {
- work = per_cpu_ptr(&up_read_work, cpu);
- init_irq_work(&work->irq_work, do_up_read);
- }
- return 0;
-}
-subsys_initcall(stack_map_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0fd80ac81f70..a1f18681721c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2,8 +2,11 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
#include <linux/bpf_trace.h>
#include <linux/bpf_lirc.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bsearch.h>
#include <linux/btf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
@@ -16,7 +19,6 @@
#include <linux/fs.h>
#include <linux/license.h>
#include <linux/filter.h>
-#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/idr.h>
#include <linux/cred.h>
@@ -28,7 +30,15 @@
#include <linux/pgtable.h>
#include <linux/bpf_lsm.h>
#include <linux/poll.h>
+#include <linux/sort.h>
#include <linux/bpf-netns.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/memcontrol.h>
+#include <linux/trace_events.h>
+
+#include <net/netfilter/nf_bpf_link.h>
+#include <net/netkit.h>
+#include <net/tcx.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -48,7 +58,8 @@ static DEFINE_SPINLOCK(map_idr_lock);
static DEFINE_IDR(link_idr);
static DEFINE_SPINLOCK(link_idr_lock);
-int sysctl_unprivileged_bpf_disabled __read_mostly;
+int sysctl_unprivileged_bpf_disabled __read_mostly =
+ IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
static const struct bpf_map_ops * const bpf_map_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
@@ -70,11 +81,10 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
* copy_from_user() call. However, this is not a concern since this function is
* meant to be a future-proofing of bits.
*/
-int bpf_check_uarg_tail_zero(void __user *uaddr,
+int bpf_check_uarg_tail_zero(bpfptr_t uaddr,
size_t expected_size,
size_t actual_size)
{
- unsigned char __user *addr = uaddr + expected_size;
int res;
if (unlikely(actual_size > PAGE_SIZE)) /* silly large */
@@ -83,48 +93,41 @@ int bpf_check_uarg_tail_zero(void __user *uaddr,
if (actual_size <= expected_size)
return 0;
- res = check_zeroed_user(addr, actual_size - expected_size);
+ if (uaddr.is_kernel)
+ res = memchr_inv(uaddr.kernel + expected_size, 0,
+ actual_size - expected_size) == NULL;
+ else
+ res = check_zeroed_user(uaddr.user + expected_size,
+ actual_size - expected_size);
if (res < 0)
return res;
return res ? 0 : -E2BIG;
}
const struct bpf_map_ops bpf_map_offload_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = bpf_map_offload_map_alloc,
.map_free = bpf_map_offload_map_free,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = bpf_map_offload_map_mem_usage,
};
-static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
+static void bpf_map_write_active_inc(struct bpf_map *map)
{
- const struct bpf_map_ops *ops;
- u32 type = attr->map_type;
- struct bpf_map *map;
- int err;
+ atomic64_inc(&map->writecnt);
+}
- if (type >= ARRAY_SIZE(bpf_map_types))
- return ERR_PTR(-EINVAL);
- type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types));
- ops = bpf_map_types[type];
- if (!ops)
- return ERR_PTR(-EINVAL);
+static void bpf_map_write_active_dec(struct bpf_map *map)
+{
+ atomic64_dec(&map->writecnt);
+}
- if (ops->map_alloc_check) {
- err = ops->map_alloc_check(attr);
- if (err)
- return ERR_PTR(err);
- }
- if (attr->map_ifindex)
- ops = &bpf_map_offload_ops;
- map = ops->map_alloc(attr);
- if (IS_ERR(map))
- return map;
- map->ops = ops;
- map->map_type = type;
- return map;
+bool bpf_map_write_active(const struct bpf_map *map)
+{
+ return atomic64_read(&map->writecnt) != 0;
}
-static u32 bpf_map_value_size(struct bpf_map *map)
+static u32 bpf_map_value_size(const struct bpf_map *map)
{
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
@@ -139,30 +142,35 @@ static u32 bpf_map_value_size(struct bpf_map *map)
static void maybe_wait_bpf_programs(struct bpf_map *map)
{
- /* Wait for any running BPF programs to complete so that
- * userspace, when we return to it, knows that all programs
- * that could be running use the new map value.
+ /* Wait for any running non-sleepable BPF programs to complete so that
+ * userspace, when we return to it, knows that all non-sleepable
+ * programs that could be running use the new map value. For sleepable
+ * BPF programs, synchronize_rcu_tasks_trace() should be used to wait
+ * for the completions of these programs, but considering the waiting
+ * time can be very long and userspace may think it will hang forever,
+ * so don't handle sleepable BPF programs now.
*/
if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
synchronize_rcu();
}
-static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
- void *value, __u64 flags)
+static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
+ void *key, void *value, __u64 flags)
{
int err;
/* Need to create a kthread, thus must support schedule */
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
return bpf_map_offload_update_elem(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
- map->map_type == BPF_MAP_TYPE_SOCKHASH ||
- map->map_type == BPF_MAP_TYPE_SOCKMAP ||
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
return map->ops->map_update_elem(map, key, value, flags);
+ } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
+ map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+ return sock_map_update_elem_sys(map, key, value, flags);
} else if (IS_FD_PROG_ARRAY(map)) {
- return bpf_fd_array_map_update_elem(map, f.file, key, value,
+ return bpf_fd_array_map_update_elem(map, map_file, key, value,
flags);
}
@@ -176,21 +184,18 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
err = bpf_percpu_cgroup_storage_update(map, key, value,
flags);
} else if (IS_FD_ARRAY(map)) {
- rcu_read_lock();
- err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+ err = bpf_fd_array_map_update_elem(map, map_file, key, value,
flags);
- rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
- rcu_read_lock();
- err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
+ err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
flags);
- rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
/* rcu_read_lock() is not needed */
err = bpf_fd_reuseport_array_update_elem(map, key, value,
flags);
} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
- map->map_type == BPF_MAP_TYPE_STACK) {
+ map->map_type == BPF_MAP_TYPE_STACK ||
+ map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
err = map->ops->map_push_elem(map, value, flags);
} else {
rcu_read_lock();
@@ -198,7 +203,6 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
rcu_read_unlock();
}
bpf_enable_instrumentation();
- maybe_wait_bpf_programs(map);
return err;
}
@@ -209,7 +213,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
void *ptr;
int err;
- if (bpf_map_is_dev_bound(map))
+ if (bpf_map_is_offloaded(map))
return bpf_map_offload_lookup_elem(map, key, value);
bpf_disable_instrumentation();
@@ -229,7 +233,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
- map->map_type == BPF_MAP_TYPE_STACK) {
+ map->map_type == BPF_MAP_TYPE_STACK ||
+ map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
err = map->ops->map_peek_elem(map, value);
} else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
/* struct_ops map requires directly updating "value" */
@@ -251,18 +256,21 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
copy_map_value_locked(map, value, ptr, true);
else
copy_map_value(map, value, ptr);
- /* mask lock, since value wasn't zero inited */
- check_and_init_map_lock(map, value);
+ /* mask lock and timer, since value wasn't zero inited */
+ check_and_init_map_value(map, value);
}
rcu_read_unlock();
}
bpf_enable_instrumentation();
- maybe_wait_bpf_programs(map);
return err;
}
+/* Please, do not use this function outside from the map creation path
+ * (e.g. in map update path) without taking care of setting the active
+ * memory cgroup (see at bpf_map_kmalloc_node() for example).
+ */
static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
{
/* We really just want to fail instead of triggering OOM killer
@@ -275,7 +283,7 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
* __GFP_RETRY_MAYFAIL to avoid such situations.
*/
- const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO;
+ gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO);
unsigned int flags = 0;
unsigned long align = 1;
void *area;
@@ -335,134 +343,375 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
map->max_entries = attr->max_entries;
map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
map->numa_node = bpf_map_attr_numa_node(attr);
+ map->map_extra = attr->map_extra;
}
-static int bpf_charge_memlock(struct user_struct *user, u32 pages)
+static int bpf_map_alloc_id(struct bpf_map *map)
{
- unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ int id;
- if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) {
- atomic_long_sub(pages, &user->locked_vm);
- return -EPERM;
- }
- return 0;
+ idr_preload(GFP_KERNEL);
+ spin_lock_bh(&map_idr_lock);
+ id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
+ if (id > 0)
+ map->id = id;
+ spin_unlock_bh(&map_idr_lock);
+ idr_preload_end();
+
+ if (WARN_ON_ONCE(!id))
+ return -ENOSPC;
+
+ return id > 0 ? 0 : id;
}
-static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
+void bpf_map_free_id(struct bpf_map *map)
{
- if (user)
- atomic_long_sub(pages, &user->locked_vm);
+ unsigned long flags;
+
+ /* Offloaded maps are removed from the IDR store when their device
+ * disappears - even if someone holds an fd to them they are unusable,
+ * the memory is gone, all ops will fail; they are simply waiting for
+ * refcnt to drop to be freed.
+ */
+ if (!map->id)
+ return;
+
+ spin_lock_irqsave(&map_idr_lock, flags);
+
+ idr_remove(&map_idr, map->id);
+ map->id = 0;
+
+ spin_unlock_irqrestore(&map_idr_lock, flags);
}
-int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size)
+#ifdef CONFIG_MEMCG_KMEM
+static void bpf_map_save_memcg(struct bpf_map *map)
{
- u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT;
- struct user_struct *user;
- int ret;
+ /* Currently if a map is created by a process belonging to the root
+ * memory cgroup, get_obj_cgroup_from_current() will return NULL.
+ * So we have to check map->objcg for being NULL each time it's
+ * being used.
+ */
+ if (memcg_bpf_enabled())
+ map->objcg = get_obj_cgroup_from_current();
+}
- if (size >= U32_MAX - PAGE_SIZE)
- return -E2BIG;
+static void bpf_map_release_memcg(struct bpf_map *map)
+{
+ if (map->objcg)
+ obj_cgroup_put(map->objcg);
+}
- user = get_current_user();
- ret = bpf_charge_memlock(user, pages);
- if (ret) {
- free_uid(user);
- return ret;
- }
+static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
+{
+ if (map->objcg)
+ return get_mem_cgroup_from_objcg(map->objcg);
- mem->pages = pages;
- mem->user = user;
+ return root_mem_cgroup;
+}
- return 0;
+void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
+ int node)
+{
+ struct mem_cgroup *memcg, *old_memcg;
+ void *ptr;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+ ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+
+ return ptr;
}
-void bpf_map_charge_finish(struct bpf_map_memory *mem)
+void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
{
- bpf_uncharge_memlock(mem->user, mem->pages);
- free_uid(mem->user);
+ struct mem_cgroup *memcg, *old_memcg;
+ void *ptr;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+ ptr = kzalloc(size, flags | __GFP_ACCOUNT);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+
+ return ptr;
}
-void bpf_map_charge_move(struct bpf_map_memory *dst,
- struct bpf_map_memory *src)
+void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
+ gfp_t flags)
{
- *dst = *src;
+ struct mem_cgroup *memcg, *old_memcg;
+ void *ptr;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+ ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
- /* Make sure src will not be used for the redundant uncharging. */
- memset(src, 0, sizeof(struct bpf_map_memory));
+ return ptr;
}
-int bpf_map_charge_memlock(struct bpf_map *map, u32 pages)
+void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
+ size_t align, gfp_t flags)
{
- int ret;
+ struct mem_cgroup *memcg, *old_memcg;
+ void __percpu *ptr;
- ret = bpf_charge_memlock(map->memory.user, pages);
- if (ret)
- return ret;
- map->memory.pages += pages;
- return ret;
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+ ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+
+ return ptr;
}
-void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages)
+#else
+static void bpf_map_save_memcg(struct bpf_map *map)
{
- bpf_uncharge_memlock(map->memory.user, pages);
- map->memory.pages -= pages;
}
-static int bpf_map_alloc_id(struct bpf_map *map)
+static void bpf_map_release_memcg(struct bpf_map *map)
{
- int id;
+}
+#endif
- idr_preload(GFP_KERNEL);
- spin_lock_bh(&map_idr_lock);
- id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
- if (id > 0)
- map->id = id;
- spin_unlock_bh(&map_idr_lock);
- idr_preload_end();
+static int btf_field_cmp(const void *a, const void *b)
+{
+ const struct btf_field *f1 = a, *f2 = b;
- if (WARN_ON_ONCE(!id))
- return -ENOSPC;
+ if (f1->offset < f2->offset)
+ return -1;
+ else if (f1->offset > f2->offset)
+ return 1;
+ return 0;
+}
- return id > 0 ? 0 : id;
+struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
+ u32 field_mask)
+{
+ struct btf_field *field;
+
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask))
+ return NULL;
+ field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
+ if (!field || !(field->type & field_mask))
+ return NULL;
+ return field;
}
-void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
+void btf_record_free(struct btf_record *rec)
{
- unsigned long flags;
+ int i;
- /* Offloaded maps are removed from the IDR store when their device
- * disappears - even if someone holds an fd to them they are unusable,
- * the memory is gone, all ops will fail; they are simply waiting for
- * refcnt to drop to be freed.
- */
- if (!map->id)
+ if (IS_ERR_OR_NULL(rec))
return;
+ for (i = 0; i < rec->cnt; i++) {
+ switch (rec->fields[i].type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ if (rec->fields[i].kptr.module)
+ module_put(rec->fields[i].kptr.module);
+ btf_put(rec->fields[i].kptr.btf);
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_LIST_NODE:
+ case BPF_RB_ROOT:
+ case BPF_RB_NODE:
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_REFCOUNT:
+ /* Nothing to release */
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ continue;
+ }
+ }
+ kfree(rec);
+}
- if (do_idr_lock)
- spin_lock_irqsave(&map_idr_lock, flags);
- else
- __acquire(&map_idr_lock);
+void bpf_map_free_record(struct bpf_map *map)
+{
+ btf_record_free(map->record);
+ map->record = NULL;
+}
- idr_remove(&map_idr, map->id);
- map->id = 0;
+struct btf_record *btf_record_dup(const struct btf_record *rec)
+{
+ const struct btf_field *fields;
+ struct btf_record *new_rec;
+ int ret, size, i;
- if (do_idr_lock)
- spin_unlock_irqrestore(&map_idr_lock, flags);
- else
- __release(&map_idr_lock);
+ if (IS_ERR_OR_NULL(rec))
+ return NULL;
+ size = offsetof(struct btf_record, fields[rec->cnt]);
+ new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN);
+ if (!new_rec)
+ return ERR_PTR(-ENOMEM);
+ /* Do a deep copy of the btf_record */
+ fields = rec->fields;
+ new_rec->cnt = 0;
+ for (i = 0; i < rec->cnt; i++) {
+ switch (fields[i].type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ btf_get(fields[i].kptr.btf);
+ if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
+ ret = -ENXIO;
+ goto free;
+ }
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_LIST_NODE:
+ case BPF_RB_ROOT:
+ case BPF_RB_NODE:
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_REFCOUNT:
+ /* Nothing to acquire */
+ break;
+ default:
+ ret = -EFAULT;
+ WARN_ON_ONCE(1);
+ goto free;
+ }
+ new_rec->cnt++;
+ }
+ return new_rec;
+free:
+ btf_record_free(new_rec);
+ return ERR_PTR(ret);
+}
+
+bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b)
+{
+ bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b);
+ int size;
+
+ if (!a_has_fields && !b_has_fields)
+ return true;
+ if (a_has_fields != b_has_fields)
+ return false;
+ if (rec_a->cnt != rec_b->cnt)
+ return false;
+ size = offsetof(struct btf_record, fields[rec_a->cnt]);
+ /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused
+ * members are zeroed out. So memcmp is safe to do without worrying
+ * about padding/unused fields.
+ *
+ * While spin_lock, timer, and kptr have no relation to map BTF,
+ * list_head metadata is specific to map BTF, the btf and value_rec
+ * members in particular. btf is the map BTF, while value_rec points to
+ * btf_record in that map BTF.
+ *
+ * So while by default, we don't rely on the map BTF (which the records
+ * were parsed from) matching for both records, which is not backwards
+ * compatible, in case list_head is part of it, we implicitly rely on
+ * that by way of depending on memcmp succeeding for it.
+ */
+ return !memcmp(rec_a, rec_b, size);
+}
+
+void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
+{
+ if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER)))
+ return;
+ bpf_timer_cancel_and_free(obj + rec->timer_off);
+}
+
+void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
+{
+ const struct btf_field *fields;
+ int i;
+
+ if (IS_ERR_OR_NULL(rec))
+ return;
+ fields = rec->fields;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_struct_meta *pointee_struct_meta;
+ const struct btf_field *field = &fields[i];
+ void *field_ptr = obj + field->offset;
+ void *xchgd_field;
+
+ switch (fields[i].type) {
+ case BPF_SPIN_LOCK:
+ break;
+ case BPF_TIMER:
+ bpf_timer_cancel_and_free(field_ptr);
+ break;
+ case BPF_KPTR_UNREF:
+ WRITE_ONCE(*(u64 *)field_ptr, 0);
+ break;
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
+ if (!xchgd_field)
+ break;
+
+ if (!btf_is_kernel(field->kptr.btf)) {
+ pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
+ field->kptr.btf_id);
+ migrate_disable();
+ __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
+ pointee_struct_meta->record : NULL,
+ fields[i].type == BPF_KPTR_PERCPU);
+ migrate_enable();
+ } else {
+ field->kptr.dtor(xchgd_field);
+ }
+ break;
+ case BPF_LIST_HEAD:
+ if (WARN_ON_ONCE(rec->spin_lock_off < 0))
+ continue;
+ bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
+ break;
+ case BPF_RB_ROOT:
+ if (WARN_ON_ONCE(rec->spin_lock_off < 0))
+ continue;
+ bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off);
+ break;
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ case BPF_REFCOUNT:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ continue;
+ }
+ }
}
/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
- struct bpf_map_memory mem;
+ struct btf_record *rec = map->record;
+ struct btf *btf = map->btf;
- bpf_map_charge_move(&mem, &map->memory);
security_bpf_map_free(map);
+ bpf_map_release_memcg(map);
/* implementation dependent freeing */
map->ops->map_free(map);
- bpf_map_charge_finish(&mem);
+ /* Delay freeing of btf_record for maps, as map_free
+ * callback usually needs access to them. It is better to do it here
+ * than require each callback to do the free itself manually.
+ *
+ * Note that the btf_record stashed in map->inner_map_meta->record was
+ * already freed using the map_free callback for map in map case which
+ * eventually calls bpf_map_free_meta, since inner_map_meta is only a
+ * template bpf_map struct used during verification.
+ */
+ btf_record_free(rec);
+ /* Delay freeing of btf for maps, as map_free callback may need
+ * struct_meta info which will be freed with btf_put().
+ */
+ btf_put(btf);
}
static void bpf_map_put_uref(struct bpf_map *map)
@@ -473,23 +722,45 @@ static void bpf_map_put_uref(struct bpf_map *map)
}
}
+static void bpf_map_free_in_work(struct bpf_map *map)
+{
+ INIT_WORK(&map->work, bpf_map_free_deferred);
+ /* Avoid spawning kworkers, since they all might contend
+ * for the same mutex like slab_mutex.
+ */
+ queue_work(system_unbound_wq, &map->work);
+}
+
+static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
+{
+ bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu));
+}
+
+static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_map_free_rcu_gp(rcu);
+ else
+ call_rcu(rcu, bpf_map_free_rcu_gp);
+}
+
/* decrement map refcnt and schedule it for freeing via workqueue
- * (unrelying map implementation ops->map_free() might sleep)
+ * (underlying map implementation ops->map_free() might sleep)
*/
-static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
+void bpf_map_put(struct bpf_map *map)
{
if (atomic64_dec_and_test(&map->refcnt)) {
/* bpf_map_free_id() must be called first */
- bpf_map_free_id(map, do_idr_lock);
- btf_put(map->btf);
- INIT_WORK(&map->work, bpf_map_free_deferred);
- schedule_work(&map->work);
- }
-}
+ bpf_map_free_id(map);
-void bpf_map_put(struct bpf_map *map)
-{
- __bpf_map_put(map, true);
+ WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
+ if (READ_ONCE(map->free_after_mult_rcu_gp))
+ call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp);
+ else if (READ_ONCE(map->free_after_rcu_gp))
+ call_rcu(&map->rcu, bpf_map_free_rcu_gp);
+ else
+ bpf_map_free_in_work(map);
+ }
}
EXPORT_SYMBOL_GPL(bpf_map_put);
@@ -523,16 +794,22 @@ static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
}
#ifdef CONFIG_PROC_FS
+/* Show the memory usage of a bpf map */
+static u64 bpf_map_memory_usage(const struct bpf_map *map)
+{
+ return map->ops->map_mem_usage(map);
+}
+
static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
{
- const struct bpf_map *map = filp->private_data;
- const struct bpf_array *array;
+ struct bpf_map *map = filp->private_data;
u32 type = 0, jited = 0;
- if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
- array = container_of(map, struct bpf_array, map);
- type = array->aux->type;
- jited = array->aux->jited;
+ if (map_type_contains_progs(map)) {
+ spin_lock(&map->owner.lock);
+ type = map->owner.type;
+ jited = map->owner.jited;
+ spin_unlock(&map->owner.lock);
}
seq_printf(m,
@@ -541,6 +818,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
"value_size:\t%u\n"
"max_entries:\t%u\n"
"map_flags:\t%#x\n"
+ "map_extra:\t%#llx\n"
"memlock:\t%llu\n"
"map_id:\t%u\n"
"frozen:\t%u\n",
@@ -549,7 +827,8 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
map->value_size,
map->max_entries,
map->map_flags,
- map->memory.pages * 1ULL << PAGE_SHIFT,
+ (unsigned long long)map->map_extra,
+ bpf_map_memory_usage(map),
map->id,
READ_ONCE(map->frozen));
if (type) {
@@ -582,11 +861,8 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;
- if (vma->vm_flags & VM_MAYWRITE) {
- mutex_lock(&map->freeze_mutex);
- map->writecnt++;
- mutex_unlock(&map->freeze_mutex);
- }
+ if (vma->vm_flags & VM_MAYWRITE)
+ bpf_map_write_active_inc(map);
}
/* called for all unmapped memory region (including initial) */
@@ -594,11 +870,8 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;
- if (vma->vm_flags & VM_MAYWRITE) {
- mutex_lock(&map->freeze_mutex);
- map->writecnt--;
- mutex_unlock(&map->freeze_mutex);
- }
+ if (vma->vm_flags & VM_MAYWRITE)
+ bpf_map_write_active_dec(map);
}
static const struct vm_operations_struct bpf_map_default_vmops = {
@@ -611,7 +884,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
struct bpf_map *map = filp->private_data;
int err;
- if (!map->ops->map_mmap || map_value_has_spin_lock(map))
+ if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
return -ENOTSUPP;
if (!(vma->vm_flags & VM_SHARED))
@@ -638,17 +911,17 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
/* set default open/close callbacks */
vma->vm_ops = &bpf_map_default_vmops;
vma->vm_private_data = map;
- vma->vm_flags &= ~VM_MAYEXEC;
+ vm_flags_clear(vma, VM_MAYEXEC);
if (!(vma->vm_flags & VM_WRITE))
/* disallow re-mapping with PROT_WRITE */
- vma->vm_flags &= ~VM_MAYWRITE;
+ vm_flags_clear(vma, VM_MAYWRITE);
err = map->ops->map_mmap(map, vma);
if (err)
goto out;
if (vma->vm_flags & VM_MAYWRITE)
- map->writecnt++;
+ bpf_map_write_active_inc(map);
out:
mutex_unlock(&map->freeze_mutex);
return err;
@@ -760,37 +1033,103 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
if (!value_type || value_size != map->value_size)
return -EINVAL;
- map->spin_lock_off = btf_find_spin_lock(btf, value_type);
-
- if (map_value_has_spin_lock(map)) {
- if (map->map_flags & BPF_F_RDONLY_PROG)
- return -EACCES;
- if (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_ARRAY &&
- map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
- map->map_type != BPF_MAP_TYPE_SK_STORAGE)
- return -ENOTSUPP;
- if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
- map->value_size) {
- WARN_ONCE(1,
- "verifier bug spin_lock_off %d value_size %d\n",
- map->spin_lock_off, map->value_size);
- return -EFAULT;
+ map->record = btf_parse_fields(btf, value_type,
+ BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
+ BPF_RB_ROOT | BPF_REFCOUNT,
+ map->value_size);
+ if (!IS_ERR_OR_NULL(map->record)) {
+ int i;
+
+ if (!bpf_capable()) {
+ ret = -EPERM;
+ goto free_map_tab;
+ }
+ if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
+ ret = -EACCES;
+ goto free_map_tab;
+ }
+ for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) {
+ switch (map->record->field_mask & (1 << i)) {
+ case 0:
+ continue;
+ case BPF_SPIN_LOCK:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_TIMER:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_REFCOUNT:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ default:
+ /* Fail if map_type checks are missing for a field type */
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
}
}
- if (map->ops->map_check_btf)
+ ret = btf_check_and_fixup_fields(btf, map->record);
+ if (ret < 0)
+ goto free_map_tab;
+
+ if (map->ops->map_check_btf) {
ret = map->ops->map_check_btf(map, btf, key_type, value_type);
+ if (ret < 0)
+ goto free_map_tab;
+ }
return ret;
+free_map_tab:
+ bpf_map_free_record(map);
+ return ret;
}
-#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id
+#define BPF_MAP_CREATE_LAST_FIELD map_extra
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
+ const struct bpf_map_ops *ops;
int numa_node = bpf_map_attr_numa_node(attr);
- struct bpf_map_memory mem;
+ u32 map_type = attr->map_type;
struct bpf_map *map;
int f_flags;
int err;
@@ -807,6 +1146,10 @@ static int map_create(union bpf_attr *attr)
return -EINVAL;
}
+ if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
+ attr->map_extra != 0)
+ return -EINVAL;
+
f_flags = bpf_get_file_flag(attr->map_flags);
if (f_flags < 0)
return f_flags;
@@ -817,9 +1160,85 @@ static int map_create(union bpf_attr *attr)
return -EINVAL;
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
- map = find_and_alloc_map(attr);
+ map_type = attr->map_type;
+ if (map_type >= ARRAY_SIZE(bpf_map_types))
+ return -EINVAL;
+ map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types));
+ ops = bpf_map_types[map_type];
+ if (!ops)
+ return -EINVAL;
+
+ if (ops->map_alloc_check) {
+ err = ops->map_alloc_check(attr);
+ if (err)
+ return err;
+ }
+ if (attr->map_ifindex)
+ ops = &bpf_map_offload_ops;
+ if (!ops->map_mem_usage)
+ return -EINVAL;
+
+ /* Intent here is for unprivileged_bpf_disabled to block BPF map
+ * creation for unprivileged users; other actions depend
+ * on fd availability and access to bpffs, so are dependent on
+ * object creation success. Even with unprivileged BPF disabled,
+ * capability checks are still carried out.
+ */
+ if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
+ return -EPERM;
+
+ /* check privileged map type permissions */
+ switch (map_type) {
+ case BPF_MAP_TYPE_ARRAY:
+ case BPF_MAP_TYPE_PERCPU_ARRAY:
+ case BPF_MAP_TYPE_PROG_ARRAY:
+ case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
+ case BPF_MAP_TYPE_CGROUP_ARRAY:
+ case BPF_MAP_TYPE_ARRAY_OF_MAPS:
+ case BPF_MAP_TYPE_HASH:
+ case BPF_MAP_TYPE_PERCPU_HASH:
+ case BPF_MAP_TYPE_HASH_OF_MAPS:
+ case BPF_MAP_TYPE_RINGBUF:
+ case BPF_MAP_TYPE_USER_RINGBUF:
+ case BPF_MAP_TYPE_CGROUP_STORAGE:
+ case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
+ /* unprivileged */
+ break;
+ case BPF_MAP_TYPE_SK_STORAGE:
+ case BPF_MAP_TYPE_INODE_STORAGE:
+ case BPF_MAP_TYPE_TASK_STORAGE:
+ case BPF_MAP_TYPE_CGRP_STORAGE:
+ case BPF_MAP_TYPE_BLOOM_FILTER:
+ case BPF_MAP_TYPE_LPM_TRIE:
+ case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
+ case BPF_MAP_TYPE_STACK_TRACE:
+ case BPF_MAP_TYPE_QUEUE:
+ case BPF_MAP_TYPE_STACK:
+ case BPF_MAP_TYPE_LRU_HASH:
+ case BPF_MAP_TYPE_LRU_PERCPU_HASH:
+ case BPF_MAP_TYPE_STRUCT_OPS:
+ case BPF_MAP_TYPE_CPUMAP:
+ if (!bpf_capable())
+ return -EPERM;
+ break;
+ case BPF_MAP_TYPE_SOCKMAP:
+ case BPF_MAP_TYPE_SOCKHASH:
+ case BPF_MAP_TYPE_DEVMAP:
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ case BPF_MAP_TYPE_XSKMAP:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ break;
+ default:
+ WARN(1, "unsupported map type %d", map_type);
+ return -EPERM;
+ }
+
+ map = ops->map_alloc(attr);
if (IS_ERR(map))
return PTR_ERR(map);
+ map->ops = ops;
+ map->map_type = map_type;
err = bpf_obj_name_cpy(map->name, attr->map_name,
sizeof(attr->map_name));
@@ -829,8 +1248,8 @@ static int map_create(union bpf_attr *attr)
atomic64_set(&map->refcnt, 1);
atomic64_set(&map->usercnt, 1);
mutex_init(&map->freeze_mutex);
+ spin_lock_init(&map->owner.lock);
- map->spin_lock_off = -EINVAL;
if (attr->btf_key_type_id || attr->btf_value_type_id ||
/* Even the map's value is a kernel's struct,
* the bpf_prog.o must have BTF to begin with
@@ -846,6 +1265,11 @@ static int map_create(union bpf_attr *attr)
err = PTR_ERR(btf);
goto free_map;
}
+ if (btf_is_kernel(btf)) {
+ btf_put(btf);
+ err = -EACCES;
+ goto free_map;
+ }
map->btf = btf;
if (attr->btf_value_type_id) {
@@ -869,6 +1293,8 @@ static int map_create(union bpf_attr *attr)
if (err)
goto free_map_sec;
+ bpf_map_save_memcg(map);
+
err = bpf_map_new_fd(map, f_flags);
if (err < 0) {
/* failed to allocate fd.
@@ -887,9 +1313,7 @@ free_map_sec:
security_bpf_map_free(map);
free_map:
btf_put(map->btf);
- bpf_map_charge_move(&mem, &map->memory);
map->ops->map_free(map);
- bpf_map_charge_finish(&mem);
return err;
}
@@ -935,6 +1359,7 @@ struct bpf_map *bpf_map_get(u32 ufd)
return map;
}
+EXPORT_SYMBOL(bpf_map_get);
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{
@@ -951,8 +1376,10 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
return map;
}
-/* map_idr_lock should have been held */
-static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
+/* map_idr_lock should have been held or the map should have been
+ * protected by rcu read lock.
+ */
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
{
int refold;
@@ -983,7 +1410,7 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
static void *__bpf_copy_key(void __user *ukey, u64 key_size)
{
if (key_size)
- return memdup_user(ukey, key_size);
+ return vmemdup_user(ukey, key_size);
if (ukey)
return ERR_PTR(-EINVAL);
@@ -991,6 +1418,17 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size)
return NULL;
}
+static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
+{
+ if (key_size)
+ return kvmemdup_bpfptr(ukey, key_size);
+
+ if (!bpfptr_is_null(ukey))
+ return ERR_PTR(-EINVAL);
+
+ return NULL;
+}
+
/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
@@ -1021,7 +1459,7 @@ static int map_lookup_elem(union bpf_attr *attr)
}
if ((attr->flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
err = -EINVAL;
goto err_put;
}
@@ -1035,10 +1473,18 @@ static int map_lookup_elem(union bpf_attr *attr)
value_size = bpf_map_value_size(map);
err = -ENOMEM;
- value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
+ if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
+ if (copy_from_user(value, uvalue, value_size))
+ err = -EFAULT;
+ else
+ err = bpf_map_copy_value(map, key, value, attr->flags);
+ goto free_value;
+ }
+
err = bpf_map_copy_value(map, key, value, attr->flags);
if (err)
goto free_value;
@@ -1050,9 +1496,9 @@ static int map_lookup_elem(union bpf_attr *attr)
err = 0;
free_value:
- kfree(value);
+ kvfree(value);
free_key:
- kfree(key);
+ kvfree(key);
err_put:
fdput(f);
return err;
@@ -1061,10 +1507,10 @@ err_put:
#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
-static int map_update_elem(union bpf_attr *attr)
+static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
{
- void __user *ukey = u64_to_user_ptr(attr->key);
- void __user *uvalue = u64_to_user_ptr(attr->value);
+ bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
+ bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
@@ -1079,56 +1525,49 @@ static int map_update_elem(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
+ bpf_map_write_active_inc(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
if ((attr->flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
err = -EINVAL;
goto err_put;
}
- key = __bpf_copy_key(ukey, map->key_size);
+ key = ___bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
}
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
- map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
- value_size = round_up(map->value_size, 8) * num_possible_cpus();
- else
- value_size = map->value_size;
-
- err = -ENOMEM;
- value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
- if (!value)
+ value_size = bpf_map_value_size(map);
+ value = kvmemdup_bpfptr(uvalue, value_size);
+ if (IS_ERR(value)) {
+ err = PTR_ERR(value);
goto free_key;
+ }
- err = -EFAULT;
- if (copy_from_user(value, uvalue, value_size) != 0)
- goto free_value;
-
- err = bpf_map_update_value(map, f, key, value, attr->flags);
+ err = bpf_map_update_value(map, f.file, key, value, attr->flags);
+ if (!err)
+ maybe_wait_bpf_programs(map);
-free_value:
- kfree(value);
+ kvfree(value);
free_key:
- kfree(key);
+ kvfree(key);
err_put:
+ bpf_map_write_active_dec(map);
fdput(f);
return err;
}
#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
-static int map_delete_elem(union bpf_attr *attr)
+static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
{
- void __user *ukey = u64_to_user_ptr(attr->key);
+ bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
int ufd = attr->map_fd;
struct bpf_map *map;
struct fd f;
@@ -1142,18 +1581,19 @@ static int map_delete_elem(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
+ bpf_map_write_active_inc(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
- key = __bpf_copy_key(ukey, map->key_size);
+ key = ___bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
}
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_delete_elem(map, key);
goto out;
} else if (IS_FD_PROG_ARRAY(map) ||
@@ -1168,10 +1608,12 @@ static int map_delete_elem(union bpf_attr *attr)
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
bpf_enable_instrumentation();
- maybe_wait_bpf_programs(map);
+ if (!err)
+ maybe_wait_bpf_programs(map);
out:
- kfree(key);
+ kvfree(key);
err_put:
+ bpf_map_write_active_dec(map);
fdput(f);
return err;
}
@@ -1212,11 +1654,11 @@ static int map_get_next_key(union bpf_attr *attr)
}
err = -ENOMEM;
- next_key = kmalloc(map->key_size, GFP_USER);
+ next_key = kvmalloc(map->key_size, GFP_USER);
if (!next_key)
goto free_key;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_get_next_key(map, key, next_key);
goto out;
}
@@ -1235,9 +1677,9 @@ out:
err = 0;
free_next_key:
- kfree(next_key);
+ kvfree(next_key);
free_key:
- kfree(key);
+ kvfree(key);
err_put:
fdput(f);
return err;
@@ -1256,7 +1698,7 @@ int generic_map_delete_batch(struct bpf_map *map,
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
return -EINVAL;
}
@@ -1264,7 +1706,10 @@ int generic_map_delete_batch(struct bpf_map *map,
if (!max_count)
return 0;
- key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
+ key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!key)
return -ENOMEM;
@@ -1274,7 +1719,7 @@ int generic_map_delete_batch(struct bpf_map *map,
map->key_size))
break;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_delete_elem(map, key);
break;
}
@@ -1284,35 +1729,33 @@ int generic_map_delete_batch(struct bpf_map *map,
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
bpf_enable_instrumentation();
- maybe_wait_bpf_programs(map);
if (err)
break;
+ cond_resched();
}
if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
err = -EFAULT;
- kfree(key);
+ kvfree(key);
+
return err;
}
-int generic_map_update_batch(struct bpf_map *map,
+int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
void __user *values = u64_to_user_ptr(attr->batch.values);
void __user *keys = u64_to_user_ptr(attr->batch.keys);
u32 value_size, cp, max_count;
- int ufd = attr->map_fd;
void *key, *value;
- struct fd f;
int err = 0;
- f = fdget(ufd);
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
return -EINVAL;
}
@@ -1322,13 +1765,16 @@ int generic_map_update_batch(struct bpf_map *map,
if (!max_count)
return 0;
- key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
+ key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!key)
return -ENOMEM;
- value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value) {
- kfree(key);
+ kvfree(key);
return -ENOMEM;
}
@@ -1339,18 +1785,20 @@ int generic_map_update_batch(struct bpf_map *map,
copy_from_user(value, values + cp * value_size, value_size))
break;
- err = bpf_map_update_value(map, f, key, value,
+ err = bpf_map_update_value(map, map_file, key, value,
attr->batch.elem_flags);
if (err)
break;
+ cond_resched();
}
if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
err = -EFAULT;
- kfree(value);
- kfree(key);
+ kvfree(value);
+ kvfree(key);
+
return err;
}
@@ -1372,7 +1820,7 @@ int generic_map_lookup_batch(struct bpf_map *map,
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map))
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK))
return -EINVAL;
value_size = bpf_map_value_size(map);
@@ -1384,13 +1832,13 @@ int generic_map_lookup_batch(struct bpf_map *map,
if (put_user(0, &uattr->batch.count))
return -EFAULT;
- buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!buf_prevkey)
return -ENOMEM;
- buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
+ buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
if (!buf) {
- kfree(buf_prevkey);
+ kvfree(buf_prevkey);
return -ENOMEM;
}
@@ -1440,6 +1888,7 @@ int generic_map_lookup_batch(struct bpf_map *map,
swap(prev_key, key);
retry = MAP_LOOKUP_RETRIES;
cp++;
+ cond_resched();
}
if (err == -EFAULT)
@@ -1450,12 +1899,12 @@ int generic_map_lookup_batch(struct bpf_map *map,
err = -EFAULT;
free_buf:
- kfree(buf_prevkey);
- kfree(buf);
+ kvfree(buf_prevkey);
+ kvfree(buf);
return err;
}
-#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
+#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
static int map_lookup_and_delete_elem(union bpf_attr *attr)
{
@@ -1471,34 +1920,61 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
return -EINVAL;
+ if (attr->flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
+ bpf_map_write_active_inc(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
+ if (attr->flags &&
+ (map->map_type == BPF_MAP_TYPE_QUEUE ||
+ map->map_type == BPF_MAP_TYPE_STACK)) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ if ((attr->flags & BPF_F_LOCK) &&
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
}
- value_size = map->value_size;
+ value_size = bpf_map_value_size(map);
err = -ENOMEM;
- value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
+ err = -ENOTSUPP;
if (map->map_type == BPF_MAP_TYPE_QUEUE ||
map->map_type == BPF_MAP_TYPE_STACK) {
err = map->ops->map_pop_elem(map, value);
- } else {
- err = -ENOTSUPP;
+ } else if (map->map_type == BPF_MAP_TYPE_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ if (!bpf_map_is_offloaded(map)) {
+ bpf_disable_instrumentation();
+ rcu_read_lock();
+ err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ }
}
if (err)
@@ -1512,10 +1988,11 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
err = 0;
free_value:
- kfree(value);
+ kvfree(value);
free_key:
- kfree(key);
+ kvfree(key);
err_put:
+ bpf_map_write_active_dec(map);
fdput(f);
return err;
}
@@ -1536,14 +2013,18 @@ static int map_freeze(const union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) {
fdput(f);
return -ENOTSUPP;
}
- mutex_lock(&map->freeze_mutex);
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ fdput(f);
+ return -EPERM;
+ }
- if (map->writecnt) {
+ mutex_lock(&map->freeze_mutex);
+ if (bpf_map_write_active(map)) {
err = -EBUSY;
goto err_put;
}
@@ -1551,10 +2032,6 @@ static int map_freeze(const union bpf_attr *attr)
err = -EBUSY;
goto err_put;
}
- if (!bpf_capable()) {
- err = -EPERM;
- goto err_put;
- }
WRITE_ONCE(map->frozen, true);
err_put:
@@ -1585,7 +2062,7 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
if (!ops)
return -EINVAL;
- if (!bpf_prog_is_dev_bound(prog->aux))
+ if (!bpf_prog_is_offloaded(prog->aux))
prog->aux->ops = ops;
else
prog->aux->ops = &bpf_offload_prog_ops;
@@ -1613,7 +2090,7 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
return;
if (audit_enabled == AUDIT_OFF)
return;
- if (op == BPF_AUDIT_LOAD)
+ if (!in_irq() && !irqs_disabled())
ctx = audit_context();
ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
if (unlikely(!ab))
@@ -1623,51 +2100,6 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
audit_log_end(ab);
}
-int __bpf_prog_charge(struct user_struct *user, u32 pages)
-{
- unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- unsigned long user_bufs;
-
- if (user) {
- user_bufs = atomic_long_add_return(pages, &user->locked_vm);
- if (user_bufs > memlock_limit) {
- atomic_long_sub(pages, &user->locked_vm);
- return -EPERM;
- }
- }
-
- return 0;
-}
-
-void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
-{
- if (user)
- atomic_long_sub(pages, &user->locked_vm);
-}
-
-static int bpf_prog_charge_memlock(struct bpf_prog *prog)
-{
- struct user_struct *user = get_current_user();
- int ret;
-
- ret = __bpf_prog_charge(user, prog->pages);
- if (ret) {
- free_uid(user);
- return ret;
- }
-
- prog->aux->user = user;
- return 0;
-}
-
-static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
-{
- struct user_struct *user = prog->aux->user;
-
- __bpf_prog_uncharge(user, prog->pages);
- free_uid(user);
-}
-
static int bpf_prog_alloc_id(struct bpf_prog *prog)
{
int id;
@@ -1687,8 +2119,10 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
return id > 0 ? 0 : id;
}
-void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
+void bpf_prog_free_id(struct bpf_prog *prog)
{
+ unsigned long flags;
+
/* cBPF to eBPF migrations are currently not in the idr store.
* Offloaded programs are removed from the store when their device
* disappears - even if someone grabs an fd to them they are unusable,
@@ -1697,18 +2131,10 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
if (!prog->aux->id)
return;
- if (do_idr_lock)
- spin_lock_bh(&prog_idr_lock);
- else
- __acquire(&prog_idr_lock);
-
+ spin_lock_irqsave(&prog_idr_lock, flags);
idr_remove(&prog_idr, prog->aux->id);
prog->aux->id = 0;
-
- if (do_idr_lock)
- spin_unlock_bh(&prog_idr_lock);
- else
- __release(&prog_idr_lock);
+ spin_unlock_irqrestore(&prog_idr_lock, flags);
}
static void __bpf_prog_put_rcu(struct rcu_head *rcu)
@@ -1717,7 +2143,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
kvfree(aux->func_info);
kfree(aux->func_info_aux);
- bpf_prog_uncharge_memlock(aux->prog);
+ free_uid(aux->user);
security_bpf_prog_free(aux);
bpf_prog_free(aux->prog);
}
@@ -1726,28 +2152,53 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
{
bpf_prog_kallsyms_del_all(prog);
btf_put(prog->aux->btf);
- bpf_prog_free_linfo(prog);
-
- if (deferred)
- call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
- else
+ module_put(prog->aux->mod);
+ kvfree(prog->aux->jited_linfo);
+ kvfree(prog->aux->linfo);
+ kfree(prog->aux->kfunc_tab);
+ if (prog->aux->attach_btf)
+ btf_put(prog->aux->attach_btf);
+
+ if (deferred) {
+ if (prog->aux->sleepable)
+ call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
+ else
+ call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
+ } else {
__bpf_prog_put_rcu(&prog->aux->rcu);
+ }
}
-static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
+static void bpf_prog_put_deferred(struct work_struct *work)
{
- if (atomic64_dec_and_test(&prog->aux->refcnt)) {
- perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
- bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
- /* bpf_prog_free_id() must be called first */
- bpf_prog_free_id(prog, do_idr_lock);
- __bpf_prog_put_noref(prog, true);
+ struct bpf_prog_aux *aux;
+ struct bpf_prog *prog;
+
+ aux = container_of(work, struct bpf_prog_aux, work);
+ prog = aux->prog;
+ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
+ bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
+ bpf_prog_free_id(prog);
+ __bpf_prog_put_noref(prog, true);
+}
+
+static void __bpf_prog_put(struct bpf_prog *prog)
+{
+ struct bpf_prog_aux *aux = prog->aux;
+
+ if (atomic64_dec_and_test(&aux->refcnt)) {
+ if (in_irq() || irqs_disabled()) {
+ INIT_WORK(&aux->work, bpf_prog_put_deferred);
+ schedule_work(&aux->work);
+ } else {
+ bpf_prog_put_deferred(&aux->work);
+ }
}
}
void bpf_prog_put(struct bpf_prog *prog)
{
- __bpf_prog_put(prog, true);
+ __bpf_prog_put(prog);
}
EXPORT_SYMBOL_GPL(bpf_prog_put);
@@ -1759,28 +2210,48 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
return 0;
}
+struct bpf_prog_kstats {
+ u64 nsecs;
+ u64 cnt;
+ u64 misses;
+};
+
+void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
+{
+ struct bpf_prog_stats *stats;
+ unsigned int flags;
+
+ stats = this_cpu_ptr(prog->stats);
+ flags = u64_stats_update_begin_irqsave(&stats->syncp);
+ u64_stats_inc(&stats->misses);
+ u64_stats_update_end_irqrestore(&stats->syncp, flags);
+}
+
static void bpf_prog_get_stats(const struct bpf_prog *prog,
- struct bpf_prog_stats *stats)
+ struct bpf_prog_kstats *stats)
{
- u64 nsecs = 0, cnt = 0;
+ u64 nsecs = 0, cnt = 0, misses = 0;
int cpu;
for_each_possible_cpu(cpu) {
const struct bpf_prog_stats *st;
unsigned int start;
- u64 tnsecs, tcnt;
+ u64 tnsecs, tcnt, tmisses;
- st = per_cpu_ptr(prog->aux->stats, cpu);
+ st = per_cpu_ptr(prog->stats, cpu);
do {
- start = u64_stats_fetch_begin_irq(&st->syncp);
- tnsecs = st->nsecs;
- tcnt = st->cnt;
- } while (u64_stats_fetch_retry_irq(&st->syncp, start));
+ start = u64_stats_fetch_begin(&st->syncp);
+ tnsecs = u64_stats_read(&st->nsecs);
+ tcnt = u64_stats_read(&st->cnt);
+ tmisses = u64_stats_read(&st->misses);
+ } while (u64_stats_fetch_retry(&st->syncp, start));
nsecs += tnsecs;
cnt += tcnt;
+ misses += tmisses;
}
stats->nsecs = nsecs;
stats->cnt = cnt;
+ stats->misses = misses;
}
#ifdef CONFIG_PROC_FS
@@ -1788,7 +2259,7 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_prog *prog = filp->private_data;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- struct bpf_prog_stats stats;
+ struct bpf_prog_kstats stats;
bpf_prog_get_stats(prog, &stats);
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
@@ -1799,14 +2270,18 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
"memlock:\t%llu\n"
"prog_id:\t%u\n"
"run_time_ns:\t%llu\n"
- "run_cnt:\t%llu\n",
+ "run_cnt:\t%llu\n"
+ "recursion_misses:\t%llu\n"
+ "verified_insns:\t%u\n",
prog->type,
prog->jited,
prog_tag,
prog->pages * 1ULL << PAGE_SHIFT,
prog->aux->id,
stats.nsecs,
- stats.cnt);
+ stats.cnt,
+ stats.misses,
+ prog->aux->verified_insns);
}
#endif
@@ -1889,7 +2364,7 @@ bool bpf_prog_get_ok(struct bpf_prog *prog,
if (prog->type != *attach_type)
return false;
- if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
+ if (bpf_prog_is_offloaded(prog->aux) && !attach_drv)
return false;
return true;
@@ -1950,18 +2425,27 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
attr->expected_attach_type =
BPF_CGROUP_INET_SOCK_CREATE;
break;
+ case BPF_PROG_TYPE_SK_REUSEPORT:
+ if (!attr->expected_attach_type)
+ attr->expected_attach_type =
+ BPF_SK_REUSEPORT_SELECT;
+ break;
}
}
static int
bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
enum bpf_attach_type expected_attach_type,
- u32 btf_id, u32 prog_fd)
+ struct btf *attach_btf, u32 btf_id,
+ struct bpf_prog *dst_prog)
{
if (btf_id) {
if (btf_id > BTF_MAX_TYPE)
return -EINVAL;
+ if (!attach_btf && !dst_prog)
+ return -EINVAL;
+
switch (prog_type) {
case BPF_PROG_TYPE_TRACING:
case BPF_PROG_TYPE_LSM:
@@ -1973,7 +2457,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
}
}
- if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING &&
+ if (attach_btf && (!btf_id || dst_prog))
+ return -EINVAL;
+
+ if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING &&
prog_type != BPF_PROG_TYPE_EXT)
return -EINVAL;
@@ -1981,6 +2468,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
case BPF_PROG_TYPE_CGROUP_SOCK:
switch (expected_attach_type) {
case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET_SOCK_RELEASE:
case BPF_CGROUP_INET4_POST_BIND:
case BPF_CGROUP_INET6_POST_BIND:
return 0;
@@ -1993,14 +2481,19 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
return 0;
default:
return -EINVAL;
@@ -2021,10 +2514,27 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
default:
return -EINVAL;
}
+ case BPF_PROG_TYPE_SK_LOOKUP:
+ if (expected_attach_type == BPF_SK_LOOKUP)
+ return 0;
+ return -EINVAL;
+ case BPF_PROG_TYPE_SK_REUSEPORT:
+ switch (expected_attach_type) {
+ case BPF_SK_REUSEPORT_SELECT:
+ case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
+ return 0;
+ default:
+ return -EINVAL;
+ }
+ case BPF_PROG_TYPE_NETFILTER:
+ if (expected_attach_type == BPF_NETFILTER)
+ return 0;
+ return -EINVAL;
+ case BPF_PROG_TYPE_SYSCALL:
case BPF_PROG_TYPE_EXT:
if (expected_attach_type)
return -EINVAL;
- /* fallthrough */
+ fallthrough;
default:
return 0;
}
@@ -2042,7 +2552,6 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_LWT_SEG6LOCAL:
case BPF_PROG_TYPE_SK_SKB:
case BPF_PROG_TYPE_SK_MSG:
- case BPF_PROG_TYPE_LIRC_MODE2:
case BPF_PROG_TYPE_FLOW_DISSECTOR:
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -2051,6 +2560,7 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_EXT: /* extends any prog */
+ case BPF_PROG_TYPE_NETFILTER:
return true;
case BPF_PROG_TYPE_CGROUP_SKB:
/* always unpriv */
@@ -2080,15 +2590,15 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd
+#define BPF_PROG_LOAD_LAST_FIELD log_true_size
-static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
+static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
enum bpf_prog_type type = attr->prog_type;
- struct bpf_prog *prog;
+ struct bpf_prog *prog, *dst_prog = NULL;
+ struct btf *attach_btf = NULL;
int err;
char license[128];
- bool is_gpl;
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;
@@ -2096,7 +2606,11 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
BPF_F_ANY_ALIGNMENT |
BPF_F_TEST_STATE_FREQ |
- BPF_F_TEST_RND_HI32))
+ BPF_F_SLEEPABLE |
+ BPF_F_TEST_RND_HI32 |
+ BPF_F_XDP_HAS_FRAGS |
+ BPF_F_XDP_DEV_BOUND_ONLY |
+ BPF_F_TEST_REG_INVARIANTS))
return -EINVAL;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -2104,14 +2618,15 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
!bpf_capable())
return -EPERM;
- /* copy eBPF program license from user space */
- if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
- sizeof(license) - 1) < 0)
- return -EFAULT;
- license[sizeof(license) - 1] = 0;
-
- /* eBPF programs must be GPL compatible to use GPL-ed functions */
- is_gpl = license_is_gpl_compatible(license);
+ /* Intent here is for unprivileged_bpf_disabled to block BPF program
+ * creation for unprivileged users; other actions depend
+ * on fd availability and access to bpffs, so are dependent on
+ * object creation success. Even with unprivileged BPF disabled,
+ * capability checks are still carried out for these
+ * and other operations.
+ */
+ if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
+ return -EPERM;
if (attr->insn_cnt == 0 ||
attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
@@ -2126,72 +2641,132 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (is_perfmon_prog_type(type) && !perfmon_capable())
return -EPERM;
+ /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
+ * or btf, we need to check which one it is
+ */
+ if (attr->attach_prog_fd) {
+ dst_prog = bpf_prog_get(attr->attach_prog_fd);
+ if (IS_ERR(dst_prog)) {
+ dst_prog = NULL;
+ attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
+ if (IS_ERR(attach_btf))
+ return -EINVAL;
+ if (!btf_is_kernel(attach_btf)) {
+ /* attaching through specifying bpf_prog's BTF
+ * objects directly might be supported eventually
+ */
+ btf_put(attach_btf);
+ return -ENOTSUPP;
+ }
+ }
+ } else if (attr->attach_btf_id) {
+ /* fall back to vmlinux BTF, if BTF type ID is specified */
+ attach_btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(attach_btf))
+ return PTR_ERR(attach_btf);
+ if (!attach_btf)
+ return -EINVAL;
+ btf_get(attach_btf);
+ }
+
bpf_prog_load_fixup_attach_type(attr);
if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
- attr->attach_btf_id,
- attr->attach_prog_fd))
+ attach_btf, attr->attach_btf_id,
+ dst_prog)) {
+ if (dst_prog)
+ bpf_prog_put(dst_prog);
+ if (attach_btf)
+ btf_put(attach_btf);
return -EINVAL;
+ }
/* plain bpf_prog allocation */
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
- if (!prog)
+ if (!prog) {
+ if (dst_prog)
+ bpf_prog_put(dst_prog);
+ if (attach_btf)
+ btf_put(attach_btf);
return -ENOMEM;
+ }
prog->expected_attach_type = attr->expected_attach_type;
+ prog->aux->attach_btf = attach_btf;
prog->aux->attach_btf_id = attr->attach_btf_id;
- if (attr->attach_prog_fd) {
- struct bpf_prog *tgt_prog;
-
- tgt_prog = bpf_prog_get(attr->attach_prog_fd);
- if (IS_ERR(tgt_prog)) {
- err = PTR_ERR(tgt_prog);
- goto free_prog_nouncharge;
- }
- prog->aux->linked_prog = tgt_prog;
- }
-
- prog->aux->offload_requested = !!attr->prog_ifindex;
+ prog->aux->dst_prog = dst_prog;
+ prog->aux->dev_bound = !!attr->prog_ifindex;
+ prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
+ prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
err = security_bpf_prog_alloc(prog->aux);
if (err)
- goto free_prog_nouncharge;
-
- err = bpf_prog_charge_memlock(prog);
- if (err)
- goto free_prog_sec;
+ goto free_prog;
+ prog->aux->user = get_current_user();
prog->len = attr->insn_cnt;
err = -EFAULT;
- if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
- bpf_prog_insn_size(prog)) != 0)
- goto free_prog;
+ if (copy_from_bpfptr(prog->insns,
+ make_bpfptr(attr->insns, uattr.is_kernel),
+ bpf_prog_insn_size(prog)) != 0)
+ goto free_prog_sec;
+ /* copy eBPF program license from user space */
+ if (strncpy_from_bpfptr(license,
+ make_bpfptr(attr->license, uattr.is_kernel),
+ sizeof(license) - 1) < 0)
+ goto free_prog_sec;
+ license[sizeof(license) - 1] = 0;
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0;
prog->orig_prog = NULL;
prog->jited = 0;
atomic64_set(&prog->aux->refcnt, 1);
- prog->gpl_compatible = is_gpl ? 1 : 0;
if (bpf_prog_is_dev_bound(prog->aux)) {
- err = bpf_prog_offload_init(prog, attr);
+ err = bpf_prog_dev_bound_init(prog, attr);
if (err)
- goto free_prog;
+ goto free_prog_sec;
+ }
+
+ if (type == BPF_PROG_TYPE_EXT && dst_prog &&
+ bpf_prog_is_dev_bound(dst_prog->aux)) {
+ err = bpf_prog_dev_bound_inherit(prog, dst_prog);
+ if (err)
+ goto free_prog_sec;
+ }
+
+ /*
+ * Bookkeeping for managing the program attachment chain.
+ *
+ * It might be tempting to set attach_tracing_prog flag at the attachment
+ * time, but this will not prevent from loading bunch of tracing prog
+ * first, then attach them one to another.
+ *
+ * The flag attach_tracing_prog is set for the whole program lifecycle, and
+ * doesn't have to be cleared in bpf_tracing_link_release, since tracing
+ * programs cannot change attachment target.
+ */
+ if (type == BPF_PROG_TYPE_TRACING && dst_prog &&
+ dst_prog->type == BPF_PROG_TYPE_TRACING) {
+ prog->aux->attach_tracing_prog = true;
}
/* find program type: socket_filter vs tracing_filter */
err = find_prog_type(type, prog);
if (err < 0)
- goto free_prog;
+ goto free_prog_sec;
prog->aux->load_time = ktime_get_boottime_ns();
err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
sizeof(attr->prog_name));
if (err < 0)
- goto free_prog;
+ goto free_prog_sec;
/* run eBPF verifier */
- err = bpf_check(&prog, attr, uattr);
+ err = bpf_check(&prog, attr, uattr, uattr_size);
if (err < 0)
goto free_used_maps;
@@ -2231,34 +2806,50 @@ free_used_maps:
* period before we can tear down JIT memory since symbols
* are already exposed under kallsyms.
*/
- __bpf_prog_put_noref(prog, prog->aux->func_cnt);
+ __bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
return err;
-free_prog:
- bpf_prog_uncharge_memlock(prog);
free_prog_sec:
+ free_uid(prog->aux->user);
security_bpf_prog_free(prog->aux);
-free_prog_nouncharge:
+free_prog:
+ if (prog->aux->attach_btf)
+ btf_put(prog->aux->attach_btf);
bpf_prog_free(prog);
return err;
}
-#define BPF_OBJ_LAST_FIELD file_flags
+#define BPF_OBJ_LAST_FIELD path_fd
static int bpf_obj_pin(const union bpf_attr *attr)
{
- if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
+ int path_fd;
+
+ if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_PATH_FD)
+ return -EINVAL;
+
+ /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
+ if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
return -EINVAL;
- return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
+ path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
+ return bpf_obj_pin_user(attr->bpf_fd, path_fd,
+ u64_to_user_ptr(attr->pathname));
}
static int bpf_obj_get(const union bpf_attr *attr)
{
+ int path_fd;
+
if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
- attr->file_flags & ~BPF_OBJ_FLAG_MASK)
+ attr->file_flags & ~(BPF_OBJ_FLAG_MASK | BPF_F_PATH_FD))
return -EINVAL;
- return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
+ /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
+ if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
+ return -EINVAL;
+
+ path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
+ return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname),
attr->file_flags);
}
@@ -2284,10 +2875,12 @@ static void bpf_link_free_id(int id)
/* Clean up bpf_link and corresponding anon_inode file and FD. After
* anon_inode is created, bpf_link can't be just kfree()'d due to deferred
- * anon_inode's release() call. This helper marksbpf_link as
+ * anon_inode's release() call. This helper marks bpf_link as
* defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
* is not decremented, it's the responsibility of a calling code that failed
* to complete bpf_link initialization.
+ * This helper eventually calls link's dealloc callback, but does not call
+ * link's release callback.
*/
void bpf_link_cleanup(struct bpf_link_primer *primer)
{
@@ -2322,27 +2915,31 @@ static void bpf_link_put_deferred(struct work_struct *work)
bpf_link_free(link);
}
-/* bpf_link_put can be called from atomic context, but ensures that resources
- * are freed from process context
+/* bpf_link_put might be called from atomic context. It needs to be called
+ * from sleepable context in order to acquire sleeping locks during the process.
*/
void bpf_link_put(struct bpf_link *link)
{
if (!atomic64_dec_and_test(&link->refcnt))
return;
- if (in_atomic()) {
- INIT_WORK(&link->work, bpf_link_put_deferred);
- schedule_work(&link->work);
- } else {
- bpf_link_free(link);
- }
+ INIT_WORK(&link->work, bpf_link_put_deferred);
+ schedule_work(&link->work);
+}
+EXPORT_SYMBOL(bpf_link_put);
+
+static void bpf_link_put_direct(struct bpf_link *link)
+{
+ if (!atomic64_dec_and_test(&link->refcnt))
+ return;
+ bpf_link_free(link);
}
static int bpf_link_release(struct inode *inode, struct file *filp)
{
struct bpf_link *link = filp->private_data;
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return 0;
}
@@ -2364,16 +2961,19 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
const struct bpf_prog *prog = link->prog;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"link_type:\t%s\n"
- "link_id:\t%u\n"
- "prog_tag:\t%s\n"
- "prog_id:\t%u\n",
+ "link_id:\t%u\n",
bpf_link_type_strs[link->type],
- link->id,
- prog_tag,
- prog->aux->id);
+ link->id);
+ if (prog) {
+ bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
+ seq_printf(m,
+ "prog_tag:\t%s\n"
+ "prog_id:\t%u\n",
+ prog_tag,
+ prog->aux->id);
+ }
if (link->ops->show_fdinfo)
link->ops->show_fdinfo(link, m);
}
@@ -2408,7 +3008,7 @@ static int bpf_link_alloc_id(struct bpf_link *link)
* pre-allocated resources are to be freed with bpf_cleanup() call. All the
* transient state is passed around in struct bpf_link_primer.
* This is preferred way to create and initialize bpf_link, especially when
- * there are complicated and expensive operations inbetween creating bpf_link
+ * there are complicated and expensive operations in between creating bpf_link
* itself and attaching it to BPF hook. By using bpf_link_prime() and
* bpf_link_settle() kernel code using bpf_link doesn't have to perform
* expensive (and potentially failing) roll back operations in a rare case
@@ -2479,21 +3079,27 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
return link;
}
-
-struct bpf_tracing_link {
- struct bpf_link link;
- enum bpf_attach_type attach_type;
-};
+EXPORT_SYMBOL(bpf_link_get_from_fd);
static void bpf_tracing_link_release(struct bpf_link *link)
{
- WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog));
+ struct bpf_tracing_link *tr_link =
+ container_of(link, struct bpf_tracing_link, link.link);
+
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
+ tr_link->trampoline));
+
+ bpf_trampoline_put(tr_link->trampoline);
+
+ /* tgt_prog is NULL if target is a kernel function */
+ if (tr_link->tgt_prog)
+ bpf_prog_put(tr_link->tgt_prog);
}
static void bpf_tracing_link_dealloc(struct bpf_link *link)
{
struct bpf_tracing_link *tr_link =
- container_of(link, struct bpf_tracing_link, link);
+ container_of(link, struct bpf_tracing_link, link.link);
kfree(tr_link);
}
@@ -2502,20 +3108,30 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
struct seq_file *seq)
{
struct bpf_tracing_link *tr_link =
- container_of(link, struct bpf_tracing_link, link);
+ container_of(link, struct bpf_tracing_link, link.link);
+ u32 target_btf_id, target_obj_id;
+ bpf_trampoline_unpack_key(tr_link->trampoline->key,
+ &target_obj_id, &target_btf_id);
seq_printf(seq,
- "attach_type:\t%d\n",
- tr_link->attach_type);
+ "attach_type:\t%d\n"
+ "target_obj_id:\t%u\n"
+ "target_btf_id:\t%u\n",
+ tr_link->attach_type,
+ target_obj_id,
+ target_btf_id);
}
static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
struct bpf_link_info *info)
{
struct bpf_tracing_link *tr_link =
- container_of(link, struct bpf_tracing_link, link);
+ container_of(link, struct bpf_tracing_link, link.link);
info->tracing.attach_type = tr_link->attach_type;
+ bpf_trampoline_unpack_key(tr_link->trampoline->key,
+ &info->tracing.target_obj_id,
+ &info->tracing.target_btf_id);
return 0;
}
@@ -2527,10 +3143,16 @@ static const struct bpf_link_ops bpf_tracing_link_lops = {
.fill_link_info = bpf_tracing_link_fill_link_info,
};
-static int bpf_tracing_prog_attach(struct bpf_prog *prog)
+static int bpf_tracing_prog_attach(struct bpf_prog *prog,
+ int tgt_prog_fd,
+ u32 btf_id,
+ u64 bpf_cookie)
{
struct bpf_link_primer link_primer;
+ struct bpf_prog *tgt_prog = NULL;
+ struct bpf_trampoline *tr = NULL;
struct bpf_tracing_link *link;
+ u64 key = 0;
int err;
switch (prog->type) {
@@ -2559,30 +3181,162 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
goto out_put_prog;
}
+ if (!!tgt_prog_fd != !!btf_id) {
+ err = -EINVAL;
+ goto out_put_prog;
+ }
+
+ if (tgt_prog_fd) {
+ /*
+ * For now we only allow new targets for BPF_PROG_TYPE_EXT. If this
+ * part would be changed to implement the same for
+ * BPF_PROG_TYPE_TRACING, do not forget to update the way how
+ * attach_tracing_prog flag is set.
+ */
+ if (prog->type != BPF_PROG_TYPE_EXT) {
+ err = -EINVAL;
+ goto out_put_prog;
+ }
+
+ tgt_prog = bpf_prog_get(tgt_prog_fd);
+ if (IS_ERR(tgt_prog)) {
+ err = PTR_ERR(tgt_prog);
+ tgt_prog = NULL;
+ goto out_put_prog;
+ }
+
+ key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
+ }
+
link = kzalloc(sizeof(*link), GFP_USER);
if (!link) {
err = -ENOMEM;
goto out_put_prog;
}
- bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING,
+ bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
&bpf_tracing_link_lops, prog);
link->attach_type = prog->expected_attach_type;
+ link->link.cookie = bpf_cookie;
- err = bpf_link_prime(&link->link, &link_primer);
- if (err) {
- kfree(link);
- goto out_put_prog;
+ mutex_lock(&prog->aux->dst_mutex);
+
+ /* There are a few possible cases here:
+ *
+ * - if prog->aux->dst_trampoline is set, the program was just loaded
+ * and not yet attached to anything, so we can use the values stored
+ * in prog->aux
+ *
+ * - if prog->aux->dst_trampoline is NULL, the program has already been
+ * attached to a target and its initial target was cleared (below)
+ *
+ * - if tgt_prog != NULL, the caller specified tgt_prog_fd +
+ * target_btf_id using the link_create API.
+ *
+ * - if tgt_prog == NULL when this function was called using the old
+ * raw_tracepoint_open API, and we need a target from prog->aux
+ *
+ * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
+ * was detached and is going for re-attachment.
+ *
+ * - if prog->aux->dst_trampoline is NULL and tgt_prog and prog->aux->attach_btf
+ * are NULL, then program was already attached and user did not provide
+ * tgt_prog_fd so we have no way to find out or create trampoline
+ */
+ if (!prog->aux->dst_trampoline && !tgt_prog) {
+ /*
+ * Allow re-attach for TRACING and LSM programs. If it's
+ * currently linked, bpf_trampoline_link_prog will fail.
+ * EXT programs need to specify tgt_prog_fd, so they
+ * re-attach in separate code path.
+ */
+ if (prog->type != BPF_PROG_TYPE_TRACING &&
+ prog->type != BPF_PROG_TYPE_LSM) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+ /* We can allow re-attach only if we have valid attach_btf. */
+ if (!prog->aux->attach_btf) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+ btf_id = prog->aux->attach_btf_id;
+ key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id);
}
- err = bpf_trampoline_link_prog(prog);
+ if (!prog->aux->dst_trampoline ||
+ (key && key != prog->aux->dst_trampoline->key)) {
+ /* If there is no saved target, or the specified target is
+ * different from the destination specified at load time, we
+ * need a new trampoline and a check for compatibility
+ */
+ struct bpf_attach_target_info tgt_info = {};
+
+ err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id,
+ &tgt_info);
+ if (err)
+ goto out_unlock;
+
+ if (tgt_info.tgt_mod) {
+ module_put(prog->aux->mod);
+ prog->aux->mod = tgt_info.tgt_mod;
+ }
+
+ tr = bpf_trampoline_get(key, &tgt_info);
+ if (!tr) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+ } else {
+ /* The caller didn't specify a target, or the target was the
+ * same as the destination supplied during program load. This
+ * means we can reuse the trampoline and reference from program
+ * load time, and there is no need to allocate a new one. This
+ * can only happen once for any program, as the saved values in
+ * prog->aux are cleared below.
+ */
+ tr = prog->aux->dst_trampoline;
+ tgt_prog = prog->aux->dst_prog;
+ }
+
+ err = bpf_link_prime(&link->link.link, &link_primer);
+ if (err)
+ goto out_unlock;
+
+ err = bpf_trampoline_link_prog(&link->link, tr);
if (err) {
bpf_link_cleanup(&link_primer);
- goto out_put_prog;
+ link = NULL;
+ goto out_unlock;
}
+ link->tgt_prog = tgt_prog;
+ link->trampoline = tr;
+
+ /* Always clear the trampoline and target prog from prog->aux to make
+ * sure the original attach destination is not kept alive after a
+ * program is (re-)attached to another target.
+ */
+ if (prog->aux->dst_prog &&
+ (tgt_prog_fd || tr != prog->aux->dst_trampoline))
+ /* got extra prog ref from syscall, or attaching to different prog */
+ bpf_prog_put(prog->aux->dst_prog);
+ if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline)
+ /* we allocated a new trampoline, so free the old one */
+ bpf_trampoline_put(prog->aux->dst_trampoline);
+
+ prog->aux->dst_prog = NULL;
+ prog->aux->dst_trampoline = NULL;
+ mutex_unlock(&prog->aux->dst_mutex);
+
return bpf_link_settle(&link_primer);
+out_unlock:
+ if (tr && tr != prog->aux->dst_trampoline)
+ bpf_trampoline_put(tr);
+ mutex_unlock(&prog->aux->dst_mutex);
+ kfree(link);
out_put_prog:
- bpf_prog_put(prog);
+ if (tgt_prog_fd && tgt_prog)
+ bpf_prog_put(tgt_prog);
return err;
}
@@ -2619,6 +3373,25 @@ static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
raw_tp_link->btp->tp->name);
}
+static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen,
+ u32 len)
+{
+ if (ulen >= len + 1) {
+ if (copy_to_user(ubuf, buf, len + 1))
+ return -EFAULT;
+ } else {
+ char zero = '\0';
+
+ if (copy_to_user(ubuf, buf, ulen - 1))
+ return -EFAULT;
+ if (put_user(zero, ubuf + ulen - 1))
+ return -EFAULT;
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
struct bpf_link_info *info)
{
@@ -2629,7 +3402,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
u32 ulen = info->raw_tracepoint.tp_name_len;
size_t tp_len = strlen(tp_name);
- if (ulen && !ubuf)
+ if (!ulen ^ !ubuf)
return -EINVAL;
info->raw_tracepoint.tp_name_len = tp_len + 1;
@@ -2637,86 +3410,280 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
if (!ubuf)
return 0;
- if (ulen >= tp_len + 1) {
- if (copy_to_user(ubuf, tp_name, tp_len + 1))
- return -EFAULT;
+ return bpf_copy_to_user(ubuf, tp_name, ulen, tp_len);
+}
+
+static const struct bpf_link_ops bpf_raw_tp_link_lops = {
+ .release = bpf_raw_tp_link_release,
+ .dealloc = bpf_raw_tp_link_dealloc,
+ .show_fdinfo = bpf_raw_tp_link_show_fdinfo,
+ .fill_link_info = bpf_raw_tp_link_fill_link_info,
+};
+
+#ifdef CONFIG_PERF_EVENTS
+struct bpf_perf_link {
+ struct bpf_link link;
+ struct file *perf_file;
+};
+
+static void bpf_perf_link_release(struct bpf_link *link)
+{
+ struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
+ struct perf_event *event = perf_link->perf_file->private_data;
+
+ perf_event_free_bpf_prog(event);
+ fput(perf_link->perf_file);
+}
+
+static void bpf_perf_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
+
+ kfree(perf_link);
+}
+
+static int bpf_perf_link_fill_common(const struct perf_event *event,
+ char __user *uname, u32 ulen,
+ u64 *probe_offset, u64 *probe_addr,
+ u32 *fd_type, unsigned long *missed)
+{
+ const char *buf;
+ u32 prog_id;
+ size_t len;
+ int err;
+
+ if (!ulen ^ !uname)
+ return -EINVAL;
+
+ err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf,
+ probe_offset, probe_addr, missed);
+ if (err)
+ return err;
+ if (!uname)
+ return 0;
+ if (buf) {
+ len = strlen(buf);
+ err = bpf_copy_to_user(uname, buf, ulen, len);
+ if (err)
+ return err;
} else {
char zero = '\0';
- if (copy_to_user(ubuf, tp_name, ulen - 1))
+ if (put_user(zero, uname))
return -EFAULT;
- if (put_user(zero, ubuf + ulen - 1))
- return -EFAULT;
- return -ENOSPC;
}
+ return 0;
+}
+
+#ifdef CONFIG_KPROBE_EVENTS
+static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ unsigned long missed;
+ char __user *uname;
+ u64 addr, offset;
+ u32 ulen, type;
+ int err;
+ uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
+ ulen = info->perf_event.kprobe.name_len;
+ err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
+ &type, &missed);
+ if (err)
+ return err;
+ if (type == BPF_FD_TYPE_KRETPROBE)
+ info->perf_event.type = BPF_PERF_EVENT_KRETPROBE;
+ else
+ info->perf_event.type = BPF_PERF_EVENT_KPROBE;
+
+ info->perf_event.kprobe.offset = offset;
+ info->perf_event.kprobe.missed = missed;
+ if (!kallsyms_show_value(current_cred()))
+ addr = 0;
+ info->perf_event.kprobe.addr = addr;
return 0;
}
+#endif
-static const struct bpf_link_ops bpf_raw_tp_link_lops = {
- .release = bpf_raw_tp_link_release,
- .dealloc = bpf_raw_tp_link_dealloc,
- .show_fdinfo = bpf_raw_tp_link_show_fdinfo,
- .fill_link_info = bpf_raw_tp_link_fill_link_info,
+#ifdef CONFIG_UPROBE_EVENTS
+static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ char __user *uname;
+ u64 addr, offset;
+ u32 ulen, type;
+ int err;
+
+ uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
+ ulen = info->perf_event.uprobe.name_len;
+ err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
+ &type, NULL);
+ if (err)
+ return err;
+
+ if (type == BPF_FD_TYPE_URETPROBE)
+ info->perf_event.type = BPF_PERF_EVENT_URETPROBE;
+ else
+ info->perf_event.type = BPF_PERF_EVENT_UPROBE;
+ info->perf_event.uprobe.offset = offset;
+ return 0;
+}
+#endif
+
+static int bpf_perf_link_fill_probe(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+#ifdef CONFIG_KPROBE_EVENTS
+ if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE)
+ return bpf_perf_link_fill_kprobe(event, info);
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+ if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE)
+ return bpf_perf_link_fill_uprobe(event, info);
+#endif
+ return -EOPNOTSUPP;
+}
+
+static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ char __user *uname;
+ u32 ulen;
+
+ uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
+ ulen = info->perf_event.tracepoint.name_len;
+ info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
+ return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL);
+}
+
+static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ info->perf_event.event.type = event->attr.type;
+ info->perf_event.event.config = event->attr.config;
+ info->perf_event.type = BPF_PERF_EVENT_EVENT;
+ return 0;
+}
+
+static int bpf_perf_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_perf_link *perf_link;
+ const struct perf_event *event;
+
+ perf_link = container_of(link, struct bpf_perf_link, link);
+ event = perf_get_event(perf_link->perf_file);
+ if (IS_ERR(event))
+ return PTR_ERR(event);
+
+ switch (event->prog->type) {
+ case BPF_PROG_TYPE_PERF_EVENT:
+ return bpf_perf_link_fill_perf_event(event, info);
+ case BPF_PROG_TYPE_TRACEPOINT:
+ return bpf_perf_link_fill_tracepoint(event, info);
+ case BPF_PROG_TYPE_KPROBE:
+ return bpf_perf_link_fill_probe(event, info);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static const struct bpf_link_ops bpf_perf_link_lops = {
+ .release = bpf_perf_link_release,
+ .dealloc = bpf_perf_link_dealloc,
+ .fill_link_info = bpf_perf_link_fill_link_info,
};
-#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
+static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_link_primer link_primer;
+ struct bpf_perf_link *link;
+ struct perf_event *event;
+ struct file *perf_file;
+ int err;
-static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
+ if (attr->link_create.flags)
+ return -EINVAL;
+
+ perf_file = perf_event_get(attr->link_create.target_fd);
+ if (IS_ERR(perf_file))
+ return PTR_ERR(perf_file);
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto out_put_file;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
+ link->perf_file = perf_file;
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ goto out_put_file;
+ }
+
+ event = perf_file->private_data;
+ err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ goto out_put_file;
+ }
+ /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */
+ bpf_prog_inc(prog);
+
+ return bpf_link_settle(&link_primer);
+
+out_put_file:
+ fput(perf_file);
+ return err;
+}
+#else
+static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_PERF_EVENTS */
+
+static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
+ const char __user *user_tp_name)
{
struct bpf_link_primer link_primer;
struct bpf_raw_tp_link *link;
struct bpf_raw_event_map *btp;
- struct bpf_prog *prog;
const char *tp_name;
char buf[128];
int err;
- if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
- return -EINVAL;
-
- prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
-
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
case BPF_PROG_TYPE_EXT:
case BPF_PROG_TYPE_LSM:
- if (attr->raw_tracepoint.name) {
+ if (user_tp_name)
/* The attach point for this category of programs
* should be specified via btf_id during program load.
*/
- err = -EINVAL;
- goto out_put_prog;
- }
+ return -EINVAL;
if (prog->type == BPF_PROG_TYPE_TRACING &&
prog->expected_attach_type == BPF_TRACE_RAW_TP) {
tp_name = prog->aux->attach_func_name;
break;
}
- return bpf_tracing_prog_attach(prog);
+ return bpf_tracing_prog_attach(prog, 0, 0, 0);
case BPF_PROG_TYPE_RAW_TRACEPOINT:
case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
- if (strncpy_from_user(buf,
- u64_to_user_ptr(attr->raw_tracepoint.name),
- sizeof(buf) - 1) < 0) {
- err = -EFAULT;
- goto out_put_prog;
- }
+ if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
+ return -EFAULT;
buf[sizeof(buf) - 1] = 0;
tp_name = buf;
break;
default:
- err = -EINVAL;
- goto out_put_prog;
+ return -EINVAL;
}
btp = bpf_get_raw_tracepoint(tp_name);
- if (!btp) {
- err = -ENOENT;
- goto out_put_prog;
- }
+ if (!btp)
+ return -ENOENT;
link = kzalloc(sizeof(*link), GFP_USER);
if (!link) {
@@ -2743,31 +3710,27 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
out_put_btp:
bpf_put_raw_tracepoint(btp);
-out_put_prog:
- bpf_prog_put(prog);
return err;
}
-static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
- enum bpf_attach_type attach_type)
+#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
+
+static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
{
- switch (prog->type) {
- case BPF_PROG_TYPE_CGROUP_SOCK:
- case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
- case BPF_PROG_TYPE_CGROUP_SOCKOPT:
- return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
- case BPF_PROG_TYPE_CGROUP_SKB:
- if (!capable(CAP_NET_ADMIN))
- /* cg-skb progs can be loaded by unpriv user.
- * check permissions at attach time.
- */
- return -EPERM;
- return prog->enforce_expected_attach_type &&
- prog->expected_attach_type != attach_type ?
- -EINVAL : 0;
- default:
- return 0;
- }
+ struct bpf_prog *prog;
+ int fd;
+
+ if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
+ return -EINVAL;
+
+ prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name));
+ if (fd < 0)
+ bpf_prog_put(prog);
+ return fd;
}
static enum bpf_prog_type
@@ -2777,8 +3740,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
return BPF_PROG_TYPE_CGROUP_SKB;
- break;
case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET_SOCK_RELEASE:
case BPF_CGROUP_INET4_POST_BIND:
case BPF_CGROUP_INET6_POST_BIND:
return BPF_PROG_TYPE_CGROUP_SOCK;
@@ -2786,14 +3749,19 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
case BPF_CGROUP_SOCK_OPS:
return BPF_PROG_TYPE_SOCK_OPS;
@@ -2803,6 +3771,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
return BPF_PROG_TYPE_SK_MSG;
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
+ case BPF_SK_SKB_VERDICT:
return BPF_PROG_TYPE_SK_SKB;
case BPF_LIRC_MODE2:
return BPF_PROG_TYPE_LIRC_MODE2;
@@ -2814,16 +3783,100 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_CGROUP_SETSOCKOPT:
return BPF_PROG_TYPE_CGROUP_SOCKOPT;
case BPF_TRACE_ITER:
+ case BPF_TRACE_RAW_TP:
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ case BPF_MODIFY_RETURN:
return BPF_PROG_TYPE_TRACING;
+ case BPF_LSM_MAC:
+ return BPF_PROG_TYPE_LSM;
+ case BPF_SK_LOOKUP:
+ return BPF_PROG_TYPE_SK_LOOKUP;
+ case BPF_XDP:
+ return BPF_PROG_TYPE_XDP;
+ case BPF_LSM_CGROUP:
+ return BPF_PROG_TYPE_LSM;
+ case BPF_TCX_INGRESS:
+ case BPF_TCX_EGRESS:
+ case BPF_NETKIT_PRIMARY:
+ case BPF_NETKIT_PEER:
+ return BPF_PROG_TYPE_SCHED_CLS;
default:
return BPF_PROG_TYPE_UNSPEC;
}
}
-#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd
+static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
+ enum bpf_attach_type attach_type)
+{
+ enum bpf_prog_type ptype;
+
+ switch (prog->type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_SK_LOOKUP:
+ return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ if (!capable(CAP_NET_ADMIN))
+ /* cg-skb progs can be loaded by unpriv user.
+ * check permissions at attach time.
+ */
+ return -EPERM;
+ return prog->enforce_expected_attach_type &&
+ prog->expected_attach_type != attach_type ?
+ -EINVAL : 0;
+ case BPF_PROG_TYPE_EXT:
+ return 0;
+ case BPF_PROG_TYPE_NETFILTER:
+ if (attach_type != BPF_NETFILTER)
+ return -EINVAL;
+ return 0;
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ if (attach_type != BPF_PERF_EVENT)
+ return -EINVAL;
+ return 0;
+ case BPF_PROG_TYPE_KPROBE:
+ if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
+ attach_type != BPF_TRACE_KPROBE_MULTI)
+ return -EINVAL;
+ if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
+ attach_type != BPF_TRACE_UPROBE_MULTI)
+ return -EINVAL;
+ if (attach_type != BPF_PERF_EVENT &&
+ attach_type != BPF_TRACE_KPROBE_MULTI &&
+ attach_type != BPF_TRACE_UPROBE_MULTI)
+ return -EINVAL;
+ return 0;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ if (attach_type != BPF_TCX_INGRESS &&
+ attach_type != BPF_TCX_EGRESS &&
+ attach_type != BPF_NETKIT_PRIMARY &&
+ attach_type != BPF_NETKIT_PEER)
+ return -EINVAL;
+ return 0;
+ default:
+ ptype = attach_type_to_prog_type(attach_type);
+ if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type)
+ return -EINVAL;
+ return 0;
+ }
+}
+
+#define BPF_PROG_ATTACH_LAST_FIELD expected_revision
+
+#define BPF_F_ATTACH_MASK_BASE \
+ (BPF_F_ALLOW_OVERRIDE | \
+ BPF_F_ALLOW_MULTI | \
+ BPF_F_REPLACE)
-#define BPF_F_ATTACH_MASK \
- (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
+#define BPF_F_ATTACH_MASK_MPROG \
+ (BPF_F_REPLACE | \
+ BPF_F_BEFORE | \
+ BPF_F_AFTER | \
+ BPF_F_ID | \
+ BPF_F_LINK)
static int bpf_prog_attach(const union bpf_attr *attr)
{
@@ -2834,12 +3887,19 @@ static int bpf_prog_attach(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_PROG_ATTACH))
return -EINVAL;
- if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
- return -EINVAL;
-
ptype = attach_type_to_prog_type(attr->attach_type);
if (ptype == BPF_PROG_TYPE_UNSPEC)
return -EINVAL;
+ if (bpf_mprog_supported(ptype)) {
+ if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
+ return -EINVAL;
+ } else {
+ if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE)
+ return -EINVAL;
+ if (attr->relative_fd ||
+ attr->expected_revision)
+ return -EINVAL;
+ }
prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
if (IS_ERR(prog))
@@ -2868,7 +3928,19 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
- ret = cgroup_bpf_prog_attach(attr, ptype, prog);
+ case BPF_PROG_TYPE_LSM:
+ if (ptype == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type != BPF_LSM_CGROUP)
+ ret = -EINVAL;
+ else
+ ret = cgroup_bpf_prog_attach(attr, ptype, prog);
+ break;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ if (attr->attach_type == BPF_TCX_INGRESS ||
+ attr->attach_type == BPF_TCX_EGRESS)
+ ret = tcx_prog_attach(attr, prog);
+ else
+ ret = netkit_prog_attach(attr, prog);
break;
default:
ret = -EINVAL;
@@ -2879,25 +3951,45 @@ static int bpf_prog_attach(const union bpf_attr *attr)
return ret;
}
-#define BPF_PROG_DETACH_LAST_FIELD attach_type
+#define BPF_PROG_DETACH_LAST_FIELD expected_revision
static int bpf_prog_detach(const union bpf_attr *attr)
{
+ struct bpf_prog *prog = NULL;
enum bpf_prog_type ptype;
+ int ret;
if (CHECK_ATTR(BPF_PROG_DETACH))
return -EINVAL;
ptype = attach_type_to_prog_type(attr->attach_type);
+ if (bpf_mprog_supported(ptype)) {
+ if (ptype == BPF_PROG_TYPE_UNSPEC)
+ return -EINVAL;
+ if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
+ return -EINVAL;
+ if (attr->attach_bpf_fd) {
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ }
+ } else if (attr->attach_flags ||
+ attr->relative_fd ||
+ attr->expected_revision) {
+ return -EINVAL;
+ }
switch (ptype) {
case BPF_PROG_TYPE_SK_MSG:
case BPF_PROG_TYPE_SK_SKB:
- return sock_map_prog_detach(attr, ptype);
+ ret = sock_map_prog_detach(attr, ptype);
+ break;
case BPF_PROG_TYPE_LIRC_MODE2:
- return lirc_prog_detach(attr);
+ ret = lirc_prog_detach(attr);
+ break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
- return netns_bpf_prog_detach(attr, ptype);
+ ret = netns_bpf_prog_detach(attr, ptype);
+ break;
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -2905,13 +3997,26 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
- return cgroup_bpf_prog_detach(attr, ptype);
+ case BPF_PROG_TYPE_LSM:
+ ret = cgroup_bpf_prog_detach(attr, ptype);
+ break;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ if (attr->attach_type == BPF_TCX_INGRESS ||
+ attr->attach_type == BPF_TCX_EGRESS)
+ ret = tcx_prog_detach(attr, prog);
+ else
+ ret = netkit_prog_detach(attr, prog);
+ break;
default:
- return -EINVAL;
+ ret = -EINVAL;
}
+
+ if (prog)
+ bpf_prog_put(prog);
+ return ret;
}
-#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
+#define BPF_PROG_QUERY_LAST_FIELD query.revision
static int bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -2927,36 +4032,55 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET_SOCK_RELEASE:
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_POST_BIND:
case BPF_CGROUP_INET6_POST_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
case BPF_CGROUP_SOCK_OPS:
case BPF_CGROUP_DEVICE:
case BPF_CGROUP_SYSCTL:
case BPF_CGROUP_GETSOCKOPT:
case BPF_CGROUP_SETSOCKOPT:
+ case BPF_LSM_CGROUP:
return cgroup_bpf_prog_query(attr, uattr);
case BPF_LIRC_MODE2:
return lirc_prog_query(attr, uattr);
case BPF_FLOW_DISSECTOR:
+ case BPF_SK_LOOKUP:
return netns_bpf_prog_query(attr, uattr);
+ case BPF_SK_SKB_STREAM_PARSER:
+ case BPF_SK_SKB_STREAM_VERDICT:
+ case BPF_SK_MSG_VERDICT:
+ case BPF_SK_SKB_VERDICT:
+ return sock_map_bpf_prog_query(attr, uattr);
+ case BPF_TCX_INGRESS:
+ case BPF_TCX_EGRESS:
+ return tcx_prog_query(attr, uattr);
+ case BPF_NETKIT_PRIMARY:
+ case BPF_NETKIT_PEER:
+ return netkit_prog_query(attr, uattr);
default:
return -EINVAL;
}
}
-#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size
static int bpf_prog_test_run(const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -3033,6 +4157,25 @@ again:
return map;
}
+struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id)
+{
+ struct bpf_prog *prog;
+
+ spin_lock_bh(&prog_idr_lock);
+again:
+ prog = idr_get_next(&prog_idr, id);
+ if (prog) {
+ prog = bpf_prog_inc_not_zero(prog);
+ if (IS_ERR(prog)) {
+ (*id)++;
+ goto again;
+ }
+ }
+ spin_unlock_bh(&prog_idr_lock);
+
+ return prog;
+}
+
#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
struct bpf_prog *bpf_prog_by_id(u32 id)
@@ -3120,21 +4263,25 @@ static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
const struct bpf_map *map;
int i;
+ mutex_lock(&prog->aux->used_maps_mutex);
for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
map = prog->aux->used_maps[i];
if (map == (void *)addr) {
*type = BPF_PSEUDO_MAP_FD;
- return map;
+ goto out;
}
if (!map->ops->map_direct_value_meta)
continue;
if (!map->ops->map_direct_value_meta(map, addr, off)) {
*type = BPF_PSEUDO_MAP_VALUE;
- return map;
+ goto out;
}
}
+ map = NULL;
- return NULL;
+out:
+ mutex_unlock(&prog->aux->used_maps_mutex);
+ return map;
}
static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
@@ -3226,14 +4373,15 @@ static int bpf_prog_get_info_by_fd(struct file *file,
union bpf_attr __user *uattr)
{
struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct btf *attach_btf = bpf_prog_get_target_btf(prog);
struct bpf_prog_info info;
u32 info_len = attr->info.info_len;
- struct bpf_prog_stats stats;
+ struct bpf_prog_kstats stats;
char __user *uinsns;
u32 ulen;
int err;
- err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
@@ -3252,6 +4400,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
memcpy(info.tag, prog->tag, sizeof(prog->tag));
memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
+ mutex_lock(&prog->aux->used_maps_mutex);
ulen = info.nr_map_ids;
info.nr_map_ids = prog->aux->used_map_cnt;
ulen = min_t(u32, info.nr_map_ids, ulen);
@@ -3261,9 +4410,12 @@ static int bpf_prog_get_info_by_fd(struct file *file,
for (i = 0; i < ulen; i++)
if (put_user(prog->aux->used_maps[i]->id,
- &user_map_ids[i]))
+ &user_map_ids[i])) {
+ mutex_unlock(&prog->aux->used_maps_mutex);
return -EFAULT;
+ }
}
+ mutex_unlock(&prog->aux->used_maps_mutex);
err = set_info_rec_size(&info);
if (err)
@@ -3272,6 +4424,9 @@ static int bpf_prog_get_info_by_fd(struct file *file,
bpf_prog_get_stats(prog, &stats);
info.run_time_ns = stats.nsecs;
info.run_cnt = stats.cnt;
+ info.recursion_misses = stats.misses;
+
+ info.verified_insns = prog->aux->verified_insns;
if (!bpf_capable()) {
info.jited_prog_len = 0;
@@ -3305,7 +4460,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
return -EFAULT;
}
- if (bpf_prog_is_dev_bound(prog->aux)) {
+ if (bpf_prog_is_offloaded(prog->aux)) {
err = bpf_prog_offload_info_fill(&info, prog);
if (err)
return err;
@@ -3419,7 +4574,10 @@ static int bpf_prog_get_info_by_fd(struct file *file,
}
if (prog->aux->btf)
- info.btf_id = btf_id(prog->aux->btf);
+ info.btf_id = btf_obj_id(prog->aux->btf);
+ info.attach_btf_id = prog->aux->attach_btf_id;
+ if (attach_btf)
+ info.attach_btf_obj_id = btf_obj_id(attach_btf);
ulen = info.nr_func_info;
info.nr_func_info = prog->aux->func_info_cnt;
@@ -3452,14 +4610,15 @@ static int bpf_prog_get_info_by_fd(struct file *file,
info.nr_jited_line_info = 0;
if (info.nr_jited_line_info && ulen) {
if (bpf_dump_raw_ok(file->f_cred)) {
+ unsigned long line_addr;
__u64 __user *user_linfo;
u32 i;
user_linfo = u64_to_user_ptr(info.jited_line_info);
ulen = min_t(u32, info.nr_jited_line_info, ulen);
for (i = 0; i < ulen; i++) {
- if (put_user((__u64)(long)prog->aux->jited_linfo[i],
- &user_linfo[i]))
+ line_addr = (unsigned long)prog->aux->jited_linfo[i];
+ if (put_user((__u64)line_addr, &user_linfo[i]))
return -EFAULT;
}
} else {
@@ -3507,7 +4666,7 @@ static int bpf_map_get_info_by_fd(struct file *file,
u32 info_len = attr->info.info_len;
int err;
- err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
@@ -3519,16 +4678,17 @@ static int bpf_map_get_info_by_fd(struct file *file,
info.value_size = map->value_size;
info.max_entries = map->max_entries;
info.map_flags = map->map_flags;
+ info.map_extra = map->map_extra;
memcpy(info.name, map->name, sizeof(map->name));
if (map->btf) {
- info.btf_id = btf_id(map->btf);
+ info.btf_id = btf_obj_id(map->btf);
info.btf_key_type_id = map->btf_key_type_id;
info.btf_value_type_id = map->btf_value_type_id;
}
info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_info_fill(&info, map);
if (err)
return err;
@@ -3550,7 +4710,7 @@ static int bpf_btf_get_info_by_fd(struct file *file,
u32 info_len = attr->info.info_len;
int err;
- err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
if (err)
return err;
@@ -3567,7 +4727,7 @@ static int bpf_link_get_info_by_fd(struct file *file,
u32 info_len = attr->info.info_len;
int err;
- err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
@@ -3578,7 +4738,8 @@ static int bpf_link_get_info_by_fd(struct file *file,
info.type = link->type;
info.id = link->id;
- info.prog_id = link->prog->aux->id;
+ if (link->prog)
+ info.prog_id = link->prog->aux->id;
if (link->ops->fill_link_info) {
err = link->ops->fill_link_info(link, &info);
@@ -3628,9 +4789,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
return err;
}
-#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
+#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size
-static int bpf_btf_load(const union bpf_attr *attr)
+static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
if (CHECK_ATTR(BPF_BTF_LOAD))
return -EINVAL;
@@ -3638,7 +4799,7 @@ static int bpf_btf_load(const union bpf_attr *attr)
if (!bpf_capable())
return -EPERM;
- return btf_new_fd(attr);
+ return btf_new_fd(attr, uattr, uattr_size);
}
#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
@@ -3709,7 +4870,6 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
pid_t pid = attr->task_fd_query.pid;
u32 fd = attr->task_fd_query.fd;
const struct perf_event *event;
- struct files_struct *files;
struct task_struct *task;
struct file *file;
int err;
@@ -3723,27 +4883,17 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (attr->task_fd_query.flags != 0)
return -EINVAL;
+ rcu_read_lock();
task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+ rcu_read_unlock();
if (!task)
return -ENOENT;
- files = get_files_struct(task);
- put_task_struct(task);
- if (!files)
- return -ENOENT;
-
err = 0;
- spin_lock(&files->file_lock);
- file = fcheck_files(files, fd);
+ file = fget_task(task, fd);
+ put_task_struct(task);
if (!file)
- err = -EBADF;
- else
- get_file(file);
- spin_unlock(&files->file_lock);
- put_files_struct(files);
-
- if (err)
- goto out;
+ return -EBADF;
if (file->f_op == &bpf_link_fops) {
struct bpf_link *link = file->private_data;
@@ -3770,7 +4920,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
&buf, &probe_offset,
- &probe_addr);
+ &probe_addr, NULL);
if (!err)
err = bpf_task_fd_query_copy(attr, uattr, prog_id,
fd_type, buf,
@@ -3783,25 +4933,27 @@ out_not_supp:
err = -ENOTSUPP;
put_file:
fput(file);
-out:
return err;
}
#define BPF_MAP_BATCH_LAST_FIELD batch.flags
-#define BPF_DO_BATCH(fn) \
+#define BPF_DO_BATCH(fn, ...) \
do { \
if (!fn) { \
err = -ENOTSUPP; \
goto err_put; \
} \
- err = fn(map, attr, uattr); \
+ err = fn(__VA_ARGS__); \
} while (0)
static int bpf_map_do_batch(const union bpf_attr *attr,
union bpf_attr __user *uattr,
int cmd)
{
+ bool has_read = cmd == BPF_MAP_LOOKUP_BATCH ||
+ cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
+ bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
struct bpf_map *map;
int err, ufd;
struct fd f;
@@ -3814,67 +4966,56 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
-
- if ((cmd == BPF_MAP_LOOKUP_BATCH ||
- cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
- !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
+ if (has_write)
+ bpf_map_write_active_inc(map);
+ if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
err = -EPERM;
goto err_put;
}
-
- if (cmd != BPF_MAP_LOOKUP_BATCH &&
- !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
if (cmd == BPF_MAP_LOOKUP_BATCH)
- BPF_DO_BATCH(map->ops->map_lookup_batch);
+ BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr);
else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
- BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
+ BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
else if (cmd == BPF_MAP_UPDATE_BATCH)
- BPF_DO_BATCH(map->ops->map_update_batch);
+ BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
else
- BPF_DO_BATCH(map->ops->map_delete_batch);
-
+ BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
err_put:
+ if (has_write) {
+ maybe_wait_bpf_programs(map);
+ bpf_map_write_active_dec(map);
+ }
fdput(f);
return err;
}
-static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid
+static int link_create(union bpf_attr *attr, bpfptr_t uattr)
{
- if (attr->link_create.attach_type == BPF_TRACE_ITER &&
- prog->expected_attach_type == BPF_TRACE_ITER)
- return bpf_iter_link_attach(attr, prog);
-
- return -EINVAL;
-}
-
-#define BPF_LINK_CREATE_LAST_FIELD link_create.flags
-static int link_create(union bpf_attr *attr)
-{
- enum bpf_prog_type ptype;
struct bpf_prog *prog;
int ret;
if (CHECK_ATTR(BPF_LINK_CREATE))
return -EINVAL;
- ptype = attach_type_to_prog_type(attr->link_create.attach_type);
- if (ptype == BPF_PROG_TYPE_UNSPEC)
- return -EINVAL;
+ if (attr->link_create.attach_type == BPF_STRUCT_OPS)
+ return bpf_struct_ops_link_create(attr);
- prog = bpf_prog_get_type(attr->link_create.prog_fd, ptype);
+ prog = bpf_prog_get(attr->link_create.prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
ret = bpf_prog_attach_check_attach_type(prog,
attr->link_create.attach_type);
if (ret)
- goto err_out;
+ goto out;
- switch (ptype) {
+ switch (prog->type) {
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
@@ -3884,22 +5025,100 @@ static int link_create(union bpf_attr *attr)
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
ret = cgroup_bpf_link_attach(attr, prog);
break;
+ case BPF_PROG_TYPE_EXT:
+ ret = bpf_tracing_prog_attach(prog,
+ attr->link_create.target_fd,
+ attr->link_create.target_btf_id,
+ attr->link_create.tracing.cookie);
+ break;
+ case BPF_PROG_TYPE_LSM:
case BPF_PROG_TYPE_TRACING:
- ret = tracing_bpf_link_attach(attr, prog);
+ if (attr->link_create.attach_type != prog->expected_attach_type) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
+ ret = bpf_raw_tp_link_attach(prog, NULL);
+ else if (prog->expected_attach_type == BPF_TRACE_ITER)
+ ret = bpf_iter_link_attach(attr, uattr, prog);
+ else if (prog->expected_attach_type == BPF_LSM_CGROUP)
+ ret = cgroup_bpf_link_attach(attr, prog);
+ else
+ ret = bpf_tracing_prog_attach(prog,
+ attr->link_create.target_fd,
+ attr->link_create.target_btf_id,
+ attr->link_create.tracing.cookie);
break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
+ case BPF_PROG_TYPE_SK_LOOKUP:
ret = netns_bpf_link_create(attr, prog);
break;
+#ifdef CONFIG_NET
+ case BPF_PROG_TYPE_XDP:
+ ret = bpf_xdp_link_attach(attr, prog);
+ break;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ if (attr->link_create.attach_type == BPF_TCX_INGRESS ||
+ attr->link_create.attach_type == BPF_TCX_EGRESS)
+ ret = tcx_link_attach(attr, prog);
+ else
+ ret = netkit_link_attach(attr, prog);
+ break;
+ case BPF_PROG_TYPE_NETFILTER:
+ ret = bpf_nf_link_attach(attr, prog);
+ break;
+#endif
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ ret = bpf_perf_link_attach(attr, prog);
+ break;
+ case BPF_PROG_TYPE_KPROBE:
+ if (attr->link_create.attach_type == BPF_PERF_EVENT)
+ ret = bpf_perf_link_attach(attr, prog);
+ else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI)
+ ret = bpf_kprobe_multi_link_attach(attr, prog);
+ else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI)
+ ret = bpf_uprobe_multi_link_attach(attr, prog);
+ break;
default:
ret = -EINVAL;
}
-err_out:
+out:
if (ret < 0)
bpf_prog_put(prog);
return ret;
}
+static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
+{
+ struct bpf_map *new_map, *old_map = NULL;
+ int ret;
+
+ new_map = bpf_map_get(attr->link_update.new_map_fd);
+ if (IS_ERR(new_map))
+ return PTR_ERR(new_map);
+
+ if (attr->link_update.flags & BPF_F_REPLACE) {
+ old_map = bpf_map_get(attr->link_update.old_map_fd);
+ if (IS_ERR(old_map)) {
+ ret = PTR_ERR(old_map);
+ goto out_put;
+ }
+ } else if (attr->link_update.old_map_fd) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ ret = link->ops->update_map(link, new_map, old_map);
+
+ if (old_map)
+ bpf_map_put(old_map);
+out_put:
+ bpf_map_put(new_map);
+ return ret;
+}
+
#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
static int link_update(union bpf_attr *attr)
@@ -3920,6 +5139,11 @@ static int link_update(union bpf_attr *attr)
if (IS_ERR(link))
return PTR_ERR(link);
+ if (link->ops->update_map) {
+ ret = link_update_map(link, attr);
+ goto out_put_link;
+ }
+
new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
if (IS_ERR(new_prog)) {
ret = PTR_ERR(new_prog);
@@ -3949,48 +5173,100 @@ out_put_progs:
if (ret)
bpf_prog_put(new_prog);
out_put_link:
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return ret;
}
-static int bpf_link_inc_not_zero(struct bpf_link *link)
+#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd
+
+static int link_detach(union bpf_attr *attr)
{
- return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT;
+ struct bpf_link *link;
+ int ret;
+
+ if (CHECK_ATTR(BPF_LINK_DETACH))
+ return -EINVAL;
+
+ link = bpf_link_get_from_fd(attr->link_detach.link_fd);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+
+ if (link->ops->detach)
+ ret = link->ops->detach(link);
+ else
+ ret = -EOPNOTSUPP;
+
+ bpf_link_put_direct(link);
+ return ret;
}
-#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
+static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
+{
+ return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
+}
-static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
+struct bpf_link *bpf_link_by_id(u32 id)
{
struct bpf_link *link;
- u32 id = attr->link_id;
- int fd, err;
- if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ if (!id)
+ return ERR_PTR(-ENOENT);
spin_lock_bh(&link_idr_lock);
- link = idr_find(&link_idr, id);
/* before link is "settled", ID is 0, pretend it doesn't exist yet */
+ link = idr_find(&link_idr, id);
if (link) {
if (link->id)
- err = bpf_link_inc_not_zero(link);
+ link = bpf_link_inc_not_zero(link);
else
- err = -EAGAIN;
+ link = ERR_PTR(-EAGAIN);
} else {
- err = -ENOENT;
+ link = ERR_PTR(-ENOENT);
}
spin_unlock_bh(&link_idr_lock);
+ return link;
+}
- if (err)
- return err;
+struct bpf_link *bpf_link_get_curr_or_next(u32 *id)
+{
+ struct bpf_link *link;
+
+ spin_lock_bh(&link_idr_lock);
+again:
+ link = idr_get_next(&link_idr, id);
+ if (link) {
+ link = bpf_link_inc_not_zero(link);
+ if (IS_ERR(link)) {
+ (*id)++;
+ goto again;
+ }
+ }
+ spin_unlock_bh(&link_idr_lock);
+
+ return link;
+}
+
+#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
+
+static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_link *link;
+ u32 id = attr->link_id;
+ int fd;
+
+ if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ link = bpf_link_by_id(id);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
fd = bpf_link_new_fd(link);
if (fd < 0)
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return fd;
}
@@ -4067,19 +5343,83 @@ static int bpf_iter_create(union bpf_attr *attr)
return PTR_ERR(link);
err = bpf_iter_new_fd(link);
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return err;
}
-SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags
+
+static int bpf_prog_bind_map(union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ struct bpf_map *map;
+ struct bpf_map **used_maps_old, **used_maps_new;
+ int i, ret = 0;
+
+ if (CHECK_ATTR(BPF_PROG_BIND_MAP))
+ return -EINVAL;
+
+ if (attr->prog_bind_map.flags)
+ return -EINVAL;
+
+ prog = bpf_prog_get(attr->prog_bind_map.prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ map = bpf_map_get(attr->prog_bind_map.map_fd);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
+ goto out_prog_put;
+ }
+
+ mutex_lock(&prog->aux->used_maps_mutex);
+
+ used_maps_old = prog->aux->used_maps;
+
+ for (i = 0; i < prog->aux->used_map_cnt; i++)
+ if (used_maps_old[i] == map) {
+ bpf_map_put(map);
+ goto out_unlock;
+ }
+
+ used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1,
+ sizeof(used_maps_new[0]),
+ GFP_KERNEL);
+ if (!used_maps_new) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ /* The bpf program will not access the bpf map, but for the sake of
+ * simplicity, increase sleepable_refcnt for sleepable program as well.
+ */
+ if (prog->aux->sleepable)
+ atomic64_inc(&map->sleepable_refcnt);
+ memcpy(used_maps_new, used_maps_old,
+ sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
+ used_maps_new[prog->aux->used_map_cnt] = map;
+
+ prog->aux->used_map_cnt++;
+ prog->aux->used_maps = used_maps_new;
+
+ kfree(used_maps_old);
+
+out_unlock:
+ mutex_unlock(&prog->aux->used_maps_mutex);
+
+ if (ret)
+ bpf_map_put(map);
+out_prog_put:
+ bpf_prog_put(prog);
+ return ret;
+}
+
+static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
int err;
- if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
- return -EPERM;
-
err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
if (err)
return err;
@@ -4087,7 +5427,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
memset(&attr, 0, sizeof(attr));
- if (copy_from_user(&attr, uattr, size) != 0)
+ if (copy_from_bpfptr(&attr, uattr, size) != 0)
return -EFAULT;
err = security_bpf(cmd, &attr, size);
@@ -4102,10 +5442,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = map_lookup_elem(&attr);
break;
case BPF_MAP_UPDATE_ELEM:
- err = map_update_elem(&attr);
+ err = map_update_elem(&attr, uattr);
break;
case BPF_MAP_DELETE_ELEM:
- err = map_delete_elem(&attr);
+ err = map_delete_elem(&attr, uattr);
break;
case BPF_MAP_GET_NEXT_KEY:
err = map_get_next_key(&attr);
@@ -4114,7 +5454,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = map_freeze(&attr);
break;
case BPF_PROG_LOAD:
- err = bpf_prog_load(&attr, uattr);
+ err = bpf_prog_load(&attr, uattr, size);
break;
case BPF_OBJ_PIN:
err = bpf_obj_pin(&attr);
@@ -4129,21 +5469,21 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = bpf_prog_detach(&attr);
break;
case BPF_PROG_QUERY:
- err = bpf_prog_query(&attr, uattr);
+ err = bpf_prog_query(&attr, uattr.user);
break;
case BPF_PROG_TEST_RUN:
- err = bpf_prog_test_run(&attr, uattr);
+ err = bpf_prog_test_run(&attr, uattr.user);
break;
case BPF_PROG_GET_NEXT_ID:
- err = bpf_obj_get_next_id(&attr, uattr,
+ err = bpf_obj_get_next_id(&attr, uattr.user,
&prog_idr, &prog_idr_lock);
break;
case BPF_MAP_GET_NEXT_ID:
- err = bpf_obj_get_next_id(&attr, uattr,
+ err = bpf_obj_get_next_id(&attr, uattr.user,
&map_idr, &map_idr_lock);
break;
case BPF_BTF_GET_NEXT_ID:
- err = bpf_obj_get_next_id(&attr, uattr,
+ err = bpf_obj_get_next_id(&attr, uattr.user,
&btf_idr, &btf_idr_lock);
break;
case BPF_PROG_GET_FD_BY_ID:
@@ -4153,38 +5493,38 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = bpf_map_get_fd_by_id(&attr);
break;
case BPF_OBJ_GET_INFO_BY_FD:
- err = bpf_obj_get_info_by_fd(&attr, uattr);
+ err = bpf_obj_get_info_by_fd(&attr, uattr.user);
break;
case BPF_RAW_TRACEPOINT_OPEN:
err = bpf_raw_tracepoint_open(&attr);
break;
case BPF_BTF_LOAD:
- err = bpf_btf_load(&attr);
+ err = bpf_btf_load(&attr, uattr, size);
break;
case BPF_BTF_GET_FD_BY_ID:
err = bpf_btf_get_fd_by_id(&attr);
break;
case BPF_TASK_FD_QUERY:
- err = bpf_task_fd_query(&attr, uattr);
+ err = bpf_task_fd_query(&attr, uattr.user);
break;
case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
err = map_lookup_and_delete_elem(&attr);
break;
case BPF_MAP_LOOKUP_BATCH:
- err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
+ err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH);
break;
case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
- err = bpf_map_do_batch(&attr, uattr,
+ err = bpf_map_do_batch(&attr, uattr.user,
BPF_MAP_LOOKUP_AND_DELETE_BATCH);
break;
case BPF_MAP_UPDATE_BATCH:
- err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
+ err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH);
break;
case BPF_MAP_DELETE_BATCH:
- err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
+ err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH);
break;
case BPF_LINK_CREATE:
- err = link_create(&attr);
+ err = link_create(&attr, uattr);
break;
case BPF_LINK_UPDATE:
err = link_update(&attr);
@@ -4193,7 +5533,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = bpf_link_get_fd_by_id(&attr);
break;
case BPF_LINK_GET_NEXT_ID:
- err = bpf_obj_get_next_id(&attr, uattr,
+ err = bpf_obj_get_next_id(&attr, uattr.user,
&link_idr, &link_idr_lock);
break;
case BPF_ENABLE_STATS:
@@ -4202,6 +5542,12 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_ITER_CREATE:
err = bpf_iter_create(&attr);
break;
+ case BPF_LINK_DETACH:
+ err = link_detach(&attr);
+ break;
+ case BPF_PROG_BIND_MAP:
+ err = bpf_prog_bind_map(&attr);
+ break;
default:
err = -EINVAL;
break;
@@ -4209,3 +5555,259 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
return err;
}
+
+SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+{
+ return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
+}
+
+static bool syscall_prog_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= U16_MAX)
+ return false;
+ if (off % size != 0)
+ return false;
+ return true;
+}
+
+BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
+{
+ switch (cmd) {
+ case BPF_MAP_CREATE:
+ case BPF_MAP_DELETE_ELEM:
+ case BPF_MAP_UPDATE_ELEM:
+ case BPF_MAP_FREEZE:
+ case BPF_MAP_GET_FD_BY_ID:
+ case BPF_PROG_LOAD:
+ case BPF_BTF_LOAD:
+ case BPF_LINK_CREATE:
+ case BPF_RAW_TRACEPOINT_OPEN:
+ break;
+ default:
+ return -EINVAL;
+ }
+ return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
+}
+
+
+/* To shut up -Wmissing-prototypes.
+ * This function is used by the kernel light skeleton
+ * to load bpf programs when modules are loaded or during kernel boot.
+ * See tools/lib/bpf/skel_internal.h
+ */
+int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
+
+int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
+{
+ struct bpf_prog * __maybe_unused prog;
+ struct bpf_tramp_run_ctx __maybe_unused run_ctx;
+
+ switch (cmd) {
+#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
+ case BPF_PROG_TEST_RUN:
+ if (attr->test.data_in || attr->test.data_out ||
+ attr->test.ctx_out || attr->test.duration ||
+ attr->test.repeat || attr->test.flags)
+ return -EINVAL;
+
+ prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (attr->test.ctx_size_in < prog->aux->max_ctx_offset ||
+ attr->test.ctx_size_in > U16_MAX) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ run_ctx.bpf_cookie = 0;
+ if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
+ /* recursion detected */
+ __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx);
+ bpf_prog_put(prog);
+ return -EBUSY;
+ }
+ attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
+ __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */,
+ &run_ctx);
+ bpf_prog_put(prog);
+ return 0;
+#endif
+ default:
+ return ____bpf_sys_bpf(cmd, attr, size);
+ }
+}
+EXPORT_SYMBOL(kern_sys_bpf);
+
+static const struct bpf_func_proto bpf_sys_bpf_proto = {
+ .func = bpf_sys_bpf,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+const struct bpf_func_proto * __weak
+tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return bpf_base_func_proto(func_id);
+}
+
+BPF_CALL_1(bpf_sys_close, u32, fd)
+{
+ /* When bpf program calls this helper there should not be
+ * an fdget() without matching completed fdput().
+ * This helper is allowed in the following callchain only:
+ * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close
+ */
+ return close_fd(fd);
+}
+
+static const struct bpf_func_proto bpf_sys_close_proto = {
+ .func = bpf_sys_close,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
+{
+ if (flags)
+ return -EINVAL;
+
+ if (name_sz <= 1 || name[name_sz - 1])
+ return -EINVAL;
+
+ if (!bpf_dump_raw_ok(current_cred()))
+ return -EPERM;
+
+ *res = kallsyms_lookup_name(name);
+ return *res ? 0 : -ENOENT;
+}
+
+static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
+ .func = bpf_kallsyms_lookup_name,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_LONG,
+};
+
+static const struct bpf_func_proto *
+syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_sys_bpf:
+ return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
+ case BPF_FUNC_btf_find_by_name_kind:
+ return &bpf_btf_find_by_name_kind_proto;
+ case BPF_FUNC_sys_close:
+ return &bpf_sys_close_proto;
+ case BPF_FUNC_kallsyms_lookup_name:
+ return &bpf_kallsyms_lookup_name_proto;
+ default:
+ return tracing_prog_func_proto(func_id, prog);
+ }
+}
+
+const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
+ .get_func_proto = syscall_prog_func_proto,
+ .is_valid_access = syscall_prog_is_valid_access,
+};
+
+const struct bpf_prog_ops bpf_syscall_prog_ops = {
+ .test_run = bpf_prog_test_run_syscall,
+};
+
+#ifdef CONFIG_SYSCTL
+static int bpf_stats_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct static_key *key = (struct static_key *)table->data;
+ static int saved_val;
+ int val, ret;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ };
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&bpf_stats_enabled_mutex);
+ val = saved_val;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret && val != saved_val) {
+ if (val)
+ static_key_slow_inc(key);
+ else
+ static_key_slow_dec(key);
+ saved_val = val;
+ }
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return ret;
+}
+
+void __weak unpriv_ebpf_notify(int new_state)
+{
+}
+
+static int bpf_unpriv_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, unpriv_enable = *(int *)table->data;
+ bool locked_state = unpriv_enable == 1;
+ struct ctl_table tmp = *table;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ tmp.data = &unpriv_enable;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret) {
+ if (locked_state && unpriv_enable != 1)
+ return -EPERM;
+ *(int *)table->data = unpriv_enable;
+ }
+
+ if (write)
+ unpriv_ebpf_notify(unpriv_enable);
+
+ return ret;
+}
+
+static struct ctl_table bpf_syscall_table[] = {
+ {
+ .procname = "unprivileged_bpf_disabled",
+ .data = &sysctl_unprivileged_bpf_disabled,
+ .maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
+ .mode = 0644,
+ .proc_handler = bpf_unpriv_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "bpf_stats_enabled",
+ .data = &bpf_stats_enabled_key.key,
+ .mode = 0644,
+ .proc_handler = bpf_stats_handler,
+ },
+ { }
+};
+
+static int __init bpf_syscall_sysctl_init(void)
+{
+ register_sysctl_init("kernel", bpf_syscall_table);
+ return 0;
+}
+late_initcall(bpf_syscall_sysctl_init);
+#endif /* CONFIG_SYSCTL */
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
index 3b495773de5a..ef6911aee3bb 100644
--- a/kernel/bpf/sysfs_btf.c
+++ b/kernel/bpf/sysfs_btf.c
@@ -26,19 +26,19 @@ static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
.read = btf_vmlinux_read,
};
-static struct kobject *btf_kobj;
+struct kobject *btf_kobj;
static int __init btf_vmlinux_init(void)
{
- if (!__start_BTF)
+ bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF;
+
+ if (!__start_BTF || bin_attr_btf_vmlinux.size == 0)
return 0;
btf_kobj = kobject_create_and_add("btf", kernel_kobj);
if (!btf_kobj)
return -ENOMEM;
- bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF;
-
return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux);
}
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 4dbf2b6035f8..ec4e97c61eef 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -7,9 +7,22 @@
#include <linux/fs.h>
#include <linux/fdtable.h>
#include <linux/filter.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/btf_ids.h>
+#include <linux/mm_types.h>
+#include "mmap_unlock_work.h"
+
+static const char * const iter_task_type_names[] = {
+ "ALL",
+ "TID",
+ "PID",
+};
struct bpf_iter_seq_task_common {
struct pid_namespace *ns;
+ enum bpf_iter_task_type type;
+ u32 pid;
+ u32 pid_visiting;
};
struct bpf_iter_seq_task_info {
@@ -20,20 +33,103 @@ struct bpf_iter_seq_task_info {
u32 tid;
};
-static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
- u32 *tid)
+static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common,
+ u32 *tid,
+ bool skip_if_dup_files)
+{
+ struct task_struct *task;
+ struct pid *pid;
+ u32 next_tid;
+
+ if (!*tid) {
+ /* The first time, the iterator calls this function. */
+ pid = find_pid_ns(common->pid, common->ns);
+ task = get_pid_task(pid, PIDTYPE_TGID);
+ if (!task)
+ return NULL;
+
+ *tid = common->pid;
+ common->pid_visiting = common->pid;
+
+ return task;
+ }
+
+ /* If the control returns to user space and comes back to the
+ * kernel again, *tid and common->pid_visiting should be the
+ * same for task_seq_start() to pick up the correct task.
+ */
+ if (*tid == common->pid_visiting) {
+ pid = find_pid_ns(common->pid_visiting, common->ns);
+ task = get_pid_task(pid, PIDTYPE_PID);
+
+ return task;
+ }
+
+ task = find_task_by_pid_ns(common->pid_visiting, common->ns);
+ if (!task)
+ return NULL;
+
+retry:
+ task = __next_thread(task);
+ if (!task)
+ return NULL;
+
+ next_tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns);
+ if (!next_tid)
+ goto retry;
+
+ if (skip_if_dup_files && task->files == task->group_leader->files)
+ goto retry;
+
+ *tid = common->pid_visiting = next_tid;
+ get_task_struct(task);
+ return task;
+}
+
+static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
+ u32 *tid,
+ bool skip_if_dup_files)
{
struct task_struct *task = NULL;
struct pid *pid;
+ if (common->type == BPF_TASK_ITER_TID) {
+ if (*tid && *tid != common->pid)
+ return NULL;
+ rcu_read_lock();
+ pid = find_pid_ns(common->pid, common->ns);
+ if (pid) {
+ task = get_pid_task(pid, PIDTYPE_TGID);
+ *tid = common->pid;
+ }
+ rcu_read_unlock();
+
+ return task;
+ }
+
+ if (common->type == BPF_TASK_ITER_TGID) {
+ rcu_read_lock();
+ task = task_group_seq_get_next(common, tid, skip_if_dup_files);
+ rcu_read_unlock();
+
+ return task;
+ }
+
rcu_read_lock();
retry:
- pid = idr_get_next(&ns->idr, tid);
+ pid = find_ge_pid(*tid, common->ns);
if (pid) {
+ *tid = pid_nr_ns(pid, common->ns);
task = get_pid_task(pid, PIDTYPE_PID);
if (!task) {
++*tid;
goto retry;
+ } else if (skip_if_dup_files && !thread_group_leader(task) &&
+ task->files == task->group_leader->files) {
+ put_task_struct(task);
+ task = NULL;
+ ++*tid;
+ goto retry;
}
}
rcu_read_unlock();
@@ -46,11 +142,12 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
struct bpf_iter_seq_task_info *info = seq->private;
struct task_struct *task;
- task = task_seq_get_next(info->common.ns, &info->tid);
+ task = task_seq_get_next(&info->common, &info->tid, false);
if (!task)
return NULL;
- ++*pos;
+ if (*pos == 0)
+ ++*pos;
return task;
}
@@ -62,7 +159,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos;
++info->tid;
put_task_struct((struct task_struct *)v);
- task = task_seq_get_next(info->common.ns, &info->tid);
+ task = task_seq_get_next(&info->common, &info->tid, false);
if (!task)
return NULL;
@@ -88,7 +185,6 @@ static int __task_seq_show(struct seq_file *seq, struct task_struct *task,
if (!prog)
return 0;
- meta.seq = seq;
ctx.meta = &meta;
ctx.task = task;
return bpf_iter_run_prog(prog, &ctx);
@@ -107,6 +203,41 @@ static void task_seq_stop(struct seq_file *seq, void *v)
put_task_struct((struct task_struct *)v);
}
+static int bpf_iter_attach_task(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ unsigned int flags;
+ struct pid *pid;
+ pid_t tgid;
+
+ if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1)
+ return -EINVAL;
+
+ aux->task.type = BPF_TASK_ITER_ALL;
+ if (linfo->task.tid != 0) {
+ aux->task.type = BPF_TASK_ITER_TID;
+ aux->task.pid = linfo->task.tid;
+ }
+ if (linfo->task.pid != 0) {
+ aux->task.type = BPF_TASK_ITER_TGID;
+ aux->task.pid = linfo->task.pid;
+ }
+ if (linfo->task.pid_fd != 0) {
+ aux->task.type = BPF_TASK_ITER_TGID;
+
+ pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
+
+ tgid = pid_nr_ns(pid, task_active_pid_ns(current));
+ aux->task.pid = tgid;
+ put_pid(pid);
+ }
+
+ return 0;
+}
+
static const struct seq_operations task_seq_ops = {
.start = task_seq_start,
.next = task_seq_next,
@@ -120,98 +251,77 @@ struct bpf_iter_seq_task_file_info {
*/
struct bpf_iter_seq_task_common common;
struct task_struct *task;
- struct files_struct *files;
u32 tid;
u32 fd;
};
static struct file *
-task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info,
- struct task_struct **task, struct files_struct **fstruct)
+task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
{
- struct pid_namespace *ns = info->common.ns;
- u32 curr_tid = info->tid, max_fds;
- struct files_struct *curr_files;
+ u32 saved_tid = info->tid;
struct task_struct *curr_task;
- int curr_fd = info->fd;
+ unsigned int curr_fd = info->fd;
/* If this function returns a non-NULL file object,
- * it held a reference to the task/files_struct/file.
+ * it held a reference to the task/file.
* Otherwise, it does not hold any reference.
*/
again:
- if (*task) {
- curr_task = *task;
- curr_files = *fstruct;
+ if (info->task) {
+ curr_task = info->task;
curr_fd = info->fd;
} else {
- curr_task = task_seq_get_next(ns, &curr_tid);
- if (!curr_task)
- return NULL;
+ curr_task = task_seq_get_next(&info->common, &info->tid, true);
+ if (!curr_task) {
+ info->task = NULL;
+ return NULL;
+ }
- curr_files = get_files_struct(curr_task);
- if (!curr_files) {
- put_task_struct(curr_task);
- curr_tid = ++(info->tid);
- info->fd = 0;
- goto again;
- }
-
- /* set *fstruct, *task and info->tid */
- *fstruct = curr_files;
- *task = curr_task;
- if (curr_tid == info->tid) {
+ /* set info->task */
+ info->task = curr_task;
+ if (saved_tid == info->tid)
curr_fd = info->fd;
- } else {
- info->tid = curr_tid;
+ else
curr_fd = 0;
- }
}
rcu_read_lock();
- max_fds = files_fdtable(curr_files)->max_fds;
- for (; curr_fd < max_fds; curr_fd++) {
+ for (;; curr_fd++) {
struct file *f;
-
- f = fcheck_files(curr_files, curr_fd);
+ f = task_lookup_next_fdget_rcu(curr_task, &curr_fd);
if (!f)
- continue;
+ break;
/* set info->fd */
info->fd = curr_fd;
- get_file(f);
rcu_read_unlock();
return f;
}
/* the current task is done, go to the next task */
rcu_read_unlock();
- put_files_struct(curr_files);
put_task_struct(curr_task);
- *task = NULL;
- *fstruct = NULL;
+
+ if (info->common.type == BPF_TASK_ITER_TID) {
+ info->task = NULL;
+ return NULL;
+ }
+
+ info->task = NULL;
info->fd = 0;
- curr_tid = ++(info->tid);
+ saved_tid = ++(info->tid);
goto again;
}
static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
{
struct bpf_iter_seq_task_file_info *info = seq->private;
- struct files_struct *files = NULL;
- struct task_struct *task = NULL;
struct file *file;
- file = task_file_seq_get_next(info, &task, &files);
- if (!file) {
- info->files = NULL;
- info->task = NULL;
- return NULL;
- }
-
- ++*pos;
- info->task = task;
- info->files = files;
+ info->task = NULL;
+ file = task_file_seq_get_next(info);
+ if (file && *pos == 0)
+ ++*pos;
return file;
}
@@ -219,24 +329,11 @@ static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct bpf_iter_seq_task_file_info *info = seq->private;
- struct files_struct *files = info->files;
- struct task_struct *task = info->task;
- struct file *file;
++*pos;
++info->fd;
fput((struct file *)v);
- file = task_file_seq_get_next(info, &task, &files);
- if (!file) {
- info->files = NULL;
- info->task = NULL;
- return NULL;
- }
-
- info->task = task;
- info->files = files;
-
- return file;
+ return task_file_seq_get_next(info);
}
struct bpf_iter__task_file {
@@ -283,18 +380,19 @@ static void task_file_seq_stop(struct seq_file *seq, void *v)
(void)__task_file_seq_show(seq, v, true);
} else {
fput((struct file *)v);
- put_files_struct(info->files);
put_task_struct(info->task);
- info->files = NULL;
info->task = NULL;
}
}
-static int init_seq_pidns(void *priv_data)
+static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
{
struct bpf_iter_seq_task_common *common = priv_data;
common->ns = get_pid_ns(task_active_pid_ns(current));
+ common->type = aux->task.type;
+ common->pid = aux->task.pid;
+
return 0;
}
@@ -312,25 +410,316 @@ static const struct seq_operations task_file_seq_ops = {
.show = task_file_seq_show,
};
-static const struct bpf_iter_reg task_reg_info = {
- .target = "task",
+struct bpf_iter_seq_task_vma_info {
+ /* The first field must be struct bpf_iter_seq_task_common.
+ * this is assumed by {init, fini}_seq_pidns() callback functions.
+ */
+ struct bpf_iter_seq_task_common common;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ u32 tid;
+ unsigned long prev_vm_start;
+ unsigned long prev_vm_end;
+};
+
+enum bpf_task_vma_iter_find_op {
+ task_vma_iter_first_vma, /* use find_vma() with addr 0 */
+ task_vma_iter_next_vma, /* use vma_next() with curr_vma */
+ task_vma_iter_find_vma, /* use find_vma() to find next vma */
+};
+
+static struct vm_area_struct *
+task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
+{
+ enum bpf_task_vma_iter_find_op op;
+ struct vm_area_struct *curr_vma;
+ struct task_struct *curr_task;
+ struct mm_struct *curr_mm;
+ u32 saved_tid = info->tid;
+
+ /* If this function returns a non-NULL vma, it holds a reference to
+ * the task_struct, holds a refcount on mm->mm_users, and holds
+ * read lock on vma->mm->mmap_lock.
+ * If this function returns NULL, it does not hold any reference or
+ * lock.
+ */
+ if (info->task) {
+ curr_task = info->task;
+ curr_vma = info->vma;
+ curr_mm = info->mm;
+ /* In case of lock contention, drop mmap_lock to unblock
+ * the writer.
+ *
+ * After relock, call find(mm, prev_vm_end - 1) to find
+ * new vma to process.
+ *
+ * +------+------+-----------+
+ * | VMA1 | VMA2 | VMA3 |
+ * +------+------+-----------+
+ * | | | |
+ * 4k 8k 16k 400k
+ *
+ * For example, curr_vma == VMA2. Before unlock, we set
+ *
+ * prev_vm_start = 8k
+ * prev_vm_end = 16k
+ *
+ * There are a few cases:
+ *
+ * 1) VMA2 is freed, but VMA3 exists.
+ *
+ * find_vma() will return VMA3, just process VMA3.
+ *
+ * 2) VMA2 still exists.
+ *
+ * find_vma() will return VMA2, process VMA2->next.
+ *
+ * 3) no more vma in this mm.
+ *
+ * Process the next task.
+ *
+ * 4) find_vma() returns a different vma, VMA2'.
+ *
+ * 4.1) If VMA2 covers same range as VMA2', skip VMA2',
+ * because we already covered the range;
+ * 4.2) VMA2 and VMA2' covers different ranges, process
+ * VMA2'.
+ */
+ if (mmap_lock_is_contended(curr_mm)) {
+ info->prev_vm_start = curr_vma->vm_start;
+ info->prev_vm_end = curr_vma->vm_end;
+ op = task_vma_iter_find_vma;
+ mmap_read_unlock(curr_mm);
+ if (mmap_read_lock_killable(curr_mm)) {
+ mmput(curr_mm);
+ goto finish;
+ }
+ } else {
+ op = task_vma_iter_next_vma;
+ }
+ } else {
+again:
+ curr_task = task_seq_get_next(&info->common, &info->tid, true);
+ if (!curr_task) {
+ info->tid++;
+ goto finish;
+ }
+
+ if (saved_tid != info->tid) {
+ /* new task, process the first vma */
+ op = task_vma_iter_first_vma;
+ } else {
+ /* Found the same tid, which means the user space
+ * finished data in previous buffer and read more.
+ * We dropped mmap_lock before returning to user
+ * space, so it is necessary to use find_vma() to
+ * find the next vma to process.
+ */
+ op = task_vma_iter_find_vma;
+ }
+
+ curr_mm = get_task_mm(curr_task);
+ if (!curr_mm)
+ goto next_task;
+
+ if (mmap_read_lock_killable(curr_mm)) {
+ mmput(curr_mm);
+ goto finish;
+ }
+ }
+
+ switch (op) {
+ case task_vma_iter_first_vma:
+ curr_vma = find_vma(curr_mm, 0);
+ break;
+ case task_vma_iter_next_vma:
+ curr_vma = find_vma(curr_mm, curr_vma->vm_end);
+ break;
+ case task_vma_iter_find_vma:
+ /* We dropped mmap_lock so it is necessary to use find_vma
+ * to find the next vma. This is similar to the mechanism
+ * in show_smaps_rollup().
+ */
+ curr_vma = find_vma(curr_mm, info->prev_vm_end - 1);
+ /* case 1) and 4.2) above just use curr_vma */
+
+ /* check for case 2) or case 4.1) above */
+ if (curr_vma &&
+ curr_vma->vm_start == info->prev_vm_start &&
+ curr_vma->vm_end == info->prev_vm_end)
+ curr_vma = find_vma(curr_mm, curr_vma->vm_end);
+ break;
+ }
+ if (!curr_vma) {
+ /* case 3) above, or case 2) 4.1) with vma->next == NULL */
+ mmap_read_unlock(curr_mm);
+ mmput(curr_mm);
+ goto next_task;
+ }
+ info->task = curr_task;
+ info->vma = curr_vma;
+ info->mm = curr_mm;
+ return curr_vma;
+
+next_task:
+ if (info->common.type == BPF_TASK_ITER_TID)
+ goto finish;
+
+ put_task_struct(curr_task);
+ info->task = NULL;
+ info->mm = NULL;
+ info->tid++;
+ goto again;
+
+finish:
+ if (curr_task)
+ put_task_struct(curr_task);
+ info->task = NULL;
+ info->vma = NULL;
+ info->mm = NULL;
+ return NULL;
+}
+
+static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_task_vma_info *info = seq->private;
+ struct vm_area_struct *vma;
+
+ vma = task_vma_seq_get_next(info);
+ if (vma && *pos == 0)
+ ++*pos;
+
+ return vma;
+}
+
+static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_task_vma_info *info = seq->private;
+
+ ++*pos;
+ return task_vma_seq_get_next(info);
+}
+
+struct bpf_iter__task_vma {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct task_struct *, task);
+ __bpf_md_ptr(struct vm_area_struct *, vma);
+};
+
+DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta,
+ struct task_struct *task, struct vm_area_struct *vma)
+
+static int __task_vma_seq_show(struct seq_file *seq, bool in_stop)
+{
+ struct bpf_iter_seq_task_vma_info *info = seq->private;
+ struct bpf_iter__task_vma ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.task = info->task;
+ ctx.vma = info->vma;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int task_vma_seq_show(struct seq_file *seq, void *v)
+{
+ return __task_vma_seq_show(seq, false);
+}
+
+static void task_vma_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_seq_task_vma_info *info = seq->private;
+
+ if (!v) {
+ (void)__task_vma_seq_show(seq, true);
+ } else {
+ /* info->vma has not been seen by the BPF program. If the
+ * user space reads more, task_vma_seq_get_next should
+ * return this vma again. Set prev_vm_start to ~0UL,
+ * so that we don't skip the vma returned by the next
+ * find_vma() (case task_vma_iter_find_vma in
+ * task_vma_seq_get_next()).
+ */
+ info->prev_vm_start = ~0UL;
+ info->prev_vm_end = info->vma->vm_end;
+ mmap_read_unlock(info->mm);
+ mmput(info->mm);
+ info->mm = NULL;
+ put_task_struct(info->task);
+ info->task = NULL;
+ }
+}
+
+static const struct seq_operations task_vma_seq_ops = {
+ .start = task_vma_seq_start,
+ .next = task_vma_seq_next,
+ .stop = task_vma_seq_stop,
+ .show = task_vma_seq_show,
+};
+
+static const struct bpf_iter_seq_info task_seq_info = {
.seq_ops = &task_seq_ops,
.init_seq_private = init_seq_pidns,
.fini_seq_private = fini_seq_pidns,
.seq_priv_size = sizeof(struct bpf_iter_seq_task_info),
+};
+
+static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info)
+{
+ switch (aux->task.type) {
+ case BPF_TASK_ITER_TID:
+ info->iter.task.tid = aux->task.pid;
+ break;
+ case BPF_TASK_ITER_TGID:
+ info->iter.task.pid = aux->task.pid;
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq)
+{
+ seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]);
+ if (aux->task.type == BPF_TASK_ITER_TID)
+ seq_printf(seq, "tid:\t%u\n", aux->task.pid);
+ else if (aux->task.type == BPF_TASK_ITER_TGID)
+ seq_printf(seq, "pid:\t%u\n", aux->task.pid);
+}
+
+static struct bpf_iter_reg task_reg_info = {
+ .target = "task",
+ .attach_target = bpf_iter_attach_task,
+ .feature = BPF_ITER_RESCHED,
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__task, task),
- PTR_TO_BTF_ID_OR_NULL },
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
},
+ .seq_info = &task_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
};
-static const struct bpf_iter_reg task_file_reg_info = {
- .target = "task_file",
+static const struct bpf_iter_seq_info task_file_seq_info = {
.seq_ops = &task_file_seq_ops,
.init_seq_private = init_seq_pidns,
.fini_seq_private = fini_seq_pidns,
.seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info),
+};
+
+static struct bpf_iter_reg task_file_reg_info = {
+ .target = "task_file",
+ .attach_target = bpf_iter_attach_task,
+ .feature = BPF_ITER_RESCHED,
.ctx_arg_info_size = 2,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__task_file, task),
@@ -338,16 +727,351 @@ static const struct bpf_iter_reg task_file_reg_info = {
{ offsetof(struct bpf_iter__task_file, file),
PTR_TO_BTF_ID_OR_NULL },
},
+ .seq_info = &task_file_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
+};
+
+static const struct bpf_iter_seq_info task_vma_seq_info = {
+ .seq_ops = &task_vma_seq_ops,
+ .init_seq_private = init_seq_pidns,
+ .fini_seq_private = fini_seq_pidns,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_task_vma_info),
+};
+
+static struct bpf_iter_reg task_vma_reg_info = {
+ .target = "task_vma",
+ .attach_target = bpf_iter_attach_task,
+ .feature = BPF_ITER_RESCHED,
+ .ctx_arg_info_size = 2,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__task_vma, task),
+ PTR_TO_BTF_ID_OR_NULL },
+ { offsetof(struct bpf_iter__task_vma, vma),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &task_vma_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
+};
+
+BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
+ bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags)
+{
+ struct mmap_unlock_irq_work *work = NULL;
+ struct vm_area_struct *vma;
+ bool irq_work_busy = false;
+ struct mm_struct *mm;
+ int ret = -ENOENT;
+
+ if (flags)
+ return -EINVAL;
+
+ if (!task)
+ return -ENOENT;
+
+ mm = task->mm;
+ if (!mm)
+ return -ENOENT;
+
+ irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
+
+ if (irq_work_busy || !mmap_read_trylock(mm))
+ return -EBUSY;
+
+ vma = find_vma(mm, start);
+
+ if (vma && vma->vm_start <= start && vma->vm_end > start) {
+ callback_fn((u64)(long)task, (u64)(long)vma,
+ (u64)(long)callback_ctx, 0, 0);
+ ret = 0;
+ }
+ bpf_mmap_unlock_mm(work, mm);
+ return ret;
+}
+
+const struct bpf_func_proto bpf_find_vma_proto = {
+ .func = bpf_find_vma,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_FUNC,
+ .arg4_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg5_type = ARG_ANYTHING,
+};
+
+struct bpf_iter_task_vma_kern_data {
+ struct task_struct *task;
+ struct mm_struct *mm;
+ struct mmap_unlock_irq_work *work;
+ struct vma_iterator vmi;
+};
+
+struct bpf_iter_task_vma {
+ /* opaque iterator state; having __u64 here allows to preserve correct
+ * alignment requirements in vmlinux.h, generated from BTF
+ */
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
+/* Non-opaque version of bpf_iter_task_vma */
+struct bpf_iter_task_vma_kern {
+ struct bpf_iter_task_vma_kern_data *data;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
+ struct task_struct *task, u64 addr)
+{
+ struct bpf_iter_task_vma_kern *kit = (void *)it;
+ bool irq_work_busy = false;
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
+
+ /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
+ * before, so non-NULL kit->data doesn't point to previously
+ * bpf_mem_alloc'd bpf_iter_task_vma_kern_data
+ */
+ kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data));
+ if (!kit->data)
+ return -ENOMEM;
+
+ kit->data->task = get_task_struct(task);
+ kit->data->mm = task->mm;
+ if (!kit->data->mm) {
+ err = -ENOENT;
+ goto err_cleanup_iter;
+ }
+
+ /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */
+ irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
+ if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
+ err = -EBUSY;
+ goto err_cleanup_iter;
+ }
+
+ vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
+ return 0;
+
+err_cleanup_iter:
+ if (kit->data->task)
+ put_task_struct(kit->data->task);
+ bpf_mem_free(&bpf_global_ma, kit->data);
+ /* NULL kit->data signals failed bpf_iter_task_vma initialization */
+ kit->data = NULL;
+ return err;
+}
+
+__bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
+{
+ struct bpf_iter_task_vma_kern *kit = (void *)it;
+
+ if (!kit->data) /* bpf_iter_task_vma_new failed */
+ return NULL;
+ return vma_next(&kit->data->vmi);
+}
+
+__bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
+{
+ struct bpf_iter_task_vma_kern *kit = (void *)it;
+
+ if (kit->data) {
+ bpf_mmap_unlock_mm(kit->data->work, kit->data->mm);
+ put_task_struct(kit->data->task);
+ bpf_mem_free(&bpf_global_ma, kit->data);
+ }
+}
+
+__bpf_kfunc_end_defs();
+
+#ifdef CONFIG_CGROUPS
+
+struct bpf_iter_css_task {
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_css_task_kern {
+ struct css_task_iter *css_it;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_css_task_new(struct bpf_iter_css_task *it,
+ struct cgroup_subsys_state *css, unsigned int flags)
+{
+ struct bpf_iter_css_task_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_css_task_kern) != sizeof(struct bpf_iter_css_task));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_css_task_kern) !=
+ __alignof__(struct bpf_iter_css_task));
+ kit->css_it = NULL;
+ switch (flags) {
+ case CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED:
+ case CSS_TASK_ITER_PROCS:
+ case 0:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ kit->css_it = bpf_mem_alloc(&bpf_global_ma, sizeof(struct css_task_iter));
+ if (!kit->css_it)
+ return -ENOMEM;
+ css_task_iter_start(css, flags, kit->css_it);
+ return 0;
+}
+
+__bpf_kfunc struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it)
+{
+ struct bpf_iter_css_task_kern *kit = (void *)it;
+
+ if (!kit->css_it)
+ return NULL;
+ return css_task_iter_next(kit->css_it);
+}
+
+__bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
+{
+ struct bpf_iter_css_task_kern *kit = (void *)it;
+
+ if (!kit->css_it)
+ return;
+ css_task_iter_end(kit->css_it);
+ bpf_mem_free(&bpf_global_ma, kit->css_it);
+}
+
+__bpf_kfunc_end_defs();
+
+#endif /* CONFIG_CGROUPS */
+
+struct bpf_iter_task {
+ __u64 __opaque[3];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_task_kern {
+ struct task_struct *task;
+ struct task_struct *pos;
+ unsigned int flags;
+} __attribute__((aligned(8)));
+
+enum {
+ /* all process in the system */
+ BPF_TASK_ITER_ALL_PROCS,
+ /* all threads in the system */
+ BPF_TASK_ITER_ALL_THREADS,
+ /* all threads of a specific process */
+ BPF_TASK_ITER_PROC_THREADS
};
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it,
+ struct task_struct *task__nullable, unsigned int flags)
+{
+ struct bpf_iter_task_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) > sizeof(struct bpf_iter_task));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) !=
+ __alignof__(struct bpf_iter_task));
+
+ kit->pos = NULL;
+
+ switch (flags) {
+ case BPF_TASK_ITER_ALL_THREADS:
+ case BPF_TASK_ITER_ALL_PROCS:
+ break;
+ case BPF_TASK_ITER_PROC_THREADS:
+ if (!task__nullable)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (flags == BPF_TASK_ITER_PROC_THREADS)
+ kit->task = task__nullable;
+ else
+ kit->task = &init_task;
+ kit->pos = kit->task;
+ kit->flags = flags;
+ return 0;
+}
+
+__bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it)
+{
+ struct bpf_iter_task_kern *kit = (void *)it;
+ struct task_struct *pos;
+ unsigned int flags;
+
+ flags = kit->flags;
+ pos = kit->pos;
+
+ if (!pos)
+ return pos;
+
+ if (flags == BPF_TASK_ITER_ALL_PROCS)
+ goto get_next_task;
+
+ kit->pos = __next_thread(kit->pos);
+ if (kit->pos || flags == BPF_TASK_ITER_PROC_THREADS)
+ return pos;
+
+get_next_task:
+ kit->task = next_task(kit->task);
+ if (kit->task == &init_task)
+ kit->pos = NULL;
+ else
+ kit->pos = kit->task;
+
+ return pos;
+}
+
+__bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it)
+{
+}
+
+__bpf_kfunc_end_defs();
+
+DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
+
+static void do_mmap_read_unlock(struct irq_work *entry)
+{
+ struct mmap_unlock_irq_work *work;
+
+ if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
+ return;
+
+ work = container_of(entry, struct mmap_unlock_irq_work, irq_work);
+ mmap_read_unlock_non_owner(work->mm);
+}
+
static int __init task_iter_init(void)
{
- int ret;
+ struct mmap_unlock_irq_work *work;
+ int ret, cpu;
+ for_each_possible_cpu(cpu) {
+ work = per_cpu_ptr(&mmap_unlock_work, cpu);
+ init_irq_work(&work->irq_work, do_mmap_read_unlock);
+ }
+
+ task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
ret = bpf_iter_reg_target(&task_reg_info);
if (ret)
return ret;
- return bpf_iter_reg_target(&task_file_reg_info);
+ task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
+ task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE];
+ ret = bpf_iter_reg_target(&task_file_reg_info);
+ if (ret)
+ return ret;
+
+ task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
+ task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
+ return bpf_iter_reg_target(&task_vma_reg_info);
}
late_initcall(task_iter_init);
diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c
new file mode 100644
index 000000000000..2e4885e7781f
--- /dev/null
+++ b/kernel/bpf/tcx.c
@@ -0,0 +1,346 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Isovalent */
+
+#include <linux/bpf.h>
+#include <linux/bpf_mprog.h>
+#include <linux/netdevice.h>
+
+#include <net/tcx.h>
+
+int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ bool created, ingress = attr->attach_type == BPF_TCX_INGRESS;
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct bpf_prog *replace_prog = NULL;
+ struct net_device *dev;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ if (attr->attach_flags & BPF_F_REPLACE) {
+ replace_prog = bpf_prog_get_type(attr->replace_bpf_fd,
+ prog->type);
+ if (IS_ERR(replace_prog)) {
+ ret = PTR_ERR(replace_prog);
+ replace_prog = NULL;
+ goto out;
+ }
+ }
+ entry = tcx_entry_fetch_or_create(dev, ingress, &created);
+ if (!entry) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = bpf_mprog_attach(entry, &entry_new, prog, NULL, replace_prog,
+ attr->attach_flags, attr->relative_fd,
+ attr->expected_revision);
+ if (!ret) {
+ if (entry != entry_new) {
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_inc(ingress);
+ }
+ bpf_mprog_commit(entry);
+ } else if (created) {
+ tcx_entry_free(entry);
+ }
+out:
+ if (replace_prog)
+ bpf_prog_put(replace_prog);
+ rtnl_unlock();
+ return ret;
+}
+
+int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ bool ingress = attr->attach_type == BPF_TCX_INGRESS;
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_detach(entry, &entry_new, prog, NULL, attr->attach_flags,
+ attr->relative_fd, attr->expected_revision);
+ if (!ret) {
+ if (!tcx_entry_is_active(entry_new))
+ entry_new = NULL;
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_dec(ingress);
+ bpf_mprog_commit(entry);
+ if (!entry_new)
+ tcx_entry_free(entry);
+ }
+out:
+ rtnl_unlock();
+ return ret;
+}
+
+void tcx_uninstall(struct net_device *dev, bool ingress)
+{
+ struct bpf_mprog_entry *entry, *entry_new = NULL;
+ struct bpf_tuple tuple = {};
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ bool active;
+
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry)
+ return;
+ active = tcx_entry(entry)->miniq_active;
+ if (active)
+ bpf_mprog_clear_all(entry, &entry_new);
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ bpf_mprog_foreach_tuple(entry, fp, cp, tuple) {
+ if (tuple.link)
+ tcx_link(tuple.link)->dev = NULL;
+ else
+ bpf_prog_put(tuple.prog);
+ tcx_skeys_dec(ingress);
+ }
+ if (!active)
+ tcx_entry_free(entry);
+}
+
+int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
+{
+ bool ingress = attr->query.attach_type == BPF_TCX_INGRESS;
+ struct net *net = current->nsproxy->net_ns;
+ struct net_device *dev;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->query.target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ ret = bpf_mprog_query(attr, uattr, tcx_entry_fetch(dev, ingress));
+out:
+ rtnl_unlock();
+ return ret;
+}
+
+static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd,
+ u64 revision)
+{
+ struct tcx_link *tcx = tcx_link(link);
+ bool created, ingress = tcx->location == BPF_TCX_INGRESS;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev = tcx->dev;
+ int ret;
+
+ ASSERT_RTNL();
+ entry = tcx_entry_fetch_or_create(dev, ingress, &created);
+ if (!entry)
+ return -ENOMEM;
+ ret = bpf_mprog_attach(entry, &entry_new, link->prog, link, NULL, flags,
+ id_or_fd, revision);
+ if (!ret) {
+ if (entry != entry_new) {
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_inc(ingress);
+ }
+ bpf_mprog_commit(entry);
+ } else if (created) {
+ tcx_entry_free(entry);
+ }
+ return ret;
+}
+
+static void tcx_link_release(struct bpf_link *link)
+{
+ struct tcx_link *tcx = tcx_link(link);
+ bool ingress = tcx->location == BPF_TCX_INGRESS;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev;
+ int ret = 0;
+
+ rtnl_lock();
+ dev = tcx->dev;
+ if (!dev)
+ goto out;
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_detach(entry, &entry_new, link->prog, link, 0, 0, 0);
+ if (!ret) {
+ if (!tcx_entry_is_active(entry_new))
+ entry_new = NULL;
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_dec(ingress);
+ bpf_mprog_commit(entry);
+ if (!entry_new)
+ tcx_entry_free(entry);
+ tcx->dev = NULL;
+ }
+out:
+ WARN_ON_ONCE(ret);
+ rtnl_unlock();
+}
+
+static int tcx_link_update(struct bpf_link *link, struct bpf_prog *nprog,
+ struct bpf_prog *oprog)
+{
+ struct tcx_link *tcx = tcx_link(link);
+ bool ingress = tcx->location == BPF_TCX_INGRESS;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev;
+ int ret = 0;
+
+ rtnl_lock();
+ dev = tcx->dev;
+ if (!dev) {
+ ret = -ENOLINK;
+ goto out;
+ }
+ if (oprog && link->prog != oprog) {
+ ret = -EPERM;
+ goto out;
+ }
+ oprog = link->prog;
+ if (oprog == nprog) {
+ bpf_prog_put(nprog);
+ goto out;
+ }
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_attach(entry, &entry_new, nprog, link, oprog,
+ BPF_F_REPLACE | BPF_F_ID,
+ link->prog->aux->id, 0);
+ if (!ret) {
+ WARN_ON_ONCE(entry != entry_new);
+ oprog = xchg(&link->prog, nprog);
+ bpf_prog_put(oprog);
+ bpf_mprog_commit(entry);
+ }
+out:
+ rtnl_unlock();
+ return ret;
+}
+
+static void tcx_link_dealloc(struct bpf_link *link)
+{
+ kfree(tcx_link(link));
+}
+
+static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
+{
+ const struct tcx_link *tcx = tcx_link(link);
+ u32 ifindex = 0;
+
+ rtnl_lock();
+ if (tcx->dev)
+ ifindex = tcx->dev->ifindex;
+ rtnl_unlock();
+
+ seq_printf(seq, "ifindex:\t%u\n", ifindex);
+ seq_printf(seq, "attach_type:\t%u (%s)\n",
+ tcx->location,
+ tcx->location == BPF_TCX_INGRESS ? "ingress" : "egress");
+}
+
+static int tcx_link_fill_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ const struct tcx_link *tcx = tcx_link(link);
+ u32 ifindex = 0;
+
+ rtnl_lock();
+ if (tcx->dev)
+ ifindex = tcx->dev->ifindex;
+ rtnl_unlock();
+
+ info->tcx.ifindex = ifindex;
+ info->tcx.attach_type = tcx->location;
+ return 0;
+}
+
+static int tcx_link_detach(struct bpf_link *link)
+{
+ tcx_link_release(link);
+ return 0;
+}
+
+static const struct bpf_link_ops tcx_link_lops = {
+ .release = tcx_link_release,
+ .detach = tcx_link_detach,
+ .dealloc = tcx_link_dealloc,
+ .update_prog = tcx_link_update,
+ .show_fdinfo = tcx_link_fdinfo,
+ .fill_link_info = tcx_link_fill_info,
+};
+
+static int tcx_link_init(struct tcx_link *tcx,
+ struct bpf_link_primer *link_primer,
+ const union bpf_attr *attr,
+ struct net_device *dev,
+ struct bpf_prog *prog)
+{
+ bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog);
+ tcx->location = attr->link_create.attach_type;
+ tcx->dev = dev;
+ return bpf_link_prime(&tcx->link, link_primer);
+}
+
+int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_link_primer link_primer;
+ struct net_device *dev;
+ struct tcx_link *tcx;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->link_create.target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ tcx = kzalloc(sizeof(*tcx), GFP_USER);
+ if (!tcx) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = tcx_link_init(tcx, &link_primer, attr, dev, prog);
+ if (ret) {
+ kfree(tcx);
+ goto out;
+ }
+ ret = tcx_link_prog_attach(&tcx->link, attr->link_create.flags,
+ attr->link_create.tcx.relative_fd,
+ attr->link_create.tcx.expected_revision);
+ if (ret) {
+ tcx->dev = NULL;
+ bpf_link_cleanup(&link_primer);
+ goto out;
+ }
+ ret = bpf_link_settle(&link_primer);
+out:
+ rtnl_unlock();
+ return ret;
+}
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index ceac5281bd31..9dbc31b25e3d 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -111,28 +111,31 @@ struct tnum tnum_xor(struct tnum a, struct tnum b)
return TNUM(v & ~mu, mu);
}
-/* half-multiply add: acc += (unknown * mask * value).
- * An intermediate step in the multiply algorithm.
+/* Generate partial products by multiplying each bit in the multiplier (tnum a)
+ * with the multiplicand (tnum b), and add the partial products after
+ * appropriately bit-shifting them. Instead of directly performing tnum addition
+ * on the generated partial products, equivalenty, decompose each partial
+ * product into two tnums, consisting of the value-sum (acc_v) and the
+ * mask-sum (acc_m) and then perform tnum addition on them. The following paper
+ * explains the algorithm in more detail: https://arxiv.org/abs/2105.05398.
*/
-static struct tnum hma(struct tnum acc, u64 value, u64 mask)
-{
- while (mask) {
- if (mask & 1)
- acc = tnum_add(acc, TNUM(0, value));
- mask >>= 1;
- value <<= 1;
- }
- return acc;
-}
-
struct tnum tnum_mul(struct tnum a, struct tnum b)
{
- struct tnum acc;
- u64 pi;
-
- pi = a.value * b.value;
- acc = hma(TNUM(pi, 0), a.mask, b.mask | b.value);
- return hma(acc, b.mask, a.value);
+ u64 acc_v = a.value * b.value;
+ struct tnum acc_m = TNUM(0, 0);
+
+ while (a.value || a.mask) {
+ /* LSB of tnum a is a certain 1 */
+ if (a.value & 1)
+ acc_m = tnum_add(acc_m, TNUM(0, b.mask));
+ /* LSB of tnum a is uncertain */
+ else if (a.mask & 1)
+ acc_m = tnum_add(acc_m, TNUM(0, b.value | b.mask));
+ /* Note: no case for LSB is certain 0 */
+ a = tnum_rshift(a, 1);
+ b = tnum_lshift(b, 1);
+ }
+ return tnum_add(TNUM(acc_v, 0), acc_m);
}
/* Note that if a and b disagree - i.e. one has a 'known 1' where the other has
@@ -169,12 +172,6 @@ bool tnum_in(struct tnum a, struct tnum b)
return a.value == b.value;
}
-int tnum_strn(char *str, size_t size, struct tnum a)
-{
- return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
-}
-EXPORT_SYMBOL_GPL(tnum_strn);
-
int tnum_sbin(char *str, size_t size, struct tnum a)
{
size_t n;
@@ -205,7 +202,12 @@ struct tnum tnum_clear_subreg(struct tnum a)
return tnum_lshift(tnum_rshift(a, 32), 32);
}
+struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg)
+{
+ return tnum_or(tnum_clear_subreg(reg), tnum_subreg(subreg));
+}
+
struct tnum tnum_const_subreg(struct tnum a, u32 value)
{
- return tnum_or(tnum_clear_subreg(a), tnum_const(value));
+ return tnum_with_subreg(a, tnum_const(value));
}
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 9be85aa4ec5f..d382f5ebe06c 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -7,6 +7,12 @@
#include <linux/rbtree_latch.h>
#include <linux/perf_event.h>
#include <linux/btf.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/static_call.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf_lsm.h>
+#include <linux/delay.h>
/* dummy _ops. The verifier will operate on target program's ops. */
const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -23,26 +29,96 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
/* serializes access to trampoline_table */
static DEFINE_MUTEX(trampoline_mutex);
-void *bpf_jit_alloc_exec_page(void)
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
+
+static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd)
{
- void *image;
+ struct bpf_trampoline *tr = ops->private;
+ int ret = 0;
- image = bpf_jit_alloc_exec(PAGE_SIZE);
- if (!image)
- return NULL;
+ if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) {
+ /* This is called inside register_ftrace_direct_multi(), so
+ * tr->mutex is already locked.
+ */
+ lockdep_assert_held_once(&tr->mutex);
+
+ /* Instead of updating the trampoline here, we propagate
+ * -EAGAIN to register_ftrace_direct(). Then we can
+ * retry register_ftrace_direct() after updating the
+ * trampoline.
+ */
+ if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
+ !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) {
+ if (WARN_ON_ONCE(tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY))
+ return -EBUSY;
+
+ tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
+ return -EAGAIN;
+ }
+
+ return 0;
+ }
- set_vm_flush_reset_perms(image);
- /* Keep image as writeable. The alternative is to keep flipping ro/rw
- * everytime new program is attached or detached.
+ /* The normal locking order is
+ * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c)
+ *
+ * The following two commands are called from
+ *
+ * prepare_direct_functions_for_ipmodify
+ * cleanup_direct_functions_after_ipmodify
+ *
+ * In both cases, direct_mutex is already locked. Use
+ * mutex_trylock(&tr->mutex) to avoid deadlock in race condition
+ * (something else is making changes to this same trampoline).
*/
- set_memory_x((long)image, 1);
- return image;
+ if (!mutex_trylock(&tr->mutex)) {
+ /* sleep 1 ms to make sure whatever holding tr->mutex makes
+ * some progress.
+ */
+ msleep(1);
+ return -EAGAIN;
+ }
+
+ switch (cmd) {
+ case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER:
+ tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
+
+ if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
+ !(tr->flags & BPF_TRAMP_F_ORIG_STACK))
+ ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+ break;
+ case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER:
+ tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY;
+
+ if (tr->flags & BPF_TRAMP_F_ORIG_STACK)
+ ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ mutex_unlock(&tr->mutex);
+ return ret;
}
+#endif
-void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym)
+bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
+{
+ enum bpf_attach_type eatype = prog->expected_attach_type;
+ enum bpf_prog_type ptype = prog->type;
+
+ return (ptype == BPF_PROG_TYPE_TRACING &&
+ (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_MODIFY_RETURN)) ||
+ (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
+}
+
+void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym)
{
ksym->start = (unsigned long) data;
- ksym->end = ksym->start + PAGE_SIZE;
+ ksym->end = ksym->start + size;
bpf_ksym_add(ksym);
perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
PAGE_SIZE, false, ksym->name);
@@ -55,19 +131,10 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym)
PAGE_SIZE, true, ksym->name);
}
-static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr)
-{
- struct bpf_ksym *ksym = &tr->ksym;
-
- snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", tr->key);
- bpf_image_ksym_add(tr->image, ksym);
-}
-
-struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
+static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
{
struct bpf_trampoline *tr;
struct hlist_head *head;
- void *image;
int i;
mutex_lock(&trampoline_mutex);
@@ -81,14 +148,16 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
tr = kzalloc(sizeof(*tr), GFP_KERNEL);
if (!tr)
goto out;
-
- /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
- image = bpf_jit_alloc_exec_page();
- if (!image) {
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL);
+ if (!tr->fops) {
kfree(tr);
tr = NULL;
goto out;
}
+ tr->fops->private = tr;
+ tr->fops->ops_func = bpf_tramp_ftrace_ops_func;
+#endif
tr->key = key;
INIT_HLIST_NODE(&tr->hlist);
@@ -97,47 +166,38 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
mutex_init(&tr->mutex);
for (i = 0; i < BPF_TRAMP_MAX; i++)
INIT_HLIST_HEAD(&tr->progs_hlist[i]);
- tr->image = image;
- INIT_LIST_HEAD_RCU(&tr->ksym.lnode);
- bpf_trampoline_ksym_add(tr);
out:
mutex_unlock(&trampoline_mutex);
return tr;
}
-static int is_ftrace_location(void *ip)
-{
- long addr;
-
- addr = ftrace_location((long)ip);
- if (!addr)
- return 0;
- if (WARN_ON_ONCE(addr != (long)ip))
- return -EFAULT;
- return 1;
-}
-
static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
{
void *ip = tr->func.addr;
int ret;
if (tr->func.ftrace_managed)
- ret = unregister_ftrace_direct((long)ip, (long)old_addr);
+ ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false);
else
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
+
return ret;
}
-static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr)
+static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr,
+ bool lock_direct_mutex)
{
void *ip = tr->func.addr;
int ret;
- if (tr->func.ftrace_managed)
- ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr);
- else
+ if (tr->func.ftrace_managed) {
+ if (lock_direct_mutex)
+ ret = modify_ftrace_direct(tr->fops, (long)new_addr);
+ else
+ ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr);
+ } else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
+ }
return ret;
}
@@ -145,93 +205,298 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad
static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
{
void *ip = tr->func.addr;
+ unsigned long faddr;
int ret;
- ret = is_ftrace_location(ip);
- if (ret < 0)
- return ret;
- tr->func.ftrace_managed = ret;
+ faddr = ftrace_location((unsigned long)ip);
+ if (faddr) {
+ if (!tr->fops)
+ return -ENOTSUPP;
+ tr->func.ftrace_managed = true;
+ }
- if (tr->func.ftrace_managed)
- ret = register_ftrace_direct((long)ip, (long)new_addr);
- else
+ if (tr->func.ftrace_managed) {
+ ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
+ ret = register_ftrace_direct(tr->fops, (long)new_addr);
+ } else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
+ }
+
return ret;
}
-static struct bpf_tramp_progs *
-bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
+static struct bpf_tramp_links *
+bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg)
{
- const struct bpf_prog_aux *aux;
- struct bpf_tramp_progs *tprogs;
- struct bpf_prog **progs;
+ struct bpf_tramp_link *link;
+ struct bpf_tramp_links *tlinks;
+ struct bpf_tramp_link **links;
int kind;
*total = 0;
- tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
- if (!tprogs)
+ tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
+ if (!tlinks)
return ERR_PTR(-ENOMEM);
for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
- tprogs[kind].nr_progs = tr->progs_cnt[kind];
+ tlinks[kind].nr_links = tr->progs_cnt[kind];
*total += tr->progs_cnt[kind];
- progs = tprogs[kind].progs;
+ links = tlinks[kind].links;
+
+ hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) {
+ *ip_arg |= link->link.prog->call_get_func_ip;
+ *links++ = link;
+ }
+ }
+ return tlinks;
+}
+
+static void bpf_tramp_image_free(struct bpf_tramp_image *im)
+{
+ bpf_image_ksym_del(&im->ksym);
+ arch_free_bpf_trampoline(im->image, im->size);
+ bpf_jit_uncharge_modmem(im->size);
+ percpu_ref_exit(&im->pcref);
+ kfree_rcu(im, rcu);
+}
+
+static void __bpf_tramp_image_put_deferred(struct work_struct *work)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(work, struct bpf_tramp_image, work);
+ bpf_tramp_image_free(im);
+}
+
+/* callback, fexit step 3 or fentry step 2 */
+static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(rcu, struct bpf_tramp_image, rcu);
+ INIT_WORK(&im->work, __bpf_tramp_image_put_deferred);
+ schedule_work(&im->work);
+}
+
+/* callback, fexit step 2. Called after percpu_ref_kill confirms. */
+static void __bpf_tramp_image_release(struct percpu_ref *pcref)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(pcref, struct bpf_tramp_image, pcref);
+ call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
+}
+
+/* callback, fexit or fentry step 1 */
+static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu)
+{
+ struct bpf_tramp_image *im;
- hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist)
- *progs++ = aux->prog;
+ im = container_of(rcu, struct bpf_tramp_image, rcu);
+ if (im->ip_after_call)
+ /* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */
+ percpu_ref_kill(&im->pcref);
+ else
+ /* the case of fentry trampoline */
+ call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
+}
+
+static void bpf_tramp_image_put(struct bpf_tramp_image *im)
+{
+ /* The trampoline image that calls original function is using:
+ * rcu_read_lock_trace to protect sleepable bpf progs
+ * rcu_read_lock to protect normal bpf progs
+ * percpu_ref to protect trampoline itself
+ * rcu tasks to protect trampoline asm not covered by percpu_ref
+ * (which are few asm insns before __bpf_tramp_enter and
+ * after __bpf_tramp_exit)
+ *
+ * The trampoline is unreachable before bpf_tramp_image_put().
+ *
+ * First, patch the trampoline to avoid calling into fexit progs.
+ * The progs will be freed even if the original function is still
+ * executing or sleeping.
+ * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on
+ * first few asm instructions to execute and call into
+ * __bpf_tramp_enter->percpu_ref_get.
+ * Then use percpu_ref_kill to wait for the trampoline and the original
+ * function to finish.
+ * Then use call_rcu_tasks() to make sure few asm insns in
+ * the trampoline epilogue are done as well.
+ *
+ * In !PREEMPT case the task that got interrupted in the first asm
+ * insns won't go through an RCU quiescent state which the
+ * percpu_ref_kill will be waiting for. Hence the first
+ * call_rcu_tasks() is not necessary.
+ */
+ if (im->ip_after_call) {
+ int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
+ NULL, im->ip_epilogue);
+ WARN_ON(err);
+ if (IS_ENABLED(CONFIG_PREEMPTION))
+ call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
+ else
+ percpu_ref_kill(&im->pcref);
+ return;
}
- return tprogs;
+
+ /* The trampoline without fexit and fmod_ret progs doesn't call original
+ * function and doesn't use percpu_ref.
+ * Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
+ * Then use call_rcu_tasks() to wait for the rest of trampoline asm
+ * and normal progs.
+ */
+ call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
}
-static int bpf_trampoline_update(struct bpf_trampoline *tr)
+static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size)
{
- void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2;
- void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2;
- struct bpf_tramp_progs *tprogs;
- u32 flags = BPF_TRAMP_F_RESTORE_REGS;
- int err, total;
+ struct bpf_tramp_image *im;
+ struct bpf_ksym *ksym;
+ void *image;
+ int err = -ENOMEM;
+
+ im = kzalloc(sizeof(*im), GFP_KERNEL);
+ if (!im)
+ goto out;
- tprogs = bpf_trampoline_get_progs(tr, &total);
- if (IS_ERR(tprogs))
- return PTR_ERR(tprogs);
+ err = bpf_jit_charge_modmem(size);
+ if (err)
+ goto out_free_im;
+ im->size = size;
+
+ err = -ENOMEM;
+ im->image = image = arch_alloc_bpf_trampoline(size);
+ if (!image)
+ goto out_uncharge;
+
+ err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL);
+ if (err)
+ goto out_free_image;
+
+ ksym = &im->ksym;
+ INIT_LIST_HEAD_RCU(&ksym->lnode);
+ snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key);
+ bpf_image_ksym_add(image, size, ksym);
+ return im;
+
+out_free_image:
+ arch_free_bpf_trampoline(im->image, im->size);
+out_uncharge:
+ bpf_jit_uncharge_modmem(size);
+out_free_im:
+ kfree(im);
+out:
+ return ERR_PTR(err);
+}
+
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex)
+{
+ struct bpf_tramp_image *im;
+ struct bpf_tramp_links *tlinks;
+ u32 orig_flags = tr->flags;
+ bool ip_arg = false;
+ int err, total, size;
+
+ tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg);
+ if (IS_ERR(tlinks))
+ return PTR_ERR(tlinks);
if (total == 0) {
- err = unregister_fentry(tr, old_image);
- tr->selector = 0;
+ err = unregister_fentry(tr, tr->cur_image->image);
+ bpf_tramp_image_put(tr->cur_image);
+ tr->cur_image = NULL;
goto out;
}
- if (tprogs[BPF_TRAMP_FEXIT].nr_progs ||
- tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
- flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+ /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */
+ tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX);
- /* Though the second half of trampoline page is unused a task could be
- * preempted in the middle of the first half of trampoline and two
- * updates to trampoline would change the code from underneath the
- * preempted task. Hence wait for tasks to voluntarily schedule or go
- * to userspace.
- */
+ if (tlinks[BPF_TRAMP_FEXIT].nr_links ||
+ tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) {
+ /* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME
+ * should not be set together.
+ */
+ tr->flags |= BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+ } else {
+ tr->flags |= BPF_TRAMP_F_RESTORE_REGS;
+ }
+
+ if (ip_arg)
+ tr->flags |= BPF_TRAMP_F_IP_ARG;
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+again:
+ if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) &&
+ (tr->flags & BPF_TRAMP_F_CALL_ORIG))
+ tr->flags |= BPF_TRAMP_F_ORIG_STACK;
+#endif
- synchronize_rcu_tasks();
+ size = arch_bpf_trampoline_size(&tr->func.model, tr->flags,
+ tlinks, tr->func.addr);
+ if (size < 0) {
+ err = size;
+ goto out;
+ }
+
+ if (size > PAGE_SIZE) {
+ err = -E2BIG;
+ goto out;
+ }
- err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2,
- &tr->func.model, flags, tprogs,
+ im = bpf_tramp_image_alloc(tr->key, size);
+ if (IS_ERR(im)) {
+ err = PTR_ERR(im);
+ goto out;
+ }
+
+ err = arch_prepare_bpf_trampoline(im, im->image, im->image + size,
+ &tr->func.model, tr->flags, tlinks,
tr->func.addr);
if (err < 0)
- goto out;
+ goto out_free;
+
+ arch_protect_bpf_trampoline(im->image, im->size);
- if (tr->selector)
+ WARN_ON(tr->cur_image && total == 0);
+ if (tr->cur_image)
/* progs already running at this address */
- err = modify_fentry(tr, old_image, new_image);
+ err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex);
else
/* first time registering */
- err = register_fentry(tr, new_image);
+ err = register_fentry(tr, im->image);
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ if (err == -EAGAIN) {
+ /* -EAGAIN from bpf_tramp_ftrace_ops_func. Now
+ * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the
+ * trampoline again, and retry register.
+ */
+ /* reset fops->func and fops->trampoline for re-register */
+ tr->fops->func = NULL;
+ tr->fops->trampoline = 0;
+
+ /* free im memory and reallocate later */
+ bpf_tramp_image_free(im);
+ goto again;
+ }
+#endif
if (err)
- goto out;
- tr->selector++;
+ goto out_free;
+
+ if (tr->cur_image)
+ bpf_tramp_image_put(tr->cur_image);
+ tr->cur_image = im;
out:
- kfree(tprogs);
+ /* If any error happens, restore previous flags */
+ if (err)
+ tr->flags = orig_flags;
+ kfree(tlinks);
return err;
+
+out_free:
+ bpf_tramp_image_free(im);
+ goto out;
}
static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
@@ -256,149 +521,572 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
}
}
-int bpf_trampoline_link_prog(struct bpf_prog *prog)
+static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
{
enum bpf_tramp_prog_type kind;
- struct bpf_trampoline *tr;
+ struct bpf_tramp_link *link_exiting;
int err = 0;
- int cnt;
+ int cnt = 0, i;
- tr = prog->aux->trampoline;
- kind = bpf_attach_type_to_tramp(prog);
- mutex_lock(&tr->mutex);
- if (tr->extension_prog) {
+ kind = bpf_attach_type_to_tramp(link->link.prog);
+ if (tr->extension_prog)
/* cannot attach fentry/fexit if extension prog is attached.
* cannot overwrite extension prog either.
*/
- err = -EBUSY;
- goto out;
- }
- cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT];
+ return -EBUSY;
+
+ for (i = 0; i < BPF_TRAMP_MAX; i++)
+ cnt += tr->progs_cnt[i];
+
if (kind == BPF_TRAMP_REPLACE) {
/* Cannot attach extension if fentry/fexit are in use. */
- if (cnt) {
- err = -EBUSY;
- goto out;
- }
- tr->extension_prog = prog;
- err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
- prog->bpf_func);
- goto out;
+ if (cnt)
+ return -EBUSY;
+ tr->extension_prog = link->link.prog;
+ return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
+ link->link.prog->bpf_func);
}
- if (cnt >= BPF_MAX_TRAMP_PROGS) {
- err = -E2BIG;
- goto out;
- }
- if (!hlist_unhashed(&prog->aux->tramp_hlist)) {
+ if (cnt >= BPF_MAX_TRAMP_LINKS)
+ return -E2BIG;
+ if (!hlist_unhashed(&link->tramp_hlist))
/* prog already linked */
- err = -EBUSY;
- goto out;
+ return -EBUSY;
+ hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) {
+ if (link_exiting->link.prog != link->link.prog)
+ continue;
+ /* prog already linked */
+ return -EBUSY;
}
- hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]);
+
+ hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]);
tr->progs_cnt[kind]++;
- err = bpf_trampoline_update(prog->aux->trampoline);
+ err = bpf_trampoline_update(tr, true /* lock_direct_mutex */);
if (err) {
- hlist_del(&prog->aux->tramp_hlist);
+ hlist_del_init(&link->tramp_hlist);
tr->progs_cnt[kind]--;
}
-out:
+ return err;
+}
+
+int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+{
+ int err;
+
+ mutex_lock(&tr->mutex);
+ err = __bpf_trampoline_link_prog(link, tr);
mutex_unlock(&tr->mutex);
return err;
}
-/* bpf_trampoline_unlink_prog() should never fail. */
-int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
+static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
{
enum bpf_tramp_prog_type kind;
- struct bpf_trampoline *tr;
int err;
- tr = prog->aux->trampoline;
- kind = bpf_attach_type_to_tramp(prog);
- mutex_lock(&tr->mutex);
+ kind = bpf_attach_type_to_tramp(link->link.prog);
if (kind == BPF_TRAMP_REPLACE) {
WARN_ON_ONCE(!tr->extension_prog);
err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
tr->extension_prog->bpf_func, NULL);
tr->extension_prog = NULL;
- goto out;
+ return err;
}
- hlist_del(&prog->aux->tramp_hlist);
+ hlist_del_init(&link->tramp_hlist);
tr->progs_cnt[kind]--;
- err = bpf_trampoline_update(prog->aux->trampoline);
-out:
+ return bpf_trampoline_update(tr, true /* lock_direct_mutex */);
+}
+
+/* bpf_trampoline_unlink_prog() should never fail. */
+int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+{
+ int err;
+
+ mutex_lock(&tr->mutex);
+ err = __bpf_trampoline_unlink_prog(link, tr);
+ mutex_unlock(&tr->mutex);
+ return err;
+}
+
+#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
+static void bpf_shim_tramp_link_release(struct bpf_link *link)
+{
+ struct bpf_shim_tramp_link *shim_link =
+ container_of(link, struct bpf_shim_tramp_link, link.link);
+
+ /* paired with 'shim_link->trampoline = tr' in bpf_trampoline_link_cgroup_shim */
+ if (!shim_link->trampoline)
+ return;
+
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline));
+ bpf_trampoline_put(shim_link->trampoline);
+}
+
+static void bpf_shim_tramp_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_shim_tramp_link *shim_link =
+ container_of(link, struct bpf_shim_tramp_link, link.link);
+
+ kfree(shim_link);
+}
+
+static const struct bpf_link_ops bpf_shim_tramp_link_lops = {
+ .release = bpf_shim_tramp_link_release,
+ .dealloc = bpf_shim_tramp_link_dealloc,
+};
+
+static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog,
+ bpf_func_t bpf_func,
+ int cgroup_atype)
+{
+ struct bpf_shim_tramp_link *shim_link = NULL;
+ struct bpf_prog *p;
+
+ shim_link = kzalloc(sizeof(*shim_link), GFP_USER);
+ if (!shim_link)
+ return NULL;
+
+ p = bpf_prog_alloc(1, 0);
+ if (!p) {
+ kfree(shim_link);
+ return NULL;
+ }
+
+ p->jited = false;
+ p->bpf_func = bpf_func;
+
+ p->aux->cgroup_atype = cgroup_atype;
+ p->aux->attach_func_proto = prog->aux->attach_func_proto;
+ p->aux->attach_btf_id = prog->aux->attach_btf_id;
+ p->aux->attach_btf = prog->aux->attach_btf;
+ btf_get(p->aux->attach_btf);
+ p->type = BPF_PROG_TYPE_LSM;
+ p->expected_attach_type = BPF_LSM_MAC;
+ bpf_prog_inc(p);
+ bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC,
+ &bpf_shim_tramp_link_lops, p);
+ bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype);
+
+ return shim_link;
+}
+
+static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr,
+ bpf_func_t bpf_func)
+{
+ struct bpf_tramp_link *link;
+ int kind;
+
+ for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
+ hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) {
+ struct bpf_prog *p = link->link.prog;
+
+ if (p->bpf_func == bpf_func)
+ return container_of(link, struct bpf_shim_tramp_link, link);
+ }
+ }
+
+ return NULL;
+}
+
+int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
+ int cgroup_atype)
+{
+ struct bpf_shim_tramp_link *shim_link = NULL;
+ struct bpf_attach_target_info tgt_info = {};
+ struct bpf_trampoline *tr;
+ bpf_func_t bpf_func;
+ u64 key;
+ int err;
+
+ err = bpf_check_attach_target(NULL, prog, NULL,
+ prog->aux->attach_btf_id,
+ &tgt_info);
+ if (err)
+ return err;
+
+ key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf,
+ prog->aux->attach_btf_id);
+
+ bpf_lsm_find_cgroup_shim(prog, &bpf_func);
+ tr = bpf_trampoline_get(key, &tgt_info);
+ if (!tr)
+ return -ENOMEM;
+
+ mutex_lock(&tr->mutex);
+
+ shim_link = cgroup_shim_find(tr, bpf_func);
+ if (shim_link) {
+ /* Reusing existing shim attached by the other program. */
+ bpf_link_inc(&shim_link->link.link);
+
+ mutex_unlock(&tr->mutex);
+ bpf_trampoline_put(tr); /* bpf_trampoline_get above */
+ return 0;
+ }
+
+ /* Allocate and install new shim. */
+
+ shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype);
+ if (!shim_link) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ err = __bpf_trampoline_link_prog(&shim_link->link, tr);
+ if (err)
+ goto err;
+
+ shim_link->trampoline = tr;
+ /* note, we're still holding tr refcnt from above */
+
+ mutex_unlock(&tr->mutex);
+
+ return 0;
+err:
mutex_unlock(&tr->mutex);
+
+ if (shim_link)
+ bpf_link_put(&shim_link->link.link);
+
+ /* have to release tr while _not_ holding its mutex */
+ bpf_trampoline_put(tr); /* bpf_trampoline_get above */
+
return err;
}
+void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog)
+{
+ struct bpf_shim_tramp_link *shim_link = NULL;
+ struct bpf_trampoline *tr;
+ bpf_func_t bpf_func;
+ u64 key;
+
+ key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf,
+ prog->aux->attach_btf_id);
+
+ bpf_lsm_find_cgroup_shim(prog, &bpf_func);
+ tr = bpf_trampoline_lookup(key);
+ if (WARN_ON_ONCE(!tr))
+ return;
+
+ mutex_lock(&tr->mutex);
+ shim_link = cgroup_shim_find(tr, bpf_func);
+ mutex_unlock(&tr->mutex);
+
+ if (shim_link)
+ bpf_link_put(&shim_link->link.link);
+
+ bpf_trampoline_put(tr); /* bpf_trampoline_lookup above */
+}
+#endif
+
+struct bpf_trampoline *bpf_trampoline_get(u64 key,
+ struct bpf_attach_target_info *tgt_info)
+{
+ struct bpf_trampoline *tr;
+
+ tr = bpf_trampoline_lookup(key);
+ if (!tr)
+ return NULL;
+
+ mutex_lock(&tr->mutex);
+ if (tr->func.addr)
+ goto out;
+
+ memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel));
+ tr->func.addr = (void *)tgt_info->tgt_addr;
+out:
+ mutex_unlock(&tr->mutex);
+ return tr;
+}
+
void bpf_trampoline_put(struct bpf_trampoline *tr)
{
+ int i;
+
if (!tr)
return;
mutex_lock(&trampoline_mutex);
if (!refcount_dec_and_test(&tr->refcnt))
goto out;
WARN_ON_ONCE(mutex_is_locked(&tr->mutex));
- if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY])))
- goto out;
- if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
- goto out;
- bpf_image_ksym_del(&tr->ksym);
- /* wait for tasks to get out of trampoline before freeing it */
- synchronize_rcu_tasks();
- bpf_jit_free_exec(tr->image);
+
+ for (i = 0; i < BPF_TRAMP_MAX; i++)
+ if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i])))
+ goto out;
+
+ /* This code will be executed even when the last bpf_tramp_image
+ * is alive. All progs are detached from the trampoline and the
+ * trampoline image is patched with jmp into epilogue to skip
+ * fexit progs. The fentry-only trampoline will be freed via
+ * multiple rcu callbacks.
+ */
hlist_del(&tr->hlist);
+ if (tr->fops) {
+ ftrace_free_filter(tr->fops);
+ kfree(tr->fops);
+ }
kfree(tr);
out:
mutex_unlock(&trampoline_mutex);
}
-/* The logic is similar to BPF_PROG_RUN, but with an explicit
+#define NO_START_TIME 1
+static __always_inline u64 notrace bpf_prog_start_time(void)
+{
+ u64 start = NO_START_TIME;
+
+ if (static_branch_unlikely(&bpf_stats_enabled_key)) {
+ start = sched_clock();
+ if (unlikely(!start))
+ start = NO_START_TIME;
+ }
+ return start;
+}
+
+/* The logic is similar to bpf_prog_run(), but with an explicit
* rcu_read_lock() and migrate_disable() which are required
* for the trampoline. The macro is split into
- * call _bpf_prog_enter
+ * call __bpf_prog_enter
* call prog->bpf_func
* call __bpf_prog_exit
+ *
+ * __bpf_prog_enter returns:
+ * 0 - skip execution of the bpf prog
+ * 1 - execute bpf prog
+ * [2..MAX_U64] - execute bpf prog and record execution time.
+ * This is start time.
*/
-u64 notrace __bpf_prog_enter(void)
+static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
- u64 start = 0;
-
rcu_read_lock();
migrate_disable();
- if (static_branch_unlikely(&bpf_stats_enabled_key))
- start = sched_clock();
- return start;
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ bpf_prog_inc_misses_counter(prog);
+ return 0;
+ }
+ return bpf_prog_start_time();
}
-void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
- __releases(RCU)
+static void notrace update_prog_stats(struct bpf_prog *prog,
+ u64 start)
{
struct bpf_prog_stats *stats;
if (static_branch_unlikely(&bpf_stats_enabled_key) &&
- /* static_key could be enabled in __bpf_prog_enter
- * and disabled in __bpf_prog_exit.
+ /* static_key could be enabled in __bpf_prog_enter*
+ * and disabled in __bpf_prog_exit*.
* And vice versa.
- * Hence check that 'start' is not zero.
+ * Hence check that 'start' is valid.
*/
- start) {
- stats = this_cpu_ptr(prog->aux->stats);
- u64_stats_update_begin(&stats->syncp);
- stats->cnt++;
- stats->nsecs += sched_clock() - start;
- u64_stats_update_end(&stats->syncp);
+ start > NO_START_TIME) {
+ unsigned long flags;
+
+ stats = this_cpu_ptr(prog->stats);
+ flags = u64_stats_update_begin_irqsave(&stats->syncp);
+ u64_stats_inc(&stats->cnt);
+ u64_stats_add(&stats->nsecs, sched_clock() - start);
+ u64_stats_update_end_irqrestore(&stats->syncp, flags);
}
+}
+
+static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __releases(RCU)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ update_prog_stats(prog, start);
+ this_cpu_dec(*(prog->active));
migrate_enable();
rcu_read_unlock();
}
+static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __acquires(RCU)
+{
+ /* Runtime stats are exported via actual BPF_LSM_CGROUP
+ * programs, not the shims.
+ */
+ rcu_read_lock();
+ migrate_disable();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return NO_START_TIME;
+}
+
+static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __releases(RCU)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ migrate_enable();
+ rcu_read_unlock();
+}
+
+u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+{
+ rcu_read_lock_trace();
+ migrate_disable();
+ might_fault();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ bpf_prog_inc_misses_counter(prog);
+ return 0;
+ }
+ return bpf_prog_start_time();
+}
+
+void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ update_prog_stats(prog, start);
+ this_cpu_dec(*(prog->active));
+ migrate_enable();
+ rcu_read_unlock_trace();
+}
+
+static u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+{
+ rcu_read_lock_trace();
+ migrate_disable();
+ might_fault();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return bpf_prog_start_time();
+}
+
+static void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ update_prog_stats(prog, start);
+ migrate_enable();
+ rcu_read_unlock_trace();
+}
+
+static u64 notrace __bpf_prog_enter(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ migrate_disable();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return bpf_prog_start_time();
+}
+
+static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __releases(RCU)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ update_prog_stats(prog, start);
+ migrate_enable();
+ rcu_read_unlock();
+}
+
+void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
+{
+ percpu_ref_get(&tr->pcref);
+}
+
+void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
+{
+ percpu_ref_put(&tr->pcref);
+}
+
+bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog)
+{
+ bool sleepable = prog->aux->sleepable;
+
+ if (bpf_prog_check_recur(prog))
+ return sleepable ? __bpf_prog_enter_sleepable_recur :
+ __bpf_prog_enter_recur;
+
+ if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type == BPF_LSM_CGROUP)
+ return __bpf_prog_enter_lsm_cgroup;
+
+ return sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter;
+}
+
+bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog)
+{
+ bool sleepable = prog->aux->sleepable;
+
+ if (bpf_prog_check_recur(prog))
+ return sleepable ? __bpf_prog_exit_sleepable_recur :
+ __bpf_prog_exit_recur;
+
+ if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type == BPF_LSM_CGROUP)
+ return __bpf_prog_exit_lsm_cgroup;
+
+ return sleepable ? __bpf_prog_exit_sleepable : __bpf_prog_exit;
+}
+
int __weak
-arch_prepare_bpf_trampoline(void *image, void *image_end,
+arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
- struct bpf_tramp_progs *tprogs,
- void *orig_call)
+ struct bpf_tramp_links *tlinks,
+ void *func_addr)
+{
+ return -ENOTSUPP;
+}
+
+void * __weak arch_alloc_bpf_trampoline(unsigned int size)
+{
+ void *image;
+
+ if (WARN_ON_ONCE(size > PAGE_SIZE))
+ return NULL;
+ image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (image)
+ set_vm_flush_reset_perms(image);
+ return image;
+}
+
+void __weak arch_free_bpf_trampoline(void *image, unsigned int size)
+{
+ WARN_ON_ONCE(size > PAGE_SIZE);
+ /* bpf_jit_free_exec doesn't need "size", but
+ * bpf_prog_pack_free() needs it.
+ */
+ bpf_jit_free_exec(image);
+}
+
+void __weak arch_protect_bpf_trampoline(void *image, unsigned int size)
+{
+ WARN_ON_ONCE(size > PAGE_SIZE);
+ set_memory_rox((long)image, 1);
+}
+
+void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size)
+{
+ WARN_ON_ONCE(size > PAGE_SIZE);
+ set_memory_nx((long)image, 1);
+ set_memory_rw((long)image, 1);
+}
+
+int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks, void *func_addr)
{
return -ENOTSUPP;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 94cead5a43e5..ddea9567f755 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4,6 +4,7 @@
* Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
*/
#include <uapi/linux/btf.h>
+#include <linux/bpf-cgroup.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
@@ -21,6 +22,12 @@
#include <linux/ctype.h>
#include <linux/error-injection.h>
#include <linux/bpf_lsm.h>
+#include <linux/btf_ids.h>
+#include <linux/poison.h>
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/bpf_mem_alloc.h>
+#include <net/xdp.h>
#include "disasm.h"
@@ -35,6 +42,9 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
#undef BPF_LINK_TYPE
};
+struct bpf_mem_alloc bpf_global_percpu_ma;
+static bool bpf_global_percpu_ma_set;
+
/* bpf_check() is a static code analyzer that walks eBPF program
* instruction by instruction and updates register/stack state.
* All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -46,7 +56,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
* - unreachable insns exist (shouldn't be a forest. program = one function)
* - out of bounds or malformed jumps
* The second pass is all possible path descent from the 1st insn.
- * Since it's analyzing all pathes through the program, the length of the
+ * Since it's analyzing all paths through the program, the length of the
* analysis is limited to 64k insn, which may be hit even if total number of
* insn is less then 4K, but there are too many branches that change stack/regs.
* Number of 'branches to be analyzed' is limited to 1k
@@ -131,7 +141,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
* If it's ok, then verifier allows this BPF_CALL insn and looks at
* .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
* R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
- * returns ether pointer to map value or NULL.
+ * returns either pointer to map value or NULL.
*
* When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
* insn, the register holding that pointer in the true branch changes state to
@@ -185,6 +195,18 @@ struct bpf_verifier_stack_elem {
POISON_POINTER_DELTA))
#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
+#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512
+
+static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
+static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
+static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
+static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
+static int ref_set_non_owning(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
+static void specialize_kfunc(struct bpf_verifier_env *env,
+ u32 func_id, u16 offset, unsigned long *addr);
+static bool is_trusted_reg(const struct bpf_reg_state *reg);
+
static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
{
return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON;
@@ -227,516 +249,1052 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
(poisoned ? BPF_MAP_KEY_POISON : 0ULL);
}
+static bool bpf_helper_call(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == 0;
+}
+
+static bool bpf_pseudo_call(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == BPF_PSEUDO_CALL;
+}
+
+static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == BPF_PSEUDO_KFUNC_CALL;
+}
+
struct bpf_call_arg_meta {
struct bpf_map *map_ptr;
bool raw_mode;
bool pkt_access;
+ u8 release_regno;
int regno;
int access_size;
int mem_size;
u64 msize_max_value;
int ref_obj_id;
+ int dynptr_id;
+ int map_uid;
int func_id;
+ struct btf *btf;
u32 btf_id;
+ struct btf *ret_btf;
+ u32 ret_btf_id;
+ u32 subprogno;
+ struct btf_field *kptr_field;
+};
+
+struct bpf_kfunc_call_arg_meta {
+ /* In parameters */
+ struct btf *btf;
+ u32 func_id;
+ u32 kfunc_flags;
+ const struct btf_type *func_proto;
+ const char *func_name;
+ /* Out parameters */
+ u32 ref_obj_id;
+ u8 release_regno;
+ bool r0_rdonly;
+ u32 ret_btf_id;
+ u64 r0_size;
+ u32 subprogno;
+ struct {
+ u64 value;
+ bool found;
+ } arg_constant;
+
+ /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling,
+ * generally to pass info about user-defined local kptr types to later
+ * verification logic
+ * bpf_obj_drop/bpf_percpu_obj_drop
+ * Record the local kptr type to be drop'd
+ * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type)
+ * Record the local kptr type to be refcount_incr'd and use
+ * arg_owning_ref to determine whether refcount_acquire should be
+ * fallible
+ */
+ struct btf *arg_btf;
+ u32 arg_btf_id;
+ bool arg_owning_ref;
+
+ struct {
+ struct btf_field *field;
+ } arg_list_head;
+ struct {
+ struct btf_field *field;
+ } arg_rbtree_root;
+ struct {
+ enum bpf_dynptr_type type;
+ u32 id;
+ u32 ref_obj_id;
+ } initialized_dynptr;
+ struct {
+ u8 spi;
+ u8 frameno;
+ } iter;
+ u64 mem_size;
};
struct btf *btf_vmlinux;
+static const char *btf_type_name(const struct btf *btf, u32 id)
+{
+ return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
+}
+
static DEFINE_MUTEX(bpf_verifier_lock);
+static DEFINE_MUTEX(bpf_percpu_ma_lock);
-static const struct bpf_line_info *
-find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
+__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
{
- const struct bpf_line_info *linfo;
- const struct bpf_prog *prog;
- u32 i, nr_linfo;
+ struct bpf_verifier_env *env = private_data;
+ va_list args;
- prog = env->prog;
- nr_linfo = prog->aux->nr_linfo;
+ if (!bpf_verifier_log_needed(&env->log))
+ return;
- if (!nr_linfo || insn_off >= prog->len)
- return NULL;
+ va_start(args, fmt);
+ bpf_verifier_vlog(&env->log, fmt, args);
+ va_end(args);
+}
- linfo = prog->aux->linfo;
- for (i = 1; i < nr_linfo; i++)
- if (insn_off < linfo[i].insn_off)
- break;
+static void verbose_invalid_scalar(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ struct bpf_retval_range range, const char *ctx,
+ const char *reg_name)
+{
+ bool unknown = true;
- return &linfo[i - 1];
+ verbose(env, "%s the register %s has", ctx, reg_name);
+ if (reg->smin_value > S64_MIN) {
+ verbose(env, " smin=%lld", reg->smin_value);
+ unknown = false;
+ }
+ if (reg->smax_value < S64_MAX) {
+ verbose(env, " smax=%lld", reg->smax_value);
+ unknown = false;
+ }
+ if (unknown)
+ verbose(env, " unknown scalar value");
+ verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval);
}
-void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
- va_list args)
+static bool type_may_be_null(u32 type)
{
- unsigned int n;
+ return type & PTR_MAYBE_NULL;
+}
- n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
+static bool reg_not_null(const struct bpf_reg_state *reg)
+{
+ enum bpf_reg_type type;
- WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
- "verifier log line truncated - local buffer too short\n");
+ type = reg->type;
+ if (type_may_be_null(type))
+ return false;
- n = min(log->len_total - log->len_used - 1, n);
- log->kbuf[n] = '\0';
+ type = base_type(type);
+ return type == PTR_TO_SOCKET ||
+ type == PTR_TO_TCP_SOCK ||
+ type == PTR_TO_MAP_VALUE ||
+ type == PTR_TO_MAP_KEY ||
+ type == PTR_TO_SOCK_COMMON ||
+ (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) ||
+ type == PTR_TO_MEM;
+}
- if (log->level == BPF_LOG_KERNEL) {
- pr_err("BPF:%s\n", log->kbuf);
- return;
+static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
+{
+ struct btf_record *rec = NULL;
+ struct btf_struct_meta *meta;
+
+ if (reg->type == PTR_TO_MAP_VALUE) {
+ rec = reg->map_ptr->record;
+ } else if (type_is_ptr_alloc_obj(reg->type)) {
+ meta = btf_find_struct_meta(reg->btf, reg->btf_id);
+ if (meta)
+ rec = meta->record;
}
- if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
- log->len_used += n;
- else
- log->ubuf = NULL;
+ return rec;
}
-static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
+static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog)
{
- char zero = 0;
+ struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux;
- if (!bpf_verifier_log_needed(log))
- return;
+ return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL;
+}
+
+static const char *subprog_name(const struct bpf_verifier_env *env, int subprog)
+{
+ struct bpf_func_info *info;
+
+ if (!env->prog->aux->func_info)
+ return "";
- log->len_used = new_pos;
- if (put_user(zero, log->ubuf + new_pos))
- log->ubuf = NULL;
+ info = &env->prog->aux->func_info[subprog];
+ return btf_type_name(env->prog->aux->btf, info->type_id);
}
-/* log_level controls verbosity level of eBPF verifier.
- * bpf_verifier_log_write() is used to dump the verification trace to the log,
- * so the user can figure out what's wrong with the program
- */
-__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
- const char *fmt, ...)
+static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog)
{
- va_list args;
+ struct bpf_subprog_info *info = subprog_info(env, subprog);
- if (!bpf_verifier_log_needed(&env->log))
- return;
+ info->is_cb = true;
+ info->is_async_cb = true;
+ info->is_exception_cb = true;
+}
- va_start(args, fmt);
- bpf_verifier_vlog(&env->log, fmt, args);
- va_end(args);
+static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog)
+{
+ return subprog_info(env, subprog)->is_exception_cb;
}
-EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
-__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
+static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
{
- struct bpf_verifier_env *env = private_data;
- va_list args;
+ return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
+}
- if (!bpf_verifier_log_needed(&env->log))
- return;
+static bool type_is_rdonly_mem(u32 type)
+{
+ return type & MEM_RDONLY;
+}
- va_start(args, fmt);
- bpf_verifier_vlog(&env->log, fmt, args);
- va_end(args);
+static bool is_acquire_function(enum bpf_func_id func_id,
+ const struct bpf_map *map)
+{
+ enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC;
+
+ if (func_id == BPF_FUNC_sk_lookup_tcp ||
+ func_id == BPF_FUNC_sk_lookup_udp ||
+ func_id == BPF_FUNC_skc_lookup_tcp ||
+ func_id == BPF_FUNC_ringbuf_reserve ||
+ func_id == BPF_FUNC_kptr_xchg)
+ return true;
+
+ if (func_id == BPF_FUNC_map_lookup_elem &&
+ (map_type == BPF_MAP_TYPE_SOCKMAP ||
+ map_type == BPF_MAP_TYPE_SOCKHASH))
+ return true;
+
+ return false;
}
-__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
- const char *fmt, ...)
+static bool is_ptr_cast_function(enum bpf_func_id func_id)
{
- va_list args;
+ return func_id == BPF_FUNC_tcp_sock ||
+ func_id == BPF_FUNC_sk_fullsock ||
+ func_id == BPF_FUNC_skc_to_tcp_sock ||
+ func_id == BPF_FUNC_skc_to_tcp6_sock ||
+ func_id == BPF_FUNC_skc_to_udp6_sock ||
+ func_id == BPF_FUNC_skc_to_mptcp_sock ||
+ func_id == BPF_FUNC_skc_to_tcp_timewait_sock ||
+ func_id == BPF_FUNC_skc_to_tcp_request_sock;
+}
- if (!bpf_verifier_log_needed(log))
- return;
+static bool is_dynptr_ref_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_dynptr_data;
+}
- va_start(args, fmt);
- bpf_verifier_vlog(log, fmt, args);
- va_end(args);
+static bool is_sync_callback_calling_kfunc(u32 btf_id);
+static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
+
+static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_for_each_map_elem ||
+ func_id == BPF_FUNC_find_vma ||
+ func_id == BPF_FUNC_loop ||
+ func_id == BPF_FUNC_user_ringbuf_drain;
}
-static const char *ltrim(const char *s)
+static bool is_async_callback_calling_function(enum bpf_func_id func_id)
{
- while (isspace(*s))
- s++;
+ return func_id == BPF_FUNC_timer_set_callback;
+}
- return s;
+static bool is_callback_calling_function(enum bpf_func_id func_id)
+{
+ return is_sync_callback_calling_function(func_id) ||
+ is_async_callback_calling_function(func_id);
}
-__printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env,
- u32 insn_off,
- const char *prefix_fmt, ...)
+static bool is_sync_callback_calling_insn(struct bpf_insn *insn)
{
- const struct bpf_line_info *linfo;
+ return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) ||
+ (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm));
+}
- if (!bpf_verifier_log_needed(&env->log))
- return;
+static bool is_storage_get_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_sk_storage_get ||
+ func_id == BPF_FUNC_inode_storage_get ||
+ func_id == BPF_FUNC_task_storage_get ||
+ func_id == BPF_FUNC_cgrp_storage_get;
+}
- linfo = find_linfo(env, insn_off);
- if (!linfo || linfo == env->prev_linfo)
- return;
+static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
+ const struct bpf_map *map)
+{
+ int ref_obj_uses = 0;
+
+ if (is_ptr_cast_function(func_id))
+ ref_obj_uses++;
+ if (is_acquire_function(func_id, map))
+ ref_obj_uses++;
+ if (is_dynptr_ref_function(func_id))
+ ref_obj_uses++;
+
+ return ref_obj_uses > 1;
+}
+
+static bool is_cmpxchg_insn(const struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_CMPXCHG;
+}
+
+static int __get_spi(s32 off)
+{
+ return (-off - 1) / BPF_REG_SIZE;
+}
+
+static struct bpf_func_state *func(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg)
+{
+ struct bpf_verifier_state *cur = env->cur_state;
+
+ return cur->frame[reg->frameno];
+}
+
+static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
+{
+ int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
+
+ /* We need to check that slots between [spi - nr_slots + 1, spi] are
+ * within [0, allocated_stack).
+ *
+ * Please note that the spi grows downwards. For example, a dynptr
+ * takes the size of two stack slots; the first slot will be at
+ * spi and the second slot will be at spi - 1.
+ */
+ return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
+}
- if (prefix_fmt) {
- va_list args;
+static int stack_slot_obj_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ const char *obj_kind, int nr_slots)
+{
+ int off, spi;
- va_start(args, prefix_fmt);
- bpf_verifier_vlog(&env->log, prefix_fmt, args);
- va_end(args);
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "%s has to be at a constant offset\n", obj_kind);
+ return -EINVAL;
}
- verbose(env, "%s\n",
- ltrim(btf_name_by_offset(env->prog->aux->btf,
- linfo->line_off)));
+ off = reg->off + reg->var_off.value;
+ if (off % BPF_REG_SIZE) {
+ verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
+ return -EINVAL;
+ }
- env->prev_linfo = linfo;
+ spi = __get_spi(off);
+ if (spi + 1 < nr_slots) {
+ verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
+ return -EINVAL;
+ }
+
+ if (!is_spi_bounds_valid(func(env, reg), spi, nr_slots))
+ return -ERANGE;
+ return spi;
}
-static bool type_is_pkt_pointer(enum bpf_reg_type type)
+static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- return type == PTR_TO_PACKET ||
- type == PTR_TO_PACKET_META;
+ return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS);
}
-static bool type_is_sk_pointer(enum bpf_reg_type type)
+static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots)
{
- return type == PTR_TO_SOCKET ||
- type == PTR_TO_SOCK_COMMON ||
- type == PTR_TO_TCP_SOCK ||
- type == PTR_TO_XDP_SOCK;
+ return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
}
-static bool reg_type_not_null(enum bpf_reg_type type)
+static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
{
- return type == PTR_TO_SOCKET ||
- type == PTR_TO_TCP_SOCK ||
- type == PTR_TO_MAP_VALUE ||
- type == PTR_TO_SOCK_COMMON;
+ switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
+ case DYNPTR_TYPE_LOCAL:
+ return BPF_DYNPTR_TYPE_LOCAL;
+ case DYNPTR_TYPE_RINGBUF:
+ return BPF_DYNPTR_TYPE_RINGBUF;
+ case DYNPTR_TYPE_SKB:
+ return BPF_DYNPTR_TYPE_SKB;
+ case DYNPTR_TYPE_XDP:
+ return BPF_DYNPTR_TYPE_XDP;
+ default:
+ return BPF_DYNPTR_TYPE_INVALID;
+ }
}
-static bool reg_type_may_be_null(enum bpf_reg_type type)
+static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
{
- return type == PTR_TO_MAP_VALUE_OR_NULL ||
- type == PTR_TO_SOCKET_OR_NULL ||
- type == PTR_TO_SOCK_COMMON_OR_NULL ||
- type == PTR_TO_TCP_SOCK_OR_NULL ||
- type == PTR_TO_BTF_ID_OR_NULL ||
- type == PTR_TO_MEM_OR_NULL;
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ return DYNPTR_TYPE_LOCAL;
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return DYNPTR_TYPE_RINGBUF;
+ case BPF_DYNPTR_TYPE_SKB:
+ return DYNPTR_TYPE_SKB;
+ case BPF_DYNPTR_TYPE_XDP:
+ return DYNPTR_TYPE_XDP;
+ default:
+ return 0;
+ }
}
-static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
+static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
{
- return reg->type == PTR_TO_MAP_VALUE &&
- map_value_has_spin_lock(reg->map_ptr);
+ return type == BPF_DYNPTR_TYPE_RINGBUF;
}
-static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
+static void __mark_dynptr_reg(struct bpf_reg_state *reg,
+ enum bpf_dynptr_type type,
+ bool first_slot, int dynptr_id);
+
+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
+
+static void mark_dynptr_stack_regs(struct bpf_verifier_env *env,
+ struct bpf_reg_state *sreg1,
+ struct bpf_reg_state *sreg2,
+ enum bpf_dynptr_type type)
{
- return type == PTR_TO_SOCKET ||
- type == PTR_TO_SOCKET_OR_NULL ||
- type == PTR_TO_TCP_SOCK ||
- type == PTR_TO_TCP_SOCK_OR_NULL ||
- type == PTR_TO_MEM ||
- type == PTR_TO_MEM_OR_NULL;
+ int id = ++env->id_gen;
+
+ __mark_dynptr_reg(sreg1, type, true, id);
+ __mark_dynptr_reg(sreg2, type, false, id);
}
-static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
+static void mark_dynptr_cb_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ enum bpf_dynptr_type type)
{
- return type == ARG_PTR_TO_SOCK_COMMON;
+ __mark_dynptr_reg(reg, type, true, ++env->id_gen);
}
-/* Determine whether the function releases some resources allocated by another
- * function call. The first reference type argument will be assumed to be
- * released by release_reference().
- */
-static bool is_release_function(enum bpf_func_id func_id)
+static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int spi);
+
+static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id)
+{
+ struct bpf_func_state *state = func(env, reg);
+ enum bpf_dynptr_type type;
+ int spi, i, err;
+
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ /* We cannot assume both spi and spi - 1 belong to the same dynptr,
+ * hence we need to call destroy_if_dynptr_stack_slot twice for both,
+ * to ensure that for the following example:
+ * [d1][d1][d2][d2]
+ * spi 3 2 1 0
+ * So marking spi = 2 should lead to destruction of both d1 and d2. In
+ * case they do belong to same dynptr, second call won't see slot_type
+ * as STACK_DYNPTR and will simply skip destruction.
+ */
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+ err = destroy_if_dynptr_stack_slot(env, state, spi - 1);
+ if (err)
+ return err;
+
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ state->stack[spi].slot_type[i] = STACK_DYNPTR;
+ state->stack[spi - 1].slot_type[i] = STACK_DYNPTR;
+ }
+
+ type = arg_to_dynptr_type(arg_type);
+ if (type == BPF_DYNPTR_TYPE_INVALID)
+ return -EINVAL;
+
+ mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr,
+ &state->stack[spi - 1].spilled_ptr, type);
+
+ if (dynptr_type_refcounted(type)) {
+ /* The id is used to track proper releasing */
+ int id;
+
+ if (clone_ref_obj_id)
+ id = clone_ref_obj_id;
+ else
+ id = acquire_reference_state(env, insn_idx);
+
+ if (id < 0)
+ return id;
+
+ state->stack[spi].spilled_ptr.ref_obj_id = id;
+ state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
+ }
+
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
+ return 0;
+}
+
+static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi)
{
- return func_id == BPF_FUNC_sk_release ||
- func_id == BPF_FUNC_ringbuf_submit ||
- func_id == BPF_FUNC_ringbuf_discard;
+ int i;
+
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ state->stack[spi].slot_type[i] = STACK_INVALID;
+ state->stack[spi - 1].slot_type[i] = STACK_INVALID;
+ }
+
+ __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+ __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
+
+ /* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot?
+ *
+ * While we don't allow reading STACK_INVALID, it is still possible to
+ * do <8 byte writes marking some but not all slots as STACK_MISC. Then,
+ * helpers or insns can do partial read of that part without failing,
+ * but check_stack_range_initialized, check_stack_read_var_off, and
+ * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of
+ * the slot conservatively. Hence we need to prevent those liveness
+ * marking walks.
+ *
+ * This was not a problem before because STACK_INVALID is only set by
+ * default (where the default reg state has its reg->parent as NULL), or
+ * in clean_live_states after REG_LIVE_DONE (at which point
+ * mark_reg_read won't walk reg->parent chain), but not randomly during
+ * verifier state exploration (like we did above). Hence, for our case
+ * parentage chain will still be live (i.e. reg->parent may be
+ * non-NULL), while earlier reg->parent was NULL, so we need
+ * REG_LIVE_WRITTEN to screen off read marker propagation when it is
+ * done later on reads or by mark_dynptr_read as well to unnecessary
+ * mark registers in verifier state.
+ */
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
}
-static bool may_be_acquire_function(enum bpf_func_id func_id)
+static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- return func_id == BPF_FUNC_sk_lookup_tcp ||
- func_id == BPF_FUNC_sk_lookup_udp ||
- func_id == BPF_FUNC_skc_lookup_tcp ||
- func_id == BPF_FUNC_map_lookup_elem ||
- func_id == BPF_FUNC_ringbuf_reserve;
+ struct bpf_func_state *state = func(env, reg);
+ int spi, ref_obj_id, i;
+
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ if (!dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
+ invalidate_dynptr(env, state, spi);
+ return 0;
+ }
+
+ ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id;
+
+ /* If the dynptr has a ref_obj_id, then we need to invalidate
+ * two things:
+ *
+ * 1) Any dynptrs with a matching ref_obj_id (clones)
+ * 2) Any slices derived from this dynptr.
+ */
+
+ /* Invalidate any slices associated with this dynptr */
+ WARN_ON_ONCE(release_reference(env, ref_obj_id));
+
+ /* Invalidate any dynptr clones */
+ for (i = 1; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].spilled_ptr.ref_obj_id != ref_obj_id)
+ continue;
+
+ /* it should always be the case that if the ref obj id
+ * matches then the stack slot also belongs to a
+ * dynptr
+ */
+ if (state->stack[i].slot_type[0] != STACK_DYNPTR) {
+ verbose(env, "verifier internal error: misconfigured ref_obj_id\n");
+ return -EFAULT;
+ }
+ if (state->stack[i].spilled_ptr.dynptr.first_slot)
+ invalidate_dynptr(env, state, i);
+ }
+
+ return 0;
}
-static bool is_acquire_function(enum bpf_func_id func_id,
- const struct bpf_map *map)
+static void __mark_reg_unknown(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
+
+static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC;
+ if (!env->allow_ptr_leaks)
+ __mark_reg_not_init(env, reg);
+ else
+ __mark_reg_unknown(env, reg);
+}
- if (func_id == BPF_FUNC_sk_lookup_tcp ||
- func_id == BPF_FUNC_sk_lookup_udp ||
- func_id == BPF_FUNC_skc_lookup_tcp ||
- func_id == BPF_FUNC_ringbuf_reserve)
+static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int spi)
+{
+ struct bpf_func_state *fstate;
+ struct bpf_reg_state *dreg;
+ int i, dynptr_id;
+
+ /* We always ensure that STACK_DYNPTR is never set partially,
+ * hence just checking for slot_type[0] is enough. This is
+ * different for STACK_SPILL, where it may be only set for
+ * 1 byte, so code has to use is_spilled_reg.
+ */
+ if (state->stack[spi].slot_type[0] != STACK_DYNPTR)
+ return 0;
+
+ /* Reposition spi to first slot */
+ if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
+ spi = spi + 1;
+
+ if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
+ verbose(env, "cannot overwrite referenced dynptr\n");
+ return -EINVAL;
+ }
+
+ mark_stack_slot_scratched(env, spi);
+ mark_stack_slot_scratched(env, spi - 1);
+
+ /* Writing partially to one dynptr stack slot destroys both. */
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ state->stack[spi].slot_type[i] = STACK_INVALID;
+ state->stack[spi - 1].slot_type[i] = STACK_INVALID;
+ }
+
+ dynptr_id = state->stack[spi].spilled_ptr.id;
+ /* Invalidate any slices associated with this dynptr */
+ bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({
+ /* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */
+ if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM)
+ continue;
+ if (dreg->dynptr_id == dynptr_id)
+ mark_reg_invalid(env, dreg);
+ }));
+
+ /* Do not release reference state, we are destroying dynptr on stack,
+ * not using some helper to release it. Just reset register.
+ */
+ __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+ __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
+
+ /* Same reason as unmark_stack_slots_dynptr above */
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
+ return 0;
+}
+
+static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return false;
+
+ spi = dynptr_get_spi(env, reg);
+
+ /* -ERANGE (i.e. spi not falling into allocated stack slots) isn't an
+ * error because this just means the stack state hasn't been updated yet.
+ * We will do check_mem_access to check and update stack bounds later.
+ */
+ if (spi < 0 && spi != -ERANGE)
+ return false;
+
+ /* We don't need to check if the stack slots are marked by previous
+ * dynptr initializations because we allow overwriting existing unreferenced
+ * STACK_DYNPTR slots, see mark_stack_slots_dynptr which calls
+ * destroy_if_dynptr_stack_slot to ensure dynptr objects at the slots we are
+ * touching are completely destructed before we reinitialize them for a new
+ * one. For referenced ones, destroy_if_dynptr_stack_slot returns an error early
+ * instead of delaying it until the end where the user will get "Unreleased
+ * reference" error.
+ */
+ return true;
+}
+
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int i, spi;
+
+ /* This already represents first slot of initialized bpf_dynptr.
+ *
+ * CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
+ * check_func_arg_reg_off's logic, so we don't need to check its
+ * offset and alignment.
+ */
+ if (reg->type == CONST_PTR_TO_DYNPTR)
return true;
- if (func_id == BPF_FUNC_map_lookup_elem &&
- (map_type == BPF_MAP_TYPE_SOCKMAP ||
- map_type == BPF_MAP_TYPE_SOCKHASH))
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return false;
+ if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
+ return false;
+
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ if (state->stack[spi].slot_type[i] != STACK_DYNPTR ||
+ state->stack[spi - 1].slot_type[i] != STACK_DYNPTR)
+ return false;
+ }
+
+ return true;
+}
+
+static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ enum bpf_arg_type arg_type)
+{
+ struct bpf_func_state *state = func(env, reg);
+ enum bpf_dynptr_type dynptr_type;
+ int spi;
+
+ /* ARG_PTR_TO_DYNPTR takes any type of dynptr */
+ if (arg_type == ARG_PTR_TO_DYNPTR)
return true;
- return false;
+ dynptr_type = arg_to_dynptr_type(arg_type);
+ if (reg->type == CONST_PTR_TO_DYNPTR) {
+ return reg->dynptr.type == dynptr_type;
+ } else {
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return false;
+ return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
+ }
}
-static bool is_ptr_cast_function(enum bpf_func_id func_id)
+static void __mark_reg_known_zero(struct bpf_reg_state *reg);
+
+static bool in_rcu_cs(struct bpf_verifier_env *env);
+
+static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta);
+
+static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+ struct bpf_kfunc_call_arg_meta *meta,
+ struct bpf_reg_state *reg, int insn_idx,
+ struct btf *btf, u32 btf_id, int nr_slots)
{
- return func_id == BPF_FUNC_tcp_sock ||
- func_id == BPF_FUNC_sk_fullsock;
-}
-
-/* string representation of 'enum bpf_reg_type' */
-static const char * const reg_type_str[] = {
- [NOT_INIT] = "?",
- [SCALAR_VALUE] = "inv",
- [PTR_TO_CTX] = "ctx",
- [CONST_PTR_TO_MAP] = "map_ptr",
- [PTR_TO_MAP_VALUE] = "map_value",
- [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
- [PTR_TO_STACK] = "fp",
- [PTR_TO_PACKET] = "pkt",
- [PTR_TO_PACKET_META] = "pkt_meta",
- [PTR_TO_PACKET_END] = "pkt_end",
- [PTR_TO_FLOW_KEYS] = "flow_keys",
- [PTR_TO_SOCKET] = "sock",
- [PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
- [PTR_TO_SOCK_COMMON] = "sock_common",
- [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
- [PTR_TO_TCP_SOCK] = "tcp_sock",
- [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
- [PTR_TO_TP_BUFFER] = "tp_buffer",
- [PTR_TO_XDP_SOCK] = "xdp_sock",
- [PTR_TO_BTF_ID] = "ptr_",
- [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_",
- [PTR_TO_MEM] = "mem",
- [PTR_TO_MEM_OR_NULL] = "mem_or_null",
-};
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j, id;
-static char slot_type_char[] = {
- [STACK_INVALID] = '?',
- [STACK_SPILL] = 'r',
- [STACK_MISC] = 'm',
- [STACK_ZERO] = '0',
-};
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ id = acquire_reference_state(env, insn_idx);
+ if (id < 0)
+ return id;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ __mark_reg_known_zero(st);
+ st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+ if (is_kfunc_rcu_protected(meta)) {
+ if (in_rcu_cs(env))
+ st->type |= MEM_RCU;
+ else
+ st->type |= PTR_UNTRUSTED;
+ }
+ st->live |= REG_LIVE_WRITTEN;
+ st->ref_obj_id = i == 0 ? id : 0;
+ st->iter.btf = btf;
+ st->iter.btf_id = btf_id;
+ st->iter.state = BPF_ITER_STATE_ACTIVE;
+ st->iter.depth = 0;
-static void print_liveness(struct bpf_verifier_env *env,
- enum bpf_reg_liveness live)
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ slot->slot_type[j] = STACK_ITER;
+
+ mark_stack_slot_scratched(env, spi - i);
+ }
+
+ return 0;
+}
+
+static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int nr_slots)
{
- if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE))
- verbose(env, "_");
- if (live & REG_LIVE_READ)
- verbose(env, "r");
- if (live & REG_LIVE_WRITTEN)
- verbose(env, "w");
- if (live & REG_LIVE_DONE)
- verbose(env, "D");
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ if (i == 0)
+ WARN_ON_ONCE(release_reference(env, st->ref_obj_id));
+
+ __mark_reg_not_init(env, st);
+
+ /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
+ st->live |= REG_LIVE_WRITTEN;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ slot->slot_type[j] = STACK_INVALID;
+
+ mark_stack_slot_scratched(env, spi - i);
+ }
+
+ return 0;
}
-static struct bpf_func_state *func(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg)
+static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int nr_slots)
{
- struct bpf_verifier_state *cur = env->cur_state;
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
- return cur->frame[reg->frameno];
+ /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+ * will do check_mem_access to check and update stack bounds later, so
+ * return true for that case.
+ */
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi == -ERANGE)
+ return true;
+ if (spi < 0)
+ return false;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ if (slot->slot_type[j] == STACK_ITER)
+ return false;
+ }
+
+ return true;
}
-const char *kernel_type_name(u32 id)
+static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ struct btf *btf, u32 btf_id, int nr_slots)
{
- return btf_name_by_offset(btf_vmlinux,
- btf_type_by_id(btf_vmlinux, id)->name_off);
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return -EINVAL;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ if (st->type & PTR_UNTRUSTED)
+ return -EPROTO;
+ /* only main (first) slot has ref_obj_id set */
+ if (i == 0 && !st->ref_obj_id)
+ return -EINVAL;
+ if (i != 0 && st->ref_obj_id)
+ return -EINVAL;
+ if (st->iter.btf != btf || st->iter.btf_id != btf_id)
+ return -EINVAL;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ if (slot->slot_type[j] != STACK_ITER)
+ return -EINVAL;
+ }
+
+ return 0;
}
-static void print_verifier_state(struct bpf_verifier_env *env,
- const struct bpf_func_state *state)
+/* Check if given stack slot is "special":
+ * - spilled register state (STACK_SPILL);
+ * - dynptr state (STACK_DYNPTR);
+ * - iter state (STACK_ITER).
+ */
+static bool is_stack_slot_special(const struct bpf_stack_state *stack)
{
- const struct bpf_reg_state *reg;
- enum bpf_reg_type t;
- int i;
+ enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1];
- if (state->frameno)
- verbose(env, " frame%d:", state->frameno);
- for (i = 0; i < MAX_BPF_REG; i++) {
- reg = &state->regs[i];
- t = reg->type;
- if (t == NOT_INIT)
- continue;
- verbose(env, " R%d", i);
- print_liveness(env, reg->live);
- verbose(env, "=%s", reg_type_str[t]);
- if (t == SCALAR_VALUE && reg->precise)
- verbose(env, "P");
- if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
- tnum_is_const(reg->var_off)) {
- /* reg->off should be 0 for SCALAR_VALUE */
- verbose(env, "%lld", reg->var_off.value + reg->off);
- } else {
- if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL)
- verbose(env, "%s", kernel_type_name(reg->btf_id));
- verbose(env, "(id=%d", reg->id);
- if (reg_type_may_be_refcounted_or_null(t))
- verbose(env, ",ref_obj_id=%d", reg->ref_obj_id);
- if (t != SCALAR_VALUE)
- verbose(env, ",off=%d", reg->off);
- if (type_is_pkt_pointer(t))
- verbose(env, ",r=%d", reg->range);
- else if (t == CONST_PTR_TO_MAP ||
- t == PTR_TO_MAP_VALUE ||
- t == PTR_TO_MAP_VALUE_OR_NULL)
- verbose(env, ",ks=%d,vs=%d",
- reg->map_ptr->key_size,
- reg->map_ptr->value_size);
- if (tnum_is_const(reg->var_off)) {
- /* Typically an immediate SCALAR_VALUE, but
- * could be a pointer whose offset is too big
- * for reg->off
- */
- verbose(env, ",imm=%llx", reg->var_off.value);
- } else {
- if (reg->smin_value != reg->umin_value &&
- reg->smin_value != S64_MIN)
- verbose(env, ",smin_value=%lld",
- (long long)reg->smin_value);
- if (reg->smax_value != reg->umax_value &&
- reg->smax_value != S64_MAX)
- verbose(env, ",smax_value=%lld",
- (long long)reg->smax_value);
- if (reg->umin_value != 0)
- verbose(env, ",umin_value=%llu",
- (unsigned long long)reg->umin_value);
- if (reg->umax_value != U64_MAX)
- verbose(env, ",umax_value=%llu",
- (unsigned long long)reg->umax_value);
- if (!tnum_is_unknown(reg->var_off)) {
- char tn_buf[48];
-
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, ",var_off=%s", tn_buf);
- }
- if (reg->s32_min_value != reg->smin_value &&
- reg->s32_min_value != S32_MIN)
- verbose(env, ",s32_min_value=%d",
- (int)(reg->s32_min_value));
- if (reg->s32_max_value != reg->smax_value &&
- reg->s32_max_value != S32_MAX)
- verbose(env, ",s32_max_value=%d",
- (int)(reg->s32_max_value));
- if (reg->u32_min_value != reg->umin_value &&
- reg->u32_min_value != U32_MIN)
- verbose(env, ",u32_min_value=%d",
- (int)(reg->u32_min_value));
- if (reg->u32_max_value != reg->umax_value &&
- reg->u32_max_value != U32_MAX)
- verbose(env, ",u32_max_value=%d",
- (int)(reg->u32_max_value));
- }
- verbose(env, ")");
- }
- }
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- char types_buf[BPF_REG_SIZE + 1];
- bool valid = false;
- int j;
+ switch (type) {
+ case STACK_SPILL:
+ case STACK_DYNPTR:
+ case STACK_ITER:
+ return true;
+ case STACK_INVALID:
+ case STACK_MISC:
+ case STACK_ZERO:
+ return false;
+ default:
+ WARN_ONCE(1, "unknown stack slot type %d\n", type);
+ return true;
+ }
+}
- for (j = 0; j < BPF_REG_SIZE; j++) {
- if (state->stack[i].slot_type[j] != STACK_INVALID)
- valid = true;
- types_buf[j] = slot_type_char[
- state->stack[i].slot_type[j]];
- }
- types_buf[BPF_REG_SIZE] = 0;
- if (!valid)
- continue;
- verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
- print_liveness(env, state->stack[i].spilled_ptr.live);
- if (state->stack[i].slot_type[0] == STACK_SPILL) {
- reg = &state->stack[i].spilled_ptr;
- t = reg->type;
- verbose(env, "=%s", reg_type_str[t]);
- if (t == SCALAR_VALUE && reg->precise)
- verbose(env, "P");
- if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
- verbose(env, "%lld", reg->var_off.value + reg->off);
- } else {
- verbose(env, "=%s", types_buf);
- }
- }
- if (state->acquired_refs && state->refs[0].id) {
- verbose(env, " refs=%d", state->refs[0].id);
- for (i = 1; i < state->acquired_refs; i++)
- if (state->refs[i].id)
- verbose(env, ",%d", state->refs[i].id);
- }
- verbose(env, "\n");
-}
-
-#define COPY_STATE_FN(NAME, COUNT, FIELD, SIZE) \
-static int copy_##NAME##_state(struct bpf_func_state *dst, \
- const struct bpf_func_state *src) \
-{ \
- if (!src->FIELD) \
- return 0; \
- if (WARN_ON_ONCE(dst->COUNT < src->COUNT)) { \
- /* internal bug, make state invalid to reject the program */ \
- memset(dst, 0, sizeof(*dst)); \
- return -EFAULT; \
- } \
- memcpy(dst->FIELD, src->FIELD, \
- sizeof(*src->FIELD) * (src->COUNT / SIZE)); \
- return 0; \
-}
-/* copy_reference_state() */
-COPY_STATE_FN(reference, acquired_refs, refs, 1)
-/* copy_stack_state() */
-COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
-#undef COPY_STATE_FN
-
-#define REALLOC_STATE_FN(NAME, COUNT, FIELD, SIZE) \
-static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \
- bool copy_old) \
-{ \
- u32 old_size = state->COUNT; \
- struct bpf_##NAME##_state *new_##FIELD; \
- int slot = size / SIZE; \
- \
- if (size <= old_size || !size) { \
- if (copy_old) \
- return 0; \
- state->COUNT = slot * SIZE; \
- if (!size && old_size) { \
- kfree(state->FIELD); \
- state->FIELD = NULL; \
- } \
- return 0; \
- } \
- new_##FIELD = kmalloc_array(slot, sizeof(struct bpf_##NAME##_state), \
- GFP_KERNEL); \
- if (!new_##FIELD) \
- return -ENOMEM; \
- if (copy_old) { \
- if (state->FIELD) \
- memcpy(new_##FIELD, state->FIELD, \
- sizeof(*new_##FIELD) * (old_size / SIZE)); \
- memset(new_##FIELD + old_size / SIZE, 0, \
- sizeof(*new_##FIELD) * (size - old_size) / SIZE); \
- } \
- state->COUNT = slot * SIZE; \
- kfree(state->FIELD); \
- state->FIELD = new_##FIELD; \
- return 0; \
-}
-/* realloc_reference_state() */
-REALLOC_STATE_FN(reference, acquired_refs, refs, 1)
-/* realloc_stack_state() */
-REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
-#undef REALLOC_STATE_FN
-
-/* do_check() starts with zero-sized stack in struct bpf_verifier_state to
- * make it consume minimal amount of memory. check_stack_write() access from
- * the program calls into realloc_func_state() to grow the stack size.
- * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
- * which realloc_stack_state() copies over. It points to previous
- * bpf_verifier_state which is never reallocated.
+/* The reg state of a pointer or a bounded scalar was saved when
+ * it was spilled to the stack.
*/
-static int realloc_func_state(struct bpf_func_state *state, int stack_size,
- int refs_size, bool copy_old)
+static bool is_spilled_reg(const struct bpf_stack_state *stack)
{
- int err = realloc_reference_state(state, refs_size, copy_old);
- if (err)
- return err;
- return realloc_stack_state(state, stack_size, copy_old);
+ return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL;
+}
+
+static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack)
+{
+ return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL &&
+ stack->spilled_ptr.type == SCALAR_VALUE;
+}
+
+/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which
+ * case they are equivalent, or it's STACK_ZERO, in which case we preserve
+ * more precise STACK_ZERO.
+ * Note, in uprivileged mode leaving STACK_INVALID is wrong, so we take
+ * env->allow_ptr_leaks into account and force STACK_MISC, if necessary.
+ */
+static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype)
+{
+ if (*stype == STACK_ZERO)
+ return;
+ if (env->allow_ptr_leaks && *stype == STACK_INVALID)
+ return;
+ *stype = STACK_MISC;
+}
+
+static void scrub_spilled_slot(u8 *stype)
+{
+ if (*stype != STACK_INVALID)
+ *stype = STACK_MISC;
+}
+
+/* copy array src of length n * size bytes to dst. dst is reallocated if it's too
+ * small to hold src. This is different from krealloc since we don't want to preserve
+ * the contents of dst.
+ *
+ * Leaves dst untouched if src is NULL or length is zero. Returns NULL if memory could
+ * not be allocated.
+ */
+static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t flags)
+{
+ size_t alloc_bytes;
+ void *orig = dst;
+ size_t bytes;
+
+ if (ZERO_OR_NULL_PTR(src))
+ goto out;
+
+ if (unlikely(check_mul_overflow(n, size, &bytes)))
+ return NULL;
+
+ alloc_bytes = max(ksize(orig), kmalloc_size_roundup(bytes));
+ dst = krealloc(orig, alloc_bytes, flags);
+ if (!dst) {
+ kfree(orig);
+ return NULL;
+ }
+
+ memcpy(dst, src, bytes);
+out:
+ return dst ? dst : ZERO_SIZE_PTR;
+}
+
+/* resize an array from old_n items to new_n items. the array is reallocated if it's too
+ * small to hold new_n items. new items are zeroed out if the array grows.
+ *
+ * Contrary to krealloc_array, does not free arr if new_n is zero.
+ */
+static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size)
+{
+ size_t alloc_size;
+ void *new_arr;
+
+ if (!new_n || old_n == new_n)
+ goto out;
+
+ alloc_size = kmalloc_size_roundup(size_mul(new_n, size));
+ new_arr = krealloc(arr, alloc_size, GFP_KERNEL);
+ if (!new_arr) {
+ kfree(arr);
+ return NULL;
+ }
+ arr = new_arr;
+
+ if (new_n > old_n)
+ memset(arr + old_n * size, 0, (new_n - old_n) * size);
+
+out:
+ return arr ? arr : ZERO_SIZE_PTR;
+}
+
+static int copy_reference_state(struct bpf_func_state *dst, const struct bpf_func_state *src)
+{
+ dst->refs = copy_array(dst->refs, src->refs, src->acquired_refs,
+ sizeof(struct bpf_reference_state), GFP_KERNEL);
+ if (!dst->refs)
+ return -ENOMEM;
+
+ dst->acquired_refs = src->acquired_refs;
+ return 0;
+}
+
+static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_state *src)
+{
+ size_t n = src->allocated_stack / BPF_REG_SIZE;
+
+ dst->stack = copy_array(dst->stack, src->stack, n, sizeof(struct bpf_stack_state),
+ GFP_KERNEL);
+ if (!dst->stack)
+ return -ENOMEM;
+
+ dst->allocated_stack = src->allocated_stack;
+ return 0;
+}
+
+static int resize_reference_state(struct bpf_func_state *state, size_t n)
+{
+ state->refs = realloc_array(state->refs, state->acquired_refs, n,
+ sizeof(struct bpf_reference_state));
+ if (!state->refs)
+ return -ENOMEM;
+
+ state->acquired_refs = n;
+ return 0;
+}
+
+/* Possibly update state->allocated_stack to be at least size bytes. Also
+ * possibly update the function's high-water mark in its bpf_subprog_info.
+ */
+static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int size)
+{
+ size_t old_n = state->allocated_stack / BPF_REG_SIZE, n;
+
+ /* The stack size is always a multiple of BPF_REG_SIZE. */
+ size = round_up(size, BPF_REG_SIZE);
+ n = size / BPF_REG_SIZE;
+
+ if (old_n >= n)
+ return 0;
+
+ state->stack = realloc_array(state->stack, old_n, n, sizeof(struct bpf_stack_state));
+ if (!state->stack)
+ return -ENOMEM;
+
+ state->allocated_stack = size;
+
+ /* update known max for given subprogram */
+ if (env->subprog_info[state->subprogno].stack_depth < size)
+ env->subprog_info[state->subprogno].stack_depth = size;
+
+ return 0;
}
/* Acquire a pointer id from the env and update the state->refs to include
@@ -750,12 +1308,13 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
int new_ofs = state->acquired_refs;
int id, err;
- err = realloc_reference_state(state, state->acquired_refs + 1, true);
+ err = resize_reference_state(state, state->acquired_refs + 1);
if (err)
return err;
id = ++env->id_gen;
state->refs[new_ofs].id = id;
state->refs[new_ofs].insn_idx = insn_idx;
+ state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0;
return id;
}
@@ -768,6 +1327,9 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id)
last_idx = state->acquired_refs - 1;
for (i = 0; i < state->acquired_refs; i++) {
if (state->refs[i].id == ptr_id) {
+ /* Cannot release caller references in callbacks */
+ if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+ return -EINVAL;
if (last_idx && i != last_idx)
memcpy(&state->refs[i], &state->refs[last_idx],
sizeof(*state->refs));
@@ -779,18 +1341,6 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id)
return -EINVAL;
}
-static int transfer_reference_state(struct bpf_func_state *dst,
- struct bpf_func_state *src)
-{
- int err = realloc_reference_state(dst, src->acquired_refs, false);
- if (err)
- return err;
- err = copy_reference_state(dst, src);
- if (err)
- return err;
- return 0;
-}
-
static void free_func_state(struct bpf_func_state *state)
{
if (!state)
@@ -829,10 +1379,6 @@ static int copy_func_state(struct bpf_func_state *dst,
{
int err;
- err = realloc_func_state(dst, src->allocated_stack, src->acquired_refs,
- false);
- if (err)
- return err;
memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs));
err = copy_reference_state(dst, src);
if (err)
@@ -844,30 +1390,34 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
const struct bpf_verifier_state *src)
{
struct bpf_func_state *dst;
- u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt;
int i, err;
- if (dst_state->jmp_history_cnt < src->jmp_history_cnt) {
- kfree(dst_state->jmp_history);
- dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER);
- if (!dst_state->jmp_history)
- return -ENOMEM;
- }
- memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz);
+ dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history,
+ src->jmp_history_cnt, sizeof(*dst_state->jmp_history),
+ GFP_USER);
+ if (!dst_state->jmp_history)
+ return -ENOMEM;
dst_state->jmp_history_cnt = src->jmp_history_cnt;
- /* if dst has more stack frames then src frame, free them */
+ /* if dst has more stack frames then src frame, free them, this is also
+ * necessary in case of exceptional exits using bpf_throw.
+ */
for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
free_func_state(dst_state->frame[i]);
dst_state->frame[i] = NULL;
}
dst_state->speculative = src->speculative;
+ dst_state->active_rcu_lock = src->active_rcu_lock;
dst_state->curframe = src->curframe;
- dst_state->active_spin_lock = src->active_spin_lock;
+ dst_state->active_lock.ptr = src->active_lock.ptr;
+ dst_state->active_lock.id = src->active_lock.id;
dst_state->branches = src->branches;
dst_state->parent = src->parent;
dst_state->first_insn_idx = src->first_insn_idx;
dst_state->last_insn_idx = src->last_insn_idx;
+ dst_state->dfs_depth = src->dfs_depth;
+ dst_state->callback_unroll_depth = src->callback_unroll_depth;
+ dst_state->used_as_loop_entry = src->used_as_loop_entry;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
if (!dst) {
@@ -883,11 +1433,203 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
return 0;
}
+static u32 state_htab_size(struct bpf_verifier_env *env)
+{
+ return env->prog->len;
+}
+
+static struct bpf_verifier_state_list **explored_state(struct bpf_verifier_env *env, int idx)
+{
+ struct bpf_verifier_state *cur = env->cur_state;
+ struct bpf_func_state *state = cur->frame[cur->curframe];
+
+ return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
+}
+
+static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_state *b)
+{
+ int fr;
+
+ if (a->curframe != b->curframe)
+ return false;
+
+ for (fr = a->curframe; fr >= 0; fr--)
+ if (a->frame[fr]->callsite != b->frame[fr]->callsite)
+ return false;
+
+ return true;
+}
+
+/* Open coded iterators allow back-edges in the state graph in order to
+ * check unbounded loops that iterators.
+ *
+ * In is_state_visited() it is necessary to know if explored states are
+ * part of some loops in order to decide whether non-exact states
+ * comparison could be used:
+ * - non-exact states comparison establishes sub-state relation and uses
+ * read and precision marks to do so, these marks are propagated from
+ * children states and thus are not guaranteed to be final in a loop;
+ * - exact states comparison just checks if current and explored states
+ * are identical (and thus form a back-edge).
+ *
+ * Paper "A New Algorithm for Identifying Loops in Decompilation"
+ * by Tao Wei, Jian Mao, Wei Zou and Yu Chen [1] presents a convenient
+ * algorithm for loop structure detection and gives an overview of
+ * relevant terminology. It also has helpful illustrations.
+ *
+ * [1] https://api.semanticscholar.org/CorpusID:15784067
+ *
+ * We use a similar algorithm but because loop nested structure is
+ * irrelevant for verifier ours is significantly simpler and resembles
+ * strongly connected components algorithm from Sedgewick's textbook.
+ *
+ * Define topmost loop entry as a first node of the loop traversed in a
+ * depth first search starting from initial state. The goal of the loop
+ * tracking algorithm is to associate topmost loop entries with states
+ * derived from these entries.
+ *
+ * For each step in the DFS states traversal algorithm needs to identify
+ * the following situations:
+ *
+ * initial initial initial
+ * | | |
+ * V V V
+ * ... ... .---------> hdr
+ * | | | |
+ * V V | V
+ * cur .-> succ | .------...
+ * | | | | | |
+ * V | V | V V
+ * succ '-- cur | ... ...
+ * | | |
+ * | V V
+ * | succ <- cur
+ * | |
+ * | V
+ * | ...
+ * | |
+ * '----'
+ *
+ * (A) successor state of cur (B) successor state of cur or it's entry
+ * not yet traversed are in current DFS path, thus cur and succ
+ * are members of the same outermost loop
+ *
+ * initial initial
+ * | |
+ * V V
+ * ... ...
+ * | |
+ * V V
+ * .------... .------...
+ * | | | |
+ * V V V V
+ * .-> hdr ... ... ...
+ * | | | | |
+ * | V V V V
+ * | succ <- cur succ <- cur
+ * | | |
+ * | V V
+ * | ... ...
+ * | | |
+ * '----' exit
+ *
+ * (C) successor state of cur is a part of some loop but this loop
+ * does not include cur or successor state is not in a loop at all.
+ *
+ * Algorithm could be described as the following python code:
+ *
+ * traversed = set() # Set of traversed nodes
+ * entries = {} # Mapping from node to loop entry
+ * depths = {} # Depth level assigned to graph node
+ * path = set() # Current DFS path
+ *
+ * # Find outermost loop entry known for n
+ * def get_loop_entry(n):
+ * h = entries.get(n, None)
+ * while h in entries and entries[h] != h:
+ * h = entries[h]
+ * return h
+ *
+ * # Update n's loop entry if h's outermost entry comes
+ * # before n's outermost entry in current DFS path.
+ * def update_loop_entry(n, h):
+ * n1 = get_loop_entry(n) or n
+ * h1 = get_loop_entry(h) or h
+ * if h1 in path and depths[h1] <= depths[n1]:
+ * entries[n] = h1
+ *
+ * def dfs(n, depth):
+ * traversed.add(n)
+ * path.add(n)
+ * depths[n] = depth
+ * for succ in G.successors(n):
+ * if succ not in traversed:
+ * # Case A: explore succ and update cur's loop entry
+ * # only if succ's entry is in current DFS path.
+ * dfs(succ, depth + 1)
+ * h = get_loop_entry(succ)
+ * update_loop_entry(n, h)
+ * else:
+ * # Case B or C depending on `h1 in path` check in update_loop_entry().
+ * update_loop_entry(n, succ)
+ * path.remove(n)
+ *
+ * To adapt this algorithm for use with verifier:
+ * - use st->branch == 0 as a signal that DFS of succ had been finished
+ * and cur's loop entry has to be updated (case A), handle this in
+ * update_branch_counts();
+ * - use st->branch > 0 as a signal that st is in the current DFS path;
+ * - handle cases B and C in is_state_visited();
+ * - update topmost loop entry for intermediate states in get_loop_entry().
+ */
+static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st)
+{
+ struct bpf_verifier_state *topmost = st->loop_entry, *old;
+
+ while (topmost && topmost->loop_entry && topmost != topmost->loop_entry)
+ topmost = topmost->loop_entry;
+ /* Update loop entries for intermediate states to avoid this
+ * traversal in future get_loop_entry() calls.
+ */
+ while (st && st->loop_entry != topmost) {
+ old = st->loop_entry;
+ st->loop_entry = topmost;
+ st = old;
+ }
+ return topmost;
+}
+
+static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr)
+{
+ struct bpf_verifier_state *cur1, *hdr1;
+
+ cur1 = get_loop_entry(cur) ?: cur;
+ hdr1 = get_loop_entry(hdr) ?: hdr;
+ /* The head1->branches check decides between cases B and C in
+ * comment for get_loop_entry(). If hdr1->branches == 0 then
+ * head's topmost loop entry is not in current DFS path,
+ * hence 'cur' and 'hdr' are not in the same loop and there is
+ * no need to update cur->loop_entry.
+ */
+ if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) {
+ cur->loop_entry = hdr;
+ hdr->used_as_loop_entry = true;
+ }
+}
+
static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{
while (st) {
u32 br = --st->branches;
+ /* br == 0 signals that DFS exploration for 'st' is finished,
+ * thus it is necessary to update parent's loop entry if it
+ * turned out that st is a part of some loop.
+ * This is a part of 'case A' in get_loop_entry() comment.
+ */
+ if (br == 0 && st->parent && st->loop_entry)
+ update_loop_entry(st->parent, st->loop_entry);
+
/* WARN_ON(br > 1) technically makes sense here,
* but see comment in push_stack(), hence:
*/
@@ -944,7 +1686,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
- elem->log_pos = env->log.len_used;
+ elem->log_pos = env->log.end_pos;
env->head = elem;
env->stack_size++;
err = copy_verifier_state(&elem->st, cur);
@@ -982,17 +1724,9 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-static void __mark_reg_not_init(const struct bpf_verifier_env *env,
- struct bpf_reg_state *reg);
-
-/* Mark the unknown part of a register (variable offset or scalar value) as
- * known to have the value @imm.
- */
-static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
+/* This helper doesn't clear reg->id */
+static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
- /* Clear id, off, and union(map_ptr, range) */
- memset(((u8 *)reg) + sizeof(reg->type), 0,
- offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
reg->var_off = tnum_const(imm);
reg->smin_value = (s64)imm;
reg->smax_value = (s64)imm;
@@ -1005,6 +1739,19 @@ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
reg->u32_max_value = (u32)imm;
}
+/* Mark the unknown part of a register (variable offset or scalar value) as
+ * known to have the value @imm.
+ */
+static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
+{
+ /* Clear off and union(map_ptr, range) */
+ memset(((u8 *)reg) + sizeof(reg->type), 0,
+ offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
+ reg->id = 0;
+ reg->ref_obj_id = 0;
+ ___mark_reg_known(reg, imm);
+}
+
static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm)
{
reg->var_off = tnum_const_subreg(reg->var_off, imm);
@@ -1022,10 +1769,14 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
__mark_reg_known(reg, 0);
}
-static void __mark_reg_const_zero(struct bpf_reg_state *reg)
+static void __mark_reg_const_zero(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
__mark_reg_known(reg, 0);
reg->type = SCALAR_VALUE;
+ /* all scalars are assumed imprecise initially (unless unprivileged,
+ * in which case everything is forced to be precise)
+ */
+ reg->precise = !env->bpf_capable;
}
static void mark_reg_known_zero(struct bpf_verifier_env *env,
@@ -1041,6 +1792,58 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
__mark_reg_known_zero(regs + regno);
}
+static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type,
+ bool first_slot, int dynptr_id)
+{
+ /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for
+ * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply
+ * set it unconditionally as it is ignored for STACK_DYNPTR anyway.
+ */
+ __mark_reg_known_zero(reg);
+ reg->type = CONST_PTR_TO_DYNPTR;
+ /* Give each dynptr a unique id to uniquely associate slices to it. */
+ reg->id = dynptr_id;
+ reg->dynptr.type = type;
+ reg->dynptr.first_slot = first_slot;
+}
+
+static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
+{
+ if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
+ const struct bpf_map *map = reg->map_ptr;
+
+ if (map->inner_map_meta) {
+ reg->type = CONST_PTR_TO_MAP;
+ reg->map_ptr = map->inner_map_meta;
+ /* transfer reg's id which is unique for every map_lookup_elem
+ * as UID of the inner map.
+ */
+ if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER))
+ reg->map_uid = reg->id;
+ } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
+ reg->type = PTR_TO_XDP_SOCK;
+ } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
+ map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+ reg->type = PTR_TO_SOCKET;
+ } else {
+ reg->type = PTR_TO_MAP_VALUE;
+ }
+ return;
+ }
+
+ reg->type &= ~PTR_MAYBE_NULL;
+}
+
+static void mark_reg_graph_node(struct bpf_reg_state *regs, u32 regno,
+ struct btf_field_graph_root *ds_head)
+{
+ __mark_reg_known_zero(&regs[regno]);
+ regs[regno].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[regno].btf = ds_head->btf;
+ regs[regno].btf_id = ds_head->value_btf_id;
+ regs[regno].off = ds_head->node_offset;
+}
+
static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
{
return type_is_pkt_pointer(reg->type);
@@ -1052,6 +1855,12 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
reg->type == PTR_TO_PACKET_END;
}
+static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg)
+{
+ return base_type(reg->type) == PTR_TO_MEM &&
+ (reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP);
+}
+
/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
enum bpf_reg_type which)
@@ -1133,69 +1942,214 @@ static void __update_reg_bounds(struct bpf_reg_state *reg)
/* Uses signed min/max values to inform unsigned, and vice-versa */
static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
{
- /* Learn sign from signed bounds.
- * If we cannot cross the sign boundary, then signed and unsigned bounds
- * are the same, so combine. This works even in the negative case, e.g.
- * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
+ /* If upper 32 bits of u64/s64 range don't change, we can use lower 32
+ * bits to improve our u32/s32 boundaries.
+ *
+ * E.g., the case where we have upper 32 bits as zero ([10, 20] in
+ * u64) is pretty trivial, it's obvious that in u32 we'll also have
+ * [10, 20] range. But this property holds for any 64-bit range as
+ * long as upper 32 bits in that entire range of values stay the same.
+ *
+ * E.g., u64 range [0x10000000A, 0x10000000F] ([4294967306, 4294967311]
+ * in decimal) has the same upper 32 bits throughout all the values in
+ * that range. As such, lower 32 bits form a valid [0xA, 0xF] ([10, 15])
+ * range.
+ *
+ * Note also, that [0xA, 0xF] is a valid range both in u32 and in s32,
+ * following the rules outlined below about u64/s64 correspondence
+ * (which equally applies to u32 vs s32 correspondence). In general it
+ * depends on actual hexadecimal values of 32-bit range. They can form
+ * only valid u32, or only valid s32 ranges in some cases.
+ *
+ * So we use all these insights to derive bounds for subregisters here.
*/
- if (reg->s32_min_value >= 0 || reg->s32_max_value < 0) {
- reg->s32_min_value = reg->u32_min_value =
- max_t(u32, reg->s32_min_value, reg->u32_min_value);
- reg->s32_max_value = reg->u32_max_value =
- min_t(u32, reg->s32_max_value, reg->u32_max_value);
- return;
+ if ((reg->umin_value >> 32) == (reg->umax_value >> 32)) {
+ /* u64 to u32 casting preserves validity of low 32 bits as
+ * a range, if upper 32 bits are the same
+ */
+ reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->umin_value);
+ reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->umax_value);
+
+ if ((s32)reg->umin_value <= (s32)reg->umax_value) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
+ }
+ }
+ if ((reg->smin_value >> 32) == (reg->smax_value >> 32)) {
+ /* low 32 bits should form a proper u32 range */
+ if ((u32)reg->smin_value <= (u32)reg->smax_value) {
+ reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->smin_value);
+ reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->smax_value);
+ }
+ /* low 32 bits should form a proper s32 range */
+ if ((s32)reg->smin_value <= (s32)reg->smax_value) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
+ }
}
- /* Learn sign from unsigned bounds. Signed bounds cross the sign
- * boundary, so we must be careful.
+ /* Special case where upper bits form a small sequence of two
+ * sequential numbers (in 32-bit unsigned space, so 0xffffffff to
+ * 0x00000000 is also valid), while lower bits form a proper s32 range
+ * going from negative numbers to positive numbers. E.g., let's say we
+ * have s64 range [-1, 1] ([0xffffffffffffffff, 0x0000000000000001]).
+ * Possible s64 values are {-1, 0, 1} ({0xffffffffffffffff,
+ * 0x0000000000000000, 0x00000000000001}). Ignoring upper 32 bits,
+ * we still get a valid s32 range [-1, 1] ([0xffffffff, 0x00000001]).
+ * Note that it doesn't have to be 0xffffffff going to 0x00000000 in
+ * upper 32 bits. As a random example, s64 range
+ * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range
+ * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister.
*/
- if ((s32)reg->u32_max_value >= 0) {
- /* Positive. We can't learn anything from the smin, but smax
- * is positive, hence safe.
- */
- reg->s32_min_value = reg->u32_min_value;
- reg->s32_max_value = reg->u32_max_value =
- min_t(u32, reg->s32_max_value, reg->u32_max_value);
- } else if ((s32)reg->u32_min_value < 0) {
- /* Negative. We can't learn anything from the smax, but smin
- * is negative, hence safe.
- */
- reg->s32_min_value = reg->u32_min_value =
- max_t(u32, reg->s32_min_value, reg->u32_min_value);
- reg->s32_max_value = reg->u32_max_value;
+ if ((u32)(reg->umin_value >> 32) + 1 == (u32)(reg->umax_value >> 32) &&
+ (s32)reg->umin_value < 0 && (s32)reg->umax_value >= 0) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
+ }
+ if ((u32)(reg->smin_value >> 32) + 1 == (u32)(reg->smax_value >> 32) &&
+ (s32)reg->smin_value < 0 && (s32)reg->smax_value >= 0) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
+ }
+ /* if u32 range forms a valid s32 range (due to matching sign bit),
+ * try to learn from that
+ */
+ if ((s32)reg->u32_min_value <= (s32)reg->u32_max_value) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value);
+ }
+ /* If we cannot cross the sign boundary, then signed and unsigned bounds
+ * are the same, so combine. This works even in the negative case, e.g.
+ * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
+ */
+ if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) {
+ reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value);
+ reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value);
}
}
static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
{
- /* Learn sign from signed bounds.
- * If we cannot cross the sign boundary, then signed and unsigned bounds
+ /* If u64 range forms a valid s64 range (due to matching sign bit),
+ * try to learn from that. Let's do a bit of ASCII art to see when
+ * this is happening. Let's take u64 range first:
+ *
+ * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX
+ * |-------------------------------|--------------------------------|
+ *
+ * Valid u64 range is formed when umin and umax are anywhere in the
+ * range [0, U64_MAX], and umin <= umax. u64 case is simple and
+ * straightforward. Let's see how s64 range maps onto the same range
+ * of values, annotated below the line for comparison:
+ *
+ * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX
+ * |-------------------------------|--------------------------------|
+ * 0 S64_MAX S64_MIN -1
+ *
+ * So s64 values basically start in the middle and they are logically
+ * contiguous to the right of it, wrapping around from -1 to 0, and
+ * then finishing as S64_MAX (0x7fffffffffffffff) right before
+ * S64_MIN. We can try drawing the continuity of u64 vs s64 values
+ * more visually as mapped to sign-agnostic range of hex values.
+ *
+ * u64 start u64 end
+ * _______________________________________________________________
+ * / \
+ * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX
+ * |-------------------------------|--------------------------------|
+ * 0 S64_MAX S64_MIN -1
+ * / \
+ * >------------------------------ ------------------------------->
+ * s64 continues... s64 end s64 start s64 "midpoint"
+ *
+ * What this means is that, in general, we can't always derive
+ * something new about u64 from any random s64 range, and vice versa.
+ *
+ * But we can do that in two particular cases. One is when entire
+ * u64/s64 range is *entirely* contained within left half of the above
+ * diagram or when it is *entirely* contained in the right half. I.e.:
+ *
+ * |-------------------------------|--------------------------------|
+ * ^ ^ ^ ^
+ * A B C D
+ *
+ * [A, B] and [C, D] are contained entirely in their respective halves
+ * and form valid contiguous ranges as both u64 and s64 values. [A, B]
+ * will be non-negative both as u64 and s64 (and in fact it will be
+ * identical ranges no matter the signedness). [C, D] treated as s64
+ * will be a range of negative values, while in u64 it will be
+ * non-negative range of values larger than 0x8000000000000000.
+ *
+ * Now, any other range here can't be represented in both u64 and s64
+ * simultaneously. E.g., [A, C], [A, D], [B, C], [B, D] are valid
+ * contiguous u64 ranges, but they are discontinuous in s64. [B, C]
+ * in s64 would be properly presented as [S64_MIN, C] and [B, S64_MAX],
+ * for example. Similarly, valid s64 range [D, A] (going from negative
+ * to positive values), would be two separate [D, U64_MAX] and [0, A]
+ * ranges as u64. Currently reg_state can't represent two segments per
+ * numeric domain, so in such situations we can only derive maximal
+ * possible range ([0, U64_MAX] for u64, and [S64_MIN, S64_MAX] for s64).
+ *
+ * So we use these facts to derive umin/umax from smin/smax and vice
+ * versa only if they stay within the same "half". This is equivalent
+ * to checking sign bit: lower half will have sign bit as zero, upper
+ * half have sign bit 1. Below in code we simplify this by just
+ * casting umin/umax as smin/smax and checking if they form valid
+ * range, and vice versa. Those are equivalent checks.
+ */
+ if ((s64)reg->umin_value <= (s64)reg->umax_value) {
+ reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value);
+ reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value);
+ }
+ /* If we cannot cross the sign boundary, then signed and unsigned bounds
* are the same, so combine. This works even in the negative case, e.g.
* -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
*/
- if (reg->smin_value >= 0 || reg->smax_value < 0) {
- reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
- reg->umin_value);
- reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
- reg->umax_value);
- return;
+ if ((u64)reg->smin_value <= (u64)reg->smax_value) {
+ reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value);
+ reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value);
}
- /* Learn sign from unsigned bounds. Signed bounds cross the sign
- * boundary, so we must be careful.
+}
+
+static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
+{
+ /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit
+ * values on both sides of 64-bit range in hope to have tigher range.
+ * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from
+ * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff].
+ * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound
+ * (0x100000000 -> 0x100000001) and 0x7fffffff as low 32-bits of
+ * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a
+ * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff].
+ * We just need to make sure that derived bounds we are intersecting
+ * with are well-formed ranges in respecitve s64 or u64 domain, just
+ * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments.
*/
- if ((s64)reg->umax_value >= 0) {
- /* Positive. We can't learn anything from the smin, but smax
- * is positive, hence safe.
- */
- reg->smin_value = reg->umin_value;
- reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
- reg->umax_value);
- } else if ((s64)reg->umin_value < 0) {
- /* Negative. We can't learn anything from the smax, but smin
- * is negative, hence safe.
- */
- reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
- reg->umin_value);
- reg->smax_value = reg->umax_value;
+ __u64 new_umin, new_umax;
+ __s64 new_smin, new_smax;
+
+ /* u32 -> u64 tightening, it's always well-formed */
+ new_umin = (reg->umin_value & ~0xffffffffULL) | reg->u32_min_value;
+ new_umax = (reg->umax_value & ~0xffffffffULL) | reg->u32_max_value;
+ reg->umin_value = max_t(u64, reg->umin_value, new_umin);
+ reg->umax_value = min_t(u64, reg->umax_value, new_umax);
+ /* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */
+ new_smin = (reg->smin_value & ~0xffffffffULL) | reg->u32_min_value;
+ new_smax = (reg->smax_value & ~0xffffffffULL) | reg->u32_max_value;
+ reg->smin_value = max_t(s64, reg->smin_value, new_smin);
+ reg->smax_value = min_t(s64, reg->smax_value, new_smax);
+
+ /* if s32 can be treated as valid u32 range, we can use it as well */
+ if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) {
+ /* s32 -> u64 tightening */
+ new_umin = (reg->umin_value & ~0xffffffffULL) | (u32)reg->s32_min_value;
+ new_umax = (reg->umax_value & ~0xffffffffULL) | (u32)reg->s32_max_value;
+ reg->umin_value = max_t(u64, reg->umin_value, new_umin);
+ reg->umax_value = min_t(u64, reg->umax_value, new_umax);
+ /* s32 -> s64 tightening */
+ new_smin = (reg->smin_value & ~0xffffffffULL) | (u32)reg->s32_min_value;
+ new_smax = (reg->smax_value & ~0xffffffffULL) | (u32)reg->s32_max_value;
+ reg->smin_value = max_t(s64, reg->smin_value, new_smin);
+ reg->smax_value = min_t(s64, reg->smax_value, new_smax);
}
}
@@ -1203,6 +2157,7 @@ static void __reg_deduce_bounds(struct bpf_reg_state *reg)
{
__reg32_deduce_bounds(reg);
__reg64_deduce_bounds(reg);
+ __reg_deduce_mixed_bounds(reg);
}
/* Attempts to improve var_off based on unsigned min/max information */
@@ -1211,94 +2166,101 @@ static void __reg_bound_offset(struct bpf_reg_state *reg)
struct tnum var64_off = tnum_intersect(reg->var_off,
tnum_range(reg->umin_value,
reg->umax_value));
- struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off),
- tnum_range(reg->u32_min_value,
- reg->u32_max_value));
+ struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off),
+ tnum_range(reg->u32_min_value,
+ reg->u32_max_value));
reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
}
-static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
-{
- reg->umin_value = reg->u32_min_value;
- reg->umax_value = reg->u32_max_value;
- /* Attempt to pull 32-bit signed bounds into 64-bit bounds
- * but must be positive otherwise set to worse case bounds
- * and refine later from tnum.
- */
- if (reg->s32_min_value >= 0 && reg->s32_max_value >= 0)
- reg->smax_value = reg->s32_max_value;
- else
- reg->smax_value = U32_MAX;
- if (reg->s32_min_value >= 0)
- reg->smin_value = reg->s32_min_value;
- else
- reg->smin_value = 0;
-}
-
-static void __reg_combine_32_into_64(struct bpf_reg_state *reg)
+static void reg_bounds_sync(struct bpf_reg_state *reg)
{
- /* special case when 64-bit register has upper 32-bit register
- * zeroed. Typically happens after zext or <<32, >>32 sequence
- * allowing us to use 32-bit bounds directly,
- */
- if (tnum_equals_const(tnum_clear_subreg(reg->var_off), 0)) {
- __reg_assign_32_into_64(reg);
- } else {
- /* Otherwise the best we can do is push lower 32bit known and
- * unknown bits into register (var_off set from jmp logic)
- * then learn as much as possible from the 64-bit tnum
- * known and unknown bits. The previous smin/smax bounds are
- * invalid here because of jmp32 compare so mark them unknown
- * so they do not impact tnum bounds calculation.
- */
- __mark_reg64_unbounded(reg);
- __update_reg_bounds(reg);
- }
-
+ /* We might have learned new bounds from the var_off. */
+ __update_reg_bounds(reg);
+ /* We might have learned something about the sign bit. */
+ __reg_deduce_bounds(reg);
+ __reg_deduce_bounds(reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(reg);
/* Intersecting with the old var_off might have improved our bounds
- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * slightly, e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
* then new var_off is (0; 0x7f...fc) which improves our umax.
*/
- __reg_deduce_bounds(reg);
- __reg_bound_offset(reg);
__update_reg_bounds(reg);
}
-static bool __reg64_bound_s32(s64 a)
+static int reg_bounds_sanity_check(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, const char *ctx)
{
- if (a > S32_MIN && a < S32_MAX)
- return true;
- return false;
+ const char *msg;
+
+ if (reg->umin_value > reg->umax_value ||
+ reg->smin_value > reg->smax_value ||
+ reg->u32_min_value > reg->u32_max_value ||
+ reg->s32_min_value > reg->s32_max_value) {
+ msg = "range bounds violation";
+ goto out;
+ }
+
+ if (tnum_is_const(reg->var_off)) {
+ u64 uval = reg->var_off.value;
+ s64 sval = (s64)uval;
+
+ if (reg->umin_value != uval || reg->umax_value != uval ||
+ reg->smin_value != sval || reg->smax_value != sval) {
+ msg = "const tnum out of sync with range bounds";
+ goto out;
+ }
+ }
+
+ if (tnum_subreg_is_const(reg->var_off)) {
+ u32 uval32 = tnum_subreg(reg->var_off).value;
+ s32 sval32 = (s32)uval32;
+
+ if (reg->u32_min_value != uval32 || reg->u32_max_value != uval32 ||
+ reg->s32_min_value != sval32 || reg->s32_max_value != sval32) {
+ msg = "const subreg tnum out of sync with range bounds";
+ goto out;
+ }
+ }
+
+ return 0;
+out:
+ verbose(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] "
+ "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)\n",
+ ctx, msg, reg->umin_value, reg->umax_value,
+ reg->smin_value, reg->smax_value,
+ reg->u32_min_value, reg->u32_max_value,
+ reg->s32_min_value, reg->s32_max_value,
+ reg->var_off.value, reg->var_off.mask);
+ if (env->test_reg_invariants)
+ return -EFAULT;
+ __mark_reg_unbounded(reg);
+ return 0;
}
-static bool __reg64_bound_u32(u64 a)
+static bool __reg32_bound_s64(s32 a)
{
- if (a > U32_MIN && a < U32_MAX)
- return true;
- return false;
+ return a >= 0 && a <= S32_MAX;
}
-static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
+static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
{
- __mark_reg32_unbounded(reg);
-
- if (__reg64_bound_s32(reg->smin_value))
- reg->s32_min_value = (s32)reg->smin_value;
- if (__reg64_bound_s32(reg->smax_value))
- reg->s32_max_value = (s32)reg->smax_value;
- if (__reg64_bound_u32(reg->umin_value))
- reg->u32_min_value = (u32)reg->umin_value;
- if (__reg64_bound_u32(reg->umax_value))
- reg->u32_max_value = (u32)reg->umax_value;
+ reg->umin_value = reg->u32_min_value;
+ reg->umax_value = reg->u32_max_value;
- /* Intersecting with the old var_off might have improved our bounds
- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
- * then new var_off is (0; 0x7f...fc) which improves our umax.
+ /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must
+ * be positive otherwise set to worse case bounds and refine later
+ * from tnum.
*/
- __reg_deduce_bounds(reg);
- __reg_bound_offset(reg);
- __update_reg_bounds(reg);
+ if (__reg32_bound_s64(reg->s32_min_value) &&
+ __reg32_bound_s64(reg->s32_max_value)) {
+ reg->smin_value = reg->s32_min_value;
+ reg->smax_value = reg->s32_max_value;
+ } else {
+ reg->smin_value = 0;
+ reg->smax_value = U32_MAX;
+ }
}
/* Mark a register as having a completely unknown (scalar) value. */
@@ -1306,14 +2268,16 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env,
struct bpf_reg_state *reg)
{
/*
- * Clear type, id, off, and union(map_ptr, range) and
+ * Clear type, off, and union(map_ptr, range) and
* padding between 'type' and union
*/
memset(reg, 0, offsetof(struct bpf_reg_state, var_off));
reg->type = SCALAR_VALUE;
+ reg->id = 0;
+ reg->ref_obj_id = 0;
reg->var_off = tnum_unknown;
reg->frameno = 0;
- reg->precise = env->subprog_cnt > 1 || !env->bpf_capable;
+ reg->precise = !env->bpf_capable;
__mark_reg_unbounded(reg);
}
@@ -1350,6 +2314,22 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
__mark_reg_not_init(env, regs + regno);
}
+static void mark_btf_ld_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno,
+ enum bpf_reg_type reg_type,
+ struct btf *btf, u32 btf_id,
+ enum bpf_type_flag flag)
+{
+ if (reg_type == SCALAR_VALUE) {
+ mark_reg_unknown(env, regs, regno);
+ return;
+ }
+ mark_reg_known_zero(env, regs, regno);
+ regs[regno].type = PTR_TO_BTF_ID | flag;
+ regs[regno].btf = btf;
+ regs[regno].btf_id = btf_id;
+}
+
#define DEF_NOT_SUBREG (0)
static void init_reg_state(struct bpf_verifier_env *env,
struct bpf_func_state *state)
@@ -1370,6 +2350,11 @@ static void init_reg_state(struct bpf_verifier_env *env,
regs[BPF_REG_FP].frameno = state->frameno;
}
+static struct bpf_retval_range retval_range(s32 minval, s32 maxval)
+{
+ return (struct bpf_retval_range){ minval, maxval };
+}
+
#define BPF_MAIN_FUNC (-1)
static void init_func_state(struct bpf_verifier_env *env,
struct bpf_func_state *state,
@@ -1378,9 +2363,59 @@ static void init_func_state(struct bpf_verifier_env *env,
state->callsite = callsite;
state->frameno = frameno;
state->subprogno = subprogno;
+ state->callback_ret_range = retval_range(0, 0);
init_reg_state(env, state);
+ mark_verifier_state_scratched(env);
+}
+
+/* Similar to push_stack(), but for async callbacks */
+static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
+ int insn_idx, int prev_insn_idx,
+ int subprog)
+{
+ struct bpf_verifier_stack_elem *elem;
+ struct bpf_func_state *frame;
+
+ elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+ if (!elem)
+ goto err;
+
+ elem->insn_idx = insn_idx;
+ elem->prev_insn_idx = prev_insn_idx;
+ elem->next = env->head;
+ elem->log_pos = env->log.end_pos;
+ env->head = elem;
+ env->stack_size++;
+ if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
+ verbose(env,
+ "The sequence of %d jumps is too complex for async cb.\n",
+ env->stack_size);
+ goto err;
+ }
+ /* Unlike push_stack() do not copy_verifier_state().
+ * The caller state doesn't matter.
+ * This is async callback. It starts in a fresh stack.
+ * Initialize it similar to do_check_common().
+ */
+ elem->st.branches = 1;
+ frame = kzalloc(sizeof(*frame), GFP_KERNEL);
+ if (!frame)
+ goto err;
+ init_func_state(env, frame,
+ BPF_MAIN_FUNC /* callsite */,
+ 0 /* frameno within this callchain */,
+ subprog /* subprog number within this prog */);
+ elem->st.frame[0] = frame;
+ return &elem->st;
+err:
+ free_verifier_state(env->cur_state, true);
+ env->cur_state = NULL;
+ /* pop all elements and return */
+ while (!pop_stack(env, NULL, NULL, false));
+ return NULL;
}
+
enum reg_arg_type {
SRC_OP, /* register is used as source operand */
DST_OP, /* register is used as destination operand */
@@ -1416,45 +2451,467 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
}
ret = find_subprog(env, off);
if (ret >= 0)
- return 0;
+ return ret;
if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
verbose(env, "too many subprograms\n");
return -E2BIG;
}
+ /* determine subprog starts. The end is one before the next starts */
env->subprog_info[env->subprog_cnt++].start = off;
sort(env->subprog_info, env->subprog_cnt,
sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
+ return env->subprog_cnt - 1;
+}
+
+static int bpf_find_exception_callback_insn_off(struct bpf_verifier_env *env)
+{
+ struct bpf_prog_aux *aux = env->prog->aux;
+ struct btf *btf = aux->btf;
+ const struct btf_type *t;
+ u32 main_btf_id, id;
+ const char *name;
+ int ret, i;
+
+ /* Non-zero func_info_cnt implies valid btf */
+ if (!aux->func_info_cnt)
+ return 0;
+ main_btf_id = aux->func_info[0].type_id;
+
+ t = btf_type_by_id(btf, main_btf_id);
+ if (!t) {
+ verbose(env, "invalid btf id for main subprog in func_info\n");
+ return -EINVAL;
+ }
+
+ name = btf_find_decl_tag_value(btf, t, -1, "exception_callback:");
+ if (IS_ERR(name)) {
+ ret = PTR_ERR(name);
+ /* If there is no tag present, there is no exception callback */
+ if (ret == -ENOENT)
+ ret = 0;
+ else if (ret == -EEXIST)
+ verbose(env, "multiple exception callback tags for main subprog\n");
+ return ret;
+ }
+
+ ret = btf_find_by_name_kind(btf, name, BTF_KIND_FUNC);
+ if (ret < 0) {
+ verbose(env, "exception callback '%s' could not be found in BTF\n", name);
+ return ret;
+ }
+ id = ret;
+ t = btf_type_by_id(btf, id);
+ if (btf_func_linkage(t) != BTF_FUNC_GLOBAL) {
+ verbose(env, "exception callback '%s' must have global linkage\n", name);
+ return -EINVAL;
+ }
+ ret = 0;
+ for (i = 0; i < aux->func_info_cnt; i++) {
+ if (aux->func_info[i].type_id != id)
+ continue;
+ ret = aux->func_info[i].insn_off;
+ /* Further func_info and subprog checks will also happen
+ * later, so assume this is the right insn_off for now.
+ */
+ if (!ret) {
+ verbose(env, "invalid exception callback insn_off in func_info: 0\n");
+ ret = -EINVAL;
+ }
+ }
+ if (!ret) {
+ verbose(env, "exception callback type id not found in func_info\n");
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+#define MAX_KFUNC_DESCS 256
+#define MAX_KFUNC_BTFS 256
+
+struct bpf_kfunc_desc {
+ struct btf_func_model func_model;
+ u32 func_id;
+ s32 imm;
+ u16 offset;
+ unsigned long addr;
+};
+
+struct bpf_kfunc_btf {
+ struct btf *btf;
+ struct module *module;
+ u16 offset;
+};
+
+struct bpf_kfunc_desc_tab {
+ /* Sorted by func_id (BTF ID) and offset (fd_array offset) during
+ * verification. JITs do lookups by bpf_insn, where func_id may not be
+ * available, therefore at the end of verification do_misc_fixups()
+ * sorts this by imm and offset.
+ */
+ struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
+ u32 nr_descs;
+};
+
+struct bpf_kfunc_btf_tab {
+ struct bpf_kfunc_btf descs[MAX_KFUNC_BTFS];
+ u32 nr_descs;
+};
+
+static int kfunc_desc_cmp_by_id_off(const void *a, const void *b)
+{
+ const struct bpf_kfunc_desc *d0 = a;
+ const struct bpf_kfunc_desc *d1 = b;
+
+ /* func_id is not greater than BTF_MAX_TYPE */
+ return d0->func_id - d1->func_id ?: d0->offset - d1->offset;
+}
+
+static int kfunc_btf_cmp_by_off(const void *a, const void *b)
+{
+ const struct bpf_kfunc_btf *d0 = a;
+ const struct bpf_kfunc_btf *d1 = b;
+
+ return d0->offset - d1->offset;
+}
+
+static const struct bpf_kfunc_desc *
+find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
+{
+ struct bpf_kfunc_desc desc = {
+ .func_id = func_id,
+ .offset = offset,
+ };
+ struct bpf_kfunc_desc_tab *tab;
+
+ tab = prog->aux->kfunc_tab;
+ return bsearch(&desc, tab->descs, tab->nr_descs,
+ sizeof(tab->descs[0]), kfunc_desc_cmp_by_id_off);
+}
+
+int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
+ u16 btf_fd_idx, u8 **func_addr)
+{
+ const struct bpf_kfunc_desc *desc;
+
+ desc = find_kfunc_desc(prog, func_id, btf_fd_idx);
+ if (!desc)
+ return -EFAULT;
+
+ *func_addr = (u8 *)desc->addr;
return 0;
}
-static int check_subprogs(struct bpf_verifier_env *env)
+static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
+ s16 offset)
+{
+ struct bpf_kfunc_btf kf_btf = { .offset = offset };
+ struct bpf_kfunc_btf_tab *tab;
+ struct bpf_kfunc_btf *b;
+ struct module *mod;
+ struct btf *btf;
+ int btf_fd;
+
+ tab = env->prog->aux->kfunc_btf_tab;
+ b = bsearch(&kf_btf, tab->descs, tab->nr_descs,
+ sizeof(tab->descs[0]), kfunc_btf_cmp_by_off);
+ if (!b) {
+ if (tab->nr_descs == MAX_KFUNC_BTFS) {
+ verbose(env, "too many different module BTFs\n");
+ return ERR_PTR(-E2BIG);
+ }
+
+ if (bpfptr_is_null(env->fd_array)) {
+ verbose(env, "kfunc offset > 0 without fd_array is invalid\n");
+ return ERR_PTR(-EPROTO);
+ }
+
+ if (copy_from_bpfptr_offset(&btf_fd, env->fd_array,
+ offset * sizeof(btf_fd),
+ sizeof(btf_fd)))
+ return ERR_PTR(-EFAULT);
+
+ btf = btf_get_by_fd(btf_fd);
+ if (IS_ERR(btf)) {
+ verbose(env, "invalid module BTF fd specified\n");
+ return btf;
+ }
+
+ if (!btf_is_module(btf)) {
+ verbose(env, "BTF fd for kfunc is not a module BTF\n");
+ btf_put(btf);
+ return ERR_PTR(-EINVAL);
+ }
+
+ mod = btf_try_get_module(btf);
+ if (!mod) {
+ btf_put(btf);
+ return ERR_PTR(-ENXIO);
+ }
+
+ b = &tab->descs[tab->nr_descs++];
+ b->btf = btf;
+ b->module = mod;
+ b->offset = offset;
+
+ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+ kfunc_btf_cmp_by_off, NULL);
+ }
+ return b->btf;
+}
+
+void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab)
+{
+ if (!tab)
+ return;
+
+ while (tab->nr_descs--) {
+ module_put(tab->descs[tab->nr_descs].module);
+ btf_put(tab->descs[tab->nr_descs].btf);
+ }
+ kfree(tab);
+}
+
+static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset)
+{
+ if (offset) {
+ if (offset < 0) {
+ /* In the future, this can be allowed to increase limit
+ * of fd index into fd_array, interpreted as u16.
+ */
+ verbose(env, "negative offset disallowed for kernel module function call\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ return __find_kfunc_desc_btf(env, offset);
+ }
+ return btf_vmlinux ?: ERR_PTR(-ENOENT);
+}
+
+static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
+{
+ const struct btf_type *func, *func_proto;
+ struct bpf_kfunc_btf_tab *btf_tab;
+ struct bpf_kfunc_desc_tab *tab;
+ struct bpf_prog_aux *prog_aux;
+ struct bpf_kfunc_desc *desc;
+ const char *func_name;
+ struct btf *desc_btf;
+ unsigned long call_imm;
+ unsigned long addr;
+ int err;
+
+ prog_aux = env->prog->aux;
+ tab = prog_aux->kfunc_tab;
+ btf_tab = prog_aux->kfunc_btf_tab;
+ if (!tab) {
+ if (!btf_vmlinux) {
+ verbose(env, "calling kernel function is not supported without CONFIG_DEBUG_INFO_BTF\n");
+ return -ENOTSUPP;
+ }
+
+ if (!env->prog->jit_requested) {
+ verbose(env, "JIT is required for calling kernel function\n");
+ return -ENOTSUPP;
+ }
+
+ if (!bpf_jit_supports_kfunc_call()) {
+ verbose(env, "JIT does not support calling kernel function\n");
+ return -ENOTSUPP;
+ }
+
+ if (!env->prog->gpl_compatible) {
+ verbose(env, "cannot call kernel function from non-GPL compatible program\n");
+ return -EINVAL;
+ }
+
+ tab = kzalloc(sizeof(*tab), GFP_KERNEL);
+ if (!tab)
+ return -ENOMEM;
+ prog_aux->kfunc_tab = tab;
+ }
+
+ /* func_id == 0 is always invalid, but instead of returning an error, be
+ * conservative and wait until the code elimination pass before returning
+ * error, so that invalid calls that get pruned out can be in BPF programs
+ * loaded from userspace. It is also required that offset be untouched
+ * for such calls.
+ */
+ if (!func_id && !offset)
+ return 0;
+
+ if (!btf_tab && offset) {
+ btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL);
+ if (!btf_tab)
+ return -ENOMEM;
+ prog_aux->kfunc_btf_tab = btf_tab;
+ }
+
+ desc_btf = find_kfunc_desc_btf(env, offset);
+ if (IS_ERR(desc_btf)) {
+ verbose(env, "failed to find BTF for kernel function\n");
+ return PTR_ERR(desc_btf);
+ }
+
+ if (find_kfunc_desc(env->prog, func_id, offset))
+ return 0;
+
+ if (tab->nr_descs == MAX_KFUNC_DESCS) {
+ verbose(env, "too many different kernel function calls\n");
+ return -E2BIG;
+ }
+
+ func = btf_type_by_id(desc_btf, func_id);
+ if (!func || !btf_type_is_func(func)) {
+ verbose(env, "kernel btf_id %u is not a function\n",
+ func_id);
+ return -EINVAL;
+ }
+ func_proto = btf_type_by_id(desc_btf, func->type);
+ if (!func_proto || !btf_type_is_func_proto(func_proto)) {
+ verbose(env, "kernel function btf_id %u does not have a valid func_proto\n",
+ func_id);
+ return -EINVAL;
+ }
+
+ func_name = btf_name_by_offset(desc_btf, func->name_off);
+ addr = kallsyms_lookup_name(func_name);
+ if (!addr) {
+ verbose(env, "cannot find address for kernel function %s\n",
+ func_name);
+ return -EINVAL;
+ }
+ specialize_kfunc(env, func_id, offset, &addr);
+
+ if (bpf_jit_supports_far_kfunc_call()) {
+ call_imm = func_id;
+ } else {
+ call_imm = BPF_CALL_IMM(addr);
+ /* Check whether the relative offset overflows desc->imm */
+ if ((unsigned long)(s32)call_imm != call_imm) {
+ verbose(env, "address of kernel function %s is out of range\n",
+ func_name);
+ return -EINVAL;
+ }
+ }
+
+ if (bpf_dev_bound_kfunc_id(func_id)) {
+ err = bpf_dev_bound_kfunc_check(&env->log, prog_aux);
+ if (err)
+ return err;
+ }
+
+ desc = &tab->descs[tab->nr_descs++];
+ desc->func_id = func_id;
+ desc->imm = call_imm;
+ desc->offset = offset;
+ desc->addr = addr;
+ err = btf_distill_func_proto(&env->log, desc_btf,
+ func_proto, func_name,
+ &desc->func_model);
+ if (!err)
+ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+ kfunc_desc_cmp_by_id_off, NULL);
+ return err;
+}
+
+static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)
+{
+ const struct bpf_kfunc_desc *d0 = a;
+ const struct bpf_kfunc_desc *d1 = b;
+
+ if (d0->imm != d1->imm)
+ return d0->imm < d1->imm ? -1 : 1;
+ if (d0->offset != d1->offset)
+ return d0->offset < d1->offset ? -1 : 1;
+ return 0;
+}
+
+static void sort_kfunc_descs_by_imm_off(struct bpf_prog *prog)
+{
+ struct bpf_kfunc_desc_tab *tab;
+
+ tab = prog->aux->kfunc_tab;
+ if (!tab)
+ return;
+
+ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+ kfunc_desc_cmp_by_imm_off, NULL);
+}
+
+bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
+{
+ return !!prog->aux->kfunc_tab;
+}
+
+const struct btf_func_model *
+bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_kfunc_desc desc = {
+ .imm = insn->imm,
+ .offset = insn->off,
+ };
+ const struct bpf_kfunc_desc *res;
+ struct bpf_kfunc_desc_tab *tab;
+
+ tab = prog->aux->kfunc_tab;
+ res = bsearch(&desc, tab->descs, tab->nr_descs,
+ sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off);
+
+ return res ? &res->func_model : NULL;
+}
+
+static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
{
- int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
struct bpf_subprog_info *subprog = env->subprog_info;
+ int i, ret, insn_cnt = env->prog->len, ex_cb_insn;
struct bpf_insn *insn = env->prog->insnsi;
- int insn_cnt = env->prog->len;
/* Add entry function. */
ret = add_subprog(env, 0);
- if (ret < 0)
+ if (ret)
return ret;
- /* determine subprog starts. The end is one before the next starts */
- for (i = 0; i < insn_cnt; i++) {
- if (insn[i].code != (BPF_JMP | BPF_CALL))
- continue;
- if (insn[i].src_reg != BPF_PSEUDO_CALL)
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn) &&
+ !bpf_pseudo_kfunc_call(insn))
continue;
+
if (!env->bpf_capable) {
- verbose(env,
- "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
+ verbose(env, "loading/calling other bpf or kernel functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
return -EPERM;
}
- ret = add_subprog(env, i + insn[i].imm + 1);
+
+ if (bpf_pseudo_func(insn) || bpf_pseudo_call(insn))
+ ret = add_subprog(env, i + insn->imm + 1);
+ else
+ ret = add_kfunc_call(env, insn->imm, insn->off);
+
if (ret < 0)
return ret;
}
+ ret = bpf_find_exception_callback_insn_off(env);
+ if (ret < 0)
+ return ret;
+ ex_cb_insn = ret;
+
+ /* If ex_cb_insn > 0, this means that the main program has a subprog
+ * marked using BTF decl tag to serve as the exception callback.
+ */
+ if (ex_cb_insn) {
+ ret = add_subprog(env, ex_cb_insn);
+ if (ret < 0)
+ return ret;
+ for (i = 1; i < env->subprog_cnt; i++) {
+ if (env->subprog_info[i].start != ex_cb_insn)
+ continue;
+ env->exception_callback_subprog = i;
+ mark_subprog_exc_cb(env, i);
+ break;
+ }
+ }
+
/* Add a fake 'exit' subprog which could simplify subprog iteration
* logic. 'subprog_cnt' should not be increased.
*/
@@ -1464,17 +2921,37 @@ static int check_subprogs(struct bpf_verifier_env *env)
for (i = 0; i < env->subprog_cnt; i++)
verbose(env, "func#%d @%d\n", i, subprog[i].start);
+ return 0;
+}
+
+static int check_subprogs(struct bpf_verifier_env *env)
+{
+ int i, subprog_start, subprog_end, off, cur_subprog = 0;
+ struct bpf_subprog_info *subprog = env->subprog_info;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+
/* now check that all jumps are within the same subprog */
subprog_start = subprog[cur_subprog].start;
subprog_end = subprog[cur_subprog + 1].start;
for (i = 0; i < insn_cnt; i++) {
u8 code = insn[i].code;
+ if (code == (BPF_JMP | BPF_CALL) &&
+ insn[i].src_reg == 0 &&
+ insn[i].imm == BPF_FUNC_tail_call)
+ subprog[cur_subprog].has_tail_call = true;
+ if (BPF_CLASS(code) == BPF_LD &&
+ (BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND))
+ subprog[cur_subprog].has_ld_abs = true;
if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
goto next;
if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
goto next;
- off = i + insn[i].off + 1;
+ if (code == (BPF_JMP32 | BPF_JA))
+ off = i + insn[i].imm + 1;
+ else
+ off = i + insn[i].off + 1;
if (off < subprog_start || off >= subprog_end) {
verbose(env, "jump out of range from insn %d to %d\n", i, off);
return -EINVAL;
@@ -1483,9 +2960,10 @@ next:
if (i == subprog_end - 1) {
/* to avoid fall-through from one subprog into another
* the last insn of the subprog should be either exit
- * or unconditional jump back
+ * or unconditional jump back or bpf_throw call
*/
if (code != (BPF_JMP | BPF_EXIT) &&
+ code != (BPF_JMP32 | BPF_JA) &&
code != (BPF_JMP | BPF_JA)) {
verbose(env, "last insn is not an exit or jmp\n");
return -EINVAL;
@@ -1515,7 +2993,7 @@ static int mark_reg_read(struct bpf_verifier_env *env,
break;
if (parent->live & REG_LIVE_DONE) {
verbose(env, "verifier BUG type %s var_off %lld off %d\n",
- reg_type_str[parent->type],
+ reg_type_str(env, parent->type),
parent->var_off.value, parent->off);
return -EFAULT;
}
@@ -1550,6 +3028,51 @@ static int mark_reg_read(struct bpf_verifier_env *env,
return 0;
}
+static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, ret;
+
+ /* For CONST_PTR_TO_DYNPTR, it must have already been done by
+ * check_reg_arg in check_helper_call and mark_btf_func_reg_size in
+ * check_kfunc_call.
+ */
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return 0;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ /* Caller ensures dynptr is valid and initialized, which means spi is in
+ * bounds and spi is the first dynptr slot. Simply mark stack slot as
+ * read.
+ */
+ ret = mark_reg_read(env, &state->stack[spi].spilled_ptr,
+ state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64);
+ if (ret)
+ return ret;
+ return mark_reg_read(env, &state->stack[spi - 1].spilled_ptr,
+ state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);
+}
+
+static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int spi, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int err, i;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
+
+ err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
+ if (err)
+ return err;
+
+ mark_stack_slot_scratched(env, spi - i);
+ }
+
+ return 0;
+}
+
/* This function is supposed to be used by the following 32-bit optimization
* code only. It returns TRUE if the source or destination register operates
* on 64-bit, otherwise return FALSE.
@@ -1586,8 +3109,10 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
+ if (class == BPF_ALU64 && op == BPF_END && (insn->imm == 16 || insn->imm == 32))
+ return false;
+
if (class == BPF_ALU64 || class == BPF_JMP ||
- /* BPF_END always use BPF_ALU class. */
(class == BPF_ALU && op == BPF_END && insn->imm == 64))
return true;
@@ -1596,13 +3121,17 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (class == BPF_LDX) {
if (t != SRC_OP)
- return BPF_SIZE(code) == BPF_DW;
+ return BPF_SIZE(code) == BPF_DW || BPF_MODE(code) == BPF_MEMSX;
/* LDX source must be ptr. */
return true;
}
if (class == BPF_STX) {
- if (reg->type != SCALAR_VALUE)
+ /* BPF_STX (including atomic variants) has multiple source
+ * operands, one of which is a ptr. Check whether the caller is
+ * asking about it.
+ */
+ if (t == SRC_OP && reg->type != SCALAR_VALUE)
return true;
return BPF_SIZE(code) == BPF_DW;
}
@@ -1634,22 +3163,38 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
return true;
}
-/* Return TRUE if INSN doesn't have explicit value define. */
-static bool insn_no_def(struct bpf_insn *insn)
+/* Return the regno defined by the insn, or -1. */
+static int insn_def_regno(const struct bpf_insn *insn)
{
- u8 class = BPF_CLASS(insn->code);
-
- return (class == BPF_JMP || class == BPF_JMP32 ||
- class == BPF_STX || class == BPF_ST);
+ switch (BPF_CLASS(insn->code)) {
+ case BPF_JMP:
+ case BPF_JMP32:
+ case BPF_ST:
+ return -1;
+ case BPF_STX:
+ if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ (insn->imm & BPF_FETCH)) {
+ if (insn->imm == BPF_CMPXCHG)
+ return BPF_REG_0;
+ else
+ return insn->src_reg;
+ } else {
+ return -1;
+ }
+ default:
+ return insn->dst_reg;
+ }
}
/* Return TRUE if INSN has defined any 32-bit value explicitly. */
static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- if (insn_no_def(insn))
+ int dst_reg = insn_def_regno(insn);
+
+ if (dst_reg == -1)
return false;
- return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP);
+ return !is_reg64(env, insn, dst_reg, NULL, DST_OP);
}
static void mark_insn_zext(struct bpf_verifier_env *env,
@@ -1665,13 +3210,11 @@ static void mark_insn_zext(struct bpf_verifier_env *env,
reg->subreg_def = DEF_NOT_SUBREG;
}
-static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
- enum reg_arg_type t)
+static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno,
+ enum reg_arg_type t)
{
- struct bpf_verifier_state *vstate = env->cur_state;
- struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
- struct bpf_reg_state *reg, *regs = state->regs;
+ struct bpf_reg_state *reg;
bool rw64;
if (regno >= MAX_BPF_REG) {
@@ -1679,6 +3222,8 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
return -EINVAL;
}
+ mark_reg_scratched(env, regno);
+
reg = &regs[regno];
rw64 = is_reg64(env, insn, regno, reg, t);
if (t == SRC_OP) {
@@ -1710,32 +3255,111 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
return 0;
}
+static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
+ enum reg_arg_type t)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+
+ return __check_reg_arg(env, state->regs, regno, t);
+}
+
+static int insn_stack_access_flags(int frameno, int spi)
+{
+ return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno;
+}
+
+static int insn_stack_access_spi(int insn_flags)
+{
+ return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK;
+}
+
+static int insn_stack_access_frameno(int insn_flags)
+{
+ return insn_flags & INSN_F_FRAMENO_MASK;
+}
+
+static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].jmp_point = true;
+}
+
+static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].jmp_point;
+}
+
/* for any branch, call, exit record the history of jmps in the given state */
-static int push_jmp_history(struct bpf_verifier_env *env,
- struct bpf_verifier_state *cur)
+static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
+ int insn_flags)
{
u32 cnt = cur->jmp_history_cnt;
- struct bpf_idx_pair *p;
+ struct bpf_jmp_history_entry *p;
+ size_t alloc_size;
+
+ /* combine instruction flags if we already recorded this instruction */
+ if (env->cur_hist_ent) {
+ /* atomic instructions push insn_flags twice, for READ and
+ * WRITE sides, but they should agree on stack slot
+ */
+ WARN_ONCE((env->cur_hist_ent->flags & insn_flags) &&
+ (env->cur_hist_ent->flags & insn_flags) != insn_flags,
+ "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n",
+ env->insn_idx, env->cur_hist_ent->flags, insn_flags);
+ env->cur_hist_ent->flags |= insn_flags;
+ return 0;
+ }
cnt++;
- p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER);
+ alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
+ p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
if (!p)
return -ENOMEM;
- p[cnt - 1].idx = env->insn_idx;
- p[cnt - 1].prev_idx = env->prev_insn_idx;
cur->jmp_history = p;
+
+ p = &cur->jmp_history[cnt - 1];
+ p->idx = env->insn_idx;
+ p->prev_idx = env->prev_insn_idx;
+ p->flags = insn_flags;
cur->jmp_history_cnt = cnt;
+ env->cur_hist_ent = p;
+
return 0;
}
+static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
+ u32 hist_end, int insn_idx)
+{
+ if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
+ return &st->jmp_history[hist_end - 1];
+ return NULL;
+}
+
/* Backtrack one insn at a time. If idx is not at the top of recorded
* history then previous instruction came from straight line execution.
+ * Return -ENOENT if we exhausted all instructions within given state.
+ *
+ * It's legal to have a bit of a looping with the same starting and ending
+ * insn index within the same state, e.g.: 3->4->5->3, so just because current
+ * instruction index is the same as state's first_idx doesn't mean we are
+ * done. If there is still some jump history left, we should keep going. We
+ * need to take into account that we might have a jump history between given
+ * state's parent and itself, due to checkpointing. In this case, we'll have
+ * history entry recording a jump from last instruction of parent state and
+ * first instruction of given state.
*/
static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
u32 *history)
{
u32 cnt = *history;
+ if (i == st->first_insn_idx) {
+ if (cnt == 0)
+ return -ENOENT;
+ if (cnt == 1 && st->jmp_history[0].idx == i)
+ return -ENOENT;
+ }
+
if (cnt && st->jmp_history[cnt - 1].idx == i) {
i = st->jmp_history[cnt - 1].prev_idx;
(*history)--;
@@ -1745,14 +3369,183 @@ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
return i;
}
+static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
+{
+ const struct btf_type *func;
+ struct btf *desc_btf;
+
+ if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
+ return NULL;
+
+ desc_btf = find_kfunc_desc_btf(data, insn->off);
+ if (IS_ERR(desc_btf))
+ return "<error>";
+
+ func = btf_type_by_id(desc_btf, insn->imm);
+ return btf_name_by_offset(desc_btf, func->name_off);
+}
+
+static inline void bt_init(struct backtrack_state *bt, u32 frame)
+{
+ bt->frame = frame;
+}
+
+static inline void bt_reset(struct backtrack_state *bt)
+{
+ struct bpf_verifier_env *env = bt->env;
+
+ memset(bt, 0, sizeof(*bt));
+ bt->env = env;
+}
+
+static inline u32 bt_empty(struct backtrack_state *bt)
+{
+ u64 mask = 0;
+ int i;
+
+ for (i = 0; i <= bt->frame; i++)
+ mask |= bt->reg_masks[i] | bt->stack_masks[i];
+
+ return mask == 0;
+}
+
+static inline int bt_subprog_enter(struct backtrack_state *bt)
+{
+ if (bt->frame == MAX_CALL_FRAMES - 1) {
+ verbose(bt->env, "BUG subprog enter from frame %d\n", bt->frame);
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ bt->frame++;
+ return 0;
+}
+
+static inline int bt_subprog_exit(struct backtrack_state *bt)
+{
+ if (bt->frame == 0) {
+ verbose(bt->env, "BUG subprog exit from frame 0\n");
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ bt->frame--;
+ return 0;
+}
+
+static inline void bt_set_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+ bt->reg_masks[frame] |= 1 << reg;
+}
+
+static inline void bt_clear_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+ bt->reg_masks[frame] &= ~(1 << reg);
+}
+
+static inline void bt_set_reg(struct backtrack_state *bt, u32 reg)
+{
+ bt_set_frame_reg(bt, bt->frame, reg);
+}
+
+static inline void bt_clear_reg(struct backtrack_state *bt, u32 reg)
+{
+ bt_clear_frame_reg(bt, bt->frame, reg);
+}
+
+static inline void bt_set_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+ bt->stack_masks[frame] |= 1ull << slot;
+}
+
+static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+ bt->stack_masks[frame] &= ~(1ull << slot);
+}
+
+static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame)
+{
+ return bt->reg_masks[frame];
+}
+
+static inline u32 bt_reg_mask(struct backtrack_state *bt)
+{
+ return bt->reg_masks[bt->frame];
+}
+
+static inline u64 bt_frame_stack_mask(struct backtrack_state *bt, u32 frame)
+{
+ return bt->stack_masks[frame];
+}
+
+static inline u64 bt_stack_mask(struct backtrack_state *bt)
+{
+ return bt->stack_masks[bt->frame];
+}
+
+static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
+{
+ return bt->reg_masks[bt->frame] & (1 << reg);
+}
+
+static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+ return bt->stack_masks[frame] & (1ull << slot);
+}
+
+/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */
+static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask)
+{
+ DECLARE_BITMAP(mask, 64);
+ bool first = true;
+ int i, n;
+
+ buf[0] = '\0';
+
+ bitmap_from_u64(mask, reg_mask);
+ for_each_set_bit(i, mask, 32) {
+ n = snprintf(buf, buf_sz, "%sr%d", first ? "" : ",", i);
+ first = false;
+ buf += n;
+ buf_sz -= n;
+ if (buf_sz < 0)
+ break;
+ }
+}
+/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */
+static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
+{
+ DECLARE_BITMAP(mask, 64);
+ bool first = true;
+ int i, n;
+
+ buf[0] = '\0';
+
+ bitmap_from_u64(mask, stack_mask);
+ for_each_set_bit(i, mask, 64) {
+ n = snprintf(buf, buf_sz, "%s%d", first ? "" : ",", -(i + 1) * 8);
+ first = false;
+ buf += n;
+ buf_sz -= n;
+ if (buf_sz < 0)
+ break;
+ }
+}
+
+static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
+
/* For given verifier state backtrack_insn() is called from the last insn to
* the first insn. Its purpose is to compute a bitmask of registers and
* stack slots that needs precision in the parent verifier state.
+ *
+ * @idx is an index of the instruction we are currently processing;
+ * @subseq_idx is an index of the subsequent instruction that:
+ * - *would be* executed next, if jump history is viewed in forward order;
+ * - *was* processed previously during backtracking.
*/
-static int backtrack_insn(struct bpf_verifier_env *env, int idx,
- u32 *reg_mask, u64 *stack_mask)
+static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
+ struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
{
const struct bpf_insn_cbs cbs = {
+ .cb_call = disasm_kfunc_name,
.cb_print = verbose,
.private_data = env,
};
@@ -1760,29 +3553,38 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
u8 class = BPF_CLASS(insn->code);
u8 opcode = BPF_OP(insn->code);
u8 mode = BPF_MODE(insn->code);
- u32 dreg = 1u << insn->dst_reg;
- u32 sreg = 1u << insn->src_reg;
- u32 spi;
+ u32 dreg = insn->dst_reg;
+ u32 sreg = insn->src_reg;
+ u32 spi, i, fr;
if (insn->code == 0)
return 0;
- if (env->log.level & BPF_LOG_LEVEL) {
- verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask);
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt));
+ verbose(env, "mark_precise: frame%d: regs=%s ",
+ bt->frame, env->tmp_str_buf);
+ fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt));
+ verbose(env, "stack=%s before ", env->tmp_str_buf);
verbose(env, "%d: ", idx);
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
}
if (class == BPF_ALU || class == BPF_ALU64) {
- if (!(*reg_mask & dreg))
+ if (!bt_is_reg_set(bt, dreg))
+ return 0;
+ if (opcode == BPF_END || opcode == BPF_NEG) {
+ /* sreg is reserved and unused
+ * dreg still need precision before this insn
+ */
return 0;
- if (opcode == BPF_MOV) {
+ } else if (opcode == BPF_MOV) {
if (BPF_SRC(insn->code) == BPF_X) {
- /* dreg = sreg
+ /* dreg = sreg or dreg = (s8, s16, s32)sreg
* dreg needs precision after this insn
* sreg needs precision before this insn
*/
- *reg_mask &= ~dreg;
- *reg_mask |= sreg;
+ bt_clear_reg(bt, dreg);
+ bt_set_reg(bt, sreg);
} else {
/* dreg = K
* dreg needs precision after this insn.
@@ -1790,7 +3592,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
* as precise=true in this verifier state.
* No further markings in parent are necessary
*/
- *reg_mask &= ~dreg;
+ bt_clear_reg(bt, dreg);
}
} else {
if (BPF_SRC(insn->code) == BPF_X) {
@@ -1798,15 +3600,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
* both dreg and sreg need precision
* before this insn
*/
- *reg_mask |= sreg;
+ bt_set_reg(bt, sreg);
} /* else dreg += K
* dreg still needs precision before this insn
*/
}
} else if (class == BPF_LDX) {
- if (!(*reg_mask & dreg))
+ if (!bt_is_reg_set(bt, dreg))
return 0;
- *reg_mask &= ~dreg;
+ bt_clear_reg(bt, dreg);
/* scalars can only be spilled into stack w/o losing precision.
* Load from any other memory can be zero extended.
@@ -1814,66 +3616,196 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
* by 'precise' mark in corresponding register of this state.
* No further tracking necessary.
*/
- if (insn->src_reg != BPF_REG_FP)
+ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
return 0;
- if (BPF_SIZE(insn->code) != BPF_DW)
- return 0;
-
/* dreg = *(u64 *)[fp - off] was a fill from the stack.
* that [fp - off] slot contains scalar that needs to be
* tracked with precision
*/
- spi = (-insn->off - 1) / BPF_REG_SIZE;
- if (spi >= 64) {
- verbose(env, "BUG spi %d\n", spi);
- WARN_ONCE(1, "verifier backtracking bug");
- return -EFAULT;
- }
- *stack_mask |= 1ull << spi;
+ spi = insn_stack_access_spi(hist->flags);
+ fr = insn_stack_access_frameno(hist->flags);
+ bt_set_frame_slot(bt, fr, spi);
} else if (class == BPF_STX || class == BPF_ST) {
- if (*reg_mask & dreg)
+ if (bt_is_reg_set(bt, dreg))
/* stx & st shouldn't be using _scalar_ dst_reg
* to access memory. It means backtracking
* encountered a case of pointer subtraction.
*/
return -ENOTSUPP;
/* scalars can only be spilled into stack */
- if (insn->dst_reg != BPF_REG_FP)
+ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
return 0;
- if (BPF_SIZE(insn->code) != BPF_DW)
+ spi = insn_stack_access_spi(hist->flags);
+ fr = insn_stack_access_frameno(hist->flags);
+ if (!bt_is_frame_slot_set(bt, fr, spi))
return 0;
- spi = (-insn->off - 1) / BPF_REG_SIZE;
- if (spi >= 64) {
- verbose(env, "BUG spi %d\n", spi);
- WARN_ONCE(1, "verifier backtracking bug");
- return -EFAULT;
- }
- if (!(*stack_mask & (1ull << spi)))
- return 0;
- *stack_mask &= ~(1ull << spi);
+ bt_clear_frame_slot(bt, fr, spi);
if (class == BPF_STX)
- *reg_mask |= sreg;
+ bt_set_reg(bt, sreg);
} else if (class == BPF_JMP || class == BPF_JMP32) {
- if (opcode == BPF_CALL) {
- if (insn->src_reg == BPF_PSEUDO_CALL)
+ if (bpf_pseudo_call(insn)) {
+ int subprog_insn_idx, subprog;
+
+ subprog_insn_idx = idx + insn->imm + 1;
+ subprog = find_subprog(env, subprog_insn_idx);
+ if (subprog < 0)
+ return -EFAULT;
+
+ if (subprog_is_global(env, subprog)) {
+ /* check that jump history doesn't have any
+ * extra instructions from subprog; the next
+ * instruction after call to global subprog
+ * should be literally next instruction in
+ * caller program
+ */
+ WARN_ONCE(idx + 1 != subseq_idx, "verifier backtracking bug");
+ /* r1-r5 are invalidated after subprog call,
+ * so for global func call it shouldn't be set
+ * anymore
+ */
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+ verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ /* global subprog always sets R0 */
+ bt_clear_reg(bt, BPF_REG_0);
+ return 0;
+ } else {
+ /* static subprog call instruction, which
+ * means that we are exiting current subprog,
+ * so only r1-r5 could be still requested as
+ * precise, r0 and r6-r10 or any stack slot in
+ * the current frame should be zero by now
+ */
+ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
+ verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ /* we are now tracking register spills correctly,
+ * so any instance of leftover slots is a bug
+ */
+ if (bt_stack_mask(bt) != 0) {
+ verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
+ WARN_ONCE(1, "verifier backtracking bug (subprog leftover stack slots)");
+ return -EFAULT;
+ }
+ /* propagate r1-r5 to the caller */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
+ if (bt_is_reg_set(bt, i)) {
+ bt_clear_reg(bt, i);
+ bt_set_frame_reg(bt, bt->frame - 1, i);
+ }
+ }
+ if (bt_subprog_exit(bt))
+ return -EFAULT;
+ return 0;
+ }
+ } else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) {
+ /* exit from callback subprog to callback-calling helper or
+ * kfunc call. Use idx/subseq_idx check to discern it from
+ * straight line code backtracking.
+ * Unlike the subprog call handling above, we shouldn't
+ * propagate precision of r1-r5 (if any requested), as they are
+ * not actually arguments passed directly to callback subprogs
+ */
+ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
+ verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ if (bt_stack_mask(bt) != 0) {
+ verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
+ WARN_ONCE(1, "verifier backtracking bug (callback leftover stack slots)");
+ return -EFAULT;
+ }
+ /* clear r1-r5 in callback subprog's mask */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ bt_clear_reg(bt, i);
+ if (bt_subprog_exit(bt))
+ return -EFAULT;
+ return 0;
+ } else if (opcode == BPF_CALL) {
+ /* kfunc with imm==0 is invalid and fixup_kfunc_call will
+ * catch this error later. Make backtracking conservative
+ * with ENOTSUPP.
+ */
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0)
return -ENOTSUPP;
/* regular helper call sets R0 */
- *reg_mask &= ~1;
- if (*reg_mask & 0x3f) {
+ bt_clear_reg(bt, BPF_REG_0);
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
/* if backtracing was looking for registers R1-R5
* they should have been found already.
*/
- verbose(env, "BUG regs %x\n", *reg_mask);
+ verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
WARN_ONCE(1, "verifier backtracking bug");
return -EFAULT;
}
} else if (opcode == BPF_EXIT) {
- return -ENOTSUPP;
+ bool r0_precise;
+
+ /* Backtracking to a nested function call, 'idx' is a part of
+ * the inner frame 'subseq_idx' is a part of the outer frame.
+ * In case of a regular function call, instructions giving
+ * precision to registers R1-R5 should have been found already.
+ * In case of a callback, it is ok to have R1-R5 marked for
+ * backtracking, as these registers are set by the function
+ * invoking callback.
+ */
+ if (subseq_idx >= 0 && calls_callback(env, subseq_idx))
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ bt_clear_reg(bt, i);
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+ verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+
+ /* BPF_EXIT in subprog or callback always returns
+ * right after the call instruction, so by checking
+ * whether the instruction at subseq_idx-1 is subprog
+ * call or not we can distinguish actual exit from
+ * *subprog* from exit from *callback*. In the former
+ * case, we need to propagate r0 precision, if
+ * necessary. In the former we never do that.
+ */
+ r0_precise = subseq_idx - 1 >= 0 &&
+ bpf_pseudo_call(&env->prog->insnsi[subseq_idx - 1]) &&
+ bt_is_reg_set(bt, BPF_REG_0);
+
+ bt_clear_reg(bt, BPF_REG_0);
+ if (bt_subprog_enter(bt))
+ return -EFAULT;
+
+ if (r0_precise)
+ bt_set_reg(bt, BPF_REG_0);
+ /* r6-r9 and stack slots will stay set in caller frame
+ * bitmasks until we return back from callee(s)
+ */
+ return 0;
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg))
+ return 0;
+ /* dreg <cond> sreg
+ * Both dreg and sreg need precision before
+ * this insn. If only sreg was marked precise
+ * before it would be equally necessary to
+ * propagate it to dreg.
+ */
+ bt_set_reg(bt, dreg);
+ bt_set_reg(bt, sreg);
+ /* else dreg <cond> K
+ * Only dreg still needs precision before
+ * this insn, so for the K-based conditional
+ * there is nothing new to be marked.
+ */
}
} else if (class == BPF_LD) {
- if (!(*reg_mask & dreg))
+ if (!bt_is_reg_set(bt, dreg))
return 0;
- *reg_mask &= ~dreg;
+ bt_clear_reg(bt, dreg);
/* It's ld_imm64 or ld_abs or ld_ind.
* For ld_imm64 no further tracking of precision
* into parent is necessary
@@ -1944,110 +3876,370 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
struct bpf_reg_state *reg;
int i, j;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "mark_precise: frame%d: falling back to forcing all scalars precise\n",
+ st->curframe);
+ }
+
/* big hammer: mark all scalars precise in this path.
* pop_stack may still get !precise scalars.
+ * We also skip current state and go straight to first parent state,
+ * because precision markings in current non-checkpointed state are
+ * not needed. See why in the comment in __mark_chain_precision below.
*/
- for (; st; st = st->parent)
+ for (st = st->parent; st; st = st->parent) {
for (i = 0; i <= st->curframe; i++) {
func = st->frame[i];
for (j = 0; j < BPF_REG_FP; j++) {
reg = &func->regs[j];
- if (reg->type != SCALAR_VALUE)
+ if (reg->type != SCALAR_VALUE || reg->precise)
continue;
reg->precise = true;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "force_precise: frame%d: forcing r%d to be precise\n",
+ i, j);
+ }
}
for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
- if (func->stack[j].slot_type[0] != STACK_SPILL)
+ if (!is_spilled_reg(&func->stack[j]))
continue;
reg = &func->stack[j].spilled_ptr;
- if (reg->type != SCALAR_VALUE)
+ if (reg->type != SCALAR_VALUE || reg->precise)
continue;
reg->precise = true;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "force_precise: frame%d: forcing fp%d to be precise\n",
+ i, -(j + 1) * 8);
+ }
}
}
+ }
}
-static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
- int spi)
+static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ int i, j;
+
+ for (i = 0; i <= st->curframe; i++) {
+ func = st->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+ reg = &func->regs[j];
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = false;
+ }
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+ if (!is_spilled_reg(&func->stack[j]))
+ continue;
+ reg = &func->stack[j].spilled_ptr;
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = false;
+ }
+ }
+}
+
+static bool idset_contains(struct bpf_idset *s, u32 id)
+{
+ u32 i;
+
+ for (i = 0; i < s->count; ++i)
+ if (s->ids[i] == id)
+ return true;
+
+ return false;
+}
+
+static int idset_push(struct bpf_idset *s, u32 id)
+{
+ if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids)))
+ return -EFAULT;
+ s->ids[s->count++] = id;
+ return 0;
+}
+
+static void idset_reset(struct bpf_idset *s)
+{
+ s->count = 0;
+}
+
+/* Collect a set of IDs for all registers currently marked as precise in env->bt.
+ * Mark all registers with these IDs as precise.
+ */
+static int mark_precise_scalar_ids(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_idset *precise_ids = &env->idset_scratch;
+ struct backtrack_state *bt = &env->bt;
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ DECLARE_BITMAP(mask, 64);
+ int i, fr;
+
+ idset_reset(precise_ids);
+
+ for (fr = bt->frame; fr >= 0; fr--) {
+ func = st->frame[fr];
+
+ bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
+ for_each_set_bit(i, mask, 32) {
+ reg = &func->regs[i];
+ if (!reg->id || reg->type != SCALAR_VALUE)
+ continue;
+ if (idset_push(precise_ids, reg->id))
+ return -EFAULT;
+ }
+
+ bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
+ for_each_set_bit(i, mask, 64) {
+ if (i >= func->allocated_stack / BPF_REG_SIZE)
+ break;
+ if (!is_spilled_scalar_reg(&func->stack[i]))
+ continue;
+ reg = &func->stack[i].spilled_ptr;
+ if (!reg->id)
+ continue;
+ if (idset_push(precise_ids, reg->id))
+ return -EFAULT;
+ }
+ }
+
+ for (fr = 0; fr <= st->curframe; ++fr) {
+ func = st->frame[fr];
+
+ for (i = BPF_REG_0; i < BPF_REG_10; ++i) {
+ reg = &func->regs[i];
+ if (!reg->id)
+ continue;
+ if (!idset_contains(precise_ids, reg->id))
+ continue;
+ bt_set_frame_reg(bt, fr, i);
+ }
+ for (i = 0; i < func->allocated_stack / BPF_REG_SIZE; ++i) {
+ if (!is_spilled_scalar_reg(&func->stack[i]))
+ continue;
+ reg = &func->stack[i].spilled_ptr;
+ if (!reg->id)
+ continue;
+ if (!idset_contains(precise_ids, reg->id))
+ continue;
+ bt_set_frame_slot(bt, fr, i);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * __mark_chain_precision() backtracks BPF program instruction sequence and
+ * chain of verifier states making sure that register *regno* (if regno >= 0)
+ * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked
+ * SCALARS, as well as any other registers and slots that contribute to
+ * a tracked state of given registers/stack slots, depending on specific BPF
+ * assembly instructions (see backtrack_insns() for exact instruction handling
+ * logic). This backtracking relies on recorded jmp_history and is able to
+ * traverse entire chain of parent states. This process ends only when all the
+ * necessary registers/slots and their transitive dependencies are marked as
+ * precise.
+ *
+ * One important and subtle aspect is that precise marks *do not matter* in
+ * the currently verified state (current state). It is important to understand
+ * why this is the case.
+ *
+ * First, note that current state is the state that is not yet "checkpointed",
+ * i.e., it is not yet put into env->explored_states, and it has no children
+ * states as well. It's ephemeral, and can end up either a) being discarded if
+ * compatible explored state is found at some point or BPF_EXIT instruction is
+ * reached or b) checkpointed and put into env->explored_states, branching out
+ * into one or more children states.
+ *
+ * In the former case, precise markings in current state are completely
+ * ignored by state comparison code (see regsafe() for details). Only
+ * checkpointed ("old") state precise markings are important, and if old
+ * state's register/slot is precise, regsafe() assumes current state's
+ * register/slot as precise and checks value ranges exactly and precisely. If
+ * states turn out to be compatible, current state's necessary precise
+ * markings and any required parent states' precise markings are enforced
+ * after the fact with propagate_precision() logic, after the fact. But it's
+ * important to realize that in this case, even after marking current state
+ * registers/slots as precise, we immediately discard current state. So what
+ * actually matters is any of the precise markings propagated into current
+ * state's parent states, which are always checkpointed (due to b) case above).
+ * As such, for scenario a) it doesn't matter if current state has precise
+ * markings set or not.
+ *
+ * Now, for the scenario b), checkpointing and forking into child(ren)
+ * state(s). Note that before current state gets to checkpointing step, any
+ * processed instruction always assumes precise SCALAR register/slot
+ * knowledge: if precise value or range is useful to prune jump branch, BPF
+ * verifier takes this opportunity enthusiastically. Similarly, when
+ * register's value is used to calculate offset or memory address, exact
+ * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
+ * what we mentioned above about state comparison ignoring precise markings
+ * during state comparison, BPF verifier ignores and also assumes precise
+ * markings *at will* during instruction verification process. But as verifier
+ * assumes precision, it also propagates any precision dependencies across
+ * parent states, which are not yet finalized, so can be further restricted
+ * based on new knowledge gained from restrictions enforced by their children
+ * states. This is so that once those parent states are finalized, i.e., when
+ * they have no more active children state, state comparison logic in
+ * is_state_visited() would enforce strict and precise SCALAR ranges, if
+ * required for correctness.
+ *
+ * To build a bit more intuition, note also that once a state is checkpointed,
+ * the path we took to get to that state is not important. This is crucial
+ * property for state pruning. When state is checkpointed and finalized at
+ * some instruction index, it can be correctly and safely used to "short
+ * circuit" any *compatible* state that reaches exactly the same instruction
+ * index. I.e., if we jumped to that instruction from a completely different
+ * code path than original finalized state was derived from, it doesn't
+ * matter, current state can be discarded because from that instruction
+ * forward having a compatible state will ensure we will safely reach the
+ * exit. States describe preconditions for further exploration, but completely
+ * forget the history of how we got here.
+ *
+ * This also means that even if we needed precise SCALAR range to get to
+ * finalized state, but from that point forward *that same* SCALAR register is
+ * never used in a precise context (i.e., it's precise value is not needed for
+ * correctness), it's correct and safe to mark such register as "imprecise"
+ * (i.e., precise marking set to false). This is what we rely on when we do
+ * not set precise marking in current state. If no child state requires
+ * precision for any given SCALAR register, it's safe to dictate that it can
+ * be imprecise. If any child state does require this register to be precise,
+ * we'll mark it precise later retroactively during precise markings
+ * propagation from child state to parent states.
+ *
+ * Skipping precise marking setting in current state is a mild version of
+ * relying on the above observation. But we can utilize this property even
+ * more aggressively by proactively forgetting any precise marking in the
+ * current state (which we inherited from the parent state), right before we
+ * checkpoint it and branch off into new child state. This is done by
+ * mark_all_scalars_imprecise() to hopefully get more permissive and generic
+ * finalized states which help in short circuiting more future states.
+ */
+static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
+{
+ struct backtrack_state *bt = &env->bt;
struct bpf_verifier_state *st = env->cur_state;
int first_idx = st->first_insn_idx;
int last_idx = env->insn_idx;
+ int subseq_idx = -1;
struct bpf_func_state *func;
struct bpf_reg_state *reg;
- u32 reg_mask = regno >= 0 ? 1u << regno : 0;
- u64 stack_mask = spi >= 0 ? 1ull << spi : 0;
bool skip_first = true;
- bool new_marks = false;
- int i, err;
+ int i, fr, err;
if (!env->bpf_capable)
return 0;
- func = st->frame[st->curframe];
+ /* set frame number from which we are starting to backtrack */
+ bt_init(bt, env->cur_state->curframe);
+
+ /* Do sanity checks against current state of register and/or stack
+ * slot, but don't set precise flag in current state, as precision
+ * tracking in the current state is unnecessary.
+ */
+ func = st->frame[bt->frame];
if (regno >= 0) {
reg = &func->regs[regno];
if (reg->type != SCALAR_VALUE) {
WARN_ONCE(1, "backtracing misuse");
return -EFAULT;
}
- if (!reg->precise)
- new_marks = true;
- else
- reg_mask = 0;
- reg->precise = true;
- }
-
- while (spi >= 0) {
- if (func->stack[spi].slot_type[0] != STACK_SPILL) {
- stack_mask = 0;
- break;
- }
- reg = &func->stack[spi].spilled_ptr;
- if (reg->type != SCALAR_VALUE) {
- stack_mask = 0;
- break;
- }
- if (!reg->precise)
- new_marks = true;
- else
- stack_mask = 0;
- reg->precise = true;
- break;
+ bt_set_reg(bt, regno);
}
- if (!new_marks)
- return 0;
- if (!reg_mask && !stack_mask)
+ if (bt_empty(bt))
return 0;
+
for (;;) {
DECLARE_BITMAP(mask, 64);
u32 history = st->jmp_history_cnt;
+ struct bpf_jmp_history_entry *hist;
+
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
+ bt->frame, last_idx, first_idx, subseq_idx);
+ }
+
+ /* If some register with scalar ID is marked as precise,
+ * make sure that all registers sharing this ID are also precise.
+ * This is needed to estimate effect of find_equal_scalars().
+ * Do this at the last instruction of each state,
+ * bpf_reg_state::id fields are valid for these instructions.
+ *
+ * Allows to track precision in situation like below:
+ *
+ * r2 = unknown value
+ * ...
+ * --- state #0 ---
+ * ...
+ * r1 = r2 // r1 and r2 now share the same ID
+ * ...
+ * --- state #1 {r1.id = A, r2.id = A} ---
+ * ...
+ * if (r2 > 10) goto exit; // find_equal_scalars() assigns range to r1
+ * ...
+ * --- state #2 {r1.id = A, r2.id = A} ---
+ * r3 = r10
+ * r3 += r1 // need to mark both r1 and r2
+ */
+ if (mark_precise_scalar_ids(env, st))
+ return -EFAULT;
+
+ if (last_idx < 0) {
+ /* we are at the entry into subprog, which
+ * is expected for global funcs, but only if
+ * requested precise registers are R1-R5
+ * (which are global func's input arguments)
+ */
+ if (st->curframe == 0 &&
+ st->frame[0]->subprogno > 0 &&
+ st->frame[0]->callsite == BPF_MAIN_FUNC &&
+ bt_stack_mask(bt) == 0 &&
+ (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) == 0) {
+ bitmap_from_u64(mask, bt_reg_mask(bt));
+ for_each_set_bit(i, mask, 32) {
+ reg = &st->frame[0]->regs[i];
+ bt_clear_reg(bt, i);
+ if (reg->type == SCALAR_VALUE)
+ reg->precise = true;
+ }
+ return 0;
+ }
+
+ verbose(env, "BUG backtracking func entry subprog %d reg_mask %x stack_mask %llx\n",
+ st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt));
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
- if (env->log.level & BPF_LOG_LEVEL)
- verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
for (i = last_idx;;) {
if (skip_first) {
err = 0;
skip_first = false;
} else {
- err = backtrack_insn(env, i, &reg_mask, &stack_mask);
+ hist = get_jmp_hist_entry(st, history, i);
+ err = backtrack_insn(env, i, subseq_idx, hist, bt);
}
if (err == -ENOTSUPP) {
- mark_all_scalars_precise(env, st);
+ mark_all_scalars_precise(env, env->cur_state);
+ bt_reset(bt);
return 0;
} else if (err) {
return err;
}
- if (!reg_mask && !stack_mask)
+ if (bt_empty(bt))
/* Found assignment(s) into tracked register in this state.
* Since this state is already marked, just return.
* Nothing to be tracked further in the parent state.
*/
return 0;
- if (i == first_idx)
- break;
+ subseq_idx = i;
i = get_prev_insn_idx(st, i, &history);
+ if (i == -ENOENT)
+ break;
if (i >= env->prog->len) {
/* This can happen if backtracking reached insn 0
* and there are still reg_mask or stack_mask
@@ -2064,86 +4256,89 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
if (!st)
break;
- new_marks = false;
- func = st->frame[st->curframe];
- bitmap_from_u64(mask, reg_mask);
- for_each_set_bit(i, mask, 32) {
- reg = &func->regs[i];
- if (reg->type != SCALAR_VALUE) {
- reg_mask &= ~(1u << i);
- continue;
+ for (fr = bt->frame; fr >= 0; fr--) {
+ func = st->frame[fr];
+ bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
+ for_each_set_bit(i, mask, 32) {
+ reg = &func->regs[i];
+ if (reg->type != SCALAR_VALUE) {
+ bt_clear_frame_reg(bt, fr, i);
+ continue;
+ }
+ if (reg->precise)
+ bt_clear_frame_reg(bt, fr, i);
+ else
+ reg->precise = true;
}
- if (!reg->precise)
- new_marks = true;
- reg->precise = true;
- }
- bitmap_from_u64(mask, stack_mask);
- for_each_set_bit(i, mask, 64) {
- if (i >= func->allocated_stack / BPF_REG_SIZE) {
- /* the sequence of instructions:
- * 2: (bf) r3 = r10
- * 3: (7b) *(u64 *)(r3 -8) = r0
- * 4: (79) r4 = *(u64 *)(r10 -8)
- * doesn't contain jmps. It's backtracked
- * as a single block.
- * During backtracking insn 3 is not recognized as
- * stack access, so at the end of backtracking
- * stack slot fp-8 is still marked in stack_mask.
- * However the parent state may not have accessed
- * fp-8 and it's "unallocated" stack space.
- * In such case fallback to conservative.
- */
- mark_all_scalars_precise(env, st);
- return 0;
- }
+ bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
+ for_each_set_bit(i, mask, 64) {
+ if (i >= func->allocated_stack / BPF_REG_SIZE) {
+ verbose(env, "BUG backtracking (stack slot %d, total slots %d)\n",
+ i, func->allocated_stack / BPF_REG_SIZE);
+ WARN_ONCE(1, "verifier backtracking bug (stack slot out of bounds)");
+ return -EFAULT;
+ }
- if (func->stack[i].slot_type[0] != STACK_SPILL) {
- stack_mask &= ~(1ull << i);
- continue;
+ if (!is_spilled_scalar_reg(&func->stack[i])) {
+ bt_clear_frame_slot(bt, fr, i);
+ continue;
+ }
+ reg = &func->stack[i].spilled_ptr;
+ if (reg->precise)
+ bt_clear_frame_slot(bt, fr, i);
+ else
+ reg->precise = true;
}
- reg = &func->stack[i].spilled_ptr;
- if (reg->type != SCALAR_VALUE) {
- stack_mask &= ~(1ull << i);
- continue;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
+ bt_frame_reg_mask(bt, fr));
+ verbose(env, "mark_precise: frame%d: parent state regs=%s ",
+ fr, env->tmp_str_buf);
+ fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
+ bt_frame_stack_mask(bt, fr));
+ verbose(env, "stack=%s: ", env->tmp_str_buf);
+ print_verifier_state(env, func, true);
}
- if (!reg->precise)
- new_marks = true;
- reg->precise = true;
- }
- if (env->log.level & BPF_LOG_LEVEL) {
- print_verifier_state(env, func);
- verbose(env, "parent %s regs=%x stack=%llx marks\n",
- new_marks ? "didn't have" : "already had",
- reg_mask, stack_mask);
}
- if (!reg_mask && !stack_mask)
- break;
- if (!new_marks)
- break;
+ if (bt_empty(bt))
+ return 0;
+ subseq_idx = first_idx;
last_idx = st->last_insn_idx;
first_idx = st->first_insn_idx;
}
+
+ /* if we still have requested precise regs or slots, we missed
+ * something (e.g., stack access through non-r10 register), so
+ * fallback to marking all precise
+ */
+ if (!bt_empty(bt)) {
+ mark_all_scalars_precise(env, env->cur_state);
+ bt_reset(bt);
+ }
+
return 0;
}
-static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
+int mark_chain_precision(struct bpf_verifier_env *env, int regno)
{
- return __mark_chain_precision(env, regno, -1);
+ return __mark_chain_precision(env, regno);
}
-static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi)
+/* mark_chain_precision_batch() assumes that env->bt is set in the caller to
+ * desired reg and stack masks across all relevant frames
+ */
+static int mark_chain_precision_batch(struct bpf_verifier_env *env)
{
- return __mark_chain_precision(env, -1, spi);
+ return __mark_chain_precision(env, -1);
}
static bool is_spillable_regtype(enum bpf_reg_type type)
{
- switch (type) {
+ switch (base_type(type)) {
case PTR_TO_MAP_VALUE:
- case PTR_TO_MAP_VALUE_OR_NULL:
case PTR_TO_STACK:
case PTR_TO_CTX:
case PTR_TO_PACKET:
@@ -2152,14 +4347,14 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_FLOW_KEYS:
case CONST_PTR_TO_MAP:
case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
- case PTR_TO_BTF_ID_OR_NULL:
+ case PTR_TO_BUF:
+ case PTR_TO_MEM:
+ case PTR_TO_FUNC:
+ case PTR_TO_MAP_KEY:
return true;
default:
return false;
@@ -2172,9 +4367,31 @@ static bool register_is_null(struct bpf_reg_state *reg)
return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
}
-static bool register_is_const(struct bpf_reg_state *reg)
+/* check if register is a constant scalar value */
+static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32)
{
- return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off);
+ return reg->type == SCALAR_VALUE &&
+ tnum_is_const(subreg32 ? tnum_subreg(reg->var_off) : reg->var_off);
+}
+
+/* assuming is_reg_const() is true, return constant value of a register */
+static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32)
+{
+ return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value;
+}
+
+static bool __is_scalar_unbounded(struct bpf_reg_state *reg)
+{
+ return tnum_is_unknown(reg->var_off) &&
+ reg->smin_value == S64_MIN && reg->smax_value == S64_MAX &&
+ reg->umin_value == 0 && reg->umax_value == U64_MAX &&
+ reg->s32_min_value == S32_MIN && reg->s32_max_value == S32_MAX &&
+ reg->u32_min_value == 0 && reg->u32_max_value == U32_MAX;
+}
+
+static bool register_is_bounded(struct bpf_reg_state *reg)
+{
+ return reg->type == SCALAR_VALUE && !__is_scalar_unbounded(reg);
}
static bool __is_pointer_value(bool allow_ptr_leaks,
@@ -2186,39 +4403,61 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
return reg->type != SCALAR_VALUE;
}
-static void save_register_state(struct bpf_func_state *state,
- int spi, struct bpf_reg_state *reg)
+/* Copy src state preserving dst->parent and dst->live fields */
+static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
+{
+ struct bpf_reg_state *parent = dst->parent;
+ enum bpf_reg_liveness live = dst->live;
+
+ *dst = *src;
+ dst->parent = parent;
+ dst->live = live;
+}
+
+static void save_register_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *state,
+ int spi, struct bpf_reg_state *reg,
+ int size)
{
int i;
- state->stack[spi].spilled_ptr = *reg;
- state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ copy_register_state(&state->stack[spi].spilled_ptr, reg);
+ if (size == BPF_REG_SIZE)
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
- for (i = 0; i < BPF_REG_SIZE; i++)
- state->stack[spi].slot_type[i] = STACK_SPILL;
+ for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--)
+ state->stack[spi].slot_type[i - 1] = STACK_SPILL;
+
+ /* size < 8 bytes spill */
+ for (; i; i--)
+ mark_stack_slot_misc(env, &state->stack[spi].slot_type[i - 1]);
}
-/* check_stack_read/write functions track spill/fill of registers,
+static bool is_bpf_st_mem(struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_ST && BPF_MODE(insn->code) == BPF_MEM;
+}
+
+/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
-static int check_stack_write(struct bpf_verifier_env *env,
- struct bpf_func_state *state, /* func where register points to */
- int off, int size, int value_regno, int insn_idx)
+static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
+ /* stack frame we're writing to */
+ struct bpf_func_state *state,
+ int off, int size, int value_regno,
+ int insn_idx)
{
struct bpf_func_state *cur; /* state of the current function */
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
- u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg;
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
struct bpf_reg_state *reg = NULL;
+ int insn_flags = insn_stack_access_flags(state->frameno, spi);
- err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
- state->acquired_refs, true);
- if (err)
- return err;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
* so it's aligned access and [off, off + size) are within stack limits
*/
if (!env->allow_ptr_leaks &&
- state->stack[spi].slot_type[0] == STACK_SPILL &&
+ is_spilled_reg(&state->stack[spi]) &&
size != BPF_REG_SIZE) {
verbose(env, "attempt to corrupt spilled pointer on stack\n");
return -EACCES;
@@ -2227,21 +4466,39 @@ static int check_stack_write(struct bpf_verifier_env *env,
cur = env->cur_state->frame[env->cur_state->curframe];
if (value_regno >= 0)
reg = &cur->regs[value_regno];
+ if (!env->bypass_spec_v4) {
+ bool sanitize = reg && is_spillable_regtype(reg->type);
- if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
- !register_is_null(reg) && env->bpf_capable) {
- if (dst_reg != BPF_REG_FP) {
- /* The backtracking logic can only recognize explicit
- * stack slot address like [fp - 8]. Other spill of
- * scalar via different register has to be conervative.
- * Backtrack from here and mark all registers as precise
- * that contributed into 'reg' being a constant.
- */
- err = mark_chain_precision(env, value_regno);
- if (err)
- return err;
+ for (i = 0; i < size; i++) {
+ u8 type = state->stack[spi].slot_type[i];
+
+ if (type != STACK_MISC && type != STACK_ZERO) {
+ sanitize = true;
+ break;
+ }
}
- save_register_state(state, spi, reg);
+
+ if (sanitize)
+ env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
+ }
+
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+
+ mark_stack_slot_scratched(env, spi);
+ if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && env->bpf_capable) {
+ save_register_state(env, state, spi, reg, size);
+ /* Break the relation on a narrowing spill. */
+ if (fls64(reg->umax_value) > BITS_PER_BYTE * size)
+ state->stack[spi].spilled_ptr.id = 0;
+ } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&
+ insn->imm != 0 && env->bpf_capable) {
+ struct bpf_reg_state fake_reg = {};
+
+ __mark_reg_known(&fake_reg, insn->imm);
+ fake_reg.type = SCALAR_VALUE;
+ save_register_state(env, state, spi, &fake_reg, size);
} else if (reg && is_spillable_regtype(reg->type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
@@ -2249,57 +4506,20 @@ static int check_stack_write(struct bpf_verifier_env *env,
verbose(env, "invalid size of register spill\n");
return -EACCES;
}
-
if (state != cur && reg->type == PTR_TO_STACK) {
verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
return -EINVAL;
}
-
- if (!env->bypass_spec_v4) {
- bool sanitize = false;
-
- if (state->stack[spi].slot_type[0] == STACK_SPILL &&
- register_is_const(&state->stack[spi].spilled_ptr))
- sanitize = true;
- for (i = 0; i < BPF_REG_SIZE; i++)
- if (state->stack[spi].slot_type[i] == STACK_MISC) {
- sanitize = true;
- break;
- }
- if (sanitize) {
- int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
- int soff = (-spi - 1) * BPF_REG_SIZE;
-
- /* detected reuse of integer stack slot with a pointer
- * which means either llvm is reusing stack slot or
- * an attacker is trying to exploit CVE-2018-3639
- * (speculative store bypass)
- * Have to sanitize that slot with preemptive
- * store of zero.
- */
- if (*poff && *poff != soff) {
- /* disallow programs where single insn stores
- * into two different stack slots, since verifier
- * cannot sanitize them
- */
- verbose(env,
- "insn %d cannot access two stack slots fp%d and fp%d",
- insn_idx, *poff, soff);
- return -EINVAL;
- }
- *poff = soff;
- }
- }
- save_register_state(state, spi, reg);
+ save_register_state(env, state, spi, reg, size);
} else {
u8 type = STACK_MISC;
/* regular write of data into stack destroys any spilled ptr */
state->stack[spi].spilled_ptr.type = NOT_INIT;
- /* Mark slots as STACK_MISC if they belonged to spilled ptr. */
- if (state->stack[spi].slot_type[0] == STACK_SPILL)
+ /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */
+ if (is_stack_slot_special(&state->stack[spi]))
for (i = 0; i < BPF_REG_SIZE; i++)
- state->stack[spi].slot_type[i] = STACK_MISC;
+ scrub_spilled_slot(&state->stack[spi].slot_type[i]);
/* only mark the slot as written if all 8 bytes were written
* otherwise read propagation may incorrectly stop too soon
@@ -2313,8 +4533,14 @@ static int check_stack_write(struct bpf_verifier_env *env,
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
/* when we zero initialize stack slots mark them as such */
- if (reg && register_is_null(reg)) {
- /* backtracking doesn't work for STACK_ZERO yet. */
+ if ((reg && register_is_null(reg)) ||
+ (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {
+ /* STACK_ZERO case happened because register spill
+ * wasn't properly aligned at the stack slot boundary,
+ * so it's not a register spill anymore; force
+ * originating register to be precise to make
+ * STACK_ZERO correct for subsequent states
+ */
err = mark_chain_precision(env, value_regno);
if (err)
return err;
@@ -2323,61 +4549,267 @@ static int check_stack_write(struct bpf_verifier_env *env,
/* Mark slots affected by this stack write. */
for (i = 0; i < size; i++)
- state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
- type;
+ state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = type;
+ insn_flags = 0; /* not a register spill */
}
+
+ if (insn_flags)
+ return push_jmp_history(env, env->cur_state, insn_flags);
return 0;
}
-static int check_stack_read(struct bpf_verifier_env *env,
- struct bpf_func_state *reg_state /* func where register points to */,
- int off, int size, int value_regno)
+/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is
+ * known to contain a variable offset.
+ * This function checks whether the write is permitted and conservatively
+ * tracks the effects of the write, considering that each stack slot in the
+ * dynamic range is potentially written to.
+ *
+ * 'off' includes 'regno->off'.
+ * 'value_regno' can be -1, meaning that an unknown value is being written to
+ * the stack.
+ *
+ * Spilled pointers in range are not marked as written because we don't know
+ * what's going to be actually written. This means that read propagation for
+ * future reads cannot be terminated by this write.
+ *
+ * For privileged programs, uninitialized stack slots are considered
+ * initialized by this write (even though we don't know exactly what offsets
+ * are going to be written to). The idea is that we don't want the verifier to
+ * reject future reads that access slots written to through variable offsets.
+ */
+static int check_stack_write_var_off(struct bpf_verifier_env *env,
+ /* func where register points to */
+ struct bpf_func_state *state,
+ int ptr_regno, int off, int size,
+ int value_regno, int insn_idx)
+{
+ struct bpf_func_state *cur; /* state of the current function */
+ int min_off, max_off;
+ int i, err;
+ struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL;
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
+ bool writing_zero = false;
+ /* set if the fact that we're writing a zero is used to let any
+ * stack slots remain STACK_ZERO
+ */
+ bool zero_used = false;
+
+ cur = env->cur_state->frame[env->cur_state->curframe];
+ ptr_reg = &cur->regs[ptr_regno];
+ min_off = ptr_reg->smin_value + off;
+ max_off = ptr_reg->smax_value + off + size;
+ if (value_regno >= 0)
+ value_reg = &cur->regs[value_regno];
+ if ((value_reg && register_is_null(value_reg)) ||
+ (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0))
+ writing_zero = true;
+
+ for (i = min_off; i < max_off; i++) {
+ int spi;
+
+ spi = __get_spi(i);
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+ }
+
+ /* Variable offset writes destroy any spilled pointers in range. */
+ for (i = min_off; i < max_off; i++) {
+ u8 new_type, *stype;
+ int slot, spi;
+
+ slot = -i - 1;
+ spi = slot / BPF_REG_SIZE;
+ stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
+ mark_stack_slot_scratched(env, spi);
+
+ if (!env->allow_ptr_leaks && *stype != STACK_MISC && *stype != STACK_ZERO) {
+ /* Reject the write if range we may write to has not
+ * been initialized beforehand. If we didn't reject
+ * here, the ptr status would be erased below (even
+ * though not all slots are actually overwritten),
+ * possibly opening the door to leaks.
+ *
+ * We do however catch STACK_INVALID case below, and
+ * only allow reading possibly uninitialized memory
+ * later for CAP_PERFMON, as the write may not happen to
+ * that slot.
+ */
+ verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",
+ insn_idx, i);
+ return -EINVAL;
+ }
+
+ /* Erase all spilled pointers. */
+ state->stack[spi].spilled_ptr.type = NOT_INIT;
+
+ /* Update the slot type. */
+ new_type = STACK_MISC;
+ if (writing_zero && *stype == STACK_ZERO) {
+ new_type = STACK_ZERO;
+ zero_used = true;
+ }
+ /* If the slot is STACK_INVALID, we check whether it's OK to
+ * pretend that it will be initialized by this write. The slot
+ * might not actually be written to, and so if we mark it as
+ * initialized future reads might leak uninitialized memory.
+ * For privileged programs, we will accept such reads to slots
+ * that may or may not be written because, if we're reject
+ * them, the error would be too confusing.
+ */
+ if (*stype == STACK_INVALID && !env->allow_uninit_stack) {
+ verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d",
+ insn_idx, i);
+ return -EINVAL;
+ }
+ *stype = new_type;
+ }
+ if (zero_used) {
+ /* backtracking doesn't work for STACK_ZERO yet. */
+ err = mark_chain_precision(env, value_regno);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+/* When register 'dst_regno' is assigned some values from stack[min_off,
+ * max_off), we set the register's type according to the types of the
+ * respective stack slots. If all the stack values are known to be zeros, then
+ * so is the destination reg. Otherwise, the register is considered to be
+ * SCALAR. This function does not deal with register filling; the caller must
+ * ensure that all spilled registers in the stack range have been marked as
+ * read.
+ */
+static void mark_reg_stack_read(struct bpf_verifier_env *env,
+ /* func where src register points to */
+ struct bpf_func_state *ptr_state,
+ int min_off, int max_off, int dst_regno)
{
struct bpf_verifier_state *vstate = env->cur_state;
struct bpf_func_state *state = vstate->frame[vstate->curframe];
- int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
- struct bpf_reg_state *reg;
+ int i, slot, spi;
u8 *stype;
+ int zeros = 0;
- if (reg_state->allocated_stack <= slot) {
- verbose(env, "invalid read from stack off %d+0 size %d\n",
- off, size);
- return -EACCES;
+ for (i = min_off; i < max_off; i++) {
+ slot = -i - 1;
+ spi = slot / BPF_REG_SIZE;
+ mark_stack_slot_scratched(env, spi);
+ stype = ptr_state->stack[spi].slot_type;
+ if (stype[slot % BPF_REG_SIZE] != STACK_ZERO)
+ break;
+ zeros++;
+ }
+ if (zeros == max_off - min_off) {
+ /* Any access_size read into register is zero extended,
+ * so the whole register == const_zero.
+ */
+ __mark_reg_const_zero(env, &state->regs[dst_regno]);
+ } else {
+ /* have read misc data from the stack */
+ mark_reg_unknown(env, state->regs, dst_regno);
}
+ state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
+}
+
+/* Read the stack at 'off' and put the results into the register indicated by
+ * 'dst_regno'. It handles reg filling if the addressed stack slot is a
+ * spilled reg.
+ *
+ * 'dst_regno' can be -1, meaning that the read value is not going to a
+ * register.
+ *
+ * The access is assumed to be within the current stack bounds.
+ */
+static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
+ /* func where src register points to */
+ struct bpf_func_state *reg_state,
+ int off, int size, int dst_regno)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
+ struct bpf_reg_state *reg;
+ u8 *stype, type;
+ int insn_flags = insn_stack_access_flags(reg_state->frameno, spi);
+
stype = reg_state->stack[spi].slot_type;
reg = &reg_state->stack[spi].spilled_ptr;
- if (stype[0] == STACK_SPILL) {
- if (size != BPF_REG_SIZE) {
+ mark_stack_slot_scratched(env, spi);
+
+ if (is_spilled_reg(&reg_state->stack[spi])) {
+ u8 spill_size = 1;
+
+ for (i = BPF_REG_SIZE - 1; i > 0 && stype[i - 1] == STACK_SPILL; i--)
+ spill_size++;
+
+ if (size != BPF_REG_SIZE || spill_size != BPF_REG_SIZE) {
if (reg->type != SCALAR_VALUE) {
verbose_linfo(env, env->insn_idx, "; ");
verbose(env, "invalid size of register fill\n");
return -EACCES;
}
- if (value_regno >= 0) {
- mark_reg_unknown(env, state->regs, value_regno);
- state->regs[value_regno].live |= REG_LIVE_WRITTEN;
- }
+
mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
- return 0;
- }
- for (i = 1; i < BPF_REG_SIZE; i++) {
- if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
- verbose(env, "corrupted spill memory\n");
- return -EACCES;
- }
- }
+ if (dst_regno < 0)
+ return 0;
+
+ if (!(off % BPF_REG_SIZE) && size == spill_size) {
+ /* The earlier check_reg_arg() has decided the
+ * subreg_def for this insn. Save it first.
+ */
+ s32 subreg_def = state->regs[dst_regno].subreg_def;
+
+ copy_register_state(&state->regs[dst_regno], reg);
+ state->regs[dst_regno].subreg_def = subreg_def;
+ } else {
+ int spill_cnt = 0, zero_cnt = 0;
+
+ for (i = 0; i < size; i++) {
+ type = stype[(slot - i) % BPF_REG_SIZE];
+ if (type == STACK_SPILL) {
+ spill_cnt++;
+ continue;
+ }
+ if (type == STACK_MISC)
+ continue;
+ if (type == STACK_ZERO) {
+ zero_cnt++;
+ continue;
+ }
+ if (type == STACK_INVALID && env->allow_uninit_stack)
+ continue;
+ verbose(env, "invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ return -EACCES;
+ }
- if (value_regno >= 0) {
+ if (spill_cnt == size &&
+ tnum_is_const(reg->var_off) && reg->var_off.value == 0) {
+ __mark_reg_const_zero(env, &state->regs[dst_regno]);
+ /* this IS register fill, so keep insn_flags */
+ } else if (zero_cnt == size) {
+ /* similarly to mark_reg_stack_read(), preserve zeroes */
+ __mark_reg_const_zero(env, &state->regs[dst_regno]);
+ insn_flags = 0; /* not restoring original register state */
+ } else {
+ mark_reg_unknown(env, state->regs, dst_regno);
+ insn_flags = 0; /* not restoring original register state */
+ }
+ }
+ state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
+ } else if (dst_regno >= 0) {
/* restore register state from stack */
- state->regs[value_regno] = *reg;
+ copy_register_state(&state->regs[dst_regno], reg);
/* mark reg as written since spilled pointer state likely
* has its liveness marks cleared by is_state_visited()
* which resets stack/reg liveness for state transitions
*/
- state->regs[value_regno].live |= REG_LIVE_WRITTEN;
+ state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
} else if (__is_pointer_value(env->allow_ptr_leaks, reg)) {
- /* If value_regno==-1, the caller is asking us whether
+ /* If dst_regno==-1, the caller is asking us whether
* it is acceptable to use this value as a SCALAR_VALUE
* (e.g. for XADD).
* We must not allow unprivileged callers to do that
@@ -2389,70 +4821,166 @@ static int check_stack_read(struct bpf_verifier_env *env,
}
mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
} else {
- int zeros = 0;
-
for (i = 0; i < size; i++) {
- if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC)
+ type = stype[(slot - i) % BPF_REG_SIZE];
+ if (type == STACK_MISC)
continue;
- if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) {
- zeros++;
+ if (type == STACK_ZERO)
+ continue;
+ if (type == STACK_INVALID && env->allow_uninit_stack)
continue;
- }
verbose(env, "invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
}
mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
- if (value_regno >= 0) {
- if (zeros == size) {
- /* any size read into register is zero extended,
- * so the whole register == const_zero
- */
- __mark_reg_const_zero(&state->regs[value_regno]);
- /* backtracking doesn't support STACK_ZERO yet,
- * so mark it precise here, so that later
- * backtracking can stop here.
- * Backtracking may not need this if this register
- * doesn't participate in pointer adjustment.
- * Forward propagation of precise flag is not
- * necessary either. This mark is only to stop
- * backtracking. Any register that contributed
- * to const 0 was marked precise before spill.
- */
- state->regs[value_regno].precise = true;
- } else {
- /* have read misc data from the stack */
- mark_reg_unknown(env, state->regs, value_regno);
- }
- state->regs[value_regno].live |= REG_LIVE_WRITTEN;
- }
+ if (dst_regno >= 0)
+ mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
+ insn_flags = 0; /* we are not restoring spilled register */
}
+ if (insn_flags)
+ return push_jmp_history(env, env->cur_state, insn_flags);
+ return 0;
+}
+
+enum bpf_access_src {
+ ACCESS_DIRECT = 1, /* the access is performed by an instruction */
+ ACCESS_HELPER = 2, /* the access is performed by a helper */
+};
+
+static int check_stack_range_initialized(struct bpf_verifier_env *env,
+ int regno, int off, int access_size,
+ bool zero_size_allowed,
+ enum bpf_access_src type,
+ struct bpf_call_arg_meta *meta);
+
+static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
+{
+ return cur_regs(env) + regno;
+}
+
+/* Read the stack at 'ptr_regno + off' and put the result into the register
+ * 'dst_regno'.
+ * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'),
+ * but not its variable offset.
+ * 'size' is assumed to be <= reg size and the access is assumed to be aligned.
+ *
+ * As opposed to check_stack_read_fixed_off, this function doesn't deal with
+ * filling registers (i.e. reads of spilled register cannot be detected when
+ * the offset is not fixed). We conservatively mark 'dst_regno' as containing
+ * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable
+ * offset; for a fixed offset check_stack_read_fixed_off should be used
+ * instead.
+ */
+static int check_stack_read_var_off(struct bpf_verifier_env *env,
+ int ptr_regno, int off, int size, int dst_regno)
+{
+ /* The state of the source register. */
+ struct bpf_reg_state *reg = reg_state(env, ptr_regno);
+ struct bpf_func_state *ptr_state = func(env, reg);
+ int err;
+ int min_off, max_off;
+
+ /* Note that we pass a NULL meta, so raw access will not be permitted.
+ */
+ err = check_stack_range_initialized(env, ptr_regno, off, size,
+ false, ACCESS_DIRECT, NULL);
+ if (err)
+ return err;
+
+ min_off = reg->smin_value + off;
+ max_off = reg->smax_value + off;
+ mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno);
return 0;
}
-static int check_stack_access(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg,
- int off, int size)
+/* check_stack_read dispatches to check_stack_read_fixed_off or
+ * check_stack_read_var_off.
+ *
+ * The caller must ensure that the offset falls within the allocated stack
+ * bounds.
+ *
+ * 'dst_regno' is a register which will receive the value from the stack. It
+ * can be -1, meaning that the read value is not going to a register.
+ */
+static int check_stack_read(struct bpf_verifier_env *env,
+ int ptr_regno, int off, int size,
+ int dst_regno)
{
- /* Stack accesses must be at a fixed offset, so that we
- * can determine what type of data were returned. See
- * check_stack_read().
+ struct bpf_reg_state *reg = reg_state(env, ptr_regno);
+ struct bpf_func_state *state = func(env, reg);
+ int err;
+ /* Some accesses are only permitted with a static offset. */
+ bool var_off = !tnum_is_const(reg->var_off);
+
+ /* The offset is required to be static when reads don't go to a
+ * register, in order to not leak pointers (see
+ * check_stack_read_fixed_off).
*/
- if (!tnum_is_const(reg->var_off)) {
+ if (dst_regno < 0 && var_off) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "variable stack access var_off=%s off=%d size=%d\n",
+ verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n",
tn_buf, off, size);
return -EACCES;
}
-
- if (off >= 0 || off < -MAX_BPF_STACK) {
- verbose(env, "invalid stack off=%d size=%d\n", off, size);
- return -EACCES;
+ /* Variable offset is prohibited for unprivileged mode for simplicity
+ * since it requires corresponding support in Spectre masking for stack
+ * ALU. See also retrieve_ptr_limit(). The check in
+ * check_stack_access_for_ptr_arithmetic() called by
+ * adjust_ptr_min_max_vals() prevents users from creating stack pointers
+ * with variable offsets, therefore no check is required here. Further,
+ * just checking it here would be insufficient as speculative stack
+ * writes could still lead to unsafe speculative behaviour.
+ */
+ if (!var_off) {
+ off += reg->var_off.value;
+ err = check_stack_read_fixed_off(env, state, off, size,
+ dst_regno);
+ } else {
+ /* Variable offset stack reads need more conservative handling
+ * than fixed offset ones. Note that dst_regno >= 0 on this
+ * branch.
+ */
+ err = check_stack_read_var_off(env, ptr_regno, off, size,
+ dst_regno);
}
+ return err;
+}
- return 0;
+
+/* check_stack_write dispatches to check_stack_write_fixed_off or
+ * check_stack_write_var_off.
+ *
+ * 'ptr_regno' is the register used as a pointer into the stack.
+ * 'off' includes 'ptr_regno->off', but not its variable offset (if any).
+ * 'value_regno' is the register whose value we're writing to the stack. It can
+ * be -1, meaning that we're not writing from a register.
+ *
+ * The caller must ensure that the offset falls within the maximum stack size.
+ */
+static int check_stack_write(struct bpf_verifier_env *env,
+ int ptr_regno, int off, int size,
+ int value_regno, int insn_idx)
+{
+ struct bpf_reg_state *reg = reg_state(env, ptr_regno);
+ struct bpf_func_state *state = func(env, reg);
+ int err;
+
+ if (tnum_is_const(reg->var_off)) {
+ off += reg->var_off.value;
+ err = check_stack_write_fixed_off(env, state, off, size,
+ value_regno, insn_idx);
+ } else {
+ /* Variable offset stack reads need more conservative handling
+ * than fixed offset ones.
+ */
+ err = check_stack_write_var_off(env, state,
+ ptr_regno, off, size,
+ value_regno, insn_idx);
+ }
+ return err;
}
static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
@@ -2490,6 +5018,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno,
reg = &cur_regs(env)[regno];
switch (reg->type) {
+ case PTR_TO_MAP_KEY:
+ verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n",
+ mem_size, off, size);
+ break;
case PTR_TO_MAP_VALUE:
verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
mem_size, off, size);
@@ -2522,11 +5054,8 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
/* We may have adjusted the register pointing to memory region, so we
* need to try adding each of min_value and max_value to off
* to make sure our theoretical access will be safe.
- */
- if (env->log.level & BPF_LOG_LEVEL)
- print_verifier_state(env, state);
-
- /* The minimum value is only important with signed
+ *
+ * The minimum value is only important with signed
* comparisons where we can't assume the floor of a
* value is 0. If we are using signed variables for our
* index'es we need to make sure that whatever we use
@@ -2568,36 +5097,309 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
return 0;
}
+static int __check_ptr_off_reg(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno,
+ bool fixed_off_ok)
+{
+ /* Access to this pointer-typed register or passing it to a helper
+ * is only allowed in its original, unmodified form.
+ */
+
+ if (reg->off < 0) {
+ verbose(env, "negative offset %s ptr R%d off=%d disallowed\n",
+ reg_type_str(env, reg->type), regno, reg->off);
+ return -EACCES;
+ }
+
+ if (!fixed_off_ok && reg->off) {
+ verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n",
+ reg_type_str(env, reg->type), regno, reg->off);
+ return -EACCES;
+ }
+
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "variable %s access var_off=%s disallowed\n",
+ reg_type_str(env, reg->type), tn_buf);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int check_ptr_off_reg(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno)
+{
+ return __check_ptr_off_reg(env, reg, regno, false);
+}
+
+static int map_kptr_match_type(struct bpf_verifier_env *env,
+ struct btf_field *kptr_field,
+ struct bpf_reg_state *reg, u32 regno)
+{
+ const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
+ int perm_flags;
+ const char *reg_name = "";
+
+ if (btf_is_kernel(reg->btf)) {
+ perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
+
+ /* Only unreferenced case accepts untrusted pointers */
+ if (kptr_field->type == BPF_KPTR_UNREF)
+ perm_flags |= PTR_UNTRUSTED;
+ } else {
+ perm_flags = PTR_MAYBE_NULL | MEM_ALLOC;
+ if (kptr_field->type == BPF_KPTR_PERCPU)
+ perm_flags |= MEM_PERCPU;
+ }
+
+ if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
+ goto bad_type;
+
+ /* We need to verify reg->type and reg->btf, before accessing reg->btf */
+ reg_name = btf_type_name(reg->btf, reg->btf_id);
+
+ /* For ref_ptr case, release function check should ensure we get one
+ * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the
+ * normal store of unreferenced kptr, we must ensure var_off is zero.
+ * Since ref_ptr cannot be accessed directly by BPF insns, checks for
+ * reg->off and reg->ref_obj_id are not needed here.
+ */
+ if (__check_ptr_off_reg(env, reg, regno, true))
+ return -EACCES;
+
+ /* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and
+ * we also need to take into account the reg->off.
+ *
+ * We want to support cases like:
+ *
+ * struct foo {
+ * struct bar br;
+ * struct baz bz;
+ * };
+ *
+ * struct foo *v;
+ * v = func(); // PTR_TO_BTF_ID
+ * val->foo = v; // reg->off is zero, btf and btf_id match type
+ * val->bar = &v->br; // reg->off is still zero, but we need to retry with
+ * // first member type of struct after comparison fails
+ * val->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked
+ * // to match type
+ *
+ * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off
+ * is zero. We must also ensure that btf_struct_ids_match does not walk
+ * the struct to match type against first member of struct, i.e. reject
+ * second case from above. Hence, when type is BPF_KPTR_REF, we set
+ * strict mode to true for type match.
+ */
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+ kptr_field->kptr.btf, kptr_field->kptr.btf_id,
+ kptr_field->type != BPF_KPTR_UNREF))
+ goto bad_type;
+ return 0;
+bad_type:
+ verbose(env, "invalid kptr access, R%d type=%s%s ", regno,
+ reg_type_str(env, reg->type), reg_name);
+ verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name);
+ if (kptr_field->type == BPF_KPTR_UNREF)
+ verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED),
+ targ_name);
+ else
+ verbose(env, "\n");
+ return -EINVAL;
+}
+
+/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
+ * can dereference RCU protected pointers and result is PTR_TRUSTED.
+ */
+static bool in_rcu_cs(struct bpf_verifier_env *env)
+{
+ return env->cur_state->active_rcu_lock ||
+ env->cur_state->active_lock.ptr ||
+ !env->prog->aux->sleepable;
+}
+
+/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
+BTF_SET_START(rcu_protected_types)
+BTF_ID(struct, prog_test_ref_kfunc)
+#ifdef CONFIG_CGROUPS
+BTF_ID(struct, cgroup)
+#endif
+#ifdef CONFIG_BPF_JIT
+BTF_ID(struct, bpf_cpumask)
+#endif
+BTF_ID(struct, task_struct)
+BTF_SET_END(rcu_protected_types)
+
+static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
+{
+ if (!btf_is_kernel(btf))
+ return true;
+ return btf_id_set_contains(&rcu_protected_types, btf_id);
+}
+
+static struct btf_record *kptr_pointee_btf_record(struct btf_field *kptr_field)
+{
+ struct btf_struct_meta *meta;
+
+ if (btf_is_kernel(kptr_field->kptr.btf))
+ return NULL;
+
+ meta = btf_find_struct_meta(kptr_field->kptr.btf,
+ kptr_field->kptr.btf_id);
+
+ return meta ? meta->record : NULL;
+}
+
+static bool rcu_safe_kptr(const struct btf_field *field)
+{
+ const struct btf_field_kptr *kptr = &field->kptr;
+
+ return field->type == BPF_KPTR_PERCPU ||
+ (field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id));
+}
+
+static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field)
+{
+ struct btf_record *rec;
+ u32 ret;
+
+ ret = PTR_MAYBE_NULL;
+ if (rcu_safe_kptr(kptr_field) && in_rcu_cs(env)) {
+ ret |= MEM_RCU;
+ if (kptr_field->type == BPF_KPTR_PERCPU)
+ ret |= MEM_PERCPU;
+ else if (!btf_is_kernel(kptr_field->kptr.btf))
+ ret |= MEM_ALLOC;
+
+ rec = kptr_pointee_btf_record(kptr_field);
+ if (rec && btf_record_has_field(rec, BPF_GRAPH_NODE))
+ ret |= NON_OWN_REF;
+ } else {
+ ret |= PTR_UNTRUSTED;
+ }
+
+ return ret;
+}
+
+static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
+ int value_regno, int insn_idx,
+ struct btf_field *kptr_field)
+{
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
+ int class = BPF_CLASS(insn->code);
+ struct bpf_reg_state *val_reg;
+
+ /* Things we already checked for in check_map_access and caller:
+ * - Reject cases where variable offset may touch kptr
+ * - size of access (must be BPF_DW)
+ * - tnum_is_const(reg->var_off)
+ * - kptr_field->offset == off + reg->var_off.value
+ */
+ /* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose(env, "kptr in map can only be accessed using BPF_MEM instruction mode\n");
+ return -EACCES;
+ }
+
+ /* We only allow loading referenced kptr, since it will be marked as
+ * untrusted, similar to unreferenced kptr.
+ */
+ if (class != BPF_LDX &&
+ (kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_PERCPU)) {
+ verbose(env, "store to referenced kptr disallowed\n");
+ return -EACCES;
+ }
+
+ if (class == BPF_LDX) {
+ val_reg = reg_state(env, value_regno);
+ /* We can simply mark the value_regno receiving the pointer
+ * value from map as PTR_TO_BTF_ID, with the correct type.
+ */
+ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
+ kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field));
+ /* For mark_ptr_or_null_reg */
+ val_reg->id = ++env->id_gen;
+ } else if (class == BPF_STX) {
+ val_reg = reg_state(env, value_regno);
+ if (!register_is_null(val_reg) &&
+ map_kptr_match_type(env, kptr_field, val_reg, value_regno))
+ return -EACCES;
+ } else if (class == BPF_ST) {
+ if (insn->imm) {
+ verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n",
+ kptr_field->offset);
+ return -EACCES;
+ }
+ } else {
+ verbose(env, "kptr in map can only be accessed using BPF_LDX/BPF_STX/BPF_ST\n");
+ return -EACCES;
+ }
+ return 0;
+}
+
/* check read/write into a map element with possible variable offset */
static int check_map_access(struct bpf_verifier_env *env, u32 regno,
- int off, int size, bool zero_size_allowed)
+ int off, int size, bool zero_size_allowed,
+ enum bpf_access_src src)
{
struct bpf_verifier_state *vstate = env->cur_state;
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *reg = &state->regs[regno];
struct bpf_map *map = reg->map_ptr;
- int err;
+ struct btf_record *rec;
+ int err, i;
err = check_mem_region_access(env, regno, off, size, map->value_size,
zero_size_allowed);
if (err)
return err;
- if (map_value_has_spin_lock(map)) {
- u32 lock = map->spin_lock_off;
+ if (IS_ERR_OR_NULL(map->record))
+ return 0;
+ rec = map->record;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_field *field = &rec->fields[i];
+ u32 p = field->offset;
- /* if any part of struct bpf_spin_lock can be touched by
- * load/store reject this program.
- * To check that [x1, x2) overlaps with [y1, y2)
+ /* If any part of a field can be touched by load/store, reject
+ * this program. To check that [x1, x2) overlaps with [y1, y2),
* it is sufficient to check x1 < y2 && y1 < x2.
*/
- if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) &&
- lock < reg->umax_value + off + size) {
- verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n");
- return -EACCES;
+ if (reg->smin_value + off < p + btf_field_type_size(field->type) &&
+ p < reg->umax_value + off + size) {
+ switch (field->type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ if (src != ACCESS_DIRECT) {
+ verbose(env, "kptr cannot be accessed indirectly by helper\n");
+ return -EACCES;
+ }
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "kptr access cannot have variable offset\n");
+ return -EACCES;
+ }
+ if (p != off + reg->var_off.value) {
+ verbose(env, "kptr access misaligned expected=%u off=%llu\n",
+ p, off + reg->var_off.value);
+ return -EACCES;
+ }
+ if (size != bpf_size_to_bytes(BPF_DW)) {
+ verbose(env, "kptr access size must be BPF_DW\n");
+ return -EACCES;
+ }
+ break;
+ default:
+ verbose(env, "%s cannot be accessed directly by load/store\n",
+ btf_field_type_name(field->type));
+ return -EACCES;
+ }
}
}
- return err;
+ return 0;
}
#define MAX_PACKET_OFF 0xffff
@@ -2606,7 +5408,9 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
const struct bpf_call_arg_meta *meta,
enum bpf_access_type t)
{
- switch (env->prog->type) {
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+
+ switch (prog_type) {
/* Program types only with direct read access go here! */
case BPF_PROG_TYPE_LWT_IN:
case BPF_PROG_TYPE_LWT_OUT:
@@ -2616,7 +5420,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
case BPF_PROG_TYPE_CGROUP_SKB:
if (t == BPF_WRITE)
return false;
- /* fallthrough */
+ fallthrough;
/* Program types with direct read + write access go here! */
case BPF_PROG_TYPE_SCHED_CLS:
@@ -2662,7 +5466,9 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
regno);
return -EACCES;
}
- err = __check_mem_access(env, regno, off, size, reg->range,
+
+ err = reg->range < 0 ? -EINVAL :
+ __check_mem_access(env, regno, off, size, reg->range,
zero_size_allowed);
if (err) {
verbose(env, "R%d offset is outside of the packet\n", regno);
@@ -2685,7 +5491,7 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
/* check access to 'struct bpf_context' fields. Supports fixed offsets only */
static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
enum bpf_access_type t, enum bpf_reg_type *reg_type,
- u32 *btf_id)
+ struct btf **btf, u32 *btf_id)
{
struct bpf_insn_access_aux info = {
.reg_type = *reg_type,
@@ -2703,10 +5509,12 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
*/
*reg_type = info.reg_type;
- if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL)
+ if (base_type(*reg_type) == PTR_TO_BTF_ID) {
+ *btf = info.btf;
*btf_id = info.btf_id;
- else
+ } else {
env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
+ }
/* remember the offset of last byte accessed in ctx */
if (env->prog->aux->max_ctx_offset < off + size)
env->prog->aux->max_ctx_offset = off + size;
@@ -2769,16 +5577,11 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
}
verbose(env, "R%d invalid %s access off=%d size=%d\n",
- regno, reg_type_str[reg->type], off, size);
+ regno, reg_type_str(env, reg->type), off, size);
return -EACCES;
}
-static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
-{
- return cur_regs(env) + regno;
-}
-
static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
{
return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno));
@@ -2813,6 +5616,48 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
return reg->type == PTR_TO_FLOW_KEYS;
}
+static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
+#ifdef CONFIG_NET
+ [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
+ [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+#endif
+ [CONST_PTR_TO_MAP] = btf_bpf_map_id,
+};
+
+static bool is_trusted_reg(const struct bpf_reg_state *reg)
+{
+ /* A referenced register is always trusted. */
+ if (reg->ref_obj_id)
+ return true;
+
+ /* Types listed in the reg2btf_ids are always trusted */
+ if (reg2btf_ids[base_type(reg->type)])
+ return true;
+
+ /* If a register is not referenced, it is trusted if it has the
+ * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the
+ * other type modifiers may be safe, but we elect to take an opt-in
+ * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are
+ * not.
+ *
+ * Eventually, we should make PTR_TRUSTED the single source of truth
+ * for whether a register is trusted.
+ */
+ return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS &&
+ !bpf_type_has_unsafe_modifiers(reg->type);
+}
+
+static bool is_rcu_reg(const struct bpf_reg_state *reg)
+{
+ return reg->type & MEM_RCU;
+}
+
+static void clear_trusted_flags(enum bpf_type_flag *flag)
+{
+ *flag &= ~(BPF_REG_TRUSTED_MODIFIERS | MEM_RCU);
+}
+
static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int off, int size, bool strict)
@@ -2889,6 +5734,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
case PTR_TO_FLOW_KEYS:
pointer_desc = "flow keys ";
break;
+ case PTR_TO_MAP_KEY:
+ pointer_desc = "key ";
+ break;
case PTR_TO_MAP_VALUE:
pointer_desc = "value ";
break;
@@ -2897,8 +5745,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
break;
case PTR_TO_STACK:
pointer_desc = "stack ";
- /* The stack spill tracking logic in check_stack_write()
- * and check_stack_read() relies on stack accesses being
+ /* The stack spill tracking logic in check_stack_write_fixed_off()
+ * and check_stack_read_fixed_off() relies on stack accesses being
* aligned.
*/
strict = true;
@@ -2922,35 +5770,49 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
strict);
}
-static int update_stack_depth(struct bpf_verifier_env *env,
- const struct bpf_func_state *func,
- int off)
-{
- u16 stack = env->subprog_info[func->subprogno].stack_depth;
-
- if (stack >= -off)
- return 0;
-
- /* update known max for given subprogram */
- env->subprog_info[func->subprogno].stack_depth = -off;
- return 0;
-}
-
/* starting from main bpf function walk all instructions of the function
* and recursively walk all callees that given function can call.
* Ignore jump and exit insns.
* Since recursion is prevented by check_cfg() this algorithm
* only needs a local stack of MAX_CALL_FRAMES to remember callsites
*/
-static int check_max_stack_depth(struct bpf_verifier_env *env)
+static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
{
- int depth = 0, frame = 0, idx = 0, i = 0, subprog_end;
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
+ int depth = 0, frame = 0, i, subprog_end;
+ bool tail_call_reachable = false;
int ret_insn[MAX_CALL_FRAMES];
int ret_prog[MAX_CALL_FRAMES];
+ int j;
+ i = subprog[idx].start;
process_func:
+ /* protect against potential stack overflow that might happen when
+ * bpf2bpf calls get combined with tailcalls. Limit the caller's stack
+ * depth for such case down to 256 so that the worst case scenario
+ * would result in 8k stack size (32 which is tailcall limit * 256 =
+ * 8k).
+ *
+ * To get the idea what might happen, see an example:
+ * func1 -> sub rsp, 128
+ * subfunc1 -> sub rsp, 256
+ * tailcall1 -> add rsp, 256
+ * func2 -> sub rsp, 192 (total stack size = 128 + 192 = 320)
+ * subfunc2 -> sub rsp, 64
+ * subfunc22 -> sub rsp, 128
+ * tailcall2 -> add rsp, 128
+ * func3 -> sub rsp, 32 (total stack size 128 + 192 + 64 + 32 = 416)
+ *
+ * tailcall will unwind the current stack frame but it will not get rid
+ * of caller's stack as shown on the example above.
+ */
+ if (idx && subprog[idx].has_tail_call && depth >= 256) {
+ verbose(env,
+ "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n",
+ depth);
+ return -EACCES;
+ }
/* round up to 32-bytes, since this is granularity
* of interpreter stack size
*/
@@ -2963,22 +5825,62 @@ process_func:
continue_func:
subprog_end = subprog[idx + 1].start;
for (; i < subprog_end; i++) {
- if (insn[i].code != (BPF_JMP | BPF_CALL))
- continue;
- if (insn[i].src_reg != BPF_PSEUDO_CALL)
+ int next_insn, sidx;
+
+ if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) {
+ bool err = false;
+
+ if (!is_bpf_throw_kfunc(insn + i))
+ continue;
+ if (subprog[idx].is_cb)
+ err = true;
+ for (int c = 0; c < frame && !err; c++) {
+ if (subprog[ret_prog[c]].is_cb) {
+ err = true;
+ break;
+ }
+ }
+ if (!err)
+ continue;
+ verbose(env,
+ "bpf_throw kfunc (insn %d) cannot be called from callback subprog %d\n",
+ i, idx);
+ return -EINVAL;
+ }
+
+ if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
continue;
/* remember insn and function to return to */
ret_insn[frame] = i + 1;
ret_prog[frame] = idx;
/* find the callee */
- i = i + insn[i].imm + 1;
- idx = find_subprog(env, i);
- if (idx < 0) {
+ next_insn = i + insn[i].imm + 1;
+ sidx = find_subprog(env, next_insn);
+ if (sidx < 0) {
WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
- i);
+ next_insn);
return -EFAULT;
}
+ if (subprog[sidx].is_async_cb) {
+ if (subprog[sidx].has_tail_call) {
+ verbose(env, "verifier bug. subprog has tail_call and async cb\n");
+ return -EFAULT;
+ }
+ /* async callbacks don't increase bpf prog stack size unless called directly */
+ if (!bpf_pseudo_call(insn + i))
+ continue;
+ if (subprog[sidx].is_exception_cb) {
+ verbose(env, "insn %d cannot call exception cb directly\n", i);
+ return -EINVAL;
+ }
+ }
+ i = next_insn;
+ idx = sidx;
+
+ if (subprog[idx].has_tail_call)
+ tail_call_reachable = true;
+
frame++;
if (frame >= MAX_CALL_FRAMES) {
verbose(env, "the call stack of %d frames is too deep !\n",
@@ -2987,6 +5889,22 @@ continue_func:
}
goto process_func;
}
+ /* if tail call got detected across bpf2bpf calls then mark each of the
+ * currently present subprog frames as tail call reachable subprogs;
+ * this info will be utilized by JIT so that we will be preserving the
+ * tail call counter throughout bpf2bpf calls combined with tailcalls
+ */
+ if (tail_call_reachable)
+ for (j = 0; j < frame; j++) {
+ if (subprog[ret_prog[j]].is_exception_cb) {
+ verbose(env, "cannot tail call within exception cb\n");
+ return -EINVAL;
+ }
+ subprog[ret_prog[j]].tail_call_reachable = true;
+ }
+ if (subprog[0].tail_call_reachable)
+ env->prog->aux->tail_call_reachable = true;
+
/* end of for() loop means the last insn of the 'subprog'
* was reached. Doesn't matter whether it was JA or EXIT
*/
@@ -2999,6 +5917,22 @@ continue_func:
goto continue_func;
}
+static int check_max_stack_depth(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *si = env->subprog_info;
+ int ret;
+
+ for (int i = 0; i < env->subprog_cnt; i++) {
+ if (!i || si[i].is_async_cb) {
+ ret = check_max_stack_depth_subprog(env, i);
+ if (ret < 0)
+ return ret;
+ }
+ continue;
+ }
+ return 0;
+}
+
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
static int get_callee_stack_depth(struct bpf_verifier_env *env,
const struct bpf_insn *insn, int idx)
@@ -3015,24 +5949,24 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
}
#endif
-int check_ctx_reg(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg, int regno)
+static int __check_buffer_access(struct bpf_verifier_env *env,
+ const char *buf_info,
+ const struct bpf_reg_state *reg,
+ int regno, int off, int size)
{
- /* Access to ctx or passing it to a helper is only allowed in
- * its original, unmodified form.
- */
-
- if (reg->off) {
- verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n",
- regno, reg->off);
+ if (off < 0) {
+ verbose(env,
+ "R%d invalid %s buffer access: off=%d, size=%d\n",
+ regno, buf_info, off, size);
return -EACCES;
}
-
if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf);
+ verbose(env,
+ "R%d invalid variable buffer offset: off=%d, var_off=%s\n",
+ regno, off, tn_buf);
return -EACCES;
}
@@ -3043,27 +5977,37 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int regno, int off, int size)
{
- if (off < 0) {
- verbose(env,
- "R%d invalid tracepoint buffer access: off=%d, size=%d",
- regno, off, size);
- return -EACCES;
- }
- if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
- char tn_buf[48];
+ int err;
+
+ err = __check_buffer_access(env, "tracepoint", reg, regno, off, size);
+ if (err)
+ return err;
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env,
- "R%d invalid variable buffer offset: off=%d, var_off=%s",
- regno, off, tn_buf);
- return -EACCES;
- }
if (off + size > env->prog->aux->max_tp_access)
env->prog->aux->max_tp_access = off + size;
return 0;
}
+static int check_buffer_access(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ int regno, int off, int size,
+ bool zero_size_allowed,
+ u32 *max_access)
+{
+ const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr";
+ int err;
+
+ err = __check_buffer_access(env, buf_info, reg, regno, off, size);
+ if (err)
+ return err;
+
+ if (off + size > *max_access)
+ *max_access = off + size;
+
+ return 0;
+}
+
/* BPF architecture zero extends alu32 ops into 64-bit registesr */
static void zext_32_to_64(struct bpf_reg_state *reg)
{
@@ -3097,17 +6041,175 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
* values are also truncated so we push 64-bit bounds into
* 32-bit bounds. Above were truncated < 32-bits already.
*/
- if (size >= 4)
+ if (size < 4) {
+ __mark_reg32_unbounded(reg);
+ reg_bounds_sync(reg);
+ }
+}
+
+static void set_sext64_default_val(struct bpf_reg_state *reg, int size)
+{
+ if (size == 1) {
+ reg->smin_value = reg->s32_min_value = S8_MIN;
+ reg->smax_value = reg->s32_max_value = S8_MAX;
+ } else if (size == 2) {
+ reg->smin_value = reg->s32_min_value = S16_MIN;
+ reg->smax_value = reg->s32_max_value = S16_MAX;
+ } else {
+ /* size == 4 */
+ reg->smin_value = reg->s32_min_value = S32_MIN;
+ reg->smax_value = reg->s32_max_value = S32_MAX;
+ }
+ reg->umin_value = reg->u32_min_value = 0;
+ reg->umax_value = U64_MAX;
+ reg->u32_max_value = U32_MAX;
+ reg->var_off = tnum_unknown;
+}
+
+static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size)
+{
+ s64 init_s64_max, init_s64_min, s64_max, s64_min, u64_cval;
+ u64 top_smax_value, top_smin_value;
+ u64 num_bits = size * 8;
+
+ if (tnum_is_const(reg->var_off)) {
+ u64_cval = reg->var_off.value;
+ if (size == 1)
+ reg->var_off = tnum_const((s8)u64_cval);
+ else if (size == 2)
+ reg->var_off = tnum_const((s16)u64_cval);
+ else
+ /* size == 4 */
+ reg->var_off = tnum_const((s32)u64_cval);
+
+ u64_cval = reg->var_off.value;
+ reg->smax_value = reg->smin_value = u64_cval;
+ reg->umax_value = reg->umin_value = u64_cval;
+ reg->s32_max_value = reg->s32_min_value = u64_cval;
+ reg->u32_max_value = reg->u32_min_value = u64_cval;
return;
- __reg_combine_64_into_32(reg);
+ }
+
+ top_smax_value = ((u64)reg->smax_value >> num_bits) << num_bits;
+ top_smin_value = ((u64)reg->smin_value >> num_bits) << num_bits;
+
+ if (top_smax_value != top_smin_value)
+ goto out;
+
+ /* find the s64_min and s64_min after sign extension */
+ if (size == 1) {
+ init_s64_max = (s8)reg->smax_value;
+ init_s64_min = (s8)reg->smin_value;
+ } else if (size == 2) {
+ init_s64_max = (s16)reg->smax_value;
+ init_s64_min = (s16)reg->smin_value;
+ } else {
+ init_s64_max = (s32)reg->smax_value;
+ init_s64_min = (s32)reg->smin_value;
+ }
+
+ s64_max = max(init_s64_max, init_s64_min);
+ s64_min = min(init_s64_max, init_s64_min);
+
+ /* both of s64_max/s64_min positive or negative */
+ if ((s64_max >= 0) == (s64_min >= 0)) {
+ reg->smin_value = reg->s32_min_value = s64_min;
+ reg->smax_value = reg->s32_max_value = s64_max;
+ reg->umin_value = reg->u32_min_value = s64_min;
+ reg->umax_value = reg->u32_max_value = s64_max;
+ reg->var_off = tnum_range(s64_min, s64_max);
+ return;
+ }
+
+out:
+ set_sext64_default_val(reg, size);
+}
+
+static void set_sext32_default_val(struct bpf_reg_state *reg, int size)
+{
+ if (size == 1) {
+ reg->s32_min_value = S8_MIN;
+ reg->s32_max_value = S8_MAX;
+ } else {
+ /* size == 2 */
+ reg->s32_min_value = S16_MIN;
+ reg->s32_max_value = S16_MAX;
+ }
+ reg->u32_min_value = 0;
+ reg->u32_max_value = U32_MAX;
+}
+
+static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size)
+{
+ s32 init_s32_max, init_s32_min, s32_max, s32_min, u32_val;
+ u32 top_smax_value, top_smin_value;
+ u32 num_bits = size * 8;
+
+ if (tnum_is_const(reg->var_off)) {
+ u32_val = reg->var_off.value;
+ if (size == 1)
+ reg->var_off = tnum_const((s8)u32_val);
+ else
+ reg->var_off = tnum_const((s16)u32_val);
+
+ u32_val = reg->var_off.value;
+ reg->s32_min_value = reg->s32_max_value = u32_val;
+ reg->u32_min_value = reg->u32_max_value = u32_val;
+ return;
+ }
+
+ top_smax_value = ((u32)reg->s32_max_value >> num_bits) << num_bits;
+ top_smin_value = ((u32)reg->s32_min_value >> num_bits) << num_bits;
+
+ if (top_smax_value != top_smin_value)
+ goto out;
+
+ /* find the s32_min and s32_min after sign extension */
+ if (size == 1) {
+ init_s32_max = (s8)reg->s32_max_value;
+ init_s32_min = (s8)reg->s32_min_value;
+ } else {
+ /* size == 2 */
+ init_s32_max = (s16)reg->s32_max_value;
+ init_s32_min = (s16)reg->s32_min_value;
+ }
+ s32_max = max(init_s32_max, init_s32_min);
+ s32_min = min(init_s32_max, init_s32_min);
+
+ if ((s32_min >= 0) == (s32_max >= 0)) {
+ reg->s32_min_value = s32_min;
+ reg->s32_max_value = s32_max;
+ reg->u32_min_value = (u32)s32_min;
+ reg->u32_max_value = (u32)s32_max;
+ return;
+ }
+
+out:
+ set_sext32_default_val(reg, size);
}
static bool bpf_map_is_rdonly(const struct bpf_map *map)
{
- return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen;
+ /* A map is considered read-only if the following condition are true:
+ *
+ * 1) BPF program side cannot change any of the map content. The
+ * BPF_F_RDONLY_PROG flag is throughout the lifetime of a map
+ * and was set at map creation time.
+ * 2) The map value(s) have been initialized from user space by a
+ * loader and then "frozen", such that no new map update/delete
+ * operations from syscall side are possible for the rest of
+ * the map's lifetime from that point onwards.
+ * 3) Any parallel/pending map update/delete operations from syscall
+ * side have been completed. Only after that point, it's safe to
+ * assume that map value(s) are immutable.
+ */
+ return (map->map_flags & BPF_F_RDONLY_PROG) &&
+ READ_ONCE(map->frozen) &&
+ !bpf_map_write_active(map);
}
-static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
+static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val,
+ bool is_ldsx)
{
void *ptr;
u64 addr;
@@ -3120,13 +6222,13 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
switch (size) {
case sizeof(u8):
- *val = (u64)*(u8 *)ptr;
+ *val = is_ldsx ? (s64)*(s8 *)ptr : (u64)*(u8 *)ptr;
break;
case sizeof(u16):
- *val = (u64)*(u16 *)ptr;
+ *val = is_ldsx ? (s64)*(s16 *)ptr : (u64)*(u16 *)ptr;
break;
case sizeof(u32):
- *val = (u64)*(u32 *)ptr;
+ *val = is_ldsx ? (s64)*(s32 *)ptr : (u64)*(u32 *)ptr;
break;
case sizeof(u64):
*val = *(u64 *)ptr;
@@ -3137,6 +6239,112 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
return 0;
}
+#define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu)
+#define BTF_TYPE_SAFE_RCU_OR_NULL(__type) __PASTE(__type, __safe_rcu_or_null)
+#define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted)
+
+/*
+ * Allow list few fields as RCU trusted or full trusted.
+ * This logic doesn't allow mix tagging and will be removed once GCC supports
+ * btf_type_tag.
+ */
+
+/* RCU trusted: these fields are trusted in RCU CS and never NULL */
+BTF_TYPE_SAFE_RCU(struct task_struct) {
+ const cpumask_t *cpus_ptr;
+ struct css_set __rcu *cgroups;
+ struct task_struct __rcu *real_parent;
+ struct task_struct *group_leader;
+};
+
+BTF_TYPE_SAFE_RCU(struct cgroup) {
+ /* cgrp->kn is always accessible as documented in kernel/cgroup/cgroup.c */
+ struct kernfs_node *kn;
+};
+
+BTF_TYPE_SAFE_RCU(struct css_set) {
+ struct cgroup *dfl_cgrp;
+};
+
+/* RCU trusted: these fields are trusted in RCU CS and can be NULL */
+BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) {
+ struct file __rcu *exe_file;
+};
+
+/* skb->sk, req->sk are not RCU protected, but we mark them as such
+ * because bpf prog accessible sockets are SOCK_RCU_FREE.
+ */
+BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff) {
+ struct sock *sk;
+};
+
+BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock) {
+ struct sock *sk;
+};
+
+/* full trusted: these fields are trusted even outside of RCU CS and never NULL */
+BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) {
+ struct seq_file *seq;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) {
+ struct bpf_iter_meta *meta;
+ struct task_struct *task;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) {
+ struct file *file;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct file) {
+ struct inode *f_inode;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct dentry) {
+ /* no negative dentry-s in places where bpf can see it */
+ struct inode *d_inode;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct socket) {
+ struct sock *sk;
+};
+
+static bool type_is_rcu(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu");
+}
+
+static bool type_is_rcu_or_null(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu_or_null");
+}
+
+static bool type_is_trusted(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted");
+}
+
static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
struct bpf_reg_state *regs,
int regno, int off, int size,
@@ -3144,11 +6352,25 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
int value_regno)
{
struct bpf_reg_state *reg = regs + regno;
- const struct btf_type *t = btf_type_by_id(btf_vmlinux, reg->btf_id);
- const char *tname = btf_name_by_offset(btf_vmlinux, t->name_off);
- u32 btf_id;
+ const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
+ const char *tname = btf_name_by_offset(reg->btf, t->name_off);
+ const char *field_name = NULL;
+ enum bpf_type_flag flag = 0;
+ u32 btf_id = 0;
int ret;
+ if (!env->allow_ptr_leaks) {
+ verbose(env,
+ "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+ tname);
+ return -EPERM;
+ }
+ if (!env->prog->gpl_compatible && btf_is_kernel(reg->btf)) {
+ verbose(env,
+ "Cannot access kernel 'struct %s' from non-GPL compatible program\n",
+ tname);
+ return -EINVAL;
+ }
if (off < 0) {
verbose(env,
"R%d is ptr_%s invalid negative access: off=%d\n",
@@ -3165,35 +6387,258 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EACCES;
}
- if (env->ops->btf_struct_access) {
- ret = env->ops->btf_struct_access(&env->log, t, off, size,
- atype, &btf_id);
+ if (reg->type & MEM_USER) {
+ verbose(env,
+ "R%d is ptr_%s access user memory: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+
+ if (reg->type & MEM_PERCPU) {
+ verbose(env,
+ "R%d is ptr_%s access percpu memory: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+
+ if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) {
+ if (!btf_is_kernel(reg->btf)) {
+ verbose(env, "verifier internal error: reg->btf must be kernel btf\n");
+ return -EFAULT;
+ }
+ ret = env->ops->btf_struct_access(&env->log, reg, off, size);
} else {
- if (atype != BPF_READ) {
+ /* Writes are permitted with default btf_struct_access for
+ * program allocated objects (which always have ref_obj_id > 0),
+ * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC.
+ */
+ if (atype != BPF_READ && !type_is_ptr_alloc_obj(reg->type)) {
verbose(env, "only read is supported\n");
return -EACCES;
}
- ret = btf_struct_access(&env->log, t, off, size, atype,
- &btf_id);
+ if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) &&
+ !(reg->type & MEM_RCU) && !reg->ref_obj_id) {
+ verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n");
+ return -EFAULT;
+ }
+
+ ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag, &field_name);
}
if (ret < 0)
return ret;
- if (atype == BPF_READ && value_regno >= 0) {
- if (ret == SCALAR_VALUE) {
- mark_reg_unknown(env, regs, value_regno);
- return 0;
+ if (ret != PTR_TO_BTF_ID) {
+ /* just mark; */
+
+ } else if (type_flag(reg->type) & PTR_UNTRUSTED) {
+ /* If this is an untrusted pointer, all pointers formed by walking it
+ * also inherit the untrusted flag.
+ */
+ flag = PTR_UNTRUSTED;
+
+ } else if (is_trusted_reg(reg) || is_rcu_reg(reg)) {
+ /* By default any pointer obtained from walking a trusted pointer is no
+ * longer trusted, unless the field being accessed has explicitly been
+ * marked as inheriting its parent's state of trust (either full or RCU).
+ * For example:
+ * 'cgroups' pointer is untrusted if task->cgroups dereference
+ * happened in a sleepable program outside of bpf_rcu_read_lock()
+ * section. In a non-sleepable program it's trusted while in RCU CS (aka MEM_RCU).
+ * Note bpf_rcu_read_unlock() converts MEM_RCU pointers to PTR_UNTRUSTED.
+ *
+ * A regular RCU-protected pointer with __rcu tag can also be deemed
+ * trusted if we are in an RCU CS. Such pointer can be NULL.
+ */
+ if (type_is_trusted(env, reg, field_name, btf_id)) {
+ flag |= PTR_TRUSTED;
+ } else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) {
+ if (type_is_rcu(env, reg, field_name, btf_id)) {
+ /* ignore __rcu tag and mark it MEM_RCU */
+ flag |= MEM_RCU;
+ } else if (flag & MEM_RCU ||
+ type_is_rcu_or_null(env, reg, field_name, btf_id)) {
+ /* __rcu tagged pointers can be NULL */
+ flag |= MEM_RCU | PTR_MAYBE_NULL;
+
+ /* We always trust them */
+ if (type_is_rcu_or_null(env, reg, field_name, btf_id) &&
+ flag & PTR_UNTRUSTED)
+ flag &= ~PTR_UNTRUSTED;
+ } else if (flag & (MEM_PERCPU | MEM_USER)) {
+ /* keep as-is */
+ } else {
+ /* walking unknown pointers yields old deprecated PTR_TO_BTF_ID */
+ clear_trusted_flags(&flag);
+ }
+ } else {
+ /*
+ * If not in RCU CS or MEM_RCU pointer can be NULL then
+ * aggressively mark as untrusted otherwise such
+ * pointers will be plain PTR_TO_BTF_ID without flags
+ * and will be allowed to be passed into helpers for
+ * compat reasons.
+ */
+ flag = PTR_UNTRUSTED;
}
- mark_reg_known_zero(env, regs, value_regno);
- regs[value_regno].type = PTR_TO_BTF_ID;
- regs[value_regno].btf_id = btf_id;
+ } else {
+ /* Old compat. Deprecated */
+ clear_trusted_flags(&flag);
}
+ if (atype == BPF_READ && value_regno >= 0)
+ mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
+
return 0;
}
+static int check_ptr_to_map_access(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs,
+ int regno, int off, int size,
+ enum bpf_access_type atype,
+ int value_regno)
+{
+ struct bpf_reg_state *reg = regs + regno;
+ struct bpf_map *map = reg->map_ptr;
+ struct bpf_reg_state map_reg;
+ enum bpf_type_flag flag = 0;
+ const struct btf_type *t;
+ const char *tname;
+ u32 btf_id;
+ int ret;
+
+ if (!btf_vmlinux) {
+ verbose(env, "map_ptr access not supported without CONFIG_DEBUG_INFO_BTF\n");
+ return -ENOTSUPP;
+ }
+
+ if (!map->ops->map_btf_id || !*map->ops->map_btf_id) {
+ verbose(env, "map_ptr access not supported for map type %d\n",
+ map->map_type);
+ return -ENOTSUPP;
+ }
+
+ t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id);
+ tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+
+ if (!env->allow_ptr_leaks) {
+ verbose(env,
+ "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+ tname);
+ return -EPERM;
+ }
+
+ if (off < 0) {
+ verbose(env, "R%d is %s invalid negative access: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+
+ if (atype != BPF_READ) {
+ verbose(env, "only read from %s is supported\n", tname);
+ return -EACCES;
+ }
+
+ /* Simulate access to a PTR_TO_BTF_ID */
+ memset(&map_reg, 0, sizeof(map_reg));
+ mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0);
+ ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (value_regno >= 0)
+ mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag);
+
+ return 0;
+}
+
+/* Check that the stack access at the given offset is within bounds. The
+ * maximum valid offset is -1.
+ *
+ * The minimum valid offset is -MAX_BPF_STACK for writes, and
+ * -state->allocated_stack for reads.
+ */
+static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
+ s64 off,
+ struct bpf_func_state *state,
+ enum bpf_access_type t)
+{
+ int min_valid_off;
+
+ if (t == BPF_WRITE || env->allow_uninit_stack)
+ min_valid_off = -MAX_BPF_STACK;
+ else
+ min_valid_off = -state->allocated_stack;
+
+ if (off < min_valid_off || off > -1)
+ return -EACCES;
+ return 0;
+}
+
+/* Check that the stack access at 'regno + off' falls within the maximum stack
+ * bounds.
+ *
+ * 'off' includes `regno->offset`, but not its dynamic part (if any).
+ */
+static int check_stack_access_within_bounds(
+ struct bpf_verifier_env *env,
+ int regno, int off, int access_size,
+ enum bpf_access_src src, enum bpf_access_type type)
+{
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = regs + regno;
+ struct bpf_func_state *state = func(env, reg);
+ s64 min_off, max_off;
+ int err;
+ char *err_extra;
+
+ if (src == ACCESS_HELPER)
+ /* We don't know if helpers are reading or writing (or both). */
+ err_extra = " indirect access to";
+ else if (type == BPF_READ)
+ err_extra = " read from";
+ else
+ err_extra = " write to";
+
+ if (tnum_is_const(reg->var_off)) {
+ min_off = (s64)reg->var_off.value + off;
+ max_off = min_off + access_size;
+ } else {
+ if (reg->smax_value >= BPF_MAX_VAR_OFF ||
+ reg->smin_value <= -BPF_MAX_VAR_OFF) {
+ verbose(env, "invalid unbounded variable-offset%s stack R%d\n",
+ err_extra, regno);
+ return -EACCES;
+ }
+ min_off = reg->smin_value + off;
+ max_off = reg->smax_value + off + access_size;
+ }
+
+ err = check_stack_slot_within_bounds(env, min_off, state, type);
+ if (!err && max_off > 0)
+ err = -EINVAL; /* out of stack access into non-negative offsets */
+
+ if (err) {
+ if (tnum_is_const(reg->var_off)) {
+ verbose(env, "invalid%s stack R%d off=%d size=%d\n",
+ err_extra, regno, off, access_size);
+ } else {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n",
+ err_extra, regno, tn_buf, off, access_size);
+ }
+ return err;
+ }
+
+ /* Note that there is no stack access with offset zero, so the needed stack
+ * size is -min_off, not -min_off+1.
+ */
+ return grow_stack_state(env, state, -min_off /* size */);
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -3202,11 +6647,10 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
*/
static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno,
int off, int bpf_size, enum bpf_access_type t,
- int value_regno, bool strict_alignment_once)
+ int value_regno, bool strict_alignment_once, bool is_ldsx)
{
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = regs + regno;
- struct bpf_func_state *state;
int size, err = 0;
size = bpf_size_to_bytes(bpf_size);
@@ -3221,7 +6665,21 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
/* for access checks, reg->off is just part of off */
off += reg->off;
- if (reg->type == PTR_TO_MAP_VALUE) {
+ if (reg->type == PTR_TO_MAP_KEY) {
+ if (t == BPF_WRITE) {
+ verbose(env, "write to change key R%d not allowed\n", regno);
+ return -EACCES;
+ }
+
+ err = check_mem_region_access(env, regno, off, size,
+ reg->map_ptr->key_size, false);
+ if (err)
+ return err;
+ if (value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
+ } else if (reg->type == PTR_TO_MAP_VALUE) {
+ struct btf_field *kptr_field = NULL;
+
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose(env, "R%d leaks addr into map\n", value_regno);
@@ -3230,8 +6688,15 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_map_access_type(env, regno, off, size, t);
if (err)
return err;
- err = check_map_access(env, regno, off, size, false);
- if (!err && t == BPF_READ && value_regno >= 0) {
+ err = check_map_access(env, regno, off, size, false, ACCESS_DIRECT);
+ if (err)
+ return err;
+ if (tnum_is_const(reg->var_off))
+ kptr_field = btf_record_find(reg->map_ptr->record,
+ off + reg->var_off.value, BPF_KPTR);
+ if (kptr_field) {
+ err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field);
+ } else if (t == BPF_READ && value_regno >= 0) {
struct bpf_map *map = reg->map_ptr;
/* if map is read-only, track its contents as scalars */
@@ -3242,7 +6707,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
u64 val = 0;
err = bpf_map_direct_read(map, map_off, size,
- &val);
+ &val, is_ldsx);
if (err)
return err;
@@ -3252,18 +6717,34 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
mark_reg_unknown(env, regs, value_regno);
}
}
- } else if (reg->type == PTR_TO_MEM) {
+ } else if (base_type(reg->type) == PTR_TO_MEM) {
+ bool rdonly_mem = type_is_rdonly_mem(reg->type);
+
+ if (type_may_be_null(reg->type)) {
+ verbose(env, "R%d invalid mem access '%s'\n", regno,
+ reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+
+ if (t == BPF_WRITE && rdonly_mem) {
+ verbose(env, "R%d cannot write into %s\n",
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose(env, "R%d leaks addr into mem\n", value_regno);
return -EACCES;
}
+
err = check_mem_region_access(env, regno, off, size,
reg->mem_size, false);
- if (!err && t == BPF_READ && value_regno >= 0)
+ if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
enum bpf_reg_type reg_type = SCALAR_VALUE;
+ struct btf *btf = NULL;
u32 btf_id = 0;
if (t == BPF_WRITE && value_regno >= 0 &&
@@ -3272,11 +6753,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return -EACCES;
}
- err = check_ctx_reg(env, reg, regno);
+ err = check_ptr_off_reg(env, reg, regno);
if (err < 0)
return err;
- err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf_id);
+ err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf,
+ &btf_id);
if (err)
verbose_linfo(env, insn_idx, "; ");
if (!err && t == BPF_READ && value_regno >= 0) {
@@ -3289,7 +6771,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else {
mark_reg_known_zero(env, regs,
value_regno);
- if (reg_type_may_be_null(reg_type))
+ if (type_may_be_null(reg_type))
regs[value_regno].id = ++env->id_gen;
/* A load of ctx field could have different
* actual load size with the one encoded in the
@@ -3297,30 +6779,26 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* a sub-register.
*/
regs[value_regno].subreg_def = DEF_NOT_SUBREG;
- if (reg_type == PTR_TO_BTF_ID ||
- reg_type == PTR_TO_BTF_ID_OR_NULL)
+ if (base_type(reg_type) == PTR_TO_BTF_ID) {
+ regs[value_regno].btf = btf;
regs[value_regno].btf_id = btf_id;
+ }
}
regs[value_regno].type = reg_type;
}
} else if (reg->type == PTR_TO_STACK) {
- off += reg->var_off.value;
- err = check_stack_access(env, reg, off, size);
- if (err)
- return err;
-
- state = func(env, reg);
- err = update_stack_depth(env, state, off);
+ /* Basic bounds checks. */
+ err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t);
if (err)
return err;
- if (t == BPF_WRITE)
- err = check_stack_write(env, state, off, size,
- value_regno, insn_idx);
- else
- err = check_stack_read(env, state, off, size,
+ if (t == BPF_READ)
+ err = check_stack_read(env, regno, off, size,
value_regno);
+ else
+ err = check_stack_write(env, regno, off, size,
+ value_regno, insn_idx);
} else if (reg_is_pkt_pointer(reg)) {
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
verbose(env, "cannot write into packet\n");
@@ -3349,7 +6827,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else if (type_is_sk_pointer(reg->type)) {
if (t == BPF_WRITE) {
verbose(env, "R%d cannot write into %s\n",
- regno, reg_type_str[reg->type]);
+ regno, reg_type_str(env, reg->type));
return -EACCES;
}
err = check_sock_access(env, insn_idx, regno, off, size, t);
@@ -3359,30 +6837,74 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_tp_buffer_access(env, reg, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
- } else if (reg->type == PTR_TO_BTF_ID) {
+ } else if (base_type(reg->type) == PTR_TO_BTF_ID &&
+ !type_may_be_null(reg->type)) {
err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
value_regno);
+ } else if (reg->type == CONST_PTR_TO_MAP) {
+ err = check_ptr_to_map_access(env, regs, regno, off, size, t,
+ value_regno);
+ } else if (base_type(reg->type) == PTR_TO_BUF) {
+ bool rdonly_mem = type_is_rdonly_mem(reg->type);
+ u32 *max_access;
+
+ if (rdonly_mem) {
+ if (t == BPF_WRITE) {
+ verbose(env, "R%d cannot write into %s\n",
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ max_access = &env->prog->aux->max_rdonly_access;
+ } else {
+ max_access = &env->prog->aux->max_rdwr_access;
+ }
+
+ err = check_buffer_access(env, reg, regno, off, size, false,
+ max_access);
+
+ if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
+ mark_reg_unknown(env, regs, value_regno);
} else {
verbose(env, "R%d invalid mem access '%s'\n", regno,
- reg_type_str[reg->type]);
+ reg_type_str(env, reg->type));
return -EACCES;
}
if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
regs[value_regno].type == SCALAR_VALUE) {
- /* b/h/w load zero-extends, mark upper bits as known 0 */
- coerce_reg_to_size(&regs[value_regno], size);
+ if (!is_ldsx)
+ /* b/h/w load zero-extends, mark upper bits as known 0 */
+ coerce_reg_to_size(&regs[value_regno], size);
+ else
+ coerce_reg_to_size_sx(&regs[value_regno], size);
}
return err;
}
-static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
+static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
{
+ int load_reg;
int err;
- if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
- insn->imm != 0) {
- verbose(env, "BPF_XADD uses reserved fields\n");
+ switch (insn->imm) {
+ case BPF_ADD:
+ case BPF_ADD | BPF_FETCH:
+ case BPF_AND:
+ case BPF_AND | BPF_FETCH:
+ case BPF_OR:
+ case BPF_OR | BPF_FETCH:
+ case BPF_XOR:
+ case BPF_XOR | BPF_FETCH:
+ case BPF_XCHG:
+ case BPF_CMPXCHG:
+ break;
+ default:
+ verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm);
+ return -EINVAL;
+ }
+
+ if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) {
+ verbose(env, "invalid atomic operand size\n");
return -EINVAL;
}
@@ -3396,6 +6918,20 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
if (err)
return err;
+ if (insn->imm == BPF_CMPXCHG) {
+ /* Check comparison of R0 with memory location */
+ const u32 aux_reg = BPF_REG_0;
+
+ err = check_reg_arg(env, aux_reg, SRC_OP);
+ if (err)
+ return err;
+
+ if (is_pointer_value(env, aux_reg)) {
+ verbose(env, "R%d leaks addr into mem\n", aux_reg);
+ return -EACCES;
+ }
+ }
+
if (is_pointer_value(env, insn->src_reg)) {
verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
return -EACCES;
@@ -3405,78 +6941,97 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
is_pkt_reg(env, insn->dst_reg) ||
is_flow_key_reg(env, insn->dst_reg) ||
is_sk_reg(env, insn->dst_reg)) {
- verbose(env, "BPF_XADD stores into R%d %s is not allowed\n",
+ verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
insn->dst_reg,
- reg_type_str[reg_state(env, insn->dst_reg)->type]);
+ reg_type_str(env, reg_state(env, insn->dst_reg)->type));
return -EACCES;
}
- /* check whether atomic_add can read the memory */
+ if (insn->imm & BPF_FETCH) {
+ if (insn->imm == BPF_CMPXCHG)
+ load_reg = BPF_REG_0;
+ else
+ load_reg = insn->src_reg;
+
+ /* check and record load of old value */
+ err = check_reg_arg(env, load_reg, DST_OP);
+ if (err)
+ return err;
+ } else {
+ /* This instruction accesses a memory location but doesn't
+ * actually load it into a register.
+ */
+ load_reg = -1;
+ }
+
+ /* Check whether we can read the memory, with second call for fetch
+ * case to simulate the register fill.
+ */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_READ, -1, true);
+ BPF_SIZE(insn->code), BPF_READ, -1, true, false);
+ if (!err && load_reg >= 0)
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ, load_reg,
+ true, false);
if (err)
return err;
- /* check whether atomic_add can write into the same memory */
- return check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_WRITE, -1, true);
-}
-
-static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno,
- int off, int access_size,
- bool zero_size_allowed)
-{
- struct bpf_reg_state *reg = reg_state(env, regno);
-
- if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
- access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
- if (tnum_is_const(reg->var_off)) {
- verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
- regno, off, access_size);
- } else {
- char tn_buf[48];
-
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n",
- regno, tn_buf, access_size);
- }
- return -EACCES;
- }
+ /* Check whether we can write into the same memory. */
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE, -1, true, false);
+ if (err)
+ return err;
return 0;
}
-/* when register 'regno' is passed into function that will read 'access_size'
- * bytes from that pointer, make sure that it's within stack boundary
- * and all elements of stack are initialized.
- * Unlike most pointer bounds-checking functions, this one doesn't take an
- * 'off' argument, so it has to add in reg->off itself.
+/* When register 'regno' is used to read the stack (either directly or through
+ * a helper function) make sure that it's within stack boundary and, depending
+ * on the access type and privileges, that all elements of the stack are
+ * initialized.
+ *
+ * 'off' includes 'regno->off', but not its dynamic part (if any).
+ *
+ * All registers that have been spilled on the stack in the slots within the
+ * read offsets are marked as read.
*/
-static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
- int access_size, bool zero_size_allowed,
- struct bpf_call_arg_meta *meta)
+static int check_stack_range_initialized(
+ struct bpf_verifier_env *env, int regno, int off,
+ int access_size, bool zero_size_allowed,
+ enum bpf_access_src type, struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_func_state *state = func(env, reg);
int err, min_off, max_off, i, j, slot, spi;
+ char *err_extra = type == ACCESS_HELPER ? " indirect" : "";
+ enum bpf_access_type bounds_check_type;
+ /* Some accesses can write anything into the stack, others are
+ * read-only.
+ */
+ bool clobber = false;
- if (reg->type != PTR_TO_STACK) {
- /* Allow zero-byte read from NULL, regardless of pointer type */
- if (zero_size_allowed && access_size == 0 &&
- register_is_null(reg))
- return 0;
-
- verbose(env, "R%d type=%s expected=%s\n", regno,
- reg_type_str[reg->type],
- reg_type_str[PTR_TO_STACK]);
+ if (access_size == 0 && !zero_size_allowed) {
+ verbose(env, "invalid zero-sized read\n");
return -EACCES;
}
+ if (type == ACCESS_HELPER) {
+ /* The bounds checks for writes are more permissive than for
+ * reads. However, if raw_mode is not set, we'll do extra
+ * checks below.
+ */
+ bounds_check_type = BPF_WRITE;
+ clobber = true;
+ } else {
+ bounds_check_type = BPF_READ;
+ }
+ err = check_stack_access_within_bounds(env, regno, off, access_size,
+ type, bounds_check_type);
+ if (err)
+ return err;
+
+
if (tnum_is_const(reg->var_off)) {
- min_off = max_off = reg->var_off.value + reg->off;
- err = __check_stack_boundary(env, regno, min_off, access_size,
- zero_size_allowed);
- if (err)
- return err;
+ min_off = max_off = reg->var_off.value + off;
} else {
/* Variable offset is prohibited for unprivileged mode for
* simplicity since it requires corresponding support in
@@ -3487,8 +7042,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n",
- regno, tn_buf);
+ verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n",
+ regno, err_extra, tn_buf);
return -EACCES;
}
/* Only initialized buffer on stack is allowed to be accessed
@@ -3500,31 +7055,36 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
if (meta && meta->raw_mode)
meta = NULL;
- if (reg->smax_value >= BPF_MAX_VAR_OFF ||
- reg->smax_value <= -BPF_MAX_VAR_OFF) {
- verbose(env, "R%d unbounded indirect variable offset stack access\n",
- regno);
- return -EACCES;
- }
- min_off = reg->smin_value + reg->off;
- max_off = reg->smax_value + reg->off;
- err = __check_stack_boundary(env, regno, min_off, access_size,
- zero_size_allowed);
- if (err) {
- verbose(env, "R%d min value is outside of stack bound\n",
- regno);
- return err;
- }
- err = __check_stack_boundary(env, regno, max_off, access_size,
- zero_size_allowed);
- if (err) {
- verbose(env, "R%d max value is outside of stack bound\n",
- regno);
- return err;
- }
+ min_off = reg->smin_value + off;
+ max_off = reg->smax_value + off;
}
if (meta && meta->raw_mode) {
+ /* Ensure we won't be overwriting dynptrs when simulating byte
+ * by byte access in check_helper_call using meta.access_size.
+ * This would be a problem if we have a helper in the future
+ * which takes:
+ *
+ * helper(uninit_mem, len, dynptr)
+ *
+ * Now, uninint_mem may overlap with dynptr pointer. Hence, it
+ * may end up writing to dynptr itself when touching memory from
+ * arg 1. This can be relaxed on a case by case basis for known
+ * safe cases, but reject due to the possibilitiy of aliasing by
+ * default.
+ */
+ for (i = min_off; i < max_off + access_size; i++) {
+ int stack_off = -i - 1;
+
+ spi = __get_spi(i);
+ /* raw_mode may write past allocated_stack */
+ if (state->allocated_stack <= stack_off)
+ continue;
+ if (state->stack[spi].slot_type[stack_off % BPF_REG_SIZE] == STACK_DYNPTR) {
+ verbose(env, "potential write to dynptr at off=%d disallowed\n", i);
+ return -EACCES;
+ }
+ }
meta->access_size = access_size;
meta->regno = regno;
return 0;
@@ -3535,39 +7095,43 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
slot = -i - 1;
spi = slot / BPF_REG_SIZE;
- if (state->allocated_stack <= slot)
- goto err;
+ if (state->allocated_stack <= slot) {
+ verbose(env, "verifier bug: allocated_stack too small");
+ return -EFAULT;
+ }
+
stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
if (*stype == STACK_MISC)
goto mark;
- if (*stype == STACK_ZERO) {
- /* helper can write anything into the stack */
- *stype = STACK_MISC;
+ if ((*stype == STACK_ZERO) ||
+ (*stype == STACK_INVALID && env->allow_uninit_stack)) {
+ if (clobber) {
+ /* helper can write anything into the stack */
+ *stype = STACK_MISC;
+ }
goto mark;
}
- if (state->stack[spi].slot_type[0] == STACK_SPILL &&
- state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID)
- goto mark;
-
- if (state->stack[spi].slot_type[0] == STACK_SPILL &&
- state->stack[spi].spilled_ptr.type == SCALAR_VALUE) {
- __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
- for (j = 0; j < BPF_REG_SIZE; j++)
- state->stack[spi].slot_type[j] = STACK_MISC;
+ if (is_spilled_reg(&state->stack[spi]) &&
+ (state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
+ env->allow_ptr_leaks)) {
+ if (clobber) {
+ __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ scrub_spilled_slot(&state->stack[spi].slot_type[j]);
+ }
goto mark;
}
-err:
if (tnum_is_const(reg->var_off)) {
- verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
- min_off, i - min_off, access_size);
+ verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n",
+ err_extra, regno, min_off, i - min_off, access_size);
} else {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n",
- tn_buf, i - min_off, access_size);
+ verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n",
+ err_extra, regno, tn_buf, i - min_off, access_size);
}
return -EACCES;
mark:
@@ -3577,8 +7141,13 @@ mark:
mark_reg_read(env, &state->stack[spi].spilled_ptr,
state->stack[spi].spilled_ptr.parent,
REG_LIVE_READ64);
+ /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not
+ * be sure that whether stack slot is written to or not. Hence,
+ * we must still conservatively propagate reads upwards even if
+ * helper may write to the entire memory range.
+ */
}
- return update_stack_depth(env, state, min_off);
+ return 0;
}
static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
@@ -3586,47 +7155,234 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ u32 *max_access;
- switch (reg->type) {
+ switch (base_type(reg->type)) {
case PTR_TO_PACKET:
case PTR_TO_PACKET_META:
return check_packet_access(env, regno, reg->off, access_size,
zero_size_allowed);
+ case PTR_TO_MAP_KEY:
+ if (meta && meta->raw_mode) {
+ verbose(env, "R%d cannot write into %s\n", regno,
+ reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ return check_mem_region_access(env, regno, reg->off, access_size,
+ reg->map_ptr->key_size, false);
case PTR_TO_MAP_VALUE:
if (check_map_access_type(env, regno, reg->off, access_size,
meta && meta->raw_mode ? BPF_WRITE :
BPF_READ))
return -EACCES;
return check_map_access(env, regno, reg->off, access_size,
- zero_size_allowed);
+ zero_size_allowed, ACCESS_HELPER);
case PTR_TO_MEM:
+ if (type_is_rdonly_mem(reg->type)) {
+ if (meta && meta->raw_mode) {
+ verbose(env, "R%d cannot write into %s\n", regno,
+ reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ }
return check_mem_region_access(env, regno, reg->off,
access_size, reg->mem_size,
zero_size_allowed);
- default: /* scalar_value|ptr_to_stack or invalid ptr */
- return check_stack_boundary(env, regno, access_size,
- zero_size_allowed, meta);
+ case PTR_TO_BUF:
+ if (type_is_rdonly_mem(reg->type)) {
+ if (meta && meta->raw_mode) {
+ verbose(env, "R%d cannot write into %s\n", regno,
+ reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+
+ max_access = &env->prog->aux->max_rdonly_access;
+ } else {
+ max_access = &env->prog->aux->max_rdwr_access;
+ }
+ return check_buffer_access(env, reg, regno, reg->off,
+ access_size, zero_size_allowed,
+ max_access);
+ case PTR_TO_STACK:
+ return check_stack_range_initialized(
+ env,
+ regno, reg->off, access_size,
+ zero_size_allowed, ACCESS_HELPER, meta);
+ case PTR_TO_BTF_ID:
+ return check_ptr_to_btf_access(env, regs, regno, reg->off,
+ access_size, BPF_READ, -1);
+ case PTR_TO_CTX:
+ /* in case the function doesn't know how to access the context,
+ * (because we are in a program of type SYSCALL for example), we
+ * can not statically check its size.
+ * Dynamically check it now.
+ */
+ if (!env->ops->convert_ctx_access) {
+ enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ;
+ int offset = access_size - 1;
+
+ /* Allow zero-byte read from PTR_TO_CTX */
+ if (access_size == 0)
+ return zero_size_allowed ? 0 : -EACCES;
+
+ return check_mem_access(env, env->insn_idx, regno, offset, BPF_B,
+ atype, -1, false, false);
+ }
+
+ fallthrough;
+ default: /* scalar_value or invalid ptr */
+ /* Allow zero-byte read from NULL, regardless of pointer type */
+ if (zero_size_allowed && access_size == 0 &&
+ register_is_null(reg))
+ return 0;
+
+ verbose(env, "R%d type=%s ", regno,
+ reg_type_str(env, reg->type));
+ verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK));
+ return -EACCES;
+ }
+}
+
+/* verify arguments to helpers or kfuncs consisting of a pointer and an access
+ * size.
+ *
+ * @regno is the register containing the access size. regno-1 is the register
+ * containing the pointer.
+ */
+static int check_mem_size_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ bool zero_size_allowed,
+ struct bpf_call_arg_meta *meta)
+{
+ int err;
+
+ /* This is used to refine r0 return value bounds for helpers
+ * that enforce this value as an upper bound on return values.
+ * See do_refine_retval_range() for helpers that can refine
+ * the return value. C type of helper is u32 so we pull register
+ * bound from umax_value however, if negative verifier errors
+ * out. Only upper bounds can be learned because retval is an
+ * int type and negative retvals are allowed.
+ */
+ meta->msize_max_value = reg->umax_value;
+
+ /* The register is SCALAR_VALUE; the access check
+ * happens using its boundaries.
+ */
+ if (!tnum_is_const(reg->var_off))
+ /* For unprivileged variable accesses, disable raw
+ * mode so that the program is required to
+ * initialize all the memory that the helper could
+ * just partially fill up.
+ */
+ meta = NULL;
+
+ if (reg->smin_value < 0) {
+ verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
+ regno);
+ return -EACCES;
+ }
+
+ if (reg->umin_value == 0 && !zero_size_allowed) {
+ verbose(env, "R%d invalid zero-sized read: u64=[%lld,%lld]\n",
+ regno, reg->umin_value, reg->umax_value);
+ return -EACCES;
+ }
+
+ if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
+ verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ regno);
+ return -EACCES;
+ }
+ err = check_helper_mem_access(env, regno - 1,
+ reg->umax_value,
+ zero_size_allowed, meta);
+ if (!err)
+ err = mark_chain_precision(env, regno);
+ return err;
+}
+
+static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ u32 regno, u32 mem_size)
+{
+ bool may_be_null = type_may_be_null(reg->type);
+ struct bpf_reg_state saved_reg;
+ struct bpf_call_arg_meta meta;
+ int err;
+
+ if (register_is_null(reg))
+ return 0;
+
+ memset(&meta, 0, sizeof(meta));
+ /* Assuming that the register contains a value check if the memory
+ * access is safe. Temporarily save and restore the register's state as
+ * the conversion shouldn't be visible to a caller.
+ */
+ if (may_be_null) {
+ saved_reg = *reg;
+ mark_ptr_not_null_reg(reg);
+ }
+
+ err = check_helper_mem_access(env, regno, mem_size, true, &meta);
+ /* Check access for BPF_WRITE */
+ meta.raw_mode = true;
+ err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta);
+
+ if (may_be_null)
+ *reg = saved_reg;
+
+ return err;
+}
+
+static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ u32 regno)
+{
+ struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
+ bool may_be_null = type_may_be_null(mem_reg->type);
+ struct bpf_reg_state saved_reg;
+ struct bpf_call_arg_meta meta;
+ int err;
+
+ WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5);
+
+ memset(&meta, 0, sizeof(meta));
+
+ if (may_be_null) {
+ saved_reg = *mem_reg;
+ mark_ptr_not_null_reg(mem_reg);
}
+
+ err = check_mem_size_reg(env, reg, regno, true, &meta);
+ /* Check access for BPF_WRITE */
+ meta.raw_mode = true;
+ err = err ?: check_mem_size_reg(env, reg, regno, true, &meta);
+
+ if (may_be_null)
+ *mem_reg = saved_reg;
+ return err;
}
/* Implementation details:
- * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
+ * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL.
+ * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL.
* Two bpf_map_lookups (even with the same key) will have different reg->id.
- * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after
- * value_or_null->value transition, since the verifier only cares about
- * the range of access to valid map value pointer and doesn't care about actual
- * address of the map element.
+ * Two separate bpf_obj_new will also have different reg->id.
+ * For traditional PTR_TO_MAP_VALUE or PTR_TO_BTF_ID | MEM_ALLOC, the verifier
+ * clears reg->id after value_or_null->value transition, since the verifier only
+ * cares about the range of access to valid map value pointer and doesn't care
+ * about actual address of the map element.
* For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
* reg->id > 0 after value_or_null->value transition. By doing so
* two bpf_map_lookups will be considered two different pointers that
- * point to different bpf_spin_locks.
+ * point to different bpf_spin_locks. Likewise for pointers to allocated objects
+ * returned from bpf_obj_new.
* The verifier allows taking only one bpf_spin_lock at a time to avoid
* dead-locks.
* Since only one bpf_spin_lock is allowed the checks are simpler than
* reg_is_refcounted() logic. The verifier needs to remember only
* one spin_lock instead of array of acquired_refs.
- * cur_state->active_spin_lock remembers which map value element got locked
- * and clears it after bpf_spin_unlock.
+ * cur_state->active_lock remembers which map value element or allocated
+ * object got locked and clears it after bpf_spin_unlock.
*/
static int process_spin_lock(struct bpf_verifier_env *env, int regno,
bool is_lock)
@@ -3634,94 +7390,583 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
struct bpf_verifier_state *cur = env->cur_state;
bool is_const = tnum_is_const(reg->var_off);
- struct bpf_map *map = reg->map_ptr;
u64 val = reg->var_off.value;
+ struct bpf_map *map = NULL;
+ struct btf *btf = NULL;
+ struct btf_record *rec;
- if (reg->type != PTR_TO_MAP_VALUE) {
- verbose(env, "R%d is not a pointer to map_value\n", regno);
- return -EINVAL;
- }
if (!is_const) {
verbose(env,
"R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n",
regno);
return -EINVAL;
}
- if (!map->btf) {
- verbose(env,
- "map '%s' has to have BTF in order to use bpf_spin_lock\n",
- map->name);
- return -EINVAL;
- }
- if (!map_value_has_spin_lock(map)) {
- if (map->spin_lock_off == -E2BIG)
- verbose(env,
- "map '%s' has more than one 'struct bpf_spin_lock'\n",
- map->name);
- else if (map->spin_lock_off == -ENOENT)
- verbose(env,
- "map '%s' doesn't have 'struct bpf_spin_lock'\n",
- map->name);
- else
+ if (reg->type == PTR_TO_MAP_VALUE) {
+ map = reg->map_ptr;
+ if (!map->btf) {
verbose(env,
- "map '%s' is not a struct type or bpf_spin_lock is mangled\n",
+ "map '%s' has to have BTF in order to use bpf_spin_lock\n",
map->name);
+ return -EINVAL;
+ }
+ } else {
+ btf = reg->btf;
+ }
+
+ rec = reg_btf_record(reg);
+ if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) {
+ verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local",
+ map ? map->name : "kptr");
return -EINVAL;
}
- if (map->spin_lock_off != val + reg->off) {
- verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n",
- val + reg->off);
+ if (rec->spin_lock_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n",
+ val + reg->off, rec->spin_lock_off);
return -EINVAL;
}
if (is_lock) {
- if (cur->active_spin_lock) {
+ if (cur->active_lock.ptr) {
verbose(env,
"Locking two bpf_spin_locks are not allowed\n");
return -EINVAL;
}
- cur->active_spin_lock = reg->id;
+ if (map)
+ cur->active_lock.ptr = map;
+ else
+ cur->active_lock.ptr = btf;
+ cur->active_lock.id = reg->id;
} else {
- if (!cur->active_spin_lock) {
+ void *ptr;
+
+ if (map)
+ ptr = map;
+ else
+ ptr = btf;
+
+ if (!cur->active_lock.ptr) {
verbose(env, "bpf_spin_unlock without taking a lock\n");
return -EINVAL;
}
- if (cur->active_spin_lock != reg->id) {
+ if (cur->active_lock.ptr != ptr ||
+ cur->active_lock.id != reg->id) {
verbose(env, "bpf_spin_unlock of different lock\n");
return -EINVAL;
}
- cur->active_spin_lock = 0;
+
+ invalidate_non_owning_refs(env);
+
+ cur->active_lock.ptr = NULL;
+ cur->active_lock.id = 0;
}
return 0;
}
-static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
+static int process_timer_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_call_arg_meta *meta)
{
- return type == ARG_PTR_TO_MEM ||
- type == ARG_PTR_TO_MEM_OR_NULL ||
- type == ARG_PTR_TO_UNINIT_MEM;
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ bool is_const = tnum_is_const(reg->var_off);
+ struct bpf_map *map = reg->map_ptr;
+ u64 val = reg->var_off.value;
+
+ if (!is_const) {
+ verbose(env,
+ "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n",
+ regno);
+ return -EINVAL;
+ }
+ if (!map->btf) {
+ verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n",
+ map->name);
+ return -EINVAL;
+ }
+ if (!btf_record_has_field(map->record, BPF_TIMER)) {
+ verbose(env, "map '%s' has no valid bpf_timer\n", map->name);
+ return -EINVAL;
+ }
+ if (map->record->timer_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
+ val + reg->off, map->record->timer_off);
+ return -EINVAL;
+ }
+ if (meta->map_ptr) {
+ verbose(env, "verifier bug. Two map pointers in a timer helper\n");
+ return -EFAULT;
+ }
+ meta->map_uid = reg->map_uid;
+ meta->map_ptr = map;
+ return 0;
}
-static bool arg_type_is_mem_size(enum bpf_arg_type type)
+static int process_kptr_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_call_arg_meta *meta)
{
- return type == ARG_CONST_SIZE ||
- type == ARG_CONST_SIZE_OR_ZERO;
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_map *map_ptr = reg->map_ptr;
+ struct btf_field *kptr_field;
+ u32 kptr_off;
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env,
+ "R%d doesn't have constant offset. kptr has to be at the constant offset\n",
+ regno);
+ return -EINVAL;
+ }
+ if (!map_ptr->btf) {
+ verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n",
+ map_ptr->name);
+ return -EINVAL;
+ }
+ if (!btf_record_has_field(map_ptr->record, BPF_KPTR)) {
+ verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);
+ return -EINVAL;
+ }
+
+ meta->map_ptr = map_ptr;
+ kptr_off = reg->off + reg->var_off.value;
+ kptr_field = btf_record_find(map_ptr->record, kptr_off, BPF_KPTR);
+ if (!kptr_field) {
+ verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
+ return -EACCES;
+ }
+ if (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_PERCPU) {
+ verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
+ return -EACCES;
+ }
+ meta->kptr_field = kptr_field;
+ return 0;
}
-static bool arg_type_is_alloc_mem_ptr(enum bpf_arg_type type)
+/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK
+ * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR.
+ *
+ * In both cases we deal with the first 8 bytes, but need to mark the next 8
+ * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of
+ * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object.
+ *
+ * Mutability of bpf_dynptr is at two levels, one is at the level of struct
+ * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct
+ * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can
+ * mutate the view of the dynptr and also possibly destroy it. In the latter
+ * case, it cannot mutate the bpf_dynptr itself but it can still mutate the
+ * memory that dynptr points to.
+ *
+ * The verifier will keep track both levels of mutation (bpf_dynptr's in
+ * reg->type and the memory's in reg->dynptr.type), but there is no support for
+ * readonly dynptr view yet, hence only the first case is tracked and checked.
+ *
+ * This is consistent with how C applies the const modifier to a struct object,
+ * where the pointer itself inside bpf_dynptr becomes const but not what it
+ * points to.
+ *
+ * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
+ * type, and declare it as 'const struct bpf_dynptr *' in their prototype.
+ */
+static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx,
+ enum bpf_arg_type arg_type, int clone_ref_obj_id)
{
- return type == ARG_PTR_TO_ALLOC_MEM ||
- type == ARG_PTR_TO_ALLOC_MEM_OR_NULL;
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ int err;
+
+ /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
+ * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
+ */
+ if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) {
+ verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n");
+ return -EFAULT;
+ }
+
+ /* MEM_UNINIT - Points to memory that is an appropriate candidate for
+ * constructing a mutable bpf_dynptr object.
+ *
+ * Currently, this is only possible with PTR_TO_STACK
+ * pointing to a region of at least 16 bytes which doesn't
+ * contain an existing bpf_dynptr.
+ *
+ * MEM_RDONLY - Points to a initialized bpf_dynptr that will not be
+ * mutated or destroyed. However, the memory it points to
+ * may be mutated.
+ *
+ * None - Points to a initialized dynptr that can be mutated and
+ * destroyed, including mutation of the memory it points
+ * to.
+ */
+ if (arg_type & MEM_UNINIT) {
+ int i;
+
+ if (!is_dynptr_reg_valid_uninit(env, reg)) {
+ verbose(env, "Dynptr has to be an uninitialized dynptr\n");
+ return -EINVAL;
+ }
+
+ /* we write BPF_DW bits (8 bytes) at a time */
+ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
+ err = check_mem_access(env, insn_idx, regno,
+ i, BPF_DW, BPF_WRITE, -1, false, false);
+ if (err)
+ return err;
+ }
+
+ err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id);
+ } else /* MEM_RDONLY and None case from above */ {
+ /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
+ if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
+ verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
+ return -EINVAL;
+ }
+
+ if (!is_dynptr_reg_valid_init(env, reg)) {
+ verbose(env,
+ "Expected an initialized dynptr as arg #%d\n",
+ regno);
+ return -EINVAL;
+ }
+
+ /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */
+ if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
+ verbose(env,
+ "Expected a dynptr of type %s as arg #%d\n",
+ dynptr_type_str(arg_to_dynptr_type(arg_type)), regno);
+ return -EINVAL;
+ }
+
+ err = mark_dynptr_read(env, reg);
+ }
+ return err;
+}
+
+static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi)
+{
+ struct bpf_func_state *state = func(env, reg);
+
+ return state->stack[spi].spilled_ptr.ref_obj_id;
+}
+
+static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+}
+
+static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_NEW;
+}
+
+static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_NEXT;
}
-static bool arg_type_is_alloc_size(enum bpf_arg_type type)
+static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
{
- return type == ARG_CONST_ALLOC_SIZE_OR_ZERO;
+ return meta->kfunc_flags & KF_ITER_DESTROY;
}
-static bool arg_type_is_int_ptr(enum bpf_arg_type type)
+static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg)
{
- return type == ARG_PTR_TO_INT ||
- type == ARG_PTR_TO_LONG;
+ /* btf_check_iter_kfuncs() guarantees that first argument of any iter
+ * kfunc is iter state pointer
+ */
+ return arg == 0 && is_iter_kfunc(meta);
+}
+
+static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ const struct btf_type *t;
+ const struct btf_param *arg;
+ int spi, err, i, nr_slots;
+ u32 btf_id;
+
+ /* btf_check_iter_kfuncs() ensures we don't need to validate anything here */
+ arg = &btf_params(meta->func_proto)[0];
+ t = btf_type_skip_modifiers(meta->btf, arg->type, NULL); /* PTR */
+ t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id); /* STRUCT */
+ nr_slots = t->size / BPF_REG_SIZE;
+
+ if (is_iter_new_kfunc(meta)) {
+ /* bpf_iter_<type>_new() expects pointer to uninit iter state */
+ if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
+ verbose(env, "expected uninitialized iter_%s as arg #%d\n",
+ iter_type_str(meta->btf, btf_id), regno);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
+ err = check_mem_access(env, insn_idx, regno,
+ i, BPF_DW, BPF_WRITE, -1, false, false);
+ if (err)
+ return err;
+ }
+
+ err = mark_stack_slots_iter(env, meta, reg, insn_idx, meta->btf, btf_id, nr_slots);
+ if (err)
+ return err;
+ } else {
+ /* iter_next() or iter_destroy() expect initialized iter state*/
+ err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots);
+ switch (err) {
+ case 0:
+ break;
+ case -EINVAL:
+ verbose(env, "expected an initialized iter_%s as arg #%d\n",
+ iter_type_str(meta->btf, btf_id), regno);
+ return err;
+ case -EPROTO:
+ verbose(env, "expected an RCU CS when using %s\n", meta->func_name);
+ return err;
+ default:
+ return err;
+ }
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ err = mark_iter_read(env, reg, spi, nr_slots);
+ if (err)
+ return err;
+
+ /* remember meta->iter info for process_iter_next_call() */
+ meta->iter.spi = spi;
+ meta->iter.frameno = reg->frameno;
+ meta->ref_obj_id = iter_ref_obj_id(env, reg, spi);
+
+ if (is_iter_destroy_kfunc(meta)) {
+ err = unmark_stack_slots_iter(env, reg, nr_slots);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+/* Look for a previous loop entry at insn_idx: nearest parent state
+ * stopped at insn_idx with callsites matching those in cur->frame.
+ */
+static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *cur,
+ int insn_idx)
+{
+ struct bpf_verifier_state_list *sl;
+ struct bpf_verifier_state *st;
+
+ /* Explored states are pushed in stack order, most recent states come first */
+ sl = *explored_state(env, insn_idx);
+ for (; sl; sl = sl->next) {
+ /* If st->branches != 0 state is a part of current DFS verification path,
+ * hence cur & st for a loop.
+ */
+ st = &sl->state;
+ if (st->insn_idx == insn_idx && st->branches && same_callsites(st, cur) &&
+ st->dfs_depth < cur->dfs_depth)
+ return st;
+ }
+
+ return NULL;
+}
+
+static void reset_idmap_scratch(struct bpf_verifier_env *env);
+static bool regs_exact(const struct bpf_reg_state *rold,
+ const struct bpf_reg_state *rcur,
+ struct bpf_idmap *idmap);
+
+static void maybe_widen_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
+ struct bpf_idmap *idmap)
+{
+ if (rold->type != SCALAR_VALUE)
+ return;
+ if (rold->type != rcur->type)
+ return;
+ if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap))
+ return;
+ __mark_reg_unknown(env, rcur);
+}
+
+static int widen_imprecise_scalars(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur)
+{
+ struct bpf_func_state *fold, *fcur;
+ int i, fr;
+
+ reset_idmap_scratch(env);
+ for (fr = old->curframe; fr >= 0; fr--) {
+ fold = old->frame[fr];
+ fcur = cur->frame[fr];
+
+ for (i = 0; i < MAX_BPF_REG; i++)
+ maybe_widen_reg(env,
+ &fold->regs[i],
+ &fcur->regs[i],
+ &env->idmap_scratch);
+
+ for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) {
+ if (!is_spilled_reg(&fold->stack[i]) ||
+ !is_spilled_reg(&fcur->stack[i]))
+ continue;
+
+ maybe_widen_reg(env,
+ &fold->stack[i].spilled_ptr,
+ &fcur->stack[i].spilled_ptr,
+ &env->idmap_scratch);
+ }
+ }
+ return 0;
+}
+
+/* process_iter_next_call() is called when verifier gets to iterator's next
+ * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
+ * to it as just "iter_next()" in comments below.
+ *
+ * BPF verifier relies on a crucial contract for any iter_next()
+ * implementation: it should *eventually* return NULL, and once that happens
+ * it should keep returning NULL. That is, once iterator exhausts elements to
+ * iterate, it should never reset or spuriously return new elements.
+ *
+ * With the assumption of such contract, process_iter_next_call() simulates
+ * a fork in the verifier state to validate loop logic correctness and safety
+ * without having to simulate infinite amount of iterations.
+ *
+ * In current state, we first assume that iter_next() returned NULL and
+ * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such
+ * conditions we should not form an infinite loop and should eventually reach
+ * exit.
+ *
+ * Besides that, we also fork current state and enqueue it for later
+ * verification. In a forked state we keep iterator state as ACTIVE
+ * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We
+ * also bump iteration depth to prevent erroneous infinite loop detection
+ * later on (see iter_active_depths_differ() comment for details). In this
+ * state we assume that we'll eventually loop back to another iter_next()
+ * calls (it could be in exactly same location or in some other instruction,
+ * it doesn't matter, we don't make any unnecessary assumptions about this,
+ * everything revolves around iterator state in a stack slot, not which
+ * instruction is calling iter_next()). When that happens, we either will come
+ * to iter_next() with equivalent state and can conclude that next iteration
+ * will proceed in exactly the same way as we just verified, so it's safe to
+ * assume that loop converges. If not, we'll go on another iteration
+ * simulation with a different input state, until all possible starting states
+ * are validated or we reach maximum number of instructions limit.
+ *
+ * This way, we will either exhaustively discover all possible input states
+ * that iterator loop can start with and eventually will converge, or we'll
+ * effectively regress into bounded loop simulation logic and either reach
+ * maximum number of instructions if loop is not provably convergent, or there
+ * is some statically known limit on number of iterations (e.g., if there is
+ * an explicit `if n > 100 then break;` statement somewhere in the loop).
+ *
+ * Iteration convergence logic in is_state_visited() relies on exact
+ * states comparison, which ignores read and precision marks.
+ * This is necessary because read and precision marks are not finalized
+ * while in the loop. Exact comparison might preclude convergence for
+ * simple programs like below:
+ *
+ * i = 0;
+ * while(iter_next(&it))
+ * i++;
+ *
+ * At each iteration step i++ would produce a new distinct state and
+ * eventually instruction processing limit would be reached.
+ *
+ * To avoid such behavior speculatively forget (widen) range for
+ * imprecise scalar registers, if those registers were not precise at the
+ * end of the previous iteration and do not match exactly.
+ *
+ * This is a conservative heuristic that allows to verify wide range of programs,
+ * however it precludes verification of programs that conjure an
+ * imprecise value on the first loop iteration and use it as precise on a second.
+ * For example, the following safe program would fail to verify:
+ *
+ * struct bpf_num_iter it;
+ * int arr[10];
+ * int i = 0, a = 0;
+ * bpf_iter_num_new(&it, 0, 10);
+ * while (bpf_iter_num_next(&it)) {
+ * if (a == 0) {
+ * a = 1;
+ * i = 7; // Because i changed verifier would forget
+ * // it's range on second loop entry.
+ * } else {
+ * arr[i] = 42; // This would fail to verify.
+ * }
+ * }
+ * bpf_iter_num_destroy(&it);
+ */
+static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
+ struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
+ struct bpf_reg_state *cur_iter, *queued_iter;
+ int iter_frameno = meta->iter.frameno;
+ int iter_spi = meta->iter.spi;
+
+ BTF_TYPE_EMIT(struct bpf_iter);
+
+ cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+
+ if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
+ cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
+ verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n",
+ cur_iter->iter.state, iter_state_str(cur_iter->iter.state));
+ return -EFAULT;
+ }
+
+ if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
+ /* Because iter_next() call is a checkpoint is_state_visitied()
+ * should guarantee parent state with same call sites and insn_idx.
+ */
+ if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx ||
+ !same_callsites(cur_st->parent, cur_st)) {
+ verbose(env, "bug: bad parent state for iter next call");
+ return -EFAULT;
+ }
+ /* Note cur_st->parent in the call below, it is necessary to skip
+ * checkpoint created for cur_st by is_state_visited()
+ * right at this instruction.
+ */
+ prev_st = find_prev_entry(env, cur_st->parent, insn_idx);
+ /* branch out active iter state */
+ queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
+ if (!queued_st)
+ return -ENOMEM;
+
+ queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+ queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
+ queued_iter->iter.depth++;
+ if (prev_st)
+ widen_imprecise_scalars(env, prev_st, queued_st);
+
+ queued_fr = queued_st->frame[queued_st->curframe];
+ mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
+ }
+
+ /* switch to DRAINED state, but keep the depth unchanged */
+ /* mark current iter state as drained and assume returned NULL */
+ cur_iter->iter.state = BPF_ITER_STATE_DRAINED;
+ __mark_reg_const_zero(env, &cur_fr->regs[BPF_REG_0]);
+
+ return 0;
+}
+
+static bool arg_type_is_mem_size(enum bpf_arg_type type)
+{
+ return type == ARG_CONST_SIZE ||
+ type == ARG_CONST_SIZE_OR_ZERO;
+}
+
+static bool arg_type_is_release(enum bpf_arg_type type)
+{
+ return type & OBJ_RELEASE;
+}
+
+static bool arg_type_is_dynptr(enum bpf_arg_type type)
+{
+ return base_type(type) == ARG_PTR_TO_DYNPTR;
}
static int int_ptr_type_to_size(enum bpf_arg_type type)
@@ -3734,12 +7979,509 @@ static int int_ptr_type_to_size(enum bpf_arg_type type)
return -EINVAL;
}
-static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
+static int resolve_map_arg_type(struct bpf_verifier_env *env,
+ const struct bpf_call_arg_meta *meta,
+ enum bpf_arg_type *arg_type)
+{
+ if (!meta->map_ptr) {
+ /* kernel subsystem misconfigured verifier */
+ verbose(env, "invalid map_ptr to access map->type\n");
+ return -EACCES;
+ }
+
+ switch (meta->map_ptr->map_type) {
+ case BPF_MAP_TYPE_SOCKMAP:
+ case BPF_MAP_TYPE_SOCKHASH:
+ if (*arg_type == ARG_PTR_TO_MAP_VALUE) {
+ *arg_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON;
+ } else {
+ verbose(env, "invalid arg_type for sockmap/sockhash\n");
+ return -EINVAL;
+ }
+ break;
+ case BPF_MAP_TYPE_BLOOM_FILTER:
+ if (meta->func_id == BPF_FUNC_map_peek_elem)
+ *arg_type = ARG_PTR_TO_MAP_VALUE;
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+struct bpf_reg_types {
+ const enum bpf_reg_type types[10];
+ u32 *btf_id;
+};
+
+static const struct bpf_reg_types sock_types = {
+ .types = {
+ PTR_TO_SOCK_COMMON,
+ PTR_TO_SOCKET,
+ PTR_TO_TCP_SOCK,
+ PTR_TO_XDP_SOCK,
+ },
+};
+
+#ifdef CONFIG_NET
+static const struct bpf_reg_types btf_id_sock_common_types = {
+ .types = {
+ PTR_TO_SOCK_COMMON,
+ PTR_TO_SOCKET,
+ PTR_TO_TCP_SOCK,
+ PTR_TO_XDP_SOCK,
+ PTR_TO_BTF_ID,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
+ },
+ .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+};
+#endif
+
+static const struct bpf_reg_types mem_types = {
+ .types = {
+ PTR_TO_STACK,
+ PTR_TO_PACKET,
+ PTR_TO_PACKET_META,
+ PTR_TO_MAP_KEY,
+ PTR_TO_MAP_VALUE,
+ PTR_TO_MEM,
+ PTR_TO_MEM | MEM_RINGBUF,
+ PTR_TO_BUF,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
+ },
+};
+
+static const struct bpf_reg_types int_ptr_types = {
+ .types = {
+ PTR_TO_STACK,
+ PTR_TO_PACKET,
+ PTR_TO_PACKET_META,
+ PTR_TO_MAP_KEY,
+ PTR_TO_MAP_VALUE,
+ },
+};
+
+static const struct bpf_reg_types spin_lock_types = {
+ .types = {
+ PTR_TO_MAP_VALUE,
+ PTR_TO_BTF_ID | MEM_ALLOC,
+ }
+};
+
+static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } };
+static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } };
+static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } };
+static const struct bpf_reg_types ringbuf_mem_types = { .types = { PTR_TO_MEM | MEM_RINGBUF } };
+static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
+static const struct bpf_reg_types btf_ptr_types = {
+ .types = {
+ PTR_TO_BTF_ID,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
+ PTR_TO_BTF_ID | MEM_RCU,
+ },
+};
+static const struct bpf_reg_types percpu_btf_ptr_types = {
+ .types = {
+ PTR_TO_BTF_ID | MEM_PERCPU,
+ PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU,
+ PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED,
+ }
+};
+static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
+static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
+static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types dynptr_types = {
+ .types = {
+ PTR_TO_STACK,
+ CONST_PTR_TO_DYNPTR,
+ }
+};
+
+static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
+ [ARG_PTR_TO_MAP_KEY] = &mem_types,
+ [ARG_PTR_TO_MAP_VALUE] = &mem_types,
+ [ARG_CONST_SIZE] = &scalar_types,
+ [ARG_CONST_SIZE_OR_ZERO] = &scalar_types,
+ [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types,
+ [ARG_CONST_MAP_PTR] = &const_map_ptr_types,
+ [ARG_PTR_TO_CTX] = &context_types,
+ [ARG_PTR_TO_SOCK_COMMON] = &sock_types,
+#ifdef CONFIG_NET
+ [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types,
+#endif
+ [ARG_PTR_TO_SOCKET] = &fullsock_types,
+ [ARG_PTR_TO_BTF_ID] = &btf_ptr_types,
+ [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types,
+ [ARG_PTR_TO_MEM] = &mem_types,
+ [ARG_PTR_TO_RINGBUF_MEM] = &ringbuf_mem_types,
+ [ARG_PTR_TO_INT] = &int_ptr_types,
+ [ARG_PTR_TO_LONG] = &int_ptr_types,
+ [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
+ [ARG_PTR_TO_FUNC] = &func_ptr_types,
+ [ARG_PTR_TO_STACK] = &stack_ptr_types,
+ [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
+ [ARG_PTR_TO_TIMER] = &timer_types,
+ [ARG_PTR_TO_KPTR] = &kptr_types,
+ [ARG_PTR_TO_DYNPTR] = &dynptr_types,
+};
+
+static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
enum bpf_arg_type arg_type,
+ const u32 *arg_btf_id,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- enum bpf_reg_type expected_type, type = reg->type;
+ enum bpf_reg_type expected, type = reg->type;
+ const struct bpf_reg_types *compatible;
+ int i, j;
+
+ compatible = compatible_reg_types[base_type(arg_type)];
+ if (!compatible) {
+ verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type);
+ return -EFAULT;
+ }
+
+ /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY,
+ * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY
+ *
+ * Same for MAYBE_NULL:
+ *
+ * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL,
+ * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL
+ *
+ * ARG_PTR_TO_MEM is compatible with PTR_TO_MEM that is tagged with a dynptr type.
+ *
+ * Therefore we fold these flags depending on the arg_type before comparison.
+ */
+ if (arg_type & MEM_RDONLY)
+ type &= ~MEM_RDONLY;
+ if (arg_type & PTR_MAYBE_NULL)
+ type &= ~PTR_MAYBE_NULL;
+ if (base_type(arg_type) == ARG_PTR_TO_MEM)
+ type &= ~DYNPTR_TYPE_FLAG_MASK;
+
+ if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) {
+ type &= ~MEM_ALLOC;
+ type &= ~MEM_PERCPU;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
+ expected = compatible->types[i];
+ if (expected == NOT_INIT)
+ break;
+
+ if (type == expected)
+ goto found;
+ }
+
+ verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type));
+ for (j = 0; j + 1 < i; j++)
+ verbose(env, "%s, ", reg_type_str(env, compatible->types[j]));
+ verbose(env, "%s\n", reg_type_str(env, compatible->types[j]));
+ return -EACCES;
+
+found:
+ if (base_type(reg->type) != PTR_TO_BTF_ID)
+ return 0;
+
+ if (compatible == &mem_types) {
+ if (!(arg_type & MEM_RDONLY)) {
+ verbose(env,
+ "%s() may write into memory pointed by R%d type=%s\n",
+ func_id_name(meta->func_id),
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ return 0;
+ }
+
+ switch ((int)reg->type) {
+ case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | PTR_TRUSTED:
+ case PTR_TO_BTF_ID | MEM_RCU:
+ case PTR_TO_BTF_ID | PTR_MAYBE_NULL:
+ case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU:
+ {
+ /* For bpf_sk_release, it needs to match against first member
+ * 'struct sock_common', hence make an exception for it. This
+ * allows bpf_sk_release to work for multiple socket types.
+ */
+ bool strict_type_match = arg_type_is_release(arg_type) &&
+ meta->func_id != BPF_FUNC_sk_release;
+
+ if (type_may_be_null(reg->type) &&
+ (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) {
+ verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno);
+ return -EACCES;
+ }
+
+ if (!arg_btf_id) {
+ if (!compatible->btf_id) {
+ verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
+ return -EFAULT;
+ }
+ arg_btf_id = compatible->btf_id;
+ }
+
+ if (meta->func_id == BPF_FUNC_kptr_xchg) {
+ if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
+ return -EACCES;
+ } else {
+ if (arg_btf_id == BPF_PTR_POISON) {
+ verbose(env, "verifier internal error:");
+ verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n",
+ regno);
+ return -EACCES;
+ }
+
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+ btf_vmlinux, *arg_btf_id,
+ strict_type_match)) {
+ verbose(env, "R%d is of type %s but %s is expected\n",
+ regno, btf_type_name(reg->btf, reg->btf_id),
+ btf_type_name(btf_vmlinux, *arg_btf_id));
+ return -EACCES;
+ }
+ }
+ break;
+ }
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC:
+ if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
+ meta->func_id != BPF_FUNC_kptr_xchg) {
+ verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
+ return -EFAULT;
+ }
+ if (meta->func_id == BPF_FUNC_kptr_xchg) {
+ if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
+ return -EACCES;
+ }
+ break;
+ case PTR_TO_BTF_ID | MEM_PERCPU:
+ case PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU:
+ case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED:
+ /* Handled by helper specific checks */
+ break;
+ default:
+ verbose(env, "verifier internal error: invalid PTR_TO_BTF_ID register for type match\n");
+ return -EFAULT;
+ }
+ return 0;
+}
+
+static struct btf_field *
+reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields)
+{
+ struct btf_field *field;
+ struct btf_record *rec;
+
+ rec = reg_btf_record(reg);
+ if (!rec)
+ return NULL;
+
+ field = btf_record_find(rec, off, fields);
+ if (!field)
+ return NULL;
+
+ return field;
+}
+
+static int check_func_arg_reg_off(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno,
+ enum bpf_arg_type arg_type)
+{
+ u32 type = reg->type;
+
+ /* When referenced register is passed to release function, its fixed
+ * offset must be 0.
+ *
+ * We will check arg_type_is_release reg has ref_obj_id when storing
+ * meta->release_regno.
+ */
+ if (arg_type_is_release(arg_type)) {
+ /* ARG_PTR_TO_DYNPTR with OBJ_RELEASE is a bit special, as it
+ * may not directly point to the object being released, but to
+ * dynptr pointing to such object, which might be at some offset
+ * on the stack. In that case, we simply to fallback to the
+ * default handling.
+ */
+ if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK)
+ return 0;
+
+ /* Doing check_ptr_off_reg check for the offset will catch this
+ * because fixed_off_ok is false, but checking here allows us
+ * to give the user a better error message.
+ */
+ if (reg->off) {
+ verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n",
+ regno);
+ return -EINVAL;
+ }
+ return __check_ptr_off_reg(env, reg, regno, false);
+ }
+
+ switch (type) {
+ /* Pointer types where both fixed and variable offset is explicitly allowed: */
+ case PTR_TO_STACK:
+ case PTR_TO_PACKET:
+ case PTR_TO_PACKET_META:
+ case PTR_TO_MAP_KEY:
+ case PTR_TO_MAP_VALUE:
+ case PTR_TO_MEM:
+ case PTR_TO_MEM | MEM_RDONLY:
+ case PTR_TO_MEM | MEM_RINGBUF:
+ case PTR_TO_BUF:
+ case PTR_TO_BUF | MEM_RDONLY:
+ case SCALAR_VALUE:
+ return 0;
+ /* All the rest must be rejected, except PTR_TO_BTF_ID which allows
+ * fixed offset.
+ */
+ case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ case PTR_TO_BTF_ID | PTR_TRUSTED:
+ case PTR_TO_BTF_ID | MEM_RCU:
+ case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
+ case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
+ /* When referenced PTR_TO_BTF_ID is passed to release function,
+ * its fixed offset must be 0. In the other cases, fixed offset
+ * can be non-zero. This was already checked above. So pass
+ * fixed_off_ok as true to allow fixed offset for all other
+ * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we
+ * still need to do checks instead of returning.
+ */
+ return __check_ptr_off_reg(env, reg, regno, true);
+ default:
+ return __check_ptr_off_reg(env, reg, regno, false);
+ }
+}
+
+static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env,
+ const struct bpf_func_proto *fn,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_reg_state *state = NULL;
+ int i;
+
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
+ if (arg_type_is_dynptr(fn->arg_type[i])) {
+ if (state) {
+ verbose(env, "verifier internal error: multiple dynptr args\n");
+ return NULL;
+ }
+ state = &regs[BPF_REG_1 + i];
+ }
+
+ if (!state)
+ verbose(env, "verifier internal error: no dynptr arg found\n");
+
+ return state;
+}
+
+static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->id;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ return state->stack[spi].spilled_ptr.id;
+}
+
+static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->ref_obj_id;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ return state->stack[spi].spilled_ptr.ref_obj_id;
+}
+
+static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->dynptr.type;
+
+ spi = __get_spi(reg->off);
+ if (spi < 0) {
+ verbose(env, "verifier internal error: invalid spi when querying dynptr type\n");
+ return BPF_DYNPTR_TYPE_INVALID;
+ }
+
+ return state->stack[spi].spilled_ptr.dynptr.type;
+}
+
+static int check_reg_const_str(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno)
+{
+ struct bpf_map *map = reg->map_ptr;
+ int err;
+ int map_off;
+ u64 map_addr;
+ char *str_ptr;
+
+ if (reg->type != PTR_TO_MAP_VALUE)
+ return -EINVAL;
+
+ if (!bpf_map_is_rdonly(map)) {
+ verbose(env, "R%d does not point to a readonly map'\n", regno);
+ return -EACCES;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d is not a constant address'\n", regno);
+ return -EACCES;
+ }
+
+ if (!map->ops->map_direct_value_addr) {
+ verbose(env, "no direct value access support for this map type\n");
+ return -EACCES;
+ }
+
+ err = check_map_access(env, regno, reg->off,
+ map->value_size - reg->off, false,
+ ACCESS_HELPER);
+ if (err)
+ return err;
+
+ map_off = reg->off + reg->var_off.value;
+ err = map->ops->map_direct_value_addr(map, &map_addr, map_off);
+ if (err) {
+ verbose(env, "direct value access on string failed\n");
+ return err;
+ }
+
+ str_ptr = (char *)(long)(map_addr);
+ if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) {
+ verbose(env, "string is not zero-terminated\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
+ struct bpf_call_arg_meta *meta,
+ const struct bpf_func_proto *fn,
+ int insn_idx)
+{
+ u32 regno = BPF_REG_1 + arg;
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ enum bpf_arg_type arg_type = fn->arg_type[arg];
+ enum bpf_reg_type type = reg->type;
+ u32 *arg_btf_id = NULL;
int err = 0;
if (arg_type == ARG_DONTCARE)
@@ -3764,107 +8506,65 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
- if (arg_type == ARG_PTR_TO_MAP_KEY ||
- arg_type == ARG_PTR_TO_MAP_VALUE ||
- arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE ||
- arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) {
- expected_type = PTR_TO_STACK;
- if (register_is_null(reg) &&
- arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL)
- /* final test in check_stack_boundary() */;
- else if (!type_is_pkt_pointer(type) &&
- type != PTR_TO_MAP_VALUE &&
- type != expected_type)
- goto err_type;
- } else if (arg_type == ARG_CONST_SIZE ||
- arg_type == ARG_CONST_SIZE_OR_ZERO ||
- arg_type == ARG_CONST_ALLOC_SIZE_OR_ZERO) {
- expected_type = SCALAR_VALUE;
- if (type != expected_type)
- goto err_type;
- } else if (arg_type == ARG_CONST_MAP_PTR) {
- expected_type = CONST_PTR_TO_MAP;
- if (type != expected_type)
- goto err_type;
- } else if (arg_type == ARG_PTR_TO_CTX ||
- arg_type == ARG_PTR_TO_CTX_OR_NULL) {
- expected_type = PTR_TO_CTX;
- if (!(register_is_null(reg) &&
- arg_type == ARG_PTR_TO_CTX_OR_NULL)) {
- if (type != expected_type)
- goto err_type;
- err = check_ctx_reg(env, reg, regno);
- if (err < 0)
- return err;
- }
- } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) {
- expected_type = PTR_TO_SOCK_COMMON;
- /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */
- if (!type_is_sk_pointer(type))
- goto err_type;
- if (reg->ref_obj_id) {
- if (meta->ref_obj_id) {
- verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
- regno, reg->ref_obj_id,
- meta->ref_obj_id);
- return -EFAULT;
- }
- meta->ref_obj_id = reg->ref_obj_id;
- }
- } else if (arg_type == ARG_PTR_TO_SOCKET) {
- expected_type = PTR_TO_SOCKET;
- if (type != expected_type)
- goto err_type;
- } else if (arg_type == ARG_PTR_TO_BTF_ID) {
- expected_type = PTR_TO_BTF_ID;
- if (type != expected_type)
- goto err_type;
- if (reg->btf_id != meta->btf_id) {
- verbose(env, "Helper has type %s got %s in R%d\n",
- kernel_type_name(meta->btf_id),
- kernel_type_name(reg->btf_id), regno);
+ if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) {
+ err = resolve_map_arg_type(env, meta, &arg_type);
+ if (err)
+ return err;
+ }
- return -EACCES;
- }
- if (!tnum_is_const(reg->var_off) || reg->var_off.value || reg->off) {
- verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n",
+ if (register_is_null(reg) && type_may_be_null(arg_type))
+ /* A NULL register has a SCALAR_VALUE type, so skip
+ * type checking.
+ */
+ goto skip_type_check;
+
+ /* arg_btf_id and arg_size are in a union. */
+ if (base_type(arg_type) == ARG_PTR_TO_BTF_ID ||
+ base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK)
+ arg_btf_id = fn->arg_btf_id[arg];
+
+ err = check_reg_type(env, regno, arg_type, arg_btf_id, meta);
+ if (err)
+ return err;
+
+ err = check_func_arg_reg_off(env, reg, regno, arg_type);
+ if (err)
+ return err;
+
+skip_type_check:
+ if (arg_type_is_release(arg_type)) {
+ if (arg_type_is_dynptr(arg_type)) {
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ /* Only dynptr created on stack can be released, thus
+ * the get_spi and stack state checks for spilled_ptr
+ * should only be done before process_dynptr_func for
+ * PTR_TO_STACK.
+ */
+ if (reg->type == PTR_TO_STACK) {
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) {
+ verbose(env, "arg %d is an unacquired reference\n", regno);
+ return -EINVAL;
+ }
+ } else {
+ verbose(env, "cannot release unowned const bpf_dynptr\n");
+ return -EINVAL;
+ }
+ } else if (!reg->ref_obj_id && !register_is_null(reg)) {
+ verbose(env, "R%d must be referenced when passed to release function\n",
regno);
- return -EACCES;
+ return -EINVAL;
}
- } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
- if (meta->func_id == BPF_FUNC_spin_lock) {
- if (process_spin_lock(env, regno, true))
- return -EACCES;
- } else if (meta->func_id == BPF_FUNC_spin_unlock) {
- if (process_spin_lock(env, regno, false))
- return -EACCES;
- } else {
- verbose(env, "verifier internal error\n");
+ if (meta->release_regno) {
+ verbose(env, "verifier internal error: more than one release argument\n");
return -EFAULT;
}
- } else if (arg_type_is_mem_ptr(arg_type)) {
- expected_type = PTR_TO_STACK;
- /* One exception here. In case function allows for NULL to be
- * passed in as argument, it's a SCALAR_VALUE type. Final test
- * happens during stack boundary checking.
- */
- if (register_is_null(reg) &&
- (arg_type == ARG_PTR_TO_MEM_OR_NULL ||
- arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL))
- /* final test in check_stack_boundary() */;
- else if (!type_is_pkt_pointer(type) &&
- type != PTR_TO_MAP_VALUE &&
- type != PTR_TO_MEM &&
- type != expected_type)
- goto err_type;
- meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
- } else if (arg_type_is_alloc_mem_ptr(arg_type)) {
- expected_type = PTR_TO_MEM;
- if (register_is_null(reg) &&
- arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL)
- /* final test in check_stack_boundary() */;
- else if (type != expected_type)
- goto err_type;
+ meta->release_regno = regno;
+ }
+
+ if (reg->ref_obj_id) {
if (meta->ref_obj_id) {
verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
regno, reg->ref_obj_id,
@@ -3872,21 +8572,36 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return -EFAULT;
}
meta->ref_obj_id = reg->ref_obj_id;
- } else if (arg_type_is_int_ptr(arg_type)) {
- expected_type = PTR_TO_STACK;
- if (!type_is_pkt_pointer(type) &&
- type != PTR_TO_MAP_VALUE &&
- type != expected_type)
- goto err_type;
- } else {
- verbose(env, "unsupported arg_type %d\n", arg_type);
- return -EFAULT;
}
- if (arg_type == ARG_CONST_MAP_PTR) {
+ switch (base_type(arg_type)) {
+ case ARG_CONST_MAP_PTR:
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
+ if (meta->map_ptr) {
+ /* Use map_uid (which is unique id of inner map) to reject:
+ * inner_map1 = bpf_map_lookup_elem(outer_map, key1)
+ * inner_map2 = bpf_map_lookup_elem(outer_map, key2)
+ * if (inner_map1 && inner_map2) {
+ * timer = bpf_map_lookup_elem(inner_map1);
+ * if (timer)
+ * // mismatch would have been allowed
+ * bpf_timer_init(timer, inner_map2);
+ * }
+ *
+ * Comparing map_ptr is enough to distinguish normal and outer maps.
+ */
+ if (meta->map_ptr != reg->map_ptr ||
+ meta->map_uid != reg->map_uid) {
+ verbose(env,
+ "timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
+ meta->map_uid, reg->map_uid);
+ return -EINVAL;
+ }
+ }
meta->map_ptr = reg->map_ptr;
- } else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+ meta->map_uid = reg->map_uid;
+ break;
+ case ARG_PTR_TO_MAP_KEY:
/* bpf_map_xxx(..., map_ptr, ..., key) call:
* check that [key, key + map->key_size) are within
* stack limits and initialized
@@ -3903,10 +8618,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
err = check_helper_mem_access(env, regno,
meta->map_ptr->key_size, false,
NULL);
- } else if (arg_type == ARG_PTR_TO_MAP_VALUE ||
- (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL &&
- !register_is_null(reg)) ||
- arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+ break;
+ case ARG_PTR_TO_MAP_VALUE:
+ if (type_may_be_null(arg_type) && register_is_null(reg))
+ return 0;
+
/* bpf_map_xxx(..., map_ptr, ..., value) call:
* check [value, value + map->value_size) validity
*/
@@ -3915,79 +8631,142 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
verbose(env, "invalid map_ptr to access map->value\n");
return -EACCES;
}
- meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE);
+ meta->raw_mode = arg_type & MEM_UNINIT;
err = check_helper_mem_access(env, regno,
meta->map_ptr->value_size, false,
meta);
- } else if (arg_type_is_mem_size(arg_type)) {
- bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
-
- /* This is used to refine r0 return value bounds for helpers
- * that enforce this value as an upper bound on return values.
- * See do_refine_retval_range() for helpers that can refine
- * the return value. C type of helper is u32 so we pull register
- * bound from umax_value however, if negative verifier errors
- * out. Only upper bounds can be learned because retval is an
- * int type and negative retvals are allowed.
- */
- meta->msize_max_value = reg->umax_value;
-
- /* The register is SCALAR_VALUE; the access check
- * happens using its boundaries.
- */
- if (!tnum_is_const(reg->var_off))
- /* For unprivileged variable accesses, disable raw
- * mode so that the program is required to
- * initialize all the memory that the helper could
- * just partially fill up.
- */
- meta = NULL;
-
- if (reg->smin_value < 0) {
- verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
- regno);
+ break;
+ case ARG_PTR_TO_PERCPU_BTF_ID:
+ if (!reg->btf_id) {
+ verbose(env, "Helper has invalid btf_id in R%d\n", regno);
return -EACCES;
}
-
- if (reg->umin_value == 0) {
- err = check_helper_mem_access(env, regno - 1, 0,
- zero_size_allowed,
- meta);
+ meta->ret_btf = reg->btf;
+ meta->ret_btf_id = reg->btf_id;
+ break;
+ case ARG_PTR_TO_SPIN_LOCK:
+ if (in_rbtree_lock_required_cb(env)) {
+ verbose(env, "can't spin_{lock,unlock} in rbtree cb\n");
+ return -EACCES;
+ }
+ if (meta->func_id == BPF_FUNC_spin_lock) {
+ err = process_spin_lock(env, regno, true);
if (err)
return err;
+ } else if (meta->func_id == BPF_FUNC_spin_unlock) {
+ err = process_spin_lock(env, regno, false);
+ if (err)
+ return err;
+ } else {
+ verbose(env, "verifier internal error\n");
+ return -EFAULT;
}
-
- if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
- verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
- regno);
- return -EACCES;
+ break;
+ case ARG_PTR_TO_TIMER:
+ err = process_timer_func(env, regno, meta);
+ if (err)
+ return err;
+ break;
+ case ARG_PTR_TO_FUNC:
+ meta->subprogno = reg->subprogno;
+ break;
+ case ARG_PTR_TO_MEM:
+ /* The access to this pointer is only checked when we hit the
+ * next is_mem_size argument below.
+ */
+ meta->raw_mode = arg_type & MEM_UNINIT;
+ if (arg_type & MEM_FIXED_SIZE) {
+ err = check_helper_mem_access(env, regno,
+ fn->arg_size[arg], false,
+ meta);
}
- err = check_helper_mem_access(env, regno - 1,
- reg->umax_value,
- zero_size_allowed, meta);
- if (!err)
- err = mark_chain_precision(env, regno);
- } else if (arg_type_is_alloc_size(arg_type)) {
+ break;
+ case ARG_CONST_SIZE:
+ err = check_mem_size_reg(env, reg, regno, false, meta);
+ break;
+ case ARG_CONST_SIZE_OR_ZERO:
+ err = check_mem_size_reg(env, reg, regno, true, meta);
+ break;
+ case ARG_PTR_TO_DYNPTR:
+ err = process_dynptr_func(env, regno, insn_idx, arg_type, 0);
+ if (err)
+ return err;
+ break;
+ case ARG_CONST_ALLOC_SIZE_OR_ZERO:
if (!tnum_is_const(reg->var_off)) {
- verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n",
+ verbose(env, "R%d is not a known constant'\n",
regno);
return -EACCES;
}
meta->mem_size = reg->var_off.value;
- } else if (arg_type_is_int_ptr(arg_type)) {
+ err = mark_chain_precision(env, regno);
+ if (err)
+ return err;
+ break;
+ case ARG_PTR_TO_INT:
+ case ARG_PTR_TO_LONG:
+ {
int size = int_ptr_type_to_size(arg_type);
err = check_helper_mem_access(env, regno, size, false, meta);
if (err)
return err;
err = check_ptr_alignment(env, reg, 0, size, true);
+ break;
+ }
+ case ARG_PTR_TO_CONST_STR:
+ {
+ err = check_reg_const_str(env, reg, regno);
+ if (err)
+ return err;
+ break;
+ }
+ case ARG_PTR_TO_KPTR:
+ err = process_kptr_func(env, regno, meta);
+ if (err)
+ return err;
+ break;
}
return err;
-err_type:
- verbose(env, "R%d type=%s expected=%s\n", regno,
- reg_type_str[type], reg_type_str[expected_type]);
- return -EACCES;
+}
+
+static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
+{
+ enum bpf_attach_type eatype = env->prog->expected_attach_type;
+ enum bpf_prog_type type = resolve_prog_type(env->prog);
+
+ if (func_id != BPF_FUNC_map_update_elem)
+ return false;
+
+ /* It's not possible to get access to a locked struct sock in these
+ * contexts, so updating is safe.
+ */
+ switch (type) {
+ case BPF_PROG_TYPE_TRACING:
+ if (eatype == BPF_TRACE_ITER)
+ return true;
+ break;
+ case BPF_PROG_TYPE_SOCKET_FILTER:
+ case BPF_PROG_TYPE_SCHED_CLS:
+ case BPF_PROG_TYPE_SCHED_ACT:
+ case BPF_PROG_TYPE_XDP:
+ case BPF_PROG_TYPE_SK_REUSEPORT:
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
+ case BPF_PROG_TYPE_SK_LOOKUP:
+ return true;
+ default:
+ break;
+ }
+
+ verbose(env, "cannot update sockmap in this context\n");
+ return false;
+}
+
+static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env)
+{
+ return env->prog->jit_requested &&
+ bpf_jit_supports_subprog_tailcalls();
}
static int check_map_func_compatibility(struct bpf_verifier_env *env,
@@ -4013,9 +8792,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_RINGBUF:
if (func_id != BPF_FUNC_ringbuf_output &&
func_id != BPF_FUNC_ringbuf_reserve &&
- func_id != BPF_FUNC_ringbuf_submit &&
- func_id != BPF_FUNC_ringbuf_discard &&
- func_id != BPF_FUNC_ringbuf_query)
+ func_id != BPF_FUNC_ringbuf_query &&
+ func_id != BPF_FUNC_ringbuf_reserve_dynptr &&
+ func_id != BPF_FUNC_ringbuf_submit_dynptr &&
+ func_id != BPF_FUNC_ringbuf_discard_dynptr)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_USER_RINGBUF:
+ if (func_id != BPF_FUNC_user_ringbuf_drain)
goto error;
break;
case BPF_MAP_TYPE_STACK_TRACE:
@@ -4061,7 +8845,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_msg_redirect_map &&
func_id != BPF_FUNC_sk_select_reuseport &&
- func_id != BPF_FUNC_map_lookup_elem)
+ func_id != BPF_FUNC_map_lookup_elem &&
+ !may_update_sockmap(env, func_id))
goto error;
break;
case BPF_MAP_TYPE_SOCKHASH:
@@ -4070,7 +8855,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_msg_redirect_hash &&
func_id != BPF_FUNC_sk_select_reuseport &&
- func_id != BPF_FUNC_map_lookup_elem)
+ func_id != BPF_FUNC_map_lookup_elem &&
+ !may_update_sockmap(env, func_id))
goto error;
break;
case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
@@ -4086,7 +8872,31 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
break;
case BPF_MAP_TYPE_SK_STORAGE:
if (func_id != BPF_FUNC_sk_storage_get &&
- func_id != BPF_FUNC_sk_storage_delete)
+ func_id != BPF_FUNC_sk_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_INODE_STORAGE:
+ if (func_id != BPF_FUNC_inode_storage_get &&
+ func_id != BPF_FUNC_inode_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_TASK_STORAGE:
+ if (func_id != BPF_FUNC_task_storage_get &&
+ func_id != BPF_FUNC_task_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_CGRP_STORAGE:
+ if (func_id != BPF_FUNC_cgrp_storage_get &&
+ func_id != BPF_FUNC_cgrp_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_BLOOM_FILTER:
+ if (func_id != BPF_FUNC_map_peek_elem &&
+ func_id != BPF_FUNC_map_push_elem)
goto error;
break;
default:
@@ -4098,8 +8908,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_FUNC_tail_call:
if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
goto error;
- if (env->subprog_cnt > 1) {
- verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n");
+ if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) {
+ verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
return -EINVAL;
}
break;
@@ -4111,6 +8921,19 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
goto error;
break;
+ case BPF_FUNC_ringbuf_output:
+ case BPF_FUNC_ringbuf_reserve:
+ case BPF_FUNC_ringbuf_query:
+ case BPF_FUNC_ringbuf_reserve_dynptr:
+ case BPF_FUNC_ringbuf_submit_dynptr:
+ case BPF_FUNC_ringbuf_discard_dynptr:
+ if (map->map_type != BPF_MAP_TYPE_RINGBUF)
+ goto error;
+ break;
+ case BPF_FUNC_user_ringbuf_drain:
+ if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
+ goto error;
+ break;
case BPF_FUNC_get_stackid:
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
@@ -4150,18 +8973,44 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
map->map_type != BPF_MAP_TYPE_SOCKHASH)
goto error;
break;
- case BPF_FUNC_map_peek_elem:
case BPF_FUNC_map_pop_elem:
- case BPF_FUNC_map_push_elem:
if (map->map_type != BPF_MAP_TYPE_QUEUE &&
map->map_type != BPF_MAP_TYPE_STACK)
goto error;
break;
+ case BPF_FUNC_map_peek_elem:
+ case BPF_FUNC_map_push_elem:
+ if (map->map_type != BPF_MAP_TYPE_QUEUE &&
+ map->map_type != BPF_MAP_TYPE_STACK &&
+ map->map_type != BPF_MAP_TYPE_BLOOM_FILTER)
+ goto error;
+ break;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ if (map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH)
+ goto error;
+ break;
case BPF_FUNC_sk_storage_get:
case BPF_FUNC_sk_storage_delete:
if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
goto error;
break;
+ case BPF_FUNC_inode_storage_get:
+ case BPF_FUNC_inode_storage_delete:
+ if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE)
+ goto error;
+ break;
+ case BPF_FUNC_task_storage_get:
+ case BPF_FUNC_task_storage_delete:
+ if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
+ goto error;
+ break;
+ case BPF_FUNC_cgrp_storage_get:
+ case BPF_FUNC_cgrp_storage_delete:
+ if (map->map_type != BPF_MAP_TYPE_CGRP_STORAGE)
+ goto error;
+ break;
default:
break;
}
@@ -4195,13 +9044,19 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
return count <= 1;
}
-static bool check_args_pair_invalid(enum bpf_arg_type arg_curr,
- enum bpf_arg_type arg_next)
+static bool check_args_pair_invalid(const struct bpf_func_proto *fn, int arg)
{
- return (arg_type_is_mem_ptr(arg_curr) &&
- !arg_type_is_mem_size(arg_next)) ||
- (!arg_type_is_mem_ptr(arg_curr) &&
- arg_type_is_mem_size(arg_next));
+ bool is_fixed = fn->arg_type[arg] & MEM_FIXED_SIZE;
+ bool has_size = fn->arg_size[arg] != 0;
+ bool is_next_size = false;
+
+ if (arg + 1 < ARRAY_SIZE(fn->arg_type))
+ is_next_size = arg_type_is_mem_size(fn->arg_type[arg + 1]);
+
+ if (base_type(fn->arg_type[arg]) != ARG_PTR_TO_MEM)
+ return is_next_size;
+
+ return has_size == is_next_size || is_next_size == is_fixed;
}
static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
@@ -4212,97 +9067,83 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
* helper function specification.
*/
if (arg_type_is_mem_size(fn->arg1_type) ||
- arg_type_is_mem_ptr(fn->arg5_type) ||
- check_args_pair_invalid(fn->arg1_type, fn->arg2_type) ||
- check_args_pair_invalid(fn->arg2_type, fn->arg3_type) ||
- check_args_pair_invalid(fn->arg3_type, fn->arg4_type) ||
- check_args_pair_invalid(fn->arg4_type, fn->arg5_type))
+ check_args_pair_invalid(fn, 0) ||
+ check_args_pair_invalid(fn, 1) ||
+ check_args_pair_invalid(fn, 2) ||
+ check_args_pair_invalid(fn, 3) ||
+ check_args_pair_invalid(fn, 4))
return false;
return true;
}
-static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)
+static bool check_btf_id_ok(const struct bpf_func_proto *fn)
{
- int count = 0;
-
- if (arg_type_may_be_refcounted(fn->arg1_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg2_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg3_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg4_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg5_type))
- count++;
+ int i;
- /* A reference acquiring function cannot acquire
- * another refcounted ptr.
- */
- if (may_be_acquire_function(func_id) && count)
- return false;
+ for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
+ if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID)
+ return !!fn->arg_btf_id[i];
+ if (base_type(fn->arg_type[i]) == ARG_PTR_TO_SPIN_LOCK)
+ return fn->arg_btf_id[i] == BPF_PTR_POISON;
+ if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] &&
+ /* arg_btf_id and arg_size are in a union. */
+ (base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM ||
+ !(fn->arg_type[i] & MEM_FIXED_SIZE)))
+ return false;
+ }
- /* We only support one arg being unreferenced at the moment,
- * which is sufficient for the helper functions we have right now.
- */
- return count <= 1;
+ return true;
}
static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
{
return check_raw_mode_ok(fn) &&
check_arg_pair_ok(fn) &&
- check_refcount_ok(fn, func_id) ? 0 : -EINVAL;
+ check_btf_id_ok(fn) ? 0 : -EINVAL;
}
/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
* are now invalid, so turn them into unknown SCALAR_VALUE.
+ *
+ * This also applies to dynptr slices belonging to skb and xdp dynptrs,
+ * since these slices point to packet data.
*/
-static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,
- struct bpf_func_state *state)
-{
- struct bpf_reg_state *regs = state->regs, *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++)
- if (reg_is_pkt_pointer_any(&regs[i]))
- mark_reg_unknown(env, regs, i);
-
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- if (reg_is_pkt_pointer_any(reg))
- __mark_reg_unknown(env, reg);
- }
-}
-
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state *vstate = env->cur_state;
- int i;
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
- for (i = 0; i <= vstate->curframe; i++)
- __clear_all_pkt_pointers(env, vstate->frame[i]);
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg_is_pkt_pointer_any(reg) || reg_is_dynptr_slice_pkt(reg))
+ mark_reg_invalid(env, reg);
+ }));
}
-static void release_reg_references(struct bpf_verifier_env *env,
- struct bpf_func_state *state,
- int ref_obj_id)
+enum {
+ AT_PKT_END = -1,
+ BEYOND_PKT_END = -2,
+};
+
+static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range_open)
{
- struct bpf_reg_state *regs = state->regs, *reg;
- int i;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+ struct bpf_reg_state *reg = &state->regs[regn];
- for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].ref_obj_id == ref_obj_id)
- mark_reg_unknown(env, regs, i);
+ if (reg->type != PTR_TO_PACKET)
+ /* PTR_TO_PACKET_META is not supported yet */
+ return;
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- if (reg->ref_obj_id == ref_obj_id)
- __mark_reg_unknown(env, reg);
- }
+ /* The 'reg' is pkt > pkt_end or pkt >= pkt_end.
+ * How far beyond pkt_end it goes is unknown.
+ * if (!range_open) it's the case of pkt >= pkt_end
+ * if (range_open) it's the case of pkt > pkt_end
+ * hence this pointer is at least 1 byte bigger than pkt_end
+ */
+ if (range_open)
+ reg->range = BEYOND_PKT_END;
+ else
+ reg->range = AT_PKT_END;
}
/* The pointer with the specified id has released its reference to kernel
@@ -4311,20 +9152,33 @@ static void release_reg_references(struct bpf_verifier_env *env,
static int release_reference(struct bpf_verifier_env *env,
int ref_obj_id)
{
- struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
int err;
- int i;
err = release_reference_state(cur_func(env), ref_obj_id);
if (err)
return err;
- for (i = 0; i <= vstate->curframe; i++)
- release_reg_references(env, vstate->frame[i], ref_obj_id);
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg->ref_obj_id == ref_obj_id)
+ mark_reg_invalid(env, reg);
+ }));
return 0;
}
+static void invalidate_non_owning_refs(struct bpf_verifier_env *env)
+{
+ struct bpf_func_state *unused;
+ struct bpf_reg_state *reg;
+
+ bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
+ if (type_is_non_owning_ref(reg->type))
+ mark_reg_invalid(env, reg);
+ }));
+}
+
static void clear_caller_saved_regs(struct bpf_verifier_env *env,
struct bpf_reg_state *regs)
{
@@ -4333,18 +9187,25 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env,
/* after the call registers r0 - r5 were scratched */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
- check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ __check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK);
}
}
-static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
- int *insn_idx)
+typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx);
+
+static int set_callee_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee, int insn_idx);
+
+static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int callsite,
+ set_callee_state_fn set_callee_state_cb,
+ struct bpf_verifier_state *state)
{
- struct bpf_verifier_state *state = env->cur_state;
- struct bpf_func_info_aux *func_info_aux;
struct bpf_func_state *caller, *callee;
- int i, err, subprog, target_insn;
- bool is_global = false;
+ int err;
if (state->curframe + 1 >= MAX_CALL_FRAMES) {
verbose(env, "the call stack of %d frames is too deep\n",
@@ -4352,47 +9213,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -E2BIG;
}
- target_insn = *insn_idx + insn->imm;
- subprog = find_subprog(env, target_insn + 1);
- if (subprog < 0) {
- verbose(env, "verifier bug. No program starts at insn %d\n",
- target_insn + 1);
- return -EFAULT;
- }
-
- caller = state->frame[state->curframe];
if (state->frame[state->curframe + 1]) {
verbose(env, "verifier bug. Frame %d already allocated\n",
state->curframe + 1);
return -EFAULT;
}
- func_info_aux = env->prog->aux->func_info_aux;
- if (func_info_aux)
- is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_func_arg_match(env, subprog, caller->regs);
- if (err == -EFAULT)
- return err;
- if (is_global) {
- if (err) {
- verbose(env, "Caller passes invalid args into func#%d\n",
- subprog);
- return err;
- } else {
- if (env->log.level & BPF_LOG_LEVEL)
- verbose(env,
- "Func#%d is global and valid. Skipping.\n",
- subprog);
- clear_caller_saved_regs(env, caller->regs);
-
- /* All global functions return SCALAR_VALUE */
- mark_reg_unknown(env, caller->regs, BPF_REG_0);
-
- /* continue with next insn after call */
- return 0;
- }
- }
-
+ caller = state->frame[state->curframe];
callee = kzalloc(sizeof(*callee), GFP_KERNEL);
if (!callee)
return -ENOMEM;
@@ -4404,43 +9231,497 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
*/
init_func_state(env, callee,
/* remember the callsite, it will be used by bpf_exit */
- *insn_idx /* callsite */,
+ callsite,
state->curframe + 1 /* frameno within this callchain */,
subprog /* subprog number within this prog */);
-
/* Transfer references to the callee */
- err = transfer_reference_state(callee, caller);
+ err = copy_reference_state(callee, caller);
+ err = err ?: set_callee_state_cb(env, caller, callee, callsite);
+ if (err)
+ goto err_out;
+
+ /* only increment it after check_reg_arg() finished */
+ state->curframe++;
+
+ return 0;
+
+err_out:
+ free_func_state(callee);
+ state->frame[state->curframe + 1] = NULL;
+ return err;
+}
+
+static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
+ const struct btf *btf,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_subprog_info *sub = subprog_info(env, subprog);
+ struct bpf_verifier_log *log = &env->log;
+ u32 i;
+ int ret;
+
+ ret = btf_prepare_func_args(env, subprog);
+ if (ret)
+ return ret;
+
+ /* check that BTF function arguments match actual types that the
+ * verifier sees.
+ */
+ for (i = 0; i < sub->arg_cnt; i++) {
+ u32 regno = i + 1;
+ struct bpf_reg_state *reg = &regs[regno];
+ struct bpf_subprog_arg_info *arg = &sub->args[i];
+
+ if (arg->arg_type == ARG_ANYTHING) {
+ if (reg->type != SCALAR_VALUE) {
+ bpf_log(log, "R%d is not a scalar\n", regno);
+ return -EINVAL;
+ }
+ } else if (arg->arg_type == ARG_PTR_TO_CTX) {
+ ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
+ if (ret < 0)
+ return ret;
+ /* If function expects ctx type in BTF check that caller
+ * is passing PTR_TO_CTX.
+ */
+ if (reg->type != PTR_TO_CTX) {
+ bpf_log(log, "arg#%d expects pointer to ctx\n", i);
+ return -EINVAL;
+ }
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
+ ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
+ if (ret < 0)
+ return ret;
+ if (check_mem_reg(env, reg, regno, arg->mem_size))
+ return -EINVAL;
+ if (!(arg->arg_type & PTR_MAYBE_NULL) && (reg->type & PTR_MAYBE_NULL)) {
+ bpf_log(log, "arg#%d is expected to be non-NULL\n", i);
+ return -EINVAL;
+ }
+ } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
+ ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);
+ if (ret)
+ return ret;
+ } else {
+ bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n",
+ i, arg->arg_type);
+ return -EFAULT;
+ }
+ }
+
+ return 0;
+}
+
+/* Compare BTF of a function call with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ */
+static int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_prog *prog = env->prog;
+ struct btf *btf = prog->aux->btf;
+ u32 btf_id;
+ int err;
+
+ if (!prog->aux->func_info)
+ return -EINVAL;
+
+ btf_id = prog->aux->func_info[subprog].type_id;
+ if (!btf_id)
+ return -EFAULT;
+
+ if (prog->aux->func_info_aux[subprog].unreliable)
+ return -EINVAL;
+
+ err = btf_check_func_arg_match(env, subprog, btf, regs);
+ /* Compiler optimizations can remove arguments from static functions
+ * or mismatched type can be passed into a global function.
+ * In such cases mark the function as unreliable from BTF point of view.
+ */
if (err)
+ prog->aux->func_info_aux[subprog].unreliable = true;
+ return err;
+}
+
+static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int insn_idx, int subprog,
+ set_callee_state_fn set_callee_state_cb)
+{
+ struct bpf_verifier_state *state = env->cur_state, *callback_state;
+ struct bpf_func_state *caller, *callee;
+ int err;
+
+ caller = state->frame[state->curframe];
+ err = btf_check_subprog_call(env, subprog, caller->regs);
+ if (err == -EFAULT)
return err;
- /* copy r1 - r5 args that callee can access. The copy includes parent
- * pointers, which connects us up to the liveness chain
+ /* set_callee_state is used for direct subprog calls, but we are
+ * interested in validating only BPF helpers that can call subprogs as
+ * callbacks
*/
- for (i = BPF_REG_1; i <= BPF_REG_5; i++)
- callee->regs[i] = caller->regs[i];
+ env->subprog_info[subprog].is_cb = true;
+ if (bpf_pseudo_kfunc_call(insn) &&
+ !is_sync_callback_calling_kfunc(insn->imm)) {
+ verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ } else if (!bpf_pseudo_kfunc_call(insn) &&
+ !is_callback_calling_function(insn->imm)) { /* helper */
+ verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ }
- clear_caller_saved_regs(env, caller->regs);
+ if (insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == 0 &&
+ insn->imm == BPF_FUNC_timer_set_callback) {
+ struct bpf_verifier_state *async_cb;
- /* only increment it after check_reg_arg() finished */
- state->curframe++;
+ /* there is no real recursion here. timer callbacks are async */
+ env->subprog_info[subprog].is_async_cb = true;
+ async_cb = push_async_cb(env, env->subprog_info[subprog].start,
+ insn_idx, subprog);
+ if (!async_cb)
+ return -EFAULT;
+ callee = async_cb->frame[0];
+ callee->async_entry_cnt = caller->async_entry_cnt + 1;
+
+ /* Convert bpf_timer_set_callback() args into timer callback args */
+ err = set_callee_state_cb(env, caller, callee, insn_idx);
+ if (err)
+ return err;
+
+ return 0;
+ }
+
+ /* for callback functions enqueue entry to callback and
+ * proceed with next instruction within current frame.
+ */
+ callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false);
+ if (!callback_state)
+ return -ENOMEM;
+
+ err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb,
+ callback_state);
+ if (err)
+ return err;
+
+ callback_state->callback_unroll_depth++;
+ callback_state->frame[callback_state->curframe - 1]->callback_depth++;
+ caller->callback_depth = 0;
+ return 0;
+}
+
+static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_func_state *caller;
+ int err, subprog, target_insn;
+
+ target_insn = *insn_idx + insn->imm + 1;
+ subprog = find_subprog(env, target_insn);
+ if (subprog < 0) {
+ verbose(env, "verifier bug. No program starts at insn %d\n", target_insn);
+ return -EFAULT;
+ }
+
+ caller = state->frame[state->curframe];
+ err = btf_check_subprog_call(env, subprog, caller->regs);
+ if (err == -EFAULT)
+ return err;
+ if (subprog_is_global(env, subprog)) {
+ const char *sub_name = subprog_name(env, subprog);
+
+ if (err) {
+ verbose(env, "Caller passes invalid args into func#%d ('%s')\n",
+ subprog, sub_name);
+ return err;
+ }
+
+ verbose(env, "Func#%d ('%s') is global and assumed valid.\n",
+ subprog, sub_name);
+ /* mark global subprog for verifying after main prog */
+ subprog_aux(env, subprog)->called = true;
+ clear_caller_saved_regs(env, caller->regs);
+
+ /* All global functions return a 64-bit SCALAR_VALUE */
+ mark_reg_unknown(env, caller->regs, BPF_REG_0);
+ caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+
+ /* continue with next insn after call */
+ return 0;
+ }
+
+ /* for regular function entry setup new frame and continue
+ * from that frame.
+ */
+ err = setup_func_entry(env, subprog, *insn_idx, set_callee_state, state);
+ if (err)
+ return err;
+
+ clear_caller_saved_regs(env, caller->regs);
/* and go analyze first insn of the callee */
- *insn_idx = target_insn;
+ *insn_idx = env->subprog_info[subprog].start - 1;
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "caller:\n");
- print_verifier_state(env, caller);
+ print_verifier_state(env, caller, true);
verbose(env, "callee:\n");
- print_verifier_state(env, callee);
+ print_verifier_state(env, state->frame[state->curframe], true);
}
+
return 0;
}
-static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
+int map_set_for_each_callback_args(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee)
+{
+ /* bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn,
+ * void *callback_ctx, u64 flags);
+ * callback_fn(struct bpf_map *map, void *key, void *value,
+ * void *callback_ctx);
+ */
+ callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
+
+ callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+ callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+ callee->regs[BPF_REG_3].map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+ /* pointer to stack or null */
+ callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ return 0;
+}
+
+static int set_callee_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee, int insn_idx)
+{
+ int i;
+
+ /* copy r1 - r5 args that callee can access. The copy includes parent
+ * pointers, which connects us up to the liveness chain
+ */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ callee->regs[i] = caller->regs[i];
+ return 0;
+}
+
+static int set_map_elem_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx];
+ struct bpf_map *map;
+ int err;
+
+ if (bpf_map_ptr_poisoned(insn_aux)) {
+ verbose(env, "tail_call abusing map_ptr\n");
+ return -EINVAL;
+ }
+
+ map = BPF_MAP_PTR(insn_aux->map_ptr_state);
+ if (!map->ops->map_set_for_each_callback_args ||
+ !map->ops->map_for_each_callback) {
+ verbose(env, "callback function not allowed for map\n");
+ return -ENOTSUPP;
+ }
+
+ err = map->ops->map_set_for_each_callback_args(env, caller, callee);
+ if (err)
+ return err;
+
+ callee->in_callback_fn = true;
+ callee->callback_ret_range = retval_range(0, 1);
+ return 0;
+}
+
+static int set_loop_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx,
+ * u64 flags);
+ * callback_fn(u32 index, void *callback_ctx);
+ */
+ callee->regs[BPF_REG_1].type = SCALAR_VALUE;
+ callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+
+ callee->in_callback_fn = true;
+ callee->callback_ret_range = retval_range(0, 1);
+ return 0;
+}
+
+static int set_timer_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ struct bpf_map *map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+ /* bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn);
+ * callback_fn(struct bpf_map *map, void *key, void *value);
+ */
+ callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+ callee->regs[BPF_REG_1].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+ callee->regs[BPF_REG_3].map_ptr = map_ptr;
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_async_callback_fn = true;
+ callee->callback_ret_range = retval_range(0, 1);
+ return 0;
+}
+
+static int set_find_vma_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_find_vma(struct task_struct *task, u64 addr,
+ * void *callback_fn, void *callback_ctx, u64 flags)
+ * (callback_fn)(struct task_struct *task,
+ * struct vm_area_struct *vma, void *callback_ctx);
+ */
+ callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
+
+ callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].btf = btf_vmlinux;
+ callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
+
+ /* pointer to stack or null */
+ callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_callback_fn = true;
+ callee->callback_ret_range = retval_range(0, 1);
+ return 0;
+}
+
+static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void
+ * callback_ctx, u64 flags);
+ * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);
+ */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
+ mark_dynptr_cb_reg(env, &callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
+ callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+
+ callee->in_callback_fn = true;
+ callee->callback_ret_range = retval_range(0, 1);
+ return 0;
+}
+
+static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* void bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ * bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b));
+ *
+ * 'struct bpf_rb_node *node' arg to bpf_rbtree_add_impl is the same PTR_TO_BTF_ID w/ offset
+ * that 'less' callback args will be receiving. However, 'node' arg was release_reference'd
+ * by this point, so look at 'root'
+ */
+ struct btf_field *field;
+
+ field = reg_find_field_offset(&caller->regs[BPF_REG_1], caller->regs[BPF_REG_1].off,
+ BPF_RB_ROOT);
+ if (!field || !field->graph_root.value_btf_id)
+ return -EFAULT;
+
+ mark_reg_graph_node(callee->regs, BPF_REG_1, &field->graph_root);
+ ref_set_non_owning(env, &callee->regs[BPF_REG_1]);
+ mark_reg_graph_node(callee->regs, BPF_REG_2, &field->graph_root);
+ ref_set_non_owning(env, &callee->regs[BPF_REG_2]);
+
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_callback_fn = true;
+ callee->callback_ret_range = retval_range(0, 1);
+ return 0;
+}
+
+static bool is_rbtree_lock_required_kfunc(u32 btf_id);
+
+/* Are we currently verifying the callback for a rbtree helper that must
+ * be called with lock held? If so, no need to complain about unreleased
+ * lock
+ */
+static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
{
struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_insn *insn = env->prog->insnsi;
+ struct bpf_func_state *callee;
+ int kfunc_btf_id;
+
+ if (!state->curframe)
+ return false;
+
+ callee = state->frame[state->curframe];
+
+ if (!callee->in_callback_fn)
+ return false;
+
+ kfunc_btf_id = insn[callee->callsite].imm;
+ return is_rbtree_lock_required_kfunc(kfunc_btf_id);
+}
+
+static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg)
+{
+ return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
+}
+
+static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
+{
+ struct bpf_verifier_state *state = env->cur_state, *prev_st;
struct bpf_func_state *caller, *callee;
struct bpf_reg_state *r0;
+ bool in_callback_fn;
int err;
callee = state->frame[state->curframe];
@@ -4456,47 +9737,124 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return -EINVAL;
}
- state->curframe--;
- caller = state->frame[state->curframe];
- /* return to the caller whatever r0 had in the callee */
- caller->regs[BPF_REG_0] = *r0;
+ caller = state->frame[state->curframe - 1];
+ if (callee->in_callback_fn) {
+ if (r0->type != SCALAR_VALUE) {
+ verbose(env, "R0 not a scalar value\n");
+ return -EACCES;
+ }
- /* Transfer references to the caller */
- err = transfer_reference_state(caller, callee);
- if (err)
- return err;
+ /* we are going to rely on register's precise value */
+ err = mark_reg_read(env, r0, r0->parent, REG_LIVE_READ64);
+ err = err ?: mark_chain_precision(env, BPF_REG_0);
+ if (err)
+ return err;
+
+ /* enforce R0 return value range */
+ if (!retval_range_within(callee->callback_ret_range, r0)) {
+ verbose_invalid_scalar(env, r0, callee->callback_ret_range,
+ "At callback return", "R0");
+ return -EINVAL;
+ }
+ if (!calls_callback(env, callee->callsite)) {
+ verbose(env, "BUG: in callback at %d, callsite %d !calls_callback\n",
+ *insn_idx, callee->callsite);
+ return -EFAULT;
+ }
+ } else {
+ /* return to the caller whatever r0 had in the callee */
+ caller->regs[BPF_REG_0] = *r0;
+ }
+
+ /* callback_fn frame should have released its own additions to parent's
+ * reference state at this point, or check_reference_leak would
+ * complain, hence it must be the same as the caller. There is no need
+ * to copy it back.
+ */
+ if (!callee->in_callback_fn) {
+ /* Transfer references to the caller */
+ err = copy_reference_state(caller, callee);
+ if (err)
+ return err;
+ }
+
+ /* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite,
+ * there function call logic would reschedule callback visit. If iteration
+ * converges is_state_visited() would prune that visit eventually.
+ */
+ in_callback_fn = callee->in_callback_fn;
+ if (in_callback_fn)
+ *insn_idx = callee->callsite;
+ else
+ *insn_idx = callee->callsite + 1;
- *insn_idx = callee->callsite + 1;
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "returning from callee:\n");
- print_verifier_state(env, callee);
+ print_verifier_state(env, callee, true);
verbose(env, "to caller at %d:\n", *insn_idx);
- print_verifier_state(env, caller);
+ print_verifier_state(env, caller, true);
}
- /* clear everything in the callee */
+ /* clear everything in the callee. In case of exceptional exits using
+ * bpf_throw, this will be done by copy_verifier_state for extra frames. */
free_func_state(callee);
- state->frame[state->curframe + 1] = NULL;
+ state->frame[state->curframe--] = NULL;
+
+ /* for callbacks widen imprecise scalars to make programs like below verify:
+ *
+ * struct ctx { int i; }
+ * void cb(int idx, struct ctx *ctx) { ctx->i++; ... }
+ * ...
+ * struct ctx = { .i = 0; }
+ * bpf_loop(100, cb, &ctx, 0);
+ *
+ * This is similar to what is done in process_iter_next_call() for open
+ * coded iterators.
+ */
+ prev_st = in_callback_fn ? find_prev_entry(env, state, *insn_idx) : NULL;
+ if (prev_st) {
+ err = widen_imprecise_scalars(env, prev_st, state);
+ if (err)
+ return err;
+ }
return 0;
}
-static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
- int func_id,
- struct bpf_call_arg_meta *meta)
+static int do_refine_retval_range(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, int ret_type,
+ int func_id,
+ struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
- if (ret_type != RET_INTEGER ||
- (func_id != BPF_FUNC_get_stack &&
- func_id != BPF_FUNC_probe_read_str &&
- func_id != BPF_FUNC_probe_read_kernel_str &&
- func_id != BPF_FUNC_probe_read_user_str))
- return;
+ if (ret_type != RET_INTEGER)
+ return 0;
- ret_reg->smax_value = meta->msize_max_value;
- ret_reg->s32_max_value = meta->msize_max_value;
- __reg_deduce_bounds(ret_reg);
- __reg_bound_offset(ret_reg);
- __update_reg_bounds(ret_reg);
+ switch (func_id) {
+ case BPF_FUNC_get_stack:
+ case BPF_FUNC_get_task_stack:
+ case BPF_FUNC_probe_read_str:
+ case BPF_FUNC_probe_read_kernel_str:
+ case BPF_FUNC_probe_read_user_str:
+ ret_reg->smax_value = meta->msize_max_value;
+ ret_reg->s32_max_value = meta->msize_max_value;
+ ret_reg->smin_value = -MAX_ERRNO;
+ ret_reg->s32_min_value = -MAX_ERRNO;
+ reg_bounds_sync(ret_reg);
+ break;
+ case BPF_FUNC_get_smp_processor_id:
+ ret_reg->umax_value = nr_cpu_ids - 1;
+ ret_reg->u32_max_value = nr_cpu_ids - 1;
+ ret_reg->smax_value = nr_cpu_ids - 1;
+ ret_reg->s32_max_value = nr_cpu_ids - 1;
+ ret_reg->umin_value = 0;
+ ret_reg->u32_min_value = 0;
+ ret_reg->smin_value = 0;
+ ret_reg->s32_min_value = 0;
+ reg_bounds_sync(ret_reg);
+ break;
+ }
+
+ return reg_bounds_sanity_check(env, ret_reg, "retval");
}
static int
@@ -4512,7 +9870,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_map_push_elem &&
func_id != BPF_FUNC_map_pop_elem &&
- func_id != BPF_FUNC_map_peek_elem)
+ func_id != BPF_FUNC_map_peek_elem &&
+ func_id != BPF_FUNC_for_each_map_elem &&
+ func_id != BPF_FUNC_redirect_map &&
+ func_id != BPF_FUNC_map_lookup_percpu_elem)
return 0;
if (map == NULL) {
@@ -4549,8 +9910,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
struct bpf_reg_state *regs = cur_regs(env), *reg;
struct bpf_map *map = meta->map_ptr;
- struct tnum range;
- u64 val;
+ u64 val, max;
int err;
if (func_id != BPF_FUNC_tail_call)
@@ -4560,10 +9920,11 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return -EINVAL;
}
- range = tnum_range(0, map->max_entries - 1);
reg = &regs[BPF_REG_3];
+ val = reg->var_off.value;
+ max = map->max_entries;
- if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) {
+ if (!(is_reg_const(reg, false) && val < max)) {
bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
return 0;
}
@@ -4571,8 +9932,6 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
err = mark_chain_precision(env, BPF_REG_3);
if (err)
return err;
-
- val = reg->var_off.value;
if (bpf_map_key_unseen(aux))
bpf_map_key_store(aux, val);
else if (!bpf_map_key_poisoned(aux) &&
@@ -4581,27 +9940,135 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return 0;
}
-static int check_reference_leak(struct bpf_verifier_env *env)
+static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit)
{
struct bpf_func_state *state = cur_func(env);
+ bool refs_lingering = false;
int i;
+ if (!exception_exit && state->frameno && !state->in_callback_fn)
+ return 0;
+
for (i = 0; i < state->acquired_refs; i++) {
+ if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+ continue;
verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
state->refs[i].id, state->refs[i].insn_idx);
+ refs_lingering = true;
}
- return state->acquired_refs ? -EINVAL : 0;
+ return refs_lingering ? -EINVAL : 0;
}
-static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
+static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs)
{
+ struct bpf_reg_state *fmt_reg = &regs[BPF_REG_3];
+ struct bpf_reg_state *data_len_reg = &regs[BPF_REG_5];
+ struct bpf_map *fmt_map = fmt_reg->map_ptr;
+ struct bpf_bprintf_data data = {};
+ int err, fmt_map_off, num_args;
+ u64 fmt_addr;
+ char *fmt;
+
+ /* data must be an array of u64 */
+ if (data_len_reg->var_off.value % 8)
+ return -EINVAL;
+ num_args = data_len_reg->var_off.value / 8;
+
+ /* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const
+ * and map_direct_value_addr is set.
+ */
+ fmt_map_off = fmt_reg->off + fmt_reg->var_off.value;
+ err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr,
+ fmt_map_off);
+ if (err) {
+ verbose(env, "verifier bug\n");
+ return -EFAULT;
+ }
+ fmt = (char *)(long)fmt_addr + fmt_map_off;
+
+ /* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we
+ * can focus on validating the format specifiers.
+ */
+ err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, num_args, &data);
+ if (err < 0)
+ verbose(env, "Invalid format string\n");
+
+ return err;
+}
+
+static int check_get_func_ip(struct bpf_verifier_env *env)
+{
+ enum bpf_prog_type type = resolve_prog_type(env->prog);
+ int func_id = BPF_FUNC_get_func_ip;
+
+ if (type == BPF_PROG_TYPE_TRACING) {
+ if (!bpf_prog_has_trampoline(env->prog)) {
+ verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n",
+ func_id_name(func_id), func_id);
+ return -ENOTSUPP;
+ }
+ return 0;
+ } else if (type == BPF_PROG_TYPE_KPROBE) {
+ return 0;
+ }
+
+ verbose(env, "func %s#%d not supported for program type %d\n",
+ func_id_name(func_id), func_id, type);
+ return -ENOTSUPP;
+}
+
+static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
+{
+ return &env->insn_aux_data[env->insn_idx];
+}
+
+static bool loop_flag_is_zero(struct bpf_verifier_env *env)
+{
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = &regs[BPF_REG_4];
+ bool reg_is_null = register_is_null(reg);
+
+ if (reg_is_null)
+ mark_chain_precision(env, BPF_REG_4);
+
+ return reg_is_null;
+}
+
+static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
+{
+ struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
+
+ if (!state->initialized) {
+ state->initialized = 1;
+ state->fit_for_inline = loop_flag_is_zero(env);
+ state->callback_subprogno = subprogno;
+ return;
+ }
+
+ if (!state->fit_for_inline)
+ return;
+
+ state->fit_for_inline = (loop_flag_is_zero(env) &&
+ state->callback_subprogno == subprogno);
+}
+
+static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx_p)
+{
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+ bool returns_cpu_specific_alloc_ptr = false;
const struct bpf_func_proto *fn = NULL;
+ enum bpf_return_type ret_type;
+ enum bpf_type_flag ret_flag;
struct bpf_reg_state *regs;
struct bpf_call_arg_meta meta;
+ int insn_idx = *insn_idx_p;
bool changes_data;
- int i, err;
+ int i, err, func_id;
/* find function prototype */
+ func_id = insn->imm;
if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
func_id);
@@ -4622,6 +10089,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
return -EINVAL;
}
+ if (fn->allowed && !fn->allowed(env->prog)) {
+ verbose(env, "helper call is not allowed in probe\n");
+ return -EINVAL;
+ }
+
+ if (!env->prog->aux->sleepable && fn->might_sleep) {
+ verbose(env, "helper call might sleep in a non-sleepable prog\n");
+ return -EINVAL;
+ }
+
/* With LD_ABS/IND some JITs save/restore skb from r1. */
changes_data = bpf_helper_changes_pkt_data(fn->func);
if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
@@ -4640,13 +10117,21 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
return err;
}
+ if (env->cur_state->active_rcu_lock) {
+ if (fn->might_sleep) {
+ verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
+
+ if (env->prog->aux->sleepable && is_storage_get_function(func_id))
+ env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
+ }
+
meta.func_id = func_id;
/* check args */
- for (i = 0; i < 5; i++) {
- err = btf_resolve_helper_id(&env->log, fn, i);
- if (err > 0)
- meta.btf_id = err;
- err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta);
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
+ err = check_func_arg(env, i, &meta, fn, insn_idx);
if (err)
return err;
}
@@ -4664,19 +10149,53 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
*/
for (i = 0; i < meta.access_size; i++) {
err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B,
- BPF_WRITE, -1, false);
+ BPF_WRITE, -1, false, false);
if (err)
return err;
}
- if (func_id == BPF_FUNC_tail_call) {
- err = check_reference_leak(env);
- if (err) {
- verbose(env, "tail_call would lead to reference leak\n");
- return err;
+ regs = cur_regs(env);
+
+ if (meta.release_regno) {
+ err = -EINVAL;
+ /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
+ * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr
+ * is safe to do directly.
+ */
+ if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) {
+ if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) {
+ verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n");
+ return -EFAULT;
+ }
+ err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
+ } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) {
+ u32 ref_obj_id = meta.ref_obj_id;
+ bool in_rcu = in_rcu_cs(env);
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+
+ err = release_reference_state(cur_func(env), ref_obj_id);
+ if (!err) {
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg->ref_obj_id == ref_obj_id) {
+ if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) {
+ reg->ref_obj_id = 0;
+ reg->type &= ~MEM_ALLOC;
+ reg->type |= MEM_RCU;
+ } else {
+ mark_reg_invalid(env, reg);
+ }
+ }
+ }));
+ }
+ } else if (meta.ref_obj_id) {
+ err = release_reference(env, meta.ref_obj_id);
+ } else if (register_is_null(&regs[meta.release_regno])) {
+ /* meta.ref_obj_id can only be 0 if register that is meant to be
+ * released is NULL, which must be > R0.
+ */
+ err = 0;
}
- } else if (is_release_function(func_id)) {
- err = release_reference(env, meta.ref_obj_id);
if (err) {
verbose(env, "func %s#%d reference has not been acquired before\n",
func_id_name(func_id), func_id);
@@ -4684,17 +10203,158 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
}
}
- regs = cur_regs(env);
+ switch (func_id) {
+ case BPF_FUNC_tail_call:
+ err = check_reference_leak(env, false);
+ if (err) {
+ verbose(env, "tail_call would lead to reference leak\n");
+ return err;
+ }
+ break;
+ case BPF_FUNC_get_local_storage:
+ /* check that flags argument in get_local_storage(map, flags) is 0,
+ * this is required because get_local_storage() can't return an error.
+ */
+ if (!register_is_null(&regs[BPF_REG_2])) {
+ verbose(env, "get_local_storage() doesn't support non-zero flags\n");
+ return -EINVAL;
+ }
+ break;
+ case BPF_FUNC_for_each_map_elem:
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_map_elem_callback_state);
+ break;
+ case BPF_FUNC_timer_set_callback:
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_timer_callback_state);
+ break;
+ case BPF_FUNC_find_vma:
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_find_vma_callback_state);
+ break;
+ case BPF_FUNC_snprintf:
+ err = check_bpf_snprintf_call(env, regs);
+ break;
+ case BPF_FUNC_loop:
+ update_loop_inline_state(env, meta.subprogno);
+ /* Verifier relies on R1 value to determine if bpf_loop() iteration
+ * is finished, thus mark it precise.
+ */
+ err = mark_chain_precision(env, BPF_REG_1);
+ if (err)
+ return err;
+ if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_loop_callback_state);
+ } else {
+ cur_func(env)->callback_depth = 0;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "frame%d bpf_loop iteration limit reached\n",
+ env->cur_state->curframe);
+ }
+ break;
+ case BPF_FUNC_dynptr_from_mem:
+ if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
+ verbose(env, "Unsupported reg type %s for bpf_dynptr_from_mem data\n",
+ reg_type_str(env, regs[BPF_REG_1].type));
+ return -EACCES;
+ }
+ break;
+ case BPF_FUNC_set_retval:
+ if (prog_type == BPF_PROG_TYPE_LSM &&
+ env->prog->expected_attach_type == BPF_LSM_CGROUP) {
+ if (!env->prog->aux->attach_func_proto->type) {
+ /* Make sure programs that attach to void
+ * hooks don't try to modify return value.
+ */
+ verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
+ return -EINVAL;
+ }
+ }
+ break;
+ case BPF_FUNC_dynptr_data:
+ {
+ struct bpf_reg_state *reg;
+ int id, ref_obj_id;
- /* check that flags argument in get_local_storage(map, flags) is 0,
- * this is required because get_local_storage() can't return an error.
- */
- if (func_id == BPF_FUNC_get_local_storage &&
- !register_is_null(&regs[BPF_REG_2])) {
- verbose(env, "get_local_storage() doesn't support non-zero flags\n");
- return -EINVAL;
+ reg = get_dynptr_arg_reg(env, fn, regs);
+ if (!reg)
+ return -EFAULT;
+
+
+ if (meta.dynptr_id) {
+ verbose(env, "verifier internal error: meta.dynptr_id already set\n");
+ return -EFAULT;
+ }
+ if (meta.ref_obj_id) {
+ verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
+ return -EFAULT;
+ }
+
+ id = dynptr_id(env, reg);
+ if (id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr id\n");
+ return id;
+ }
+
+ ref_obj_id = dynptr_ref_obj_id(env, reg);
+ if (ref_obj_id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n");
+ return ref_obj_id;
+ }
+
+ meta.dynptr_id = id;
+ meta.ref_obj_id = ref_obj_id;
+
+ break;
+ }
+ case BPF_FUNC_dynptr_write:
+ {
+ enum bpf_dynptr_type dynptr_type;
+ struct bpf_reg_state *reg;
+
+ reg = get_dynptr_arg_reg(env, fn, regs);
+ if (!reg)
+ return -EFAULT;
+
+ dynptr_type = dynptr_get_type(env, reg);
+ if (dynptr_type == BPF_DYNPTR_TYPE_INVALID)
+ return -EFAULT;
+
+ if (dynptr_type == BPF_DYNPTR_TYPE_SKB)
+ /* this will trigger clear_all_pkt_pointers(), which will
+ * invalidate all dynptr slices associated with the skb
+ */
+ changes_data = true;
+
+ break;
+ }
+ case BPF_FUNC_per_cpu_ptr:
+ case BPF_FUNC_this_cpu_ptr:
+ {
+ struct bpf_reg_state *reg = &regs[BPF_REG_1];
+ const struct btf_type *type;
+
+ if (reg->type & MEM_RCU) {
+ type = btf_type_by_id(reg->btf, reg->btf_id);
+ if (!type || !btf_type_is_struct(type)) {
+ verbose(env, "Helper has invalid btf/btf_id in R1\n");
+ return -EFAULT;
+ }
+ returns_cpu_specific_alloc_ptr = true;
+ env->insn_aux_data[insn_idx].call_with_percpu_alloc_ptr = true;
+ }
+ break;
+ }
+ case BPF_FUNC_user_ringbuf_drain:
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_user_ringbuf_callback_state);
+ break;
}
+ if (err)
+ return err;
+
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
@@ -4705,13 +10365,18 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
/* update return register (already marked as written above) */
- if (fn->ret_type == RET_INTEGER) {
+ ret_type = fn->ret_type;
+ ret_flag = type_flag(ret_type);
+
+ switch (base_type(ret_type)) {
+ case RET_INTEGER:
/* sets type to SCALAR_VALUE */
mark_reg_unknown(env, regs, BPF_REG_0);
- } else if (fn->ret_type == RET_VOID) {
+ break;
+ case RET_VOID:
regs[BPF_REG_0].type = NOT_INIT;
- } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL ||
- fn->ret_type == RET_PTR_TO_MAP_VALUE) {
+ break;
+ case RET_PTR_TO_MAP_VALUE:
/* There is no offset yet applied, variable or fixed */
mark_reg_known_zero(env, regs, BPF_REG_0);
/* remember map_ptr, so that check_map_access()
@@ -4724,38 +10389,123 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
return -EINVAL;
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
- if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
- if (map_value_has_spin_lock(meta.map_ptr))
- regs[BPF_REG_0].id = ++env->id_gen;
- } else {
- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+ regs[BPF_REG_0].map_uid = meta.map_uid;
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
+ if (!type_may_be_null(ret_type) &&
+ btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) {
regs[BPF_REG_0].id = ++env->id_gen;
}
- } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
+ break;
+ case RET_PTR_TO_SOCKET:
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
- regs[BPF_REG_0].id = ++env->id_gen;
- } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag;
+ break;
+ case RET_PTR_TO_SOCK_COMMON:
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL;
- regs[BPF_REG_0].id = ++env->id_gen;
- } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag;
+ break;
+ case RET_PTR_TO_TCP_SOCK:
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
- regs[BPF_REG_0].id = ++env->id_gen;
- } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
+ break;
+ case RET_PTR_TO_MEM:
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
- regs[BPF_REG_0].id = ++env->id_gen;
+ regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
regs[BPF_REG_0].mem_size = meta.mem_size;
- } else {
- verbose(env, "unknown return type %d of func %s#%d\n",
- fn->ret_type, func_id_name(func_id), func_id);
+ break;
+ case RET_PTR_TO_MEM_OR_BTF_ID:
+ {
+ const struct btf_type *t;
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ t = btf_type_skip_modifiers(meta.ret_btf, meta.ret_btf_id, NULL);
+ if (!btf_type_is_struct(t)) {
+ u32 tsize;
+ const struct btf_type *ret;
+ const char *tname;
+
+ /* resolve the type size of ksym. */
+ ret = btf_resolve_size(meta.ret_btf, t, &tsize);
+ if (IS_ERR(ret)) {
+ tname = btf_name_by_offset(meta.ret_btf, t->name_off);
+ verbose(env, "unable to resolve the size of type '%s': %ld\n",
+ tname, PTR_ERR(ret));
+ return -EINVAL;
+ }
+ regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
+ regs[BPF_REG_0].mem_size = tsize;
+ } else {
+ if (returns_cpu_specific_alloc_ptr) {
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU;
+ } else {
+ /* MEM_RDONLY may be carried from ret_flag, but it
+ * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
+ * it will confuse the check of PTR_TO_BTF_ID in
+ * check_mem_access().
+ */
+ ret_flag &= ~MEM_RDONLY;
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
+ }
+
+ regs[BPF_REG_0].btf = meta.ret_btf;
+ regs[BPF_REG_0].btf_id = meta.ret_btf_id;
+ }
+ break;
+ }
+ case RET_PTR_TO_BTF_ID:
+ {
+ struct btf *ret_btf;
+ int ret_btf_id;
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
+ if (func_id == BPF_FUNC_kptr_xchg) {
+ ret_btf = meta.kptr_field->kptr.btf;
+ ret_btf_id = meta.kptr_field->kptr.btf_id;
+ if (!btf_is_kernel(ret_btf)) {
+ regs[BPF_REG_0].type |= MEM_ALLOC;
+ if (meta.kptr_field->type == BPF_KPTR_PERCPU)
+ regs[BPF_REG_0].type |= MEM_PERCPU;
+ }
+ } else {
+ if (fn->ret_btf_id == BPF_PTR_POISON) {
+ verbose(env, "verifier internal error:");
+ verbose(env, "func %s has non-overwritten BPF_PTR_POISON return type\n",
+ func_id_name(func_id));
+ return -EINVAL;
+ }
+ ret_btf = btf_vmlinux;
+ ret_btf_id = *fn->ret_btf_id;
+ }
+ if (ret_btf_id == 0) {
+ verbose(env, "invalid return type %u of func %s#%d\n",
+ base_type(ret_type), func_id_name(func_id),
+ func_id);
+ return -EINVAL;
+ }
+ regs[BPF_REG_0].btf = ret_btf;
+ regs[BPF_REG_0].btf_id = ret_btf_id;
+ break;
+ }
+ default:
+ verbose(env, "unknown return type %u of func %s#%d\n",
+ base_type(ret_type), func_id_name(func_id), func_id);
return -EINVAL;
}
- if (is_ptr_cast_function(func_id)) {
+ if (type_may_be_null(regs[BPF_REG_0].type))
+ regs[BPF_REG_0].id = ++env->id_gen;
+
+ if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) {
+ verbose(env, "verifier internal error: func %s#%d sets ref_obj_id more than once\n",
+ func_id_name(func_id), func_id);
+ return -EFAULT;
+ }
+
+ if (is_dynptr_ref_function(func_id))
+ regs[BPF_REG_0].dynptr_id = meta.dynptr_id;
+
+ if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) {
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
} else if (is_acquire_function(func_id, meta.map_ptr)) {
@@ -4769,13 +10519,17 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
regs[BPF_REG_0].ref_obj_id = id;
}
- do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
+ err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta);
+ if (err)
+ return err;
err = check_map_func_compatibility(env, meta.map_ptr, func_id);
if (err)
return err;
- if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) {
+ if ((func_id == BPF_FUNC_get_stack ||
+ func_id == BPF_FUNC_get_task_stack) &&
+ !env->prog->has_callchain_buf) {
const char *err_str;
#ifdef CONFIG_PERF_EVENTS
@@ -4793,11 +10547,1840 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
env->prog->has_callchain_buf = true;
}
+ if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack)
+ env->prog->call_get_stack = true;
+
+ if (func_id == BPF_FUNC_get_func_ip) {
+ if (check_get_func_ip(env))
+ return -ENOTSUPP;
+ env->prog->call_get_func_ip = true;
+ }
+
if (changes_data)
clear_all_pkt_pointers(env);
return 0;
}
+/* mark_btf_func_reg_size() is used when the reg size is determined by
+ * the BTF func_proto's return value size and argument.
+ */
+static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
+ size_t reg_size)
+{
+ struct bpf_reg_state *reg = &cur_regs(env)[regno];
+
+ if (regno == BPF_REG_0) {
+ /* Function return value */
+ reg->live |= REG_LIVE_WRITTEN;
+ reg->subreg_def = reg_size == sizeof(u64) ?
+ DEF_NOT_SUBREG : env->insn_idx + 1;
+ } else {
+ /* Function argument */
+ if (reg_size == sizeof(u64)) {
+ mark_insn_zext(env, reg);
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
+ } else {
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32);
+ }
+ }
+}
+
+static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ACQUIRE;
+}
+
+static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RELEASE;
+}
+
+static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta);
+}
+
+static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_SLEEPABLE;
+}
+
+static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_DESTRUCTIVE;
+}
+
+static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RCU;
+}
+
+static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RCU_PROTECTED;
+}
+
+static bool __kfunc_param_match_suffix(const struct btf *btf,
+ const struct btf_param *arg,
+ const char *suffix)
+{
+ int suffix_len = strlen(suffix), len;
+ const char *param_name;
+
+ /* In the future, this can be ported to use BTF tagging */
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len < suffix_len)
+ return false;
+ param_name += len - suffix_len;
+ return !strncmp(param_name, suffix, suffix_len);
+}
+
+static bool is_kfunc_arg_mem_size(const struct btf *btf,
+ const struct btf_param *arg,
+ const struct bpf_reg_state *reg)
+{
+ const struct btf_type *t;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+ return false;
+
+ return __kfunc_param_match_suffix(btf, arg, "__sz");
+}
+
+static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
+ const struct btf_param *arg,
+ const struct bpf_reg_state *reg)
+{
+ const struct btf_type *t;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+ return false;
+
+ return __kfunc_param_match_suffix(btf, arg, "__szk");
+}
+
+static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__opt");
+}
+
+static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__k");
+}
+
+static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__ign");
+}
+
+static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__alloc");
+}
+
+static bool is_kfunc_arg_uninit(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__uninit");
+}
+
+static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__refcounted_kptr");
+}
+
+static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__nullable");
+}
+
+static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__str");
+}
+
+static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
+ const struct btf_param *arg,
+ const char *name)
+{
+ int len, target_len = strlen(name);
+ const char *param_name;
+
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len != target_len)
+ return false;
+ if (strcmp(param_name, name))
+ return false;
+
+ return true;
+}
+
+enum {
+ KF_ARG_DYNPTR_ID,
+ KF_ARG_LIST_HEAD_ID,
+ KF_ARG_LIST_NODE_ID,
+ KF_ARG_RB_ROOT_ID,
+ KF_ARG_RB_NODE_ID,
+};
+
+BTF_ID_LIST(kf_arg_btf_ids)
+BTF_ID(struct, bpf_dynptr_kern)
+BTF_ID(struct, bpf_list_head)
+BTF_ID(struct, bpf_list_node)
+BTF_ID(struct, bpf_rb_root)
+BTF_ID(struct, bpf_rb_node)
+
+static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
+ const struct btf_param *arg, int type)
+{
+ const struct btf_type *t;
+ u32 res_id;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!t)
+ return false;
+ if (!btf_type_is_ptr(t))
+ return false;
+ t = btf_type_skip_modifiers(btf, t->type, &res_id);
+ if (!t)
+ return false;
+ return btf_types_are_same(btf, res_id, btf_vmlinux, kf_arg_btf_ids[type]);
+}
+
+static bool is_kfunc_arg_dynptr(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_DYNPTR_ID);
+}
+
+static bool is_kfunc_arg_list_head(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_HEAD_ID);
+}
+
+static bool is_kfunc_arg_list_node(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_NODE_ID);
+}
+
+static bool is_kfunc_arg_rbtree_root(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_ROOT_ID);
+}
+
+static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID);
+}
+
+static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf,
+ const struct btf_param *arg)
+{
+ const struct btf_type *t;
+
+ t = btf_type_resolve_func_ptr(btf, arg->type, NULL);
+ if (!t)
+ return false;
+
+ return true;
+}
+
+/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
+static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env,
+ const struct btf *btf,
+ const struct btf_type *t, int rec)
+{
+ const struct btf_type *member_type;
+ const struct btf_member *member;
+ u32 i;
+
+ if (!btf_type_is_struct(t))
+ return false;
+
+ for_each_member(i, t, member) {
+ const struct btf_array *array;
+
+ member_type = btf_type_skip_modifiers(btf, member->type, NULL);
+ if (btf_type_is_struct(member_type)) {
+ if (rec >= 3) {
+ verbose(env, "max struct nesting depth exceeded\n");
+ return false;
+ }
+ if (!__btf_type_is_scalar_struct(env, btf, member_type, rec + 1))
+ return false;
+ continue;
+ }
+ if (btf_type_is_array(member_type)) {
+ array = btf_array(member_type);
+ if (!array->nelems)
+ return false;
+ member_type = btf_type_skip_modifiers(btf, array->type, NULL);
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ continue;
+ }
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ }
+ return true;
+}
+
+enum kfunc_ptr_arg_type {
+ KF_ARG_PTR_TO_CTX,
+ KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */
+ KF_ARG_PTR_TO_REFCOUNTED_KPTR, /* Refcounted local kptr */
+ KF_ARG_PTR_TO_DYNPTR,
+ KF_ARG_PTR_TO_ITER,
+ KF_ARG_PTR_TO_LIST_HEAD,
+ KF_ARG_PTR_TO_LIST_NODE,
+ KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */
+ KF_ARG_PTR_TO_MEM,
+ KF_ARG_PTR_TO_MEM_SIZE, /* Size derived from next argument, skip it */
+ KF_ARG_PTR_TO_CALLBACK,
+ KF_ARG_PTR_TO_RB_ROOT,
+ KF_ARG_PTR_TO_RB_NODE,
+ KF_ARG_PTR_TO_NULL,
+ KF_ARG_PTR_TO_CONST_STR,
+};
+
+enum special_kfunc_type {
+ KF_bpf_obj_new_impl,
+ KF_bpf_obj_drop_impl,
+ KF_bpf_refcount_acquire_impl,
+ KF_bpf_list_push_front_impl,
+ KF_bpf_list_push_back_impl,
+ KF_bpf_list_pop_front,
+ KF_bpf_list_pop_back,
+ KF_bpf_cast_to_kern_ctx,
+ KF_bpf_rdonly_cast,
+ KF_bpf_rcu_read_lock,
+ KF_bpf_rcu_read_unlock,
+ KF_bpf_rbtree_remove,
+ KF_bpf_rbtree_add_impl,
+ KF_bpf_rbtree_first,
+ KF_bpf_dynptr_from_skb,
+ KF_bpf_dynptr_from_xdp,
+ KF_bpf_dynptr_slice,
+ KF_bpf_dynptr_slice_rdwr,
+ KF_bpf_dynptr_clone,
+ KF_bpf_percpu_obj_new_impl,
+ KF_bpf_percpu_obj_drop_impl,
+ KF_bpf_throw,
+ KF_bpf_iter_css_task_new,
+};
+
+BTF_SET_START(special_kfunc_set)
+BTF_ID(func, bpf_obj_new_impl)
+BTF_ID(func, bpf_obj_drop_impl)
+BTF_ID(func, bpf_refcount_acquire_impl)
+BTF_ID(func, bpf_list_push_front_impl)
+BTF_ID(func, bpf_list_push_back_impl)
+BTF_ID(func, bpf_list_pop_front)
+BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_cast_to_kern_ctx)
+BTF_ID(func, bpf_rdonly_cast)
+BTF_ID(func, bpf_rbtree_remove)
+BTF_ID(func, bpf_rbtree_add_impl)
+BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_dynptr_from_skb)
+BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_slice)
+BTF_ID(func, bpf_dynptr_slice_rdwr)
+BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_percpu_obj_new_impl)
+BTF_ID(func, bpf_percpu_obj_drop_impl)
+BTF_ID(func, bpf_throw)
+#ifdef CONFIG_CGROUPS
+BTF_ID(func, bpf_iter_css_task_new)
+#endif
+BTF_SET_END(special_kfunc_set)
+
+BTF_ID_LIST(special_kfunc_list)
+BTF_ID(func, bpf_obj_new_impl)
+BTF_ID(func, bpf_obj_drop_impl)
+BTF_ID(func, bpf_refcount_acquire_impl)
+BTF_ID(func, bpf_list_push_front_impl)
+BTF_ID(func, bpf_list_push_back_impl)
+BTF_ID(func, bpf_list_pop_front)
+BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_cast_to_kern_ctx)
+BTF_ID(func, bpf_rdonly_cast)
+BTF_ID(func, bpf_rcu_read_lock)
+BTF_ID(func, bpf_rcu_read_unlock)
+BTF_ID(func, bpf_rbtree_remove)
+BTF_ID(func, bpf_rbtree_add_impl)
+BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_dynptr_from_skb)
+BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_slice)
+BTF_ID(func, bpf_dynptr_slice_rdwr)
+BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_percpu_obj_new_impl)
+BTF_ID(func, bpf_percpu_obj_drop_impl)
+BTF_ID(func, bpf_throw)
+#ifdef CONFIG_CGROUPS
+BTF_ID(func, bpf_iter_css_task_new)
+#else
+BTF_ID_UNUSED
+#endif
+
+static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
+{
+ if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
+ meta->arg_owning_ref) {
+ return false;
+ }
+
+ return meta->kfunc_flags & KF_RET_NULL;
+}
+
+static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_lock];
+}
+
+static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock];
+}
+
+static enum kfunc_ptr_arg_type
+get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
+ struct bpf_kfunc_call_arg_meta *meta,
+ const struct btf_type *t, const struct btf_type *ref_t,
+ const char *ref_tname, const struct btf_param *args,
+ int argno, int nargs)
+{
+ u32 regno = argno + 1;
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = &regs[regno];
+ bool arg_mem_size = false;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx])
+ return KF_ARG_PTR_TO_CTX;
+
+ /* In this function, we verify the kfunc's BTF as per the argument type,
+ * leaving the rest of the verification with respect to the register
+ * type to our caller. When a set of conditions hold in the BTF type of
+ * arguments, we resolve it to a known kfunc_ptr_arg_type.
+ */
+ if (btf_get_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
+ return KF_ARG_PTR_TO_CTX;
+
+ if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_ALLOC_BTF_ID;
+
+ if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_REFCOUNTED_KPTR;
+
+ if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_DYNPTR;
+
+ if (is_kfunc_arg_iter(meta, argno))
+ return KF_ARG_PTR_TO_ITER;
+
+ if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_LIST_HEAD;
+
+ if (is_kfunc_arg_list_node(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_LIST_NODE;
+
+ if (is_kfunc_arg_rbtree_root(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_RB_ROOT;
+
+ if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_RB_NODE;
+
+ if (is_kfunc_arg_const_str(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_CONST_STR;
+
+ if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
+ if (!btf_type_is_struct(ref_t)) {
+ verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
+ meta->func_name, argno, btf_type_str(ref_t), ref_tname);
+ return -EINVAL;
+ }
+ return KF_ARG_PTR_TO_BTF_ID;
+ }
+
+ if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_CALLBACK;
+
+ if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg))
+ return KF_ARG_PTR_TO_NULL;
+
+ if (argno + 1 < nargs &&
+ (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
+ is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
+ arg_mem_size = true;
+
+ /* This is the catch all argument type of register types supported by
+ * check_helper_mem_access. However, we only allow when argument type is
+ * pointer to scalar, or struct composed (recursively) of scalars. When
+ * arg_mem_size is true, the pointer can be void *.
+ */
+ if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) &&
+ (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
+ verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
+ argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
+ return -EINVAL;
+ }
+ return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM;
+}
+
+static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const struct btf_type *ref_t,
+ const char *ref_tname, u32 ref_id,
+ struct bpf_kfunc_call_arg_meta *meta,
+ int argno)
+{
+ const struct btf_type *reg_ref_t;
+ bool strict_type_match = false;
+ const struct btf *reg_btf;
+ const char *reg_ref_tname;
+ u32 reg_ref_id;
+
+ if (base_type(reg->type) == PTR_TO_BTF_ID) {
+ reg_btf = reg->btf;
+ reg_ref_id = reg->btf_id;
+ } else {
+ reg_btf = btf_vmlinux;
+ reg_ref_id = *reg2btf_ids[base_type(reg->type)];
+ }
+
+ /* Enforce strict type matching for calls to kfuncs that are acquiring
+ * or releasing a reference, or are no-cast aliases. We do _not_
+ * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default,
+ * as we want to enable BPF programs to pass types that are bitwise
+ * equivalent without forcing them to explicitly cast with something
+ * like bpf_cast_to_kern_ctx().
+ *
+ * For example, say we had a type like the following:
+ *
+ * struct bpf_cpumask {
+ * cpumask_t cpumask;
+ * refcount_t usage;
+ * };
+ *
+ * Note that as specified in <linux/cpumask.h>, cpumask_t is typedef'ed
+ * to a struct cpumask, so it would be safe to pass a struct
+ * bpf_cpumask * to a kfunc expecting a struct cpumask *.
+ *
+ * The philosophy here is similar to how we allow scalars of different
+ * types to be passed to kfuncs as long as the size is the same. The
+ * only difference here is that we're simply allowing
+ * btf_struct_ids_match() to walk the struct at the 0th offset, and
+ * resolve types.
+ */
+ if (is_kfunc_acquire(meta) ||
+ (is_kfunc_release(meta) && reg->ref_obj_id) ||
+ btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
+ strict_type_match = true;
+
+ WARN_ON_ONCE(is_kfunc_trusted_args(meta) && reg->off);
+
+ reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, &reg_ref_id);
+ reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
+ if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) {
+ verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
+ meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1,
+ btf_type_str(reg_ref_t), reg_ref_tname);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct btf_record *rec = reg_btf_record(reg);
+
+ if (!state->active_lock.ptr) {
+ verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
+ return -EFAULT;
+ }
+
+ if (type_flag(reg->type) & NON_OWN_REF) {
+ verbose(env, "verifier internal error: NON_OWN_REF already set\n");
+ return -EFAULT;
+ }
+
+ reg->type |= NON_OWN_REF;
+ if (rec->refcount_off >= 0)
+ reg->type |= MEM_RCU;
+
+ return 0;
+}
+
+static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id)
+{
+ struct bpf_func_state *state, *unused;
+ struct bpf_reg_state *reg;
+ int i;
+
+ state = cur_func(env);
+
+ if (!ref_obj_id) {
+ verbose(env, "verifier internal error: ref_obj_id is zero for "
+ "owning -> non-owning conversion\n");
+ return -EFAULT;
+ }
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].id != ref_obj_id)
+ continue;
+
+ /* Clear ref_obj_id here so release_reference doesn't clobber
+ * the whole reg
+ */
+ bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
+ if (reg->ref_obj_id == ref_obj_id) {
+ reg->ref_obj_id = 0;
+ ref_set_non_owning(env, reg);
+ }
+ }));
+ return 0;
+ }
+
+ verbose(env, "verifier internal error: ref state missing for ref_obj_id\n");
+ return -EFAULT;
+}
+
+/* Implementation details:
+ *
+ * Each register points to some region of memory, which we define as an
+ * allocation. Each allocation may embed a bpf_spin_lock which protects any
+ * special BPF objects (bpf_list_head, bpf_rb_root, etc.) part of the same
+ * allocation. The lock and the data it protects are colocated in the same
+ * memory region.
+ *
+ * Hence, everytime a register holds a pointer value pointing to such
+ * allocation, the verifier preserves a unique reg->id for it.
+ *
+ * The verifier remembers the lock 'ptr' and the lock 'id' whenever
+ * bpf_spin_lock is called.
+ *
+ * To enable this, lock state in the verifier captures two values:
+ * active_lock.ptr = Register's type specific pointer
+ * active_lock.id = A unique ID for each register pointer value
+ *
+ * Currently, PTR_TO_MAP_VALUE and PTR_TO_BTF_ID | MEM_ALLOC are the two
+ * supported register types.
+ *
+ * The active_lock.ptr in case of map values is the reg->map_ptr, and in case of
+ * allocated objects is the reg->btf pointer.
+ *
+ * The active_lock.id is non-unique for maps supporting direct_value_addr, as we
+ * can establish the provenance of the map value statically for each distinct
+ * lookup into such maps. They always contain a single map value hence unique
+ * IDs for each pseudo load pessimizes the algorithm and rejects valid programs.
+ *
+ * So, in case of global variables, they use array maps with max_entries = 1,
+ * hence their active_lock.ptr becomes map_ptr and id = 0 (since they all point
+ * into the same map value as max_entries is 1, as described above).
+ *
+ * In case of inner map lookups, the inner map pointer has same map_ptr as the
+ * outer map pointer (in verifier context), but each lookup into an inner map
+ * assigns a fresh reg->id to the lookup, so while lookups into distinct inner
+ * maps from the same outer map share the same map_ptr as active_lock.ptr, they
+ * will get different reg->id assigned to each lookup, hence different
+ * active_lock.id.
+ *
+ * In case of allocated objects, active_lock.ptr is the reg->btf, and the
+ * reg->id is a unique ID preserved after the NULL pointer check on the pointer
+ * returned from bpf_obj_new. Each allocation receives a new reg->id.
+ */
+static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ void *ptr;
+ u32 id;
+
+ switch ((int)reg->type) {
+ case PTR_TO_MAP_VALUE:
+ ptr = reg->map_ptr;
+ break;
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ ptr = reg->btf;
+ break;
+ default:
+ verbose(env, "verifier internal error: unknown reg type for lock check\n");
+ return -EFAULT;
+ }
+ id = reg->id;
+
+ if (!env->cur_state->active_lock.ptr)
+ return -EINVAL;
+ if (env->cur_state->active_lock.ptr != ptr ||
+ env->cur_state->active_lock.id != id) {
+ verbose(env, "held lock and object are not in the same allocation\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static bool is_bpf_list_api_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+ btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
+ btf_id == special_kfunc_list[KF_bpf_list_pop_back];
+}
+
+static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_first];
+}
+
+static bool is_bpf_graph_api_kfunc(u32 btf_id)
+{
+ return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id) ||
+ btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
+}
+
+static bool is_sync_callback_calling_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
+}
+
+static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
+{
+ return bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
+ insn->imm == special_kfunc_list[KF_bpf_throw];
+}
+
+static bool is_rbtree_lock_required_kfunc(u32 btf_id)
+{
+ return is_bpf_rbtree_api_kfunc(btf_id);
+}
+
+static bool check_kfunc_is_graph_root_api(struct bpf_verifier_env *env,
+ enum btf_field_type head_field_type,
+ u32 kfunc_btf_id)
+{
+ bool ret;
+
+ switch (head_field_type) {
+ case BPF_LIST_HEAD:
+ ret = is_bpf_list_api_kfunc(kfunc_btf_id);
+ break;
+ case BPF_RB_ROOT:
+ ret = is_bpf_rbtree_api_kfunc(kfunc_btf_id);
+ break;
+ default:
+ verbose(env, "verifier internal error: unexpected graph root argument type %s\n",
+ btf_field_type_name(head_field_type));
+ return false;
+ }
+
+ if (!ret)
+ verbose(env, "verifier internal error: %s head arg for unknown kfunc\n",
+ btf_field_type_name(head_field_type));
+ return ret;
+}
+
+static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
+ enum btf_field_type node_field_type,
+ u32 kfunc_btf_id)
+{
+ bool ret;
+
+ switch (node_field_type) {
+ case BPF_LIST_NODE:
+ ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back_impl]);
+ break;
+ case BPF_RB_NODE:
+ ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]);
+ break;
+ default:
+ verbose(env, "verifier internal error: unexpected graph node argument type %s\n",
+ btf_field_type_name(node_field_type));
+ return false;
+ }
+
+ if (!ret)
+ verbose(env, "verifier internal error: %s node arg for unknown kfunc\n",
+ btf_field_type_name(node_field_type));
+ return ret;
+}
+
+static int
+__process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta,
+ enum btf_field_type head_field_type,
+ struct btf_field **head_field)
+{
+ const char *head_type_name;
+ struct btf_field *field;
+ struct btf_record *rec;
+ u32 head_off;
+
+ if (meta->btf != btf_vmlinux) {
+ verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n");
+ return -EFAULT;
+ }
+
+ if (!check_kfunc_is_graph_root_api(env, head_field_type, meta->func_id))
+ return -EFAULT;
+
+ head_type_name = btf_field_type_name(head_field_type);
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env,
+ "R%d doesn't have constant offset. %s has to be at the constant offset\n",
+ regno, head_type_name);
+ return -EINVAL;
+ }
+
+ rec = reg_btf_record(reg);
+ head_off = reg->off + reg->var_off.value;
+ field = btf_record_find(rec, head_off, head_field_type);
+ if (!field) {
+ verbose(env, "%s not found at offset=%u\n", head_type_name, head_off);
+ return -EINVAL;
+ }
+
+ /* All functions require bpf_list_head to be protected using a bpf_spin_lock */
+ if (check_reg_allocation_locked(env, reg)) {
+ verbose(env, "bpf_spin_lock at off=%d must be held for %s\n",
+ rec->spin_lock_off, head_type_name);
+ return -EINVAL;
+ }
+
+ if (*head_field) {
+ verbose(env, "verifier internal error: repeating %s arg\n", head_type_name);
+ return -EFAULT;
+ }
+ *head_field = field;
+ return 0;
+}
+
+static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_LIST_HEAD,
+ &meta->arg_list_head.field);
+}
+
+static int process_kf_arg_ptr_to_rbtree_root(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_RB_ROOT,
+ &meta->arg_rbtree_root.field);
+}
+
+static int
+__process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta,
+ enum btf_field_type head_field_type,
+ enum btf_field_type node_field_type,
+ struct btf_field **node_field)
+{
+ const char *node_type_name;
+ const struct btf_type *et, *t;
+ struct btf_field *field;
+ u32 node_off;
+
+ if (meta->btf != btf_vmlinux) {
+ verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n");
+ return -EFAULT;
+ }
+
+ if (!check_kfunc_is_graph_node_api(env, node_field_type, meta->func_id))
+ return -EFAULT;
+
+ node_type_name = btf_field_type_name(node_field_type);
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env,
+ "R%d doesn't have constant offset. %s has to be at the constant offset\n",
+ regno, node_type_name);
+ return -EINVAL;
+ }
+
+ node_off = reg->off + reg->var_off.value;
+ field = reg_find_field_offset(reg, node_off, node_field_type);
+ if (!field || field->offset != node_off) {
+ verbose(env, "%s not found at offset=%u\n", node_type_name, node_off);
+ return -EINVAL;
+ }
+
+ field = *node_field;
+
+ et = btf_type_by_id(field->graph_root.btf, field->graph_root.value_btf_id);
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->graph_root.btf,
+ field->graph_root.value_btf_id, true)) {
+ verbose(env, "operation on %s expects arg#1 %s at offset=%d "
+ "in struct %s, but arg is at offset=%d in struct %s\n",
+ btf_field_type_name(head_field_type),
+ btf_field_type_name(node_field_type),
+ field->graph_root.node_offset,
+ btf_name_by_offset(field->graph_root.btf, et->name_off),
+ node_off, btf_name_by_offset(reg->btf, t->name_off));
+ return -EINVAL;
+ }
+ meta->arg_btf = reg->btf;
+ meta->arg_btf_id = reg->btf_id;
+
+ if (node_off != field->graph_root.node_offset) {
+ verbose(env, "arg#1 offset=%d, but expected %s at offset=%d in struct %s\n",
+ node_off, btf_field_type_name(node_field_type),
+ field->graph_root.node_offset,
+ btf_name_by_offset(field->graph_root.btf, et->name_off));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+ BPF_LIST_HEAD, BPF_LIST_NODE,
+ &meta->arg_list_head.field);
+}
+
+static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+ BPF_RB_ROOT, BPF_RB_NODE,
+ &meta->arg_rbtree_root.field);
+}
+
+/*
+ * css_task iter allowlist is needed to avoid dead locking on css_set_lock.
+ * LSM hooks and iters (both sleepable and non-sleepable) are safe.
+ * Any sleepable progs are also safe since bpf_check_attach_target() enforce
+ * them can only be attached to some specific hook points.
+ */
+static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env)
+{
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+
+ switch (prog_type) {
+ case BPF_PROG_TYPE_LSM:
+ return true;
+ case BPF_PROG_TYPE_TRACING:
+ if (env->prog->expected_attach_type == BPF_TRACE_ITER)
+ return true;
+ fallthrough;
+ default:
+ return env->prog->aux->sleepable;
+ }
+}
+
+static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta,
+ int insn_idx)
+{
+ const char *func_name = meta->func_name, *ref_tname;
+ const struct btf *btf = meta->btf;
+ const struct btf_param *args;
+ struct btf_record *rec;
+ u32 i, nargs;
+ int ret;
+
+ args = (const struct btf_param *)(meta->func_proto + 1);
+ nargs = btf_type_vlen(meta->func_proto);
+ if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ verbose(env, "Function %s has %d > %d args\n", func_name, nargs,
+ MAX_BPF_FUNC_REG_ARGS);
+ return -EINVAL;
+ }
+
+ /* Check that BTF function arguments match actual types that the
+ * verifier sees.
+ */
+ for (i = 0; i < nargs; i++) {
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[i + 1];
+ const struct btf_type *t, *ref_t, *resolve_ret;
+ enum bpf_arg_type arg_type = ARG_DONTCARE;
+ u32 regno = i + 1, ref_id, type_size;
+ bool is_ret_buf_sz = false;
+ int kf_arg_type;
+
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+
+ if (is_kfunc_arg_ignore(btf, &args[i]))
+ continue;
+
+ if (btf_type_is_scalar(t)) {
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "R%d is not a scalar\n", regno);
+ return -EINVAL;
+ }
+
+ if (is_kfunc_arg_constant(meta->btf, &args[i])) {
+ if (meta->arg_constant.found) {
+ verbose(env, "verifier internal error: only one constant argument permitted\n");
+ return -EFAULT;
+ }
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d must be a known constant\n", regno);
+ return -EINVAL;
+ }
+ ret = mark_chain_precision(env, regno);
+ if (ret < 0)
+ return ret;
+ meta->arg_constant.found = true;
+ meta->arg_constant.value = reg->var_off.value;
+ } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdonly_buf_size")) {
+ meta->r0_rdonly = true;
+ is_ret_buf_sz = true;
+ } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdwr_buf_size")) {
+ is_ret_buf_sz = true;
+ }
+
+ if (is_ret_buf_sz) {
+ if (meta->r0_size) {
+ verbose(env, "2 or more rdonly/rdwr_buf_size parameters for kfunc");
+ return -EINVAL;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d is not a const\n", regno);
+ return -EINVAL;
+ }
+
+ meta->r0_size = reg->var_off.value;
+ ret = mark_chain_precision(env, regno);
+ if (ret)
+ return ret;
+ }
+ continue;
+ }
+
+ if (!btf_type_is_ptr(t)) {
+ verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t));
+ return -EINVAL;
+ }
+
+ if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&
+ (register_is_null(reg) || type_may_be_null(reg->type)) &&
+ !is_kfunc_arg_nullable(meta->btf, &args[i])) {
+ verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
+ return -EACCES;
+ }
+
+ if (reg->ref_obj_id) {
+ if (is_kfunc_release(meta) && meta->ref_obj_id) {
+ verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+ regno, reg->ref_obj_id,
+ meta->ref_obj_id);
+ return -EFAULT;
+ }
+ meta->ref_obj_id = reg->ref_obj_id;
+ if (is_kfunc_release(meta))
+ meta->release_regno = regno;
+ }
+
+ ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
+ ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+
+ kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs);
+ if (kf_arg_type < 0)
+ return kf_arg_type;
+
+ switch (kf_arg_type) {
+ case KF_ARG_PTR_TO_NULL:
+ continue;
+ case KF_ARG_PTR_TO_ALLOC_BTF_ID:
+ case KF_ARG_PTR_TO_BTF_ID:
+ if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
+ break;
+
+ if (!is_trusted_reg(reg)) {
+ if (!is_kfunc_rcu(meta)) {
+ verbose(env, "R%d must be referenced or trusted\n", regno);
+ return -EINVAL;
+ }
+ if (!is_rcu_reg(reg)) {
+ verbose(env, "R%d must be a rcu pointer\n", regno);
+ return -EINVAL;
+ }
+ }
+
+ fallthrough;
+ case KF_ARG_PTR_TO_CTX:
+ /* Trusted arguments have the same offset checks as release arguments */
+ arg_type |= OBJ_RELEASE;
+ break;
+ case KF_ARG_PTR_TO_DYNPTR:
+ case KF_ARG_PTR_TO_ITER:
+ case KF_ARG_PTR_TO_LIST_HEAD:
+ case KF_ARG_PTR_TO_LIST_NODE:
+ case KF_ARG_PTR_TO_RB_ROOT:
+ case KF_ARG_PTR_TO_RB_NODE:
+ case KF_ARG_PTR_TO_MEM:
+ case KF_ARG_PTR_TO_MEM_SIZE:
+ case KF_ARG_PTR_TO_CALLBACK:
+ case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
+ case KF_ARG_PTR_TO_CONST_STR:
+ /* Trusted by default */
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EFAULT;
+ }
+
+ if (is_kfunc_release(meta) && reg->ref_obj_id)
+ arg_type |= OBJ_RELEASE;
+ ret = check_func_arg_reg_off(env, reg, regno, arg_type);
+ if (ret < 0)
+ return ret;
+
+ switch (kf_arg_type) {
+ case KF_ARG_PTR_TO_CTX:
+ if (reg->type != PTR_TO_CTX) {
+ verbose(env, "arg#%d expected pointer to ctx, but got %s\n", i, btf_type_str(t));
+ return -EINVAL;
+ }
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+ ret = get_kern_ctx_btf_id(&env->log, resolve_prog_type(env->prog));
+ if (ret < 0)
+ return -EINVAL;
+ meta->ret_btf_id = ret;
+ }
+ break;
+ case KF_ARG_PTR_TO_ALLOC_BTF_ID:
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ if (meta->func_id != special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ verbose(env, "arg#%d expected for bpf_obj_drop_impl()\n", i);
+ return -EINVAL;
+ }
+ } else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) {
+ if (meta->func_id != special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
+ verbose(env, "arg#%d expected for bpf_percpu_obj_drop_impl()\n", i);
+ return -EINVAL;
+ }
+ } else {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ if (meta->btf == btf_vmlinux) {
+ meta->arg_btf = reg->btf;
+ meta->arg_btf_id = reg->btf_id;
+ }
+ break;
+ case KF_ARG_PTR_TO_DYNPTR:
+ {
+ enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
+ int clone_ref_obj_id = 0;
+
+ if (reg->type != PTR_TO_STACK &&
+ reg->type != CONST_PTR_TO_DYNPTR) {
+ verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i);
+ return -EINVAL;
+ }
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ dynptr_arg_type |= MEM_RDONLY;
+
+ if (is_kfunc_arg_uninit(btf, &args[i]))
+ dynptr_arg_type |= MEM_UNINIT;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
+ dynptr_arg_type |= DYNPTR_TYPE_SKB;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) {
+ dynptr_arg_type |= DYNPTR_TYPE_XDP;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] &&
+ (dynptr_arg_type & MEM_UNINIT)) {
+ enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type;
+
+ if (parent_type == BPF_DYNPTR_TYPE_INVALID) {
+ verbose(env, "verifier internal error: no dynptr type for parent of clone\n");
+ return -EFAULT;
+ }
+
+ dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type);
+ clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id;
+ if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) {
+ verbose(env, "verifier internal error: missing ref obj id for parent of clone\n");
+ return -EFAULT;
+ }
+ }
+
+ ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id);
+ if (ret < 0)
+ return ret;
+
+ if (!(dynptr_arg_type & MEM_UNINIT)) {
+ int id = dynptr_id(env, reg);
+
+ if (id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr id\n");
+ return id;
+ }
+ meta->initialized_dynptr.id = id;
+ meta->initialized_dynptr.type = dynptr_get_type(env, reg);
+ meta->initialized_dynptr.ref_obj_id = dynptr_ref_obj_id(env, reg);
+ }
+
+ break;
+ }
+ case KF_ARG_PTR_TO_ITER:
+ if (meta->func_id == special_kfunc_list[KF_bpf_iter_css_task_new]) {
+ if (!check_css_task_iter_allowlist(env)) {
+ verbose(env, "css_task_iter is only allowed in bpf_lsm, bpf_iter and sleepable progs\n");
+ return -EINVAL;
+ }
+ }
+ ret = process_iter_arg(env, regno, insn_idx, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_LIST_HEAD:
+ if (reg->type != PTR_TO_MAP_VALUE &&
+ reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
+ return -EINVAL;
+ }
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_RB_ROOT:
+ if (reg->type != PTR_TO_MAP_VALUE &&
+ reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
+ return -EINVAL;
+ }
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_rbtree_root(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_LIST_NODE:
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_RB_NODE:
+ if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) {
+ if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) {
+ verbose(env, "rbtree_remove node input must be non-owning ref\n");
+ return -EINVAL;
+ }
+ if (in_rbtree_lock_required_cb(env)) {
+ verbose(env, "rbtree_remove not allowed in rbtree cb\n");
+ return -EINVAL;
+ }
+ } else {
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ }
+
+ ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_BTF_ID:
+ /* Only base_type is checked, further checks are done here */
+ if ((base_type(reg->type) != PTR_TO_BTF_ID ||
+ (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) &&
+ !reg2btf_ids[base_type(reg->type)]) {
+ verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type));
+ verbose(env, "expected %s or socket\n",
+ reg_type_str(env, base_type(reg->type) |
+ (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS)));
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_MEM:
+ resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
+ if (IS_ERR(resolve_ret)) {
+ verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+ i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret));
+ return -EINVAL;
+ }
+ ret = check_mem_reg(env, reg, regno, type_size);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_MEM_SIZE:
+ {
+ struct bpf_reg_state *buff_reg = &regs[regno];
+ const struct btf_param *buff_arg = &args[i];
+ struct bpf_reg_state *size_reg = &regs[regno + 1];
+ const struct btf_param *size_arg = &args[i + 1];
+
+ if (!register_is_null(buff_reg) || !is_kfunc_arg_optional(meta->btf, buff_arg)) {
+ ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
+ if (ret < 0) {
+ verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
+ return ret;
+ }
+ }
+
+ if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) {
+ if (meta->arg_constant.found) {
+ verbose(env, "verifier internal error: only one constant argument permitted\n");
+ return -EFAULT;
+ }
+ if (!tnum_is_const(size_reg->var_off)) {
+ verbose(env, "R%d must be a known constant\n", regno + 1);
+ return -EINVAL;
+ }
+ meta->arg_constant.found = true;
+ meta->arg_constant.value = size_reg->var_off.value;
+ }
+
+ /* Skip next '__sz' or '__szk' argument */
+ i++;
+ break;
+ }
+ case KF_ARG_PTR_TO_CALLBACK:
+ if (reg->type != PTR_TO_FUNC) {
+ verbose(env, "arg%d expected pointer to func\n", i);
+ return -EINVAL;
+ }
+ meta->subprogno = reg->subprogno;
+ break;
+ case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
+ if (!type_is_ptr_alloc_obj(reg->type)) {
+ verbose(env, "arg#%d is neither owning or non-owning ref\n", i);
+ return -EINVAL;
+ }
+ if (!type_is_non_owning_ref(reg->type))
+ meta->arg_owning_ref = true;
+
+ rec = reg_btf_record(reg);
+ if (!rec) {
+ verbose(env, "verifier internal error: Couldn't find btf_record\n");
+ return -EFAULT;
+ }
+
+ if (rec->refcount_off < 0) {
+ verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i);
+ return -EINVAL;
+ }
+
+ meta->arg_btf = reg->btf;
+ meta->arg_btf_id = reg->btf_id;
+ break;
+ case KF_ARG_PTR_TO_CONST_STR:
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "arg#%d doesn't point to a const string\n", i);
+ return -EINVAL;
+ }
+ ret = check_reg_const_str(env, reg, regno);
+ if (ret)
+ return ret;
+ break;
+ }
+ }
+
+ if (is_kfunc_release(meta) && !meta->release_regno) {
+ verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
+ func_name);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int fetch_kfunc_meta(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_kfunc_call_arg_meta *meta,
+ const char **kfunc_name)
+{
+ const struct btf_type *func, *func_proto;
+ u32 func_id, *kfunc_flags;
+ const char *func_name;
+ struct btf *desc_btf;
+
+ if (kfunc_name)
+ *kfunc_name = NULL;
+
+ if (!insn->imm)
+ return -EINVAL;
+
+ desc_btf = find_kfunc_desc_btf(env, insn->off);
+ if (IS_ERR(desc_btf))
+ return PTR_ERR(desc_btf);
+
+ func_id = insn->imm;
+ func = btf_type_by_id(desc_btf, func_id);
+ func_name = btf_name_by_offset(desc_btf, func->name_off);
+ if (kfunc_name)
+ *kfunc_name = func_name;
+ func_proto = btf_type_by_id(desc_btf, func->type);
+
+ kfunc_flags = btf_kfunc_id_set_contains(desc_btf, func_id, env->prog);
+ if (!kfunc_flags) {
+ return -EACCES;
+ }
+
+ memset(meta, 0, sizeof(*meta));
+ meta->btf = desc_btf;
+ meta->func_id = func_id;
+ meta->kfunc_flags = *kfunc_flags;
+ meta->func_proto = func_proto;
+ meta->func_name = func_name;
+
+ return 0;
+}
+
+static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name);
+
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx_p)
+{
+ const struct btf_type *t, *ptr_type;
+ u32 i, nargs, ptr_type_id, release_ref_obj_id;
+ struct bpf_reg_state *regs = cur_regs(env);
+ const char *func_name, *ptr_type_name;
+ bool sleepable, rcu_lock, rcu_unlock;
+ struct bpf_kfunc_call_arg_meta meta;
+ struct bpf_insn_aux_data *insn_aux;
+ int err, insn_idx = *insn_idx_p;
+ const struct btf_param *args;
+ const struct btf_type *ret_t;
+ struct btf *desc_btf;
+
+ /* skip for now, but return error when we find this in fixup_kfunc_call */
+ if (!insn->imm)
+ return 0;
+
+ err = fetch_kfunc_meta(env, insn, &meta, &func_name);
+ if (err == -EACCES && func_name)
+ verbose(env, "calling kernel function %s is not allowed\n", func_name);
+ if (err)
+ return err;
+ desc_btf = meta.btf;
+ insn_aux = &env->insn_aux_data[insn_idx];
+
+ insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
+
+ if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
+ verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
+ return -EACCES;
+ }
+
+ sleepable = is_kfunc_sleepable(&meta);
+ if (sleepable && !env->prog->aux->sleepable) {
+ verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name);
+ return -EACCES;
+ }
+
+ /* Check the arguments */
+ err = check_kfunc_args(env, &meta, insn_idx);
+ if (err < 0)
+ return err;
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_rbtree_add_callback_state);
+ if (err) {
+ verbose(env, "kfunc %s#%d failed callback verification\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
+ rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
+ rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
+
+ if (env->cur_state->active_rcu_lock) {
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+ u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER);
+
+ if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
+ verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
+ return -EACCES;
+ }
+
+ if (rcu_lock) {
+ verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
+ return -EINVAL;
+ } else if (rcu_unlock) {
+ bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({
+ if (reg->type & MEM_RCU) {
+ reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
+ reg->type |= PTR_UNTRUSTED;
+ }
+ }));
+ env->cur_state->active_rcu_lock = false;
+ } else if (sleepable) {
+ verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name);
+ return -EACCES;
+ }
+ } else if (rcu_lock) {
+ env->cur_state->active_rcu_lock = true;
+ } else if (rcu_unlock) {
+ verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
+ return -EINVAL;
+ }
+
+ /* In case of release function, we get register number of refcounted
+ * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
+ */
+ if (meta.release_regno) {
+ err = release_reference(env, regs[meta.release_regno].ref_obj_id);
+ if (err) {
+ verbose(env, "kfunc %s#%d reference has not been acquired before\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
+ insn_aux->insert_off = regs[BPF_REG_2].off;
+ insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id);
+ err = ref_convert_owning_non_owning(env, release_ref_obj_id);
+ if (err) {
+ verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
+ func_name, meta.func_id);
+ return err;
+ }
+
+ err = release_reference(env, release_ref_obj_id);
+ if (err) {
+ verbose(env, "kfunc %s#%d reference has not been acquired before\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_throw]) {
+ if (!bpf_jit_supports_exceptions()) {
+ verbose(env, "JIT does not support calling kfunc %s#%d\n",
+ func_name, meta.func_id);
+ return -ENOTSUPP;
+ }
+ env->seen_exception = true;
+
+ /* In the case of the default callback, the cookie value passed
+ * to bpf_throw becomes the return value of the program.
+ */
+ if (!env->exception_callback_subprog) {
+ err = check_return_code(env, BPF_REG_1, "R1");
+ if (err < 0)
+ return err;
+ }
+ }
+
+ for (i = 0; i < CALLER_SAVED_REGS; i++)
+ mark_reg_not_init(env, regs, caller_saved[i]);
+
+ /* Check return type */
+ t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);
+
+ if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {
+ /* Only exception is bpf_obj_new_impl */
+ if (meta.btf != btf_vmlinux ||
+ (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] &&
+ meta.func_id != special_kfunc_list[KF_bpf_percpu_obj_new_impl] &&
+ meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) {
+ verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
+ return -EINVAL;
+ }
+ }
+
+ if (btf_type_is_scalar(t)) {
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ mark_btf_func_reg_size(env, BPF_REG_0, t->size);
+ } else if (btf_type_is_ptr(t)) {
+ ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id);
+
+ if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ struct btf_struct_meta *struct_meta;
+ struct btf *ret_btf;
+ u32 ret_btf_id;
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
+ return -ENOMEM;
+
+ if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
+ verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
+ return -EINVAL;
+ }
+
+ ret_btf = env->prog->aux->btf;
+ ret_btf_id = meta.arg_constant.value;
+
+ /* This may be NULL due to user not supplying a BTF */
+ if (!ret_btf) {
+ verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n");
+ return -EINVAL;
+ }
+
+ ret_t = btf_type_by_id(ret_btf, ret_btf_id);
+ if (!ret_t || !__btf_type_is_struct(ret_t)) {
+ verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n");
+ return -EINVAL;
+ }
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
+ verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
+ ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE);
+ return -EINVAL;
+ }
+
+ if (!bpf_global_percpu_ma_set) {
+ mutex_lock(&bpf_percpu_ma_lock);
+ if (!bpf_global_percpu_ma_set) {
+ /* Charge memory allocated with bpf_global_percpu_ma to
+ * root memcg. The obj_cgroup for root memcg is NULL.
+ */
+ err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL);
+ if (!err)
+ bpf_global_percpu_ma_set = true;
+ }
+ mutex_unlock(&bpf_percpu_ma_lock);
+ if (err)
+ return err;
+ }
+
+ mutex_lock(&bpf_percpu_ma_lock);
+ err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size);
+ mutex_unlock(&bpf_percpu_ma_lock);
+ if (err)
+ return err;
+ }
+
+ struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
+ if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
+ verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n");
+ return -EINVAL;
+ }
+
+ if (struct_meta) {
+ verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n");
+ return -EINVAL;
+ }
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = ret_btf;
+ regs[BPF_REG_0].btf_id = ret_btf_id;
+ if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl])
+ regs[BPF_REG_0].type |= MEM_PERCPU;
+
+ insn_aux->obj_new_size = ret_t->size;
+ insn_aux->kptr_struct_meta = struct_meta;
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = meta.arg_btf;
+ regs[BPF_REG_0].btf_id = meta.arg_btf_id;
+
+ insn_aux->kptr_struct_meta =
+ btf_find_struct_meta(meta.arg_btf,
+ meta.arg_btf_id);
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
+ meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
+ struct btf_field *field = meta.arg_list_head.field;
+
+ mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+ meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
+ struct btf_field *field = meta.arg_rbtree_root.field;
+
+ mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta.ret_btf_id;
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value);
+ if (!ret_t || !btf_type_is_struct(ret_t)) {
+ verbose(env,
+ "kfunc bpf_rdonly_cast type ID argument must be of a struct\n");
+ return -EINVAL;
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta.arg_constant.value;
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
+ meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
+ enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type);
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+
+ if (!meta.arg_constant.found) {
+ verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n");
+ return -EFAULT;
+ }
+
+ regs[BPF_REG_0].mem_size = meta.arg_constant.value;
+
+ /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
+ regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
+ regs[BPF_REG_0].type |= MEM_RDONLY;
+ } else {
+ /* this will set env->seen_direct_write to true */
+ if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
+ verbose(env, "the prog does not allow writes to packet data\n");
+ return -EINVAL;
+ }
+ }
+
+ if (!meta.initialized_dynptr.id) {
+ verbose(env, "verifier internal error: no dynptr id\n");
+ return -EFAULT;
+ }
+ regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id;
+
+ /* we don't need to set BPF_REG_0's ref obj id
+ * because packet slices are not refcounted (see
+ * dynptr_type_refcounted)
+ */
+ } else {
+ verbose(env, "kernel function %s unhandled dynamic return type\n",
+ meta.func_name);
+ return -EFAULT;
+ }
+ } else if (!__btf_type_is_struct(ptr_type)) {
+ if (!meta.r0_size) {
+ __u32 sz;
+
+ if (!IS_ERR(btf_resolve_size(desc_btf, ptr_type, &sz))) {
+ meta.r0_size = sz;
+ meta.r0_rdonly = true;
+ }
+ }
+ if (!meta.r0_size) {
+ ptr_type_name = btf_name_by_offset(desc_btf,
+ ptr_type->name_off);
+ verbose(env,
+ "kernel function %s returns pointer type %s %s is not supported\n",
+ func_name,
+ btf_type_str(ptr_type),
+ ptr_type_name);
+ return -EINVAL;
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_MEM;
+ regs[BPF_REG_0].mem_size = meta.r0_size;
+
+ if (meta.r0_rdonly)
+ regs[BPF_REG_0].type |= MEM_RDONLY;
+
+ /* Ensures we don't access the memory after a release_reference() */
+ if (meta.ref_obj_id)
+ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+ } else {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID;
+ regs[BPF_REG_0].btf_id = ptr_type_id;
+ }
+
+ if (is_kfunc_ret_null(&meta)) {
+ regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
+ /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
+ regs[BPF_REG_0].id = ++env->id_gen;
+ }
+ mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
+ if (is_kfunc_acquire(&meta)) {
+ int id = acquire_reference_state(env, insn_idx);
+
+ if (id < 0)
+ return id;
+ if (is_kfunc_ret_null(&meta))
+ regs[BPF_REG_0].id = id;
+ regs[BPF_REG_0].ref_obj_id = id;
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
+ ref_set_non_owning(env, &regs[BPF_REG_0]);
+ }
+
+ if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
+ regs[BPF_REG_0].id = ++env->id_gen;
+ } else if (btf_type_is_void(t)) {
+ if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
+ insn_aux->kptr_struct_meta =
+ btf_find_struct_meta(meta.arg_btf,
+ meta.arg_btf_id);
+ }
+ }
+ }
+
+ nargs = btf_type_vlen(meta.func_proto);
+ args = (const struct btf_param *)(meta.func_proto + 1);
+ for (i = 0; i < nargs; i++) {
+ u32 regno = i + 1;
+
+ t = btf_type_skip_modifiers(desc_btf, args[i].type, NULL);
+ if (btf_type_is_ptr(t))
+ mark_btf_func_reg_size(env, regno, sizeof(void *));
+ else
+ /* scalar. ensured by btf_check_kfunc_arg_match() */
+ mark_btf_func_reg_size(env, regno, t->size);
+ }
+
+ if (is_iter_next_kfunc(&meta)) {
+ err = process_iter_next_call(env, insn_idx, &meta);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
static bool signed_add_overflows(s64 a, s64 b)
{
/* Do the add in u64, where overflow is well-defined */
@@ -4808,7 +12391,7 @@ static bool signed_add_overflows(s64 a, s64 b)
return res < a;
}
-static bool signed_add32_overflows(s64 a, s64 b)
+static bool signed_add32_overflows(s32 a, s32 b)
{
/* Do the add in u32, where overflow is well-defined */
s32 res = (s32)((u32)a + (u32)b);
@@ -4818,7 +12401,7 @@ static bool signed_add32_overflows(s64 a, s64 b)
return res < a;
}
-static bool signed_sub_overflows(s32 a, s32 b)
+static bool signed_sub_overflows(s64 a, s64 b)
{
/* Do the sub in u64, where overflow is well-defined */
s64 res = (s64)((u64)a - (u64)b);
@@ -4830,7 +12413,7 @@ static bool signed_sub_overflows(s32 a, s32 b)
static bool signed_sub32_overflows(s32 a, s32 b)
{
- /* Do the sub in u64, where overflow is well-defined */
+ /* Do the sub in u32, where overflow is well-defined */
s32 res = (s32)((u32)a - (u32)b);
if (b < 0)
@@ -4848,65 +12431,68 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
verbose(env, "math between %s pointer and %lld is not allowed\n",
- reg_type_str[type], val);
+ reg_type_str(env, type), val);
return false;
}
if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
verbose(env, "%s pointer offset %d is not allowed\n",
- reg_type_str[type], reg->off);
+ reg_type_str(env, type), reg->off);
return false;
}
if (smin == S64_MIN) {
verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
- reg_type_str[type]);
+ reg_type_str(env, type));
return false;
}
if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
verbose(env, "value %lld makes %s pointer be out of bounds\n",
- smin, reg_type_str[type]);
+ smin, reg_type_str(env, type));
return false;
}
return true;
}
-static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
-{
- return &env->insn_aux_data[env->insn_idx];
-}
+enum {
+ REASON_BOUNDS = -1,
+ REASON_TYPE = -2,
+ REASON_PATHS = -3,
+ REASON_LIMIT = -4,
+ REASON_STACK = -5,
+};
static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
- u32 *ptr_limit, u8 opcode, bool off_is_neg)
+ u32 *alu_limit, bool mask_to_left)
{
- bool mask_to_left = (opcode == BPF_ADD && off_is_neg) ||
- (opcode == BPF_SUB && !off_is_neg);
- u32 off;
+ u32 max = 0, ptr_limit = 0;
switch (ptr_reg->type) {
case PTR_TO_STACK:
- /* Indirect variable offset stack access is prohibited in
- * unprivileged mode so it's not handled here.
+ /* Offset 0 is out-of-bounds, but acceptable start for the
+ * left direction, see BPF_REG_FP. Also, unknown scalar
+ * offset where we would need to deal with min/max bounds is
+ * currently prohibited for unprivileged.
*/
- off = ptr_reg->off + ptr_reg->var_off.value;
- if (mask_to_left)
- *ptr_limit = MAX_BPF_STACK + off;
- else
- *ptr_limit = -off;
- return 0;
+ max = MAX_BPF_STACK + mask_to_left;
+ ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off);
+ break;
case PTR_TO_MAP_VALUE:
- if (mask_to_left) {
- *ptr_limit = ptr_reg->umax_value + ptr_reg->off;
- } else {
- off = ptr_reg->smin_value + ptr_reg->off;
- *ptr_limit = ptr_reg->map_ptr->value_size - off;
- }
- return 0;
+ max = ptr_reg->map_ptr->value_size;
+ ptr_limit = (mask_to_left ?
+ ptr_reg->smin_value :
+ ptr_reg->umax_value) + ptr_reg->off;
+ break;
default:
- return -EINVAL;
+ return REASON_TYPE;
}
+
+ if (ptr_limit >= max)
+ return REASON_LIMIT;
+ *alu_limit = ptr_limit;
+ return 0;
}
static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
@@ -4924,9 +12510,9 @@ static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
if (aux->alu_state &&
(aux->alu_state != alu_state ||
aux->alu_limit != alu_limit))
- return -EACCES;
+ return REASON_PATHS;
- /* Corresponding fixup done in fixup_bpf_calls(). */
+ /* Corresponding fixup done in do_misc_fixups(). */
aux->alu_state = alu_state;
aux->alu_limit = alu_limit;
return 0;
@@ -4943,19 +12529,55 @@ static int sanitize_val_alu(struct bpf_verifier_env *env,
return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0);
}
+static bool sanitize_needed(u8 opcode)
+{
+ return opcode == BPF_ADD || opcode == BPF_SUB;
+}
+
+struct bpf_sanitize_info {
+ struct bpf_insn_aux_data aux;
+ bool mask_to_left;
+};
+
+static struct bpf_verifier_state *
+sanitize_speculative_path(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn,
+ u32 next_idx, u32 curr_idx)
+{
+ struct bpf_verifier_state *branch;
+ struct bpf_reg_state *regs;
+
+ branch = push_stack(env, next_idx, curr_idx, true);
+ if (branch && insn) {
+ regs = branch->frame[branch->curframe]->regs;
+ if (BPF_SRC(insn->code) == BPF_K) {
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->src_reg);
+ }
+ }
+ return branch;
+}
+
static int sanitize_ptr_alu(struct bpf_verifier_env *env,
struct bpf_insn *insn,
const struct bpf_reg_state *ptr_reg,
+ const struct bpf_reg_state *off_reg,
struct bpf_reg_state *dst_reg,
- bool off_is_neg)
+ struct bpf_sanitize_info *info,
+ const bool commit_window)
{
+ struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : &info->aux;
struct bpf_verifier_state *vstate = env->cur_state;
- struct bpf_insn_aux_data *aux = cur_aux(env);
+ bool off_is_imm = tnum_is_const(off_reg->var_off);
+ bool off_is_neg = off_reg->smin_value < 0;
bool ptr_is_dst_reg = ptr_reg == dst_reg;
u8 opcode = BPF_OP(insn->code);
u32 alu_state, alu_limit;
struct bpf_reg_state tmp;
bool ret;
+ int err;
if (can_skip_alu_sanitation(env, insn))
return 0;
@@ -4967,15 +12589,53 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
if (vstate->speculative)
goto do_sim;
- alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
- alu_state |= ptr_is_dst_reg ?
- BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+ if (!commit_window) {
+ if (!tnum_is_const(off_reg->var_off) &&
+ (off_reg->smin_value < 0) != (off_reg->smax_value < 0))
+ return REASON_BOUNDS;
- if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg))
- return 0;
- if (update_alu_sanitation_state(aux, alu_state, alu_limit))
- return -EACCES;
+ info->mask_to_left = (opcode == BPF_ADD && off_is_neg) ||
+ (opcode == BPF_SUB && !off_is_neg);
+ }
+
+ err = retrieve_ptr_limit(ptr_reg, &alu_limit, info->mask_to_left);
+ if (err < 0)
+ return err;
+
+ if (commit_window) {
+ /* In commit phase we narrow the masking window based on
+ * the observed pointer move after the simulated operation.
+ */
+ alu_state = info->aux.alu_state;
+ alu_limit = abs(info->aux.alu_limit - alu_limit);
+ } else {
+ alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
+ alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
+ alu_state |= ptr_is_dst_reg ?
+ BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+
+ /* Limit pruning on unknown scalars to enable deep search for
+ * potential masking differences from other program paths.
+ */
+ if (!off_is_imm)
+ env->explore_alu_limits = true;
+ }
+
+ err = update_alu_sanitation_state(aux, alu_state, alu_limit);
+ if (err < 0)
+ return err;
do_sim:
+ /* If we're in commit phase, we're done here given we already
+ * pushed the truncated dst_reg into the speculative verification
+ * stack.
+ *
+ * Also, when register is a known constant, we rewrite register-based
+ * operation to immediate-based, and thus do not need masking (and as
+ * a consequence, do not need to simulate the zero-truncation either).
+ */
+ if (commit_window || off_is_imm)
+ return 0;
+
/* Simulate and find potential out-of-bounds access under
* speculative execution from truncation as a result of
* masking when off was not within expected range. If off
@@ -4987,12 +12647,131 @@ do_sim:
*/
if (!ptr_is_dst_reg) {
tmp = *dst_reg;
- *dst_reg = *ptr_reg;
+ copy_register_state(dst_reg, ptr_reg);
}
- ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true);
+ ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,
+ env->insn_idx);
if (!ptr_is_dst_reg && ret)
*dst_reg = tmp;
- return !ret ? -EFAULT : 0;
+ return !ret ? REASON_STACK : 0;
+}
+
+static void sanitize_mark_insn_seen(struct bpf_verifier_env *env)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+
+ /* If we simulate paths under speculation, we don't update the
+ * insn as 'seen' such that when we verify unreachable paths in
+ * the non-speculative domain, sanitize_dead_code() can still
+ * rewrite/sanitize them.
+ */
+ if (!vstate->speculative)
+ env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
+}
+
+static int sanitize_err(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, int reason,
+ const struct bpf_reg_state *off_reg,
+ const struct bpf_reg_state *dst_reg)
+{
+ static const char *err = "pointer arithmetic with it prohibited for !root";
+ const char *op = BPF_OP(insn->code) == BPF_ADD ? "add" : "sub";
+ u32 dst = insn->dst_reg, src = insn->src_reg;
+
+ switch (reason) {
+ case REASON_BOUNDS:
+ verbose(env, "R%d has unknown scalar with mixed signed bounds, %s\n",
+ off_reg == dst_reg ? dst : src, err);
+ break;
+ case REASON_TYPE:
+ verbose(env, "R%d has pointer with unsupported alu operation, %s\n",
+ off_reg == dst_reg ? src : dst, err);
+ break;
+ case REASON_PATHS:
+ verbose(env, "R%d tried to %s from different maps, paths or scalars, %s\n",
+ dst, op, err);
+ break;
+ case REASON_LIMIT:
+ verbose(env, "R%d tried to %s beyond pointer bounds, %s\n",
+ dst, op, err);
+ break;
+ case REASON_STACK:
+ verbose(env, "R%d could not be pushed for speculative verification, %s\n",
+ dst, err);
+ break;
+ default:
+ verbose(env, "verifier internal error: unknown reason (%d)\n",
+ reason);
+ break;
+ }
+
+ return -EACCES;
+}
+
+/* check that stack access falls within stack limits and that 'reg' doesn't
+ * have a variable offset.
+ *
+ * Variable offset is prohibited for unprivileged mode for simplicity since it
+ * requires corresponding support in Spectre masking for stack ALU. See also
+ * retrieve_ptr_limit().
+ *
+ *
+ * 'off' includes 'reg->off'.
+ */
+static int check_stack_access_for_ptr_arithmetic(
+ struct bpf_verifier_env *env,
+ int regno,
+ const struct bpf_reg_state *reg,
+ int off)
+{
+ if (!tnum_is_const(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "R%d variable stack access prohibited for !root, var_off=%s off=%d\n",
+ regno, tn_buf, off);
+ return -EACCES;
+ }
+
+ if (off >= 0 || off < -MAX_BPF_STACK) {
+ verbose(env, "R%d stack pointer arithmetic goes out of range, "
+ "prohibited for !root; off=%d\n", regno, off);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int sanitize_check_bounds(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn,
+ const struct bpf_reg_state *dst_reg)
+{
+ u32 dst = insn->dst_reg;
+
+ /* For unprivileged we require that resulting offset must be in bounds
+ * in order to be able to sanitize access later on.
+ */
+ if (env->bypass_spec_v1)
+ return 0;
+
+ switch (dst_reg->type) {
+ case PTR_TO_STACK:
+ if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg,
+ dst_reg->off + dst_reg->var_off.value))
+ return -EACCES;
+ break;
+ case PTR_TO_MAP_VALUE:
+ if (check_map_access(env, dst, dst_reg->off, 1, false, ACCESS_HELPER)) {
+ verbose(env, "R%d pointer arithmetic of map value goes out of range, "
+ "prohibited for !root\n", dst);
+ return -EACCES;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return 0;
}
/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
@@ -5013,8 +12792,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
- u32 dst = insn->dst_reg, src = insn->src_reg;
+ struct bpf_sanitize_info info = {};
u8 opcode = BPF_OP(insn->code);
+ u32 dst = insn->dst_reg;
int ret;
dst_reg = &regs[dst];
@@ -5030,36 +12810,41 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
if (BPF_CLASS(insn->code) != BPF_ALU64) {
/* 32-bit ALU ops on pointers produce (meaningless) scalars */
+ if (opcode == BPF_SUB && env->allow_ptr_leaks) {
+ __mark_reg_unknown(env, dst_reg);
+ return 0;
+ }
+
verbose(env,
"R%d 32-bit pointer arithmetic prohibited\n",
dst);
return -EACCES;
}
- switch (ptr_reg->type) {
- case PTR_TO_MAP_VALUE_OR_NULL:
+ if (ptr_reg->type & PTR_MAYBE_NULL) {
verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
- dst, reg_type_str[ptr_reg->type]);
+ dst, reg_type_str(env, ptr_reg->type));
return -EACCES;
+ }
+
+ switch (base_type(ptr_reg->type)) {
+ case PTR_TO_FLOW_KEYS:
+ if (known)
+ break;
+ fallthrough;
case CONST_PTR_TO_MAP:
+ /* smin_val represents the known value */
+ if (known && smin_val == 0 && opcode == BPF_ADD)
+ break;
+ fallthrough;
case PTR_TO_PACKET_END:
case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
- dst, reg_type_str[ptr_reg->type]);
+ dst, reg_type_str(env, ptr_reg->type));
return -EACCES;
- case PTR_TO_MAP_VALUE:
- if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) {
- verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n",
- off_reg == dst_reg ? dst : src);
- return -EACCES;
- }
- /* fall-through */
default:
break;
}
@@ -5077,13 +12862,15 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
/* pointer types do not carry 32-bit bounds at the moment. */
__mark_reg32_unbounded(dst_reg);
+ if (sanitize_needed(opcode)) {
+ ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg,
+ &info, false);
+ if (ret < 0)
+ return sanitize_err(env, insn, ret, off_reg, dst_reg);
+ }
+
switch (opcode) {
case BPF_ADD:
- ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
- if (ret < 0) {
- verbose(env, "R%d tried to add from different maps or paths\n", dst);
- return ret;
- }
/* We can take a fixed offset as long as it doesn't overflow
* the s32 'off' field
*/
@@ -5130,15 +12917,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
if (reg_is_pkt_pointer(ptr_reg)) {
dst_reg->id = ++env->id_gen;
/* something was added to pkt_ptr, set range to zero */
- dst_reg->raw = 0;
+ memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
}
break;
case BPF_SUB:
- ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
- if (ret < 0) {
- verbose(env, "R%d tried to sub from different maps or paths\n", dst);
- return ret;
- }
if (dst_reg == off_reg) {
/* scalar -= pointer. Creates an unknown scalar */
verbose(env, "R%d tried to subtract pointer from scalar\n",
@@ -5195,7 +12977,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg->id = ++env->id_gen;
/* something was added to pkt_ptr, set range to zero */
if (smin_val < 0)
- dst_reg->raw = 0;
+ memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
}
break;
case BPF_AND:
@@ -5214,27 +12996,14 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
return -EINVAL;
-
- __update_reg_bounds(dst_reg);
- __reg_deduce_bounds(dst_reg);
- __reg_bound_offset(dst_reg);
-
- /* For unprivileged we require that resulting offset must be in bounds
- * in order to be able to sanitize access later on.
- */
- if (!env->bypass_spec_v1) {
- if (dst_reg->type == PTR_TO_MAP_VALUE &&
- check_map_access(env, dst, dst_reg->off, 1, false)) {
- verbose(env, "R%d pointer arithmetic of map value goes out of range, "
- "prohibited for !root\n", dst);
- return -EACCES;
- } else if (dst_reg->type == PTR_TO_STACK &&
- check_stack_access(env, dst_reg, dst_reg->off +
- dst_reg->var_off.value, 1)) {
- verbose(env, "R%d stack pointer arithmetic goes out of range, "
- "prohibited for !root\n", dst);
- return -EACCES;
- }
+ reg_bounds_sync(dst_reg);
+ if (sanitize_check_bounds(env, insn, dst_reg) < 0)
+ return -EACCES;
+ if (sanitize_needed(opcode)) {
+ ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg,
+ &info, true);
+ if (ret < 0)
+ return sanitize_err(env, insn, ret, off_reg, dst_reg);
}
return 0;
@@ -5421,11 +13190,10 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
s32 smin_val = src_reg->s32_min_value;
u32 umax_val = src_reg->u32_max_value;
- /* Assuming scalar64_min_max_and will be called so its safe
- * to skip updating register for known 32-bit case.
- */
- if (src_known && dst_known)
+ if (src_known && dst_known) {
+ __mark_reg32_known(dst_reg, var32_off.value);
return;
+ }
/* We get our minimum from the var_off, since that's inherently
* bitwise. Our maximum is the minimum of the operands' maxima.
@@ -5445,7 +13213,6 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
}
-
}
static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
@@ -5457,8 +13224,7 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
u64 umax_val = src_reg->umax_value;
if (src_known && dst_known) {
- __mark_reg_known(dst_reg, dst_reg->var_off.value &
- src_reg->var_off.value);
+ __mark_reg_known(dst_reg, dst_reg->var_off.value);
return;
}
@@ -5490,14 +13256,13 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
- s32 smin_val = src_reg->smin_value;
- u32 umin_val = src_reg->umin_value;
+ s32 smin_val = src_reg->s32_min_value;
+ u32 umin_val = src_reg->u32_min_value;
- /* Assuming scalar64_min_max_or will be called so it is safe
- * to skip updating register for known case.
- */
- if (src_known && dst_known)
+ if (src_known && dst_known) {
+ __mark_reg32_known(dst_reg, var32_off.value);
return;
+ }
/* We get our maximum from the var_off, and our minimum is the
* maximum of the operands' minima
@@ -5514,8 +13279,8 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
/* ORing two positives gives a positive, so safe to
* cast result into s64.
*/
- dst_reg->s32_min_value = dst_reg->umin_value;
- dst_reg->s32_max_value = dst_reg->umax_value;
+ dst_reg->s32_min_value = dst_reg->u32_min_value;
+ dst_reg->s32_max_value = dst_reg->u32_max_value;
}
}
@@ -5528,8 +13293,7 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
u64 umin_val = src_reg->umin_value;
if (src_known && dst_known) {
- __mark_reg_known(dst_reg, dst_reg->var_off.value |
- src_reg->var_off.value);
+ __mark_reg_known(dst_reg, dst_reg->var_off.value);
return;
}
@@ -5555,6 +13319,66 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
__update_reg_bounds(dst_reg);
}
+static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ bool src_known = tnum_subreg_is_const(src_reg->var_off);
+ bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
+ struct tnum var32_off = tnum_subreg(dst_reg->var_off);
+ s32 smin_val = src_reg->s32_min_value;
+
+ if (src_known && dst_known) {
+ __mark_reg32_known(dst_reg, var32_off.value);
+ return;
+ }
+
+ /* We get both minimum and maximum from the var32_off. */
+ dst_reg->u32_min_value = var32_off.value;
+ dst_reg->u32_max_value = var32_off.value | var32_off.mask;
+
+ if (dst_reg->s32_min_value >= 0 && smin_val >= 0) {
+ /* XORing two positive sign numbers gives a positive,
+ * so safe to cast u32 result into s32.
+ */
+ dst_reg->s32_min_value = dst_reg->u32_min_value;
+ dst_reg->s32_max_value = dst_reg->u32_max_value;
+ } else {
+ dst_reg->s32_min_value = S32_MIN;
+ dst_reg->s32_max_value = S32_MAX;
+ }
+}
+
+static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ bool src_known = tnum_is_const(src_reg->var_off);
+ bool dst_known = tnum_is_const(dst_reg->var_off);
+ s64 smin_val = src_reg->smin_value;
+
+ if (src_known && dst_known) {
+ /* dst_reg->var_off.value has been updated earlier */
+ __mark_reg_known(dst_reg, dst_reg->var_off.value);
+ return;
+ }
+
+ /* We get both minimum and maximum from the var_off. */
+ dst_reg->umin_value = dst_reg->var_off.value;
+ dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
+
+ if (dst_reg->smin_value >= 0 && smin_val >= 0) {
+ /* XORing two positive sign numbers gives a positive,
+ * so safe to cast u64 result into s64.
+ */
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ } else {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ }
+
+ __update_reg_bounds(dst_reg);
+}
+
static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
u64 umin_val, u64 umax_val)
{
@@ -5652,7 +13476,7 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg,
* 3) the signed bounds cross zero, so they tell us nothing
* about the result
* If the value in dst_reg is known nonnegative, then again the
- * unsigned bounts capture the signed bounds.
+ * unsigned bounds capture the signed bounds.
* Thus, in all cases it suffices to blow away our signed bounds
* and rely on inferring new ones from the unsigned bounds and
* var_off of the result.
@@ -5683,7 +13507,7 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg,
* 3) the signed bounds cross zero, so they tell us nothing
* about the result
* If the value in dst_reg is known nonnegative, then again the
- * unsigned bounts capture the signed bounds.
+ * unsigned bounds capture the signed bounds.
* Thus, in all cases it suffices to blow away our signed bounds
* and rely on inferring new ones from the unsigned bounds and
* var_off of the result.
@@ -5769,9 +13593,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
s32 s32_min_val, s32_max_val;
u32 u32_min_val, u32_max_val;
u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
- u32 dst = insn->dst_reg;
- int ret;
bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
+ int ret;
smin_val = src_reg.smin_value;
smax_val = src_reg.smax_value;
@@ -5813,6 +13636,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
return 0;
}
+ if (sanitize_needed(opcode)) {
+ ret = sanitize_val_alu(env, insn);
+ if (ret < 0)
+ return sanitize_err(env, insn, ret, NULL, NULL);
+ }
+
/* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops.
* There are two classes of instructions: The first class we track both
* alu32 and alu64 sign/unsigned bounds independently this provides the
@@ -5829,21 +13658,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
*/
switch (opcode) {
case BPF_ADD:
- ret = sanitize_val_alu(env, insn);
- if (ret < 0) {
- verbose(env, "R%d tried to add from different pointers or scalars\n", dst);
- return ret;
- }
scalar32_min_max_add(dst_reg, &src_reg);
scalar_min_max_add(dst_reg, &src_reg);
dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
break;
case BPF_SUB:
- ret = sanitize_val_alu(env, insn);
- if (ret < 0) {
- verbose(env, "R%d tried to sub from different pointers or scalars\n", dst);
- return ret;
- }
scalar32_min_max_sub(dst_reg, &src_reg);
scalar_min_max_sub(dst_reg, &src_reg);
dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
@@ -5863,6 +13682,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
scalar32_min_max_or(dst_reg, &src_reg);
scalar_min_max_or(dst_reg, &src_reg);
break;
+ case BPF_XOR:
+ dst_reg->var_off = tnum_xor(dst_reg->var_off, src_reg.var_off);
+ scalar32_min_max_xor(dst_reg, &src_reg);
+ scalar_min_max_xor(dst_reg, &src_reg);
+ break;
case BPF_LSH:
if (umax_val >= insn_bitness) {
/* Shifts greater than 31 or 63 are undefined.
@@ -5910,10 +13734,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
/* ALU32 ops are zero extended into 64bit register */
if (alu32)
zext_32_to_64(dst_reg);
-
- __update_reg_bounds(dst_reg);
- __reg_deduce_bounds(dst_reg);
- __reg_bound_offset(dst_reg);
+ reg_bounds_sync(dst_reg);
return 0;
}
@@ -5934,6 +13755,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
src_reg = NULL;
if (dst_reg->type != SCALAR_VALUE)
ptr_reg = dst_reg;
+ else
+ /* Make sure ID is cleared otherwise dst_reg min/max could be
+ * incorrectly propagated into other registers by find_equal_scalars()
+ */
+ dst_reg->id = 0;
if (BPF_SRC(insn->code) == BPF_X) {
src_reg = &regs[insn->src_reg];
if (src_reg->type != SCALAR_VALUE) {
@@ -5968,6 +13794,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
return err;
return adjust_ptr_min_max_vals(env, insn,
dst_reg, src_reg);
+ } else if (dst_reg->precise) {
+ /* if dst_reg is precise, src_reg should be precise as well */
+ err = mark_chain_precision(env, insn->src_reg);
+ if (err)
+ return err;
}
} else {
/* Pretend the src is a reg with a known value, since we only
@@ -5983,12 +13814,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* Got here implies adding two SCALAR_VALUEs */
if (WARN_ON_ONCE(ptr_reg)) {
- print_verifier_state(env, state);
+ print_verifier_state(env, state, true);
verbose(env, "verifier internal error: unexpected ptr_reg\n");
return -EINVAL;
}
if (WARN_ON(!src_reg)) {
- print_verifier_state(env, state);
+ print_verifier_state(env, state, true);
verbose(env, "verifier internal error: no src_reg\n");
return -EINVAL;
}
@@ -6004,7 +13835,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (opcode == BPF_END || opcode == BPF_NEG) {
if (opcode == BPF_NEG) {
- if (BPF_SRC(insn->code) != 0 ||
+ if (BPF_SRC(insn->code) != BPF_K ||
insn->src_reg != BPF_REG_0 ||
insn->off != 0 || insn->imm != 0) {
verbose(env, "BPF_NEG uses reserved fields\n");
@@ -6013,7 +13844,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
(insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
- BPF_CLASS(insn->code) == BPF_ALU64) {
+ (BPF_CLASS(insn->code) == BPF_ALU64 &&
+ BPF_SRC(insn->code) != BPF_TO_LE)) {
verbose(env, "BPF_END uses reserved fields\n");
return -EINVAL;
}
@@ -6038,11 +13870,24 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else if (opcode == BPF_MOV) {
if (BPF_SRC(insn->code) == BPF_X) {
- if (insn->imm != 0 || insn->off != 0) {
+ if (insn->imm != 0) {
verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
+ if (BPF_CLASS(insn->code) == BPF_ALU) {
+ if (insn->off != 0 && insn->off != 8 && insn->off != 16) {
+ verbose(env, "BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else {
+ if (insn->off != 0 && insn->off != 8 && insn->off != 16 &&
+ insn->off != 32) {
+ verbose(env, "BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
/* check src operand */
err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
@@ -6062,14 +13907,46 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (BPF_SRC(insn->code) == BPF_X) {
struct bpf_reg_state *src_reg = regs + insn->src_reg;
struct bpf_reg_state *dst_reg = regs + insn->dst_reg;
+ bool need_id = src_reg->type == SCALAR_VALUE && !src_reg->id &&
+ !tnum_is_const(src_reg->var_off);
if (BPF_CLASS(insn->code) == BPF_ALU64) {
- /* case: R1 = R2
- * copy register state to dest reg
- */
- *dst_reg = *src_reg;
- dst_reg->live |= REG_LIVE_WRITTEN;
- dst_reg->subreg_def = DEF_NOT_SUBREG;
+ if (insn->off == 0) {
+ /* case: R1 = R2
+ * copy register state to dest reg
+ */
+ if (need_id)
+ /* Assign src and dst registers the same ID
+ * that will be used by find_equal_scalars()
+ * to propagate min/max range.
+ */
+ src_reg->id = ++env->id_gen;
+ copy_register_state(dst_reg, src_reg);
+ dst_reg->live |= REG_LIVE_WRITTEN;
+ dst_reg->subreg_def = DEF_NOT_SUBREG;
+ } else {
+ /* case: R1 = (s8, s16 s32)R2 */
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose(env,
+ "R%d sign-extension part of pointer\n",
+ insn->src_reg);
+ return -EACCES;
+ } else if (src_reg->type == SCALAR_VALUE) {
+ bool no_sext;
+
+ no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
+ if (no_sext && need_id)
+ src_reg->id = ++env->id_gen;
+ copy_register_state(dst_reg, src_reg);
+ if (!no_sext)
+ dst_reg->id = 0;
+ coerce_reg_to_size_sx(dst_reg, insn->off >> 3);
+ dst_reg->live |= REG_LIVE_WRITTEN;
+ dst_reg->subreg_def = DEF_NOT_SUBREG;
+ } else {
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ }
+ }
} else {
/* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
@@ -6078,14 +13955,39 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->src_reg);
return -EACCES;
} else if (src_reg->type == SCALAR_VALUE) {
- *dst_reg = *src_reg;
- dst_reg->live |= REG_LIVE_WRITTEN;
- dst_reg->subreg_def = env->insn_idx + 1;
+ if (insn->off == 0) {
+ bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX;
+
+ if (is_src_reg_u32 && need_id)
+ src_reg->id = ++env->id_gen;
+ copy_register_state(dst_reg, src_reg);
+ /* Make sure ID is cleared if src_reg is not in u32
+ * range otherwise dst_reg min/max could be incorrectly
+ * propagated into src_reg by find_equal_scalars()
+ */
+ if (!is_src_reg_u32)
+ dst_reg->id = 0;
+ dst_reg->live |= REG_LIVE_WRITTEN;
+ dst_reg->subreg_def = env->insn_idx + 1;
+ } else {
+ /* case: W1 = (s8, s16)W2 */
+ bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
+
+ if (no_sext && need_id)
+ src_reg->id = ++env->id_gen;
+ copy_register_state(dst_reg, src_reg);
+ if (!no_sext)
+ dst_reg->id = 0;
+ dst_reg->live |= REG_LIVE_WRITTEN;
+ dst_reg->subreg_def = env->insn_idx + 1;
+ coerce_subreg_to_size_sx(dst_reg, insn->off >> 3);
+ }
} else {
mark_reg_unknown(env, regs,
insn->dst_reg);
}
zext_32_to_64(dst_reg);
+ reg_bounds_sync(dst_reg);
}
} else {
/* case: R = imm
@@ -6110,7 +14012,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else { /* all other ALU ops: and, sub, xor, add, ... */
if (BPF_SRC(insn->code) == BPF_X) {
- if (insn->imm != 0 || insn->off != 0) {
+ if (insn->imm != 0 || insn->off > 1 ||
+ (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
@@ -6119,7 +14022,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
} else {
- if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ if (insn->src_reg != BPF_REG_0 || insn->off > 1 ||
+ (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
@@ -6148,35 +14052,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
/* check dest operand */
err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
+ err = err ?: adjust_reg_min_max_vals(env, insn);
if (err)
return err;
-
- return adjust_reg_min_max_vals(env, insn);
- }
-
- return 0;
-}
-
-static void __find_good_pkt_pointers(struct bpf_func_state *state,
- struct bpf_reg_state *dst_reg,
- enum bpf_reg_type type, u16 new_range)
-{
- struct bpf_reg_state *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++) {
- reg = &state->regs[i];
- if (reg->type == type && reg->id == dst_reg->id)
- /* keep the maximum range already checked */
- reg->range = max(reg->range, new_range);
}
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- if (reg->type == type && reg->id == dst_reg->id)
- reg->range = max(reg->range, new_range);
- }
+ return reg_bounds_sanity_check(env, &regs[insn->dst_reg], "alu");
}
static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
@@ -6184,8 +14065,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
enum bpf_reg_type type,
bool range_right_open)
{
- u16 new_range;
- int i;
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+ int new_range;
if (dst_reg->off < 0 ||
(dst_reg->off == 0 && range_right_open))
@@ -6201,7 +14083,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
new_range = dst_reg->off;
if (range_right_open)
- new_range--;
+ new_range++;
/* Examples for register markings:
*
@@ -6250,77 +14132,137 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
* the range won't allow anything.
* dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
*/
- for (i = 0; i <= vstate->curframe; i++)
- __find_good_pkt_pointers(vstate->frame[i], dst_reg, type,
- new_range);
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ if (reg->type == type && reg->id == dst_reg->id)
+ /* keep the maximum range already checked */
+ reg->range = max(reg->range, new_range);
+ }));
}
-static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
-{
- struct tnum subreg = tnum_subreg(reg->var_off);
- s32 sval = (s32)val;
+/*
+ * <reg1> <op> <reg2>, currently assuming reg2 is a constant
+ */
+static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+ u8 opcode, bool is_jmp32)
+{
+ struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off;
+ struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
+ u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value;
+ u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value;
+ s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value;
+ s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value;
+ u64 umin2 = is_jmp32 ? (u64)reg2->u32_min_value : reg2->umin_value;
+ u64 umax2 = is_jmp32 ? (u64)reg2->u32_max_value : reg2->umax_value;
+ s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value;
+ s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value;
switch (opcode) {
case BPF_JEQ:
- if (tnum_is_const(subreg))
- return !!tnum_equals_const(subreg, val);
+ /* constants, umin/umax and smin/smax checks would be
+ * redundant in this case because they all should match
+ */
+ if (tnum_is_const(t1) && tnum_is_const(t2))
+ return t1.value == t2.value;
+ /* non-overlapping ranges */
+ if (umin1 > umax2 || umax1 < umin2)
+ return 0;
+ if (smin1 > smax2 || smax1 < smin2)
+ return 0;
+ if (!is_jmp32) {
+ /* if 64-bit ranges are inconclusive, see if we can
+ * utilize 32-bit subrange knowledge to eliminate
+ * branches that can't be taken a priori
+ */
+ if (reg1->u32_min_value > reg2->u32_max_value ||
+ reg1->u32_max_value < reg2->u32_min_value)
+ return 0;
+ if (reg1->s32_min_value > reg2->s32_max_value ||
+ reg1->s32_max_value < reg2->s32_min_value)
+ return 0;
+ }
break;
case BPF_JNE:
- if (tnum_is_const(subreg))
- return !tnum_equals_const(subreg, val);
+ /* constants, umin/umax and smin/smax checks would be
+ * redundant in this case because they all should match
+ */
+ if (tnum_is_const(t1) && tnum_is_const(t2))
+ return t1.value != t2.value;
+ /* non-overlapping ranges */
+ if (umin1 > umax2 || umax1 < umin2)
+ return 1;
+ if (smin1 > smax2 || smax1 < smin2)
+ return 1;
+ if (!is_jmp32) {
+ /* if 64-bit ranges are inconclusive, see if we can
+ * utilize 32-bit subrange knowledge to eliminate
+ * branches that can't be taken a priori
+ */
+ if (reg1->u32_min_value > reg2->u32_max_value ||
+ reg1->u32_max_value < reg2->u32_min_value)
+ return 1;
+ if (reg1->s32_min_value > reg2->s32_max_value ||
+ reg1->s32_max_value < reg2->s32_min_value)
+ return 1;
+ }
break;
case BPF_JSET:
- if ((~subreg.mask & subreg.value) & val)
+ if (!is_reg_const(reg2, is_jmp32)) {
+ swap(reg1, reg2);
+ swap(t1, t2);
+ }
+ if (!is_reg_const(reg2, is_jmp32))
+ return -1;
+ if ((~t1.mask & t1.value) & t2.value)
return 1;
- if (!((subreg.mask | subreg.value) & val))
+ if (!((t1.mask | t1.value) & t2.value))
return 0;
break;
case BPF_JGT:
- if (reg->u32_min_value > val)
+ if (umin1 > umax2)
return 1;
- else if (reg->u32_max_value <= val)
+ else if (umax1 <= umin2)
return 0;
break;
case BPF_JSGT:
- if (reg->s32_min_value > sval)
+ if (smin1 > smax2)
return 1;
- else if (reg->s32_max_value < sval)
+ else if (smax1 <= smin2)
return 0;
break;
case BPF_JLT:
- if (reg->u32_max_value < val)
+ if (umax1 < umin2)
return 1;
- else if (reg->u32_min_value >= val)
+ else if (umin1 >= umax2)
return 0;
break;
case BPF_JSLT:
- if (reg->s32_max_value < sval)
+ if (smax1 < smin2)
return 1;
- else if (reg->s32_min_value >= sval)
+ else if (smin1 >= smax2)
return 0;
break;
case BPF_JGE:
- if (reg->u32_min_value >= val)
+ if (umin1 >= umax2)
return 1;
- else if (reg->u32_max_value < val)
+ else if (umax1 < umin2)
return 0;
break;
case BPF_JSGE:
- if (reg->s32_min_value >= sval)
+ if (smin1 >= smax2)
return 1;
- else if (reg->s32_max_value < sval)
+ else if (smax1 < smin2)
return 0;
break;
case BPF_JLE:
- if (reg->u32_max_value <= val)
+ if (umax1 <= umin2)
return 1;
- else if (reg->u32_min_value > val)
+ else if (umin1 > umax2)
return 0;
break;
case BPF_JSLE:
- if (reg->s32_max_value <= sval)
+ if (smax1 <= smin2)
return 1;
- else if (reg->s32_min_value > sval)
+ else if (smin1 > smax2)
return 0;
break;
}
@@ -6328,96 +14270,99 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
return -1;
}
+static int flip_opcode(u32 opcode)
+{
+ /* How can we transform "a <op> b" into "b <op> a"? */
+ static const u8 opcode_flip[16] = {
+ /* these stay the same */
+ [BPF_JEQ >> 4] = BPF_JEQ,
+ [BPF_JNE >> 4] = BPF_JNE,
+ [BPF_JSET >> 4] = BPF_JSET,
+ /* these swap "lesser" and "greater" (L and G in the opcodes) */
+ [BPF_JGE >> 4] = BPF_JLE,
+ [BPF_JGT >> 4] = BPF_JLT,
+ [BPF_JLE >> 4] = BPF_JGE,
+ [BPF_JLT >> 4] = BPF_JGT,
+ [BPF_JSGE >> 4] = BPF_JSLE,
+ [BPF_JSGT >> 4] = BPF_JSLT,
+ [BPF_JSLE >> 4] = BPF_JSGE,
+ [BPF_JSLT >> 4] = BPF_JSGT
+ };
+ return opcode_flip[opcode >> 4];
+}
-static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
+static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg,
+ u8 opcode)
{
- s64 sval = (s64)val;
+ struct bpf_reg_state *pkt;
+
+ if (src_reg->type == PTR_TO_PACKET_END) {
+ pkt = dst_reg;
+ } else if (dst_reg->type == PTR_TO_PACKET_END) {
+ pkt = src_reg;
+ opcode = flip_opcode(opcode);
+ } else {
+ return -1;
+ }
+
+ if (pkt->range >= 0)
+ return -1;
switch (opcode) {
- case BPF_JEQ:
- if (tnum_is_const(reg->var_off))
- return !!tnum_equals_const(reg->var_off, val);
- break;
- case BPF_JNE:
- if (tnum_is_const(reg->var_off))
- return !tnum_equals_const(reg->var_off, val);
- break;
- case BPF_JSET:
- if ((~reg->var_off.mask & reg->var_off.value) & val)
- return 1;
- if (!((reg->var_off.mask | reg->var_off.value) & val))
- return 0;
- break;
+ case BPF_JLE:
+ /* pkt <= pkt_end */
+ fallthrough;
case BPF_JGT:
- if (reg->umin_value > val)
- return 1;
- else if (reg->umax_value <= val)
- return 0;
- break;
- case BPF_JSGT:
- if (reg->smin_value > sval)
- return 1;
- else if (reg->smax_value < sval)
- return 0;
+ /* pkt > pkt_end */
+ if (pkt->range == BEYOND_PKT_END)
+ /* pkt has at last one extra byte beyond pkt_end */
+ return opcode == BPF_JGT;
break;
case BPF_JLT:
- if (reg->umax_value < val)
- return 1;
- else if (reg->umin_value >= val)
- return 0;
- break;
- case BPF_JSLT:
- if (reg->smax_value < sval)
- return 1;
- else if (reg->smin_value >= sval)
- return 0;
- break;
+ /* pkt < pkt_end */
+ fallthrough;
case BPF_JGE:
- if (reg->umin_value >= val)
- return 1;
- else if (reg->umax_value < val)
- return 0;
- break;
- case BPF_JSGE:
- if (reg->smin_value >= sval)
- return 1;
- else if (reg->smax_value < sval)
- return 0;
- break;
- case BPF_JLE:
- if (reg->umax_value <= val)
- return 1;
- else if (reg->umin_value > val)
- return 0;
- break;
- case BPF_JSLE:
- if (reg->smax_value <= sval)
- return 1;
- else if (reg->smin_value > sval)
- return 0;
+ /* pkt >= pkt_end */
+ if (pkt->range == BEYOND_PKT_END || pkt->range == AT_PKT_END)
+ return opcode == BPF_JGE;
break;
}
-
return -1;
}
-/* compute branch direction of the expression "if (reg opcode val) goto target;"
+/* compute branch direction of the expression "if (<reg1> opcode <reg2>) goto target;"
* and return:
* 1 - branch will be taken and "goto target" will be executed
* 0 - branch will not be taken and fall-through to next insn
- * -1 - unknown. Example: "if (reg < 5)" is unknown when register value
+ * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value
* range [0,10]
*/
-static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
- bool is_jmp32)
+static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+ u8 opcode, bool is_jmp32)
{
- if (__is_pointer_value(false, reg)) {
- if (!reg_type_not_null(reg->type))
+ if (reg_is_pkt_pointer_any(reg1) && reg_is_pkt_pointer_any(reg2) && !is_jmp32)
+ return is_pkt_ptr_branch_taken(reg1, reg2, opcode);
+
+ if (__is_pointer_value(false, reg1) || __is_pointer_value(false, reg2)) {
+ u64 val;
+
+ /* arrange that reg2 is a scalar, and reg1 is a pointer */
+ if (!is_reg_const(reg2, is_jmp32)) {
+ opcode = flip_opcode(opcode);
+ swap(reg1, reg2);
+ }
+ /* and ensure that reg2 is a constant */
+ if (!is_reg_const(reg2, is_jmp32))
+ return -1;
+
+ if (!reg_not_null(reg1))
return -1;
/* If pointer is valid tests against zero will fail so we can
* use this to direct branch taken.
*/
+ val = reg_const_value(reg2, is_jmp32);
if (val != 0)
return -1;
@@ -6431,293 +14376,275 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
}
}
- if (is_jmp32)
- return is_branch32_taken(reg, val, opcode);
- return is_branch64_taken(reg, val, opcode);
+ /* now deal with two scalars, but not necessarily constants */
+ return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32);
}
-/* Adjusts the register min/max values in the case that the dst_reg is the
- * variable register that we are working on, and src_reg is a constant or we're
- * simply doing a BPF_K check.
- * In JEQ/JNE cases we also adjust the var_off values.
+/* Opcode that corresponds to a *false* branch condition.
+ * E.g., if r1 < r2, then reverse (false) condition is r1 >= r2
*/
-static void reg_set_min_max(struct bpf_reg_state *true_reg,
- struct bpf_reg_state *false_reg,
- u64 val, u32 val32,
- u8 opcode, bool is_jmp32)
-{
- struct tnum false_32off = tnum_subreg(false_reg->var_off);
- struct tnum false_64off = false_reg->var_off;
- struct tnum true_32off = tnum_subreg(true_reg->var_off);
- struct tnum true_64off = true_reg->var_off;
- s64 sval = (s64)val;
- s32 sval32 = (s32)val32;
-
- /* If the dst_reg is a pointer, we can't learn anything about its
- * variable offset from the compare (unless src_reg were a pointer into
- * the same object, but we don't bother with that.
- * Since false_reg and true_reg have the same type by construction, we
- * only need to check one of them for pointerness.
- */
- if (__is_pointer_value(false, false_reg))
- return;
+static u8 rev_opcode(u8 opcode)
+{
+ switch (opcode) {
+ case BPF_JEQ: return BPF_JNE;
+ case BPF_JNE: return BPF_JEQ;
+ /* JSET doesn't have it's reverse opcode in BPF, so add
+ * BPF_X flag to denote the reverse of that operation
+ */
+ case BPF_JSET: return BPF_JSET | BPF_X;
+ case BPF_JSET | BPF_X: return BPF_JSET;
+ case BPF_JGE: return BPF_JLT;
+ case BPF_JGT: return BPF_JLE;
+ case BPF_JLE: return BPF_JGT;
+ case BPF_JLT: return BPF_JGE;
+ case BPF_JSGE: return BPF_JSLT;
+ case BPF_JSGT: return BPF_JSLE;
+ case BPF_JSLE: return BPF_JSGT;
+ case BPF_JSLT: return BPF_JSGE;
+ default: return 0;
+ }
+}
+
+/* Refine range knowledge for <reg1> <op> <reg>2 conditional operation. */
+static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+ u8 opcode, bool is_jmp32)
+{
+ struct tnum t;
+ u64 val;
+again:
switch (opcode) {
case BPF_JEQ:
+ if (is_jmp32) {
+ reg1->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
+ reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
+ reg1->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
+ reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
+ reg2->u32_min_value = reg1->u32_min_value;
+ reg2->u32_max_value = reg1->u32_max_value;
+ reg2->s32_min_value = reg1->s32_min_value;
+ reg2->s32_max_value = reg1->s32_max_value;
+
+ t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off));
+ reg1->var_off = tnum_with_subreg(reg1->var_off, t);
+ reg2->var_off = tnum_with_subreg(reg2->var_off, t);
+ } else {
+ reg1->umin_value = max(reg1->umin_value, reg2->umin_value);
+ reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
+ reg1->smin_value = max(reg1->smin_value, reg2->smin_value);
+ reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
+ reg2->umin_value = reg1->umin_value;
+ reg2->umax_value = reg1->umax_value;
+ reg2->smin_value = reg1->smin_value;
+ reg2->smax_value = reg1->smax_value;
+
+ reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off);
+ reg2->var_off = reg1->var_off;
+ }
+ break;
case BPF_JNE:
- {
- struct bpf_reg_state *reg =
- opcode == BPF_JEQ ? true_reg : false_reg;
+ if (!is_reg_const(reg2, is_jmp32))
+ swap(reg1, reg2);
+ if (!is_reg_const(reg2, is_jmp32))
+ break;
- /* For BPF_JEQ, if this is false we know nothing Jon Snow, but
- * if it is true we know the value for sure. Likewise for
- * BPF_JNE.
+ /* try to recompute the bound of reg1 if reg2 is a const and
+ * is exactly the edge of reg1.
*/
- if (is_jmp32)
- __mark_reg32_known(reg, val32);
- else
- __mark_reg_known(reg, val);
+ val = reg_const_value(reg2, is_jmp32);
+ if (is_jmp32) {
+ /* u32_min_value is not equal to 0xffffffff at this point,
+ * because otherwise u32_max_value is 0xffffffff as well,
+ * in such a case both reg1 and reg2 would be constants,
+ * jump would be predicted and reg_set_min_max() won't
+ * be called.
+ *
+ * Same reasoning works for all {u,s}{min,max}{32,64} cases
+ * below.
+ */
+ if (reg1->u32_min_value == (u32)val)
+ reg1->u32_min_value++;
+ if (reg1->u32_max_value == (u32)val)
+ reg1->u32_max_value--;
+ if (reg1->s32_min_value == (s32)val)
+ reg1->s32_min_value++;
+ if (reg1->s32_max_value == (s32)val)
+ reg1->s32_max_value--;
+ } else {
+ if (reg1->umin_value == (u64)val)
+ reg1->umin_value++;
+ if (reg1->umax_value == (u64)val)
+ reg1->umax_value--;
+ if (reg1->smin_value == (s64)val)
+ reg1->smin_value++;
+ if (reg1->smax_value == (s64)val)
+ reg1->smax_value--;
+ }
break;
- }
case BPF_JSET:
+ if (!is_reg_const(reg2, is_jmp32))
+ swap(reg1, reg2);
+ if (!is_reg_const(reg2, is_jmp32))
+ break;
+ val = reg_const_value(reg2, is_jmp32);
+ /* BPF_JSET (i.e., TRUE branch, *not* BPF_JSET | BPF_X)
+ * requires single bit to learn something useful. E.g., if we
+ * know that `r1 & 0x3` is true, then which bits (0, 1, or both)
+ * are actually set? We can learn something definite only if
+ * it's a single-bit value to begin with.
+ *
+ * BPF_JSET | BPF_X (i.e., negation of BPF_JSET) doesn't have
+ * this restriction. I.e., !(r1 & 0x3) means neither bit 0 nor
+ * bit 1 is set, which we can readily use in adjustments.
+ */
+ if (!is_power_of_2(val))
+ break;
if (is_jmp32) {
- false_32off = tnum_and(false_32off, tnum_const(~val32));
- if (is_power_of_2(val32))
- true_32off = tnum_or(true_32off,
- tnum_const(val32));
+ t = tnum_or(tnum_subreg(reg1->var_off), tnum_const(val));
+ reg1->var_off = tnum_with_subreg(reg1->var_off, t);
} else {
- false_64off = tnum_and(false_64off, tnum_const(~val));
- if (is_power_of_2(val))
- true_64off = tnum_or(true_64off,
- tnum_const(val));
+ reg1->var_off = tnum_or(reg1->var_off, tnum_const(val));
}
break;
- case BPF_JGE:
- case BPF_JGT:
- {
+ case BPF_JSET | BPF_X: /* reverse of BPF_JSET, see rev_opcode() */
+ if (!is_reg_const(reg2, is_jmp32))
+ swap(reg1, reg2);
+ if (!is_reg_const(reg2, is_jmp32))
+ break;
+ val = reg_const_value(reg2, is_jmp32);
if (is_jmp32) {
- u32 false_umax = opcode == BPF_JGT ? val32 : val32 - 1;
- u32 true_umin = opcode == BPF_JGT ? val32 + 1 : val32;
-
- false_reg->u32_max_value = min(false_reg->u32_max_value,
- false_umax);
- true_reg->u32_min_value = max(true_reg->u32_min_value,
- true_umin);
+ t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val));
+ reg1->var_off = tnum_with_subreg(reg1->var_off, t);
} else {
- u64 false_umax = opcode == BPF_JGT ? val : val - 1;
- u64 true_umin = opcode == BPF_JGT ? val + 1 : val;
-
- false_reg->umax_value = min(false_reg->umax_value, false_umax);
- true_reg->umin_value = max(true_reg->umin_value, true_umin);
+ reg1->var_off = tnum_and(reg1->var_off, tnum_const(~val));
}
break;
- }
- case BPF_JSGE:
- case BPF_JSGT:
- {
+ case BPF_JLE:
if (is_jmp32) {
- s32 false_smax = opcode == BPF_JSGT ? sval32 : sval32 - 1;
- s32 true_smin = opcode == BPF_JSGT ? sval32 + 1 : sval32;
-
- false_reg->s32_max_value = min(false_reg->s32_max_value, false_smax);
- true_reg->s32_min_value = max(true_reg->s32_min_value, true_smin);
+ reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
+ reg2->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
} else {
- s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1;
- s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval;
-
- false_reg->smax_value = min(false_reg->smax_value, false_smax);
- true_reg->smin_value = max(true_reg->smin_value, true_smin);
+ reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
+ reg2->umin_value = max(reg1->umin_value, reg2->umin_value);
}
break;
- }
- case BPF_JLE:
case BPF_JLT:
- {
if (is_jmp32) {
- u32 false_umin = opcode == BPF_JLT ? val32 : val32 + 1;
- u32 true_umax = opcode == BPF_JLT ? val32 - 1 : val32;
-
- false_reg->u32_min_value = max(false_reg->u32_min_value,
- false_umin);
- true_reg->u32_max_value = min(true_reg->u32_max_value,
- true_umax);
+ reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value - 1);
+ reg2->u32_min_value = max(reg1->u32_min_value + 1, reg2->u32_min_value);
} else {
- u64 false_umin = opcode == BPF_JLT ? val : val + 1;
- u64 true_umax = opcode == BPF_JLT ? val - 1 : val;
-
- false_reg->umin_value = max(false_reg->umin_value, false_umin);
- true_reg->umax_value = min(true_reg->umax_value, true_umax);
+ reg1->umax_value = min(reg1->umax_value, reg2->umax_value - 1);
+ reg2->umin_value = max(reg1->umin_value + 1, reg2->umin_value);
}
break;
- }
case BPF_JSLE:
+ if (is_jmp32) {
+ reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
+ reg2->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
+ } else {
+ reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
+ reg2->smin_value = max(reg1->smin_value, reg2->smin_value);
+ }
+ break;
case BPF_JSLT:
- {
if (is_jmp32) {
- s32 false_smin = opcode == BPF_JSLT ? sval32 : sval32 + 1;
- s32 true_smax = opcode == BPF_JSLT ? sval32 - 1 : sval32;
-
- false_reg->s32_min_value = max(false_reg->s32_min_value, false_smin);
- true_reg->s32_max_value = min(true_reg->s32_max_value, true_smax);
+ reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value - 1);
+ reg2->s32_min_value = max(reg1->s32_min_value + 1, reg2->s32_min_value);
} else {
- s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1;
- s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval;
-
- false_reg->smin_value = max(false_reg->smin_value, false_smin);
- true_reg->smax_value = min(true_reg->smax_value, true_smax);
+ reg1->smax_value = min(reg1->smax_value, reg2->smax_value - 1);
+ reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value);
}
break;
- }
+ case BPF_JGE:
+ case BPF_JGT:
+ case BPF_JSGE:
+ case BPF_JSGT:
+ /* just reuse LE/LT logic above */
+ opcode = flip_opcode(opcode);
+ swap(reg1, reg2);
+ goto again;
default:
return;
}
-
- if (is_jmp32) {
- false_reg->var_off = tnum_or(tnum_clear_subreg(false_64off),
- tnum_subreg(false_32off));
- true_reg->var_off = tnum_or(tnum_clear_subreg(true_64off),
- tnum_subreg(true_32off));
- __reg_combine_32_into_64(false_reg);
- __reg_combine_32_into_64(true_reg);
- } else {
- false_reg->var_off = false_64off;
- true_reg->var_off = true_64off;
- __reg_combine_64_into_32(false_reg);
- __reg_combine_64_into_32(true_reg);
- }
}
-/* Same as above, but for the case that dst_reg holds a constant and src_reg is
- * the variable reg.
+/* Adjusts the register min/max values in the case that the dst_reg and
+ * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K
+ * check, in which case we havea fake SCALAR_VALUE representing insn->imm).
+ * Technically we can do similar adjustments for pointers to the same object,
+ * but we don't support that right now.
*/
-static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
- struct bpf_reg_state *false_reg,
- u64 val, u32 val32,
- u8 opcode, bool is_jmp32)
+static int reg_set_min_max(struct bpf_verifier_env *env,
+ struct bpf_reg_state *true_reg1,
+ struct bpf_reg_state *true_reg2,
+ struct bpf_reg_state *false_reg1,
+ struct bpf_reg_state *false_reg2,
+ u8 opcode, bool is_jmp32)
{
- /* How can we transform "a <op> b" into "b <op> a"? */
- static const u8 opcode_flip[16] = {
- /* these stay the same */
- [BPF_JEQ >> 4] = BPF_JEQ,
- [BPF_JNE >> 4] = BPF_JNE,
- [BPF_JSET >> 4] = BPF_JSET,
- /* these swap "lesser" and "greater" (L and G in the opcodes) */
- [BPF_JGE >> 4] = BPF_JLE,
- [BPF_JGT >> 4] = BPF_JLT,
- [BPF_JLE >> 4] = BPF_JGE,
- [BPF_JLT >> 4] = BPF_JGT,
- [BPF_JSGE >> 4] = BPF_JSLE,
- [BPF_JSGT >> 4] = BPF_JSLT,
- [BPF_JSLE >> 4] = BPF_JSGE,
- [BPF_JSLT >> 4] = BPF_JSGT
- };
- opcode = opcode_flip[opcode >> 4];
- /* This uses zero as "not present in table"; luckily the zero opcode,
- * BPF_JA, can't get here.
- */
- if (opcode)
- reg_set_min_max(true_reg, false_reg, val, val32, opcode, is_jmp32);
-}
-
-/* Regs are known to be equal, so intersect their min/max/var_off */
-static void __reg_combine_min_max(struct bpf_reg_state *src_reg,
- struct bpf_reg_state *dst_reg)
-{
- src_reg->umin_value = dst_reg->umin_value = max(src_reg->umin_value,
- dst_reg->umin_value);
- src_reg->umax_value = dst_reg->umax_value = min(src_reg->umax_value,
- dst_reg->umax_value);
- src_reg->smin_value = dst_reg->smin_value = max(src_reg->smin_value,
- dst_reg->smin_value);
- src_reg->smax_value = dst_reg->smax_value = min(src_reg->smax_value,
- dst_reg->smax_value);
- src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off,
- dst_reg->var_off);
- /* We might have learned new bounds from the var_off. */
- __update_reg_bounds(src_reg);
- __update_reg_bounds(dst_reg);
- /* We might have learned something about the sign bit. */
- __reg_deduce_bounds(src_reg);
- __reg_deduce_bounds(dst_reg);
- /* We might have learned some bits from the bounds. */
- __reg_bound_offset(src_reg);
- __reg_bound_offset(dst_reg);
- /* Intersecting with the old var_off might have improved our bounds
- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
- * then new var_off is (0; 0x7f...fc) which improves our umax.
+ int err;
+
+ /* If either register is a pointer, we can't learn anything about its
+ * variable offset from the compare (unless they were a pointer into
+ * the same object, but we don't bother with that).
*/
- __update_reg_bounds(src_reg);
- __update_reg_bounds(dst_reg);
-}
+ if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE)
+ return 0;
-static void reg_combine_min_max(struct bpf_reg_state *true_src,
- struct bpf_reg_state *true_dst,
- struct bpf_reg_state *false_src,
- struct bpf_reg_state *false_dst,
- u8 opcode)
-{
- switch (opcode) {
- case BPF_JEQ:
- __reg_combine_min_max(true_src, true_dst);
- break;
- case BPF_JNE:
- __reg_combine_min_max(false_src, false_dst);
- break;
- }
+ /* fallthrough (FALSE) branch */
+ regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32);
+ reg_bounds_sync(false_reg1);
+ reg_bounds_sync(false_reg2);
+
+ /* jump (TRUE) branch */
+ regs_refine_cond_op(true_reg1, true_reg2, opcode, is_jmp32);
+ reg_bounds_sync(true_reg1);
+ reg_bounds_sync(true_reg2);
+
+ err = reg_bounds_sanity_check(env, true_reg1, "true_reg1");
+ err = err ?: reg_bounds_sanity_check(env, true_reg2, "true_reg2");
+ err = err ?: reg_bounds_sanity_check(env, false_reg1, "false_reg1");
+ err = err ?: reg_bounds_sanity_check(env, false_reg2, "false_reg2");
+ return err;
}
static void mark_ptr_or_null_reg(struct bpf_func_state *state,
struct bpf_reg_state *reg, u32 id,
bool is_null)
{
- if (reg_type_may_be_null(reg->type) && reg->id == id) {
- /* Old offset (both fixed and variable parts) should
- * have been known-zero, because we don't allow pointer
- * arithmetic on pointers that might be NULL.
+ if (type_may_be_null(reg->type) && reg->id == id &&
+ (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) {
+ /* Old offset (both fixed and variable parts) should have been
+ * known-zero, because we don't allow pointer arithmetic on
+ * pointers that might be NULL. If we see this happening, don't
+ * convert the register.
+ *
+ * But in some cases, some helpers that return local kptrs
+ * advance offset for the returned pointer. In those cases, it
+ * is fine to expect to see reg->off.
*/
- if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
- !tnum_equals_const(reg->var_off, 0) ||
- reg->off)) {
- __mark_reg_known_zero(reg);
- reg->off = 0;
- }
+ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0)))
+ return;
+ if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) &&
+ WARN_ON_ONCE(reg->off))
+ return;
+
if (is_null) {
reg->type = SCALAR_VALUE;
- } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
- const struct bpf_map *map = reg->map_ptr;
-
- if (map->inner_map_meta) {
- reg->type = CONST_PTR_TO_MAP;
- reg->map_ptr = map->inner_map_meta;
- } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
- reg->type = PTR_TO_XDP_SOCK;
- } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
- map->map_type == BPF_MAP_TYPE_SOCKHASH) {
- reg->type = PTR_TO_SOCKET;
- } else {
- reg->type = PTR_TO_MAP_VALUE;
- }
- } else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
- reg->type = PTR_TO_SOCKET;
- } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) {
- reg->type = PTR_TO_SOCK_COMMON;
- } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
- reg->type = PTR_TO_TCP_SOCK;
- } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) {
- reg->type = PTR_TO_BTF_ID;
- } else if (reg->type == PTR_TO_MEM_OR_NULL) {
- reg->type = PTR_TO_MEM;
- }
- if (is_null) {
/* We don't need id and ref_obj_id from this point
* onwards anymore, thus we should better reset it,
* so that state pruning has chances to take effect.
*/
reg->id = 0;
reg->ref_obj_id = 0;
- } else if (!reg_may_point_to_spin_lock(reg)) {
+
+ return;
+ }
+
+ mark_ptr_not_null_reg(reg);
+
+ if (!reg_may_point_to_spin_lock(reg)) {
/* For not-NULL ptr, reg->ref_obj_id will be reset
- * in release_reg_references().
+ * in release_reference().
*
* reg->id is still used by spin_lock ptr. Other
* than spin_lock ptr type, reg->id can be reset.
@@ -6727,22 +14654,6 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
}
}
-static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id,
- bool is_null)
-{
- struct bpf_reg_state *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++)
- mark_ptr_or_null_reg(state, &state->regs[i], id, is_null);
-
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- mark_ptr_or_null_reg(state, reg, id, is_null);
- }
-}
-
/* The logic is similar to find_good_pkt_pointers(), both could eventually
* be folded together at some point.
*/
@@ -6750,10 +14661,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
bool is_null)
{
struct bpf_func_state *state = vstate->frame[vstate->curframe];
- struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *regs = state->regs, *reg;
u32 ref_obj_id = regs[regno].ref_obj_id;
u32 id = regs[regno].id;
- int i;
if (ref_obj_id && ref_obj_id == id && is_null)
/* regs[regno] is in the " == NULL" branch.
@@ -6762,8 +14672,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
*/
WARN_ON_ONCE(release_reference_state(state, id));
- for (i = 0; i <= vstate->curframe; i++)
- __mark_ptr_or_null_regs(vstate->frame[i], id, is_null);
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ mark_ptr_or_null_reg(state, reg, id, is_null);
+ }));
}
static bool try_match_pkt_pointers(const struct bpf_insn *insn,
@@ -6788,6 +14699,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
/* pkt_data' > pkt_end, pkt_meta' > pkt_data */
find_good_pkt_pointers(this_branch, dst_reg,
dst_reg->type, false);
+ mark_pkt_end(other_branch, insn->dst_reg, true);
} else if ((dst_reg->type == PTR_TO_PACKET_END &&
src_reg->type == PTR_TO_PACKET) ||
(reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
@@ -6795,6 +14707,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
/* pkt_end > pkt_data', pkt_data > pkt_meta' */
find_good_pkt_pointers(other_branch, src_reg,
src_reg->type, true);
+ mark_pkt_end(this_branch, insn->src_reg, false);
} else {
return false;
}
@@ -6807,6 +14720,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
/* pkt_data' < pkt_end, pkt_meta' < pkt_data */
find_good_pkt_pointers(other_branch, dst_reg,
dst_reg->type, true);
+ mark_pkt_end(this_branch, insn->dst_reg, false);
} else if ((dst_reg->type == PTR_TO_PACKET_END &&
src_reg->type == PTR_TO_PACKET) ||
(reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
@@ -6814,6 +14728,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
/* pkt_end < pkt_data', pkt_data > pkt_meta' */
find_good_pkt_pointers(this_branch, src_reg,
src_reg->type, false);
+ mark_pkt_end(other_branch, insn->src_reg, true);
} else {
return false;
}
@@ -6826,6 +14741,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
/* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */
find_good_pkt_pointers(this_branch, dst_reg,
dst_reg->type, true);
+ mark_pkt_end(other_branch, insn->dst_reg, false);
} else if ((dst_reg->type == PTR_TO_PACKET_END &&
src_reg->type == PTR_TO_PACKET) ||
(reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
@@ -6833,6 +14749,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
/* pkt_end >= pkt_data', pkt_data >= pkt_meta' */
find_good_pkt_pointers(other_branch, src_reg,
src_reg->type, false);
+ mark_pkt_end(this_branch, insn->src_reg, true);
} else {
return false;
}
@@ -6845,6 +14762,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
/* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */
find_good_pkt_pointers(other_branch, dst_reg,
dst_reg->type, false);
+ mark_pkt_end(this_branch, insn->dst_reg, true);
} else if ((dst_reg->type == PTR_TO_PACKET_END &&
src_reg->type == PTR_TO_PACKET) ||
(reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
@@ -6852,6 +14770,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
/* pkt_end <= pkt_data', pkt_data <= pkt_meta' */
find_good_pkt_pointers(this_branch, src_reg,
src_reg->type, true);
+ mark_pkt_end(other_branch, insn->src_reg, false);
} else {
return false;
}
@@ -6863,6 +14782,18 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
return true;
}
+static void find_equal_scalars(struct bpf_verifier_state *vstate,
+ struct bpf_reg_state *known_reg)
+{
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
+ copy_register_state(reg, known_reg);
+ }));
+}
+
static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_insn *insn, int *insn_idx)
{
@@ -6870,6 +14801,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_verifier_state *other_branch;
struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
+ struct bpf_reg_state *eq_branch_regs;
+ struct bpf_reg_state fake_reg = {};
u8 opcode = BPF_OP(insn->code);
bool is_jmp32;
int pred = -1;
@@ -6881,6 +14814,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return -EINVAL;
}
+ /* check src2 operand */
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ dst_reg = &regs[insn->dst_reg];
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0) {
verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
@@ -6892,62 +14831,63 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (err)
return err;
- if (is_pointer_value(env, insn->src_reg)) {
+ src_reg = &regs[insn->src_reg];
+ if (!(reg_is_pkt_pointer_any(dst_reg) && reg_is_pkt_pointer_any(src_reg)) &&
+ is_pointer_value(env, insn->src_reg)) {
verbose(env, "R%d pointer comparison prohibited\n",
insn->src_reg);
return -EACCES;
}
- src_reg = &regs[insn->src_reg];
} else {
if (insn->src_reg != BPF_REG_0) {
verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
return -EINVAL;
}
+ src_reg = &fake_reg;
+ src_reg->type = SCALAR_VALUE;
+ __mark_reg_known(src_reg, insn->imm);
}
- /* check src2 operand */
- err = check_reg_arg(env, insn->dst_reg, SRC_OP);
- if (err)
- return err;
-
- dst_reg = &regs[insn->dst_reg];
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
-
- if (BPF_SRC(insn->code) == BPF_K) {
- pred = is_branch_taken(dst_reg, insn->imm, opcode, is_jmp32);
- } else if (src_reg->type == SCALAR_VALUE &&
- is_jmp32 && tnum_is_const(tnum_subreg(src_reg->var_off))) {
- pred = is_branch_taken(dst_reg,
- tnum_subreg(src_reg->var_off).value,
- opcode,
- is_jmp32);
- } else if (src_reg->type == SCALAR_VALUE &&
- !is_jmp32 && tnum_is_const(src_reg->var_off)) {
- pred = is_branch_taken(dst_reg,
- src_reg->var_off.value,
- opcode,
- is_jmp32);
- }
-
+ pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
if (pred >= 0) {
/* If we get here with a dst_reg pointer type it is because
* above is_branch_taken() special cased the 0 comparison.
*/
if (!__is_pointer_value(false, dst_reg))
err = mark_chain_precision(env, insn->dst_reg);
- if (BPF_SRC(insn->code) == BPF_X && !err)
+ if (BPF_SRC(insn->code) == BPF_X && !err &&
+ !__is_pointer_value(false, src_reg))
err = mark_chain_precision(env, insn->src_reg);
if (err)
return err;
}
+
if (pred == 1) {
- /* only follow the goto, ignore fall-through */
+ /* Only follow the goto, ignore fall-through. If needed, push
+ * the fall-through branch for simulation under speculative
+ * execution.
+ */
+ if (!env->bypass_spec_v1 &&
+ !sanitize_speculative_path(env, insn, *insn_idx + 1,
+ *insn_idx))
+ return -EFAULT;
+ if (env->log.level & BPF_LOG_LEVEL)
+ print_insn_state(env, this_branch->frame[this_branch->curframe]);
*insn_idx += insn->off;
return 0;
} else if (pred == 0) {
- /* only follow fall-through branch, since
- * that's where the program will go
+ /* Only follow the fall-through branch, since that's where the
+ * program will go. If needed, push the goto branch for
+ * simulation under speculative execution.
*/
+ if (!env->bypass_spec_v1 &&
+ !sanitize_speculative_path(env, insn,
+ *insn_idx + insn->off + 1,
+ *insn_idx))
+ return -EFAULT;
+ if (env->log.level & BPF_LOG_LEVEL)
+ print_insn_state(env, this_branch->frame[this_branch->curframe]);
return 0;
}
@@ -6957,45 +14897,68 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return -EFAULT;
other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
- /* detect if we are comparing against a constant value so we can adjust
- * our min/max values for our dst register.
- * this is only legit if both are scalars (or pointers to the same
- * object, I suppose, but we don't support that right now), because
- * otherwise the different base pointers mean the offsets aren't
- * comparable.
- */
if (BPF_SRC(insn->code) == BPF_X) {
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
-
- if (dst_reg->type == SCALAR_VALUE &&
- src_reg->type == SCALAR_VALUE) {
- if (tnum_is_const(src_reg->var_off) ||
- (is_jmp32 &&
- tnum_is_const(tnum_subreg(src_reg->var_off))))
- reg_set_min_max(&other_branch_regs[insn->dst_reg],
- dst_reg,
- src_reg->var_off.value,
- tnum_subreg(src_reg->var_off).value,
- opcode, is_jmp32);
- else if (tnum_is_const(dst_reg->var_off) ||
- (is_jmp32 &&
- tnum_is_const(tnum_subreg(dst_reg->var_off))))
- reg_set_min_max_inv(&other_branch_regs[insn->src_reg],
- src_reg,
- dst_reg->var_off.value,
- tnum_subreg(dst_reg->var_off).value,
- opcode, is_jmp32);
- else if (!is_jmp32 &&
- (opcode == BPF_JEQ || opcode == BPF_JNE))
- /* Comparing for equality, we can combine knowledge */
- reg_combine_min_max(&other_branch_regs[insn->src_reg],
- &other_branch_regs[insn->dst_reg],
- src_reg, dst_reg, opcode);
- }
- } else if (dst_reg->type == SCALAR_VALUE) {
- reg_set_min_max(&other_branch_regs[insn->dst_reg],
- dst_reg, insn->imm, (u32)insn->imm,
- opcode, is_jmp32);
+ err = reg_set_min_max(env,
+ &other_branch_regs[insn->dst_reg],
+ &other_branch_regs[insn->src_reg],
+ dst_reg, src_reg, opcode, is_jmp32);
+ } else /* BPF_SRC(insn->code) == BPF_K */ {
+ err = reg_set_min_max(env,
+ &other_branch_regs[insn->dst_reg],
+ src_reg /* fake one */,
+ dst_reg, src_reg /* same fake one */,
+ opcode, is_jmp32);
+ }
+ if (err)
+ return err;
+
+ if (BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == SCALAR_VALUE && src_reg->id &&
+ !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
+ find_equal_scalars(this_branch, src_reg);
+ find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
+ }
+ if (dst_reg->type == SCALAR_VALUE && dst_reg->id &&
+ !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) {
+ find_equal_scalars(this_branch, dst_reg);
+ find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);
+ }
+
+ /* if one pointer register is compared to another pointer
+ * register check if PTR_MAYBE_NULL could be lifted.
+ * E.g. register A - maybe null
+ * register B - not null
+ * for JNE A, B, ... - A is not null in the false branch;
+ * for JEQ A, B, ... - A is not null in the true branch.
+ *
+ * Since PTR_TO_BTF_ID points to a kernel struct that does
+ * not need to be null checked by the BPF program, i.e.,
+ * could be null even without PTR_MAYBE_NULL marking, so
+ * only propagate nullness when neither reg is that type.
+ */
+ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_X &&
+ __is_pointer_value(false, src_reg) && __is_pointer_value(false, dst_reg) &&
+ type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type) &&
+ base_type(src_reg->type) != PTR_TO_BTF_ID &&
+ base_type(dst_reg->type) != PTR_TO_BTF_ID) {
+ eq_branch_regs = NULL;
+ switch (opcode) {
+ case BPF_JEQ:
+ eq_branch_regs = other_branch_regs;
+ break;
+ case BPF_JNE:
+ eq_branch_regs = regs;
+ break;
+ default:
+ /* do nothing */
+ break;
+ }
+ if (eq_branch_regs) {
+ if (type_may_be_null(src_reg->type))
+ mark_ptr_not_null_reg(&eq_branch_regs[insn->src_reg]);
+ else
+ mark_ptr_not_null_reg(&eq_branch_regs[insn->dst_reg]);
+ }
}
/* detect if R == 0 where R is returned from bpf_map_lookup_elem().
@@ -7004,7 +14967,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
*/
if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
- reg_type_may_be_null(dst_reg->type)) {
+ type_may_be_null(dst_reg->type)) {
/* Mark all identical registers in each branch as either
* safe or unknown depending R == 0 or R != 0 conditional.
*/
@@ -7020,7 +14983,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return -EACCES;
}
if (env->log.level & BPF_LOG_LEVEL)
- print_verifier_state(env, this_branch->frame[this_branch->curframe]);
+ print_insn_state(env, this_branch->frame[this_branch->curframe]);
return 0;
}
@@ -7029,6 +14992,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
struct bpf_insn_aux_data *aux = cur_aux(env);
struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *dst_reg;
struct bpf_map *map;
int err;
@@ -7045,25 +15009,69 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
+ dst_reg = &regs[insn->dst_reg];
if (insn->src_reg == 0) {
u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
- regs[insn->dst_reg].type = SCALAR_VALUE;
+ dst_reg->type = SCALAR_VALUE;
__mark_reg_known(&regs[insn->dst_reg], imm);
return 0;
}
- map = env->used_maps[aux->map_index];
+ /* All special src_reg cases are listed below. From this point onwards
+ * we either succeed and assign a corresponding dst_reg->type after
+ * zeroing the offset, or fail and reject the program.
+ */
mark_reg_known_zero(env, regs, insn->dst_reg);
- regs[insn->dst_reg].map_ptr = map;
-
- if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) {
- regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
- regs[insn->dst_reg].off = aux->map_off;
- if (map_value_has_spin_lock(map))
- regs[insn->dst_reg].id = ++env->id_gen;
- } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
- regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
+
+ if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
+ dst_reg->type = aux->btf_var.reg_type;
+ switch (base_type(dst_reg->type)) {
+ case PTR_TO_MEM:
+ dst_reg->mem_size = aux->btf_var.mem_size;
+ break;
+ case PTR_TO_BTF_ID:
+ dst_reg->btf = aux->btf_var.btf;
+ dst_reg->btf_id = aux->btf_var.btf_id;
+ break;
+ default:
+ verbose(env, "bpf verifier is misconfigured\n");
+ return -EFAULT;
+ }
+ return 0;
+ }
+
+ if (insn->src_reg == BPF_PSEUDO_FUNC) {
+ struct bpf_prog_aux *aux = env->prog->aux;
+ u32 subprogno = find_subprog(env,
+ env->insn_idx + insn->imm + 1);
+
+ if (!aux->func_info) {
+ verbose(env, "missing btf func_info\n");
+ return -EINVAL;
+ }
+ if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) {
+ verbose(env, "callback function not static\n");
+ return -EINVAL;
+ }
+
+ dst_reg->type = PTR_TO_FUNC;
+ dst_reg->subprogno = subprogno;
+ return 0;
+ }
+
+ map = env->used_maps[aux->map_index];
+ dst_reg->map_ptr = map;
+
+ if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
+ insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
+ dst_reg->type = PTR_TO_MAP_VALUE;
+ dst_reg->off = aux->map_off;
+ WARN_ON_ONCE(map->max_entries != 1);
+ /* We want reg->id to be same (0) as map_value is not distinct */
+ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
+ insn->src_reg == BPF_PSEUDO_MAP_IDX) {
+ dst_reg->type = CONST_PTR_TO_MAP;
} else {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
@@ -7106,7 +15114,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
u8 mode = BPF_MODE(insn->code);
int i, err;
- if (!may_access_skb(env->prog->type)) {
+ if (!may_access_skb(resolve_prog_type(env->prog))) {
verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
return -EINVAL;
}
@@ -7116,18 +15124,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
- if (env->subprog_cnt > 1) {
- /* when program has LD_ABS insn JITs and interpreter assume
- * that r1 == ctx == skb which is not the case for callees
- * that can have arbitrary arguments. It's problematic
- * for main prog as well since JITs would need to analyze
- * all functions in order to make proper register save/restore
- * decisions in the main prog. Hence disallow LD_ABS with calls
- */
- verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n");
- return -EINVAL;
- }
-
if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
BPF_SIZE(insn->code) == BPF_DW ||
(mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
@@ -7144,17 +15140,22 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
* gen_ld_abs() may terminate the program at runtime, leading to
* reference leak.
*/
- err = check_reference_leak(env);
+ err = check_reference_leak(env, false);
if (err) {
verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n");
return err;
}
- if (env->cur_state->active_spin_lock) {
+ if (env->cur_state->active_lock.ptr) {
verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
return -EINVAL;
}
+ if (env->cur_state->active_rcu_lock) {
+ verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n");
+ return -EINVAL;
+ }
+
if (regs[ctx_reg].type != PTR_TO_CTX) {
verbose(env,
"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
@@ -7168,7 +15169,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
}
- err = check_ctx_reg(env, &regs[ctx_reg], ctx_reg);
+ err = check_ptr_off_reg(env, &regs[ctx_reg], ctx_reg);
if (err < 0)
return err;
@@ -7188,48 +15189,87 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
-static int check_return_code(struct bpf_verifier_env *env)
+static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name)
{
+ const char *exit_ctx = "At program exit";
struct tnum enforce_attach_type_range = tnum_unknown;
const struct bpf_prog *prog = env->prog;
struct bpf_reg_state *reg;
- struct tnum range = tnum_range(0, 1);
+ struct bpf_retval_range range = retval_range(0, 1);
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
int err;
+ struct bpf_func_state *frame = env->cur_state->frame[0];
+ const bool is_subprog = frame->subprogno;
/* LSM and struct_ops func-ptr's return type could be "void" */
- if ((env->prog->type == BPF_PROG_TYPE_STRUCT_OPS ||
- env->prog->type == BPF_PROG_TYPE_LSM) &&
- !prog->aux->attach_func_proto->type)
- return 0;
+ if (!is_subprog || frame->in_exception_callback_fn) {
+ switch (prog_type) {
+ case BPF_PROG_TYPE_LSM:
+ if (prog->expected_attach_type == BPF_LSM_CGROUP)
+ /* See below, can be 0 or 0-1 depending on hook. */
+ break;
+ fallthrough;
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ if (!prog->aux->attach_func_proto->type)
+ return 0;
+ break;
+ default:
+ break;
+ }
+ }
- /* eBPF calling convetion is such that R0 is used
+ /* eBPF calling convention is such that R0 is used
* to return the value from eBPF program.
* Make sure that it's readable at this time
* of bpf_exit, which means that program wrote
* something into it earlier
*/
- err = check_reg_arg(env, BPF_REG_0, SRC_OP);
+ err = check_reg_arg(env, regno, SRC_OP);
if (err)
return err;
- if (is_pointer_value(env, BPF_REG_0)) {
- verbose(env, "R0 leaks addr as return value\n");
+ if (is_pointer_value(env, regno)) {
+ verbose(env, "R%d leaks addr as return value\n", regno);
return -EACCES;
}
- switch (env->prog->type) {
+ reg = cur_regs(env) + regno;
+
+ if (frame->in_async_callback_fn) {
+ /* enforce return zero from async callbacks like timer */
+ exit_ctx = "At async callback return";
+ range = retval_range(0, 0);
+ goto enforce_retval;
+ }
+
+ if (is_subprog && !frame->in_exception_callback_fn) {
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "At subprogram exit the register R%d is not a scalar value (%s)\n",
+ regno, reg_type_str(env, reg->type));
+ return -EINVAL;
+ }
+ return 0;
+ }
+
+ switch (prog_type) {
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
+ env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG ||
env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME ||
env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
- env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
- range = tnum_range(1, 1);
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME)
+ range = retval_range(1, 1);
+ if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND)
+ range = retval_range(0, 3);
break;
case BPF_PROG_TYPE_CGROUP_SKB:
if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
- range = tnum_range(0, 3);
+ range = retval_range(0, 3);
enforce_attach_type_range = tnum_range(2, 3);
}
break;
@@ -7242,13 +15282,13 @@ static int check_return_code(struct bpf_verifier_env *env)
case BPF_PROG_TYPE_RAW_TRACEPOINT:
if (!env->prog->aux->attach_btf_id)
return 0;
- range = tnum_const(0);
+ range = retval_range(0, 0);
break;
case BPF_PROG_TYPE_TRACING:
switch (env->prog->expected_attach_type) {
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
- range = tnum_const(0);
+ range = retval_range(0, 0);
break;
case BPF_TRACE_RAW_TP:
case BPF_MODIFY_RETURN:
@@ -7259,6 +15299,28 @@ static int check_return_code(struct bpf_verifier_env *env)
return -ENOTSUPP;
}
break;
+ case BPF_PROG_TYPE_SK_LOOKUP:
+ range = retval_range(SK_DROP, SK_PASS);
+ break;
+
+ case BPF_PROG_TYPE_LSM:
+ if (env->prog->expected_attach_type != BPF_LSM_CGROUP) {
+ /* Regular BPF_PROG_TYPE_LSM programs can return
+ * any value.
+ */
+ return 0;
+ }
+ if (!env->prog->aux->attach_func_proto->type) {
+ /* Make sure programs that attach to void
+ * hooks don't try to modify return value.
+ */
+ range = retval_range(1, 1);
+ }
+ break;
+
+ case BPF_PROG_TYPE_NETFILTER:
+ range = retval_range(NF_DROP, NF_ACCEPT);
+ break;
case BPF_PROG_TYPE_EXT:
/* freplace program can return anything as its return value
* depends on the to-be-replaced kernel func or bpf program.
@@ -7267,25 +15329,24 @@ static int check_return_code(struct bpf_verifier_env *env)
return 0;
}
- reg = cur_regs(env) + BPF_REG_0;
+enforce_retval:
if (reg->type != SCALAR_VALUE) {
- verbose(env, "At program exit the register R0 is not a known value (%s)\n",
- reg_type_str[reg->type]);
+ verbose(env, "%s the register R%d is not a known value (%s)\n",
+ exit_ctx, regno, reg_type_str(env, reg->type));
return -EINVAL;
}
- if (!tnum_in(range, reg->var_off)) {
- char tn_buf[48];
+ err = mark_chain_precision(env, regno);
+ if (err)
+ return err;
- verbose(env, "At program exit the register R0 ");
- if (!tnum_is_unknown(reg->var_off)) {
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "has value %s", tn_buf);
- } else {
- verbose(env, "has unknown scalar value");
- }
- tnum_strn(tn_buf, sizeof(tn_buf), range);
- verbose(env, " should have been in %s\n", tn_buf);
+ if (!retval_range_within(range, reg)) {
+ verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name);
+ if (!is_subprog &&
+ prog->expected_attach_type == BPF_LSM_CGROUP &&
+ prog_type == BPF_PROG_TYPE_LSM &&
+ !prog->aux->attach_func_proto->type)
+ verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
return -EINVAL;
}
@@ -7301,7 +15362,7 @@ static int check_return_code(struct bpf_verifier_env *env)
* 3 let S be a stack
* 4 S.push(v)
* 5 while S is not empty
- * 6 t <- S.pop()
+ * 6 t <- S.peek()
* 7 if t is what we're looking for:
* 8 return t
* 9 for all edges e in G.adjacentEdges(t) do
@@ -7335,42 +15396,56 @@ enum {
BRANCH = 2,
};
-static u32 state_htab_size(struct bpf_verifier_env *env)
+static void mark_prune_point(struct bpf_verifier_env *env, int idx)
{
- return env->prog->len;
+ env->insn_aux_data[idx].prune_point = true;
}
-static struct bpf_verifier_state_list **explored_state(
- struct bpf_verifier_env *env,
- int idx)
+static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
{
- struct bpf_verifier_state *cur = env->cur_state;
- struct bpf_func_state *state = cur->frame[cur->curframe];
+ return env->insn_aux_data[insn_idx].prune_point;
+}
- return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
+static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].force_checkpoint = true;
}
-static void init_explored_state(struct bpf_verifier_env *env, int idx)
+static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
{
- env->insn_aux_data[idx].prune_point = true;
+ return env->insn_aux_data[insn_idx].force_checkpoint;
+}
+
+static void mark_calls_callback(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].calls_callback = true;
+}
+
+static bool calls_callback(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].calls_callback;
}
+enum {
+ DONE_EXPLORING = 0,
+ KEEP_EXPLORING = 1,
+};
+
/* t, w, e - match pseudo-code above:
* t - index of current instruction
* w - next instruction
* e - edge
*/
-static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
- bool loop_ok)
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
{
int *insn_stack = env->cfg.insn_stack;
int *insn_state = env->cfg.insn_state;
if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
- return 0;
+ return DONE_EXPLORING;
if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
- return 0;
+ return DONE_EXPLORING;
if (w < 0 || w >= env->prog->len) {
verbose_linfo(env, t, "%d: ", t);
@@ -7378,9 +15453,11 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
return -EINVAL;
}
- if (e == BRANCH)
+ if (e == BRANCH) {
/* mark branch target for state pruning */
- init_explored_state(env, w);
+ mark_prune_point(env, w);
+ mark_jmp_point(env, w);
+ }
if (insn_state[w] == 0) {
/* tree-edge */
@@ -7389,10 +15466,10 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
if (env->cfg.cur_stack >= env->prog->len)
return -E2BIG;
insn_stack[env->cfg.cur_stack++] = w;
- return 1;
+ return KEEP_EXPLORING;
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
- if (loop_ok && env->bpf_capable)
- return 0;
+ if (env->bpf_capable)
+ return DONE_EXPLORING;
verbose_linfo(env, t, "%d: ", t);
verbose_linfo(env, w, "%d: ", w);
verbose(env, "back-edge from insn %d to %d\n", t, w);
@@ -7404,7 +15481,129 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
verbose(env, "insn state internal bug\n");
return -EFAULT;
}
- return 0;
+ return DONE_EXPLORING;
+}
+
+static int visit_func_call_insn(int t, struct bpf_insn *insns,
+ struct bpf_verifier_env *env,
+ bool visit_callee)
+{
+ int ret, insn_sz;
+
+ insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
+ ret = push_insn(t, t + insn_sz, FALLTHROUGH, env);
+ if (ret)
+ return ret;
+
+ mark_prune_point(env, t + insn_sz);
+ /* when we exit from subprog, we need to record non-linear history */
+ mark_jmp_point(env, t + insn_sz);
+
+ if (visit_callee) {
+ mark_prune_point(env, t);
+ ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
+ }
+ return ret;
+}
+
+/* Visits the instruction at index t and returns one of the following:
+ * < 0 - an error occurred
+ * DONE_EXPLORING - the instruction was fully explored
+ * KEEP_EXPLORING - there is still work to be done before it is fully explored
+ */
+static int visit_insn(int t, struct bpf_verifier_env *env)
+{
+ struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
+ int ret, off, insn_sz;
+
+ if (bpf_pseudo_func(insn))
+ return visit_func_call_insn(t, insns, env, true);
+
+ /* All non-branch instructions have a single fall-through edge. */
+ if (BPF_CLASS(insn->code) != BPF_JMP &&
+ BPF_CLASS(insn->code) != BPF_JMP32) {
+ insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
+ return push_insn(t, t + insn_sz, FALLTHROUGH, env);
+ }
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_EXIT:
+ return DONE_EXPLORING;
+
+ case BPF_CALL:
+ if (insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback)
+ /* Mark this call insn as a prune point to trigger
+ * is_state_visited() check before call itself is
+ * processed by __check_func_call(). Otherwise new
+ * async state will be pushed for further exploration.
+ */
+ mark_prune_point(env, t);
+ /* For functions that invoke callbacks it is not known how many times
+ * callback would be called. Verifier models callback calling functions
+ * by repeatedly visiting callback bodies and returning to origin call
+ * instruction.
+ * In order to stop such iteration verifier needs to identify when a
+ * state identical some state from a previous iteration is reached.
+ * Check below forces creation of checkpoint before callback calling
+ * instruction to allow search for such identical states.
+ */
+ if (is_sync_callback_calling_insn(insn)) {
+ mark_calls_callback(env, t);
+ mark_force_checkpoint(env, t);
+ mark_prune_point(env, t);
+ mark_jmp_point(env, t);
+ }
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ struct bpf_kfunc_call_arg_meta meta;
+
+ ret = fetch_kfunc_meta(env, insn, &meta, NULL);
+ if (ret == 0 && is_iter_next_kfunc(&meta)) {
+ mark_prune_point(env, t);
+ /* Checking and saving state checkpoints at iter_next() call
+ * is crucial for fast convergence of open-coded iterator loop
+ * logic, so we need to force it. If we don't do that,
+ * is_state_visited() might skip saving a checkpoint, causing
+ * unnecessarily long sequence of not checkpointed
+ * instructions and jumps, leading to exhaustion of jump
+ * history buffer, and potentially other undesired outcomes.
+ * It is expected that with correct open-coded iterators
+ * convergence will happen quickly, so we don't run a risk of
+ * exhausting memory.
+ */
+ mark_force_checkpoint(env, t);
+ }
+ }
+ return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
+
+ case BPF_JA:
+ if (BPF_SRC(insn->code) != BPF_K)
+ return -EINVAL;
+
+ if (BPF_CLASS(insn->code) == BPF_JMP)
+ off = insn->off;
+ else
+ off = insn->imm;
+
+ /* unconditional jump with single edge */
+ ret = push_insn(t, t + off + 1, FALLTHROUGH, env);
+ if (ret)
+ return ret;
+
+ mark_prune_point(env, t + off + 1);
+ mark_jmp_point(env, t + off + 1);
+
+ return ret;
+
+ default:
+ /* conditional jump with two edges */
+ mark_prune_point(env, t);
+
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ if (ret)
+ return ret;
+
+ return push_insn(t, t + insn->off + 1, BRANCH, env);
+ }
}
/* non-recursive depth-first-search to detect loops in BPF program
@@ -7412,11 +15611,10 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
*/
static int check_cfg(struct bpf_verifier_env *env)
{
- struct bpf_insn *insns = env->prog->insnsi;
int insn_cnt = env->prog->len;
int *insn_stack, *insn_state;
- int ret = 0;
- int i, t;
+ int ex_insn_beg, i, ret = 0;
+ bool ex_done = false;
insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_state)
@@ -7432,98 +15630,59 @@ static int check_cfg(struct bpf_verifier_env *env)
insn_stack[0] = 0; /* 0 is the first instruction */
env->cfg.cur_stack = 1;
-peek_stack:
- if (env->cfg.cur_stack == 0)
- goto check_state;
- t = insn_stack[env->cfg.cur_stack - 1];
-
- if (BPF_CLASS(insns[t].code) == BPF_JMP ||
- BPF_CLASS(insns[t].code) == BPF_JMP32) {
- u8 opcode = BPF_OP(insns[t].code);
+walk_cfg:
+ while (env->cfg.cur_stack > 0) {
+ int t = insn_stack[env->cfg.cur_stack - 1];
- if (opcode == BPF_EXIT) {
- goto mark_explored;
- } else if (opcode == BPF_CALL) {
- ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
- if (ret == 1)
- goto peek_stack;
- else if (ret < 0)
- goto err_free;
- if (t + 1 < insn_cnt)
- init_explored_state(env, t + 1);
- if (insns[t].src_reg == BPF_PSEUDO_CALL) {
- init_explored_state(env, t);
- ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
- env, false);
- if (ret == 1)
- goto peek_stack;
- else if (ret < 0)
- goto err_free;
- }
- } else if (opcode == BPF_JA) {
- if (BPF_SRC(insns[t].code) != BPF_K) {
- ret = -EINVAL;
- goto err_free;
+ ret = visit_insn(t, env);
+ switch (ret) {
+ case DONE_EXPLORING:
+ insn_state[t] = EXPLORED;
+ env->cfg.cur_stack--;
+ break;
+ case KEEP_EXPLORING:
+ break;
+ default:
+ if (ret > 0) {
+ verbose(env, "visit_insn internal bug\n");
+ ret = -EFAULT;
}
- /* unconditional jump with single edge */
- ret = push_insn(t, t + insns[t].off + 1,
- FALLTHROUGH, env, true);
- if (ret == 1)
- goto peek_stack;
- else if (ret < 0)
- goto err_free;
- /* unconditional jmp is not a good pruning point,
- * but it's marked, since backtracking needs
- * to record jmp history in is_state_visited().
- */
- init_explored_state(env, t + insns[t].off + 1);
- /* tell verifier to check for equivalent states
- * after every call and jump
- */
- if (t + 1 < insn_cnt)
- init_explored_state(env, t + 1);
- } else {
- /* conditional jump with two edges */
- init_explored_state(env, t);
- ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
- if (ret == 1)
- goto peek_stack;
- else if (ret < 0)
- goto err_free;
-
- ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
- if (ret == 1)
- goto peek_stack;
- else if (ret < 0)
- goto err_free;
- }
- } else {
- /* all other non-branch instructions with single
- * fall-through edge
- */
- ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
- if (ret == 1)
- goto peek_stack;
- else if (ret < 0)
goto err_free;
+ }
}
-mark_explored:
- insn_state[t] = EXPLORED;
- if (env->cfg.cur_stack-- <= 0) {
+ if (env->cfg.cur_stack < 0) {
verbose(env, "pop stack internal bug\n");
ret = -EFAULT;
goto err_free;
}
- goto peek_stack;
-check_state:
+ if (env->exception_callback_subprog && !ex_done) {
+ ex_insn_beg = env->subprog_info[env->exception_callback_subprog].start;
+
+ insn_state[ex_insn_beg] = DISCOVERED;
+ insn_stack[0] = ex_insn_beg;
+ env->cfg.cur_stack = 1;
+ ex_done = true;
+ goto walk_cfg;
+ }
+
for (i = 0; i < insn_cnt; i++) {
+ struct bpf_insn *insn = &env->prog->insnsi[i];
+
if (insn_state[i] != EXPLORED) {
verbose(env, "unreachable insn %d\n", i);
ret = -EINVAL;
goto err_free;
}
+ if (bpf_is_ldimm64(insn)) {
+ if (insn_state[i + 1] != 0) {
+ verbose(env, "jump into the middle of ldimm64 insn %d\n", i);
+ ret = -EINVAL;
+ goto err_free;
+ }
+ i++; /* skip second half of ldimm64 */
+ }
}
ret = 0; /* cfg looks good */
@@ -7534,32 +15693,46 @@ err_free:
return ret;
}
+static int check_abnormal_return(struct bpf_verifier_env *env)
+{
+ int i;
+
+ for (i = 1; i < env->subprog_cnt; i++) {
+ if (env->subprog_info[i].has_ld_abs) {
+ verbose(env, "LD_ABS is not allowed in subprogs without BTF\n");
+ return -EINVAL;
+ }
+ if (env->subprog_info[i].has_tail_call) {
+ verbose(env, "tail_call is not allowed in subprogs without BTF\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
/* The minimum supported BTF func info size */
#define MIN_BPF_FUNCINFO_SIZE 8
#define MAX_FUNCINFO_REC_SIZE 252
-static int check_btf_func(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- union bpf_attr __user *uattr)
+static int check_btf_func_early(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
{
- u32 i, nfuncs, urec_size, min_size;
u32 krec_size = sizeof(struct bpf_func_info);
+ const struct btf_type *type, *func_proto;
+ u32 i, nfuncs, urec_size, min_size;
struct bpf_func_info *krecord;
- struct bpf_func_info_aux *info_aux = NULL;
- const struct btf_type *type;
struct bpf_prog *prog;
const struct btf *btf;
- void __user *urecord;
u32 prev_offset = 0;
+ bpfptr_t urecord;
int ret = -ENOMEM;
nfuncs = attr->func_info_cnt;
- if (!nfuncs)
+ if (!nfuncs) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
return 0;
-
- if (nfuncs != env->subprog_cnt) {
- verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
- return -EINVAL;
}
urec_size = attr->func_info_rec_size;
@@ -7573,15 +15746,12 @@ static int check_btf_func(struct bpf_verifier_env *env,
prog = env->prog;
btf = prog->aux->btf;
- urecord = u64_to_user_ptr(attr->func_info);
+ urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
min_size = min_t(u32, krec_size, urec_size);
krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
if (!krecord)
return -ENOMEM;
- info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN);
- if (!info_aux)
- goto err_free;
for (i = 0; i < nfuncs; i++) {
ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
@@ -7591,37 +15761,32 @@ static int check_btf_func(struct bpf_verifier_env *env,
/* set the size kernel expects so loader can zero
* out the rest of the record.
*/
- if (put_user(min_size, &uattr->func_info_rec_size))
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, func_info_rec_size),
+ &min_size, sizeof(min_size)))
ret = -EFAULT;
}
goto err_free;
}
- if (copy_from_user(&krecord[i], urecord, min_size)) {
+ if (copy_from_bpfptr(&krecord[i], urecord, min_size)) {
ret = -EFAULT;
goto err_free;
}
/* check insn_off */
+ ret = -EINVAL;
if (i == 0) {
if (krecord[i].insn_off) {
verbose(env,
"nonzero insn_off %u for the first func info record",
krecord[i].insn_off);
- ret = -EINVAL;
goto err_free;
}
} else if (krecord[i].insn_off <= prev_offset) {
verbose(env,
"same or smaller insn offset (%u) than previous func info record (%u)",
krecord[i].insn_off, prev_offset);
- ret = -EINVAL;
- goto err_free;
- }
-
- if (env->subprog_info[i].start != krecord[i].insn_off) {
- verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
- ret = -EINVAL;
goto err_free;
}
@@ -7630,21 +15795,98 @@ static int check_btf_func(struct bpf_verifier_env *env,
if (!type || !btf_type_is_func(type)) {
verbose(env, "invalid type id %d in func info",
krecord[i].type_id);
- ret = -EINVAL;
goto err_free;
}
- info_aux[i].linkage = BTF_INFO_VLEN(type->info);
+
+ func_proto = btf_type_by_id(btf, type->type);
+ if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto)))
+ /* btf_func_check() already verified it during BTF load */
+ goto err_free;
+
prev_offset = krecord[i].insn_off;
- urecord += urec_size;
+ bpfptr_add(&urecord, urec_size);
}
prog->aux->func_info = krecord;
prog->aux->func_info_cnt = nfuncs;
- prog->aux->func_info_aux = info_aux;
return 0;
err_free:
kvfree(krecord);
+ return ret;
+}
+
+static int check_btf_func(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ const struct btf_type *type, *func_proto, *ret_type;
+ u32 i, nfuncs, urec_size;
+ struct bpf_func_info *krecord;
+ struct bpf_func_info_aux *info_aux = NULL;
+ struct bpf_prog *prog;
+ const struct btf *btf;
+ bpfptr_t urecord;
+ bool scalar_return;
+ int ret = -ENOMEM;
+
+ nfuncs = attr->func_info_cnt;
+ if (!nfuncs) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+ if (nfuncs != env->subprog_cnt) {
+ verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
+ return -EINVAL;
+ }
+
+ urec_size = attr->func_info_rec_size;
+
+ prog = env->prog;
+ btf = prog->aux->btf;
+
+ urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
+
+ krecord = prog->aux->func_info;
+ info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN);
+ if (!info_aux)
+ return -ENOMEM;
+
+ for (i = 0; i < nfuncs; i++) {
+ /* check insn_off */
+ ret = -EINVAL;
+
+ if (env->subprog_info[i].start != krecord[i].insn_off) {
+ verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
+ goto err_free;
+ }
+
+ /* Already checked type_id */
+ type = btf_type_by_id(btf, krecord[i].type_id);
+ info_aux[i].linkage = BTF_INFO_VLEN(type->info);
+ /* Already checked func_proto */
+ func_proto = btf_type_by_id(btf, type->type);
+
+ ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
+ scalar_return =
+ btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type);
+ if (i && !scalar_return && env->subprog_info[i].has_ld_abs) {
+ verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n");
+ goto err_free;
+ }
+ if (i && !scalar_return && env->subprog_info[i].has_tail_call) {
+ verbose(env, "tail_call is only allowed in functions that return 'int'.\n");
+ goto err_free;
+ }
+
+ bpfptr_add(&urecord, urec_size);
+ }
+
+ prog->aux->func_info_aux = info_aux;
+ return 0;
+
+err_free:
kfree(info_aux);
return ret;
}
@@ -7657,29 +15899,31 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
if (!aux->func_info)
return;
- for (i = 0; i < env->subprog_cnt; i++)
+ /* func_info is not available for hidden subprogs */
+ for (i = 0; i < env->subprog_cnt - env->hidden_subprog_cnt; i++)
aux->func_info[i].insn_off = env->subprog_info[i].start;
}
-#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \
- sizeof(((struct bpf_line_info *)(0))->line_col))
+#define MIN_BPF_LINEINFO_SIZE offsetofend(struct bpf_line_info, line_col)
#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE
static int check_btf_line(struct bpf_verifier_env *env,
const union bpf_attr *attr,
- union bpf_attr __user *uattr)
+ bpfptr_t uattr)
{
u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0;
struct bpf_subprog_info *sub;
struct bpf_line_info *linfo;
struct bpf_prog *prog;
const struct btf *btf;
- void __user *ulinfo;
+ bpfptr_t ulinfo;
int err;
nr_linfo = attr->line_info_cnt;
if (!nr_linfo)
return 0;
+ if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info))
+ return -EINVAL;
rec_size = attr->line_info_rec_size;
if (rec_size < MIN_BPF_LINEINFO_SIZE ||
@@ -7700,7 +15944,7 @@ static int check_btf_line(struct bpf_verifier_env *env,
s = 0;
sub = env->subprog_info;
- ulinfo = u64_to_user_ptr(attr->line_info);
+ ulinfo = make_bpfptr(attr->line_info, uattr.is_kernel);
expected_size = sizeof(struct bpf_line_info);
ncopy = min_t(u32, expected_size, rec_size);
for (i = 0; i < nr_linfo; i++) {
@@ -7708,14 +15952,15 @@ static int check_btf_line(struct bpf_verifier_env *env,
if (err) {
if (err == -E2BIG) {
verbose(env, "nonzero tailing record in line_info");
- if (put_user(expected_size,
- &uattr->line_info_rec_size))
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, line_info_rec_size),
+ &expected_size, sizeof(expected_size)))
err = -EFAULT;
}
goto err_free;
}
- if (copy_from_user(&linfo[i], ulinfo, ncopy)) {
+ if (copy_from_bpfptr(&linfo[i], ulinfo, ncopy)) {
err = -EFAULT;
goto err_free;
}
@@ -7767,7 +16012,7 @@ static int check_btf_line(struct bpf_verifier_env *env,
}
prev_offset = linfo[i].insn_off;
- ulinfo += rec_size;
+ bpfptr_add(&ulinfo, rec_size);
}
if (s != env->subprog_cnt) {
@@ -7787,21 +16032,118 @@ err_free:
return err;
}
-static int check_btf_info(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- union bpf_attr __user *uattr)
+#define MIN_CORE_RELO_SIZE sizeof(struct bpf_core_relo)
+#define MAX_CORE_RELO_SIZE MAX_FUNCINFO_REC_SIZE
+
+static int check_core_relo(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ u32 i, nr_core_relo, ncopy, expected_size, rec_size;
+ struct bpf_core_relo core_relo = {};
+ struct bpf_prog *prog = env->prog;
+ const struct btf *btf = prog->aux->btf;
+ struct bpf_core_ctx ctx = {
+ .log = &env->log,
+ .btf = btf,
+ };
+ bpfptr_t u_core_relo;
+ int err;
+
+ nr_core_relo = attr->core_relo_cnt;
+ if (!nr_core_relo)
+ return 0;
+ if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo))
+ return -EINVAL;
+
+ rec_size = attr->core_relo_rec_size;
+ if (rec_size < MIN_CORE_RELO_SIZE ||
+ rec_size > MAX_CORE_RELO_SIZE ||
+ rec_size % sizeof(u32))
+ return -EINVAL;
+
+ u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel);
+ expected_size = sizeof(struct bpf_core_relo);
+ ncopy = min_t(u32, expected_size, rec_size);
+
+ /* Unlike func_info and line_info, copy and apply each CO-RE
+ * relocation record one at a time.
+ */
+ for (i = 0; i < nr_core_relo; i++) {
+ /* future proofing when sizeof(bpf_core_relo) changes */
+ err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size);
+ if (err) {
+ if (err == -E2BIG) {
+ verbose(env, "nonzero tailing record in core_relo");
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, core_relo_rec_size),
+ &expected_size, sizeof(expected_size)))
+ err = -EFAULT;
+ }
+ break;
+ }
+
+ if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) {
+ verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n",
+ i, core_relo.insn_off, prog->len);
+ err = -EINVAL;
+ break;
+ }
+
+ err = bpf_core_apply(&ctx, &core_relo, i,
+ &prog->insnsi[core_relo.insn_off / 8]);
+ if (err)
+ break;
+ bpfptr_add(&u_core_relo, rec_size);
+ }
+ return err;
+}
+
+static int check_btf_info_early(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
{
struct btf *btf;
int err;
- if (!attr->func_info_cnt && !attr->line_info_cnt)
+ if (!attr->func_info_cnt && !attr->line_info_cnt) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
return 0;
+ }
btf = btf_get_by_fd(attr->prog_btf_fd);
if (IS_ERR(btf))
return PTR_ERR(btf);
+ if (btf_is_kernel(btf)) {
+ btf_put(btf);
+ return -EACCES;
+ }
env->prog->aux->btf = btf;
+ err = check_btf_func_early(env, attr, uattr);
+ if (err)
+ return err;
+ return 0;
+}
+
+static int check_btf_info(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ int err;
+
+ if (!attr->func_info_cnt && !attr->line_info_cnt) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+
err = check_btf_func(env, attr, uattr);
if (err)
return err;
@@ -7810,6 +16152,10 @@ static int check_btf_info(struct bpf_verifier_env *env,
if (err)
return err;
+ err = check_core_relo(env, attr, uattr);
+ if (err)
+ return err;
+
return 0;
}
@@ -7820,16 +16166,13 @@ static bool range_within(struct bpf_reg_state *old,
return old->umin_value <= cur->umin_value &&
old->umax_value >= cur->umax_value &&
old->smin_value <= cur->smin_value &&
- old->smax_value >= cur->smax_value;
+ old->smax_value >= cur->smax_value &&
+ old->u32_min_value <= cur->u32_min_value &&
+ old->u32_max_value >= cur->u32_max_value &&
+ old->s32_min_value <= cur->s32_min_value &&
+ old->s32_max_value >= cur->s32_max_value;
}
-/* Maximum number of register states that can exist at once */
-#define ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE)
-struct idpair {
- u32 old;
- u32 cur;
-};
-
/* If in the old state two registers had the same id, then they need to have
* the same id in the new state as well. But that id could be different from
* the old state, so we need to track the mapping from old to new ids.
@@ -7840,25 +16183,47 @@ struct idpair {
* So we look through our idmap to see if this old id has been seen before. If
* so, we require the new id to match; otherwise, we add the id pair to the map.
*/
-static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
+static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
{
+ struct bpf_id_pair *map = idmap->map;
unsigned int i;
- for (i = 0; i < ID_MAP_SIZE; i++) {
- if (!idmap[i].old) {
+ /* either both IDs should be set or both should be zero */
+ if (!!old_id != !!cur_id)
+ return false;
+
+ if (old_id == 0) /* cur_id == 0 as well */
+ return true;
+
+ for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
+ if (!map[i].old) {
/* Reached an empty slot; haven't seen this id before */
- idmap[i].old = old_id;
- idmap[i].cur = cur_id;
+ map[i].old = old_id;
+ map[i].cur = cur_id;
return true;
}
- if (idmap[i].old == old_id)
- return idmap[i].cur == cur_id;
+ if (map[i].old == old_id)
+ return map[i].cur == cur_id;
+ if (map[i].cur == cur_id)
+ return false;
}
/* We ran out of idmap slots, which should be impossible */
WARN_ON_ONCE(1);
return false;
}
+/* Similar to check_ids(), but allocate a unique temporary ID
+ * for 'old_id' or 'cur_id' of zero.
+ * This makes pairs like '0 vs unique ID', 'unique ID vs 0' valid.
+ */
+static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
+{
+ old_id = old_id ? old_id : ++idmap->tmp_id_gen;
+ cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen;
+
+ return check_ids(old_id, cur_id, idmap);
+}
+
static void clean_func_state(struct bpf_verifier_env *env,
struct bpf_func_state *st)
{
@@ -7921,7 +16286,7 @@ static void clean_verifier_state(struct bpf_verifier_env *env,
* Since the verifier pushes the branch states as it sees them while exploring
* the program the condition of walking the branch instruction for the second
* time means that all states below this branch were already explored and
- * their final liveness markes are already propagated.
+ * their final liveness marks are already propagated.
* Hence when the verifier completes the search of state list in is_state_visited()
* we can call this clean_live_states() function to mark all liveness states
* as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state'
@@ -7937,98 +16302,122 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
struct bpf_verifier_state *cur)
{
struct bpf_verifier_state_list *sl;
- int i;
sl = *explored_state(env, insn);
while (sl) {
if (sl->state.branches)
goto next;
if (sl->state.insn_idx != insn ||
- sl->state.curframe != cur->curframe)
+ !same_callsites(&sl->state, cur))
goto next;
- for (i = 0; i <= cur->curframe; i++)
- if (sl->state.frame[i]->callsite != cur->frame[i]->callsite)
- goto next;
clean_verifier_state(env, &sl->state);
next:
sl = sl->next;
}
}
+static bool regs_exact(const struct bpf_reg_state *rold,
+ const struct bpf_reg_state *rcur,
+ struct bpf_idmap *idmap)
+{
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ check_ids(rold->id, rcur->id, idmap) &&
+ check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
+}
+
/* Returns true if (rold safe implies rcur safe) */
-static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
- struct idpair *idmap)
+static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
+ struct bpf_reg_state *rcur, struct bpf_idmap *idmap, bool exact)
{
- bool equal;
+ if (exact)
+ return regs_exact(rold, rcur, idmap);
if (!(rold->live & REG_LIVE_READ))
/* explored state didn't use this */
return true;
-
- equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0;
-
- if (rold->type == PTR_TO_STACK)
- /* two stack pointers are equal only if they're pointing to
- * the same stack frame, since fp-8 in foo != fp-8 in bar
- */
- return equal && rold->frameno == rcur->frameno;
-
- if (equal)
- return true;
-
if (rold->type == NOT_INIT)
/* explored state can't have used this */
return true;
if (rcur->type == NOT_INIT)
return false;
- switch (rold->type) {
+
+ /* Enforce that register types have to match exactly, including their
+ * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general
+ * rule.
+ *
+ * One can make a point that using a pointer register as unbounded
+ * SCALAR would be technically acceptable, but this could lead to
+ * pointer leaks because scalars are allowed to leak while pointers
+ * are not. We could make this safe in special cases if root is
+ * calling us, but it's probably not worth the hassle.
+ *
+ * Also, register types that are *not* MAYBE_NULL could technically be
+ * safe to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE
+ * is safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point
+ * to the same map).
+ * However, if the old MAYBE_NULL register then got NULL checked,
+ * doing so could have affected others with the same id, and we can't
+ * check for that because we lost the id when we converted to
+ * a non-MAYBE_NULL variant.
+ * So, as a general rule we don't allow mixing MAYBE_NULL and
+ * non-MAYBE_NULL registers as well.
+ */
+ if (rold->type != rcur->type)
+ return false;
+
+ switch (base_type(rold->type)) {
case SCALAR_VALUE:
- if (rcur->type == SCALAR_VALUE) {
- if (!rold->precise && !rcur->precise)
- return true;
- /* new val must satisfy old val knowledge */
- return range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off);
- } else {
- /* We're trying to use a pointer in place of a scalar.
- * Even if the scalar was unbounded, this could lead to
- * pointer leaks because scalars are allowed to leak
- * while pointers are not. We could make this safe in
- * special cases if root is calling us, but it's
- * probably not worth the hassle.
+ if (env->explore_alu_limits) {
+ /* explore_alu_limits disables tnum_in() and range_within()
+ * logic and requires everything to be strict
*/
- return false;
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ check_scalar_ids(rold->id, rcur->id, idmap);
}
+ if (!rold->precise)
+ return true;
+ /* Why check_ids() for scalar registers?
+ *
+ * Consider the following BPF code:
+ * 1: r6 = ... unbound scalar, ID=a ...
+ * 2: r7 = ... unbound scalar, ID=b ...
+ * 3: if (r6 > r7) goto +1
+ * 4: r6 = r7
+ * 5: if (r6 > X) goto ...
+ * 6: ... memory operation using r7 ...
+ *
+ * First verification path is [1-6]:
+ * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7;
+ * - at (5) r6 would be marked <= X, find_equal_scalars() would also mark
+ * r7 <= X, because r6 and r7 share same id.
+ * Next verification path is [1-4, 6].
+ *
+ * Instruction (6) would be reached in two states:
+ * I. r6{.id=b}, r7{.id=b} via path 1-6;
+ * II. r6{.id=a}, r7{.id=b} via path 1-4, 6.
+ *
+ * Use check_ids() to distinguish these states.
+ * ---
+ * Also verify that new value satisfies old value range knowledge.
+ */
+ return range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off) &&
+ check_scalar_ids(rold->id, rcur->id, idmap);
+ case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
+ case PTR_TO_MEM:
+ case PTR_TO_BUF:
+ case PTR_TO_TP_BUFFER:
/* If the new min/max/var_off satisfy the old ones and
* everything else matches, we are OK.
- * 'id' is not compared, since it's only used for maps with
- * bpf_spin_lock inside map element and in such cases if
- * the rest of the prog is valid for one map element then
- * it's valid for all map elements regardless of the key
- * used in bpf_map_lookup()
*/
- return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off);
- case PTR_TO_MAP_VALUE_OR_NULL:
- /* a PTR_TO_MAP_VALUE could be safe to use as a
- * PTR_TO_MAP_VALUE_OR_NULL into the same map.
- * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
- * checked, doing so could have affected others with the same
- * id, and we can't check for that because we lost the id when
- * we converted to a PTR_TO_MAP_VALUE.
- */
- if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL)
- return false;
- if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
- return false;
- /* Check our ids match any regs they're supposed to */
- return check_ids(rold->id, rcur->id, idmap);
+ tnum_in(rold->var_off, rcur->var_off) &&
+ check_ids(rold->id, rcur->id, idmap) &&
+ check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
case PTR_TO_PACKET_META:
case PTR_TO_PACKET:
- if (rcur->type != rold->type)
- return false;
/* We must have at least as much range as the old ptr
* did, so that any accesses which were safe before are
* still safe. This is true even if old range < old off,
@@ -8043,38 +16432,23 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
if (rold->off != rcur->off)
return false;
/* id relations must be preserved */
- if (rold->id && !check_ids(rold->id, rcur->id, idmap))
+ if (!check_ids(rold->id, rcur->id, idmap))
return false;
/* new val must satisfy old val knowledge */
return range_within(rold, rcur) &&
tnum_in(rold->var_off, rcur->var_off);
- case PTR_TO_CTX:
- case CONST_PTR_TO_MAP:
- case PTR_TO_PACKET_END:
- case PTR_TO_FLOW_KEYS:
- case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
- case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
- case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
- case PTR_TO_XDP_SOCK:
- /* Only valid matches are exact, which memcmp() above
- * would have accepted
+ case PTR_TO_STACK:
+ /* two stack pointers are equal only if they're pointing to
+ * the same stack frame, since fp-8 in foo != fp-8 in bar
*/
+ return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
default:
- /* Don't know what's going on, just say it's not safe */
- return false;
+ return regs_exact(rold, rcur, idmap);
}
-
- /* Shouldn't get here; if we do, say it's not safe */
- WARN_ON_ONCE(1);
- return false;
}
-static bool stacksafe(struct bpf_func_state *old,
- struct bpf_func_state *cur,
- struct idpair *idmap)
+static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
+ struct bpf_func_state *cur, struct bpf_idmap *idmap, bool exact)
{
int i, spi;
@@ -8083,9 +16457,16 @@ static bool stacksafe(struct bpf_func_state *old,
* didn't use them
*/
for (i = 0; i < old->allocated_stack; i++) {
+ struct bpf_reg_state *old_reg, *cur_reg;
+
spi = i / BPF_REG_SIZE;
- if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
+ if (exact &&
+ old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+ return false;
+
+ if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) && !exact) {
i += BPF_REG_SIZE - 1;
/* explored state didn't use this */
continue;
@@ -8094,6 +16475,10 @@ static bool stacksafe(struct bpf_func_state *old,
if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
continue;
+ if (env->allow_uninit_stack &&
+ old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC)
+ continue;
+
/* explored stack has more populated slots than current stack
* and these slots were used
*/
@@ -8110,18 +16495,16 @@ static bool stacksafe(struct bpf_func_state *old,
if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
cur->stack[spi].slot_type[i % BPF_REG_SIZE])
/* Ex: old explored (safe) state has STACK_SPILL in
- * this stack slot, but current has has STACK_MISC ->
+ * this stack slot, but current has STACK_MISC ->
* this verifier states are not equivalent,
* return false to continue verification of this path
*/
return false;
- if (i % BPF_REG_SIZE)
- continue;
- if (old->stack[spi].slot_type[0] != STACK_SPILL)
+ if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1)
continue;
- if (!regsafe(&old->stack[spi].spilled_ptr,
- &cur->stack[spi].spilled_ptr,
- idmap))
+ /* Both old and cur are having same slot_type */
+ switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) {
+ case STACK_SPILL:
/* when explored and current stack slot are both storing
* spilled registers, check that stored pointers types
* are the same as well.
@@ -8132,17 +16515,60 @@ static bool stacksafe(struct bpf_func_state *old,
* such verifier states are not equivalent.
* return false to continue verification of this path
*/
+ if (!regsafe(env, &old->stack[spi].spilled_ptr,
+ &cur->stack[spi].spilled_ptr, idmap, exact))
+ return false;
+ break;
+ case STACK_DYNPTR:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ if (old_reg->dynptr.type != cur_reg->dynptr.type ||
+ old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot ||
+ !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
+ case STACK_ITER:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ /* iter.depth is not compared between states as it
+ * doesn't matter for correctness and would otherwise
+ * prevent convergence; we maintain it only to prevent
+ * infinite loop check triggering, see
+ * iter_active_depths_differ()
+ */
+ if (old_reg->iter.btf != cur_reg->iter.btf ||
+ old_reg->iter.btf_id != cur_reg->iter.btf_id ||
+ old_reg->iter.state != cur_reg->iter.state ||
+ /* ignore {old_reg,cur_reg}->iter.depth, see above */
+ !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
+ case STACK_MISC:
+ case STACK_ZERO:
+ case STACK_INVALID:
+ continue;
+ /* Ensure that new unhandled slot types return false by default */
+ default:
return false;
+ }
}
return true;
}
-static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur)
+static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
+ struct bpf_idmap *idmap)
{
+ int i;
+
if (old->acquired_refs != cur->acquired_refs)
return false;
- return !memcmp(old->refs, cur->refs,
- sizeof(*old->refs) * old->acquired_refs);
+
+ for (i = 0; i < old->acquired_refs; i++) {
+ if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap))
+ return false;
+ }
+
+ return true;
}
/* compare two verifier states
@@ -8171,50 +16597,66 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur)
* whereas register type in current state is meaningful, it means that
* the current state will reach 'bpf_exit' instruction safely
*/
-static bool func_states_equal(struct bpf_func_state *old,
- struct bpf_func_state *cur)
+static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
+ struct bpf_func_state *cur, bool exact)
{
- struct idpair *idmap;
- bool ret = false;
int i;
- idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL);
- /* If we failed to allocate the idmap, just say it's not safe */
- if (!idmap)
+ if (old->callback_depth > cur->callback_depth)
return false;
- for (i = 0; i < MAX_BPF_REG; i++) {
- if (!regsafe(&old->regs[i], &cur->regs[i], idmap))
- goto out_free;
- }
+ for (i = 0; i < MAX_BPF_REG; i++)
+ if (!regsafe(env, &old->regs[i], &cur->regs[i],
+ &env->idmap_scratch, exact))
+ return false;
- if (!stacksafe(old, cur, idmap))
- goto out_free;
+ if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
+ return false;
- if (!refsafe(old, cur))
- goto out_free;
- ret = true;
-out_free:
- kfree(idmap);
- return ret;
+ if (!refsafe(old, cur, &env->idmap_scratch))
+ return false;
+
+ return true;
+}
+
+static void reset_idmap_scratch(struct bpf_verifier_env *env)
+{
+ env->idmap_scratch.tmp_id_gen = env->id_gen;
+ memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
}
static bool states_equal(struct bpf_verifier_env *env,
struct bpf_verifier_state *old,
- struct bpf_verifier_state *cur)
+ struct bpf_verifier_state *cur,
+ bool exact)
{
int i;
if (old->curframe != cur->curframe)
return false;
+ reset_idmap_scratch(env);
+
/* Verification state from speculative execution simulation
* must never prune a non-speculative execution one.
*/
if (old->speculative && !cur->speculative)
return false;
- if (old->active_spin_lock != cur->active_spin_lock)
+ if (old->active_lock.ptr != cur->active_lock.ptr)
+ return false;
+
+ /* Old and cur active_lock's have to be either both present
+ * or both absent.
+ */
+ if (!!old->active_lock.id != !!cur->active_lock.id)
+ return false;
+
+ if (old->active_lock.id &&
+ !check_ids(old->active_lock.id, cur->active_lock.id, &env->idmap_scratch))
+ return false;
+
+ if (old->active_rcu_lock != cur->active_rcu_lock)
return false;
/* for states to be equal callsites have to be the same
@@ -8223,7 +16665,7 @@ static bool states_equal(struct bpf_verifier_env *env,
for (i = 0; i <= old->curframe; i++) {
if (old->frame[i]->callsite != cur->frame[i]->callsite)
return false;
- if (!func_states_equal(old->frame[i], cur->frame[i]))
+ if (!func_states_equal(env, old->frame[i], cur->frame[i], exact))
return false;
}
return true;
@@ -8317,35 +16759,54 @@ static int propagate_precision(struct bpf_verifier_env *env,
{
struct bpf_reg_state *state_reg;
struct bpf_func_state *state;
- int i, err = 0;
+ int i, err = 0, fr;
+ bool first;
- state = old->frame[old->curframe];
- state_reg = state->regs;
- for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
- if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
- continue;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "propagating r%d\n", i);
- err = mark_chain_precision(env, i);
- if (err < 0)
- return err;
- }
+ for (fr = old->curframe; fr >= 0; fr--) {
+ state = old->frame[fr];
+ state_reg = state->regs;
+ first = true;
+ for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise ||
+ !(state_reg->live & REG_LIVE_READ))
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ if (first)
+ verbose(env, "frame %d: propagating r%d", fr, i);
+ else
+ verbose(env, ",r%d", i);
+ }
+ bt_set_frame_reg(&env->bt, fr, i);
+ first = false;
+ }
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- if (state->stack[i].slot_type[0] != STACK_SPILL)
- continue;
- state_reg = &state->stack[i].spilled_ptr;
- if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
- continue;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "propagating fp%d\n",
- (-i - 1) * BPF_REG_SIZE);
- err = mark_chain_precision_stack(env, i);
- if (err < 0)
- return err;
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (!is_spilled_reg(&state->stack[i]))
+ continue;
+ state_reg = &state->stack[i].spilled_ptr;
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise ||
+ !(state_reg->live & REG_LIVE_READ))
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ if (first)
+ verbose(env, "frame %d: propagating fp%d",
+ fr, (-i - 1) * BPF_REG_SIZE);
+ else
+ verbose(env, ",fp%d", (-i - 1) * BPF_REG_SIZE);
+ }
+ bt_set_frame_slot(&env->bt, fr, i);
+ first = false;
+ }
+ if (!first)
+ verbose(env, "\n");
}
+
+ err = mark_chain_precision_batch(env);
+ if (err < 0)
+ return err;
+
return 0;
}
@@ -8367,21 +16828,102 @@ static bool states_maybe_looping(struct bpf_verifier_state *old,
return true;
}
+static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].is_iter_next;
+}
+
+/* is_state_visited() handles iter_next() (see process_iter_next_call() for
+ * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
+ * states to match, which otherwise would look like an infinite loop. So while
+ * iter_next() calls are taken care of, we still need to be careful and
+ * prevent erroneous and too eager declaration of "ininite loop", when
+ * iterators are involved.
+ *
+ * Here's a situation in pseudo-BPF assembly form:
+ *
+ * 0: again: ; set up iter_next() call args
+ * 1: r1 = &it ; <CHECKPOINT HERE>
+ * 2: call bpf_iter_num_next ; this is iter_next() call
+ * 3: if r0 == 0 goto done
+ * 4: ... something useful here ...
+ * 5: goto again ; another iteration
+ * 6: done:
+ * 7: r1 = &it
+ * 8: call bpf_iter_num_destroy ; clean up iter state
+ * 9: exit
+ *
+ * This is a typical loop. Let's assume that we have a prune point at 1:,
+ * before we get to `call bpf_iter_num_next` (e.g., because of that `goto
+ * again`, assuming other heuristics don't get in a way).
+ *
+ * When we first time come to 1:, let's say we have some state X. We proceed
+ * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit.
+ * Now we come back to validate that forked ACTIVE state. We proceed through
+ * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we
+ * are converging. But the problem is that we don't know that yet, as this
+ * convergence has to happen at iter_next() call site only. So if nothing is
+ * done, at 1: verifier will use bounded loop logic and declare infinite
+ * looping (and would be *technically* correct, if not for iterator's
+ * "eventual sticky NULL" contract, see process_iter_next_call()). But we
+ * don't want that. So what we do in process_iter_next_call() when we go on
+ * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's
+ * a different iteration. So when we suspect an infinite loop, we additionally
+ * check if any of the *ACTIVE* iterator states depths differ. If yes, we
+ * pretend we are not looping and wait for next iter_next() call.
+ *
+ * This only applies to ACTIVE state. In DRAINED state we don't expect to
+ * loop, because that would actually mean infinite loop, as DRAINED state is
+ * "sticky", and so we'll keep returning into the same instruction with the
+ * same state (at least in one of possible code paths).
+ *
+ * This approach allows to keep infinite loop heuristic even in the face of
+ * active iterator. E.g., C snippet below is and will be detected as
+ * inifintely looping:
+ *
+ * struct bpf_iter_num it;
+ * int *p, x;
+ *
+ * bpf_iter_num_new(&it, 0, 10);
+ * while ((p = bpf_iter_num_next(&t))) {
+ * x = p;
+ * while (x--) {} // <<-- infinite loop here
+ * }
+ *
+ */
+static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur)
+{
+ struct bpf_reg_state *slot, *cur_slot;
+ struct bpf_func_state *state;
+ int i, fr;
+
+ for (fr = old->curframe; fr >= 0; fr--) {
+ state = old->frame[fr];
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_ITER)
+ continue;
+
+ slot = &state->stack[i].spilled_ptr;
+ if (slot->iter.state != BPF_ITER_STATE_ACTIVE)
+ continue;
+
+ cur_slot = &cur->frame[fr]->stack[i].spilled_ptr;
+ if (cur_slot->iter.depth != slot->iter.depth)
+ return true;
+ }
+ }
+ return false;
+}
static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
struct bpf_verifier_state_list *sl, **pprev;
- struct bpf_verifier_state *cur = env->cur_state, *new;
- int i, j, err, states_cnt = 0;
- bool add_new_state = env->test_state_freq ? true : false;
-
- cur->last_insn_idx = env->prev_insn_idx;
- if (!env->insn_aux_data[insn_idx].prune_point)
- /* this 'insn_idx' instruction wasn't marked, so we will not
- * be doing state search here
- */
- return 0;
+ struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
+ int i, j, n, err, states_cnt = 0;
+ bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
+ bool add_new_state = force_new_state;
+ bool force_exact;
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
@@ -8404,11 +16946,101 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
states_cnt++;
if (sl->state.insn_idx != insn_idx)
goto next;
+
if (sl->state.branches) {
+ struct bpf_func_state *frame = sl->state.frame[sl->state.curframe];
+
+ if (frame->in_async_callback_fn &&
+ frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) {
+ /* Different async_entry_cnt means that the verifier is
+ * processing another entry into async callback.
+ * Seeing the same state is not an indication of infinite
+ * loop or infinite recursion.
+ * But finding the same state doesn't mean that it's safe
+ * to stop processing the current state. The previous state
+ * hasn't yet reached bpf_exit, since state.branches > 0.
+ * Checking in_async_callback_fn alone is not enough either.
+ * Since the verifier still needs to catch infinite loops
+ * inside async callbacks.
+ */
+ goto skip_inf_loop_check;
+ }
+ /* BPF open-coded iterators loop detection is special.
+ * states_maybe_looping() logic is too simplistic in detecting
+ * states that *might* be equivalent, because it doesn't know
+ * about ID remapping, so don't even perform it.
+ * See process_iter_next_call() and iter_active_depths_differ()
+ * for overview of the logic. When current and one of parent
+ * states are detected as equivalent, it's a good thing: we prove
+ * convergence and can stop simulating further iterations.
+ * It's safe to assume that iterator loop will finish, taking into
+ * account iter_next() contract of eventually returning
+ * sticky NULL result.
+ *
+ * Note, that states have to be compared exactly in this case because
+ * read and precision marks might not be finalized inside the loop.
+ * E.g. as in the program below:
+ *
+ * 1. r7 = -16
+ * 2. r6 = bpf_get_prandom_u32()
+ * 3. while (bpf_iter_num_next(&fp[-8])) {
+ * 4. if (r6 != 42) {
+ * 5. r7 = -32
+ * 6. r6 = bpf_get_prandom_u32()
+ * 7. continue
+ * 8. }
+ * 9. r0 = r10
+ * 10. r0 += r7
+ * 11. r8 = *(u64 *)(r0 + 0)
+ * 12. r6 = bpf_get_prandom_u32()
+ * 13. }
+ *
+ * Here verifier would first visit path 1-3, create a checkpoint at 3
+ * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does
+ * not have read or precision mark for r7 yet, thus inexact states
+ * comparison would discard current state with r7=-32
+ * => unsafe memory access at 11 would not be caught.
+ */
+ if (is_iter_next_insn(env, insn_idx)) {
+ if (states_equal(env, &sl->state, cur, true)) {
+ struct bpf_func_state *cur_frame;
+ struct bpf_reg_state *iter_state, *iter_reg;
+ int spi;
+
+ cur_frame = cur->frame[cur->curframe];
+ /* btf_check_iter_kfuncs() enforces that
+ * iter state pointer is always the first arg
+ */
+ iter_reg = &cur_frame->regs[BPF_REG_1];
+ /* current state is valid due to states_equal(),
+ * so we can assume valid iter and reg state,
+ * no need for extra (re-)validations
+ */
+ spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
+ iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
+ if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
+ update_loop_entry(cur, &sl->state);
+ goto hit;
+ }
+ }
+ goto skip_inf_loop_check;
+ }
+ if (calls_callback(env, insn_idx)) {
+ if (states_equal(env, &sl->state, cur, true))
+ goto hit;
+ goto skip_inf_loop_check;
+ }
+ /* attempt to detect infinite loop to avoid unnecessary doomed work */
if (states_maybe_looping(&sl->state, cur) &&
- states_equal(env, &sl->state, cur)) {
+ states_equal(env, &sl->state, cur, false) &&
+ !iter_active_depths_differ(&sl->state, cur) &&
+ sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
verbose_linfo(env, insn_idx, "; ");
verbose(env, "infinite loop detected at insn %d\n", insn_idx);
+ verbose(env, "cur state:");
+ print_verifier_state(env, cur->frame[cur->curframe], true);
+ verbose(env, "old state:");
+ print_verifier_state(env, sl->state.frame[cur->curframe], true);
return -EINVAL;
}
/* if the verifier is processing a loop, avoid adding new state
@@ -8423,12 +17055,44 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* This threshold shouldn't be too high either, since states
* at the end of the loop are likely to be useful in pruning.
*/
- if (env->jmps_processed - env->prev_jmps_processed < 20 &&
+skip_inf_loop_check:
+ if (!force_new_state &&
+ env->jmps_processed - env->prev_jmps_processed < 20 &&
env->insn_processed - env->prev_insn_processed < 100)
add_new_state = false;
goto miss;
}
- if (states_equal(env, &sl->state, cur)) {
+ /* If sl->state is a part of a loop and this loop's entry is a part of
+ * current verification path then states have to be compared exactly.
+ * 'force_exact' is needed to catch the following case:
+ *
+ * initial Here state 'succ' was processed first,
+ * | it was eventually tracked to produce a
+ * V state identical to 'hdr'.
+ * .---------> hdr All branches from 'succ' had been explored
+ * | | and thus 'succ' has its .branches == 0.
+ * | V
+ * | .------... Suppose states 'cur' and 'succ' correspond
+ * | | | to the same instruction + callsites.
+ * | V V In such case it is necessary to check
+ * | ... ... if 'succ' and 'cur' are states_equal().
+ * | | | If 'succ' and 'cur' are a part of the
+ * | V V same loop exact flag has to be set.
+ * | succ <- cur To check if that is the case, verify
+ * | | if loop entry of 'succ' is in current
+ * | V DFS path.
+ * | ...
+ * | |
+ * '----'
+ *
+ * Additional details are in the comment before get_loop_entry().
+ */
+ loop_entry = get_loop_entry(&sl->state);
+ force_exact = loop_entry && loop_entry->branches > 0;
+ if (states_equal(env, &sl->state, cur, force_exact)) {
+ if (force_exact)
+ update_loop_entry(cur, loop_entry);
+hit:
sl->hit_cnt++;
/* reached equivalent register/stack state,
* prune the search.
@@ -8447,7 +17111,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* the precision needs to be propagated back in
* the current state.
*/
- err = err ? : push_jmp_history(env, cur);
+ if (is_jmp_point(env, env->insn_idx))
+ err = err ? : push_jmp_history(env, cur, 0);
err = err ? : propagate_precision(env, &sl->state);
if (err)
return err;
@@ -8466,13 +17131,18 @@ miss:
* to keep checking from state equivalence point of view.
* Higher numbers increase max_states_per_insn and verification time,
* but do not meaningfully decrease insn_processed.
+ * 'n' controls how many times state could miss before eviction.
+ * Use bigger 'n' for checkpoints because evicting checkpoint states
+ * too early would hinder iterator convergence.
*/
- if (sl->miss_cnt > sl->hit_cnt * 3 + 3) {
+ n = is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3;
+ if (sl->miss_cnt > sl->hit_cnt * n + n) {
/* the state is unlikely to be useful. Remove it to
* speed up verification
*/
*pprev = sl->next;
- if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) {
+ if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE &&
+ !sl->state.used_as_loop_entry) {
u32 br = sl->state.branches;
WARN_ONCE(br,
@@ -8501,10 +17171,10 @@ next:
env->max_states_per_insn = states_cnt;
if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
- return push_jmp_history(env, cur);
+ return 0;
if (!add_new_state)
- return push_jmp_history(env, cur);
+ return 0;
/* There were no equivalent states, remember the current one.
* Technically the current state is not proven to be safe yet,
@@ -8523,6 +17193,10 @@ next:
env->prev_jmps_processed = env->jmps_processed;
env->prev_insn_processed = env->insn_processed;
+ /* forget precise markings we inherited, see __mark_chain_precision */
+ if (env->bpf_capable)
+ mark_all_scalars_imprecise(env, cur);
+
/* add new state to the head of linked list */
new = &new_sl->state;
err = copy_verifier_state(new, cur);
@@ -8537,6 +17211,7 @@ next:
cur->parent = new;
cur->first_insn_idx = insn_idx;
+ cur->dfs_depth = new->dfs_depth + 1;
clear_jmp_history(cur);
new_sl->next = *explored_state(env, insn_idx);
*explored_state(env, insn_idx) = new_sl;
@@ -8577,17 +17252,13 @@ next:
/* Return true if it's OK to have the same insn return a different type. */
static bool reg_type_mismatch_ok(enum bpf_reg_type type)
{
- switch (type) {
+ switch (base_type(type)) {
case PTR_TO_CTX:
case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
- case PTR_TO_BTF_ID_OR_NULL:
return false;
default:
return true;
@@ -8612,6 +17283,44 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
!reg_type_mismatch_ok(prev));
}
+static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
+ bool allow_trust_missmatch)
+{
+ enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type;
+
+ if (*prev_type == NOT_INIT) {
+ /* Saw a valid insn
+ * dst_reg = *(u32 *)(src_reg + off)
+ * save type to validate intersecting paths
+ */
+ *prev_type = type;
+ } else if (reg_type_mismatch(type, *prev_type)) {
+ /* Abuser program is trying to use the same insn
+ * dst_reg = *(u32*) (src_reg + off)
+ * with different pointer types:
+ * src_reg == ctx in one branch and
+ * src_reg == stack|map in some other branch.
+ * Reject it.
+ */
+ if (allow_trust_missmatch &&
+ base_type(type) == PTR_TO_BTF_ID &&
+ base_type(*prev_type) == PTR_TO_BTF_ID) {
+ /*
+ * Have to support a use case when one path through
+ * the program yields TRUSTED pointer while another
+ * is UNTRUSTED. Fallback to UNTRUSTED to generate
+ * BPF_PROBE_MEM/BPF_PROBE_MEMSX.
+ */
+ *prev_type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
+ } else {
+ verbose(env, "same insn cannot be used with different pointers\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int do_check(struct bpf_verifier_env *env)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
@@ -8623,10 +17332,14 @@ static int do_check(struct bpf_verifier_env *env)
int prev_insn_idx = -1;
for (;;) {
+ bool exception_exit = false;
struct bpf_insn *insn;
u8 class;
int err;
+ /* reset current history entry on each new instruction */
+ env->cur_hist_ent = NULL;
+
env->prev_insn_idx = prev_insn_idx;
if (env->insn_idx >= insn_cnt) {
verbose(env, "invalid insn idx %d insn_cnt %d\n",
@@ -8644,21 +17357,31 @@ static int do_check(struct bpf_verifier_env *env)
return -E2BIG;
}
- err = is_state_visited(env, env->insn_idx);
- if (err < 0)
- return err;
- if (err == 1) {
- /* found equivalent state, can prune the search */
- if (env->log.level & BPF_LOG_LEVEL) {
- if (do_print_state)
- verbose(env, "\nfrom %d to %d%s: safe\n",
- env->prev_insn_idx, env->insn_idx,
- env->cur_state->speculative ?
- " (speculative execution)" : "");
- else
- verbose(env, "%d: safe\n", env->insn_idx);
+ state->last_insn_idx = env->prev_insn_idx;
+
+ if (is_prune_point(env, env->insn_idx)) {
+ err = is_state_visited(env, env->insn_idx);
+ if (err < 0)
+ return err;
+ if (err == 1) {
+ /* found equivalent state, can prune the search */
+ if (env->log.level & BPF_LOG_LEVEL) {
+ if (do_print_state)
+ verbose(env, "\nfrom %d to %d%s: safe\n",
+ env->prev_insn_idx, env->insn_idx,
+ env->cur_state->speculative ?
+ " (speculative execution)" : "");
+ else
+ verbose(env, "%d: safe\n", env->insn_idx);
+ }
+ goto process_bpf_exit;
}
- goto process_bpf_exit;
+ }
+
+ if (is_jmp_point(env, env->insn_idx)) {
+ err = push_jmp_history(env, state, 0);
+ if (err)
+ return err;
}
if (signal_pending(current))
@@ -8667,31 +17390,34 @@ static int do_check(struct bpf_verifier_env *env)
if (need_resched())
cond_resched();
- if (env->log.level & BPF_LOG_LEVEL2 ||
- (env->log.level & BPF_LOG_LEVEL && do_print_state)) {
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "%d:", env->insn_idx);
- else
- verbose(env, "\nfrom %d to %d%s:",
- env->prev_insn_idx, env->insn_idx,
- env->cur_state->speculative ?
- " (speculative execution)" : "");
- print_verifier_state(env, state->frame[state->curframe]);
+ if (env->log.level & BPF_LOG_LEVEL2 && do_print_state) {
+ verbose(env, "\nfrom %d to %d%s:",
+ env->prev_insn_idx, env->insn_idx,
+ env->cur_state->speculative ?
+ " (speculative execution)" : "");
+ print_verifier_state(env, state->frame[state->curframe], true);
do_print_state = false;
}
if (env->log.level & BPF_LOG_LEVEL) {
const struct bpf_insn_cbs cbs = {
+ .cb_call = disasm_kfunc_name,
.cb_print = verbose,
.private_data = env,
};
+ if (verifier_state_scratched(env))
+ print_insn_state(env, state->frame[state->curframe]);
+
verbose_linfo(env, env->insn_idx, "; ");
+ env->prev_log_pos = env->log.end_pos;
verbose(env, "%d: ", env->insn_idx);
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+ env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos;
+ env->prev_log_pos = env->log.end_pos;
}
- if (bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (bpf_prog_is_offloaded(env->prog->aux)) {
err = bpf_prog_offload_verify_insn(env, env->insn_idx,
env->prev_insn_idx);
if (err)
@@ -8699,7 +17425,7 @@ static int do_check(struct bpf_verifier_env *env)
}
regs = cur_regs(env);
- env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
+ sanitize_mark_insn_seen(env);
prev_insn_idx = env->insn_idx;
if (class == BPF_ALU || class == BPF_ALU64) {
@@ -8708,7 +17434,7 @@ static int do_check(struct bpf_verifier_env *env)
return err;
} else if (class == BPF_LDX) {
- enum bpf_reg_type *prev_src_type, src_reg_type;
+ enum bpf_reg_type src_reg_type;
/* check for reserved fields is already done */
@@ -8728,42 +17454,28 @@ static int do_check(struct bpf_verifier_env *env)
*/
err = check_mem_access(env, env->insn_idx, insn->src_reg,
insn->off, BPF_SIZE(insn->code),
- BPF_READ, insn->dst_reg, false);
+ BPF_READ, insn->dst_reg, false,
+ BPF_MODE(insn->code) == BPF_MEMSX);
+ err = err ?: save_aux_ptr_type(env, src_reg_type, true);
+ err = err ?: reg_bounds_sanity_check(env, &regs[insn->dst_reg], "ldx");
if (err)
return err;
-
- prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type;
-
- if (*prev_src_type == NOT_INIT) {
- /* saw a valid insn
- * dst_reg = *(u32 *)(src_reg + off)
- * save type to validate intersecting paths
- */
- *prev_src_type = src_reg_type;
-
- } else if (reg_type_mismatch(src_reg_type, *prev_src_type)) {
- /* ABuser program is trying to use the same insn
- * dst_reg = *(u32*) (src_reg + off)
- * with different pointer types:
- * src_reg == ctx in one branch and
- * src_reg == stack|map in some other branch.
- * Reject it.
- */
- verbose(env, "same insn cannot be used with different pointers\n");
- return -EINVAL;
- }
-
} else if (class == BPF_STX) {
- enum bpf_reg_type *prev_dst_type, dst_reg_type;
+ enum bpf_reg_type dst_reg_type;
- if (BPF_MODE(insn->code) == BPF_XADD) {
- err = check_xadd(env, env->insn_idx, insn);
+ if (BPF_MODE(insn->code) == BPF_ATOMIC) {
+ err = check_atomic(env, env->insn_idx, insn);
if (err)
return err;
env->insn_idx++;
continue;
}
+ if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) {
+ verbose(env, "BPF_STX uses reserved fields\n");
+ return -EINVAL;
+ }
+
/* check src1 operand */
err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
@@ -8778,20 +17490,16 @@ static int do_check(struct bpf_verifier_env *env)
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, env->insn_idx, insn->dst_reg,
insn->off, BPF_SIZE(insn->code),
- BPF_WRITE, insn->src_reg, false);
+ BPF_WRITE, insn->src_reg, false, false);
if (err)
return err;
- prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type;
-
- if (*prev_dst_type == NOT_INIT) {
- *prev_dst_type = dst_reg_type;
- } else if (reg_type_mismatch(dst_reg_type, *prev_dst_type)) {
- verbose(env, "same insn cannot be used with different pointers\n");
- return -EINVAL;
- }
-
+ err = save_aux_ptr_type(env, dst_reg_type, false);
+ if (err)
+ return err;
} else if (class == BPF_ST) {
+ enum bpf_reg_type dst_reg_type;
+
if (BPF_MODE(insn->code) != BPF_MEM ||
insn->src_reg != BPF_REG_0) {
verbose(env, "BPF_ST uses reserved fields\n");
@@ -8802,59 +17510,73 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
- if (is_ctx_reg(env, insn->dst_reg)) {
- verbose(env, "BPF_ST stores into R%d %s is not allowed\n",
- insn->dst_reg,
- reg_type_str[reg_state(env, insn->dst_reg)->type]);
- return -EACCES;
- }
+ dst_reg_type = regs[insn->dst_reg].type;
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, env->insn_idx, insn->dst_reg,
insn->off, BPF_SIZE(insn->code),
- BPF_WRITE, -1, false);
+ BPF_WRITE, -1, false, false);
if (err)
return err;
+ err = save_aux_ptr_type(env, dst_reg_type, false);
+ if (err)
+ return err;
} else if (class == BPF_JMP || class == BPF_JMP32) {
u8 opcode = BPF_OP(insn->code);
env->jmps_processed++;
if (opcode == BPF_CALL) {
if (BPF_SRC(insn->code) != BPF_K ||
- insn->off != 0 ||
+ (insn->src_reg != BPF_PSEUDO_KFUNC_CALL
+ && insn->off != 0) ||
(insn->src_reg != BPF_REG_0 &&
- insn->src_reg != BPF_PSEUDO_CALL) ||
+ insn->src_reg != BPF_PSEUDO_CALL &&
+ insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
insn->dst_reg != BPF_REG_0 ||
class == BPF_JMP32) {
verbose(env, "BPF_CALL uses reserved fields\n");
return -EINVAL;
}
- if (env->cur_state->active_spin_lock &&
- (insn->src_reg == BPF_PSEUDO_CALL ||
- insn->imm != BPF_FUNC_spin_unlock)) {
- verbose(env, "function calls are not allowed while holding a lock\n");
- return -EINVAL;
+ if (env->cur_state->active_lock.ptr) {
+ if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) ||
+ (insn->src_reg == BPF_PSEUDO_CALL) ||
+ (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
+ (insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) {
+ verbose(env, "function calls are not allowed while holding a lock\n");
+ return -EINVAL;
+ }
}
- if (insn->src_reg == BPF_PSEUDO_CALL)
+ if (insn->src_reg == BPF_PSEUDO_CALL) {
err = check_func_call(env, insn, &env->insn_idx);
- else
- err = check_helper_call(env, insn->imm, env->insn_idx);
+ } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ err = check_kfunc_call(env, insn, &env->insn_idx);
+ if (!err && is_bpf_throw_kfunc(insn)) {
+ exception_exit = true;
+ goto process_bpf_exit_full;
+ }
+ } else {
+ err = check_helper_call(env, insn, &env->insn_idx);
+ }
if (err)
return err;
+ mark_reg_scratched(env, BPF_REG_0);
} else if (opcode == BPF_JA) {
if (BPF_SRC(insn->code) != BPF_K ||
- insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0 ||
- class == BPF_JMP32) {
+ (class == BPF_JMP && insn->imm != 0) ||
+ (class == BPF_JMP32 && insn->off != 0)) {
verbose(env, "BPF_JA uses reserved fields\n");
return -EINVAL;
}
- env->insn_idx += insn->off + 1;
+ if (class == BPF_JMP)
+ env->insn_idx += insn->off + 1;
+ else
+ env->insn_idx += insn->imm + 1;
continue;
} else if (opcode == BPF_EXIT) {
@@ -8866,12 +17588,42 @@ static int do_check(struct bpf_verifier_env *env)
verbose(env, "BPF_EXIT uses reserved fields\n");
return -EINVAL;
}
-
- if (env->cur_state->active_spin_lock) {
+process_bpf_exit_full:
+ if (env->cur_state->active_lock.ptr &&
+ !in_rbtree_lock_required_cb(env)) {
verbose(env, "bpf_spin_unlock is missing\n");
return -EINVAL;
}
+ if (env->cur_state->active_rcu_lock &&
+ !in_rbtree_lock_required_cb(env)) {
+ verbose(env, "bpf_rcu_read_unlock is missing\n");
+ return -EINVAL;
+ }
+
+ /* We must do check_reference_leak here before
+ * prepare_func_exit to handle the case when
+ * state->curframe > 0, it may be a callback
+ * function, for which reference_state must
+ * match caller reference state when it exits.
+ */
+ err = check_reference_leak(env, exception_exit);
+ if (err)
+ return err;
+
+ /* The side effect of the prepare_func_exit
+ * which is being skipped is that it frees
+ * bpf_func_state. Typically, process_bpf_exit
+ * will only be hit with outermost exit.
+ * copy_verifier_state in pop_stack will handle
+ * freeing of any extra bpf_func_state left over
+ * from not processing all nested function
+ * exits. We also skip return code checks as
+ * they are not needed for exceptional exits.
+ */
+ if (exception_exit)
+ goto process_bpf_exit;
+
if (state->curframe) {
/* exit from nested function */
err = prepare_func_exit(env, &env->insn_idx);
@@ -8881,14 +17633,11 @@ static int do_check(struct bpf_verifier_env *env)
continue;
}
- err = check_reference_leak(env);
- if (err)
- return err;
-
- err = check_return_code(env);
+ err = check_return_code(env, BPF_REG_0, "R0");
if (err)
return err;
process_bpf_exit:
+ mark_verifier_state_scratched(env);
update_branch_counts(env, env->cur_state);
err = pop_stack(env, &prev_insn_idx,
&env->insn_idx, pop_log);
@@ -8919,7 +17668,7 @@ process_bpf_exit:
return err;
env->insn_idx++;
- env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
+ sanitize_mark_insn_seen(env);
} else {
verbose(env, "invalid BPF_LD mode\n");
return -EINVAL;
@@ -8935,12 +17684,170 @@ process_bpf_exit:
return 0;
}
-static int check_map_prealloc(struct bpf_map *map)
+static int find_btf_percpu_datasec(struct btf *btf)
{
- return (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
- map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) ||
- !(map->map_flags & BPF_F_NO_PREALLOC);
+ const struct btf_type *t;
+ const char *tname;
+ int i, n;
+
+ /*
+ * Both vmlinux and module each have their own ".data..percpu"
+ * DATASECs in BTF. So for module's case, we need to skip vmlinux BTF
+ * types to look at only module's own BTF types.
+ */
+ n = btf_nr_types(btf);
+ if (btf_is_module(btf))
+ i = btf_nr_types(btf_vmlinux);
+ else
+ i = 1;
+
+ for(; i < n; i++) {
+ t = btf_type_by_id(btf, i);
+ if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC)
+ continue;
+
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (!strcmp(tname, ".data..percpu"))
+ return i;
+ }
+
+ return -ENOENT;
+}
+
+/* replace pseudo btf_id with kernel symbol address */
+static int check_pseudo_btf_id(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_insn_aux_data *aux)
+{
+ const struct btf_var_secinfo *vsi;
+ const struct btf_type *datasec;
+ struct btf_mod_pair *btf_mod;
+ const struct btf_type *t;
+ const char *sym_name;
+ bool percpu = false;
+ u32 type, id = insn->imm;
+ struct btf *btf;
+ s32 datasec_id;
+ u64 addr;
+ int i, btf_fd, err;
+
+ btf_fd = insn[1].imm;
+ if (btf_fd) {
+ btf = btf_get_by_fd(btf_fd);
+ if (IS_ERR(btf)) {
+ verbose(env, "invalid module BTF object FD specified.\n");
+ return -EINVAL;
+ }
+ } else {
+ if (!btf_vmlinux) {
+ verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
+ return -EINVAL;
+ }
+ btf = btf_vmlinux;
+ btf_get(btf);
+ }
+
+ t = btf_type_by_id(btf, id);
+ if (!t) {
+ verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id);
+ err = -ENOENT;
+ goto err_put;
+ }
+
+ if (!btf_type_is_var(t) && !btf_type_is_func(t)) {
+ verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR or KIND_FUNC\n", id);
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ sym_name = btf_name_by_offset(btf, t->name_off);
+ addr = kallsyms_lookup_name(sym_name);
+ if (!addr) {
+ verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n",
+ sym_name);
+ err = -ENOENT;
+ goto err_put;
+ }
+ insn[0].imm = (u32)addr;
+ insn[1].imm = addr >> 32;
+
+ if (btf_type_is_func(t)) {
+ aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
+ aux->btf_var.mem_size = 0;
+ goto check_btf;
+ }
+
+ datasec_id = find_btf_percpu_datasec(btf);
+ if (datasec_id > 0) {
+ datasec = btf_type_by_id(btf, datasec_id);
+ for_each_vsi(i, datasec, vsi) {
+ if (vsi->type == id) {
+ percpu = true;
+ break;
+ }
+ }
+ }
+
+ type = t->type;
+ t = btf_type_skip_modifiers(btf, type, NULL);
+ if (percpu) {
+ aux->btf_var.reg_type = PTR_TO_BTF_ID | MEM_PERCPU;
+ aux->btf_var.btf = btf;
+ aux->btf_var.btf_id = type;
+ } else if (!btf_type_is_struct(t)) {
+ const struct btf_type *ret;
+ const char *tname;
+ u32 tsize;
+
+ /* resolve the type size of ksym. */
+ ret = btf_resolve_size(btf, t, &tsize);
+ if (IS_ERR(ret)) {
+ tname = btf_name_by_offset(btf, t->name_off);
+ verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n",
+ tname, PTR_ERR(ret));
+ err = -EINVAL;
+ goto err_put;
+ }
+ aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
+ aux->btf_var.mem_size = tsize;
+ } else {
+ aux->btf_var.reg_type = PTR_TO_BTF_ID;
+ aux->btf_var.btf = btf;
+ aux->btf_var.btf_id = type;
+ }
+check_btf:
+ /* check whether we recorded this BTF (and maybe module) already */
+ for (i = 0; i < env->used_btf_cnt; i++) {
+ if (env->used_btfs[i].btf == btf) {
+ btf_put(btf);
+ return 0;
+ }
+ }
+
+ if (env->used_btf_cnt >= MAX_USED_BTFS) {
+ err = -E2BIG;
+ goto err_put;
+ }
+
+ btf_mod = &env->used_btfs[env->used_btf_cnt];
+ btf_mod->btf = btf;
+ btf_mod->module = NULL;
+
+ /* if we reference variables from kernel module, bump its refcount */
+ if (btf_is_module(btf)) {
+ btf_mod->module = btf_try_get_module(btf);
+ if (!btf_mod->module) {
+ err = -ENXIO;
+ goto err_put;
+ }
+ }
+
+ env->used_btf_cnt++;
+
+ return 0;
+err_put:
+ btf_put(btf);
+ return err;
}
static bool is_tracing_prog_type(enum bpf_prog_type type)
@@ -8950,64 +17857,48 @@ static bool is_tracing_prog_type(enum bpf_prog_type type)
case BPF_PROG_TYPE_TRACEPOINT:
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
return true;
default:
return false;
}
}
-static bool is_preallocated_map(struct bpf_map *map)
-{
- if (!check_map_prealloc(map))
- return false;
- if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta))
- return false;
- return true;
-}
-
static int check_map_prog_compatibility(struct bpf_verifier_env *env,
struct bpf_map *map,
struct bpf_prog *prog)
{
- /*
- * Validate that trace type programs use preallocated hash maps.
- *
- * For programs attached to PERF events this is mandatory as the
- * perf NMI can hit any arbitrary code sequence.
- *
- * All other trace types using preallocated hash maps are unsafe as
- * well because tracepoint or kprobes can be inside locked regions
- * of the memory allocator or at a place where a recursion into the
- * memory allocator would see inconsistent state.
- *
- * On RT enabled kernels run-time allocation of all trace type
- * programs is strictly prohibited due to lock type constraints. On
- * !RT kernels it is allowed for backwards compatibility reasons for
- * now, but warnings are emitted so developers are made aware of
- * the unsafety and can fix their programs before this is enforced.
- */
- if (is_tracing_prog_type(prog->type) && !is_preallocated_map(map)) {
- if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
- verbose(env, "perf_event programs can only use preallocated hash map\n");
+ enum bpf_prog_type prog_type = resolve_prog_type(prog);
+
+ if (btf_record_has_field(map->record, BPF_LIST_HEAD) ||
+ btf_record_has_field(map->record, BPF_RB_ROOT)) {
+ if (is_tracing_prog_type(prog_type)) {
+ verbose(env, "tracing progs cannot use bpf_{list_head,rb_root} yet\n");
return -EINVAL;
}
- if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
- verbose(env, "trace type programs can only use preallocated hash map\n");
+ }
+
+ if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
+ if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
+ verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n");
+ return -EINVAL;
+ }
+
+ if (is_tracing_prog_type(prog_type)) {
+ verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
return -EINVAL;
}
- WARN_ONCE(1, "trace type BPF program uses run-time allocation\n");
- verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n");
}
- if ((is_tracing_prog_type(prog->type) ||
- prog->type == BPF_PROG_TYPE_SOCKET_FILTER) &&
- map_value_has_spin_lock(map)) {
- verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
- return -EINVAL;
+ if (btf_record_has_field(map->record, BPF_TIMER)) {
+ if (is_tracing_prog_type(prog_type)) {
+ verbose(env, "tracing progs cannot use bpf_timer yet\n");
+ return -EINVAL;
+ }
}
- if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
+ if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) &&
!bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
return -EINVAL;
@@ -9018,6 +17909,29 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL;
}
+ if (prog->aux->sleepable)
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_HASH:
+ case BPF_MAP_TYPE_LRU_HASH:
+ case BPF_MAP_TYPE_ARRAY:
+ case BPF_MAP_TYPE_PERCPU_HASH:
+ case BPF_MAP_TYPE_PERCPU_ARRAY:
+ case BPF_MAP_TYPE_LRU_PERCPU_HASH:
+ case BPF_MAP_TYPE_ARRAY_OF_MAPS:
+ case BPF_MAP_TYPE_HASH_OF_MAPS:
+ case BPF_MAP_TYPE_RINGBUF:
+ case BPF_MAP_TYPE_USER_RINGBUF:
+ case BPF_MAP_TYPE_INODE_STORAGE:
+ case BPF_MAP_TYPE_SK_STORAGE:
+ case BPF_MAP_TYPE_TASK_STORAGE:
+ case BPF_MAP_TYPE_CGRP_STORAGE:
+ break;
+ default:
+ verbose(env,
+ "Sleepable programs can only use array, hash, ringbuf and local storage maps\n");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -9027,10 +17941,14 @@ static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
}
-/* look for pseudo eBPF instructions that access map FDs and
- * replace them with actual map pointers
+/* find and rewrite pseudo imm in ld_imm64 instructions:
+ *
+ * 1. if it accesses map FD, replace it with actual map pointer.
+ * 2. if it accesses btf_id of a VAR, replace it with pointer to the var.
+ *
+ * NOTE: btf_vmlinux is required for converting pseudo btf_id.
*/
-static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
+static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
@@ -9042,23 +17960,18 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
- (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
+ ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) ||
+ insn->imm != 0)) {
verbose(env, "BPF_LDX uses reserved fields\n");
return -EINVAL;
}
- if (BPF_CLASS(insn->code) == BPF_STX &&
- ((BPF_MODE(insn->code) != BPF_MEM &&
- BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
- verbose(env, "BPF_STX uses reserved fields\n");
- return -EINVAL;
- }
-
if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
struct bpf_insn_aux_data *aux;
struct bpf_map *map;
struct fd f;
u64 addr;
+ u32 fd;
if (i == insn_cnt - 1 || insn[1].code != 0 ||
insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
@@ -9071,19 +17984,55 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
/* valid generic load 64-bit imm */
goto next_insn;
+ if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) {
+ aux = &env->insn_aux_data[i];
+ err = check_pseudo_btf_id(env, insn, aux);
+ if (err)
+ return err;
+ goto next_insn;
+ }
+
+ if (insn[0].src_reg == BPF_PSEUDO_FUNC) {
+ aux = &env->insn_aux_data[i];
+ aux->ptr_type = PTR_TO_FUNC;
+ goto next_insn;
+ }
+
/* In final convert_pseudo_ld_imm64() step, this is
* converted into regular 64-bit imm load insn.
*/
- if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD &&
- insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) ||
- (insn[0].src_reg == BPF_PSEUDO_MAP_FD &&
- insn[1].imm != 0)) {
- verbose(env,
- "unrecognized bpf_ld_imm64 insn\n");
+ switch (insn[0].src_reg) {
+ case BPF_PSEUDO_MAP_VALUE:
+ case BPF_PSEUDO_MAP_IDX_VALUE:
+ break;
+ case BPF_PSEUDO_MAP_FD:
+ case BPF_PSEUDO_MAP_IDX:
+ if (insn[1].imm == 0)
+ break;
+ fallthrough;
+ default:
+ verbose(env, "unrecognized bpf_ld_imm64 insn\n");
return -EINVAL;
}
- f = fdget(insn[0].imm);
+ switch (insn[0].src_reg) {
+ case BPF_PSEUDO_MAP_IDX_VALUE:
+ case BPF_PSEUDO_MAP_IDX:
+ if (bpfptr_is_null(env->fd_array)) {
+ verbose(env, "fd_idx without fd_array is invalid\n");
+ return -EPROTO;
+ }
+ if (copy_from_bpfptr_offset(&fd, env->fd_array,
+ insn[0].imm * sizeof(fd),
+ sizeof(fd)))
+ return -EFAULT;
+ break;
+ default:
+ fd = insn[0].imm;
+ break;
+ }
+
+ f = fdget(fd);
map = __bpf_map_get(f);
if (IS_ERR(map)) {
verbose(env, "fd %d is not pointing to valid bpf_map\n",
@@ -9098,7 +18047,8 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
}
aux = &env->insn_aux_data[i];
- if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
+ if (insn[0].src_reg == BPF_PSEUDO_MAP_FD ||
+ insn[0].src_reg == BPF_PSEUDO_MAP_IDX) {
addr = (unsigned long)map;
} else {
u32 off = insn[1].imm;
@@ -9144,10 +18094,12 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
return -E2BIG;
}
+ if (env->prog->aux->sleepable)
+ atomic64_inc(&map->sleepable_refcnt);
/* hold the map. If the program is rejected by verifier,
* the map will be released by release_maps() or it
* will be used by the valid program until it's unloaded
- * and all maps are released in free_used_maps()
+ * and all maps are released in bpf_free_used_maps()
*/
bpf_map_inc(map);
@@ -9189,6 +18141,13 @@ static void release_maps(struct bpf_verifier_env *env)
env->used_map_cnt);
}
+/* drop refcnt of maps used by the rejected program */
+static void release_btfs(struct bpf_verifier_env *env)
+{
+ __bpf_free_used_btfs(env->prog->aux, env->used_btfs,
+ env->used_btf_cnt);
+}
+
/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
{
@@ -9196,20 +18155,26 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
int insn_cnt = env->prog->len;
int i;
- for (i = 0; i < insn_cnt; i++, insn++)
- if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
- insn->src_reg = 0;
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code != (BPF_LD | BPF_IMM | BPF_DW))
+ continue;
+ if (insn->src_reg == BPF_PSEUDO_FUNC)
+ continue;
+ insn->src_reg = 0;
+ }
}
/* single env->prog->insni[off] instruction was replaced with the range
* insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
* [0, off) and [off, end) to new locations, so the patched range stays zero
*/
-static int adjust_insn_aux_data(struct bpf_verifier_env *env,
- struct bpf_prog *new_prog, u32 off, u32 cnt)
+static void adjust_insn_aux_data(struct bpf_verifier_env *env,
+ struct bpf_insn_aux_data *new_data,
+ struct bpf_prog *new_prog, u32 off, u32 cnt)
{
- struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+ struct bpf_insn_aux_data *old_data = env->insn_aux_data;
struct bpf_insn *insn = new_prog->insnsi;
+ u32 old_seen = old_data[off].seen;
u32 prog_len;
int i;
@@ -9220,22 +18185,19 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env,
old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1);
if (cnt == 1)
- return 0;
+ return;
prog_len = new_prog->len;
- new_data = vzalloc(array_size(prog_len,
- sizeof(struct bpf_insn_aux_data)));
- if (!new_data)
- return -ENOMEM;
+
memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
memcpy(new_data + off + cnt - 1, old_data + off,
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
for (i = off; i < off + cnt - 1; i++) {
- new_data[i].seen = env->pass_cnt;
+ /* Expand insni[off]'s seen count to the patched range. */
+ new_data[i].seen = old_seen;
new_data[i].zext_dst = insn_has_def32(env, insn + i);
}
env->insn_aux_data = new_data;
vfree(old_data);
- return 0;
}
static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
@@ -9252,10 +18214,32 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len
}
}
+static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len)
+{
+ struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
+ int i, sz = prog->aux->size_poke_tab;
+ struct bpf_jit_poke_descriptor *desc;
+
+ for (i = 0; i < sz; i++) {
+ desc = &tab[i];
+ if (desc->insn_idx <= off)
+ continue;
+ desc->insn_idx += len - 1;
+ }
+}
+
static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
const struct bpf_insn *patch, u32 len)
{
struct bpf_prog *new_prog;
+ struct bpf_insn_aux_data *new_data = NULL;
+
+ if (len > 1) {
+ new_data = vzalloc(array_size(env->prog->len + len - 1,
+ sizeof(struct bpf_insn_aux_data)));
+ if (!new_data)
+ return NULL;
+ }
new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
if (IS_ERR(new_prog)) {
@@ -9263,11 +18247,12 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
verbose(env,
"insn %d cannot be patched due to 16-bit range\n",
env->insn_aux_data[off].orig_idx);
+ vfree(new_data);
return NULL;
}
- if (adjust_insn_aux_data(env, new_prog, off, len))
- return NULL;
+ adjust_insn_aux_data(env, new_data, new_prog, off, len);
adjust_subprog_starts(env, off, len);
+ adjust_poke_descs(new_prog, off, len);
return new_prog;
}
@@ -9397,7 +18382,7 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
unsigned int orig_prog_len = env->prog->len;
int err;
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
bpf_prog_offload_remove_insns(env, off, cnt);
err = bpf_remove_insns(env->prog, off, cnt);
@@ -9441,6 +18426,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)
if (aux_data[i].seen)
continue;
memcpy(insn + i, &trap, sizeof(trap));
+ aux_data[i].zext_dst = false;
}
}
@@ -9448,13 +18434,13 @@ static bool insn_is_cond_jump(u8 code)
{
u8 op;
+ op = BPF_OP(code);
if (BPF_CLASS(code) == BPF_JMP32)
- return true;
+ return op != BPF_JA;
if (BPF_CLASS(code) != BPF_JMP)
return false;
- op = BPF_OP(code);
return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL;
}
@@ -9477,7 +18463,7 @@ static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
else
continue;
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
bpf_prog_offload_replace_insn(env, i, &ja);
memcpy(insn, &ja, sizeof(ja));
@@ -9547,8 +18533,10 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
for (i = 0; i < len; i++) {
int adj_idx = i + delta;
struct bpf_insn insn;
+ int load_reg;
insn = insns[adj_idx];
+ load_reg = insn_def_regno(&insn);
if (!aux[adj_idx].zext_dst) {
u8 code, class;
u32 imm_rnd;
@@ -9558,14 +18546,14 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
code = insn.code;
class = BPF_CLASS(code);
- if (insn_no_def(&insn))
+ if (load_reg == -1)
continue;
/* NOTE: arg "reg" (the fourth one) is only used for
- * BPF_STX which has been ruled out in above
- * check, it is safe to pass NULL here.
+ * BPF_STX + SRC_OP, so it is safe to pass NULL
+ * here.
*/
- if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) {
+ if (is_reg64(env, &insn, load_reg, NULL, DST_OP)) {
if (class == BPF_LD &&
BPF_MODE(code) == BPF_IMM)
i++;
@@ -9577,21 +18565,40 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
aux[adj_idx].ptr_type == PTR_TO_CTX)
continue;
- imm_rnd = get_random_int();
+ imm_rnd = get_random_u32();
rnd_hi32_patch[0] = insn;
rnd_hi32_patch[1].imm = imm_rnd;
- rnd_hi32_patch[3].dst_reg = insn.dst_reg;
+ rnd_hi32_patch[3].dst_reg = load_reg;
patch = rnd_hi32_patch;
patch_len = 4;
goto apply_patch_buffer;
}
- if (!bpf_jit_needs_zext())
+ /* Add in an zero-extend instruction if a) the JIT has requested
+ * it or b) it's a CMPXCHG.
+ *
+ * The latter is because: BPF_CMPXCHG always loads a value into
+ * R0, therefore always zero-extends. However some archs'
+ * equivalent instruction only does this load when the
+ * comparison is successful. This detail of CMPXCHG is
+ * orthogonal to the general zero-extension behaviour of the
+ * CPU, so it's treated independently of bpf_jit_needs_zext.
+ */
+ if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))
+ continue;
+
+ /* Zero-extension is done by the caller. */
+ if (bpf_pseudo_kfunc_call(&insn))
continue;
+ if (WARN_ON(load_reg == -1)) {
+ verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n");
+ return -EFAULT;
+ }
+
zext_patch[0] = insn;
- zext_patch[1].dst_reg = insn.dst_reg;
- zext_patch[1].src_reg = insn.dst_reg;
+ zext_patch[1].dst_reg = load_reg;
+ zext_patch[1].src_reg = load_reg;
patch = zext_patch;
patch_len = 2;
apply_patch_buffer:
@@ -9643,42 +18650,41 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
}
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
return 0;
insn = env->prog->insnsi + delta;
for (i = 0; i < insn_cnt; i++, insn++) {
bpf_convert_ctx_access_t convert_ctx_access;
+ u8 mode;
if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
- insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
+ insn->code == (BPF_LDX | BPF_MEM | BPF_DW) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_B) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_H) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_W)) {
type = BPF_READ;
- else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_DW))
+ } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_DW) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
type = BPF_WRITE;
- else
+ } else {
continue;
+ }
if (type == BPF_WRITE &&
- env->insn_aux_data[i + delta].sanitize_stack_off) {
+ env->insn_aux_data[i + delta].sanitize_stack_spill) {
struct bpf_insn patch[] = {
- /* Sanitize suspicious stack slot with zero.
- * There are no memory dependencies for this store,
- * since it's only using frame pointer and immediate
- * constant of zero
- */
- BPF_ST_MEM(BPF_DW, BPF_REG_FP,
- env->insn_aux_data[i + delta].sanitize_stack_off,
- 0),
- /* the original STX instruction will immediately
- * overwrite the same stack slot with appropriate value
- */
*insn,
+ BPF_ST_NOSPEC(),
};
cnt = ARRAY_SIZE(patch);
@@ -9692,7 +18698,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
continue;
}
- switch (env->insn_aux_data[i + delta].ptr_type) {
+ switch ((int)env->insn_aux_data[i + delta].ptr_type) {
case PTR_TO_CTX:
if (!ops->convert_ctx_access)
continue;
@@ -9709,13 +18715,22 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
break;
case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | PTR_UNTRUSTED:
+ /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike
+ * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot
+ * be said once it is marked PTR_UNTRUSTED, hence we must handle
+ * any faults for loads into such types. BPF_WRITE is disallowed
+ * for this case.
+ */
+ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
if (type == BPF_READ) {
- insn->code = BPF_LDX | BPF_PROBE_MEM |
- BPF_SIZE((insn)->code);
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ insn->code = BPF_LDX | BPF_PROBE_MEM |
+ BPF_SIZE((insn)->code);
+ else
+ insn->code = BPF_LDX | BPF_PROBE_MEMSX |
+ BPF_SIZE((insn)->code);
env->prog->aux->num_exentries++;
- } else if (env->prog->type != BPF_PROG_TYPE_STRUCT_OPS) {
- verbose(env, "Writes through BTF pointers are not allowed\n");
- return -EINVAL;
}
continue;
default:
@@ -9724,6 +18739,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
size = BPF_LDST_BYTES(insn);
+ mode = BPF_MODE(insn->code);
/* If the read access is a narrower load of the field,
* convert to a 4/8-byte load, to minimum program type specific
@@ -9763,6 +18779,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
if (is_narrower_load && size < target_size) {
u8 shift = bpf_ctx_narrow_access_offset(
off, size, size_default) * 8;
+ if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) {
+ verbose(env, "bpf verifier narrow ctx load misconfigured\n");
+ return -EINVAL;
+ }
if (ctx_field_size <= 4) {
if (shift)
insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
@@ -9775,10 +18795,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
insn->dst_reg,
shift);
- insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
(1ULL << size * 8) - 1);
}
}
+ if (mode == BPF_MEMSX)
+ insn_buf[cnt++] = BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_X,
+ insn->dst_reg, insn->dst_reg,
+ size * 8, 0);
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
@@ -9798,6 +18822,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog, **func, *tmp;
int i, j, subprog_start, subprog_end = 0, len, subprog;
+ struct bpf_map *map_ptr;
struct bpf_insn *insn;
void *old_bpf_func;
int err, num_exentries;
@@ -9806,9 +18831,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
return 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
- if (insn->code != (BPF_JMP | BPF_CALL) ||
- insn->src_reg != BPF_PSEUDO_CALL)
+ if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn))
continue;
+
/* Upon error here we cannot fall back to interpreter but
* need a hard reject of the program. Thus -EFAULT is
* propagated in any case.
@@ -9829,6 +18854,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
env->insn_aux_data[i].call_imm = insn->imm;
/* point imm to __bpf_call_base+1 from JITs point of view */
insn->imm = 1;
+ if (bpf_pseudo_func(insn))
+ /* jit (e.g. x86_64) may emit fewer instructions
+ * if it learns a u32 imm is the same as a u64 imm.
+ * Force a non zero here.
+ */
+ insn[1].imm = 1;
}
err = bpf_prog_alloc_jited_linfo(prog);
@@ -9845,10 +18876,10 @@ static int jit_subprogs(struct bpf_verifier_env *env)
subprog_end = env->subprog_info[i + 1].start;
len = subprog_end - subprog_start;
- /* BPF_PROG_RUN doesn't call subprogs directly,
+ /* bpf_prog_run() doesn't call subprogs directly,
* hence main prog stats include the runtime of subprogs.
* subprogs don't have IDs and not reachable via prog_get_next_id
- * func[i]->aux->stats will never be accessed and stays NULL
+ * func[i]->stats will never be accessed and stays NULL
*/
func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
if (!func[i])
@@ -9861,16 +18892,28 @@ static int jit_subprogs(struct bpf_verifier_env *env)
goto out_free;
func[i]->is_func = 1;
func[i]->aux->func_idx = i;
- /* the btf and func_info will be freed only at prog->aux */
+ /* Below members will be freed only at prog->aux */
func[i]->aux->btf = prog->aux->btf;
func[i]->aux->func_info = prog->aux->func_info;
+ func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
+ func[i]->aux->poke_tab = prog->aux->poke_tab;
+ func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
+
+ for (j = 0; j < prog->aux->size_poke_tab; j++) {
+ struct bpf_jit_poke_descriptor *poke;
+
+ poke = &prog->aux->poke_tab[j];
+ if (poke->insn_idx < subprog_end &&
+ poke->insn_idx >= subprog_start)
+ poke->aux = func[i]->aux;
+ }
- /* Use bpf_prog_F_tag to indicate functions in stack traces.
- * Long term would need debug info to populate names
- */
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
func[i]->jit_requested = 1;
+ func[i]->blinding_requested = prog->blinding_requested;
+ func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
+ func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab;
func[i]->aux->linfo = prog->aux->linfo;
func[i]->aux->nr_linfo = prog->aux->nr_linfo;
func[i]->aux->jited_linfo = prog->aux->jited_linfo;
@@ -9879,10 +18922,15 @@ static int jit_subprogs(struct bpf_verifier_env *env)
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
- BPF_MODE(insn->code) == BPF_PROBE_MEM)
+ (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
num_exentries++;
}
func[i]->aux->num_exentries = num_exentries;
+ func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
+ func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
+ if (!i)
+ func[i]->aux->exception_boundary = env->seen_exception;
func[i] = bpf_int_jit_compile(func[i]);
if (!func[i]->jited) {
err = -ENOTSUPP;
@@ -9890,6 +18938,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
}
cond_resched();
}
+
/* at this point all bpf functions were successfully JITed
* now populate all bpf_calls with correct addresses and
* run last pass of JIT
@@ -9897,12 +18946,16 @@ static int jit_subprogs(struct bpf_verifier_env *env)
for (i = 0; i < env->subprog_cnt; i++) {
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
- if (insn->code != (BPF_JMP | BPF_CALL) ||
- insn->src_reg != BPF_PSEUDO_CALL)
+ if (bpf_pseudo_func(insn)) {
+ subprog = insn->off;
+ insn[0].imm = (u32)(long)func[subprog]->bpf_func;
+ insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
+ continue;
+ }
+ if (!bpf_pseudo_call(insn))
continue;
subprog = insn->off;
- insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(func[subprog]->bpf_func);
}
/* we use the aux data to keep a list of the start addresses
@@ -9917,7 +18970,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* the call instruction, as an index for this list
*/
func[i]->aux->func = func;
- func[i]->aux->func_cnt = env->subprog_cnt;
+ func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+ func[i]->aux->real_func_cnt = env->subprog_cnt;
}
for (i = 0; i < env->subprog_cnt; i++) {
old_bpf_func = func[i]->bpf_func;
@@ -9931,9 +18985,10 @@ static int jit_subprogs(struct bpf_verifier_env *env)
}
/* finally lock prog and jit images for all functions and
- * populate kallsysm
+ * populate kallsysm. Begin at the first subprogram, since
+ * bpf_prog_load will add the kallsyms for the main program.
*/
- for (i = 0; i < env->subprog_cnt; i++) {
+ for (i = 1; i < env->subprog_cnt; i++) {
bpf_prog_lock_ro(func[i]);
bpf_prog_kallsyms_add(func[i]);
}
@@ -9943,8 +18998,13 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* later look the same as if they were interpreted only.
*/
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
- if (insn->code != (BPF_JMP | BPF_CALL) ||
- insn->src_reg != BPF_PSEUDO_CALL)
+ if (bpf_pseudo_func(insn)) {
+ insn[0].imm = env->insn_aux_data[i].call_imm;
+ insn[1].imm = insn->off;
+ insn->off = 0;
+ continue;
+ }
+ if (!bpf_pseudo_call(insn))
continue;
insn->off = env->insn_aux_data[i].call_imm;
subprog = find_subprog(env, i + insn->off + 1);
@@ -9953,26 +19013,47 @@ static int jit_subprogs(struct bpf_verifier_env *env)
prog->jited = 1;
prog->bpf_func = func[0]->bpf_func;
+ prog->jited_len = func[0]->jited_len;
+ prog->aux->extable = func[0]->aux->extable;
+ prog->aux->num_exentries = func[0]->aux->num_exentries;
prog->aux->func = func;
- prog->aux->func_cnt = env->subprog_cnt;
- bpf_prog_free_unused_jited_linfo(prog);
+ prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+ prog->aux->real_func_cnt = env->subprog_cnt;
+ prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func;
+ prog->aux->exception_boundary = func[0]->aux->exception_boundary;
+ bpf_prog_jit_attempt_done(prog);
return 0;
out_free:
- for (i = 0; i < env->subprog_cnt; i++)
- if (func[i])
- bpf_jit_free(func[i]);
+ /* We failed JIT'ing, so at this point we need to unregister poke
+ * descriptors from subprogs, so that kernel is not attempting to
+ * patch it anymore as we're freeing the subprog JIT memory.
+ */
+ for (i = 0; i < prog->aux->size_poke_tab; i++) {
+ map_ptr = prog->aux->poke_tab[i].tail_call.map;
+ map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
+ }
+ /* At this point we're guaranteed that poke descriptors are not
+ * live anymore. We can just unlink its descriptor table as it's
+ * released with the main prog.
+ */
+ for (i = 0; i < env->subprog_cnt; i++) {
+ if (!func[i])
+ continue;
+ func[i]->aux->poke_tab = NULL;
+ bpf_jit_free(func[i]);
+ }
kfree(func);
out_undo_insn:
/* cleanup main prog to be interpreted */
prog->jit_requested = 0;
+ prog->blinding_requested = 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
- if (insn->code != (BPF_JMP | BPF_CALL) ||
- insn->src_reg != BPF_PSEUDO_CALL)
+ if (!bpf_pseudo_call(insn))
continue;
insn->off = 0;
insn->imm = env->insn_aux_data[i].call_imm;
}
- bpf_prog_free_jited_linfo(prog);
+ bpf_prog_jit_attempt_done(prog);
return err;
}
@@ -9981,12 +19062,13 @@ static int fixup_call_args(struct bpf_verifier_env *env)
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
struct bpf_prog *prog = env->prog;
struct bpf_insn *insn = prog->insnsi;
+ bool has_kfunc_call = bpf_prog_has_kfunc_call(prog);
int i, depth;
#endif
int err = 0;
if (env->prog->jit_requested &&
- !bpf_prog_is_dev_bound(env->prog->aux)) {
+ !bpf_prog_is_offloaded(env->prog->aux)) {
err = jit_subprogs(env);
if (err == 0)
return 0;
@@ -9994,9 +19076,27 @@ static int fixup_call_args(struct bpf_verifier_env *env)
return err;
}
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+ if (has_kfunc_call) {
+ verbose(env, "calling kernel functions are not allowed in non-JITed programs\n");
+ return -EINVAL;
+ }
+ if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
+ /* When JIT fails the progs with bpf2bpf calls and tail_calls
+ * have to be rejected, since interpreter doesn't support them yet.
+ */
+ verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
+ return -EINVAL;
+ }
for (i = 0; i < prog->len; i++, insn++) {
- if (insn->code != (BPF_JMP | BPF_CALL) ||
- insn->src_reg != BPF_PSEUDO_CALL)
+ if (bpf_pseudo_func(insn)) {
+ /* When JIT fails the progs with callback calls
+ * have to be rejected, since interpreter doesn't support them yet.
+ */
+ verbose(env, "callbacks are not allowed in non-JITed programs\n");
+ return -EINVAL;
+ }
+
+ if (!bpf_pseudo_call(insn))
continue;
depth = get_callee_stack_depth(env, insn, i);
if (depth < 0)
@@ -10008,15 +19108,189 @@ static int fixup_call_args(struct bpf_verifier_env *env)
return err;
}
-/* fixup insn->imm field of bpf_call instructions
- * and inline eligible helpers as explicit sequence of BPF instructions
- *
- * this function is called after eBPF program passed verification
+/* replace a generic kfunc with a specialized version if necessary */
+static void specialize_kfunc(struct bpf_verifier_env *env,
+ u32 func_id, u16 offset, unsigned long *addr)
+{
+ struct bpf_prog *prog = env->prog;
+ bool seen_direct_write;
+ void *xdp_kfunc;
+ bool is_rdonly;
+
+ if (bpf_dev_bound_kfunc_id(func_id)) {
+ xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id);
+ if (xdp_kfunc) {
+ *addr = (unsigned long)xdp_kfunc;
+ return;
+ }
+ /* fallback to default kfunc when not supported by netdev */
+ }
+
+ if (offset)
+ return;
+
+ if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
+ seen_direct_write = env->seen_direct_write;
+ is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
+
+ if (is_rdonly)
+ *addr = (unsigned long)bpf_dynptr_from_skb_rdonly;
+
+ /* restore env->seen_direct_write to its original value, since
+ * may_access_direct_pkt_data mutates it
+ */
+ env->seen_direct_write = seen_direct_write;
+ }
+}
+
+static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
+ u16 struct_meta_reg,
+ u16 node_offset_reg,
+ struct bpf_insn *insn,
+ struct bpf_insn *insn_buf,
+ int *cnt)
+{
+ struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) };
+
+ insn_buf[0] = addr[0];
+ insn_buf[1] = addr[1];
+ insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off);
+ insn_buf[3] = *insn;
+ *cnt = 4;
+}
+
+static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ struct bpf_insn *insn_buf, int insn_idx, int *cnt)
+{
+ const struct bpf_kfunc_desc *desc;
+
+ if (!insn->imm) {
+ verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
+ return -EINVAL;
+ }
+
+ *cnt = 0;
+
+ /* insn->imm has the btf func_id. Replace it with an offset relative to
+ * __bpf_call_base, unless the JIT needs to call functions that are
+ * further than 32 bits away (bpf_jit_supports_far_kfunc_call()).
+ */
+ desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
+ if (!desc) {
+ verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n",
+ insn->imm);
+ return -EFAULT;
+ }
+
+ if (!bpf_jit_supports_far_kfunc_call())
+ insn->imm = BPF_CALL_IMM(desc->addr);
+ if (insn->off)
+ return 0;
+ if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+ u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;
+
+ if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && kptr_struct_meta) {
+ verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n",
+ insn_idx);
+ return -EFAULT;
+ }
+
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size);
+ insn_buf[1] = addr[0];
+ insn_buf[2] = addr[1];
+ insn_buf[3] = *insn;
+ *cnt = 4;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+
+ if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] && kptr_struct_meta) {
+ verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n",
+ insn_idx);
+ return -EFAULT;
+ }
+
+ if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
+ !kptr_struct_meta) {
+ verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n",
+ insn_idx);
+ return -EFAULT;
+ }
+
+ insn_buf[0] = addr[0];
+ insn_buf[1] = addr[1];
+ insn_buf[2] = *insn;
+ *cnt = 3;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ int struct_meta_reg = BPF_REG_3;
+ int node_offset_reg = BPF_REG_4;
+
+ /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */
+ if (desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ struct_meta_reg = BPF_REG_4;
+ node_offset_reg = BPF_REG_5;
+ }
+
+ if (!kptr_struct_meta) {
+ verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n",
+ insn_idx);
+ return -EFAULT;
+ }
+
+ __fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
+ node_offset_reg, insn, insn_buf, cnt);
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
+ desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
+ *cnt = 1;
+ }
+ return 0;
+}
+
+/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */
+static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
+{
+ struct bpf_subprog_info *info = env->subprog_info;
+ int cnt = env->subprog_cnt;
+ struct bpf_prog *prog;
+
+ /* We only reserve one slot for hidden subprogs in subprog_info. */
+ if (env->hidden_subprog_cnt) {
+ verbose(env, "verifier internal error: only one hidden subprog supported\n");
+ return -EFAULT;
+ }
+ /* We're not patching any existing instruction, just appending the new
+ * ones for the hidden subprog. Hence all of the adjustment operations
+ * in bpf_patch_insn_data are no-ops.
+ */
+ prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len);
+ if (!prog)
+ return -ENOMEM;
+ env->prog = prog;
+ info[cnt + 1].start = info[cnt].start;
+ info[cnt].start = prog->len - len + 1;
+ env->subprog_cnt++;
+ env->hidden_subprog_cnt++;
+ return 0;
+}
+
+/* Do various post-verification rewrites in a single program pass.
+ * These rewrites simplify JIT and interpreter implementations.
*/
-static int fixup_bpf_calls(struct bpf_verifier_env *env)
+static int do_misc_fixups(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
- bool expect_blinding = bpf_jit_blinding_enabled(prog);
+ enum bpf_attach_type eatype = prog->expected_attach_type;
+ enum bpf_prog_type prog_type = resolve_prog_type(prog);
struct bpf_insn *insn = prog->insnsi;
const struct bpf_func_proto *fn;
const int insn_cnt = prog->len;
@@ -10027,36 +19301,55 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
struct bpf_map *map_ptr;
int i, ret, cnt, delta = 0;
+ if (env->seen_exception && !env->exception_callback_subprog) {
+ struct bpf_insn patch[] = {
+ env->prog->insnsi[insn_cnt - 1],
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ };
+
+ ret = add_hidden_subprog(env, patch, ARRAY_SIZE(patch));
+ if (ret < 0)
+ return ret;
+ prog = env->prog;
+ insn = prog->insnsi;
+
+ env->exception_callback_subprog = env->subprog_cnt - 1;
+ /* Don't update insn_cnt, as add_hidden_subprog always appends insns */
+ mark_subprog_exc_cb(env, env->exception_callback_subprog);
+ }
+
for (i = 0; i < insn_cnt; i++, insn++) {
+ /* Make divide-by-zero exceptions impossible. */
if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
- struct bpf_insn mask_and_div[] = {
- BPF_MOV32_REG(insn->src_reg, insn->src_reg),
- /* Rx div 0 -> 0 */
- BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2),
+ bool isdiv = BPF_OP(insn->code) == BPF_DIV;
+ struct bpf_insn *patchlet;
+ struct bpf_insn chk_and_div[] = {
+ /* [R,W]x div 0 -> 0 */
+ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JNE | BPF_K, insn->src_reg,
+ 0, 2, 0),
BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg),
BPF_JMP_IMM(BPF_JA, 0, 0, 1),
*insn,
};
- struct bpf_insn mask_and_mod[] = {
- BPF_MOV32_REG(insn->src_reg, insn->src_reg),
- /* Rx mod 0 -> Rx */
- BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1),
+ struct bpf_insn chk_and_mod[] = {
+ /* [R,W]x mod 0 -> [R,W]x */
+ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, insn->src_reg,
+ 0, 1 + (is64 ? 0 : 1), 0),
*insn,
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_MOV32_REG(insn->dst_reg, insn->dst_reg),
};
- struct bpf_insn *patchlet;
- if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
- insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
- patchlet = mask_and_div + (is64 ? 1 : 0);
- cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0);
- } else {
- patchlet = mask_and_mod + (is64 ? 1 : 0);
- cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0);
- }
+ patchlet = isdiv ? chk_and_div : chk_and_mod;
+ cnt = isdiv ? ARRAY_SIZE(chk_and_div) :
+ ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0);
new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
if (!new_prog)
@@ -10068,6 +19361,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
}
+ /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
if (BPF_CLASS(insn->code) == BPF_LD &&
(BPF_MODE(insn->code) == BPF_ABS ||
BPF_MODE(insn->code) == BPF_IND)) {
@@ -10087,13 +19381,13 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
}
+ /* Rewrite pointer arithmetic to mitigate speculation attacks. */
if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
- struct bpf_insn insn_buf[16];
struct bpf_insn *patch = &insn_buf[0];
- bool issrc, isneg;
+ bool issrc, isneg, isimm;
u32 off_reg;
aux = &env->insn_aux_data[i + delta];
@@ -10104,28 +19398,29 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
BPF_ALU_SANITIZE_SRC;
+ isimm = aux->alu_state & BPF_ALU_IMMEDIATE;
off_reg = issrc ? insn->src_reg : insn->dst_reg;
- if (isneg)
- *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
- *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1);
- *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
- *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
- *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
- *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
- if (issrc) {
- *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX,
- off_reg);
- insn->src_reg = BPF_REG_AX;
+ if (isimm) {
+ *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
} else {
- *patch++ = BPF_ALU64_REG(BPF_AND, off_reg,
- BPF_REG_AX);
+ if (isneg)
+ *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
+ *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
+ *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
+ *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
+ *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
+ *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
+ *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg);
}
+ if (!issrc)
+ *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg);
+ insn->src_reg = BPF_REG_AX;
if (isneg)
insn->code = insn->code == code_add ?
code_sub : code_add;
*patch++ = *insn;
- if (issrc && isneg)
+ if (issrc && isneg && !isimm)
*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
cnt = patch - insn_buf;
@@ -10143,6 +19438,22 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
if (insn->src_reg == BPF_PSEUDO_CALL)
continue;
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
+ if (ret)
+ return ret;
+ if (cnt == 0)
+ continue;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
if (insn->imm == BPF_FUNC_get_route_realm)
prog->dst_needed = 1;
@@ -10157,11 +19468,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
* the program array.
*/
prog->cb_access = 1;
- env->prog->aux->stack_depth = MAX_BPF_STACK;
- env->prog->aux->max_pkt_offset = MAX_PACKET_OFF;
+ if (!allow_tail_call_in_subprogs(env))
+ prog->aux->stack_depth = MAX_BPF_STACK;
+ prog->aux->max_pkt_offset = MAX_PACKET_OFF;
/* mark bpf_tail_call as different opcode to avoid
- * conditional branch in the interpeter for every normal
+ * conditional branch in the interpreter for every normal
* call and to prevent accidental JITing by JIT compiler
* that doesn't support bpf_tail_call yet
*/
@@ -10169,7 +19481,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn->code = BPF_JMP | BPF_TAIL_CALL;
aux = &env->insn_aux_data[i + delta];
- if (env->bpf_capable && !expect_blinding &&
+ if (env->bpf_capable && !prog->blinding_requested &&
prog->jit_requested &&
!bpf_map_key_poisoned(aux) &&
!bpf_map_ptr_poisoned(aux) &&
@@ -10178,6 +19490,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
.reason = BPF_POKE_REASON_TAIL_CALL,
.tail_call.map = BPF_MAP_PTR(aux->map_ptr_state),
.tail_call.key = bpf_map_key_immediate(aux),
+ .insn_idx = i + delta,
};
ret = bpf_jit_add_poke_descriptor(prog, &desc);
@@ -10223,6 +19536,77 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
}
+ if (insn->imm == BPF_FUNC_timer_set_callback) {
+ /* The verifier will process callback_fn as many times as necessary
+ * with different maps and the register states prepared by
+ * set_timer_callback_state will be accurate.
+ *
+ * The following use case is valid:
+ * map1 is shared by prog1, prog2, prog3.
+ * prog1 calls bpf_timer_init for some map1 elements
+ * prog2 calls bpf_timer_set_callback for some map1 elements.
+ * Those that were not bpf_timer_init-ed will return -EINVAL.
+ * prog3 calls bpf_timer_start for some map1 elements.
+ * Those that were not both bpf_timer_init-ed and
+ * bpf_timer_set_callback-ed will return -EINVAL.
+ */
+ struct bpf_insn ld_addrs[2] = {
+ BPF_LD_IMM64(BPF_REG_3, (long)prog->aux),
+ };
+
+ insn_buf[0] = ld_addrs[0];
+ insn_buf[1] = ld_addrs[1];
+ insn_buf[2] = *insn;
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
+ if (is_storage_get_function(insn->imm)) {
+ if (!env->prog->aux->sleepable ||
+ env->insn_aux_data[i + delta].storage_get_func_atomic)
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
+ else
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
+ insn_buf[1] = *insn;
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
+ /* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */
+ if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) {
+ /* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data,
+ * bpf_mem_alloc() returns a ptr to the percpu data ptr.
+ */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
+ insn_buf[1] = *insn;
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
* and other inlining handlers are currently limited to 64 bit
* only.
@@ -10233,7 +19617,10 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn->imm == BPF_FUNC_map_delete_elem ||
insn->imm == BPF_FUNC_map_push_elem ||
insn->imm == BPF_FUNC_map_pop_elem ||
- insn->imm == BPF_FUNC_map_peek_elem)) {
+ insn->imm == BPF_FUNC_map_peek_elem ||
+ insn->imm == BPF_FUNC_redirect_map ||
+ insn->imm == BPF_FUNC_for_each_map_elem ||
+ insn->imm == BPF_FUNC_map_lookup_percpu_elem)) {
aux = &env->insn_aux_data[i + delta];
if (bpf_map_ptr_poisoned(aux))
goto patch_call_imm;
@@ -10243,7 +19630,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
if (insn->imm == BPF_FUNC_map_lookup_elem &&
ops->map_gen_lookup) {
cnt = ops->map_gen_lookup(map_ptr, insn_buf);
- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ if (cnt == -EOPNOTSUPP)
+ goto patch_map_ops_generic;
+ if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@@ -10262,48 +19651,62 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
- (int (*)(struct bpf_map *map, void *key))NULL));
+ (long (*)(struct bpf_map *map, void *key))NULL));
BUILD_BUG_ON(!__same_type(ops->map_update_elem,
- (int (*)(struct bpf_map *map, void *key, void *value,
+ (long (*)(struct bpf_map *map, void *key, void *value,
u64 flags))NULL));
BUILD_BUG_ON(!__same_type(ops->map_push_elem,
- (int (*)(struct bpf_map *map, void *value,
+ (long (*)(struct bpf_map *map, void *value,
u64 flags))NULL));
BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
- (int (*)(struct bpf_map *map, void *value))NULL));
+ (long (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
- (int (*)(struct bpf_map *map, void *value))NULL));
+ (long (*)(struct bpf_map *map, void *value))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_redirect,
+ (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
+ (long (*)(struct bpf_map *map,
+ bpf_callback_t callback_fn,
+ void *callback_ctx,
+ u64 flags))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem,
+ (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL));
+patch_map_ops_generic:
switch (insn->imm) {
case BPF_FUNC_map_lookup_elem:
- insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_lookup_elem);
continue;
case BPF_FUNC_map_update_elem:
- insn->imm = BPF_CAST_CALL(ops->map_update_elem) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_update_elem);
continue;
case BPF_FUNC_map_delete_elem:
- insn->imm = BPF_CAST_CALL(ops->map_delete_elem) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_delete_elem);
continue;
case BPF_FUNC_map_push_elem:
- insn->imm = BPF_CAST_CALL(ops->map_push_elem) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_push_elem);
continue;
case BPF_FUNC_map_pop_elem:
- insn->imm = BPF_CAST_CALL(ops->map_pop_elem) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_pop_elem);
continue;
case BPF_FUNC_map_peek_elem:
- insn->imm = BPF_CAST_CALL(ops->map_peek_elem) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_peek_elem);
+ continue;
+ case BPF_FUNC_redirect_map:
+ insn->imm = BPF_CALL_IMM(ops->map_redirect);
+ continue;
+ case BPF_FUNC_for_each_map_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_for_each_callback);
+ continue;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem);
continue;
}
goto patch_call_imm;
}
+ /* Implement bpf_jiffies64 inline. */
if (prog->jit_requested && BITS_PER_LONG == 64 &&
insn->imm == BPF_FUNC_jiffies64) {
struct bpf_insn ld_jiffies_addr[2] = {
@@ -10328,6 +19731,89 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
}
+ /* Implement bpf_get_func_arg inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_arg) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3);
+ insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1);
+ insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0);
+ insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ insn_buf[7] = BPF_JMP_A(1);
+ insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
+ cnt = 9;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ /* Implement bpf_get_func_ret inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_ret) {
+ if (eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_MODIFY_RETURN) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+ insn_buf[2] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
+ insn_buf[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[4] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0);
+ insn_buf[5] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ cnt = 6;
+ } else {
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP);
+ cnt = 1;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ /* Implement get_func_arg_cnt inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_arg_cnt) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
+ if (!new_prog)
+ return -ENOMEM;
+
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ /* Implement bpf_get_func_ip inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_ip) {
+ /* Load IP address from ctx - 16 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
+ if (!new_prog)
+ return -ENOMEM;
+
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
patch_call_imm:
fn = env->ops->get_func_proto(insn->imm, env->prog);
/* all functions that have prototype and verifier allowed
@@ -10359,6 +19845,144 @@ patch_call_imm:
}
}
+ sort_kfunc_descs_by_imm_off(env->prog);
+
+ return 0;
+}
+
+static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
+ int position,
+ s32 stack_base,
+ u32 callback_subprogno,
+ u32 *cnt)
+{
+ s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
+ s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
+ s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
+ int reg_loop_max = BPF_REG_6;
+ int reg_loop_cnt = BPF_REG_7;
+ int reg_loop_ctx = BPF_REG_8;
+
+ struct bpf_prog *new_prog;
+ u32 callback_start;
+ u32 call_insn_offset;
+ s32 callback_offset;
+
+ /* This represents an inlined version of bpf_iter.c:bpf_loop,
+ * be careful to modify this code in sync.
+ */
+ struct bpf_insn insn_buf[] = {
+ /* Return error and jump to the end of the patch if
+ * expected number of iterations is too big.
+ */
+ BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2),
+ BPF_MOV32_IMM(BPF_REG_0, -E2BIG),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 16),
+ /* spill R6, R7, R8 to use these as loop vars */
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset),
+ /* initialize loop vars */
+ BPF_MOV64_REG(reg_loop_max, BPF_REG_1),
+ BPF_MOV32_IMM(reg_loop_cnt, 0),
+ BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3),
+ /* loop header,
+ * if reg_loop_cnt >= reg_loop_max skip the loop body
+ */
+ BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5),
+ /* callback call,
+ * correct callback offset would be set after patching
+ */
+ BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt),
+ BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx),
+ BPF_CALL_REL(0),
+ /* increment loop counter */
+ BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1),
+ /* jump to loop header if callback returned 0 */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6),
+ /* return value of bpf_loop,
+ * set R0 to the number of iterations
+ */
+ BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt),
+ /* restore original values of R6, R7, R8 */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset),
+ };
+
+ *cnt = ARRAY_SIZE(insn_buf);
+ new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt);
+ if (!new_prog)
+ return new_prog;
+
+ /* callback start is known only after patching */
+ callback_start = env->subprog_info[callback_subprogno].start;
+ /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */
+ call_insn_offset = position + 12;
+ callback_offset = callback_start - call_insn_offset - 1;
+ new_prog->insnsi[call_insn_offset].imm = callback_offset;
+
+ return new_prog;
+}
+
+static bool is_bpf_loop_call(struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == 0 &&
+ insn->imm == BPF_FUNC_loop;
+}
+
+/* For all sub-programs in the program (including main) check
+ * insn_aux_data to see if there are bpf_loop calls that require
+ * inlining. If such calls are found the calls are replaced with a
+ * sequence of instructions produced by `inline_bpf_loop` function and
+ * subprog stack_depth is increased by the size of 3 registers.
+ * This stack space is used to spill values of the R6, R7, R8. These
+ * registers are used to store the loop bound, counter and context
+ * variables.
+ */
+static int optimize_bpf_loop(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprogs = env->subprog_info;
+ int i, cur_subprog = 0, cnt, delta = 0;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ u16 stack_depth = subprogs[cur_subprog].stack_depth;
+ u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
+ u16 stack_depth_extra = 0;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ struct bpf_loop_inline_state *inline_state =
+ &env->insn_aux_data[i + delta].loop_inline_state;
+
+ if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) {
+ struct bpf_prog *new_prog;
+
+ stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup;
+ new_prog = inline_bpf_loop(env,
+ i + delta,
+ -(stack_depth + stack_depth_extra),
+ inline_state->callback_subprogno,
+ &cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ }
+
+ if (subprogs[cur_subprog + 1].start == i + delta + 1) {
+ subprogs[cur_subprog].stack_depth += stack_depth_extra;
+ cur_subprog++;
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
+ stack_depth_extra = 0;
+ }
+ }
+
+ env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
+
return 0;
}
@@ -10392,37 +20016,10 @@ static void free_states(struct bpf_verifier_env *env)
}
}
-/* The verifier is using insn_aux_data[] to store temporary data during
- * verification and to store information for passes that run after the
- * verification like dead code sanitization. do_check_common() for subprogram N
- * may analyze many other subprograms. sanitize_insn_aux_data() clears all
- * temporary data after do_check_common() finds that subprogram N cannot be
- * verified independently. pass_cnt counts the number of times
- * do_check_common() was run and insn->aux->seen tells the pass number
- * insn_aux_data was touched. These variables are compared to clear temporary
- * data from failed pass. For testing and experiments do_check_common() can be
- * run multiple times even when prior attempt to verify is unsuccessful.
- */
-static void sanitize_insn_aux_data(struct bpf_verifier_env *env)
-{
- struct bpf_insn *insn = env->prog->insnsi;
- struct bpf_insn_aux_data *aux;
- int i, class;
-
- for (i = 0; i < env->prog->len; i++) {
- class = BPF_CLASS(insn[i].code);
- if (class != BPF_LDX && class != BPF_STX)
- continue;
- aux = &env->insn_aux_data[i];
- if (aux->seen != env->pass_cnt)
- continue;
- memset(aux, 0, offsetof(typeof(*aux), orig_idx));
- }
-}
-
static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
+ struct bpf_subprog_info *sub = subprog_info(env, subprog);
struct bpf_verifier_state *state;
struct bpf_reg_state *regs;
int ret, i;
@@ -10446,34 +20043,74 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
BPF_MAIN_FUNC /* callsite */,
0 /* frameno */,
subprog);
+ state->first_insn_idx = env->subprog_info[subprog].start;
+ state->last_insn_idx = -1;
+
regs = state->frame[state->curframe]->regs;
if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
- ret = btf_prepare_func_args(env, subprog, regs);
+ const char *sub_name = subprog_name(env, subprog);
+ struct bpf_subprog_arg_info *arg;
+ struct bpf_reg_state *reg;
+
+ verbose(env, "Validating %s() func#%d...\n", sub_name, subprog);
+ ret = btf_prepare_func_args(env, subprog);
if (ret)
goto out;
- for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
- if (regs[i].type == PTR_TO_CTX)
+
+ if (subprog_is_exc_cb(env, subprog)) {
+ state->frame[0]->in_exception_callback_fn = true;
+ /* We have already ensured that the callback returns an integer, just
+ * like all global subprogs. We need to determine it only has a single
+ * scalar argument.
+ */
+ if (sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_ANYTHING) {
+ verbose(env, "exception cb only supports single integer argument\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ for (i = BPF_REG_1; i <= sub->arg_cnt; i++) {
+ arg = &sub->args[i - BPF_REG_1];
+ reg = &regs[i];
+
+ if (arg->arg_type == ARG_PTR_TO_CTX) {
+ reg->type = PTR_TO_CTX;
mark_reg_known_zero(env, regs, i);
- else if (regs[i].type == SCALAR_VALUE)
+ } else if (arg->arg_type == ARG_ANYTHING) {
+ reg->type = SCALAR_VALUE;
mark_reg_unknown(env, regs, i);
+ } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
+ /* assume unspecial LOCAL dynptr type */
+ __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen);
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
+ reg->type = PTR_TO_MEM;
+ if (arg->arg_type & PTR_MAYBE_NULL)
+ reg->type |= PTR_MAYBE_NULL;
+ mark_reg_known_zero(env, regs, i);
+ reg->mem_size = arg->mem_size;
+ reg->id = ++env->id_gen;
+ } else {
+ WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n",
+ i - BPF_REG_1, arg->arg_type);
+ ret = -EFAULT;
+ goto out;
+ }
}
} else {
+ /* if main BPF program has associated BTF info, validate that
+ * it's matching expected signature, and otherwise mark BTF
+ * info for main program as unreliable
+ */
+ if (env->prog->aux->func_info_aux) {
+ ret = btf_prepare_func_args(env, 0);
+ if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX)
+ env->prog->aux->func_info_aux[0].unreliable = true;
+ }
+
/* 1st arg to a function */
regs[BPF_REG_1].type = PTR_TO_CTX;
mark_reg_known_zero(env, regs, BPF_REG_1);
- ret = btf_check_func_arg_match(env, subprog, regs);
- if (ret == -EFAULT)
- /* unlikely verifier bug. abort.
- * ret == 0 and ret < 0 are sadly acceptable for
- * main() function due to backward compatibility.
- * Like socket filter program may be written as:
- * int bpf_prog(struct pt_regs *ctx)
- * and never dereference that ctx in the program.
- * 'struct pt_regs' is a type mismatch for socket
- * filter that should be using 'struct __sk_buff'.
- */
- goto out;
}
ret = do_check(env);
@@ -10489,14 +20126,14 @@ out:
if (!ret && pop_log)
bpf_vlog_reset(&env->log, 0);
free_states(env);
- if (ret)
- /* clean aux data in case subprog was rejected */
- sanitize_insn_aux_data(env);
return ret;
}
-/* Verify all global functions in a BPF program one by one based on their BTF.
- * All global functions must pass verification. Otherwise the whole program is rejected.
+/* Lazily verify all global functions based on their BTF, if they are called
+ * from main BPF program or any of subprograms transitively.
+ * BPF global subprogs called from dead code are not validated.
+ * All callable global functions must pass verification.
+ * Otherwise the whole program is rejected.
* Consider:
* int bar(int);
* int foo(int f)
@@ -10515,25 +20152,50 @@ out:
static int do_check_subprogs(struct bpf_verifier_env *env)
{
struct bpf_prog_aux *aux = env->prog->aux;
- int i, ret;
+ struct bpf_func_info_aux *sub_aux;
+ int i, ret, new_cnt;
if (!aux->func_info)
return 0;
+ /* exception callback is presumed to be always called */
+ if (env->exception_callback_subprog)
+ subprog_aux(env, env->exception_callback_subprog)->called = true;
+
+again:
+ new_cnt = 0;
for (i = 1; i < env->subprog_cnt; i++) {
- if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL)
+ if (!subprog_is_global(env, i))
+ continue;
+
+ sub_aux = subprog_aux(env, i);
+ if (!sub_aux->called || sub_aux->verified)
continue;
+
env->insn_idx = env->subprog_info[i].start;
WARN_ON_ONCE(env->insn_idx == 0);
ret = do_check_common(env, i);
if (ret) {
return ret;
} else if (env->log.level & BPF_LOG_LEVEL) {
- verbose(env,
- "Func#%d is safe for any args that match its prototype\n",
- i);
+ verbose(env, "Func#%d ('%s') is safe for any args that match its prototype\n",
+ i, subprog_name(env, i));
}
+
+ /* We verified new global subprog, it might have called some
+ * more global subprogs that we haven't verified yet, so we
+ * need to do another pass over subprogs to verify those.
+ */
+ sub_aux->verified = true;
+ new_cnt++;
}
+
+ /* We can't loop forever as we verify at least one global subprog on
+ * each pass.
+ */
+ if (new_cnt)
+ goto again;
+
return 0;
}
@@ -10582,6 +20244,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
u32 btf_id, member_idx;
const char *mname;
+ if (!prog->gpl_compatible) {
+ verbose(env, "struct ops programs must have a GPL compatible license\n");
+ return -EINVAL;
+ }
+
btf_id = prog->aux->attach_btf_id;
st_ops = bpf_struct_ops_find(btf_id);
if (!st_ops) {
@@ -10609,7 +20276,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
if (st_ops->check_member) {
- int err = st_ops->check_member(t, member);
+ int err = st_ops->check_member(t, member, prog);
if (err) {
verbose(env, "attach to unsupported member %s of struct %s\n",
@@ -10626,98 +20293,127 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
#define SECURITY_PREFIX "security_"
-static int check_attach_modify_return(struct bpf_prog *prog, unsigned long addr)
+static int check_attach_modify_return(unsigned long addr, const char *func_name)
{
if (within_error_injection_list(addr) ||
- !strncmp(SECURITY_PREFIX, prog->aux->attach_func_name,
- sizeof(SECURITY_PREFIX) - 1))
+ !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1))
return 0;
return -EINVAL;
}
-static int check_attach_btf_id(struct bpf_verifier_env *env)
+/* list of non-sleepable functions that are otherwise on
+ * ALLOW_ERROR_INJECTION list
+ */
+BTF_SET_START(btf_non_sleepable_error_inject)
+/* Three functions below can be called from sleepable and non-sleepable context.
+ * Assume non-sleepable from bpf safety point of view.
+ */
+BTF_ID(func, __filemap_add_folio)
+BTF_ID(func, should_fail_alloc_page)
+BTF_ID(func, should_failslab)
+BTF_SET_END(btf_non_sleepable_error_inject)
+
+static int check_non_sleepable_error_inject(u32 btf_id)
+{
+ return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id);
+}
+
+int bpf_check_attach_target(struct bpf_verifier_log *log,
+ const struct bpf_prog *prog,
+ const struct bpf_prog *tgt_prog,
+ u32 btf_id,
+ struct bpf_attach_target_info *tgt_info)
{
- struct bpf_prog *prog = env->prog;
bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
- struct bpf_prog *tgt_prog = prog->aux->linked_prog;
- u32 btf_id = prog->aux->attach_btf_id;
+ bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING;
const char prefix[] = "btf_trace_";
- struct btf_func_model fmodel;
int ret = 0, subprog = -1, i;
- struct bpf_trampoline *tr;
const struct btf_type *t;
bool conservative = true;
const char *tname;
struct btf *btf;
- long addr;
- u64 key;
-
- if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
- return check_struct_ops_btf_id(env);
-
- if (prog->type != BPF_PROG_TYPE_TRACING &&
- prog->type != BPF_PROG_TYPE_LSM &&
- !prog_extension)
- return 0;
+ long addr = 0;
+ struct module *mod = NULL;
if (!btf_id) {
- verbose(env, "Tracing programs must provide btf_id\n");
+ bpf_log(log, "Tracing programs must provide btf_id\n");
return -EINVAL;
}
- btf = bpf_prog_get_target_btf(prog);
+ btf = tgt_prog ? tgt_prog->aux->btf : prog->aux->attach_btf;
if (!btf) {
- verbose(env,
+ bpf_log(log,
"FENTRY/FEXIT program can only be attached to another program annotated with BTF\n");
return -EINVAL;
}
t = btf_type_by_id(btf, btf_id);
if (!t) {
- verbose(env, "attach_btf_id %u is invalid\n", btf_id);
+ bpf_log(log, "attach_btf_id %u is invalid\n", btf_id);
return -EINVAL;
}
tname = btf_name_by_offset(btf, t->name_off);
if (!tname) {
- verbose(env, "attach_btf_id %u doesn't have a name\n", btf_id);
+ bpf_log(log, "attach_btf_id %u doesn't have a name\n", btf_id);
return -EINVAL;
}
if (tgt_prog) {
struct bpf_prog_aux *aux = tgt_prog->aux;
+ if (bpf_prog_is_dev_bound(prog->aux) &&
+ !bpf_prog_dev_bound_match(prog, tgt_prog)) {
+ bpf_log(log, "Target program bound device mismatch");
+ return -EINVAL;
+ }
+
for (i = 0; i < aux->func_info_cnt; i++)
if (aux->func_info[i].type_id == btf_id) {
subprog = i;
break;
}
if (subprog == -1) {
- verbose(env, "Subprog %s doesn't exist\n", tname);
+ bpf_log(log, "Subprog %s doesn't exist\n", tname);
+ return -EINVAL;
+ }
+ if (aux->func && aux->func[subprog]->aux->exception_cb) {
+ bpf_log(log,
+ "%s programs cannot attach to exception callback\n",
+ prog_extension ? "Extension" : "FENTRY/FEXIT");
return -EINVAL;
}
conservative = aux->func_info_aux[subprog].unreliable;
if (prog_extension) {
if (conservative) {
- verbose(env,
+ bpf_log(log,
"Cannot replace static functions\n");
return -EINVAL;
}
if (!prog->jit_requested) {
- verbose(env,
+ bpf_log(log,
"Extension programs should be JITed\n");
return -EINVAL;
}
- env->ops = bpf_verifier_ops[tgt_prog->type];
- prog->expected_attach_type = tgt_prog->expected_attach_type;
}
if (!tgt_prog->jited) {
- verbose(env, "Can attach to only JITed progs\n");
+ bpf_log(log, "Can attach to only JITed progs\n");
return -EINVAL;
}
- if (tgt_prog->type == prog->type) {
- /* Cannot fentry/fexit another fentry/fexit program.
- * Cannot attach program extension to another extension.
- * It's ok to attach fentry/fexit to extension program.
+ if (prog_tracing) {
+ if (aux->attach_tracing_prog) {
+ /*
+ * Target program is an fentry/fexit which is already attached
+ * to another tracing program. More levels of nesting
+ * attachment are not allowed.
+ */
+ bpf_log(log, "Cannot nest tracing program attach more than once\n");
+ return -EINVAL;
+ }
+ } else if (tgt_prog->type == prog->type) {
+ /*
+ * To avoid potential call chain cycles, prevent attaching of a
+ * program extension to another extension. It's ok to attach
+ * fentry/fexit to extension program.
*/
- verbose(env, "Cannot recursively attach\n");
+ bpf_log(log, "Cannot recursively attach\n");
return -EINVAL;
}
if (tgt_prog->type == BPF_PROG_TYPE_TRACING &&
@@ -10728,43 +20424,40 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
* except fentry/fexit. The reason is the following.
* The fentry/fexit programs are used for performance
* analysis, stats and can be attached to any program
- * type except themselves. When extension program is
- * replacing XDP function it is necessary to allow
- * performance analysis of all functions. Both original
- * XDP program and its program extension. Hence
- * attaching fentry/fexit to BPF_PROG_TYPE_EXT is
- * allowed. If extending of fentry/fexit was allowed it
- * would be possible to create long call chain
- * fentry->extension->fentry->extension beyond
- * reasonable stack size. Hence extending fentry is not
- * allowed.
+ * type. When extension program is replacing XDP function
+ * it is necessary to allow performance analysis of all
+ * functions. Both original XDP program and its program
+ * extension. Hence attaching fentry/fexit to
+ * BPF_PROG_TYPE_EXT is allowed. If extending of
+ * fentry/fexit was allowed it would be possible to create
+ * long call chain fentry->extension->fentry->extension
+ * beyond reasonable stack size. Hence extending fentry
+ * is not allowed.
*/
- verbose(env, "Cannot extend fentry/fexit\n");
+ bpf_log(log, "Cannot extend fentry/fexit\n");
return -EINVAL;
}
- key = ((u64)aux->id) << 32 | btf_id;
} else {
if (prog_extension) {
- verbose(env, "Cannot replace kernel functions\n");
+ bpf_log(log, "Cannot replace kernel functions\n");
return -EINVAL;
}
- key = btf_id;
}
switch (prog->expected_attach_type) {
case BPF_TRACE_RAW_TP:
if (tgt_prog) {
- verbose(env,
+ bpf_log(log,
"Only FENTRY/FEXIT progs are attachable to another BPF prog\n");
return -EINVAL;
}
if (!btf_type_is_typedef(t)) {
- verbose(env, "attach_btf_id %u is not a typedef\n",
+ bpf_log(log, "attach_btf_id %u is not a typedef\n",
btf_id);
return -EINVAL;
}
if (strncmp(prefix, tname, sizeof(prefix) - 1)) {
- verbose(env, "attach_btf_id %u points to wrong type name %s\n",
+ bpf_log(log, "attach_btf_id %u points to wrong type name %s\n",
btf_id, tname);
return -EINVAL;
}
@@ -10778,115 +20471,278 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
/* should never happen in valid vmlinux build */
return -EINVAL;
- /* remember two read only pointers that are valid for
- * the life time of the kernel
- */
- prog->aux->attach_func_name = tname;
- prog->aux->attach_func_proto = t;
- prog->aux->attach_btf_trace = true;
- return 0;
+ break;
case BPF_TRACE_ITER:
if (!btf_type_is_func(t)) {
- verbose(env, "attach_btf_id %u is not a function\n",
+ bpf_log(log, "attach_btf_id %u is not a function\n",
btf_id);
return -EINVAL;
}
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_func_proto(t))
return -EINVAL;
- prog->aux->attach_func_name = tname;
- prog->aux->attach_func_proto = t;
- if (!bpf_iter_prog_supported(prog))
- return -EINVAL;
- ret = btf_distill_func_proto(&env->log, btf, t,
- tname, &fmodel);
- return ret;
+ ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel);
+ if (ret)
+ return ret;
+ break;
default:
if (!prog_extension)
return -EINVAL;
- /* fallthrough */
+ fallthrough;
case BPF_MODIFY_RETURN:
case BPF_LSM_MAC:
+ case BPF_LSM_CGROUP:
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
- prog->aux->attach_func_name = tname;
- if (prog->type == BPF_PROG_TYPE_LSM) {
- ret = bpf_lsm_verify_prog(&env->log, prog);
- if (ret < 0)
- return ret;
- }
-
if (!btf_type_is_func(t)) {
- verbose(env, "attach_btf_id %u is not a function\n",
+ bpf_log(log, "attach_btf_id %u is not a function\n",
btf_id);
return -EINVAL;
}
if (prog_extension &&
- btf_check_type_match(env, prog, btf, t))
+ btf_check_type_match(log, prog, btf, t))
return -EINVAL;
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_func_proto(t))
return -EINVAL;
- tr = bpf_trampoline_lookup(key);
- if (!tr)
- return -ENOMEM;
- /* t is either vmlinux type or another program's type */
- prog->aux->attach_func_proto = t;
- mutex_lock(&tr->mutex);
- if (tr->func.addr) {
- prog->aux->trampoline = tr;
- goto out;
- }
- if (tgt_prog && conservative) {
- prog->aux->attach_func_proto = NULL;
+
+ if ((prog->aux->saved_dst_prog_type || prog->aux->saved_dst_attach_type) &&
+ (!tgt_prog || prog->aux->saved_dst_prog_type != tgt_prog->type ||
+ prog->aux->saved_dst_attach_type != tgt_prog->expected_attach_type))
+ return -EINVAL;
+
+ if (tgt_prog && conservative)
t = NULL;
- }
- ret = btf_distill_func_proto(&env->log, btf, t,
- tname, &tr->func.model);
+
+ ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel);
if (ret < 0)
- goto out;
+ return ret;
+
if (tgt_prog) {
if (subprog == 0)
addr = (long) tgt_prog->bpf_func;
else
addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
} else {
- addr = kallsyms_lookup_name(tname);
+ if (btf_is_module(btf)) {
+ mod = btf_try_get_module(btf);
+ if (mod)
+ addr = find_kallsyms_symbol_value(mod, tname);
+ else
+ addr = 0;
+ } else {
+ addr = kallsyms_lookup_name(tname);
+ }
if (!addr) {
- verbose(env,
+ module_put(mod);
+ bpf_log(log,
"The address of function %s cannot be found\n",
tname);
- ret = -ENOENT;
- goto out;
+ return -ENOENT;
}
}
- if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
- ret = check_attach_modify_return(prog, addr);
- if (ret)
- verbose(env, "%s() is not modifiable\n",
- prog->aux->attach_func_name);
+ if (prog->aux->sleepable) {
+ ret = -EINVAL;
+ switch (prog->type) {
+ case BPF_PROG_TYPE_TRACING:
+
+ /* fentry/fexit/fmod_ret progs can be sleepable if they are
+ * attached to ALLOW_ERROR_INJECTION and are not in denylist.
+ */
+ if (!check_non_sleepable_error_inject(btf_id) &&
+ within_error_injection_list(addr))
+ ret = 0;
+ /* fentry/fexit/fmod_ret progs can also be sleepable if they are
+ * in the fmodret id set with the KF_SLEEPABLE flag.
+ */
+ else {
+ u32 *flags = btf_kfunc_is_modify_return(btf, btf_id,
+ prog);
+
+ if (flags && (*flags & KF_SLEEPABLE))
+ ret = 0;
+ }
+ break;
+ case BPF_PROG_TYPE_LSM:
+ /* LSM progs check that they are attached to bpf_lsm_*() funcs.
+ * Only some of them are sleepable.
+ */
+ if (bpf_lsm_is_sleepable_hook(btf_id))
+ ret = 0;
+ break;
+ default:
+ break;
+ }
+ if (ret) {
+ module_put(mod);
+ bpf_log(log, "%s is not sleepable\n", tname);
+ return ret;
+ }
+ } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
+ if (tgt_prog) {
+ module_put(mod);
+ bpf_log(log, "can't modify return codes of BPF programs\n");
+ return -EINVAL;
+ }
+ ret = -EINVAL;
+ if (btf_kfunc_is_modify_return(btf, btf_id, prog) ||
+ !check_attach_modify_return(addr, tname))
+ ret = 0;
+ if (ret) {
+ module_put(mod);
+ bpf_log(log, "%s() is not modifiable\n", tname);
+ return ret;
+ }
}
- if (ret)
- goto out;
- tr->func.addr = (void *)addr;
- prog->aux->trampoline = tr;
-out:
- mutex_unlock(&tr->mutex);
- if (ret)
- bpf_trampoline_put(tr);
+ break;
+ }
+ tgt_info->tgt_addr = addr;
+ tgt_info->tgt_name = tname;
+ tgt_info->tgt_type = t;
+ tgt_info->tgt_mod = mod;
+ return 0;
+}
+
+BTF_SET_START(btf_id_deny)
+BTF_ID_UNUSED
+#ifdef CONFIG_SMP
+BTF_ID(func, migrate_disable)
+BTF_ID(func, migrate_enable)
+#endif
+#if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
+BTF_ID(func, rcu_read_unlock_strict)
+#endif
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
+BTF_ID(func, preempt_count_add)
+BTF_ID(func, preempt_count_sub)
+#endif
+#ifdef CONFIG_PREEMPT_RCU
+BTF_ID(func, __rcu_read_lock)
+BTF_ID(func, __rcu_read_unlock)
+#endif
+BTF_SET_END(btf_id_deny)
+
+static bool can_be_sleepable(struct bpf_prog *prog)
+{
+ if (prog->type == BPF_PROG_TYPE_TRACING) {
+ switch (prog->expected_attach_type) {
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ case BPF_MODIFY_RETURN:
+ case BPF_TRACE_ITER:
+ return true;
+ default:
+ return false;
+ }
+ }
+ return prog->type == BPF_PROG_TYPE_LSM ||
+ prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
+ prog->type == BPF_PROG_TYPE_STRUCT_OPS;
+}
+
+static int check_attach_btf_id(struct bpf_verifier_env *env)
+{
+ struct bpf_prog *prog = env->prog;
+ struct bpf_prog *tgt_prog = prog->aux->dst_prog;
+ struct bpf_attach_target_info tgt_info = {};
+ u32 btf_id = prog->aux->attach_btf_id;
+ struct bpf_trampoline *tr;
+ int ret;
+ u64 key;
+
+ if (prog->type == BPF_PROG_TYPE_SYSCALL) {
+ if (prog->aux->sleepable)
+ /* attach_btf_id checked to be zero already */
+ return 0;
+ verbose(env, "Syscall programs can only be sleepable\n");
+ return -EINVAL;
+ }
+
+ if (prog->aux->sleepable && !can_be_sleepable(prog)) {
+ verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
+ return -EINVAL;
+ }
+
+ if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
+ return check_struct_ops_btf_id(env);
+
+ if (prog->type != BPF_PROG_TYPE_TRACING &&
+ prog->type != BPF_PROG_TYPE_LSM &&
+ prog->type != BPF_PROG_TYPE_EXT)
+ return 0;
+
+ ret = bpf_check_attach_target(&env->log, prog, tgt_prog, btf_id, &tgt_info);
+ if (ret)
return ret;
+
+ if (tgt_prog && prog->type == BPF_PROG_TYPE_EXT) {
+ /* to make freplace equivalent to their targets, they need to
+ * inherit env->ops and expected_attach_type for the rest of the
+ * verification
+ */
+ env->ops = bpf_verifier_ops[tgt_prog->type];
+ prog->expected_attach_type = tgt_prog->expected_attach_type;
+ }
+
+ /* store info about the attachment target that will be used later */
+ prog->aux->attach_func_proto = tgt_info.tgt_type;
+ prog->aux->attach_func_name = tgt_info.tgt_name;
+ prog->aux->mod = tgt_info.tgt_mod;
+
+ if (tgt_prog) {
+ prog->aux->saved_dst_prog_type = tgt_prog->type;
+ prog->aux->saved_dst_attach_type = tgt_prog->expected_attach_type;
+ }
+
+ if (prog->expected_attach_type == BPF_TRACE_RAW_TP) {
+ prog->aux->attach_btf_trace = true;
+ return 0;
+ } else if (prog->expected_attach_type == BPF_TRACE_ITER) {
+ if (!bpf_iter_prog_supported(prog))
+ return -EINVAL;
+ return 0;
+ }
+
+ if (prog->type == BPF_PROG_TYPE_LSM) {
+ ret = bpf_lsm_verify_prog(&env->log, prog);
+ if (ret < 0)
+ return ret;
+ } else if (prog->type == BPF_PROG_TYPE_TRACING &&
+ btf_id_set_contains(&btf_id_deny, btf_id)) {
+ return -EINVAL;
+ }
+
+ key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
+ tr = bpf_trampoline_get(key, &tgt_info);
+ if (!tr)
+ return -ENOMEM;
+
+ if (tgt_prog && tgt_prog->aux->tail_call_reachable)
+ tr->flags = BPF_TRAMP_F_TAIL_CALL_CTX;
+
+ prog->aux->dst_trampoline = tr;
+ return 0;
+}
+
+struct btf *bpf_get_btf_vmlinux(void)
+{
+ if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+ mutex_lock(&bpf_verifier_lock);
+ if (!btf_vmlinux)
+ btf_vmlinux = btf_parse_vmlinux();
+ mutex_unlock(&bpf_verifier_lock);
}
+ return btf_vmlinux;
}
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
- union bpf_attr __user *uattr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
u64 start_time = ktime_get_ns();
struct bpf_verifier_env *env;
- struct bpf_verifier_log *log;
- int i, len, ret = -EINVAL;
+ int i, len, ret = -EINVAL, err;
+ u32 log_true_size;
bool is_priv;
/* no program is valid */
@@ -10899,7 +20755,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
if (!env)
return -ENOMEM;
- log = &env->log;
+
+ env->bt.env = env;
len = (*prog)->len;
env->insn_aux_data =
@@ -10911,33 +20768,25 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
env->insn_aux_data[i].orig_idx = i;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
+ env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
is_priv = bpf_capable();
- if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
- mutex_lock(&bpf_verifier_lock);
- if (!btf_vmlinux)
- btf_vmlinux = btf_parse_vmlinux();
- mutex_unlock(&bpf_verifier_lock);
- }
+ bpf_get_btf_vmlinux();
/* grab the mutex to protect few globals used by verifier */
if (!is_priv)
mutex_lock(&bpf_verifier_lock);
- if (attr->log_level || attr->log_buf || attr->log_size) {
- /* user requested verbose verifier output
- * and supplied buffer to store the verification trace
- */
- log->level = attr->log_level;
- log->ubuf = (char __user *) (unsigned long) attr->log_buf;
- log->len_total = attr->log_size;
+ /* user could have requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ ret = bpf_vlog_init(&env->log, attr->log_level,
+ (char __user *) (unsigned long) attr->log_buf,
+ attr->log_size);
+ if (ret)
+ goto err_unlock;
- ret = -EINVAL;
- /* log attributes have to be sane */
- if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 ||
- !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK)
- goto err_unlock;
- }
+ mark_verifier_state_clean(env);
if (IS_ERR(btf_vmlinux)) {
/* Either gcc or pahole or kernel are broken. */
@@ -10953,22 +20802,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
env->strict_alignment = false;
env->allow_ptr_leaks = bpf_allow_ptr_leaks();
+ env->allow_uninit_stack = bpf_allow_uninit_stack();
env->bypass_spec_v1 = bpf_bypass_spec_v1();
env->bypass_spec_v4 = bpf_bypass_spec_v4();
env->bpf_capable = bpf_capable();
if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
-
- ret = replace_map_fd_with_map_ptr(env);
- if (ret < 0)
- goto skip_full_check;
-
- if (bpf_prog_is_dev_bound(env->prog->aux)) {
- ret = bpf_prog_offload_verifier_prep(env->prog);
- if (ret)
- goto skip_full_check;
- }
+ env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;
env->explored_states = kvcalloc(state_htab_size(env),
sizeof(struct bpf_verifier_state_list *),
@@ -10977,6 +20818,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (!env->explored_states)
goto skip_full_check;
+ ret = check_btf_info_early(env, attr, uattr);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = add_subprog_and_kfunc(env);
+ if (ret < 0)
+ goto skip_full_check;
+
ret = check_subprogs(env);
if (ret < 0)
goto skip_full_check;
@@ -10989,14 +20838,24 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (ret)
goto skip_full_check;
+ ret = resolve_pseudo_ldimm64(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ if (bpf_prog_is_offloaded(env->prog->aux)) {
+ ret = bpf_prog_offload_verifier_prep(env->prog);
+ if (ret)
+ goto skip_full_check;
+ }
+
ret = check_cfg(env);
if (ret < 0)
goto skip_full_check;
- ret = do_check_subprogs(env);
- ret = ret ?: do_check_main(env);
+ ret = do_check_main(env);
+ ret = ret ?: do_check_subprogs(env);
- if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
+ if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux))
ret = bpf_prog_offload_finalize(env);
skip_full_check:
@@ -11006,6 +20865,9 @@ skip_full_check:
ret = check_max_stack_depth(env);
/* instruction rewrites happen after this point */
+ if (ret == 0)
+ ret = optimize_bpf_loop(env);
+
if (is_priv) {
if (ret == 0)
opt_hard_wire_dead_code_branches(env);
@@ -11023,12 +20885,12 @@ skip_full_check:
ret = convert_ctx_accesses(env);
if (ret == 0)
- ret = fixup_bpf_calls(env);
+ ret = do_misc_fixups(env);
/* do 32-bit optimization after insn patching has done so those patched
* insns could be handled correctly.
*/
- if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (ret == 0 && !bpf_prog_is_offloaded(env->prog->aux)) {
ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
: false;
@@ -11039,15 +20901,24 @@ skip_full_check:
env->verification_time = ktime_get_ns() - start_time;
print_verification_stats(env);
+ env->prog->aux->verified_insns = env->insn_processed;
+
+ /* preserve original error even if log finalization is successful */
+ err = bpf_vlog_finalize(&env->log, &log_true_size);
+ if (err)
+ ret = err;
- if (log->level && bpf_verifier_log_full(log))
- ret = -ENOSPC;
- if (log->level && !log->ubuf) {
+ if (uattr_size >= offsetofend(union bpf_attr, log_true_size) &&
+ copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size),
+ &log_true_size, sizeof(log_true_size))) {
ret = -EFAULT;
goto err_release_maps;
}
- if (ret == 0 && env->used_map_cnt) {
+ if (ret)
+ goto err_release_maps;
+
+ if (env->used_map_cnt) {
/* if program passed verifier, update used_maps in bpf_prog_info */
env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
sizeof(env->used_maps[0]),
@@ -11061,15 +20932,29 @@ skip_full_check:
memcpy(env->prog->aux->used_maps, env->used_maps,
sizeof(env->used_maps[0]) * env->used_map_cnt);
env->prog->aux->used_map_cnt = env->used_map_cnt;
+ }
+ if (env->used_btf_cnt) {
+ /* if program passed verifier, update used_btfs in bpf_prog_aux */
+ env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt,
+ sizeof(env->used_btfs[0]),
+ GFP_KERNEL);
+ if (!env->prog->aux->used_btfs) {
+ ret = -ENOMEM;
+ goto err_release_maps;
+ }
+ memcpy(env->prog->aux->used_btfs, env->used_btfs,
+ sizeof(env->used_btfs[0]) * env->used_btf_cnt);
+ env->prog->aux->used_btf_cnt = env->used_btf_cnt;
+ }
+ if (env->used_map_cnt || env->used_btf_cnt) {
/* program is valid. Convert pseudo bpf_ld_imm64 into generic
* bpf_ld_imm64 instructions
*/
convert_pseudo_ld_imm64(env);
}
- if (ret == 0)
- adjust_btf_func(env);
+ adjust_btf_func(env);
err_release_maps:
if (!env->prog->aux->used_maps)
@@ -11077,6 +20962,8 @@ err_release_maps:
* them now. Otherwise free_used_maps() will release them.
*/
release_maps(env);
+ if (!env->prog->aux->used_btfs)
+ release_btfs(env);
/* extension progs temporarily inherit the attach_type of their targets
for verification purposes, so set it back to zero before returning
diff --git a/kernel/capability.c b/kernel/capability.c
index 1444f3954d75..dac4df77e376 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -20,13 +20,6 @@
#include <linux/user_namespace.h>
#include <linux/uaccess.h>
-/*
- * Leveraged for setting/resetting capabilities
- */
-
-const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
-EXPORT_SYMBOL(__cap_empty_set);
-
int file_caps_enabled = 1;
static int __init file_caps_disable(char *str)
@@ -93,7 +86,7 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
break;
case _LINUX_CAPABILITY_VERSION_2:
warn_deprecated_v2();
- /* fall through - v3 is otherwise equivalent to v2. */
+ fallthrough; /* v3 is otherwise equivalent to v2 */
case _LINUX_CAPABILITY_VERSION_3:
*tocopy = _LINUX_CAPABILITY_U32S_3;
break;
@@ -119,7 +112,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
int ret;
if (pid && (pid != task_pid_vnr(current))) {
- struct task_struct *target;
+ const struct task_struct *target;
rcu_read_lock();
@@ -151,6 +144,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
pid_t pid;
unsigned tocopy;
kernel_cap_t pE, pI, pP;
+ struct __user_cap_data_struct kdata[2];
ret = cap_validate_magic(header, &tocopy);
if ((dataptr == NULL) || (ret != 0))
@@ -163,42 +157,46 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
return -EINVAL;
ret = cap_get_target_pid(pid, &pE, &pI, &pP);
- if (!ret) {
- struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
- unsigned i;
-
- for (i = 0; i < tocopy; i++) {
- kdata[i].effective = pE.cap[i];
- kdata[i].permitted = pP.cap[i];
- kdata[i].inheritable = pI.cap[i];
- }
-
- /*
- * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
- * we silently drop the upper capabilities here. This
- * has the effect of making older libcap
- * implementations implicitly drop upper capability
- * bits when they perform a: capget/modify/capset
- * sequence.
- *
- * This behavior is considered fail-safe
- * behavior. Upgrading the application to a newer
- * version of libcap will enable access to the newer
- * capabilities.
- *
- * An alternative would be to return an error here
- * (-ERANGE), but that causes legacy applications to
- * unexpectedly fail; the capget/modify/capset aborts
- * before modification is attempted and the application
- * fails.
- */
- if (copy_to_user(dataptr, kdata, tocopy
- * sizeof(struct __user_cap_data_struct))) {
- return -EFAULT;
- }
- }
+ if (ret)
+ return ret;
- return ret;
+ /*
+ * Annoying legacy format with 64-bit capabilities exposed
+ * as two sets of 32-bit fields, so we need to split the
+ * capability values up.
+ */
+ kdata[0].effective = pE.val; kdata[1].effective = pE.val >> 32;
+ kdata[0].permitted = pP.val; kdata[1].permitted = pP.val >> 32;
+ kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32;
+
+ /*
+ * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
+ * we silently drop the upper capabilities here. This
+ * has the effect of making older libcap
+ * implementations implicitly drop upper capability
+ * bits when they perform a: capget/modify/capset
+ * sequence.
+ *
+ * This behavior is considered fail-safe
+ * behavior. Upgrading the application to a newer
+ * version of libcap will enable access to the newer
+ * capabilities.
+ *
+ * An alternative would be to return an error here
+ * (-ERANGE), but that causes legacy applications to
+ * unexpectedly fail; the capget/modify/capset aborts
+ * before modification is attempted and the application
+ * fails.
+ */
+ if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0])))
+ return -EFAULT;
+
+ return 0;
+}
+
+static kernel_cap_t mk_kernel_cap(u32 low, u32 high)
+{
+ return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK };
}
/**
@@ -221,8 +219,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
*/
SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
{
- struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
- unsigned i, tocopy, copybytes;
+ struct __user_cap_data_struct kdata[2] = { { 0, }, };
+ unsigned tocopy, copybytes;
kernel_cap_t inheritable, permitted, effective;
struct cred *new;
int ret;
@@ -246,21 +244,9 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
if (copy_from_user(&kdata, data, copybytes))
return -EFAULT;
- for (i = 0; i < tocopy; i++) {
- effective.cap[i] = kdata[i].effective;
- permitted.cap[i] = kdata[i].permitted;
- inheritable.cap[i] = kdata[i].inheritable;
- }
- while (i < _KERNEL_CAPABILITY_U32S) {
- effective.cap[i] = 0;
- permitted.cap[i] = 0;
- inheritable.cap[i] = 0;
- i++;
- }
-
- effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
- permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
- inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+ effective = mk_kernel_cap(kdata[0].effective, kdata[1].effective);
+ permitted = mk_kernel_cap(kdata[0].permitted, kdata[1].permitted);
+ inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable);
new = prepare_creds();
if (!new)
@@ -360,6 +346,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
{
return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
+EXPORT_SYMBOL(has_capability_noaudit);
static bool ns_capable_common(struct user_namespace *ns,
int cap,
@@ -418,7 +405,7 @@ EXPORT_SYMBOL(ns_capable_noaudit);
/**
* ns_capable_setid - Determine if the current task has a superior capability
* in effect, while signalling that this check is being done from within a
- * setid syscall.
+ * setid or setgroups syscall.
* @ns: The usernamespace we want the capability in
* @cap: The capability to be tested for
*
@@ -480,18 +467,22 @@ EXPORT_SYMBOL(file_ns_capable);
/**
* privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
* @ns: The user namespace in question
+ * @idmap: idmap of the mount @inode was found from
* @inode: The inode in question
*
* Return true if the inode uid and gid are within the namespace.
*/
-bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode)
+bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
+ struct mnt_idmap *idmap,
+ const struct inode *inode)
{
- return kuid_has_mapping(ns, inode->i_uid) &&
- kgid_has_mapping(ns, inode->i_gid);
+ return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) &&
+ vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode));
}
/**
* capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
+ * @idmap: idmap of the mount @inode was found from
* @inode: The inode in question
* @cap: The capability in question
*
@@ -499,11 +490,13 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *
* its own user namespace and that the given inode's uid and gid are
* mapped into the current user namespace.
*/
-bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
+bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap,
+ const struct inode *inode, int cap)
{
struct user_namespace *ns = current_user_ns();
- return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode);
+ return ns_capable(ns, cap) &&
+ privileged_wrt_inode_uidgid(ns, idmap, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/kernel/cfi.c b/kernel/cfi.c
new file mode 100644
index 000000000000..08caad776717
--- /dev/null
+++ b/kernel/cfi.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Clang Control Flow Integrity (CFI) error handling.
+ *
+ * Copyright (C) 2022 Google LLC
+ */
+
+#include <linux/cfi.h>
+
+enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
+ unsigned long *target, u32 type)
+{
+ if (target)
+ pr_err("CFI failure at %pS (target: %pS; expected type: 0x%08x)\n",
+ (void *)addr, (void *)*target, type);
+ else
+ pr_err("CFI failure at %pS (no target information)\n",
+ (void *)addr);
+
+ if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) {
+ __warn(NULL, 0, (void *)addr, 0, regs, NULL);
+ return BUG_TRAP_TYPE_WARN;
+ }
+
+ return BUG_TRAP_TYPE_BUG;
+}
+
+#ifdef CONFIG_ARCH_USES_CFI_TRAPS
+static inline unsigned long trap_address(s32 *p)
+{
+ return (unsigned long)((long)p + (long)*p);
+}
+
+static bool is_trap(unsigned long addr, s32 *start, s32 *end)
+{
+ s32 *p;
+
+ for (p = start; p < end; ++p) {
+ if (trap_address(p) == addr)
+ return true;
+ }
+
+ return false;
+}
+
+#ifdef CONFIG_MODULES
+/* Populates `kcfi_trap(_end)?` fields in `struct module`. */
+void module_cfi_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+ struct module *mod)
+{
+ char *secstrings;
+ unsigned int i;
+
+ mod->kcfi_traps = NULL;
+ mod->kcfi_traps_end = NULL;
+
+ secstrings = (char *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+ for (i = 1; i < hdr->e_shnum; i++) {
+ if (strcmp(secstrings + sechdrs[i].sh_name, "__kcfi_traps"))
+ continue;
+
+ mod->kcfi_traps = (s32 *)sechdrs[i].sh_addr;
+ mod->kcfi_traps_end = (s32 *)(sechdrs[i].sh_addr + sechdrs[i].sh_size);
+ break;
+ }
+}
+
+static bool is_module_cfi_trap(unsigned long addr)
+{
+ struct module *mod;
+ bool found = false;
+
+ rcu_read_lock_sched_notrace();
+
+ mod = __module_address(addr);
+ if (mod)
+ found = is_trap(addr, mod->kcfi_traps, mod->kcfi_traps_end);
+
+ rcu_read_unlock_sched_notrace();
+
+ return found;
+}
+#else /* CONFIG_MODULES */
+static inline bool is_module_cfi_trap(unsigned long addr)
+{
+ return false;
+}
+#endif /* CONFIG_MODULES */
+
+extern s32 __start___kcfi_traps[];
+extern s32 __stop___kcfi_traps[];
+
+bool is_cfi_trap(unsigned long addr)
+{
+ if (is_trap(addr, __start___kcfi_traps, __stop___kcfi_traps))
+ return true;
+
+ return is_module_cfi_trap(addr);
+}
+#endif /* CONFIG_ARCH_USES_CFI_TRAPS */
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 5d7a76bfbbb7..12f8457ad1f9 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -5,4 +5,5 @@ obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
obj-$(CONFIG_CGROUP_RDMA) += rdma.o
obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUP_MISC) += misc.o
obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index bfbeabc17a9d..520b90dd97ec 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -12,7 +12,6 @@
#define TRACE_CGROUP_PATH_LEN 1024
extern spinlock_t trace_cgroup_path_lock;
extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
-extern bool cgroup_debug;
extern void __init enable_debug_cgroup(void);
/*
@@ -65,6 +64,25 @@ static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc)
return container_of(kfc, struct cgroup_fs_context, kfc);
}
+struct cgroup_pidlist;
+
+struct cgroup_file_ctx {
+ struct cgroup_namespace *ns;
+
+ struct {
+ void *trigger;
+ } psi;
+
+ struct {
+ bool started;
+ struct css_task_iter iter;
+ } procs;
+
+ struct {
+ struct cgroup_pidlist *pidlist;
+ } procs1;
+};
+
/*
* A cgroup can be associated with multiple css_sets as different tasks may
* belong to different cgroups on different hierarchies. In the other
@@ -146,15 +164,13 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-extern struct mutex cgroup_mutex;
-extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
-extern struct file_system_type cgroup_fs_type;
/* iterate across the hierarchies */
#define for_each_root(root) \
- list_for_each_entry((root), &cgroup_roots, root_list)
+ list_for_each_entry_rcu((root), &cgroup_roots, root_list, \
+ lockdep_is_held(&cgroup_mutex))
/**
* for_each_subsys - iterate all enabled cgroup subsystems
@@ -204,8 +220,6 @@ static inline void get_css_set(struct css_set *cset)
bool cgroup_ssid_enabled(int ssid);
bool cgroup_on_dfl(const struct cgroup *cgrp);
-bool cgroup_is_thread_root(struct cgroup *cgrp);
-bool cgroup_is_threaded(struct cgroup *cgrp);
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
struct cgroup *task_cgroup_from_root(struct task_struct *task,
@@ -215,6 +229,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn);
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns);
+void cgroup_favor_dynmods(struct cgroup_root *root, bool favor);
void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_fs_context *ctx);
int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
@@ -231,6 +246,8 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup);
+void cgroup_attach_lock(bool lock_threadgroup);
+void cgroup_attach_unlock(bool lock_threadgroup);
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
bool *locked)
__acquires(&cgroup_threadgroup_rwsem);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 191c329e482a..520a11cb12f4 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -50,20 +50,19 @@ bool cgroup1_ssid_disabled(int ssid)
* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
* @from: attach to all cgroups of a given task
* @tsk: the task to be attached
+ *
+ * Return: %0 on success or a negative errno code on failure
*/
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
{
struct cgroup_root *root;
int retval = 0;
- mutex_lock(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ cgroup_lock();
+ cgroup_attach_lock(true);
for_each_root(root) {
struct cgroup *from_cgrp;
- if (root == &cgrp_dfl_root)
- continue;
-
spin_lock_irq(&css_set_lock);
from_cgrp = task_cgroup_from_root(from, root);
spin_unlock_irq(&css_set_lock);
@@ -72,15 +71,15 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
if (retval)
break;
}
- percpu_up_write(&cgroup_threadgroup_rwsem);
- mutex_unlock(&cgroup_mutex);
+ cgroup_attach_unlock(true);
+ cgroup_unlock();
return retval;
}
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
/**
- * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * cgroup_transfer_tasks - move tasks from one cgroup to another
* @to: cgroup to which the tasks will be moved
* @from: cgroup in which the tasks currently reside
*
@@ -89,6 +88,8 @@ EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
* is guaranteed to be either visible in the source cgroup after the
* parent's migration is complete or put into the target cgroup. No task
* can slip out of migration through forking.
+ *
+ * Return: %0 on success or a negative errno code on failure
*/
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
{
@@ -105,9 +106,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
if (ret)
return ret;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_lock(true);
/* all tasks in @from are being moved, all csets are source */
spin_lock_irq(&css_set_lock);
@@ -143,8 +144,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
} while (task && !ret);
out_err:
cgroup_migrate_finish(&mgctx);
- percpu_up_write(&cgroup_threadgroup_rwsem);
- mutex_unlock(&cgroup_mutex);
+ cgroup_attach_unlock(true);
+ cgroup_unlock();
return ret;
}
@@ -359,10 +360,9 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
}
css_task_iter_end(&it);
length = n;
- /* now sort & (if procs) strip out duplicates */
+ /* now sort & strip out duplicates (tgids or recycled thread PIDs) */
sort(array, length, sizeof(pid_t), cmppid, NULL);
- if (type == CGROUP_FILE_PROCS)
- length = pidlist_uniq(array, length);
+ length = pidlist_uniq(array, length);
l = cgroup_pidlist_find_create(cgrp, type);
if (!l) {
@@ -393,6 +393,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
* next pid to display, if any
*/
struct kernfs_open_file *of = s->private;
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *cgrp = seq_css(s)->cgroup;
struct cgroup_pidlist *l;
enum cgroup_filetype type = seq_cft(s)->private;
@@ -402,25 +403,24 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
mutex_lock(&cgrp->pidlist_mutex);
/*
- * !NULL @of->priv indicates that this isn't the first start()
- * after open. If the matching pidlist is around, we can use that.
- * Look for it. Note that @of->priv can't be used directly. It
- * could already have been destroyed.
+ * !NULL @ctx->procs1.pidlist indicates that this isn't the first
+ * start() after open. If the matching pidlist is around, we can use
+ * that. Look for it. Note that @ctx->procs1.pidlist can't be used
+ * directly. It could already have been destroyed.
*/
- if (of->priv)
- of->priv = cgroup_pidlist_find(cgrp, type);
+ if (ctx->procs1.pidlist)
+ ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type);
/*
* Either this is the first start() after open or the matching
* pidlist has been destroyed inbetween. Create a new one.
*/
- if (!of->priv) {
- ret = pidlist_array_load(cgrp, type,
- (struct cgroup_pidlist **)&of->priv);
+ if (!ctx->procs1.pidlist) {
+ ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist);
if (ret)
return ERR_PTR(ret);
}
- l = of->priv;
+ l = ctx->procs1.pidlist;
if (pid) {
int end = l->length;
@@ -430,7 +430,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
if (l->list[mid] == pid) {
index = mid;
break;
- } else if (l->list[mid] <= pid)
+ } else if (l->list[mid] < pid)
index = mid + 1;
else
end = mid;
@@ -448,7 +448,8 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
if (l)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
@@ -459,7 +460,8 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
@@ -503,10 +505,11 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
goto out_unlock;
/*
- * Even if we're attaching all tasks in the thread group, we only
- * need to check permissions on one of them.
+ * Even if we're attaching all tasks in the thread group, we only need
+ * to check permissions on one of them. Check permissions using the
+ * credentials from file open to protect against inherited fd attacks.
*/
- cred = current_cred();
+ cred = of->file->f_cred;
tcred = get_task_cred(task);
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
!uid_eq(cred->euid, tcred->uid) &&
@@ -542,14 +545,24 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct cgroup *cgrp;
+ struct cgroup_file_ctx *ctx;
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+ /*
+ * Release agent gets called with all capabilities,
+ * require capabilities to set release agent.
+ */
+ ctx = of->priv;
+ if ((ctx->ns->user_ns != &init_user_ns) ||
+ !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
spin_lock(&release_agent_path_lock);
- strlcpy(cgrp->root->release_agent_path, strstrip(buf),
+ strscpy(cgrp->root->release_agent_path, strstrip(buf),
sizeof(cgrp->root->release_agent_path));
spin_unlock(&release_agent_path_lock);
cgroup_kn_unlock(of->kn);
@@ -658,11 +671,9 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
/*
- * ideally we don't want subsystems moving around while we do this.
- * cgroup_mutex is also necessary to guarantee an atomic snapshot of
- * subsys/hierarchy state.
+ * Grab the subsystems state racily. No need to add avenue to
+ * cgroup_mutex contention.
*/
- mutex_lock(&cgroup_mutex);
for_each_subsys(ss, i)
seq_printf(m, "%s\t%d\t%d\t%d\n",
@@ -670,7 +681,6 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
atomic_read(&ss->root->nr_cgrps),
cgroup_ssid_enabled(i));
- mutex_unlock(&cgroup_mutex);
return 0;
}
@@ -682,6 +692,8 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
*
* Build and fill cgroupstats so that taskstats can export it to user
* space.
+ *
+ * Return: %0 on success or a negative errno code on failure
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
@@ -695,8 +707,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
kernfs_type(kn) != KERNFS_DIR)
return -EINVAL;
- mutex_lock(&cgroup_mutex);
-
/*
* We aren't being called from kernfs and there's no guarantee on
* @kn->priv's validity. For this and css_tryget_online_from_dir(),
@@ -704,16 +714,15 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
*/
rcu_read_lock();
cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
- if (!cgrp || cgroup_is_dead(cgrp)) {
+ if (!cgrp || !cgroup_tryget(cgrp)) {
rcu_read_unlock();
- mutex_unlock(&cgroup_mutex);
return -ENOENT;
}
rcu_read_unlock();
css_task_iter_start(&cgrp->self, 0, &it);
while ((tsk = css_task_iter_next(&it))) {
- switch (tsk->state) {
+ switch (READ_ONCE(tsk->__state)) {
case TASK_RUNNING:
stats->nr_running++;
break;
@@ -727,14 +736,14 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
stats->nr_stopped++;
break;
default:
- if (delayacct_is_task_waiting_on_io(tsk))
+ if (tsk->in_iowait)
stats->nr_io_wait++;
break;
}
}
css_task_iter_end(&it);
- mutex_unlock(&cgroup_mutex);
+ cgroup_put(cgrp);
return 0;
}
@@ -787,13 +796,13 @@ void cgroup1_release_agent(struct work_struct *work)
goto out_free;
spin_lock(&release_agent_path_lock);
- strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
+ strscpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
spin_unlock(&release_agent_path_lock);
if (!agentbuf[0])
goto out_free;
ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
- if (ret < 0 || ret >= PATH_MAX)
+ if (ret < 0)
goto out_free;
argv[0] = agentbuf;
@@ -820,6 +829,10 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent
struct cgroup *cgrp = kn->priv;
int ret;
+ /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
+ if (strchr(new_name_str, '\n'))
+ return -EINVAL;
+
if (kernfs_type(kn) != KERNFS_DIR)
return -ENOTDIR;
if (kn->parent != new_parent)
@@ -833,13 +846,13 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent
kernfs_break_active_protection(new_parent);
kernfs_break_active_protection(kn);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
ret = kernfs_rename(kn, new_parent, new_name_str);
if (!ret)
TRACE_CGROUP_PATH(rename, cgrp);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
kernfs_unbreak_active_protection(kn);
kernfs_unbreak_active_protection(new_parent);
@@ -861,6 +874,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
seq_puts(seq, ",xattr");
if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
seq_puts(seq, ",cpuset_v2_mode");
+ if (root->flags & CGRP_ROOT_FAVOR_DYNMODS)
+ seq_puts(seq, ",favordynmods");
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
@@ -884,6 +899,8 @@ enum cgroup1_param {
Opt_noprefix,
Opt_release_agent,
Opt_xattr,
+ Opt_favordynmods,
+ Opt_nofavordynmods,
};
const struct fs_parameter_spec cgroup1_fs_parameters[] = {
@@ -895,6 +912,8 @@ const struct fs_parameter_spec cgroup1_fs_parameters[] = {
fsparam_flag ("noprefix", Opt_noprefix),
fsparam_string("release_agent", Opt_release_agent),
fsparam_flag ("xattr", Opt_xattr),
+ fsparam_flag ("favordynmods", Opt_favordynmods),
+ fsparam_flag ("nofavordynmods", Opt_nofavordynmods),
{}
};
@@ -907,14 +926,17 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
if (opt == -ENOPARAM) {
- if (strcmp(param->key, "source") == 0) {
- fc->source = param->string;
- param->string = NULL;
- return 0;
- }
+ int ret;
+
+ ret = vfs_parse_fs_param_source(fc, param);
+ if (ret != -ENOPARAM)
+ return ret;
for_each_subsys(ss, i) {
if (strcmp(param->key, ss->legacy_name))
continue;
+ if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i))
+ return invalfc(fc, "Disabled controller '%s'",
+ param->key);
ctx->subsys_mask |= (1 << i);
return 0;
}
@@ -943,10 +965,22 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_xattr:
ctx->flags |= CGRP_ROOT_XATTR;
break;
+ case Opt_favordynmods:
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ break;
+ case Opt_nofavordynmods:
+ ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
+ break;
case Opt_release_agent:
/* Specifying two release agents is forbidden */
if (ctx->release_agent)
return invalfc(fc, "release_agent respecified");
+ /*
+ * Release agent gets called with all capabilities,
+ * require capabilities to set release agent.
+ */
+ if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))
+ return invalfc(fc, "Setting release_agent not allowed");
ctx->release_agent = param->string;
param->string = NULL;
break;
@@ -996,7 +1030,7 @@ static int check_cgroupfs_options(struct fs_context *fc)
ctx->subsys_mask &= enabled;
/*
- * In absense of 'none', 'name=' or subsystem name options,
+ * In absence of 'none', 'name=' and subsystem name options,
* let's default to 'all'.
*/
if (!ctx->subsys_mask && !ctx->none && !ctx->name)
@@ -1084,7 +1118,7 @@ int cgroup1_reconfigure(struct fs_context *fc)
trace_cgroup_remount(root);
out_unlock:
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
@@ -1188,8 +1222,11 @@ static int cgroup1_root_to_use(struct fs_context *fc)
init_cgroup_root(ctx);
ret = cgroup_setup_root(root, ctx->subsys_mask);
- if (ret)
+ if (!ret)
+ cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS);
+ else
cgroup_free_root(root);
+
return ret;
}
@@ -1208,15 +1245,13 @@ int cgroup1_get_tree(struct fs_context *fc)
if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))
ret = 1; /* restart */
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
if (!ret)
ret = cgroup_do_get_tree(fc);
if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
- struct super_block *sb = fc->root->d_sb;
- dput(fc->root);
- deactivate_locked_super(sb);
+ fc_drop_locked(fc);
ret = 1;
}
@@ -1227,6 +1262,40 @@ int cgroup1_get_tree(struct fs_context *fc)
return ret;
}
+/**
+ * task_get_cgroup1 - Acquires the associated cgroup of a task within a
+ * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
+ * hierarchy ID.
+ * @tsk: The target task
+ * @hierarchy_id: The ID of a cgroup1 hierarchy
+ *
+ * On success, the cgroup is returned. On failure, ERR_PTR is returned.
+ * We limit it to cgroup1 only.
+ */
+struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id)
+{
+ struct cgroup *cgrp = ERR_PTR(-ENOENT);
+ struct cgroup_root *root;
+ unsigned long flags;
+
+ rcu_read_lock();
+ for_each_root(root) {
+ /* cgroup1 only*/
+ if (root == &cgrp_dfl_root)
+ continue;
+ if (root->hierarchy_id != hierarchy_id)
+ continue;
+ spin_lock_irqsave(&css_set_lock, flags);
+ cgrp = task_cgroup_from_root(tsk, root);
+ if (!cgrp || !cgroup_tryget(cgrp))
+ cgrp = ERR_PTR(-ENOENT);
+ spin_unlock_irqrestore(&css_set_lock, flags);
+ break;
+ }
+ rcu_read_unlock();
+ return cgrp;
+}
+
static int __init cgroup1_wq_init(void)
{
/*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index dd247747ec14..a66c088c851c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -30,6 +30,7 @@
#include "cgroup-internal.h"
+#include <linux/bpf-cgroup.h>
#include <linux/cred.h>
#include <linux/errno.h>
#include <linux/init_task.h>
@@ -56,6 +57,7 @@
#include <linux/file.h>
#include <linux/fs_parser.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/deadline.h>
#include <linux/psi.h>
#include <net/sock.h>
@@ -68,6 +70,14 @@
#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
/*
+ * To avoid confusing the compiler (and generating warnings) with code
+ * that attempts to access what would be a 0-element array (i.e. sized
+ * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this
+ * constant expression can be added.
+ */
+#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0)
+
+/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
*
@@ -87,7 +97,7 @@ EXPORT_SYMBOL_GPL(css_set_lock);
DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
-bool cgroup_debug __read_mostly;
+static bool cgroup_debug __read_mostly;
/*
* Protects cgroup_idr and css_idr so that IDs can be released without
@@ -197,9 +207,11 @@ static u16 have_exit_callback __read_mostly;
static u16 have_release_callback __read_mostly;
static u16 have_canfork_callback __read_mostly;
+static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
+
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
- .count = REFCOUNT_INIT(2),
+ .ns.count = REFCOUNT_INIT(2),
.user_ns = &init_user_ns,
.ns.ops = &cgroupns_operations,
.ns.inum = PROC_CGROUP_INIT_INO,
@@ -208,6 +220,23 @@ struct cgroup_namespace init_cgroup_ns = {
static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[];
+static struct cftype cgroup_psi_files[];
+
+/* cgroup optional features */
+enum cgroup_opt_features {
+#ifdef CONFIG_PSI
+ OPT_FEATURE_PRESSURE,
+#endif
+ OPT_FEATURE_COUNT
+};
+
+static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
+#ifdef CONFIG_PSI
+ "pressure",
+#endif
+};
+
+static u16 cgroup_feature_disable_mask __read_mostly;
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
@@ -222,6 +251,12 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
+#ifdef CONFIG_DEBUG_CGROUP_REF
+#define CGROUP_REF_FN_ATTRS noinline
+#define CGROUP_REF_EXPORT(fn) EXPORT_SYMBOL_GPL(fn);
+#include <linux/cgroup_refcnt.h>
+#endif
+
/**
* cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
* @ssid: subsys ID of interest
@@ -232,7 +267,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
*/
bool cgroup_ssid_enabled(int ssid)
{
- if (CGROUP_SUBSYS_COUNT == 0)
+ if (!CGROUP_HAS_SUBSYS_CONFIG)
return false;
return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
@@ -244,7 +279,7 @@ bool cgroup_ssid_enabled(int ssid)
*
* The default hierarchy is the v2 interface of cgroup and this function
* can be used to test whether a cgroup is on the default hierarchy for
- * cases where a subsystem should behave differnetly depending on the
+ * cases where a subsystem should behave differently depending on the
* interface version.
*
* List of changed behaviors:
@@ -254,15 +289,13 @@ bool cgroup_ssid_enabled(int ssid)
*
* - When mounting an existing superblock, mount options should match.
*
- * - Remount is disallowed.
- *
* - rename(2) is disallowed.
*
* - "tasks" is removed. Everything should be at process granularity. Use
* "cgroup.procs" instead.
*
* - "cgroup.procs" is not sorted. pids will be unique unless they got
- * recycled inbetween reads.
+ * recycled in-between reads.
*
* - "release_agent" and "notify_on_release" are removed. Replacement
* notification mechanism will be implemented.
@@ -281,12 +314,7 @@ bool cgroup_ssid_enabled(int ssid)
* - cpuset: a task can be moved into an empty cpuset, and again it takes
* masks of ancestors.
*
- * - memcg: use_hierarchy is on by default and the cgroup file for the flag
- * is not created.
- *
* - blkcg: blk-throttle becomes properly hierarchical.
- *
- * - debug: disallowed on the default hierarchy.
*/
bool cgroup_on_dfl(const struct cgroup *cgrp)
{
@@ -329,7 +357,7 @@ static bool cgroup_has_tasks(struct cgroup *cgrp)
return cgrp->nr_populated_csets;
}
-bool cgroup_is_threaded(struct cgroup *cgrp)
+static bool cgroup_is_threaded(struct cgroup *cgrp)
{
return cgrp->dom_cgrp != cgrp;
}
@@ -345,7 +373,7 @@ static bool cgroup_is_mixable(struct cgroup *cgrp)
return !cgroup_parent(cgrp);
}
-/* can @cgrp become a thread root? should always be true for a thread root */
+/* can @cgrp become a thread root? Should always be true for a thread root */
static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
{
/* mixables don't care */
@@ -368,7 +396,7 @@ static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
}
/* is @cgrp root of a threaded subtree? */
-bool cgroup_is_thread_root(struct cgroup *cgrp)
+static bool cgroup_is_thread_root(struct cgroup *cgrp)
{
/* thread root should be a domain */
if (cgroup_is_threaded(cgrp))
@@ -459,7 +487,7 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp)
static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
- if (ss)
+ if (CGROUP_HAS_SUBSYS_CONFIG && ss)
return rcu_dereference_check(cgrp->subsys[ss->id],
lockdep_is_held(&cgroup_mutex));
else
@@ -467,28 +495,6 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
}
/**
- * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
- * @cgrp: the cgroup of interest
- * @ss: the subsystem of interest
- *
- * Find and get @cgrp's css assocaited with @ss. If the css doesn't exist
- * or is offline, %NULL is returned.
- */
-static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
- struct cgroup_subsys *ss)
-{
- struct cgroup_subsys_state *css;
-
- rcu_read_lock();
- css = cgroup_css(cgrp, ss);
- if (css && !css_tryget_online(css))
- css = NULL;
- rcu_read_unlock();
-
- return css;
-}
-
-/**
* cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
@@ -530,13 +536,16 @@ static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
* the root css is returned, so this function always returns a valid css.
*
* The returned css is not guaranteed to be online, and therefore it is the
- * callers responsiblity to tryget a reference for it.
+ * callers responsibility to try get a reference for it.
*/
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
+ if (!CGROUP_HAS_SUBSYS_CONFIG)
+ return NULL;
+
do {
css = cgroup_css(cgrp, ss);
@@ -564,6 +573,9 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
{
struct cgroup_subsys_state *css;
+ if (!CGROUP_HAS_SUBSYS_CONFIG)
+ return NULL;
+
rcu_read_lock();
do {
@@ -580,11 +592,12 @@ out_unlock:
rcu_read_unlock();
return css;
}
+EXPORT_SYMBOL_GPL(cgroup_get_e_css);
static void cgroup_get_live(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
- css_get(&cgrp->self);
+ cgroup_get(cgrp);
}
/**
@@ -633,7 +646,7 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
* the matching css from the cgroup's subsys table is guaranteed to
* be and stay valid until the enclosing operation is complete.
*/
- if (cft->ss)
+ if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss)
return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
else
return &cgrp->self;
@@ -646,7 +659,7 @@ EXPORT_SYMBOL_GPL(of_css);
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
* @cgrp: the target cgroup to iterate css's of
*
- * Should be called under cgroup_[tree_]mutex.
+ * Should be called under cgroup_mutex.
*/
#define for_each_css(css, ssid, cgrp) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
@@ -656,21 +669,6 @@ EXPORT_SYMBOL_GPL(of_css);
else
/**
- * for_each_e_css - iterate all effective css's of a cgroup
- * @css: the iteration cursor
- * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
- * @cgrp: the target cgroup to iterate css's of
- *
- * Should be called under cgroup_[tree_]mutex.
- */
-#define for_each_e_css(css, ssid, cgrp) \
- for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
- if (!((css) = cgroup_e_css_by_mask(cgrp, \
- cgroup_subsys[(ssid)]))) \
- ; \
- else
-
-/**
* do_each_subsys_mask - filter for_each_subsys with a bitmask
* @ss: the iteration cursor
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -681,7 +679,7 @@ EXPORT_SYMBOL_GPL(of_css);
*/
#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
unsigned long __ss_mask = (ss_mask); \
- if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \
+ if (!CGROUP_HAS_SUBSYS_CONFIG) { \
(ssid) = 0; \
break; \
} \
@@ -702,7 +700,7 @@ EXPORT_SYMBOL_GPL(of_css);
; \
else
-/* walk live descendants in preorder */
+/* walk live descendants in pre order */
#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
if (({ lockdep_assert_held(&cgroup_mutex); \
@@ -736,7 +734,8 @@ struct css_set init_css_set = {
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
- .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
+ .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
+ .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
/*
@@ -910,7 +909,7 @@ static void css_set_move_task(struct task_struct *task,
#define CSS_SET_HASH_BITS 7
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
-static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
+static unsigned long css_set_hash(struct cgroup_subsys_state **css)
{
unsigned long key = 0UL;
struct cgroup_subsys *ss;
@@ -936,7 +935,7 @@ void put_css_set_locked(struct css_set *cset)
WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
- /* This css_set is dead. unlink it and release cgroup and css refs */
+ /* This css_set is dead. Unlink it and release cgroup and css refs */
for_each_subsys(ss, ssid) {
list_del(&cset->e_cset_node[ssid]);
css_put(cset->subsys[ssid]);
@@ -1051,7 +1050,7 @@ static bool compare_css_sets(struct css_set *cset,
*/
static struct css_set *find_existing_css_set(struct css_set *old_cset,
struct cgroup *cgrp,
- struct cgroup_subsys_state *template[])
+ struct cgroup_subsys_state **template)
{
struct cgroup_root *root = cgrp->root;
struct cgroup_subsys *ss;
@@ -1061,7 +1060,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
/*
* Build the set of subsystem state objects that we want to see in the
- * new css_set. while subsystems can change globally, the entries here
+ * new css_set. While subsystems can change globally, the entries here
* won't change, so no need for locking.
*/
for_each_subsys(ss, i) {
@@ -1151,7 +1150,7 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
/*
* Always add links to the tail of the lists so that the lists are
- * in choronological order.
+ * in chronological order.
*/
list_move_tail(&link->cset_link, &cgrp->cset_links);
list_add_tail(&link->cgrp_link, &cset->cgrp_links);
@@ -1211,7 +1210,8 @@ static struct css_set *find_css_set(struct css_set *old_cset,
INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
INIT_LIST_HEAD(&cset->cgrp_links);
- INIT_LIST_HEAD(&cset->mg_preload_node);
+ INIT_LIST_HEAD(&cset->mg_src_preload_node);
+ INIT_LIST_HEAD(&cset->mg_dst_preload_node);
INIT_LIST_HEAD(&cset->mg_node);
/* Copy the set of subsystem state objects generated in
@@ -1273,11 +1273,25 @@ static struct css_set *find_css_set(struct css_set *old_cset,
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
- struct cgroup *root_cgrp = kf_root->kn->priv;
+ struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv;
return root_cgrp->root;
}
+void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
+{
+ bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
+
+ /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
+ if (favor && !favoring) {
+ rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
+ root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ } else if (!favor && favoring) {
+ rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
+ root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
+ }
+}
+
static int cgroup_init_root_id(struct cgroup_root *root)
{
int id;
@@ -1301,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
void cgroup_free_root(struct cgroup_root *root)
{
- kfree(root);
+ kfree_rcu(root, rcu);
}
static void cgroup_destroy_root(struct cgroup_root *root)
@@ -1333,89 +1347,130 @@ static void cgroup_destroy_root(struct cgroup_root *root)
spin_unlock_irq(&css_set_lock);
- if (!list_empty(&root->root_list)) {
- list_del(&root->root_list);
- cgroup_root_count--;
- }
+ WARN_ON_ONCE(list_empty(&root->root_list));
+ list_del_rcu(&root->root_list);
+ cgroup_root_count--;
+
+ if (!have_favordynmods)
+ cgroup_favor_dynmods(root, false);
cgroup_exit_root_id(root);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
+ cgroup_rstat_exit(cgrp);
kernfs_destroy_root(root->kf_root);
cgroup_free_root(root);
}
/*
- * look up cgroup associated with current task's cgroup namespace on the
- * specified hierarchy
+ * Returned cgroup is without refcount but it's valid as long as cset pins it.
*/
-static struct cgroup *
-current_cgns_cgroup_from_root(struct cgroup_root *root)
+static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
{
- struct cgroup *res = NULL;
- struct css_set *cset;
-
- lockdep_assert_held(&css_set_lock);
+ struct cgroup *res_cgroup = NULL;
- rcu_read_lock();
-
- cset = current->nsproxy->cgroup_ns->root_cset;
if (cset == &init_css_set) {
- res = &root->cgrp;
+ res_cgroup = &root->cgrp;
} else if (root == &cgrp_dfl_root) {
- res = cset->dfl_cgrp;
+ res_cgroup = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
+ lockdep_assert_held(&css_set_lock);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
if (c->root == root) {
- res = c;
+ res_cgroup = c;
break;
}
}
}
- rcu_read_unlock();
- BUG_ON(!res);
- return res;
+ /*
+ * If cgroup_mutex is not held, the cgrp_cset_link will be freed
+ * before we remove the cgroup root from the root_list. Consequently,
+ * when accessing a cgroup root, the cset_link may have already been
+ * freed, resulting in a NULL res_cgroup. However, by holding the
+ * cgroup_mutex, we ensure that res_cgroup can't be NULL.
+ * If we don't hold cgroup_mutex in the caller, we must do the NULL
+ * check.
+ */
+ return res_cgroup;
}
-/* look up cgroup associated with given css_set on the specified hierarchy */
-static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
- struct cgroup_root *root)
+/*
+ * look up cgroup associated with current task's cgroup namespace on the
+ * specified hierarchy
+ */
+static struct cgroup *
+current_cgns_cgroup_from_root(struct cgroup_root *root)
{
struct cgroup *res = NULL;
+ struct css_set *cset;
- lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
- if (cset == &init_css_set) {
- res = &root->cgrp;
- } else if (root == &cgrp_dfl_root) {
- res = cset->dfl_cgrp;
- } else {
- struct cgrp_cset_link *link;
+ rcu_read_lock();
- list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
- struct cgroup *c = link->cgrp;
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ res = __cset_cgroup_from_root(cset, root);
- if (c->root == root) {
- res = c;
- break;
- }
- }
- }
+ rcu_read_unlock();
- BUG_ON(!res);
+ /*
+ * The namespace_sem is held by current, so the root cgroup can't
+ * be umounted. Therefore, we can ensure that the res is non-NULL.
+ */
+ WARN_ON_ONCE(!res);
return res;
}
/*
+ * Look up cgroup associated with current task's cgroup namespace on the default
+ * hierarchy.
+ *
+ * Unlike current_cgns_cgroup_from_root(), this doesn't need locks:
+ * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu
+ * pointers.
+ * - css_set_lock is not needed because we just read cset->dfl_cgrp.
+ * - As a bonus returned cgrp is pinned with the current because it cannot
+ * switch cgroup_ns asynchronously.
+ */
+static struct cgroup *current_cgns_cgroup_dfl(void)
+{
+ struct css_set *cset;
+
+ if (current->nsproxy) {
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
+ } else {
+ /*
+ * NOTE: This function may be called from bpf_cgroup_from_id()
+ * on a task which has already passed exit_task_namespaces() and
+ * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all
+ * cgroups visible for lookups.
+ */
+ return &cgrp_dfl_root.cgrp;
+ }
+}
+
+/* look up cgroup associated with given css_set on the specified hierarchy */
+static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ return __cset_cgroup_from_root(cset, root);
+}
+
+/*
* Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex and css_set_lock held.
+ * called with css_set_lock held to prevent task's groups from being modified.
+ * Must be called with either cgroup_mutex or rcu read lock to prevent the
+ * cgroup root from being destroyed.
*/
struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct cgroup_root *root)
@@ -1559,7 +1614,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn)
else
cgrp = kn->parent->priv;
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
kernfs_unbreak_active_protection(kn);
cgroup_put(cgrp);
@@ -1604,7 +1659,7 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
if (drain_offline)
cgroup_lock_and_drain_offline(cgrp);
else
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
if (!cgroup_is_dead(cgrp))
return cgrp;
@@ -1635,7 +1690,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
/**
* css_clear_dir - remove subsys files in a cgroup directory
- * @css: taget css
+ * @css: target css
*/
static void css_clear_dir(struct cgroup_subsys_state *css)
{
@@ -1648,12 +1703,16 @@ static void css_clear_dir(struct cgroup_subsys_state *css)
css->flags &= ~CSS_VISIBLE;
if (!css->ss) {
- if (cgroup_on_dfl(cgrp))
- cfts = cgroup_base_files;
- else
- cfts = cgroup1_base_files;
-
- cgroup_addrm_files(css, cgrp, cfts, false);
+ if (cgroup_on_dfl(cgrp)) {
+ cgroup_addrm_files(css, cgrp,
+ cgroup_base_files, false);
+ if (cgroup_psi_enabled())
+ cgroup_addrm_files(css, cgrp,
+ cgroup_psi_files, false);
+ } else {
+ cgroup_addrm_files(css, cgrp,
+ cgroup1_base_files, false);
+ }
} else {
list_for_each_entry(cfts, &css->ss->cfts, node)
cgroup_addrm_files(css, cgrp, cfts, false);
@@ -1672,18 +1731,28 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
struct cftype *cfts, *failed_cfts;
int ret;
- if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
+ if (css->flags & CSS_VISIBLE)
return 0;
if (!css->ss) {
- if (cgroup_on_dfl(cgrp))
- cfts = cgroup_base_files;
- else
- cfts = cgroup1_base_files;
-
- ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
- if (ret < 0)
- return ret;
+ if (cgroup_on_dfl(cgrp)) {
+ ret = cgroup_addrm_files(css, cgrp,
+ cgroup_base_files, true);
+ if (ret < 0)
+ return ret;
+
+ if (cgroup_psi_enabled()) {
+ ret = cgroup_addrm_files(css, cgrp,
+ cgroup_psi_files, true);
+ if (ret < 0)
+ return ret;
+ }
+ } else {
+ ret = cgroup_addrm_files(css, cgrp,
+ cgroup1_base_files, true);
+ if (ret < 0)
+ return ret;
+ }
} else {
list_for_each_entry(cfts, &css->ss->cfts, node) {
ret = cgroup_addrm_files(css, cgrp, cfts, true);
@@ -1710,7 +1779,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
{
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
- int ssid, i, ret;
+ int ssid, ret;
+ u16 dfl_disable_ss_mask = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -1727,20 +1797,43 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
/* can't move between two non-dummy roots either */
if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
return -EBUSY;
+
+ /*
+ * Collect ssid's that need to be disabled from default
+ * hierarchy.
+ */
+ if (ss->root == &cgrp_dfl_root)
+ dfl_disable_ss_mask |= 1 << ssid;
+
} while_each_subsys_mask();
+ if (dfl_disable_ss_mask) {
+ struct cgroup *scgrp = &cgrp_dfl_root.cgrp;
+
+ /*
+ * Controllers from default hierarchy that need to be rebound
+ * are all disabled together in one go.
+ */
+ cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
+ WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_finalize_control(scgrp, 0);
+ }
+
do_each_subsys_mask(ss, ssid, ss_mask) {
struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = &src_root->cgrp;
struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
- struct css_set *cset;
+ struct css_set *cset, *cset_pos;
+ struct css_task_iter *it;
WARN_ON(!css || cgroup_css(dcgrp, ss));
- /* disable from the source */
- src_root->subsys_mask &= ~(1 << ssid);
- WARN_ON(cgroup_apply_control(scgrp));
- cgroup_finalize_control(scgrp, 0);
+ if (src_root != &cgrp_dfl_root) {
+ /* disable from the source */
+ src_root->subsys_mask &= ~(1 << ssid);
+ WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_finalize_control(scgrp, 0);
+ }
/* rebind */
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
@@ -1749,11 +1842,31 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
css->cgroup = dcgrp;
spin_lock_irq(&css_set_lock);
- hash_for_each(css_set_table, i, cset, hlist)
+ WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
+ list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
+ e_cset_node[ss->id]) {
list_move_tail(&cset->e_cset_node[ss->id],
&dcgrp->e_csets[ss->id]);
+ /*
+ * all css_sets of scgrp together in same order to dcgrp,
+ * patch in-flight iterators to preserve correct iteration.
+ * since the iterator is always advanced right away and
+ * finished when it->cset_pos meets it->cset_head, so only
+ * update it->cset_head is enough here.
+ */
+ list_for_each_entry(it, &cset->task_iters, iters_node)
+ if (it->cset_head == &scgrp->e_csets[ss->id])
+ it->cset_head = &dcgrp->e_csets[ss->id];
+ }
spin_unlock_irq(&css_set_lock);
+ if (ss->css_rstat_flush) {
+ list_del_rcu(&css->rstat_css_node);
+ synchronize_rcu();
+ list_add_rcu(&css->rstat_css_node,
+ &dcgrp->rstat_css_list);
+ }
+
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
if (dst_root == &cgrp_dfl_root) {
@@ -1793,7 +1906,7 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
spin_unlock_irq(&css_set_lock);
- if (len >= PATH_MAX)
+ if (len == -E2BIG)
len = -ERANGE;
else if (len > 0) {
seq_escape(sf, buf, " \t\n\\");
@@ -1805,15 +1918,19 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
enum cgroup2_param {
Opt_nsdelegate,
+ Opt_favordynmods,
Opt_memory_localevents,
Opt_memory_recursiveprot,
+ Opt_memory_hugetlb_accounting,
nr__cgroup2_params
};
static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("nsdelegate", Opt_nsdelegate),
+ fsparam_flag("favordynmods", Opt_favordynmods),
fsparam_flag("memory_localevents", Opt_memory_localevents),
fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
+ fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting),
{}
};
@@ -1831,12 +1948,18 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
case Opt_nsdelegate:
ctx->flags |= CGRP_ROOT_NS_DELEGATE;
return 0;
+ case Opt_favordynmods:
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ return 0;
case Opt_memory_localevents:
ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
return 0;
case Opt_memory_recursiveprot:
ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
return 0;
+ case Opt_memory_hugetlb_accounting:
+ ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+ return 0;
}
return -EINVAL;
}
@@ -1849,6 +1972,9 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+ cgroup_favor_dynmods(&cgrp_dfl_root,
+ root_flags & CGRP_ROOT_FAVOR_DYNMODS);
+
if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
else
@@ -1858,6 +1984,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
+
+ if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
+ cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
}
}
@@ -1865,10 +1996,14 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
{
if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
seq_puts(seq, ",nsdelegate");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS)
+ seq_puts(seq, ",favordynmods");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
seq_puts(seq, ",memory_localevents");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
seq_puts(seq, ",memory_recursiveprot");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
+ seq_puts(seq, ",memory_hugetlb_accounting");
return 0;
}
@@ -1910,12 +2045,13 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
struct cgroup_root *root = ctx->root;
struct cgroup *cgrp = &root->cgrp;
- INIT_LIST_HEAD(&root->root_list);
+ INIT_LIST_HEAD_RCU(&root->root_list);
atomic_set(&root->nr_cgrps, 1);
cgrp->root = root;
init_cgroup_housekeeping(cgrp);
- root->flags = ctx->flags;
+ /* DYNMODS must be modified through cgroup_favor_dynmods() */
+ root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS;
if (ctx->release_agent)
strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
if (ctx->name)
@@ -1966,18 +2102,22 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
ret = PTR_ERR(root->kf_root);
goto exit_root_id;
}
- root_cgrp->kn = root->kf_root->kn;
+ root_cgrp->kn = kernfs_root_to_node(root->kf_root);
WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
- root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
+ root_cgrp->ancestors[0] = root_cgrp;
ret = css_populate_dir(&root_cgrp->self);
if (ret)
goto destroy_root;
- ret = rebind_subsystems(root, ss_mask);
+ ret = cgroup_rstat_init(root_cgrp);
if (ret)
goto destroy_root;
+ ret = rebind_subsystems(root, ss_mask);
+ if (ret)
+ goto exit_stats;
+
ret = cgroup_bpf_inherit(root_cgrp);
WARN_ON_ONCE(ret);
@@ -1988,7 +2128,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
* care of subsystems' refcounts, which are explicitly dropped in
* the failure exit path.
*/
- list_add(&root->root_list, &cgroup_roots);
+ list_add_rcu(&root->root_list, &cgroup_roots);
cgroup_root_count++;
/*
@@ -2006,10 +2146,11 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
BUG_ON(!list_empty(&root_cgrp->self.children));
BUG_ON(atomic_read(&root->nr_cgrps) != 1);
- kernfs_activate(root_cgrp->kn);
ret = 0;
goto out;
+exit_stats:
+ cgroup_rstat_exit(root_cgrp);
destroy_root:
kernfs_destroy_root(root->kf_root);
root->kf_root = NULL;
@@ -2043,13 +2184,13 @@ int cgroup_do_get_tree(struct fs_context *fc)
struct super_block *sb = fc->root->d_sb;
struct cgroup *cgrp;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
spin_lock_irq(&css_set_lock);
cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
spin_unlock_irq(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
nsdentry = kernfs_node_dentry(cgrp->kn, sb);
dput(fc->root);
@@ -2086,7 +2227,7 @@ static int cgroup_get_tree(struct fs_context *fc)
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
- cgrp_dfl_visible = true;
+ WRITE_ONCE(cgrp_dfl_visible, true);
cgroup_get_live(&cgrp_dfl_root.cgrp);
ctx->root = &cgrp_dfl_root;
@@ -2132,6 +2273,10 @@ static int cgroup_init_fs_context(struct fs_context *fc)
put_user_ns(fc->user_ns);
fc->user_ns = get_user_ns(ctx->ns->user_ns);
fc->global = true;
+
+ if (have_favordynmods)
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+
return 0;
}
@@ -2143,13 +2288,14 @@ static void cgroup_kill_sb(struct super_block *sb)
/*
* If @root doesn't have any children, start killing it.
* This prevents new mounts by disabling percpu_ref_tryget_live().
- * cgroup_mount() may wait for @root's release.
*
* And don't kill the default root.
*/
if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
- !percpu_ref_is_dying(&root->cgrp.self.refcnt))
+ !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
+ cgroup_bpf_offline(&root->cgrp);
percpu_ref_kill(&root->cgrp.self.refcnt);
+ }
cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
@@ -2227,56 +2373,58 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
{
int ret;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
spin_lock_irq(&css_set_lock);
ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
spin_unlock_irq(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(cgroup_path_ns);
/**
- * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
- * @task: target task
- * @buf: the buffer to write the path into
- * @buflen: the length of the buffer
- *
- * Determine @task's cgroup on the first (the one with the lowest non-zero
- * hierarchy_id) cgroup hierarchy and copy its path into @buf. This
- * function grabs cgroup_mutex and shouldn't be used inside locks used by
- * cgroup controller callbacks.
- *
- * Return value is the same as kernfs_path().
+ * cgroup_attach_lock - Lock for ->attach()
+ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
+ *
+ * cgroup migration sometimes needs to stabilize threadgroups against forks and
+ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
+ * implementations (e.g. cpuset), also need to disable CPU hotplug.
+ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
+ * lead to deadlocks.
+ *
+ * Bringing up a CPU may involve creating and destroying tasks which requires
+ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
+ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
+ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
+ * waiting for an on-going CPU hotplug operation which in turn is waiting for
+ * the threadgroup_rwsem to be released to create new tasks. For more details:
+ *
+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
+ *
+ * Resolve the situation by always acquiring cpus_read_lock() before optionally
+ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
+ * CPU hotplug is disabled on entry.
*/
-int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
+void cgroup_attach_lock(bool lock_threadgroup)
{
- struct cgroup_root *root;
- struct cgroup *cgrp;
- int hierarchy_id = 1;
- int ret;
-
- mutex_lock(&cgroup_mutex);
- spin_lock_irq(&css_set_lock);
-
- root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
-
- if (root) {
- cgrp = task_cgroup_from_root(task, root);
- ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
- } else {
- /* if no hierarchy exists, everyone is in "/" */
- ret = strlcpy(buf, "/", buflen);
- }
+ cpus_read_lock();
+ if (lock_threadgroup)
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+}
- spin_unlock_irq(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
- return ret;
+/**
+ * cgroup_attach_unlock - Undo cgroup_attach_lock()
+ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
+ */
+void cgroup_attach_unlock(bool lock_threadgroup)
+{
+ if (lock_threadgroup)
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ cpus_read_unlock();
}
-EXPORT_SYMBOL_GPL(task_cgroup_path);
/**
* cgroup_migrate_add_task - add a migration target task to a migration context
@@ -2347,7 +2495,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
struct css_set *cset = tset->cur_cset;
struct task_struct *task = tset->cur_task;
- while (&cset->mg_node != tset->csets) {
+ while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) {
if (!task)
task = list_first_entry(&cset->mg_tasks,
struct task_struct, cg_list);
@@ -2360,7 +2508,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
/*
* This function may be called both before and
- * after cgroup_taskset_migrate(). The two cases
+ * after cgroup_migrate_execute(). The two cases
* can be distinguished by looking at whether @cset
* has its ->mg_dst_cset set.
*/
@@ -2380,7 +2528,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
}
/**
- * cgroup_taskset_migrate - migrate a taskset
+ * cgroup_migrate_execute - migrate a taskset
* @mgctx: migration context
*
* Migrate tasks in @mgctx as setup by migration preparation functions.
@@ -2505,10 +2653,6 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
return -EOPNOTSUPP;
- /* mixables don't care */
- if (cgroup_is_mixable(dst_cgrp))
- return 0;
-
/*
* If @dst_cgrp is already or can become a thread root or is
* threaded, it doesn't matter.
@@ -2532,21 +2676,27 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
*/
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
{
- LIST_HEAD(preloaded);
struct css_set *cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
- list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
+ mg_src_preload_node) {
+ cset->mg_src_cgrp = NULL;
+ cset->mg_dst_cgrp = NULL;
+ cset->mg_dst_cset = NULL;
+ list_del_init(&cset->mg_src_preload_node);
+ put_css_set_locked(cset);
+ }
- list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
+ mg_dst_preload_node) {
cset->mg_src_cgrp = NULL;
cset->mg_dst_cgrp = NULL;
cset->mg_dst_cset = NULL;
- list_del_init(&cset->mg_preload_node);
+ list_del_init(&cset->mg_dst_preload_node);
put_css_set_locked(cset);
}
@@ -2586,11 +2736,11 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
if (src_cset->dead)
return;
- src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
-
- if (!list_empty(&src_cset->mg_preload_node))
+ if (!list_empty(&src_cset->mg_src_preload_node))
return;
+ src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
+
WARN_ON(src_cset->mg_src_cgrp);
WARN_ON(src_cset->mg_dst_cgrp);
WARN_ON(!list_empty(&src_cset->mg_tasks));
@@ -2599,7 +2749,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
src_cset->mg_src_cgrp = src_cgrp;
src_cset->mg_dst_cgrp = dst_cgrp;
get_css_set(src_cset);
- list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
+ list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
}
/**
@@ -2624,7 +2774,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
/* look up the dst cset for each src cset and link it to src */
list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
- mg_preload_node) {
+ mg_src_preload_node) {
struct css_set *dst_cset;
struct cgroup_subsys *ss;
int ssid;
@@ -2643,7 +2793,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
if (src_cset == dst_cset) {
src_cset->mg_src_cgrp = NULL;
src_cset->mg_dst_cgrp = NULL;
- list_del_init(&src_cset->mg_preload_node);
+ list_del_init(&src_cset->mg_src_preload_node);
put_css_set(src_cset);
put_css_set(dst_cset);
continue;
@@ -2651,8 +2801,8 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
src_cset->mg_dst_cset = dst_cset;
- if (list_empty(&dst_cset->mg_preload_node))
- list_add_tail(&dst_cset->mg_preload_node,
+ if (list_empty(&dst_cset->mg_dst_preload_node))
+ list_add_tail(&dst_cset->mg_dst_preload_node,
&mgctx->preloaded_dst_csets);
else
put_css_set(dst_cset);
@@ -2689,19 +2839,17 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
struct task_struct *task;
/*
- * Prevent freeing of tasks while we take a snapshot. Tasks that are
- * already PF_EXITING could be freed from underneath us unless we
- * take an rcu_read_lock.
+ * The following thread iteration should be inside an RCU critical
+ * section to prevent tasks from being freed while taking the snapshot.
+ * spin_lock_irq() implies RCU critical section here.
*/
spin_lock_irq(&css_set_lock);
- rcu_read_lock();
task = leader;
do {
cgroup_migrate_add_task(task, mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
- rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
return cgroup_migrate_execute(mgctx);
@@ -2748,8 +2896,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
}
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
- bool *locked)
- __acquires(&cgroup_threadgroup_rwsem)
+ bool *threadgroup_locked)
{
struct task_struct *tsk;
pid_t pid;
@@ -2766,12 +2913,8 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
* Therefore, we can skip the global lock.
*/
lockdep_assert_held(&cgroup_mutex);
- if (pid || threadgroup) {
- percpu_down_write(&cgroup_threadgroup_rwsem);
- *locked = true;
- } else {
- *locked = false;
- }
+ *threadgroup_locked = pid || threadgroup;
+ cgroup_attach_lock(*threadgroup_locked);
rcu_read_lock();
if (pid) {
@@ -2802,17 +2945,14 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
goto out_unlock_rcu;
out_unlock_threadgroup:
- if (*locked) {
- percpu_up_write(&cgroup_threadgroup_rwsem);
- *locked = false;
- }
+ cgroup_attach_unlock(*threadgroup_locked);
+ *threadgroup_locked = false;
out_unlock_rcu:
rcu_read_unlock();
return tsk;
}
-void cgroup_procs_write_finish(struct task_struct *task, bool locked)
- __releases(&cgroup_threadgroup_rwsem)
+void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
{
struct cgroup_subsys *ss;
int ssid;
@@ -2820,8 +2960,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked)
/* release reference from cgroup_procs_write_start() */
put_task_struct(task);
- if (locked)
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(threadgroup_locked);
+
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
@@ -2876,29 +3016,47 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
struct css_set *src_cset;
+ bool has_tasks;
int ret;
lockdep_assert_held(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
-
/* look up all csses currently attached to @cgrp's subtree */
spin_lock_irq(&css_set_lock);
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
struct cgrp_cset_link *link;
+ /*
+ * As cgroup_update_dfl_csses() is only called by
+ * cgroup_apply_control(). The csses associated with the
+ * given cgrp will not be affected by changes made to
+ * its subtree_control file. We can skip them.
+ */
+ if (dsct == cgrp)
+ continue;
+
list_for_each_entry(link, &dsct->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, dsct, &mgctx);
}
spin_unlock_irq(&css_set_lock);
+ /*
+ * We need to write-lock threadgroup_rwsem while migrating tasks.
+ * However, if there are no source csets for @cgrp, changing its
+ * controllers isn't gonna produce any task migrations and the
+ * write-locking can be skipped safely.
+ */
+ has_tasks = !list_empty(&mgctx.preloaded_src_csets);
+ cgroup_attach_lock(has_tasks);
+
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&mgctx);
if (ret)
goto out_finish;
spin_lock_irq(&css_set_lock);
- list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
+ list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
+ mg_src_preload_node) {
struct task_struct *task, *ntask;
/* all tasks in src_csets need to be migrated */
@@ -2910,7 +3068,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(has_tasks);
return ret;
}
@@ -2931,7 +3089,7 @@ void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
int ssid;
restart:
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
for_each_subsys(ss, ssid) {
@@ -2945,7 +3103,7 @@ restart:
prepare_to_wait(&dsct->offline_waitq, &wait,
TASK_UNINTERRUPTIBLE);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
schedule();
finish_wait(&dsct->offline_waitq, &wait);
@@ -3147,11 +3305,7 @@ static int cgroup_apply_control(struct cgroup *cgrp)
* making the following cgroup_update_dfl_csses() properly update
* css associations of all tasks in the subtree.
*/
- ret = cgroup_update_dfl_csses(cgrp);
- if (ret)
- return ret;
-
- return 0;
+ return cgroup_update_dfl_csses(cgrp);
}
/**
@@ -3509,9 +3663,32 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
return 0;
}
-static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
- struct cgroup *cgrp, int ssid)
+#ifdef CONFIG_CGROUP_SCHED
+/**
+ * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get @cgrp's css associated with @ss. If the css doesn't exist
+ * or is offline, %NULL is returned.
+ */
+static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+ css = cgroup_css(cgrp, ss);
+ if (css && !css_tryget_online(css))
+ css = NULL;
+ rcu_read_unlock();
+
+ return css;
+}
+
+static int cgroup_extra_stat_show(struct seq_file *seq, int ssid)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
struct cgroup_subsys *ss = cgroup_subsys[ssid];
struct cgroup_subsys_state *css;
int ret;
@@ -3528,14 +3705,44 @@ static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
return ret;
}
+static int cgroup_local_stat_show(struct seq_file *seq,
+ struct cgroup *cgrp, int ssid)
+{
+ struct cgroup_subsys *ss = cgroup_subsys[ssid];
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ if (!ss->css_local_stat_show)
+ return 0;
+
+ css = cgroup_tryget_css(cgrp, ss);
+ if (!css)
+ return 0;
+
+ ret = ss->css_local_stat_show(seq, css);
+ css_put(css);
+ return ret;
+}
+#endif
+
static int cpu_stat_show(struct seq_file *seq, void *v)
{
- struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
int ret = 0;
cgroup_base_stat_cputime_show(seq);
#ifdef CONFIG_CGROUP_SCHED
- ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
+ ret = cgroup_extra_stat_show(seq, cpu_cgrp_id);
+#endif
+ return ret;
+}
+
+static int cpu_local_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
+ int ret = 0;
+
+#ifdef CONFIG_CGROUP_SCHED
+ ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id);
#endif
return ret;
}
@@ -3544,30 +3751,32 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_CPU);
}
-static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, enum psi_res res)
+static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, enum psi_res res)
{
+ struct cgroup_file_ctx *ctx = of->priv;
struct psi_trigger *new;
struct cgroup *cgrp;
+ struct psi_group *psi;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
@@ -3576,14 +3785,20 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
cgroup_get(cgrp);
cgroup_kn_unlock(of->kn);
- new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
+ /* Allow only one trigger per file descriptor */
+ if (ctx->psi.trigger) {
+ cgroup_put(cgrp);
+ return -EBUSY;
+ }
+
+ psi = cgroup_psi(cgrp);
+ new = psi_trigger_create(psi, buf, res, of->file, of);
if (IS_ERR(new)) {
cgroup_put(cgrp);
return PTR_ERR(new);
}
- psi_trigger_replace(&of->priv, new);
-
+ smp_store_release(&ctx->psi.trigger, new);
cgroup_put(cgrp);
return nbytes;
@@ -3593,33 +3808,117 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
+ return pressure_write(of, buf, nbytes, PSI_IO);
}
static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+ return pressure_write(of, buf, nbytes, PSI_MEM);
}
static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+ return pressure_write(of, buf, nbytes, PSI_CPU);
+}
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ return psi_show(seq, psi, PSI_IRQ);
+}
+
+static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return pressure_write(of, buf, nbytes, PSI_IRQ);
+}
+#endif
+
+static int cgroup_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ seq_printf(seq, "%d\n", psi->enabled);
+
+ return 0;
+}
+
+static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ ssize_t ret;
+ int enable;
+ struct cgroup *cgrp;
+ struct psi_group *psi;
+
+ ret = kstrtoint(strstrip(buf), 0, &enable);
+ if (ret)
+ return ret;
+
+ if (enable < 0 || enable > 1)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ psi = cgroup_psi(cgrp);
+ if (psi->enabled != enable) {
+ int i;
+
+ /* show or hide {cpu,memory,io,irq}.pressure files */
+ for (i = 0; i < NR_PSI_RESOURCES; i++)
+ cgroup_file_show(&cgrp->psi_files[i], enable);
+
+ psi->enabled = enable;
+ if (enable)
+ psi_cgroup_restart(psi);
+ }
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
}
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
poll_table *pt)
{
- return psi_trigger_poll(&of->priv, of->file, pt);
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}
static void cgroup_pressure_release(struct kernfs_open_file *of)
{
- psi_trigger_replace(&of->priv, NULL);
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ psi_trigger_destroy(ctx->psi.trigger);
+}
+
+bool cgroup_psi_enabled(void)
+{
+ if (static_branch_likely(&psi_disabled))
+ return false;
+
+ return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
+}
+
+#else /* CONFIG_PSI */
+bool cgroup_psi_enabled(void)
+{
+ return false;
}
+
#endif /* CONFIG_PSI */
static int cgroup_freeze_show(struct seq_file *seq, void *v)
@@ -3656,32 +3955,128 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
return nbytes;
}
+static void __cgroup_kill(struct cgroup *cgrp)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ spin_lock_irq(&css_set_lock);
+ set_bit(CGRP_KILL, &cgrp->flags);
+ spin_unlock_irq(&css_set_lock);
+
+ css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
+ while ((task = css_task_iter_next(&it))) {
+ /* Ignore kernel threads here. */
+ if (task->flags & PF_KTHREAD)
+ continue;
+
+ /* Skip tasks that are already dying. */
+ if (__fatal_signal_pending(task))
+ continue;
+
+ send_sig(SIGKILL, task, 0);
+ }
+ css_task_iter_end(&it);
+
+ spin_lock_irq(&css_set_lock);
+ clear_bit(CGRP_KILL, &cgrp->flags);
+ spin_unlock_irq(&css_set_lock);
+}
+
+static void cgroup_kill(struct cgroup *cgrp)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup *dsct;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
+ __cgroup_kill(dsct);
+}
+
+static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ ssize_t ret = 0;
+ int kill;
+ struct cgroup *cgrp;
+
+ ret = kstrtoint(strstrip(buf), 0, &kill);
+ if (ret)
+ return ret;
+
+ if (kill != 1)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ /*
+ * Killing is a process directed operation, i.e. the whole thread-group
+ * is taken down so act like we do for cgroup.procs and only make this
+ * writable in non-threaded cgroups.
+ */
+ if (cgroup_is_threaded(cgrp))
+ ret = -EOPNOTSUPP;
+ else
+ cgroup_kill(cgrp);
+
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
static int cgroup_file_open(struct kernfs_open_file *of)
{
- struct cftype *cft = of->kn->priv;
+ struct cftype *cft = of_cft(of);
+ struct cgroup_file_ctx *ctx;
+ int ret;
- if (cft->open)
- return cft->open(of);
- return 0;
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ns = current->nsproxy->cgroup_ns;
+ get_cgroup_ns(ctx->ns);
+ of->priv = ctx;
+
+ if (!cft->open)
+ return 0;
+
+ ret = cft->open(of);
+ if (ret) {
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
+ }
+ return ret;
}
static void cgroup_file_release(struct kernfs_open_file *of)
{
- struct cftype *cft = of->kn->priv;
+ struct cftype *cft = of_cft(of);
+ struct cgroup_file_ctx *ctx = of->priv;
if (cft->release)
cft->release(of);
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
}
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *cgrp = of->kn->parent->priv;
- struct cftype *cft = of->kn->priv;
+ struct cftype *cft = of_cft(of);
struct cgroup_subsys_state *css;
int ret;
+ if (!nbytes)
+ return 0;
+
/*
* If namespaces are delegation boundaries, disallow writes to
* files in an non-init namespace root from inside the namespace
@@ -3690,7 +4085,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
*/
if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
- ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
+ ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
return -EPERM;
if (cft->write)
@@ -3725,7 +4120,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
{
- struct cftype *cft = of->kn->priv;
+ struct cftype *cft = of_cft(of);
if (cft->poll)
return cft->poll(of, pt);
@@ -3787,20 +4182,6 @@ static struct kernfs_ops cgroup_kf_ops = {
.seq_show = cgroup_seqfile_show,
};
-/* set uid and gid of cgroup dirs and files to that of the creator */
-static int cgroup_kn_set_ugid(struct kernfs_node *kn)
-{
- struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
- .ia_uid = current_fsuid(),
- .ia_gid = current_fsgid(), };
-
- if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
- gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
- return 0;
-
- return kernfs_setattr(kn, &iattr);
-}
-
static void cgroup_file_notify_timer(struct timer_list *timer)
{
cgroup_file_notify(container_of(timer, struct cgroup_file,
@@ -3813,25 +4194,18 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
char name[CGROUP_FILE_NAME_MAX];
struct kernfs_node *kn;
struct lock_class_key *key = NULL;
- int ret;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
key = &cft->lockdep_key;
#endif
kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
cgroup_file_mode(cft),
- GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+ current_fsuid(), current_fsgid(),
0, cft->kf_ops, cft,
NULL, key);
if (IS_ERR(kn))
return PTR_ERR(kn);
- ret = cgroup_kn_set_ugid(kn);
- if (ret) {
- kernfs_remove(kn);
- return ret;
- }
-
if (cft->file_offset) {
struct cgroup_file *cfile = (void *)css + cft->file_offset;
@@ -3931,19 +4305,26 @@ static void cgroup_exit_cftypes(struct cftype *cfts)
cft->ss = NULL;
/* revert flags set by cgroup core while adding @cfts */
- cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
+ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL |
+ __CFTYPE_ADDED);
}
}
static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype *cft;
+ int ret = 0;
for (cft = cfts; cft->name[0] != '\0'; cft++) {
struct kernfs_ops *kf_ops;
WARN_ON(cft->ss || cft->kf_ops);
+ if (cft->flags & __CFTYPE_ADDED) {
+ ret = -EBUSY;
+ break;
+ }
+
if (cft->seq_start)
kf_ops = &cgroup_kf_ops;
else
@@ -3956,30 +4337,29 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
if (!kf_ops) {
- cgroup_exit_cftypes(cfts);
- return -ENOMEM;
+ ret = -ENOMEM;
+ break;
}
kf_ops->atomic_write_len = cft->max_write_len;
}
cft->kf_ops = kf_ops;
cft->ss = ss;
+ cft->flags |= __CFTYPE_ADDED;
}
- return 0;
+ if (ret)
+ cgroup_exit_cftypes(cfts);
+ return ret;
}
-static int cgroup_rm_cftypes_locked(struct cftype *cfts)
+static void cgroup_rm_cftypes_locked(struct cftype *cfts)
{
lockdep_assert_held(&cgroup_mutex);
- if (!cfts || !cfts[0].ss)
- return -ENOENT;
-
list_del(&cfts->node);
cgroup_apply_cftypes(cfts, false);
cgroup_exit_cftypes(cfts);
- return 0;
}
/**
@@ -3995,12 +4375,16 @@ static int cgroup_rm_cftypes_locked(struct cftype *cfts)
*/
int cgroup_rm_cftypes(struct cftype *cfts)
{
- int ret;
+ if (!cfts || cfts[0].name[0] == '\0')
+ return 0;
- mutex_lock(&cgroup_mutex);
- ret = cgroup_rm_cftypes_locked(cfts);
- mutex_unlock(&cgroup_mutex);
- return ret;
+ if (!(cfts[0].flags & __CFTYPE_ADDED))
+ return -ENOENT;
+
+ cgroup_lock();
+ cgroup_rm_cftypes_locked(cfts);
+ cgroup_unlock();
+ return 0;
}
/**
@@ -4031,14 +4415,14 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
if (ret)
return ret;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
list_add_tail(&cfts->node, &ss->cfts);
ret = cgroup_apply_cftypes(cfts, true);
if (ret)
cgroup_rm_cftypes_locked(cfts);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
@@ -4102,6 +4486,26 @@ void cgroup_file_notify(struct cgroup_file *cfile)
}
/**
+ * cgroup_file_show - show or hide a hidden cgroup file
+ * @cfile: target cgroup_file obtained by setting cftype->file_offset
+ * @show: whether to show or hide
+ */
+void cgroup_file_show(struct cgroup_file *cfile, bool show)
+{
+ struct kernfs_node *kn;
+
+ spin_lock_irq(&cgroup_file_kn_lock);
+ kn = cfile->kn;
+ kernfs_get(kn);
+ spin_unlock_irq(&cgroup_file_kn_lock);
+
+ if (kn)
+ kernfs_show(kn, show);
+
+ kernfs_put(kn);
+}
+
+/**
* css_next_child - find the next child of a given css
* @pos: the current position (%NULL to initiate traversal)
* @parent: css whose children to walk
@@ -4135,7 +4539,7 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
* implies that if we observe !CSS_RELEASED on @pos in this RCU
* critical section, the one pointed to by its next pointer is
* guaranteed to not have finished its RCU grace period even if we
- * have dropped rcu_read_lock() inbetween iterations.
+ * have dropped rcu_read_lock() in-between iterations.
*
* If @pos has CSS_RELEASED set, its next pointer can't be
* dereferenced; however, as each css is given a monotonically
@@ -4383,7 +4787,7 @@ static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
}
/**
- * css_task_iter_advance_css_set - advance a task itererator to the next css_set
+ * css_task_iter_advance_css_set - advance a task iterator to the next css_set
* @it: the iterator to advance
*
* Advance @it to the next css_set to walk.
@@ -4515,14 +4919,16 @@ repeat:
void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
struct css_task_iter *it)
{
+ unsigned long irqflags;
+
memset(it, 0, sizeof(*it));
- spin_lock_irq(&css_set_lock);
+ spin_lock_irqsave(&css_set_lock, irqflags);
it->ss = css->ss;
it->flags = flags;
- if (it->ss)
+ if (CGROUP_HAS_SUBSYS_CONFIG && it->ss)
it->cset_pos = &css->cgroup->e_csets[css->ss->id];
else
it->cset_pos = &css->cgroup->cset_links;
@@ -4531,7 +4937,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
css_task_iter_advance(it);
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irqrestore(&css_set_lock, irqflags);
}
/**
@@ -4544,12 +4950,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
*/
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
+ unsigned long irqflags;
+
if (it->cur_task) {
put_task_struct(it->cur_task);
it->cur_task = NULL;
}
- spin_lock_irq(&css_set_lock);
+ spin_lock_irqsave(&css_set_lock, irqflags);
/* @it may be half-advanced by skips, finish advancing */
if (it->flags & CSS_TASK_ITER_SKIPPED)
@@ -4562,7 +4970,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
css_task_iter_advance(it);
}
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irqrestore(&css_set_lock, irqflags);
return it->cur_task;
}
@@ -4575,11 +4983,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
*/
void css_task_iter_end(struct css_task_iter *it)
{
+ unsigned long irqflags;
+
if (it->cur_cset) {
- spin_lock_irq(&css_set_lock);
+ spin_lock_irqsave(&css_set_lock, irqflags);
list_del(&it->iters_node);
put_css_set_locked(it->cur_cset);
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irqrestore(&css_set_lock, irqflags);
}
if (it->cur_dcset)
@@ -4591,21 +5001,21 @@ void css_task_iter_end(struct css_task_iter *it)
static void cgroup_procs_release(struct kernfs_open_file *of)
{
- if (of->priv) {
- css_task_iter_end(of->priv);
- kfree(of->priv);
- }
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ if (ctx->procs.started)
+ css_task_iter_end(&ctx->procs.iter);
}
static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
- struct css_task_iter *it = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
if (pos)
(*pos)++;
- return css_task_iter_next(it);
+ return css_task_iter_next(&ctx->procs.iter);
}
static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
@@ -4613,21 +5023,18 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
{
struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
- struct css_task_iter *it = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct css_task_iter *it = &ctx->procs.iter;
/*
* When a seq_file is seeked, it's always traversed sequentially
* from position 0, so we can simply keep iterating on !0 *pos.
*/
- if (!it) {
+ if (!ctx->procs.started) {
if (WARN_ON_ONCE((*pos)))
return ERR_PTR(-EINVAL);
-
- it = kzalloc(sizeof(*it), GFP_KERNEL);
- if (!it)
- return ERR_PTR(-ENOMEM);
- of->priv = it;
css_task_iter_start(&cgrp->self, iter_flags, it);
+ ctx->procs.started = true;
} else if (!(*pos)) {
css_task_iter_end(it);
css_task_iter_start(&cgrp->self, iter_flags, it);
@@ -4671,16 +5078,16 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
if (!inode)
return -ENOMEM;
- ret = inode_permission(inode, MAY_WRITE);
+ ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE);
iput(inode);
return ret;
}
static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
- struct super_block *sb)
+ struct super_block *sb,
+ struct cgroup_namespace *ns)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup *com_cgrp = src_cgrp;
int ret;
@@ -4709,11 +5116,12 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
static int cgroup_attach_permissions(struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
- struct super_block *sb, bool threadgroup)
+ struct super_block *sb, bool threadgroup,
+ struct cgroup_namespace *ns)
{
int ret = 0;
- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb);
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
if (ret)
return ret;
@@ -4727,19 +5135,21 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp,
return ret;
}
-static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ bool threadgroup)
{
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
+ const struct cred *saved_cred;
ssize_t ret;
- bool locked;
+ bool threadgroup_locked;
dst_cgrp = cgroup_kn_lock_live(of->kn, false);
if (!dst_cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, true, &locked);
+ task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -4749,19 +5159,33 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
spin_unlock_irq(&css_set_lock);
+ /*
+ * Process and thread migrations follow same delegation rule. Check
+ * permissions using the credentials from file open to protect against
+ * inherited fd attacks.
+ */
+ saved_cred = override_creds(of->file->f_cred);
ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb, true);
+ of->file->f_path.dentry->d_sb,
+ threadgroup, ctx->ns);
+ revert_creds(saved_cred);
if (ret)
goto out_finish;
- ret = cgroup_attach_task(dst_cgrp, task, true);
+ ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
out_finish:
- cgroup_procs_write_finish(task, locked);
+ cgroup_procs_write_finish(task, threadgroup_locked);
out_unlock:
cgroup_kn_unlock(of->kn);
- return ret ?: nbytes;
+ return ret;
+}
+
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup_procs_write(of, buf, true) ?: nbytes;
}
static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
@@ -4772,41 +5196,7 @@ static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
- struct cgroup *src_cgrp, *dst_cgrp;
- struct task_struct *task;
- ssize_t ret;
- bool locked;
-
- buf = strstrip(buf);
-
- dst_cgrp = cgroup_kn_lock_live(of->kn, false);
- if (!dst_cgrp)
- return -ENODEV;
-
- task = cgroup_procs_write_start(buf, false, &locked);
- ret = PTR_ERR_OR_ZERO(task);
- if (ret)
- goto out_unlock;
-
- /* find the source cgroup */
- spin_lock_irq(&css_set_lock);
- src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- spin_unlock_irq(&css_set_lock);
-
- /* thread migrations follow the cgroup.procs delegation rule */
- ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb, false);
- if (ret)
- goto out_finish;
-
- ret = cgroup_attach_task(dst_cgrp, task, false);
-
-out_finish:
- cgroup_procs_write_finish(task, locked);
-out_unlock:
- cgroup_kn_unlock(of->kn);
-
- return ret ?: nbytes;
+ return __cgroup_procs_write(of, buf, false) ?: nbytes;
}
/* cgroup core interface files for the default hierarchy */
@@ -4873,12 +5263,26 @@ static struct cftype cgroup_base_files[] = {
.write = cgroup_freeze_write,
},
{
+ .name = "cgroup.kill",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .write = cgroup_kill_write,
+ },
+ {
.name = "cpu.stat",
.seq_show = cpu_stat_show,
},
+ {
+ .name = "cpu.stat.local",
+ .seq_show = cpu_local_stat_show,
+ },
+ { } /* terminate */
+};
+
+static struct cftype cgroup_psi_files[] = {
#ifdef CONFIG_PSI
{
.name = "io.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@@ -4886,6 +5290,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "memory.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@@ -4893,11 +5298,27 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "cpu.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ {
+ .name = "irq.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
+ .seq_show = cgroup_irq_pressure_show,
+ .write = cgroup_irq_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
+ },
+#endif
+ {
+ .name = "cgroup.pressure",
+ .seq_show = cgroup_pressure_show,
+ .write = cgroup_pressure_write,
+ },
#endif /* CONFIG_PSI */
{ } /* terminate */
};
@@ -4918,7 +5339,7 @@ static struct cftype cgroup_base_files[] = {
* RCU callback.
*
* 4. After the grace period, the css can be freed. Implemented in
- * css_free_work_fn().
+ * css_free_rwork_fn().
*
* It is actually hairier because both step 2 and 4 require process context
* and thus involve punting to css->destroy_work adding two additional
@@ -4949,6 +5370,7 @@ static void css_free_rwork_fn(struct work_struct *work)
atomic_dec(&cgrp->root->nr_cgrps);
cgroup1_pidlist_destroy_all(cgrp);
cancel_work_sync(&cgrp->release_agent_work);
+ bpf_cgrp_storage_free(cgrp);
if (cgroup_parent(cgrp)) {
/*
@@ -4960,8 +5382,7 @@ static void css_free_rwork_fn(struct work_struct *work)
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
psi_cgroup_free(cgrp);
- if (cgroup_on_dfl(cgrp))
- cgroup_rstat_exit(cgrp);
+ cgroup_rstat_exit(cgrp);
kfree(cgrp);
} else {
/*
@@ -4981,7 +5402,7 @@ static void css_release_work_fn(struct work_struct *work)
struct cgroup_subsys *ss = css->ss;
struct cgroup *cgrp = css->cgroup;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
css->flags |= CSS_RELEASED;
list_del_rcu(&css->sibling);
@@ -5002,8 +5423,7 @@ static void css_release_work_fn(struct work_struct *work)
/* cgroup release path */
TRACE_CGROUP_PATH(release, cgrp);
- if (cgroup_on_dfl(cgrp))
- cgroup_rstat_flush(cgrp);
+ cgroup_rstat_flush(cgrp);
spin_lock_irq(&css_set_lock);
for (tcgrp = cgroup_parent(cgrp); tcgrp;
@@ -5023,7 +5443,7 @@ static void css_release_work_fn(struct work_struct *work)
NULL);
}
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
@@ -5060,7 +5480,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css_get(css->parent);
}
- if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush)
+ if (ss->css_rstat_flush)
list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
BUG_ON(cgroup_css(cgrp, ss));
@@ -5150,15 +5570,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
if (err)
goto err_list_del;
- if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
- cgroup_parent(parent)) {
- pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
- current->comm, current->pid, ss->name);
- if (!strcmp(ss->name, "memory"))
- pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
- ss->warned_broken_hierarchy = true;
- }
-
return css;
err_list_del:
@@ -5172,8 +5583,7 @@ err_free_css:
/*
* The returned cgroup is fully initialized including its control mask, but
- * it isn't associated with its kernfs_node and doesn't have the control
- * mask applied.
+ * it doesn't have the control mask applied.
*/
static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
umode_t mode)
@@ -5185,8 +5595,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
- cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
- GFP_KERNEL);
+ cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);
if (!cgrp)
return ERR_PTR(-ENOMEM);
@@ -5194,14 +5603,14 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
if (ret)
goto out_free_cgrp;
- if (cgroup_on_dfl(parent)) {
- ret = cgroup_rstat_init(cgrp);
- if (ret)
- goto out_cancel_ref;
- }
+ ret = cgroup_rstat_init(cgrp);
+ if (ret)
+ goto out_cancel_ref;
/* create the directory */
- kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+ kn = kernfs_create_dir_ns(parent->kn, name, mode,
+ current_fsuid(), current_fsgid(),
+ cgrp, NULL);
if (IS_ERR(kn)) {
ret = PTR_ERR(kn);
goto out_stat_exit;
@@ -5240,7 +5649,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
spin_lock_irq(&css_set_lock);
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
- cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
+ cgrp->ancestors[tcgrp->level] = tcgrp;
if (tcgrp != cgrp) {
tcgrp->nr_descendants++;
@@ -5285,8 +5694,7 @@ out_psi_free:
out_kernfs_remove:
kernfs_remove(cgrp->kn);
out_stat_exit:
- if (cgroup_on_dfl(parent))
- cgroup_rstat_exit(cgrp);
+ cgroup_rstat_exit(cgrp);
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
@@ -5347,10 +5755,6 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
*/
kernfs_get(cgrp->kn);
- ret = cgroup_kn_set_ugid(cgrp->kn);
- if (ret)
- goto out_destroy;
-
ret = css_populate_dir(&cgrp->self);
if (ret)
goto out_destroy;
@@ -5377,14 +5781,14 @@ out_unlock:
/*
* This is called when the refcnt of a css is confirmed to be killed.
* css_tryget_online() is now guaranteed to fail. Tell the subsystem to
- * initate destruction and put the css ref from kill_css().
+ * initiate destruction and put the css ref from kill_css().
*/
static void css_killed_work_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
do {
offline_css(css);
@@ -5393,7 +5797,7 @@ static void css_killed_work_fn(struct work_struct *work)
css = css->parent;
} while (css && atomic_dec_and_test(&css->online_cnt));
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
}
/* css kill confirmation processing requires process context, bounce */
@@ -5503,7 +5907,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
/*
* Mark @cgrp and the associated csets dead. The former prevents
* further task migration and child creation by disabling
- * cgroup_lock_live_group(). The latter makes the csets ignored by
+ * cgroup_kn_lock_live(). The latter makes the csets ignored by
* the migration path.
*/
cgrp->self.flags &= ~CSS_ONLINE;
@@ -5521,11 +5925,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
css_clear_dir(&cgrp->self);
kernfs_remove(cgrp->kn);
- if (parent && cgroup_is_threaded(cgrp))
+ if (cgroup_is_threaded(cgrp))
parent->nr_threaded_children--;
spin_lock_irq(&css_set_lock);
- for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
+ for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
tcgrp->nr_descendants--;
tcgrp->nr_dying_descendants++;
/*
@@ -5577,14 +5981,14 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
pr_debug("Initializing cgroup subsys %s\n", ss->name);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
idr_init(&ss->css_idr);
INIT_LIST_HEAD(&ss->cfts);
/* Create the root cgroup state for this subsystem */
ss->root = &cgrp_dfl_root;
- css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
+ css = ss->css_alloc(NULL);
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
@@ -5621,7 +6025,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
BUG_ON(online_css(css));
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
}
/**
@@ -5661,8 +6065,6 @@ int __init cgroup_init_early(void)
return 0;
}
-static u16 cgroup_disable_mask __initdata;
-
/**
* cgroup_init - cgroup initialization
*
@@ -5676,19 +6078,14 @@ int __init cgroup_init(void)
BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
cgroup_rstat_boot();
- /*
- * The latency of the synchronize_rcu() is too high for cgroups,
- * avoid it at the cost of forcing all readers into the slow path.
- */
- rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
-
get_user_ns(init_cgroup_ns.user_ns);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
/*
* Add init_css_set to the hash table so that dfl_root can link to
@@ -5699,7 +6096,7 @@ int __init cgroup_init(void)
BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
for_each_subsys(ss, ssid) {
if (ss->early_init) {
@@ -5721,16 +6118,12 @@ int __init cgroup_init(void)
* disabled flag and cftype registration needs kmalloc,
* both of which aren't available during early_init.
*/
- if (cgroup_disable_mask & (1 << ssid)) {
- static_branch_disable(cgroup_subsys_enabled_key[ssid]);
- printk(KERN_INFO "Disabling %s control group subsystem\n",
- ss->name);
+ if (!cgroup_ssid_enabled(ssid))
continue;
- }
if (cgroup1_ssid_disabled(ssid))
- printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
- ss->name);
+ pr_info("Disabling %s control group subsystem in v1 mounts\n",
+ ss->legacy_name);
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
@@ -5755,9 +6148,9 @@ int __init cgroup_init(void)
if (ss->bind)
ss->bind(init_css_set.subsys[ssid]);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
css_populate_dir(init_css_set.subsys[ssid]);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
}
/* init_css_set.subsys[] has been updated, re-hash */
@@ -5804,6 +6197,48 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
}
/*
+ * cgroup_get_from_id : get the cgroup associated with cgroup id
+ * @id: cgroup id
+ * On success return the cgrp or ERR_PTR on failure
+ * Only cgroups within current task's cgroup NS are valid.
+ */
+struct cgroup *cgroup_get_from_id(u64 id)
+{
+ struct kernfs_node *kn;
+ struct cgroup *cgrp, *root_cgrp;
+
+ kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
+ if (!kn)
+ return ERR_PTR(-ENOENT);
+
+ if (kernfs_type(kn) != KERNFS_DIR) {
+ kernfs_put(kn);
+ return ERR_PTR(-ENOENT);
+ }
+
+ rcu_read_lock();
+
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+ if (cgrp && !cgroup_tryget(cgrp))
+ cgrp = NULL;
+
+ rcu_read_unlock();
+ kernfs_put(kn);
+
+ if (!cgrp)
+ return ERR_PTR(-ENOENT);
+
+ root_cgrp = current_cgns_cgroup_dfl();
+ if (!cgroup_is_descendant(cgrp, root_cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_id);
+
+/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
@@ -5820,7 +6255,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
if (!buf)
goto out;
- mutex_lock(&cgroup_mutex);
+ rcu_read_lock();
spin_lock_irq(&css_set_lock);
for_each_root(root) {
@@ -5828,7 +6263,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct cgroup *cgrp;
int ssid, count = 0;
- if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
+ if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
+ continue;
+
+ cgrp = task_cgroup_from_root(tsk, root);
+ /* The root has already been unmounted. */
+ if (!cgrp)
continue;
seq_printf(m, "%d:", root->hierarchy_id);
@@ -5841,9 +6281,6 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
seq_putc(m, ':');
-
- cgrp = task_cgroup_from_root(tsk, root);
-
/*
* On traditional hierarchies, all zombie tasks show up as
* belonging to the root cgroup. On the default hierarchy,
@@ -5856,7 +6293,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
current->nsproxy->cgroup_ns);
- if (retval >= PATH_MAX)
+ if (retval == -E2BIG)
retval = -ENAMETOOLONG;
if (retval < 0)
goto out_unlock;
@@ -5875,7 +6312,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
retval = 0;
out_unlock:
spin_unlock_irq(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
+ rcu_read_unlock();
kfree(buf);
out:
return retval;
@@ -5894,16 +6331,37 @@ void cgroup_fork(struct task_struct *child)
INIT_LIST_HEAD(&child->cg_list);
}
-static struct cgroup *cgroup_get_from_file(struct file *f)
+/**
+ * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer
+ * @f: file corresponding to cgroup_dir
+ *
+ * Find the cgroup from a file pointer associated with a cgroup directory.
+ * Returns a pointer to the cgroup on success. ERR_PTR is returned if the
+ * cgroup cannot be found.
+ */
+static struct cgroup *cgroup_v1v2_get_from_file(struct file *f)
{
struct cgroup_subsys_state *css;
- struct cgroup *cgrp;
css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
if (IS_ERR(css))
return ERR_CAST(css);
- cgrp = css->cgroup;
+ return css->cgroup;
+}
+
+/**
+ * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports
+ * cgroup2.
+ * @f: file corresponding to cgroup2_dir
+ */
+static struct cgroup *cgroup_get_from_file(struct file *f)
+{
+ struct cgroup *cgrp = cgroup_v1v2_get_from_file(f);
+
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
+
if (!cgroup_on_dfl(cgrp)) {
cgroup_put(cgrp);
return ERR_PTR(-EBADF);
@@ -5938,7 +6396,7 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
struct file *f;
if (kargs->flags & CLONE_INTO_CGROUP)
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
cgroup_threadgroup_change_begin(current);
@@ -5980,8 +6438,23 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
if (ret)
goto err;
+ /*
+ * Spawning a task directly into a cgroup works by passing a file
+ * descriptor to the target cgroup directory. This can even be an O_PATH
+ * file descriptor. But it can never be a cgroup.procs file descriptor.
+ * This was done on purpose so spawning into a cgroup could be
+ * conceptualized as an atomic
+ *
+ * fd = openat(dfd_cgroup, "cgroup.procs", ...);
+ * write(fd, <child-pid>, ...);
+ *
+ * sequence, i.e. it's a shorthand for the caller opening and writing
+ * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us
+ * to always use the caller's credentials.
+ */
ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
- !(kargs->flags & CLONE_THREAD));
+ !(kargs->flags & CLONE_THREAD),
+ current->nsproxy->cgroup_ns);
if (ret)
goto err;
@@ -5998,7 +6471,7 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
err:
cgroup_threadgroup_change_end(current);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
if (f)
fput(f);
if (dst_cgrp)
@@ -6019,19 +6492,18 @@ err:
static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
- cgroup_threadgroup_change_end(current);
+ struct cgroup *cgrp = kargs->cgrp;
+ struct css_set *cset = kargs->cset;
- if (kargs->flags & CLONE_INTO_CGROUP) {
- struct cgroup *cgrp = kargs->cgrp;
- struct css_set *cset = kargs->cset;
-
- mutex_unlock(&cgroup_mutex);
+ cgroup_threadgroup_change_end(current);
- if (cset) {
- put_css_set(cset);
- kargs->cset = NULL;
- }
+ if (cset) {
+ put_css_set(cset);
+ kargs->cset = NULL;
+ }
+ if (kargs->flags & CLONE_INTO_CGROUP) {
+ cgroup_unlock();
if (cgrp) {
cgroup_put(cgrp);
kargs->cgrp = NULL;
@@ -6042,6 +6514,7 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
/**
* cgroup_can_fork - called on a new task before the process is exposed
* @child: the child process
+ * @kargs: the arguments passed to create the child process
*
* This prepares a new css_set for the child process which the child will
* be attached to in cgroup_post_fork().
@@ -6085,7 +6558,7 @@ out_revert:
* @kargs: the arguments passed to create the child process
*
* This calls the cancel_fork() callbacks if a fork failed *after*
- * cgroup_can_fork() succeded and cleans up references we took to
+ * cgroup_can_fork() succeeded and cleans up references we took to
* prepare a new css_set for the child process in cgroup_can_fork().
*/
void cgroup_cancel_fork(struct task_struct *child,
@@ -6104,6 +6577,7 @@ void cgroup_cancel_fork(struct task_struct *child,
/**
* cgroup_post_fork - finalize cgroup setup for the child process
* @child: the child process
+ * @kargs: the arguments passed to create the child process
*
* Attach the child process to its css_set calling the subsystem fork()
* callbacks.
@@ -6112,6 +6586,8 @@ void cgroup_post_fork(struct task_struct *child,
struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
+ unsigned long cgrp_flags = 0;
+ bool kill = false;
struct cgroup_subsys *ss;
struct css_set *cset;
int i;
@@ -6123,6 +6599,11 @@ void cgroup_post_fork(struct task_struct *child,
/* init tasks are special, only link regular threads */
if (likely(child->pid)) {
+ if (kargs->cgrp)
+ cgrp_flags = kargs->cgrp->flags;
+ else
+ cgrp_flags = cset->dfl_cgrp->flags;
+
WARN_ON_ONCE(!list_empty(&child->cg_list));
cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false);
@@ -6131,23 +6612,32 @@ void cgroup_post_fork(struct task_struct *child,
cset = NULL;
}
- /*
- * If the cgroup has to be frozen, the new task has too. Let's set
- * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the
- * frozen state.
- */
- if (unlikely(cgroup_task_freeze(child))) {
- spin_lock(&child->sighand->siglock);
- WARN_ON_ONCE(child->frozen);
- child->jobctl |= JOBCTL_TRAP_FREEZE;
- spin_unlock(&child->sighand->siglock);
+ if (!(child->flags & PF_KTHREAD)) {
+ if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
+ /*
+ * If the cgroup has to be frozen, the new task has
+ * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
+ * get the task into the frozen state.
+ */
+ spin_lock(&child->sighand->siglock);
+ WARN_ON_ONCE(child->frozen);
+ child->jobctl |= JOBCTL_TRAP_FREEZE;
+ spin_unlock(&child->sighand->siglock);
+
+ /*
+ * Calling cgroup_update_frozen() isn't required here,
+ * because it will be called anyway a bit later from
+ * do_freezer_trap(). So we avoid cgroup's transient
+ * switch from the frozen state and back.
+ */
+ }
/*
- * Calling cgroup_update_frozen() isn't required here,
- * because it will be called anyway a bit later from
- * do_freezer_trap(). So we avoid cgroup's transient switch
- * from the frozen state and back.
+ * If the cgroup is to be killed notice it now and take the
+ * child down right after we finished preparing it for
+ * userspace.
*/
+ kill = test_bit(CGRP_KILL, &cgrp_flags);
}
spin_unlock_irq(&css_set_lock);
@@ -6170,6 +6660,10 @@ void cgroup_post_fork(struct task_struct *child,
put_css_set(rcset);
}
+ /* Cgroup has to be killed so take down child immediately. */
+ if (unlikely(kill))
+ do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);
+
cgroup_css_set_put_fork(kargs);
}
@@ -6194,8 +6688,12 @@ void cgroup_exit(struct task_struct *tsk)
list_add_tail(&tsk->cg_list, &cset->dying_tasks);
cset->nr_tasks--;
+ if (dl_task(tsk))
+ dec_dl_tasks_cs(tsk);
+
WARN_ON_ONCE(cgroup_task_frozen(tsk));
- if (unlikely(cgroup_task_freeze(tsk)))
+ if (unlikely(!(tsk->flags & PF_KTHREAD) &&
+ test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
cgroup_update_frozen(task_dfl_cgroup(tsk));
spin_unlock_irq(&css_set_lock);
@@ -6241,7 +6739,19 @@ static int __init cgroup_disable(char *str)
if (strcmp(token, ss->name) &&
strcmp(token, ss->legacy_name))
continue;
- cgroup_disable_mask |= 1 << i;
+
+ static_branch_disable(cgroup_subsys_enabled_key[i]);
+ pr_info("Disabling %s control group subsystem\n",
+ ss->name);
+ }
+
+ for (i = 0; i < OPT_FEATURE_COUNT; i++) {
+ if (strcmp(token, cgroup_opt_feature_names[i]))
+ continue;
+ cgroup_feature_disable_mask |= 1 << i;
+ pr_info("Disabling %s control group feature\n",
+ cgroup_opt_feature_names[i]);
+ break;
}
}
return 1;
@@ -6258,6 +6768,12 @@ static int __init enable_cgroup_debug(char *str)
}
__setup("cgroup_debug", enable_cgroup_debug);
+static int __init cgroup_favordynmods_setup(char *str)
+{
+ return (kstrtobool(str, &have_favordynmods) == 0);
+}
+__setup("cgroup_favordynmods=", cgroup_favordynmods_setup);
+
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
@@ -6318,54 +6834,77 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
*
* Find the cgroup at @path on the default hierarchy, increment its
* reference count and return it. Returns pointer to the found cgroup on
- * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
- * if @path points to a non-directory.
+ * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already
+ * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory.
*/
struct cgroup *cgroup_get_from_path(const char *path)
{
struct kernfs_node *kn;
- struct cgroup *cgrp;
+ struct cgroup *cgrp = ERR_PTR(-ENOENT);
+ struct cgroup *root_cgrp;
- mutex_lock(&cgroup_mutex);
+ root_cgrp = current_cgns_cgroup_dfl();
+ kn = kernfs_walk_and_get(root_cgrp->kn, path);
+ if (!kn)
+ goto out;
- kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
- if (kn) {
- if (kernfs_type(kn) == KERNFS_DIR) {
- cgrp = kn->priv;
- cgroup_get_live(cgrp);
- } else {
- cgrp = ERR_PTR(-ENOTDIR);
- }
- kernfs_put(kn);
- } else {
- cgrp = ERR_PTR(-ENOENT);
+ if (kernfs_type(kn) != KERNFS_DIR) {
+ cgrp = ERR_PTR(-ENOTDIR);
+ goto out_kernfs;
}
- mutex_unlock(&cgroup_mutex);
+ rcu_read_lock();
+
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+ if (!cgrp || !cgroup_tryget(cgrp))
+ cgrp = ERR_PTR(-ENOENT);
+
+ rcu_read_unlock();
+
+out_kernfs:
+ kernfs_put(kn);
+out:
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
/**
- * cgroup_get_from_fd - get a cgroup pointer from a fd
- * @fd: fd obtained by open(cgroup2_dir)
+ * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd
+ * @fd: fd obtained by open(cgroup_dir)
*
* Find the cgroup from a fd which should be obtained
* by opening a cgroup directory. Returns a pointer to the
* cgroup on success. ERR_PTR is returned if the cgroup
* cannot be found.
*/
-struct cgroup *cgroup_get_from_fd(int fd)
+struct cgroup *cgroup_v1v2_get_from_fd(int fd)
{
struct cgroup *cgrp;
- struct file *f;
-
- f = fget_raw(fd);
- if (!f)
+ struct fd f = fdget_raw(fd);
+ if (!f.file)
return ERR_PTR(-EBADF);
- cgrp = cgroup_get_from_file(f);
- fput(f);
+ cgrp = cgroup_v1v2_get_from_file(f.file);
+ fdput(f);
+ return cgrp;
+}
+
+/**
+ * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports
+ * cgroup2.
+ * @fd: fd obtained by open(cgroup2_dir)
+ */
+struct cgroup *cgroup_get_from_fd(int fd)
+{
+ struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd);
+
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
+
+ if (!cgroup_on_dfl(cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-EBADF);
+ }
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
@@ -6418,118 +6957,57 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-
-DEFINE_SPINLOCK(cgroup_sk_update_lock);
-static bool cgroup_sk_alloc_disabled __read_mostly;
-
-void cgroup_sk_alloc_disable(void)
-{
- if (cgroup_sk_alloc_disabled)
- return;
- pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
- cgroup_sk_alloc_disabled = true;
-}
-
-#else
-
-#define cgroup_sk_alloc_disabled false
-
-#endif
-
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
- if (cgroup_sk_alloc_disabled) {
- skcd->no_refcnt = 1;
- return;
- }
-
- /* Don't associate the sock with unrelated interrupted task's cgroup. */
- if (in_interrupt())
- return;
+ struct cgroup *cgroup;
rcu_read_lock();
+ /* Don't associate the sock with unrelated interrupted task's cgroup. */
+ if (in_interrupt()) {
+ cgroup = &cgrp_dfl_root.cgrp;
+ cgroup_get(cgroup);
+ goto out;
+ }
while (true) {
struct css_set *cset;
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
- skcd->val = (unsigned long)cset->dfl_cgrp;
- cgroup_bpf_get(cset->dfl_cgrp);
+ cgroup = cset->dfl_cgrp;
break;
}
cpu_relax();
}
-
+out:
+ skcd->cgroup = cgroup;
+ cgroup_bpf_get(cgroup);
rcu_read_unlock();
}
void cgroup_sk_clone(struct sock_cgroup_data *skcd)
{
- if (skcd->val) {
- if (skcd->no_refcnt)
- return;
- /*
- * We might be cloning a socket which is left in an empty
- * cgroup and the cgroup might have already been rmdir'd.
- * Don't use cgroup_get_live().
- */
- cgroup_get(sock_cgroup_ptr(skcd));
- cgroup_bpf_get(sock_cgroup_ptr(skcd));
- }
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+ /*
+ * We might be cloning a socket which is left in an empty
+ * cgroup and the cgroup might have already been rmdir'd.
+ * Don't use cgroup_get_live().
+ */
+ cgroup_get(cgrp);
+ cgroup_bpf_get(cgrp);
}
void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
- if (skcd->no_refcnt)
- return;
cgroup_bpf_put(cgrp);
cgroup_put(cgrp);
}
#endif /* CONFIG_SOCK_CGROUP_DATA */
-#ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_attach(struct cgroup *cgrp,
- struct bpf_prog *prog, struct bpf_prog *replace_prog,
- struct bpf_cgroup_link *link,
- enum bpf_attach_type type,
- u32 flags)
-{
- int ret;
-
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-
-int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type)
-{
- int ret;
-
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-
-int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
- union bpf_attr __user *uattr)
-{
- int ret;
-
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_query(cgrp, attr, uattr);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-#endif /* CONFIG_CGROUP_BPF */
-
#ifdef CONFIG_SYSFS
static ssize_t show_delegatable_files(struct cftype *files, char *buf,
ssize_t size, const char *prefix)
@@ -6560,8 +7038,11 @@ static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
int ssid;
ssize_t ret = 0;
- ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
- NULL);
+ ret = show_delegatable_files(cgroup_base_files, buf + ret,
+ PAGE_SIZE - ret, NULL);
+ if (cgroup_psi_enabled())
+ ret += show_delegatable_files(cgroup_psi_files, buf + ret,
+ PAGE_SIZE - ret, NULL);
for_each_subsys(ss, ssid)
ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
@@ -6577,8 +7058,10 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
{
return snprintf(buf, PAGE_SIZE,
"nsdelegate\n"
+ "favordynmods\n"
"memory_localevents\n"
- "memory_recursiveprot\n");
+ "memory_recursiveprot\n"
+ "memory_hugetlb_accounting\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 642415b8c3c9..927bef3a598a 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -25,50 +25,37 @@
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
-#include <linux/err.h>
-#include <linux/errno.h>
-#include <linux/file.h>
-#include <linux/fs.h>
+#include <linux/delay.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
-#include <linux/kmod.h>
-#include <linux/list.h>
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/memory.h>
#include <linux/export.h>
-#include <linux/mount.h>
-#include <linux/fs_context.h>
-#include <linux/namei.h>
-#include <linux/pagemap.h>
-#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/deadline.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
-#include <linux/seq_file.h>
#include <linux/security.h>
-#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/time.h>
-#include <linux/time64.h>
-#include <linux/backing-dev.h>
-#include <linux/sort.h>
#include <linux/oom.h>
#include <linux/sched/isolation.h>
-#include <linux/uaccess.h>
-#include <linux/atomic.h>
-#include <linux/mutex.h>
#include <linux/cgroup.h>
#include <linux/wait.h>
+#include <linux/workqueue.h>
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
+/*
+ * There could be abnormal cpuset configurations for cpu or memory
+ * node binding, add this key to provide a quick low-cost judgment
+ * of the situation.
+ */
+DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
+
/* See "Frequency meter" comments, below. */
struct fmeter {
@@ -78,6 +65,32 @@ struct fmeter {
spinlock_t lock; /* guards read or write of above */
};
+/*
+ * Invalid partition error code
+ */
+enum prs_errcode {
+ PERR_NONE = 0,
+ PERR_INVCPUS,
+ PERR_INVPARENT,
+ PERR_NOTPART,
+ PERR_NOTEXCL,
+ PERR_NOCPUS,
+ PERR_HOTPLUG,
+ PERR_CPUSEMPTY,
+ PERR_HKEEPING,
+};
+
+static const char * const perr_strings[] = {
+ [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive",
+ [PERR_INVPARENT] = "Parent is an invalid partition root",
+ [PERR_NOTPART] = "Parent is not a partition root",
+ [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",
+ [PERR_NOCPUS] = "Parent unable to distribute cpu downstream",
+ [PERR_HOTPLUG] = "No cpu available due to hotplug",
+ [PERR_CPUSEMPTY] = "cpuset.cpus is empty",
+ [PERR_HKEEPING] = "partition config conflicts with housekeeping setup",
+};
+
struct cpuset {
struct cgroup_subsys_state css;
@@ -98,7 +111,7 @@ struct cpuset {
* and if it ends up empty, it will inherit the parent's mask.
*
*
- * On legacy hierachy:
+ * On legacy hierarchy:
*
* The user-configured masks are always the same with effective masks.
*/
@@ -112,14 +125,23 @@ struct cpuset {
nodemask_t effective_mems;
/*
- * CPUs allocated to child sub-partitions (default hierarchy only)
- * - CPUs granted by the parent = effective_cpus U subparts_cpus
- * - effective_cpus and subparts_cpus are mutually exclusive.
+ * Exclusive CPUs dedicated to current cgroup (default hierarchy only)
+ *
+ * This exclusive CPUs must be a subset of cpus_allowed. A parent
+ * cgroup can only grant exclusive CPUs to one of its children.
*
- * effective_cpus contains only onlined CPUs, but subparts_cpus
- * may have offlined ones.
+ * When the cgroup becomes a valid partition root, effective_xcpus
+ * defaults to cpus_allowed if not set. The effective_cpus of a valid
+ * partition root comes solely from its effective_xcpus and some of the
+ * effective_xcpus may be distributed to sub-partitions below & hence
+ * excluded from its effective_cpus.
*/
- cpumask_var_t subparts_cpus;
+ cpumask_var_t effective_xcpus;
+
+ /*
+ * Exclusive CPUs as requested by the user (default hierarchy only)
+ */
+ cpumask_var_t exclusive_cpus;
/*
* This is old Memory Nodes tasks took on.
@@ -147,8 +169,8 @@ struct cpuset {
/* for custom sched domain */
int relax_domain_level;
- /* number of CPUs in subparts_cpus */
- int nr_subparts_cpus;
+ /* number of valid sub-partitions */
+ int nr_subparts;
/* partition root state */
int partition_root_state;
@@ -160,25 +182,57 @@ struct cpuset {
*/
int use_parent_ecpus;
int child_ecpus_count;
+
+ /*
+ * number of SCHED_DEADLINE tasks attached to this cpuset, so that we
+ * know when to rebuild associated root domain bandwidth information.
+ */
+ int nr_deadline_tasks;
+ int nr_migrate_dl_tasks;
+ u64 sum_migrate_dl_bw;
+
+ /* Invalid partition error code, not lock protected */
+ enum prs_errcode prs_err;
+
+ /* Handle for cpuset.cpus.partition */
+ struct cgroup_file partition_file;
+
+ /* Remote partition silbling list anchored at remote_children */
+ struct list_head remote_sibling;
};
/*
+ * Exclusive CPUs distributed out to sub-partitions of top_cpuset
+ */
+static cpumask_var_t subpartitions_cpus;
+
+/*
+ * Exclusive CPUs in isolated partitions
+ */
+static cpumask_var_t isolated_cpus;
+
+/* List of remote partition root children */
+static struct list_head remote_children;
+
+/*
* Partition root states:
*
- * 0 - not a partition root
- *
+ * 0 - member (not a partition root)
* 1 - partition root
- *
+ * 2 - partition root without load balancing (isolated)
* -1 - invalid partition root
- * None of the cpus in cpus_allowed can be put into the parent's
- * subparts_cpus. In this case, the cpuset is not a real partition
- * root anymore. However, the CPU_EXCLUSIVE bit will still be set
- * and the cpuset can be restored back to a partition root if the
- * parent cpuset can give more CPUs back to this child cpuset.
+ * -2 - invalid isolated partition root
*/
-#define PRS_DISABLED 0
-#define PRS_ENABLED 1
-#define PRS_ERROR -1
+#define PRS_MEMBER 0
+#define PRS_ROOT 1
+#define PRS_ISOLATED 2
+#define PRS_INVALID_ROOT -1
+#define PRS_INVALID_ISOLATED -2
+
+static inline bool is_prs_invalid(int prs_state)
+{
+ return prs_state < 0;
+}
/*
* Temporary cpumasks for working with partitions that are passed among
@@ -205,6 +259,20 @@ static inline struct cpuset *parent_cs(struct cpuset *cs)
return css_cs(cs->css.parent);
}
+void inc_dl_tasks_cs(struct task_struct *p)
+{
+ struct cpuset *cs = task_cs(p);
+
+ cs->nr_deadline_tasks++;
+}
+
+void dec_dl_tasks_cs(struct task_struct *p)
+{
+ struct cpuset *cs = task_cs(p);
+
+ cs->nr_deadline_tasks--;
+}
+
/* bits in struct cpuset flags field */
typedef enum {
CS_ONLINE,
@@ -258,15 +326,44 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
-static inline int is_partition_root(const struct cpuset *cs)
+static inline int is_partition_valid(const struct cpuset *cs)
{
return cs->partition_root_state > 0;
}
+static inline int is_partition_invalid(const struct cpuset *cs)
+{
+ return cs->partition_root_state < 0;
+}
+
+/*
+ * Callers should hold callback_lock to modify partition_root_state.
+ */
+static inline void make_partition_invalid(struct cpuset *cs)
+{
+ if (cs->partition_root_state > 0)
+ cs->partition_root_state = -cs->partition_root_state;
+}
+
+/*
+ * Send notification event of whenever partition_root_state changes.
+ */
+static inline void notify_partition_change(struct cpuset *cs, int old_prs)
+{
+ if (old_prs == cs->partition_root_state)
+ return;
+ cgroup_file_notify(&cs->partition_file);
+
+ /* Reset prs_err if not invalid */
+ if (is_partition_valid(cs))
+ WRITE_ONCE(cs->prs_err, PERR_NONE);
+}
+
static struct cpuset top_cpuset = {
.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
(1 << CS_MEM_EXCLUSIVE)),
- .partition_root_state = PRS_ENABLED,
+ .partition_root_state = PRS_ROOT,
+ .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling),
};
/**
@@ -301,17 +398,20 @@ static struct cpuset top_cpuset = {
* There are two global locks guarding cpuset structures - cpuset_mutex and
* callback_lock. We also require taking task_lock() when dereferencing a
* task's cpuset pointer. See "The task_lock() exception", at the end of this
- * comment.
+ * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems
+ * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
+ * structures. Note that cpuset_mutex needs to be a mutex as it is used in
+ * paths that rely on priority inheritance (e.g. scheduler - on RT) for
+ * correctness.
*
* A task must hold both locks to modify cpusets. If a task holds
- * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
- * is the only task able to also acquire callback_lock and be able to
- * modify cpusets. It can perform various checks on the cpuset structure
- * first, knowing nothing will change. It can also allocate memory while
- * just holding cpuset_mutex. While it is performing these checks, various
- * callback routines can briefly acquire callback_lock to query cpusets.
- * Once it is ready to make the changes, it takes callback_lock, blocking
- * everyone else.
+ * cpuset_mutex, it blocks others, ensuring that it is the only task able to
+ * also acquire callback_lock and be able to modify cpusets. It can perform
+ * various checks on the cpuset structure first, knowing nothing will change.
+ * It can also allocate memory while just holding cpuset_mutex. While it is
+ * performing these checks, various callback routines can briefly acquire
+ * callback_lock to query cpusets. Once it is ready to make the changes, it
+ * takes callback_lock, blocking everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
* callback_lock, as that would risk double tripping on callback_lock
@@ -333,16 +433,16 @@ static struct cpuset top_cpuset = {
* guidelines for accessing subsystem state in kernel/cgroup.c
*/
-DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
+static DEFINE_MUTEX(cpuset_mutex);
-void cpuset_read_lock(void)
+void cpuset_lock(void)
{
- percpu_down_read(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
}
-void cpuset_read_unlock(void)
+void cpuset_unlock(void)
{
- percpu_up_read(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}
static DEFINE_SPINLOCK(callback_lock);
@@ -357,6 +457,17 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
+static inline void check_insane_mems_config(nodemask_t *nodes)
+{
+ if (!cpusets_insane_config() &&
+ movable_only_nodes(nodes)) {
+ static_branch_enable(&cpusets_insane_config_key);
+ pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
+ "Cpuset allocations might fail even with a lot of memory available.\n",
+ nodemask_pr_args(nodes));
+ }
+}
+
/*
* Cgroup v2 behavior is used on the "cpus" and "mems" control files when
* on default hierarchy or when the cpuset_v2_mode flag is set by mounting
@@ -371,33 +482,81 @@ static inline bool is_in_v2_mode(void)
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}
+/**
+ * partition_is_populated - check if partition has tasks
+ * @cs: partition root to be checked
+ * @excluded_child: a child cpuset to be excluded in task checking
+ * Return: true if there are tasks, false otherwise
+ *
+ * It is assumed that @cs is a valid partition root. @excluded_child should
+ * be non-NULL when this cpuset is going to become a partition itself.
+ */
+static inline bool partition_is_populated(struct cpuset *cs,
+ struct cpuset *excluded_child)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *child;
+
+ if (cs->css.cgroup->nr_populated_csets)
+ return true;
+ if (!excluded_child && !cs->nr_subparts)
+ return cgroup_is_populated(cs->css.cgroup);
+
+ rcu_read_lock();
+ cpuset_for_each_child(child, css, cs) {
+ if (child == excluded_child)
+ continue;
+ if (is_partition_valid(child))
+ continue;
+ if (cgroup_is_populated(child->css.cgroup)) {
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+ return false;
+}
+
/*
- * Return in pmask the portion of a cpusets's cpus_allowed that
- * are online. If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus.
+ * Return in pmask the portion of a task's cpusets's cpus_allowed that
+ * are online and are capable of running the task. If none are found,
+ * walk up the cpuset hierarchy until we find one that does have some
+ * appropriate cpus.
*
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
*
* Call with callback_lock or cpuset_mutex held.
*/
-static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
+static void guarantee_online_cpus(struct task_struct *tsk,
+ struct cpumask *pmask)
{
- while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+ struct cpuset *cs;
+
+ if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
+ cpumask_copy(pmask, cpu_online_mask);
+
+ rcu_read_lock();
+ cs = task_cs(tsk);
+
+ while (!cpumask_intersects(cs->effective_cpus, pmask)) {
cs = parent_cs(cs);
if (unlikely(!cs)) {
/*
* The top cpuset doesn't have any online cpu as a
* consequence of a race between cpuset_hotplug_work
* and cpu hotplug notifier. But we know the top
- * cpuset's effective_cpus is on its way to to be
+ * cpuset's effective_cpus is on its way to be
* identical to cpu_online_mask.
*/
- cpumask_copy(pmask, cpu_online_mask);
- return;
+ goto out_unlock;
}
}
- cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
+ cpumask_and(pmask, pmask, cs->effective_cpus);
+
+out_unlock:
+ rcu_read_unlock();
}
/*
@@ -421,11 +580,15 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Call with callback_lock or cpuset_mutex held.
+ * Call with callback_lock or cpuset_mutex held. The check can be skipped
+ * if on default hierarchy.
*/
-static void cpuset_update_task_spread_flag(struct cpuset *cs,
+static void cpuset_update_task_spread_flags(struct cpuset *cs,
struct task_struct *tsk)
{
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ return;
+
if (is_spread_page(cs))
task_set_spread_page(tsk);
else
@@ -463,16 +626,18 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
*/
static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
{
- cpumask_var_t *pmask1, *pmask2, *pmask3;
+ cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4;
if (cs) {
pmask1 = &cs->cpus_allowed;
pmask2 = &cs->effective_cpus;
- pmask3 = &cs->subparts_cpus;
+ pmask3 = &cs->effective_xcpus;
+ pmask4 = &cs->exclusive_cpus;
} else {
pmask1 = &tmp->new_cpus;
pmask2 = &tmp->addmask;
pmask3 = &tmp->delmask;
+ pmask4 = NULL;
}
if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
@@ -484,8 +649,14 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
goto free_two;
+ if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL))
+ goto free_three;
+
+
return 0;
+free_three:
+ free_cpumask_var(*pmask3);
free_two:
free_cpumask_var(*pmask2);
free_one:
@@ -503,7 +674,8 @@ static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
if (cs) {
free_cpumask_var(cs->cpus_allowed);
free_cpumask_var(cs->effective_cpus);
- free_cpumask_var(cs->subparts_cpus);
+ free_cpumask_var(cs->effective_xcpus);
+ free_cpumask_var(cs->exclusive_cpus);
}
if (tmp) {
free_cpumask_var(tmp->new_cpus);
@@ -531,6 +703,8 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
cpumask_copy(trial->effective_cpus, cs->effective_cpus);
+ cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
+ cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
return trial;
}
@@ -544,6 +718,57 @@ static inline void free_cpuset(struct cpuset *cs)
kfree(cs);
}
+static inline struct cpumask *fetch_xcpus(struct cpuset *cs)
+{
+ return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus :
+ cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed
+ : cs->effective_xcpus;
+}
+
+/*
+ * cpusets_are_exclusive() - check if two cpusets are exclusive
+ *
+ * Return true if exclusive, false if not
+ */
+static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
+{
+ struct cpumask *xcpus1 = fetch_xcpus(cs1);
+ struct cpumask *xcpus2 = fetch_xcpus(cs2);
+
+ if (cpumask_intersects(xcpus1, xcpus2))
+ return false;
+ return true;
+}
+
+/*
+ * validate_change_legacy() - Validate conditions specific to legacy (v1)
+ * behavior.
+ */
+static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *c, *par;
+ int ret;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* Each of our child cpusets must be a subset of us */
+ ret = -EBUSY;
+ cpuset_for_each_child(c, css, cur)
+ if (!is_cpuset_subset(c, trial))
+ goto out;
+
+ /* On legacy hierarchy, we must be a subset of our parent cpuset. */
+ ret = -EACCES;
+ par = parent_cs(cur);
+ if (par && !is_cpuset_subset(trial, par))
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
+
/*
* validate_change() - Used to validate that any proposed cpuset change
* follows the structural rules for cpusets.
@@ -568,44 +793,21 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
struct cgroup_subsys_state *css;
struct cpuset *c, *par;
- int ret;
+ int ret = 0;
rcu_read_lock();
- /* Each of our child cpusets must be a subset of us */
- ret = -EBUSY;
- cpuset_for_each_child(c, css, cur)
- if (!is_cpuset_subset(c, trial))
- goto out;
+ if (!is_in_v2_mode())
+ ret = validate_change_legacy(cur, trial);
+ if (ret)
+ goto out;
/* Remaining checks don't apply to root cpuset */
- ret = 0;
if (cur == &top_cpuset)
goto out;
par = parent_cs(cur);
- /* On legacy hiearchy, we must be a subset of our parent cpuset. */
- ret = -EACCES;
- if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
- goto out;
-
- /*
- * If either I or some sibling (!= me) is exclusive, we can't
- * overlap
- */
- ret = -EINVAL;
- cpuset_for_each_child(c, css, par) {
- if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
- c != cur &&
- cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
- goto out;
- if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
- c != cur &&
- nodes_intersects(trial->mems_allowed, c->mems_allowed))
- goto out;
- }
-
/*
* Cpusets with tasks - existing or newly being attached - can't
* be changed to have empty cpus_allowed or mems_allowed.
@@ -630,6 +832,23 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
trial->cpus_allowed))
goto out;
+ /*
+ * If either I or some sibling (!= me) is exclusive, we can't
+ * overlap
+ */
+ ret = -EINVAL;
+ cpuset_for_each_child(c, css, par) {
+ if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
+ c != cur) {
+ if (!cpusets_are_exclusive(trial, c))
+ goto out;
+ }
+ if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
+ c != cur &&
+ nodes_intersects(trial->mems_allowed, c->mems_allowed))
+ goto out;
+ }
+
ret = 0;
out:
rcu_read_unlock();
@@ -753,7 +972,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */
- if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
+ if (root_load_balance && !top_cpuset.nr_subparts) {
ndoms = 1;
doms = alloc_sched_domains(ndoms);
if (!doms)
@@ -765,7 +984,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
update_domain_attr_tree(dattr, &top_cpuset);
}
cpumask_and(doms[0], top_cpuset.effective_cpus,
- housekeeping_cpumask(HK_FLAG_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
goto done;
}
@@ -795,7 +1014,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
cpumask_intersects(cp->cpus_allowed,
- housekeeping_cpumask(HK_FLAG_DOMAIN))))
+ housekeeping_cpumask(HK_TYPE_DOMAIN))))
continue;
if (root_load_balance &&
@@ -807,7 +1026,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
csa[csn++] = cp;
/* skip @cp's subtree if not a partition root */
- if (!is_partition_root(cp))
+ if (!is_partition_valid(cp))
pos_css = css_rightmost_descendant(pos_css);
}
rcu_read_unlock();
@@ -884,7 +1103,7 @@ restart:
if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
- cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
+ cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -911,11 +1130,14 @@ done:
return ndoms;
}
-static void update_tasks_root_domain(struct cpuset *cs)
+static void dl_update_tasks_root_domain(struct cpuset *cs)
{
struct css_task_iter it;
struct task_struct *task;
+ if (cs->nr_deadline_tasks == 0)
+ return;
+
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
@@ -924,12 +1146,12 @@ static void update_tasks_root_domain(struct cpuset *cs)
css_task_iter_end(&it);
}
-static void rebuild_root_domains(void)
+static void dl_rebuild_rd_accounting(void)
{
struct cpuset *cs = NULL;
struct cgroup_subsys_state *pos_css;
- percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_held(&cpuset_mutex);
lockdep_assert_cpus_held();
lockdep_assert_held(&sched_domains_mutex);
@@ -952,7 +1174,7 @@ static void rebuild_root_domains(void)
rcu_read_unlock();
- update_tasks_root_domain(cs);
+ dl_update_tasks_root_domain(cs);
rcu_read_lock();
css_put(&cs->css);
@@ -966,7 +1188,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
{
mutex_lock(&sched_domains_mutex);
partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
- rebuild_root_domains();
+ dl_rebuild_rd_accounting();
mutex_unlock(&sched_domains_mutex);
}
@@ -979,29 +1201,52 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
* 'cpus' is removed, then call this routine to rebuild the
* scheduler's dynamic sched domains.
*
- * Call with cpuset_mutex held. Takes get_online_cpus().
+ * Call with cpuset_mutex held. Takes cpus_read_lock().
*/
static void rebuild_sched_domains_locked(void)
{
+ struct cgroup_subsys_state *pos_css;
struct sched_domain_attr *attr;
cpumask_var_t *doms;
+ struct cpuset *cs;
int ndoms;
lockdep_assert_cpus_held();
- percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_held(&cpuset_mutex);
/*
- * We have raced with CPU hotplug. Don't do anything to avoid
+ * If we have raced with CPU hotplug, return early to avoid
* passing doms with offlined cpu to partition_sched_domains().
- * Anyways, hotplug work item will rebuild sched domains.
+ * Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
+ *
+ * With no CPUs in any subpartitions, top_cpuset's effective CPUs
+ * should be the same as the active CPUs, so checking only top_cpuset
+ * is enough to detect racing CPU offlines.
*/
- if (!top_cpuset.nr_subparts_cpus &&
+ if (cpumask_empty(subpartitions_cpus) &&
!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
return;
- if (top_cpuset.nr_subparts_cpus &&
- !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
- return;
+ /*
+ * With subpartition CPUs, however, the effective CPUs of a partition
+ * root should be only a subset of the active CPUs. Since a CPU in any
+ * partition root could be offlined, all must be checked.
+ */
+ if (top_cpuset.nr_subparts) {
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
+ if (!is_partition_valid(cs)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ if (!cpumask_subset(cs->effective_cpus,
+ cpu_active_mask)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+ }
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(&doms, &attr);
@@ -1017,29 +1262,46 @@ static void rebuild_sched_domains_locked(void)
void rebuild_sched_domains(void)
{
- get_online_cpus();
- percpu_down_write(&cpuset_rwsem);
+ cpus_read_lock();
+ mutex_lock(&cpuset_mutex);
rebuild_sched_domains_locked();
- percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ mutex_unlock(&cpuset_mutex);
+ cpus_read_unlock();
}
/**
* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ * @new_cpus: the temp variable for the new effective_cpus mask
*
* Iterate through each task of @cs updating its cpus_allowed to the
* effective cpuset's. As this function is called with cpuset_mutex held,
- * cpuset membership stays stable.
+ * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask()
+ * is used instead of effective_cpus to make sure all offline CPUs are also
+ * included as hotplug code won't update cpumasks for tasks in top_cpuset.
*/
-static void update_tasks_cpumask(struct cpuset *cs)
+static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
{
struct css_task_iter it;
struct task_struct *task;
+ bool top_cs = cs == &top_cpuset;
css_task_iter_start(&cs->css, 0, &it);
- while ((task = css_task_iter_next(&it)))
- set_cpus_allowed_ptr(task, cs->effective_cpus);
+ while ((task = css_task_iter_next(&it))) {
+ const struct cpumask *possible_mask = task_cpu_possible_mask(task);
+
+ if (top_cs) {
+ /*
+ * Percpu kthreads in top_cpuset are ignored
+ */
+ if (kthread_is_per_cpu(task))
+ continue;
+ cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
+ } else {
+ cpumask_and(new_cpus, possible_mask, cs->effective_cpus);
+ }
+ set_cpus_allowed_ptr(task, new_cpus);
+ }
css_task_iter_end(&it);
}
@@ -1049,267 +1311,930 @@ static void update_tasks_cpumask(struct cpuset *cs)
* @cs: the cpuset the need to recompute the new effective_cpus mask
* @parent: the parent cpuset
*
- * If the parent has subpartition CPUs, include them in the list of
- * allowable CPUs in computing the new effective_cpus mask. Since offlined
- * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
- * to mask those out.
+ * The result is valid only if the given cpuset isn't a partition root.
*/
static void compute_effective_cpumask(struct cpumask *new_cpus,
struct cpuset *cs, struct cpuset *parent)
{
- if (parent->nr_subparts_cpus) {
- cpumask_or(new_cpus, parent->effective_cpus,
- parent->subparts_cpus);
- cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
- cpumask_and(new_cpus, new_cpus, cpu_active_mask);
+ cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
+}
+
+/*
+ * Commands for update_parent_effective_cpumask
+ */
+enum partition_cmd {
+ partcmd_enable, /* Enable partition root */
+ partcmd_enablei, /* Enable isolated partition root */
+ partcmd_disable, /* Disable partition root */
+ partcmd_update, /* Update parent's effective_cpus */
+ partcmd_invalidate, /* Make partition invalid */
+};
+
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+ int turning_on);
+static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
+ struct tmpmasks *tmp);
+
+/*
+ * Update partition exclusive flag
+ *
+ * Return: 0 if successful, an error code otherwise
+ */
+static int update_partition_exclusive(struct cpuset *cs, int new_prs)
+{
+ bool exclusive = (new_prs > 0);
+
+ if (exclusive && !is_cpu_exclusive(cs)) {
+ if (update_flag(CS_CPU_EXCLUSIVE, cs, 1))
+ return PERR_NOTEXCL;
+ } else if (!exclusive && is_cpu_exclusive(cs)) {
+ /* Turning off CS_CPU_EXCLUSIVE will not return error */
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ }
+ return 0;
+}
+
+/*
+ * Update partition load balance flag and/or rebuild sched domain
+ *
+ * Changing load balance flag will automatically call
+ * rebuild_sched_domains_locked().
+ * This function is for cgroup v2 only.
+ */
+static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
+{
+ int new_prs = cs->partition_root_state;
+ bool rebuild_domains = (new_prs > 0) || (old_prs > 0);
+ bool new_lb;
+
+ /*
+ * If cs is not a valid partition root, the load balance state
+ * will follow its parent.
+ */
+ if (new_prs > 0) {
+ new_lb = (new_prs != PRS_ISOLATED);
} else {
- cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
+ new_lb = is_sched_load_balance(parent_cs(cs));
+ }
+ if (new_lb != !!is_sched_load_balance(cs)) {
+ rebuild_domains = true;
+ if (new_lb)
+ set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+ else
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}
+
+ if (rebuild_domains)
+ rebuild_sched_domains_locked();
}
/*
- * Commands for update_parent_subparts_cpumask
+ * tasks_nocpu_error - Return true if tasks will have no effective_cpus
*/
-enum subparts_cmd {
- partcmd_enable, /* Enable partition root */
- partcmd_disable, /* Disable partition root */
- partcmd_update, /* Update parent's subparts_cpus */
-};
+static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs,
+ struct cpumask *xcpus)
+{
+ /*
+ * A populated partition (cs or parent) can't have empty effective_cpus
+ */
+ return (cpumask_subset(parent->effective_cpus, xcpus) &&
+ partition_is_populated(parent, cs)) ||
+ (!cpumask_intersects(xcpus, cpu_active_mask) &&
+ partition_is_populated(cs, NULL));
+}
+
+static void reset_partition_data(struct cpuset *cs)
+{
+ struct cpuset *parent = parent_cs(cs);
+
+ if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ return;
+
+ lockdep_assert_held(&callback_lock);
+
+ cs->nr_subparts = 0;
+ if (cpumask_empty(cs->exclusive_cpus)) {
+ cpumask_clear(cs->effective_xcpus);
+ if (is_cpu_exclusive(cs))
+ clear_bit(CS_CPU_EXCLUSIVE, &cs->flags);
+ }
+ if (!cpumask_and(cs->effective_cpus,
+ parent->effective_cpus, cs->cpus_allowed)) {
+ cs->use_parent_ecpus = true;
+ parent->child_ecpus_count++;
+ cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+ }
+}
+
+/*
+ * partition_xcpus_newstate - Exclusive CPUs state change
+ * @old_prs: old partition_root_state
+ * @new_prs: new partition_root_state
+ * @xcpus: exclusive CPUs with state change
+ */
+static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus)
+{
+ WARN_ON_ONCE(old_prs == new_prs);
+ if (new_prs == PRS_ISOLATED)
+ cpumask_or(isolated_cpus, isolated_cpus, xcpus);
+ else
+ cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
+}
+
+/*
+ * partition_xcpus_add - Add new exclusive CPUs to partition
+ * @new_prs: new partition_root_state
+ * @parent: parent cpuset
+ * @xcpus: exclusive CPUs to be added
+ * Return: true if isolated_cpus modified, false otherwise
+ *
+ * Remote partition if parent == NULL
+ */
+static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
+ struct cpumask *xcpus)
+{
+ bool isolcpus_updated;
+
+ WARN_ON_ONCE(new_prs < 0);
+ lockdep_assert_held(&callback_lock);
+ if (!parent)
+ parent = &top_cpuset;
+
+
+ if (parent == &top_cpuset)
+ cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
+
+ isolcpus_updated = (new_prs != parent->partition_root_state);
+ if (isolcpus_updated)
+ partition_xcpus_newstate(parent->partition_root_state, new_prs,
+ xcpus);
+
+ cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
+ return isolcpus_updated;
+}
+
+/*
+ * partition_xcpus_del - Remove exclusive CPUs from partition
+ * @old_prs: old partition_root_state
+ * @parent: parent cpuset
+ * @xcpus: exclusive CPUs to be removed
+ * Return: true if isolated_cpus modified, false otherwise
+ *
+ * Remote partition if parent == NULL
+ */
+static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
+ struct cpumask *xcpus)
+{
+ bool isolcpus_updated;
+
+ WARN_ON_ONCE(old_prs < 0);
+ lockdep_assert_held(&callback_lock);
+ if (!parent)
+ parent = &top_cpuset;
+
+ if (parent == &top_cpuset)
+ cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
+
+ isolcpus_updated = (old_prs != parent->partition_root_state);
+ if (isolcpus_updated)
+ partition_xcpus_newstate(old_prs, parent->partition_root_state,
+ xcpus);
+
+ cpumask_and(xcpus, xcpus, cpu_active_mask);
+ cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
+ return isolcpus_updated;
+}
+
+static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
+{
+ int ret;
+
+ lockdep_assert_cpus_held();
+
+ if (!isolcpus_updated)
+ return;
+
+ ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
+ WARN_ON_ONCE(ret < 0);
+}
/**
- * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
- * @cpuset: The cpuset that requests change in partition root state
- * @cmd: Partition root state change command
- * @newmask: Optional new cpumask for partcmd_update
- * @tmp: Temporary addmask and delmask
- * Return: 0, 1 or an error code
- *
- * For partcmd_enable, the cpuset is being transformed from a non-partition
- * root to a partition root. The cpus_allowed mask of the given cpuset will
- * be put into parent's subparts_cpus and taken away from parent's
- * effective_cpus. The function will return 0 if all the CPUs listed in
- * cpus_allowed can be granted or an error code will be returned.
- *
- * For partcmd_disable, the cpuset is being transofrmed from a partition
- * root back to a non-partition root. any CPUs in cpus_allowed that are in
- * parent's subparts_cpus will be taken away from that cpumask and put back
- * into parent's effective_cpus. 0 should always be returned.
- *
- * For partcmd_update, if the optional newmask is specified, the cpu
- * list is to be changed from cpus_allowed to newmask. Otherwise,
- * cpus_allowed is assumed to remain the same. The cpuset should either
- * be a partition root or an invalid partition root. The partition root
- * state may change if newmask is NULL and none of the requested CPUs can
- * be granted by the parent. The function will return 1 if changes to
- * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
- * Error code should only be returned when newmask is non-NULL.
- *
- * The partcmd_enable and partcmd_disable commands are used by
- * update_prstate(). The partcmd_update command is used by
- * update_cpumasks_hier() with newmask NULL and update_cpumask() with
- * newmask set.
- *
- * The checking is more strict when enabling partition root than the
- * other two commands.
- *
- * Because of the implicit cpu exclusive nature of a partition root,
- * cpumask changes that violates the cpu exclusivity rule will not be
- * permitted when checked by validate_change(). The validate_change()
- * function will also prevent any changes to the cpu list if it is not
- * a superset of children's cpu lists.
- */
-static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
- struct cpumask *newmask,
- struct tmpmasks *tmp)
-{
- struct cpuset *parent = parent_cs(cpuset);
- int adding; /* Moving cpus from effective_cpus to subparts_cpus */
- int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
- bool part_error = false; /* Partition error? */
-
- percpu_rwsem_assert_held(&cpuset_rwsem);
+ * cpuset_cpu_is_isolated - Check if the given CPU is isolated
+ * @cpu: the CPU number to be checked
+ * Return: true if CPU is used in an isolated partition, false otherwise
+ */
+bool cpuset_cpu_is_isolated(int cpu)
+{
+ return cpumask_test_cpu(cpu, isolated_cpus);
+}
+EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
+
+/*
+ * compute_effective_exclusive_cpumask - compute effective exclusive CPUs
+ * @cs: cpuset
+ * @xcpus: effective exclusive CPUs value to be set
+ * Return: true if xcpus is not empty, false otherwise.
+ *
+ * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set),
+ * it must be a subset of cpus_allowed and parent's effective_xcpus.
+ */
+static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
+ struct cpumask *xcpus)
+{
+ struct cpuset *parent = parent_cs(cs);
+
+ if (!xcpus)
+ xcpus = cs->effective_xcpus;
+
+ if (!cpumask_empty(cs->exclusive_cpus))
+ cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed);
+ else
+ cpumask_copy(xcpus, cs->cpus_allowed);
+
+ return cpumask_and(xcpus, xcpus, parent->effective_xcpus);
+}
+
+static inline bool is_remote_partition(struct cpuset *cs)
+{
+ return !list_empty(&cs->remote_sibling);
+}
+
+static inline bool is_local_partition(struct cpuset *cs)
+{
+ return is_partition_valid(cs) && !is_remote_partition(cs);
+}
+
+/*
+ * remote_partition_enable - Enable current cpuset as a remote partition root
+ * @cs: the cpuset to update
+ * @new_prs: new partition_root_state
+ * @tmp: temparary masks
+ * Return: 1 if successful, 0 if error
+ *
+ * Enable the current cpuset to become a remote partition root taking CPUs
+ * directly from the top cpuset. cpuset_mutex must be held by the caller.
+ */
+static int remote_partition_enable(struct cpuset *cs, int new_prs,
+ struct tmpmasks *tmp)
+{
+ bool isolcpus_updated;
/*
- * The parent must be a partition root.
- * The new cpumask, if present, or the current cpus_allowed must
- * not be empty.
+ * The user must have sysadmin privilege.
*/
- if (!is_partition_root(parent) ||
- (newmask && cpumask_empty(newmask)) ||
- (!newmask && cpumask_empty(cpuset->cpus_allowed)))
- return -EINVAL;
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
/*
- * Enabling/disabling partition root is not allowed if there are
- * online children.
+ * The requested exclusive_cpus must not be allocated to other
+ * partitions and it can't use up all the root's effective_cpus.
+ *
+ * Note that if there is any local partition root above it or
+ * remote partition root underneath it, its exclusive_cpus must
+ * have overlapped with subpartitions_cpus.
*/
- if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
- return -EBUSY;
+ compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
+ if (cpumask_empty(tmp->new_cpus) ||
+ cpumask_intersects(tmp->new_cpus, subpartitions_cpus) ||
+ cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
+ return 0;
+
+ spin_lock_irq(&callback_lock);
+ isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
+ list_add(&cs->remote_sibling, &remote_children);
+ if (cs->use_parent_ecpus) {
+ struct cpuset *parent = parent_cs(cs);
+
+ cs->use_parent_ecpus = false;
+ parent->child_ecpus_count--;
+ }
+ spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
/*
- * Enabling partition root is not allowed if not all the CPUs
- * can be granted from parent's effective_cpus or at least one
- * CPU will be left after that.
+ * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
*/
- if ((cmd == partcmd_enable) &&
- (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
- cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
- return -EINVAL;
+ update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+ update_sibling_cpumasks(&top_cpuset, NULL, tmp);
+ return 1;
+}
+
+/*
+ * remote_partition_disable - Remove current cpuset from remote partition list
+ * @cs: the cpuset to update
+ * @tmp: temparary masks
+ *
+ * The effective_cpus is also updated.
+ *
+ * cpuset_mutex must be held by the caller.
+ */
+static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
+{
+ bool isolcpus_updated;
+
+ compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
+ WARN_ON_ONCE(!is_remote_partition(cs));
+ WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));
+
+ spin_lock_irq(&callback_lock);
+ list_del_init(&cs->remote_sibling);
+ isolcpus_updated = partition_xcpus_del(cs->partition_root_state,
+ NULL, tmp->new_cpus);
+ cs->partition_root_state = -cs->partition_root_state;
+ if (!cs->prs_err)
+ cs->prs_err = PERR_INVCPUS;
+ reset_partition_data(cs);
+ spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
/*
- * A cpumask update cannot make parent's effective_cpus become empty.
+ * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
+ */
+ update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+ update_sibling_cpumasks(&top_cpuset, NULL, tmp);
+}
+
+/*
+ * remote_cpus_update - cpus_exclusive change of remote partition
+ * @cs: the cpuset to be updated
+ * @newmask: the new effective_xcpus mask
+ * @tmp: temparary masks
+ *
+ * top_cpuset and subpartitions_cpus will be updated or partition can be
+ * invalidated.
+ */
+static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
+ struct tmpmasks *tmp)
+{
+ bool adding, deleting;
+ int prs = cs->partition_root_state;
+ int isolcpus_updated = 0;
+
+ if (WARN_ON_ONCE(!is_remote_partition(cs)))
+ return;
+
+ WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
+
+ if (cpumask_empty(newmask))
+ goto invalidate;
+
+ adding = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus);
+ deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask);
+
+ /*
+ * Additions of remote CPUs is only allowed if those CPUs are
+ * not allocated to other partitions and there are effective_cpus
+ * left in the top cpuset.
+ */
+ if (adding && (!capable(CAP_SYS_ADMIN) ||
+ cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
+ cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)))
+ goto invalidate;
+
+ spin_lock_irq(&callback_lock);
+ if (adding)
+ isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask);
+ if (deleting)
+ isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask);
+ spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
+
+ /*
+ * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
+ */
+ update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+ update_sibling_cpumasks(&top_cpuset, NULL, tmp);
+ return;
+
+invalidate:
+ remote_partition_disable(cs, tmp);
+}
+
+/*
+ * remote_partition_check - check if a child remote partition needs update
+ * @cs: the cpuset to be updated
+ * @newmask: the new effective_xcpus mask
+ * @delmask: temporary mask for deletion (not in tmp)
+ * @tmp: temparary masks
+ *
+ * This should be called before the given cs has updated its cpus_allowed
+ * and/or effective_xcpus.
+ */
+static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
+ struct cpumask *delmask, struct tmpmasks *tmp)
+{
+ struct cpuset *child, *next;
+ int disable_cnt = 0;
+
+ /*
+ * Compute the effective exclusive CPUs that will be deleted.
+ */
+ if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) ||
+ !cpumask_intersects(delmask, subpartitions_cpus))
+ return; /* No deletion of exclusive CPUs in partitions */
+
+ /*
+ * Searching the remote children list to look for those that will
+ * be impacted by the deletion of exclusive CPUs.
+ *
+ * Since a cpuset must be removed from the remote children list
+ * before it can go offline and holding cpuset_mutex will prevent
+ * any change in cpuset status. RCU read lock isn't needed.
+ */
+ lockdep_assert_held(&cpuset_mutex);
+ list_for_each_entry_safe(child, next, &remote_children, remote_sibling)
+ if (cpumask_intersects(child->effective_cpus, delmask)) {
+ remote_partition_disable(child, tmp);
+ disable_cnt++;
+ }
+ if (disable_cnt)
+ rebuild_sched_domains_locked();
+}
+
+/*
+ * prstate_housekeeping_conflict - check for partition & housekeeping conflicts
+ * @prstate: partition root state to be checked
+ * @new_cpus: cpu mask
+ * Return: true if there is conflict, false otherwise
+ *
+ * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in
+ * an isolated partition.
+ */
+static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
+{
+ const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN);
+ bool all_in_hk = cpumask_subset(new_cpus, hk_domain);
+
+ if (!all_in_hk && (prstate != PRS_ISOLATED))
+ return true;
+
+ return false;
+}
+
+/**
+ * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset
+ * @cs: The cpuset that requests change in partition root state
+ * @cmd: Partition root state change command
+ * @newmask: Optional new cpumask for partcmd_update
+ * @tmp: Temporary addmask and delmask
+ * Return: 0 or a partition root state error code
+ *
+ * For partcmd_enable*, the cpuset is being transformed from a non-partition
+ * root to a partition root. The effective_xcpus (cpus_allowed if
+ * effective_xcpus not set) mask of the given cpuset will be taken away from
+ * parent's effective_cpus. The function will return 0 if all the CPUs listed
+ * in effective_xcpus can be granted or an error code will be returned.
+ *
+ * For partcmd_disable, the cpuset is being transformed from a partition
+ * root back to a non-partition root. Any CPUs in effective_xcpus will be
+ * given back to parent's effective_cpus. 0 will always be returned.
+ *
+ * For partcmd_update, if the optional newmask is specified, the cpu list is
+ * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is
+ * assumed to remain the same. The cpuset should either be a valid or invalid
+ * partition root. The partition root state may change from valid to invalid
+ * or vice versa. An error code will be returned if transitioning from
+ * invalid to valid violates the exclusivity rule.
+ *
+ * For partcmd_invalidate, the current partition will be made invalid.
+ *
+ * The partcmd_enable* and partcmd_disable commands are used by
+ * update_prstate(). An error code may be returned and the caller will check
+ * for error.
+ *
+ * The partcmd_update command is used by update_cpumasks_hier() with newmask
+ * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
+ * by update_cpumask() with NULL newmask. In both cases, the callers won't
+ * check for error and so partition_root_state and prs_error will be updated
+ * directly.
+ */
+static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
+ struct cpumask *newmask,
+ struct tmpmasks *tmp)
+{
+ struct cpuset *parent = parent_cs(cs);
+ int adding; /* Adding cpus to parent's effective_cpus */
+ int deleting; /* Deleting cpus from parent's effective_cpus */
+ int old_prs, new_prs;
+ int part_error = PERR_NONE; /* Partition error? */
+ int subparts_delta = 0;
+ struct cpumask *xcpus; /* cs effective_xcpus */
+ int isolcpus_updated = 0;
+ bool nocpu;
+
+ lockdep_assert_held(&cpuset_mutex);
+
+ /*
+ * new_prs will only be changed for the partcmd_update and
+ * partcmd_invalidate commands.
*/
adding = deleting = false;
- if (cmd == partcmd_enable) {
- cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
- adding = true;
+ old_prs = new_prs = cs->partition_root_state;
+ xcpus = !cpumask_empty(cs->exclusive_cpus)
+ ? cs->effective_xcpus : cs->cpus_allowed;
+
+ if (cmd == partcmd_invalidate) {
+ if (is_prs_invalid(old_prs))
+ return 0;
+
+ /*
+ * Make the current partition invalid.
+ */
+ if (is_partition_valid(parent))
+ adding = cpumask_and(tmp->addmask,
+ xcpus, parent->effective_xcpus);
+ if (old_prs > 0) {
+ new_prs = -old_prs;
+ subparts_delta--;
+ }
+ goto write_error;
+ }
+
+ /*
+ * The parent must be a partition root.
+ * The new cpumask, if present, or the current cpus_allowed must
+ * not be empty.
+ */
+ if (!is_partition_valid(parent)) {
+ return is_partition_invalid(parent)
+ ? PERR_INVPARENT : PERR_NOTPART;
+ }
+ if (!newmask && cpumask_empty(cs->cpus_allowed))
+ return PERR_CPUSEMPTY;
+
+ nocpu = tasks_nocpu_error(parent, cs, xcpus);
+
+ if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
+ /*
+ * Enabling partition root is not allowed if its
+ * effective_xcpus is empty or doesn't overlap with
+ * parent's effective_xcpus.
+ */
+ if (cpumask_empty(xcpus) ||
+ !cpumask_intersects(xcpus, parent->effective_xcpus))
+ return PERR_INVCPUS;
+
+ if (prstate_housekeeping_conflict(new_prs, xcpus))
+ return PERR_HKEEPING;
+
+ /*
+ * A parent can be left with no CPU as long as there is no
+ * task directly associated with the parent partition.
+ */
+ if (nocpu)
+ return PERR_NOCPUS;
+
+ cpumask_copy(tmp->delmask, xcpus);
+ deleting = true;
+ subparts_delta++;
+ new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
} else if (cmd == partcmd_disable) {
- deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
- parent->subparts_cpus);
+ /*
+ * May need to add cpus to parent's effective_cpus for
+ * valid partition root.
+ */
+ adding = !is_prs_invalid(old_prs) &&
+ cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus);
+ if (adding)
+ subparts_delta--;
+ new_prs = PRS_MEMBER;
} else if (newmask) {
/*
+ * Empty cpumask is not allowed
+ */
+ if (cpumask_empty(newmask)) {
+ part_error = PERR_CPUSEMPTY;
+ goto write_error;
+ }
+
+ /*
* partcmd_update with newmask:
*
- * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
- * addmask = newmask & parent->effective_cpus
- * & ~parent->subparts_cpus
+ * Compute add/delete mask to/from effective_cpus
+ *
+ * For valid partition:
+ * addmask = exclusive_cpus & ~newmask
+ * & parent->effective_xcpus
+ * delmask = newmask & ~exclusive_cpus
+ * & parent->effective_xcpus
+ *
+ * For invalid partition:
+ * delmask = newmask & parent->effective_xcpus
*/
- cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
- deleting = cpumask_and(tmp->delmask, tmp->delmask,
- parent->subparts_cpus);
-
- cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
- adding = cpumask_andnot(tmp->addmask, tmp->addmask,
- parent->subparts_cpus);
+ if (is_prs_invalid(old_prs)) {
+ adding = false;
+ deleting = cpumask_and(tmp->delmask,
+ newmask, parent->effective_xcpus);
+ } else {
+ cpumask_andnot(tmp->addmask, xcpus, newmask);
+ adding = cpumask_and(tmp->addmask, tmp->addmask,
+ parent->effective_xcpus);
+
+ cpumask_andnot(tmp->delmask, newmask, xcpus);
+ deleting = cpumask_and(tmp->delmask, tmp->delmask,
+ parent->effective_xcpus);
+ }
/*
- * Return error if the new effective_cpus could become empty.
+ * Make partition invalid if parent's effective_cpus could
+ * become empty and there are tasks in the parent.
*/
- if (adding &&
- cpumask_equal(parent->effective_cpus, tmp->addmask)) {
- if (!deleting)
- return -EINVAL;
- /*
- * As some of the CPUs in subparts_cpus might have
- * been offlined, we need to compute the real delmask
- * to confirm that.
- */
- if (!cpumask_and(tmp->addmask, tmp->delmask,
- cpu_active_mask))
- return -EINVAL;
- cpumask_copy(tmp->addmask, parent->effective_cpus);
+ if (nocpu && (!adding ||
+ !cpumask_intersects(tmp->addmask, cpu_active_mask))) {
+ part_error = PERR_NOCPUS;
+ deleting = false;
+ adding = cpumask_and(tmp->addmask,
+ xcpus, parent->effective_xcpus);
}
} else {
/*
- * partcmd_update w/o newmask:
+ * partcmd_update w/o newmask
*
- * addmask = cpus_allowed & parent->effectiveb_cpus
+ * delmask = effective_xcpus & parent->effective_cpus
*
- * Note that parent's subparts_cpus may have been
- * pre-shrunk in case there is a change in the cpu list.
- * So no deletion is needed.
+ * This can be called from:
+ * 1) update_cpumasks_hier()
+ * 2) cpuset_hotplug_update_tasks()
+ *
+ * Check to see if it can be transitioned from valid to
+ * invalid partition or vice versa.
+ *
+ * A partition error happens when parent has tasks and all
+ * its effective CPUs will have to be distributed out.
*/
- adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
- parent->effective_cpus);
- part_error = cpumask_equal(tmp->addmask,
- parent->effective_cpus);
+ WARN_ON_ONCE(!is_partition_valid(parent));
+ if (nocpu) {
+ part_error = PERR_NOCPUS;
+ if (is_partition_valid(cs))
+ adding = cpumask_and(tmp->addmask,
+ xcpus, parent->effective_xcpus);
+ } else if (is_partition_invalid(cs) &&
+ cpumask_subset(xcpus, parent->effective_xcpus)) {
+ struct cgroup_subsys_state *css;
+ struct cpuset *child;
+ bool exclusive = true;
+
+ /*
+ * Convert invalid partition to valid has to
+ * pass the cpu exclusivity test.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(child, css, parent) {
+ if (child == cs)
+ continue;
+ if (!cpusets_are_exclusive(cs, child)) {
+ exclusive = false;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (exclusive)
+ deleting = cpumask_and(tmp->delmask,
+ xcpus, parent->effective_cpus);
+ else
+ part_error = PERR_NOTEXCL;
+ }
}
- if (cmd == partcmd_update) {
- int prev_prs = cpuset->partition_root_state;
+write_error:
+ if (part_error)
+ WRITE_ONCE(cs->prs_err, part_error);
+ if (cmd == partcmd_update) {
/*
- * Check for possible transition between PRS_ENABLED
- * and PRS_ERROR.
+ * Check for possible transition between valid and invalid
+ * partition root.
*/
- switch (cpuset->partition_root_state) {
- case PRS_ENABLED:
- if (part_error)
- cpuset->partition_root_state = PRS_ERROR;
+ switch (cs->partition_root_state) {
+ case PRS_ROOT:
+ case PRS_ISOLATED:
+ if (part_error) {
+ new_prs = -old_prs;
+ subparts_delta--;
+ }
break;
- case PRS_ERROR:
- if (!part_error)
- cpuset->partition_root_state = PRS_ENABLED;
+ case PRS_INVALID_ROOT:
+ case PRS_INVALID_ISOLATED:
+ if (!part_error) {
+ new_prs = -old_prs;
+ subparts_delta++;
+ }
break;
}
- /*
- * Set part_error if previously in invalid state.
- */
- part_error = (prev_prs == PRS_ERROR);
}
- if (!part_error && (cpuset->partition_root_state == PRS_ERROR))
- return 0; /* Nothing need to be done */
+ if (!adding && !deleting && (new_prs == old_prs))
+ return 0;
- if (cpuset->partition_root_state == PRS_ERROR) {
- /*
- * Remove all its cpus from parent's subparts_cpus.
- */
- adding = false;
- deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
- parent->subparts_cpus);
- }
+ /*
+ * Transitioning between invalid to valid or vice versa may require
+ * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update,
+ * validate_change() has already been successfully called and
+ * CPU lists in cs haven't been updated yet. So defer it to later.
+ */
+ if ((old_prs != new_prs) && (cmd != partcmd_update)) {
+ int err = update_partition_exclusive(cs, new_prs);
- if (!adding && !deleting)
- return 0;
+ if (err)
+ return err;
+ }
/*
- * Change the parent's subparts_cpus.
+ * Change the parent's effective_cpus & effective_xcpus (top cpuset
+ * only).
+ *
* Newly added CPUs will be removed from effective_cpus and
* newly deleted ones will be added back to effective_cpus.
*/
spin_lock_irq(&callback_lock);
- if (adding) {
- cpumask_or(parent->subparts_cpus,
- parent->subparts_cpus, tmp->addmask);
- cpumask_andnot(parent->effective_cpus,
- parent->effective_cpus, tmp->addmask);
- }
- if (deleting) {
- cpumask_andnot(parent->subparts_cpus,
- parent->subparts_cpus, tmp->delmask);
- /*
- * Some of the CPUs in subparts_cpus might have been offlined.
- */
- cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
- cpumask_or(parent->effective_cpus,
- parent->effective_cpus, tmp->delmask);
+ if (old_prs != new_prs) {
+ cs->partition_root_state = new_prs;
+ if (new_prs <= 0)
+ cs->nr_subparts = 0;
+ }
+ /*
+ * Adding to parent's effective_cpus means deletion CPUs from cs
+ * and vice versa.
+ */
+ if (adding)
+ isolcpus_updated += partition_xcpus_del(old_prs, parent,
+ tmp->addmask);
+ if (deleting)
+ isolcpus_updated += partition_xcpus_add(new_prs, parent,
+ tmp->delmask);
+
+ if (is_partition_valid(parent)) {
+ parent->nr_subparts += subparts_delta;
+ WARN_ON_ONCE(parent->nr_subparts < 0);
}
-
- parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
+
+ if ((old_prs != new_prs) && (cmd == partcmd_update))
+ update_partition_exclusive(cs, new_prs);
+
+ if (adding || deleting) {
+ update_tasks_cpumask(parent, tmp->addmask);
+ update_sibling_cpumasks(parent, cs, tmp);
+ }
- return cmd == partcmd_update;
+ /*
+ * For partcmd_update without newmask, it is being called from
+ * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken.
+ * Update the load balance flag and scheduling domain if
+ * cpus_read_trylock() is successful.
+ */
+ if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) {
+ update_partition_sd_lb(cs, old_prs);
+ cpus_read_unlock();
+ }
+
+ notify_partition_change(cs, old_prs);
+ return 0;
}
+/**
+ * compute_partition_effective_cpumask - compute effective_cpus for partition
+ * @cs: partition root cpuset
+ * @new_ecpus: previously computed effective_cpus to be updated
+ *
+ * Compute the effective_cpus of a partition root by scanning effective_xcpus
+ * of child partition roots and excluding their effective_xcpus.
+ *
+ * This has the side effect of invalidating valid child partition roots,
+ * if necessary. Since it is called from either cpuset_hotplug_update_tasks()
+ * or update_cpumasks_hier() where parent and children are modified
+ * successively, we don't need to call update_parent_effective_cpumask()
+ * and the child's effective_cpus will be updated in later iterations.
+ *
+ * Note that rcu_read_lock() is assumed to be held.
+ */
+static void compute_partition_effective_cpumask(struct cpuset *cs,
+ struct cpumask *new_ecpus)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *child;
+ bool populated = partition_is_populated(cs, NULL);
+
+ /*
+ * Check child partition roots to see if they should be
+ * invalidated when
+ * 1) child effective_xcpus not a subset of new
+ * excluisve_cpus
+ * 2) All the effective_cpus will be used up and cp
+ * has tasks
+ */
+ compute_effective_exclusive_cpumask(cs, new_ecpus);
+ cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);
+
+ rcu_read_lock();
+ cpuset_for_each_child(child, css, cs) {
+ if (!is_partition_valid(child))
+ continue;
+
+ child->prs_err = 0;
+ if (!cpumask_subset(child->effective_xcpus,
+ cs->effective_xcpus))
+ child->prs_err = PERR_INVCPUS;
+ else if (populated &&
+ cpumask_subset(new_ecpus, child->effective_xcpus))
+ child->prs_err = PERR_NOCPUS;
+
+ if (child->prs_err) {
+ int old_prs = child->partition_root_state;
+
+ /*
+ * Invalidate child partition
+ */
+ spin_lock_irq(&callback_lock);
+ make_partition_invalid(child);
+ cs->nr_subparts--;
+ child->nr_subparts = 0;
+ spin_unlock_irq(&callback_lock);
+ notify_partition_change(child, old_prs);
+ continue;
+ }
+ cpumask_andnot(new_ecpus, new_ecpus,
+ child->effective_xcpus);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * update_cpumasks_hier() flags
+ */
+#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */
+#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */
+
/*
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
* @cs: the cpuset to consider
* @tmp: temp variables for calculating effective_cpus & partition setup
+ * @force: don't skip any descendant cpusets if set
*
- * When congifured cpumask is changed, the effective cpumasks of this cpuset
+ * When configured cpumask is changed, the effective cpumasks of this cpuset
* and all its descendants need to be updated.
*
- * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
+ * On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
*
* Called with cpuset_mutex held
*/
-static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
+static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
+ int flags)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
bool need_rebuild_sched_domains = false;
+ int old_prs, new_prs;
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
struct cpuset *parent = parent_cs(cp);
+ bool remote = is_remote_partition(cp);
+ bool update_parent = false;
+
+ /*
+ * Skip descendent remote partition that acquires CPUs
+ * directly from top cpuset unless it is cs.
+ */
+ if (remote && (cp != cs)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+
+ /*
+ * Update effective_xcpus if exclusive_cpus set.
+ * The case when exclusive_cpus isn't set is handled later.
+ */
+ if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) {
+ spin_lock_irq(&callback_lock);
+ compute_effective_exclusive_cpumask(cp, NULL);
+ spin_unlock_irq(&callback_lock);
+ }
- compute_effective_cpumask(tmp->new_cpus, cp, parent);
+ old_prs = new_prs = cp->partition_root_state;
+ if (remote || (is_partition_valid(parent) &&
+ is_partition_valid(cp)))
+ compute_partition_effective_cpumask(cp, tmp->new_cpus);
+ else
+ compute_effective_cpumask(tmp->new_cpus, cp, parent);
+
+ /*
+ * A partition with no effective_cpus is allowed as long as
+ * there is no task associated with it. Call
+ * update_parent_effective_cpumask() to check it.
+ */
+ if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) {
+ update_parent = true;
+ goto update_parent_effective;
+ }
/*
* If it becomes empty, inherit the effective mask of the
- * parent, which is guaranteed to have some CPUs.
+ * parent, which is guaranteed to have some CPUs unless
+ * it is a partition root that has explicitly distributed
+ * out all its CPUs.
*/
- if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
+ if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) {
cpumask_copy(tmp->new_cpus, parent->effective_cpus);
if (!cp->use_parent_ecpus) {
cp->use_parent_ecpus = true;
@@ -1321,103 +2246,100 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
parent->child_ecpus_count--;
}
+ if (remote)
+ goto get_css;
+
/*
- * Skip the whole subtree if the cpumask remains the same
- * and has no partition root state.
+ * Skip the whole subtree if
+ * 1) the cpumask remains the same,
+ * 2) has no partition root state,
+ * 3) HIER_CHECKALL flag not set, and
+ * 4) for v2 load balance state same as its parent.
*/
- if (!cp->partition_root_state &&
- cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
+ if (!cp->partition_root_state && !(flags & HIER_CHECKALL) &&
+ cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
+ (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
+update_parent_effective:
/*
- * update_parent_subparts_cpumask() should have been called
+ * update_parent_effective_cpumask() should have been called
* for cs already in update_cpumask(). We should also call
* update_tasks_cpumask() again for tasks in the parent
- * cpuset if the parent's subparts_cpus changes.
+ * cpuset if the parent's effective_cpus changes.
*/
- if ((cp != cs) && cp->partition_root_state) {
+ if ((cp != cs) && old_prs) {
switch (parent->partition_root_state) {
- case PRS_DISABLED:
- /*
- * If parent is not a partition root or an
- * invalid partition root, clear the state
- * state and the CS_CPU_EXCLUSIVE flag.
- */
- WARN_ON_ONCE(cp->partition_root_state
- != PRS_ERROR);
- cp->partition_root_state = 0;
-
- /*
- * clear_bit() is an atomic operation and
- * readers aren't interested in the state
- * of CS_CPU_EXCLUSIVE anyway. So we can
- * just update the flag without holding
- * the callback_lock.
- */
- clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
+ case PRS_ROOT:
+ case PRS_ISOLATED:
+ update_parent = true;
break;
- case PRS_ENABLED:
- if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
- update_tasks_cpumask(parent);
- break;
-
- case PRS_ERROR:
+ default:
/*
- * When parent is invalid, it has to be too.
+ * When parent is not a partition root or is
+ * invalid, child partition roots become
+ * invalid too.
*/
- cp->partition_root_state = PRS_ERROR;
- if (cp->nr_subparts_cpus) {
- cp->nr_subparts_cpus = 0;
- cpumask_clear(cp->subparts_cpus);
- }
+ if (is_partition_valid(cp))
+ new_prs = -cp->partition_root_state;
+ WRITE_ONCE(cp->prs_err,
+ is_partition_invalid(parent)
+ ? PERR_INVPARENT : PERR_NOTPART);
break;
}
}
-
+get_css:
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
- spin_lock_irq(&callback_lock);
-
- cpumask_copy(cp->effective_cpus, tmp->new_cpus);
- if (cp->nr_subparts_cpus &&
- (cp->partition_root_state != PRS_ENABLED)) {
- cp->nr_subparts_cpus = 0;
- cpumask_clear(cp->subparts_cpus);
- } else if (cp->nr_subparts_cpus) {
+ if (update_parent) {
+ update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp);
/*
- * Make sure that effective_cpus & subparts_cpus
- * are mutually exclusive.
- *
- * In the unlikely event that effective_cpus
- * becomes empty. we clear cp->nr_subparts_cpus and
- * let its child partition roots to compete for
- * CPUs again.
+ * The cpuset partition_root_state may become
+ * invalid. Capture it.
*/
- cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
- cp->subparts_cpus);
- if (cpumask_empty(cp->effective_cpus)) {
- cpumask_copy(cp->effective_cpus, tmp->new_cpus);
- cpumask_clear(cp->subparts_cpus);
- cp->nr_subparts_cpus = 0;
- } else if (!cpumask_subset(cp->subparts_cpus,
- tmp->new_cpus)) {
- cpumask_andnot(cp->subparts_cpus,
- cp->subparts_cpus, tmp->new_cpus);
- cp->nr_subparts_cpus
- = cpumask_weight(cp->subparts_cpus);
- }
+ new_prs = cp->partition_root_state;
}
+
+ spin_lock_irq(&callback_lock);
+ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+ cp->partition_root_state = new_prs;
+ /*
+ * Make sure effective_xcpus is properly set for a valid
+ * partition root.
+ */
+ if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))
+ cpumask_and(cp->effective_xcpus,
+ cp->cpus_allowed, parent->effective_xcpus);
+ else if (new_prs < 0)
+ reset_partition_data(cp);
spin_unlock_irq(&callback_lock);
+ notify_partition_change(cp, old_prs);
+
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
- update_tasks_cpumask(cp);
+ update_tasks_cpumask(cp, cp->effective_cpus);
+
+ /*
+ * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
+ * from parent if current cpuset isn't a valid partition root
+ * and their load balance states differ.
+ */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ !is_partition_valid(cp) &&
+ (is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
+ if (is_sched_load_balance(parent))
+ set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
+ else
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
+ }
/*
* On legacy hierarchy, if the effective cpumask of any non-
@@ -1428,7 +2350,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
if (!cpumask_empty(cp->cpus_allowed) &&
is_sched_load_balance(cp) &&
(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
- is_partition_root(cp)))
+ is_partition_valid(cp)))
need_rebuild_sched_domains = true;
rcu_read_lock();
@@ -1436,7 +2358,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
}
rcu_read_unlock();
- if (need_rebuild_sched_domains)
+ if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD))
rebuild_sched_domains_locked();
}
@@ -1452,19 +2374,41 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct cpuset *sibling;
struct cgroup_subsys_state *pos_css;
+ lockdep_assert_held(&cpuset_mutex);
+
/*
* Check all its siblings and call update_cpumasks_hier()
- * if their use_parent_ecpus flag is set in order for them
- * to use the right effective_cpus value.
+ * if their effective_cpus will need to be changed.
+ *
+ * With the addition of effective_xcpus which is a subset of
+ * cpus_allowed. It is possible a change in parent's effective_cpus
+ * due to a change in a child partition's effective_xcpus will impact
+ * its siblings even if they do not inherit parent's effective_cpus
+ * directly.
+ *
+ * The update_cpumasks_hier() function may sleep. So we have to
+ * release the RCU read lock before calling it. HIER_NO_SD_REBUILD
+ * flag is used to suppress rebuild of sched domains as the callers
+ * will take care of that.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
if (sibling == cs)
continue;
- if (!sibling->use_parent_ecpus)
+ if (!sibling->use_parent_ecpus &&
+ !is_partition_valid(sibling)) {
+ compute_effective_cpumask(tmp->new_cpus, sibling,
+ parent);
+ if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
+ continue;
+ }
+ if (!css_tryget_online(&sibling->css))
continue;
- update_cpumasks_hier(sibling, tmp);
+ rcu_read_unlock();
+ update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD);
+ rcu_read_lock();
+ css_put(&sibling->css);
}
rcu_read_unlock();
}
@@ -1480,6 +2424,10 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
{
int retval;
struct tmpmasks tmp;
+ struct cpuset *parent = parent_cs(cs);
+ bool invalidate = false;
+ int hier_flags = 0;
+ int old_prs = cs->partition_root_state;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
@@ -1493,6 +2441,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
*/
if (!*buf) {
cpumask_clear(trialcs->cpus_allowed);
+ cpumask_clear(trialcs->effective_xcpus);
} else {
retval = cpulist_parse(buf, trialcs->cpus_allowed);
if (retval < 0)
@@ -1501,60 +2450,223 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (!cpumask_subset(trialcs->cpus_allowed,
top_cpuset.cpus_allowed))
return -EINVAL;
+
+ /*
+ * When exclusive_cpus isn't explicitly set, it is constrainted
+ * by cpus_allowed and parent's effective_xcpus. Otherwise,
+ * trialcs->effective_xcpus is used as a temporary cpumask
+ * for checking validity of the partition root.
+ */
+ if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs))
+ compute_effective_exclusive_cpumask(trialcs, NULL);
}
/* Nothing to do if the cpus didn't change */
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
- retval = validate_change(cs, trialcs);
- if (retval < 0)
- return retval;
+ if (alloc_cpumasks(NULL, &tmp))
+ return -ENOMEM;
+
+ if (old_prs) {
+ if (is_partition_valid(cs) &&
+ cpumask_empty(trialcs->effective_xcpus)) {
+ invalidate = true;
+ cs->prs_err = PERR_INVCPUS;
+ } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
+ invalidate = true;
+ cs->prs_err = PERR_HKEEPING;
+ } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
+ invalidate = true;
+ cs->prs_err = PERR_NOCPUS;
+ }
+ }
-#ifdef CONFIG_CPUMASK_OFFSTACK
/*
- * Use the cpumasks in trialcs for tmpmasks when they are pointers
- * to allocated cpumasks.
+ * Check all the descendants in update_cpumasks_hier() if
+ * effective_xcpus is to be changed.
*/
- tmp.addmask = trialcs->subparts_cpus;
- tmp.delmask = trialcs->effective_cpus;
- tmp.new_cpus = trialcs->cpus_allowed;
-#endif
+ if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
+ hier_flags = HIER_CHECKALL;
- if (cs->partition_root_state) {
- /* Cpumask of a partition root cannot be empty */
- if (cpumask_empty(trialcs->cpus_allowed))
- return -EINVAL;
- if (update_parent_subparts_cpumask(cs, partcmd_update,
- trialcs->cpus_allowed, &tmp) < 0)
- return -EINVAL;
+ retval = validate_change(cs, trialcs);
+
+ if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ struct cgroup_subsys_state *css;
+ struct cpuset *cp;
+
+ /*
+ * The -EINVAL error code indicates that partition sibling
+ * CPU exclusivity rule has been violated. We still allow
+ * the cpumask change to proceed while invalidating the
+ * partition. However, any conflicting sibling partitions
+ * have to be marked as invalid too.
+ */
+ invalidate = true;
+ rcu_read_lock();
+ cpuset_for_each_child(cp, css, parent) {
+ struct cpumask *xcpus = fetch_xcpus(trialcs);
+
+ if (is_partition_valid(cp) &&
+ cpumask_intersects(xcpus, cp->effective_xcpus)) {
+ rcu_read_unlock();
+ update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp);
+ rcu_read_lock();
+ }
+ }
+ rcu_read_unlock();
+ retval = 0;
+ }
+
+ if (retval < 0)
+ goto out_free;
+
+ if (is_partition_valid(cs) ||
+ (is_partition_invalid(cs) && !invalidate)) {
+ struct cpumask *xcpus = trialcs->effective_xcpus;
+
+ if (cpumask_empty(xcpus) && is_partition_invalid(cs))
+ xcpus = trialcs->cpus_allowed;
+
+ /*
+ * Call remote_cpus_update() to handle valid remote partition
+ */
+ if (is_remote_partition(cs))
+ remote_cpus_update(cs, xcpus, &tmp);
+ else if (invalidate)
+ update_parent_effective_cpumask(cs, partcmd_invalidate,
+ NULL, &tmp);
+ else
+ update_parent_effective_cpumask(cs, partcmd_update,
+ xcpus, &tmp);
+ } else if (!cpumask_empty(cs->exclusive_cpus)) {
+ /*
+ * Use trialcs->effective_cpus as a temp cpumask
+ */
+ remote_partition_check(cs, trialcs->effective_xcpus,
+ trialcs->effective_cpus, &tmp);
}
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+ cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
+ if ((old_prs > 0) && !is_partition_valid(cs))
+ reset_partition_data(cs);
+ spin_unlock_irq(&callback_lock);
+
+ /* effective_cpus/effective_xcpus will be updated here */
+ update_cpumasks_hier(cs, &tmp, hier_flags);
+
+ /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
+ if (cs->partition_root_state)
+ update_partition_sd_lb(cs, old_prs);
+out_free:
+ free_cpumasks(NULL, &tmp);
+ return retval;
+}
+
+/**
+ * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset
+ * @cs: the cpuset to consider
+ * @trialcs: trial cpuset
+ * @buf: buffer of cpu numbers written to this cpuset
+ *
+ * The tasks' cpumask will be updated if cs is a valid partition root.
+ */
+static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+ const char *buf)
+{
+ int retval;
+ struct tmpmasks tmp;
+ struct cpuset *parent = parent_cs(cs);
+ bool invalidate = false;
+ int hier_flags = 0;
+ int old_prs = cs->partition_root_state;
+
+ if (!*buf) {
+ cpumask_clear(trialcs->exclusive_cpus);
+ cpumask_clear(trialcs->effective_xcpus);
+ } else {
+ retval = cpulist_parse(buf, trialcs->exclusive_cpus);
+ if (retval < 0)
+ return retval;
+ if (!is_cpu_exclusive(cs))
+ set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags);
+ }
+
+ /* Nothing to do if the CPUs didn't change */
+ if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
+ return 0;
+
+ if (*buf)
+ compute_effective_exclusive_cpumask(trialcs, NULL);
/*
- * Make sure that subparts_cpus is a subset of cpus_allowed.
+ * Check all the descendants in update_cpumasks_hier() if
+ * effective_xcpus is to be changed.
*/
- if (cs->nr_subparts_cpus) {
- cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
- cs->cpus_allowed);
- cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
- }
- spin_unlock_irq(&callback_lock);
+ if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
+ hier_flags = HIER_CHECKALL;
- update_cpumasks_hier(cs, &tmp);
+ retval = validate_change(cs, trialcs);
+ if (retval)
+ return retval;
- if (cs->partition_root_state) {
- struct cpuset *parent = parent_cs(cs);
+ if (alloc_cpumasks(NULL, &tmp))
+ return -ENOMEM;
+ if (old_prs) {
+ if (cpumask_empty(trialcs->effective_xcpus)) {
+ invalidate = true;
+ cs->prs_err = PERR_INVCPUS;
+ } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
+ invalidate = true;
+ cs->prs_err = PERR_HKEEPING;
+ } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
+ invalidate = true;
+ cs->prs_err = PERR_NOCPUS;
+ }
+
+ if (is_remote_partition(cs)) {
+ if (invalidate)
+ remote_partition_disable(cs, &tmp);
+ else
+ remote_cpus_update(cs, trialcs->effective_xcpus,
+ &tmp);
+ } else if (invalidate) {
+ update_parent_effective_cpumask(cs, partcmd_invalidate,
+ NULL, &tmp);
+ } else {
+ update_parent_effective_cpumask(cs, partcmd_update,
+ trialcs->effective_xcpus, &tmp);
+ }
+ } else if (!cpumask_empty(trialcs->exclusive_cpus)) {
/*
- * For partition root, update the cpumasks of sibling
- * cpusets if they use parent's effective_cpus.
+ * Use trialcs->effective_cpus as a temp cpumask
*/
- if (parent->child_ecpus_count)
- update_sibling_cpumasks(parent, cs, &tmp);
+ remote_partition_check(cs, trialcs->effective_xcpus,
+ trialcs->effective_cpus, &tmp);
}
+ spin_lock_irq(&callback_lock);
+ cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
+ cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
+ if ((old_prs > 0) && !is_partition_valid(cs))
+ reset_partition_data(cs);
+ spin_unlock_irq(&callback_lock);
+
+ /*
+ * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus
+ * of the subtree when it is a valid partition root or effective_xcpus
+ * is updated.
+ */
+ if (is_partition_valid(cs) || hier_flags)
+ update_cpumasks_hier(cs, &tmp, hier_flags);
+
+ /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
+ if (cs->partition_root_state)
+ update_partition_sd_lb(cs, old_prs);
+
+ free_cpumasks(NULL, &tmp);
return 0;
}
@@ -1589,6 +2701,11 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
{
struct cpuset_migrate_mm_work *mwork;
+ if (nodes_equal(*from, *to)) {
+ mmput(mm);
+ return;
+ }
+
mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
if (mwork) {
mwork->mm = mm;
@@ -1703,7 +2820,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
* When configured nodemask is changed, the effective nodemasks of this cpuset
* and all its descendants need to be updated.
*
- * On legacy hiearchy, effective_mems will be the same with mems_allowed.
+ * On legacy hierarchy, effective_mems will be the same with mems_allowed.
*
* Called with cpuset_mutex held
*/
@@ -1805,6 +2922,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
+ check_insane_mems_config(&trialcs->mems_allowed);
+
spin_lock_irq(&callback_lock);
cs->mems_allowed = trialcs->mems_allowed;
spin_unlock_irq(&callback_lock);
@@ -1858,7 +2977,7 @@ static void update_tasks_flags(struct cpuset *cs)
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
- cpuset_update_task_spread_flag(cs, task);
+ cpuset_update_task_spread_flags(cs, task);
css_task_iter_end(&it);
}
@@ -1912,92 +3031,117 @@ out:
return err;
}
-/*
- * update_prstate - update partititon_root_state
- * cs: the cpuset to update
- * val: 0 - disabled, 1 - enabled
+/**
+ * update_prstate - update partition_root_state
+ * @cs: the cpuset to update
+ * @new_prs: new partition root state
+ * Return: 0 if successful, != 0 if error
*
* Call with cpuset_mutex held.
*/
-static int update_prstate(struct cpuset *cs, int val)
+static int update_prstate(struct cpuset *cs, int new_prs)
{
- int err;
+ int err = PERR_NONE, old_prs = cs->partition_root_state;
struct cpuset *parent = parent_cs(cs);
- struct tmpmasks tmp;
+ struct tmpmasks tmpmask;
+ bool new_xcpus_state = false;
- if ((val != 0) && (val != 1))
- return -EINVAL;
- if (val == cs->partition_root_state)
+ if (old_prs == new_prs)
return 0;
/*
- * Cannot force a partial or invalid partition root to a full
- * partition root.
+ * Treat a previously invalid partition root as if it is a "member".
*/
- if (val && cs->partition_root_state)
- return -EINVAL;
+ if (new_prs && is_prs_invalid(old_prs))
+ old_prs = PRS_MEMBER;
- if (alloc_cpumasks(NULL, &tmp))
+ if (alloc_cpumasks(NULL, &tmpmask))
return -ENOMEM;
- err = -EINVAL;
- if (!cs->partition_root_state) {
- /*
- * Turning on partition root requires setting the
- * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
- * cannot be NULL.
- */
- if (cpumask_empty(cs->cpus_allowed))
- goto out;
+ /*
+ * Setup effective_xcpus if not properly set yet, it will be cleared
+ * later if partition becomes invalid.
+ */
+ if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) {
+ spin_lock_irq(&callback_lock);
+ cpumask_and(cs->effective_xcpus,
+ cs->cpus_allowed, parent->effective_xcpus);
+ spin_unlock_irq(&callback_lock);
+ }
- err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
- if (err)
- goto out;
+ err = update_partition_exclusive(cs, new_prs);
+ if (err)
+ goto out;
+
+ if (!old_prs) {
+ enum partition_cmd cmd = (new_prs == PRS_ROOT)
+ ? partcmd_enable : partcmd_enablei;
- err = update_parent_subparts_cpumask(cs, partcmd_enable,
- NULL, &tmp);
- if (err) {
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
- goto out;
- }
- cs->partition_root_state = PRS_ENABLED;
- } else {
/*
- * Turning off partition root will clear the
- * CS_CPU_EXCLUSIVE bit.
+ * cpus_allowed cannot be empty.
*/
- if (cs->partition_root_state == PRS_ERROR) {
- cs->partition_root_state = 0;
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
- err = 0;
+ if (cpumask_empty(cs->cpus_allowed)) {
+ err = PERR_CPUSEMPTY;
goto out;
}
- err = update_parent_subparts_cpumask(cs, partcmd_disable,
- NULL, &tmp);
- if (err)
- goto out;
-
- cs->partition_root_state = 0;
+ err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);
+ /*
+ * If an attempt to become local partition root fails,
+ * try to become a remote partition root instead.
+ */
+ if (err && remote_partition_enable(cs, new_prs, &tmpmask))
+ err = 0;
+ } else if (old_prs && new_prs) {
+ /*
+ * A change in load balance state only, no change in cpumasks.
+ */
+ new_xcpus_state = true;
+ } else {
+ /*
+ * Switching back to member is always allowed even if it
+ * disables child partitions.
+ */
+ if (is_remote_partition(cs))
+ remote_partition_disable(cs, &tmpmask);
+ else
+ update_parent_effective_cpumask(cs, partcmd_disable,
+ NULL, &tmpmask);
- /* Turning off CS_CPU_EXCLUSIVE will not return error */
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ /*
+ * Invalidation of child partitions will be done in
+ * update_cpumasks_hier().
+ */
}
-
+out:
/*
- * Update cpumask of parent's tasks except when it is the top
- * cpuset as some system daemons cannot be mapped to other CPUs.
+ * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error
+ * happens.
*/
- if (parent != &top_cpuset)
- update_tasks_cpumask(parent);
+ if (err) {
+ new_prs = -new_prs;
+ update_partition_exclusive(cs, new_prs);
+ }
- if (parent->child_ecpus_count)
- update_sibling_cpumasks(parent, cs, &tmp);
+ spin_lock_irq(&callback_lock);
+ cs->partition_root_state = new_prs;
+ WRITE_ONCE(cs->prs_err, err);
+ if (!is_partition_valid(cs))
+ reset_partition_data(cs);
+ else if (new_xcpus_state)
+ partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus);
+ spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(new_xcpus_state);
- rebuild_sched_domains_locked();
-out:
- free_cpumasks(NULL, &tmp);
- return err;
+ /* Force update if switching back to member */
+ update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
+
+ /* Update sched domains and load balance flag */
+ update_partition_sd_lb(cs, old_prs);
+
+ notify_partition_change(cs, old_prs);
+ free_cpumasks(NULL, &tmpmask);
+ return 0;
}
/*
@@ -2103,103 +3247,198 @@ static int fmeter_getrate(struct fmeter *fmp)
static struct cpuset *cpuset_attach_old_cs;
+/*
+ * Check to see if a cpuset can accept a new task
+ * For v1, cpus_allowed and mems_allowed can't be empty.
+ * For v2, effective_cpus can't be empty.
+ * Note that in v1, effective_cpus = cpus_allowed.
+ */
+static int cpuset_can_attach_check(struct cpuset *cs)
+{
+ if (cpumask_empty(cs->effective_cpus) ||
+ (!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))
+ return -ENOSPC;
+ return 0;
+}
+
+static void reset_migrate_dl_data(struct cpuset *cs)
+{
+ cs->nr_migrate_dl_tasks = 0;
+ cs->sum_migrate_dl_bw = 0;
+}
+
/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
- struct cpuset *cs;
+ struct cpuset *cs, *oldcs;
struct task_struct *task;
+ bool cpus_updated, mems_updated;
int ret;
/* used later by cpuset_attach() */
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
+ oldcs = cpuset_attach_old_cs;
cs = css_cs(css);
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
- /* allow moving tasks into an empty cpuset if on default hierarchy */
- ret = -ENOSPC;
- if (!is_in_v2_mode() &&
- (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
+ /* Check to see if task is allowed in the cpuset */
+ ret = cpuset_can_attach_check(cs);
+ if (ret)
goto out_unlock;
+ cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
+ mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+
cgroup_taskset_for_each(task, css, tset) {
- ret = task_can_attach(task, cs->cpus_allowed);
+ ret = task_can_attach(task);
if (ret)
goto out_unlock;
- ret = security_task_setscheduler(task);
- if (ret)
+
+ /*
+ * Skip rights over task check in v2 when nothing changes,
+ * migration permission derives from hierarchy ownership in
+ * cgroup_procs_write_permission()).
+ */
+ if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ (cpus_updated || mems_updated)) {
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
+ }
+
+ if (dl_task(task)) {
+ cs->nr_migrate_dl_tasks++;
+ cs->sum_migrate_dl_bw += task->dl.dl_bw;
+ }
+ }
+
+ if (!cs->nr_migrate_dl_tasks)
+ goto out_success;
+
+ if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
+ int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
+
+ if (unlikely(cpu >= nr_cpu_ids)) {
+ reset_migrate_dl_data(cs);
+ ret = -EINVAL;
goto out_unlock;
+ }
+
+ ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
+ if (ret) {
+ reset_migrate_dl_data(cs);
+ goto out_unlock;
+ }
}
+out_success:
/*
* Mark attach is in progress. This makes validate_change() fail
* changes which zero cpus/mems_allowed.
*/
cs->attach_in_progress++;
- ret = 0;
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
return ret;
}
static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
+ struct cpuset *cs;
cgroup_taskset_first(tset, &css);
+ cs = css_cs(css);
+
+ mutex_lock(&cpuset_mutex);
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
+
+ if (cs->nr_migrate_dl_tasks) {
+ int cpu = cpumask_any(cs->effective_cpus);
+
+ dl_bw_free(cpu, cs->sum_migrate_dl_bw);
+ reset_migrate_dl_data(cs);
+ }
- percpu_down_write(&cpuset_rwsem);
- css_cs(css)->attach_in_progress--;
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}
/*
- * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
+ * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task()
* but we can't allocate it dynamically there. Define it global and
* allocate from cpuset_init().
*/
static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_to;
+
+static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
+{
+ lockdep_assert_held(&cpuset_mutex);
+
+ if (cs != &top_cpuset)
+ guarantee_online_cpus(task, cpus_attach);
+ else
+ cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
+ subpartitions_cpus);
+ /*
+ * can_attach beforehand should guarantee that this doesn't
+ * fail. TODO: have a better way to handle failure here
+ */
+ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+
+ cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
+ cpuset_update_task_spread_flags(cs, task);
+}
static void cpuset_attach(struct cgroup_taskset *tset)
{
- /* static buf protected by cpuset_mutex */
- static nodemask_t cpuset_attach_nodemask_to;
struct task_struct *task;
struct task_struct *leader;
struct cgroup_subsys_state *css;
struct cpuset *cs;
struct cpuset *oldcs = cpuset_attach_old_cs;
+ bool cpus_updated, mems_updated;
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
- percpu_down_write(&cpuset_rwsem);
+ lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
+ mutex_lock(&cpuset_mutex);
+ cpus_updated = !cpumask_equal(cs->effective_cpus,
+ oldcs->effective_cpus);
+ mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
- /* prepare for attach */
- if (cs == &top_cpuset)
- cpumask_copy(cpus_attach, cpu_possible_mask);
- else
- guarantee_online_cpus(cs, cpus_attach);
+ /*
+ * In the default hierarchy, enabling cpuset in the child cgroups
+ * will trigger a number of cpuset_attach() calls with no change
+ * in effective cpus and mems. In that case, we can optimize out
+ * by skipping the task iteration and update.
+ */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ !cpus_updated && !mems_updated) {
+ cpuset_attach_nodemask_to = cs->effective_mems;
+ goto out;
+ }
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
- cgroup_taskset_for_each(task, css, tset) {
- /*
- * can_attach beforehand should guarantee that this doesn't
- * fail. TODO: have a better way to handle failure here
- */
- WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
-
- cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
- cpuset_update_task_spread_flag(cs, task);
- }
+ cgroup_taskset_for_each(task, css, tset)
+ cpuset_attach_task(cs, task);
/*
* Change mm for all threadgroup leaders. This is expensive and may
- * sleep and should be moved outside migration path proper.
+ * sleep and should be moved outside migration path proper. Skip it
+ * if there is no change in effective_mems and CS_MEMORY_MIGRATE is
+ * not set.
*/
cpuset_attach_nodemask_to = cs->effective_mems;
+ if (!is_memory_migrate(cs) && !mems_updated)
+ goto out;
+
cgroup_taskset_for_each_leader(leader, css, tset) {
struct mm_struct *mm = get_task_mm(leader);
@@ -2222,13 +3461,20 @@ static void cpuset_attach(struct cgroup_taskset *tset)
}
}
+out:
cs->old_mems_allowed = cpuset_attach_nodemask_to;
+ if (cs->nr_migrate_dl_tasks) {
+ cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
+ oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
+ reset_migrate_dl_data(cs);
+ }
+
cs->attach_in_progress--;
if (!cs->attach_in_progress)
wake_up(&cpuset_attach_wq);
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}
/* The various types of files and directories in a cpuset file system */
@@ -2240,6 +3486,9 @@ typedef enum {
FILE_EFFECTIVE_CPULIST,
FILE_EFFECTIVE_MEMLIST,
FILE_SUBPARTS_CPULIST,
+ FILE_EXCLUSIVE_CPULIST,
+ FILE_EFFECTIVE_XCPULIST,
+ FILE_ISOLATED_CPULIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
FILE_MEM_HARDWALL,
@@ -2259,8 +3508,8 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;
- get_online_cpus();
- percpu_down_write(&cpuset_rwsem);
+ cpus_read_lock();
+ mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
goto out_unlock;
@@ -2296,8 +3545,8 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ mutex_unlock(&cpuset_mutex);
+ cpus_read_unlock();
return retval;
}
@@ -2308,8 +3557,8 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = -ENODEV;
- get_online_cpus();
- percpu_down_write(&cpuset_rwsem);
+ cpus_read_lock();
+ mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2322,8 +3571,8 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ mutex_unlock(&cpuset_mutex);
+ cpus_read_unlock();
return retval;
}
@@ -2362,8 +3611,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
kernfs_break_active_protection(of->kn);
flush_work(&cpuset_hotplug_work);
- get_online_cpus();
- percpu_down_write(&cpuset_rwsem);
+ cpus_read_lock();
+ mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2377,6 +3626,9 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
case FILE_CPULIST:
retval = update_cpumask(cs, trialcs, buf);
break;
+ case FILE_EXCLUSIVE_CPULIST:
+ retval = update_exclusive_cpumask(cs, trialcs, buf);
+ break;
case FILE_MEMLIST:
retval = update_nodemask(cs, trialcs, buf);
break;
@@ -2387,8 +3639,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
free_cpuset(trialcs);
out_unlock:
- percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ mutex_unlock(&cpuset_mutex);
+ cpus_read_unlock();
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
flush_workqueue(cpuset_migrate_mm_wq);
@@ -2424,8 +3676,17 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
case FILE_EFFECTIVE_MEMLIST:
seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
break;
+ case FILE_EXCLUSIVE_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus));
+ break;
+ case FILE_EFFECTIVE_XCPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus));
+ break;
case FILE_SUBPARTS_CPULIST:
- seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));
+ break;
+ case FILE_ISOLATED_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus));
break;
default:
ret = -EINVAL;
@@ -2477,23 +3738,36 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
BUG();
}
- /* Unrechable but makes gcc happy */
+ /* Unreachable but makes gcc happy */
return 0;
}
static int sched_partition_show(struct seq_file *seq, void *v)
{
struct cpuset *cs = css_cs(seq_css(seq));
+ const char *err, *type = NULL;
switch (cs->partition_root_state) {
- case PRS_ENABLED:
+ case PRS_ROOT:
seq_puts(seq, "root\n");
break;
- case PRS_DISABLED:
+ case PRS_ISOLATED:
+ seq_puts(seq, "isolated\n");
+ break;
+ case PRS_MEMBER:
seq_puts(seq, "member\n");
break;
- case PRS_ERROR:
- seq_puts(seq, "root invalid\n");
+ case PRS_INVALID_ROOT:
+ type = "root";
+ fallthrough;
+ case PRS_INVALID_ISOLATED:
+ if (!type)
+ type = "isolated";
+ err = perr_strings[READ_ONCE(cs->prs_err)];
+ if (err)
+ seq_printf(seq, "%s invalid (%s)\n", type, err);
+ else
+ seq_printf(seq, "%s invalid\n", type);
break;
}
return 0;
@@ -2512,22 +3786,24 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
* Convert "root" to ENABLED, and convert "member" to DISABLED.
*/
if (!strcmp(buf, "root"))
- val = PRS_ENABLED;
+ val = PRS_ROOT;
else if (!strcmp(buf, "member"))
- val = PRS_DISABLED;
+ val = PRS_MEMBER;
+ else if (!strcmp(buf, "isolated"))
+ val = PRS_ISOLATED;
else
return -EINVAL;
css_get(&cs->css);
- get_online_cpus();
- percpu_down_write(&cpuset_rwsem);
+ cpus_read_lock();
+ mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;
retval = update_prstate(cs, val);
out_unlock:
- percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ mutex_unlock(&cpuset_mutex);
+ cpus_read_unlock();
css_put(&cs->css);
return retval ?: nbytes;
}
@@ -2679,24 +3955,52 @@ static struct cftype dfl_files[] = {
.write = sched_partition_write,
.private = FILE_PARTITION_ROOT,
.flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct cpuset, partition_file),
+ },
+
+ {
+ .name = "cpus.exclusive",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * NR_CPUS),
+ .private = FILE_EXCLUSIVE_CPULIST,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+
+ {
+ .name = "cpus.exclusive.effective",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_XCPULIST,
+ .flags = CFTYPE_NOT_ON_ROOT,
},
{
.name = "cpus.subpartitions",
.seq_show = cpuset_common_seq_show,
.private = FILE_SUBPARTS_CPULIST,
- .flags = CFTYPE_DEBUG,
+ .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
+ },
+
+ {
+ .name = "cpus.isolated",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_ISOLATED_CPULIST,
+ .flags = CFTYPE_ONLY_ON_ROOT,
},
{ } /* terminate */
};
-/*
- * cpuset_css_alloc - allocate a cpuset css
- * cgrp: control group that the new cpuset will be part of
+/**
+ * cpuset_css_alloc - Allocate a cpuset css
+ * @parent_css: Parent css of the control group that the new cpuset will be
+ * part of
+ * Return: cpuset css on success, -ENOMEM on failure.
+ *
+ * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return
+ * top cpuset css otherwise.
*/
-
static struct cgroup_subsys_state *
cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -2714,11 +4018,16 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
return ERR_PTR(-ENOMEM);
}
- set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+ __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
nodes_clear(cs->mems_allowed);
nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
+ INIT_LIST_HEAD(&cs->remote_sibling);
+
+ /* Set CS_MEMORY_MIGRATE for default hierarchy */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ __set_bit(CS_MEMORY_MIGRATE, &cs->flags);
return &cs->css;
}
@@ -2733,8 +4042,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (!parent)
return 0;
- get_online_cpus();
- percpu_down_write(&cpuset_rwsem);
+ cpus_read_lock();
+ mutex_lock(&cpuset_mutex);
set_bit(CS_ONLINE, &cs->flags);
if (is_spread_page(parent))
@@ -2750,7 +4059,20 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->effective_mems = parent->effective_mems;
cs->use_parent_ecpus = true;
parent->child_ecpus_count++;
+ /*
+ * Clear CS_SCHED_LOAD_BALANCE if parent is isolated
+ */
+ if (!is_sched_load_balance(parent))
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}
+
+ /*
+ * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
+ */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ !is_sched_load_balance(parent))
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+
spin_unlock_irq(&callback_lock);
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
@@ -2759,7 +4081,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
/*
* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
* set. This flag handling is implemented in cgroup core for
- * histrical reasons - the flag may be specified during mount.
+ * historical reasons - the flag may be specified during mount.
*
* Currently, if any sibling cpusets have exclusive cpus or mem, we
* refuse to clone the configuration - thereby refusing the task to
@@ -2785,8 +4107,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(&callback_lock);
out_unlock:
- percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ mutex_unlock(&cpuset_mutex);
+ cpus_read_unlock();
return 0;
}
@@ -2805,10 +4127,10 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
- get_online_cpus();
- percpu_down_write(&cpuset_rwsem);
+ cpus_read_lock();
+ mutex_lock(&cpuset_mutex);
- if (is_partition_root(cs))
+ if (is_partition_valid(cs))
update_prstate(cs, 0);
if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
@@ -2825,8 +4147,8 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
cpuset_dec();
clear_bit(CS_ONLINE, &cs->flags);
- percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ mutex_unlock(&cpuset_mutex);
+ cpus_read_unlock();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -2838,11 +4160,12 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
spin_lock_irq(&callback_lock);
if (is_in_v2_mode()) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+ cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask);
top_cpuset.mems_allowed = node_possible_map;
} else {
cpumask_copy(top_cpuset.cpus_allowed,
@@ -2851,7 +4174,69 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
}
spin_unlock_irq(&callback_lock);
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
+}
+
+/*
+ * In case the child is cloned into a cpuset different from its parent,
+ * additional checks are done to see if the move is allowed.
+ */
+static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
+{
+ struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
+ bool same_cs;
+ int ret;
+
+ rcu_read_lock();
+ same_cs = (cs == task_cs(current));
+ rcu_read_unlock();
+
+ if (same_cs)
+ return 0;
+
+ lockdep_assert_held(&cgroup_mutex);
+ mutex_lock(&cpuset_mutex);
+
+ /* Check to see if task is allowed in the cpuset */
+ ret = cpuset_can_attach_check(cs);
+ if (ret)
+ goto out_unlock;
+
+ ret = task_can_attach(task);
+ if (ret)
+ goto out_unlock;
+
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Mark attach is in progress. This makes validate_change() fail
+ * changes which zero cpus/mems_allowed.
+ */
+ cs->attach_in_progress++;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return ret;
+}
+
+static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)
+{
+ struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
+ bool same_cs;
+
+ rcu_read_lock();
+ same_cs = (cs == task_cs(current));
+ rcu_read_unlock();
+
+ if (same_cs)
+ return;
+
+ mutex_lock(&cpuset_mutex);
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
+ mutex_unlock(&cpuset_mutex);
}
/*
@@ -2861,11 +4246,33 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
*/
static void cpuset_fork(struct task_struct *task)
{
- if (task_css_is_root(task, cpuset_cgrp_id))
+ struct cpuset *cs;
+ bool same_cs;
+
+ rcu_read_lock();
+ cs = task_cs(task);
+ same_cs = (cs == task_cs(current));
+ rcu_read_unlock();
+
+ if (same_cs) {
+ if (cs == &top_cpuset)
+ return;
+
+ set_cpus_allowed_ptr(task, current->cpus_ptr);
+ task->mems_allowed = current->mems_allowed;
return;
+ }
+
+ /* CLONE_INTO_CGROUP */
+ mutex_lock(&cpuset_mutex);
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+ cpuset_attach_task(cs, task);
+
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
- set_cpus_allowed_ptr(task, current->cpus_ptr);
- task->mems_allowed = current->mems_allowed;
+ mutex_unlock(&cpuset_mutex);
}
struct cgroup_subsys cpuset_cgrp_subsys = {
@@ -2878,6 +4285,8 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.attach = cpuset_attach,
.post_attach = cpuset_post_attach,
.bind = cpuset_bind,
+ .can_fork = cpuset_can_fork,
+ .cancel_fork = cpuset_cancel_fork,
.fork = cpuset_fork,
.legacy_cftypes = legacy_files,
.dfl_cftypes = dfl_files,
@@ -2893,20 +4302,24 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
int __init cpuset_init(void)
{
- BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
-
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
- BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
cpumask_setall(top_cpuset.effective_cpus);
+ cpumask_setall(top_cpuset.effective_xcpus);
+ cpumask_setall(top_cpuset.exclusive_cpus);
nodes_setall(top_cpuset.effective_mems);
fmeter_init(&top_cpuset.fmeter);
set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
top_cpuset.relax_domain_level = -1;
+ INIT_LIST_HEAD(&remote_children);
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
@@ -2956,27 +4369,26 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
/*
* Don't call update_tasks_cpumask() if the cpuset becomes empty,
- * as the tasks will be migratecd to an ancestor.
+ * as the tasks will be migrated to an ancestor.
*/
if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
- update_tasks_cpumask(cs);
+ update_tasks_cpumask(cs, new_cpus);
if (mems_updated && !nodes_empty(cs->mems_allowed))
update_tasks_nodemask(cs);
is_empty = cpumask_empty(cs->cpus_allowed) ||
nodes_empty(cs->mems_allowed);
- percpu_up_write(&cpuset_rwsem);
-
/*
* Move tasks to the nearest ancestor with execution resources,
* This is full cgroup operation which will also call back into
* cpuset. Should be done outside any lock.
*/
- if (is_empty)
+ if (is_empty) {
+ mutex_unlock(&cpuset_mutex);
remove_tasks_in_empty_cpuset(cs);
-
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
+ }
}
static void
@@ -2984,7 +4396,8 @@ hotplug_update_tasks(struct cpuset *cs,
struct cpumask *new_cpus, nodemask_t *new_mems,
bool cpus_updated, bool mems_updated)
{
- if (cpumask_empty(new_cpus))
+ /* A partition root is allowed to have empty effective cpus */
+ if (cpumask_empty(new_cpus) && !is_partition_valid(cs))
cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
if (nodes_empty(*new_mems))
*new_mems = parent_cs(cs)->effective_mems;
@@ -2995,7 +4408,7 @@ hotplug_update_tasks(struct cpuset *cs,
spin_unlock_irq(&callback_lock);
if (cpus_updated)
- update_tasks_cpumask(cs);
+ update_tasks_cpumask(cs, new_cpus);
if (mems_updated)
update_tasks_nodemask(cs);
}
@@ -3007,6 +4420,30 @@ void cpuset_force_rebuild(void)
force_rebuild = true;
}
+/*
+ * Attempt to acquire a cpus_read_lock while a hotplug operation may be in
+ * progress.
+ * Return: true if successful, false otherwise
+ *
+ * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock,
+ * cpus_read_trylock() is used here to acquire the lock.
+ */
+static bool cpuset_hotplug_cpus_read_trylock(void)
+{
+ int retries = 0;
+
+ while (!cpus_read_trylock()) {
+ /*
+ * CPU hotplug still in progress. Retry 5 times
+ * with a 10ms wait before bailing out.
+ */
+ if (++retries > 5)
+ return false;
+ msleep(10);
+ }
+ return true;
+}
+
/**
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
* @cs: cpuset in interest
@@ -3022,77 +4459,91 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
static nodemask_t new_mems;
bool cpus_updated;
bool mems_updated;
+ bool remote;
+ int partcmd = -1;
struct cpuset *parent;
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
/*
* We have raced with task attaching. We wait until attaching
* is finished, so we won't attach a task to an empty cpuset.
*/
if (cs->attach_in_progress) {
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
goto retry;
}
- parent = parent_cs(cs);
+ parent = parent_cs(cs);
compute_effective_cpumask(&new_cpus, cs, parent);
nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
- if (cs->nr_subparts_cpus)
- /*
- * Make sure that CPUs allocated to child partitions
- * do not show up in effective_cpus.
- */
- cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
-
if (!tmp || !cs->partition_root_state)
goto update_tasks;
/*
- * In the unlikely event that a partition root has empty
- * effective_cpus or its parent becomes erroneous, we have to
- * transition it to the erroneous state.
+ * Compute effective_cpus for valid partition root, may invalidate
+ * child partition roots if necessary.
*/
- if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
- (parent->partition_root_state == PRS_ERROR))) {
- if (cs->nr_subparts_cpus) {
- cs->nr_subparts_cpus = 0;
- cpumask_clear(cs->subparts_cpus);
- compute_effective_cpumask(&new_cpus, cs, parent);
- }
-
- /*
- * If the effective_cpus is empty because the child
- * partitions take away all the CPUs, we can keep
- * the current partition and let the child partitions
- * fight for available CPUs.
- */
- if ((parent->partition_root_state == PRS_ERROR) ||
- cpumask_empty(&new_cpus)) {
- update_parent_subparts_cpumask(cs, partcmd_disable,
- NULL, tmp);
- cs->partition_root_state = PRS_ERROR;
- }
+ remote = is_remote_partition(cs);
+ if (remote || (is_partition_valid(cs) && is_partition_valid(parent)))
+ compute_partition_effective_cpumask(cs, &new_cpus);
+
+ if (remote && cpumask_empty(&new_cpus) &&
+ partition_is_populated(cs, NULL) &&
+ cpuset_hotplug_cpus_read_trylock()) {
+ remote_partition_disable(cs, tmp);
+ compute_effective_cpumask(&new_cpus, cs, parent);
+ remote = false;
cpuset_force_rebuild();
+ cpus_read_unlock();
}
/*
- * On the other hand, an erroneous partition root may be transitioned
- * back to a regular one or a partition root with no CPU allocated
- * from the parent may change to erroneous.
+ * Force the partition to become invalid if either one of
+ * the following conditions hold:
+ * 1) empty effective cpus but not valid empty partition.
+ * 2) parent is invalid or doesn't grant any cpus to child
+ * partitions.
*/
- if (is_partition_root(parent) &&
- ((cs->partition_root_state == PRS_ERROR) ||
- !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
- update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
- cpuset_force_rebuild();
+ if (is_local_partition(cs) && (!is_partition_valid(parent) ||
+ tasks_nocpu_error(parent, cs, &new_cpus)))
+ partcmd = partcmd_invalidate;
+ /*
+ * On the other hand, an invalid partition root may be transitioned
+ * back to a regular one.
+ */
+ else if (is_partition_valid(parent) && is_partition_invalid(cs))
+ partcmd = partcmd_update;
+
+ /*
+ * cpus_read_lock needs to be held before calling
+ * update_parent_effective_cpumask(). To avoid circular lock
+ * dependency between cpuset_mutex and cpus_read_lock,
+ * cpus_read_trylock() is used here to acquire the lock.
+ */
+ if (partcmd >= 0) {
+ if (!cpuset_hotplug_cpus_read_trylock())
+ goto update_tasks;
+
+ update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
+ cpus_read_unlock();
+ if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
+ compute_partition_effective_cpumask(cs, &new_cpus);
+ cpuset_force_rebuild();
+ }
+ }
update_tasks:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
+ if (!cpus_updated && !mems_updated)
+ goto unlock; /* Hotplug doesn't affect this cpuset */
+
+ if (mems_updated)
+ check_insane_mems_config(&new_mems);
if (is_in_v2_mode())
hotplug_update_tasks(cs, &new_cpus, &new_mems,
@@ -3101,11 +4552,13 @@ update_tasks:
hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
- percpu_up_write(&cpuset_rwsem);
+unlock:
+ mutex_unlock(&cpuset_mutex);
}
/**
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
+ * @work: unused
*
* This function is called after either CPU or memory configuration has
* changed and updates cpuset accordingly. The top_cpuset is always
@@ -3131,21 +4584,29 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
if (on_dfl && !alloc_cpumasks(NULL, &tmp))
ptmp = &tmp;
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
/* fetch the available cpus/mems and find out which changed how */
cpumask_copy(&new_cpus, cpu_active_mask);
new_mems = node_states[N_MEMORY];
/*
- * If subparts_cpus is populated, it is likely that the check below
- * will produce a false positive on cpus_updated when the cpu list
- * isn't changed. It is extra work, but it is better to be safe.
+ * If subpartitions_cpus is populated, it is likely that the check
+ * below will produce a false positive on cpus_updated when the cpu
+ * list isn't changed. It is extra work, but it is better to be safe.
*/
- cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
+ cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) ||
+ !cpumask_empty(subpartitions_cpus);
mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
- /* synchronize cpus_allowed to cpu_active_mask */
+ /*
+ * In the rare case that hotplug removes all the cpus in
+ * subpartitions_cpus, we assumed that cpus are updated.
+ */
+ if (!cpus_updated && top_cpuset.nr_subparts)
+ cpus_updated = true;
+
+ /* For v1, synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
spin_lock_irq(&callback_lock);
if (!on_dfl)
@@ -3153,17 +4614,16 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
/*
* Make sure that CPUs allocated to child partitions
* do not show up in effective_cpus. If no CPU is left,
- * we clear the subparts_cpus & let the child partitions
+ * we clear the subpartitions_cpus & let the child partitions
* fight for the CPUs again.
*/
- if (top_cpuset.nr_subparts_cpus) {
- if (cpumask_subset(&new_cpus,
- top_cpuset.subparts_cpus)) {
- top_cpuset.nr_subparts_cpus = 0;
- cpumask_clear(top_cpuset.subparts_cpus);
+ if (!cpumask_empty(subpartitions_cpus)) {
+ if (cpumask_subset(&new_cpus, subpartitions_cpus)) {
+ top_cpuset.nr_subparts = 0;
+ cpumask_clear(subpartitions_cpus);
} else {
cpumask_andnot(&new_cpus, &new_cpus,
- top_cpuset.subparts_cpus);
+ subpartitions_cpus);
}
}
cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
@@ -3181,7 +4641,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
update_tasks_nodemask(&top_cpuset);
}
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
/* if cpus or mems changed, we need to propagate to descendants */
if (cpus_updated || mems_updated) {
@@ -3238,11 +4698,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block cpuset_track_online_nodes_nb = {
- .notifier_call = cpuset_track_online_nodes,
- .priority = 10, /* ??! */
-};
-
/**
* cpuset_init_smp - initialize cpus_allowed
*
@@ -3250,14 +4705,17 @@ static struct notifier_block cpuset_track_online_nodes_nb = {
*/
void __init cpuset_init_smp(void)
{
- cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
- top_cpuset.mems_allowed = node_states[N_MEMORY];
+ /*
+ * cpus_allowd/mems_allowed set to v2 values in the initial
+ * cpuset_bind() call will be reset to v1 values in another
+ * cpuset_bind() call when v1 cpuset is mounted.
+ */
top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
top_cpuset.effective_mems = node_states[N_MEMORY];
- register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
+ hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
BUG_ON(!cpuset_migrate_mm_wq);
@@ -3271,16 +4729,37 @@ void __init cpuset_init_smp(void)
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
* subset of cpu_online_mask, even if this means going outside the
- * tasks cpuset.
+ * tasks cpuset, except when the task is in the top cpuset.
**/
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
unsigned long flags;
+ struct cpuset *cs;
spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
- guarantee_online_cpus(task_cs(tsk), pmask);
+
+ cs = task_cs(tsk);
+ if (cs != &top_cpuset)
+ guarantee_online_cpus(tsk, pmask);
+ /*
+ * Tasks in the top cpuset won't get update to their cpumasks
+ * when a hotplug online/offline event happens. So we include all
+ * offline cpus in the allowed cpu list.
+ */
+ if ((cs == &top_cpuset) || cpumask_empty(pmask)) {
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+
+ /*
+ * We first exclude cpus allocated to partitions. If there is no
+ * allowable online cpu left, we fall back to all possible cpus.
+ */
+ cpumask_andnot(pmask, possible_mask, subpartitions_cpus);
+ if (!cpumask_intersects(pmask, cpu_online_mask))
+ cpumask_copy(pmask, possible_mask);
+ }
+
rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags);
}
@@ -3295,13 +4774,22 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
* which will not contain a sane cpumask during cases such as cpu hotplugging.
* This is the absolute last resort for the scheduler and it is only used if
* _every_ other avenue has been traveled.
+ *
+ * Returns true if the affinity of @tsk was changed, false otherwise.
**/
-void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+ const struct cpumask *cs_mask;
+ bool changed = false;
+
rcu_read_lock();
- do_set_cpus_allowed(tsk, is_in_v2_mode() ?
- task_cs(tsk)->cpus_allowed : cpu_possible_mask);
+ cs_mask = task_cs(tsk)->cpus_allowed;
+ if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
+ do_set_cpus_allowed(tsk, cs_mask);
+ changed = true;
+ }
rcu_read_unlock();
/*
@@ -3321,6 +4809,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
* select_fallback_rq() will fix things ups and set cpu_possible_mask
* if required.
*/
+ return changed;
}
void __init cpuset_init_current_mems_allowed(void)
@@ -3353,7 +4842,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
}
/**
- * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
+ * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed
* @nodemask: the nodemask to be checked
*
* Are any of the nodes in the nodemask allowed in current->mems_allowed?
@@ -3376,7 +4865,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
return cs;
}
-/**
+/*
* cpuset_node_allowed - Can we allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
@@ -3416,10 +4905,10 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
-bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
+bool cpuset_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
- int allowed; /* is allocation in zone z allowed? */
+ bool allowed; /* is allocation in zone z allowed? */
unsigned long flags;
if (in_interrupt())
@@ -3451,8 +4940,8 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
}
/**
- * cpuset_mem_spread_node() - On which node to begin search for a file page
- * cpuset_slab_spread_node() - On which node to begin search for a slab page
+ * cpuset_spread_node() - On which node to begin search for a page
+ * @rotor: round robin rotor
*
* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
* tasks in a cpuset with is_spread_page or is_spread_slab set),
@@ -3476,12 +4965,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
* is passed an offline node, it will fall back to the local node.
* See kmem_cache_alloc_node().
*/
-
static int cpuset_spread_node(int *rotor)
{
return *rotor = next_node_in(*rotor, current->mems_allowed);
}
+/**
+ * cpuset_mem_spread_node() - On which node to begin search for a file page
+ */
int cpuset_mem_spread_node(void)
{
if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
@@ -3491,6 +4982,9 @@ int cpuset_mem_spread_node(void)
return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
}
+/**
+ * cpuset_slab_spread_node() - On which node to begin search for a slab page
+ */
int cpuset_slab_spread_node(void)
{
if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
@@ -3499,7 +4993,6 @@ int cpuset_slab_spread_node(void)
return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
}
-
EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
/**
@@ -3548,8 +5041,8 @@ void cpuset_print_current_mems_allowed(void)
int cpuset_memory_pressure_enabled __read_mostly;
-/**
- * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
+/*
+ * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
*
* Keep a running average of the rate of synchronous (direct)
* page reclaim efforts initiated by tasks in each cpuset.
@@ -3564,7 +5057,7 @@ int cpuset_memory_pressure_enabled __read_mostly;
* "memory_pressure". Value displayed is an integer
* representing the recent rate of entry into the synchronous
* (direct) page reclaim by any task attached to the cpuset.
- **/
+ */
void __cpuset_memory_pressure_bump(void)
{
@@ -3599,7 +5092,7 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
current->nsproxy->cgroup_ns);
css_put(css);
- if (retval >= PATH_MAX)
+ if (retval == -E2BIG)
retval = -ENAMETOOLONG;
if (retval < 0)
goto out_free;
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 3984dd6b8ddb..617861a54793 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -1,4 +1,4 @@
-//SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0
#include <linux/cgroup.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 08236798d173..66d1708042a7 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -22,6 +22,7 @@
#include <linux/freezer.h>
#include <linux/seq_file.h>
#include <linux/mutex.h>
+#include <linux/cpu.h>
/*
* A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
@@ -65,9 +66,15 @@ static struct freezer *parent_freezer(struct freezer *freezer)
bool cgroup_freezing(struct task_struct *task)
{
bool ret;
+ unsigned int state;
rcu_read_lock();
- ret = task_freezer(task)->state & CGROUP_FREEZING;
+ /* Check if the cgroup is still FREEZING, but not FROZEN. The extra
+ * !FROZEN check is required, because the FREEZING bit is not cleared
+ * when the state FROZEN is reached.
+ */
+ state = task_freezer(task)->state;
+ ret = (state & CGROUP_FREEZING) && !(state & CGROUP_FROZEN);
rcu_read_unlock();
return ret;
@@ -107,16 +114,18 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
struct freezer *freezer = css_freezer(css);
struct freezer *parent = parent_freezer(freezer);
+ cpus_read_lock();
mutex_lock(&freezer_mutex);
freezer->state |= CGROUP_FREEZER_ONLINE;
if (parent && (parent->state & CGROUP_FREEZING)) {
freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc_cpuslocked(&freezer_active);
}
mutex_unlock(&freezer_mutex);
+ cpus_read_unlock();
return 0;
}
@@ -131,14 +140,16 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
{
struct freezer *freezer = css_freezer(css);
+ cpus_read_lock();
mutex_lock(&freezer_mutex);
if (freezer->state & CGROUP_FREEZING)
- atomic_dec(&system_freezing_cnt);
+ static_branch_dec_cpuslocked(&freezer_active);
freezer->state = 0;
mutex_unlock(&freezer_mutex);
+ cpus_read_unlock();
}
static void freezer_css_free(struct cgroup_subsys_state *css)
@@ -179,6 +190,7 @@ static void freezer_attach(struct cgroup_taskset *tset)
__thaw_task(task);
} else {
freeze_task(task);
+
/* clear FROZEN and propagate upwards */
while (freezer && (freezer->state & CGROUP_FROZEN)) {
freezer->state &= ~CGROUP_FROZEN;
@@ -271,16 +283,8 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
css_task_iter_start(css, 0, &it);
while ((task = css_task_iter_next(&it))) {
- if (freezing(task)) {
- /*
- * freezer_should_skip() indicates that the task
- * should be skipped when determining freezing
- * completion. Consider it frozen in addition to
- * the usual frozen condition.
- */
- if (!frozen(task) && !freezer_should_skip(task))
- goto out_iter_end;
- }
+ if (freezing(task) && !frozen(task))
+ goto out_iter_end;
}
freezer->state |= CGROUP_FROZEN;
@@ -357,7 +361,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
if (freeze) {
if (!(freezer->state & CGROUP_FREEZING))
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc_cpuslocked(&freezer_active);
freezer->state |= state;
freeze_cgroup(freezer);
} else {
@@ -366,9 +370,9 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
freezer->state &= ~state;
if (!(freezer->state & CGROUP_FREEZING)) {
- if (was_freezing)
- atomic_dec(&system_freezing_cnt);
freezer->state &= ~CGROUP_FROZEN;
+ if (was_freezing)
+ static_branch_dec_cpuslocked(&freezer_active);
unfreeze_cgroup(freezer);
}
}
@@ -386,6 +390,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
{
struct cgroup_subsys_state *pos;
+ cpus_read_lock();
/*
* Update all its descendants in pre-order traversal. Each
* descendant will try to inherit its parent's FREEZING state as
@@ -414,6 +419,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
}
rcu_read_unlock();
mutex_unlock(&freezer_mutex);
+ cpus_read_unlock();
}
static ssize_t freezer_write(struct kernfs_open_file *of,
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
new file mode 100644
index 000000000000..79a3717a5803
--- /dev/null
+++ b/kernel/cgroup/misc.c
@@ -0,0 +1,422 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Miscellaneous cgroup controller
+ *
+ * Copyright 2020 Google LLC
+ * Author: Vipin Sharma <vipinsh@google.com>
+ */
+
+#include <linux/limits.h>
+#include <linux/cgroup.h>
+#include <linux/errno.h>
+#include <linux/atomic.h>
+#include <linux/slab.h>
+#include <linux/misc_cgroup.h>
+
+#define MAX_STR "max"
+#define MAX_NUM U64_MAX
+
+/* Miscellaneous res name, keep it in sync with enum misc_res_type */
+static const char *const misc_res_name[] = {
+#ifdef CONFIG_KVM_AMD_SEV
+ /* AMD SEV ASIDs resource */
+ "sev",
+ /* AMD SEV-ES ASIDs resource */
+ "sev_es",
+#endif
+};
+
+/* Root misc cgroup */
+static struct misc_cg root_cg;
+
+/*
+ * Miscellaneous resources capacity for the entire machine. 0 capacity means
+ * resource is not initialized or not present in the host.
+ *
+ * root_cg.max and capacity are independent of each other. root_cg.max can be
+ * more than the actual capacity. We are using Limits resource distribution
+ * model of cgroup for miscellaneous controller.
+ */
+static u64 misc_res_capacity[MISC_CG_RES_TYPES];
+
+/**
+ * parent_misc() - Get the parent of the passed misc cgroup.
+ * @cgroup: cgroup whose parent needs to be fetched.
+ *
+ * Context: Any context.
+ * Return:
+ * * struct misc_cg* - Parent of the @cgroup.
+ * * %NULL - If @cgroup is null or the passed cgroup does not have a parent.
+ */
+static struct misc_cg *parent_misc(struct misc_cg *cgroup)
+{
+ return cgroup ? css_misc(cgroup->css.parent) : NULL;
+}
+
+/**
+ * valid_type() - Check if @type is valid or not.
+ * @type: misc res type.
+ *
+ * Context: Any context.
+ * Return:
+ * * true - If valid type.
+ * * false - If not valid type.
+ */
+static inline bool valid_type(enum misc_res_type type)
+{
+ return type >= 0 && type < MISC_CG_RES_TYPES;
+}
+
+/**
+ * misc_cg_res_total_usage() - Get the current total usage of the resource.
+ * @type: misc res type.
+ *
+ * Context: Any context.
+ * Return: Current total usage of the resource.
+ */
+u64 misc_cg_res_total_usage(enum misc_res_type type)
+{
+ if (valid_type(type))
+ return atomic64_read(&root_cg.res[type].usage);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(misc_cg_res_total_usage);
+
+/**
+ * misc_cg_set_capacity() - Set the capacity of the misc cgroup res.
+ * @type: Type of the misc res.
+ * @capacity: Supported capacity of the misc res on the host.
+ *
+ * If capacity is 0 then the charging a misc cgroup fails for that type.
+ *
+ * Context: Any context.
+ * Return:
+ * * %0 - Successfully registered the capacity.
+ * * %-EINVAL - If @type is invalid.
+ */
+int misc_cg_set_capacity(enum misc_res_type type, u64 capacity)
+{
+ if (!valid_type(type))
+ return -EINVAL;
+
+ WRITE_ONCE(misc_res_capacity[type], capacity);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(misc_cg_set_capacity);
+
+/**
+ * misc_cg_cancel_charge() - Cancel the charge from the misc cgroup.
+ * @type: Misc res type in misc cg to cancel the charge from.
+ * @cg: Misc cgroup to cancel charge from.
+ * @amount: Amount to cancel.
+ *
+ * Context: Any context.
+ */
+static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
+ u64 amount)
+{
+ WARN_ONCE(atomic64_add_negative(-amount, &cg->res[type].usage),
+ "misc cgroup resource %s became less than 0",
+ misc_res_name[type]);
+}
+
+/**
+ * misc_cg_try_charge() - Try charging the misc cgroup.
+ * @type: Misc res type to charge.
+ * @cg: Misc cgroup which will be charged.
+ * @amount: Amount to charge.
+ *
+ * Charge @amount to the misc cgroup. Caller must use the same cgroup during
+ * the uncharge call.
+ *
+ * Context: Any context.
+ * Return:
+ * * %0 - If successfully charged.
+ * * -EINVAL - If @type is invalid or misc res has 0 capacity.
+ * * -EBUSY - If max limit will be crossed or total usage will be more than the
+ * capacity.
+ */
+int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount)
+{
+ struct misc_cg *i, *j;
+ int ret;
+ struct misc_res *res;
+ u64 new_usage;
+
+ if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type])))
+ return -EINVAL;
+
+ if (!amount)
+ return 0;
+
+ for (i = cg; i; i = parent_misc(i)) {
+ res = &i->res[type];
+
+ new_usage = atomic64_add_return(amount, &res->usage);
+ if (new_usage > READ_ONCE(res->max) ||
+ new_usage > READ_ONCE(misc_res_capacity[type])) {
+ ret = -EBUSY;
+ goto err_charge;
+ }
+ }
+ return 0;
+
+err_charge:
+ for (j = i; j; j = parent_misc(j)) {
+ atomic64_inc(&j->res[type].events);
+ cgroup_file_notify(&j->events_file);
+ }
+
+ for (j = cg; j != i; j = parent_misc(j))
+ misc_cg_cancel_charge(type, j, amount);
+ misc_cg_cancel_charge(type, i, amount);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(misc_cg_try_charge);
+
+/**
+ * misc_cg_uncharge() - Uncharge the misc cgroup.
+ * @type: Misc res type which was charged.
+ * @cg: Misc cgroup which will be uncharged.
+ * @amount: Charged amount.
+ *
+ * Context: Any context.
+ */
+void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount)
+{
+ struct misc_cg *i;
+
+ if (!(amount && valid_type(type) && cg))
+ return;
+
+ for (i = cg; i; i = parent_misc(i))
+ misc_cg_cancel_charge(type, i, amount);
+}
+EXPORT_SYMBOL_GPL(misc_cg_uncharge);
+
+/**
+ * misc_cg_max_show() - Show the misc cgroup max limit.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_max_show(struct seq_file *sf, void *v)
+{
+ int i;
+ struct misc_cg *cg = css_misc(seq_css(sf));
+ u64 max;
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ if (READ_ONCE(misc_res_capacity[i])) {
+ max = READ_ONCE(cg->res[i].max);
+ if (max == MAX_NUM)
+ seq_printf(sf, "%s max\n", misc_res_name[i]);
+ else
+ seq_printf(sf, "%s %llu\n", misc_res_name[i],
+ max);
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * misc_cg_max_write() - Update the maximum limit of the cgroup.
+ * @of: Handler for the file.
+ * @buf: Data from the user. It should be either "max", 0, or a positive
+ * integer.
+ * @nbytes: Number of bytes of the data.
+ * @off: Offset in the file.
+ *
+ * User can pass data like:
+ * echo sev 23 > misc.max, OR
+ * echo sev max > misc.max
+ *
+ * Context: Any context.
+ * Return:
+ * * >= 0 - Number of bytes processed in the input.
+ * * -EINVAL - If buf is not valid.
+ * * -ERANGE - If number is bigger than the u64 capacity.
+ */
+static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct misc_cg *cg;
+ u64 max;
+ int ret = 0, i;
+ enum misc_res_type type = MISC_CG_RES_TYPES;
+ char *token;
+
+ buf = strstrip(buf);
+ token = strsep(&buf, " ");
+
+ if (!token || !buf)
+ return -EINVAL;
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ if (!strcmp(misc_res_name[i], token)) {
+ type = i;
+ break;
+ }
+ }
+
+ if (type == MISC_CG_RES_TYPES)
+ return -EINVAL;
+
+ if (!strcmp(MAX_STR, buf)) {
+ max = MAX_NUM;
+ } else {
+ ret = kstrtou64(buf, 0, &max);
+ if (ret)
+ return ret;
+ }
+
+ cg = css_misc(of_css(of));
+
+ if (READ_ONCE(misc_res_capacity[type]))
+ WRITE_ONCE(cg->res[type].max, max);
+ else
+ ret = -EINVAL;
+
+ return ret ? ret : nbytes;
+}
+
+/**
+ * misc_cg_current_show() - Show the current usage of the misc cgroup.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_current_show(struct seq_file *sf, void *v)
+{
+ int i;
+ u64 usage;
+ struct misc_cg *cg = css_misc(seq_css(sf));
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ usage = atomic64_read(&cg->res[i].usage);
+ if (READ_ONCE(misc_res_capacity[i]) || usage)
+ seq_printf(sf, "%s %llu\n", misc_res_name[i], usage);
+ }
+
+ return 0;
+}
+
+/**
+ * misc_cg_capacity_show() - Show the total capacity of misc res on the host.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Only present in the root cgroup directory.
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_capacity_show(struct seq_file *sf, void *v)
+{
+ int i;
+ u64 cap;
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ cap = READ_ONCE(misc_res_capacity[i]);
+ if (cap)
+ seq_printf(sf, "%s %llu\n", misc_res_name[i], cap);
+ }
+
+ return 0;
+}
+
+static int misc_events_show(struct seq_file *sf, void *v)
+{
+ struct misc_cg *cg = css_misc(seq_css(sf));
+ u64 events;
+ int i;
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ events = atomic64_read(&cg->res[i].events);
+ if (READ_ONCE(misc_res_capacity[i]) || events)
+ seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events);
+ }
+ return 0;
+}
+
+/* Misc cgroup interface files */
+static struct cftype misc_cg_files[] = {
+ {
+ .name = "max",
+ .write = misc_cg_max_write,
+ .seq_show = misc_cg_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .seq_show = misc_cg_current_show,
+ },
+ {
+ .name = "capacity",
+ .seq_show = misc_cg_capacity_show,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+ {
+ .name = "events",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct misc_cg, events_file),
+ .seq_show = misc_events_show,
+ },
+ {}
+};
+
+/**
+ * misc_cg_alloc() - Allocate misc cgroup.
+ * @parent_css: Parent cgroup.
+ *
+ * Context: Process context.
+ * Return:
+ * * struct cgroup_subsys_state* - css of the allocated cgroup.
+ * * ERR_PTR(-ENOMEM) - No memory available to allocate.
+ */
+static struct cgroup_subsys_state *
+misc_cg_alloc(struct cgroup_subsys_state *parent_css)
+{
+ enum misc_res_type i;
+ struct misc_cg *cg;
+
+ if (!parent_css) {
+ cg = &root_cg;
+ } else {
+ cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+ if (!cg)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ WRITE_ONCE(cg->res[i].max, MAX_NUM);
+ atomic64_set(&cg->res[i].usage, 0);
+ }
+
+ return &cg->css;
+}
+
+/**
+ * misc_cg_free() - Free the misc cgroup.
+ * @css: cgroup subsys object.
+ *
+ * Context: Any context.
+ */
+static void misc_cg_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_misc(css));
+}
+
+/* Cgroup controller callbacks */
+struct cgroup_subsys misc_cgrp_subsys = {
+ .css_alloc = misc_cg_alloc,
+ .css_free = misc_cg_free,
+ .legacy_cftypes = misc_cg_files,
+ .dfl_cftypes = misc_cg_files,
+};
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 812a61afd538..144a464e45c6 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -24,7 +24,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
struct cgroup_namespace *new_ns;
int ret;
- new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+ new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT);
if (!new_ns)
return ERR_PTR(-ENOMEM);
ret = ns_alloc_inum(&new_ns->ns);
@@ -32,7 +32,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
kfree(new_ns);
return ERR_PTR(ret);
}
- refcount_set(&new_ns->count, 1);
+ refcount_set(&new_ns->ns.count, 1);
new_ns->ns.ops = &cgroupns_operations;
return new_ns;
}
@@ -149,9 +149,3 @@ const struct proc_ns_operations cgroupns_operations = {
.install = cgroupns_install,
.owner = cgroupns_owner,
};
-
-static __init int cgroup_namespaces_init(void)
-{
- return 0;
-}
-subsys_initcall(cgroup_namespaces_init);
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 511af87f685e..7695e60bcb40 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -47,6 +47,7 @@ struct pids_cgroup {
*/
atomic64_t counter;
atomic64_t limit;
+ int64_t watermark;
/* Handle for "pids.events" */
struct cgroup_file events_file;
@@ -85,6 +86,16 @@ static void pids_css_free(struct cgroup_subsys_state *css)
kfree(css_pids(css));
}
+static void pids_update_watermark(struct pids_cgroup *p, int64_t nr_pids)
+{
+ /*
+ * This is racy, but we don't need perfectly accurate tallying of
+ * the watermark, and this lets us avoid extra atomic overhead.
+ */
+ if (nr_pids > READ_ONCE(p->watermark))
+ WRITE_ONCE(p->watermark, nr_pids);
+}
+
/**
* pids_cancel - uncharge the local pid count
* @pids: the pid cgroup state
@@ -128,8 +139,11 @@ static void pids_charge(struct pids_cgroup *pids, int num)
{
struct pids_cgroup *p;
- for (p = pids; parent_pids(p); p = parent_pids(p))
- atomic64_add(num, &p->counter);
+ for (p = pids; parent_pids(p); p = parent_pids(p)) {
+ int64_t new = atomic64_add_return(num, &p->counter);
+
+ pids_update_watermark(p, new);
+ }
}
/**
@@ -156,6 +170,12 @@ static int pids_try_charge(struct pids_cgroup *pids, int num)
*/
if (new > limit)
goto revert;
+
+ /*
+ * Not technically accurate if we go over limit somewhere up
+ * the hierarchy, but that's tolerable for the watermark.
+ */
+ pids_update_watermark(p, new);
}
return 0;
@@ -311,6 +331,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css,
return atomic64_read(&pids->counter);
}
+static s64 pids_peak_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct pids_cgroup *pids = css_pids(css);
+
+ return READ_ONCE(pids->watermark);
+}
+
static int pids_events_show(struct seq_file *sf, void *v)
{
struct pids_cgroup *pids = css_pids(seq_css(sf));
@@ -332,6 +360,11 @@ static struct cftype pids_files[] = {
.flags = CFTYPE_NOT_ON_ROOT,
},
{
+ .name = "peak",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = pids_peak_read,
+ },
+ {
.name = "events",
.seq_show = pids_events_show,
.file_offset = offsetof(struct pids_cgroup, events_file),
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index ae042c347c64..ef5878fb2005 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -197,6 +197,7 @@ uncharge_cg_locked(struct rdma_cgroup *cg,
/**
* rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
+ * @cg: pointer to cg to uncharge and all parents in hierarchy
* @device: pointer to rdmacg device
* @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
* stop uncharging
@@ -221,6 +222,7 @@ static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
/**
* rdmacg_uncharge - hierarchically uncharge rdma resource count
+ * @cg: pointer to cg to uncharge and all parents in hierarchy
* @device: pointer to rdmacg device
* @index: index of the resource to uncharge in cgroup in given resource pool
*/
@@ -244,7 +246,7 @@ EXPORT_SYMBOL(rdmacg_uncharge);
* This function follows charging resource in hierarchical way.
* It will fail if the charge would cause the new value to exceed the
* hierarchical limit.
- * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
+ * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
* Returns pointer to rdmacg for this resource when charging is successful.
*
* Charger needs to account resources on two criteria.
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index b6397a186ce9..a8350d2d63e6 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -3,6 +3,10 @@
#include <linux/sched/cputime.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+
static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
@@ -22,16 +26,11 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
* rstat_cpu->updated_children list. See the comment on top of
* cgroup_rstat_cpu definition for details.
*/
-void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
+__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
{
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
- struct cgroup *parent;
unsigned long flags;
- /* nothing to do for root */
- if (!cgroup_parent(cgrp))
- return;
-
/*
* Speculative already-on-list test. This may race leading to
* temporary inaccuracies, which is fine.
@@ -40,16 +39,16 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
* instead of NULL, we can tell whether @cgrp is on the list by
* testing the next pointer for NULL.
*/
- if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
+ if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
return;
raw_spin_lock_irqsave(cpu_lock, flags);
/* put @cgrp and all ancestors on the corresponding updated lists */
- for (parent = cgroup_parent(cgrp); parent;
- cgrp = parent, parent = cgroup_parent(cgrp)) {
+ while (true) {
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
- struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_rstat_cpu *prstatc;
/*
* Both additions and removals are bottom-up. If a cgroup
@@ -58,88 +57,173 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
if (rstatc->updated_next)
break;
+ /* Root has no parent to link it to, but mark it busy */
+ if (!parent) {
+ rstatc->updated_next = cgrp;
+ break;
+ }
+
+ prstatc = cgroup_rstat_cpu(parent, cpu);
rstatc->updated_next = prstatc->updated_children;
prstatc->updated_children = cgrp;
+
+ cgrp = parent;
}
raw_spin_unlock_irqrestore(cpu_lock, flags);
}
-EXPORT_SYMBOL_GPL(cgroup_rstat_updated);
/**
- * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
- * @pos: current position
- * @root: root of the tree to traversal
+ * cgroup_rstat_push_children - push children cgroups into the given list
+ * @head: current head of the list (= subtree root)
+ * @child: first child of the root
* @cpu: target cpu
+ * Return: A new singly linked list of cgroups to be flush
*
- * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts
- * the traversal and %NULL return indicates the end. During traversal,
- * each returned cgroup is unlinked from the tree. Must be called with the
- * matching cgroup_rstat_cpu_lock held.
+ * Iteratively traverse down the cgroup_rstat_cpu updated tree level by
+ * level and push all the parents first before their next level children
+ * into a singly linked list built from the tail backward like "pushing"
+ * cgroups into a stack. The root is pushed by the caller.
+ */
+static struct cgroup *cgroup_rstat_push_children(struct cgroup *head,
+ struct cgroup *child, int cpu)
+{
+ struct cgroup *chead = child; /* Head of child cgroup level */
+ struct cgroup *ghead = NULL; /* Head of grandchild cgroup level */
+ struct cgroup *parent, *grandchild;
+ struct cgroup_rstat_cpu *crstatc;
+
+ child->rstat_flush_next = NULL;
+
+next_level:
+ while (chead) {
+ child = chead;
+ chead = child->rstat_flush_next;
+ parent = cgroup_parent(child);
+
+ /* updated_next is parent cgroup terminated */
+ while (child != parent) {
+ child->rstat_flush_next = head;
+ head = child;
+ crstatc = cgroup_rstat_cpu(child, cpu);
+ grandchild = crstatc->updated_children;
+ if (grandchild != child) {
+ /* Push the grand child to the next level */
+ crstatc->updated_children = child;
+ grandchild->rstat_flush_next = ghead;
+ ghead = grandchild;
+ }
+ child = crstatc->updated_next;
+ crstatc->updated_next = NULL;
+ }
+ }
+
+ if (ghead) {
+ chead = ghead;
+ ghead = NULL;
+ goto next_level;
+ }
+ return head;
+}
+
+/**
+ * cgroup_rstat_updated_list - return a list of updated cgroups to be flushed
+ * @root: root of the cgroup subtree to traverse
+ * @cpu: target cpu
+ * Return: A singly linked list of cgroups to be flushed
+ *
+ * Walks the updated rstat_cpu tree on @cpu from @root. During traversal,
+ * each returned cgroup is unlinked from the updated tree.
*
* The only ordering guarantee is that, for a parent and a child pair
- * covered by a given traversal, if a child is visited, its parent is
- * guaranteed to be visited afterwards.
+ * covered by a given traversal, the child is before its parent in
+ * the list.
+ *
+ * Note that updated_children is self terminated and points to a list of
+ * child cgroups if not empty. Whereas updated_next is like a sibling link
+ * within the children list and terminated by the parent cgroup. An exception
+ * here is the cgroup root whose updated_next can be self terminated.
*/
-static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
- struct cgroup *root, int cpu)
+static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
{
- struct cgroup_rstat_cpu *rstatc;
-
- if (pos == root)
- return NULL;
+ raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
+ struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
+ struct cgroup *head = NULL, *parent, *child;
+ unsigned long flags;
/*
- * We're gonna walk down to the first leaf and visit/remove it. We
- * can pick whatever unvisited node as the starting point.
+ * The _irqsave() is needed because cgroup_rstat_lock is
+ * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+ * this lock with the _irq() suffix only disables interrupts on
+ * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+ * interrupts on both configurations. The _irqsave() ensures
+ * that interrupts are always disabled and later restored.
*/
- if (!pos)
- pos = root;
- else
- pos = cgroup_parent(pos);
+ raw_spin_lock_irqsave(cpu_lock, flags);
- /* walk down to the first leaf */
- while (true) {
- rstatc = cgroup_rstat_cpu(pos, cpu);
- if (rstatc->updated_children == pos)
- break;
- pos = rstatc->updated_children;
- }
+ /* Return NULL if this subtree is not on-list */
+ if (!rstatc->updated_next)
+ goto unlock_ret;
/*
- * Unlink @pos from the tree. As the updated_children list is
+ * Unlink @root from its parent. As the updated_children list is
* singly linked, we have to walk it to find the removal point.
- * However, due to the way we traverse, @pos will be the first
- * child in most cases. The only exception is @root.
*/
- if (rstatc->updated_next) {
- struct cgroup *parent = cgroup_parent(pos);
- struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
- struct cgroup_rstat_cpu *nrstatc;
+ parent = cgroup_parent(root);
+ if (parent) {
+ struct cgroup_rstat_cpu *prstatc;
struct cgroup **nextp;
+ prstatc = cgroup_rstat_cpu(parent, cpu);
nextp = &prstatc->updated_children;
- while (true) {
- nrstatc = cgroup_rstat_cpu(*nextp, cpu);
- if (*nextp == pos)
- break;
+ while (*nextp != root) {
+ struct cgroup_rstat_cpu *nrstatc;
+ nrstatc = cgroup_rstat_cpu(*nextp, cpu);
WARN_ON_ONCE(*nextp == parent);
nextp = &nrstatc->updated_next;
}
-
*nextp = rstatc->updated_next;
- rstatc->updated_next = NULL;
-
- return pos;
}
- /* only happens for @root */
- return NULL;
+ rstatc->updated_next = NULL;
+
+ /* Push @root to the list first before pushing the children */
+ head = root;
+ root->rstat_flush_next = NULL;
+ child = rstatc->updated_children;
+ rstatc->updated_children = root;
+ if (child != root)
+ head = cgroup_rstat_push_children(head, child, cpu);
+unlock_ret:
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
+ return head;
+}
+
+/*
+ * A hook for bpf stat collectors to attach to and flush their stats.
+ * Together with providing bpf kfuncs for cgroup_rstat_updated() and
+ * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
+ * collect cgroup stats can integrate with rstat for efficient flushing.
+ *
+ * A static noinline declaration here could cause the compiler to optimize away
+ * the function. A global noinline declaration will keep the definition, but may
+ * optimize away the callsite. Therefore, __weak is needed to ensure that the
+ * call is still emitted, by telling the compiler that we don't know what the
+ * function might eventually be.
+ */
+
+__bpf_hook_start();
+
+__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
+ struct cgroup *parent, int cpu)
+{
}
+__bpf_hook_end();
+
/* see cgroup_rstat_flush() */
-static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
+static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
{
int cpu;
@@ -147,15 +231,13 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
lockdep_assert_held(&cgroup_rstat_lock);
for_each_possible_cpu(cpu) {
- raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
- cpu);
- struct cgroup *pos = NULL;
+ struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu);
- raw_spin_lock(cpu_lock);
- while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
+ for (; pos; pos = pos->rstat_flush_next) {
struct cgroup_subsys_state *css;
cgroup_base_stat_flush(pos, cpu);
+ bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
rcu_read_lock();
list_for_each_entry_rcu(css, &pos->rstat_css_list,
@@ -163,11 +245,9 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
css->ss->css_rstat_flush(css, cpu);
rcu_read_unlock();
}
- raw_spin_unlock(cpu_lock);
- /* if @may_sleep, play nice and yield if necessary */
- if (may_sleep && (need_resched() ||
- spin_needbreak(&cgroup_rstat_lock))) {
+ /* play nice and yield if necessary */
+ if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
spin_unlock_irq(&cgroup_rstat_lock);
if (!cond_resched())
cpu_relax();
@@ -189,32 +269,17 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
*
* This function may block.
*/
-void cgroup_rstat_flush(struct cgroup *cgrp)
+__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
{
might_sleep();
spin_lock_irq(&cgroup_rstat_lock);
- cgroup_rstat_flush_locked(cgrp, true);
+ cgroup_rstat_flush_locked(cgrp);
spin_unlock_irq(&cgroup_rstat_lock);
}
/**
- * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
- * @cgrp: target cgroup
- *
- * This function can be called from any context.
- */
-void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&cgroup_rstat_lock, flags);
- cgroup_rstat_flush_locked(cgrp, false);
- spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
-}
-
-/**
- * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
+ * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
* @cgrp: target cgroup
*
* Flush stats in @cgrp's subtree and prevent further flushes. Must be
@@ -227,7 +292,7 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp)
{
might_sleep();
spin_lock_irq(&cgroup_rstat_lock);
- cgroup_rstat_flush_locked(cgrp, true);
+ cgroup_rstat_flush_locked(cgrp);
}
/**
@@ -286,8 +351,6 @@ void __init cgroup_rstat_boot(void)
for_each_possible_cpu(cpu)
raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
-
- BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp));
}
/*
@@ -300,6 +363,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
dst_bstat->cputime.utime += src_bstat->cputime.utime;
dst_bstat->cputime.stime += src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+ dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
+#endif
}
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@@ -308,50 +374,65 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
dst_bstat->cputime.utime -= src_bstat->cputime.utime;
dst_bstat->cputime.stime -= src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+ dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
+#endif
}
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
- struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
- struct cgroup_base_stat cur, delta;
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_rstat_cpu *prstatc;
+ struct cgroup_base_stat delta;
unsigned seq;
+ /* Root-level stats are sourced from system-wide CPU stats */
+ if (!parent)
+ return;
+
/* fetch the current per-cpu values */
do {
seq = __u64_stats_fetch_begin(&rstatc->bsync);
- cur.cputime = rstatc->bstat.cputime;
+ delta = rstatc->bstat;
} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
- /* propagate percpu delta to global */
- delta = cur;
+ /* propagate per-cpu delta to cgroup and per-cpu global statistics */
cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
cgroup_base_stat_add(&cgrp->bstat, &delta);
cgroup_base_stat_add(&rstatc->last_bstat, &delta);
+ cgroup_base_stat_add(&rstatc->subtree_bstat, &delta);
- /* propagate global delta to parent */
- if (parent) {
+ /* propagate cgroup and per-cpu global delta to parent (unless that's root) */
+ if (cgroup_parent(parent)) {
delta = cgrp->bstat;
cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
cgroup_base_stat_add(&parent->bstat, &delta);
cgroup_base_stat_add(&cgrp->last_bstat, &delta);
+
+ delta = rstatc->subtree_bstat;
+ prstatc = cgroup_rstat_cpu(parent, cpu);
+ cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat);
+ cgroup_base_stat_add(&prstatc->subtree_bstat, &delta);
+ cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta);
}
}
static struct cgroup_rstat_cpu *
-cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
+cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
{
struct cgroup_rstat_cpu *rstatc;
rstatc = get_cpu_ptr(cgrp->rstat_cpu);
- u64_stats_update_begin(&rstatc->bsync);
+ *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
return rstatc;
}
static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
- struct cgroup_rstat_cpu *rstatc)
+ struct cgroup_rstat_cpu *rstatc,
+ unsigned long flags)
{
- u64_stats_update_end(&rstatc->bsync);
+ u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
cgroup_rstat_updated(cgrp, smp_processor_id());
put_cpu_ptr(rstatc);
}
@@ -359,18 +440,20 @@ static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
struct cgroup_rstat_cpu *rstatc;
+ unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+ rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
- cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+ cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}
void __cgroup_account_cputime_field(struct cgroup *cgrp,
enum cpu_usage_stat index, u64 delta_exec)
{
struct cgroup_rstat_cpu *rstatc;
+ unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+ rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
switch (index) {
case CPUTIME_USER:
@@ -382,11 +465,16 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
case CPUTIME_SOFTIRQ:
rstatc->bstat.cputime.stime += delta_exec;
break;
+#ifdef CONFIG_SCHED_CORE
+ case CPUTIME_FORCEIDLE:
+ rstatc->bstat.forceidle_sum += delta_exec;
+ break;
+#endif
default:
break;
}
- cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+ cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}
/*
@@ -395,13 +483,12 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
* with how it is done by __cgroup_account_cputime_field for each bit of
* cpu time attributed to a cgroup.
*/
-static void root_cgroup_cputime(struct task_cputime *cputime)
+static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
{
+ struct task_cputime *cputime = &bstat->cputime;
int i;
- cputime->stime = 0;
- cputime->utime = 0;
- cputime->sum_exec_runtime = 0;
+ memset(bstat, 0, sizeof(*bstat));
for_each_possible_cpu(i) {
struct kernel_cpustat kcpustat;
u64 *cpustat = kcpustat.cpustat;
@@ -422,8 +509,10 @@ static void root_cgroup_cputime(struct task_cputime *cputime)
cputime->sum_exec_runtime += user;
cputime->sum_exec_runtime += sys;
cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
- cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST];
- cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE];
+
+#ifdef CONFIG_SCHED_CORE
+ bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
+#endif
}
}
@@ -431,27 +520,61 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
u64 usage, utime, stime;
- struct task_cputime cputime;
+ struct cgroup_base_stat bstat;
+#ifdef CONFIG_SCHED_CORE
+ u64 forceidle_time;
+#endif
if (cgroup_parent(cgrp)) {
cgroup_rstat_flush_hold(cgrp);
usage = cgrp->bstat.cputime.sum_exec_runtime;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
&utime, &stime);
+#ifdef CONFIG_SCHED_CORE
+ forceidle_time = cgrp->bstat.forceidle_sum;
+#endif
cgroup_rstat_flush_release();
} else {
- root_cgroup_cputime(&cputime);
- usage = cputime.sum_exec_runtime;
- utime = cputime.utime;
- stime = cputime.stime;
+ root_cgroup_cputime(&bstat);
+ usage = bstat.cputime.sum_exec_runtime;
+ utime = bstat.cputime.utime;
+ stime = bstat.cputime.stime;
+#ifdef CONFIG_SCHED_CORE
+ forceidle_time = bstat.forceidle_sum;
+#endif
}
do_div(usage, NSEC_PER_USEC);
do_div(utime, NSEC_PER_USEC);
do_div(stime, NSEC_PER_USEC);
+#ifdef CONFIG_SCHED_CORE
+ do_div(forceidle_time, NSEC_PER_USEC);
+#endif
seq_printf(seq, "usage_usec %llu\n"
"user_usec %llu\n"
"system_usec %llu\n",
usage, utime, stime);
+
+#ifdef CONFIG_SCHED_CORE
+ seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
+#endif
+}
+
+/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
+BTF_SET8_START(bpf_rstat_kfunc_ids)
+BTF_ID_FLAGS(func, cgroup_rstat_updated)
+BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
+BTF_SET8_END(bpf_rstat_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_rstat_kfunc_ids,
+};
+
+static int __init bpf_rstat_kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &bpf_rstat_kfunc_set);
}
+late_initcall(bpf_rstat_kfunc_init);
diff --git a/kernel/compat.c b/kernel/compat.c
index b8d2800bb4b7..fb50f29d9b36 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -152,7 +152,7 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
if (len & (sizeof(compat_ulong_t)-1))
return -EINVAL;
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
@@ -255,11 +255,11 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
return -EFAULT;
switch (_NSIG_WORDS) {
case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 );
- /* fall through */
+ fallthrough;
case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 );
- /* fall through */
+ fallthrough;
case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 );
- /* fall through */
+ fallthrough;
case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 );
}
#else
@@ -269,24 +269,3 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
return 0;
}
EXPORT_SYMBOL_GPL(get_compat_sigset);
-
-/*
- * Allocate user-space memory for the duration of a single system call,
- * in order to marshall parameters inside a compat thunk.
- */
-void __user *compat_alloc_user_space(unsigned long len)
-{
- void __user *ptr;
-
- /* If len would occupy more than half of the entire compat space... */
- if (unlikely(len > (((compat_uptr_t)~0) >> 1)))
- return NULL;
-
- ptr = arch_compat_alloc_user_space(len);
-
- if (unlikely(!access_ok(ptr, len)))
- return NULL;
-
- return ptr;
-}
-EXPORT_SYMBOL_GPL(compat_alloc_user_space);
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
deleted file mode 100644
index d3fd428f4b92..000000000000
--- a/kernel/configs/android-base.config
+++ /dev/null
@@ -1,161 +0,0 @@
-# KEEP ALPHABETICALLY SORTED
-# CONFIG_DEVKMEM is not set
-# CONFIG_DEVMEM is not set
-# CONFIG_FHANDLE is not set
-# CONFIG_INET_LRO is not set
-# CONFIG_NFSD is not set
-# CONFIG_NFS_FS is not set
-# CONFIG_OABI_COMPAT is not set
-# CONFIG_SYSVIPC is not set
-# CONFIG_USELIB is not set
-CONFIG_ANDROID=y
-CONFIG_ANDROID_BINDER_IPC=y
-CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder
-CONFIG_ANDROID_LOW_MEMORY_KILLER=y
-CONFIG_ARMV8_DEPRECATED=y
-CONFIG_ASHMEM=y
-CONFIG_AUDIT=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CGROUPS=y
-CONFIG_CGROUP_BPF=y
-CONFIG_CGROUP_CPUACCT=y
-CONFIG_CGROUP_DEBUG=y
-CONFIG_CGROUP_FREEZER=y
-CONFIG_CGROUP_SCHED=y
-CONFIG_CP15_BARRIER_EMULATION=y
-CONFIG_DEFAULT_SECURITY_SELINUX=y
-CONFIG_EMBEDDED=y
-CONFIG_FB=y
-CONFIG_HARDENED_USERCOPY=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_IKCONFIG=y
-CONFIG_IKCONFIG_PROC=y
-CONFIG_INET6_AH=y
-CONFIG_INET6_ESP=y
-CONFIG_INET6_IPCOMP=y
-CONFIG_INET=y
-CONFIG_INET_DIAG_DESTROY=y
-CONFIG_INET_ESP=y
-CONFIG_INET_XFRM_MODE_TUNNEL=y
-CONFIG_IP6_NF_FILTER=y
-CONFIG_IP6_NF_IPTABLES=y
-CONFIG_IP6_NF_MANGLE=y
-CONFIG_IP6_NF_RAW=y
-CONFIG_IP6_NF_TARGET_REJECT=y
-CONFIG_IPV6=y
-CONFIG_IPV6_MIP6=y
-CONFIG_IPV6_MULTIPLE_TABLES=y
-CONFIG_IPV6_OPTIMISTIC_DAD=y
-CONFIG_IPV6_ROUTER_PREF=y
-CONFIG_IPV6_ROUTE_INFO=y
-CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_MULTIPLE_TABLES=y
-CONFIG_IP_NF_ARPFILTER=y
-CONFIG_IP_NF_ARPTABLES=y
-CONFIG_IP_NF_ARP_MANGLE=y
-CONFIG_IP_NF_FILTER=y
-CONFIG_IP_NF_IPTABLES=y
-CONFIG_IP_NF_MANGLE=y
-CONFIG_IP_NF_MATCH_AH=y
-CONFIG_IP_NF_MATCH_ECN=y
-CONFIG_IP_NF_MATCH_TTL=y
-CONFIG_IP_NF_NAT=y
-CONFIG_IP_NF_RAW=y
-CONFIG_IP_NF_SECURITY=y
-CONFIG_IP_NF_TARGET_MASQUERADE=y
-CONFIG_IP_NF_TARGET_NETMAP=y
-CONFIG_IP_NF_TARGET_REDIRECT=y
-CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODVERSIONS=y
-CONFIG_NET=y
-CONFIG_NETDEVICES=y
-CONFIG_NETFILTER=y
-CONFIG_NETFILTER_TPROXY=y
-CONFIG_NETFILTER_XT_MATCH_COMMENT=y
-CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
-CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
-CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
-CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
-CONFIG_NETFILTER_XT_MATCH_HELPER=y
-CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
-CONFIG_NETFILTER_XT_MATCH_LENGTH=y
-CONFIG_NETFILTER_XT_MATCH_LIMIT=y
-CONFIG_NETFILTER_XT_MATCH_MAC=y
-CONFIG_NETFILTER_XT_MATCH_MARK=y
-CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
-CONFIG_NETFILTER_XT_MATCH_POLICY=y
-CONFIG_NETFILTER_XT_MATCH_QUOTA=y
-CONFIG_NETFILTER_XT_MATCH_SOCKET=y
-CONFIG_NETFILTER_XT_MATCH_STATE=y
-CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
-CONFIG_NETFILTER_XT_MATCH_STRING=y
-CONFIG_NETFILTER_XT_MATCH_TIME=y
-CONFIG_NETFILTER_XT_MATCH_U32=y
-CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
-CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
-CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
-CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
-CONFIG_NETFILTER_XT_TARGET_MARK=y
-CONFIG_NETFILTER_XT_TARGET_NFLOG=y
-CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
-CONFIG_NETFILTER_XT_TARGET_SECMARK=y
-CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
-CONFIG_NETFILTER_XT_TARGET_TPROXY=y
-CONFIG_NETFILTER_XT_TARGET_TRACE=y
-CONFIG_NET_CLS_ACT=y
-CONFIG_NET_CLS_U32=y
-CONFIG_NET_EMATCH=y
-CONFIG_NET_EMATCH_U32=y
-CONFIG_NET_KEY=y
-CONFIG_NET_SCHED=y
-CONFIG_NET_SCH_HTB=y
-CONFIG_NF_CONNTRACK=y
-CONFIG_NF_CONNTRACK_AMANDA=y
-CONFIG_NF_CONNTRACK_EVENTS=y
-CONFIG_NF_CONNTRACK_FTP=y
-CONFIG_NF_CONNTRACK_H323=y
-CONFIG_NF_CONNTRACK_IPV4=y
-CONFIG_NF_CONNTRACK_IPV6=y
-CONFIG_NF_CONNTRACK_IRC=y
-CONFIG_NF_CONNTRACK_NETBIOS_NS=y
-CONFIG_NF_CONNTRACK_PPTP=y
-CONFIG_NF_CONNTRACK_SANE=y
-CONFIG_NF_CONNTRACK_SECMARK=y
-CONFIG_NF_CONNTRACK_TFTP=y
-CONFIG_NF_CT_NETLINK=y
-CONFIG_NF_CT_PROTO_DCCP=y
-CONFIG_NF_CT_PROTO_SCTP=y
-CONFIG_NF_CT_PROTO_UDPLITE=y
-CONFIG_NF_NAT=y
-CONFIG_NO_HZ=y
-CONFIG_PACKET=y
-CONFIG_PM_AUTOSLEEP=y
-CONFIG_PM_WAKELOCKS=y
-CONFIG_PPP=y
-CONFIG_PPP_BSDCOMP=y
-CONFIG_PPP_DEFLATE=y
-CONFIG_PPP_MPPE=y
-CONFIG_PREEMPT=y
-CONFIG_QUOTA=y
-CONFIG_RANDOMIZE_BASE=y
-CONFIG_RTC_CLASS=y
-CONFIG_RT_GROUP_SCHED=y
-CONFIG_SECCOMP=y
-CONFIG_SECURITY=y
-CONFIG_SECURITY_NETWORK=y
-CONFIG_SECURITY_SELINUX=y
-CONFIG_SETEND_EMULATION=y
-CONFIG_STAGING=y
-CONFIG_SWP_EMULATION=y
-CONFIG_SYNC=y
-CONFIG_TUN=y
-CONFIG_UNIX=y
-CONFIG_USB_GADGET=y
-CONFIG_USB_CONFIGFS=y
-CONFIG_USB_CONFIGFS_F_FS=y
-CONFIG_USB_CONFIGFS_F_MIDI=y
-CONFIG_USB_OTG_WAKELOCK=y
-CONFIG_XFRM_USER=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
deleted file mode 100644
index 81e9af7dcec2..000000000000
--- a/kernel/configs/android-recommended.config
+++ /dev/null
@@ -1,129 +0,0 @@
-# KEEP ALPHABETICALLY SORTED
-# CONFIG_AIO is not set
-# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
-# CONFIG_INPUT_MOUSE is not set
-# CONFIG_LEGACY_PTYS is not set
-# CONFIG_NF_CONNTRACK_SIP is not set
-# CONFIG_PM_WAKELOCKS_GC is not set
-# CONFIG_VT is not set
-CONFIG_ARM64_SW_TTBR0_PAN=y
-CONFIG_BACKLIGHT_LCD_SUPPORT=y
-CONFIG_BLK_DEV_DM=y
-CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_RAM=y
-CONFIG_BLK_DEV_RAM_SIZE=8192
-CONFIG_STACKPROTECTOR_STRONG=y
-CONFIG_COMPACTION=y
-CONFIG_CPU_SW_DOMAIN_PAN=y
-CONFIG_DM_CRYPT=y
-CONFIG_DM_UEVENT=y
-CONFIG_DM_VERITY=y
-CONFIG_DM_VERITY_FEC=y
-CONFIG_DRAGONRISE_FF=y
-CONFIG_ENABLE_DEFAULT_TRACERS=y
-CONFIG_EXT4_FS=y
-CONFIG_EXT4_FS_SECURITY=y
-CONFIG_FUSE_FS=y
-CONFIG_GREENASIA_FF=y
-CONFIG_HIDRAW=y
-CONFIG_HID_A4TECH=y
-CONFIG_HID_ACRUX=y
-CONFIG_HID_ACRUX_FF=y
-CONFIG_HID_APPLE=y
-CONFIG_HID_BELKIN=y
-CONFIG_HID_CHERRY=y
-CONFIG_HID_CHICONY=y
-CONFIG_HID_CYPRESS=y
-CONFIG_HID_DRAGONRISE=y
-CONFIG_HID_ELECOM=y
-CONFIG_HID_EMS_FF=y
-CONFIG_HID_EZKEY=y
-CONFIG_HID_GREENASIA=y
-CONFIG_HID_GYRATION=y
-CONFIG_HID_HOLTEK=y
-CONFIG_HID_KENSINGTON=y
-CONFIG_HID_KEYTOUCH=y
-CONFIG_HID_KYE=y
-CONFIG_HID_LCPOWER=y
-CONFIG_HID_LOGITECH=y
-CONFIG_HID_LOGITECH_DJ=y
-CONFIG_HID_MAGICMOUSE=y
-CONFIG_HID_MICROSOFT=y
-CONFIG_HID_MONTEREY=y
-CONFIG_HID_MULTITOUCH=y
-CONFIG_HID_NTRIG=y
-CONFIG_HID_ORTEK=y
-CONFIG_HID_PANTHERLORD=y
-CONFIG_HID_PETALYNX=y
-CONFIG_HID_PICOLCD=y
-CONFIG_HID_PRIMAX=y
-CONFIG_HID_PRODIKEYS=y
-CONFIG_HID_ROCCAT=y
-CONFIG_HID_SAITEK=y
-CONFIG_HID_SAMSUNG=y
-CONFIG_HID_SMARTJOYPLUS=y
-CONFIG_HID_SONY=y
-CONFIG_HID_SPEEDLINK=y
-CONFIG_HID_SUNPLUS=y
-CONFIG_HID_THRUSTMASTER=y
-CONFIG_HID_TIVO=y
-CONFIG_HID_TOPSEED=y
-CONFIG_HID_TWINHAN=y
-CONFIG_HID_UCLOGIC=y
-CONFIG_HID_WACOM=y
-CONFIG_HID_WALTOP=y
-CONFIG_HID_WIIMOTE=y
-CONFIG_HID_ZEROPLUS=y
-CONFIG_HID_ZYDACRON=y
-CONFIG_INPUT_EVDEV=y
-CONFIG_INPUT_GPIO=y
-CONFIG_INPUT_JOYSTICK=y
-CONFIG_INPUT_MISC=y
-CONFIG_INPUT_TABLET=y
-CONFIG_INPUT_UINPUT=y
-CONFIG_ION=y
-CONFIG_JOYSTICK_XPAD=y
-CONFIG_JOYSTICK_XPAD_FF=y
-CONFIG_JOYSTICK_XPAD_LEDS=y
-CONFIG_KALLSYMS_ALL=y
-CONFIG_KSM=y
-CONFIG_LOGIG940_FF=y
-CONFIG_LOGIRUMBLEPAD2_FF=y
-CONFIG_LOGITECH_FF=y
-CONFIG_MD=y
-CONFIG_MEDIA_SUPPORT=y
-CONFIG_MSDOS_FS=y
-CONFIG_PANIC_TIMEOUT=5
-CONFIG_PANTHERLORD_FF=y
-CONFIG_PERF_EVENTS=y
-CONFIG_PM_DEBUG=y
-CONFIG_PM_RUNTIME=y
-CONFIG_PM_WAKELOCKS_LIMIT=0
-CONFIG_POWER_SUPPLY=y
-CONFIG_PSTORE=y
-CONFIG_PSTORE_CONSOLE=y
-CONFIG_PSTORE_RAM=y
-CONFIG_SCHEDSTATS=y
-CONFIG_SMARTJOYPLUS_FF=y
-CONFIG_SND=y
-CONFIG_SOUND=y
-CONFIG_STRICT_KERNEL_RWX=y
-CONFIG_SUSPEND_TIME=y
-CONFIG_TABLET_USB_ACECAD=y
-CONFIG_TABLET_USB_AIPTEK=y
-CONFIG_TABLET_USB_GTCO=y
-CONFIG_TABLET_USB_HANWANG=y
-CONFIG_TABLET_USB_KBTAB=y
-CONFIG_TASKSTATS=y
-CONFIG_TASK_DELAY_ACCT=y
-CONFIG_TASK_IO_ACCOUNTING=y
-CONFIG_TASK_XACCT=y
-CONFIG_TIMER_STATS=y
-CONFIG_TMPFS=y
-CONFIG_TMPFS_POSIX_ACL=y
-CONFIG_UHID=y
-CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
-CONFIG_USB_EHCI_HCD=y
-CONFIG_USB_HIDDEV=y
-CONFIG_USB_USBNET=y
-CONFIG_VFAT_FS=y
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
new file mode 100644
index 000000000000..4722b998a324
--- /dev/null
+++ b/kernel/configs/debug.config
@@ -0,0 +1,108 @@
+# Help: Debugging for CI systems and finding regressions
+#
+# The config is based on running daily CI for enterprise Linux distros to
+# seek regressions on linux-next builds on different bare-metal and virtual
+# platforms. It can be used for example,
+#
+# $ make ARCH=arm64 defconfig debug.config
+#
+# Keep alphabetically sorted inside each section.
+#
+# printk and dmesg options
+#
+CONFIG_DEBUG_BUGVERBOSE=y
+CONFIG_DYNAMIC_DEBUG=y
+CONFIG_PRINTK_CALLER=y
+CONFIG_PRINTK_TIME=y
+CONFIG_SYMBOLIC_ERRNAME=y
+#
+# Compile-time checks and compiler options
+#
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
+CONFIG_DEBUG_SECTION_MISMATCH=y
+CONFIG_FRAME_WARN=2048
+CONFIG_SECTION_MISMATCH_WARN_ONLY=y
+#
+# Generic Kernel Debugging Instruments
+#
+# CONFIG_UBSAN_ALIGNMENT is not set
+# CONFIG_UBSAN_DIV_ZERO is not set
+# CONFIG_UBSAN_TRAP is not set
+# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set
+CONFIG_DEBUG_FS=y
+CONFIG_DEBUG_FS_ALLOW_ALL=y
+CONFIG_DEBUG_IRQFLAGS=y
+CONFIG_UBSAN=y
+CONFIG_UBSAN_BOOL=y
+CONFIG_UBSAN_BOUNDS=y
+CONFIG_UBSAN_ENUM=y
+CONFIG_UBSAN_SHIFT=y
+CONFIG_UBSAN_UNREACHABLE=y
+#
+# Memory Debugging
+#
+# CONFIG_DEBUG_PAGEALLOC is not set
+# CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF is not set
+# CONFIG_DEBUG_RODATA_TEST is not set
+# CONFIG_DEBUG_WX is not set
+# CONFIG_KFENCE is not set
+# CONFIG_PAGE_POISONING is not set
+# CONFIG_SLUB_STATS is not set
+CONFIG_PAGE_EXTENSION=y
+CONFIG_PAGE_OWNER=y
+CONFIG_DEBUG_KMEMLEAK=y
+CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN=y
+CONFIG_DEBUG_OBJECTS=y
+CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1
+CONFIG_DEBUG_OBJECTS_FREE=y
+CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER=y
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
+CONFIG_DEBUG_OBJECTS_TIMERS=y
+CONFIG_DEBUG_OBJECTS_WORK=y
+CONFIG_DEBUG_PER_CPU_MAPS=y
+CONFIG_DEBUG_STACK_USAGE=y
+CONFIG_DEBUG_VIRTUAL=y
+CONFIG_DEBUG_VM=y
+CONFIG_DEBUG_VM_PGFLAGS=y
+CONFIG_DEBUG_VM_RB=y
+CONFIG_DEBUG_VM_VMACACHE=y
+CONFIG_GENERIC_PTDUMP=y
+CONFIG_KASAN=y
+CONFIG_KASAN_GENERIC=y
+CONFIG_KASAN_INLINE=y
+CONFIG_KASAN_VMALLOC=y
+CONFIG_PTDUMP_DEBUGFS=y
+CONFIG_SCHED_STACK_END_CHECK=y
+CONFIG_SLUB_DEBUG_ON=y
+#
+# Debug Oops, Lockups and Hangs
+#
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
+CONFIG_DEBUG_ATOMIC_SLEEP=y
+CONFIG_DETECT_HUNG_TASK=y
+CONFIG_PANIC_ON_OOPS=y
+CONFIG_PANIC_TIMEOUT=0
+CONFIG_SOFTLOCKUP_DETECTOR=y
+#
+# Lock Debugging (spinlocks, mutexes, etc...)
+#
+# CONFIG_PROVE_RAW_LOCK_NESTING is not set
+CONFIG_PROVE_LOCKING=y
+#
+# Debug kernel data structures
+#
+CONFIG_BUG_ON_DATA_CORRUPTION=y
+#
+# RCU Debugging
+#
+CONFIG_PROVE_RCU=y
+CONFIG_PROVE_RCU_LIST=y
+#
+# Tracers
+#
+CONFIG_BRANCH_PROFILE_NONE=y
+CONFIG_DYNAMIC_FTRACE=y
+CONFIG_FTRACE=y
+CONFIG_FUNCTION_TRACER=y
diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config
new file mode 100644
index 000000000000..95a400f042b1
--- /dev/null
+++ b/kernel/configs/hardening.config
@@ -0,0 +1,98 @@
+# Help: Basic kernel hardening options
+#
+# These are considered the basic kernel hardening, self-protection, and
+# attack surface reduction options. They are expected to have low (or
+# no) performance impact on most workloads, and have a reasonable level
+# of legacy API removals.
+
+# Make sure reporting of various hardening actions is possible.
+CONFIG_BUG=y
+
+# Basic kernel memory permission enforcement.
+CONFIG_STRICT_KERNEL_RWX=y
+CONFIG_STRICT_MODULE_RWX=y
+CONFIG_VMAP_STACK=y
+
+# Kernel image and memory ASLR.
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_RANDOMIZE_MEMORY=y
+
+# Randomize allocator freelists, harden metadata.
+CONFIG_SLAB_FREELIST_RANDOM=y
+CONFIG_SLAB_FREELIST_HARDENED=y
+CONFIG_SHUFFLE_PAGE_ALLOCATOR=y
+CONFIG_RANDOM_KMALLOC_CACHES=y
+
+# Randomize kernel stack offset on syscall entry.
+CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT=y
+
+# Basic stack frame overflow protection.
+CONFIG_STACKPROTECTOR=y
+CONFIG_STACKPROTECTOR_STRONG=y
+
+# Basic buffer length bounds checking.
+CONFIG_HARDENED_USERCOPY=y
+CONFIG_FORTIFY_SOURCE=y
+
+# Basic array index bounds checking.
+CONFIG_UBSAN=y
+CONFIG_UBSAN_TRAP=y
+CONFIG_UBSAN_BOUNDS=y
+# CONFIG_UBSAN_SHIFT is not set
+# CONFIG_UBSAN_DIV_ZERO
+# CONFIG_UBSAN_UNREACHABLE
+# CONFIG_UBSAN_BOOL
+# CONFIG_UBSAN_ENUM
+# CONFIG_UBSAN_ALIGNMENT
+CONFIG_UBSAN_SANITIZE_ALL=y
+
+# Linked list integrity checking.
+CONFIG_LIST_HARDENED=y
+
+# Initialize all heap variables to zero on allocation.
+CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y
+
+# Initialize all stack variables to zero on function entry.
+CONFIG_INIT_STACK_ALL_ZERO=y
+
+# Wipe RAM at reboot via EFI. For more details, see:
+# https://trustedcomputinggroup.org/resource/pc-client-work-group-platform-reset-attack-mitigation-specification/
+# https://bugzilla.redhat.com/show_bug.cgi?id=1532058
+CONFIG_RESET_ATTACK_MITIGATION=y
+
+# Disable DMA between EFI hand-off and the kernel's IOMMU setup.
+CONFIG_EFI_DISABLE_PCI_DMA=y
+
+# Force IOMMU TLB invalidation so devices will never be able to access stale
+# data content.
+CONFIG_IOMMU_SUPPORT=y
+CONFIG_IOMMU_DEFAULT_DMA_STRICT=y
+
+# Do not allow direct physical memory access to non-device memory.
+CONFIG_STRICT_DEVMEM=y
+CONFIG_IO_STRICT_DEVMEM=y
+
+# Provide userspace with seccomp BPF API for syscall attack surface reduction.
+CONFIG_SECCOMP=y
+CONFIG_SECCOMP_FILTER=y
+
+# Provides some protections against SYN flooding.
+CONFIG_SYN_COOKIES=y
+
+# Attack surface reduction: do not autoload TTY line disciplines.
+# CONFIG_LDISC_AUTOLOAD is not set
+
+# Dangerous; enabling this disables userspace brk ASLR.
+# CONFIG_COMPAT_BRK is not set
+
+# Dangerous; exposes kernel text image layout.
+# CONFIG_PROC_KCORE is not set
+
+# Dangerous; enabling this disables userspace VDSO ASLR.
+# CONFIG_COMPAT_VDSO is not set
+
+# Attack surface reduction: Use the modern PTY interface (devpts) only.
+# CONFIG_LEGACY_PTYS is not set
+
+# Attack surface reduction: Use only modesetting video drivers.
+# CONFIG_DRM_LEGACY is not set
diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config
index 208481d91090..d0877063d925 100644
--- a/kernel/configs/kvm_guest.config
+++ b/kernel/configs/kvm_guest.config
@@ -1,3 +1,4 @@
+# Help: Bootable as a KVM guest
CONFIG_NET=y
CONFIG_NET_CORE=y
CONFIG_NETDEVICES=y
diff --git a/kernel/configs/nopm.config b/kernel/configs/nopm.config
index 81ff07863576..ebfdc3d8aa9a 100644
--- a/kernel/configs/nopm.config
+++ b/kernel/configs/nopm.config
@@ -1,3 +1,5 @@
+# Help: Disable Power Management
+
CONFIG_PM=n
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
diff --git a/kernel/configs/rust.config b/kernel/configs/rust.config
new file mode 100644
index 000000000000..2c6e001a7284
--- /dev/null
+++ b/kernel/configs/rust.config
@@ -0,0 +1,2 @@
+# Help: Enable Rust
+CONFIG_RUST=y
diff --git a/kernel/configs/tiny-base.config b/kernel/configs/tiny-base.config
new file mode 100644
index 000000000000..ffb9dcafca26
--- /dev/null
+++ b/kernel/configs/tiny-base.config
@@ -0,0 +1 @@
+CONFIG_EXPERT=y
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index 8a44b93da0f3..00009f7d0835 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -6,6 +6,5 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_KERNEL_XZ=y
# CONFIG_KERNEL_LZO is not set
# CONFIG_KERNEL_LZ4 is not set
-# CONFIG_SLAB is not set
-# CONFIG_SLUB is not set
-CONFIG_SLOB=y
+CONFIG_SLUB=y
+CONFIG_SLUB_TINY=y
diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config
new file mode 100644
index 000000000000..35f48671b8d5
--- /dev/null
+++ b/kernel/configs/x86_debug.config
@@ -0,0 +1,18 @@
+# Help: Debugging options for tip tree testing
+CONFIG_X86_DEBUG_FPU=y
+CONFIG_LOCK_STAT=y
+CONFIG_DEBUG_VM=y
+CONFIG_DEBUG_VM_VMACACHE=y
+CONFIG_DEBUG_VM_RB=y
+CONFIG_DEBUG_SLAB=y
+CONFIG_DEBUG_KMEMLEAK=y
+CONFIG_DEBUG_PAGEALLOC=y
+CONFIG_SLUB_DEBUG_ON=y
+CONFIG_DEBUG_OBJECTS=y
+CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1
+CONFIG_GCOV_KERNEL=y
+CONFIG_LOCKDEP=y
+CONFIG_PROVE_LOCKING=y
+CONFIG_SCHEDSTATS=y
+CONFIG_NOINSTR_VALIDATION=y
+CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config
index ff756221f112..6878b9a49be8 100644
--- a/kernel/configs/xen.config
+++ b/kernel/configs/xen.config
@@ -1,3 +1,5 @@
+# Help: Bootable as a Xen guest
+#
# global stuff - these enable us to allow some
# of the not so generic stuff below for xen
CONFIG_PARAVIRT=y
@@ -34,7 +36,6 @@ CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m
CONFIG_XEN_SCSI_FRONTEND=m
# others
CONFIG_XEN_BALLOON=y
-CONFIG_XEN_SCRUB_PAGES=y
CONFIG_XEN_DEV_EVTCHN=m
CONFIG_XEN_BLKDEV_FRONTEND=m
CONFIG_XEN_NETDEV_FRONTEND=m
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 36a98c48aedc..6ef0b35fc28c 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -1,18 +1,20 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Context tracking: Probe on high level context boundaries such as kernel
- * and userspace. This includes syscalls and exceptions entry/exit.
+ * Context tracking: Probe on high level context boundaries such as kernel,
+ * userspace, guest or idle.
*
* This is used by RCU to remove its dependency on the timer tick while a CPU
- * runs in userspace.
+ * runs in idle, userspace or guest mode.
*
- * Started by Frederic Weisbecker:
+ * User/guest tracking started by Frederic Weisbecker:
*
- * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker
*
* Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
* Steven Rostedt, Peter Zijlstra for suggestions and improvements.
*
+ * RCU extended quiescent state bits imported from kernel/rcu/tree.c
+ * where the relevant authorship may be found.
*/
#include <linux/context_tracking.h>
@@ -21,6 +23,411 @@
#include <linux/hardirq.h>
#include <linux/export.h>
#include <linux/kprobes.h>
+#include <trace/events/rcu.h>
+
+
+DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+ .dynticks_nesting = 1,
+ .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
+#endif
+ .state = ATOMIC_INIT(RCU_DYNTICKS_IDX),
+};
+EXPORT_SYMBOL_GPL(context_tracking);
+
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+#define TPS(x) tracepoint_string(x)
+
+/* Record the current task on dyntick-idle entry. */
+static __always_inline void rcu_dynticks_task_enter(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+ WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
+
+/* Record no current task on dyntick-idle exit. */
+static __always_inline void rcu_dynticks_task_exit(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+ WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
+
+/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
+static __always_inline void rcu_dynticks_task_trace_enter(void)
+{
+#ifdef CONFIG_TASKS_TRACE_RCU
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+ current->trc_reader_special.b.need_mb = true;
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
+}
+
+/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
+static __always_inline void rcu_dynticks_task_trace_exit(void)
+{
+#ifdef CONFIG_TASKS_TRACE_RCU
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+ current->trc_reader_special.b.need_mb = false;
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
+}
+
+/*
+ * Record entry into an extended quiescent state. This is only to be
+ * called when not already in an extended quiescent state, that is,
+ * RCU is watching prior to the call to this function and is no longer
+ * watching upon return.
+ */
+static noinstr void ct_kernel_exit_state(int offset)
+{
+ int seq;
+
+ /*
+ * CPUs seeing atomic_add_return() must see prior RCU read-side
+ * critical sections, and we also must force ordering with the
+ * next idle sojourn.
+ */
+ rcu_dynticks_task_trace_enter(); // Before ->dynticks update!
+ seq = ct_state_inc(offset);
+ // RCU is no longer watching. Better be in extended quiescent state!
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICKS_IDX));
+}
+
+/*
+ * Record exit from an extended quiescent state. This is only to be
+ * called from an extended quiescent state, that is, RCU is not watching
+ * prior to the call to this function and is watching upon return.
+ */
+static noinstr void ct_kernel_enter_state(int offset)
+{
+ int seq;
+
+ /*
+ * CPUs seeing atomic_add_return() must see prior idle sojourns,
+ * and we also must force ordering with the next RCU read-side
+ * critical section.
+ */
+ seq = ct_state_inc(offset);
+ // RCU is now watching. Better not be in an extended quiescent state!
+ rcu_dynticks_task_trace_exit(); // After ->dynticks update!
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX));
+}
+
+/*
+ * Enter an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ *
+ * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
+ * the possibility of usermode upcalls having messed up our count
+ * of interrupt nesting level during the prior busy period.
+ */
+static void noinstr ct_kernel_exit(bool user, int offset)
+{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+ WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE);
+ WRITE_ONCE(ct->dynticks_nmi_nesting, 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ ct_dynticks_nesting() == 0);
+ if (ct_dynticks_nesting() != 1) {
+ // RCU will still be watching, so just do accounting and leave.
+ ct->dynticks_nesting--;
+ return;
+ }
+
+ instrumentation_begin();
+ lockdep_assert_irqs_disabled();
+ trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_dynticks());
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
+ rcu_preempt_deferred_qs(current);
+
+ // instrumentation for the noinstr ct_kernel_exit_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+
+ instrumentation_end();
+ WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */
+ // RCU is watching here ...
+ ct_kernel_exit_state(offset);
+ // ... but is no longer watching here.
+ rcu_dynticks_task_enter();
+}
+
+/*
+ * Exit an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ *
+ * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
+ * allow for the possibility of usermode upcalls messing up our count of
+ * interrupt nesting level during the busy period that is just now starting.
+ */
+static void noinstr ct_kernel_enter(bool user, int offset)
+{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+ long oldval;
+
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
+ oldval = ct_dynticks_nesting();
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
+ if (oldval) {
+ // RCU was already watching, so just do accounting and leave.
+ ct->dynticks_nesting++;
+ return;
+ }
+ rcu_dynticks_task_exit();
+ // RCU is not watching here ...
+ ct_kernel_enter_state(offset);
+ // ... but is watching here.
+ instrumentation_begin();
+
+ // instrumentation for the noinstr ct_kernel_enter_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+
+ trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks());
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
+ WRITE_ONCE(ct->dynticks_nesting, 1);
+ WARN_ON_ONCE(ct_dynticks_nmi_nesting());
+ WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
+ instrumentation_end();
+}
+
+/**
+ * ct_nmi_exit - inform RCU of exit from NMI context
+ *
+ * If we are returning from the outermost NMI handler that interrupted an
+ * RCU-idle period, update ct->state and ct->dynticks_nmi_nesting
+ * to let the RCU grace-period handling know that the CPU is back to
+ * being RCU-idle.
+ *
+ * If you add or remove a call to ct_nmi_exit(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_nmi_exit(void)
+{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+ instrumentation_begin();
+ /*
+ * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
+ * (We are exiting an NMI handler, so RCU better be paying attention
+ * to us!)
+ */
+ WARN_ON_ONCE(ct_dynticks_nmi_nesting() <= 0);
+ WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
+
+ /*
+ * If the nesting level is not 1, the CPU wasn't RCU-idle, so
+ * leave it in non-RCU-idle state.
+ */
+ if (ct_dynticks_nmi_nesting() != 1) {
+ trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2,
+ ct_dynticks());
+ WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */
+ ct_dynticks_nmi_nesting() - 2);
+ instrumentation_end();
+ return;
+ }
+
+ /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
+ trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks());
+ WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
+
+ // instrumentation for the noinstr ct_kernel_exit_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+ instrumentation_end();
+
+ // RCU is watching here ...
+ ct_kernel_exit_state(RCU_DYNTICKS_IDX);
+ // ... but is no longer watching here.
+
+ if (!in_nmi())
+ rcu_dynticks_task_enter();
+}
+
+/**
+ * ct_nmi_enter - inform RCU of entry to NMI context
+ *
+ * If the CPU was idle from RCU's viewpoint, update ct->state and
+ * ct->dynticks_nmi_nesting to let the RCU grace-period handling know
+ * that the CPU is active. This implementation permits nested NMIs, as
+ * long as the nesting level does not overflow an int. (You will probably
+ * run out of stack space first.)
+ *
+ * If you add or remove a call to ct_nmi_enter(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_nmi_enter(void)
+{
+ long incby = 2;
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+ /* Complain about underflow. */
+ WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0);
+
+ /*
+ * If idle from RCU viewpoint, atomically increment ->dynticks
+ * to mark non-idle and increment ->dynticks_nmi_nesting by one.
+ * Otherwise, increment ->dynticks_nmi_nesting by two. This means
+ * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
+ * to be in the outermost NMI handler that interrupted an RCU-idle
+ * period (observation due to Andy Lutomirski).
+ */
+ if (rcu_dynticks_curr_cpu_in_eqs()) {
+
+ if (!in_nmi())
+ rcu_dynticks_task_exit();
+
+ // RCU is not watching here ...
+ ct_kernel_enter_state(RCU_DYNTICKS_IDX);
+ // ... but is watching here.
+
+ instrumentation_begin();
+ // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
+ instrument_atomic_read(&ct->state, sizeof(ct->state));
+ // instrumentation for the noinstr ct_kernel_enter_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+
+ incby = 1;
+ } else if (!in_nmi()) {
+ instrumentation_begin();
+ rcu_irq_enter_check_tick();
+ } else {
+ instrumentation_begin();
+ }
+
+ trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
+ ct_dynticks_nmi_nesting(),
+ ct_dynticks_nmi_nesting() + incby, ct_dynticks());
+ instrumentation_end();
+ WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */
+ ct_dynticks_nmi_nesting() + incby);
+ barrier();
+}
+
+/**
+ * ct_idle_enter - inform RCU that current CPU is entering idle
+ *
+ * Enter idle mode, in other words, -leave- the mode in which RCU
+ * read-side critical sections can occur. (Though RCU read-side
+ * critical sections can occur in irq handlers in idle, a possibility
+ * handled by irq_enter() and irq_exit().)
+ *
+ * If you add or remove a call to ct_idle_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_idle_enter(void)
+{
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
+ ct_kernel_exit(false, RCU_DYNTICKS_IDX + CONTEXT_IDLE);
+}
+EXPORT_SYMBOL_GPL(ct_idle_enter);
+
+/**
+ * ct_idle_exit - inform RCU that current CPU is leaving idle
+ *
+ * Exit idle mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections can occur.
+ *
+ * If you add or remove a call to ct_idle_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_idle_exit(void)
+{
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ ct_kernel_enter(false, RCU_DYNTICKS_IDX - CONTEXT_IDLE);
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(ct_idle_exit);
+
+/**
+ * ct_irq_enter - inform RCU that current CPU is entering irq away from idle
+ *
+ * Enter an interrupt handler, which might possibly result in exiting
+ * idle mode, in other words, entering the mode in which read-side critical
+ * sections can occur. The caller must have disabled interrupts.
+ *
+ * Note that the Linux kernel is fully capable of entering an interrupt
+ * handler that it never exits, for example when doing upcalls to user mode!
+ * This code assumes that the idle loop never does upcalls to user mode.
+ * If your architecture's idle loop does do upcalls to user mode (or does
+ * anything else that results in unbalanced calls to the irq_enter() and
+ * irq_exit() functions), RCU will give you what you deserve, good and hard.
+ * But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ *
+ * If you add or remove a call to ct_irq_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+noinstr void ct_irq_enter(void)
+{
+ lockdep_assert_irqs_disabled();
+ ct_nmi_enter();
+}
+
+/**
+ * ct_irq_exit - inform RCU that current CPU is exiting irq towards idle
+ *
+ * Exit from an interrupt handler, which might possibly result in entering
+ * idle mode, in other words, leaving the mode in which read-side critical
+ * sections can occur. The caller must have disabled interrupts.
+ *
+ * This code assumes that the idle loop never does anything that might
+ * result in unbalanced calls to irq_enter() and irq_exit(). If your
+ * architecture's idle loop violates this assumption, RCU will give you what
+ * you deserve, good and hard. But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ *
+ * If you add or remove a call to ct_irq_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+noinstr void ct_irq_exit(void)
+{
+ lockdep_assert_irqs_disabled();
+ ct_nmi_exit();
+}
+
+/*
+ * Wrapper for ct_irq_enter() where interrupts are enabled.
+ *
+ * If you add or remove a call to ct_irq_enter_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void ct_irq_enter_irqson(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ct_irq_enter();
+ local_irq_restore(flags);
+}
+
+/*
+ * Wrapper for ct_irq_exit() where interrupts are enabled.
+ *
+ * If you add or remove a call to ct_irq_exit_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void ct_irq_exit_irqson(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ct_irq_exit();
+ local_irq_restore(flags);
+}
+#else
+static __always_inline void ct_kernel_exit(bool user, int offset) { }
+static __always_inline void ct_kernel_enter(bool user, int offset) { }
+#endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */
+
+#ifdef CONFIG_CONTEXT_TRACKING_USER
#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
@@ -28,9 +435,6 @@
DEFINE_STATIC_KEY_FALSE(context_tracking_key);
EXPORT_SYMBOL_GPL(context_tracking_key);
-DEFINE_PER_CPU(struct context_tracking, context_tracking);
-EXPORT_SYMBOL_GPL(context_tracking);
-
static noinstr bool context_tracking_recursion_enter(void)
{
int recursion;
@@ -51,29 +455,32 @@ static __always_inline void context_tracking_recursion_exit(void)
}
/**
- * context_tracking_enter - Inform the context tracking that the CPU is going
- * enter user or guest space mode.
+ * __ct_user_enter - Inform the context tracking that the CPU is going
+ * to enter user or guest space mode.
*
* This function must be called right before we switch from the kernel
* to user or guest space, when it's guaranteed the remaining kernel
* instructions to execute won't use any RCU read side critical section
* because this function sets RCU in extended quiescent state.
*/
-void noinstr __context_tracking_enter(enum ctx_state state)
+void noinstr __ct_user_enter(enum ctx_state state)
{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+ lockdep_assert_irqs_disabled();
+
/* Kernel threads aren't supposed to go to userspace */
WARN_ON_ONCE(!current->mm);
if (!context_tracking_recursion_enter())
return;
- if ( __this_cpu_read(context_tracking.state) != state) {
- if (__this_cpu_read(context_tracking.active)) {
+ if (__ct_state() != state) {
+ if (ct->active) {
/*
* At this stage, only low level arch entry code remains and
* then we'll run in userspace. We can assume there won't be
* any RCU read-side critical section until the next call to
- * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+ * user_exit() or ct_irq_enter(). Let's remove RCU's dependency
* on the tick.
*/
if (state == CONTEXT_USER) {
@@ -82,35 +489,77 @@ void noinstr __context_tracking_enter(enum ctx_state state)
vtime_user_enter(current);
instrumentation_end();
}
- rcu_user_enter();
+ /*
+ * Other than generic entry implementation, we may be past the last
+ * rescheduling opportunity in the entry code. Trigger a self IPI
+ * that will fire and reschedule once we resume in user/guest mode.
+ */
+ rcu_irq_work_resched();
+
+ /*
+ * Enter RCU idle mode right before resuming userspace. No use of RCU
+ * is permitted between this call and rcu_eqs_exit(). This way the
+ * CPU doesn't need to maintain the tick for RCU maintenance purposes
+ * when the CPU runs in userspace.
+ */
+ ct_kernel_exit(true, RCU_DYNTICKS_IDX + state);
+
+ /*
+ * Special case if we only track user <-> kernel transitions for tickless
+ * cputime accounting but we don't support RCU extended quiescent state.
+ * In this we case we don't care about any concurrency/ordering.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
+ raw_atomic_set(&ct->state, state);
+ } else {
+ /*
+ * Even if context tracking is disabled on this CPU, because it's outside
+ * the full dynticks mask for example, we still have to keep track of the
+ * context transitions and states to prevent inconsistency on those of
+ * other CPUs.
+ * If a task triggers an exception in userspace, sleep on the exception
+ * handler and then migrate to another CPU, that new CPU must know where
+ * the exception returns by the time we call exception_exit().
+ * This information can only be provided by the previous CPU when it called
+ * exception_enter().
+ * OTOH we can spare the calls to vtime and RCU when context_tracking.active
+ * is false because we know that CPU is not tickless.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
+ /* Tracking for vtime only, no concurrent RCU EQS accounting */
+ raw_atomic_set(&ct->state, state);
+ } else {
+ /*
+ * Tracking for vtime and RCU EQS. Make sure we don't race
+ * with NMIs. OTOH we don't care about ordering here since
+ * RCU only requires RCU_DYNTICKS_IDX increments to be fully
+ * ordered.
+ */
+ raw_atomic_add(state, &ct->state);
+ }
}
- /*
- * Even if context tracking is disabled on this CPU, because it's outside
- * the full dynticks mask for example, we still have to keep track of the
- * context transitions and states to prevent inconsistency on those of
- * other CPUs.
- * If a task triggers an exception in userspace, sleep on the exception
- * handler and then migrate to another CPU, that new CPU must know where
- * the exception returns by the time we call exception_exit().
- * This information can only be provided by the previous CPU when it called
- * exception_enter().
- * OTOH we can spare the calls to vtime and RCU when context_tracking.active
- * is false because we know that CPU is not tickless.
- */
- __this_cpu_write(context_tracking.state, state);
}
context_tracking_recursion_exit();
}
-EXPORT_SYMBOL_GPL(__context_tracking_enter);
+EXPORT_SYMBOL_GPL(__ct_user_enter);
-void context_tracking_enter(enum ctx_state state)
+/*
+ * OBSOLETE:
+ * This function should be noinstr but the below local_irq_restore() is
+ * unsafe because it involves illegal RCU uses through tracing and lockdep.
+ * This is unlikely to be fixed as this function is obsolete. The preferred
+ * way is to call __context_tracking_enter() through user_enter_irqoff()
+ * or context_tracking_guest_enter(). It should be the arch entry code
+ * responsibility to call into context tracking with IRQs disabled.
+ */
+void ct_user_enter(enum ctx_state state)
{
unsigned long flags;
/*
* Some contexts may involve an exception occuring in an irq,
* leading to that nesting:
- * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
+ * ct_irq_enter() rcu_eqs_exit(true) rcu_eqs_enter(true) ct_irq_exit()
* This would mess up the dyntick_nesting count though. And rcu_irq_*()
* helpers are enough to protect RCU uses inside the exception. So
* just return immediately if we detect we are in an IRQ.
@@ -119,21 +568,32 @@ void context_tracking_enter(enum ctx_state state)
return;
local_irq_save(flags);
- __context_tracking_enter(state);
+ __ct_user_enter(state);
local_irq_restore(flags);
}
-NOKPROBE_SYMBOL(context_tracking_enter);
-EXPORT_SYMBOL_GPL(context_tracking_enter);
+NOKPROBE_SYMBOL(ct_user_enter);
+EXPORT_SYMBOL_GPL(ct_user_enter);
-void context_tracking_user_enter(void)
+/**
+ * user_enter_callable() - Unfortunate ASM callable version of user_enter() for
+ * archs that didn't manage to check the context tracking
+ * static key from low level code.
+ *
+ * This OBSOLETE function should be noinstr but it unsafely calls
+ * local_irq_restore(), involving illegal RCU uses through tracing and lockdep.
+ * This is unlikely to be fixed as this function is obsolete. The preferred
+ * way is to call user_enter_irqoff(). It should be the arch entry code
+ * responsibility to call into context tracking with IRQs disabled.
+ */
+void user_enter_callable(void)
{
user_enter();
}
-NOKPROBE_SYMBOL(context_tracking_user_enter);
+NOKPROBE_SYMBOL(user_enter_callable);
/**
- * context_tracking_exit - Inform the context tracking that the CPU is
- * exiting user or guest mode and entering the kernel.
+ * __ct_user_exit - Inform the context tracking that the CPU is
+ * exiting user or guest mode and entering the kernel.
*
* This function must be called after we entered the kernel from user or
* guest space before any use of RCU read side critical section. This
@@ -143,32 +603,64 @@ NOKPROBE_SYMBOL(context_tracking_user_enter);
* This call supports re-entrancy. This way it can be called from any exception
* handler without needing to know if we came from userspace or not.
*/
-void noinstr __context_tracking_exit(enum ctx_state state)
+void noinstr __ct_user_exit(enum ctx_state state)
{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
if (!context_tracking_recursion_enter())
return;
- if (__this_cpu_read(context_tracking.state) == state) {
- if (__this_cpu_read(context_tracking.active)) {
+ if (__ct_state() == state) {
+ if (ct->active) {
/*
- * We are going to run code that may use RCU. Inform
- * RCU core about that (ie: we may need the tick again).
+ * Exit RCU idle mode while entering the kernel because it can
+ * run a RCU read side critical section anytime.
*/
- rcu_user_exit();
+ ct_kernel_enter(true, RCU_DYNTICKS_IDX - state);
if (state == CONTEXT_USER) {
instrumentation_begin();
vtime_user_exit(current);
trace_user_exit(0);
instrumentation_end();
}
+
+ /*
+ * Special case if we only track user <-> kernel transitions for tickless
+ * cputime accounting but we don't support RCU extended quiescent state.
+ * In this we case we don't care about any concurrency/ordering.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
+ raw_atomic_set(&ct->state, CONTEXT_KERNEL);
+
+ } else {
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
+ /* Tracking for vtime only, no concurrent RCU EQS accounting */
+ raw_atomic_set(&ct->state, CONTEXT_KERNEL);
+ } else {
+ /*
+ * Tracking for vtime and RCU EQS. Make sure we don't race
+ * with NMIs. OTOH we don't care about ordering here since
+ * RCU only requires RCU_DYNTICKS_IDX increments to be fully
+ * ordered.
+ */
+ raw_atomic_sub(state, &ct->state);
+ }
}
- __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
}
context_tracking_recursion_exit();
}
-EXPORT_SYMBOL_GPL(__context_tracking_exit);
+EXPORT_SYMBOL_GPL(__ct_user_exit);
-void context_tracking_exit(enum ctx_state state)
+/*
+ * OBSOLETE:
+ * This function should be noinstr but the below local_irq_save() is
+ * unsafe because it involves illegal RCU uses through tracing and lockdep.
+ * This is unlikely to be fixed as this function is obsolete. The preferred
+ * way is to call __context_tracking_exit() through user_exit_irqoff()
+ * or context_tracking_guest_exit(). It should be the arch entry code
+ * responsibility to call into context tracking with IRQs disabled.
+ */
+void ct_user_exit(enum ctx_state state)
{
unsigned long flags;
@@ -176,19 +668,30 @@ void context_tracking_exit(enum ctx_state state)
return;
local_irq_save(flags);
- __context_tracking_exit(state);
+ __ct_user_exit(state);
local_irq_restore(flags);
}
-NOKPROBE_SYMBOL(context_tracking_exit);
-EXPORT_SYMBOL_GPL(context_tracking_exit);
+NOKPROBE_SYMBOL(ct_user_exit);
+EXPORT_SYMBOL_GPL(ct_user_exit);
-void context_tracking_user_exit(void)
+/**
+ * user_exit_callable() - Unfortunate ASM callable version of user_exit() for
+ * archs that didn't manage to check the context tracking
+ * static key from low level code.
+ *
+ * This OBSOLETE function should be noinstr but it unsafely calls local_irq_save(),
+ * involving illegal RCU uses through tracing and lockdep. This is unlikely
+ * to be fixed as this function is obsolete. The preferred way is to call
+ * user_exit_irqoff(). It should be the arch entry code responsibility to
+ * call into context tracking with IRQs disabled.
+ */
+void user_exit_callable(void)
{
user_exit();
}
-NOKPROBE_SYMBOL(context_tracking_user_exit);
+NOKPROBE_SYMBOL(user_exit_callable);
-void __init context_tracking_cpu_set(int cpu)
+void __init ct_cpu_track_user(int cpu)
{
static __initdata bool initialized = false;
@@ -212,12 +715,14 @@ void __init context_tracking_cpu_set(int cpu)
initialized = true;
}
-#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+#ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE
void __init context_tracking_init(void)
{
int cpu;
for_each_possible_cpu(cpu)
- context_tracking_cpu_set(cpu);
+ ct_cpu_track_user(cpu);
}
#endif
+
+#endif /* #ifdef CONFIG_CONTEXT_TRACKING_USER */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ff2578ecf17..e6ec3ba4950b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -17,6 +17,7 @@
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/rcupdate.h>
+#include <linux/delay.h>
#include <linux/export.h>
#include <linux/bug.h>
#include <linux/kthread.h>
@@ -31,7 +32,11 @@
#include <linux/smpboot.h>
#include <linux/relay.h>
#include <linux/slab.h>
+#include <linux/scs.h>
#include <linux/percpu-rwsem.h>
+#include <linux/cpuset.h>
+#include <linux/random.h>
+#include <linux/cc_platform.h>
#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
@@ -40,16 +45,22 @@
#include "smpboot.h"
/**
- * cpuhp_cpu_state - Per cpu hotplug state storage
+ * struct cpuhp_cpu_state - Per cpu hotplug state storage
* @state: The current cpu state
* @target: The target state
+ * @fail: Current CPU hotplug callback state
* @thread: Pointer to the hotplug thread
* @should_run: Thread should execute
* @rollback: Perform a rollback
* @single: Single callback invocation
* @bringup: Single callback bringup or teardown selector
+ * @cpu: CPU number
+ * @node: Remote CPU node; for multi-instance, do a
+ * single entry callback for install/remove
+ * @last: For multi-instance rollback, remember how far we got
* @cb_state: The state for a single callback (install/uninstall)
* @result: Result of the operation
+ * @ap_sync_state: State for AP synchronization
* @done_up: Signal completion to the issuer of the task for cpu-up
* @done_down: Signal completion to the issuer of the task for cpu-down
*/
@@ -67,6 +78,7 @@ struct cpuhp_cpu_state {
struct hlist_node *last;
enum cpuhp_state cb_state;
int result;
+ atomic_t ap_sync_state;
struct completion done_up;
struct completion done_down;
#endif
@@ -104,11 +116,12 @@ static inline void cpuhp_lock_release(bool bringup) { }
#endif
/**
- * cpuhp_step - Hotplug state machine step
+ * struct cpuhp_step - Hotplug state machine step
* @name: Name of the step
* @startup: Startup function of the step
* @teardown: Teardown function of the step
* @cant_stop: Bringup/teardown can't be stopped at this step
+ * @multi_instance: State has multiple instances which get added afterwards
*/
struct cpuhp_step {
const char *name;
@@ -122,7 +135,9 @@ struct cpuhp_step {
int (*multi)(unsigned int cpu,
struct hlist_node *node);
} teardown;
+ /* private: */
struct hlist_head list;
+ /* public: */
bool cant_stop;
bool multi_instance;
};
@@ -135,8 +150,13 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
return cpuhp_hp_states + state;
}
+static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
+{
+ return bringup ? !step->startup.single : !step->teardown.single;
+}
+
/**
- * cpuhp_invoke_callback _ Invoke the callbacks for a given state
+ * cpuhp_invoke_callback - Invoke the callbacks for a given state
* @cpu: The cpu for which the callback should be invoked
* @state: The state to do callbacks for
* @bringup: True if the bringup callback should be invoked
@@ -144,6 +164,8 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
* @lastp: For multi-instance rollback, remember how far we got
*
* Called from cpu hotplug and from the state register machinery.
+ *
+ * Return: %0 on success or a negative errno code
*/
static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
bool bringup, struct hlist_node *node,
@@ -157,26 +179,24 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
if (st->fail == state) {
st->fail = CPUHP_INVALID;
-
- if (!(bringup ? step->startup.single : step->teardown.single))
- return 0;
-
return -EAGAIN;
}
+ if (cpuhp_step_empty(bringup, step)) {
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
if (!step->multi_instance) {
WARN_ON_ONCE(lastp && *lastp);
cb = bringup ? step->startup.single : step->teardown.single;
- if (!cb)
- return 0;
+
trace_cpuhp_enter(cpu, st->target, state, cb);
ret = cb(cpu);
trace_cpuhp_exit(cpu, st->state, state, ret);
return ret;
}
cbm = bringup ? step->startup.multi : step->teardown.multi;
- if (!cbm)
- return 0;
/* Single invocation for instance add/remove */
if (node) {
@@ -259,6 +279,182 @@ static bool cpuhp_is_atomic_state(enum cpuhp_state state)
return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
}
+/* Synchronization state management */
+enum cpuhp_sync_state {
+ SYNC_STATE_DEAD,
+ SYNC_STATE_KICKED,
+ SYNC_STATE_SHOULD_DIE,
+ SYNC_STATE_ALIVE,
+ SYNC_STATE_SHOULD_ONLINE,
+ SYNC_STATE_ONLINE,
+};
+
+#ifdef CONFIG_HOTPLUG_CORE_SYNC
+/**
+ * cpuhp_ap_update_sync_state - Update synchronization state during bringup/teardown
+ * @state: The synchronization state to set
+ *
+ * No synchronization point. Just update of the synchronization state, but implies
+ * a full barrier so that the AP changes are visible before the control CPU proceeds.
+ */
+static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state)
+{
+ atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);
+
+ (void)atomic_xchg(st, state);
+}
+
+void __weak arch_cpuhp_sync_state_poll(void) { cpu_relax(); }
+
+static bool cpuhp_wait_for_sync_state(unsigned int cpu, enum cpuhp_sync_state state,
+ enum cpuhp_sync_state next_state)
+{
+ atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
+ ktime_t now, end, start = ktime_get();
+ int sync;
+
+ end = start + 10ULL * NSEC_PER_SEC;
+
+ sync = atomic_read(st);
+ while (1) {
+ if (sync == state) {
+ if (!atomic_try_cmpxchg(st, &sync, next_state))
+ continue;
+ return true;
+ }
+
+ now = ktime_get();
+ if (now > end) {
+ /* Timeout. Leave the state unchanged */
+ return false;
+ } else if (now - start < NSEC_PER_MSEC) {
+ /* Poll for one millisecond */
+ arch_cpuhp_sync_state_poll();
+ } else {
+ usleep_range_state(USEC_PER_MSEC, 2 * USEC_PER_MSEC, TASK_UNINTERRUPTIBLE);
+ }
+ sync = atomic_read(st);
+ }
+ return true;
+}
+#else /* CONFIG_HOTPLUG_CORE_SYNC */
+static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state) { }
+#endif /* !CONFIG_HOTPLUG_CORE_SYNC */
+
+#ifdef CONFIG_HOTPLUG_CORE_SYNC_DEAD
+/**
+ * cpuhp_ap_report_dead - Update synchronization state to DEAD
+ *
+ * No synchronization point. Just update of the synchronization state.
+ */
+void cpuhp_ap_report_dead(void)
+{
+ cpuhp_ap_update_sync_state(SYNC_STATE_DEAD);
+}
+
+void __weak arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) { }
+
+/*
+ * Late CPU shutdown synchronization point. Cannot use cpuhp_state::done_down
+ * because the AP cannot issue complete() at this stage.
+ */
+static void cpuhp_bp_sync_dead(unsigned int cpu)
+{
+ atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
+ int sync = atomic_read(st);
+
+ do {
+ /* CPU can have reported dead already. Don't overwrite that! */
+ if (sync == SYNC_STATE_DEAD)
+ break;
+ } while (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_SHOULD_DIE));
+
+ if (cpuhp_wait_for_sync_state(cpu, SYNC_STATE_DEAD, SYNC_STATE_DEAD)) {
+ /* CPU reached dead state. Invoke the cleanup function */
+ arch_cpuhp_cleanup_dead_cpu(cpu);
+ return;
+ }
+
+ /* No further action possible. Emit message and give up. */
+ pr_err("CPU%u failed to report dead state\n", cpu);
+}
+#else /* CONFIG_HOTPLUG_CORE_SYNC_DEAD */
+static inline void cpuhp_bp_sync_dead(unsigned int cpu) { }
+#endif /* !CONFIG_HOTPLUG_CORE_SYNC_DEAD */
+
+#ifdef CONFIG_HOTPLUG_CORE_SYNC_FULL
+/**
+ * cpuhp_ap_sync_alive - Synchronize AP with the control CPU once it is alive
+ *
+ * Updates the AP synchronization state to SYNC_STATE_ALIVE and waits
+ * for the BP to release it.
+ */
+void cpuhp_ap_sync_alive(void)
+{
+ atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);
+
+ cpuhp_ap_update_sync_state(SYNC_STATE_ALIVE);
+
+ /* Wait for the control CPU to release it. */
+ while (atomic_read(st) != SYNC_STATE_SHOULD_ONLINE)
+ cpu_relax();
+}
+
+static bool cpuhp_can_boot_ap(unsigned int cpu)
+{
+ atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
+ int sync = atomic_read(st);
+
+again:
+ switch (sync) {
+ case SYNC_STATE_DEAD:
+ /* CPU is properly dead */
+ break;
+ case SYNC_STATE_KICKED:
+ /* CPU did not come up in previous attempt */
+ break;
+ case SYNC_STATE_ALIVE:
+ /* CPU is stuck cpuhp_ap_sync_alive(). */
+ break;
+ default:
+ /* CPU failed to report online or dead and is in limbo state. */
+ return false;
+ }
+
+ /* Prepare for booting */
+ if (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_KICKED))
+ goto again;
+
+ return true;
+}
+
+void __weak arch_cpuhp_cleanup_kick_cpu(unsigned int cpu) { }
+
+/*
+ * Early CPU bringup synchronization point. Cannot use cpuhp_state::done_up
+ * because the AP cannot issue complete() so early in the bringup.
+ */
+static int cpuhp_bp_sync_alive(unsigned int cpu)
+{
+ int ret = 0;
+
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CORE_SYNC_FULL))
+ return 0;
+
+ if (!cpuhp_wait_for_sync_state(cpu, SYNC_STATE_ALIVE, SYNC_STATE_SHOULD_ONLINE)) {
+ pr_err("CPU%u failed to report alive state\n", cpu);
+ ret = -EIO;
+ }
+
+ /* Let the architecture cleanup the kick alive mechanics. */
+ arch_cpuhp_cleanup_kick_cpu(cpu);
+ return ret;
+}
+#else /* CONFIG_HOTPLUG_CORE_SYNC_FULL */
+static inline int cpuhp_bp_sync_alive(unsigned int cpu) { return 0; }
+static inline bool cpuhp_can_boot_ap(unsigned int cpu) { return true; }
+#endif /* !CONFIG_HOTPLUG_CORE_SYNC_FULL */
+
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
bool cpuhp_tasks_frozen;
@@ -330,6 +526,13 @@ void lockdep_assert_cpus_held(void)
percpu_rwsem_assert_held(&cpu_hotplug_lock);
}
+#ifdef CONFIG_LOCKDEP
+int lockdep_is_cpus_held(void)
+{
+ return percpu_rwsem_is_held(&cpu_hotplug_lock);
+}
+#endif
+
static void lockdep_acquire_cpus_lock(void)
{
rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
@@ -389,7 +592,10 @@ static void lockdep_release_cpus_lock(void)
void __weak arch_smt_update(void) { }
#ifdef CONFIG_HOTPLUG_SMT
+
enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
+static unsigned int cpu_smt_max_threads __ro_after_init;
+unsigned int cpu_smt_num_threads __read_mostly = UINT_MAX;
void __init cpu_smt_disable(bool force)
{
@@ -403,16 +609,33 @@ void __init cpu_smt_disable(bool force)
pr_info("SMT: disabled\n");
cpu_smt_control = CPU_SMT_DISABLED;
}
+ cpu_smt_num_threads = 1;
}
/*
* The decision whether SMT is supported can only be done after the full
* CPU identification. Called from architecture code.
*/
-void __init cpu_smt_check_topology(void)
+void __init cpu_smt_set_num_threads(unsigned int num_threads,
+ unsigned int max_threads)
{
- if (!topology_smt_supported())
+ WARN_ON(!num_threads || (num_threads > max_threads));
+
+ if (max_threads == 1)
cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
+
+ cpu_smt_max_threads = max_threads;
+
+ /*
+ * If SMT has been disabled via the kernel command line or SMT is
+ * not supported, set cpu_smt_num_threads to 1 for consistency.
+ * If enabled, take the architecture requested number of threads
+ * to bring up into account.
+ */
+ if (cpu_smt_control != CPU_SMT_ENABLED)
+ cpu_smt_num_threads = 1;
+ else if (num_threads < cpu_smt_num_threads)
+ cpu_smt_num_threads = num_threads;
}
static int __init smt_cmdline_disable(char *str)
@@ -422,9 +645,31 @@ static int __init smt_cmdline_disable(char *str)
}
early_param("nosmt", smt_cmdline_disable);
-static inline bool cpu_smt_allowed(unsigned int cpu)
+/*
+ * For Archicture supporting partial SMT states check if the thread is allowed.
+ * Otherwise this has already been checked through cpu_smt_max_threads when
+ * setting the SMT level.
+ */
+static inline bool cpu_smt_thread_allowed(unsigned int cpu)
+{
+#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC
+ return topology_smt_thread_allowed(cpu);
+#else
+ return true;
+#endif
+}
+
+static inline bool cpu_bootable(unsigned int cpu)
{
- if (cpu_smt_control == CPU_SMT_ENABLED)
+ if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
+ return true;
+
+ /* All CPUs are bootable if controls are not configured */
+ if (cpu_smt_control == CPU_SMT_NOT_IMPLEMENTED)
+ return true;
+
+ /* All CPUs are bootable if CPU is not SMT capable */
+ if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
return true;
if (topology_is_primary_thread(cpu))
@@ -439,35 +684,51 @@ static inline bool cpu_smt_allowed(unsigned int cpu)
return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
}
-/* Returns true if SMT is not supported of forcefully (irreversibly) disabled */
+/* Returns true if SMT is supported and not forcefully (irreversibly) disabled */
bool cpu_smt_possible(void)
{
return cpu_smt_control != CPU_SMT_FORCE_DISABLED &&
cpu_smt_control != CPU_SMT_NOT_SUPPORTED;
}
EXPORT_SYMBOL_GPL(cpu_smt_possible);
+
#else
-static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
+static inline bool cpu_bootable(unsigned int cpu) { return true; }
#endif
static inline enum cpuhp_state
-cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+cpuhp_set_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
+ bool bringup = st->state < target;
st->rollback = false;
st->last = NULL;
st->target = target;
st->single = false;
- st->bringup = st->state < target;
+ st->bringup = bringup;
+ if (cpu_dying(cpu) != !bringup)
+ set_cpu_dying(cpu, !bringup);
return prev_state;
}
static inline void
-cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
+cpuhp_reset_state(int cpu, struct cpuhp_cpu_state *st,
+ enum cpuhp_state prev_state)
{
+ bool bringup = !st->bringup;
+
+ st->target = prev_state;
+
+ /*
+ * Already rolling back. No need invert the bringup value or to change
+ * the current state.
+ */
+ if (st->rollback)
+ return;
+
st->rollback = true;
/*
@@ -481,8 +742,9 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
st->state++;
}
- st->target = prev_state;
- st->bringup = !st->bringup;
+ st->bringup = bringup;
+ if (cpu_dying(cpu) != !bringup)
+ set_cpu_dying(cpu, !bringup);
}
/* Regular hotplug invocation of the AP hotplug thread */
@@ -502,22 +764,23 @@ static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
wait_for_ap_thread(st, st->bringup);
}
-static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
{
enum cpuhp_state prev_state;
int ret;
- prev_state = cpuhp_set_state(st, target);
+ prev_state = cpuhp_set_state(cpu, st, target);
__cpuhp_kick_ap(st);
if ((ret = st->result)) {
- cpuhp_reset_state(st, prev_state);
+ cpuhp_reset_state(cpu, st, prev_state);
__cpuhp_kick_ap(st);
}
return ret;
}
-static int bringup_wait_for_ap(unsigned int cpu)
+static int bringup_wait_for_ap_online(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
@@ -533,37 +796,99 @@ static int bringup_wait_for_ap(unsigned int cpu)
* SMT soft disabling on X86 requires to bring the CPU out of the
* BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The
* CPU marked itself as booted_once in notify_cpu_starting() so the
- * cpu_smt_allowed() check will now return false if this is not the
+ * cpu_bootable() check will now return false if this is not the
* primary sibling.
*/
- if (!cpu_smt_allowed(cpu))
+ if (!cpu_bootable(cpu))
return -ECANCELED;
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
+static int cpuhp_kick_ap_alive(unsigned int cpu)
+{
+ if (!cpuhp_can_boot_ap(cpu))
+ return -EAGAIN;
+
+ return arch_cpuhp_kick_ap_alive(cpu, idle_thread_get(cpu));
+}
+
+static int cpuhp_bringup_ap(unsigned int cpu)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int ret;
+
+ /*
+ * Some architectures have to walk the irq descriptors to
+ * setup the vector space for the cpu which comes online.
+ * Prevent irq alloc/free across the bringup.
+ */
+ irq_lock_sparse();
+
+ ret = cpuhp_bp_sync_alive(cpu);
+ if (ret)
+ goto out_unlock;
+
+ ret = bringup_wait_for_ap_online(cpu);
+ if (ret)
+ goto out_unlock;
+
+ irq_unlock_sparse();
if (st->target <= CPUHP_AP_ONLINE_IDLE)
return 0;
- return cpuhp_kick_ap(st, st->target);
-}
+ return cpuhp_kick_ap(cpu, st, st->target);
+out_unlock:
+ irq_unlock_sparse();
+ return ret;
+}
+#else
static int bringup_cpu(unsigned int cpu)
{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
struct task_struct *idle = idle_thread_get(cpu);
int ret;
+ if (!cpuhp_can_boot_ap(cpu))
+ return -EAGAIN;
+
/*
* Some architectures have to walk the irq descriptors to
* setup the vector space for the cpu which comes online.
- * Prevent irq alloc/free across the bringup.
+ *
+ * Prevent irq alloc/free across the bringup by acquiring the
+ * sparse irq lock. Hold it until the upcoming CPU completes the
+ * startup in cpuhp_online_idle() which allows to avoid
+ * intermediate synchronization points in the architecture code.
*/
irq_lock_sparse();
- /* Arch-specific enabling code. */
ret = __cpu_up(cpu, idle);
- irq_unlock_sparse();
if (ret)
- return ret;
- return bringup_wait_for_ap(cpu);
+ goto out_unlock;
+
+ ret = cpuhp_bp_sync_alive(cpu);
+ if (ret)
+ goto out_unlock;
+
+ ret = bringup_wait_for_ap_online(cpu);
+ if (ret)
+ goto out_unlock;
+
+ irq_unlock_sparse();
+
+ if (st->target <= CPUHP_AP_ONLINE_IDLE)
+ return 0;
+
+ return cpuhp_kick_ap(cpu, st, st->target);
+
+out_unlock:
+ irq_unlock_sparse();
+ return ret;
}
+#endif
static int finish_cpu(unsigned int cpu)
{
@@ -576,7 +901,7 @@ static int finish_cpu(unsigned int cpu)
*/
if (mm != &init_mm)
idle->active_mm = &init_mm;
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
return 0;
}
@@ -584,10 +909,83 @@ static int finish_cpu(unsigned int cpu)
* Hotplug state machine related functions
*/
-static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
+/*
+ * Get the next state to run. Empty ones will be skipped. Returns true if a
+ * state must be run.
+ *
+ * st->state will be modified ahead of time, to match state_to_run, as if it
+ * has already ran.
+ */
+static bool cpuhp_next_state(bool bringup,
+ enum cpuhp_state *state_to_run,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ do {
+ if (bringup) {
+ if (st->state >= target)
+ return false;
+
+ *state_to_run = ++st->state;
+ } else {
+ if (st->state <= target)
+ return false;
+
+ *state_to_run = st->state--;
+ }
+
+ if (!cpuhp_step_empty(bringup, cpuhp_get_step(*state_to_run)))
+ break;
+ } while (true);
+
+ return true;
+}
+
+static int __cpuhp_invoke_callback_range(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target,
+ bool nofail)
{
- for (st->state--; st->state > st->target; st->state--)
- cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
+ enum cpuhp_state state;
+ int ret = 0;
+
+ while (cpuhp_next_state(bringup, &state, st, target)) {
+ int err;
+
+ err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
+ if (!err)
+ continue;
+
+ if (nofail) {
+ pr_warn("CPU %u %s state %s (%d) failed (%d)\n",
+ cpu, bringup ? "UP" : "DOWN",
+ cpuhp_get_step(st->state)->name,
+ st->state, err);
+ ret = -1;
+ } else {
+ ret = err;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static inline int cpuhp_invoke_callback_range(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ return __cpuhp_invoke_callback_range(bringup, cpu, st, target, false);
+}
+
+static inline void cpuhp_invoke_callback_range_nofail(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ __cpuhp_invoke_callback_range(bringup, cpu, st, target, true);
}
static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
@@ -610,16 +1008,16 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
enum cpuhp_state prev_state = st->state;
int ret = 0;
- while (st->state < target) {
- st->state++;
- ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
- if (ret) {
- if (can_rollback_cpu(st)) {
- st->target = prev_state;
- undo_cpu_up(cpu, st);
- }
- break;
- }
+ ret = cpuhp_invoke_callback_range(true, cpu, st, target);
+ if (ret) {
+ pr_debug("CPU UP failed (%d) CPU %u state %s (%d)\n",
+ ret, cpu, cpuhp_get_step(st->state)->name,
+ st->state);
+
+ cpuhp_reset_state(cpu, st, prev_state);
+ if (can_rollback_cpu(st))
+ WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
+ prev_state));
}
return ret;
}
@@ -627,14 +1025,6 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
/*
* The cpu hotplug threads manage the bringup and teardown of the cpus
*/
-static void cpuhp_create(unsigned int cpu)
-{
- struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
-
- init_completion(&st->done_up);
- init_completion(&st->done_down);
-}
-
static int cpuhp_should_run(unsigned int cpu)
{
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
@@ -683,17 +1073,9 @@ static void cpuhp_thread_fun(unsigned int cpu)
state = st->cb_state;
st->should_run = false;
} else {
- if (bringup) {
- st->state++;
- state = st->state;
- st->should_run = (st->state < st->target);
- WARN_ON_ONCE(st->state > st->target);
- } else {
- state = st->state;
- st->state--;
- st->should_run = (st->state > st->target);
- WARN_ON_ONCE(st->state < st->target);
- }
+ st->should_run = cpuhp_next_state(bringup, &state, st, st->target);
+ if (!st->should_run)
+ goto end;
}
WARN_ON_ONCE(!cpuhp_is_ap_state(state));
@@ -721,6 +1103,7 @@ static void cpuhp_thread_fun(unsigned int cpu)
st->should_run = false;
}
+end:
cpuhp_lock_release(bringup);
lockdep_release_cpus_lock();
@@ -793,7 +1176,7 @@ static int cpuhp_kick_ap_work(unsigned int cpu)
cpuhp_lock_release(true);
trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
- ret = cpuhp_kick_ap(st, st->target);
+ ret = cpuhp_kick_ap(cpu, st, st->target);
trace_cpuhp_exit(cpu, st->state, prev_state, ret);
return ret;
@@ -801,20 +1184,82 @@ static int cpuhp_kick_ap_work(unsigned int cpu)
static struct smp_hotplug_thread cpuhp_threads = {
.store = &cpuhp_state.thread,
- .create = &cpuhp_create,
.thread_should_run = cpuhp_should_run,
.thread_fn = cpuhp_thread_fun,
.thread_comm = "cpuhp/%u",
.selfparking = true,
};
+static __init void cpuhp_init_state(void)
+{
+ struct cpuhp_cpu_state *st;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ st = per_cpu_ptr(&cpuhp_state, cpu);
+ init_completion(&st->done_up);
+ init_completion(&st->done_down);
+ }
+}
+
void __init cpuhp_threads_init(void)
{
+ cpuhp_init_state();
BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
kthread_unpark(this_cpu_read(cpuhp_state.thread));
}
+/*
+ *
+ * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock
+ * protected region.
+ *
+ * The operation is still serialized against concurrent CPU hotplug via
+ * cpu_add_remove_lock, i.e. CPU map protection. But it is _not_
+ * serialized against other hotplug related activity like adding or
+ * removing of state callbacks and state instances, which invoke either the
+ * startup or the teardown callback of the affected state.
+ *
+ * This is required for subsystems which are unfixable vs. CPU hotplug and
+ * evade lock inversion problems by scheduling work which has to be
+ * completed _before_ cpu_up()/_cpu_down() returns.
+ *
+ * Don't even think about adding anything to this for any new code or even
+ * drivers. It's only purpose is to keep existing lock order trainwrecks
+ * working.
+ *
+ * For cpu_down() there might be valid reasons to finish cleanups which are
+ * not required to be done under cpu_hotplug_lock, but that's a different
+ * story and would be not invoked via this.
+ */
+static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen)
+{
+ /*
+ * cpusets delegate hotplug operations to a worker to "solve" the
+ * lock order problems. Wait for the worker, but only if tasks are
+ * _not_ frozen (suspend, hibernate) as that would wait forever.
+ *
+ * The wait is required because otherwise the hotplug operation
+ * returns with inconsistent state, which could even be observed in
+ * user space when a new CPU is brought up. The CPU plug uevent
+ * would be delivered and user space reacting on it would fail to
+ * move tasks to the newly plugged CPU up to the point where the
+ * work has finished because up to that point the newly plugged CPU
+ * is not assignable in cpusets/cgroups. On unplug that's not
+ * necessarily a visible issue, but it is still inconsistent state,
+ * which is the real problem which needs to be "fixed". This can't
+ * prevent the transient state between scheduling the work and
+ * returning from waiting for it.
+ */
+ if (!tasks_frozen)
+ cpuset_wait_for_hotplug();
+}
+
#ifdef CONFIG_HOTPLUG_CPU
+#ifndef arch_clear_mm_cpumask_cpu
+#define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
+#endif
+
/**
* clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
* @cpu: a CPU id
@@ -850,7 +1295,7 @@ void clear_tasks_mm_cpumask(int cpu)
t = find_lock_task_mm(p);
if (!t)
continue;
- cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
+ arch_clear_mm_cpumask_cpu(cpu, t->mm);
task_unlock(t);
}
rcu_read_unlock();
@@ -862,7 +1307,6 @@ static int take_cpu_down(void *_param)
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
int err, cpu = smp_processor_id();
- int ret;
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
@@ -870,19 +1314,15 @@ static int take_cpu_down(void *_param)
return err;
/*
- * We get here while we are in CPUHP_TEARDOWN_CPU state and we must not
- * do this step again.
+ * Must be called from CPUHP_TEARDOWN_CPU, which means, as we are going
+ * down, that the current state is CPUHP_TEARDOWN_CPU - 1.
*/
- WARN_ON(st->state != CPUHP_TEARDOWN_CPU);
- st->state--;
- /* Invoke the former CPU_DYING callbacks */
- for (; st->state > target; st->state--) {
- ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
- /*
- * DYING must not fail!
- */
- WARN_ON_ONCE(ret);
- }
+ WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));
+
+ /*
+ * Invoke the former CPU_DYING callbacks. DYING must not fail!
+ */
+ cpuhp_invoke_callback_range_nofail(false, cpu, st, target);
/* Give up timekeeping duties */
tick_handover_do_timer();
@@ -899,7 +1339,7 @@ static int takedown_cpu(unsigned int cpu)
int err;
/* Park the smpboot threads */
- kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
+ kthread_park(st->thread);
/*
* Prevent irq alloc/free while the dying cpu reorganizes the
@@ -915,7 +1355,7 @@ static int takedown_cpu(unsigned int cpu)
/* CPU refused to die */
irq_unlock_sparse();
/* Unpark the hotplug thread so we can rollback there */
- kthread_unpark(per_cpu_ptr(&cpuhp_state, cpu)->thread);
+ kthread_unpark(st->thread);
return err;
}
BUG_ON(cpu_online(cpu));
@@ -937,8 +1377,17 @@ static int takedown_cpu(unsigned int cpu)
/* This actually kills the CPU. */
__cpu_die(cpu);
+ cpuhp_bp_sync_dead(cpu);
+
tick_cleanup_dead_cpu(cpu);
+
+ /*
+ * Callbacks must be re-integrated right away to the RCU state machine.
+ * Otherwise an RCU callback could block a further teardown function
+ * waiting for its completion.
+ */
rcutree_migrate_callbacks(cpu);
+
return 0;
}
@@ -954,37 +1403,35 @@ void cpuhp_report_idle_dead(void)
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
BUG_ON(st->state != CPUHP_AP_OFFLINE);
- rcu_report_dead(smp_processor_id());
+ rcutree_report_cpu_dead();
st->state = CPUHP_AP_IDLE_DEAD;
/*
- * We cannot call complete after rcu_report_dead() so we delegate it
+ * We cannot call complete after rcutree_report_cpu_dead() so we delegate it
* to an online cpu.
*/
smp_call_function_single(cpumask_first(cpu_online_mask),
cpuhp_complete_idle_dead, st, 0);
}
-static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
- for (st->state++; st->state < st->target; st->state++)
- cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
-}
-
static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
int ret = 0;
- for (; st->state > target; st->state--) {
- ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
- if (ret) {
- st->target = prev_state;
- if (st->state < prev_state)
- undo_cpu_down(cpu, st);
- break;
- }
+ ret = cpuhp_invoke_callback_range(false, cpu, st, target);
+ if (ret) {
+ pr_debug("CPU DOWN failed (%d) CPU %u state %s (%d)\n",
+ ret, cpu, cpuhp_get_step(st->state)->name,
+ st->state);
+
+ cpuhp_reset_state(cpu, st, prev_state);
+
+ if (st->state < prev_state)
+ WARN_ON(cpuhp_invoke_callback_range(true, cpu, st,
+ prev_state));
}
+
return ret;
}
@@ -1005,7 +1452,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
cpuhp_tasks_frozen = tasks_frozen;
- prev_state = cpuhp_set_state(st, target);
+ prev_state = cpuhp_set_state(cpu, st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread.
@@ -1034,9 +1481,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
* to do the further cleanups.
*/
ret = cpuhp_down_callbacks(cpu, st, target);
- if (ret && st->state == CPUHP_TEARDOWN_CPU && st->state < prev_state) {
- cpuhp_reset_state(st, prev_state);
- __cpuhp_kick_ap(st);
+ if (ret && st->state < prev_state) {
+ if (st->state == CPUHP_TEARDOWN_CPU) {
+ cpuhp_reset_state(cpu, st, prev_state);
+ __cpuhp_kick_ap(st);
+ } else {
+ WARN(1, "DEAD callback error for CPU%d", cpu);
+ }
}
out:
@@ -1047,14 +1498,46 @@ out:
*/
lockup_detector_cleanup();
arch_smt_update();
+ cpu_up_down_serialize_trainwrecks(tasks_frozen);
return ret;
}
+struct cpu_down_work {
+ unsigned int cpu;
+ enum cpuhp_state target;
+};
+
+static long __cpu_down_maps_locked(void *arg)
+{
+ struct cpu_down_work *work = arg;
+
+ return _cpu_down(work->cpu, 0, work->target);
+}
+
static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
{
+ struct cpu_down_work work = { .cpu = cpu, .target = target, };
+
+ /*
+ * If the platform does not support hotplug, report it explicitly to
+ * differentiate it from a transient offlining failure.
+ */
+ if (cc_platform_has(CC_ATTR_HOTPLUG_DISABLED))
+ return -EOPNOTSUPP;
if (cpu_hotplug_disabled)
return -EBUSY;
- return _cpu_down(cpu, 0, target);
+
+ /*
+ * Ensure that the control task does not run on the to be offlined
+ * CPU to prevent a deadlock against cfs_b->period_timer.
+ * Also keep at least one housekeeping cpu onlined to avoid generating
+ * an empty sched_domain span.
+ */
+ for_each_cpu_and(cpu, cpu_online_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) {
+ if (cpu != work.cpu)
+ return work_on_cpu(cpu, __cpu_down_maps_locked, &work);
+ }
+ return -EBUSY;
}
static int cpu_down(unsigned int cpu, enum cpuhp_state target)
@@ -1074,6 +1557,8 @@ static int cpu_down(unsigned int cpu, enum cpuhp_state target)
* This function is meant to be used by device core cpu subsystem only.
*
* Other subsystems should use remove_cpu() instead.
+ *
+ * Return: %0 on success or a negative errno code
*/
int cpu_device_down(struct device *dev)
{
@@ -1149,18 +1634,14 @@ void notify_cpu_starting(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
- int ret;
- rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
+ rcutree_report_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
cpumask_set_cpu(cpu, &cpus_booted_once_mask);
- while (st->state < target) {
- st->state++;
- ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
- /*
- * STARTING must not fail!
- */
- WARN_ON_ONCE(ret);
- }
+
+ /*
+ * STARTING must not fail!
+ */
+ cpuhp_invoke_callback_range_nofail(true, cpu, st, target);
}
/*
@@ -1176,8 +1657,10 @@ void cpuhp_online_idle(enum cpuhp_state state)
if (state != CPUHP_AP_ONLINE_IDLE)
return;
+ cpuhp_ap_update_sync_state(SYNC_STATE_ONLINE);
+
/*
- * Unpart the stopper thread before we start the idle loop (and start
+ * Unpark the stopper thread before we start the idle loop (and start
* scheduling); this ensures the stopper task is always available.
*/
stop_machine_unpark(smp_processor_id());
@@ -1214,11 +1697,17 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
ret = PTR_ERR(idle);
goto out;
}
+
+ /*
+ * Reset stale stack state from the last time this CPU was online.
+ */
+ scs_task_reset(idle);
+ kasan_unpoison_task_stack(idle);
}
cpuhp_tasks_frozen = tasks_frozen;
- cpuhp_set_state(st, target);
+ cpuhp_set_state(cpu, st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread once more.
@@ -1243,6 +1732,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
out:
cpus_write_unlock();
arch_smt_update();
+ cpu_up_down_serialize_trainwrecks(tasks_frozen);
return ret;
}
@@ -1253,9 +1743,6 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target)
if (!cpu_possible(cpu)) {
pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
cpu);
-#if defined(CONFIG_IA64)
- pr_err("please check additional_cpus= boot parameter\n");
-#endif
return -EINVAL;
}
@@ -1269,7 +1756,7 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target)
err = -EBUSY;
goto out;
}
- if (!cpu_smt_allowed(cpu)) {
+ if (!cpu_bootable(cpu)) {
err = -EPERM;
goto out;
}
@@ -1287,6 +1774,8 @@ out:
* This function is meant to be used by device core cpu subsystem only.
*
* Other subsystems should use add_cpu() instead.
+ *
+ * Return: %0 on success or a negative errno code
*/
int cpu_device_up(struct device *dev)
{
@@ -1312,6 +1801,8 @@ EXPORT_SYMBOL_GPL(add_cpu);
* On some architectures like arm64, we can hibernate on any CPU, but on
* wake up the CPU we hibernated on might be offline as a side effect of
* using maxcpus= for example.
+ *
+ * Return: %0 on success or a negative errno code
*/
int bringup_hibernate_cpu(unsigned int sleep_cpu)
{
@@ -1328,18 +1819,106 @@ int bringup_hibernate_cpu(unsigned int sleep_cpu)
return 0;
}
-void bringup_nonboot_cpus(unsigned int setup_max_cpus)
+static void __init cpuhp_bringup_mask(const struct cpumask *mask, unsigned int ncpus,
+ enum cpuhp_state target)
{
unsigned int cpu;
- for_each_present_cpu(cpu) {
- if (num_online_cpus() >= setup_max_cpus)
+ for_each_cpu(cpu, mask) {
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+
+ if (cpu_up(cpu, target) && can_rollback_cpu(st)) {
+ /*
+ * If this failed then cpu_up() might have only
+ * rolled back to CPUHP_BP_KICK_AP for the final
+ * online. Clean it up. NOOP if already rolled back.
+ */
+ WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, CPUHP_OFFLINE));
+ }
+
+ if (!--ncpus)
break;
- if (!cpu_online(cpu))
- cpu_up(cpu, CPUHP_ONLINE);
}
}
+#ifdef CONFIG_HOTPLUG_PARALLEL
+static bool __cpuhp_parallel_bringup __ro_after_init = true;
+
+static int __init parallel_bringup_parse_param(char *arg)
+{
+ return kstrtobool(arg, &__cpuhp_parallel_bringup);
+}
+early_param("cpuhp.parallel", parallel_bringup_parse_param);
+
+static inline bool cpuhp_smt_aware(void)
+{
+ return cpu_smt_max_threads > 1;
+}
+
+static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
+{
+ return cpu_primary_thread_mask;
+}
+
+/*
+ * On architectures which have enabled parallel bringup this invokes all BP
+ * prepare states for each of the to be onlined APs first. The last state
+ * sends the startup IPI to the APs. The APs proceed through the low level
+ * bringup code in parallel and then wait for the control CPU to release
+ * them one by one for the final onlining procedure.
+ *
+ * This avoids waiting for each AP to respond to the startup IPI in
+ * CPUHP_BRINGUP_CPU.
+ */
+static bool __init cpuhp_bringup_cpus_parallel(unsigned int ncpus)
+{
+ const struct cpumask *mask = cpu_present_mask;
+
+ if (__cpuhp_parallel_bringup)
+ __cpuhp_parallel_bringup = arch_cpuhp_init_parallel_bringup();
+ if (!__cpuhp_parallel_bringup)
+ return false;
+
+ if (cpuhp_smt_aware()) {
+ const struct cpumask *pmask = cpuhp_get_primary_thread_mask();
+ static struct cpumask tmp_mask __initdata;
+
+ /*
+ * X86 requires to prevent that SMT siblings stopped while
+ * the primary thread does a microcode update for various
+ * reasons. Bring the primary threads up first.
+ */
+ cpumask_and(&tmp_mask, mask, pmask);
+ cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_BP_KICK_AP);
+ cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_ONLINE);
+ /* Account for the online CPUs */
+ ncpus -= num_online_cpus();
+ if (!ncpus)
+ return true;
+ /* Create the mask for secondary CPUs */
+ cpumask_andnot(&tmp_mask, mask, pmask);
+ mask = &tmp_mask;
+ }
+
+ /* Bring the not-yet started CPUs up */
+ cpuhp_bringup_mask(mask, ncpus, CPUHP_BP_KICK_AP);
+ cpuhp_bringup_mask(mask, ncpus, CPUHP_ONLINE);
+ return true;
+}
+#else
+static inline bool cpuhp_bringup_cpus_parallel(unsigned int ncpus) { return false; }
+#endif /* CONFIG_HOTPLUG_PARALLEL */
+
+void __init bringup_nonboot_cpus(unsigned int setup_max_cpus)
+{
+ /* Try parallel bringup optimization if enabled */
+ if (cpuhp_bringup_cpus_parallel(setup_max_cpus))
+ return;
+
+ /* Full per CPU serialized bringup */
+ cpuhp_bringup_mask(cpu_present_mask, setup_max_cpus, CPUHP_ONLINE);
+}
+
#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;
@@ -1350,8 +1929,8 @@ int freeze_secondary_cpus(int primary)
cpu_maps_update_begin();
if (primary == -1) {
primary = cpumask_first(cpu_online_mask);
- if (!housekeeping_cpu(primary, HK_FLAG_TIMER))
- primary = housekeeping_any_cpu(HK_FLAG_TIMER);
+ if (!housekeeping_cpu(primary, HK_TYPE_TIMER))
+ primary = housekeeping_any_cpu(HK_TYPE_TIMER);
} else {
if (!cpu_online(primary))
primary = cpumask_first(cpu_online_mask);
@@ -1521,6 +2100,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = perf_event_init_cpu,
.teardown.single = perf_event_exit_cpu,
},
+ [CPUHP_RANDOM_PREPARE] = {
+ .name = "random:prepare",
+ .startup.single = random_prepare_cpu,
+ .teardown.single = NULL,
+ },
[CPUHP_WORKQUEUE_PREP] = {
.name = "workqueue:prepare",
.startup.single = workqueue_prepare_cpu,
@@ -1529,7 +2113,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
[CPUHP_HRTIMERS_PREPARE] = {
.name = "hrtimers:prepare",
.startup.single = hrtimers_prepare_cpu,
- .teardown.single = hrtimers_dead_cpu,
+ .teardown.single = NULL,
},
[CPUHP_SMPCFD_PREPARE] = {
.name = "smpcfd:prepare",
@@ -1541,11 +2125,6 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = relay_prepare_cpu,
.teardown.single = NULL,
},
- [CPUHP_SLAB_PREPARE] = {
- .name = "slab:prepare",
- .startup.single = slab_prepare_cpu,
- .teardown.single = slab_dead_cpu,
- },
[CPUHP_RCUTREE_PREP] = {
.name = "RCU/tree:prepare",
.startup.single = rcutree_prepare_cpu,
@@ -1561,13 +2140,38 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = timers_prepare_cpu,
.teardown.single = timers_dead_cpu,
},
- /* Kicks the plugged cpu into life */
+
+#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
+ /*
+ * Kicks the AP alive. AP will wait in cpuhp_ap_sync_alive() until
+ * the next step will release it.
+ */
+ [CPUHP_BP_KICK_AP] = {
+ .name = "cpu:kick_ap",
+ .startup.single = cpuhp_kick_ap_alive,
+ },
+
+ /*
+ * Waits for the AP to reach cpuhp_ap_sync_alive() and then
+ * releases it for the complete bringup.
+ */
+ [CPUHP_BRINGUP_CPU] = {
+ .name = "cpu:bringup",
+ .startup.single = cpuhp_bringup_ap,
+ .teardown.single = finish_cpu,
+ .cant_stop = true,
+ },
+#else
+ /*
+ * All-in-one CPU bringup state which includes the kick alive.
+ */
[CPUHP_BRINGUP_CPU] = {
.name = "cpu:bringup",
.startup.single = bringup_cpu,
.teardown.single = finish_cpu,
.cant_stop = true,
},
+#endif
/* Final state before CPU kills itself */
[CPUHP_AP_IDLE_DEAD] = {
.name = "idle:dead",
@@ -1596,13 +2200,19 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = NULL,
.teardown.single = smpcfd_dying_cpu,
},
+ [CPUHP_AP_HRTIMERS_DYING] = {
+ .name = "hrtimers:dying",
+ .startup.single = NULL,
+ .teardown.single = hrtimers_cpu_dying,
+ },
+
/* Entry state on starting. Interrupts enabled from here on. Transient
* state for synchronsization */
[CPUHP_AP_ONLINE] = {
.name = "ap:online",
},
/*
- * Handled on controll processor until the plugged processor manages
+ * Handled on control processor until the plugged processor manages
* this itself.
*/
[CPUHP_TEARDOWN_CPU] = {
@@ -1611,6 +2221,13 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.teardown.single = takedown_cpu,
.cant_stop = true,
},
+
+ [CPUHP_AP_SCHED_WAIT_EMPTY] = {
+ .name = "sched:waitempty",
+ .startup.single = NULL,
+ .teardown.single = sched_cpu_wait_empty,
+ },
+
/* Handle smpboot threads park/unpark */
[CPUHP_AP_SMPBOOT_THREADS] = {
.name = "smpboot/threads:online",
@@ -1637,6 +2254,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = workqueue_online_cpu,
.teardown.single = workqueue_offline_cpu,
},
+ [CPUHP_AP_RANDOM_ONLINE] = {
+ .name = "random:online",
+ .startup.single = random_online_cpu,
+ .teardown.single = NULL,
+ },
[CPUHP_AP_RCUTREE_ONLINE] = {
.name = "RCU/tree:online",
.startup.single = rcutree_online_cpu,
@@ -1759,8 +2381,7 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
* If there's nothing to do, we done.
* Relies on the union for multi_instance.
*/
- if ((bringup && !sp->startup.single) ||
- (!bringup && !sp->teardown.single))
+ if (cpuhp_step_empty(bringup, sp))
return 0;
/*
* The non AP bound callbacks can fail on bringup. On teardown
@@ -1862,6 +2483,7 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
/**
* __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state
* @state: The state to setup
+ * @name: Name of the step
* @invoke: If true, the startup function is invoked for cpus where
* cpu state >= @state
* @startup: startup callback function
@@ -1870,9 +2492,9 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
* added afterwards.
*
* The caller needs to hold cpus read locked while calling this function.
- * Returns:
+ * Return:
* On success:
- * Positive state number if @state is CPUHP_AP_ONLINE_DYN
+ * Positive state number if @state is CPUHP_AP_ONLINE_DYN;
* 0 for all other states
* On failure: proper (negative) error code
*/
@@ -2072,6 +2694,12 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
for_each_online_cpu(cpu) {
if (topology_is_primary_thread(cpu))
continue;
+ /*
+ * Disable can be called with CPU_SMT_ENABLED when changing
+ * from a higher to lower number of SMT threads per core.
+ */
+ if (ctrlval == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
+ continue;
ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
if (ret)
break;
@@ -2106,6 +2734,8 @@ int cpuhp_smt_enable(void)
/* Skip online CPUs and CPUs on offline nodes */
if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
continue;
+ if (!cpu_smt_thread_allowed(cpu))
+ continue;
ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
if (ret)
break;
@@ -2118,18 +2748,17 @@ int cpuhp_smt_enable(void)
#endif
#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
-static ssize_t show_cpuhp_state(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t state_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
return sprintf(buf, "%d\n", st->state);
}
-static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL);
+static DEVICE_ATTR_RO(state);
-static ssize_t write_cpuhp_target(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t target_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
struct cpuhp_step *sp;
@@ -2160,26 +2789,26 @@ static ssize_t write_cpuhp_target(struct device *dev,
if (st->state < target)
ret = cpu_up(dev->id, target);
- else
+ else if (st->state > target)
ret = cpu_down(dev->id, target);
+ else if (WARN_ON(st->target != target))
+ st->target = target;
out:
unlock_device_hotplug();
return ret ? ret : count;
}
-static ssize_t show_cpuhp_target(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t target_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
return sprintf(buf, "%d\n", st->target);
}
-static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target);
-
+static DEVICE_ATTR_RW(target);
-static ssize_t write_cpuhp_fail(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t fail_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
struct cpuhp_step *sp;
@@ -2189,6 +2818,11 @@ static ssize_t write_cpuhp_fail(struct device *dev,
if (ret)
return ret;
+ if (fail == CPUHP_INVALID) {
+ st->fail = fail;
+ return count;
+ }
+
if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
return -EINVAL;
@@ -2199,6 +2833,15 @@ static ssize_t write_cpuhp_fail(struct device *dev,
return -EINVAL;
/*
+ * DEAD callbacks cannot fail...
+ * ... neither can CPUHP_BRINGUP_CPU during hotunplug. The latter
+ * triggering STARTING callbacks, a failure in this state would
+ * hinder rollback.
+ */
+ if (fail <= CPUHP_BRINGUP_CPU && st->state > CPUHP_BRINGUP_CPU)
+ return -EINVAL;
+
+ /*
* Cannot fail anything that doesn't have callbacks.
*/
mutex_lock(&cpuhp_state_mutex);
@@ -2214,15 +2857,15 @@ static ssize_t write_cpuhp_fail(struct device *dev,
return count;
}
-static ssize_t show_cpuhp_fail(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t fail_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
return sprintf(buf, "%d\n", st->fail);
}
-static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail);
+static DEVICE_ATTR_RW(fail);
static struct attribute *cpuhp_cpu_attrs[] = {
&dev_attr_state.attr,
@@ -2237,7 +2880,7 @@ static const struct attribute_group cpuhp_cpu_attr_group = {
NULL
};
-static ssize_t show_cpuhp_states(struct device *dev,
+static ssize_t states_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
ssize_t cur, res = 0;
@@ -2256,7 +2899,7 @@ static ssize_t show_cpuhp_states(struct device *dev,
mutex_unlock(&cpuhp_state_mutex);
return res;
}
-static DEVICE_ATTR(states, 0444, show_cpuhp_states, NULL);
+static DEVICE_ATTR_RO(states);
static struct attribute *cpuhp_cpu_root_attrs[] = {
&dev_attr_states.attr,
@@ -2271,20 +2914,19 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {
#ifdef CONFIG_HOTPLUG_SMT
+static bool cpu_smt_num_threads_valid(unsigned int threads)
+{
+ if (IS_ENABLED(CONFIG_SMT_NUM_THREADS_DYNAMIC))
+ return threads >= 1 && threads <= cpu_smt_max_threads;
+ return threads == 1 || threads == cpu_smt_max_threads;
+}
+
static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
- int ctrlval, ret;
-
- if (sysfs_streq(buf, "on"))
- ctrlval = CPU_SMT_ENABLED;
- else if (sysfs_streq(buf, "off"))
- ctrlval = CPU_SMT_DISABLED;
- else if (sysfs_streq(buf, "forceoff"))
- ctrlval = CPU_SMT_FORCE_DISABLED;
- else
- return -EINVAL;
+ int ctrlval, ret, num_threads, orig_threads;
+ bool force_off;
if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
return -EPERM;
@@ -2292,21 +2934,39 @@ __store_smt_control(struct device *dev, struct device_attribute *attr,
if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
return -ENODEV;
+ if (sysfs_streq(buf, "on")) {
+ ctrlval = CPU_SMT_ENABLED;
+ num_threads = cpu_smt_max_threads;
+ } else if (sysfs_streq(buf, "off")) {
+ ctrlval = CPU_SMT_DISABLED;
+ num_threads = 1;
+ } else if (sysfs_streq(buf, "forceoff")) {
+ ctrlval = CPU_SMT_FORCE_DISABLED;
+ num_threads = 1;
+ } else if (kstrtoint(buf, 10, &num_threads) == 0) {
+ if (num_threads == 1)
+ ctrlval = CPU_SMT_DISABLED;
+ else if (cpu_smt_num_threads_valid(num_threads))
+ ctrlval = CPU_SMT_ENABLED;
+ else
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
+
ret = lock_device_hotplug_sysfs();
if (ret)
return ret;
- if (ctrlval != cpu_smt_control) {
- switch (ctrlval) {
- case CPU_SMT_ENABLED:
- ret = cpuhp_smt_enable();
- break;
- case CPU_SMT_DISABLED:
- case CPU_SMT_FORCE_DISABLED:
- ret = cpuhp_smt_disable(ctrlval);
- break;
- }
- }
+ orig_threads = cpu_smt_num_threads;
+ cpu_smt_num_threads = num_threads;
+
+ force_off = ctrlval != cpu_smt_control && ctrlval == CPU_SMT_FORCE_DISABLED;
+
+ if (num_threads > orig_threads)
+ ret = cpuhp_smt_enable();
+ else if (num_threads < orig_threads || force_off)
+ ret = cpuhp_smt_disable(ctrlval);
unlock_device_hotplug();
return ret ? ret : count;
@@ -2329,28 +2989,38 @@ static const char *smt_states[] = {
[CPU_SMT_NOT_IMPLEMENTED] = "notimplemented",
};
-static ssize_t
-show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
+static ssize_t control_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
const char *state = smt_states[cpu_smt_control];
+#ifdef CONFIG_HOTPLUG_SMT
+ /*
+ * If SMT is enabled but not all threads are enabled then show the
+ * number of threads. If all threads are enabled show "on". Otherwise
+ * show the state name.
+ */
+ if (cpu_smt_control == CPU_SMT_ENABLED &&
+ cpu_smt_num_threads != cpu_smt_max_threads)
+ return sysfs_emit(buf, "%d\n", cpu_smt_num_threads);
+#endif
+
return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
}
-static ssize_t
-store_smt_control(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t control_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
{
return __store_smt_control(dev, attr, buf, count);
}
-static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control);
+static DEVICE_ATTR_RW(control);
-static ssize_t
-show_smt_active(struct device *dev, struct device_attribute *attr, char *buf)
+static ssize_t active_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active());
}
-static DEVICE_ATTR(active, 0444, show_smt_active, NULL);
+static DEVICE_ATTR_RO(active);
static struct attribute *cpuhp_smt_attrs[] = {
&dev_attr_control.attr,
@@ -2366,22 +3036,33 @@ static const struct attribute_group cpuhp_smt_attr_group = {
static int __init cpu_smt_sysfs_init(void)
{
- return sysfs_create_group(&cpu_subsys.dev_root->kobj,
- &cpuhp_smt_attr_group);
+ struct device *dev_root;
+ int ret = -ENODEV;
+
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ ret = sysfs_create_group(&dev_root->kobj, &cpuhp_smt_attr_group);
+ put_device(dev_root);
+ }
+ return ret;
}
static int __init cpuhp_sysfs_init(void)
{
+ struct device *dev_root;
int cpu, ret;
ret = cpu_smt_sysfs_init();
if (ret)
return ret;
- ret = sysfs_create_group(&cpu_subsys.dev_root->kobj,
- &cpuhp_cpu_root_attr_group);
- if (ret)
- return ret;
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ ret = sysfs_create_group(&dev_root->kobj, &cpuhp_cpu_root_attr_group);
+ put_device(dev_root);
+ if (ret)
+ return ret;
+ }
for_each_possible_cpu(cpu) {
struct device *dev = get_cpu_device(cpu);
@@ -2442,6 +3123,9 @@ EXPORT_SYMBOL(__cpu_present_mask);
struct cpumask __cpu_active_mask __read_mostly;
EXPORT_SYMBOL(__cpu_active_mask);
+struct cpumask __cpu_dying_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_dying_mask);
+
atomic_t __num_online_cpus __read_mostly;
EXPORT_SYMBOL(__num_online_cpus);
@@ -2506,8 +3190,10 @@ void __init boot_cpu_hotplug_init(void)
{
#ifdef CONFIG_SMP
cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
+ atomic_set(this_cpu_ptr(&cpuhp_state.ap_sync_state), SYNC_STATE_ONLINE);
#endif
this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
+ this_cpu_write(cpuhp_state.target, CPUHP_ONLINE);
}
/*
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 44a259338e33..b0f0d15085db 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -13,21 +13,38 @@
#include <linux/spinlock.h>
#include <linux/syscore_ops.h>
-static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain);
+/*
+ * atomic_notifiers use a spinlock_t, which can block under PREEMPT_RT.
+ * Notifications for cpu_pm will be issued by the idle task itself, which can
+ * never block, IOW it requires using a raw_spinlock_t.
+ */
+static struct {
+ struct raw_notifier_head chain;
+ raw_spinlock_t lock;
+} cpu_pm_notifier = {
+ .chain = RAW_NOTIFIER_INIT(cpu_pm_notifier.chain),
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(cpu_pm_notifier.lock),
+};
+
+static int cpu_pm_notify(enum cpu_pm_event event)
+{
+ int ret;
-static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
+ rcu_read_lock();
+ ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL);
+ rcu_read_unlock();
+
+ return notifier_to_errno(ret);
+}
+
+static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event event_down)
{
+ unsigned long flags;
int ret;
- /*
- * __atomic_notifier_call_chain has a RCU read critical section, which
- * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let
- * RCU know this.
- */
- rcu_irq_enter_irqson();
- ret = __atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
- nr_to_call, nr_calls);
- rcu_irq_exit_irqson();
+ raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
+ ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL);
+ raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
return notifier_to_errno(ret);
}
@@ -39,12 +56,17 @@ static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
* Add a driver to a list of drivers that are notified about
* CPU and CPU cluster low power entry and exit.
*
- * This function may sleep, and has the same return conditions as
- * raw_notifier_chain_register.
+ * This function has the same return conditions as raw_notifier_chain_register.
*/
int cpu_pm_register_notifier(struct notifier_block *nb)
{
- return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb);
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
+ ret = raw_notifier_chain_register(&cpu_pm_notifier.chain, nb);
+ raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
+ return ret;
}
EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
@@ -54,12 +76,17 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
*
* Remove a driver from the CPU PM notifier list.
*
- * This function may sleep, and has the same return conditions as
- * raw_notifier_chain_unregister.
+ * This function has the same return conditions as raw_notifier_chain_unregister.
*/
int cpu_pm_unregister_notifier(struct notifier_block *nb)
{
- return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
+ ret = raw_notifier_chain_unregister(&cpu_pm_notifier.chain, nb);
+ raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
+ return ret;
}
EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
@@ -80,18 +107,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
*/
int cpu_pm_enter(void)
{
- int nr_calls = 0;
- int ret = 0;
-
- ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
- if (ret)
- /*
- * Inform listeners (nr_calls - 1) about failure of CPU PM
- * PM entry who are notified earlier to prepare for it.
- */
- cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
-
- return ret;
+ return cpu_pm_notify_robust(CPU_PM_ENTER, CPU_PM_ENTER_FAILED);
}
EXPORT_SYMBOL_GPL(cpu_pm_enter);
@@ -109,7 +125,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_enter);
*/
int cpu_pm_exit(void)
{
- return cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
+ return cpu_pm_notify(CPU_PM_EXIT);
}
EXPORT_SYMBOL_GPL(cpu_pm_exit);
@@ -131,18 +147,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit);
*/
int cpu_cluster_pm_enter(void)
{
- int nr_calls = 0;
- int ret = 0;
-
- ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
- if (ret)
- /*
- * Inform listeners (nr_calls - 1) about failure of CPU cluster
- * PM entry who are notified earlier to prepare for it.
- */
- cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
-
- return ret;
+ return cpu_pm_notify_robust(CPU_CLUSTER_PM_ENTER, CPU_CLUSTER_PM_ENTER_FAILED);
}
EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
@@ -163,7 +168,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
*/
int cpu_cluster_pm_exit(void)
{
- return cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
+ return cpu_pm_notify(CPU_CLUSTER_PM_EXIT);
}
EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 9f1557b98468..75cd6a736d03 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -4,13 +4,28 @@
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
*/
-#include <linux/crash_core.h>
+#include <linux/buildid.h>
+#include <linux/init.h>
#include <linux/utsname.h>
#include <linux/vmalloc.h>
+#include <linux/sizes.h>
+#include <linux/kexec.h>
+#include <linux/memory.h>
+#include <linux/cpuhotplug.h>
+#include <linux/memblock.h>
+#include <linux/kmemleak.h>
#include <asm/page.h>
#include <asm/sections.h>
+#include <crypto/sha1.h>
+
+#include "kallsyms_internal.h"
+#include "kexec_internal.h"
+
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t __percpu *crash_notes;
+
/* vmcoreinfo stuff */
unsigned char *vmcoreinfo_data;
size_t vmcoreinfo_size;
@@ -19,6 +34,22 @@ u32 *vmcoreinfo_note;
/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
static unsigned char *vmcoreinfo_data_safecopy;
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+ .desc = IORES_DESC_CRASH_KERNEL
+};
+struct resource crashk_low_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+ .desc = IORES_DESC_CRASH_KERNEL
+};
+
/*
* parsing the "crashkernel" commandline
*
@@ -39,6 +70,15 @@ static int __init parse_crashkernel_mem(char *cmdline,
unsigned long long *crash_base)
{
char *cur = cmdline, *tmp;
+ unsigned long long total_mem = system_ram;
+
+ /*
+ * Firmware sometimes reserves some memory regions for its own use,
+ * so the system memory size is less than the actual physical memory
+ * size. Work around this by rounding up the total size to 128M,
+ * which is enough for most test cases.
+ */
+ total_mem = roundup(total_mem, SZ_128M);
/* for each entry of the comma-separated list */
do {
@@ -83,13 +123,13 @@ static int __init parse_crashkernel_mem(char *cmdline,
return -EINVAL;
}
cur = tmp;
- if (size >= system_ram) {
+ if (size >= total_mem) {
pr_warn("crashkernel: invalid size\n");
return -EINVAL;
}
/* match ? */
- if (system_ram >= start && system_ram < end) {
+ if (total_mem >= start && total_mem < end) {
*crash_size = size;
break;
}
@@ -158,7 +198,7 @@ static __initdata char *suffix_tbl[] = {
* It returns 0 on success and -EINVAL on failure.
*/
static int __init parse_crashkernel_suffix(char *cmdline,
- unsigned long long *crash_size,
+ unsigned long long *crash_size,
const char *suffix)
{
char *cur = cmdline;
@@ -218,9 +258,6 @@ next:
p = strstr(p+1, name);
}
- if (!ck_cmdline)
- return NULL;
-
return ck_cmdline;
}
@@ -228,20 +265,19 @@ static int __init __parse_crashkernel(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
unsigned long long *crash_base,
- const char *name,
const char *suffix)
{
- char *first_colon, *first_space;
- char *ck_cmdline;
+ char *first_colon, *first_space;
+ char *ck_cmdline;
+ char *name = "crashkernel=";
BUG_ON(!crash_size || !crash_base);
*crash_size = 0;
*crash_base = 0;
ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-
if (!ck_cmdline)
- return -EINVAL;
+ return -ENOENT;
ck_cmdline += strlen(name);
@@ -264,32 +300,337 @@ static int __init __parse_crashkernel(char *cmdline,
/*
* That function is the entry point for command line parsing and should be
* called from the arch-specific code.
+ *
+ * If crashkernel=,high|low is supported on architecture, non-NULL values
+ * should be passed to parameters 'low_size' and 'high'.
*/
int __init parse_crashkernel(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
- unsigned long long *crash_base)
+ unsigned long long *crash_base,
+ unsigned long long *low_size,
+ bool *high)
{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", NULL);
+ int ret;
+
+ /* crashkernel=X[@offset] */
+ ret = __parse_crashkernel(cmdline, system_ram, crash_size,
+ crash_base, NULL);
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ /*
+ * If non-NULL 'high' passed in and no normal crashkernel
+ * setting detected, try parsing crashkernel=,high|low.
+ */
+ if (high && ret == -ENOENT) {
+ ret = __parse_crashkernel(cmdline, 0, crash_size,
+ crash_base, suffix_tbl[SUFFIX_HIGH]);
+ if (ret || !*crash_size)
+ return -EINVAL;
+
+ /*
+ * crashkernel=Y,low can be specified or not, but invalid value
+ * is not allowed.
+ */
+ ret = __parse_crashkernel(cmdline, 0, low_size,
+ crash_base, suffix_tbl[SUFFIX_LOW]);
+ if (ret == -ENOENT) {
+ *low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+ ret = 0;
+ } else if (ret) {
+ return ret;
+ }
+
+ *high = true;
+ }
+#endif
+ if (!*crash_size)
+ ret = -EINVAL;
+
+ return ret;
}
-int __init parse_crashkernel_high(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
+/*
+ * Add a dummy early_param handler to mark crashkernel= as a known command line
+ * parameter and suppress incorrect warnings in init/main.c.
+ */
+static int __init parse_crashkernel_dummy(char *arg)
{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+ return 0;
}
+early_param("crashkernel", parse_crashkernel_dummy);
-int __init parse_crashkernel_low(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+static int __init reserve_crashkernel_low(unsigned long long low_size)
+{
+#ifdef CONFIG_64BIT
+ unsigned long long low_base;
+
+ low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
+ if (!low_base) {
+ pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
+ return -ENOMEM;
+ }
+
+ pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
+ low_base, low_base + low_size, low_size >> 20);
+
+ crashk_low_res.start = low_base;
+ crashk_low_res.end = low_base + low_size - 1;
+#endif
+ return 0;
+}
+
+void __init reserve_crashkernel_generic(char *cmdline,
+ unsigned long long crash_size,
+ unsigned long long crash_base,
+ unsigned long long crash_low_size,
+ bool high)
+{
+ unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0;
+ bool fixed_base = false;
+
+ /* User specifies base address explicitly. */
+ if (crash_base) {
+ fixed_base = true;
+ search_base = crash_base;
+ search_end = crash_base + crash_size;
+ } else if (high) {
+ search_base = CRASH_ADDR_LOW_MAX;
+ search_end = CRASH_ADDR_HIGH_MAX;
+ }
+
+retry:
+ crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
+ search_base, search_end);
+ if (!crash_base) {
+ /*
+ * For crashkernel=size[KMG]@offset[KMG], print out failure
+ * message if can't reserve the specified region.
+ */
+ if (fixed_base) {
+ pr_warn("crashkernel reservation failed - memory is in use.\n");
+ return;
+ }
+
+ /*
+ * For crashkernel=size[KMG], if the first attempt was for
+ * low memory, fall back to high memory, the minimum required
+ * low memory will be reserved later.
+ */
+ if (!high && search_end == CRASH_ADDR_LOW_MAX) {
+ search_end = CRASH_ADDR_HIGH_MAX;
+ search_base = CRASH_ADDR_LOW_MAX;
+ crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+ goto retry;
+ }
+
+ /*
+ * For crashkernel=size[KMG],high, if the first attempt was
+ * for high memory, fall back to low memory.
+ */
+ if (high && search_end == CRASH_ADDR_HIGH_MAX) {
+ search_end = CRASH_ADDR_LOW_MAX;
+ search_base = 0;
+ goto retry;
+ }
+ pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
+ crash_size);
+ return;
+ }
+
+ if ((crash_base >= CRASH_ADDR_LOW_MAX) &&
+ crash_low_size && reserve_crashkernel_low(crash_low_size)) {
+ memblock_phys_free(crash_base, crash_size);
+ return;
+ }
+
+ pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
+ crash_base, crash_base + crash_size, crash_size >> 20);
+
+ /*
+ * The crashkernel memory will be removed from the kernel linear
+ * map. Inform kmemleak so that it won't try to access it.
+ */
+ kmemleak_ignore_phys(crash_base);
+ if (crashk_low_res.end)
+ kmemleak_ignore_phys(crashk_low_res.start);
+
+ crashk_res.start = crash_base;
+ crashk_res.end = crash_base + crash_size - 1;
+}
+
+static __init int insert_crashkernel_resources(void)
{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_LOW]);
+ if (crashk_res.start < crashk_res.end)
+ insert_resource(&iomem_resource, &crashk_res);
+
+ if (crashk_low_res.start < crashk_low_res.end)
+ insert_resource(&iomem_resource, &crashk_low_res);
+
+ return 0;
+}
+early_initcall(insert_crashkernel_resources);
+#endif
+
+int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
+ void **addr, unsigned long *sz)
+{
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
+ unsigned char *buf;
+ unsigned int cpu, i;
+ unsigned long long notes_addr;
+ unsigned long mstart, mend;
+
+ /* extra phdr for vmcoreinfo ELF note */
+ nr_phdr = nr_cpus + 1;
+ nr_phdr += mem->nr_ranges;
+
+ /*
+ * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
+ * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
+ * I think this is required by tools like gdb. So same physical
+ * memory will be mapped in two ELF headers. One will contain kernel
+ * text virtual addresses and other will have __va(physical) addresses.
+ */
+
+ nr_phdr++;
+ elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
+ elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
+
+ buf = vzalloc(elf_sz);
+ if (!buf)
+ return -ENOMEM;
+
+ ehdr = (Elf64_Ehdr *)buf;
+ phdr = (Elf64_Phdr *)(ehdr + 1);
+ memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
+ ehdr->e_ident[EI_CLASS] = ELFCLASS64;
+ ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
+ ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+ ehdr->e_ident[EI_OSABI] = ELF_OSABI;
+ memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
+ ehdr->e_type = ET_CORE;
+ ehdr->e_machine = ELF_ARCH;
+ ehdr->e_version = EV_CURRENT;
+ ehdr->e_phoff = sizeof(Elf64_Ehdr);
+ ehdr->e_ehsize = sizeof(Elf64_Ehdr);
+ ehdr->e_phentsize = sizeof(Elf64_Phdr);
+
+ /* Prepare one phdr of type PT_NOTE for each possible CPU */
+ for_each_possible_cpu(cpu) {
+ phdr->p_type = PT_NOTE;
+ notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
+ phdr->p_offset = phdr->p_paddr = notes_addr;
+ phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
+ (ehdr->e_phnum)++;
+ phdr++;
+ }
+
+ /* Prepare one PT_NOTE header for vmcoreinfo */
+ phdr->p_type = PT_NOTE;
+ phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
+ phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
+ (ehdr->e_phnum)++;
+ phdr++;
+
+ /* Prepare PT_LOAD type program header for kernel text region */
+ if (need_kernel_map) {
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_vaddr = (unsigned long) _text;
+ phdr->p_filesz = phdr->p_memsz = _end - _text;
+ phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
+ ehdr->e_phnum++;
+ phdr++;
+ }
+
+ /* Go through all the ranges in mem->ranges[] and prepare phdr */
+ for (i = 0; i < mem->nr_ranges; i++) {
+ mstart = mem->ranges[i].start;
+ mend = mem->ranges[i].end;
+
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_offset = mstart;
+
+ phdr->p_paddr = mstart;
+ phdr->p_vaddr = (unsigned long) __va(mstart);
+ phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
+ phdr->p_align = 0;
+ ehdr->e_phnum++;
+#ifdef CONFIG_KEXEC_FILE
+ kexec_dprintk("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+ phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
+ ehdr->e_phnum, phdr->p_offset);
+#endif
+ phdr++;
+ }
+
+ *addr = buf;
+ *sz = elf_sz;
+ return 0;
+}
+
+int crash_exclude_mem_range(struct crash_mem *mem,
+ unsigned long long mstart, unsigned long long mend)
+{
+ int i;
+ unsigned long long start, end, p_start, p_end;
+
+ for (i = 0; i < mem->nr_ranges; i++) {
+ start = mem->ranges[i].start;
+ end = mem->ranges[i].end;
+ p_start = mstart;
+ p_end = mend;
+
+ if (p_start > end)
+ continue;
+
+ /*
+ * Because the memory ranges in mem->ranges are stored in
+ * ascending order, when we detect `p_end < start`, we can
+ * immediately exit the for loop, as the subsequent memory
+ * ranges will definitely be outside the range we are looking
+ * for.
+ */
+ if (p_end < start)
+ break;
+
+ /* Truncate any area outside of range */
+ if (p_start < start)
+ p_start = start;
+ if (p_end > end)
+ p_end = end;
+
+ /* Found completely overlapping range */
+ if (p_start == start && p_end == end) {
+ memmove(&mem->ranges[i], &mem->ranges[i + 1],
+ (mem->nr_ranges - (i + 1)) * sizeof(mem->ranges[i]));
+ i--;
+ mem->nr_ranges--;
+ } else if (p_start > start && p_end < end) {
+ /* Split original range */
+ if (mem->nr_ranges >= mem->max_nr_ranges)
+ return -ENOMEM;
+
+ memmove(&mem->ranges[i + 2], &mem->ranges[i + 1],
+ (mem->nr_ranges - (i + 1)) * sizeof(mem->ranges[i]));
+
+ mem->ranges[i].end = p_start - 1;
+ mem->ranges[i + 1].start = p_end + 1;
+ mem->ranges[i + 1].end = end;
+
+ i++;
+ mem->nr_ranges++;
+ } else if (p_start != start)
+ mem->ranges[i].end = p_start - 1;
+ else
+ mem->ranges[i].start = p_end + 1;
+ }
+
+ return 0;
}
Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
@@ -361,6 +702,9 @@ void vmcoreinfo_append_str(const char *fmt, ...)
memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
vmcoreinfo_size += r;
+
+ WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES,
+ "vmcoreinfo data exceeds allocated size, truncating");
}
/*
@@ -394,9 +738,11 @@ static int __init crash_save_vmcoreinfo_init(void)
}
VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+ VMCOREINFO_BUILD_ID();
VMCOREINFO_PAGESIZE(PAGE_SIZE);
VMCOREINFO_SYMBOL(init_uts_ns);
+ VMCOREINFO_OFFSET(uts_namespace, name);
VMCOREINFO_SYMBOL(node_online_map);
#ifdef CONFIG_MMU
VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
@@ -404,7 +750,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(_stext);
VMCOREINFO_SYMBOL(vmap_area_list);
-#ifndef CONFIG_NEED_MULTIPLE_NODES
+#ifndef CONFIG_NUMA
VMCOREINFO_SYMBOL(mem_map);
VMCOREINFO_SYMBOL(contig_page_data);
#endif
@@ -413,6 +759,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
VMCOREINFO_STRUCT_SIZE(mem_section);
VMCOREINFO_OFFSET(mem_section, section_mem_map);
+ VMCOREINFO_NUMBER(SECTION_SIZE_BITS);
+ VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS);
#endif
VMCOREINFO_STRUCT_SIZE(page);
VMCOREINFO_STRUCT_SIZE(pglist_data);
@@ -426,12 +774,10 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(page, lru);
VMCOREINFO_OFFSET(page, _mapcount);
VMCOREINFO_OFFSET(page, private);
- VMCOREINFO_OFFSET(page, compound_dtor);
- VMCOREINFO_OFFSET(page, compound_order);
VMCOREINFO_OFFSET(page, compound_head);
VMCOREINFO_OFFSET(pglist_data, node_zones);
VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
+#ifdef CONFIG_FLATMEM
VMCOREINFO_OFFSET(pglist_data, node_mem_map);
#endif
VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
@@ -445,7 +791,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(list_head, prev);
VMCOREINFO_OFFSET(vmap_area, va_start);
VMCOREINFO_OFFSET(vmap_area, list);
- VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+ VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS);
log_buf_vmcoreinfo_setup();
VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
VMCOREINFO_NUMBER(NR_FREE_PAGES);
@@ -461,11 +807,24 @@ static int __init crash_save_vmcoreinfo_init(void)
#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy)
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
#ifdef CONFIG_HUGETLB_PAGE
- VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
+ VMCOREINFO_NUMBER(PG_hugetlb);
#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline)
VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
#endif
+#ifdef CONFIG_KALLSYMS
+ VMCOREINFO_SYMBOL(kallsyms_names);
+ VMCOREINFO_SYMBOL(kallsyms_num_syms);
+ VMCOREINFO_SYMBOL(kallsyms_token_table);
+ VMCOREINFO_SYMBOL(kallsyms_token_index);
+#ifdef CONFIG_KALLSYMS_BASE_RELATIVE
+ VMCOREINFO_SYMBOL(kallsyms_offsets);
+ VMCOREINFO_SYMBOL(kallsyms_relative_base);
+#else
+ VMCOREINFO_SYMBOL(kallsyms_addresses);
+#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */
+#endif /* CONFIG_KALLSYMS */
+
arch_crash_save_vmcoreinfo();
update_vmcoreinfo_note();
@@ -473,3 +832,223 @@ static int __init crash_save_vmcoreinfo_init(void)
}
subsys_initcall(crash_save_vmcoreinfo_init);
+
+static int __init crash_notes_memory_init(void)
+{
+ /* Allocate memory for saving cpu registers. */
+ size_t size, align;
+
+ /*
+ * crash_notes could be allocated across 2 vmalloc pages when percpu
+ * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc
+ * pages are also on 2 continuous physical pages. In this case the
+ * 2nd part of crash_notes in 2nd page could be lost since only the
+ * starting address and size of crash_notes are exported through sysfs.
+ * Here round up the size of crash_notes to the nearest power of two
+ * and pass it to __alloc_percpu as align value. This can make sure
+ * crash_notes is allocated inside one physical page.
+ */
+ size = sizeof(note_buf_t);
+ align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE);
+
+ /*
+ * Break compile if size is bigger than PAGE_SIZE since crash_notes
+ * definitely will be in 2 pages with that.
+ */
+ BUILD_BUG_ON(size > PAGE_SIZE);
+
+ crash_notes = __alloc_percpu(size, align);
+ if (!crash_notes) {
+ pr_warn("Memory allocation for saving cpu register states failed\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+subsys_initcall(crash_notes_memory_init);
+
+#ifdef CONFIG_CRASH_HOTPLUG
+#undef pr_fmt
+#define pr_fmt(fmt) "crash hp: " fmt
+
+/*
+ * Different than kexec/kdump loading/unloading/jumping/shrinking which
+ * usually rarely happen, there will be many crash hotplug events notified
+ * during one short period, e.g one memory board is hot added and memory
+ * regions are online. So mutex lock __crash_hotplug_lock is used to
+ * serialize the crash hotplug handling specifically.
+ */
+static DEFINE_MUTEX(__crash_hotplug_lock);
+#define crash_hotplug_lock() mutex_lock(&__crash_hotplug_lock)
+#define crash_hotplug_unlock() mutex_unlock(&__crash_hotplug_lock)
+
+/*
+ * This routine utilized when the crash_hotplug sysfs node is read.
+ * It reflects the kernel's ability/permission to update the crash
+ * elfcorehdr directly.
+ */
+int crash_check_update_elfcorehdr(void)
+{
+ int rc = 0;
+
+ crash_hotplug_lock();
+ /* Obtain lock while reading crash information */
+ if (!kexec_trylock()) {
+ pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
+ crash_hotplug_unlock();
+ return 0;
+ }
+ if (kexec_crash_image) {
+ if (kexec_crash_image->file_mode)
+ rc = 1;
+ else
+ rc = kexec_crash_image->update_elfcorehdr;
+ }
+ /* Release lock now that update complete */
+ kexec_unlock();
+ crash_hotplug_unlock();
+
+ return rc;
+}
+
+/*
+ * To accurately reflect hot un/plug changes of cpu and memory resources
+ * (including onling and offlining of those resources), the elfcorehdr
+ * (which is passed to the crash kernel via the elfcorehdr= parameter)
+ * must be updated with the new list of CPUs and memories.
+ *
+ * In order to make changes to elfcorehdr, two conditions are needed:
+ * First, the segment containing the elfcorehdr must be large enough
+ * to permit a growing number of resources; the elfcorehdr memory size
+ * is based on NR_CPUS_DEFAULT and CRASH_MAX_MEMORY_RANGES.
+ * Second, purgatory must explicitly exclude the elfcorehdr from the
+ * list of segments it checks (since the elfcorehdr changes and thus
+ * would require an update to purgatory itself to update the digest).
+ */
+static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
+{
+ struct kimage *image;
+
+ crash_hotplug_lock();
+ /* Obtain lock while changing crash information */
+ if (!kexec_trylock()) {
+ pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
+ crash_hotplug_unlock();
+ return;
+ }
+
+ /* Check kdump is not loaded */
+ if (!kexec_crash_image)
+ goto out;
+
+ image = kexec_crash_image;
+
+ /* Check that updating elfcorehdr is permitted */
+ if (!(image->file_mode || image->update_elfcorehdr))
+ goto out;
+
+ if (hp_action == KEXEC_CRASH_HP_ADD_CPU ||
+ hp_action == KEXEC_CRASH_HP_REMOVE_CPU)
+ pr_debug("hp_action %u, cpu %u\n", hp_action, cpu);
+ else
+ pr_debug("hp_action %u\n", hp_action);
+
+ /*
+ * The elfcorehdr_index is set to -1 when the struct kimage
+ * is allocated. Find the segment containing the elfcorehdr,
+ * if not already found.
+ */
+ if (image->elfcorehdr_index < 0) {
+ unsigned long mem;
+ unsigned char *ptr;
+ unsigned int n;
+
+ for (n = 0; n < image->nr_segments; n++) {
+ mem = image->segment[n].mem;
+ ptr = kmap_local_page(pfn_to_page(mem >> PAGE_SHIFT));
+ if (ptr) {
+ /* The segment containing elfcorehdr */
+ if (memcmp(ptr, ELFMAG, SELFMAG) == 0)
+ image->elfcorehdr_index = (int)n;
+ kunmap_local(ptr);
+ }
+ }
+ }
+
+ if (image->elfcorehdr_index < 0) {
+ pr_err("unable to locate elfcorehdr segment");
+ goto out;
+ }
+
+ /* Needed in order for the segments to be updated */
+ arch_kexec_unprotect_crashkres();
+
+ /* Differentiate between normal load and hotplug update */
+ image->hp_action = hp_action;
+
+ /* Now invoke arch-specific update handler */
+ arch_crash_handle_hotplug_event(image);
+
+ /* No longer handling a hotplug event */
+ image->hp_action = KEXEC_CRASH_HP_NONE;
+ image->elfcorehdr_updated = true;
+
+ /* Change back to read-only */
+ arch_kexec_protect_crashkres();
+
+ /* Errors in the callback is not a reason to rollback state */
+out:
+ /* Release lock now that update complete */
+ kexec_unlock();
+ crash_hotplug_unlock();
+}
+
+static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v)
+{
+ switch (val) {
+ case MEM_ONLINE:
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY,
+ KEXEC_CRASH_HP_INVALID_CPU);
+ break;
+
+ case MEM_OFFLINE:
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY,
+ KEXEC_CRASH_HP_INVALID_CPU);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block crash_memhp_nb = {
+ .notifier_call = crash_memhp_notifier,
+ .priority = 0
+};
+
+static int crash_cpuhp_online(unsigned int cpu)
+{
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu);
+ return 0;
+}
+
+static int crash_cpuhp_offline(unsigned int cpu)
+{
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu);
+ return 0;
+}
+
+static int __init crash_hotplug_init(void)
+{
+ int result = 0;
+
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
+ register_memory_notifier(&crash_memhp_nb);
+
+ if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
+ result = cpuhp_setup_state_nocalls(CPUHP_BP_PREPARE_DYN,
+ "crash/cpuhp", crash_cpuhp_online, crash_cpuhp_offline);
+ }
+
+ return result;
+}
+
+subsys_initcall(crash_hotplug_init);
+#endif
diff --git a/kernel/cred.c b/kernel/cred.c
index 421b1149c651..c033a201c808 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -4,6 +4,9 @@
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
+
+#define pr_fmt(fmt) "CRED: " fmt
+
#include <linux/export.h>
#include <linux/cred.h>
#include <linux/slab.h>
@@ -33,17 +36,13 @@ do { \
static struct kmem_cache *cred_jar;
/* init to 2 - one for init_task, one to ensure it is never freed */
-struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) };
/*
* The initial credentials for the initial task
*/
struct cred init_cred = {
.usage = ATOMIC_INIT(4),
-#ifdef CONFIG_DEBUG_CREDENTIALS
- .subscribers = ATOMIC_INIT(2),
- .magic = CRED_MAGIC,
-#endif
.uid = GLOBAL_ROOT_UID,
.gid = GLOBAL_ROOT_GID,
.suid = GLOBAL_ROOT_UID,
@@ -60,33 +59,9 @@ struct cred init_cred = {
.user = INIT_USER,
.user_ns = &init_user_ns,
.group_info = &init_groups,
+ .ucounts = &init_ucounts,
};
-static inline void set_cred_subscribers(struct cred *cred, int n)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
- atomic_set(&cred->subscribers, n);
-#endif
-}
-
-static inline int read_cred_subscribers(const struct cred *cred)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
- return atomic_read(&cred->subscribers);
-#else
- return 0;
-#endif
-}
-
-static inline void alter_cred_subscribers(const struct cred *_cred, int n)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
- struct cred *cred = (struct cred *) _cred;
-
- atomic_add(n, &cred->subscribers);
-#endif
-}
-
/*
* The RCU callback to actually dispose of a set of credentials
*/
@@ -96,20 +71,9 @@ static void put_cred_rcu(struct rcu_head *rcu)
kdebug("put_cred_rcu(%p)", cred);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- if (cred->magic != CRED_MAGIC_DEAD ||
- atomic_read(&cred->usage) != 0 ||
- read_cred_subscribers(cred) != 0)
- panic("CRED: put_cred_rcu() sees %p with"
- " mag %x, put %p, usage %d, subscr %d\n",
- cred, cred->magic, cred->put_addr,
- atomic_read(&cred->usage),
- read_cred_subscribers(cred));
-#else
- if (atomic_read(&cred->usage) != 0)
- panic("CRED: put_cred_rcu() sees %p with usage %d\n",
- cred, atomic_read(&cred->usage));
-#endif
+ if (atomic_long_read(&cred->usage) != 0)
+ panic("CRED: put_cred_rcu() sees %p with usage %ld\n",
+ cred, atomic_long_read(&cred->usage));
security_cred_free(cred);
key_put(cred->session_keyring);
@@ -119,6 +83,8 @@ static void put_cred_rcu(struct rcu_head *rcu)
if (cred->group_info)
put_group_info(cred->group_info);
free_uid(cred->user);
+ if (cred->ucounts)
+ put_ucounts(cred->ucounts);
put_user_ns(cred->user_ns);
kmem_cache_free(cred_jar, cred);
}
@@ -131,16 +97,10 @@ static void put_cred_rcu(struct rcu_head *rcu)
*/
void __put_cred(struct cred *cred)
{
- kdebug("__put_cred(%p{%d,%d})", cred,
- atomic_read(&cred->usage),
- read_cred_subscribers(cred));
-
- BUG_ON(atomic_read(&cred->usage) != 0);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- BUG_ON(read_cred_subscribers(cred) != 0);
- cred->magic = CRED_MAGIC_DEAD;
- cred->put_addr = __builtin_return_address(0);
-#endif
+ kdebug("__put_cred(%p{%ld})", cred,
+ atomic_long_read(&cred->usage));
+
+ BUG_ON(atomic_long_read(&cred->usage) != 0);
BUG_ON(cred == current->cred);
BUG_ON(cred == current->real_cred);
@@ -156,23 +116,23 @@ EXPORT_SYMBOL(__put_cred);
*/
void exit_creds(struct task_struct *tsk)
{
- struct cred *cred;
+ struct cred *real_cred, *cred;
- kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
- atomic_read(&tsk->cred->usage),
- read_cred_subscribers(tsk->cred));
+ kdebug("exit_creds(%u,%p,%p,{%ld})", tsk->pid, tsk->real_cred, tsk->cred,
+ atomic_long_read(&tsk->cred->usage));
- cred = (struct cred *) tsk->real_cred;
+ real_cred = (struct cred *) tsk->real_cred;
tsk->real_cred = NULL;
- validate_creds(cred);
- alter_cred_subscribers(cred, -1);
- put_cred(cred);
cred = (struct cred *) tsk->cred;
tsk->cred = NULL;
- validate_creds(cred);
- alter_cred_subscribers(cred, -1);
- put_cred(cred);
+
+ if (real_cred == cred) {
+ put_cred_many(cred, 2);
+ } else {
+ put_cred(real_cred);
+ put_cred(cred);
+ }
#ifdef CONFIG_KEYS_REQUEST_CACHE
key_put(tsk->cached_requested_key);
@@ -218,11 +178,7 @@ struct cred *cred_alloc_blank(void)
if (!new)
return NULL;
- atomic_set(&new->usage, 1);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- new->magic = CRED_MAGIC;
-#endif
-
+ atomic_long_set(&new->usage, 1);
if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
goto error;
@@ -253,8 +209,6 @@ struct cred *prepare_creds(void)
const struct cred *old;
struct cred *new;
- validate_process_creds();
-
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
@@ -265,8 +219,7 @@ struct cred *prepare_creds(void)
memcpy(new, old, sizeof(struct cred));
new->non_rcu = 0;
- atomic_set(&new->usage, 1);
- set_cred_subscribers(new, 0);
+ atomic_long_set(&new->usage, 1);
get_group_info(new->group_info);
get_uid(new->user);
get_user_ns(new->user_ns);
@@ -282,9 +235,13 @@ struct cred *prepare_creds(void)
new->security = NULL;
#endif
+ new->ucounts = get_ucounts(new->ucounts);
+ if (!new->ucounts)
+ goto error;
+
if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
- validate_creds(new);
+
return new;
error:
@@ -345,13 +302,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
#endif
clone_flags & CLONE_THREAD
) {
- p->real_cred = get_cred(p->cred);
- get_cred(p->cred);
- alter_cred_subscribers(p->cred, 2);
- kdebug("share_creds(%p{%d,%d})",
- p->cred, atomic_read(&p->cred->usage),
- read_cred_subscribers(p->cred));
- atomic_inc(&p->cred->user->processes);
+ p->real_cred = get_cred_many(p->cred, 2);
+ kdebug("share_creds(%p{%ld})",
+ p->cred, atomic_long_read(&p->cred->usage));
+ inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
return 0;
}
@@ -363,6 +317,9 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
ret = create_user_ns(new);
if (ret < 0)
goto error_put;
+ ret = set_cred_ucounts(new);
+ if (ret < 0)
+ goto error_put;
}
#ifdef CONFIG_KEYS
@@ -384,10 +341,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
}
#endif
- atomic_inc(&new->user->processes);
p->cred = p->real_cred = get_cred(new);
- alter_cred_subscribers(new, 2);
- validate_creds(new);
+ inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
return 0;
error_put:
@@ -439,17 +394,11 @@ int commit_creds(struct cred *new)
struct task_struct *task = current;
const struct cred *old = task->real_cred;
- kdebug("commit_creds(%p{%d,%d})", new,
- atomic_read(&new->usage),
- read_cred_subscribers(new));
+ kdebug("commit_creds(%p{%ld})", new,
+ atomic_long_read(&new->usage));
BUG_ON(task->cred != old);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- BUG_ON(read_cred_subscribers(old) < 2);
- validate_creds(old);
- validate_creds(new);
-#endif
- BUG_ON(atomic_read(&new->usage) < 1);
+ BUG_ON(atomic_long_read(&new->usage) < 1);
get_cred(new); /* we will require a ref for the subj creds too */
@@ -484,14 +433,12 @@ int commit_creds(struct cred *new)
* RLIMIT_NPROC limits on user->processes have already been checked
* in set_user().
*/
- alter_cred_subscribers(new, 2);
- if (new->user != old->user)
- atomic_inc(&new->user->processes);
+ if (new->user != old->user || new->user_ns != old->user_ns)
+ inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
- if (new->user != old->user)
- atomic_dec(&old->user->processes);
- alter_cred_subscribers(old, -2);
+ if (new->user != old->user || new->user_ns != old->user_ns)
+ dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
/* send notifications */
if (!uid_eq(new->uid, old->uid) ||
@@ -507,8 +454,7 @@ int commit_creds(struct cred *new)
proc_id_connector(task, PROC_EVENT_GID);
/* release the old obj and subj refs both */
- put_cred(old);
- put_cred(old);
+ put_cred_many(old, 2);
return 0;
}
EXPORT_SYMBOL(commit_creds);
@@ -522,14 +468,10 @@ EXPORT_SYMBOL(commit_creds);
*/
void abort_creds(struct cred *new)
{
- kdebug("abort_creds(%p{%d,%d})", new,
- atomic_read(&new->usage),
- read_cred_subscribers(new));
+ kdebug("abort_creds(%p{%ld})", new,
+ atomic_long_read(&new->usage));
-#ifdef CONFIG_DEBUG_CREDENTIALS
- BUG_ON(read_cred_subscribers(new) != 0);
-#endif
- BUG_ON(atomic_read(&new->usage) < 1);
+ BUG_ON(atomic_long_read(&new->usage) < 1);
put_cred(new);
}
EXPORT_SYMBOL(abort_creds);
@@ -545,12 +487,8 @@ const struct cred *override_creds(const struct cred *new)
{
const struct cred *old = current->cred;
- kdebug("override_creds(%p{%d,%d})", new,
- atomic_read(&new->usage),
- read_cred_subscribers(new));
-
- validate_creds(old);
- validate_creds(new);
+ kdebug("override_creds(%p{%ld})", new,
+ atomic_long_read(&new->usage));
/*
* NOTE! This uses 'get_new_cred()' rather than 'get_cred()'.
@@ -559,18 +497,12 @@ const struct cred *override_creds(const struct cred *new)
* we are only installing the cred into the thread-synchronous
* '->cred' pointer, not the '->real_cred' pointer that is
* visible to other threads under RCU.
- *
- * Also note that we did validate_creds() manually, not depending
- * on the validation in 'get_cred()'.
*/
get_new_cred((struct cred *)new);
- alter_cred_subscribers(new, 1);
rcu_assign_pointer(current->cred, new);
- alter_cred_subscribers(old, -1);
- kdebug("override_creds() = %p{%d,%d}", old,
- atomic_read(&old->usage),
- read_cred_subscribers(old));
+ kdebug("override_creds() = %p{%ld}", old,
+ atomic_long_read(&old->usage));
return old;
}
EXPORT_SYMBOL(override_creds);
@@ -586,15 +518,10 @@ void revert_creds(const struct cred *old)
{
const struct cred *override = current->cred;
- kdebug("revert_creds(%p{%d,%d})", old,
- atomic_read(&old->usage),
- read_cred_subscribers(old));
+ kdebug("revert_creds(%p{%ld})", old,
+ atomic_long_read(&old->usage));
- validate_creds(old);
- validate_creds(override);
- alter_cred_subscribers(old, 1);
rcu_assign_pointer(current->cred, old);
- alter_cred_subscribers(override, -1);
put_cred(override);
}
EXPORT_SYMBOL(revert_creds);
@@ -653,6 +580,26 @@ int cred_fscmp(const struct cred *a, const struct cred *b)
}
EXPORT_SYMBOL(cred_fscmp);
+int set_cred_ucounts(struct cred *new)
+{
+ struct ucounts *new_ucounts, *old_ucounts = new->ucounts;
+
+ /*
+ * This optimization is needed because alloc_ucounts() uses locks
+ * for table lookups.
+ */
+ if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->uid))
+ return 0;
+
+ if (!(new_ucounts = alloc_ucounts(new->user_ns, new->uid)))
+ return -EAGAIN;
+
+ new->ucounts = new_ucounts;
+ put_ucounts(old_ucounts);
+
+ return 0;
+}
+
/*
* initialise the credentials stuff
*/
@@ -671,9 +618,9 @@ void __init cred_init(void)
* override a task's own credentials so that work can be done on behalf of that
* task that requires a different subjective context.
*
- * @daemon is used to provide a base for the security record, but can be NULL.
- * If @daemon is supplied, then the security data will be derived from that;
- * otherwise they'll be set to 0 and no groups, full capabilities and no keys.
+ * @daemon is used to provide a base cred, with the security data derived from
+ * that; if this is "&init_task", they'll be set to 0, no groups, full
+ * capabilities, and no keys.
*
* The caller may change these controls afterwards if desired.
*
@@ -684,23 +631,20 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
const struct cred *old;
struct cred *new;
+ if (WARN_ON_ONCE(!daemon))
+ return NULL;
+
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
kdebug("prepare_kernel_cred() alloc %p", new);
- if (daemon)
- old = get_task_cred(daemon);
- else
- old = get_cred(&init_cred);
-
- validate_creds(old);
+ old = get_task_cred(daemon);
*new = *old;
new->non_rcu = 0;
- atomic_set(&new->usage, 1);
- set_cred_subscribers(new, 0);
+ atomic_long_set(&new->usage, 1);
get_uid(new->user);
get_user_ns(new->user_ns);
get_group_info(new->group_info);
@@ -716,11 +660,14 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
#ifdef CONFIG_SECURITY
new->security = NULL;
#endif
+ new->ucounts = get_ucounts(new->ucounts);
+ if (!new->ucounts)
+ goto error;
+
if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
put_cred(old);
- validate_creds(new);
return new;
error:
@@ -785,109 +732,3 @@ int set_create_files_as(struct cred *new, struct inode *inode)
return security_kernel_create_files_as(new, inode);
}
EXPORT_SYMBOL(set_create_files_as);
-
-#ifdef CONFIG_DEBUG_CREDENTIALS
-
-bool creds_are_invalid(const struct cred *cred)
-{
- if (cred->magic != CRED_MAGIC)
- return true;
- return false;
-}
-EXPORT_SYMBOL(creds_are_invalid);
-
-/*
- * dump invalid credentials
- */
-static void dump_invalid_creds(const struct cred *cred, const char *label,
- const struct task_struct *tsk)
-{
- printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
- label, cred,
- cred == &init_cred ? "[init]" : "",
- cred == tsk->real_cred ? "[real]" : "",
- cred == tsk->cred ? "[eff]" : "");
- printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
- cred->magic, cred->put_addr);
- printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
- atomic_read(&cred->usage),
- read_cred_subscribers(cred));
- printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
- from_kuid_munged(&init_user_ns, cred->uid),
- from_kuid_munged(&init_user_ns, cred->euid),
- from_kuid_munged(&init_user_ns, cred->suid),
- from_kuid_munged(&init_user_ns, cred->fsuid));
- printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
- from_kgid_munged(&init_user_ns, cred->gid),
- from_kgid_munged(&init_user_ns, cred->egid),
- from_kgid_munged(&init_user_ns, cred->sgid),
- from_kgid_munged(&init_user_ns, cred->fsgid));
-#ifdef CONFIG_SECURITY
- printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
- if ((unsigned long) cred->security >= PAGE_SIZE &&
- (((unsigned long) cred->security & 0xffffff00) !=
- (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
- printk(KERN_ERR "CRED: ->security {%x, %x}\n",
- ((u32*)cred->security)[0],
- ((u32*)cred->security)[1]);
-#endif
-}
-
-/*
- * report use of invalid credentials
- */
-void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
-{
- printk(KERN_ERR "CRED: Invalid credentials\n");
- printk(KERN_ERR "CRED: At %s:%u\n", file, line);
- dump_invalid_creds(cred, "Specified", current);
- BUG();
-}
-EXPORT_SYMBOL(__invalid_creds);
-
-/*
- * check the credentials on a process
- */
-void __validate_process_creds(struct task_struct *tsk,
- const char *file, unsigned line)
-{
- if (tsk->cred == tsk->real_cred) {
- if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
- creds_are_invalid(tsk->cred)))
- goto invalid_creds;
- } else {
- if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
- read_cred_subscribers(tsk->cred) < 1 ||
- creds_are_invalid(tsk->real_cred) ||
- creds_are_invalid(tsk->cred)))
- goto invalid_creds;
- }
- return;
-
-invalid_creds:
- printk(KERN_ERR "CRED: Invalid process credentials\n");
- printk(KERN_ERR "CRED: At %s:%u\n", file, line);
-
- dump_invalid_creds(tsk->real_cred, "Real", tsk);
- if (tsk->cred != tsk->real_cred)
- dump_invalid_creds(tsk->cred, "Effective", tsk);
- else
- printk(KERN_ERR "CRED: Effective creds == Real creds\n");
- BUG();
-}
-EXPORT_SYMBOL(__validate_process_creds);
-
-/*
- * check creds for do_exit()
- */
-void validate_creds_for_do_exit(struct task_struct *tsk)
-{
- kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
- tsk->real_cred, tsk->cred,
- atomic_read(&tsk->cred->usage),
- read_cred_subscribers(tsk->cred));
-
- __validate_process_creds(tsk, __FILE__, __LINE__);
-}
-
-#endif /* CONFIG_DEBUG_CREDENTIALS */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 9e5934780f41..ce1bb2301c06 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel Debug Core
*
@@ -22,10 +23,6 @@
*
* Original KGDB stub: David Grothe <dave@gcom.com>,
* Tigran Aivazian <tigran@sco.com>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
*/
#define pr_fmt(fmt) "KGDB: " fmt
@@ -53,9 +50,9 @@
#include <linux/pid.h>
#include <linux/smp.h>
#include <linux/mm.h>
-#include <linux/vmacache.h>
#include <linux/rcupdate.h>
#include <linux/irq.h>
+#include <linux/security.h>
#include <asm/cacheflush.h>
#include <asm/byteorder.h>
@@ -80,7 +77,7 @@ static int exception_level;
struct kgdb_io *dbg_io_ops;
static DEFINE_SPINLOCK(kgdb_registration_lock);
-/* Action for the reboot notifiter, a global allow kdb to change it */
+/* Action for the reboot notifier, a global allow kdb to change it */
static int kgdbreboot;
/* kgdb console driver is loaded */
static int kgdb_con_registered;
@@ -94,14 +91,6 @@ int dbg_switch_cpu;
/* Use kdb or gdbserver mode */
int dbg_kdb_mode = 1;
-static int __init opt_kgdb_con(char *str)
-{
- kgdb_use_con = 1;
- return 0;
-}
-
-early_param("kgdbcon", opt_kgdb_con);
-
module_param(kgdb_use_con, int, 0644);
module_param(kgdbreboot, int, 0644);
@@ -127,7 +116,6 @@ static DEFINE_RAW_SPINLOCK(dbg_slave_lock);
*/
static atomic_t masters_in_kgdb;
static atomic_t slaves_in_kgdb;
-static atomic_t kgdb_break_tasklet_var;
atomic_t kgdb_setting_breakpoint;
struct task_struct *kgdb_usethread;
@@ -163,7 +151,7 @@ early_param("nokgdbroundup", opt_nokgdbroundup);
/*
* Weak aliases for breakpoint management,
- * can be overriden by architectures when needed:
+ * can be overridden by architectures when needed:
*/
int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
{
@@ -177,17 +165,23 @@ int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
return err;
}
+NOKPROBE_SYMBOL(kgdb_arch_set_breakpoint);
int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
{
return copy_to_kernel_nofault((char *)bpt->bpt_addr,
(char *)bpt->saved_instr, BREAK_INSTR_SIZE);
}
+NOKPROBE_SYMBOL(kgdb_arch_remove_breakpoint);
int __weak kgdb_validate_break_address(unsigned long addr)
{
struct kgdb_bkpt tmp;
int err;
+
+ if (kgdb_within_blocklist(addr))
+ return -EINVAL;
+
/* Validate setting the breakpoint and then removing it. If the
* remove fails, the kernel needs to emit a bad message because we
* are deep trouble not being able to put things back the way we
@@ -208,6 +202,7 @@ unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
{
return instruction_pointer(regs);
}
+NOKPROBE_SYMBOL(kgdb_arch_pc);
int __weak kgdb_arch_init(void)
{
@@ -218,6 +213,7 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
{
return 0;
}
+NOKPROBE_SYMBOL(kgdb_skipexception);
#ifdef CONFIG_SMP
@@ -225,8 +221,6 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
* Default (weak) implementation for kgdb_roundup_cpus
*/
-static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd);
-
void __weak kgdb_call_nmi_hook(void *ignored)
{
/*
@@ -239,6 +233,10 @@ void __weak kgdb_call_nmi_hook(void *ignored)
*/
kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
}
+NOKPROBE_SYMBOL(kgdb_call_nmi_hook);
+
+static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd) =
+ CSD_INIT(kgdb_call_nmi_hook, NULL);
void __weak kgdb_roundup_cpus(void)
{
@@ -266,12 +264,12 @@ void __weak kgdb_roundup_cpus(void)
continue;
kgdb_info[cpu].rounding_up = true;
- csd->func = kgdb_call_nmi_hook;
ret = smp_call_function_single_async(cpu, csd);
if (ret)
kgdb_info[cpu].rounding_up = false;
}
}
+NOKPROBE_SYMBOL(kgdb_roundup_cpus);
#endif
@@ -284,20 +282,10 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
if (!CACHE_FLUSH_IS_SAFE)
return;
- if (current->mm) {
- int i;
-
- for (i = 0; i < VMACACHE_SIZE; i++) {
- if (!current->vmacache.vmas[i])
- continue;
- flush_cache_range(current->vmacache.vmas[i],
- addr, addr + BREAK_INSTR_SIZE);
- }
- }
-
/* Force flush instruction cache if it was outside the mm */
flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
}
+NOKPROBE_SYMBOL(kgdb_flush_swbreak_addr);
/*
* SW breakpoint management:
@@ -325,6 +313,7 @@ int dbg_activate_sw_breakpoints(void)
}
return ret;
}
+NOKPROBE_SYMBOL(dbg_activate_sw_breakpoints);
int dbg_set_sw_break(unsigned long addr)
{
@@ -388,6 +377,7 @@ int dbg_deactivate_sw_breakpoints(void)
}
return ret;
}
+NOKPROBE_SYMBOL(dbg_deactivate_sw_breakpoints);
int dbg_remove_sw_break(unsigned long addr)
{
@@ -451,6 +441,17 @@ setundefined:
return 0;
}
+void kgdb_free_init_mem(void)
+{
+ int i;
+
+ /* Clear init memory breakpoints. */
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (init_section_contains((void *)kgdb_break[i].bpt_addr, 0))
+ kgdb_break[i].state = BP_UNDEFINED;
+ }
+}
+
#ifdef CONFIG_KGDB_KDB
void kdb_dump_stack_on_cpu(int cpu)
{
@@ -509,6 +510,7 @@ static int kgdb_io_ready(int print_wait)
}
return 1;
}
+NOKPROBE_SYMBOL(kgdb_io_ready);
static int kgdb_reenter_check(struct kgdb_state *ks)
{
@@ -556,6 +558,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
return 1;
}
+NOKPROBE_SYMBOL(kgdb_reenter_check);
static void dbg_touch_watchdogs(void)
{
@@ -563,6 +566,7 @@ static void dbg_touch_watchdogs(void)
clocksource_touch_watchdog();
rcu_cpu_stall_reset();
}
+NOKPROBE_SYMBOL(dbg_touch_watchdogs);
static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
int exception_state)
@@ -737,6 +741,29 @@ cpu_master_loop:
continue;
kgdb_connected = 0;
} else {
+ /*
+ * This is a brutal way to interfere with the debugger
+ * and prevent gdb being used to poke at kernel memory.
+ * This could cause trouble if lockdown is applied when
+ * there is already an active gdb session. For now the
+ * answer is simply "don't do that". Typically lockdown
+ * *will* be applied before the debug core gets started
+ * so only developers using kgdb for fairly advanced
+ * early kernel debug can be biten by this. Hopefully
+ * they are sophisticated enough to take care of
+ * themselves, especially with help from the lockdown
+ * message printed on the console!
+ */
+ if (security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL)) {
+ if (IS_ENABLED(CONFIG_KGDB_KDB)) {
+ /* Switch back to kdb if possible... */
+ dbg_kdb_mode = 1;
+ continue;
+ } else {
+ /* ... otherwise just bail */
+ break;
+ }
+ }
error = gdb_serial_stub(ks);
}
@@ -752,6 +779,8 @@ cpu_master_loop:
}
}
+ dbg_activate_sw_breakpoints();
+
/* Call the I/O driver's post_exception routine */
if (dbg_io_ops->post_exception)
dbg_io_ops->post_exception();
@@ -794,6 +823,7 @@ kgdb_restore:
return kgdb_info[cpu].ret_state;
}
+NOKPROBE_SYMBOL(kgdb_cpu_enter);
/*
* kgdb_handle_exception() - main entry point from a kernel exception
@@ -838,6 +868,7 @@ out:
arch_kgdb_ops.enable_nmi(1);
return ret;
}
+NOKPROBE_SYMBOL(kgdb_handle_exception);
/*
* GDB places a breakpoint at this function to know dynamically loaded objects.
@@ -872,6 +903,7 @@ int kgdb_nmicallback(int cpu, void *regs)
#endif
return 1;
}
+NOKPROBE_SYMBOL(kgdb_nmicallback);
int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code,
atomic_t *send_ready)
@@ -897,6 +929,7 @@ int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code,
#endif
return 1;
}
+NOKPROBE_SYMBOL(kgdb_nmicallin);
static void kgdb_console_write(struct console *co, const char *s,
unsigned count)
@@ -920,8 +953,22 @@ static struct console kgdbcons = {
.index = -1,
};
+static int __init opt_kgdb_con(char *str)
+{
+ kgdb_use_con = 1;
+
+ if (kgdb_io_module_registered && !kgdb_con_registered) {
+ register_console(&kgdbcons);
+ kgdb_con_registered = 1;
+ }
+
+ return 0;
+}
+
+early_param("kgdbcon", opt_kgdb_con);
+
#ifdef CONFIG_MAGIC_SYSRQ
-static void sysrq_handle_dbg(int key)
+static void sysrq_handle_dbg(u8 key)
{
if (!dbg_io_ops) {
pr_crit("ERROR: No KGDB I/O module available\n");
@@ -959,6 +1006,9 @@ void kgdb_panic(const char *msg)
if (panic_timeout)
return;
+ debug_locks_off();
+ console_flush_on_panic(CONSOLE_FLUSH_PENDING);
+
if (dbg_kdb_mode)
kdb_printf("PANIC: %s\n", msg);
@@ -994,12 +1044,13 @@ dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
/*
* Take the following action on reboot notify depending on value:
* 1 == Enter debugger
- * 0 == [the default] detatch debug client
+ * 0 == [the default] detach debug client
* -1 == Do nothing... and use this until the board resets
*/
switch (kgdbreboot) {
case 1:
kgdb_breakpoint();
+ goto done;
case -1:
goto done;
}
@@ -1056,31 +1107,6 @@ static void kgdb_unregister_callbacks(void)
}
}
-/*
- * There are times a tasklet needs to be used vs a compiled in
- * break point so as to cause an exception outside a kgdb I/O module,
- * such as is the case with kgdboe, where calling a breakpoint in the
- * I/O driver itself would be fatal.
- */
-static void kgdb_tasklet_bpt(unsigned long ing)
-{
- kgdb_breakpoint();
- atomic_set(&kgdb_break_tasklet_var, 0);
-}
-
-static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0);
-
-void kgdb_schedule_breakpoint(void)
-{
- if (atomic_read(&kgdb_break_tasklet_var) ||
- atomic_read(&kgdb_active) != -1 ||
- atomic_read(&kgdb_setting_breakpoint))
- return;
- atomic_inc(&kgdb_break_tasklet_var);
- tasklet_schedule(&kgdb_tasklet_breakpoint);
-}
-EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
-
/**
* kgdb_register_io_module - register KGDB IO module
* @new_dbg_io_ops: the io ops vector
@@ -1138,7 +1164,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
EXPORT_SYMBOL_GPL(kgdb_register_io_module);
/**
- * kkgdb_unregister_io_module - unregister KGDB IO module
+ * kgdb_unregister_io_module - unregister KGDB IO module
* @old_dbg_io_ops: the io ops vector
*
* Unregister it with the KGDB core.
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index a790026e42d0..9d34d2364b5a 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel Debug Core
*
@@ -22,10 +23,6 @@
*
* Original KGDB stub: David Grothe <dave@gcom.com>,
* Tigran Aivazian <tigran@sco.com>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
*/
#include <linux/kernel.h>
@@ -321,7 +318,7 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
/*
* Copy the binary array pointed to by buf into mem. Fix $, #, and
* 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
- * The input buf is overwitten with the result to write to mem.
+ * The input buf is overwritten with the result to write to mem.
*/
static int kgdb_ebin2mem(char *buf, char *mem, int count)
{
@@ -595,7 +592,7 @@ static char *gdb_hex_reg_helper(int regnum, char *out)
dbg_reg_def[i].size);
}
-/* Handle the 'p' individual regster get */
+/* Handle the 'p' individual register get */
static void gdb_cmd_reg_get(struct kgdb_state *ks)
{
unsigned long regnum;
@@ -610,7 +607,7 @@ static void gdb_cmd_reg_get(struct kgdb_state *ks)
gdb_hex_reg_helper(regnum, remcom_out_buffer);
}
-/* Handle the 'P' individual regster set */
+/* Handle the 'P' individual register set */
static void gdb_cmd_reg_set(struct kgdb_state *ks)
{
unsigned long regnum;
@@ -725,7 +722,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
}
}
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
if (i >= ks->thr_query && !finished) {
int_to_threadref(thref, p->pid);
ptr = pack_threadid(ptr, thref);
@@ -735,7 +732,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
finished = 1;
}
i++;
- } while_each_thread(g, p);
+ }
*(--ptr) = '\0';
break;
@@ -952,7 +949,7 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks)
}
/*
- * This function performs all gdbserial command procesing
+ * This function performs all gdbserial command processing
*/
int gdb_serial_stub(struct kgdb_state *ks)
{
@@ -1045,15 +1042,15 @@ int gdb_serial_stub(struct kgdb_state *ks)
gdb_cmd_detachkill(ks);
return DBG_PASS_EVENT;
}
+ fallthrough;
#endif
- /* Fall through */
case 'C': /* Exception passing */
tmp = gdb_cmd_exception_pass(ks);
if (tmp > 0)
goto default_handle;
if (tmp == 0)
break;
- /* Fall through - on tmp < 0 */
+ fallthrough; /* on tmp < 0 */
case 'c': /* Continue packet */
case 's': /* Single step packet */
if (kgdb_contthread && kgdb_contthread != current) {
@@ -1061,8 +1058,7 @@ int gdb_serial_stub(struct kgdb_state *ks)
error_packet(remcom_out_buffer, -EINVAL);
break;
}
- dbg_activate_sw_breakpoints();
- /* Fall through - to default processing */
+ fallthrough; /* to default processing */
default:
default_handle:
error = kgdb_arch_handle_exception(ks->ex_vector,
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index d7ebb2c79cb8..372025cf1ca3 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -307,6 +307,15 @@ static int kdb_bp(int argc, const char **argv)
return KDB_BADINT;
/*
+ * This check is redundant (since the breakpoint machinery should
+ * be doing the same check during kdb_bp_install) but gives the
+ * user immediate feedback.
+ */
+ diag = kgdb_validate_break_address(template.bp_addr);
+ if (diag)
+ return diag;
+
+ /*
* Find an empty bp structure to allocate
*/
for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
@@ -513,6 +522,54 @@ static int kdb_ss(int argc, const char **argv)
return KDB_CMD_SS;
}
+static kdbtab_t bptab[] = {
+ { .name = "bp",
+ .func = kdb_bp,
+ .usage = "[<vaddr>]",
+ .help = "Set/Display breakpoints",
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ },
+ { .name = "bl",
+ .func = kdb_bp,
+ .usage = "[<vaddr>]",
+ .help = "Display breakpoints",
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ },
+ { .name = "bc",
+ .func = kdb_bc,
+ .usage = "<bpnum>",
+ .help = "Clear Breakpoint",
+ .flags = KDB_ENABLE_FLOW_CTRL,
+ },
+ { .name = "be",
+ .func = kdb_bc,
+ .usage = "<bpnum>",
+ .help = "Enable Breakpoint",
+ .flags = KDB_ENABLE_FLOW_CTRL,
+ },
+ { .name = "bd",
+ .func = kdb_bc,
+ .usage = "<bpnum>",
+ .help = "Disable Breakpoint",
+ .flags = KDB_ENABLE_FLOW_CTRL,
+ },
+ { .name = "ss",
+ .func = kdb_ss,
+ .usage = "",
+ .help = "Single Step",
+ .minlen = 1,
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ },
+};
+
+static kdbtab_t bphcmd = {
+ .name = "bph",
+ .func = kdb_bp,
+ .usage = "[<vaddr>]",
+ .help = "[datar [length]|dataw [length]] Set hw brk",
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+};
+
/* Initialize the breakpoint table and register breakpoint commands. */
void __init kdb_initbptab(void)
@@ -528,30 +585,7 @@ void __init kdb_initbptab(void)
for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
bp->bp_free = 1;
- kdb_register_flags("bp", kdb_bp, "[<vaddr>]",
- "Set/Display breakpoints", 0,
- KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("bl", kdb_bp, "[<vaddr>]",
- "Display breakpoints", 0,
- KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
+ kdb_register_table(bptab, ARRAY_SIZE(bptab));
if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
- kdb_register_flags("bph", kdb_bp, "[<vaddr>]",
- "[datar [length]|dataw [length]] Set hw brk", 0,
- KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("bc", kdb_bc, "<bpnum>",
- "Clear Breakpoint", 0,
- KDB_ENABLE_FLOW_CTRL);
- kdb_register_flags("be", kdb_bc, "<bpnum>",
- "Enable Breakpoint", 0,
- KDB_ENABLE_FLOW_CTRL);
- kdb_register_flags("bd", kdb_bc, "<bpnum>",
- "Disable Breakpoint", 0,
- KDB_ENABLE_FLOW_CTRL);
-
- kdb_register_flags("ss", kdb_ss, "",
- "Single Step", 1,
- KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
- /*
- * Architecture dependent initialization.
- */
+ kdb_register_table(&bphcmd, 1);
}
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 18e03aba2cfc..10b454554ab0 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -46,7 +46,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr)
* btp <pid> Kernel stack for <pid>
* btt <address-expression> Kernel stack for task structure at
* <address-expression>
- * bta [DRSTCZEUIMA] All useful processes, optionally
+ * bta [state_chars>|A] All useful processes, optionally
* filtered by state
* btc [<cpu>] The current process on one cpu,
* default is all cpus
@@ -74,7 +74,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr)
*/
static int
-kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt)
+kdb_bt1(struct task_struct *p, const char *mask, bool btaprompt)
{
char ch;
@@ -120,7 +120,7 @@ kdb_bt_cpu(unsigned long cpu)
return;
}
- kdb_bt1(kdb_tsk, ~0UL, false);
+ kdb_bt1(kdb_tsk, "A", false);
}
int
@@ -138,8 +138,8 @@ kdb_bt(int argc, const char **argv)
if (strcmp(argv[0], "bta") == 0) {
struct task_struct *g, *p;
unsigned long cpu;
- unsigned long mask = kdb_task_state_string(argc ? argv[1] :
- NULL);
+ const char *mask = argc ? argv[1] : kdbgetenv("PS");
+
if (argc == 0)
kdb_ps_suppressed();
/* Run the active tasks first */
@@ -149,14 +149,14 @@ kdb_bt(int argc, const char **argv)
return 0;
}
/* Now the inactive tasks */
- kdb_do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
if (task_curr(p))
continue;
if (kdb_bt1(p, mask, btaprompt))
return 0;
- } kdb_while_each_thread(g, p);
+ }
} else if (strcmp(argv[0], "btp") == 0) {
struct task_struct *p;
unsigned long pid;
@@ -167,7 +167,7 @@ kdb_bt(int argc, const char **argv)
return diag;
p = find_task_by_pid_ns(pid, &init_pid_ns);
if (p)
- return kdb_bt1(p, ~0UL, false);
+ return kdb_bt1(p, "A", false);
kdb_printf("No process with pid == %ld found\n", pid);
return 0;
} else if (strcmp(argv[0], "btt") == 0) {
@@ -176,7 +176,7 @@ kdb_bt(int argc, const char **argv)
diag = kdbgetularg((char *)argv[1], &addr);
if (diag)
return diag;
- return kdb_bt1((struct task_struct *)addr, ~0UL, false);
+ return kdb_bt1((struct task_struct *)addr, "A", false);
} else if (strcmp(argv[0], "btc") == 0) {
unsigned long cpu = ~0;
if (argc > 1)
@@ -212,7 +212,7 @@ kdb_bt(int argc, const char **argv)
kdb_show_stack(kdb_current_task, (void *)addr);
return 0;
} else {
- return kdb_bt1(kdb_current_task, ~0UL, false);
+ return kdb_bt1(kdb_current_task, "A", false);
}
}
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 53a0df6e4d92..e91fc3e4edd5 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -140,14 +140,12 @@ int kdb_stub(struct kgdb_state *ks)
*/
kdb_common_deinit_state();
KDB_STATE_CLEAR(PAGER);
- kdbnearsym_cleanup();
if (error == KDB_CMD_KGDB) {
if (KDB_STATE(DOING_KGDB))
KDB_STATE_CLEAR(DOING_KGDB);
return DBG_PASS_EVENT;
}
kdb_bp_install(ks->linux_regs);
- dbg_activate_sw_breakpoints();
/* Set the exit state to a single step or a continue */
if (KDB_STATE(DOING_SS))
gdbstub_state(ks, "s");
@@ -167,7 +165,6 @@ int kdb_stub(struct kgdb_state *ks)
* differently vs the gdbstub
*/
kgdb_single_step = 0;
- dbg_deactivate_sw_breakpoints();
return DBG_SWITCH_CPU_EVENT;
}
return kgdb_info[ks->cpu].ret_state;
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 683a799618ad..9443bc63c5a2 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -9,7 +9,6 @@
* Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
*/
-#include <linux/module.h>
#include <linux/types.h>
#include <linux/ctype.h>
#include <linux/kernel.h>
@@ -132,6 +131,7 @@ char kdb_getchar(void)
int escape_delay = 0;
get_char_func *f, *f_prev = NULL;
int key;
+ static bool last_char_was_cr;
for (f = &kdb_poll_funcs[0]; ; ++f) {
if (*f == NULL) {
@@ -151,6 +151,18 @@ char kdb_getchar(void)
}
/*
+ * The caller expects that newlines are either CR or LF. However
+ * some terminals send _both_ CR and LF. Avoid having to handle
+ * this in the caller by stripping the LF if we saw a CR right
+ * before.
+ */
+ if (last_char_was_cr && key == '\n') {
+ last_char_was_cr = false;
+ continue;
+ }
+ last_char_was_cr = (key == '\r');
+
+ /*
* When the first character is received (or we get a change
* input source) we set ourselves up to handle an escape
* sequences (just in case).
@@ -245,7 +257,8 @@ poll_again:
*cp = tmp;
}
break;
- case 13: /* enter */
+ case 10: /* linefeed */
+ case 13: /* carriage return */
*lastchar++ = '\n';
*lastchar++ = '\0';
if (!KDB_STATE(KGDB_TRANS)) {
@@ -545,25 +558,40 @@ static int kdb_search_string(char *searched, char *searchfor)
static void kdb_msg_write(const char *msg, int msg_len)
{
struct console *c;
+ const char *cp;
+ int cookie;
+ int len;
if (msg_len == 0)
return;
- if (dbg_io_ops) {
- const char *cp = msg;
- int len = msg_len;
+ cp = msg;
+ len = msg_len;
- while (len--) {
- dbg_io_ops->write_char(*cp);
- cp++;
- }
+ while (len--) {
+ dbg_io_ops->write_char(*cp);
+ cp++;
}
- for_each_console(c) {
- if (!(c->flags & CON_ENABLED))
+ /*
+ * The console_srcu_read_lock() only provides safe console list
+ * traversal. The use of the ->write() callback relies on all other
+ * CPUs being stopped at the moment and console drivers being able to
+ * handle reentrance when @oops_in_progress is set.
+ *
+ * There is no guarantee that every console driver can handle
+ * reentrance in this way; the developer deploying the debugger
+ * is responsible for ensuring that the console drivers they
+ * have selected handle reentrance appropriately.
+ */
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if (!(console_srcu_read_flags(c) & CON_ENABLED))
continue;
if (c == dbg_io_ops->cons)
continue;
+ if (!c->write)
+ continue;
/*
* Set oops_in_progress to encourage the console drivers to
* disregard their internal spin locks: in the current calling
@@ -578,6 +606,7 @@ static void kdb_msg_write(const char *msg, int msg_len)
--oops_in_progress;
touch_nmi_watchdog();
}
+ console_srcu_read_unlock(cookie);
}
int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
@@ -591,7 +620,7 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
int this_cpu, old_cpu;
char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
char *moreprompt = "more> ";
- unsigned long uninitialized_var(flags);
+ unsigned long flags;
/* Serialize kdb_printf if multiple cpus try to write at once.
* But if any cpu goes recursive in kdb, just print the output,
@@ -706,12 +735,16 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
size_avail = sizeof(kdb_buffer) - len;
goto kdb_print_out;
}
- if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH)
+ if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH) {
/*
* This was a interactive search (using '/' at more
- * prompt) and it has completed. Clear the flag.
+ * prompt) and it has completed. Replace the \0 with
+ * its original value to ensure multi-line strings
+ * are handled properly, and return to normal mode.
*/
+ *cphold = replaced_byte;
kdb_grepping_flag = 0;
+ }
/*
* at this point the string is a full line and
* should be printed, up to the null.
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index 750497b0003a..3c2987f46f6e 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -11,9 +11,10 @@
#include <linux/kdb.h>
#include <linux/keyboard.h>
#include <linux/ctype.h>
-#include <linux/module.h>
#include <linux/io.h>
+#include "kdb_private.h"
+
/* Keyboard Controller Registers on normal PCs. */
#define KBD_STATUS_REG 0x64 /* Status register (R) */
@@ -173,11 +174,11 @@ int kdb_get_kbd_char(void)
case KT_LATIN:
if (isprint(keychar))
break; /* printable characters */
- /* fall through */
+ fallthrough;
case KT_SPEC:
if (keychar == K_ENTER)
break;
- /* fall through */
+ fallthrough;
default:
return -1; /* ignore unprintables */
}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 5c7949061671..d05066cb40b2 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -26,7 +26,6 @@
#include <linux/utsname.h>
#include <linux/vmalloc.h>
#include <linux/atomic.h>
-#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/mm.h>
#include <linux/init.h>
@@ -45,6 +44,7 @@
#include <linux/proc_fs.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
+#include <linux/security.h>
#include "kdb_private.h"
#undef MODULE_PARAM_PREFIX
@@ -84,15 +84,8 @@ static unsigned int kdb_continue_catastrophic =
static unsigned int kdb_continue_catastrophic;
#endif
-/* kdb_commands describes the available commands. */
-static kdbtab_t *kdb_commands;
-#define KDB_BASE_CMD_MAX 50
-static int kdb_max_commands = KDB_BASE_CMD_MAX;
-static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX];
-#define for_each_kdbcmd(cmd, num) \
- for ((cmd) = kdb_base_commands, (num) = 0; \
- num < kdb_max_commands; \
- num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++)
+/* kdb_cmds_head describes the available commands. */
+static LIST_HEAD(kdb_cmds_head);
typedef struct _kdbmsg {
int km_diag; /* kdb diagnostic */
@@ -146,42 +139,18 @@ static const int __nkdb_err = ARRAY_SIZE(kdbmsgs);
* KDB_ENVBUFSIZE if required).
*/
-static char *__env[] = {
+static char *__env[31] = {
#if defined(CONFIG_SMP)
- "PROMPT=[%d]kdb> ",
+ "PROMPT=[%d]kdb> ",
#else
- "PROMPT=kdb> ",
+ "PROMPT=kdb> ",
#endif
- "MOREPROMPT=more> ",
- "RADIX=16",
- "MDCOUNT=8", /* lines of md output */
- KDB_PLATFORM_ENV,
- "DTABCOUNT=30",
- "NOSECT=1",
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
+ "MOREPROMPT=more> ",
+ "RADIX=16",
+ "MDCOUNT=8", /* lines of md output */
+ KDB_PLATFORM_ENV,
+ "DTABCOUNT=30",
+ "NOSECT=1",
};
static const int __nenv = ARRAY_SIZE(__env);
@@ -197,10 +166,62 @@ struct task_struct *kdb_curr_task(int cpu)
}
/*
- * Check whether the flags of the current command and the permissions
- * of the kdb console has allow a command to be run.
+ * Update the permissions flags (kdb_cmd_enabled) to match the
+ * current lockdown state.
+ *
+ * Within this function the calls to security_locked_down() are "lazy". We
+ * avoid calling them if the current value of kdb_cmd_enabled already excludes
+ * flags that might be subject to lockdown. Additionally we deliberately check
+ * the lockdown flags independently (even though read lockdown implies write
+ * lockdown) since that results in both simpler code and clearer messages to
+ * the user on first-time debugger entry.
+ *
+ * The permission masks during a read+write lockdown permits the following
+ * flags: INSPECT, SIGNAL, REBOOT (and ALWAYS_SAFE).
+ *
+ * The INSPECT commands are not blocked during lockdown because they are
+ * not arbitrary memory reads. INSPECT covers the backtrace family (sometimes
+ * forcing them to have no arguments) and lsmod. These commands do expose
+ * some kernel state but do not allow the developer seated at the console to
+ * choose what state is reported. SIGNAL and REBOOT should not be controversial,
+ * given these are allowed for root during lockdown already.
+ */
+static void kdb_check_for_lockdown(void)
+{
+ const int write_flags = KDB_ENABLE_MEM_WRITE |
+ KDB_ENABLE_REG_WRITE |
+ KDB_ENABLE_FLOW_CTRL;
+ const int read_flags = KDB_ENABLE_MEM_READ |
+ KDB_ENABLE_REG_READ;
+
+ bool need_to_lockdown_write = false;
+ bool need_to_lockdown_read = false;
+
+ if (kdb_cmd_enabled & (KDB_ENABLE_ALL | write_flags))
+ need_to_lockdown_write =
+ security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL);
+
+ if (kdb_cmd_enabled & (KDB_ENABLE_ALL | read_flags))
+ need_to_lockdown_read =
+ security_locked_down(LOCKDOWN_DBG_READ_KERNEL);
+
+ /* De-compose KDB_ENABLE_ALL if required */
+ if (need_to_lockdown_write || need_to_lockdown_read)
+ if (kdb_cmd_enabled & KDB_ENABLE_ALL)
+ kdb_cmd_enabled = KDB_ENABLE_MASK & ~KDB_ENABLE_ALL;
+
+ if (need_to_lockdown_write)
+ kdb_cmd_enabled &= ~write_flags;
+
+ if (need_to_lockdown_read)
+ kdb_cmd_enabled &= ~read_flags;
+}
+
+/*
+ * Check whether the flags of the current command, the permissions of the kdb
+ * console and the lockdown state allow a command to be run.
*/
-static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
+static bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
bool no_args)
{
/* permissions comes from userspace so needs massaging slightly */
@@ -251,11 +272,10 @@ char *kdbgetenv(const char *match)
* kdballocenv - This function is used to allocate bytes for
* environment entries.
* Parameters:
- * match A character string representing a numeric value
- * Outputs:
- * *value the unsigned long representation of the env variable 'match'
+ * bytes The number of bytes to allocate in the static buffer.
* Returns:
- * Zero on success, a kdb diagnostic on failure.
+ * A pointer to the allocated space in the buffer on success.
+ * NULL if bytes > size available in the envbuffer.
* Remarks:
* We use a static environment buffer (envbuffer) to hold the values
* of dynamically generated environment variables (see kdb_set). Buffer
@@ -283,7 +303,7 @@ static char *kdballocenv(size_t bytes)
* Parameters:
* match A character string representing a numeric value
* Outputs:
- * *value the unsigned long represntation of the env variable 'match'
+ * *value the unsigned long representation of the env variable 'match'
* Returns:
* Zero on success, a kdb diagnostic on failure.
*/
@@ -324,12 +344,69 @@ int kdbgetintenv(const char *match, int *value)
}
/*
+ * kdb_setenv() - Alter an existing environment variable or create a new one.
+ * @var: Name of the variable
+ * @val: Value of the variable
+ *
+ * Return: Zero on success, a kdb diagnostic on failure.
+ */
+static int kdb_setenv(const char *var, const char *val)
+{
+ int i;
+ char *ep;
+ size_t varlen, vallen;
+
+ varlen = strlen(var);
+ vallen = strlen(val);
+ ep = kdballocenv(varlen + vallen + 2);
+ if (ep == (char *)0)
+ return KDB_ENVBUFFULL;
+
+ sprintf(ep, "%s=%s", var, val);
+
+ for (i = 0; i < __nenv; i++) {
+ if (__env[i]
+ && ((strncmp(__env[i], var, varlen) == 0)
+ && ((__env[i][varlen] == '\0')
+ || (__env[i][varlen] == '=')))) {
+ __env[i] = ep;
+ return 0;
+ }
+ }
+
+ /*
+ * Wasn't existing variable. Fit into slot.
+ */
+ for (i = 0; i < __nenv-1; i++) {
+ if (__env[i] == (char *)0) {
+ __env[i] = ep;
+ return 0;
+ }
+ }
+
+ return KDB_ENVFULL;
+}
+
+/*
+ * kdb_printenv() - Display the current environment variables.
+ */
+static void kdb_printenv(void)
+{
+ int i;
+
+ for (i = 0; i < __nenv; i++) {
+ if (__env[i])
+ kdb_printf("%s\n", __env[i]);
+ }
+}
+
+/*
* kdbgetularg - This function will convert a numeric string into an
* unsigned long value.
* Parameters:
* arg A character string representing a numeric value
* Outputs:
- * *value the unsigned long represntation of arg.
+ * *value the unsigned long representation of arg.
* Returns:
* Zero on success, a kdb diagnostic on failure.
*/
@@ -380,10 +457,6 @@ int kdbgetu64arg(const char *arg, u64 *value)
*/
int kdb_set(int argc, const char **argv)
{
- int i;
- char *ep;
- size_t varlen, vallen;
-
/*
* we can be invoked two ways:
* set var=value argv[1]="var", argv[2]="value"
@@ -428,37 +501,7 @@ int kdb_set(int argc, const char **argv)
* Tokenizer squashed the '=' sign. argv[1] is variable
* name, argv[2] = value.
*/
- varlen = strlen(argv[1]);
- vallen = strlen(argv[2]);
- ep = kdballocenv(varlen + vallen + 2);
- if (ep == (char *)0)
- return KDB_ENVBUFFULL;
-
- sprintf(ep, "%s=%s", argv[1], argv[2]);
-
- ep[varlen+vallen+1] = '\0';
-
- for (i = 0; i < __nenv; i++) {
- if (__env[i]
- && ((strncmp(__env[i], argv[1], varlen) == 0)
- && ((__env[i][varlen] == '\0')
- || (__env[i][varlen] == '=')))) {
- __env[i] = ep;
- return 0;
- }
- }
-
- /*
- * Wasn't existing variable. Fit into slot.
- */
- for (i = 0; i < __nenv-1; i++) {
- if (__env[i] == (char *)0) {
- __env[i] = ep;
- return 0;
- }
- }
-
- return KDB_ENVFULL;
+ return kdb_setenv(argv[1], argv[2]);
}
static int kdb_check_regs(void)
@@ -477,7 +520,7 @@ static int kdb_check_regs(void)
* symbol name, and offset to the caller.
*
* The argument may consist of a numeric value (decimal or
- * hexidecimal), a symbol name, a register name (preceded by the
+ * hexadecimal), a symbol name, a register name (preceded by the
* percent sign), an environment variable with a numeric value
* (preceded by a dollar sign) or a simple arithmetic expression
* consisting of a symbol name, +/-, and a numeric constant value
@@ -661,16 +704,17 @@ static void kdb_cmderror(int diag)
* Returns:
* zero for success, a kdb diagnostic if error
*/
-struct defcmd_set {
- int count;
- bool usable;
- char *name;
- char *usage;
- char *help;
- char **command;
+struct kdb_macro {
+ kdbtab_t cmd; /* Macro command */
+ struct list_head statements; /* Associated statement list */
};
-static struct defcmd_set *defcmd_set;
-static int defcmd_set_count;
+
+struct kdb_macro_statement {
+ char *statement; /* Statement text */
+ struct list_head list_node; /* Statement list node */
+};
+
+static struct kdb_macro *kdb_macro;
static bool defcmd_in_progress;
/* Forward references */
@@ -678,53 +722,55 @@ static int kdb_exec_defcmd(int argc, const char **argv);
static int kdb_defcmd2(const char *cmdstr, const char *argv0)
{
- struct defcmd_set *s = defcmd_set + defcmd_set_count - 1;
- char **save_command = s->command;
+ struct kdb_macro_statement *kms;
+
+ if (!kdb_macro)
+ return KDB_NOTIMP;
+
if (strcmp(argv0, "endefcmd") == 0) {
defcmd_in_progress = false;
- if (!s->count)
- s->usable = false;
- if (s->usable)
- /* macros are always safe because when executed each
- * internal command re-enters kdb_parse() and is
- * safety checked individually.
- */
- kdb_register_flags(s->name, kdb_exec_defcmd, s->usage,
- s->help, 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ if (!list_empty(&kdb_macro->statements))
+ kdb_register(&kdb_macro->cmd);
return 0;
}
- if (!s->usable)
- return KDB_NOTIMP;
- s->command = kcalloc(s->count + 1, sizeof(*(s->command)), GFP_KDB);
- if (!s->command) {
- kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
+
+ kms = kmalloc(sizeof(*kms), GFP_KDB);
+ if (!kms) {
+ kdb_printf("Could not allocate new kdb macro command: %s\n",
cmdstr);
- s->usable = false;
return KDB_NOTIMP;
}
- memcpy(s->command, save_command, s->count * sizeof(*(s->command)));
- s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB);
- kfree(save_command);
+
+ kms->statement = kdb_strdup(cmdstr, GFP_KDB);
+ list_add_tail(&kms->list_node, &kdb_macro->statements);
+
return 0;
}
static int kdb_defcmd(int argc, const char **argv)
{
- struct defcmd_set *save_defcmd_set = defcmd_set, *s;
+ kdbtab_t *mp;
+
if (defcmd_in_progress) {
kdb_printf("kdb: nested defcmd detected, assuming missing "
"endefcmd\n");
kdb_defcmd2("endefcmd", "endefcmd");
}
if (argc == 0) {
- int i;
- for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) {
- kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name,
- s->usage, s->help);
- for (i = 0; i < s->count; ++i)
- kdb_printf("%s", s->command[i]);
- kdb_printf("endefcmd\n");
+ kdbtab_t *kp;
+ struct kdb_macro *kmp;
+ struct kdb_macro_statement *kms;
+
+ list_for_each_entry(kp, &kdb_cmds_head, list_node) {
+ if (kp->func == kdb_exec_defcmd) {
+ kdb_printf("defcmd %s \"%s\" \"%s\"\n",
+ kp->name, kp->usage, kp->help);
+ kmp = container_of(kp, struct kdb_macro, cmd);
+ list_for_each_entry(kms, &kmp->statements,
+ list_node)
+ kdb_printf("%s", kms->statement);
+ kdb_printf("endefcmd\n");
+ }
}
return 0;
}
@@ -734,45 +780,43 @@ static int kdb_defcmd(int argc, const char **argv)
kdb_printf("Command only available during kdb_init()\n");
return KDB_NOTIMP;
}
- defcmd_set = kmalloc_array(defcmd_set_count + 1, sizeof(*defcmd_set),
- GFP_KDB);
- if (!defcmd_set)
+ kdb_macro = kzalloc(sizeof(*kdb_macro), GFP_KDB);
+ if (!kdb_macro)
goto fail_defcmd;
- memcpy(defcmd_set, save_defcmd_set,
- defcmd_set_count * sizeof(*defcmd_set));
- s = defcmd_set + defcmd_set_count;
- memset(s, 0, sizeof(*s));
- s->usable = true;
- s->name = kdb_strdup(argv[1], GFP_KDB);
- if (!s->name)
+
+ mp = &kdb_macro->cmd;
+ mp->func = kdb_exec_defcmd;
+ mp->minlen = 0;
+ mp->flags = KDB_ENABLE_ALWAYS_SAFE;
+ mp->name = kdb_strdup(argv[1], GFP_KDB);
+ if (!mp->name)
goto fail_name;
- s->usage = kdb_strdup(argv[2], GFP_KDB);
- if (!s->usage)
+ mp->usage = kdb_strdup(argv[2], GFP_KDB);
+ if (!mp->usage)
goto fail_usage;
- s->help = kdb_strdup(argv[3], GFP_KDB);
- if (!s->help)
+ mp->help = kdb_strdup(argv[3], GFP_KDB);
+ if (!mp->help)
goto fail_help;
- if (s->usage[0] == '"') {
- strcpy(s->usage, argv[2]+1);
- s->usage[strlen(s->usage)-1] = '\0';
+ if (mp->usage[0] == '"') {
+ strcpy(mp->usage, argv[2]+1);
+ mp->usage[strlen(mp->usage)-1] = '\0';
}
- if (s->help[0] == '"') {
- strcpy(s->help, argv[3]+1);
- s->help[strlen(s->help)-1] = '\0';
+ if (mp->help[0] == '"') {
+ strcpy(mp->help, argv[3]+1);
+ mp->help[strlen(mp->help)-1] = '\0';
}
- ++defcmd_set_count;
+
+ INIT_LIST_HEAD(&kdb_macro->statements);
defcmd_in_progress = true;
- kfree(save_defcmd_set);
return 0;
fail_help:
- kfree(s->usage);
+ kfree(mp->usage);
fail_usage:
- kfree(s->name);
+ kfree(mp->name);
fail_name:
- kfree(defcmd_set);
+ kfree(kdb_macro);
fail_defcmd:
- kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]);
- defcmd_set = save_defcmd_set;
+ kdb_printf("Could not allocate new kdb_macro entry for %s\n", argv[1]);
return KDB_NOTIMP;
}
@@ -787,25 +831,31 @@ fail_defcmd:
*/
static int kdb_exec_defcmd(int argc, const char **argv)
{
- int i, ret;
- struct defcmd_set *s;
+ int ret;
+ kdbtab_t *kp;
+ struct kdb_macro *kmp;
+ struct kdb_macro_statement *kms;
+
if (argc != 0)
return KDB_ARGCOUNT;
- for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) {
- if (strcmp(s->name, argv[0]) == 0)
+
+ list_for_each_entry(kp, &kdb_cmds_head, list_node) {
+ if (strcmp(kp->name, argv[0]) == 0)
break;
}
- if (i == defcmd_set_count) {
+ if (list_entry_is_head(kp, &kdb_cmds_head, list_node)) {
kdb_printf("kdb_exec_defcmd: could not find commands for %s\n",
argv[0]);
return KDB_NOTIMP;
}
- for (i = 0; i < s->count; ++i) {
- /* Recursive use of kdb_parse, do not use argv after
- * this point */
+ kmp = container_of(kp, struct kdb_macro, cmd);
+ list_for_each_entry(kms, &kmp->statements, list_node) {
+ /*
+ * Recursive use of kdb_parse, do not use argv after this point.
+ */
argv = NULL;
- kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]);
- ret = kdb_parse(s->command[i]);
+ kdb_printf("[%s]kdb> %s\n", kmp->cmd.name, kms->statement);
+ ret = kdb_parse(kms->statement);
if (ret)
return ret;
}
@@ -901,7 +951,7 @@ static void parse_grep(const char *str)
* Limited to 20 tokens.
*
* Real rudimentary tokenization. Basically only whitespace
- * is considered a token delimeter (but special consideration
+ * is considered a token delimiter (but special consideration
* is taken of the '=' sign as used by the 'set' command).
*
* The algorithm used to tokenize the input string relies on
@@ -921,7 +971,7 @@ int kdb_parse(const char *cmdstr)
char *cp;
char *cpp, quoted;
kdbtab_t *tp;
- int i, escaped, ignore_errors = 0, check_grep = 0;
+ int escaped, ignore_errors = 0, check_grep = 0;
/*
* First tokenize the command string.
@@ -1011,25 +1061,17 @@ int kdb_parse(const char *cmdstr)
++argv[0];
}
- for_each_kdbcmd(tp, i) {
- if (tp->cmd_name) {
- /*
- * If this command is allowed to be abbreviated,
- * check to see if this is it.
- */
-
- if (tp->cmd_minlen
- && (strlen(argv[0]) <= tp->cmd_minlen)) {
- if (strncmp(argv[0],
- tp->cmd_name,
- tp->cmd_minlen) == 0) {
- break;
- }
- }
+ list_for_each_entry(tp, &kdb_cmds_head, list_node) {
+ /*
+ * If this command is allowed to be abbreviated,
+ * check to see if this is it.
+ */
+ if (tp->minlen && (strlen(argv[0]) <= tp->minlen) &&
+ (strncmp(argv[0], tp->name, tp->minlen) == 0))
+ break;
- if (strcmp(argv[0], tp->cmd_name) == 0)
- break;
- }
+ if (strcmp(argv[0], tp->name) == 0)
+ break;
}
/*
@@ -1037,34 +1079,29 @@ int kdb_parse(const char *cmdstr)
* few characters of this match any of the known commands.
* e.g., md1c20 should match md.
*/
- if (i == kdb_max_commands) {
- for_each_kdbcmd(tp, i) {
- if (tp->cmd_name) {
- if (strncmp(argv[0],
- tp->cmd_name,
- strlen(tp->cmd_name)) == 0) {
- break;
- }
- }
+ if (list_entry_is_head(tp, &kdb_cmds_head, list_node)) {
+ list_for_each_entry(tp, &kdb_cmds_head, list_node) {
+ if (strncmp(argv[0], tp->name, strlen(tp->name)) == 0)
+ break;
}
}
- if (i < kdb_max_commands) {
+ if (!list_entry_is_head(tp, &kdb_cmds_head, list_node)) {
int result;
- if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1))
+ if (!kdb_check_flags(tp->flags, kdb_cmd_enabled, argc <= 1))
return KDB_NOPERM;
KDB_STATE_SET(CMD);
- result = (*tp->cmd_func)(argc-1, (const char **)argv);
+ result = (*tp->func)(argc-1, (const char **)argv);
if (result && ignore_errors && result > KDB_CMD_GO)
result = 0;
KDB_STATE_CLEAR(CMD);
- if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS)
+ if (tp->flags & KDB_REPEAT_WITH_ARGS)
return result;
- argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
+ argc = tp->flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
if (argv[argc])
*(argv[argc]) = '\0';
return result;
@@ -1194,6 +1231,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
kdb_curr_task(raw_smp_processor_id());
KDB_DEBUG_STATE("kdb_local 1", reason);
+
+ kdb_check_for_lockdown();
+
kdb_go_count = 0;
if (reason == KDB_REASON_DEBUG) {
/* special case below */
@@ -1308,8 +1348,6 @@ do_full_getstr:
/* PROMPT can only be set if we have MEM_READ permission. */
snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
raw_smp_processor_id());
- if (defcmd_in_progress)
- strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN);
/*
* Fetch command from keyboard
@@ -2018,54 +2056,6 @@ static int kdb_ef(int argc, const char **argv)
return 0;
}
-#if defined(CONFIG_MODULES)
-/*
- * kdb_lsmod - This function implements the 'lsmod' command. Lists
- * currently loaded kernel modules.
- * Mostly taken from userland lsmod.
- */
-static int kdb_lsmod(int argc, const char **argv)
-{
- struct module *mod;
-
- if (argc != 0)
- return KDB_ARGCOUNT;
-
- kdb_printf("Module Size modstruct Used by\n");
- list_for_each_entry(mod, kdb_modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
-
- kdb_printf("%-20s%8u 0x%px ", mod->name,
- mod->core_layout.size, (void *)mod);
-#ifdef CONFIG_MODULE_UNLOAD
- kdb_printf("%4d ", module_refcount(mod));
-#endif
- if (mod->state == MODULE_STATE_GOING)
- kdb_printf(" (Unloading)");
- else if (mod->state == MODULE_STATE_COMING)
- kdb_printf(" (Loading)");
- else
- kdb_printf(" (Live)");
- kdb_printf(" 0x%px", mod->core_layout.base);
-
-#ifdef CONFIG_MODULE_UNLOAD
- {
- struct module_use *use;
- kdb_printf(" [ ");
- list_for_each_entry(use, &mod->source_list,
- source_list)
- kdb_printf("%s ", use->target->name);
- kdb_printf("]\n");
- }
-#endif
- }
-
- return 0;
-}
-
-#endif /* CONFIG_MODULES */
-
/*
* kdb_env - This function implements the 'env' command. Display the
* current environment variables.
@@ -2073,12 +2063,7 @@ static int kdb_lsmod(int argc, const char **argv)
static int kdb_env(int argc, const char **argv)
{
- int i;
-
- for (i = 0; i < __nenv; i++) {
- if (__env[i])
- kdb_printf("%s\n", __env[i]);
- }
+ kdb_printenv();
if (KDB_DEBUG(MASK))
kdb_printf("KDBDEBUG=0x%x\n",
@@ -2101,7 +2086,7 @@ static int kdb_dmesg(int argc, const char **argv)
int adjust = 0;
int n = 0;
int skip = 0;
- struct kmsg_dumper dumper = { .active = 1 };
+ struct kmsg_dump_iter iter;
size_t len;
char buf[201];
@@ -2126,8 +2111,8 @@ static int kdb_dmesg(int argc, const char **argv)
kdb_set(2, setargs);
}
- kmsg_dump_rewind_nolock(&dumper);
- while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
+ kmsg_dump_rewind(&iter);
+ while (kmsg_dump_get_line(&iter, 1, NULL, 0, NULL))
n++;
if (lines < 0) {
@@ -2159,8 +2144,8 @@ static int kdb_dmesg(int argc, const char **argv)
if (skip >= n || skip < 0)
return 0;
- kmsg_dump_rewind_nolock(&dumper);
- while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
+ kmsg_dump_rewind(&iter);
+ while (kmsg_dump_get_line(&iter, 1, buf, sizeof(buf), &len)) {
if (skip) {
skip--;
continue;
@@ -2222,8 +2207,8 @@ static void kdb_cpu_status(void)
state = 'D'; /* cpu is online but unresponsive */
} else {
state = ' '; /* cpu is responding to kdb */
- if (kdb_task_state_char(KDB_TSK(i)) == 'I')
- state = 'I'; /* idle task */
+ if (kdb_task_state_char(KDB_TSK(i)) == '-')
+ state = '-'; /* idle task */
}
if (state != prev_state) {
if (prev_state != '?') {
@@ -2290,37 +2275,30 @@ static int kdb_cpu(int argc, const char **argv)
void kdb_ps_suppressed(void)
{
int idle = 0, daemon = 0;
- unsigned long mask_I = kdb_task_state_string("I"),
- mask_M = kdb_task_state_string("M");
unsigned long cpu;
const struct task_struct *p, *g;
for_each_online_cpu(cpu) {
p = kdb_curr_task(cpu);
- if (kdb_task_state(p, mask_I))
+ if (kdb_task_state(p, "-"))
++idle;
}
- kdb_do_each_thread(g, p) {
- if (kdb_task_state(p, mask_M))
+ for_each_process_thread(g, p) {
+ if (kdb_task_state(p, "ims"))
++daemon;
- } kdb_while_each_thread(g, p);
+ }
if (idle || daemon) {
if (idle)
- kdb_printf("%d idle process%s (state I)%s\n",
+ kdb_printf("%d idle process%s (state -)%s\n",
idle, idle == 1 ? "" : "es",
daemon ? " and " : "");
if (daemon)
- kdb_printf("%d sleeping system daemon (state M) "
+ kdb_printf("%d sleeping system daemon (state [ims]) "
"process%s", daemon,
daemon == 1 ? "" : "es");
kdb_printf(" suppressed,\nuse 'ps A' to see all.\n");
}
}
-/*
- * kdb_ps - This function implements the 'ps' command which shows a
- * list of the active processes.
- * ps [DRSTCZEUIMA] All processes, optionally filtered by state
- */
void kdb_ps1(const struct task_struct *p)
{
int cpu;
@@ -2349,17 +2327,25 @@ void kdb_ps1(const struct task_struct *p)
}
}
+/*
+ * kdb_ps - This function implements the 'ps' command which shows a
+ * list of the active processes.
+ *
+ * ps [<state_chars>] Show processes, optionally selecting only those whose
+ * state character is found in <state_chars>.
+ */
static int kdb_ps(int argc, const char **argv)
{
struct task_struct *g, *p;
- unsigned long mask, cpu;
+ const char *mask;
+ unsigned long cpu;
if (argc == 0)
kdb_ps_suppressed();
kdb_printf("%-*s Pid Parent [*] cpu State %-*s Command\n",
(int)(2*sizeof(void *))+2, "Task Addr",
(int)(2*sizeof(void *))+2, "Thread");
- mask = kdb_task_state_string(argc ? argv[1] : NULL);
+ mask = argc ? argv[1] : kdbgetenv("PS");
/* Run the active tasks first */
for_each_online_cpu(cpu) {
if (KDB_FLAG(CMD_INTERRUPT))
@@ -2370,12 +2356,12 @@ static int kdb_ps(int argc, const char **argv)
}
kdb_printf("\n");
/* Now the real tasks */
- kdb_do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
if (kdb_task_state(p, mask))
kdb_ps1(p);
- } kdb_while_each_thread(g, p);
+ }
return 0;
}
@@ -2428,23 +2414,20 @@ static int kdb_kgdb(int argc, const char **argv)
static int kdb_help(int argc, const char **argv)
{
kdbtab_t *kt;
- int i;
kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description");
kdb_printf("-----------------------------"
"-----------------------------\n");
- for_each_kdbcmd(kt, i) {
+ list_for_each_entry(kt, &kdb_cmds_head, list_node) {
char *space = "";
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
- if (!kt->cmd_name)
+ if (!kdb_check_flags(kt->flags, kdb_cmd_enabled, true))
continue;
- if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true))
- continue;
- if (strlen(kt->cmd_usage) > 20)
+ if (strlen(kt->usage) > 20)
space = "\n ";
- kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
- kt->cmd_usage, space, kt->cmd_help);
+ kdb_printf("%-15.15s %-20s%s%s\n", kt->name,
+ kt->usage, space, kt->help);
}
return 0;
}
@@ -2515,7 +2498,6 @@ static void kdb_sysinfo(struct sysinfo *val)
static int kdb_summary(int argc, const char **argv)
{
time64_t now;
- struct tm tm;
struct sysinfo val;
if (argc)
@@ -2529,13 +2511,7 @@ static int kdb_summary(int argc, const char **argv)
kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
now = __ktime_get_real_seconds();
- time64_to_tm(now, 0, &tm);
- kdb_printf("date %04ld-%02d-%02d %02d:%02d:%02d "
- "tz_minuteswest %d\n",
- 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
- tm.tm_hour, tm.tm_min, tm.tm_sec,
- sys_tz.tz_minuteswest);
-
+ kdb_printf("date %ptTs tz_minuteswest %d\n", &now, sys_tz.tz_minuteswest);
kdb_sysinfo(&val);
kdb_printf("uptime ");
if (val.uptime > (24*60*60)) {
@@ -2647,246 +2623,278 @@ static int kdb_grep_help(int argc, const char **argv)
return 0;
}
-/*
- * kdb_register_flags - This function is used to register a kernel
- * debugger command.
- * Inputs:
- * cmd Command name
- * func Function to execute the command
- * usage A simple usage string showing arguments
- * help A simple help string describing command
- * repeat Does the command auto repeat on enter?
- * Returns:
- * zero for success, one if a duplicate command.
+/**
+ * kdb_register() - This function is used to register a kernel debugger
+ * command.
+ * @cmd: pointer to kdb command
+ *
+ * Note that it's the job of the caller to keep the memory for the cmd
+ * allocated until unregister is called.
*/
-#define kdb_command_extend 50 /* arbitrary */
-int kdb_register_flags(char *cmd,
- kdb_func_t func,
- char *usage,
- char *help,
- short minlen,
- kdb_cmdflags_t flags)
+int kdb_register(kdbtab_t *cmd)
{
- int i;
kdbtab_t *kp;
- /*
- * Brute force method to determine duplicates
- */
- for_each_kdbcmd(kp, i) {
- if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
- kdb_printf("Duplicate kdb command registered: "
- "%s, func %px help %s\n", cmd, func, help);
- return 1;
- }
- }
-
- /*
- * Insert command into first available location in table
- */
- for_each_kdbcmd(kp, i) {
- if (kp->cmd_name == NULL)
- break;
- }
-
- if (i >= kdb_max_commands) {
- kdbtab_t *new = kmalloc_array(kdb_max_commands -
- KDB_BASE_CMD_MAX +
- kdb_command_extend,
- sizeof(*new),
- GFP_KDB);
- if (!new) {
- kdb_printf("Could not allocate new kdb_command "
- "table\n");
+ list_for_each_entry(kp, &kdb_cmds_head, list_node) {
+ if (strcmp(kp->name, cmd->name) == 0) {
+ kdb_printf("Duplicate kdb cmd: %s, func %p help %s\n",
+ cmd->name, cmd->func, cmd->help);
return 1;
}
- if (kdb_commands) {
- memcpy(new, kdb_commands,
- (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
- kfree(kdb_commands);
- }
- memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0,
- kdb_command_extend * sizeof(*new));
- kdb_commands = new;
- kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
- kdb_max_commands += kdb_command_extend;
}
- kp->cmd_name = cmd;
- kp->cmd_func = func;
- kp->cmd_usage = usage;
- kp->cmd_help = help;
- kp->cmd_minlen = minlen;
- kp->cmd_flags = flags;
-
+ list_add_tail(&cmd->list_node, &kdb_cmds_head);
return 0;
}
-EXPORT_SYMBOL_GPL(kdb_register_flags);
-
+EXPORT_SYMBOL_GPL(kdb_register);
-/*
- * kdb_register - Compatibility register function for commands that do
- * not need to specify a repeat state. Equivalent to
- * kdb_register_flags with flags set to 0.
- * Inputs:
- * cmd Command name
- * func Function to execute the command
- * usage A simple usage string showing arguments
- * help A simple help string describing command
- * Returns:
- * zero for success, one if a duplicate command.
+/**
+ * kdb_register_table() - This function is used to register a kdb command
+ * table.
+ * @kp: pointer to kdb command table
+ * @len: length of kdb command table
*/
-int kdb_register(char *cmd,
- kdb_func_t func,
- char *usage,
- char *help,
- short minlen)
+void kdb_register_table(kdbtab_t *kp, size_t len)
{
- return kdb_register_flags(cmd, func, usage, help, minlen, 0);
+ while (len--) {
+ list_add_tail(&kp->list_node, &kdb_cmds_head);
+ kp++;
+ }
}
-EXPORT_SYMBOL_GPL(kdb_register);
-/*
- * kdb_unregister - This function is used to unregister a kernel
- * debugger command. It is generally called when a module which
- * implements kdb commands is unloaded.
- * Inputs:
- * cmd Command name
- * Returns:
- * zero for success, one command not registered.
+/**
+ * kdb_unregister() - This function is used to unregister a kernel debugger
+ * command. It is generally called when a module which
+ * implements kdb command is unloaded.
+ * @cmd: pointer to kdb command
*/
-int kdb_unregister(char *cmd)
+void kdb_unregister(kdbtab_t *cmd)
{
- int i;
- kdbtab_t *kp;
-
- /*
- * find the command.
- */
- for_each_kdbcmd(kp, i) {
- if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
- kp->cmd_name = NULL;
- return 0;
- }
- }
-
- /* Couldn't find it. */
- return 1;
+ list_del(&cmd->list_node);
}
EXPORT_SYMBOL_GPL(kdb_unregister);
-/* Initialize the kdb command table. */
-static void __init kdb_inittab(void)
-{
- int i;
- kdbtab_t *kp;
-
- for_each_kdbcmd(kp, i)
- kp->cmd_name = NULL;
-
- kdb_register_flags("md", kdb_md, "<vaddr>",
- "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
- KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>",
- "Display Raw Memory", 0,
- KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>",
- "Display Physical Memory", 0,
- KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("mds", kdb_md, "<vaddr>",
- "Display Memory Symbolically", 0,
- KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>",
- "Modify Memory Contents", 0,
- KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("go", kdb_go, "[<vaddr>]",
- "Continue Execution", 1,
- KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
- kdb_register_flags("rd", kdb_rd, "",
- "Display Registers", 0,
- KDB_ENABLE_REG_READ);
- kdb_register_flags("rm", kdb_rm, "<reg> <contents>",
- "Modify Registers", 0,
- KDB_ENABLE_REG_WRITE);
- kdb_register_flags("ef", kdb_ef, "<vaddr>",
- "Display exception frame", 0,
- KDB_ENABLE_MEM_READ);
- kdb_register_flags("bt", kdb_bt, "[<vaddr>]",
- "Stack traceback", 1,
- KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
- kdb_register_flags("btp", kdb_bt, "<pid>",
- "Display stack for process <pid>", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
- "Backtrace all processes matching state flag", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("btc", kdb_bt, "",
- "Backtrace current process on each cpu", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("btt", kdb_bt, "<vaddr>",
- "Backtrace process given its struct task address", 0,
- KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
- kdb_register_flags("env", kdb_env, "",
- "Show environment variables", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("set", kdb_set, "",
- "Set environment variables", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("help", kdb_help, "",
- "Display Help Message", 1,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("?", kdb_help, "",
- "Display Help Message", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("cpu", kdb_cpu, "<cpunum>",
- "Switch to new cpu", 0,
- KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
- kdb_register_flags("kgdb", kdb_kgdb, "",
- "Enter kgdb mode", 0, 0);
- kdb_register_flags("ps", kdb_ps, "[<flags>|A]",
- "Display active task list", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("pid", kdb_pid, "<pidnum>",
- "Switch to another task", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("reboot", kdb_reboot, "",
- "Reboot the machine immediately", 0,
- KDB_ENABLE_REBOOT);
+static kdbtab_t maintab[] = {
+ { .name = "md",
+ .func = kdb_md,
+ .usage = "<vaddr>",
+ .help = "Display Memory Contents, also mdWcN, e.g. md8c1",
+ .minlen = 1,
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ },
+ { .name = "mdr",
+ .func = kdb_md,
+ .usage = "<vaddr> <bytes>",
+ .help = "Display Raw Memory",
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ },
+ { .name = "mdp",
+ .func = kdb_md,
+ .usage = "<paddr> <bytes>",
+ .help = "Display Physical Memory",
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ },
+ { .name = "mds",
+ .func = kdb_md,
+ .usage = "<vaddr>",
+ .help = "Display Memory Symbolically",
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ },
+ { .name = "mm",
+ .func = kdb_mm,
+ .usage = "<vaddr> <contents>",
+ .help = "Modify Memory Contents",
+ .flags = KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS,
+ },
+ { .name = "go",
+ .func = kdb_go,
+ .usage = "[<vaddr>]",
+ .help = "Continue Execution",
+ .minlen = 1,
+ .flags = KDB_ENABLE_REG_WRITE |
+ KDB_ENABLE_ALWAYS_SAFE_NO_ARGS,
+ },
+ { .name = "rd",
+ .func = kdb_rd,
+ .usage = "",
+ .help = "Display Registers",
+ .flags = KDB_ENABLE_REG_READ,
+ },
+ { .name = "rm",
+ .func = kdb_rm,
+ .usage = "<reg> <contents>",
+ .help = "Modify Registers",
+ .flags = KDB_ENABLE_REG_WRITE,
+ },
+ { .name = "ef",
+ .func = kdb_ef,
+ .usage = "<vaddr>",
+ .help = "Display exception frame",
+ .flags = KDB_ENABLE_MEM_READ,
+ },
+ { .name = "bt",
+ .func = kdb_bt,
+ .usage = "[<vaddr>]",
+ .help = "Stack traceback",
+ .minlen = 1,
+ .flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
+ },
+ { .name = "btp",
+ .func = kdb_bt,
+ .usage = "<pid>",
+ .help = "Display stack for process <pid>",
+ .flags = KDB_ENABLE_INSPECT,
+ },
+ { .name = "bta",
+ .func = kdb_bt,
+ .usage = "[<state_chars>|A]",
+ .help = "Backtrace all processes whose state matches",
+ .flags = KDB_ENABLE_INSPECT,
+ },
+ { .name = "btc",
+ .func = kdb_bt,
+ .usage = "",
+ .help = "Backtrace current process on each cpu",
+ .flags = KDB_ENABLE_INSPECT,
+ },
+ { .name = "btt",
+ .func = kdb_bt,
+ .usage = "<vaddr>",
+ .help = "Backtrace process given its struct task address",
+ .flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
+ },
+ { .name = "env",
+ .func = kdb_env,
+ .usage = "",
+ .help = "Show environment variables",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .name = "set",
+ .func = kdb_set,
+ .usage = "",
+ .help = "Set environment variables",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .name = "help",
+ .func = kdb_help,
+ .usage = "",
+ .help = "Display Help Message",
+ .minlen = 1,
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .name = "?",
+ .func = kdb_help,
+ .usage = "",
+ .help = "Display Help Message",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .name = "cpu",
+ .func = kdb_cpu,
+ .usage = "<cpunum>",
+ .help = "Switch to new cpu",
+ .flags = KDB_ENABLE_ALWAYS_SAFE_NO_ARGS,
+ },
+ { .name = "kgdb",
+ .func = kdb_kgdb,
+ .usage = "",
+ .help = "Enter kgdb mode",
+ .flags = 0,
+ },
+ { .name = "ps",
+ .func = kdb_ps,
+ .usage = "[<state_chars>|A]",
+ .help = "Display active task list",
+ .flags = KDB_ENABLE_INSPECT,
+ },
+ { .name = "pid",
+ .func = kdb_pid,
+ .usage = "<pidnum>",
+ .help = "Switch to another task",
+ .flags = KDB_ENABLE_INSPECT,
+ },
+ { .name = "reboot",
+ .func = kdb_reboot,
+ .usage = "",
+ .help = "Reboot the machine immediately",
+ .flags = KDB_ENABLE_REBOOT,
+ },
#if defined(CONFIG_MODULES)
- kdb_register_flags("lsmod", kdb_lsmod, "",
- "List loaded kernel modules", 0,
- KDB_ENABLE_INSPECT);
+ { .name = "lsmod",
+ .func = kdb_lsmod,
+ .usage = "",
+ .help = "List loaded kernel modules",
+ .flags = KDB_ENABLE_INSPECT,
+ },
#endif
#if defined(CONFIG_MAGIC_SYSRQ)
- kdb_register_flags("sr", kdb_sr, "<key>",
- "Magic SysRq key", 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ { .name = "sr",
+ .func = kdb_sr,
+ .usage = "<key>",
+ .help = "Magic SysRq key",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
#endif
#if defined(CONFIG_PRINTK)
- kdb_register_flags("dmesg", kdb_dmesg, "[lines]",
- "Display syslog buffer", 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ { .name = "dmesg",
+ .func = kdb_dmesg,
+ .usage = "[lines]",
+ .help = "Display syslog buffer",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
#endif
- if (arch_kgdb_ops.enable_nmi) {
- kdb_register_flags("disable_nmi", kdb_disable_nmi, "",
- "Disable NMI entry to KDB", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- }
- kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
- "Define a set of commands, down to endefcmd", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("kill", kdb_kill, "<-signal> <pid>",
- "Send a signal to a process", 0,
- KDB_ENABLE_SIGNAL);
- kdb_register_flags("summary", kdb_summary, "",
- "Summarize the system", 4,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
- "Display per_cpu variables", 3,
- KDB_ENABLE_MEM_READ);
- kdb_register_flags("grephelp", kdb_grep_help, "",
- "Display help on | grep", 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ { .name = "defcmd",
+ .func = kdb_defcmd,
+ .usage = "name \"usage\" \"help\"",
+ .help = "Define a set of commands, down to endefcmd",
+ /*
+ * Macros are always safe because when executed each
+ * internal command re-enters kdb_parse() and is safety
+ * checked individually.
+ */
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .name = "kill",
+ .func = kdb_kill,
+ .usage = "<-signal> <pid>",
+ .help = "Send a signal to a process",
+ .flags = KDB_ENABLE_SIGNAL,
+ },
+ { .name = "summary",
+ .func = kdb_summary,
+ .usage = "",
+ .help = "Summarize the system",
+ .minlen = 4,
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .name = "per_cpu",
+ .func = kdb_per_cpu,
+ .usage = "<sym> [<bytes>] [<cpu>]",
+ .help = "Display per_cpu variables",
+ .minlen = 3,
+ .flags = KDB_ENABLE_MEM_READ,
+ },
+ { .name = "grephelp",
+ .func = kdb_grep_help,
+ .usage = "",
+ .help = "Display help on | grep",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+};
+
+static kdbtab_t nmicmd = {
+ .name = "disable_nmi",
+ .func = kdb_disable_nmi,
+ .usage = "",
+ .help = "Disable NMI entry to KDB",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+};
+
+/* Initialize the kdb command table. */
+static void __init kdb_inittab(void)
+{
+ kdb_register_table(maintab, ARRAY_SIZE(maintab));
+ if (arch_kgdb_ops.enable_nmi)
+ kdb_register_table(&nmicmd, 1);
}
/* Execute any commands defined in kdb_cmds. */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 2e296e4a234c..548fd4059bf9 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -64,7 +64,7 @@
/*
* KDB_MAXBPT describes the total number of breakpoints
- * supported by this architecure.
+ * supported by this architecture.
*/
#define KDB_MAXBPT 16
@@ -109,7 +109,6 @@ extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
long *, char **);
extern int kdbgetsymval(const char *, kdb_symtab_t *);
extern int kdbnearsym(unsigned long, kdb_symtab_t *);
-extern void kdbnearsym_cleanup(void);
extern char *kdb_strdup(const char *str, gfp_t type);
extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int);
@@ -165,17 +164,7 @@ typedef struct _kdb_bp {
#ifdef CONFIG_KGDB_KDB
extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */];
-/* The KDB shell command table */
-typedef struct _kdbtab {
- char *cmd_name; /* Command name */
- kdb_func_t cmd_func; /* Function to execute command */
- char *cmd_usage; /* Usage String for this command */
- char *cmd_help; /* Help message for this command */
- short cmd_minlen; /* Minimum legal # command
- * chars required */
- kdb_cmdflags_t cmd_flags; /* Command behaviour flags */
-} kdbtab_t;
-
+extern void kdb_register_table(kdbtab_t *kp, size_t len);
extern int kdb_bt(int, const char **); /* KDB display back trace */
/* KDB breakpoint management functions */
@@ -201,15 +190,10 @@ extern char kdb_grep_string[];
extern int kdb_grep_leading;
extern int kdb_grep_trailing;
extern char *kdb_cmds[];
-extern unsigned long kdb_task_state_string(const char *);
extern char kdb_task_state_char (const struct task_struct *);
-extern unsigned long kdb_task_state(const struct task_struct *p,
- unsigned long mask);
+extern bool kdb_task_state(const struct task_struct *p, const char *mask);
extern void kdb_ps_suppressed(void);
extern void kdb_ps1(const struct task_struct *p);
-extern void kdb_print_nameval(const char *name, unsigned long val);
-extern void kdb_send_sig(struct task_struct *p, int sig);
-extern void kdb_meminfo_proc_show(void);
extern char kdb_getchar(void);
extern char *kdb_getstr(char *, size_t, const char *);
extern void kdb_gdb_state_pass(char *buf);
@@ -230,15 +214,7 @@ extern struct task_struct *kdb_curr_task(int);
#define kdb_task_has_cpu(p) (task_curr(p))
-/* Simplify coexistence with NPTL */
-#define kdb_do_each_thread(g, p) do_each_thread(g, p)
-#define kdb_while_each_thread(g, p) while_each_thread(g, p)
-
-#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
-
-extern void *debug_kmalloc(size_t size, gfp_t flags);
-extern void debug_kfree(void *);
-extern void debug_kusage(void);
+#define GFP_KDB (in_dbg_master() ? GFP_ATOMIC : GFP_KERNEL)
extern struct task_struct *kdb_current_task;
extern struct pt_regs *kdb_current_regs;
@@ -249,13 +225,19 @@ extern void kdb_kbd_cleanup_state(void);
#define kdb_kbd_cleanup_state()
#endif /* ! CONFIG_KDB_KEYBOARD */
-#ifdef CONFIG_MODULES
-extern struct list_head *kdb_modules;
-#endif /* CONFIG_MODULES */
-
extern char kdb_prompt_str[];
#define KDB_WORD_SIZE ((int)sizeof(unsigned long))
#endif /* CONFIG_KGDB_KDB */
+
+#define kdb_func_printf(format, args...) \
+ kdb_printf("%s: " format, __func__, ## args)
+
+#define kdb_dbg_printf(mask, format, args...) \
+ do { \
+ if (KDB_DEBUG(mask)) \
+ kdb_func_printf(format, ## args); \
+ } while (0)
+
#endif /* !_KDBPRIVATE_H */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 004c5b6c87f8..0a39497140bf 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -10,7 +10,6 @@
* 03/02/13 added new 2.5 kallsyms <xavier.bru@bull.net>
*/
-#include <stdarg.h>
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/mm.h>
@@ -18,13 +17,13 @@
#include <linux/stddef.h>
#include <linux/vmalloc.h>
#include <linux/ptrace.h>
-#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/hardirq.h>
#include <linux/delay.h>
#include <linux/uaccess.h>
#include <linux/kdb.h>
#include <linux/slab.h>
+#include <linux/ctype.h>
#include "kdb_private.h"
/*
@@ -39,68 +38,61 @@
*/
int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
{
- if (KDB_DEBUG(AR))
- kdb_printf("kdbgetsymval: symname=%s, symtab=%px\n", symname,
- symtab);
+ kdb_dbg_printf(AR, "symname=%s, symtab=%px\n", symname, symtab);
memset(symtab, 0, sizeof(*symtab));
symtab->sym_start = kallsyms_lookup_name(symname);
if (symtab->sym_start) {
- if (KDB_DEBUG(AR))
- kdb_printf("kdbgetsymval: returns 1, "
- "symtab->sym_start=0x%lx\n",
- symtab->sym_start);
+ kdb_dbg_printf(AR, "returns 1, symtab->sym_start=0x%lx\n",
+ symtab->sym_start);
return 1;
}
- if (KDB_DEBUG(AR))
- kdb_printf("kdbgetsymval: returns 0\n");
+ kdb_dbg_printf(AR, "returns 0\n");
return 0;
}
EXPORT_SYMBOL(kdbgetsymval);
-static char *kdb_name_table[100]; /* arbitrary size */
-
-/*
- * kdbnearsym - Return the name of the symbol with the nearest address
- * less than 'addr'.
+/**
+ * kdbnearsym() - Return the name of the symbol with the nearest address
+ * less than @addr.
+ * @addr: Address to check for near symbol
+ * @symtab: Structure to receive results
*
- * Parameters:
- * addr Address to check for symbol near
- * symtab Structure to receive results
- * Returns:
- * 0 No sections contain this address, symtab zero filled
- * 1 Address mapped to module/symbol/section, data in symtab
- * Remarks:
- * 2.6 kallsyms has a "feature" where it unpacks the name into a
- * string. If that string is reused before the caller expects it
- * then the caller sees its string change without warning. To
- * avoid cluttering up the main kdb code with lots of kdb_strdup,
- * tests and kfree calls, kdbnearsym maintains an LRU list of the
- * last few unique strings. The list is sized large enough to
- * hold active strings, no kdb caller of kdbnearsym makes more
- * than ~20 later calls before using a saved value.
+ * WARNING: This function may return a pointer to a single statically
+ * allocated buffer (namebuf). kdb's unusual calling context (single
+ * threaded, all other CPUs halted) provides us sufficient locking for
+ * this to be safe. The only constraint imposed by the static buffer is
+ * that the caller must consume any previous reply prior to another call
+ * to lookup a new symbol.
+ *
+ * Note that, strictly speaking, some architectures may re-enter the kdb
+ * trap if the system turns out to be very badly damaged and this breaks
+ * the single-threaded assumption above. In these circumstances successful
+ * continuation and exit from the inner trap is unlikely to work and any
+ * user attempting this receives a prominent warning before being allowed
+ * to progress. In these circumstances we remain memory safe because
+ * namebuf[KSYM_NAME_LEN-1] will never change from '\0' although we do
+ * tolerate the possibility of garbled symbol display from the outer kdb
+ * trap.
+ *
+ * Return:
+ * * 0 - No sections contain this address, symtab zero filled
+ * * 1 - Address mapped to module/symbol/section, data in symtab
*/
int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
{
int ret = 0;
unsigned long symbolsize = 0;
unsigned long offset = 0;
-#define knt1_size 128 /* must be >= kallsyms table size */
- char *knt1 = NULL;
+ static char namebuf[KSYM_NAME_LEN];
- if (KDB_DEBUG(AR))
- kdb_printf("kdbnearsym: addr=0x%lx, symtab=%px\n", addr, symtab);
+ kdb_dbg_printf(AR, "addr=0x%lx, symtab=%px\n", addr, symtab);
memset(symtab, 0, sizeof(*symtab));
if (addr < 4096)
goto out;
- knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
- if (!knt1) {
- kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n",
- addr);
- goto out;
- }
+
symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
- (char **)(&symtab->mod_name), knt1);
+ (char **)(&symtab->mod_name), namebuf);
if (offset > 8*1024*1024) {
symtab->sym_name = NULL;
addr = offset = symbolsize = 0;
@@ -109,66 +101,14 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
symtab->sym_end = symtab->sym_start + symbolsize;
ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0';
- if (ret) {
- int i;
- /* Another 2.6 kallsyms "feature". Sometimes the sym_name is
- * set but the buffer passed into kallsyms_lookup is not used,
- * so it contains garbage. The caller has to work out which
- * buffer needs to be saved.
- *
- * What was Rusty smoking when he wrote that code?
- */
- if (symtab->sym_name != knt1) {
- strncpy(knt1, symtab->sym_name, knt1_size);
- knt1[knt1_size-1] = '\0';
- }
- for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
- if (kdb_name_table[i] &&
- strcmp(kdb_name_table[i], knt1) == 0)
- break;
- }
- if (i >= ARRAY_SIZE(kdb_name_table)) {
- debug_kfree(kdb_name_table[0]);
- memmove(kdb_name_table, kdb_name_table+1,
- sizeof(kdb_name_table[0]) *
- (ARRAY_SIZE(kdb_name_table)-1));
- } else {
- debug_kfree(knt1);
- knt1 = kdb_name_table[i];
- memmove(kdb_name_table+i, kdb_name_table+i+1,
- sizeof(kdb_name_table[0]) *
- (ARRAY_SIZE(kdb_name_table)-i-1));
- }
- i = ARRAY_SIZE(kdb_name_table) - 1;
- kdb_name_table[i] = knt1;
- symtab->sym_name = kdb_name_table[i];
- knt1 = NULL;
- }
-
if (symtab->mod_name == NULL)
symtab->mod_name = "kernel";
- if (KDB_DEBUG(AR))
- kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, "
- "symtab->mod_name=%px, symtab->sym_name=%px (%s)\n", ret,
- symtab->sym_start, symtab->mod_name, symtab->sym_name,
- symtab->sym_name);
-
+ kdb_dbg_printf(AR, "returns %d symtab->sym_start=0x%lx, symtab->mod_name=%px, symtab->sym_name=%px (%s)\n",
+ ret, symtab->sym_start, symtab->mod_name, symtab->sym_name, symtab->sym_name);
out:
- debug_kfree(knt1);
return ret;
}
-void kdbnearsym_cleanup(void)
-{
- int i;
- for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
- if (kdb_name_table[i]) {
- debug_kfree(kdb_name_table[i]);
- kdb_name_table[i] = NULL;
- }
- }
-}
-
static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1];
/*
@@ -328,7 +268,7 @@ int kdb_getarea_size(void *res, unsigned long addr, size_t size)
int ret = copy_from_kernel_nofault((char *)res, (char *)addr, size);
if (ret) {
if (!KDB_STATE(SUPPRESS)) {
- kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr);
+ kdb_func_printf("Bad address 0x%lx\n", addr);
KDB_STATE_SET(SUPPRESS);
}
ret = KDB_BADADDR;
@@ -350,10 +290,10 @@ int kdb_getarea_size(void *res, unsigned long addr, size_t size)
*/
int kdb_putarea_size(unsigned long addr, void *res, size_t size)
{
- int ret = copy_from_kernel_nofault((char *)addr, (char *)res, size);
+ int ret = copy_to_kernel_nofault((char *)addr, (char *)res, size);
if (ret) {
if (!KDB_STATE(SUPPRESS)) {
- kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr);
+ kdb_func_printf("Bad address 0x%lx\n", addr);
KDB_STATE_SET(SUPPRESS);
}
ret = KDB_BADADDR;
@@ -432,10 +372,10 @@ int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
*word = w8;
break;
}
- /* fall through */
+ fallthrough;
default:
diag = KDB_BADWIDTH;
- kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
+ kdb_func_printf("bad width %zu\n", size);
}
return diag;
}
@@ -481,10 +421,10 @@ int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
*word = w8;
break;
}
- /* fall through */
+ fallthrough;
default:
diag = KDB_BADWIDTH;
- kdb_printf("kdb_getword: bad width %ld\n", (long) size);
+ kdb_func_printf("bad width %zu\n", size);
}
return diag;
}
@@ -525,91 +465,15 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
diag = kdb_putarea(addr, w8);
break;
}
- /* fall through */
+ fallthrough;
default:
diag = KDB_BADWIDTH;
- kdb_printf("kdb_putword: bad width %ld\n", (long) size);
+ kdb_func_printf("bad width %zu\n", size);
}
return diag;
}
-/*
- * kdb_task_state_string - Convert a string containing any of the
- * letters DRSTCZEUIMA to a mask for the process state field and
- * return the value. If no argument is supplied, return the mask
- * that corresponds to environment variable PS, DRSTCZEU by
- * default.
- * Inputs:
- * s String to convert
- * Returns:
- * Mask for process state.
- * Notes:
- * The mask folds data from several sources into a single long value, so
- * be careful not to overlap the bits. TASK_* bits are in the LSB,
- * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
- * is no overlap between TASK_* and EXIT_* but that may not always be
- * true, so EXIT_* bits are shifted left 16 bits before being stored in
- * the mask.
- */
-
-/* unrunnable is < 0 */
-#define UNRUNNABLE (1UL << (8*sizeof(unsigned long) - 1))
-#define RUNNING (1UL << (8*sizeof(unsigned long) - 2))
-#define IDLE (1UL << (8*sizeof(unsigned long) - 3))
-#define DAEMON (1UL << (8*sizeof(unsigned long) - 4))
-unsigned long kdb_task_state_string(const char *s)
-{
- long res = 0;
- if (!s) {
- s = kdbgetenv("PS");
- if (!s)
- s = "DRSTCZEU"; /* default value for ps */
- }
- while (*s) {
- switch (*s) {
- case 'D':
- res |= TASK_UNINTERRUPTIBLE;
- break;
- case 'R':
- res |= RUNNING;
- break;
- case 'S':
- res |= TASK_INTERRUPTIBLE;
- break;
- case 'T':
- res |= TASK_STOPPED;
- break;
- case 'C':
- res |= TASK_TRACED;
- break;
- case 'Z':
- res |= EXIT_ZOMBIE << 16;
- break;
- case 'E':
- res |= EXIT_DEAD << 16;
- break;
- case 'U':
- res |= UNRUNNABLE;
- break;
- case 'I':
- res |= IDLE;
- break;
- case 'M':
- res |= DAEMON;
- break;
- case 'A':
- res = ~0UL;
- break;
- default:
- kdb_printf("%s: unknown flag '%c' ignored\n",
- __func__, *s);
- break;
- }
- ++s;
- }
- return res;
-}
/*
* kdb_task_state_char - Return the character that represents the task state.
@@ -620,32 +484,26 @@ unsigned long kdb_task_state_string(const char *s)
*/
char kdb_task_state_char (const struct task_struct *p)
{
- int cpu;
- char state;
unsigned long tmp;
+ char state;
+ int cpu;
if (!p ||
copy_from_kernel_nofault(&tmp, (char *)p, sizeof(unsigned long)))
return 'E';
- cpu = kdb_process_cpu(p);
- state = (p->state == 0) ? 'R' :
- (p->state < 0) ? 'U' :
- (p->state & TASK_UNINTERRUPTIBLE) ? 'D' :
- (p->state & TASK_STOPPED) ? 'T' :
- (p->state & TASK_TRACED) ? 'C' :
- (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
- (p->exit_state & EXIT_DEAD) ? 'E' :
- (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
+ state = task_state_to_char((struct task_struct *) p);
+
if (is_idle_task(p)) {
/* Idle task. Is it really idle, apart from the kdb
* interrupt? */
+ cpu = kdb_process_cpu(p);
if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
if (cpu != kdb_initial_cpu)
- state = 'I'; /* idle task */
+ state = '-'; /* idle task */
}
- } else if (!p->mm && state == 'S') {
- state = 'M'; /* sleeping system daemon */
+ } else if (!p->mm && strchr("IMS", state)) {
+ state = tolower(state); /* sleeping system daemon */
}
return state;
}
@@ -655,258 +513,28 @@ char kdb_task_state_char (const struct task_struct *p)
* given by the mask.
* Inputs:
* p struct task for the process
- * mask mask from kdb_task_state_string to select processes
+ * mask set of characters used to select processes; both NULL
+ * and the empty string mean adopt a default filter, which
+ * is to suppress sleeping system daemons and the idle tasks
* Returns:
* True if the process matches at least one criteria defined by the mask.
*/
-unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
+bool kdb_task_state(const struct task_struct *p, const char *mask)
{
- char state[] = { kdb_task_state_char(p), '\0' };
- return (mask & kdb_task_state_string(state)) != 0;
-}
+ char state = kdb_task_state_char(p);
-/*
- * kdb_print_nameval - Print a name and its value, converting the
- * value to a symbol lookup if possible.
- * Inputs:
- * name field name to print
- * val value of field
- */
-void kdb_print_nameval(const char *name, unsigned long val)
-{
- kdb_symtab_t symtab;
- kdb_printf(" %-11.11s ", name);
- if (kdbnearsym(val, &symtab))
- kdb_symbol_print(val, &symtab,
- KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE);
- else
- kdb_printf("0x%lx\n", val);
-}
-
-/* Last ditch allocator for debugging, so we can still debug even when
- * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned
- * for space usage, not for speed. One smallish memory pool, the free
- * chain is always in ascending address order to allow coalescing,
- * allocations are done in brute force best fit.
- */
-
-struct debug_alloc_header {
- u32 next; /* offset of next header from start of pool */
- u32 size;
- void *caller;
-};
-
-/* The memory returned by this allocator must be aligned, which means
- * so must the header size. Do not assume that sizeof(struct
- * debug_alloc_header) is a multiple of the alignment, explicitly
- * calculate the overhead of this header, including the alignment.
- * The rest of this code must not use sizeof() on any header or
- * pointer to a header.
- */
-#define dah_align 8
-#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align)
-
-static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */
-static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned;
-static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max;
-
-/* Locking is awkward. The debug code is called from all contexts,
- * including non maskable interrupts. A normal spinlock is not safe
- * in NMI context. Try to get the debug allocator lock, if it cannot
- * be obtained after a second then give up. If the lock could not be
- * previously obtained on this cpu then only try once.
- *
- * sparse has no annotation for "this function _sometimes_ acquires a
- * lock", so fudge the acquire/release notation.
- */
-static DEFINE_SPINLOCK(dap_lock);
-static int get_dap_lock(void)
- __acquires(dap_lock)
-{
- static int dap_locked = -1;
- int count;
- if (dap_locked == smp_processor_id())
- count = 1;
- else
- count = 1000;
- while (1) {
- if (spin_trylock(&dap_lock)) {
- dap_locked = -1;
- return 1;
- }
- if (!count--)
- break;
- udelay(1000);
- }
- dap_locked = smp_processor_id();
- __acquire(dap_lock);
- return 0;
-}
-
-void *debug_kmalloc(size_t size, gfp_t flags)
-{
- unsigned int rem, h_offset;
- struct debug_alloc_header *best, *bestprev, *prev, *h;
- void *p = NULL;
- if (!get_dap_lock()) {
- __release(dap_lock); /* we never actually got it */
- return NULL;
- }
- h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
- if (dah_first_call) {
- h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead;
- dah_first_call = 0;
- }
- size = ALIGN(size, dah_align);
- prev = best = bestprev = NULL;
- while (1) {
- if (h->size >= size && (!best || h->size < best->size)) {
- best = h;
- bestprev = prev;
- if (h->size == size)
- break;
- }
- if (!h->next)
- break;
- prev = h;
- h = (struct debug_alloc_header *)(debug_alloc_pool + h->next);
- }
- if (!best)
- goto out;
- rem = best->size - size;
- /* The pool must always contain at least one header */
- if (best->next == 0 && bestprev == NULL && rem < dah_overhead)
- goto out;
- if (rem >= dah_overhead) {
- best->size = size;
- h_offset = ((char *)best - debug_alloc_pool) +
- dah_overhead + best->size;
- h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset);
- h->size = rem - dah_overhead;
- h->next = best->next;
- } else
- h_offset = best->next;
- best->caller = __builtin_return_address(0);
- dah_used += best->size;
- dah_used_max = max(dah_used, dah_used_max);
- if (bestprev)
- bestprev->next = h_offset;
- else
- dah_first = h_offset;
- p = (char *)best + dah_overhead;
- memset(p, POISON_INUSE, best->size - 1);
- *((char *)p + best->size - 1) = POISON_END;
-out:
- spin_unlock(&dap_lock);
- return p;
-}
+ /* If there is no mask, then we will filter code that runs when the
+ * scheduler is idling and any system daemons that are currently
+ * sleeping.
+ */
+ if (!mask || mask[0] == '\0')
+ return !strchr("-ims", state);
-void debug_kfree(void *p)
-{
- struct debug_alloc_header *h;
- unsigned int h_offset;
- if (!p)
- return;
- if ((char *)p < debug_alloc_pool ||
- (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) {
- kfree(p);
- return;
- }
- if (!get_dap_lock()) {
- __release(dap_lock); /* we never actually got it */
- return; /* memory leak, cannot be helped */
- }
- h = (struct debug_alloc_header *)((char *)p - dah_overhead);
- memset(p, POISON_FREE, h->size - 1);
- *((char *)p + h->size - 1) = POISON_END;
- h->caller = NULL;
- dah_used -= h->size;
- h_offset = (char *)h - debug_alloc_pool;
- if (h_offset < dah_first) {
- h->next = dah_first;
- dah_first = h_offset;
- } else {
- struct debug_alloc_header *prev;
- unsigned int prev_offset;
- prev = (struct debug_alloc_header *)(debug_alloc_pool +
- dah_first);
- while (1) {
- if (!prev->next || prev->next > h_offset)
- break;
- prev = (struct debug_alloc_header *)
- (debug_alloc_pool + prev->next);
- }
- prev_offset = (char *)prev - debug_alloc_pool;
- if (prev_offset + dah_overhead + prev->size == h_offset) {
- prev->size += dah_overhead + h->size;
- memset(h, POISON_FREE, dah_overhead - 1);
- *((char *)h + dah_overhead - 1) = POISON_END;
- h = prev;
- h_offset = prev_offset;
- } else {
- h->next = prev->next;
- prev->next = h_offset;
- }
- }
- if (h_offset + dah_overhead + h->size == h->next) {
- struct debug_alloc_header *next;
- next = (struct debug_alloc_header *)
- (debug_alloc_pool + h->next);
- h->size += dah_overhead + next->size;
- h->next = next->next;
- memset(next, POISON_FREE, dah_overhead - 1);
- *((char *)next + dah_overhead - 1) = POISON_END;
- }
- spin_unlock(&dap_lock);
-}
+ /* A is a special case that matches all states */
+ if (strchr(mask, 'A'))
+ return true;
-void debug_kusage(void)
-{
- struct debug_alloc_header *h_free, *h_used;
-#ifdef CONFIG_IA64
- /* FIXME: using dah for ia64 unwind always results in a memory leak.
- * Fix that memory leak first, then set debug_kusage_one_time = 1 for
- * all architectures.
- */
- static int debug_kusage_one_time;
-#else
- static int debug_kusage_one_time = 1;
-#endif
- if (!get_dap_lock()) {
- __release(dap_lock); /* we never actually got it */
- return;
- }
- h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
- if (dah_first == 0 &&
- (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead ||
- dah_first_call))
- goto out;
- if (!debug_kusage_one_time)
- goto out;
- debug_kusage_one_time = 0;
- kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n",
- __func__, dah_first);
- if (dah_first) {
- h_used = (struct debug_alloc_header *)debug_alloc_pool;
- kdb_printf("%s: h_used %px size %d\n", __func__, h_used,
- h_used->size);
- }
- do {
- h_used = (struct debug_alloc_header *)
- ((char *)h_free + dah_overhead + h_free->size);
- kdb_printf("%s: h_used %px size %d caller %px\n",
- __func__, h_used, h_used->size, h_used->caller);
- h_free = (struct debug_alloc_header *)
- (debug_alloc_pool + h_free->next);
- } while (h_free->next);
- h_used = (struct debug_alloc_header *)
- ((char *)h_free + dah_overhead + h_free->size);
- if ((char *)h_used - debug_alloc_pool !=
- sizeof(debug_alloc_pool_aligned))
- kdb_printf("%s: h_used %px size %d caller %px\n",
- __func__, h_used, h_used->size, h_used->caller);
-out:
- spin_unlock(&dap_lock);
+ return strchr(mask, state);
}
/* Maintain a small stack of kdb_flags to allow recursion without disturbing
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 27725754ac99..6f0c358e73d8 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -7,30 +7,84 @@
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/clock.h>
#include <linux/slab.h>
#include <linux/taskstats.h>
-#include <linux/time.h>
#include <linux/sysctl.h>
#include <linux/delayacct.h>
#include <linux/module.h>
-int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
-EXPORT_SYMBOL_GPL(delayacct_on);
+DEFINE_STATIC_KEY_FALSE(delayacct_key);
+int delayacct_on __read_mostly; /* Delay accounting turned on/off */
struct kmem_cache *delayacct_cache;
-static int __init delayacct_setup_disable(char *str)
+static void set_delayacct(bool enabled)
{
- delayacct_on = 0;
+ if (enabled) {
+ static_branch_enable(&delayacct_key);
+ delayacct_on = 1;
+ } else {
+ delayacct_on = 0;
+ static_branch_disable(&delayacct_key);
+ }
+}
+
+static int __init delayacct_setup_enable(char *str)
+{
+ delayacct_on = 1;
return 1;
}
-__setup("nodelayacct", delayacct_setup_disable);
+__setup("delayacct", delayacct_setup_enable);
void delayacct_init(void)
{
delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
delayacct_tsk_init(&init_task);
+ set_delayacct(delayacct_on);
}
+#ifdef CONFIG_PROC_SYSCTL
+static int sysctl_delayacct(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int state = delayacct_on;
+ struct ctl_table t;
+ int err;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ set_delayacct(state);
+ return err;
+}
+
+static struct ctl_table kern_delayacct_table[] = {
+ {
+ .procname = "task_delayacct",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_delayacct,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ { }
+};
+
+static __init int kernel_delayacct_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_delayacct_table);
+ return 0;
+}
+late_initcall(kernel_delayacct_sysctls_init);
+#endif
+
void __delayacct_tsk_init(struct task_struct *tsk)
{
tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
@@ -42,10 +96,9 @@ void __delayacct_tsk_init(struct task_struct *tsk)
* Finish delay accounting for a statistic using its timestamps (@start),
* accumalator (@total) and @count
*/
-static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total,
- u32 *count)
+static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count)
{
- s64 ns = ktime_get_ns() - *start;
+ s64 ns = local_clock() - *start;
unsigned long flags;
if (ns > 0) {
@@ -58,7 +111,7 @@ static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total,
void __delayacct_blkio_start(void)
{
- current->delays->blkio_start = ktime_get_ns();
+ current->delays->blkio_start = local_clock();
}
/*
@@ -67,22 +120,13 @@ void __delayacct_blkio_start(void)
*/
void __delayacct_blkio_end(struct task_struct *p)
{
- struct task_delay_info *delays = p->delays;
- u64 *total;
- u32 *count;
-
- if (p->delays->flags & DELAYACCT_PF_SWAPIN) {
- total = &delays->swapin_delay;
- count = &delays->swapin_count;
- } else {
- total = &delays->blkio_delay;
- count = &delays->blkio_count;
- }
-
- delayacct_end(&delays->lock, &delays->blkio_start, total, count);
+ delayacct_end(&p->delays->lock,
+ &p->delays->blkio_start,
+ &p->delays->blkio_delay,
+ &p->delays->blkio_count);
}
-int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
+int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
{
u64 utime, stime, stimescaled, utimescaled;
unsigned long long t2, t3;
@@ -117,6 +161,9 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->cpu_run_virtual_total =
(tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
+ if (!tsk->delays)
+ return 0;
+
/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
raw_spin_lock_irqsave(&tsk->delays->lock, flags);
@@ -128,10 +175,19 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay;
d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp;
+ tmp = d->compact_delay_total + tsk->delays->compact_delay;
+ d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp;
+ tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay;
+ d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp;
+ tmp = d->irq_delay_total + tsk->delays->irq_delay;
+ d->irq_delay_total = (tmp < d->irq_delay_total) ? 0 : tmp;
d->blkio_count += tsk->delays->blkio_count;
d->swapin_count += tsk->delays->swapin_count;
d->freepages_count += tsk->delays->freepages_count;
d->thrashing_count += tsk->delays->thrashing_count;
+ d->compact_count += tsk->delays->compact_count;
+ d->wpcopy_count += tsk->delays->wpcopy_count;
+ d->irq_count += tsk->delays->irq_count;
raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return 0;
@@ -143,35 +199,92 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
unsigned long flags;
raw_spin_lock_irqsave(&tsk->delays->lock, flags);
- ret = nsec_to_clock_t(tsk->delays->blkio_delay +
- tsk->delays->swapin_delay);
+ ret = nsec_to_clock_t(tsk->delays->blkio_delay);
raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return ret;
}
void __delayacct_freepages_start(void)
{
- current->delays->freepages_start = ktime_get_ns();
+ current->delays->freepages_start = local_clock();
}
void __delayacct_freepages_end(void)
{
- delayacct_end(
- &current->delays->lock,
- &current->delays->freepages_start,
- &current->delays->freepages_delay,
- &current->delays->freepages_count);
+ delayacct_end(&current->delays->lock,
+ &current->delays->freepages_start,
+ &current->delays->freepages_delay,
+ &current->delays->freepages_count);
}
-void __delayacct_thrashing_start(void)
+void __delayacct_thrashing_start(bool *in_thrashing)
{
- current->delays->thrashing_start = ktime_get_ns();
+ *in_thrashing = !!current->in_thrashing;
+ if (*in_thrashing)
+ return;
+
+ current->in_thrashing = 1;
+ current->delays->thrashing_start = local_clock();
}
-void __delayacct_thrashing_end(void)
+void __delayacct_thrashing_end(bool *in_thrashing)
{
+ if (*in_thrashing)
+ return;
+
+ current->in_thrashing = 0;
delayacct_end(&current->delays->lock,
&current->delays->thrashing_start,
&current->delays->thrashing_delay,
&current->delays->thrashing_count);
}
+
+void __delayacct_swapin_start(void)
+{
+ current->delays->swapin_start = local_clock();
+}
+
+void __delayacct_swapin_end(void)
+{
+ delayacct_end(&current->delays->lock,
+ &current->delays->swapin_start,
+ &current->delays->swapin_delay,
+ &current->delays->swapin_count);
+}
+
+void __delayacct_compact_start(void)
+{
+ current->delays->compact_start = local_clock();
+}
+
+void __delayacct_compact_end(void)
+{
+ delayacct_end(&current->delays->lock,
+ &current->delays->compact_start,
+ &current->delays->compact_delay,
+ &current->delays->compact_count);
+}
+
+void __delayacct_wpcopy_start(void)
+{
+ current->delays->wpcopy_start = local_clock();
+}
+
+void __delayacct_wpcopy_end(void)
+{
+ delayacct_end(&current->delays->lock,
+ &current->delays->wpcopy_start,
+ &current->delays->wpcopy_delay,
+ &current->delays->wpcopy_count);
+}
+
+void __delayacct_irq(struct task_struct *task, u32 delta)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task->delays->lock, flags);
+ task->delays->irq_delay += delta;
+ task->delays->irq_count++;
+ raw_spin_unlock_irqrestore(&task->delays->lock, flags);
+}
+
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 1da3f44f2565..d62f5957f36b 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -1,10 +1,32 @@
# SPDX-License-Identifier: GPL-2.0-only
+config NO_DMA
+ bool
+
config HAS_DMA
bool
depends on !NO_DMA
default y
+config DMA_OPS
+ depends on HAS_DMA
+ bool
+
+#
+# IOMMU drivers that can bypass the IOMMU code and optionally use the direct
+# mapping fast path should select this option and set the dma_ops_bypass
+# flag in struct device where applicable
+#
+config DMA_OPS_BYPASS
+ bool
+
+# Lets platform IOMMU driver choose between bypass and IOMMU
+config ARCH_HAS_DMA_MAP_DIRECT
+ bool
+
+config NEED_SG_DMA_FLAGS
+ bool
+
config NEED_SG_DMA_LENGTH
bool
@@ -14,21 +36,24 @@ config NEED_DMA_MAP_STATE
config ARCH_DMA_ADDR_T_64BIT
def_bool 64BIT || PHYS_ADDR_T_64BIT
-config ARCH_HAS_DMA_COHERENCE_H
- bool
-
config ARCH_HAS_DMA_SET_MASK
bool
#
# Select this option if the architecture needs special handling for
# DMA_ATTR_WRITE_COMBINE. Normally the "uncached" mapping should be what
-# people thing of when saying write combine, so very few platforms should
+# people think of when saying write combine, so very few platforms should
# need to enable this.
#
config ARCH_HAS_DMA_WRITE_COMBINE
bool
+#
+# Select if the architectures provides the arch_dma_mark_clean hook
+#
+config ARCH_HAS_DMA_MARK_CLEAN
+ bool
+
config DMA_DECLARE_COHERENT
bool
@@ -54,17 +79,47 @@ config ARCH_HAS_DMA_PREP_COHERENT
config ARCH_HAS_FORCE_DMA_UNENCRYPTED
bool
-config DMA_NONCOHERENT_CACHE_SYNC
- bool
-
-config DMA_VIRT_OPS
+#
+# Select this option if the architecture assumes DMA devices are coherent
+# by default.
+#
+config ARCH_DMA_DEFAULT_COHERENT
bool
- depends on HAS_DMA
config SWIOTLB
bool
select NEED_DMA_MAP_STATE
+config SWIOTLB_DYNAMIC
+ bool "Dynamic allocation of DMA bounce buffers"
+ default n
+ depends on SWIOTLB
+ help
+ This enables dynamic resizing of the software IO TLB. The kernel
+ starts with one memory pool at boot and it will allocate additional
+ pools as needed. To reduce run-time kernel memory requirements, you
+ may have to specify a smaller size of the initial pool using
+ "swiotlb=" on the kernel command line.
+
+ If unsure, say N.
+
+config DMA_BOUNCE_UNALIGNED_KMALLOC
+ bool
+ depends on SWIOTLB
+
+config DMA_RESTRICTED_POOL
+ bool "DMA Restricted Pool"
+ depends on OF && OF_RESERVED_MEM && SWIOTLB
+ help
+ This enables support for restricted DMA pools which provide a level of
+ DMA memory protection on systems with limited hardware protection
+ capabilities, such as those lacking an IOMMU.
+
+ For more information see
+ <Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt>
+ and <kernel/dma/swiotlb.c>.
+ If unsure, say "n".
+
#
# Should be selected if we can mmap non-coherent mappings to userspace.
# The only thing that is really required is a way to set an uncached bit
@@ -78,15 +133,25 @@ config DMA_COHERENT_POOL
select GENERIC_ALLOCATOR
bool
-config DMA_REMAP
+config DMA_GLOBAL_POOL
+ select DMA_DECLARE_COHERENT
+ depends on !ARCH_HAS_DMA_SET_UNCACHED
+ depends on !DMA_DIRECT_REMAP
bool
- depends on MMU
- select DMA_NONCOHERENT_MMAP
config DMA_DIRECT_REMAP
bool
- select DMA_REMAP
select DMA_COHERENT_POOL
+ select DMA_NONCOHERENT_MMAP
+
+#
+# Fallback to arch code for DMA allocations. This should eventually go away.
+#
+config ARCH_HAS_DMA_ALLOC
+ depends on !ARCH_HAS_DMA_SET_UNCACHED
+ depends on !DMA_DIRECT_REMAP
+ depends on !DMA_GLOBAL_POOL
+ bool
config DMA_CMA
bool "DMA Contiguous Memory Allocator"
@@ -99,10 +164,22 @@ config DMA_CMA
You can disable CMA by specifying "cma=0" on the kernel's command
line.
- For more information see <include/linux/dma-contiguous.h>.
+ For more information see <kernel/dma/contiguous.c>.
If unsure, say "n".
if DMA_CMA
+
+config DMA_NUMA_CMA
+ bool "Enable separate DMA Contiguous Memory Area for NUMA Node"
+ depends on NUMA
+ help
+ Enable this option to get numa CMA areas so that NUMA devices
+ can get local memory by DMA coherent APIs.
+
+ You can set the size of pernuma CMA by specifying "cma_pernuma=size"
+ or set the node id and its size of CMA by specifying "numa_cma=
+ <node>:size[,<node>:size]" on the kernel's command line.
+
comment "Default contiguous memory area size:"
config CMA_SIZE_MBYTES
@@ -147,7 +224,7 @@ endchoice
config CMA_ALIGNMENT
int "Maximum PAGE_SIZE order of alignment for contiguous buffers"
- range 4 12
+ range 2 12
default 8
help
DMA mapping framework by default aligns all buffers to the smallest
@@ -174,11 +251,6 @@ config DMA_API_DEBUG
drivers like double-freeing of DMA mappings or freeing mappings that
were never allocated.
- This also attempts to catch cases where a page owned by DMA is
- accessed by the cpu in a way that could cause data corruption. For
- example, this enables cow_user_page() to check that the source page is
- not undergoing DMA.
-
This option causes a performance degradation. Use only if you want to
debug device drivers and dma interactions.
@@ -200,3 +272,12 @@ config DMA_API_DEBUG_SG
is technically out-of-spec.
If unsure, say N.
+
+config DMA_MAP_BENCHMARK
+ bool "Enable benchmarking of streaming DMA mapping"
+ depends on DEBUG_FS
+ help
+ Provides /sys/kernel/debug/dma_map_benchmark that helps with testing
+ performance of dma_(un)map_page.
+
+ See tools/testing/selftests/dma/dma_map_benchmark.c
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 370f63344e9c..21926e46ef4f 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -1,10 +1,12 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_HAS_DMA) += mapping.o direct.o dummy.o
+obj-$(CONFIG_HAS_DMA) += mapping.o direct.o
+obj-$(CONFIG_DMA_OPS) += ops_helpers.o
+obj-$(CONFIG_DMA_OPS) += dummy.o
obj-$(CONFIG_DMA_CMA) += contiguous.o
obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o
-obj-$(CONFIG_DMA_VIRT_OPS) += virt.o
obj-$(CONFIG_DMA_API_DEBUG) += debug.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o
-obj-$(CONFIG_DMA_REMAP) += remap.o
+obj-$(CONFIG_MMU) += remap.o
+obj-$(CONFIG_DMA_MAP_BENCHMARK) += map_benchmark.o
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 2a0c4985f38e..ff5683a57f77 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -7,7 +7,8 @@
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-map-ops.h>
struct dma_coherent_mem {
void *virt_base;
@@ -19,8 +20,6 @@ struct dma_coherent_mem {
bool use_dev_dma_pfn_offset;
};
-static struct dma_coherent_mem *dma_coherent_default_memory __ro_after_init;
-
static inline struct dma_coherent_mem *dev_get_coherent_memory(struct device *dev)
{
if (dev && dev->dma_mem)
@@ -32,65 +31,56 @@ static inline dma_addr_t dma_get_device_base(struct device *dev,
struct dma_coherent_mem * mem)
{
if (mem->use_dev_dma_pfn_offset)
- return (mem->pfn_base - dev->dma_pfn_offset) << PAGE_SHIFT;
- else
- return mem->device_base;
+ return phys_to_dma(dev, PFN_PHYS(mem->pfn_base));
+ return mem->device_base;
}
-static int dma_init_coherent_memory(phys_addr_t phys_addr,
- dma_addr_t device_addr, size_t size,
- struct dma_coherent_mem **mem)
+static struct dma_coherent_mem *dma_init_coherent_memory(phys_addr_t phys_addr,
+ dma_addr_t device_addr, size_t size, bool use_dma_pfn_offset)
{
- struct dma_coherent_mem *dma_mem = NULL;
- void *mem_base = NULL;
+ struct dma_coherent_mem *dma_mem;
int pages = size >> PAGE_SHIFT;
- int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
- int ret;
+ void *mem_base;
- if (!size) {
- ret = -EINVAL;
- goto out;
- }
+ if (!size)
+ return ERR_PTR(-EINVAL);
mem_base = memremap(phys_addr, size, MEMREMAP_WC);
- if (!mem_base) {
- ret = -EINVAL;
- goto out;
- }
+ if (!mem_base)
+ return ERR_PTR(-EINVAL);
+
dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
- if (!dma_mem) {
- ret = -ENOMEM;
- goto out;
- }
- dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
- if (!dma_mem->bitmap) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!dma_mem)
+ goto out_unmap_membase;
+ dma_mem->bitmap = bitmap_zalloc(pages, GFP_KERNEL);
+ if (!dma_mem->bitmap)
+ goto out_free_dma_mem;
dma_mem->virt_base = mem_base;
dma_mem->device_base = device_addr;
dma_mem->pfn_base = PFN_DOWN(phys_addr);
dma_mem->size = pages;
+ dma_mem->use_dev_dma_pfn_offset = use_dma_pfn_offset;
spin_lock_init(&dma_mem->spinlock);
- *mem = dma_mem;
- return 0;
+ return dma_mem;
-out:
+out_free_dma_mem:
kfree(dma_mem);
- if (mem_base)
- memunmap(mem_base);
- return ret;
+out_unmap_membase:
+ memunmap(mem_base);
+ pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %zd MiB\n",
+ &phys_addr, size / SZ_1M);
+ return ERR_PTR(-ENOMEM);
}
-static void dma_release_coherent_memory(struct dma_coherent_mem *mem)
+static void _dma_release_coherent_memory(struct dma_coherent_mem *mem)
{
if (!mem)
return;
memunmap(mem->virt_base);
- kfree(mem->bitmap);
+ bitmap_free(mem->bitmap);
kfree(mem);
}
@@ -107,22 +97,47 @@ static int dma_assign_coherent_memory(struct device *dev,
return 0;
}
+/*
+ * Declare a region of memory to be handed out by dma_alloc_coherent() when it
+ * is asked for coherent memory for this device. This shall only be used
+ * from platform code, usually based on the device tree description.
+ *
+ * phys_addr is the CPU physical address to which the memory is currently
+ * assigned (this will be ioremapped so the CPU can access the region).
+ *
+ * device_addr is the DMA address the device needs to be programmed with to
+ * actually address this memory (this will be handed out as the dma_addr_t in
+ * dma_alloc_coherent()).
+ *
+ * size is the size of the area (must be a multiple of PAGE_SIZE).
+ *
+ * As a simplification for the platforms, only *one* such region of memory may
+ * be declared per device.
+ */
int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
dma_addr_t device_addr, size_t size)
{
struct dma_coherent_mem *mem;
int ret;
- ret = dma_init_coherent_memory(phys_addr, device_addr, size, &mem);
- if (ret)
- return ret;
+ mem = dma_init_coherent_memory(phys_addr, device_addr, size, false);
+ if (IS_ERR(mem))
+ return PTR_ERR(mem);
ret = dma_assign_coherent_memory(dev, mem);
if (ret)
- dma_release_coherent_memory(mem);
+ _dma_release_coherent_memory(mem);
return ret;
}
+void dma_release_coherent_memory(struct device *dev)
+{
+ if (dev) {
+ _dma_release_coherent_memory(dev->dma_mem);
+ dev->dma_mem = NULL;
+ }
+}
+
static void *__dma_alloc_from_coherent(struct device *dev,
struct dma_coherent_mem *mem,
ssize_t size, dma_addr_t *dma_handle)
@@ -181,16 +196,6 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
return 1;
}
-void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size,
- dma_addr_t *dma_handle)
-{
- if (!dma_coherent_default_memory)
- return NULL;
-
- return __dma_alloc_from_coherent(dev, dma_coherent_default_memory, size,
- dma_handle);
-}
-
static int __dma_release_from_coherent(struct dma_coherent_mem *mem,
int order, void *vaddr)
{
@@ -226,15 +231,6 @@ int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr)
return __dma_release_from_coherent(mem, order, vaddr);
}
-int dma_release_from_global_coherent(int order, void *vaddr)
-{
- if (!dma_coherent_default_memory)
- return 0;
-
- return __dma_release_from_coherent(dma_coherent_default_memory, order,
- vaddr);
-}
-
static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem,
struct vm_area_struct *vma, void *vaddr, size_t size, int *ret)
{
@@ -280,6 +276,28 @@ int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma,
return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret);
}
+#ifdef CONFIG_DMA_GLOBAL_POOL
+static struct dma_coherent_mem *dma_coherent_default_memory __ro_after_init;
+
+void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size,
+ dma_addr_t *dma_handle)
+{
+ if (!dma_coherent_default_memory)
+ return NULL;
+
+ return __dma_alloc_from_coherent(dev, dma_coherent_default_memory, size,
+ dma_handle);
+}
+
+int dma_release_from_global_coherent(int order, void *vaddr)
+{
+ if (!dma_coherent_default_memory)
+ return 0;
+
+ return __dma_release_from_coherent(dma_coherent_default_memory, order,
+ vaddr);
+}
+
int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr,
size_t size, int *ret)
{
@@ -290,6 +308,19 @@ int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr,
vaddr, size, ret);
}
+int dma_init_global_coherent(phys_addr_t phys_addr, size_t size)
+{
+ struct dma_coherent_mem *mem;
+
+ mem = dma_init_coherent_memory(phys_addr, phys_addr, size, true);
+ if (IS_ERR(mem))
+ return PTR_ERR(mem);
+ dma_coherent_default_memory = mem;
+ pr_info("DMA: default coherent area is set\n");
+ return 0;
+}
+#endif /* CONFIG_DMA_GLOBAL_POOL */
+
/*
* Support for reserved memory regions defined in device tree
*/
@@ -298,25 +329,22 @@ int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr,
#include <linux/of_fdt.h>
#include <linux/of_reserved_mem.h>
+#ifdef CONFIG_DMA_GLOBAL_POOL
static struct reserved_mem *dma_reserved_default_memory __initdata;
+#endif
static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev)
{
- struct dma_coherent_mem *mem = rmem->priv;
- int ret;
-
- if (!mem) {
- ret = dma_init_coherent_memory(rmem->base, rmem->base,
- rmem->size, &mem);
- if (ret) {
- pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n",
- &rmem->base, (unsigned long)rmem->size / SZ_1M);
- return ret;
- }
+ if (!rmem->priv) {
+ struct dma_coherent_mem *mem;
+
+ mem = dma_init_coherent_memory(rmem->base, rmem->base,
+ rmem->size, true);
+ if (IS_ERR(mem))
+ return PTR_ERR(mem);
+ rmem->priv = mem;
}
- mem->use_dev_dma_pfn_offset = true;
- rmem->priv = mem;
- dma_assign_coherent_memory(dev, mem);
+ dma_assign_coherent_memory(dev, rmem->priv);
return 0;
}
@@ -344,7 +372,9 @@ static int __init rmem_dma_setup(struct reserved_mem *rmem)
pr_err("Reserved memory: regions without no-map are not yet supported\n");
return -EINVAL;
}
+#endif
+#ifdef CONFIG_DMA_GLOBAL_POOL
if (of_get_flat_dt_prop(node, "linux,dma-default", NULL)) {
WARN(dma_reserved_default_memory,
"Reserved memory: region for default DMA coherent area is redefined\n");
@@ -358,31 +388,16 @@ static int __init rmem_dma_setup(struct reserved_mem *rmem)
return 0;
}
+#ifdef CONFIG_DMA_GLOBAL_POOL
static int __init dma_init_reserved_memory(void)
{
- const struct reserved_mem_ops *ops;
- int ret;
-
if (!dma_reserved_default_memory)
return -ENOMEM;
-
- ops = dma_reserved_default_memory->ops;
-
- /*
- * We rely on rmem_dma_device_init() does not propagate error of
- * dma_assign_coherent_memory() for "NULL" device.
- */
- ret = ops->device_init(dma_reserved_default_memory, NULL);
-
- if (!ret) {
- dma_coherent_default_memory = dma_reserved_default_memory->priv;
- pr_info("DMA: default coherent area is set\n");
- }
-
- return ret;
+ return dma_init_global_coherent(dma_reserved_default_memory->base,
+ dma_reserved_default_memory->size);
}
-
core_initcall(dma_init_reserved_memory);
+#endif /* CONFIG_DMA_GLOBAL_POOL */
RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", rmem_dma_setup);
#endif
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 15bc5026c485..f005c66f378c 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -5,6 +5,34 @@
* Written by:
* Marek Szyprowski <m.szyprowski@samsung.com>
* Michal Nazarewicz <mina86@mina86.com>
+ *
+ * Contiguous Memory Allocator
+ *
+ * The Contiguous Memory Allocator (CMA) makes it possible to
+ * allocate big contiguous chunks of memory after the system has
+ * booted.
+ *
+ * Why is it needed?
+ *
+ * Various devices on embedded systems have no scatter-getter and/or
+ * IO map support and require contiguous blocks of memory to
+ * operate. They include devices such as cameras, hardware video
+ * coders, etc.
+ *
+ * Such devices often require big memory buffers (a full HD frame
+ * is, for instance, more than 2 mega pixels large, i.e. more than 6
+ * MB of memory), which makes mechanisms such as kmalloc() or
+ * alloc_page() ineffective.
+ *
+ * At the same time, a solution where a big memory region is
+ * reserved for a device is suboptimal since often more memory is
+ * reserved then strictly required and, moreover, the memory is
+ * inaccessible to page system even if device drivers don't use it.
+ *
+ * CMA tries to solve this issue by operating on memory regions
+ * where only movable pages can be allocated from. This way, kernel
+ * can use the memory for pagecache and when device driver requests
+ * it, allocated pages can be migrated.
*/
#define pr_fmt(fmt) "cma: " fmt
@@ -16,13 +44,13 @@
#endif
#include <asm/page.h>
-#include <asm/dma-contiguous.h>
#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/sizes.h>
-#include <linux/dma-contiguous.h>
+#include <linux/dma-map-ops.h>
#include <linux/cma.h>
+#include <linux/nospec.h>
#ifdef CONFIG_CMA_SIZE_MBYTES
#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES
@@ -69,20 +97,57 @@ static int __init early_cma(char *p)
}
early_param("cma", early_cma);
+#ifdef CONFIG_DMA_NUMA_CMA
+
+static struct cma *dma_contiguous_numa_area[MAX_NUMNODES];
+static phys_addr_t numa_cma_size[MAX_NUMNODES] __initdata;
+static struct cma *dma_contiguous_pernuma_area[MAX_NUMNODES];
+static phys_addr_t pernuma_size_bytes __initdata;
+
+static int __init early_numa_cma(char *p)
+{
+ int nid, count = 0;
+ unsigned long tmp;
+ char *s = p;
+
+ while (*s) {
+ if (sscanf(s, "%lu%n", &tmp, &count) != 1)
+ break;
+
+ if (s[count] == ':') {
+ if (tmp >= MAX_NUMNODES)
+ break;
+ nid = array_index_nospec(tmp, MAX_NUMNODES);
+
+ s += count + 1;
+ tmp = memparse(s, &s);
+ numa_cma_size[nid] = tmp;
+
+ if (*s == ',')
+ s++;
+ else
+ break;
+ } else
+ break;
+ }
+
+ return 0;
+}
+early_param("numa_cma", early_numa_cma);
+
+static int __init early_cma_pernuma(char *p)
+{
+ pernuma_size_bytes = memparse(p, &p);
+ return 0;
+}
+early_param("cma_pernuma", early_cma_pernuma);
+#endif
+
#ifdef CONFIG_CMA_SIZE_PERCENTAGE
static phys_addr_t __init __maybe_unused cma_early_percent_memory(void)
{
- struct memblock_region *reg;
- unsigned long total_pages = 0;
-
- /*
- * We cannot use memblock_phys_mem_size() here, because
- * memblock_analyze() has not been called yet.
- */
- for_each_memblock(memory, reg)
- total_pages += memblock_region_memory_end_pfn(reg) -
- memblock_region_memory_base_pfn(reg);
+ unsigned long total_pages = PHYS_PFN(memblock_phys_mem_size());
return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT;
}
@@ -96,6 +161,51 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void)
#endif
+#ifdef CONFIG_DMA_NUMA_CMA
+static void __init dma_numa_cma_reserve(void)
+{
+ int nid;
+
+ for_each_node(nid) {
+ int ret;
+ char name[CMA_MAX_NAME];
+ struct cma **cma;
+
+ if (!node_online(nid)) {
+ if (pernuma_size_bytes || numa_cma_size[nid])
+ pr_warn("invalid node %d specified\n", nid);
+ continue;
+ }
+
+ if (pernuma_size_bytes) {
+
+ cma = &dma_contiguous_pernuma_area[nid];
+ snprintf(name, sizeof(name), "pernuma%d", nid);
+ ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0,
+ 0, false, name, cma, nid);
+ if (ret)
+ pr_warn("%s: reservation failed: err %d, node %d", __func__,
+ ret, nid);
+ }
+
+ if (numa_cma_size[nid]) {
+
+ cma = &dma_contiguous_numa_area[nid];
+ snprintf(name, sizeof(name), "numa%d", nid);
+ ret = cma_declare_contiguous_nid(0, numa_cma_size[nid], 0, 0, 0, false,
+ name, cma, nid);
+ if (ret)
+ pr_warn("%s: reservation failed: err %d, node %d", __func__,
+ ret, nid);
+ }
+ }
+}
+#else
+static inline void __init dma_numa_cma_reserve(void)
+{
+}
+#endif
+
/**
* dma_contiguous_reserve() - reserve area(s) for contiguous memory handling
* @limit: End address of the reserved memory (optional, 0 for any).
@@ -112,6 +222,8 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
phys_addr_t selected_limit = limit;
bool fixed = false;
+ dma_numa_cma_reserve();
+
pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);
if (size_cmdline != -1) {
@@ -143,6 +255,11 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
}
}
+void __weak
+dma_contiguous_early_fixup(phys_addr_t base, unsigned long size)
+{
+}
+
/**
* dma_contiguous_reserve_area() - reserve custom contiguous area
* @size: Size of the reserved area (in bytes),
@@ -215,40 +332,65 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
return cma_release(dev_get_cma_area(dev), pages, count);
}
+static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp)
+{
+ unsigned int align = min(get_order(size), CONFIG_CMA_ALIGNMENT);
+
+ return cma_alloc(cma, size >> PAGE_SHIFT, align, gfp & __GFP_NOWARN);
+}
+
/**
* dma_alloc_contiguous() - allocate contiguous pages
* @dev: Pointer to device for which the allocation is performed.
* @size: Requested allocation size.
* @gfp: Allocation flags.
*
- * This function allocates contiguous memory buffer for specified device. It
- * tries to use device specific contiguous memory area if available, or the
- * default global one.
+ * tries to use device specific contiguous memory area if available, or it
+ * tries to use per-numa cma, if the allocation fails, it will fallback to
+ * try default global one.
*
- * Note that it byapss one-page size of allocations from the global area as
- * the addresses within one page are always contiguous, so there is no need
- * to waste CMA pages for that kind; it also helps reduce fragmentations.
+ * Note that it bypass one-page size of allocations from the per-numa and
+ * global area as the addresses within one page are always contiguous, so
+ * there is no need to waste CMA pages for that kind; it also helps reduce
+ * fragmentations.
*/
struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
{
- size_t count = size >> PAGE_SHIFT;
- struct page *page = NULL;
- struct cma *cma = NULL;
-
- if (dev && dev->cma_area)
- cma = dev->cma_area;
- else if (count > 1)
- cma = dma_contiguous_default_area;
+#ifdef CONFIG_DMA_NUMA_CMA
+ int nid = dev_to_node(dev);
+#endif
/* CMA can be used only in the context which permits sleeping */
- if (cma && gfpflags_allow_blocking(gfp)) {
- size_t align = get_order(size);
- size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
-
- page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN);
+ if (!gfpflags_allow_blocking(gfp))
+ return NULL;
+ if (dev->cma_area)
+ return cma_alloc_aligned(dev->cma_area, size, gfp);
+ if (size <= PAGE_SIZE)
+ return NULL;
+
+#ifdef CONFIG_DMA_NUMA_CMA
+ if (nid != NUMA_NO_NODE && !(gfp & (GFP_DMA | GFP_DMA32))) {
+ struct cma *cma = dma_contiguous_pernuma_area[nid];
+ struct page *page;
+
+ if (cma) {
+ page = cma_alloc_aligned(cma, size, gfp);
+ if (page)
+ return page;
+ }
+
+ cma = dma_contiguous_numa_area[nid];
+ if (cma) {
+ page = cma_alloc_aligned(cma, size, gfp);
+ if (page)
+ return page;
+ }
}
+#endif
+ if (!dma_contiguous_default_area)
+ return NULL;
- return page;
+ return cma_alloc_aligned(dma_contiguous_default_area, size, gfp);
}
/**
@@ -264,9 +406,30 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
*/
void dma_free_contiguous(struct device *dev, struct page *page, size_t size)
{
- if (!cma_release(dev_get_cma_area(dev), page,
- PAGE_ALIGN(size) >> PAGE_SHIFT))
- __free_pages(page, get_order(size));
+ unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ /* if dev has its own cma, free page from there */
+ if (dev->cma_area) {
+ if (cma_release(dev->cma_area, page, count))
+ return;
+ } else {
+ /*
+ * otherwise, page is from either per-numa cma or default cma
+ */
+#ifdef CONFIG_DMA_NUMA_CMA
+ if (cma_release(dma_contiguous_pernuma_area[page_to_nid(page)],
+ page, count))
+ return;
+ if (cma_release(dma_contiguous_numa_area[page_to_nid(page)],
+ page, count))
+ return;
+#endif
+ if (cma_release(dma_contiguous_default_area, page, count))
+ return;
+ }
+
+ /* not in any cma, free from buddy */
+ __free_pages(page, get_order(size));
}
/*
@@ -282,14 +445,14 @@ void dma_free_contiguous(struct device *dev, struct page *page, size_t size)
static int rmem_cma_device_init(struct reserved_mem *rmem, struct device *dev)
{
- dev_set_cma_area(dev, rmem->priv);
+ dev->cma_area = rmem->priv;
return 0;
}
static void rmem_cma_device_release(struct reserved_mem *rmem,
struct device *dev)
{
- dev_set_cma_area(dev, NULL);
+ dev->cma_area = NULL;
}
static const struct reserved_mem_ops rmem_cma_ops = {
@@ -299,8 +462,6 @@ static const struct reserved_mem_ops rmem_cma_ops = {
static int __init rmem_cma_setup(struct reserved_mem *rmem)
{
- phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
- phys_addr_t mask = align - 1;
unsigned long node = rmem->fdt_node;
bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL);
struct cma *cma;
@@ -316,7 +477,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
of_get_flat_dt_prop(node, "no-map", NULL))
return -EINVAL;
- if ((rmem->base & mask) || (rmem->size & mask)) {
+ if (!IS_ALIGNED(rmem->base | rmem->size, CMA_MIN_ALIGNMENT_BYTES)) {
pr_err("Reserved memory: incorrect alignment of CMA region\n");
return -EINVAL;
}
@@ -330,7 +491,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
dma_contiguous_early_fixup(rmem->base, rmem->size);
if (default_cma)
- dma_contiguous_set_default(cma);
+ dma_contiguous_default_area = cma;
rmem->ops = &rmem_cma_ops;
rmem->priv = cma;
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 36c962a86bf2..a6e3792b15f8 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -9,10 +9,9 @@
#include <linux/sched/task_stack.h>
#include <linux/scatterlist.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-map-ops.h>
#include <linux/sched/task.h>
#include <linux/stacktrace.h>
-#include <linux/dma-debug.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/debugfs.h>
@@ -24,8 +23,8 @@
#include <linux/ctype.h>
#include <linux/list.h>
#include <linux/slab.h>
-
#include <asm/sections.h>
+#include "debug.h"
#define HASH_SIZE 16384ULL
#define HASH_FN_SHIFT 13
@@ -54,6 +53,7 @@ enum map_err_types {
* struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping
* @list: node on pre-allocated free_entries list
* @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent
+ * @dev_addr: dma address
* @size: length of the mapping
* @type: single, page, sg, coherent
* @direction: enum dma_data_direction
@@ -62,7 +62,8 @@ enum map_err_types {
* @pfn: page frame of the start address
* @offset: offset of mapping relative to pfn
* @map_err_type: track whether dma_mapping_error() was checked
- * @stacktrace: support backtraces when a violation is detected
+ * @stack_len: number of backtrace entries in @stack_entries
+ * @stack_entries: stack of backtrace history
*/
struct dma_debug_entry {
struct list_head list;
@@ -139,13 +140,17 @@ static const char *const maperr2str[] = {
static const char *type2name[] = {
[dma_debug_single] = "single",
- [dma_debug_sg] = "scather-gather",
+ [dma_debug_sg] = "scatter-gather",
[dma_debug_coherent] = "coherent",
[dma_debug_resource] = "resource",
};
-static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE",
- "DMA_FROM_DEVICE", "DMA_NONE" };
+static const char *dir2name[] = {
+ [DMA_BIDIRECTIONAL] = "DMA_BIDIRECTIONAL",
+ [DMA_TO_DEVICE] = "DMA_TO_DEVICE",
+ [DMA_FROM_DEVICE] = "DMA_FROM_DEVICE",
+ [DMA_NONE] = "DMA_NONE",
+};
/*
* The access to some variables in this macro is racy. We can't use atomic_t
@@ -347,11 +352,10 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket,
unsigned long *flags)
{
- unsigned int max_range = dma_get_max_seg_size(ref->dev);
struct dma_debug_entry *entry, index = *ref;
- unsigned int range = 0;
+ int limit = min(HASH_SIZE, (index.dev_addr >> HASH_FN_SHIFT) + 1);
- while (range <= max_range) {
+ for (int i = 0; i < limit; i++) {
entry = __hash_bucket_find(*bucket, ref, containing_match);
if (entry)
@@ -361,7 +365,6 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket,
* Nothing found, go back a hash bucket
*/
put_hash_bucket(*bucket, *flags);
- range += (1 << HASH_FN_SHIFT);
index.dev_addr -= (1 << HASH_FN_SHIFT);
*bucket = get_hash_bucket(&index, flags);
}
@@ -395,37 +398,6 @@ static unsigned long long phys_addr(struct dma_debug_entry *entry)
}
/*
- * Dump mapping entries for debugging purposes
- */
-void debug_dma_dump_mappings(struct device *dev)
-{
- int idx;
-
- for (idx = 0; idx < HASH_SIZE; idx++) {
- struct hash_bucket *bucket = &dma_entry_hash[idx];
- struct dma_debug_entry *entry;
- unsigned long flags;
-
- spin_lock_irqsave(&bucket->lock, flags);
-
- list_for_each_entry(entry, &bucket->list, list) {
- if (!dev || dev == entry->dev) {
- dev_info(entry->dev,
- "%s idx %d P=%Lx N=%lx D=%Lx L=%Lx %s %s\n",
- type2name[entry->type], idx,
- phys_addr(entry), entry->pfn,
- entry->dev_addr, entry->size,
- dir2name[entry->direction],
- maperr2str[entry->map_err_type]);
- }
- }
-
- spin_unlock_irqrestore(&bucket->lock, flags);
- cond_resched();
- }
-}
-
-/*
* For each mapping (initial cacheline in the case of
* dma_alloc_coherent/dma_map_page, initial cacheline in each page of a
* scatterlist, or the cacheline specified in dma_map_single) insert
@@ -444,11 +416,8 @@ void debug_dma_dump_mappings(struct device *dev)
* dma_active_cacheline entry to track per event. dma_map_sg(), on the
* other hand, consumes a single dma_debug_entry, but inserts 'nents'
* entries into the tree.
- *
- * At any time debug_dma_assert_idle() can be called to trigger a
- * warning if any cachelines in the given page are in the active set.
*/
-static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT);
+static RADIX_TREE(dma_active_cacheline, GFP_ATOMIC);
static DEFINE_SPINLOCK(radix_lock);
#define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1)
#define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT)
@@ -493,10 +462,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln)
overlap = active_cacheline_set_overlap(cln, ++overlap);
/* If we overflowed the overlap counter then we're potentially
- * leaking dma-mappings. Otherwise, if maps and unmaps are
- * balanced then this overflow may cause false negatives in
- * debug_dma_assert_idle() as the cacheline may be marked idle
- * prematurely.
+ * leaking dma-mappings.
*/
WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"),
@@ -551,58 +517,75 @@ static void active_cacheline_remove(struct dma_debug_entry *entry)
spin_unlock_irqrestore(&radix_lock, flags);
}
-/**
- * debug_dma_assert_idle() - assert that a page is not undergoing dma
- * @page: page to lookup in the dma_active_cacheline tree
- *
- * Place a call to this routine in cases where the cpu touching the page
- * before the dma completes (page is dma_unmapped) will lead to data
- * corruption.
+/*
+ * Dump mappings entries on kernel space for debugging purposes
*/
-void debug_dma_assert_idle(struct page *page)
+void debug_dma_dump_mappings(struct device *dev)
{
- static struct dma_debug_entry *ents[CACHELINES_PER_PAGE];
- struct dma_debug_entry *entry = NULL;
- void **results = (void **) &ents;
- unsigned int nents, i;
- unsigned long flags;
+ int idx;
phys_addr_t cln;
- if (dma_debug_disabled())
- return;
-
- if (!page)
- return;
+ for (idx = 0; idx < HASH_SIZE; idx++) {
+ struct hash_bucket *bucket = &dma_entry_hash[idx];
+ struct dma_debug_entry *entry;
+ unsigned long flags;
- cln = (phys_addr_t) page_to_pfn(page) << CACHELINE_PER_PAGE_SHIFT;
- spin_lock_irqsave(&radix_lock, flags);
- nents = radix_tree_gang_lookup(&dma_active_cacheline, results, cln,
- CACHELINES_PER_PAGE);
- for (i = 0; i < nents; i++) {
- phys_addr_t ent_cln = to_cacheline_number(ents[i]);
+ spin_lock_irqsave(&bucket->lock, flags);
+ list_for_each_entry(entry, &bucket->list, list) {
+ if (!dev || dev == entry->dev) {
+ cln = to_cacheline_number(entry);
+ dev_info(entry->dev,
+ "%s idx %d P=%llx N=%lx D=%llx L=%llx cln=%pa %s %s\n",
+ type2name[entry->type], idx,
+ phys_addr(entry), entry->pfn,
+ entry->dev_addr, entry->size,
+ &cln, dir2name[entry->direction],
+ maperr2str[entry->map_err_type]);
+ }
+ }
+ spin_unlock_irqrestore(&bucket->lock, flags);
- if (ent_cln == cln) {
- entry = ents[i];
- break;
- } else if (ent_cln >= cln + CACHELINES_PER_PAGE)
- break;
+ cond_resched();
}
- spin_unlock_irqrestore(&radix_lock, flags);
+}
- if (!entry)
- return;
+/*
+ * Dump mappings entries on user space via debugfs
+ */
+static int dump_show(struct seq_file *seq, void *v)
+{
+ int idx;
+ phys_addr_t cln;
- cln = to_cacheline_number(entry);
- err_printk(entry->dev, entry,
- "cpu touching an active dma mapped cacheline [cln=%pa]\n",
- &cln);
+ for (idx = 0; idx < HASH_SIZE; idx++) {
+ struct hash_bucket *bucket = &dma_entry_hash[idx];
+ struct dma_debug_entry *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bucket->lock, flags);
+ list_for_each_entry(entry, &bucket->list, list) {
+ cln = to_cacheline_number(entry);
+ seq_printf(seq,
+ "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx cln=%pa %s %s\n",
+ dev_driver_string(entry->dev),
+ dev_name(entry->dev),
+ type2name[entry->type], idx,
+ phys_addr(entry), entry->pfn,
+ entry->dev_addr, entry->size,
+ &cln, dir2name[entry->direction],
+ maperr2str[entry->map_err_type]);
+ }
+ spin_unlock_irqrestore(&bucket->lock, flags);
+ }
+ return 0;
}
+DEFINE_SHOW_ATTRIBUTE(dump);
/*
* Wrapper function for adding an entry to the hash.
* This function takes care of locking itself.
*/
-static void add_dma_entry(struct dma_debug_entry *entry)
+static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs)
{
struct hash_bucket *bucket;
unsigned long flags;
@@ -614,13 +597,12 @@ static void add_dma_entry(struct dma_debug_entry *entry)
rc = active_cacheline_insert(entry);
if (rc == -ENOMEM) {
- pr_err("cacheline tracking ENOMEM, dma-debug disabled\n");
+ pr_err_once("cacheline tracking ENOMEM, dma-debug disabled\n");
global_disable = true;
+ } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
+ err_printk(entry->dev, entry,
+ "cacheline tracking EEXIST, overlapping mappings aren't supported\n");
}
-
- /* TODO: report -EEXIST errors here as overlapping mappings are
- * not supported by the DMA API
- */
}
static int dma_debug_create_entries(gfp_t gfp)
@@ -656,15 +638,19 @@ static struct dma_debug_entry *__dma_entry_alloc(void)
return entry;
}
-static void __dma_entry_alloc_check_leak(void)
+/*
+ * This should be called outside of free_entries_lock scope to avoid potential
+ * deadlocks with serial consoles that use DMA.
+ */
+static void __dma_entry_alloc_check_leak(u32 nr_entries)
{
- u32 tmp = nr_total_entries % nr_prealloc_entries;
+ u32 tmp = nr_entries % nr_prealloc_entries;
/* Shout each time we tick over some multiple of the initial pool */
if (tmp < DMA_DEBUG_DYNAMIC_ENTRIES) {
pr_info("dma_debug_entry pool grown to %u (%u00%%)\n",
- nr_total_entries,
- (nr_total_entries / nr_prealloc_entries));
+ nr_entries,
+ (nr_entries / nr_prealloc_entries));
}
}
@@ -675,8 +661,10 @@ static void __dma_entry_alloc_check_leak(void)
*/
static struct dma_debug_entry *dma_entry_alloc(void)
{
+ bool alloc_check_leak = false;
struct dma_debug_entry *entry;
unsigned long flags;
+ u32 nr_entries;
spin_lock_irqsave(&free_entries_lock, flags);
if (num_free_entries == 0) {
@@ -686,13 +674,17 @@ static struct dma_debug_entry *dma_entry_alloc(void)
pr_err("debugging out of memory - disabling\n");
return NULL;
}
- __dma_entry_alloc_check_leak();
+ alloc_check_leak = true;
+ nr_entries = nr_total_entries;
}
entry = __dma_entry_alloc();
spin_unlock_irqrestore(&free_entries_lock, flags);
+ if (alloc_check_leak)
+ __dma_entry_alloc_check_leak(nr_entries);
+
#ifdef CONFIG_STACKTRACE
entry->stack_len = stack_trace_save(entry->stack_entries,
ARRAY_SIZE(entry->stack_entries),
@@ -817,34 +809,7 @@ static const struct file_operations filter_fops = {
.llseek = default_llseek,
};
-static int dump_show(struct seq_file *seq, void *v)
-{
- int idx;
-
- for (idx = 0; idx < HASH_SIZE; idx++) {
- struct hash_bucket *bucket = &dma_entry_hash[idx];
- struct dma_debug_entry *entry;
- unsigned long flags;
-
- spin_lock_irqsave(&bucket->lock, flags);
- list_for_each_entry(entry, &bucket->list, list) {
- seq_printf(seq,
- "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx %s %s\n",
- dev_name(entry->dev),
- dev_driver_string(entry->dev),
- type2name[entry->type], idx,
- phys_addr(entry), entry->pfn,
- entry->dev_addr, entry->size,
- dir2name[entry->direction],
- maperr2str[entry->map_err_type]);
- }
- spin_unlock_irqrestore(&bucket->lock, flags);
- }
- return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(dump);
-
-static void dma_debug_fs_init(void)
+static int __init dma_debug_fs_init(void)
{
struct dentry *dentry = debugfs_create_dir("dma-api", NULL);
@@ -857,7 +822,10 @@ static void dma_debug_fs_init(void)
debugfs_create_u32("nr_total_entries", 0444, dentry, &nr_total_entries);
debugfs_create_file("driver_filter", 0644, dentry, NULL, &filter_fops);
debugfs_create_file("dump", 0444, dentry, NULL, &dump_fops);
+
+ return 0;
}
+core_initcall_sync(dma_debug_fs_init);
static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry)
{
@@ -882,7 +850,7 @@ static int device_dma_allocations(struct device *dev, struct dma_debug_entry **o
static int dma_debug_device_change(struct notifier_block *nb, unsigned long action, void *data)
{
struct device *dev = data;
- struct dma_debug_entry *uninitialized_var(entry);
+ struct dma_debug_entry *entry;
int count;
if (dma_debug_disabled())
@@ -909,7 +877,7 @@ static int dma_debug_device_change(struct notifier_block *nb, unsigned long acti
return 0;
}
-void dma_debug_add_bus(struct bus_type *bus)
+void dma_debug_add_bus(const struct bus_type *bus)
{
struct notifier_block *nb;
@@ -942,8 +910,6 @@ static int dma_debug_init(void)
spin_lock_init(&dma_entry_hash[i].lock);
}
- dma_debug_fs_init();
-
nr_pages = DIV_ROUND_UP(nr_prealloc_entries, DMA_DEBUG_DYNAMIC_ENTRIES);
for (i = 0; i < nr_pages; ++i)
dma_debug_create_entries(GFP_KERNEL);
@@ -977,7 +943,7 @@ static __init int dma_debug_cmdline(char *str)
global_disable = true;
}
- return 0;
+ return 1;
}
static __init int dma_debug_entries_cmdline(char *str)
@@ -986,7 +952,7 @@ static __init int dma_debug_entries_cmdline(char *str)
return -EINVAL;
if (!get_option(&str, &nr_prealloc_entries))
nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
- return 0;
+ return 1;
}
__setup("dma_debug=", dma_debug_cmdline);
@@ -1071,7 +1037,7 @@ static void check_unmap(struct dma_debug_entry *ref)
/*
* Drivers should use dma_mapping_error() to check the returned
* addresses of dma_map_single() and dma_map_page().
- * If not, print this warning message. See Documentation/DMA-API.txt.
+ * If not, print this warning message. See Documentation/core-api/dma-api.rst.
*/
if (entry->map_err_type == MAP_ERR_NOT_CHECKED) {
err_printk(ref->dev, entry,
@@ -1116,20 +1082,10 @@ static void check_for_stack(struct device *dev,
}
}
-static inline bool overlap(void *addr, unsigned long len, void *start, void *end)
-{
- unsigned long a1 = (unsigned long)addr;
- unsigned long b1 = a1 + len;
- unsigned long a2 = (unsigned long)start;
- unsigned long b2 = (unsigned long)end;
-
- return !(b1 <= a2 || a1 >= b2);
-}
-
static void check_for_illegal_area(struct device *dev, void *addr, unsigned long len)
{
- if (overlap(addr, len, _stext, _etext) ||
- overlap(addr, len, __start_rodata, __end_rodata))
+ if (memory_intersects(_stext, _etext, addr, len) ||
+ memory_intersects(__start_rodata, __end_rodata, addr, len))
err_printk(dev, NULL, "device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len);
}
@@ -1251,7 +1207,8 @@ void debug_dma_map_single(struct device *dev, const void *addr,
EXPORT_SYMBOL(debug_dma_map_single);
void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
- size_t size, int direction, dma_addr_t dma_addr)
+ size_t size, int direction, dma_addr_t dma_addr,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
@@ -1268,7 +1225,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
entry->dev = dev;
entry->type = dma_debug_single;
entry->pfn = page_to_pfn(page);
- entry->offset = offset,
+ entry->offset = offset;
entry->dev_addr = dma_addr;
entry->size = size;
entry->direction = direction;
@@ -1282,9 +1239,8 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
check_for_illegal_area(dev, addr, size);
}
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
-EXPORT_SYMBOL(debug_dma_map_page);
void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
{
@@ -1324,13 +1280,13 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
}
EXPORT_SYMBOL(debug_dma_mapping_error);
-void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
+void debug_dma_unmap_page(struct device *dev, dma_addr_t dma_addr,
size_t size, int direction)
{
struct dma_debug_entry ref = {
.type = dma_debug_single,
.dev = dev,
- .dev_addr = addr,
+ .dev_addr = dma_addr,
.size = size,
.direction = direction,
};
@@ -1339,10 +1295,10 @@ void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
return;
check_unmap(&ref);
}
-EXPORT_SYMBOL(debug_dma_unmap_page);
void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
- int nents, int mapped_ents, int direction)
+ int nents, int mapped_ents, int direction,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
struct scatterlist *s;
@@ -1351,6 +1307,12 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
if (unlikely(dma_debug_disabled()))
return;
+ for_each_sg(sg, s, nents, i) {
+ check_for_stack(dev, sg_page(s), s->offset);
+ if (!PageHighMem(sg_page(s)))
+ check_for_illegal_area(dev, sg_virt(s), s->length);
+ }
+
for_each_sg(sg, s, mapped_ents, i) {
entry = dma_entry_alloc();
if (!entry)
@@ -1359,25 +1321,18 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
entry->type = dma_debug_sg;
entry->dev = dev;
entry->pfn = page_to_pfn(sg_page(s));
- entry->offset = s->offset,
+ entry->offset = s->offset;
entry->size = sg_dma_len(s);
entry->dev_addr = sg_dma_address(s);
entry->direction = direction;
entry->sg_call_ents = nents;
entry->sg_mapped_ents = mapped_ents;
- check_for_stack(dev, sg_page(s), s->offset);
-
- if (!PageHighMem(sg_page(s))) {
- check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s));
- }
-
check_sg_segment(dev, s);
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
}
-EXPORT_SYMBOL(debug_dma_map_sg);
static int get_nr_mapped_entries(struct device *dev,
struct dma_debug_entry *ref)
@@ -1429,10 +1384,10 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
check_unmap(&ref);
}
}
-EXPORT_SYMBOL(debug_dma_unmap_sg);
void debug_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t dma_addr, void *virt)
+ dma_addr_t dma_addr, void *virt,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
@@ -1462,17 +1417,17 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size,
else
entry->pfn = page_to_pfn(virt_to_page(virt));
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
void debug_dma_free_coherent(struct device *dev, size_t size,
- void *virt, dma_addr_t addr)
+ void *virt, dma_addr_t dma_addr)
{
struct dma_debug_entry ref = {
.type = dma_debug_coherent,
.dev = dev,
.offset = offset_in_page(virt),
- .dev_addr = addr,
+ .dev_addr = dma_addr,
.size = size,
.direction = DMA_BIDIRECTIONAL,
};
@@ -1493,7 +1448,8 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
}
void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
- int direction, dma_addr_t dma_addr)
+ int direction, dma_addr_t dma_addr,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
@@ -1513,9 +1469,8 @@ void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
entry->direction = direction;
entry->map_err_type = MAP_ERR_NOT_CHECKED;
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
-EXPORT_SYMBOL(debug_dma_map_resource);
void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
size_t size, int direction)
@@ -1533,7 +1488,6 @@ void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
check_unmap(&ref);
}
-EXPORT_SYMBOL(debug_dma_unmap_resource);
void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
size_t size, int direction)
@@ -1552,7 +1506,6 @@ void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
check_sync(dev, &ref, true);
}
-EXPORT_SYMBOL(debug_dma_sync_single_for_cpu);
void debug_dma_sync_single_for_device(struct device *dev,
dma_addr_t dma_handle, size_t size,
@@ -1572,7 +1525,6 @@ void debug_dma_sync_single_for_device(struct device *dev,
check_sync(dev, &ref, false);
}
-EXPORT_SYMBOL(debug_dma_sync_single_for_device);
void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
int nelems, int direction)
@@ -1605,7 +1557,6 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
check_sync(dev, &ref, true);
}
}
-EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu);
void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
int nelems, int direction)
@@ -1637,7 +1588,6 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
check_sync(dev, &ref, false);
}
}
-EXPORT_SYMBOL(debug_dma_sync_sg_for_device);
static int __init dma_debug_driver_setup(char *str)
{
diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h
new file mode 100644
index 000000000000..f525197d3cae
--- /dev/null
+++ b/kernel/dma/debug.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2008 Advanced Micro Devices, Inc.
+ *
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ */
+
+#ifndef _KERNEL_DMA_DEBUG_H
+#define _KERNEL_DMA_DEBUG_H
+
+#ifdef CONFIG_DMA_API_DEBUG
+extern void debug_dma_map_page(struct device *dev, struct page *page,
+ size_t offset, size_t size,
+ int direction, dma_addr_t dma_addr,
+ unsigned long attrs);
+
+extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
+ size_t size, int direction);
+
+extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
+ int nents, int mapped_ents, int direction,
+ unsigned long attrs);
+
+extern void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
+ int nelems, int dir);
+
+extern void debug_dma_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t dma_addr, void *virt,
+ unsigned long attrs);
+
+extern void debug_dma_free_coherent(struct device *dev, size_t size,
+ void *virt, dma_addr_t addr);
+
+extern void debug_dma_map_resource(struct device *dev, phys_addr_t addr,
+ size_t size, int direction,
+ dma_addr_t dma_addr,
+ unsigned long attrs);
+
+extern void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
+ size_t size, int direction);
+
+extern void debug_dma_sync_single_for_cpu(struct device *dev,
+ dma_addr_t dma_handle, size_t size,
+ int direction);
+
+extern void debug_dma_sync_single_for_device(struct device *dev,
+ dma_addr_t dma_handle,
+ size_t size, int direction);
+
+extern void debug_dma_sync_sg_for_cpu(struct device *dev,
+ struct scatterlist *sg,
+ int nelems, int direction);
+
+extern void debug_dma_sync_sg_for_device(struct device *dev,
+ struct scatterlist *sg,
+ int nelems, int direction);
+#else /* CONFIG_DMA_API_DEBUG */
+static inline void debug_dma_map_page(struct device *dev, struct page *page,
+ size_t offset, size_t size,
+ int direction, dma_addr_t dma_addr,
+ unsigned long attrs)
+{
+}
+
+static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
+ size_t size, int direction)
+{
+}
+
+static inline void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
+ int nents, int mapped_ents, int direction,
+ unsigned long attrs)
+{
+}
+
+static inline void debug_dma_unmap_sg(struct device *dev,
+ struct scatterlist *sglist,
+ int nelems, int dir)
+{
+}
+
+static inline void debug_dma_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t dma_addr, void *virt,
+ unsigned long attrs)
+{
+}
+
+static inline void debug_dma_free_coherent(struct device *dev, size_t size,
+ void *virt, dma_addr_t addr)
+{
+}
+
+static inline void debug_dma_map_resource(struct device *dev, phys_addr_t addr,
+ size_t size, int direction,
+ dma_addr_t dma_addr,
+ unsigned long attrs)
+{
+}
+
+static inline void debug_dma_unmap_resource(struct device *dev,
+ dma_addr_t dma_addr, size_t size,
+ int direction)
+{
+}
+
+static inline void debug_dma_sync_single_for_cpu(struct device *dev,
+ dma_addr_t dma_handle,
+ size_t size, int direction)
+{
+}
+
+static inline void debug_dma_sync_single_for_device(struct device *dev,
+ dma_addr_t dma_handle,
+ size_t size, int direction)
+{
+}
+
+static inline void debug_dma_sync_sg_for_cpu(struct device *dev,
+ struct scatterlist *sg,
+ int nelems, int direction)
+{
+}
+
+static inline void debug_dma_sync_sg_for_device(struct device *dev,
+ struct scatterlist *sg,
+ int nelems, int direction)
+{
+}
+#endif /* CONFIG_DMA_API_DEBUG */
+#endif /* _KERNEL_DMA_DEBUG_H */
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 67f060b86a73..98b2e192fd69 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -1,23 +1,22 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Copyright (C) 2018 Christoph Hellwig.
+ * Copyright (C) 2018-2020 Christoph Hellwig.
*
* DMA operations that map physical memory directly without using an IOMMU.
*/
#include <linux/memblock.h> /* for max_pfn */
#include <linux/export.h>
#include <linux/mm.h>
-#include <linux/dma-direct.h>
+#include <linux/dma-map-ops.h>
#include <linux/scatterlist.h>
-#include <linux/dma-contiguous.h>
-#include <linux/dma-noncoherent.h>
#include <linux/pfn.h>
#include <linux/vmalloc.h>
#include <linux/set_memory.h>
-#include <linux/swiotlb.h>
+#include <linux/slab.h>
+#include "direct.h"
/*
- * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it
+ * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use
* it for entirely different regions. In that case the arch code needs to
* override the variable below for dma-direct to work properly.
*/
@@ -27,7 +26,7 @@ static inline dma_addr_t phys_to_dma_direct(struct device *dev,
phys_addr_t phys)
{
if (force_dma_unencrypted(dev))
- return __phys_to_dma(dev, phys);
+ return phys_to_dma_unencrypted(dev, phys);
return phys_to_dma(dev, phys);
}
@@ -45,15 +44,11 @@ u64 dma_direct_get_required_mask(struct device *dev)
return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
}
-gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
- u64 *phys_limit)
+static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit)
{
- u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit);
-
- if (force_dma_unencrypted(dev))
- *phys_limit = __dma_to_phys(dev, dma_limit);
- else
- *phys_limit = dma_to_phys(dev, dma_limit);
+ u64 dma_limit = min_not_zero(
+ dev->coherent_dma_mask,
+ dev->bus_dma_limit);
/*
* Optimistically try the zone that the physical address mask falls
@@ -63,6 +58,7 @@ gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
* Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding
* zones.
*/
+ *phys_limit = dma_to_phys(dev, dma_limit);
if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
return GFP_DMA;
if (*phys_limit <= DMA_BIT_MASK(32))
@@ -72,45 +68,55 @@ gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
{
- return phys_to_dma_direct(dev, phys) + size - 1 <=
- min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
+ dma_addr_t dma_addr = phys_to_dma_direct(dev, phys);
+
+ if (dma_addr == DMA_MAPPING_ERROR)
+ return false;
+ return dma_addr + size - 1 <=
+ min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
}
-/*
- * Decrypting memory is allowed to block, so if this device requires
- * unencrypted memory it must come from atomic pools.
- */
-static inline bool dma_should_alloc_from_pool(struct device *dev, gfp_t gfp,
- unsigned long attrs)
+static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size)
{
- if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
- return false;
- if (gfpflags_allow_blocking(gfp))
- return false;
- if (force_dma_unencrypted(dev))
- return true;
- if (!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP))
- return false;
- if (dma_alloc_need_uncached(dev, attrs))
- return true;
- return false;
+ if (!force_dma_unencrypted(dev))
+ return 0;
+ return set_memory_decrypted((unsigned long)vaddr, PFN_UP(size));
}
-static inline bool dma_should_free_from_pool(struct device *dev,
- unsigned long attrs)
+static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size)
{
- if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
- return true;
- if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
- !force_dma_unencrypted(dev))
- return false;
- if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP))
- return true;
- return false;
+ int ret;
+
+ if (!force_dma_unencrypted(dev))
+ return 0;
+ ret = set_memory_encrypted((unsigned long)vaddr, PFN_UP(size));
+ if (ret)
+ pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n");
+ return ret;
+}
+
+static void __dma_direct_free_pages(struct device *dev, struct page *page,
+ size_t size)
+{
+ if (swiotlb_free(dev, page, size))
+ return;
+ dma_free_contiguous(dev, page, size);
+}
+
+static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size)
+{
+ struct page *page = swiotlb_alloc(dev, size);
+
+ if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+ swiotlb_free(dev, page, size);
+ return NULL;
+ }
+
+ return page;
}
static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
- gfp_t gfp, unsigned long attrs)
+ gfp_t gfp, bool allow_highmem)
{
int node = dev_to_node(dev);
struct page *page = NULL;
@@ -118,17 +124,17 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
WARN_ON_ONCE(!PAGE_ALIGNED(size));
- if (attrs & DMA_ATTR_NO_WARN)
- gfp |= __GFP_NOWARN;
+ if (is_swiotlb_for_alloc(dev))
+ return dma_direct_alloc_swiotlb(dev, size);
- /* we always manually zero the memory once we are done: */
- gfp &= ~__GFP_ZERO;
- gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
- &phys_limit);
+ gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit);
page = dma_alloc_contiguous(dev, size, gfp);
- if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
- dma_free_contiguous(dev, page, size);
- page = NULL;
+ if (page) {
+ if (!dma_coherent_ok(dev, page_to_phys(page), size) ||
+ (!allow_highmem && PageHighMem(page))) {
+ dma_free_contiguous(dev, page, size);
+ page = NULL;
+ }
}
again:
if (!page)
@@ -153,170 +159,241 @@ again:
return page;
}
-void *dma_direct_alloc_pages(struct device *dev, size_t size,
+/*
+ * Check if a potentially blocking operations needs to dip into the atomic
+ * pools for the given device/gfp.
+ */
+static bool dma_direct_use_pool(struct device *dev, gfp_t gfp)
+{
+ return !gfpflags_allow_blocking(gfp) && !is_swiotlb_for_alloc(dev);
+}
+
+static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp)
+{
+ struct page *page;
+ u64 phys_limit;
+ void *ret;
+
+ if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)))
+ return NULL;
+
+ gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit);
+ page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok);
+ if (!page)
+ return NULL;
+ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+ return ret;
+}
+
+static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp)
+{
+ struct page *page;
+
+ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true);
+ if (!page)
+ return NULL;
+
+ /* remove any dirty cache lines on the kernel alias */
+ if (!PageHighMem(page))
+ arch_dma_prep_coherent(page, size);
+
+ /* return the page pointer as the opaque cookie */
+ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+ return page;
+}
+
+void *dma_direct_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
+ bool remap = false, set_uncached = false;
struct page *page;
void *ret;
- int err;
size = PAGE_ALIGN(size);
+ if (attrs & DMA_ATTR_NO_WARN)
+ gfp |= __GFP_NOWARN;
- if (dma_should_alloc_from_pool(dev, gfp, attrs)) {
- ret = dma_alloc_from_pool(dev, size, &page, gfp);
- if (!ret)
+ if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
+ !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev))
+ return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp);
+
+ if (!dev_is_dma_coherent(dev)) {
+ if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_ALLOC) &&
+ !is_swiotlb_for_alloc(dev))
+ return arch_dma_alloc(dev, size, dma_handle, gfp,
+ attrs);
+
+ /*
+ * If there is a global pool, always allocate from it for
+ * non-coherent devices.
+ */
+ if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL))
+ return dma_alloc_from_global_coherent(dev, size,
+ dma_handle);
+
+ /*
+ * Otherwise we require the architecture to either be able to
+ * mark arbitrary parts of the kernel direct mapping uncached,
+ * or remapped it uncached.
+ */
+ set_uncached = IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED);
+ remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP);
+ if (!set_uncached && !remap) {
+ pr_warn_once("coherent DMA allocations not supported on this platform.\n");
return NULL;
- goto done;
+ }
}
- page = __dma_direct_alloc_pages(dev, size, gfp, attrs);
+ /*
+ * Remapping or decrypting memory may block, allocate the memory from
+ * the atomic pools instead if we aren't allowed block.
+ */
+ if ((remap || force_dma_unencrypted(dev)) &&
+ dma_direct_use_pool(dev, gfp))
+ return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
+
+ /* we always manually zero the memory once we are done */
+ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true);
if (!page)
return NULL;
- if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
- !force_dma_unencrypted(dev)) {
- /* remove any dirty cache lines on the kernel alias */
- if (!PageHighMem(page))
- arch_dma_prep_coherent(page, size);
- /* return the page pointer as the opaque cookie */
- ret = page;
- goto done;
+ /*
+ * dma_alloc_contiguous can return highmem pages depending on a
+ * combination the cma= arguments and per-arch setup. These need to be
+ * remapped to return a kernel virtual address.
+ */
+ if (PageHighMem(page)) {
+ remap = true;
+ set_uncached = false;
}
- if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- dma_alloc_need_uncached(dev, attrs)) ||
- (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) {
+ if (remap) {
+ pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs);
+
+ if (force_dma_unencrypted(dev))
+ prot = pgprot_decrypted(prot);
+
/* remove any dirty cache lines on the kernel alias */
arch_dma_prep_coherent(page, size);
/* create a coherent mapping */
- ret = dma_common_contiguous_remap(page, size,
- dma_pgprot(dev, PAGE_KERNEL, attrs),
+ ret = dma_common_contiguous_remap(page, size, prot,
__builtin_return_address(0));
if (!ret)
goto out_free_pages;
- if (force_dma_unencrypted(dev)) {
- err = set_memory_decrypted((unsigned long)ret,
- 1 << get_order(size));
- if (err)
- goto out_free_pages;
- }
- memset(ret, 0, size);
- goto done;
- }
-
- if (PageHighMem(page)) {
- /*
- * Depending on the cma= arguments and per-arch setup
- * dma_alloc_contiguous could return highmem pages.
- * Without remapping there is no way to return them here,
- * so log an error and fail.
- */
- dev_info(dev, "Rejecting highmem page from CMA.\n");
- goto out_free_pages;
- }
-
- ret = page_address(page);
- if (force_dma_unencrypted(dev)) {
- err = set_memory_decrypted((unsigned long)ret,
- 1 << get_order(size));
- if (err)
+ } else {
+ ret = page_address(page);
+ if (dma_set_decrypted(dev, ret, size))
goto out_free_pages;
}
memset(ret, 0, size);
- if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
- dma_alloc_need_uncached(dev, attrs)) {
+ if (set_uncached) {
arch_dma_prep_coherent(page, size);
ret = arch_dma_set_uncached(ret, size);
if (IS_ERR(ret))
goto out_encrypt_pages;
}
-done:
- if (force_dma_unencrypted(dev))
- *dma_handle = __phys_to_dma(dev, page_to_phys(page));
- else
- *dma_handle = phys_to_dma(dev, page_to_phys(page));
+
+ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return ret;
out_encrypt_pages:
- if (force_dma_unencrypted(dev)) {
- err = set_memory_encrypted((unsigned long)page_address(page),
- 1 << get_order(size));
- /* If memory cannot be re-encrypted, it must be leaked */
- if (err)
- return NULL;
- }
+ if (dma_set_encrypted(dev, page_address(page), size))
+ return NULL;
out_free_pages:
- dma_free_contiguous(dev, page, size);
+ __dma_direct_free_pages(dev, page, size);
return NULL;
}
-void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
- dma_addr_t dma_addr, unsigned long attrs)
+void dma_direct_free(struct device *dev, size_t size,
+ void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
{
unsigned int page_order = get_order(size);
- /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
- if (dma_should_free_from_pool(dev, attrs) &&
- dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
- return;
-
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
- !force_dma_unencrypted(dev)) {
+ !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) {
/* cpu_addr is a struct page cookie, not a kernel address */
dma_free_contiguous(dev, cpu_addr, size);
return;
}
- if (force_dma_unencrypted(dev))
- set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
+ if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_ALLOC) &&
+ !dev_is_dma_coherent(dev) &&
+ !is_swiotlb_for_alloc(dev)) {
+ arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
+ !dev_is_dma_coherent(dev)) {
+ if (!dma_release_from_global_coherent(page_order, cpu_addr))
+ WARN_ON_ONCE(1);
+ return;
+ }
- if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr))
+ /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
+ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+ dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
+ return;
+
+ if (is_vmalloc_addr(cpu_addr)) {
vunmap(cpu_addr);
- else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
- arch_dma_clear_uncached(cpu_addr, size);
+ } else {
+ if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
+ arch_dma_clear_uncached(cpu_addr, size);
+ if (dma_set_encrypted(dev, cpu_addr, size))
+ return;
+ }
- dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size);
+ __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size);
}
-void *dma_direct_alloc(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
{
- if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
- !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- dma_alloc_need_uncached(dev, attrs))
- return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
- return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
-}
+ struct page *page;
+ void *ret;
-void dma_direct_free(struct device *dev, size_t size,
- void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
-{
- if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
- !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- dma_alloc_need_uncached(dev, attrs))
- arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
- else
- dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
+ if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
+ return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
+
+ page = __dma_direct_alloc_pages(dev, size, gfp, false);
+ if (!page)
+ return NULL;
+
+ ret = page_address(page);
+ if (dma_set_decrypted(dev, ret, size))
+ goto out_free_pages;
+ memset(ret, 0, size);
+ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+ return page;
+out_free_pages:
+ __dma_direct_free_pages(dev, page, size);
+ return NULL;
}
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
- defined(CONFIG_SWIOTLB)
-void dma_direct_sync_single_for_device(struct device *dev,
- dma_addr_t addr, size_t size, enum dma_data_direction dir)
+void dma_direct_free_pages(struct device *dev, size_t size,
+ struct page *page, dma_addr_t dma_addr,
+ enum dma_data_direction dir)
{
- phys_addr_t paddr = dma_to_phys(dev, addr);
+ void *vaddr = page_address(page);
- if (unlikely(is_swiotlb_buffer(paddr)))
- swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
+ /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
+ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+ dma_free_from_pool(dev, vaddr, size))
+ return;
- if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_device(paddr, size, dir);
+ if (dma_set_encrypted(dev, vaddr, size))
+ return;
+ __dma_direct_free_pages(dev, page, size);
}
-EXPORT_SYMBOL(dma_direct_sync_single_for_device);
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+ defined(CONFIG_SWIOTLB)
void dma_direct_sync_sg_for_device(struct device *dev,
struct scatterlist *sgl, int nents, enum dma_data_direction dir)
{
@@ -326,36 +403,20 @@ void dma_direct_sync_sg_for_device(struct device *dev,
for_each_sg(sgl, sg, nents, i) {
phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
- if (unlikely(is_swiotlb_buffer(paddr)))
- swiotlb_tbl_sync_single(dev, paddr, sg->length,
- dir, SYNC_FOR_DEVICE);
+ if (unlikely(is_swiotlb_buffer(dev, paddr)))
+ swiotlb_sync_single_for_device(dev, paddr, sg->length,
+ dir);
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_device(paddr, sg->length,
dir);
}
}
-EXPORT_SYMBOL(dma_direct_sync_sg_for_device);
#endif
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
defined(CONFIG_SWIOTLB)
-void dma_direct_sync_single_for_cpu(struct device *dev,
- dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
- phys_addr_t paddr = dma_to_phys(dev, addr);
-
- if (!dev_is_dma_coherent(dev)) {
- arch_sync_dma_for_cpu(paddr, size, dir);
- arch_sync_dma_for_cpu_all();
- }
-
- if (unlikely(is_swiotlb_buffer(paddr)))
- swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
-}
-EXPORT_SYMBOL(dma_direct_sync_single_for_cpu);
-
void dma_direct_sync_sg_for_cpu(struct device *dev,
struct scatterlist *sgl, int nents, enum dma_data_direction dir)
{
@@ -368,79 +429,72 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu(paddr, sg->length, dir);
- if (unlikely(is_swiotlb_buffer(paddr)))
- swiotlb_tbl_sync_single(dev, paddr, sg->length, dir,
- SYNC_FOR_CPU);
+ if (unlikely(is_swiotlb_buffer(dev, paddr)))
+ swiotlb_sync_single_for_cpu(dev, paddr, sg->length,
+ dir);
+
+ if (dir == DMA_FROM_DEVICE)
+ arch_dma_mark_clean(paddr, sg->length);
}
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu_all();
}
-EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu);
-
-void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
- size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
- phys_addr_t phys = dma_to_phys(dev, addr);
-
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- dma_direct_sync_single_for_cpu(dev, addr, size, dir);
-
- if (unlikely(is_swiotlb_buffer(phys)))
- swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
-}
-EXPORT_SYMBOL(dma_direct_unmap_page);
+/*
+ * Unmaps segments, except for ones marked as pci_p2pdma which do not
+ * require any further action as they contain a bus address.
+ */
void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
int nents, enum dma_data_direction dir, unsigned long attrs)
{
struct scatterlist *sg;
int i;
- for_each_sg(sgl, sg, nents, i)
- dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
- attrs);
-}
-EXPORT_SYMBOL(dma_direct_unmap_sg);
-#endif
-
-dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size, enum dma_data_direction dir,
- unsigned long attrs)
-{
- phys_addr_t phys = page_to_phys(page) + offset;
- dma_addr_t dma_addr = phys_to_dma(dev, phys);
-
- if (unlikely(swiotlb_force == SWIOTLB_FORCE))
- return swiotlb_map(dev, phys, size, dir, attrs);
-
- if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
- if (swiotlb_force != SWIOTLB_NO_FORCE)
- return swiotlb_map(dev, phys, size, dir, attrs);
-
- dev_WARN_ONCE(dev, 1,
- "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
- &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
- return DMA_MAPPING_ERROR;
+ for_each_sg(sgl, sg, nents, i) {
+ if (sg_dma_is_bus_address(sg))
+ sg_dma_unmark_bus_address(sg);
+ else
+ dma_direct_unmap_page(dev, sg->dma_address,
+ sg_dma_len(sg), dir, attrs);
}
-
- if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- arch_sync_dma_for_device(phys, size, dir);
- return dma_addr;
}
-EXPORT_SYMBOL(dma_direct_map_page);
+#endif
int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
enum dma_data_direction dir, unsigned long attrs)
{
- int i;
+ struct pci_p2pdma_map_state p2pdma_state = {};
+ enum pci_p2pdma_map_type map;
struct scatterlist *sg;
+ int i, ret;
for_each_sg(sgl, sg, nents, i) {
+ if (is_pci_p2pdma_page(sg_page(sg))) {
+ map = pci_p2pdma_map_segment(&p2pdma_state, dev, sg);
+ switch (map) {
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ continue;
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ /*
+ * Any P2P mapping that traverses the PCI
+ * host bridge must be mapped with CPU physical
+ * address and not PCI bus addresses. This is
+ * done with dma_direct_map_page() below.
+ */
+ break;
+ default:
+ ret = -EREMOTEIO;
+ goto out_unmap;
+ }
+ }
+
sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
sg->offset, sg->length, dir, attrs);
- if (sg->dma_address == DMA_MAPPING_ERROR)
+ if (sg->dma_address == DMA_MAPPING_ERROR) {
+ ret = -EIO;
goto out_unmap;
+ }
sg_dma_len(sg) = sg->length;
}
@@ -448,9 +502,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
out_unmap:
dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
- return 0;
+ return ret;
}
-EXPORT_SYMBOL(dma_direct_map_sg);
dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
@@ -467,7 +520,6 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
return dma_addr;
}
-EXPORT_SYMBOL(dma_direct_map_resource);
int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
@@ -498,9 +550,13 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
int ret = -ENXIO;
vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
+ if (force_dma_unencrypted(dev))
+ vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
return ret;
+ if (dma_mmap_from_global_coherent(vma, cpu_addr, size, &ret))
+ return ret;
if (vma->vm_pgoff >= count || user_count > count - vma->vm_pgoff)
return -ENXIO;
@@ -522,20 +578,60 @@ int dma_direct_supported(struct device *dev, u64 mask)
return 1;
/*
- * This check needs to be against the actual bit mask value, so
- * use __phys_to_dma() here so that the SME encryption mask isn't
+ * This check needs to be against the actual bit mask value, so use
+ * phys_to_dma_unencrypted() here so that the SME encryption mask isn't
* part of the check.
*/
if (IS_ENABLED(CONFIG_ZONE_DMA))
min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits));
- return mask >= __phys_to_dma(dev, min_mask);
+ return mask >= phys_to_dma_unencrypted(dev, min_mask);
+}
+
+/*
+ * To check whether all ram resource ranges are covered by dma range map
+ * Returns 0 when further check is needed
+ * Returns 1 if there is some RAM range can't be covered by dma_range_map
+ */
+static int check_ram_in_range_map(unsigned long start_pfn,
+ unsigned long nr_pages, void *data)
+{
+ unsigned long end_pfn = start_pfn + nr_pages;
+ const struct bus_dma_region *bdr = NULL;
+ const struct bus_dma_region *m;
+ struct device *dev = data;
+
+ while (start_pfn < end_pfn) {
+ for (m = dev->dma_range_map; PFN_DOWN(m->size); m++) {
+ unsigned long cpu_start_pfn = PFN_DOWN(m->cpu_start);
+
+ if (start_pfn >= cpu_start_pfn &&
+ start_pfn - cpu_start_pfn < PFN_DOWN(m->size)) {
+ bdr = m;
+ break;
+ }
+ }
+ if (!bdr)
+ return 1;
+
+ start_pfn = PFN_DOWN(bdr->cpu_start) + PFN_DOWN(bdr->size);
+ }
+
+ return 0;
+}
+
+bool dma_direct_all_ram_mapped(struct device *dev)
+{
+ if (!dev->dma_range_map)
+ return true;
+ return !walk_system_ram_range(0, PFN_DOWN(ULONG_MAX) + 1, dev,
+ check_ram_in_range_map);
}
size_t dma_direct_max_mapping_size(struct device *dev)
{
/* If SWIOTLB is active, use its maximum mapping size */
- if (is_swiotlb_active() &&
- (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE))
+ if (is_swiotlb_active(dev) &&
+ (dma_addressing_limited(dev) || is_swiotlb_force_bounce(dev)))
return swiotlb_max_mapping_size(dev);
return SIZE_MAX;
}
@@ -543,5 +639,45 @@ size_t dma_direct_max_mapping_size(struct device *dev)
bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr)
{
return !dev_is_dma_coherent(dev) ||
- is_swiotlb_buffer(dma_to_phys(dev, dma_addr));
+ is_swiotlb_buffer(dev, dma_to_phys(dev, dma_addr));
+}
+
+/**
+ * dma_direct_set_offset - Assign scalar offset for a single DMA range.
+ * @dev: device pointer; needed to "own" the alloced memory.
+ * @cpu_start: beginning of memory region covered by this offset.
+ * @dma_start: beginning of DMA/PCI region covered by this offset.
+ * @size: size of the region.
+ *
+ * This is for the simple case of a uniform offset which cannot
+ * be discovered by "dma-ranges".
+ *
+ * It returns -ENOMEM if out of memory, -EINVAL if a map
+ * already exists, 0 otherwise.
+ *
+ * Note: any call to this from a driver is a bug. The mapping needs
+ * to be described by the device tree or other firmware interfaces.
+ */
+int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start,
+ dma_addr_t dma_start, u64 size)
+{
+ struct bus_dma_region *map;
+ u64 offset = (u64)cpu_start - (u64)dma_start;
+
+ if (dev->dma_range_map) {
+ dev_err(dev, "attempt to add DMA range to existing map\n");
+ return -EINVAL;
+ }
+
+ if (!offset)
+ return 0;
+
+ map = kcalloc(2, sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+ map[0].cpu_start = cpu_start;
+ map[0].dma_start = dma_start;
+ map[0].size = size;
+ dev->dma_range_map = map;
+ return 0;
}
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
new file mode 100644
index 000000000000..18d346118fe8
--- /dev/null
+++ b/kernel/dma/direct.h
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Christoph Hellwig.
+ *
+ * DMA operations that map physical memory directly without using an IOMMU.
+ */
+#ifndef _KERNEL_DMA_DIRECT_H
+#define _KERNEL_DMA_DIRECT_H
+
+#include <linux/dma-direct.h>
+#include <linux/memremap.h>
+
+int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs);
+bool dma_direct_can_mmap(struct device *dev);
+int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs);
+bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr);
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
+ enum dma_data_direction dir, unsigned long attrs);
+bool dma_direct_all_ram_mapped(struct device *dev);
+size_t dma_direct_max_mapping_size(struct device *dev);
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+ defined(CONFIG_SWIOTLB)
+void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir);
+#else
+static inline void dma_direct_sync_sg_for_device(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
+ defined(CONFIG_SWIOTLB)
+void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir, unsigned long attrs);
+void dma_direct_sync_sg_for_cpu(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir);
+#else
+static inline void dma_direct_unmap_sg(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+}
+static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
+static inline void dma_direct_sync_single_for_device(struct device *dev,
+ dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+ phys_addr_t paddr = dma_to_phys(dev, addr);
+
+ if (unlikely(is_swiotlb_buffer(dev, paddr)))
+ swiotlb_sync_single_for_device(dev, paddr, size, dir);
+
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_for_device(paddr, size, dir);
+}
+
+static inline void dma_direct_sync_single_for_cpu(struct device *dev,
+ dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+ phys_addr_t paddr = dma_to_phys(dev, addr);
+
+ if (!dev_is_dma_coherent(dev)) {
+ arch_sync_dma_for_cpu(paddr, size, dir);
+ arch_sync_dma_for_cpu_all();
+ }
+
+ if (unlikely(is_swiotlb_buffer(dev, paddr)))
+ swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
+
+ if (dir == DMA_FROM_DEVICE)
+ arch_dma_mark_clean(paddr, size);
+}
+
+static inline dma_addr_t dma_direct_map_page(struct device *dev,
+ struct page *page, unsigned long offset, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ phys_addr_t phys = page_to_phys(page) + offset;
+ dma_addr_t dma_addr = phys_to_dma(dev, phys);
+
+ if (is_swiotlb_force_bounce(dev)) {
+ if (is_pci_p2pdma_page(page))
+ return DMA_MAPPING_ERROR;
+ return swiotlb_map(dev, phys, size, dir, attrs);
+ }
+
+ if (unlikely(!dma_capable(dev, dma_addr, size, true)) ||
+ dma_kmalloc_needs_bounce(dev, size, dir)) {
+ if (is_pci_p2pdma_page(page))
+ return DMA_MAPPING_ERROR;
+ if (is_swiotlb_active(dev))
+ return swiotlb_map(dev, phys, size, dir, attrs);
+
+ dev_WARN_ONCE(dev, 1,
+ "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
+ &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
+ return DMA_MAPPING_ERROR;
+ }
+
+ if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ arch_sync_dma_for_device(phys, size, dir);
+ return dma_addr;
+}
+
+static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ phys_addr_t phys = dma_to_phys(dev, addr);
+
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+
+ if (unlikely(is_swiotlb_buffer(dev, phys)))
+ swiotlb_tbl_unmap_single(dev, phys, size, dir,
+ attrs | DMA_ATTR_SKIP_CPU_SYNC);
+}
+#endif /* _KERNEL_DMA_DIRECT_H */
diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c
index 05607642c888..b492d59ac77e 100644
--- a/kernel/dma/dummy.c
+++ b/kernel/dma/dummy.c
@@ -2,7 +2,7 @@
/*
* Dummy DMA ops that always fail.
*/
-#include <linux/dma-mapping.h>
+#include <linux/dma-map-ops.h>
static int dma_dummy_mmap(struct device *dev, struct vm_area_struct *vma,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
@@ -22,7 +22,7 @@ static int dma_dummy_map_sg(struct device *dev, struct scatterlist *sgl,
int nelems, enum dma_data_direction dir,
unsigned long attrs)
{
- return 0;
+ return -EINVAL;
}
static int dma_dummy_supported(struct device *hwdev, u64 mask)
@@ -36,4 +36,3 @@ const struct dma_map_ops dma_dummy_ops = {
.map_sg = dma_dummy_map_sg,
.dma_supported = dma_dummy_supported,
};
-EXPORT_SYMBOL(dma_dummy_ops);
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
new file mode 100644
index 000000000000..02205ab53b7e
--- /dev/null
+++ b/kernel/dma/map_benchmark.c
@@ -0,0 +1,358 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 HiSilicon Limited.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/map_benchmark.h>
+#include <linux/math64.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/timekeeping.h>
+
+struct map_benchmark_data {
+ struct map_benchmark bparam;
+ struct device *dev;
+ struct dentry *debugfs;
+ enum dma_data_direction dir;
+ atomic64_t sum_map_100ns;
+ atomic64_t sum_unmap_100ns;
+ atomic64_t sum_sq_map;
+ atomic64_t sum_sq_unmap;
+ atomic64_t loops;
+};
+
+static int map_benchmark_thread(void *data)
+{
+ void *buf;
+ dma_addr_t dma_addr;
+ struct map_benchmark_data *map = data;
+ int npages = map->bparam.granule;
+ u64 size = npages * PAGE_SIZE;
+ int ret = 0;
+
+ buf = alloc_pages_exact(size, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ while (!kthread_should_stop()) {
+ u64 map_100ns, unmap_100ns, map_sq, unmap_sq;
+ ktime_t map_stime, map_etime, unmap_stime, unmap_etime;
+ ktime_t map_delta, unmap_delta;
+
+ /*
+ * for a non-coherent device, if we don't stain them in the
+ * cache, this will give an underestimate of the real-world
+ * overhead of BIDIRECTIONAL or TO_DEVICE mappings;
+ * 66 means evertything goes well! 66 is lucky.
+ */
+ if (map->dir != DMA_FROM_DEVICE)
+ memset(buf, 0x66, size);
+
+ map_stime = ktime_get();
+ dma_addr = dma_map_single(map->dev, buf, size, map->dir);
+ if (unlikely(dma_mapping_error(map->dev, dma_addr))) {
+ pr_err("dma_map_single failed on %s\n",
+ dev_name(map->dev));
+ ret = -ENOMEM;
+ goto out;
+ }
+ map_etime = ktime_get();
+ map_delta = ktime_sub(map_etime, map_stime);
+
+ /* Pretend DMA is transmitting */
+ ndelay(map->bparam.dma_trans_ns);
+
+ unmap_stime = ktime_get();
+ dma_unmap_single(map->dev, dma_addr, size, map->dir);
+ unmap_etime = ktime_get();
+ unmap_delta = ktime_sub(unmap_etime, unmap_stime);
+
+ /* calculate sum and sum of squares */
+
+ map_100ns = div64_ul(map_delta, 100);
+ unmap_100ns = div64_ul(unmap_delta, 100);
+ map_sq = map_100ns * map_100ns;
+ unmap_sq = unmap_100ns * unmap_100ns;
+
+ atomic64_add(map_100ns, &map->sum_map_100ns);
+ atomic64_add(unmap_100ns, &map->sum_unmap_100ns);
+ atomic64_add(map_sq, &map->sum_sq_map);
+ atomic64_add(unmap_sq, &map->sum_sq_unmap);
+ atomic64_inc(&map->loops);
+ }
+
+out:
+ free_pages_exact(buf, size);
+ return ret;
+}
+
+static int do_map_benchmark(struct map_benchmark_data *map)
+{
+ struct task_struct **tsk;
+ int threads = map->bparam.threads;
+ int node = map->bparam.node;
+ const cpumask_t *cpu_mask = cpumask_of_node(node);
+ u64 loops;
+ int ret = 0;
+ int i;
+
+ tsk = kmalloc_array(threads, sizeof(*tsk), GFP_KERNEL);
+ if (!tsk)
+ return -ENOMEM;
+
+ get_device(map->dev);
+
+ for (i = 0; i < threads; i++) {
+ tsk[i] = kthread_create_on_node(map_benchmark_thread, map,
+ map->bparam.node, "dma-map-benchmark/%d", i);
+ if (IS_ERR(tsk[i])) {
+ pr_err("create dma_map thread failed\n");
+ ret = PTR_ERR(tsk[i]);
+ goto out;
+ }
+
+ if (node != NUMA_NO_NODE)
+ kthread_bind_mask(tsk[i], cpu_mask);
+ }
+
+ /* clear the old value in the previous benchmark */
+ atomic64_set(&map->sum_map_100ns, 0);
+ atomic64_set(&map->sum_unmap_100ns, 0);
+ atomic64_set(&map->sum_sq_map, 0);
+ atomic64_set(&map->sum_sq_unmap, 0);
+ atomic64_set(&map->loops, 0);
+
+ for (i = 0; i < threads; i++) {
+ get_task_struct(tsk[i]);
+ wake_up_process(tsk[i]);
+ }
+
+ msleep_interruptible(map->bparam.seconds * 1000);
+
+ /* wait for the completion of benchmark threads */
+ for (i = 0; i < threads; i++) {
+ ret = kthread_stop(tsk[i]);
+ if (ret)
+ goto out;
+ }
+
+ loops = atomic64_read(&map->loops);
+ if (likely(loops > 0)) {
+ u64 map_variance, unmap_variance;
+ u64 sum_map = atomic64_read(&map->sum_map_100ns);
+ u64 sum_unmap = atomic64_read(&map->sum_unmap_100ns);
+ u64 sum_sq_map = atomic64_read(&map->sum_sq_map);
+ u64 sum_sq_unmap = atomic64_read(&map->sum_sq_unmap);
+
+ /* average latency */
+ map->bparam.avg_map_100ns = div64_u64(sum_map, loops);
+ map->bparam.avg_unmap_100ns = div64_u64(sum_unmap, loops);
+
+ /* standard deviation of latency */
+ map_variance = div64_u64(sum_sq_map, loops) -
+ map->bparam.avg_map_100ns *
+ map->bparam.avg_map_100ns;
+ unmap_variance = div64_u64(sum_sq_unmap, loops) -
+ map->bparam.avg_unmap_100ns *
+ map->bparam.avg_unmap_100ns;
+ map->bparam.map_stddev = int_sqrt64(map_variance);
+ map->bparam.unmap_stddev = int_sqrt64(unmap_variance);
+ }
+
+out:
+ for (i = 0; i < threads; i++)
+ put_task_struct(tsk[i]);
+ put_device(map->dev);
+ kfree(tsk);
+ return ret;
+}
+
+static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct map_benchmark_data *map = file->private_data;
+ void __user *argp = (void __user *)arg;
+ u64 old_dma_mask;
+ int ret;
+
+ if (copy_from_user(&map->bparam, argp, sizeof(map->bparam)))
+ return -EFAULT;
+
+ switch (cmd) {
+ case DMA_MAP_BENCHMARK:
+ if (map->bparam.threads == 0 ||
+ map->bparam.threads > DMA_MAP_MAX_THREADS) {
+ pr_err("invalid thread number\n");
+ return -EINVAL;
+ }
+
+ if (map->bparam.seconds == 0 ||
+ map->bparam.seconds > DMA_MAP_MAX_SECONDS) {
+ pr_err("invalid duration seconds\n");
+ return -EINVAL;
+ }
+
+ if (map->bparam.dma_trans_ns > DMA_MAP_MAX_TRANS_DELAY) {
+ pr_err("invalid transmission delay\n");
+ return -EINVAL;
+ }
+
+ if (map->bparam.node != NUMA_NO_NODE &&
+ !node_possible(map->bparam.node)) {
+ pr_err("invalid numa node\n");
+ return -EINVAL;
+ }
+
+ if (map->bparam.granule < 1 || map->bparam.granule > 1024) {
+ pr_err("invalid granule size\n");
+ return -EINVAL;
+ }
+
+ switch (map->bparam.dma_dir) {
+ case DMA_MAP_BIDIRECTIONAL:
+ map->dir = DMA_BIDIRECTIONAL;
+ break;
+ case DMA_MAP_FROM_DEVICE:
+ map->dir = DMA_FROM_DEVICE;
+ break;
+ case DMA_MAP_TO_DEVICE:
+ map->dir = DMA_TO_DEVICE;
+ break;
+ default:
+ pr_err("invalid DMA direction\n");
+ return -EINVAL;
+ }
+
+ old_dma_mask = dma_get_mask(map->dev);
+
+ ret = dma_set_mask(map->dev,
+ DMA_BIT_MASK(map->bparam.dma_bits));
+ if (ret) {
+ pr_err("failed to set dma_mask on device %s\n",
+ dev_name(map->dev));
+ return -EINVAL;
+ }
+
+ ret = do_map_benchmark(map);
+
+ /*
+ * restore the original dma_mask as many devices' dma_mask are
+ * set by architectures, acpi, busses. When we bind them back
+ * to their original drivers, those drivers shouldn't see
+ * dma_mask changed by benchmark
+ */
+ dma_set_mask(map->dev, old_dma_mask);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (copy_to_user(argp, &map->bparam, sizeof(map->bparam)))
+ return -EFAULT;
+
+ return ret;
+}
+
+static const struct file_operations map_benchmark_fops = {
+ .open = simple_open,
+ .unlocked_ioctl = map_benchmark_ioctl,
+};
+
+static void map_benchmark_remove_debugfs(void *data)
+{
+ struct map_benchmark_data *map = (struct map_benchmark_data *)data;
+
+ debugfs_remove(map->debugfs);
+}
+
+static int __map_benchmark_probe(struct device *dev)
+{
+ struct dentry *entry;
+ struct map_benchmark_data *map;
+ int ret;
+
+ map = devm_kzalloc(dev, sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+ map->dev = dev;
+
+ ret = devm_add_action(dev, map_benchmark_remove_debugfs, map);
+ if (ret) {
+ pr_err("Can't add debugfs remove action\n");
+ return ret;
+ }
+
+ /*
+ * we only permit a device bound with this driver, 2nd probe
+ * will fail
+ */
+ entry = debugfs_create_file("dma_map_benchmark", 0600, NULL, map,
+ &map_benchmark_fops);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
+ map->debugfs = entry;
+
+ return 0;
+}
+
+static int map_benchmark_platform_probe(struct platform_device *pdev)
+{
+ return __map_benchmark_probe(&pdev->dev);
+}
+
+static struct platform_driver map_benchmark_platform_driver = {
+ .driver = {
+ .name = "dma_map_benchmark",
+ },
+ .probe = map_benchmark_platform_probe,
+};
+
+static int
+map_benchmark_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ return __map_benchmark_probe(&pdev->dev);
+}
+
+static struct pci_driver map_benchmark_pci_driver = {
+ .name = "dma_map_benchmark",
+ .probe = map_benchmark_pci_probe,
+};
+
+static int __init map_benchmark_init(void)
+{
+ int ret;
+
+ ret = pci_register_driver(&map_benchmark_pci_driver);
+ if (ret)
+ return ret;
+
+ ret = platform_driver_register(&map_benchmark_platform_driver);
+ if (ret) {
+ pci_unregister_driver(&map_benchmark_pci_driver);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void __exit map_benchmark_cleanup(void)
+{
+ platform_driver_unregister(&map_benchmark_platform_driver);
+ pci_unregister_driver(&map_benchmark_pci_driver);
+}
+
+module_init(map_benchmark_init);
+module_exit(map_benchmark_cleanup);
+
+MODULE_AUTHOR("Barry Song <song.bao.hua@hisilicon.com>");
+MODULE_DESCRIPTION("dma_map benchmark driver");
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index a8c18c9a796f..58db8fd70471 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -7,13 +7,21 @@
*/
#include <linux/memblock.h> /* for max_pfn */
#include <linux/acpi.h>
-#include <linux/dma-direct.h>
-#include <linux/dma-noncoherent.h>
+#include <linux/dma-map-ops.h>
#include <linux/export.h>
#include <linux/gfp.h>
+#include <linux/kmsan.h>
#include <linux/of_device.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include "debug.h"
+#include "direct.h"
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
+bool dma_default_coherent = IS_ENABLED(CONFIG_ARCH_DMA_DEFAULT_COHERENT);
+#endif
/*
* Managed DMA API
@@ -105,21 +113,277 @@ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
}
EXPORT_SYMBOL(dmam_alloc_attrs);
+static bool dma_go_direct(struct device *dev, dma_addr_t mask,
+ const struct dma_map_ops *ops)
+{
+ if (likely(!ops))
+ return true;
+#ifdef CONFIG_DMA_OPS_BYPASS
+ if (dev->dma_ops_bypass)
+ return min_not_zero(mask, dev->bus_dma_limit) >=
+ dma_direct_get_required_mask(dev);
+#endif
+ return false;
+}
+
+
/*
- * Create scatter-list for the already allocated DMA buffer.
+ * Check if the devices uses a direct mapping for streaming DMA operations.
+ * This allows IOMMU drivers to set a bypass mode if the DMA mask is large
+ * enough.
*/
-int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
- void *cpu_addr, dma_addr_t dma_addr, size_t size,
- unsigned long attrs)
+static inline bool dma_alloc_direct(struct device *dev,
+ const struct dma_map_ops *ops)
+{
+ return dma_go_direct(dev, dev->coherent_dma_mask, ops);
+}
+
+static inline bool dma_map_direct(struct device *dev,
+ const struct dma_map_ops *ops)
+{
+ return dma_go_direct(dev, *dev->dma_mask, ops);
+}
+
+dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
+ size_t offset, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ dma_addr_t addr;
+
+ BUG_ON(!valid_dma_direction(dir));
+
+ if (WARN_ON_ONCE(!dev->dma_mask))
+ return DMA_MAPPING_ERROR;
+
+ if (dma_map_direct(dev, ops) ||
+ arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size))
+ addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+ else
+ addr = ops->map_page(dev, page, offset, size, dir, attrs);
+ kmsan_handle_dma(page, offset, size, dir);
+ debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);
+
+ return addr;
+}
+EXPORT_SYMBOL(dma_map_page_attrs);
+
+void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops) ||
+ arch_dma_unmap_page_direct(dev, addr + size))
+ dma_direct_unmap_page(dev, addr, size, dir, attrs);
+ else if (ops->unmap_page)
+ ops->unmap_page(dev, addr, size, dir, attrs);
+ debug_dma_unmap_page(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_unmap_page_attrs);
+
+static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ int ents;
+
+ BUG_ON(!valid_dma_direction(dir));
+
+ if (WARN_ON_ONCE(!dev->dma_mask))
+ return 0;
+
+ if (dma_map_direct(dev, ops) ||
+ arch_dma_map_sg_direct(dev, sg, nents))
+ ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
+ else
+ ents = ops->map_sg(dev, sg, nents, dir, attrs);
+
+ if (ents > 0) {
+ kmsan_handle_dma_sg(sg, nents, dir);
+ debug_dma_map_sg(dev, sg, nents, ents, dir, attrs);
+ } else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
+ ents != -EIO && ents != -EREMOTEIO)) {
+ return -EIO;
+ }
+
+ return ents;
+}
+
+/**
+ * dma_map_sg_attrs - Map the given buffer for DMA
+ * @dev: The device for which to perform the DMA operation
+ * @sg: The sg_table object describing the buffer
+ * @nents: Number of entries to map
+ * @dir: DMA direction
+ * @attrs: Optional DMA attributes for the map operation
+ *
+ * Maps a buffer described by a scatterlist passed in the sg argument with
+ * nents segments for the @dir DMA operation by the @dev device.
+ *
+ * Returns the number of mapped entries (which can be less than nents)
+ * on success. Zero is returned for any error.
+ *
+ * dma_unmap_sg_attrs() should be used to unmap the buffer with the
+ * original sg and original nents (not the value returned by this funciton).
+ */
+unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir, unsigned long attrs)
{
- struct page *page = virt_to_page(cpu_addr);
int ret;
- ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
- if (!ret)
- sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+ ret = __dma_map_sg_attrs(dev, sg, nents, dir, attrs);
+ if (ret < 0)
+ return 0;
return ret;
}
+EXPORT_SYMBOL(dma_map_sg_attrs);
+
+/**
+ * dma_map_sgtable - Map the given buffer for DMA
+ * @dev: The device for which to perform the DMA operation
+ * @sgt: The sg_table object describing the buffer
+ * @dir: DMA direction
+ * @attrs: Optional DMA attributes for the map operation
+ *
+ * Maps a buffer described by a scatterlist stored in the given sg_table
+ * object for the @dir DMA operation by the @dev device. After success, the
+ * ownership for the buffer is transferred to the DMA domain. One has to
+ * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the
+ * ownership of the buffer back to the CPU domain before touching the
+ * buffer by the CPU.
+ *
+ * Returns 0 on success or a negative error code on error. The following
+ * error codes are supported with the given meaning:
+ *
+ * -EINVAL An invalid argument, unaligned access or other error
+ * in usage. Will not succeed if retried.
+ * -ENOMEM Insufficient resources (like memory or IOVA space) to
+ * complete the mapping. Should succeed if retried later.
+ * -EIO Legacy error code with an unknown meaning. eg. this is
+ * returned if a lower level call returned
+ * DMA_MAPPING_ERROR.
+ * -EREMOTEIO The DMA device cannot access P2PDMA memory specified
+ * in the sg_table. This will not succeed if retried.
+ */
+int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ int nents;
+
+ nents = __dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs);
+ if (nents < 0)
+ return nents;
+ sgt->nents = nents;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dma_map_sgtable);
+
+void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ debug_dma_unmap_sg(dev, sg, nents, dir);
+ if (dma_map_direct(dev, ops) ||
+ arch_dma_unmap_sg_direct(dev, sg, nents))
+ dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
+ else if (ops->unmap_sg)
+ ops->unmap_sg(dev, sg, nents, dir, attrs);
+}
+EXPORT_SYMBOL(dma_unmap_sg_attrs);
+
+dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ dma_addr_t addr = DMA_MAPPING_ERROR;
+
+ BUG_ON(!valid_dma_direction(dir));
+
+ if (WARN_ON_ONCE(!dev->dma_mask))
+ return DMA_MAPPING_ERROR;
+
+ if (dma_map_direct(dev, ops))
+ addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
+ else if (ops->map_resource)
+ addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
+
+ debug_dma_map_resource(dev, phys_addr, size, dir, addr, attrs);
+ return addr;
+}
+EXPORT_SYMBOL(dma_map_resource);
+
+void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (!dma_map_direct(dev, ops) && ops->unmap_resource)
+ ops->unmap_resource(dev, addr, size, dir, attrs);
+ debug_dma_unmap_resource(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_unmap_resource);
+
+void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+ else if (ops->sync_single_for_cpu)
+ ops->sync_single_for_cpu(dev, addr, size, dir);
+ debug_dma_sync_single_for_cpu(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_sync_single_for_cpu);
+
+void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ dma_direct_sync_single_for_device(dev, addr, size, dir);
+ else if (ops->sync_single_for_device)
+ ops->sync_single_for_device(dev, addr, size, dir);
+ debug_dma_sync_single_for_device(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_sync_single_for_device);
+
+void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
+ else if (ops->sync_sg_for_cpu)
+ ops->sync_sg_for_cpu(dev, sg, nelems, dir);
+ debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
+}
+EXPORT_SYMBOL(dma_sync_sg_for_cpu);
+
+void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
+ else if (ops->sync_sg_for_device)
+ ops->sync_sg_for_device(dev, sg, nelems, dir);
+ debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
+}
+EXPORT_SYMBOL(dma_sync_sg_for_device);
/*
* The whole dma_get_sgtable() idea is fundamentally unsafe - it seems
@@ -138,7 +402,7 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr,
size, attrs);
if (!ops->get_sgtable)
@@ -154,11 +418,7 @@ EXPORT_SYMBOL(dma_get_sgtable_attrs);
*/
pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
{
- if (force_dma_unencrypted(dev))
- prot = pgprot_decrypted(prot);
- if (dev_is_dma_coherent(dev) ||
- (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) &&
- (attrs & DMA_ATTR_NON_CONSISTENT)))
+ if (dev_is_dma_coherent(dev))
return prot;
#ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE
if (attrs & DMA_ATTR_WRITE_COMBINE)
@@ -168,35 +428,6 @@ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
}
#endif /* CONFIG_MMU */
-/*
- * Create userspace mapping for the DMA-coherent memory.
- */
-int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
- void *cpu_addr, dma_addr_t dma_addr, size_t size,
- unsigned long attrs)
-{
-#ifdef CONFIG_MMU
- unsigned long user_count = vma_pages(vma);
- unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
- unsigned long off = vma->vm_pgoff;
- int ret = -ENXIO;
-
- vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
-
- if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
- return ret;
-
- if (off >= count || user_count > count - off)
- return -ENXIO;
-
- return remap_pfn_range(vma, vma->vm_start,
- page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff,
- user_count << PAGE_SHIFT, vma->vm_page_prot);
-#else
- return -ENXIO;
-#endif /* CONFIG_MMU */
-}
-
/**
* dma_can_mmap - check if a given device supports dma_mmap_*
* @dev: device to check
@@ -208,7 +439,7 @@ bool dma_can_mmap(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
return dma_direct_can_mmap(dev);
return ops->mmap != NULL;
}
@@ -233,7 +464,7 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size,
attrs);
if (!ops->mmap)
@@ -246,7 +477,7 @@ u64 dma_get_required_mask(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
return dma_direct_get_required_mask(dev);
if (ops->get_required_mask)
return ops->get_required_mask(dev);
@@ -271,20 +502,28 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
WARN_ON_ONCE(!dev->coherent_dma_mask);
+ /*
+ * DMA allocations can never be turned back into a page pointer, so
+ * requesting compound pages doesn't make sense (and can't even be
+ * supported at all by various backends).
+ */
+ if (WARN_ON_ONCE(flag & __GFP_COMP))
+ return NULL;
+
if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))
return cpu_addr;
/* let the implementation decide on the zone to allocate from: */
flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
else if (ops->alloc)
cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
else
return NULL;
- debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+ debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, attrs);
return cpu_addr;
}
EXPORT_SYMBOL(dma_alloc_attrs);
@@ -309,30 +548,217 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
return;
debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
dma_direct_free(dev, size, cpu_addr, dma_handle, attrs);
else if (ops->free)
ops->free(dev, size, cpu_addr, dma_handle, attrs);
}
EXPORT_SYMBOL(dma_free_attrs);
-int dma_supported(struct device *dev, u64 mask)
+static struct page *__dma_alloc_pages(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (WARN_ON_ONCE(!dev->coherent_dma_mask))
+ return NULL;
+ if (WARN_ON_ONCE(gfp & (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM)))
+ return NULL;
+ if (WARN_ON_ONCE(gfp & __GFP_COMP))
+ return NULL;
+
+ size = PAGE_ALIGN(size);
+ if (dma_alloc_direct(dev, ops))
+ return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp);
+ if (!ops->alloc_pages)
+ return NULL;
+ return ops->alloc_pages(dev, size, dma_handle, dir, gfp);
+}
+
+struct page *dma_alloc_pages(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
+{
+ struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp);
+
+ if (page)
+ debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0);
+ return page;
+}
+EXPORT_SYMBOL_GPL(dma_alloc_pages);
+
+static void __dma_free_pages(struct device *dev, size_t size, struct page *page,
+ dma_addr_t dma_handle, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ size = PAGE_ALIGN(size);
+ if (dma_alloc_direct(dev, ops))
+ dma_direct_free_pages(dev, size, page, dma_handle, dir);
+ else if (ops->free_pages)
+ ops->free_pages(dev, size, page, dma_handle, dir);
+}
+
+void dma_free_pages(struct device *dev, size_t size, struct page *page,
+ dma_addr_t dma_handle, enum dma_data_direction dir)
+{
+ debug_dma_unmap_page(dev, dma_handle, size, dir);
+ __dma_free_pages(dev, size, page, dma_handle, dir);
+}
+EXPORT_SYMBOL_GPL(dma_free_pages);
+
+int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma,
+ size_t size, struct page *page)
+{
+ unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ if (vma->vm_pgoff >= count || vma_pages(vma) > count - vma->vm_pgoff)
+ return -ENXIO;
+ return remap_pfn_range(vma, vma->vm_start,
+ page_to_pfn(page) + vma->vm_pgoff,
+ vma_pages(vma) << PAGE_SHIFT, vma->vm_page_prot);
+}
+EXPORT_SYMBOL_GPL(dma_mmap_pages);
+
+static struct sg_table *alloc_single_sgt(struct device *dev, size_t size,
+ enum dma_data_direction dir, gfp_t gfp)
+{
+ struct sg_table *sgt;
+ struct page *page;
+
+ sgt = kmalloc(sizeof(*sgt), gfp);
+ if (!sgt)
+ return NULL;
+ if (sg_alloc_table(sgt, 1, gfp))
+ goto out_free_sgt;
+ page = __dma_alloc_pages(dev, size, &sgt->sgl->dma_address, dir, gfp);
+ if (!page)
+ goto out_free_table;
+ sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+ sg_dma_len(sgt->sgl) = sgt->sgl->length;
+ return sgt;
+out_free_table:
+ sg_free_table(sgt);
+out_free_sgt:
+ kfree(sgt);
+ return NULL;
+}
+
+struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
+ enum dma_data_direction dir, gfp_t gfp, unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ struct sg_table *sgt;
+
+ if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES))
+ return NULL;
+ if (WARN_ON_ONCE(gfp & __GFP_COMP))
+ return NULL;
+
+ if (ops && ops->alloc_noncontiguous)
+ sgt = ops->alloc_noncontiguous(dev, size, dir, gfp, attrs);
+ else
+ sgt = alloc_single_sgt(dev, size, dir, gfp);
+
+ if (sgt) {
+ sgt->nents = 1;
+ debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs);
+ }
+ return sgt;
+}
+EXPORT_SYMBOL_GPL(dma_alloc_noncontiguous);
+
+static void free_single_sgt(struct device *dev, size_t size,
+ struct sg_table *sgt, enum dma_data_direction dir)
+{
+ __dma_free_pages(dev, size, sg_page(sgt->sgl), sgt->sgl->dma_address,
+ dir);
+ sg_free_table(sgt);
+ kfree(sgt);
+}
+
+void dma_free_noncontiguous(struct device *dev, size_t size,
+ struct sg_table *sgt, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ debug_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir);
+ if (ops && ops->free_noncontiguous)
+ ops->free_noncontiguous(dev, size, sgt, dir);
+ else
+ free_single_sgt(dev, size, sgt, dir);
+}
+EXPORT_SYMBOL_GPL(dma_free_noncontiguous);
+
+void *dma_vmap_noncontiguous(struct device *dev, size_t size,
+ struct sg_table *sgt)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ if (ops && ops->alloc_noncontiguous)
+ return vmap(sgt_handle(sgt)->pages, count, VM_MAP, PAGE_KERNEL);
+ return page_address(sg_page(sgt->sgl));
+}
+EXPORT_SYMBOL_GPL(dma_vmap_noncontiguous);
+
+void dma_vunmap_noncontiguous(struct device *dev, void *vaddr)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (ops && ops->alloc_noncontiguous)
+ vunmap(vaddr);
+}
+EXPORT_SYMBOL_GPL(dma_vunmap_noncontiguous);
+
+int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma,
+ size_t size, struct sg_table *sgt)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_is_direct(ops))
+ if (ops && ops->alloc_noncontiguous) {
+ unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ if (vma->vm_pgoff >= count ||
+ vma_pages(vma) > count - vma->vm_pgoff)
+ return -ENXIO;
+ return vm_map_pages(vma, sgt_handle(sgt)->pages, count);
+ }
+ return dma_mmap_pages(dev, vma, size, sg_page(sgt->sgl));
+}
+EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous);
+
+static int dma_supported(struct device *dev, u64 mask)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ /*
+ * ->dma_supported sets the bypass flag, so we must always call
+ * into the method here unless the device is truly direct mapped.
+ */
+ if (!ops)
return dma_direct_supported(dev, mask);
if (!ops->dma_supported)
return 1;
return ops->dma_supported(dev, mask);
}
-EXPORT_SYMBOL(dma_supported);
-#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK
-void arch_dma_set_mask(struct device *dev, u64 mask);
-#else
-#define arch_dma_set_mask(dev, mask) do { } while (0)
-#endif
+bool dma_pci_p2pdma_supported(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ /* if ops is not set, dma direct will be used which supports P2PDMA */
+ if (!ops)
+ return true;
+
+ /*
+ * Note: dma_ops_bypass is not checked here because P2PDMA should
+ * not be used with dma mapping ops that do not have support even
+ * if the specific device is bypassing them.
+ */
+
+ return ops->flags & DMA_F_PCI_P2PDMA_SUPPORTED;
+}
+EXPORT_SYMBOL_GPL(dma_pci_p2pdma_supported);
int dma_set_mask(struct device *dev, u64 mask)
{
@@ -351,7 +777,6 @@ int dma_set_mask(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_set_mask);
-#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
int dma_set_coherent_mask(struct device *dev, u64 mask)
{
/*
@@ -367,28 +792,35 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
return 0;
}
EXPORT_SYMBOL(dma_set_coherent_mask);
-#endif
-void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
- enum dma_data_direction dir)
+/**
+ * dma_addressing_limited - return if the device is addressing limited
+ * @dev: device to check
+ *
+ * Return %true if the devices DMA mask is too small to address all memory in
+ * the system, else %false. Lack of addressing bits is the prime reason for
+ * bounce buffering, but might not be the only one.
+ */
+bool dma_addressing_limited(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- BUG_ON(!valid_dma_direction(dir));
+ if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) <
+ dma_get_required_mask(dev))
+ return true;
- if (dma_is_direct(ops))
- arch_dma_cache_sync(dev, vaddr, size, dir);
- else if (ops->cache_sync)
- ops->cache_sync(dev, vaddr, size, dir);
+ if (unlikely(ops))
+ return false;
+ return !dma_direct_all_ram_mapped(dev);
}
-EXPORT_SYMBOL(dma_cache_sync);
+EXPORT_SYMBOL_GPL(dma_addressing_limited);
size_t dma_max_mapping_size(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
size_t size = SIZE_MAX;
- if (dma_is_direct(ops))
+ if (dma_map_direct(dev, ops))
size = dma_direct_max_mapping_size(dev);
else if (ops && ops->max_mapping_size)
size = ops->max_mapping_size(dev);
@@ -397,11 +829,23 @@ size_t dma_max_mapping_size(struct device *dev)
}
EXPORT_SYMBOL_GPL(dma_max_mapping_size);
+size_t dma_opt_mapping_size(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ size_t size = SIZE_MAX;
+
+ if (ops && ops->opt_mapping_size)
+ size = ops->opt_mapping_size();
+
+ return min(dma_max_mapping_size(dev), size);
+}
+EXPORT_SYMBOL_GPL(dma_opt_mapping_size);
+
bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_is_direct(ops))
+ if (dma_map_direct(dev, ops))
return dma_direct_need_sync(dev, dma_addr);
return ops->sync_single_for_cpu || ops->sync_single_for_device;
}
diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c
new file mode 100644
index 000000000000..af4a6ef48ce0
--- /dev/null
+++ b/kernel/dma/ops_helpers.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Helpers for DMA ops implementations. These generally rely on the fact that
+ * the allocated memory contains normal pages in the direct kernel mapping.
+ */
+#include <linux/dma-map-ops.h>
+
+static struct page *dma_common_vaddr_to_page(void *cpu_addr)
+{
+ if (is_vmalloc_addr(cpu_addr))
+ return vmalloc_to_page(cpu_addr);
+ return virt_to_page(cpu_addr);
+}
+
+/*
+ * Create scatter-list for the already allocated DMA buffer.
+ */
+int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ struct page *page = dma_common_vaddr_to_page(cpu_addr);
+ int ret;
+
+ ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
+ if (!ret)
+ sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+ return ret;
+}
+
+/*
+ * Create userspace mapping for the DMA-coherent memory.
+ */
+int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+#ifdef CONFIG_MMU
+ unsigned long user_count = vma_pages(vma);
+ unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long off = vma->vm_pgoff;
+ struct page *page = dma_common_vaddr_to_page(cpu_addr);
+ int ret = -ENXIO;
+
+ vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
+
+ if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
+ return ret;
+
+ if (off >= count || user_count > count - off)
+ return -ENXIO;
+
+ return remap_pfn_range(vma, vma->vm_start,
+ page_to_pfn(page) + vma->vm_pgoff,
+ user_count << PAGE_SHIFT, vma->vm_page_prot);
+#else
+ return -ENXIO;
+#endif /* CONFIG_MMU */
+}
+
+struct page *dma_common_alloc_pages(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ struct page *page;
+
+ page = dma_alloc_contiguous(dev, size, gfp);
+ if (!page)
+ page = alloc_pages_node(dev_to_node(dev), gfp, get_order(size));
+ if (!page)
+ return NULL;
+
+ *dma_handle = ops->map_page(dev, page, 0, size, dir,
+ DMA_ATTR_SKIP_CPU_SYNC);
+ if (*dma_handle == DMA_MAPPING_ERROR) {
+ dma_free_contiguous(dev, page, size);
+ return NULL;
+ }
+
+ memset(page_address(page), 0, size);
+ return page;
+}
+
+void dma_common_free_pages(struct device *dev, size_t size, struct page *page,
+ dma_addr_t dma_handle, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (ops->unmap_page)
+ ops->unmap_page(dev, dma_handle, size, dir,
+ DMA_ATTR_SKIP_CPU_SYNC);
+ dma_free_contiguous(dev, page, size);
+}
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 6bc74a2d5127..d10613eb0f63 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -3,9 +3,10 @@
* Copyright (C) 2012 ARM Ltd.
* Copyright (C) 2020 Google LLC
*/
+#include <linux/cma.h>
#include <linux/debugfs.h>
+#include <linux/dma-map-ops.h>
#include <linux/dma-direct.h>
-#include <linux/dma-noncoherent.h>
#include <linux/init.h>
#include <linux/genalloc.h>
#include <linux/set_memory.h>
@@ -37,9 +38,6 @@ static void __init dma_atomic_pool_debugfs_init(void)
struct dentry *root;
root = debugfs_create_dir("dma_pools", NULL);
- if (IS_ERR_OR_NULL(root))
- return;
-
debugfs_create_ulong("pool_size_dma", 0400, root, &pool_size_dma);
debugfs_create_ulong("pool_size_dma32", 0400, root, &pool_size_dma32);
debugfs_create_ulong("pool_size_kernel", 0400, root, &pool_size_kernel);
@@ -55,20 +53,47 @@ static void dma_atomic_pool_size_add(gfp_t gfp, size_t size)
pool_size_kernel += size;
}
+static bool cma_in_zone(gfp_t gfp)
+{
+ unsigned long size;
+ phys_addr_t end;
+ struct cma *cma;
+
+ cma = dev_get_cma_area(NULL);
+ if (!cma)
+ return false;
+
+ size = cma_get_size(cma);
+ if (!size)
+ return false;
+
+ /* CMA can't cross zone boundaries, see cma_activate_area() */
+ end = cma_get_base(cma) + size - 1;
+ if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
+ return end <= DMA_BIT_MASK(zone_dma_bits);
+ if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
+ return end <= DMA_BIT_MASK(32);
+ return true;
+}
+
static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
gfp_t gfp)
{
unsigned int order;
- struct page *page;
+ struct page *page = NULL;
void *addr;
int ret = -ENOMEM;
- /* Cannot allocate larger than MAX_ORDER-1 */
- order = min(get_order(pool_size), MAX_ORDER-1);
+ /* Cannot allocate larger than MAX_PAGE_ORDER */
+ order = min(get_order(pool_size), MAX_PAGE_ORDER);
do {
pool_size = 1 << (PAGE_SHIFT + order);
- page = alloc_pages(gfp, order);
+ if (cma_in_zone(gfp))
+ page = dma_alloc_from_contiguous(NULL, 1 << order,
+ order, false);
+ if (!page)
+ page = alloc_pages(gfp, order);
} while (!page && order-- > 0);
if (!page)
goto out;
@@ -86,7 +111,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
#endif
/*
* Memory in the atomic DMA pools must be unencrypted, the pools do not
- * shrink so no re-encryption occurs in dma_direct_free_pages().
+ * shrink so no re-encryption occurs in dma_direct_free().
*/
ret = set_memory_decrypted((unsigned long)page_to_virt(page),
1 << order);
@@ -110,9 +135,9 @@ encrypt_mapping:
remove_mapping:
#ifdef CONFIG_DMA_DIRECT_REMAP
dma_common_free_remap(addr, pool_size);
-#endif
-free_page: __maybe_unused
+free_page:
__free_pages(page, order);
+#endif
out:
return ret;
}
@@ -165,7 +190,7 @@ static int __init dma_atomic_pool_init(void)
/*
* If coherent_pool was not used on the command line, default the pool
- * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1.
+ * sizes to 128KB per 1GB of memory, min 128KB, max MAX_PAGE_ORDER.
*/
if (!atomic_pool_size) {
unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K);
@@ -178,7 +203,7 @@ static int __init dma_atomic_pool_init(void)
GFP_KERNEL);
if (!atomic_pool_kernel)
ret = -ENOMEM;
- if (IS_ENABLED(CONFIG_ZONE_DMA)) {
+ if (has_managed_dma()) {
atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size,
GFP_KERNEL | GFP_DMA);
if (!atomic_pool_dma)
@@ -196,93 +221,75 @@ static int __init dma_atomic_pool_init(void)
}
postcore_initcall(dma_atomic_pool_init);
-static inline struct gen_pool *dma_guess_pool_from_device(struct device *dev)
+static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
{
- u64 phys_mask;
- gfp_t gfp;
-
- gfp = dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
- &phys_mask);
- if (IS_ENABLED(CONFIG_ZONE_DMA) && gfp == GFP_DMA)
+ if (prev == NULL) {
+ if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
+ return atomic_pool_dma32;
+ if (atomic_pool_dma && (gfp & GFP_DMA))
+ return atomic_pool_dma;
+ return atomic_pool_kernel;
+ }
+ if (prev == atomic_pool_kernel)
+ return atomic_pool_dma32 ? atomic_pool_dma32 : atomic_pool_dma;
+ if (prev == atomic_pool_dma32)
return atomic_pool_dma;
- if (IS_ENABLED(CONFIG_ZONE_DMA32) && gfp == GFP_DMA32)
- return atomic_pool_dma32;
- return atomic_pool_kernel;
+ return NULL;
}
-static inline struct gen_pool *dma_get_safer_pool(struct gen_pool *bad_pool)
+static struct page *__dma_alloc_from_pool(struct device *dev, size_t size,
+ struct gen_pool *pool, void **cpu_addr,
+ bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t))
{
- if (bad_pool == atomic_pool_kernel)
- return atomic_pool_dma32 ? : atomic_pool_dma;
+ unsigned long addr;
+ phys_addr_t phys;
- if (bad_pool == atomic_pool_dma32)
- return atomic_pool_dma;
+ addr = gen_pool_alloc(pool, size);
+ if (!addr)
+ return NULL;
- return NULL;
-}
+ phys = gen_pool_virt_to_phys(pool, addr);
+ if (phys_addr_ok && !phys_addr_ok(dev, phys, size)) {
+ gen_pool_free(pool, addr, size);
+ return NULL;
+ }
-static inline struct gen_pool *dma_guess_pool(struct device *dev,
- struct gen_pool *bad_pool)
-{
- if (bad_pool)
- return dma_get_safer_pool(bad_pool);
+ if (gen_pool_avail(pool) < atomic_pool_size)
+ schedule_work(&atomic_pool_work);
- return dma_guess_pool_from_device(dev);
+ *cpu_addr = (void *)addr;
+ memset(*cpu_addr, 0, size);
+ return pfn_to_page(__phys_to_pfn(phys));
}
-void *dma_alloc_from_pool(struct device *dev, size_t size,
- struct page **ret_page, gfp_t flags)
+struct page *dma_alloc_from_pool(struct device *dev, size_t size,
+ void **cpu_addr, gfp_t gfp,
+ bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t))
{
struct gen_pool *pool = NULL;
- unsigned long val = 0;
- void *ptr = NULL;
- phys_addr_t phys;
-
- while (1) {
- pool = dma_guess_pool(dev, pool);
- if (!pool) {
- WARN(1, "Failed to get suitable pool for %s\n",
- dev_name(dev));
- break;
- }
-
- val = gen_pool_alloc(pool, size);
- if (!val)
- continue;
-
- phys = gen_pool_virt_to_phys(pool, val);
- if (dma_coherent_ok(dev, phys, size))
- break;
-
- gen_pool_free(pool, val, size);
- val = 0;
- }
-
-
- if (val) {
- *ret_page = pfn_to_page(__phys_to_pfn(phys));
- ptr = (void *)val;
- memset(ptr, 0, size);
+ struct page *page;
- if (gen_pool_avail(pool) < atomic_pool_size)
- schedule_work(&atomic_pool_work);
+ while ((pool = dma_guess_pool(pool, gfp))) {
+ page = __dma_alloc_from_pool(dev, size, pool, cpu_addr,
+ phys_addr_ok);
+ if (page)
+ return page;
}
- return ptr;
+ WARN(1, "Failed to get suitable pool for %s\n", dev_name(dev));
+ return NULL;
}
bool dma_free_from_pool(struct device *dev, void *start, size_t size)
{
struct gen_pool *pool = NULL;
- while (1) {
- pool = dma_guess_pool(dev, pool);
- if (!pool)
- return false;
-
- if (gen_pool_has_addr(pool, (unsigned long)start, size)) {
- gen_pool_free(pool, (unsigned long)start, size);
- return true;
- }
+ while ((pool = dma_guess_pool(pool, 0))) {
+ if (!gen_pool_has_addr(pool, (unsigned long)start, size))
+ continue;
+ gen_pool_free(pool, (unsigned long)start, size);
+ return true;
}
+
+ return false;
}
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 78b23f089cf1..27596f3b4aef 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -2,7 +2,7 @@
/*
* Copyright (c) 2014 The Linux Foundation
*/
-#include <linux/dma-mapping.h>
+#include <linux/dma-map-ops.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
@@ -43,13 +43,13 @@ void *dma_common_contiguous_remap(struct page *page, size_t size,
void *vaddr;
int i;
- pages = kmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
+ pages = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
if (!pages)
return NULL;
for (i = 0; i < count; i++)
pages[i] = nth_page(page, i);
vaddr = vmap(pages, count, VM_DMA_COHERENT, prot);
- kfree(pages);
+ kvfree(pages);
return vaddr;
}
@@ -66,6 +66,5 @@ void dma_common_free_remap(void *cpu_addr, size_t size)
return;
}
- unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size));
vunmap(cpu_addr);
}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index c19379fabd20..b079a9a8e087 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -21,38 +21,37 @@
#define pr_fmt(fmt) "software IO TLB: " fmt
#include <linux/cache.h>
+#include <linux/cc_platform.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
#include <linux/dma-direct.h>
-#include <linux/dma-noncoherent.h>
-#include <linux/mm.h>
+#include <linux/dma-map-ops.h>
#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/iommu-helper.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/pfn.h>
+#include <linux/rculist.h>
+#include <linux/scatterlist.h>
+#include <linux/set_memory.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/swiotlb.h>
-#include <linux/pfn.h>
#include <linux/types.h>
-#include <linux/ctype.h>
-#include <linux/highmem.h>
-#include <linux/gfp.h>
-#include <linux/scatterlist.h>
-#include <linux/mem_encrypt.h>
-#include <linux/set_memory.h>
-#ifdef CONFIG_DEBUG_FS
-#include <linux/debugfs.h>
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_reserved_mem.h>
+#include <linux/slab.h>
#endif
-#include <asm/io.h>
-#include <asm/dma.h>
-
-#include <linux/init.h>
-#include <linux/memblock.h>
-#include <linux/iommu-helper.h>
-
#define CREATE_TRACE_POINTS
#include <trace/events/swiotlb.h>
-#define OFFSET(val,align) ((unsigned long) \
- ( (val) & ( (align) - 1)))
-
#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
/*
@@ -62,120 +61,189 @@
*/
#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
-enum swiotlb_force swiotlb_force;
+#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
-/*
- * Used to do a quick range check in swiotlb_tbl_unmap_single and
- * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
- * API.
+/**
+ * struct io_tlb_slot - IO TLB slot descriptor
+ * @orig_addr: The original address corresponding to a mapped entry.
+ * @alloc_size: Size of the allocated buffer.
+ * @list: The free list describing the number of free entries available
+ * from each index.
*/
-phys_addr_t io_tlb_start, io_tlb_end;
+struct io_tlb_slot {
+ phys_addr_t orig_addr;
+ size_t alloc_size;
+ unsigned int list;
+};
-/*
- * The number of IO TLB blocks (in groups of 64) between io_tlb_start and
- * io_tlb_end. This is command line adjustable via setup_io_tlb_npages.
- */
-static unsigned long io_tlb_nslabs;
+static bool swiotlb_force_bounce;
+static bool swiotlb_force_disable;
-/*
- * The number of used IO TLB block
- */
-static unsigned long io_tlb_used;
+#ifdef CONFIG_SWIOTLB_DYNAMIC
-/*
- * This is a free list describing the number of free entries available from
- * each index
- */
-static unsigned int *io_tlb_list;
-static unsigned int io_tlb_index;
+static void swiotlb_dyn_alloc(struct work_struct *work);
-/*
- * Max segment that we can provide which (if pages are contingous) will
- * not be bounced (unless SWIOTLB_FORCE is set).
+static struct io_tlb_mem io_tlb_default_mem = {
+ .lock = __SPIN_LOCK_UNLOCKED(io_tlb_default_mem.lock),
+ .pools = LIST_HEAD_INIT(io_tlb_default_mem.pools),
+ .dyn_alloc = __WORK_INITIALIZER(io_tlb_default_mem.dyn_alloc,
+ swiotlb_dyn_alloc),
+};
+
+#else /* !CONFIG_SWIOTLB_DYNAMIC */
+
+static struct io_tlb_mem io_tlb_default_mem;
+
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
+static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
+static unsigned long default_nareas;
+
+/**
+ * struct io_tlb_area - IO TLB memory area descriptor
+ *
+ * This is a single area with a single lock.
+ *
+ * @used: The number of used IO TLB block.
+ * @index: The slot index to start searching in this area for next round.
+ * @lock: The lock to protect the above data structures in the map and
+ * unmap calls.
*/
-unsigned int max_segment;
+struct io_tlb_area {
+ unsigned long used;
+ unsigned int index;
+ spinlock_t lock;
+};
/*
- * We need to save away the original address corresponding to a mapped entry
- * for the sync operations.
+ * Round up number of slabs to the next power of 2. The last area is going
+ * be smaller than the rest if default_nslabs is not power of two.
+ * The number of slot in an area should be a multiple of IO_TLB_SEGSIZE,
+ * otherwise a segment may span two or more areas. It conflicts with free
+ * contiguous slots tracking: free slots are treated contiguous no matter
+ * whether they cross an area boundary.
+ *
+ * Return true if default_nslabs is rounded up.
*/
-#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
-static phys_addr_t *io_tlb_orig_addr;
+static bool round_up_default_nslabs(void)
+{
+ if (!default_nareas)
+ return false;
+
+ if (default_nslabs < IO_TLB_SEGSIZE * default_nareas)
+ default_nslabs = IO_TLB_SEGSIZE * default_nareas;
+ else if (is_power_of_2(default_nslabs))
+ return false;
+ default_nslabs = roundup_pow_of_two(default_nslabs);
+ return true;
+}
-/*
- * Protect the above data structures in the map and unmap calls
+/**
+ * swiotlb_adjust_nareas() - adjust the number of areas and slots
+ * @nareas: Desired number of areas. Zero is treated as 1.
+ *
+ * Adjust the default number of areas in a memory pool.
+ * The default size of the memory pool may also change to meet minimum area
+ * size requirements.
*/
-static DEFINE_SPINLOCK(io_tlb_lock);
+static void swiotlb_adjust_nareas(unsigned int nareas)
+{
+ if (!nareas)
+ nareas = 1;
+ else if (!is_power_of_2(nareas))
+ nareas = roundup_pow_of_two(nareas);
-static int late_alloc;
+ default_nareas = nareas;
+
+ pr_info("area num %d.\n", nareas);
+ if (round_up_default_nslabs())
+ pr_info("SWIOTLB bounce buffer size roundup to %luMB",
+ (default_nslabs << IO_TLB_SHIFT) >> 20);
+}
+
+/**
+ * limit_nareas() - get the maximum number of areas for a given memory pool size
+ * @nareas: Desired number of areas.
+ * @nslots: Total number of slots in the memory pool.
+ *
+ * Limit the number of areas to the maximum possible number of areas in
+ * a memory pool of the given size.
+ *
+ * Return: Maximum possible number of areas.
+ */
+static unsigned int limit_nareas(unsigned int nareas, unsigned long nslots)
+{
+ if (nslots < nareas * IO_TLB_SEGSIZE)
+ return nslots / IO_TLB_SEGSIZE;
+ return nareas;
+}
static int __init
setup_io_tlb_npages(char *str)
{
if (isdigit(*str)) {
- io_tlb_nslabs = simple_strtoul(str, &str, 0);
/* avoid tail segment of size < IO_TLB_SEGSIZE */
- io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+ default_nslabs =
+ ALIGN(simple_strtoul(str, &str, 0), IO_TLB_SEGSIZE);
}
if (*str == ',')
++str;
- if (!strcmp(str, "force")) {
- swiotlb_force = SWIOTLB_FORCE;
- } else if (!strcmp(str, "noforce")) {
- swiotlb_force = SWIOTLB_NO_FORCE;
- io_tlb_nslabs = 1;
- }
+ if (isdigit(*str))
+ swiotlb_adjust_nareas(simple_strtoul(str, &str, 0));
+ if (*str == ',')
+ ++str;
+ if (!strcmp(str, "force"))
+ swiotlb_force_bounce = true;
+ else if (!strcmp(str, "noforce"))
+ swiotlb_force_disable = true;
return 0;
}
early_param("swiotlb", setup_io_tlb_npages);
-static bool no_iotlb_memory;
-
-unsigned long swiotlb_nr_tbl(void)
-{
- return unlikely(no_iotlb_memory) ? 0 : io_tlb_nslabs;
-}
-EXPORT_SYMBOL_GPL(swiotlb_nr_tbl);
-
-unsigned int swiotlb_max_segment(void)
-{
- return unlikely(no_iotlb_memory) ? 0 : max_segment;
-}
-EXPORT_SYMBOL_GPL(swiotlb_max_segment);
-
-void swiotlb_set_max_segment(unsigned int val)
+unsigned long swiotlb_size_or_default(void)
{
- if (swiotlb_force == SWIOTLB_FORCE)
- max_segment = 1;
- else
- max_segment = rounddown(val, PAGE_SIZE);
+ return default_nslabs << IO_TLB_SHIFT;
}
-/* default to 64MB */
-#define IO_TLB_DEFAULT_SIZE (64UL<<20)
-unsigned long swiotlb_size_or_default(void)
+void __init swiotlb_adjust_size(unsigned long size)
{
- unsigned long size;
-
- size = io_tlb_nslabs << IO_TLB_SHIFT;
+ /*
+ * If swiotlb parameter has not been specified, give a chance to
+ * architectures such as those supporting memory encryption to
+ * adjust/expand SWIOTLB size for their use.
+ */
+ if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT)
+ return;
- return size ? size : (IO_TLB_DEFAULT_SIZE);
+ size = ALIGN(size, IO_TLB_SIZE);
+ default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+ if (round_up_default_nslabs())
+ size = default_nslabs << IO_TLB_SHIFT;
+ pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
}
void swiotlb_print_info(void)
{
- unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
- if (no_iotlb_memory) {
+ if (!mem->nslabs) {
pr_warn("No low mem\n");
return;
}
- pr_info("mapped [mem %#010llx-%#010llx] (%luMB)\n",
- (unsigned long long)io_tlb_start,
- (unsigned long long)io_tlb_end,
- bytes >> 20);
+ pr_info("mapped [mem %pa-%pa] (%luMB)\n", &mem->start, &mem->end,
+ (mem->nslabs << IO_TLB_SHIFT) >> 20);
+}
+
+static inline unsigned long io_tlb_offset(unsigned long val)
+{
+ return val & (IO_TLB_SEGSIZE - 1);
+}
+
+static inline unsigned long nr_slots(u64 val)
+{
+ return DIV_ROUND_UP(val, IO_TLB_SIZE);
}
/*
@@ -186,87 +254,168 @@ void swiotlb_print_info(void)
*/
void __init swiotlb_update_mem_attributes(void)
{
- void *vaddr;
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
unsigned long bytes;
- if (no_iotlb_memory || late_alloc)
+ if (!mem->nslabs || mem->late_alloc)
return;
+ bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
+ set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT);
+}
+
+static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start,
+ unsigned long nslabs, bool late_alloc, unsigned int nareas)
+{
+ void *vaddr = phys_to_virt(start);
+ unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
+
+ mem->nslabs = nslabs;
+ mem->start = start;
+ mem->end = mem->start + bytes;
+ mem->late_alloc = late_alloc;
+ mem->nareas = nareas;
+ mem->area_nslabs = nslabs / mem->nareas;
+
+ for (i = 0; i < mem->nareas; i++) {
+ spin_lock_init(&mem->areas[i].lock);
+ mem->areas[i].index = 0;
+ mem->areas[i].used = 0;
+ }
+
+ for (i = 0; i < mem->nslabs; i++) {
+ mem->slots[i].list = min(IO_TLB_SEGSIZE - io_tlb_offset(i),
+ mem->nslabs - i);
+ mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
+ mem->slots[i].alloc_size = 0;
+ }
- vaddr = phys_to_virt(io_tlb_start);
- bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT);
- set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
memset(vaddr, 0, bytes);
+ mem->vaddr = vaddr;
+ return;
}
-int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
+/**
+ * add_mem_pool() - add a memory pool to the allocator
+ * @mem: Software IO TLB allocator.
+ * @pool: Memory pool to be added.
+ */
+static void add_mem_pool(struct io_tlb_mem *mem, struct io_tlb_pool *pool)
{
- unsigned long i, bytes;
- size_t alloc_size;
-
- bytes = nslabs << IO_TLB_SHIFT;
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ spin_lock(&mem->lock);
+ list_add_rcu(&pool->node, &mem->pools);
+ mem->nslabs += pool->nslabs;
+ spin_unlock(&mem->lock);
+#else
+ mem->nslabs = pool->nslabs;
+#endif
+}
- io_tlb_nslabs = nslabs;
- io_tlb_start = __pa(tlb);
- io_tlb_end = io_tlb_start + bytes;
+static void __init *swiotlb_memblock_alloc(unsigned long nslabs,
+ unsigned int flags,
+ int (*remap)(void *tlb, unsigned long nslabs))
+{
+ size_t bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT);
+ void *tlb;
/*
- * Allocate and initialize the free list array. This array is used
- * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
- * between io_tlb_start and io_tlb_end.
+ * By default allocate the bounce buffer memory from low memory, but
+ * allow to pick a location everywhere for hypervisors with guest
+ * memory encryption.
*/
- alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(int));
- io_tlb_list = memblock_alloc(alloc_size, PAGE_SIZE);
- if (!io_tlb_list)
- panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
- __func__, alloc_size, PAGE_SIZE);
-
- alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t));
- io_tlb_orig_addr = memblock_alloc(alloc_size, PAGE_SIZE);
- if (!io_tlb_orig_addr)
- panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
- __func__, alloc_size, PAGE_SIZE);
+ if (flags & SWIOTLB_ANY)
+ tlb = memblock_alloc(bytes, PAGE_SIZE);
+ else
+ tlb = memblock_alloc_low(bytes, PAGE_SIZE);
- for (i = 0; i < io_tlb_nslabs; i++) {
- io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
- io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+ if (!tlb) {
+ pr_warn("%s: Failed to allocate %zu bytes tlb structure\n",
+ __func__, bytes);
+ return NULL;
}
- io_tlb_index = 0;
- if (verbose)
- swiotlb_print_info();
+ if (remap && remap(tlb, nslabs) < 0) {
+ memblock_free(tlb, PAGE_ALIGN(bytes));
+ pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes);
+ return NULL;
+ }
- swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT);
- return 0;
+ return tlb;
}
/*
* Statically reserve bounce buffer space and initialize bounce buffer data
* structures for the software IO TLB used to implement the DMA API.
*/
-void __init
-swiotlb_init(int verbose)
+void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
+ int (*remap)(void *tlb, unsigned long nslabs))
{
- size_t default_size = IO_TLB_DEFAULT_SIZE;
- unsigned char *vstart;
- unsigned long bytes;
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
+ unsigned long nslabs;
+ unsigned int nareas;
+ size_t alloc_size;
+ void *tlb;
+
+ if (!addressing_limit && !swiotlb_force_bounce)
+ return;
+ if (swiotlb_force_disable)
+ return;
+
+ io_tlb_default_mem.force_bounce =
+ swiotlb_force_bounce || (flags & SWIOTLB_FORCE);
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ if (!remap)
+ io_tlb_default_mem.can_grow = true;
+ if (flags & SWIOTLB_ANY)
+ io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
+ else
+ io_tlb_default_mem.phys_limit = ARCH_LOW_ADDRESS_LIMIT;
+#endif
+
+ if (!default_nareas)
+ swiotlb_adjust_nareas(num_possible_cpus());
- if (!io_tlb_nslabs) {
- io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
- io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+ nslabs = default_nslabs;
+ nareas = limit_nareas(default_nareas, nslabs);
+ while ((tlb = swiotlb_memblock_alloc(nslabs, flags, remap)) == NULL) {
+ if (nslabs <= IO_TLB_MIN_SLABS)
+ return;
+ nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
+ nareas = limit_nareas(nareas, nslabs);
}
- bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+ if (default_nslabs != nslabs) {
+ pr_info("SWIOTLB bounce buffer size adjusted %lu -> %lu slabs",
+ default_nslabs, nslabs);
+ default_nslabs = nslabs;
+ }
- /* Get IO TLB memory from the low pages */
- vstart = memblock_alloc_low(PAGE_ALIGN(bytes), PAGE_SIZE);
- if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose))
+ alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
+ mem->slots = memblock_alloc(alloc_size, PAGE_SIZE);
+ if (!mem->slots) {
+ pr_warn("%s: Failed to allocate %zu bytes align=0x%lx\n",
+ __func__, alloc_size, PAGE_SIZE);
return;
+ }
- if (io_tlb_start)
- memblock_free_early(io_tlb_start,
- PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
- pr_warn("Cannot allocate buffer");
- no_iotlb_memory = true;
+ mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area),
+ nareas), SMP_CACHE_BYTES);
+ if (!mem->areas) {
+ pr_warn("%s: Failed to allocate mem->areas.\n", __func__);
+ return;
+ }
+
+ swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false, nareas);
+ add_mem_pool(&io_tlb_default_mem, mem);
+
+ if (flags & SWIOTLB_VERBOSE)
+ swiotlb_print_info();
+}
+
+void __init swiotlb_init(bool addressing_limit, unsigned int flags)
+{
+ swiotlb_init_remap(addressing_limit, flags, NULL);
}
/*
@@ -274,148 +423,459 @@ swiotlb_init(int verbose)
* initialize the swiotlb later using the slab allocator if needed.
* This should be just like above, but with some error catching.
*/
-int
-swiotlb_late_init_with_default_size(size_t default_size)
+int swiotlb_init_late(size_t size, gfp_t gfp_mask,
+ int (*remap)(void *tlb, unsigned long nslabs))
{
- unsigned long bytes, req_nslabs = io_tlb_nslabs;
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
+ unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+ unsigned int nareas;
unsigned char *vstart = NULL;
- unsigned int order;
+ unsigned int order, area_order;
+ bool retried = false;
int rc = 0;
- if (!io_tlb_nslabs) {
- io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
- io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
- }
+ if (io_tlb_default_mem.nslabs)
+ return 0;
- /*
- * Get IO TLB memory from the low pages
- */
- order = get_order(io_tlb_nslabs << IO_TLB_SHIFT);
- io_tlb_nslabs = SLABS_PER_PAGE << order;
- bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+ if (swiotlb_force_disable)
+ return 0;
+
+ io_tlb_default_mem.force_bounce = swiotlb_force_bounce;
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ if (!remap)
+ io_tlb_default_mem.can_grow = true;
+ if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA))
+ io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits);
+ else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32))
+ io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32);
+ else
+ io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
+#endif
+
+ if (!default_nareas)
+ swiotlb_adjust_nareas(num_possible_cpus());
+
+retry:
+ order = get_order(nslabs << IO_TLB_SHIFT);
+ nslabs = SLABS_PER_PAGE << order;
while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
- vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
+ vstart = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
order);
if (vstart)
break;
order--;
+ nslabs = SLABS_PER_PAGE << order;
+ retried = true;
}
- if (!vstart) {
- io_tlb_nslabs = req_nslabs;
+ if (!vstart)
return -ENOMEM;
+
+ if (remap)
+ rc = remap(vstart, nslabs);
+ if (rc) {
+ free_pages((unsigned long)vstart, order);
+
+ nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
+ if (nslabs < IO_TLB_MIN_SLABS)
+ return rc;
+ retried = true;
+ goto retry;
}
- if (order != get_order(bytes)) {
+
+ if (retried) {
pr_warn("only able to allocate %ld MB\n",
(PAGE_SIZE << order) >> 20);
- io_tlb_nslabs = SLABS_PER_PAGE << order;
}
- rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs);
- if (rc)
- free_pages((unsigned long)vstart, order);
- return rc;
+ nareas = limit_nareas(default_nareas, nslabs);
+ area_order = get_order(array_size(sizeof(*mem->areas), nareas));
+ mem->areas = (struct io_tlb_area *)
+ __get_free_pages(GFP_KERNEL | __GFP_ZERO, area_order);
+ if (!mem->areas)
+ goto error_area;
+
+ mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(array_size(sizeof(*mem->slots), nslabs)));
+ if (!mem->slots)
+ goto error_slots;
+
+ set_memory_decrypted((unsigned long)vstart,
+ (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
+ swiotlb_init_io_tlb_pool(mem, virt_to_phys(vstart), nslabs, true,
+ nareas);
+ add_mem_pool(&io_tlb_default_mem, mem);
+
+ swiotlb_print_info();
+ return 0;
+
+error_slots:
+ free_pages((unsigned long)mem->areas, area_order);
+error_area:
+ free_pages((unsigned long)vstart, order);
+ return -ENOMEM;
}
-static void swiotlb_cleanup(void)
+void __init swiotlb_exit(void)
{
- io_tlb_end = 0;
- io_tlb_start = 0;
- io_tlb_nslabs = 0;
- max_segment = 0;
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
+ unsigned long tbl_vaddr;
+ size_t tbl_size, slots_size;
+ unsigned int area_order;
+
+ if (swiotlb_force_bounce)
+ return;
+
+ if (!mem->nslabs)
+ return;
+
+ pr_info("tearing down default memory pool\n");
+ tbl_vaddr = (unsigned long)phys_to_virt(mem->start);
+ tbl_size = PAGE_ALIGN(mem->end - mem->start);
+ slots_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), mem->nslabs));
+
+ set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
+ if (mem->late_alloc) {
+ area_order = get_order(array_size(sizeof(*mem->areas),
+ mem->nareas));
+ free_pages((unsigned long)mem->areas, area_order);
+ free_pages(tbl_vaddr, get_order(tbl_size));
+ free_pages((unsigned long)mem->slots, get_order(slots_size));
+ } else {
+ memblock_free_late(__pa(mem->areas),
+ array_size(sizeof(*mem->areas), mem->nareas));
+ memblock_free_late(mem->start, tbl_size);
+ memblock_free_late(__pa(mem->slots), slots_size);
+ }
+
+ memset(mem, 0, sizeof(*mem));
}
-int
-swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
+/**
+ * alloc_dma_pages() - allocate pages to be used for DMA
+ * @gfp: GFP flags for the allocation.
+ * @bytes: Size of the buffer.
+ * @phys_limit: Maximum allowed physical address of the buffer.
+ *
+ * Allocate pages from the buddy allocator. If successful, make the allocated
+ * pages decrypted that they can be used for DMA.
+ *
+ * Return: Decrypted pages, %NULL on allocation failure, or ERR_PTR(-EAGAIN)
+ * if the allocated physical address was above @phys_limit.
+ */
+static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit)
{
- unsigned long i, bytes;
+ unsigned int order = get_order(bytes);
+ struct page *page;
+ phys_addr_t paddr;
+ void *vaddr;
- bytes = nslabs << IO_TLB_SHIFT;
+ page = alloc_pages(gfp, order);
+ if (!page)
+ return NULL;
- io_tlb_nslabs = nslabs;
- io_tlb_start = virt_to_phys(tlb);
- io_tlb_end = io_tlb_start + bytes;
+ paddr = page_to_phys(page);
+ if (paddr + bytes - 1 > phys_limit) {
+ __free_pages(page, order);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ vaddr = phys_to_virt(paddr);
+ if (set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes)))
+ goto error;
+ return page;
- set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT);
- memset(tlb, 0, bytes);
+error:
+ /* Intentional leak if pages cannot be encrypted again. */
+ if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
+ __free_pages(page, order);
+ return NULL;
+}
+
+/**
+ * swiotlb_alloc_tlb() - allocate a dynamic IO TLB buffer
+ * @dev: Device for which a memory pool is allocated.
+ * @bytes: Size of the buffer.
+ * @phys_limit: Maximum allowed physical address of the buffer.
+ * @gfp: GFP flags for the allocation.
+ *
+ * Return: Allocated pages, or %NULL on allocation failure.
+ */
+static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
+ u64 phys_limit, gfp_t gfp)
+{
+ struct page *page;
/*
- * Allocate and initialize the free list array. This array is used
- * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
- * between io_tlb_start and io_tlb_end.
+ * Allocate from the atomic pools if memory is encrypted and
+ * the allocation is atomic, because decrypting may block.
*/
- io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
- get_order(io_tlb_nslabs * sizeof(int)));
- if (!io_tlb_list)
- goto cleanup3;
+ if (!gfpflags_allow_blocking(gfp) && dev && force_dma_unencrypted(dev)) {
+ void *vaddr;
- io_tlb_orig_addr = (phys_addr_t *)
- __get_free_pages(GFP_KERNEL,
- get_order(io_tlb_nslabs *
- sizeof(phys_addr_t)));
- if (!io_tlb_orig_addr)
- goto cleanup4;
+ if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
+ return NULL;
- for (i = 0; i < io_tlb_nslabs; i++) {
- io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
- io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+ return dma_alloc_from_pool(dev, bytes, &vaddr, gfp,
+ dma_coherent_ok);
}
- io_tlb_index = 0;
- swiotlb_print_info();
+ gfp &= ~GFP_ZONEMASK;
+ if (phys_limit <= DMA_BIT_MASK(zone_dma_bits))
+ gfp |= __GFP_DMA;
+ else if (phys_limit <= DMA_BIT_MASK(32))
+ gfp |= __GFP_DMA32;
+
+ while (IS_ERR(page = alloc_dma_pages(gfp, bytes, phys_limit))) {
+ if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
+ phys_limit < DMA_BIT_MASK(64) &&
+ !(gfp & (__GFP_DMA32 | __GFP_DMA)))
+ gfp |= __GFP_DMA32;
+ else if (IS_ENABLED(CONFIG_ZONE_DMA) &&
+ !(gfp & __GFP_DMA))
+ gfp = (gfp & ~__GFP_DMA32) | __GFP_DMA;
+ else
+ return NULL;
+ }
- late_alloc = 1;
+ return page;
+}
- swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT);
+/**
+ * swiotlb_free_tlb() - free a dynamically allocated IO TLB buffer
+ * @vaddr: Virtual address of the buffer.
+ * @bytes: Size of the buffer.
+ */
+static void swiotlb_free_tlb(void *vaddr, size_t bytes)
+{
+ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+ dma_free_from_pool(NULL, vaddr, bytes))
+ return;
- return 0;
+ /* Intentional leak if pages cannot be encrypted again. */
+ if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
+ __free_pages(virt_to_page(vaddr), get_order(bytes));
+}
-cleanup4:
- free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
- sizeof(int)));
- io_tlb_list = NULL;
-cleanup3:
- swiotlb_cleanup();
- return -ENOMEM;
+/**
+ * swiotlb_alloc_pool() - allocate a new IO TLB memory pool
+ * @dev: Device for which a memory pool is allocated.
+ * @minslabs: Minimum number of slabs.
+ * @nslabs: Desired (maximum) number of slabs.
+ * @nareas: Number of areas.
+ * @phys_limit: Maximum DMA buffer physical address.
+ * @gfp: GFP flags for the allocations.
+ *
+ * Allocate and initialize a new IO TLB memory pool. The actual number of
+ * slabs may be reduced if allocation of @nslabs fails. If even
+ * @minslabs cannot be allocated, this function fails.
+ *
+ * Return: New memory pool, or %NULL on allocation failure.
+ */
+static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
+ unsigned long minslabs, unsigned long nslabs,
+ unsigned int nareas, u64 phys_limit, gfp_t gfp)
+{
+ struct io_tlb_pool *pool;
+ unsigned int slot_order;
+ struct page *tlb;
+ size_t pool_size;
+ size_t tlb_size;
+
+ if (nslabs > SLABS_PER_PAGE << MAX_PAGE_ORDER) {
+ nslabs = SLABS_PER_PAGE << MAX_PAGE_ORDER;
+ nareas = limit_nareas(nareas, nslabs);
+ }
+
+ pool_size = sizeof(*pool) + array_size(sizeof(*pool->areas), nareas);
+ pool = kzalloc(pool_size, gfp);
+ if (!pool)
+ goto error;
+ pool->areas = (void *)pool + sizeof(*pool);
+
+ tlb_size = nslabs << IO_TLB_SHIFT;
+ while (!(tlb = swiotlb_alloc_tlb(dev, tlb_size, phys_limit, gfp))) {
+ if (nslabs <= minslabs)
+ goto error_tlb;
+ nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
+ nareas = limit_nareas(nareas, nslabs);
+ tlb_size = nslabs << IO_TLB_SHIFT;
+ }
+
+ slot_order = get_order(array_size(sizeof(*pool->slots), nslabs));
+ pool->slots = (struct io_tlb_slot *)
+ __get_free_pages(gfp, slot_order);
+ if (!pool->slots)
+ goto error_slots;
+
+ swiotlb_init_io_tlb_pool(pool, page_to_phys(tlb), nslabs, true, nareas);
+ return pool;
+
+error_slots:
+ swiotlb_free_tlb(page_address(tlb), tlb_size);
+error_tlb:
+ kfree(pool);
+error:
+ return NULL;
}
-void __init swiotlb_exit(void)
+/**
+ * swiotlb_dyn_alloc() - dynamic memory pool allocation worker
+ * @work: Pointer to dyn_alloc in struct io_tlb_mem.
+ */
+static void swiotlb_dyn_alloc(struct work_struct *work)
{
- if (!io_tlb_orig_addr)
+ struct io_tlb_mem *mem =
+ container_of(work, struct io_tlb_mem, dyn_alloc);
+ struct io_tlb_pool *pool;
+
+ pool = swiotlb_alloc_pool(NULL, IO_TLB_MIN_SLABS, default_nslabs,
+ default_nareas, mem->phys_limit, GFP_KERNEL);
+ if (!pool) {
+ pr_warn_ratelimited("Failed to allocate new pool");
return;
+ }
- if (late_alloc) {
- free_pages((unsigned long)io_tlb_orig_addr,
- get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
- free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
- sizeof(int)));
- free_pages((unsigned long)phys_to_virt(io_tlb_start),
- get_order(io_tlb_nslabs << IO_TLB_SHIFT));
- } else {
- memblock_free_late(__pa(io_tlb_orig_addr),
- PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
- memblock_free_late(__pa(io_tlb_list),
- PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
- memblock_free_late(io_tlb_start,
- PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
+ add_mem_pool(mem, pool);
+}
+
+/**
+ * swiotlb_dyn_free() - RCU callback to free a memory pool
+ * @rcu: RCU head in the corresponding struct io_tlb_pool.
+ */
+static void swiotlb_dyn_free(struct rcu_head *rcu)
+{
+ struct io_tlb_pool *pool = container_of(rcu, struct io_tlb_pool, rcu);
+ size_t slots_size = array_size(sizeof(*pool->slots), pool->nslabs);
+ size_t tlb_size = pool->end - pool->start;
+
+ free_pages((unsigned long)pool->slots, get_order(slots_size));
+ swiotlb_free_tlb(pool->vaddr, tlb_size);
+ kfree(pool);
+}
+
+/**
+ * swiotlb_find_pool() - find the IO TLB pool for a physical address
+ * @dev: Device which has mapped the DMA buffer.
+ * @paddr: Physical address within the DMA buffer.
+ *
+ * Find the IO TLB memory pool descriptor which contains the given physical
+ * address, if any.
+ *
+ * Return: Memory pool which contains @paddr, or %NULL if none.
+ */
+struct io_tlb_pool *swiotlb_find_pool(struct device *dev, phys_addr_t paddr)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *pool;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &mem->pools, node) {
+ if (paddr >= pool->start && paddr < pool->end)
+ goto out;
+ }
+
+ list_for_each_entry_rcu(pool, &dev->dma_io_tlb_pools, node) {
+ if (paddr >= pool->start && paddr < pool->end)
+ goto out;
}
- swiotlb_cleanup();
+ pool = NULL;
+out:
+ rcu_read_unlock();
+ return pool;
+}
+
+/**
+ * swiotlb_del_pool() - remove an IO TLB pool from a device
+ * @dev: Owning device.
+ * @pool: Memory pool to be removed.
+ */
+static void swiotlb_del_pool(struct device *dev, struct io_tlb_pool *pool)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->dma_io_tlb_lock, flags);
+ list_del_rcu(&pool->node);
+ spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags);
+
+ call_rcu(&pool->rcu, swiotlb_dyn_free);
+}
+
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
+/**
+ * swiotlb_dev_init() - initialize swiotlb fields in &struct device
+ * @dev: Device to be initialized.
+ */
+void swiotlb_dev_init(struct device *dev)
+{
+ dev->dma_io_tlb_mem = &io_tlb_default_mem;
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ INIT_LIST_HEAD(&dev->dma_io_tlb_pools);
+ spin_lock_init(&dev->dma_io_tlb_lock);
+ dev->dma_uses_io_tlb = false;
+#endif
+}
+
+/*
+ * Return the offset into a iotlb slot required to keep the device happy.
+ */
+static unsigned int swiotlb_align_offset(struct device *dev, u64 addr)
+{
+ return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1);
}
/*
* Bounce: copy the swiotlb buffer from or back to the original dma location
*/
-static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
- size_t size, enum dma_data_direction dir)
+static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
+ enum dma_data_direction dir)
{
+ struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr);
+ int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
+ phys_addr_t orig_addr = mem->slots[index].orig_addr;
+ size_t alloc_size = mem->slots[index].alloc_size;
unsigned long pfn = PFN_DOWN(orig_addr);
- unsigned char *vaddr = phys_to_virt(tlb_addr);
+ unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start;
+ unsigned int tlb_offset, orig_addr_offset;
+
+ if (orig_addr == INVALID_PHYS_ADDR)
+ return;
+
+ tlb_offset = tlb_addr & (IO_TLB_SIZE - 1);
+ orig_addr_offset = swiotlb_align_offset(dev, orig_addr);
+ if (tlb_offset < orig_addr_offset) {
+ dev_WARN_ONCE(dev, 1,
+ "Access before mapping start detected. orig offset %u, requested offset %u.\n",
+ orig_addr_offset, tlb_offset);
+ return;
+ }
+
+ tlb_offset -= orig_addr_offset;
+ if (tlb_offset > alloc_size) {
+ dev_WARN_ONCE(dev, 1,
+ "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu+%u.\n",
+ alloc_size, size, tlb_offset);
+ return;
+ }
+
+ orig_addr += tlb_offset;
+ alloc_size -= tlb_offset;
+
+ if (size > alloc_size) {
+ dev_WARN_ONCE(dev, 1,
+ "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu.\n",
+ alloc_size, size);
+ size = alloc_size;
+ }
if (PageHighMem(pfn_to_page(pfn))) {
- /* The buffer does not have a mapping. Map it in and copy */
unsigned int offset = orig_addr & ~PAGE_MASK;
- char *buffer;
+ struct page *page;
unsigned int sz = 0;
unsigned long flags;
@@ -423,12 +883,11 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
sz = min_t(size_t, PAGE_SIZE - offset, size);
local_irq_save(flags);
- buffer = kmap_atomic(pfn_to_page(pfn));
+ page = pfn_to_page(pfn);
if (dir == DMA_TO_DEVICE)
- memcpy(vaddr, buffer + offset, sz);
+ memcpy_from_page(vaddr, page, offset, sz);
else
- memcpy(buffer + offset, vaddr, sz);
- kunmap_atomic(buffer);
+ memcpy_to_page(page, offset, vaddr, sz);
local_irq_restore(flags);
size -= sz;
@@ -443,157 +902,458 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
}
}
-phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
- dma_addr_t tbl_dma_addr,
- phys_addr_t orig_addr,
- size_t mapping_size,
- size_t alloc_size,
- enum dma_data_direction dir,
- unsigned long attrs)
+static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx)
{
- unsigned long flags;
- phys_addr_t tlb_addr;
- unsigned int nslots, stride, index, wrap;
- int i;
- unsigned long mask;
- unsigned long offset_slots;
- unsigned long max_slots;
- unsigned long tmp_io_tlb_used;
+ return start + (idx << IO_TLB_SHIFT);
+}
- if (no_iotlb_memory)
- panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
+/*
+ * Carefully handle integer overflow which can occur when boundary_mask == ~0UL.
+ */
+static inline unsigned long get_max_slots(unsigned long boundary_mask)
+{
+ return (boundary_mask >> IO_TLB_SHIFT) + 1;
+}
- if (mem_encrypt_active())
- pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
+static unsigned int wrap_area_index(struct io_tlb_pool *mem, unsigned int index)
+{
+ if (index >= mem->area_nslabs)
+ return 0;
+ return index;
+}
- if (mapping_size > alloc_size) {
- dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
- mapping_size, alloc_size);
- return (phys_addr_t)DMA_MAPPING_ERROR;
- }
+/*
+ * Track the total used slots with a global atomic value in order to have
+ * correct information to determine the high water mark. The mem_used()
+ * function gives imprecise results because there's no locking across
+ * multiple areas.
+ */
+#ifdef CONFIG_DEBUG_FS
+static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots)
+{
+ unsigned long old_hiwater, new_used;
- mask = dma_get_seg_boundary(hwdev);
+ new_used = atomic_long_add_return(nslots, &mem->total_used);
+ old_hiwater = atomic_long_read(&mem->used_hiwater);
+ do {
+ if (new_used <= old_hiwater)
+ break;
+ } while (!atomic_long_try_cmpxchg(&mem->used_hiwater,
+ &old_hiwater, new_used));
+}
- tbl_dma_addr &= mask;
+static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+ atomic_long_sub(nslots, &mem->total_used);
+}
- offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+#else /* !CONFIG_DEBUG_FS */
+static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots)
+{
+}
+static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+
+/**
+ * swiotlb_search_pool_area() - search one memory area in one pool
+ * @dev: Device which maps the buffer.
+ * @pool: Memory pool to be searched.
+ * @area_index: Index of the IO TLB memory area to be searched.
+ * @orig_addr: Original (non-bounced) IO buffer address.
+ * @alloc_size: Total requested size of the bounce buffer,
+ * including initial alignment padding.
+ * @alloc_align_mask: Required alignment of the allocated buffer.
+ *
+ * Find a suitable sequence of IO TLB entries for the request and allocate
+ * a buffer from the given IO TLB memory area.
+ * This function takes care of locking.
+ *
+ * Return: Index of the first allocated slot, or -1 on error.
+ */
+static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool,
+ int area_index, phys_addr_t orig_addr, size_t alloc_size,
+ unsigned int alloc_align_mask)
+{
+ struct io_tlb_area *area = pool->areas + area_index;
+ unsigned long boundary_mask = dma_get_seg_boundary(dev);
+ dma_addr_t tbl_dma_addr =
+ phys_to_dma_unencrypted(dev, pool->start) & boundary_mask;
+ unsigned long max_slots = get_max_slots(boundary_mask);
+ unsigned int iotlb_align_mask =
+ dma_get_min_align_mask(dev) | alloc_align_mask;
+ unsigned int nslots = nr_slots(alloc_size), stride;
+ unsigned int offset = swiotlb_align_offset(dev, orig_addr);
+ unsigned int index, slots_checked, count = 0, i;
+ unsigned long flags;
+ unsigned int slot_base;
+ unsigned int slot_index;
- /*
- * Carefully handle integer overflow which can occur when mask == ~0UL.
- */
- max_slots = mask + 1
- ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
- : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
+ BUG_ON(!nslots);
+ BUG_ON(area_index >= pool->nareas);
/*
- * For mappings greater than or equal to a page, we limit the stride
- * (and hence alignment) to a page size.
+ * For allocations of PAGE_SIZE or larger only look for page aligned
+ * allocations.
*/
- nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
if (alloc_size >= PAGE_SIZE)
- stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
- else
- stride = 1;
-
- BUG_ON(!nslots);
+ iotlb_align_mask |= ~PAGE_MASK;
+ iotlb_align_mask &= ~(IO_TLB_SIZE - 1);
/*
- * Find suitable number of IO TLB entries size that will fit this
- * request and allocate a buffer from that IO TLB pool.
+ * For mappings with an alignment requirement don't bother looping to
+ * unaligned slots once we found an aligned one.
*/
- spin_lock_irqsave(&io_tlb_lock, flags);
+ stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1;
- if (unlikely(nslots > io_tlb_nslabs - io_tlb_used))
+ spin_lock_irqsave(&area->lock, flags);
+ if (unlikely(nslots > pool->area_nslabs - area->used))
goto not_found;
- index = ALIGN(io_tlb_index, stride);
- if (index >= io_tlb_nslabs)
- index = 0;
- wrap = index;
+ slot_base = area_index * pool->area_nslabs;
+ index = area->index;
- do {
- while (iommu_is_span_boundary(index, nslots, offset_slots,
- max_slots)) {
- index += stride;
- if (index >= io_tlb_nslabs)
- index = 0;
- if (index == wrap)
- goto not_found;
- }
+ for (slots_checked = 0; slots_checked < pool->area_nslabs; ) {
+ slot_index = slot_base + index;
- /*
- * If we find a slot that indicates we have 'nslots' number of
- * contiguous buffers, we allocate the buffers from that slot
- * and mark the entries as '0' indicating unavailable.
- */
- if (io_tlb_list[index] >= nslots) {
- int count = 0;
-
- for (i = index; i < (int) (index + nslots); i++)
- io_tlb_list[i] = 0;
- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
- io_tlb_list[i] = ++count;
- tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT);
-
- /*
- * Update the indices to avoid searching in the next
- * round.
- */
- io_tlb_index = ((index + nslots) < io_tlb_nslabs
- ? (index + nslots) : 0);
+ if (orig_addr &&
+ (slot_addr(tbl_dma_addr, slot_index) &
+ iotlb_align_mask) != (orig_addr & iotlb_align_mask)) {
+ index = wrap_area_index(pool, index + 1);
+ slots_checked++;
+ continue;
+ }
- goto found;
+ if (!iommu_is_span_boundary(slot_index, nslots,
+ nr_slots(tbl_dma_addr),
+ max_slots)) {
+ if (pool->slots[slot_index].list >= nslots)
+ goto found;
}
- index += stride;
- if (index >= io_tlb_nslabs)
- index = 0;
- } while (index != wrap);
+ index = wrap_area_index(pool, index + stride);
+ slots_checked += stride;
+ }
not_found:
- tmp_io_tlb_used = io_tlb_used;
+ spin_unlock_irqrestore(&area->lock, flags);
+ return -1;
- spin_unlock_irqrestore(&io_tlb_lock, flags);
- if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
- dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
- alloc_size, io_tlb_nslabs, tmp_io_tlb_used);
- return (phys_addr_t)DMA_MAPPING_ERROR;
found:
- io_tlb_used += nslots;
- spin_unlock_irqrestore(&io_tlb_lock, flags);
+ /*
+ * If we find a slot that indicates we have 'nslots' number of
+ * contiguous buffers, we allocate the buffers from that slot onwards
+ * and set the list of free entries to '0' indicating unavailable.
+ */
+ for (i = slot_index; i < slot_index + nslots; i++) {
+ pool->slots[i].list = 0;
+ pool->slots[i].alloc_size = alloc_size - (offset +
+ ((i - slot_index) << IO_TLB_SHIFT));
+ }
+ for (i = slot_index - 1;
+ io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
+ pool->slots[i].list; i--)
+ pool->slots[i].list = ++count;
/*
- * Save away the mapping from the original address to the DMA address.
- * This is needed when we sync the memory. Then we sync the buffer if
- * needed.
+ * Update the indices to avoid searching in the next round.
*/
- for (i = 0; i < nslots; i++)
- io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
- (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
+ area->index = wrap_area_index(pool, index + nslots);
+ area->used += nslots;
+ spin_unlock_irqrestore(&area->lock, flags);
- return tlb_addr;
+ inc_used_and_hiwater(dev->dma_io_tlb_mem, nslots);
+ return slot_index;
}
-/*
- * tlb_addr is the physical address of the bounce buffer to unmap.
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
+/**
+ * swiotlb_search_area() - search one memory area in all pools
+ * @dev: Device which maps the buffer.
+ * @start_cpu: Start CPU number.
+ * @cpu_offset: Offset from @start_cpu.
+ * @orig_addr: Original (non-bounced) IO buffer address.
+ * @alloc_size: Total requested size of the bounce buffer,
+ * including initial alignment padding.
+ * @alloc_align_mask: Required alignment of the allocated buffer.
+ * @retpool: Used memory pool, updated on return.
+ *
+ * Search one memory area in all pools for a sequence of slots that match the
+ * allocation constraints.
+ *
+ * Return: Index of the first allocated slot, or -1 on error.
+ */
+static int swiotlb_search_area(struct device *dev, int start_cpu,
+ int cpu_offset, phys_addr_t orig_addr, size_t alloc_size,
+ unsigned int alloc_align_mask, struct io_tlb_pool **retpool)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *pool;
+ int area_index;
+ int index = -1;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &mem->pools, node) {
+ if (cpu_offset >= pool->nareas)
+ continue;
+ area_index = (start_cpu + cpu_offset) & (pool->nareas - 1);
+ index = swiotlb_search_pool_area(dev, pool, area_index,
+ orig_addr, alloc_size,
+ alloc_align_mask);
+ if (index >= 0) {
+ *retpool = pool;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return index;
+}
+
+/**
+ * swiotlb_find_slots() - search for slots in the whole swiotlb
+ * @dev: Device which maps the buffer.
+ * @orig_addr: Original (non-bounced) IO buffer address.
+ * @alloc_size: Total requested size of the bounce buffer,
+ * including initial alignment padding.
+ * @alloc_align_mask: Required alignment of the allocated buffer.
+ * @retpool: Used memory pool, updated on return.
+ *
+ * Search through the whole software IO TLB to find a sequence of slots that
+ * match the allocation constraints.
+ *
+ * Return: Index of the first allocated slot, or -1 on error.
*/
-void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
- size_t mapping_size, size_t alloc_size,
- enum dma_data_direction dir, unsigned long attrs)
+static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
+ size_t alloc_size, unsigned int alloc_align_mask,
+ struct io_tlb_pool **retpool)
{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *pool;
+ unsigned long nslabs;
unsigned long flags;
- int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
- int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
- phys_addr_t orig_addr = io_tlb_orig_addr[index];
+ u64 phys_limit;
+ int cpu, i;
+ int index;
+
+ if (alloc_size > IO_TLB_SEGSIZE * IO_TLB_SIZE)
+ return -1;
+
+ cpu = raw_smp_processor_id();
+ for (i = 0; i < default_nareas; ++i) {
+ index = swiotlb_search_area(dev, cpu, i, orig_addr, alloc_size,
+ alloc_align_mask, &pool);
+ if (index >= 0)
+ goto found;
+ }
+
+ if (!mem->can_grow)
+ return -1;
+
+ schedule_work(&mem->dyn_alloc);
+
+ nslabs = nr_slots(alloc_size);
+ phys_limit = min_not_zero(*dev->dma_mask, dev->bus_dma_limit);
+ pool = swiotlb_alloc_pool(dev, nslabs, nslabs, 1, phys_limit,
+ GFP_NOWAIT | __GFP_NOWARN);
+ if (!pool)
+ return -1;
+
+ index = swiotlb_search_pool_area(dev, pool, 0, orig_addr,
+ alloc_size, alloc_align_mask);
+ if (index < 0) {
+ swiotlb_dyn_free(&pool->rcu);
+ return -1;
+ }
+
+ pool->transient = true;
+ spin_lock_irqsave(&dev->dma_io_tlb_lock, flags);
+ list_add_rcu(&pool->node, &dev->dma_io_tlb_pools);
+ spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags);
+
+found:
+ WRITE_ONCE(dev->dma_uses_io_tlb, true);
/*
- * First, sync the memory before unmapping the entry
+ * The general barrier orders reads and writes against a presumed store
+ * of the SWIOTLB buffer address by a device driver (to a driver private
+ * data structure). It serves two purposes.
+ *
+ * First, the store to dev->dma_uses_io_tlb must be ordered before the
+ * presumed store. This guarantees that the returned buffer address
+ * cannot be passed to another CPU before updating dev->dma_uses_io_tlb.
+ *
+ * Second, the load from mem->pools must be ordered before the same
+ * presumed store. This guarantees that the returned buffer address
+ * cannot be observed by another CPU before an update of the RCU list
+ * that was made by swiotlb_dyn_alloc() on a third CPU (cf. multicopy
+ * atomicity).
+ *
+ * See also the comment in is_swiotlb_buffer().
+ */
+ smp_mb();
+
+ *retpool = pool;
+ return index;
+}
+
+#else /* !CONFIG_SWIOTLB_DYNAMIC */
+
+static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
+ size_t alloc_size, unsigned int alloc_align_mask,
+ struct io_tlb_pool **retpool)
+{
+ struct io_tlb_pool *pool;
+ int start, i;
+ int index;
+
+ *retpool = pool = &dev->dma_io_tlb_mem->defpool;
+ i = start = raw_smp_processor_id() & (pool->nareas - 1);
+ do {
+ index = swiotlb_search_pool_area(dev, pool, i, orig_addr,
+ alloc_size, alloc_align_mask);
+ if (index >= 0)
+ return index;
+ if (++i >= pool->nareas)
+ i = 0;
+ } while (i != start);
+ return -1;
+}
+
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
+#ifdef CONFIG_DEBUG_FS
+
+/**
+ * mem_used() - get number of used slots in an allocator
+ * @mem: Software IO TLB allocator.
+ *
+ * The result is accurate in this version of the function, because an atomic
+ * counter is available if CONFIG_DEBUG_FS is set.
+ *
+ * Return: Number of used slots.
+ */
+static unsigned long mem_used(struct io_tlb_mem *mem)
+{
+ return atomic_long_read(&mem->total_used);
+}
+
+#else /* !CONFIG_DEBUG_FS */
+
+/**
+ * mem_pool_used() - get number of used slots in a memory pool
+ * @pool: Software IO TLB memory pool.
+ *
+ * The result is not accurate, see mem_used().
+ *
+ * Return: Approximate number of used slots.
+ */
+static unsigned long mem_pool_used(struct io_tlb_pool *pool)
+{
+ int i;
+ unsigned long used = 0;
+
+ for (i = 0; i < pool->nareas; i++)
+ used += pool->areas[i].used;
+ return used;
+}
+
+/**
+ * mem_used() - get number of used slots in an allocator
+ * @mem: Software IO TLB allocator.
+ *
+ * The result is not accurate, because there is no locking of individual
+ * areas.
+ *
+ * Return: Approximate number of used slots.
+ */
+static unsigned long mem_used(struct io_tlb_mem *mem)
+{
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ struct io_tlb_pool *pool;
+ unsigned long used = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &mem->pools, node)
+ used += mem_pool_used(pool);
+ rcu_read_unlock();
+
+ return used;
+#else
+ return mem_pool_used(&mem->defpool);
+#endif
+}
+
+#endif /* CONFIG_DEBUG_FS */
+
+phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
+ size_t mapping_size, size_t alloc_size,
+ unsigned int alloc_align_mask, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ unsigned int offset = swiotlb_align_offset(dev, orig_addr);
+ struct io_tlb_pool *pool;
+ unsigned int i;
+ int index;
+ phys_addr_t tlb_addr;
+
+ if (!mem || !mem->nslabs) {
+ dev_warn_ratelimited(dev,
+ "Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+ }
+
+ if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
+ pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
+
+ if (mapping_size > alloc_size) {
+ dev_warn_once(dev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
+ mapping_size, alloc_size);
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+ }
+
+ index = swiotlb_find_slots(dev, orig_addr,
+ alloc_size + offset, alloc_align_mask, &pool);
+ if (index == -1) {
+ if (!(attrs & DMA_ATTR_NO_WARN))
+ dev_warn_ratelimited(dev,
+ "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
+ alloc_size, mem->nslabs, mem_used(mem));
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+ }
+
+ /*
+ * Save away the mapping from the original address to the DMA address.
+ * This is needed when we sync the memory. Then we sync the buffer if
+ * needed.
*/
- if (orig_addr != INVALID_PHYS_ADDR &&
- !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
- ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
- swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_FROM_DEVICE);
+ for (i = 0; i < nr_slots(alloc_size + offset); i++)
+ pool->slots[index + i].orig_addr = slot_addr(orig_addr, i);
+ tlb_addr = slot_addr(pool->start, index) + offset;
+ /*
+ * When the device is writing memory, i.e. dir == DMA_FROM_DEVICE, copy
+ * the original buffer to the TLB buffer before initiating DMA in order
+ * to preserve the original's data if the device does a partial write,
+ * i.e. if the device doesn't overwrite the entire buffer. Preserving
+ * the original data, even if it's garbage, is necessary to match
+ * hardware behavior. Use of swiotlb is supposed to be transparent,
+ * i.e. swiotlb must not corrupt memory by clobbering unwritten bytes.
+ */
+ swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
+ return tlb_addr;
+}
+
+static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
+{
+ struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr);
+ unsigned long flags;
+ unsigned int offset = swiotlb_align_offset(dev, tlb_addr);
+ int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
+ int nslots = nr_slots(mem->slots[index].alloc_size + offset);
+ int aindex = index / mem->area_nslabs;
+ struct io_tlb_area *area = &mem->areas[aindex];
+ int count, i;
/*
* Return the buffer to the free list by setting the corresponding
@@ -601,59 +1361,108 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
* While returning the entries to the free list, we merge the entries
* with slots below and above the pool being returned.
*/
- spin_lock_irqsave(&io_tlb_lock, flags);
- {
- count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
- io_tlb_list[index + nslots] : 0);
- /*
- * Step 1: return the slots to the free list, merging the
- * slots with superceeding slots
- */
- for (i = index + nslots - 1; i >= index; i--) {
- io_tlb_list[i] = ++count;
- io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
- }
- /*
- * Step 2: merge the returned slots with the preceding slots,
- * if available (non zero)
- */
- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
- io_tlb_list[i] = ++count;
+ BUG_ON(aindex >= mem->nareas);
- io_tlb_used -= nslots;
+ spin_lock_irqsave(&area->lock, flags);
+ if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE))
+ count = mem->slots[index + nslots].list;
+ else
+ count = 0;
+
+ /*
+ * Step 1: return the slots to the free list, merging the slots with
+ * superceeding slots
+ */
+ for (i = index + nslots - 1; i >= index; i--) {
+ mem->slots[i].list = ++count;
+ mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
+ mem->slots[i].alloc_size = 0;
}
- spin_unlock_irqrestore(&io_tlb_lock, flags);
+
+ /*
+ * Step 2: merge the returned slots with the preceding slots, if
+ * available (non zero)
+ */
+ for (i = index - 1;
+ io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list;
+ i--)
+ mem->slots[i].list = ++count;
+ area->used -= nslots;
+ spin_unlock_irqrestore(&area->lock, flags);
+
+ dec_used(dev->dma_io_tlb_mem, nslots);
}
-void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
- size_t size, enum dma_data_direction dir,
- enum dma_sync_target target)
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
+/**
+ * swiotlb_del_transient() - delete a transient memory pool
+ * @dev: Device which mapped the buffer.
+ * @tlb_addr: Physical address within a bounce buffer.
+ *
+ * Check whether the address belongs to a transient SWIOTLB memory pool.
+ * If yes, then delete the pool.
+ *
+ * Return: %true if @tlb_addr belonged to a transient pool that was released.
+ */
+static bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr)
{
- int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
- phys_addr_t orig_addr = io_tlb_orig_addr[index];
+ struct io_tlb_pool *pool;
- if (orig_addr == INVALID_PHYS_ADDR)
+ pool = swiotlb_find_pool(dev, tlb_addr);
+ if (!pool->transient)
+ return false;
+
+ dec_used(dev->dma_io_tlb_mem, pool->nslabs);
+ swiotlb_del_pool(dev, pool);
+ return true;
+}
+
+#else /* !CONFIG_SWIOTLB_DYNAMIC */
+
+static inline bool swiotlb_del_transient(struct device *dev,
+ phys_addr_t tlb_addr)
+{
+ return false;
+}
+
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
+/*
+ * tlb_addr is the physical address of the bounce buffer to unmap.
+ */
+void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
+ size_t mapping_size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ /*
+ * First, sync the memory before unmapping the entry
+ */
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+ (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+ swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
+
+ if (swiotlb_del_transient(dev, tlb_addr))
return;
- orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1);
+ swiotlb_release_slots(dev, tlb_addr);
+}
- switch (target) {
- case SYNC_FOR_CPU:
- if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(orig_addr, tlb_addr,
- size, DMA_FROM_DEVICE);
- else
- BUG_ON(dir != DMA_TO_DEVICE);
- break;
- case SYNC_FOR_DEVICE:
- if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(orig_addr, tlb_addr,
- size, DMA_TO_DEVICE);
- else
- BUG_ON(dir != DMA_FROM_DEVICE);
- break;
- default:
- BUG();
- }
+void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir)
+{
+ if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+ swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE);
+ else
+ BUG_ON(dir != DMA_FROM_DEVICE);
+}
+
+void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir)
+{
+ if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+ swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE);
+ else
+ BUG_ON(dir != DMA_TO_DEVICE);
}
/*
@@ -666,19 +1475,17 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
phys_addr_t swiotlb_addr;
dma_addr_t dma_addr;
- trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size,
- swiotlb_force);
+ trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size);
- swiotlb_addr = swiotlb_tbl_map_single(dev,
- __phys_to_dma(dev, io_tlb_start),
- paddr, size, size, dir, attrs);
+ swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, 0, dir,
+ attrs);
if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
return DMA_MAPPING_ERROR;
/* Ensure that the address returned is DMA'ble */
- dma_addr = __phys_to_dma(dev, swiotlb_addr);
+ dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
- swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir,
+ swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC);
dev_WARN_ONCE(dev, 1,
"swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
@@ -693,30 +1500,251 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
size_t swiotlb_max_mapping_size(struct device *dev)
{
- return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE;
-}
+ int min_align_mask = dma_get_min_align_mask(dev);
+ int min_align = 0;
-bool is_swiotlb_active(void)
-{
/*
- * When SWIOTLB is initialized, even if io_tlb_start points to physical
- * address zero, io_tlb_end surely doesn't.
+ * swiotlb_find_slots() skips slots according to
+ * min align mask. This affects max mapping size.
+ * Take it into acount here.
*/
- return io_tlb_end != 0;
+ if (min_align_mask)
+ min_align = roundup(min_align_mask, IO_TLB_SIZE);
+
+ return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE - min_align;
+}
+
+/**
+ * is_swiotlb_allocated() - check if the default software IO TLB is initialized
+ */
+bool is_swiotlb_allocated(void)
+{
+ return io_tlb_default_mem.nslabs;
+}
+
+bool is_swiotlb_active(struct device *dev)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+
+ return mem && mem->nslabs;
+}
+
+/**
+ * default_swiotlb_base() - get the base address of the default SWIOTLB
+ *
+ * Get the lowest physical address used by the default software IO TLB pool.
+ */
+phys_addr_t default_swiotlb_base(void)
+{
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ io_tlb_default_mem.can_grow = false;
+#endif
+ return io_tlb_default_mem.defpool.start;
+}
+
+/**
+ * default_swiotlb_limit() - get the address limit of the default SWIOTLB
+ *
+ * Get the highest physical address used by the default software IO TLB pool.
+ */
+phys_addr_t default_swiotlb_limit(void)
+{
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ return io_tlb_default_mem.phys_limit;
+#else
+ return io_tlb_default_mem.defpool.end - 1;
+#endif
}
#ifdef CONFIG_DEBUG_FS
-static int __init swiotlb_create_debugfs(void)
+static int io_tlb_used_get(void *data, u64 *val)
{
- struct dentry *root;
+ struct io_tlb_mem *mem = data;
- root = debugfs_create_dir("swiotlb", NULL);
- debugfs_create_ulong("io_tlb_nslabs", 0400, root, &io_tlb_nslabs);
- debugfs_create_ulong("io_tlb_used", 0400, root, &io_tlb_used);
+ *val = mem_used(mem);
return 0;
}
-late_initcall(swiotlb_create_debugfs);
+static int io_tlb_hiwater_get(void *data, u64 *val)
+{
+ struct io_tlb_mem *mem = data;
+ *val = atomic_long_read(&mem->used_hiwater);
+ return 0;
+}
+
+static int io_tlb_hiwater_set(void *data, u64 val)
+{
+ struct io_tlb_mem *mem = data;
+
+ /* Only allow setting to zero */
+ if (val != 0)
+ return -EINVAL;
+
+ atomic_long_set(&mem->used_hiwater, val);
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_used, io_tlb_used_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_hiwater, io_tlb_hiwater_get,
+ io_tlb_hiwater_set, "%llu\n");
+
+static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
+ const char *dirname)
+{
+ atomic_long_set(&mem->total_used, 0);
+ atomic_long_set(&mem->used_hiwater, 0);
+
+ mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs);
+ if (!mem->nslabs)
+ return;
+
+ debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
+ debugfs_create_file("io_tlb_used", 0400, mem->debugfs, mem,
+ &fops_io_tlb_used);
+ debugfs_create_file("io_tlb_used_hiwater", 0600, mem->debugfs, mem,
+ &fops_io_tlb_hiwater);
+}
+
+static int __init swiotlb_create_default_debugfs(void)
+{
+ swiotlb_create_debugfs_files(&io_tlb_default_mem, "swiotlb");
+ return 0;
+}
+
+late_initcall(swiotlb_create_default_debugfs);
+
+#else /* !CONFIG_DEBUG_FS */
+
+static inline void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
+ const char *dirname)
+{
+}
+
+#endif /* CONFIG_DEBUG_FS */
+
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+
+struct page *swiotlb_alloc(struct device *dev, size_t size)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *pool;
+ phys_addr_t tlb_addr;
+ int index;
+
+ if (!mem)
+ return NULL;
+
+ index = swiotlb_find_slots(dev, 0, size, 0, &pool);
+ if (index == -1)
+ return NULL;
+
+ tlb_addr = slot_addr(pool->start, index);
+
+ return pfn_to_page(PFN_DOWN(tlb_addr));
+}
+
+bool swiotlb_free(struct device *dev, struct page *page, size_t size)
+{
+ phys_addr_t tlb_addr = page_to_phys(page);
+
+ if (!is_swiotlb_buffer(dev, tlb_addr))
+ return false;
+
+ swiotlb_release_slots(dev, tlb_addr);
+
+ return true;
+}
+
+static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
+ struct device *dev)
+{
+ struct io_tlb_mem *mem = rmem->priv;
+ unsigned long nslabs = rmem->size >> IO_TLB_SHIFT;
+
+ /* Set Per-device io tlb area to one */
+ unsigned int nareas = 1;
+
+ if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) {
+ dev_err(dev, "Restricted DMA pool must be accessible within the linear mapping.");
+ return -EINVAL;
+ }
+
+ /*
+ * Since multiple devices can share the same pool, the private data,
+ * io_tlb_mem struct, will be initialized by the first device attached
+ * to it.
+ */
+ if (!mem) {
+ struct io_tlb_pool *pool;
+
+ mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+ if (!mem)
+ return -ENOMEM;
+ pool = &mem->defpool;
+
+ pool->slots = kcalloc(nslabs, sizeof(*pool->slots), GFP_KERNEL);
+ if (!pool->slots) {
+ kfree(mem);
+ return -ENOMEM;
+ }
+
+ pool->areas = kcalloc(nareas, sizeof(*pool->areas),
+ GFP_KERNEL);
+ if (!pool->areas) {
+ kfree(pool->slots);
+ kfree(mem);
+ return -ENOMEM;
+ }
+
+ set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
+ rmem->size >> PAGE_SHIFT);
+ swiotlb_init_io_tlb_pool(pool, rmem->base, nslabs,
+ false, nareas);
+ mem->force_bounce = true;
+ mem->for_alloc = true;
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ spin_lock_init(&mem->lock);
#endif
+ add_mem_pool(mem, pool);
+
+ rmem->priv = mem;
+
+ swiotlb_create_debugfs_files(mem, rmem->name);
+ }
+
+ dev->dma_io_tlb_mem = mem;
+
+ return 0;
+}
+
+static void rmem_swiotlb_device_release(struct reserved_mem *rmem,
+ struct device *dev)
+{
+ dev->dma_io_tlb_mem = &io_tlb_default_mem;
+}
+
+static const struct reserved_mem_ops rmem_swiotlb_ops = {
+ .device_init = rmem_swiotlb_device_init,
+ .device_release = rmem_swiotlb_device_release,
+};
+
+static int __init rmem_swiotlb_setup(struct reserved_mem *rmem)
+{
+ unsigned long node = rmem->fdt_node;
+
+ if (of_get_flat_dt_prop(node, "reusable", NULL) ||
+ of_get_flat_dt_prop(node, "linux,cma-default", NULL) ||
+ of_get_flat_dt_prop(node, "linux,dma-default", NULL) ||
+ of_get_flat_dt_prop(node, "no-map", NULL))
+ return -EINVAL;
+
+ rmem->ops = &rmem_swiotlb_ops;
+ pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n",
+ &rmem->base, (unsigned long)rmem->size / SZ_1M);
+ return 0;
+}
+
+RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", rmem_swiotlb_setup);
+#endif /* CONFIG_DMA_RESTRICTED_POOL */
diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c
deleted file mode 100644
index ebe128833af7..000000000000
--- a/kernel/dma/virt.c
+++ /dev/null
@@ -1,59 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DMA operations that map to virtual addresses without flushing memory.
- */
-#include <linux/export.h>
-#include <linux/mm.h>
-#include <linux/dma-mapping.h>
-#include <linux/scatterlist.h>
-
-static void *dma_virt_alloc(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t gfp,
- unsigned long attrs)
-{
- void *ret;
-
- ret = (void *)__get_free_pages(gfp | __GFP_ZERO, get_order(size));
- if (ret)
- *dma_handle = (uintptr_t)ret;
- return ret;
-}
-
-static void dma_virt_free(struct device *dev, size_t size,
- void *cpu_addr, dma_addr_t dma_addr,
- unsigned long attrs)
-{
- free_pages((unsigned long)cpu_addr, get_order(size));
-}
-
-static dma_addr_t dma_virt_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- unsigned long attrs)
-{
- return (uintptr_t)(page_address(page) + offset);
-}
-
-static int dma_virt_map_sg(struct device *dev, struct scatterlist *sgl,
- int nents, enum dma_data_direction dir,
- unsigned long attrs)
-{
- int i;
- struct scatterlist *sg;
-
- for_each_sg(sgl, sg, nents, i) {
- BUG_ON(!sg_page(sg));
- sg_dma_address(sg) = (uintptr_t)sg_virt(sg);
- sg_dma_len(sg) = sg->length;
- }
-
- return nents;
-}
-
-const struct dma_map_ops dma_virt_ops = {
- .alloc = dma_virt_alloc,
- .free = dma_virt_free,
- .map_page = dma_virt_map_page,
- .map_sg = dma_virt_map_sg,
-};
-EXPORT_SYMBOL(dma_virt_ops);
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
deleted file mode 100644
index 57fb4dcff434..000000000000
--- a/kernel/elfcore.c
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/elf.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/binfmts.h>
-#include <linux/elfcore.h>
-
-Elf_Half __weak elf_core_extra_phdrs(void)
-{
- return 0;
-}
-
-int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
-{
- return 1;
-}
-
-int __weak elf_core_write_extra_data(struct coredump_params *cprm)
-{
- return 1;
-}
-
-size_t __weak elf_core_extra_data_size(void)
-{
- return 0;
-}
diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile
new file mode 100644
index 000000000000..095c775e001e
--- /dev/null
+++ b/kernel/entry/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# Prevent the noinstr section from being pestered by sanitizer and other goodies
+# as long as these things cannot be disabled per function.
+KASAN_SANITIZE := n
+UBSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+
+CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong
+CFLAGS_common.o += -fno-stack-protector
+
+obj-$(CONFIG_GENERIC_ENTRY) += common.o syscall_user_dispatch.o
+obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
new file mode 100644
index 000000000000..88cb3c88aaa5
--- /dev/null
+++ b/kernel/entry/common.c
@@ -0,0 +1,398 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/context_tracking.h>
+#include <linux/entry-common.h>
+#include <linux/resume_user_mode.h>
+#include <linux/highmem.h>
+#include <linux/jump_label.h>
+#include <linux/kmsan.h>
+#include <linux/livepatch.h>
+#include <linux/audit.h>
+#include <linux/tick.h>
+
+#include "common.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
+{
+ if (unlikely(audit_context())) {
+ unsigned long args[6];
+
+ syscall_get_arguments(current, regs, args);
+ audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
+ }
+}
+
+long syscall_trace_enter(struct pt_regs *regs, long syscall,
+ unsigned long work)
+{
+ long ret = 0;
+
+ /*
+ * Handle Syscall User Dispatch. This must comes first, since
+ * the ABI here can be something that doesn't make sense for
+ * other syscall_work features.
+ */
+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+ if (syscall_user_dispatch(regs))
+ return -1L;
+ }
+
+ /* Handle ptrace */
+ if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
+ ret = ptrace_report_syscall_entry(regs);
+ if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
+ return -1L;
+ }
+
+ /* Do seccomp after ptrace, to catch any tracer changes. */
+ if (work & SYSCALL_WORK_SECCOMP) {
+ ret = __secure_computing(NULL);
+ if (ret == -1L)
+ return ret;
+ }
+
+ /* Either of the above might have changed the syscall number */
+ syscall = syscall_get_nr(current, regs);
+
+ if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
+ trace_sys_enter(regs, syscall);
+
+ syscall_enter_audit(regs, syscall);
+
+ return ret ? : syscall;
+}
+
+noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
+{
+ enter_from_user_mode(regs);
+ instrumentation_begin();
+ local_irq_enable();
+ instrumentation_end();
+}
+
+/* Workaround to allow gradual conversion of architecture code */
+void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
+
+/**
+ * exit_to_user_mode_loop - do any pending work before leaving to user space
+ * @regs: Pointer to pt_regs on entry stack
+ * @ti_work: TIF work flags as read by the caller
+ */
+__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+ unsigned long ti_work)
+{
+ /*
+ * Before returning to user space ensure that all pending work
+ * items have been completed.
+ */
+ while (ti_work & EXIT_TO_USER_MODE_WORK) {
+
+ local_irq_enable_exit_to_user(ti_work);
+
+ if (ti_work & _TIF_NEED_RESCHED)
+ schedule();
+
+ if (ti_work & _TIF_UPROBE)
+ uprobe_notify_resume(regs);
+
+ if (ti_work & _TIF_PATCH_PENDING)
+ klp_update_patch_state(current);
+
+ if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
+ arch_do_signal_or_restart(regs);
+
+ if (ti_work & _TIF_NOTIFY_RESUME)
+ resume_user_mode_work(regs);
+
+ /* Architecture specific TIF work */
+ arch_exit_to_user_mode_work(regs, ti_work);
+
+ /*
+ * Disable interrupts and reevaluate the work flags as they
+ * might have changed while interrupts and preemption was
+ * enabled above.
+ */
+ local_irq_disable_exit_to_user();
+
+ /* Check if any of the above work has queued a deferred wakeup */
+ tick_nohz_user_enter_prepare();
+
+ ti_work = read_thread_flags();
+ }
+
+ /* Return the latest work state for arch_exit_to_user_mode() */
+ return ti_work;
+}
+
+/*
+ * If SYSCALL_EMU is set, then the only reason to report is when
+ * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
+ * instruction has been already reported in syscall_enter_from_user_mode().
+ */
+static inline bool report_single_step(unsigned long work)
+{
+ if (work & SYSCALL_WORK_SYSCALL_EMU)
+ return false;
+
+ return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
+}
+
+static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
+{
+ bool step;
+
+ /*
+ * If the syscall was rolled back due to syscall user dispatching,
+ * then the tracers below are not invoked for the same reason as
+ * the entry side was not invoked in syscall_trace_enter(): The ABI
+ * of these syscalls is unknown.
+ */
+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+ if (unlikely(current->syscall_dispatch.on_dispatch)) {
+ current->syscall_dispatch.on_dispatch = false;
+ return;
+ }
+ }
+
+ audit_syscall_exit(regs);
+
+ if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
+ trace_sys_exit(regs, syscall_get_return_value(current, regs));
+
+ step = report_single_step(work);
+ if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
+ ptrace_report_syscall_exit(regs, step);
+}
+
+/*
+ * Syscall specific exit to user mode preparation. Runs with interrupts
+ * enabled.
+ */
+static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
+{
+ unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
+ unsigned long nr = syscall_get_nr(current, regs);
+
+ CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
+ if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
+ local_irq_enable();
+ }
+
+ rseq_syscall(regs);
+
+ /*
+ * Do one-time syscall specific work. If these work items are
+ * enabled, we want to run them exactly once per syscall exit with
+ * interrupts enabled.
+ */
+ if (unlikely(work & SYSCALL_WORK_EXIT))
+ syscall_exit_work(regs, work);
+}
+
+static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs)
+{
+ syscall_exit_to_user_mode_prepare(regs);
+ local_irq_disable_exit_to_user();
+ exit_to_user_mode_prepare(regs);
+}
+
+void syscall_exit_to_user_mode_work(struct pt_regs *regs)
+{
+ __syscall_exit_to_user_mode_work(regs);
+}
+
+__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ __syscall_exit_to_user_mode_work(regs);
+ instrumentation_end();
+ exit_to_user_mode();
+}
+
+noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
+{
+ enter_from_user_mode(regs);
+}
+
+noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ exit_to_user_mode_prepare(regs);
+ instrumentation_end();
+ exit_to_user_mode();
+}
+
+noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
+{
+ irqentry_state_t ret = {
+ .exit_rcu = false,
+ };
+
+ if (user_mode(regs)) {
+ irqentry_enter_from_user_mode(regs);
+ return ret;
+ }
+
+ /*
+ * If this entry hit the idle task invoke ct_irq_enter() whether
+ * RCU is watching or not.
+ *
+ * Interrupts can nest when the first interrupt invokes softirq
+ * processing on return which enables interrupts.
+ *
+ * Scheduler ticks in the idle task can mark quiescent state and
+ * terminate a grace period, if and only if the timer interrupt is
+ * not nested into another interrupt.
+ *
+ * Checking for rcu_is_watching() here would prevent the nesting
+ * interrupt to invoke ct_irq_enter(). If that nested interrupt is
+ * the tick then rcu_flavor_sched_clock_irq() would wrongfully
+ * assume that it is the first interrupt and eventually claim
+ * quiescent state and end grace periods prematurely.
+ *
+ * Unconditionally invoke ct_irq_enter() so RCU state stays
+ * consistent.
+ *
+ * TINY_RCU does not support EQS, so let the compiler eliminate
+ * this part when enabled.
+ */
+ if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
+ /*
+ * If RCU is not watching then the same careful
+ * sequence vs. lockdep and tracing is required
+ * as in irqentry_enter_from_user_mode().
+ */
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ ct_irq_enter();
+ instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+
+ ret.exit_rcu = true;
+ return ret;
+ }
+
+ /*
+ * If RCU is watching then RCU only wants to check whether it needs
+ * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
+ * already contains a warning when RCU is not watching, so no point
+ * in having another one here.
+ */
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
+ rcu_irq_enter_check_tick();
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+
+ return ret;
+}
+
+void raw_irqentry_exit_cond_resched(void)
+{
+ if (!preempt_count()) {
+ /* Sanity check RCU and thread stack */
+ rcu_irq_exit_check_preempt();
+ if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+ WARN_ON_ONCE(!on_thread_stack());
+ if (need_resched())
+ preempt_schedule_irq();
+ }
+}
+#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
+void dynamic_irqentry_exit_cond_resched(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
+ return;
+ raw_irqentry_exit_cond_resched();
+}
+#endif
+#endif
+
+noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
+{
+ lockdep_assert_irqs_disabled();
+
+ /* Check whether this returns to user mode */
+ if (user_mode(regs)) {
+ irqentry_exit_to_user_mode(regs);
+ } else if (!regs_irqs_disabled(regs)) {
+ /*
+ * If RCU was not watching on entry this needs to be done
+ * carefully and needs the same ordering of lockdep/tracing
+ * and RCU as the return to user mode path.
+ */
+ if (state.exit_rcu) {
+ instrumentation_begin();
+ /* Tell the tracer that IRET will enable interrupts */
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare();
+ instrumentation_end();
+ ct_irq_exit();
+ lockdep_hardirqs_on(CALLER_ADDR0);
+ return;
+ }
+
+ instrumentation_begin();
+ if (IS_ENABLED(CONFIG_PREEMPTION))
+ irqentry_exit_cond_resched();
+
+ /* Covers both tracing and lockdep */
+ trace_hardirqs_on();
+ instrumentation_end();
+ } else {
+ /*
+ * IRQ flags state is correct already. Just tell RCU if it
+ * was not watching on entry.
+ */
+ if (state.exit_rcu)
+ ct_irq_exit();
+ }
+}
+
+irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
+{
+ irqentry_state_t irq_state;
+
+ irq_state.lockdep = lockdep_hardirqs_enabled();
+
+ __nmi_enter();
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ lockdep_hardirq_enter();
+ ct_nmi_enter();
+
+ instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
+ trace_hardirqs_off_finish();
+ ftrace_nmi_enter();
+ instrumentation_end();
+
+ return irq_state;
+}
+
+void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
+{
+ instrumentation_begin();
+ ftrace_nmi_exit();
+ if (irq_state.lockdep) {
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare();
+ }
+ instrumentation_end();
+
+ ct_nmi_exit();
+ lockdep_hardirq_exit();
+ if (irq_state.lockdep)
+ lockdep_hardirqs_on(CALLER_ADDR0);
+ __nmi_exit();
+}
diff --git a/kernel/entry/common.h b/kernel/entry/common.h
new file mode 100644
index 000000000000..f6e6d02f07fe
--- /dev/null
+++ b/kernel/entry/common.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _COMMON_H
+#define _COMMON_H
+
+bool syscall_user_dispatch(struct pt_regs *regs);
+
+#endif
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
new file mode 100644
index 000000000000..2e0f75bcb7fd
--- /dev/null
+++ b/kernel/entry/kvm.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/entry-kvm.h>
+#include <linux/kvm_host.h>
+
+static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
+{
+ do {
+ int ret;
+
+ if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
+ kvm_handle_signal_exit(vcpu);
+ return -EINTR;
+ }
+
+ if (ti_work & _TIF_NEED_RESCHED)
+ schedule();
+
+ if (ti_work & _TIF_NOTIFY_RESUME)
+ resume_user_mode_work(NULL);
+
+ ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work);
+ if (ret)
+ return ret;
+
+ ti_work = read_thread_flags();
+ } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched());
+ return 0;
+}
+
+int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu)
+{
+ unsigned long ti_work;
+
+ /*
+ * This is invoked from the outer guest loop with interrupts and
+ * preemption enabled.
+ *
+ * KVM invokes xfer_to_guest_mode_work_pending() with interrupts
+ * disabled in the inner loop before going into guest mode. No need
+ * to disable interrupts here.
+ */
+ ti_work = read_thread_flags();
+ if (!(ti_work & XFER_TO_GUEST_MODE_WORK))
+ return 0;
+
+ return xfer_to_guest_mode_work(vcpu, ti_work);
+}
+EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work);
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
new file mode 100644
index 000000000000..5340c5aa89e7
--- /dev/null
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 Collabora Ltd.
+ */
+#include <linux/sched.h>
+#include <linux/prctl.h>
+#include <linux/ptrace.h>
+#include <linux/syscall_user_dispatch.h>
+#include <linux/uaccess.h>
+#include <linux/signal.h>
+#include <linux/elf.h>
+
+#include <linux/sched/signal.h>
+#include <linux/sched/task_stack.h>
+
+#include <asm/syscall.h>
+
+#include "common.h"
+
+static void trigger_sigsys(struct pt_regs *regs)
+{
+ struct kernel_siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = SIGSYS;
+ info.si_code = SYS_USER_DISPATCH;
+ info.si_call_addr = (void __user *)KSTK_EIP(current);
+ info.si_errno = 0;
+ info.si_arch = syscall_get_arch(current);
+ info.si_syscall = syscall_get_nr(current, regs);
+
+ force_sig_info(&info);
+}
+
+bool syscall_user_dispatch(struct pt_regs *regs)
+{
+ struct syscall_user_dispatch *sd = &current->syscall_dispatch;
+ char state;
+
+ if (likely(instruction_pointer(regs) - sd->offset < sd->len))
+ return false;
+
+ if (unlikely(arch_syscall_is_vdso_sigreturn(regs)))
+ return false;
+
+ if (likely(sd->selector)) {
+ /*
+ * access_ok() is performed once, at prctl time, when
+ * the selector is loaded by userspace.
+ */
+ if (unlikely(__get_user(state, sd->selector))) {
+ force_exit_sig(SIGSEGV);
+ return true;
+ }
+
+ if (likely(state == SYSCALL_DISPATCH_FILTER_ALLOW))
+ return false;
+
+ if (state != SYSCALL_DISPATCH_FILTER_BLOCK) {
+ force_exit_sig(SIGSYS);
+ return true;
+ }
+ }
+
+ sd->on_dispatch = true;
+ syscall_rollback(current, regs);
+ trigger_sigsys(regs);
+
+ return true;
+}
+
+static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned long mode,
+ unsigned long offset, unsigned long len,
+ char __user *selector)
+{
+ switch (mode) {
+ case PR_SYS_DISPATCH_OFF:
+ if (offset || len || selector)
+ return -EINVAL;
+ break;
+ case PR_SYS_DISPATCH_ON:
+ /*
+ * Validate the direct dispatcher region just for basic
+ * sanity against overflow and a 0-sized dispatcher
+ * region. If the user is able to submit a syscall from
+ * an address, that address is obviously valid.
+ */
+ if (offset && offset + len <= offset)
+ return -EINVAL;
+
+ /*
+ * access_ok() will clear memory tags for tagged addresses
+ * if current has memory tagging enabled.
+
+ * To enable a tracer to set a tracees selector the
+ * selector address must be untagged for access_ok(),
+ * otherwise an untagged tracer will always fail to set a
+ * tagged tracees selector.
+ */
+ if (selector && !access_ok(untagged_addr(selector), sizeof(*selector)))
+ return -EFAULT;
+
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ task->syscall_dispatch.selector = selector;
+ task->syscall_dispatch.offset = offset;
+ task->syscall_dispatch.len = len;
+ task->syscall_dispatch.on_dispatch = false;
+
+ if (mode == PR_SYS_DISPATCH_ON)
+ set_task_syscall_work(task, SYSCALL_USER_DISPATCH);
+ else
+ clear_task_syscall_work(task, SYSCALL_USER_DISPATCH);
+
+ return 0;
+}
+
+int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
+ unsigned long len, char __user *selector)
+{
+ return task_set_syscall_user_dispatch(current, mode, offset, len, selector);
+}
+
+int syscall_user_dispatch_get_config(struct task_struct *task, unsigned long size,
+ void __user *data)
+{
+ struct syscall_user_dispatch *sd = &task->syscall_dispatch;
+ struct ptrace_sud_config cfg;
+
+ if (size != sizeof(cfg))
+ return -EINVAL;
+
+ if (test_task_syscall_work(task, SYSCALL_USER_DISPATCH))
+ cfg.mode = PR_SYS_DISPATCH_ON;
+ else
+ cfg.mode = PR_SYS_DISPATCH_OFF;
+
+ cfg.offset = sd->offset;
+ cfg.len = sd->len;
+ cfg.selector = (__u64)(uintptr_t)sd->selector;
+
+ if (copy_to_user(data, &cfg, sizeof(cfg)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size,
+ void __user *data)
+{
+ struct ptrace_sud_config cfg;
+
+ if (size != sizeof(cfg))
+ return -EINVAL;
+
+ if (copy_from_user(&cfg, data, sizeof(cfg)))
+ return -EFAULT;
+
+ return task_set_syscall_user_dispatch(task, cfg.mode, cfg.offset, cfg.len,
+ (char __user *)(uintptr_t)cfg.selector);
+}
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 3c022e33c109..91a62f566743 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -1,10 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
-ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
-endif
-
obj-y := core.o ring_buffer.o callchain.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+obj-$(CONFIG_HW_BREAKPOINT_KUNIT_TEST) += hw_breakpoint_test.o
obj-$(CONFIG_UPROBES) += uprobes.o
-
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 334d48b16c36..1273be84392c 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -149,7 +149,7 @@ void put_callchain_buffers(void)
}
}
-static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+struct perf_callchain_entry *get_callchain_entry(int *rctx)
{
int cpu;
struct callchain_cpus_entries *entries;
@@ -159,8 +159,10 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
return NULL;
entries = rcu_dereference(callchain_cpus_entries);
- if (!entries)
+ if (!entries) {
+ put_recursion_context(this_cpu_ptr(callchain_recursion), *rctx);
return NULL;
+ }
cpu = smp_processor_id();
@@ -168,7 +170,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
(*rctx * perf_callchain_entry__sizeof()));
}
-static void
+void
put_callchain_entry(int rctx)
{
put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
@@ -183,11 +185,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
int rctx;
entry = get_callchain_entry(&rctx);
- if (rctx == -1)
- return NULL;
-
if (!entry)
- goto exit_put;
+ return NULL;
ctx.entry = entry;
ctx.max_stack = max_stack;
@@ -210,18 +209,13 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
}
if (regs) {
- mm_segment_t fs;
-
if (crosstask)
goto exit_put;
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
- fs = get_fs();
- set_fs(USER_DS);
perf_callchain_user(&ctx, regs);
- set_fs(fs);
}
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 856d98c36f56..f0f0f71213a1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -51,6 +51,10 @@
#include <linux/proc_ns.h>
#include <linux/mount.h>
#include <linux/min_heap.h>
+#include <linux/highmem.h>
+#include <linux/pgtable.h>
+#include <linux/buildid.h>
+#include <linux/task_work.h>
#include "internal.h"
@@ -99,7 +103,7 @@ static void remote_function(void *data)
* retry due to any failures in smp_call_function_single(), such as if the
* task_cpu() goes offline concurrently.
*
- * returns @func return value or -ESRCH when the process isn't running
+ * returns @func return value or -ESRCH or -ENXIO when the process isn't running
*/
static int
task_function_call(struct task_struct *p, remote_function_f func, void *info)
@@ -115,7 +119,8 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info)
for (;;) {
ret = smp_call_function_single(task_cpu(p), remote_function,
&data, 1);
- ret = !ret ? data.ret : -EAGAIN;
+ if (!ret)
+ ret = data.ret;
if (ret != -EAGAIN)
break;
@@ -128,6 +133,7 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info)
/**
* cpu_function_call - call a function on the cpu
+ * @cpu: target cpu to queue this function
* @func: the function to be called
* @info: the function call argument
*
@@ -149,12 +155,6 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
return data.ret;
}
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
- return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
-
static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
@@ -178,6 +178,14 @@ static bool is_kernel_event(struct perf_event *event)
return READ_ONCE(event->owner) == TASK_TOMBSTONE;
}
+static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+struct perf_event_context *perf_cpu_task_ctx(void)
+{
+ lockdep_assert_irqs_disabled();
+ return this_cpu_ptr(&perf_cpu_context)->task_ctx;
+}
+
/*
* On task ctx scheduling...
*
@@ -211,7 +219,7 @@ static int event_function(void *info)
struct event_function_struct *efs = info;
struct perf_event *event = efs->event;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
int ret = 0;
@@ -265,7 +273,7 @@ static void event_function_call(struct perf_event *event, event_f func, void *da
if (!event->parent) {
/*
* If this is a !child event, we must hold ctx::mutex to
- * stabilize the the event->ctx relation. See
+ * stabilize the event->ctx relation. See
* perf_event_ctx_lock().
*/
lockdep_assert_held(&ctx->mutex);
@@ -308,7 +316,7 @@ again:
static void event_function_local(struct perf_event *event, event_f func, void *data)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct task_struct *task = READ_ONCE(ctx->task);
struct perf_event_context *task_ctx = NULL;
@@ -367,12 +375,12 @@ enum event_type_t {
EVENT_TIME = 0x4,
/* see ctx_resched() for details */
EVENT_CPU = 0x8,
+ EVENT_CGROUP = 0x10,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};
/*
* perf_sched_events : >0 events exist
- * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
*/
static void perf_sched_delayed(struct work_struct *work);
@@ -381,8 +389,6 @@ static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;
-static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
@@ -394,11 +400,14 @@ static atomic_t nr_switch_events __read_mostly;
static atomic_t nr_ksymbol_events __read_mostly;
static atomic_t nr_bpf_events __read_mostly;
static atomic_t nr_cgroup_events __read_mostly;
+static atomic_t nr_text_poke_events __read_mostly;
+static atomic_t nr_build_id_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
static cpumask_var_t perf_online_mask;
+static struct kmem_cache *perf_event_cache;
/*
* perf event paranoia level:
@@ -439,10 +448,10 @@ static void update_perf_cpu_limits(void)
WRITE_ONCE(perf_sample_allowed_ns, tmp);
}
-static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
+static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
-int perf_proc_update_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+int perf_event_max_sample_rate_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
int perf_cpu = sysctl_perf_cpu_time_max_percent;
@@ -562,23 +571,11 @@ void perf_sample_event_took(u64 sample_len_ns)
static atomic64_t perf_event_id;
-static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task);
-
static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
void __weak perf_event_print_debug(void) { }
-extern __weak const char *perf_pmu_name(void)
-{
- return "pmu";
-}
-
static inline u64 perf_clock(void)
{
return local_clock();
@@ -671,13 +668,54 @@ perf_event_set_state(struct perf_event *event, enum perf_event_state state)
WRITE_ONCE(event->state, state);
}
+/*
+ * UP store-release, load-acquire
+ */
+
+#define __store_release(ptr, val) \
+do { \
+ barrier(); \
+ WRITE_ONCE(*(ptr), (val)); \
+} while (0)
+
+#define __load_acquire(ptr) \
+({ \
+ __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \
+ barrier(); \
+ ___p; \
+})
+
+static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
+ perf_pmu_disable(pmu_ctx->pmu);
+ }
+}
+
+static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
+ perf_pmu_enable(pmu_ctx->pmu);
+ }
+}
+
+static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
+static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
+
#ifdef CONFIG_CGROUP_PERF
static inline bool
perf_cgroup_match(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
/* @event doesn't care about cgroup */
if (!event->cgrp)
@@ -716,35 +754,51 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
return t->time;
}
-static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
- struct perf_cgroup_info *info;
- u64 now;
-
- now = perf_clock();
+ struct perf_cgroup_info *t;
- info = this_cpu_ptr(cgrp->info);
+ t = per_cpu_ptr(event->cgrp->info, event->cpu);
+ if (!__load_acquire(&t->active))
+ return t->time;
+ now += READ_ONCE(t->timeoffset);
+ return now;
+}
- info->time += now - info->timestamp;
+static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
+{
+ if (adv)
+ info->time += now - info->timestamp;
info->timestamp = now;
+ /*
+ * see update_context_time()
+ */
+ WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
}
-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
{
struct perf_cgroup *cgrp = cpuctx->cgrp;
struct cgroup_subsys_state *css;
+ struct perf_cgroup_info *info;
if (cgrp) {
+ u64 now = perf_clock();
+
for (css = &cgrp->css; css; css = css->parent) {
cgrp = container_of(css, struct perf_cgroup, css);
- __update_cgrp_time(cgrp);
+ info = this_cpu_ptr(cgrp->info);
+
+ __update_cgrp_time(info, now, true);
+ if (final)
+ __store_release(&info->active, 0);
}
}
}
static inline void update_cgrp_time_from_event(struct perf_event *event)
{
- struct perf_cgroup *cgrp;
+ struct perf_cgroup_info *info;
/*
* ensure we access cgroup data only when needed and
@@ -753,19 +807,19 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
if (!is_cgroup_event(event))
return;
- cgrp = perf_cgroup_from_task(current, event->ctx);
+ info = this_cpu_ptr(event->cgrp->info);
/*
* Do not update time when cgroup is not active
*/
- if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
- __update_cgrp_time(event->cgrp);
+ if (info->active)
+ __update_cgrp_time(info, perf_clock(), true);
}
static inline void
-perf_cgroup_set_timestamp(struct task_struct *task,
- struct perf_event_context *ctx)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
{
- struct perf_cgroup *cgrp;
+ struct perf_event_context *ctx = &cpuctx->ctx;
+ struct perf_cgroup *cgrp = cpuctx->cgrp;
struct perf_cgroup_info *info;
struct cgroup_subsys_state *css;
@@ -774,127 +828,59 @@ perf_cgroup_set_timestamp(struct task_struct *task,
* ensure we do not access cgroup data
* unless we have the cgroup pinned (css_get)
*/
- if (!task || !ctx->nr_cgroups)
+ if (!cgrp)
return;
- cgrp = perf_cgroup_from_task(task, ctx);
+ WARN_ON_ONCE(!ctx->nr_cgroups);
for (css = &cgrp->css; css; css = css->parent) {
cgrp = container_of(css, struct perf_cgroup, css);
info = this_cpu_ptr(cgrp->info);
- info->timestamp = ctx->timestamp;
+ __update_cgrp_time(info, ctx->timestamp, false);
+ __store_release(&info->active, 1);
}
}
-static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
-
-#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
-#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
-
/*
* reschedule events based on the cgroup constraint of task.
- *
- * mode SWOUT : schedule out everything
- * mode SWIN : schedule in based on cgroup for next
*/
-static void perf_cgroup_switch(struct task_struct *task, int mode)
-{
- struct perf_cpu_context *cpuctx;
- struct list_head *list;
- unsigned long flags;
-
- /*
- * Disable interrupts and preemption to avoid this CPU's
- * cgrp_cpuctx_entry to change under us.
- */
- local_irq_save(flags);
-
- list = this_cpu_ptr(&cgrp_cpuctx_list);
- list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
- WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
-
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
-
- if (mode & PERF_CGROUP_SWOUT) {
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
- /*
- * must not be done before ctxswout due
- * to event_filter_match() in event_sched_out()
- */
- cpuctx->cgrp = NULL;
- }
-
- if (mode & PERF_CGROUP_SWIN) {
- WARN_ON_ONCE(cpuctx->cgrp);
- /*
- * set cgrp before ctxsw in to allow
- * event_filter_match() to not have to pass
- * task around
- * we pass the cpuctx->ctx to perf_cgroup_from_task()
- * because cgorup events are only per-cpu
- */
- cpuctx->cgrp = perf_cgroup_from_task(task,
- &cpuctx->ctx);
- cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
- }
- perf_pmu_enable(cpuctx->ctx.pmu);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- }
-
- local_irq_restore(flags);
-}
-
-static inline void perf_cgroup_sched_out(struct task_struct *task,
- struct task_struct *next)
+static void perf_cgroup_switch(struct task_struct *task)
{
- struct perf_cgroup *cgrp1;
- struct perf_cgroup *cgrp2 = NULL;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_cgroup *cgrp;
- rcu_read_lock();
/*
- * we come here when we know perf_cgroup_events > 0
- * we do not need to pass the ctx here because we know
- * we are holding the rcu lock
+ * cpuctx->cgrp is set when the first cgroup event enabled,
+ * and is cleared when the last cgroup event disabled.
*/
- cgrp1 = perf_cgroup_from_task(task, NULL);
- cgrp2 = perf_cgroup_from_task(next, NULL);
+ if (READ_ONCE(cpuctx->cgrp) == NULL)
+ return;
- /*
- * only schedule out current cgroup events if we know
- * that we are switching to a different cgroup. Otherwise,
- * do no touch the cgroup events.
- */
- if (cgrp1 != cgrp2)
- perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+ WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
- rcu_read_unlock();
-}
+ cgrp = perf_cgroup_from_task(task, NULL);
+ if (READ_ONCE(cpuctx->cgrp) == cgrp)
+ return;
-static inline void perf_cgroup_sched_in(struct task_struct *prev,
- struct task_struct *task)
-{
- struct perf_cgroup *cgrp1;
- struct perf_cgroup *cgrp2 = NULL;
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_ctx_disable(&cpuctx->ctx, true);
- rcu_read_lock();
+ ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
/*
- * we come here when we know perf_cgroup_events > 0
- * we do not need to pass the ctx here because we know
- * we are holding the rcu lock
+ * must not be done before ctxswout due
+ * to update_cgrp_time_from_cpuctx() in
+ * ctx_sched_out()
*/
- cgrp1 = perf_cgroup_from_task(task, NULL);
- cgrp2 = perf_cgroup_from_task(prev, NULL);
-
+ cpuctx->cgrp = cgrp;
/*
- * only need to schedule in cgroup events if we are changing
- * cgroup during ctxsw. Cgroup events were not scheduled
- * out of ctxsw out if that was not the case.
+ * set cgrp before ctxsw in to allow
+ * perf_cgroup_set_timestamp() in ctx_sched_in()
+ * to not have to pass task around
*/
- if (cgrp1 != cgrp2)
- perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+ ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
- rcu_read_unlock();
+ perf_ctx_enable(&cpuctx->ctx, true);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
static int perf_cgroup_ensure_storage(struct perf_event *event,
@@ -912,7 +898,7 @@ static int perf_cgroup_ensure_storage(struct perf_event *event,
heap_size++;
for_each_possible_cpu(cpu) {
- cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
if (heap_size <= cpuctx->heap_size)
continue;
@@ -979,14 +965,6 @@ out:
}
static inline void
-perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
-{
- struct perf_cgroup_info *t;
- t = per_cpu_ptr(event->cgrp->info, event->cpu);
- event->shadow_ctx_time = now - t->timestamp;
-}
-
-static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
@@ -994,30 +972,18 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
if (!is_cgroup_event(event))
return;
+ event->pmu_ctx->nr_cgroups++;
+
/*
* Because cgroup events are always per-cpu events,
* @ctx == &cpuctx->ctx.
*/
cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
- /*
- * Since setting cpuctx->cgrp is conditional on the current @cgrp
- * matching the event's cgroup, we must do this for every new event,
- * because if the first would mismatch, the second would not try again
- * and we would leave cpuctx->cgrp unset.
- */
- if (ctx->is_active && !cpuctx->cgrp) {
- struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
-
- if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
- cpuctx->cgrp = cgrp;
- }
-
if (ctx->nr_cgroups++)
return;
- list_add(&cpuctx->cgrp_cpuctx_entry,
- per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
+ cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
}
static inline void
@@ -1028,6 +994,8 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
if (!is_cgroup_event(event))
return;
+ event->pmu_ctx->nr_cgroups--;
+
/*
* Because cgroup events are always per-cpu events,
* @ctx == &cpuctx->ctx.
@@ -1037,10 +1005,7 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
if (--ctx->nr_cgroups)
return;
- if (ctx->is_active && cpuctx->cgrp)
- cpuctx->cgrp = NULL;
-
- list_del(&cpuctx->cgrp_cpuctx_entry);
+ cpuctx->cgrp = NULL;
}
#else /* !CONFIG_CGROUP_PERF */
@@ -1063,17 +1028,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}
-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
-{
-}
-
-static inline void perf_cgroup_sched_out(struct task_struct *task,
- struct task_struct *next)
-{
-}
-
-static inline void perf_cgroup_sched_in(struct task_struct *prev,
- struct task_struct *task)
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
+ bool final)
{
}
@@ -1085,22 +1041,16 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
}
static inline void
-perf_cgroup_set_timestamp(struct task_struct *task,
- struct perf_event_context *ctx)
-{
-}
-
-static inline void
-perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
{
}
-static inline void
-perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
+ return 0;
}
-static inline u64 perf_cgroup_event_time(struct perf_event *event)
+static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
return 0;
}
@@ -1114,6 +1064,10 @@ static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
}
+
+static void perf_cgroup_switch(struct task_struct *task)
+{
+}
#endif
/*
@@ -1126,34 +1080,30 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
*/
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_pmu_context *cpc;
bool rotations;
lockdep_assert_irqs_disabled();
- cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
- rotations = perf_rotate_context(cpuctx);
+ cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
+ rotations = perf_rotate_context(cpc);
- raw_spin_lock(&cpuctx->hrtimer_lock);
+ raw_spin_lock(&cpc->hrtimer_lock);
if (rotations)
- hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+ hrtimer_forward_now(hr, cpc->hrtimer_interval);
else
- cpuctx->hrtimer_active = 0;
- raw_spin_unlock(&cpuctx->hrtimer_lock);
+ cpc->hrtimer_active = 0;
+ raw_spin_unlock(&cpc->hrtimer_lock);
return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
}
-static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
{
- struct hrtimer *timer = &cpuctx->hrtimer;
- struct pmu *pmu = cpuctx->ctx.pmu;
+ struct hrtimer *timer = &cpc->hrtimer;
+ struct pmu *pmu = cpc->epc.pmu;
u64 interval;
- /* no multiplexing needed for SW PMU */
- if (pmu->task_ctx_nr == perf_sw_context)
- return;
-
/*
* check default is sane, if not set then force to
* default interval (1/tick)
@@ -1162,34 +1112,34 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
if (interval < 1)
interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
+ cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
- raw_spin_lock_init(&cpuctx->hrtimer_lock);
+ raw_spin_lock_init(&cpc->hrtimer_lock);
hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
timer->function = perf_mux_hrtimer_handler;
}
-static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
+static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
{
- struct hrtimer *timer = &cpuctx->hrtimer;
- struct pmu *pmu = cpuctx->ctx.pmu;
+ struct hrtimer *timer = &cpc->hrtimer;
unsigned long flags;
- /* not for SW PMU */
- if (pmu->task_ctx_nr == perf_sw_context)
- return 0;
-
- raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
- if (!cpuctx->hrtimer_active) {
- cpuctx->hrtimer_active = 1;
- hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+ raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
+ if (!cpc->hrtimer_active) {
+ cpc->hrtimer_active = 1;
+ hrtimer_forward_now(timer, cpc->hrtimer_interval);
hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
}
- raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
+ raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);
return 0;
}
+static int perf_mux_hrtimer_restart_ipi(void *arg)
+{
+ return perf_mux_hrtimer_restart(arg);
+}
+
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -1204,37 +1154,28 @@ void perf_pmu_enable(struct pmu *pmu)
pmu->pmu_enable(pmu);
}
-static DEFINE_PER_CPU(struct list_head, active_ctx_list);
-
-/*
- * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
- * perf_event_task_tick() are fully serialized because they're strictly cpu
- * affine and perf_event_ctx{activate,deactivate} are called with IRQs
- * disabled, while perf_event_task_tick is called from IRQ context.
- */
-static void perf_event_ctx_activate(struct perf_event_context *ctx)
+static void perf_assert_pmu_disabled(struct pmu *pmu)
{
- struct list_head *head = this_cpu_ptr(&active_ctx_list);
-
- lockdep_assert_irqs_disabled();
-
- WARN_ON(!list_empty(&ctx->active_ctx_list));
-
- list_add(&ctx->active_ctx_list, head);
+ WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
}
-static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
+static void get_ctx(struct perf_event_context *ctx)
{
- lockdep_assert_irqs_disabled();
+ refcount_inc(&ctx->refcount);
+}
- WARN_ON(list_empty(&ctx->active_ctx_list));
+static void *alloc_task_ctx_data(struct pmu *pmu)
+{
+ if (pmu->task_ctx_cache)
+ return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
- list_del_init(&ctx->active_ctx_list);
+ return NULL;
}
-static void get_ctx(struct perf_event_context *ctx)
+static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
{
- refcount_inc(&ctx->refcount);
+ if (pmu->task_ctx_cache && task_ctx_data)
+ kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
}
static void free_ctx(struct rcu_head *head)
@@ -1242,7 +1183,6 @@ static void free_ctx(struct rcu_head *head)
struct perf_event_context *ctx;
ctx = container_of(head, struct perf_event_context, rcu_head);
- kfree(ctx->task_ctx_data);
kfree(ctx);
}
@@ -1284,7 +1224,7 @@ static void put_ctx(struct perf_event_context *ctx)
* life-time rules separate them. That is an exiting task cannot fork, and a
* spawning task cannot (yet) exit.
*
- * But remember that that these are parent<->child context relations, and
+ * But remember that these are parent<->child context relations, and
* migration does not affect children, therefore these two orderings should not
* interact.
*
@@ -1310,7 +1250,7 @@ static void put_ctx(struct perf_event_context *ctx)
* function.
*
* Lock order:
- * exec_update_mutex
+ * exec_update_lock
* task_struct::perf_event_mutex
* perf_event_context::mutex
* perf_event::child_mutex;
@@ -1423,11 +1363,11 @@ static u64 primary_event_id(struct perf_event *event)
/*
* Get the perf_event_context for a task and lock it.
*
- * This has to cope with with the fact that until it is locked,
+ * This has to cope with the fact that until it is locked,
* the context could get moved to another task.
*/
static struct perf_event_context *
-perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
{
struct perf_event_context *ctx;
@@ -1443,7 +1383,7 @@ retry:
*/
local_irq_save(*flags);
rcu_read_lock();
- ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
+ ctx = rcu_dereference(task->perf_event_ctxp);
if (ctx) {
/*
* If this context is a clone of another, it might
@@ -1456,7 +1396,7 @@ retry:
* can't get swapped on us any more.
*/
raw_spin_lock(&ctx->lock);
- if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
+ if (ctx != rcu_dereference(task->perf_event_ctxp)) {
raw_spin_unlock(&ctx->lock);
rcu_read_unlock();
local_irq_restore(*flags);
@@ -1483,12 +1423,12 @@ retry:
* reference count so that the context can't get freed.
*/
static struct perf_event_context *
-perf_pin_task_context(struct task_struct *task, int ctxn)
+perf_pin_task_context(struct task_struct *task)
{
struct perf_event_context *ctx;
unsigned long flags;
- ctx = perf_lock_task_context(task, ctxn, &flags);
+ ctx = perf_lock_task_context(task, &flags);
if (ctx) {
++ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -1508,22 +1448,61 @@ static void perf_unpin_context(struct perf_event_context *ctx)
/*
* Update the record of the current time in a context.
*/
-static void update_context_time(struct perf_event_context *ctx)
+static void __update_context_time(struct perf_event_context *ctx, bool adv)
{
u64 now = perf_clock();
- ctx->time += now - ctx->timestamp;
+ lockdep_assert_held(&ctx->lock);
+
+ if (adv)
+ ctx->time += now - ctx->timestamp;
ctx->timestamp = now;
+
+ /*
+ * The above: time' = time + (now - timestamp), can be re-arranged
+ * into: time` = now + (time - timestamp), which gives a single value
+ * offset to compute future time without locks on.
+ *
+ * See perf_event_time_now(), which can be used from NMI context where
+ * it's (obviously) not possible to acquire ctx->lock in order to read
+ * both the above values in a consistent manner.
+ */
+ WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
+}
+
+static void update_context_time(struct perf_event_context *ctx)
+{
+ __update_context_time(ctx, true);
}
static u64 perf_event_time(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
+ if (unlikely(!ctx))
+ return 0;
+
if (is_cgroup_event(event))
return perf_cgroup_event_time(event);
- return ctx ? ctx->time : 0;
+ return ctx->time;
+}
+
+static u64 perf_event_time_now(struct perf_event *event, u64 now)
+{
+ struct perf_event_context *ctx = event->ctx;
+
+ if (unlikely(!ctx))
+ return 0;
+
+ if (is_cgroup_event(event))
+ return perf_cgroup_event_time_now(event, now);
+
+ if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
+ return ctx->time;
+
+ now += READ_ONCE(ctx->timeoffset);
+ return now;
}
static enum event_type_t get_event_type(struct perf_event *event)
@@ -1578,82 +1557,125 @@ static void perf_event_groups_init(struct perf_event_groups *groups)
groups->index = 0;
}
+static inline struct cgroup *event_cgroup(const struct perf_event *event)
+{
+ struct cgroup *cgroup = NULL;
+
+#ifdef CONFIG_CGROUP_PERF
+ if (event->cgrp)
+ cgroup = event->cgrp->css.cgroup;
+#endif
+
+ return cgroup;
+}
+
/*
* Compare function for event groups;
*
* Implements complex key that first sorts by CPU and then by virtual index
* which provides ordering when rotating groups for the same CPU.
*/
-static bool
-perf_event_groups_less(struct perf_event *left, struct perf_event *right)
+static __always_inline int
+perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu,
+ const struct cgroup *left_cgroup, const u64 left_group_index,
+ const struct perf_event *right)
{
- if (left->cpu < right->cpu)
- return true;
- if (left->cpu > right->cpu)
- return false;
+ if (left_cpu < right->cpu)
+ return -1;
+ if (left_cpu > right->cpu)
+ return 1;
+
+ if (left_pmu) {
+ if (left_pmu < right->pmu_ctx->pmu)
+ return -1;
+ if (left_pmu > right->pmu_ctx->pmu)
+ return 1;
+ }
#ifdef CONFIG_CGROUP_PERF
- if (left->cgrp != right->cgrp) {
- if (!left->cgrp || !left->cgrp->css.cgroup) {
- /*
- * Left has no cgroup but right does, no cgroups come
- * first.
- */
- return true;
- }
- if (!right->cgrp || !right->cgrp->css.cgroup) {
- /*
- * Right has no cgroup but left does, no cgroups come
- * first.
- */
- return false;
- }
- /* Two dissimilar cgroups, order by id. */
- if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
- return true;
+ {
+ const struct cgroup *right_cgroup = event_cgroup(right);
- return false;
+ if (left_cgroup != right_cgroup) {
+ if (!left_cgroup) {
+ /*
+ * Left has no cgroup but right does, no
+ * cgroups come first.
+ */
+ return -1;
+ }
+ if (!right_cgroup) {
+ /*
+ * Right has no cgroup but left does, no
+ * cgroups come first.
+ */
+ return 1;
+ }
+ /* Two dissimilar cgroups, order by id. */
+ if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
+ return -1;
+
+ return 1;
+ }
}
#endif
- if (left->group_index < right->group_index)
- return true;
- if (left->group_index > right->group_index)
- return false;
+ if (left_group_index < right->group_index)
+ return -1;
+ if (left_group_index > right->group_index)
+ return 1;
- return false;
+ return 0;
+}
+
+#define __node_2_pe(node) \
+ rb_entry((node), struct perf_event, group_node)
+
+static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
+{
+ struct perf_event *e = __node_2_pe(a);
+ return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e),
+ e->group_index, __node_2_pe(b)) < 0;
+}
+
+struct __group_key {
+ int cpu;
+ struct pmu *pmu;
+ struct cgroup *cgroup;
+};
+
+static inline int __group_cmp(const void *key, const struct rb_node *node)
+{
+ const struct __group_key *a = key;
+ const struct perf_event *b = __node_2_pe(node);
+
+ /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */
+ return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b);
+}
+
+static inline int
+__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node)
+{
+ const struct __group_key *a = key;
+ const struct perf_event *b = __node_2_pe(node);
+
+ /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */
+ return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b),
+ b->group_index, b);
}
/*
- * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
- * key (see perf_event_groups_less). This places it last inside the CPU
- * subtree.
+ * Insert @event into @groups' tree; using
+ * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index}
+ * as key. This places it last inside the {cpu,pmu,cgroup} subtree.
*/
static void
perf_event_groups_insert(struct perf_event_groups *groups,
struct perf_event *event)
{
- struct perf_event *node_event;
- struct rb_node *parent;
- struct rb_node **node;
-
event->group_index = ++groups->index;
- node = &groups->tree.rb_node;
- parent = *node;
-
- while (*node) {
- parent = *node;
- node_event = container_of(*node, struct perf_event, group_node);
-
- if (perf_event_groups_less(event, node_event))
- node = &parent->rb_left;
- else
- node = &parent->rb_right;
- }
-
- rb_link_node(&event->group_node, parent, node);
- rb_insert_color(&event->group_node, &groups->tree);
+ rb_add(&event->group_node, &groups->tree, __group_less);
}
/*
@@ -1695,82 +1717,47 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
}
/*
- * Get the leftmost event in the cpu/cgroup subtree.
+ * Get the leftmost event in the {cpu,pmu,cgroup} subtree.
*/
static struct perf_event *
perf_event_groups_first(struct perf_event_groups *groups, int cpu,
- struct cgroup *cgrp)
+ struct pmu *pmu, struct cgroup *cgrp)
{
- struct perf_event *node_event = NULL, *match = NULL;
- struct rb_node *node = groups->tree.rb_node;
-#ifdef CONFIG_CGROUP_PERF
- u64 node_cgrp_id, cgrp_id = 0;
-
- if (cgrp)
- cgrp_id = cgrp->kn->id;
-#endif
-
- while (node) {
- node_event = container_of(node, struct perf_event, group_node);
-
- if (cpu < node_event->cpu) {
- node = node->rb_left;
- continue;
- }
- if (cpu > node_event->cpu) {
- node = node->rb_right;
- continue;
- }
-#ifdef CONFIG_CGROUP_PERF
- node_cgrp_id = 0;
- if (node_event->cgrp && node_event->cgrp->css.cgroup)
- node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
+ struct __group_key key = {
+ .cpu = cpu,
+ .pmu = pmu,
+ .cgroup = cgrp,
+ };
+ struct rb_node *node;
- if (cgrp_id < node_cgrp_id) {
- node = node->rb_left;
- continue;
- }
- if (cgrp_id > node_cgrp_id) {
- node = node->rb_right;
- continue;
- }
-#endif
- match = node_event;
- node = node->rb_left;
- }
+ node = rb_find_first(&key, &groups->tree, __group_cmp);
+ if (node)
+ return __node_2_pe(node);
- return match;
+ return NULL;
}
-/*
- * Like rb_entry_next_safe() for the @cpu subtree.
- */
static struct perf_event *
-perf_event_groups_next(struct perf_event *event)
+perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
{
- struct perf_event *next;
-#ifdef CONFIG_CGROUP_PERF
- u64 curr_cgrp_id = 0;
- u64 next_cgrp_id = 0;
-#endif
-
- next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
- if (next == NULL || next->cpu != event->cpu)
- return NULL;
-
-#ifdef CONFIG_CGROUP_PERF
- if (event->cgrp && event->cgrp->css.cgroup)
- curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
+ struct __group_key key = {
+ .cpu = event->cpu,
+ .pmu = pmu,
+ .cgroup = event_cgroup(event),
+ };
+ struct rb_node *next;
- if (next->cgrp && next->cgrp->css.cgroup)
- next_cgrp_id = next->cgrp->css.cgroup->kn->id;
+ next = rb_next_match(&key, &event->group_node, __group_cmp);
+ if (next)
+ return __node_2_pe(next);
- if (curr_cgrp_id != next_cgrp_id)
- return NULL;
-#endif
- return next;
+ return NULL;
}
+#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \
+ for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \
+ event; event = perf_event_groups_next(event, pmu))
+
/*
* Iterate through the whole groups tree.
*/
@@ -1806,6 +1793,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
+ if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
+ ctx->nr_user++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
@@ -1813,6 +1802,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
perf_cgroup_event_enable(event, ctx);
ctx->generation++;
+ event->pmu_ctx->nr_events++;
}
/*
@@ -1824,28 +1814,34 @@ static inline void perf_event__state_init(struct perf_event *event)
PERF_EVENT_STATE_INACTIVE;
}
-static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
+static int __perf_event_read_size(u64 read_format, int nr_siblings)
{
int entry = sizeof(u64); /* value */
int size = 0;
int nr = 1;
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
size += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
size += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_ID)
+ if (read_format & PERF_FORMAT_ID)
+ entry += sizeof(u64);
+
+ if (read_format & PERF_FORMAT_LOST)
entry += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_GROUP) {
+ if (read_format & PERF_FORMAT_GROUP) {
nr += nr_siblings;
size += sizeof(u64);
}
- size += entry * nr;
- event->read_size = size;
+ /*
+ * Since perf_event_validate_size() limits this to 16k and inhibits
+ * adding more siblings, this will never overflow.
+ */
+ return size + nr * entry;
}
static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
@@ -1862,8 +1858,8 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_PERIOD)
size += sizeof(data->period);
- if (sample_type & PERF_SAMPLE_WEIGHT)
- size += sizeof(data->weight);
+ if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+ size += sizeof(data->weight.full);
if (sample_type & PERF_SAMPLE_READ)
size += event->read_size;
@@ -1880,6 +1876,12 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_CGROUP)
size += sizeof(data->cgroup);
+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ size += sizeof(data->data_page_size);
+
+ if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ size += sizeof(data->code_page_size);
+
event->header_size = size;
}
@@ -1889,8 +1891,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
*/
static void perf_event__header_size(struct perf_event *event)
{
- __perf_event_read_size(event,
- event->group_leader->nr_siblings);
+ event->read_size =
+ __perf_event_read_size(event->attr.read_format,
+ event->group_leader->nr_siblings);
__perf_event_header_size(event, event->attr.sample_type);
}
@@ -1921,23 +1924,44 @@ static void perf_event__id_header_size(struct perf_event *event)
event->id_header_size = size;
}
+/*
+ * Check that adding an event to the group does not result in anybody
+ * overflowing the 64k event limit imposed by the output buffer.
+ *
+ * Specifically, check that the read_size for the event does not exceed 16k,
+ * read_size being the one term that grows with groups size. Since read_size
+ * depends on per-event read_format, also (re)check the existing events.
+ *
+ * This leaves 48k for the constant size fields and things like callchains,
+ * branch stacks and register sets.
+ */
static bool perf_event_validate_size(struct perf_event *event)
{
- /*
- * The values computed here will be over-written when we actually
- * attach the event.
- */
- __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
- __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
- perf_event__id_header_size(event);
+ struct perf_event *sibling, *group_leader = event->group_leader;
+
+ if (__perf_event_read_size(event->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
+
+ if (__perf_event_read_size(group_leader->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
/*
- * Sum the lot; should not exceed the 64k limit we have on records.
- * Conservative limit to allow for callchains and other variable fields.
+ * When creating a new group leader, group_leader->ctx is initialized
+ * after the size has been validated, but we cannot safely use
+ * for_each_sibling_event() until group_leader->ctx is set. A new group
+ * leader cannot have any siblings yet, so we can safely skip checking
+ * the non-existent siblings.
*/
- if (event->read_size + event->header_size +
- event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
- return false;
+ if (event == group_leader)
+ return true;
+
+ for_each_sibling_event(sibling, group_leader) {
+ if (__perf_event_read_size(sibling->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
+ }
return true;
}
@@ -1949,7 +1973,8 @@ static void perf_group_attach(struct perf_event *event)
lockdep_assert_held(&event->ctx->lock);
/*
- * We can have double attach due to group movement in perf_event_open.
+ * We can have double attach due to group movement (move_group) in
+ * perf_event_open().
*/
if (event->attach_state & PERF_ATTACH_GROUP)
return;
@@ -1965,6 +1990,7 @@ static void perf_group_attach(struct perf_event *event)
list_add_tail(&event->sibling_list, &group_leader->sibling_list);
group_leader->nr_siblings++;
+ group_leader->group_generation++;
perf_event__header_size(group_leader);
@@ -1991,6 +2017,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->attach_state &= ~PERF_ATTACH_CONTEXT;
ctx->nr_events--;
+ if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
+ ctx->nr_user--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -2012,6 +2040,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
}
ctx->generation++;
+ event->pmu_ctx->nr_events--;
}
static int
@@ -2028,13 +2057,11 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
static void put_event(struct perf_event *event);
static void event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx);
static void perf_put_aux_event(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event *iter;
/*
@@ -2063,7 +2090,7 @@ static void perf_put_aux_event(struct perf_event *event)
* state so that we don't try to schedule it again. Note
* that perf_event_enable() will clear the ERROR status.
*/
- event_sched_out(iter, cpuctx, ctx);
+ event_sched_out(iter, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}
}
@@ -2114,12 +2141,25 @@ static int perf_get_aux_event(struct perf_event *event,
static inline struct list_head *get_event_list(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
+ return event->attr.pinned ? &event->pmu_ctx->pinned_active :
+ &event->pmu_ctx->flexible_active;
+}
+
+/*
+ * Events that have PERF_EV_CAP_SIBLING require being part of a group and
+ * cannot exist on their own, schedule them out and move them into the ERROR
+ * state. Also see _perf_event_enable(), it will not be able to recover
+ * this ERROR state.
+ */
+static inline void perf_remove_sibling_event(struct perf_event *event)
+{
+ event_sched_out(event, event->ctx);
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}
static void perf_group_detach(struct perf_event *event)
{
+ struct perf_event *leader = event->group_leader;
struct perf_event *sibling, *tmp;
struct perf_event_context *ctx = event->ctx;
@@ -2138,9 +2178,10 @@ static void perf_group_detach(struct perf_event *event)
/*
* If this is a sibling, remove it from its group.
*/
- if (event->group_leader != event) {
+ if (leader != event) {
list_del_init(&event->sibling_list);
event->group_leader->nr_siblings--;
+ event->group_leader->group_generation++;
goto out;
}
@@ -2151,13 +2192,16 @@ static void perf_group_detach(struct perf_event *event)
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
+ if (sibling->event_caps & PERF_EV_CAP_SIBLING)
+ perf_remove_sibling_event(sibling);
+
sibling->group_leader = sibling;
list_del_init(&sibling->sibling_list);
/* Inherit group flags from the previous leader */
sibling->group_caps = event->group_caps;
- if (!RB_EMPTY_NODE(&event->group_node)) {
+ if (sibling->attach_state & PERF_ATTACH_CONTEXT) {
add_event_to_groups(sibling, event->ctx);
if (sibling->state == PERF_EVENT_STATE_ACTIVE)
@@ -2168,58 +2212,53 @@ static void perf_group_detach(struct perf_event *event)
}
out:
- perf_event__header_size(event->group_leader);
-
- for_each_sibling_event(tmp, event->group_leader)
+ for_each_sibling_event(tmp, leader)
perf_event__header_size(tmp);
-}
-static bool is_orphaned_event(struct perf_event *event)
-{
- return event->state == PERF_EVENT_STATE_DEAD;
+ perf_event__header_size(leader);
}
-static inline int __pmu_filter_match(struct perf_event *event)
-{
- struct pmu *pmu = event->pmu;
- return pmu->filter_match ? pmu->filter_match(event) : 1;
-}
+static void sync_child_event(struct perf_event *child_event);
-/*
- * Check whether we should attempt to schedule an event group based on
- * PMU-specific filtering. An event group can consist of HW and SW events,
- * potentially with a SW leader, so we must check all the filters, to
- * determine whether a group is schedulable:
- */
-static inline int pmu_filter_match(struct perf_event *event)
+static void perf_child_detach(struct perf_event *event)
{
- struct perf_event *sibling;
+ struct perf_event *parent_event = event->parent;
- if (!__pmu_filter_match(event))
- return 0;
+ if (!(event->attach_state & PERF_ATTACH_CHILD))
+ return;
- for_each_sibling_event(sibling, event) {
- if (!__pmu_filter_match(sibling))
- return 0;
- }
+ event->attach_state &= ~PERF_ATTACH_CHILD;
- return 1;
+ if (WARN_ON_ONCE(!parent_event))
+ return;
+
+ lockdep_assert_held(&parent_event->child_mutex);
+
+ sync_child_event(event);
+ list_del_init(&event->child_list);
+}
+
+static bool is_orphaned_event(struct perf_event *event)
+{
+ return event->state == PERF_EVENT_STATE_DEAD;
}
static inline int
event_filter_match(struct perf_event *event)
{
return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
- perf_cgroup_match(event) && pmu_filter_match(event);
+ perf_cgroup_match(event);
}
static void
-event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
+ // XXX cpc serialization, probably per-cpu IRQ disabled
+
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
@@ -2238,52 +2277,61 @@ event_sched_out(struct perf_event *event,
event->pmu->del(event, 0);
event->oncpu = -1;
- if (READ_ONCE(event->pending_disable) >= 0) {
- WRITE_ONCE(event->pending_disable, -1);
+ if (event->pending_disable) {
+ event->pending_disable = 0;
perf_cgroup_event_disable(event, ctx);
state = PERF_EVENT_STATE_OFF;
}
+
+ if (event->pending_sigtrap) {
+ bool dec = true;
+
+ event->pending_sigtrap = 0;
+ if (state != PERF_EVENT_STATE_OFF &&
+ !event->pending_work) {
+ event->pending_work = 1;
+ dec = false;
+ WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
+ task_work_add(current, &event->pending_task, TWA_RESUME);
+ }
+ if (dec)
+ local_dec(&event->ctx->nr_pending);
+ }
+
perf_event_set_state(event, state);
if (!is_software_event(event))
- cpuctx->active_oncpu--;
- if (!--ctx->nr_active)
- perf_event_ctx_deactivate(ctx);
+ cpc->active_oncpu--;
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq--;
- if (event->attr.exclusive || !cpuctx->active_oncpu)
- cpuctx->exclusive = 0;
+ if (event->attr.exclusive || !cpc->active_oncpu)
+ cpc->exclusive = 0;
perf_pmu_enable(event->pmu);
}
static void
-group_sched_out(struct perf_event *group_event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
{
struct perf_event *event;
if (group_event->state != PERF_EVENT_STATE_ACTIVE)
return;
- perf_pmu_disable(ctx->pmu);
+ perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);
- event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, ctx);
/*
* Schedule out siblings (if any):
*/
for_each_sibling_event(event, group_event)
- event_sched_out(event, cpuctx, ctx);
-
- perf_pmu_enable(ctx->pmu);
-
- if (group_event->attr.exclusive)
- cpuctx->exclusive = 0;
+ event_sched_out(event, ctx);
}
#define DETACH_GROUP 0x01UL
+#define DETACH_CHILD 0x02UL
+#define DETACH_DEAD 0x04UL
/*
* Cross CPU call to remove a performance event
@@ -2297,21 +2345,46 @@ __perf_remove_from_context(struct perf_event *event,
struct perf_event_context *ctx,
void *info)
{
+ struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
unsigned long flags = (unsigned long)info;
if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx);
+ update_cgrp_time_from_cpuctx(cpuctx, false);
}
- event_sched_out(event, cpuctx, ctx);
+ /*
+ * Ensure event_sched_out() switches to OFF, at the very least
+ * this avoids raising perf_pending_task() at this time.
+ */
+ if (flags & DETACH_DEAD)
+ event->pending_disable = 1;
+ event_sched_out(event, ctx);
if (flags & DETACH_GROUP)
perf_group_detach(event);
+ if (flags & DETACH_CHILD)
+ perf_child_detach(event);
list_del_event(event, ctx);
+ if (flags & DETACH_DEAD)
+ event->state = PERF_EVENT_STATE_DEAD;
+
+ if (!pmu_ctx->nr_events) {
+ pmu_ctx->rotate_necessary = 0;
+
+ if (ctx->task && ctx->is_active) {
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = NULL;
+ }
+ }
if (!ctx->nr_events && ctx->is_active) {
+ if (ctx == &cpuctx->ctx)
+ update_cgrp_time_from_cpuctx(cpuctx, true);
+
ctx->is_active = 0;
- ctx->rotate_necessary = 0;
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
cpuctx->task_ctx = NULL;
@@ -2335,25 +2408,21 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
lockdep_assert_held(&ctx->mutex);
- event_function_call(event, __perf_remove_from_context, (void *)flags);
-
/*
- * The above event_function_call() can NO-OP when it hits
- * TASK_TOMBSTONE. In that case we must already have been detached
- * from the context (by perf_event_exit_event()) but the grouping
- * might still be in-tact.
+ * Because of perf_event_exit_task(), perf_remove_from_context() ought
+ * to work in the face of TASK_TOMBSTONE, unlike every other
+ * event_function_call() user.
*/
- WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
- if ((flags & DETACH_GROUP) &&
- (event->attach_state & PERF_ATTACH_GROUP)) {
- /*
- * Since in that case we cannot possibly be scheduled, simply
- * detach now.
- */
- raw_spin_lock_irq(&ctx->lock);
- perf_group_detach(event);
+ raw_spin_lock_irq(&ctx->lock);
+ if (!ctx->is_active) {
+ __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
+ ctx, (void *)flags);
raw_spin_unlock_irq(&ctx->lock);
+ return;
}
+ raw_spin_unlock_irq(&ctx->lock);
+
+ event_function_call(event, __perf_remove_from_context, (void *)flags);
}
/*
@@ -2372,13 +2441,17 @@ static void __perf_event_disable(struct perf_event *event,
update_cgrp_time_from_event(event);
}
+ perf_pmu_disable(event->pmu_ctx->pmu);
+
if (event == event->group_leader)
- group_sched_out(event, cpuctx, ctx);
+ group_sched_out(event, ctx);
else
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
perf_cgroup_event_disable(event, ctx);
+
+ perf_pmu_enable(event->pmu_ctx->pmu);
}
/*
@@ -2391,7 +2464,7 @@ static void __perf_event_disable(struct perf_event *event,
* hold the top-level event's child_mutex, so any descendant that
* goes to exit will block in perf_event_exit_event().
*
- * When called from perf_pending_event it's OK because event->ctx
+ * When called from perf_pending_irq it's OK because event->ctx
* is the current context on this CPU and preemption is disabled,
* hence we can't get into perf_event_task_sched_out for this context.
*/
@@ -2430,43 +2503,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
void perf_event_disable_inatomic(struct perf_event *event)
{
- WRITE_ONCE(event->pending_disable, smp_processor_id());
- /* can fail, see perf_pending_event_disable() */
- irq_work_queue(&event->pending);
-}
-
-static void perf_set_shadow_time(struct perf_event *event,
- struct perf_event_context *ctx)
-{
- /*
- * use the correct time source for the time snapshot
- *
- * We could get by without this by leveraging the
- * fact that to get to this function, the caller
- * has most likely already called update_context_time()
- * and update_cgrp_time_xx() and thus both timestamp
- * are identical (or very close). Given that tstamp is,
- * already adjusted for cgroup, we could say that:
- * tstamp - ctx->timestamp
- * is equivalent to
- * tstamp - cgrp->timestamp.
- *
- * Then, in perf_output_read(), the calculation would
- * work with no changes because:
- * - event is guaranteed scheduled in
- * - no scheduled out in between
- * - thus the timestamp would be the same
- *
- * But this is a bit hairy.
- *
- * So instead, we have an explicit cgroup call to remain
- * within the time time source all along. We believe it
- * is cleaner and simpler to understand.
- */
- if (is_cgroup_event(event))
- perf_cgroup_set_shadow_time(event, event->tstamp);
- else
- event->shadow_ctx_time = event->tstamp - ctx->timestamp;
+ event->pending_disable = 1;
+ irq_work_queue(&event->pending_irq);
}
#define MAX_INTERRUPTS (~0ULL)
@@ -2475,10 +2513,10 @@ static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);
static int
-event_sched_in(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
int ret = 0;
WARN_ON_ONCE(event->ctx != ctx);
@@ -2509,8 +2547,6 @@ event_sched_in(struct perf_event *event,
perf_pmu_disable(event->pmu);
- perf_set_shadow_time(event, ctx);
-
perf_log_itrace_start(event);
if (event->pmu->add(event, PERF_EF_START)) {
@@ -2521,14 +2557,12 @@ event_sched_in(struct perf_event *event,
}
if (!is_software_event(event))
- cpuctx->active_oncpu++;
- if (!ctx->nr_active++)
- perf_event_ctx_activate(ctx);
+ cpc->active_oncpu++;
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq++;
if (event->attr.exclusive)
- cpuctx->exclusive = 1;
+ cpc->exclusive = 1;
out:
perf_pmu_enable(event->pmu);
@@ -2537,29 +2571,24 @@ out:
}
static int
-group_sched_in(struct perf_event *group_event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx)
{
struct perf_event *event, *partial_group = NULL;
- struct pmu *pmu = ctx->pmu;
+ struct pmu *pmu = group_event->pmu_ctx->pmu;
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
- if (event_sched_in(group_event, cpuctx, ctx)) {
- pmu->cancel_txn(pmu);
- perf_mux_hrtimer_restart(cpuctx);
- return -EAGAIN;
- }
+ if (event_sched_in(group_event, ctx))
+ goto error;
/*
* Schedule in siblings as one group (if any):
*/
for_each_sibling_event(event, group_event) {
- if (event_sched_in(event, cpuctx, ctx)) {
+ if (event_sched_in(event, ctx)) {
partial_group = event;
goto group_error;
}
@@ -2578,24 +2607,23 @@ group_error:
if (event == partial_group)
break;
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, ctx);
}
- event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, ctx);
+error:
pmu->cancel_txn(pmu);
-
- perf_mux_hrtimer_restart(cpuctx);
-
return -EAGAIN;
}
/*
* Work out whether we can put this event group on the CPU now.
*/
-static int group_can_go_on(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- int can_add_hw)
+static int group_can_go_on(struct perf_event *event, int can_add_hw)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
+
/*
* Groups consisting entirely of software events can always go on.
*/
@@ -2605,13 +2633,13 @@ static int group_can_go_on(struct perf_event *event,
* If an exclusive group is already on, no other hardware
* events can go on.
*/
- if (cpuctx->exclusive)
+ if (cpc->exclusive)
return 0;
/*
* If this group is exclusive and there are already
* events on the CPU, it can't go on.
*/
- if (event->attr.exclusive && cpuctx->active_oncpu)
+ if (event->attr.exclusive && !list_empty(get_event_list(event)))
return 0;
/*
* Otherwise, try to add it if all previous groups were able
@@ -2627,38 +2655,29 @@ static void add_event_to_ctx(struct perf_event *event,
perf_group_attach(event);
}
-static void ctx_sched_out(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-static void
-ctx_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task);
-
-static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx,
- enum event_type_t event_type)
+static void task_ctx_sched_out(struct perf_event_context *ctx,
+ enum event_type_t event_type)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
if (!cpuctx->task_ctx)
return;
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;
- ctx_sched_out(ctx, cpuctx, event_type);
+ ctx_sched_out(ctx, event_type);
}
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx,
- struct task_struct *task)
+ struct perf_event_context *ctx)
{
- cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
+ ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, EVENT_FLEXIBLE);
}
/*
@@ -2676,11 +2695,15 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
* event_type is a bit mask of the types of events involved. For CPU events,
* event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
*/
+/*
+ * XXX: ctx_resched() reschedule entire perf_event_context while adding new
+ * event to the context or enabling existing event in the context. We can
+ * probably optimize it by rescheduling only affected pmu_ctx.
+ */
static void ctx_resched(struct perf_cpu_context *cpuctx,
struct perf_event_context *task_ctx,
enum event_type_t event_type)
{
- enum event_type_t ctx_event_type;
bool cpu_event = !!(event_type & EVENT_CPU);
/*
@@ -2690,11 +2713,13 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
if (event_type & EVENT_PINNED)
event_type |= EVENT_FLEXIBLE;
- ctx_event_type = event_type & EVENT_ALL;
+ event_type &= EVENT_ALL;
- perf_pmu_disable(cpuctx->ctx.pmu);
- if (task_ctx)
- task_ctx_sched_out(cpuctx, task_ctx, event_type);
+ perf_ctx_disable(&cpuctx->ctx, false);
+ if (task_ctx) {
+ perf_ctx_disable(task_ctx, false);
+ task_ctx_sched_out(task_ctx, event_type);
+ }
/*
* Decide which cpu ctx groups to schedule out based on the types
@@ -2704,17 +2729,20 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
* - otherwise, do nothing more.
*/
if (cpu_event)
- cpu_ctx_sched_out(cpuctx, ctx_event_type);
- else if (ctx_event_type & EVENT_PINNED)
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_out(&cpuctx->ctx, event_type);
+ else if (event_type & EVENT_PINNED)
+ ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+
+ perf_event_sched_in(cpuctx, task_ctx);
- perf_event_sched_in(cpuctx, task_ctx, current);
- perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_ctx_enable(&cpuctx->ctx, false);
+ if (task_ctx)
+ perf_ctx_enable(task_ctx, false);
}
void perf_pmu_resched(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
perf_ctx_lock(cpuctx, task_ctx);
@@ -2732,7 +2760,7 @@ static int __perf_install_in_context(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
bool reprogram = true;
int ret = 0;
@@ -2774,7 +2802,7 @@ static int __perf_install_in_context(void *info)
#endif
if (reprogram) {
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
add_event_to_ctx(event, ctx);
ctx_resched(cpuctx, task_ctx, get_event_type(event));
} else {
@@ -2807,7 +2835,7 @@ perf_install_in_context(struct perf_event_context *ctx,
WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
if (event->cpu != -1)
- event->cpu = cpu;
+ WARN_ON_ONCE(event->cpu != cpu);
/*
* Ensures that if we can observe event->ctx, both the event and ctx
@@ -2823,7 +2851,8 @@ perf_install_in_context(struct perf_event_context *ctx,
* The IOC_ENABLE that is sure to follow the creation of a disabled
* event will issue the IPI and reprogram the hardware.
*/
- if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
+ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
+ ctx->nr_events && !is_cgroup_event(event)) {
raw_spin_lock_irq(&ctx->lock);
if (ctx->task == TASK_TOMBSTONE) {
raw_spin_unlock_irq(&ctx->lock);
@@ -2919,7 +2948,7 @@ static void __perf_event_enable(struct perf_event *event,
return;
if (ctx->is_active)
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
perf_cgroup_event_enable(event, ctx);
@@ -2928,7 +2957,7 @@ static void __perf_event_enable(struct perf_event *event,
return;
if (!event_filter_match(event)) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_sched_in(ctx, EVENT_TIME);
return;
}
@@ -2937,7 +2966,7 @@ static void __perf_event_enable(struct perf_event *event,
* then don't put it on unless the group is on.
*/
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_sched_in(ctx, EVENT_TIME);
return;
}
@@ -2964,6 +2993,7 @@ static void _perf_event_enable(struct perf_event *event)
raw_spin_lock_irq(&ctx->lock);
if (event->state >= PERF_EVENT_STATE_INACTIVE ||
event->state < PERF_EVENT_STATE_ERROR) {
+out:
raw_spin_unlock_irq(&ctx->lock);
return;
}
@@ -2975,8 +3005,16 @@ static void _perf_event_enable(struct perf_event *event)
* has gone back into error state, as distinct from the task having
* been scheduled away before the cross-call arrived.
*/
- if (event->state == PERF_EVENT_STATE_ERROR)
+ if (event->state == PERF_EVENT_STATE_ERROR) {
+ /*
+ * Detached SIBLING events cannot leave ERROR state.
+ */
+ if (event->event_caps & PERF_EV_CAP_SIBLING &&
+ event->group_leader == event)
+ goto out;
+
event->state = PERF_EVENT_STATE_OFF;
+ }
raw_spin_unlock_irq(&ctx->lock);
event_function_call(event, __perf_event_enable, NULL);
@@ -3146,27 +3184,107 @@ static int perf_event_modify_breakpoint(struct perf_event *bp,
return err;
}
+/*
+ * Copy event-type-independent attributes that may be modified.
+ */
+static void perf_event_modify_copy_attr(struct perf_event_attr *to,
+ const struct perf_event_attr *from)
+{
+ to->sig_data = from->sig_data;
+}
+
static int perf_event_modify_attr(struct perf_event *event,
struct perf_event_attr *attr)
{
+ int (*func)(struct perf_event *, struct perf_event_attr *);
+ struct perf_event *child;
+ int err;
+
if (event->attr.type != attr->type)
return -EINVAL;
switch (event->attr.type) {
case PERF_TYPE_BREAKPOINT:
- return perf_event_modify_breakpoint(event, attr);
+ func = perf_event_modify_breakpoint;
+ break;
default:
/* Place holder for future additions. */
return -EOPNOTSUPP;
}
+
+ WARN_ON_ONCE(event->ctx->parent_ctx);
+
+ mutex_lock(&event->child_mutex);
+ /*
+ * Event-type-independent attributes must be copied before event-type
+ * modification, which will validate that final attributes match the
+ * source attributes after all relevant attributes have been copied.
+ */
+ perf_event_modify_copy_attr(&event->attr, attr);
+ err = func(event, attr);
+ if (err)
+ goto out;
+ list_for_each_entry(child, &event->child_list, child_list) {
+ perf_event_modify_copy_attr(&child->attr, attr);
+ err = func(child, attr);
+ if (err)
+ goto out;
+ }
+out:
+ mutex_unlock(&event->child_mutex);
+ return err;
}
-static void ctx_sched_out(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type)
{
+ struct perf_event_context *ctx = pmu_ctx->ctx;
struct perf_event *event, *tmp;
+ struct pmu *pmu = pmu_ctx->pmu;
+
+ if (ctx->task && !ctx->is_active) {
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = this_cpu_ptr(pmu->cpu_pmu_context);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = NULL;
+ }
+
+ if (!event_type)
+ return;
+
+ perf_pmu_disable(pmu);
+ if (event_type & EVENT_PINNED) {
+ list_for_each_entry_safe(event, tmp,
+ &pmu_ctx->pinned_active,
+ active_list)
+ group_sched_out(event, ctx);
+ }
+
+ if (event_type & EVENT_FLEXIBLE) {
+ list_for_each_entry_safe(event, tmp,
+ &pmu_ctx->flexible_active,
+ active_list)
+ group_sched_out(event, ctx);
+ /*
+ * Since we cleared EVENT_FLEXIBLE, also clear
+ * rotate_necessary, is will be reset by
+ * ctx_flexible_sched_in() when needed.
+ */
+ pmu_ctx->rotate_necessary = 0;
+ }
+ perf_pmu_enable(pmu);
+}
+
+static void
+ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
+ bool cgroup = event_type & EVENT_CGROUP;
+
+ event_type &= ~EVENT_CGROUP;
lockdep_assert_held(&ctx->lock);
@@ -3180,16 +3298,6 @@ static void ctx_sched_out(struct perf_event_context *ctx,
return;
}
- ctx->is_active &= ~event_type;
- if (!(ctx->is_active & EVENT_ALL))
- ctx->is_active = 0;
-
- if (ctx->task) {
- WARN_ON_ONCE(cpuctx->task_ctx != ctx);
- if (!ctx->is_active)
- cpuctx->task_ctx = NULL;
- }
-
/*
* Always update time if it was set; not only when it changes.
* Otherwise we can 'forget' to update time for any but the last
@@ -3203,32 +3311,31 @@ static void ctx_sched_out(struct perf_event_context *ctx,
if (is_active & EVENT_TIME) {
/* update (and stop) ctx time */
update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx);
+ update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
+ /*
+ * CPU-release for the below ->is_active store,
+ * see __load_acquire() in perf_event_time_now()
+ */
+ barrier();
}
- is_active ^= ctx->is_active; /* changed bits */
-
- if (!ctx->nr_active || !(is_active & EVENT_ALL))
- return;
+ ctx->is_active &= ~event_type;
+ if (!(ctx->is_active & EVENT_ALL))
+ ctx->is_active = 0;
- perf_pmu_disable(ctx->pmu);
- if (is_active & EVENT_PINNED) {
- list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
- group_sched_out(event, cpuctx, ctx);
+ if (ctx->task) {
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+ if (!ctx->is_active)
+ cpuctx->task_ctx = NULL;
}
- if (is_active & EVENT_FLEXIBLE) {
- list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
- group_sched_out(event, cpuctx, ctx);
+ is_active ^= ctx->is_active; /* changed bits */
- /*
- * Since we cleared EVENT_FLEXIBLE, also clear
- * rotate_necessary, is will be reset by
- * ctx_flexible_sched_in() when needed.
- */
- ctx->rotate_necessary = 0;
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
+ __pmu_ctx_sched_out(pmu_ctx, is_active);
}
- perf_pmu_enable(ctx->pmu);
}
/*
@@ -3333,24 +3440,68 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
}
}
-static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
- struct task_struct *next)
+#define double_list_for_each_entry(pos1, pos2, head1, head2, member) \
+ for (pos1 = list_first_entry(head1, typeof(*pos1), member), \
+ pos2 = list_first_entry(head2, typeof(*pos2), member); \
+ !list_entry_is_head(pos1, head1, member) && \
+ !list_entry_is_head(pos2, head2, member); \
+ pos1 = list_next_entry(pos1, member), \
+ pos2 = list_next_entry(pos2, member))
+
+static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx,
+ struct perf_event_context *next_ctx)
+{
+ struct perf_event_pmu_context *prev_epc, *next_epc;
+
+ if (!prev_ctx->nr_task_data)
+ return;
+
+ double_list_for_each_entry(prev_epc, next_epc,
+ &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list,
+ pmu_ctx_entry) {
+
+ if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu))
+ continue;
+
+ /*
+ * PMU specific parts of task perf context can require
+ * additional synchronization. As an example of such
+ * synchronization see implementation details of Intel
+ * LBR call stack data profiling;
+ */
+ if (prev_epc->pmu->swap_task_ctx)
+ prev_epc->pmu->swap_task_ctx(prev_epc, next_epc);
+ else
+ swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
+ }
+}
+
+static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+ struct perf_cpu_pmu_context *cpc;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+
+ if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
+ pmu_ctx->pmu->sched_task(pmu_ctx, sched_in);
+ }
+}
+
+static void
+perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
{
- struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
+ struct perf_event_context *ctx = task->perf_event_ctxp;
struct perf_event_context *next_ctx;
struct perf_event_context *parent, *next_parent;
- struct perf_cpu_context *cpuctx;
int do_switch = 1;
if (likely(!ctx))
return;
- cpuctx = __get_cpu_context(ctx);
- if (!cpuctx->task_ctx)
- return;
-
rcu_read_lock();
- next_ctx = next->perf_event_ctxp[ctxn];
+ next_ctx = rcu_dereference(next->perf_event_ctxp);
if (!next_ctx)
goto unlock;
@@ -3374,21 +3525,28 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock(&ctx->lock);
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
- struct pmu *pmu = ctx->pmu;
+
+ perf_ctx_disable(ctx, false);
+
+ /* PMIs are disabled; ctx->nr_pending is stable. */
+ if (local_read(&ctx->nr_pending) ||
+ local_read(&next_ctx->nr_pending)) {
+ /*
+ * Must not swap out ctx when there's pending
+ * events that rely on the ctx->task relation.
+ */
+ raw_spin_unlock(&next_ctx->lock);
+ rcu_read_unlock();
+ goto inside_switch;
+ }
WRITE_ONCE(ctx->task, next);
WRITE_ONCE(next_ctx->task, task);
- /*
- * PMU specific parts of task perf context can require
- * additional synchronization. As an example of such
- * synchronization see implementation details of Intel
- * LBR call stack data profiling;
- */
- if (pmu->swap_task_ctx)
- pmu->swap_task_ctx(ctx, next_ctx);
- else
- swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+ perf_ctx_sched_task_cb(ctx, false);
+ perf_event_swap_task_ctx_data(ctx, next_ctx);
+
+ perf_ctx_enable(ctx, false);
/*
* RCU_INIT_POINTER here is safe because we've not
@@ -3397,8 +3555,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
* since those values are always verified under
* ctx->lock which we're now holding.
*/
- RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
- RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
+ RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
+ RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
do_switch = 0;
@@ -3412,31 +3570,40 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
- task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
+ perf_ctx_disable(ctx, false);
+
+inside_switch:
+ perf_ctx_sched_task_cb(ctx, false);
+ task_ctx_sched_out(ctx, EVENT_ALL);
+
+ perf_ctx_enable(ctx, false);
raw_spin_unlock(&ctx->lock);
}
}
static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
void perf_sched_cb_dec(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
this_cpu_dec(perf_sched_cb_usages);
+ barrier();
- if (!--cpuctx->sched_cb_usage)
- list_del(&cpuctx->sched_cb_entry);
+ if (!--cpc->sched_cb_usage)
+ list_del(&cpc->sched_cb_entry);
}
void perf_sched_cb_inc(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
- if (!cpuctx->sched_cb_usage++)
- list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+ if (!cpc->sched_cb_usage++)
+ list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+ barrier();
this_cpu_inc(perf_sched_cb_usages);
}
@@ -3448,38 +3615,44 @@ void perf_sched_cb_inc(struct pmu *pmu)
* PEBS requires this to provide PID/TID information. This requires we flush
* all queued PEBS records before we context switch to a new task.
*/
-static void perf_pmu_sched_task(struct task_struct *prev,
- struct task_struct *next,
- bool sched_in)
+static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in)
{
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct pmu *pmu;
- if (prev == next)
+ pmu = cpc->epc.pmu;
+
+ /* software PMUs will not have sched_task */
+ if (WARN_ON_ONCE(!pmu->sched_task))
return;
- list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
- pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_pmu_disable(pmu);
+
+ pmu->sched_task(cpc->task_epc, sched_in);
- if (WARN_ON_ONCE(!pmu->sched_task))
- continue;
+ perf_pmu_enable(pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+}
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(pmu);
+static void perf_pmu_sched_task(struct task_struct *prev,
+ struct task_struct *next,
+ bool sched_in)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_cpu_pmu_context *cpc;
- pmu->sched_task(cpuctx->task_ctx, sched_in);
+ /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */
+ if (prev == next || cpuctx->task_ctx)
+ return;
- perf_pmu_enable(pmu);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- }
+ list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
+ __perf_pmu_sched_task(cpc, sched_in);
}
static void perf_event_switch(struct task_struct *task,
struct task_struct *next_prev, bool sched_in);
-#define for_each_task_context_nr(ctxn) \
- for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
-
/*
* Called from scheduler to remove the events of the current task,
* with interrupts disabled.
@@ -3494,33 +3667,20 @@ static void perf_event_switch(struct task_struct *task,
void __perf_event_task_sched_out(struct task_struct *task,
struct task_struct *next)
{
- int ctxn;
-
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(task, next, false);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);
- for_each_task_context_nr(ctxn)
- perf_event_context_sched_out(task, ctxn, next);
+ perf_event_context_sched_out(task, next);
/*
* if cgroup events exist on this CPU, then we need
* to check if we have to switch out PMU state.
* cgroup event are system-wide mode only
*/
- if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_sched_out(task, next);
-}
-
-/*
- * Called with IRQs disabled
- */
-static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
-{
- ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
+ perf_cgroup_switch(next);
}
static bool perf_less_group_idx(const void *l, const void *r)
@@ -3554,21 +3714,39 @@ static void __heap_add(struct min_heap *heap, struct perf_event *event)
}
}
-static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
+static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
+{
+ struct perf_cpu_pmu_context *cpc;
+
+ if (!pmu_ctx->ctx->task)
+ return;
+
+ cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = pmu_ctx;
+}
+
+static noinline int visit_groups_merge(struct perf_event_context *ctx,
struct perf_event_groups *groups, int cpu,
+ struct pmu *pmu,
int (*func)(struct perf_event *, void *),
void *data)
{
#ifdef CONFIG_CGROUP_PERF
struct cgroup_subsys_state *css = NULL;
#endif
+ struct perf_cpu_context *cpuctx = NULL;
/* Space for per CPU and/or any CPU event iterators. */
struct perf_event *itrs[2];
struct min_heap event_heap;
struct perf_event **evt;
int ret;
- if (cpuctx) {
+ if (pmu->filter && pmu->filter(pmu, cpu))
+ return 0;
+
+ if (!ctx->task) {
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
event_heap = (struct min_heap){
.data = cpuctx->heap,
.nr = 0,
@@ -3588,17 +3766,22 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
.size = ARRAY_SIZE(itrs),
};
/* Events not within a CPU context may be on any CPU. */
- __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
+ __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL));
}
evt = event_heap.data;
- __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL));
#ifdef CONFIG_CGROUP_PERF
for (; css; css = css->parent)
- __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup));
#endif
+ if (event_heap.nr) {
+ __link_epc((*evt)->pmu_ctx);
+ perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
+ }
+
min_heapify_all(&event_heap, &perf_min_heap);
while (event_heap.nr) {
@@ -3606,7 +3789,7 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
if (ret)
return ret;
- *evt = perf_event_groups_next(*evt);
+ *evt = perf_event_groups_next(*evt, pmu);
if (*evt)
min_heapify(&event_heap, 0, &perf_min_heap);
else
@@ -3616,10 +3799,38 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
return 0;
}
+/*
+ * Because the userpage is strictly per-event (there is no concept of context,
+ * so there cannot be a context indirection), every userpage must be updated
+ * when context time starts :-(
+ *
+ * IOW, we must not miss EVENT_TIME edges.
+ */
+static inline bool event_update_userpage(struct perf_event *event)
+{
+ if (likely(!atomic_read(&event->mmap_count)))
+ return false;
+
+ perf_event_update_time(event);
+ perf_event_update_userpage(event);
+
+ return true;
+}
+
+static inline void group_update_userpage(struct perf_event *group_event)
+{
+ struct perf_event *event;
+
+ if (!event_update_userpage(group_event))
+ return;
+
+ for_each_sibling_event(event, group_event)
+ event_update_userpage(event);
+}
+
static int merge_sched_in(struct perf_event *event, void *data)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
int *can_add_hw = data;
if (event->state <= PERF_EVENT_STATE_OFF)
@@ -3628,66 +3839,82 @@ static int merge_sched_in(struct perf_event *event, void *data)
if (!event_filter_match(event))
return 0;
- if (group_can_go_on(event, cpuctx, *can_add_hw)) {
- if (!group_sched_in(event, cpuctx, ctx))
+ if (group_can_go_on(event, *can_add_hw)) {
+ if (!group_sched_in(event, ctx))
list_add_tail(&event->active_list, get_event_list(event));
}
if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ *can_add_hw = 0;
if (event->attr.pinned) {
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
- }
+ } else {
+ struct perf_cpu_pmu_context *cpc;
- *can_add_hw = 0;
- ctx->rotate_necessary = 1;
+ event->pmu_ctx->rotate_necessary = 1;
+ cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context);
+ perf_mux_hrtimer_restart(cpc);
+ group_update_userpage(event);
+ }
}
return 0;
}
-static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static void pmu_groups_sched_in(struct perf_event_context *ctx,
+ struct perf_event_groups *groups,
+ struct pmu *pmu)
{
int can_add_hw = 1;
-
- if (ctx != &cpuctx->ctx)
- cpuctx = NULL;
-
- visit_groups_merge(cpuctx, &ctx->pinned_groups,
- smp_processor_id(),
+ visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
merge_sched_in, &can_add_hw);
}
-static void
-ctx_flexible_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static void ctx_groups_sched_in(struct perf_event_context *ctx,
+ struct perf_event_groups *groups,
+ bool cgroup)
{
- int can_add_hw = 1;
+ struct perf_event_pmu_context *pmu_ctx;
- if (ctx != &cpuctx->ctx)
- cpuctx = NULL;
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
+ pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
+ }
+}
- visit_groups_merge(cpuctx, &ctx->flexible_groups,
- smp_processor_id(),
- merge_sched_in, &can_add_hw);
+static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
+ struct pmu *pmu)
+{
+ pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
}
static void
-ctx_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task)
+ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
int is_active = ctx->is_active;
- u64 now;
+ bool cgroup = event_type & EVENT_CGROUP;
+
+ event_type &= ~EVENT_CGROUP;
lockdep_assert_held(&ctx->lock);
if (likely(!ctx->nr_events))
return;
+ if (!(is_active & EVENT_TIME)) {
+ /* start ctx time */
+ __update_context_time(ctx, false);
+ perf_cgroup_set_timestamp(cpuctx);
+ /*
+ * CPU-release for the below ->is_active store,
+ * see __load_acquire() in perf_event_time_now()
+ */
+ barrier();
+ }
+
ctx->is_active |= (event_type | EVENT_TIME);
if (ctx->task) {
if (!is_active)
@@ -3698,42 +3925,38 @@ ctx_sched_in(struct perf_event_context *ctx,
is_active ^= ctx->is_active; /* changed bits */
- if (is_active & EVENT_TIME) {
- /* start ctx time */
- now = perf_clock();
- ctx->timestamp = now;
- perf_cgroup_set_timestamp(task, ctx);
- }
-
/*
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
*/
if (is_active & EVENT_PINNED)
- ctx_pinned_sched_in(ctx, cpuctx);
+ ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
/* Then walk through the lower prio flexible groups */
if (is_active & EVENT_FLEXIBLE)
- ctx_flexible_sched_in(ctx, cpuctx);
+ ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
}
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task)
+static void perf_event_context_sched_in(struct task_struct *task)
{
- struct perf_event_context *ctx = &cpuctx->ctx;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_context *ctx;
- ctx_sched_in(ctx, cpuctx, event_type, task);
-}
+ rcu_read_lock();
+ ctx = rcu_dereference(task->perf_event_ctxp);
+ if (!ctx)
+ goto rcu_unlock;
-static void perf_event_context_sched_in(struct perf_event_context *ctx,
- struct task_struct *task)
-{
- struct perf_cpu_context *cpuctx;
+ if (cpuctx->task_ctx == ctx) {
+ perf_ctx_lock(cpuctx, ctx);
+ perf_ctx_disable(ctx, false);
- cpuctx = __get_cpu_context(ctx);
- if (cpuctx->task_ctx == ctx)
- return;
+ perf_ctx_sched_task_cb(ctx, true);
+
+ perf_ctx_enable(ctx, false);
+ perf_ctx_unlock(cpuctx, ctx);
+ goto rcu_unlock;
+ }
perf_ctx_lock(cpuctx, ctx);
/*
@@ -3743,7 +3966,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
if (!ctx->nr_events)
goto unlock;
- perf_pmu_disable(ctx->pmu);
+ perf_ctx_disable(ctx, false);
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
@@ -3752,13 +3975,24 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* However, if task's ctx is not carrying any pinned
* events, no need to flip the cpuctx's events around.
*/
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
+ perf_ctx_disable(&cpuctx->ctx, false);
+ ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+ }
+
+ perf_event_sched_in(cpuctx, ctx);
+
+ perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
+
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- perf_event_sched_in(cpuctx, ctx, task);
- perf_pmu_enable(ctx->pmu);
+ perf_ctx_enable(&cpuctx->ctx, false);
+
+ perf_ctx_enable(ctx, false);
unlock:
perf_ctx_unlock(cpuctx, ctx);
+rcu_unlock:
+ rcu_read_unlock();
}
/*
@@ -3775,26 +4009,7 @@ unlock:
void __perf_event_task_sched_in(struct task_struct *prev,
struct task_struct *task)
{
- struct perf_event_context *ctx;
- int ctxn;
-
- /*
- * If cgroup events exist on this CPU, then we need to check if we have
- * to switch in PMU state; cgroup event are system-wide mode only.
- *
- * Since cgroup events are CPU events, we must schedule these in before
- * we schedule in the task events.
- */
- if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_sched_in(prev, task);
-
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (likely(!ctx))
- continue;
-
- perf_event_context_sched_in(ctx, task);
- }
+ perf_event_context_sched_in(task);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
@@ -3913,8 +4128,8 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
* events. At the same time, make sure, having freq events does not change
* the rate of unthrottling as that would introduce bias.
*/
-static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
- int needs_unthr)
+static void
+perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
{
struct perf_event *event;
struct hw_perf_event *hwc;
@@ -3926,16 +4141,16 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
* - context have events in frequency mode (needs freq adjust)
* - there are events to unthrottle on this cpu
*/
- if (!(ctx->nr_freq || needs_unthr))
+ if (!(ctx->nr_freq || unthrottle))
return;
raw_spin_lock(&ctx->lock);
- perf_pmu_disable(ctx->pmu);
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
+ // XXX use visit thingy to avoid the -1,cpu match
if (!event_filter_match(event))
continue;
@@ -3976,7 +4191,6 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
perf_pmu_enable(event->pmu);
}
- perf_pmu_enable(ctx->pmu);
raw_spin_unlock(&ctx->lock);
}
@@ -3998,72 +4212,109 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
-ctx_event_to_rotate(struct perf_event_context *ctx)
+ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
{
struct perf_event *event;
+ struct rb_node *node;
+ struct rb_root *tree;
+ struct __group_key key = {
+ .pmu = pmu_ctx->pmu,
+ };
/* pick the first active flexible event */
- event = list_first_entry_or_null(&ctx->flexible_active,
+ event = list_first_entry_or_null(&pmu_ctx->flexible_active,
struct perf_event, active_list);
+ if (event)
+ goto out;
/* if no active flexible event, pick the first event */
- if (!event) {
- event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
- typeof(*event), group_node);
+ tree = &pmu_ctx->ctx->flexible_groups.tree;
+
+ if (!pmu_ctx->ctx->task) {
+ key.cpu = smp_processor_id();
+
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node)
+ event = __node_2_pe(node);
+ goto out;
}
+ key.cpu = -1;
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node) {
+ event = __node_2_pe(node);
+ goto out;
+ }
+
+ key.cpu = smp_processor_id();
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node)
+ event = __node_2_pe(node);
+
+out:
/*
* Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
* finds there are unschedulable events, it will set it again.
*/
- ctx->rotate_necessary = 0;
+ pmu_ctx->rotate_necessary = 0;
return event;
}
-static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
+static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
struct perf_event *cpu_event = NULL, *task_event = NULL;
- struct perf_event_context *task_ctx = NULL;
int cpu_rotate, task_rotate;
+ struct pmu *pmu;
/*
* Since we run this from IRQ context, nobody can install new
* events, thus the event count values are stable.
*/
- cpu_rotate = cpuctx->ctx.rotate_necessary;
- task_ctx = cpuctx->task_ctx;
- task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
+ cpu_epc = &cpc->epc;
+ pmu = cpu_epc->pmu;
+ task_epc = cpc->task_epc;
+
+ cpu_rotate = cpu_epc->rotate_necessary;
+ task_rotate = task_epc ? task_epc->rotate_necessary : 0;
if (!(cpu_rotate || task_rotate))
return false;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
+ perf_pmu_disable(pmu);
if (task_rotate)
- task_event = ctx_event_to_rotate(task_ctx);
+ task_event = ctx_event_to_rotate(task_epc);
if (cpu_rotate)
- cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
+ cpu_event = ctx_event_to_rotate(cpu_epc);
/*
* As per the order given at ctx_resched() first 'pop' task flexible
* and then, if needed CPU flexible.
*/
- if (task_event || (task_ctx && cpu_event))
- ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
- if (cpu_event)
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ if (task_event || (task_epc && cpu_event)) {
+ update_context_time(task_epc->ctx);
+ __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
+ }
- if (task_event)
- rotate_ctx(task_ctx, task_event);
- if (cpu_event)
+ if (cpu_event) {
+ update_context_time(&cpuctx->ctx);
+ __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
rotate_ctx(&cpuctx->ctx, cpu_event);
+ __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
+ }
- perf_event_sched_in(cpuctx, task_ctx, current);
+ if (task_event)
+ rotate_ctx(task_epc->ctx, task_event);
- perf_pmu_enable(cpuctx->ctx.pmu);
+ if (task_event || (task_epc && cpu_event))
+ __pmu_ctx_sched_in(task_epc->ctx, pmu);
+
+ perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
return true;
@@ -4071,8 +4322,8 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
void perf_event_task_tick(void)
{
- struct list_head *head = this_cpu_ptr(&active_ctx_list);
- struct perf_event_context *ctx, *tmp;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_context *ctx;
int throttled;
lockdep_assert_irqs_disabled();
@@ -4081,8 +4332,13 @@ void perf_event_task_tick(void)
throttled = __this_cpu_xchg(perf_throttled_count, 0);
tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
- list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
- perf_adjust_freq_unthr_context(ctx, throttled);
+ perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);
+
+ rcu_read_lock();
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
+ perf_adjust_freq_unthr_context(ctx, !!throttled);
+ rcu_read_unlock();
}
static int event_enable_on_exec(struct perf_event *event,
@@ -4104,9 +4360,9 @@ static int event_enable_on_exec(struct perf_event *event,
* Enable all of a task's events that have been marked enable-on-exec.
* This expects task == current.
*/
-static void perf_event_enable_on_exec(int ctxn)
+static void perf_event_enable_on_exec(struct perf_event_context *ctx)
{
- struct perf_event_context *ctx, *clone_ctx = NULL;
+ struct perf_event_context *clone_ctx = NULL;
enum event_type_t event_type = 0;
struct perf_cpu_context *cpuctx;
struct perf_event *event;
@@ -4114,13 +4370,16 @@ static void perf_event_enable_on_exec(int ctxn)
int enabled = 0;
local_irq_save(flags);
- ctx = current->perf_event_ctxp[ctxn];
- if (!ctx || !ctx->nr_events)
+ if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
goto out;
- cpuctx = __get_cpu_context(ctx);
+ if (!ctx->nr_events)
+ goto out;
+
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
perf_ctx_lock(cpuctx, ctx);
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
+
list_for_each_entry(event, &ctx->event_list, event_entry) {
enabled |= event_enable_on_exec(event, ctx);
event_type |= get_event_type(event);
@@ -4133,7 +4392,7 @@ static void perf_event_enable_on_exec(int ctxn)
clone_ctx = unclone_ctx(ctx);
ctx_resched(cpuctx, ctx, event_type);
} else {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_sched_in(ctx, EVENT_TIME);
}
perf_ctx_unlock(cpuctx, ctx);
@@ -4144,6 +4403,50 @@ out:
put_ctx(clone_ctx);
}
+static void perf_remove_from_owner(struct perf_event *event);
+static void perf_event_exit_event(struct perf_event *event,
+ struct perf_event_context *ctx);
+
+/*
+ * Removes all events from the current task that have been marked
+ * remove-on-exec, and feeds their values back to parent events.
+ */
+static void perf_event_remove_on_exec(struct perf_event_context *ctx)
+{
+ struct perf_event_context *clone_ctx = NULL;
+ struct perf_event *event, *next;
+ unsigned long flags;
+ bool modified = false;
+
+ mutex_lock(&ctx->mutex);
+
+ if (WARN_ON_ONCE(ctx->task != current))
+ goto unlock;
+
+ list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
+ if (!event->attr.remove_on_exec)
+ continue;
+
+ if (!is_kernel_event(event))
+ perf_remove_from_owner(event);
+
+ modified = true;
+
+ perf_event_exit_event(event, ctx);
+ }
+
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+ if (modified)
+ clone_ctx = unclone_ctx(ctx);
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+unlock:
+ mutex_unlock(&ctx->mutex);
+
+ if (clone_ctx)
+ put_ctx(clone_ctx);
+}
+
struct perf_read_data {
struct perf_event *event;
bool group;
@@ -4154,6 +4457,9 @@ static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
{
u16 local_pkg, event_pkg;
+ if ((unsigned)event_cpu >= nr_cpu_ids)
+ return event_cpu;
+
if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
int local_cpu = smp_processor_id();
@@ -4175,7 +4481,7 @@ static void __perf_event_read(void *info)
struct perf_read_data *data = info;
struct perf_event *sub, *event = data->event;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct pmu *pmu = event->pmu;
/*
@@ -4232,6 +4538,18 @@ static inline u64 perf_event_count(struct perf_event *event)
return local64_read(&event->count) + atomic64_read(&event->child_count);
}
+static void calc_timer_values(struct perf_event *event,
+ u64 *now,
+ u64 *enabled,
+ u64 *running)
+{
+ u64 ctx_time;
+
+ *now = perf_clock();
+ ctx_time = perf_event_time_now(event, *now);
+ __perf_update_times(event, ctx_time, enabled, running);
+}
+
/*
* NMI-safe method to read a local event, that is an event that
* is:
@@ -4244,6 +4562,8 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
u64 *enabled, u64 *running)
{
unsigned long flags;
+ int event_oncpu;
+ int event_cpu;
int ret = 0;
/*
@@ -4268,15 +4588,22 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
goto out;
}
+ /*
+ * Get the event CPU numbers, and adjust them to local if the event is
+ * a per-package event that can be read locally
+ */
+ event_oncpu = __perf_event_read_cpu(event, event->oncpu);
+ event_cpu = __perf_event_read_cpu(event, event->cpu);
+
/* If this is a per-CPU event, it must be for this CPU */
if (!(event->attach_state & PERF_ATTACH_TASK) &&
- event->cpu != smp_processor_id()) {
+ event_cpu != smp_processor_id()) {
ret = -EINVAL;
goto out;
}
/* If this is a pinned event it must be running on this CPU */
- if (event->attr.pinned && event->oncpu != smp_processor_id()) {
+ if (event->attr.pinned && event_oncpu != smp_processor_id()) {
ret = -EBUSY;
goto out;
}
@@ -4286,15 +4613,14 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
* or local to this CPU. Furthermore it means its ACTIVE (otherwise
* oncpu == -1).
*/
- if (event->oncpu == smp_processor_id())
+ if (event_oncpu == smp_processor_id())
event->pmu->read(event);
*value = local64_read(&event->count);
if (enabled || running) {
- u64 now = event->shadow_ctx_time + perf_clock();
- u64 __enabled, __running;
+ u64 __enabled, __running, __now;
- __perf_update_times(event, now, &__enabled, &__running);
+ calc_timer_values(event, &__now, &__enabled, &__running);
if (enabled)
*enabled = __enabled;
if (running)
@@ -4390,17 +4716,25 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
{
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
- INIT_LIST_HEAD(&ctx->active_ctx_list);
+ INIT_LIST_HEAD(&ctx->pmu_ctx_list);
perf_event_groups_init(&ctx->pinned_groups);
perf_event_groups_init(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
- INIT_LIST_HEAD(&ctx->pinned_active);
- INIT_LIST_HEAD(&ctx->flexible_active);
refcount_set(&ctx->refcount, 1);
}
+static void
+__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu)
+{
+ epc->pmu = pmu;
+ INIT_LIST_HEAD(&epc->pmu_ctx_entry);
+ INIT_LIST_HEAD(&epc->pinned_active);
+ INIT_LIST_HEAD(&epc->flexible_active);
+ atomic_set(&epc->refcount, 1);
+}
+
static struct perf_event_context *
-alloc_perf_context(struct pmu *pmu, struct task_struct *task)
+alloc_perf_context(struct task_struct *task)
{
struct perf_event_context *ctx;
@@ -4411,7 +4745,6 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task)
__perf_event_init_context(ctx);
if (task)
ctx->task = get_task_struct(task);
- ctx->pmu = pmu;
return ctx;
}
@@ -4440,15 +4773,12 @@ find_lively_task_by_vpid(pid_t vpid)
* Returns a matching context with refcount and pincount.
*/
static struct perf_event_context *
-find_get_context(struct pmu *pmu, struct task_struct *task,
- struct perf_event *event)
+find_get_context(struct task_struct *task, struct perf_event *event)
{
struct perf_event_context *ctx, *clone_ctx = NULL;
struct perf_cpu_context *cpuctx;
- void *task_ctx_data = NULL;
unsigned long flags;
- int ctxn, err;
- int cpu = event->cpu;
+ int err;
if (!task) {
/* Must be root to operate on a CPU event: */
@@ -4456,52 +4786,33 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
if (err)
return ERR_PTR(err);
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
ctx = &cpuctx->ctx;
get_ctx(ctx);
+ raw_spin_lock_irqsave(&ctx->lock, flags);
++ctx->pin_count;
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
return ctx;
}
err = -EINVAL;
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto errout;
-
- if (event->attach_state & PERF_ATTACH_TASK_DATA) {
- task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
- if (!task_ctx_data) {
- err = -ENOMEM;
- goto errout;
- }
- }
-
retry:
- ctx = perf_lock_task_context(task, ctxn, &flags);
+ ctx = perf_lock_task_context(task, &flags);
if (ctx) {
clone_ctx = unclone_ctx(ctx);
++ctx->pin_count;
- if (task_ctx_data && !ctx->task_ctx_data) {
- ctx->task_ctx_data = task_ctx_data;
- task_ctx_data = NULL;
- }
raw_spin_unlock_irqrestore(&ctx->lock, flags);
if (clone_ctx)
put_ctx(clone_ctx);
} else {
- ctx = alloc_perf_context(pmu, task);
+ ctx = alloc_perf_context(task);
err = -ENOMEM;
if (!ctx)
goto errout;
- if (task_ctx_data) {
- ctx->task_ctx_data = task_ctx_data;
- task_ctx_data = NULL;
- }
-
err = 0;
mutex_lock(&task->perf_event_mutex);
/*
@@ -4510,12 +4821,12 @@ retry:
*/
if (task->flags & PF_EXITING)
err = -ESRCH;
- else if (task->perf_event_ctxp[ctxn])
+ else if (task->perf_event_ctxp)
err = -EAGAIN;
else {
get_ctx(ctx);
++ctx->pin_count;
- rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+ rcu_assign_pointer(task->perf_event_ctxp, ctx);
}
mutex_unlock(&task->perf_event_mutex);
@@ -4528,26 +4839,150 @@ retry:
}
}
- kfree(task_ctx_data);
return ctx;
errout:
- kfree(task_ctx_data);
return ERR_PTR(err);
}
+static struct perf_event_pmu_context *
+find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
+ struct perf_event *event)
+{
+ struct perf_event_pmu_context *new = NULL, *epc;
+ void *task_ctx_data = NULL;
+
+ if (!ctx->task) {
+ /*
+ * perf_pmu_migrate_context() / __perf_pmu_install_event()
+ * relies on the fact that find_get_pmu_context() cannot fail
+ * for CPU contexts.
+ */
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
+ epc = &cpc->epc;
+ raw_spin_lock_irq(&ctx->lock);
+ if (!epc->ctx) {
+ atomic_set(&epc->refcount, 1);
+ epc->embedded = 1;
+ list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+ epc->ctx = ctx;
+ } else {
+ WARN_ON_ONCE(epc->ctx != ctx);
+ atomic_inc(&epc->refcount);
+ }
+ raw_spin_unlock_irq(&ctx->lock);
+ return epc;
+ }
+
+ new = kzalloc(sizeof(*epc), GFP_KERNEL);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+
+ if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+ task_ctx_data = alloc_task_ctx_data(pmu);
+ if (!task_ctx_data) {
+ kfree(new);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
+ __perf_init_event_pmu_context(new, pmu);
+
+ /*
+ * XXX
+ *
+ * lockdep_assert_held(&ctx->mutex);
+ *
+ * can't because perf_event_init_task() doesn't actually hold the
+ * child_ctx->mutex.
+ */
+
+ raw_spin_lock_irq(&ctx->lock);
+ list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (epc->pmu == pmu) {
+ WARN_ON_ONCE(epc->ctx != ctx);
+ atomic_inc(&epc->refcount);
+ goto found_epc;
+ }
+ }
+
+ epc = new;
+ new = NULL;
+
+ list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+ epc->ctx = ctx;
+
+found_epc:
+ if (task_ctx_data && !epc->task_ctx_data) {
+ epc->task_ctx_data = task_ctx_data;
+ task_ctx_data = NULL;
+ ctx->nr_task_data++;
+ }
+ raw_spin_unlock_irq(&ctx->lock);
+
+ free_task_ctx_data(pmu, task_ctx_data);
+ kfree(new);
+
+ return epc;
+}
+
+static void get_pmu_ctx(struct perf_event_pmu_context *epc)
+{
+ WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
+}
+
+static void free_epc_rcu(struct rcu_head *head)
+{
+ struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);
+
+ kfree(epc->task_ctx_data);
+ kfree(epc);
+}
+
+static void put_pmu_ctx(struct perf_event_pmu_context *epc)
+{
+ struct perf_event_context *ctx = epc->ctx;
+ unsigned long flags;
+
+ /*
+ * XXX
+ *
+ * lockdep_assert_held(&ctx->mutex);
+ *
+ * can't because of the call-site in _free_event()/put_event()
+ * which isn't always called under ctx->mutex.
+ */
+ if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags))
+ return;
+
+ WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
+
+ list_del_init(&epc->pmu_ctx_entry);
+ epc->ctx = NULL;
+
+ WARN_ON_ONCE(!list_empty(&epc->pinned_active));
+ WARN_ON_ONCE(!list_empty(&epc->flexible_active));
+
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+ if (epc->embedded)
+ return;
+
+ call_rcu(&epc->rcu_head, free_epc_rcu);
+}
+
static void perf_event_free_filter(struct perf_event *event);
-static void perf_event_free_bpf_prog(struct perf_event *event);
static void free_event_rcu(struct rcu_head *head)
{
- struct perf_event *event;
+ struct perf_event *event = container_of(head, typeof(*event), rcu_head);
- event = container_of(head, struct perf_event, rcu_head);
if (event->ns)
put_pid_ns(event->ns);
perf_event_free_filter(event);
- kfree(event);
+ kmem_cache_free(perf_event_cache, event);
}
static void ring_buffer_attach(struct perf_event *event,
@@ -4575,7 +5010,7 @@ static bool is_sb_event(struct perf_event *event)
if (attr->mmap || attr->mmap_data || attr->mmap2 ||
attr->comm || attr->comm_exec ||
attr->task || attr->ksymbol ||
- attr->context_switch ||
+ attr->context_switch || attr->text_poke ||
attr->bpf_event)
return true;
return false;
@@ -4587,15 +5022,6 @@ static void unaccount_pmu_sb_event(struct perf_event *event)
detach_sb_event(event);
}
-static void unaccount_event_cpu(struct perf_event *event, int cpu)
-{
- if (event->parent)
- return;
-
- if (is_cgroup_event(event))
- atomic_dec(&per_cpu(perf_cgroup_events, cpu));
-}
-
#ifdef CONFIG_NO_HZ_FULL
static DEFINE_SPINLOCK(nr_freq_lock);
#endif
@@ -4625,10 +5051,12 @@ static void unaccount_event(struct perf_event *event)
if (event->parent)
return;
- if (event->attach_state & PERF_ATTACH_TASK)
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
dec = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
+ if (event->attr.build_id)
+ atomic_dec(&nr_build_id_events);
if (event->attr.comm)
atomic_dec(&nr_comm_events);
if (event->attr.namespaces)
@@ -4651,14 +5079,14 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_ksymbol_events);
if (event->attr.bpf_event)
atomic_dec(&nr_bpf_events);
+ if (event->attr.text_poke)
+ atomic_dec(&nr_text_poke_events);
if (dec) {
if (!atomic_add_unless(&perf_sched_count, -1, 1))
schedule_delayed_work(&perf_sched_work, HZ);
}
- unaccount_event_cpu(event, event->cpu);
-
unaccount_pmu_sb_event(event);
}
@@ -4677,7 +5105,7 @@ static void perf_sched_delayed(struct work_struct *work)
*
* 1) cpu-wide events in the presence of per-task events,
* 2) per-task events in the presence of cpu-wide events,
- * 3) two matching events on the same context.
+ * 3) two matching events on the same perf_event_context.
*
* The former two cases are handled in the allocation path (perf_event_alloc(),
* _free_event()), the latter -- before the first perf_install_in_context().
@@ -4761,7 +5189,7 @@ static void perf_addr_filters_splice(struct perf_event *event,
static void _free_event(struct perf_event *event)
{
- irq_work_sync(&event->pending);
+ irq_work_sync(&event->pending_irq);
unaccount_event(event);
@@ -4801,6 +5229,9 @@ static void _free_event(struct perf_event *event)
if (event->hw.target)
put_task_struct(event->hw.target);
+ if (event->pmu_ctx)
+ put_pmu_ctx(event->pmu_ctx);
+
/*
* perf_event_free_task() relies on put_ctx() being 'last', in particular
* all task references must be cleaned up.
@@ -4901,8 +5332,8 @@ int perf_event_release_kernel(struct perf_event *event)
LIST_HEAD(free_list);
/*
- * If we got here through err_file: fput(event_file); we will not have
- * attached to a context yet.
+ * If we got here through err_alloc: free_event(event); we will not
+ * have attached to a context yet.
*/
if (!ctx) {
WARN_ON_ONCE(event->attach_state &
@@ -4915,9 +5346,7 @@ int perf_event_release_kernel(struct perf_event *event)
ctx = perf_event_ctx_lock(event);
WARN_ON_ONCE(ctx->parent_ctx);
- perf_remove_from_context(event, DETACH_GROUP);
- raw_spin_lock_irq(&ctx->lock);
/*
* Mark this event as STATE_DEAD, there is no external reference to it
* anymore.
@@ -4929,8 +5358,7 @@ int perf_event_release_kernel(struct perf_event *event)
* Thus this guarantees that we will in fact observe and kill _ALL_
* child events.
*/
- event->state = PERF_EVENT_STATE_DEAD;
- raw_spin_unlock_irq(&ctx->lock);
+ perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
perf_event_ctx_unlock(event, ctx);
@@ -5061,7 +5489,7 @@ static int __perf_read_group_add(struct perf_event *leader,
u64 read_format, u64 *values)
{
struct perf_event_context *ctx = leader->ctx;
- struct perf_event *sub;
+ struct perf_event *sub, *parent;
unsigned long flags;
int n = 1; /* skip @nr */
int ret;
@@ -5071,6 +5499,33 @@ static int __perf_read_group_add(struct perf_event *leader,
return ret;
raw_spin_lock_irqsave(&ctx->lock, flags);
+ /*
+ * Verify the grouping between the parent and child (inherited)
+ * events is still in tact.
+ *
+ * Specifically:
+ * - leader->ctx->lock pins leader->sibling_list
+ * - parent->child_mutex pins parent->child_list
+ * - parent->ctx->mutex pins parent->sibling_list
+ *
+ * Because parent->ctx != leader->ctx (and child_list nests inside
+ * ctx->mutex), group destruction is not atomic between children, also
+ * see perf_event_release_kernel(). Additionally, parent can grow the
+ * group.
+ *
+ * Therefore it is possible to have parent and child groups in a
+ * different configuration and summing over such a beast makes no sense
+ * what so ever.
+ *
+ * Reject this.
+ */
+ parent = leader->parent;
+ if (parent &&
+ (parent->group_generation != leader->group_generation ||
+ parent->nr_siblings != leader->nr_siblings)) {
+ ret = -ECHILD;
+ goto unlock;
+ }
/*
* Since we co-schedule groups, {enabled,running} times of siblings
@@ -5093,15 +5548,20 @@ static int __perf_read_group_add(struct perf_event *leader,
values[n++] += perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&leader->lost_samples);
for_each_sibling_event(sub, leader) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&sub->lost_samples);
}
+unlock:
raw_spin_unlock_irqrestore(&ctx->lock, flags);
- return 0;
+ return ret;
}
static int perf_read_group(struct perf_event *event,
@@ -5120,10 +5580,6 @@ static int perf_read_group(struct perf_event *event,
values[0] = 1 + leader->nr_siblings;
- /*
- * By locking the child_mutex of the leader we effectively
- * lock the child list of all siblings.. XXX explain how.
- */
mutex_lock(&leader->child_mutex);
ret = __perf_read_group_add(leader, read_format, values);
@@ -5154,7 +5610,7 @@ static int perf_read_one(struct perf_event *event,
u64 read_format, char __user *buf)
{
u64 enabled, running;
- u64 values[4];
+ u64 values[5];
int n = 0;
values[n++] = __perf_event_read_value(event, &enabled, &running);
@@ -5164,6 +5620,8 @@ static int perf_read_one(struct perf_event *event,
values[n++] = running;
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&event->lost_samples);
if (copy_to_user(buf, values, n * sizeof(u64)))
return -EFAULT;
@@ -5331,7 +5789,7 @@ static void __perf_event_period(struct perf_event *event,
active = (event->state == PERF_EVENT_STATE_ACTIVE);
if (active) {
- perf_pmu_disable(ctx->pmu);
+ perf_pmu_disable(event->pmu);
/*
* We could be throttled; unthrottle now to avoid the tick
* trying to unthrottle while we already re-started the event.
@@ -5347,7 +5805,7 @@ static void __perf_event_period(struct perf_event *event,
if (active) {
event->pmu->start(event, PERF_EF_RELOAD);
- perf_pmu_enable(ctx->pmu);
+ perf_pmu_enable(event->pmu);
}
}
@@ -5410,7 +5868,6 @@ static inline int perf_fget_light(int fd, struct fd *p)
static int perf_event_set_output(struct perf_event *event,
struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
static int perf_copy_attr(struct perf_event_attr __user *uattr,
struct perf_event_attr *attr);
@@ -5473,7 +5930,22 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
return perf_event_set_filter(event, (void __user *)arg);
case PERF_EVENT_IOC_SET_BPF:
- return perf_event_set_bpf_prog(event, arg);
+ {
+ struct bpf_prog *prog;
+ int err;
+
+ prog = bpf_prog_get(arg);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ err = perf_event_set_bpf_prog(event, prog, 0);
+ if (err) {
+ bpf_prog_put(prog);
+ return err;
+ }
+
+ return 0;
+ }
case PERF_EVENT_IOC_PAUSE_OUTPUT: {
struct perf_buffer *rb;
@@ -5597,18 +6069,6 @@ static int perf_event_index(struct perf_event *event)
return event->pmu->event_idx(event);
}
-static void calc_timer_values(struct perf_event *event,
- u64 *now,
- u64 *enabled,
- u64 *running)
-{
- u64 ctx_time;
-
- *now = perf_clock();
- ctx_time = event->shadow_ctx_time + *now;
- __perf_update_times(event, ctx_time, enabled, running);
-}
-
static void perf_event_init_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
@@ -5733,6 +6193,8 @@ static void ring_buffer_attach(struct perf_event *event,
struct perf_buffer *old_rb = NULL;
unsigned long flags;
+ WARN_ON_ONCE(event->parent);
+
if (event->rb) {
/*
* Should be impossible, we set this when removing
@@ -5790,6 +6252,9 @@ static void ring_buffer_wakeup(struct perf_event *event)
{
struct perf_buffer *rb;
+ if (event->parent)
+ event = event->parent;
+
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (rb) {
@@ -5803,6 +6268,9 @@ struct perf_buffer *ring_buffer_get(struct perf_event *event)
{
struct perf_buffer *rb;
+ if (event->parent)
+ event = event->parent;
+
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (rb) {
@@ -5851,11 +6319,11 @@ static void perf_pmu_output_stop(struct perf_event *event);
static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
-
struct perf_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
+ bool detach_rest = false;
if (event->pmu->event_unmapped)
event->pmu->event_unmapped(event, vma->vm_mm);
@@ -5886,7 +6354,8 @@ static void perf_mmap_close(struct vm_area_struct *vma)
mutex_unlock(&event->mmap_mutex);
}
- atomic_dec(&rb->mmap_count);
+ if (atomic_dec_and_test(&rb->mmap_count))
+ detach_rest = true;
if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
goto out_put;
@@ -5895,7 +6364,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
mutex_unlock(&event->mmap_mutex);
/* If there's still other mmap()s of this buffer, we're done. */
- if (atomic_read(&rb->mmap_count))
+ if (!detach_rest)
goto out_put;
/*
@@ -6068,17 +6537,17 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
again:
mutex_lock(&event->mmap_mutex);
if (event->rb) {
- if (event->rb->nr_pages != nr_pages) {
+ if (data_page_nr(event->rb) != nr_pages) {
ret = -EINVAL;
goto unlock;
}
if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
/*
- * Raced against perf_mmap_close() through
- * perf_event_set_output(). Try again, hope for better
- * luck.
+ * Raced against perf_mmap_close(); remove the
+ * event and try again.
*/
+ ring_buffer_attach(event, NULL);
mutex_unlock(&event->mmap_mutex);
goto again;
}
@@ -6146,6 +6615,7 @@ accounting:
ring_buffer_attach(event, rb);
+ perf_event_update_time(event);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
} else {
@@ -6171,7 +6641,7 @@ aux_unlock:
* Since pinned accounting is per vm we cannot allow fork() to copy our
* vma.
*/
- vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
+ vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &perf_mmap_vmops;
if (event->pmu->event_mapped)
@@ -6232,16 +6702,54 @@ void perf_event_wakeup(struct perf_event *event)
}
}
-static void perf_pending_event_disable(struct perf_event *event)
+static void perf_sigtrap(struct perf_event *event)
{
- int cpu = READ_ONCE(event->pending_disable);
+ /*
+ * We'd expect this to only occur if the irq_work is delayed and either
+ * ctx->task or current has changed in the meantime. This can be the
+ * case on architectures that do not implement arch_irq_work_raise().
+ */
+ if (WARN_ON_ONCE(event->ctx->task != current))
+ return;
+ /*
+ * Both perf_pending_task() and perf_pending_irq() can race with the
+ * task exiting.
+ */
+ if (current->flags & PF_EXITING)
+ return;
+
+ send_sig_perf((void __user *)event->pending_addr,
+ event->orig_type, event->attr.sig_data);
+}
+
+/*
+ * Deliver the pending work in-event-context or follow the context.
+ */
+static void __perf_pending_irq(struct perf_event *event)
+{
+ int cpu = READ_ONCE(event->oncpu);
+
+ /*
+ * If the event isn't running; we done. event_sched_out() will have
+ * taken care of things.
+ */
if (cpu < 0)
return;
+ /*
+ * Yay, we hit home and are in the context of the event.
+ */
if (cpu == smp_processor_id()) {
- WRITE_ONCE(event->pending_disable, -1);
- perf_event_disable_local(event);
+ if (event->pending_sigtrap) {
+ event->pending_sigtrap = 0;
+ perf_sigtrap(event);
+ local_dec(&event->ctx->nr_pending);
+ }
+ if (event->pending_disable) {
+ event->pending_disable = 0;
+ perf_event_disable_local(event);
+ }
return;
}
@@ -6261,55 +6769,101 @@ static void perf_pending_event_disable(struct perf_event *event)
* irq_work_queue(); // FAILS
*
* irq_work_run()
- * perf_pending_event()
+ * perf_pending_irq()
*
* But the event runs on CPU-B and wants disabling there.
*/
- irq_work_queue_on(&event->pending, cpu);
+ irq_work_queue_on(&event->pending_irq, cpu);
}
-static void perf_pending_event(struct irq_work *entry)
+static void perf_pending_irq(struct irq_work *entry)
{
- struct perf_event *event = container_of(entry, struct perf_event, pending);
+ struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
int rctx;
- rctx = perf_swevent_get_recursion_context();
/*
* If we 'fail' here, that's OK, it means recursion is already disabled
* and we won't recurse 'further'.
*/
+ rctx = perf_swevent_get_recursion_context();
- perf_pending_event_disable(event);
-
+ /*
+ * The wakeup isn't bound to the context of the event -- it can happen
+ * irrespective of where the event is.
+ */
if (event->pending_wakeup) {
event->pending_wakeup = 0;
perf_event_wakeup(event);
}
+ __perf_pending_irq(event);
+
if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
}
-/*
- * We assume there is only KVM supporting the callbacks.
- * Later on, we might change it to a list if there is
- * another virtualization implementation supporting the callbacks.
- */
-struct perf_guest_info_callbacks *perf_guest_cbs;
+static void perf_pending_task(struct callback_head *head)
+{
+ struct perf_event *event = container_of(head, struct perf_event, pending_task);
+ int rctx;
-int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+ /*
+ * If we 'fail' here, that's OK, it means recursion is already disabled
+ * and we won't recurse 'further'.
+ */
+ preempt_disable_notrace();
+ rctx = perf_swevent_get_recursion_context();
+
+ if (event->pending_work) {
+ event->pending_work = 0;
+ perf_sigtrap(event);
+ local_dec(&event->ctx->nr_pending);
+ }
+
+ if (rctx >= 0)
+ perf_swevent_put_recursion_context(rctx);
+ preempt_enable_notrace();
+
+ put_event(event);
+}
+
+#ifdef CONFIG_GUEST_PERF_EVENTS
+struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
+
+DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
+DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
+DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
+
+void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
- perf_guest_cbs = cbs;
- return 0;
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
+ return;
+
+ rcu_assign_pointer(perf_guest_cbs, cbs);
+ static_call_update(__perf_guest_state, cbs->state);
+ static_call_update(__perf_guest_get_ip, cbs->get_ip);
+
+ /* Implementing ->handle_intel_pt_intr is optional. */
+ if (cbs->handle_intel_pt_intr)
+ static_call_update(__perf_guest_handle_intel_pt_intr,
+ cbs->handle_intel_pt_intr);
}
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
-int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
- perf_guest_cbs = NULL;
- return 0;
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
+ return;
+
+ rcu_assign_pointer(perf_guest_cbs, NULL);
+ static_call_update(__perf_guest_state, (void *)&__static_call_return0);
+ static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
+ static_call_update(__perf_guest_handle_intel_pt_intr,
+ (void *)&__static_call_return0);
+ synchronize_rcu();
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+#endif
static void
perf_output_sample_regs(struct perf_output_handle *handle,
@@ -6328,14 +6882,13 @@ perf_output_sample_regs(struct perf_output_handle *handle,
}
static void perf_sample_regs_user(struct perf_regs *regs_user,
- struct pt_regs *regs,
- struct pt_regs *regs_user_copy)
+ struct pt_regs *regs)
{
if (user_mode(regs)) {
regs_user->abi = perf_reg_abi(current);
regs_user->regs = regs;
} else if (!(current->flags & PF_KTHREAD)) {
- perf_get_regs_user(regs_user, regs, regs_user_copy);
+ perf_get_regs_user(regs_user, regs);
} else {
regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
regs_user->regs = NULL;
@@ -6418,7 +6971,6 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
unsigned long sp;
unsigned int rem;
u64 dyn_size;
- mm_segment_t fs;
/*
* We dump:
@@ -6436,10 +6988,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
/* Data. */
sp = perf_user_stack_pointer(regs);
- fs = get_fs();
- set_fs(USER_DS);
rem = __output_copy_user(handle, (void *) sp, dump_size);
- set_fs(fs);
dyn_size = dump_size - rem;
perf_output_skip(handle, rem);
@@ -6467,7 +7016,7 @@ static unsigned long perf_prepare_sample_aux(struct perf_event *event,
if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
goto out;
- rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ rb = ring_buffer_get(sampler);
if (!rb)
goto out;
@@ -6487,10 +7036,10 @@ out:
return data->aux_size;
}
-long perf_pmu_snapshot_aux(struct perf_buffer *rb,
- struct perf_event *event,
- struct perf_output_handle *handle,
- unsigned long size)
+static long perf_pmu_snapshot_aux(struct perf_buffer *rb,
+ struct perf_event *event,
+ struct perf_output_handle *handle,
+ unsigned long size)
{
unsigned long flags;
long ret;
@@ -6533,7 +7082,7 @@ static void perf_aux_sample_output(struct perf_event *event,
if (WARN_ON_ONCE(!sampler || !data->aux_size))
return;
- rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ rb = ring_buffer_get(sampler);
if (!rb)
return;
@@ -6565,14 +7114,20 @@ out_put:
ring_buffer_put(rb);
}
-static void __perf_event_header__init_id(struct perf_event_header *header,
- struct perf_sample_data *data,
- struct perf_event *event)
-{
- u64 sample_type = event->attr.sample_type;
+/*
+ * A set of common sample data types saved even for non-sample records
+ * when event->attr.sample_id_all is set.
+ */
+#define PERF_SAMPLE_ID_ALL (PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \
+ PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID | \
+ PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER)
- data->type = sample_type;
- header->size += event->id_header_size;
+static void __perf_event_header__init_id(struct perf_sample_data *data,
+ struct perf_event *event,
+ u64 sample_type)
+{
+ data->type = event->attr.sample_type;
+ data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL;
if (sample_type & PERF_SAMPLE_TID) {
/* namespace issues */
@@ -6599,8 +7154,10 @@ void perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
{
- if (event->attr.sample_id_all)
- __perf_event_header__init_id(header, data, event);
+ if (event->attr.sample_id_all) {
+ header->size += event->id_header_size;
+ __perf_event_header__init_id(data, event, event->attr.sample_type);
+ }
}
static void __perf_event__output_id_sample(struct perf_output_handle *handle,
@@ -6640,7 +7197,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
u64 enabled, u64 running)
{
u64 read_format = event->attr.read_format;
- u64 values[4];
+ u64 values[5];
int n = 0;
values[n++] = perf_event_count(event);
@@ -6654,6 +7211,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
}
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&event->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
@@ -6664,9 +7223,16 @@ static void perf_output_read_group(struct perf_output_handle *handle,
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
- u64 values[5];
+ unsigned long flags;
+ u64 values[6];
int n = 0;
+ /*
+ * Disabling interrupts avoids all counter scheduling
+ * (context switches, timer based rotation and IPIs).
+ */
+ local_irq_save(flags);
+
values[n++] = 1 + leader->nr_siblings;
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
@@ -6682,6 +7248,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&leader->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
@@ -6695,9 +7263,13 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&sub->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
+
+ local_irq_restore(flags);
}
#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
@@ -6734,11 +7306,6 @@ static void perf_output_read(struct perf_output_handle *handle,
perf_output_read_one(handle, event, enabled, running);
}
-static inline bool perf_sample_save_hw_index(struct perf_event *event)
-{
- return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
-}
-
void perf_output_sample(struct perf_output_handle *handle,
struct perf_event_header *header,
struct perf_sample_data *data,
@@ -6827,9 +7394,17 @@ void perf_output_sample(struct perf_output_handle *handle,
* sizeof(struct perf_branch_entry);
perf_output_put(handle, data->br_stack->nr);
- if (perf_sample_save_hw_index(event))
+ if (branch_sample_hw_index(event))
perf_output_put(handle, data->br_stack->hw_idx);
perf_output_copy(handle, data->br_stack->entries, size);
+ /*
+ * Add the extension space which is appended
+ * right after the struct perf_branch_stack.
+ */
+ if (data->br_stack_cntr) {
+ size = data->br_stack->nr * sizeof(u64);
+ perf_output_copy(handle, data->br_stack_cntr, size);
+ }
} else {
/*
* we always store at least the value of nr
@@ -6862,8 +7437,8 @@ void perf_output_sample(struct perf_output_handle *handle,
data->regs_user.regs);
}
- if (sample_type & PERF_SAMPLE_WEIGHT)
- perf_output_put(handle, data->weight);
+ if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+ perf_output_put(handle, data->weight.full);
if (sample_type & PERF_SAMPLE_DATA_SRC)
perf_output_put(handle, data->data_src.val);
@@ -6894,6 +7469,12 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_CGROUP)
perf_output_put(handle, data->cgroup);
+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ perf_output_put(handle, data->data_page_size);
+
+ if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ perf_output_put(handle, data->code_page_size);
+
if (sample_type & PERF_SAMPLE_AUX) {
perf_output_put(handle, data->aux_size);
@@ -6919,7 +7500,6 @@ void perf_output_sample(struct perf_output_handle *handle,
static u64 perf_virt_to_phys(u64 virt)
{
u64 phys_addr = 0;
- struct page *p = NULL;
if (!virt)
return 0;
@@ -6938,19 +7518,111 @@ static u64 perf_virt_to_phys(u64 virt)
* If failed, leave phys_addr as 0.
*/
if (current->mm != NULL) {
+ struct page *p;
+
pagefault_disable();
- if (get_user_page_fast_only(virt, 0, &p))
+ if (get_user_page_fast_only(virt, 0, &p)) {
phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+ put_page(p);
+ }
pagefault_enable();
}
-
- if (p)
- put_page(p);
}
return phys_addr;
}
+/*
+ * Return the pagetable size of a given virtual address.
+ */
+static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
+{
+ u64 size = 0;
+
+#ifdef CONFIG_HAVE_FAST_GUP
+ pgd_t *pgdp, pgd;
+ p4d_t *p4dp, p4d;
+ pud_t *pudp, pud;
+ pmd_t *pmdp, pmd;
+ pte_t *ptep, pte;
+
+ pgdp = pgd_offset(mm, addr);
+ pgd = READ_ONCE(*pgdp);
+ if (pgd_none(pgd))
+ return 0;
+
+ if (pgd_leaf(pgd))
+ return pgd_leaf_size(pgd);
+
+ p4dp = p4d_offset_lockless(pgdp, pgd, addr);
+ p4d = READ_ONCE(*p4dp);
+ if (!p4d_present(p4d))
+ return 0;
+
+ if (p4d_leaf(p4d))
+ return p4d_leaf_size(p4d);
+
+ pudp = pud_offset_lockless(p4dp, p4d, addr);
+ pud = READ_ONCE(*pudp);
+ if (!pud_present(pud))
+ return 0;
+
+ if (pud_leaf(pud))
+ return pud_leaf_size(pud);
+
+ pmdp = pmd_offset_lockless(pudp, pud, addr);
+again:
+ pmd = pmdp_get_lockless(pmdp);
+ if (!pmd_present(pmd))
+ return 0;
+
+ if (pmd_leaf(pmd))
+ return pmd_leaf_size(pmd);
+
+ ptep = pte_offset_map(&pmd, addr);
+ if (!ptep)
+ goto again;
+
+ pte = ptep_get_lockless(ptep);
+ if (pte_present(pte))
+ size = pte_leaf_size(pte);
+ pte_unmap(ptep);
+#endif /* CONFIG_HAVE_FAST_GUP */
+
+ return size;
+}
+
+static u64 perf_get_page_size(unsigned long addr)
+{
+ struct mm_struct *mm;
+ unsigned long flags;
+ u64 size;
+
+ if (!addr)
+ return 0;
+
+ /*
+ * Software page-table walkers must disable IRQs,
+ * which prevents any tear down of the page tables.
+ */
+ local_irq_save(flags);
+
+ mm = current->mm;
+ if (!mm) {
+ /*
+ * For kernel threads and the like, use init_mm so that
+ * we can find kernel memory.
+ */
+ mm = &init_mm;
+ }
+
+ size = perf_get_pgtable_size(mm, addr);
+
+ local_irq_restore(flags);
+
+ return size;
+}
+
static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
struct perf_callchain_entry *
@@ -6971,77 +7643,68 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
return callchain ?: &__empty_callchain;
}
-void perf_prepare_sample(struct perf_event_header *header,
- struct perf_sample_data *data,
+static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d)
+{
+ return d * !!(flags & s);
+}
+
+void perf_prepare_sample(struct perf_sample_data *data,
struct perf_event *event,
struct pt_regs *regs)
{
u64 sample_type = event->attr.sample_type;
+ u64 filtered_sample_type;
- header->type = PERF_RECORD_SAMPLE;
- header->size = sizeof(*header) + event->header_size;
+ /*
+ * Add the sample flags that are dependent to others. And clear the
+ * sample flags that have already been done by the PMU driver.
+ */
+ filtered_sample_type = sample_type;
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE,
+ PERF_SAMPLE_IP);
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE |
+ PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER,
+ PERF_SAMPLE_REGS_USER);
+ filtered_sample_type &= ~data->sample_flags;
- header->misc = 0;
- header->misc |= perf_misc_flags(regs);
+ if (filtered_sample_type == 0) {
+ /* Make sure it has the correct data->type for output */
+ data->type = event->attr.sample_type;
+ return;
+ }
- __perf_event_header__init_id(header, data, event);
+ __perf_event_header__init_id(data, event, filtered_sample_type);
- if (sample_type & PERF_SAMPLE_IP)
+ if (filtered_sample_type & PERF_SAMPLE_IP) {
data->ip = perf_instruction_pointer(regs);
-
- if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- int size = 1;
-
- if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
- data->callchain = perf_callchain(event, regs);
-
- size += data->callchain->nr;
-
- header->size += size * sizeof(u64);
+ data->sample_flags |= PERF_SAMPLE_IP;
}
- if (sample_type & PERF_SAMPLE_RAW) {
- struct perf_raw_record *raw = data->raw;
- int size;
-
- if (raw) {
- struct perf_raw_frag *frag = &raw->frag;
- u32 sum = 0;
-
- do {
- sum += frag->size;
- if (perf_raw_frag_last(frag))
- break;
- frag = frag->next;
- } while (1);
-
- size = round_up(sum + sizeof(u32), sizeof(u64));
- raw->size = size - sizeof(u32);
- frag->pad = raw->size - sum;
- } else {
- size = sizeof(u64);
- }
+ if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
+ perf_sample_save_callchain(data, event, regs);
- header->size += size;
+ if (filtered_sample_type & PERF_SAMPLE_RAW) {
+ data->raw = NULL;
+ data->dyn_size += sizeof(u64);
+ data->sample_flags |= PERF_SAMPLE_RAW;
}
- if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
- int size = sizeof(u64); /* nr */
- if (data->br_stack) {
- if (perf_sample_save_hw_index(event))
- size += sizeof(u64);
-
- size += data->br_stack->nr
- * sizeof(struct perf_branch_entry);
- }
- header->size += size;
+ if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ data->br_stack = NULL;
+ data->dyn_size += sizeof(u64);
+ data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
}
- if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
- perf_sample_regs_user(&data->regs_user, regs,
- &data->regs_user_copy);
+ if (filtered_sample_type & PERF_SAMPLE_REGS_USER)
+ perf_sample_regs_user(&data->regs_user, regs);
- if (sample_type & PERF_SAMPLE_REGS_USER) {
+ /*
+ * It cannot use the filtered_sample_type here as REGS_USER can be set
+ * by STACK_USER (using __cond_set() above) and we don't want to update
+ * the dyn_size if it's not requested by users.
+ */
+ if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) {
/* regs dump ABI info */
int size = sizeof(u64);
@@ -7050,10 +7713,11 @@ void perf_prepare_sample(struct perf_event_header *header,
size += hweight64(mask) * sizeof(u64);
}
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_REGS_USER;
}
- if (sample_type & PERF_SAMPLE_STACK_USER) {
+ if (filtered_sample_type & PERF_SAMPLE_STACK_USER) {
/*
* Either we need PERF_SAMPLE_STACK_USER bit to be always
* processed as the last one or have additional check added
@@ -7061,9 +7725,10 @@ void perf_prepare_sample(struct perf_event_header *header,
* up the rest of the sample size.
*/
u16 stack_size = event->attr.sample_stack_user;
+ u16 header_size = perf_sample_data_size(data, event);
u16 size = sizeof(u64);
- stack_size = perf_sample_ustack_size(stack_size, header->size,
+ stack_size = perf_sample_ustack_size(stack_size, header_size,
data->regs_user.regs);
/*
@@ -7075,10 +7740,31 @@ void perf_prepare_sample(struct perf_event_header *header,
size += sizeof(u64) + stack_size;
data->stack_user_size = stack_size;
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_STACK_USER;
}
- if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
+ data->weight.full = 0;
+ data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) {
+ data->data_src.val = PERF_MEM_NA;
+ data->sample_flags |= PERF_SAMPLE_DATA_SRC;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) {
+ data->txn = 0;
+ data->sample_flags |= PERF_SAMPLE_TRANSACTION;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_ADDR) {
+ data->addr = 0;
+ data->sample_flags |= PERF_SAMPLE_ADDR;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) {
/* regs dump ABI info */
int size = sizeof(u64);
@@ -7090,26 +7776,46 @@ void perf_prepare_sample(struct perf_event_header *header,
size += hweight64(mask) * sizeof(u64);
}
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_REGS_INTR;
}
- if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) {
data->phys_addr = perf_virt_to_phys(data->addr);
+ data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
+ }
#ifdef CONFIG_CGROUP_PERF
- if (sample_type & PERF_SAMPLE_CGROUP) {
+ if (filtered_sample_type & PERF_SAMPLE_CGROUP) {
struct cgroup *cgrp;
/* protected by RCU */
cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
data->cgroup = cgroup_id(cgrp);
+ data->sample_flags |= PERF_SAMPLE_CGROUP;
}
#endif
- if (sample_type & PERF_SAMPLE_AUX) {
+ /*
+ * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't
+ * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
+ * but the value will not dump to the userspace.
+ */
+ if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) {
+ data->data_page_size = perf_get_page_size(data->addr);
+ data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {
+ data->code_page_size = perf_get_page_size(data->ip);
+ data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_AUX) {
u64 size;
+ u16 header_size = perf_sample_data_size(data, event);
- header->size += sizeof(u64); /* size */
+ header_size += sizeof(u64); /* size */
/*
* Given the 16bit nature of header::size, an AUX sample can
@@ -7117,14 +7823,26 @@ void perf_prepare_sample(struct perf_event_header *header,
* Make sure this doesn't happen by using up to U16_MAX bytes
* per sample in total (rounded down to 8 byte boundary).
*/
- size = min_t(size_t, U16_MAX - header->size,
+ size = min_t(size_t, U16_MAX - header_size,
event->attr.aux_sample_size);
size = rounddown(size, 8);
size = perf_prepare_sample_aux(event, data, size);
- WARN_ON_ONCE(size + header->size > U16_MAX);
- header->size += size;
+ WARN_ON_ONCE(size + header_size > U16_MAX);
+ data->dyn_size += size + sizeof(u64); /* size above */
+ data->sample_flags |= PERF_SAMPLE_AUX;
}
+}
+
+void perf_prepare_header(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event,
+ struct pt_regs *regs)
+{
+ header->type = PERF_RECORD_SAMPLE;
+ header->size = perf_sample_data_size(data, event);
+ header->misc = perf_misc_flags(regs);
+
/*
* If you're adding more sample types here, you likely need to do
* something about the overflowing header::size, like repurpose the
@@ -7141,6 +7859,7 @@ __perf_event_output(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs,
int (*output_begin)(struct perf_output_handle *,
+ struct perf_sample_data *,
struct perf_event *,
unsigned int))
{
@@ -7151,9 +7870,10 @@ __perf_event_output(struct perf_event *event,
/* protect the callchain buffers */
rcu_read_lock();
- perf_prepare_sample(&header, data, event, regs);
+ perf_prepare_sample(data, event, regs);
+ perf_prepare_header(&header, data, event, regs);
- err = output_begin(&handle, event, header.size);
+ err = output_begin(&handle, data, event, header.size);
if (err)
goto exit;
@@ -7219,7 +7939,7 @@ perf_event_read_event(struct perf_event *event,
int ret;
perf_event_header__init_id(&read_event.header, &sample, event);
- ret = perf_output_begin(&handle, event, read_event.header.size);
+ ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
if (ret)
return;
@@ -7284,7 +8004,6 @@ perf_iterate_sb(perf_iterate_f output, void *data,
struct perf_event_context *task_ctx)
{
struct perf_event_context *ctx;
- int ctxn;
rcu_read_lock();
preempt_disable();
@@ -7301,11 +8020,9 @@ perf_iterate_sb(perf_iterate_f output, void *data,
perf_iterate_sb_cpu(output, data);
- for_each_task_context_nr(ctxn) {
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx)
- perf_iterate_ctx(ctx, output, data, false);
- }
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
+ perf_iterate_ctx(ctx, output, data, false);
done:
preempt_enable();
rcu_read_unlock();
@@ -7347,20 +8064,17 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
void perf_event_exec(void)
{
struct perf_event_context *ctx;
- int ctxn;
- rcu_read_lock();
- for_each_task_context_nr(ctxn) {
- ctx = current->perf_event_ctxp[ctxn];
- if (!ctx)
- continue;
+ ctx = perf_pin_task_context(current);
+ if (!ctx)
+ return;
- perf_event_enable_on_exec(ctxn);
+ perf_event_enable_on_exec(ctx);
+ perf_event_remove_on_exec(ctx);
+ perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);
- perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
- true);
- }
- rcu_read_unlock();
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
}
struct remote_output {
@@ -7400,8 +8114,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
static int __perf_pmu_output_stop(void *info)
{
struct perf_event *event = info;
- struct pmu *pmu = event->ctx->pmu;
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct remote_output ro = {
.rb = event->rb,
};
@@ -7488,7 +8201,7 @@ static void perf_event_task_output(struct perf_event *event,
perf_event_header__init_id(&task_event->event_id.header, &sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
task_event->event_id.header.size);
if (ret)
goto out;
@@ -7591,7 +8304,7 @@ static void perf_event_comm_output(struct perf_event *event,
return;
perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
comm_event->event_id.header.size);
if (ret)
@@ -7617,7 +8330,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
unsigned int size;
memset(comm, 0, sizeof(comm));
- strlcpy(comm, comm_event->task->comm, sizeof(comm));
+ strscpy(comm, comm_event->task->comm, sizeof(comm));
size = ALIGN(strlen(comm)+1, sizeof(u64));
comm_event->comm = comm;
@@ -7691,7 +8404,7 @@ static void perf_event_namespaces_output(struct perf_event *event,
perf_event_header__init_id(&namespaces_event->event_id.header,
&sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
namespaces_event->event_id.header.size);
if (ret)
goto out;
@@ -7818,7 +8531,7 @@ static void perf_event_cgroup_output(struct perf_event *event, void *data)
perf_event_header__init_id(&cgroup_event->event_id.header,
&sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
cgroup_event->event_id.header.size);
if (ret)
goto out;
@@ -7897,6 +8610,8 @@ struct perf_mmap_event {
u64 ino;
u64 ino_generation;
u32 prot, flags;
+ u8 build_id[BUILD_ID_SIZE_MAX];
+ u32 build_id_size;
struct {
struct perf_event_header header;
@@ -7928,6 +8643,7 @@ static void perf_event_mmap_output(struct perf_event *event,
struct perf_sample_data sample;
int size = mmap_event->event_id.header.size;
u32 type = mmap_event->event_id.header.type;
+ bool use_build_id;
int ret;
if (!perf_event_mmap_match(event, data))
@@ -7944,7 +8660,7 @@ static void perf_event_mmap_output(struct perf_event *event,
}
perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
mmap_event->event_id.header.size);
if (ret)
goto out;
@@ -7952,13 +8668,25 @@ static void perf_event_mmap_output(struct perf_event *event,
mmap_event->event_id.pid = perf_event_pid(event, current);
mmap_event->event_id.tid = perf_event_tid(event, current);
+ use_build_id = event->attr.build_id && mmap_event->build_id_size;
+
+ if (event->attr.mmap2 && use_build_id)
+ mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;
+
perf_output_put(&handle, mmap_event->event_id);
if (event->attr.mmap2) {
- perf_output_put(&handle, mmap_event->maj);
- perf_output_put(&handle, mmap_event->min);
- perf_output_put(&handle, mmap_event->ino);
- perf_output_put(&handle, mmap_event->ino_generation);
+ if (use_build_id) {
+ u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };
+
+ __output_copy(&handle, size, 4);
+ __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
+ } else {
+ perf_output_put(&handle, mmap_event->maj);
+ perf_output_put(&handle, mmap_event->min);
+ perf_output_put(&handle, mmap_event->ino);
+ perf_output_put(&handle, mmap_event->ino_generation);
+ }
perf_output_put(&handle, mmap_event->prot);
perf_output_put(&handle, mmap_event->flags);
}
@@ -7984,7 +8712,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
unsigned int size;
char tmp[16];
char *buf = NULL;
- char *name;
+ char *name = NULL;
if (vma->vm_flags & VM_READ)
prot |= PROT_READ;
@@ -7998,10 +8726,6 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
else
flags = MAP_PRIVATE;
- if (vma->vm_flags & VM_DENYWRITE)
- flags |= MAP_DENYWRITE;
- if (vma->vm_flags & VM_MAYEXEC)
- flags |= MAP_EXECUTABLE;
if (vma->vm_flags & VM_LOCKED)
flags |= MAP_LOCKED;
if (is_vm_hugetlb_page(vma))
@@ -8035,33 +8759,22 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
goto got_name;
} else {
- if (vma->vm_ops && vma->vm_ops->name) {
+ if (vma->vm_ops && vma->vm_ops->name)
name = (char *) vma->vm_ops->name(vma);
- if (name)
- goto cpy_name;
- }
-
- name = (char *)arch_vma_name(vma);
- if (name)
- goto cpy_name;
-
- if (vma->vm_start <= vma->vm_mm->start_brk &&
- vma->vm_end >= vma->vm_mm->brk) {
- name = "[heap]";
- goto cpy_name;
- }
- if (vma->vm_start <= vma->vm_mm->start_stack &&
- vma->vm_end >= vma->vm_mm->start_stack) {
- name = "[stack]";
- goto cpy_name;
+ if (!name)
+ name = (char *)arch_vma_name(vma);
+ if (!name) {
+ if (vma_is_initial_heap(vma))
+ name = "[heap]";
+ else if (vma_is_initial_stack(vma))
+ name = "[stack]";
+ else
+ name = "//anon";
}
-
- name = "//anon";
- goto cpy_name;
}
cpy_name:
- strlcpy(tmp, name, sizeof(tmp));
+ strscpy(tmp, name, sizeof(tmp));
name = tmp;
got_name:
/*
@@ -8087,6 +8800,9 @@ got_name:
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
+ if (atomic_read(&nr_build_id_events))
+ build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
+
perf_iterate_sb(perf_event_mmap_output,
mmap_event,
NULL);
@@ -8176,7 +8892,6 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
static void perf_addr_filters_adjust(struct vm_area_struct *vma)
{
struct perf_event_context *ctx;
- int ctxn;
/*
* Data tracing isn't supported yet and as such there is no need
@@ -8186,13 +8901,9 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
return;
rcu_read_lock();
- for_each_task_context_nr(ctxn) {
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (!ctx)
- continue;
-
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
- }
rcu_read_unlock();
}
@@ -8254,7 +8965,7 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head,
int ret;
perf_event_header__init_id(&rec.header, &sample, event);
- ret = perf_output_begin(&handle, event, rec.header.size);
+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
if (ret)
return;
@@ -8288,7 +8999,7 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost)
perf_event_header__init_id(&lost_samples_event.header, &sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
lost_samples_event.header.size);
if (ret)
return;
@@ -8343,7 +9054,7 @@ static void perf_event_switch_output(struct perf_event *event, void *data)
perf_event_header__init_id(&se->event_id.header, &sample, event);
- ret = perf_output_begin(&handle, event, se->event_id.header.size);
+ ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
if (ret)
return;
@@ -8378,13 +9089,12 @@ static void perf_event_switch(struct task_struct *task,
},
};
- if (!sched_in && task->state == TASK_RUNNING)
+ if (!sched_in && task->on_rq) {
switch_event.event_id.header.misc |=
PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
+ }
- perf_iterate_sb(perf_event_switch_output,
- &switch_event,
- NULL);
+ perf_iterate_sb(perf_event_switch_output, &switch_event, NULL);
}
/*
@@ -8418,7 +9128,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
perf_event_header__init_id(&throttle_event.header, &sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
throttle_event.header.size);
if (ret)
return;
@@ -8461,7 +9171,7 @@ static void perf_event_ksymbol_output(struct perf_event *event, void *data)
perf_event_header__init_id(&ksymbol_event->event_id.header,
&sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
ksymbol_event->event_id.header.size);
if (ret)
return;
@@ -8488,7 +9198,7 @@ void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
goto err;
- strlcpy(name, sym, KSYM_NAME_LEN);
+ strscpy(name, sym, KSYM_NAME_LEN);
name_len = strlen(name) + 1;
while (!IS_ALIGNED(name_len, sizeof(u64)))
name[name_len++] = '\0';
@@ -8551,7 +9261,7 @@ static void perf_event_bpf_output(struct perf_event *event, void *data)
perf_event_header__init_id(&bpf_event->event_id.header,
&sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
bpf_event->event_id.header.size);
if (ret)
return;
@@ -8581,7 +9291,7 @@ static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
PERF_RECORD_KSYMBOL_TYPE_BPF,
(u64)(unsigned long)subprog->bpf_func,
subprog->jited_len, unregister,
- prog->aux->ksym.name);
+ subprog->aux->ksym.name);
}
}
}
@@ -8628,6 +9338,90 @@ void perf_event_bpf_event(struct bpf_prog *prog,
perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
}
+struct perf_text_poke_event {
+ const void *old_bytes;
+ const void *new_bytes;
+ size_t pad;
+ u16 old_len;
+ u16 new_len;
+
+ struct {
+ struct perf_event_header header;
+
+ u64 addr;
+ } event_id;
+};
+
+static int perf_event_text_poke_match(struct perf_event *event)
+{
+ return event->attr.text_poke;
+}
+
+static void perf_event_text_poke_output(struct perf_event *event, void *data)
+{
+ struct perf_text_poke_event *text_poke_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ u64 padding = 0;
+ int ret;
+
+ if (!perf_event_text_poke_match(event))
+ return;
+
+ perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
+
+ ret = perf_output_begin(&handle, &sample, event,
+ text_poke_event->event_id.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, text_poke_event->event_id);
+ perf_output_put(&handle, text_poke_event->old_len);
+ perf_output_put(&handle, text_poke_event->new_len);
+
+ __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
+ __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
+
+ if (text_poke_event->pad)
+ __output_copy(&handle, &padding, text_poke_event->pad);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+void perf_event_text_poke(const void *addr, const void *old_bytes,
+ size_t old_len, const void *new_bytes, size_t new_len)
+{
+ struct perf_text_poke_event text_poke_event;
+ size_t tot, pad;
+
+ if (!atomic_read(&nr_text_poke_events))
+ return;
+
+ tot = sizeof(text_poke_event.old_len) + old_len;
+ tot += sizeof(text_poke_event.new_len) + new_len;
+ pad = ALIGN(tot, sizeof(u64)) - tot;
+
+ text_poke_event = (struct perf_text_poke_event){
+ .old_bytes = old_bytes,
+ .new_bytes = new_bytes,
+ .pad = pad,
+ .old_len = old_len,
+ .new_len = new_len,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_TEXT_POKE,
+ .misc = PERF_RECORD_MISC_KERNEL,
+ .size = sizeof(text_poke_event.event_id) + tot + pad,
+ },
+ .addr = (unsigned long)addr,
+ },
+ };
+
+ perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
+}
+
void perf_event_itrace_started(struct perf_event *event)
{
event->attach_state |= PERF_ATTACH_ITRACE;
@@ -8658,7 +9452,7 @@ static void perf_log_itrace_start(struct perf_event *event)
rec.tid = perf_event_tid(event, current);
perf_event_header__init_id(&rec.header, &sample, event);
- ret = perf_output_begin(&handle, event, rec.header.size);
+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
if (ret)
return;
@@ -8669,6 +9463,37 @@ static void perf_log_itrace_start(struct perf_event *event)
perf_output_end(&handle);
}
+void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)
+{
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ struct perf_aux_event {
+ struct perf_event_header header;
+ u64 hw_id;
+ } rec;
+ int ret;
+
+ if (event->parent)
+ event = event->parent;
+
+ rec.header.type = PERF_RECORD_AUX_OUTPUT_HW_ID;
+ rec.header.misc = 0;
+ rec.header.size = sizeof(rec);
+ rec.hw_id = hw_id;
+
+ perf_event_header__init_id(&rec.header, &sample, event);
+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
+
+ if (ret)
+ return;
+
+ perf_output_put(&handle, rec);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+EXPORT_SYMBOL_GPL(perf_report_aux_output_id);
+
static int
__perf_event_account_interrupt(struct perf_event *event, int throttle)
{
@@ -8682,8 +9507,8 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle)
hwc->interrupts = 1;
} else {
hwc->interrupts++;
- if (unlikely(throttle
- && hwc->interrupts >= max_samples_per_tick)) {
+ if (unlikely(throttle &&
+ hwc->interrupts > max_samples_per_tick)) {
__this_cpu_inc(perf_throttled_count);
tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
hwc->interrupts = MAX_INTERRUPTS;
@@ -8710,13 +9535,26 @@ int perf_event_account_interrupt(struct perf_event *event)
return __perf_event_account_interrupt(event, 1);
}
+static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
+{
+ /*
+ * Due to interrupt latency (AKA "skid"), we may enter the
+ * kernel before taking an overflow, even if the PMU is only
+ * counting user events.
+ */
+ if (event->attr.exclude_kernel && !user_mode(regs))
+ return false;
+
+ return true;
+}
+
/*
* Generic event overflow handling, sampling.
*/
static int __perf_event_overflow(struct perf_event *event,
- int throttle, struct perf_sample_data *data,
- struct pt_regs *regs)
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
{
int events = atomic_read(&event->event_limit);
int ret = 0;
@@ -8739,23 +9577,59 @@ static int __perf_event_overflow(struct perf_event *event,
if (events && atomic_dec_and_test(&event->event_limit)) {
ret = 1;
event->pending_kill = POLL_HUP;
-
perf_event_disable_inatomic(event);
}
+ if (event->attr.sigtrap) {
+ /*
+ * The desired behaviour of sigtrap vs invalid samples is a bit
+ * tricky; on the one hand, one should not loose the SIGTRAP if
+ * it is the first event, on the other hand, we should also not
+ * trigger the WARN or override the data address.
+ */
+ bool valid_sample = sample_is_allowed(event, regs);
+ unsigned int pending_id = 1;
+
+ if (regs)
+ pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
+ if (!event->pending_sigtrap) {
+ event->pending_sigtrap = pending_id;
+ local_inc(&event->ctx->nr_pending);
+ } else if (event->attr.exclude_kernel && valid_sample) {
+ /*
+ * Should not be able to return to user space without
+ * consuming pending_sigtrap; with exceptions:
+ *
+ * 1. Where !exclude_kernel, events can overflow again
+ * in the kernel without returning to user space.
+ *
+ * 2. Events that can overflow again before the IRQ-
+ * work without user space progress (e.g. hrtimer).
+ * To approximate progress (with false negatives),
+ * check 32-bit hash of the current IP.
+ */
+ WARN_ON_ONCE(event->pending_sigtrap != pending_id);
+ }
+
+ event->pending_addr = 0;
+ if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
+ event->pending_addr = data->addr;
+ irq_work_queue(&event->pending_irq);
+ }
+
READ_ONCE(event->overflow_handler)(event, data, regs);
if (*perf_event_fasync(event) && event->pending_kill) {
event->pending_wakeup = 1;
- irq_work_queue(&event->pending);
+ irq_work_queue(&event->pending_irq);
}
return ret;
}
int perf_event_overflow(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
return __perf_event_overflow(event, 1, data, regs);
}
@@ -8791,16 +9665,16 @@ u64 perf_swevent_set_period(struct perf_event *event)
hwc->last_period = hwc->sample_period;
-again:
- old = val = local64_read(&hwc->period_left);
- if (val < 0)
- return 0;
+ old = local64_read(&hwc->period_left);
+ do {
+ val = old;
+ if (val < 0)
+ return 0;
- nr = div64_u64(period + val, period);
- offset = nr * period;
- val -= offset;
- if (local64_cmpxchg(&hwc->period_left, old, val) != old)
- goto again;
+ nr = div64_u64(period + val, period);
+ offset = nr * period;
+ val -= offset;
+ } while (!local64_try_cmpxchg(&hwc->period_left, &old, val));
return nr;
}
@@ -9151,6 +10025,9 @@ static void sw_perf_event_destroy(struct perf_event *event)
swevent_hlist_put();
}
+static struct pmu perf_cpu_clock; /* fwd declaration */
+static struct pmu perf_task_clock;
+
static int perf_swevent_init(struct perf_event *event)
{
u64 event_id = event->attr.config;
@@ -9166,7 +10043,10 @@ static int perf_swevent_init(struct perf_event *event)
switch (event_id) {
case PERF_COUNT_SW_CPU_CLOCK:
+ event->attr.type = perf_cpu_clock.type;
+ return -ENOENT;
case PERF_COUNT_SW_TASK_CLOCK:
+ event->attr.type = perf_task_clock.type;
return -ENOENT;
default:
@@ -9205,6 +10085,44 @@ static struct pmu perf_swevent = {
#ifdef CONFIG_EVENT_TRACING
+static void tp_perf_event_destroy(struct perf_event *event)
+{
+ perf_trace_destroy(event);
+}
+
+static int perf_tp_event_init(struct perf_event *event)
+{
+ int err;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for tracepoint events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ err = perf_trace_init(event);
+ if (err)
+ return err;
+
+ event->destroy = tp_perf_event_destroy;
+
+ return 0;
+}
+
+static struct pmu perf_tracepoint = {
+ .task_ctx_nr = perf_sw_context,
+
+ .event_init = perf_tp_event_init,
+ .add = perf_trace_add,
+ .del = perf_trace_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+};
+
static int perf_tp_filter_match(struct perf_event *event,
struct perf_sample_data *data)
{
@@ -9254,6 +10172,44 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
+static void __perf_tp_event_target_task(u64 count, void *record,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ struct perf_event *event)
+{
+ struct trace_entry *entry = record;
+
+ if (event->attr.config != entry->type)
+ return;
+ /* Cannot deliver synchronous signal to other task. */
+ if (event->attr.sigtrap)
+ return;
+ if (perf_tp_event_match(event, data, regs))
+ perf_swevent_event(event, count, data, regs);
+}
+
+static void perf_tp_event_target_task(u64 count, void *record,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ struct perf_event_context *ctx)
+{
+ unsigned int cpu = smp_processor_id();
+ struct pmu *pmu = &perf_tracepoint;
+ struct perf_event *event, *sibling;
+
+ perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
+ __perf_tp_event_target_task(count, record, regs, data, event);
+ for_each_sibling_event(sibling, event)
+ __perf_tp_event_target_task(count, record, regs, data, sibling);
+ }
+
+ perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
+ __perf_tp_event_target_task(count, record, regs, data, event);
+ for_each_sibling_event(sibling, event)
+ __perf_tp_event_target_task(count, record, regs, data, sibling);
+ }
+}
+
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
struct task_struct *task)
@@ -9269,13 +10225,25 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
};
perf_sample_data_init(&data, 0, 0);
- data.raw = &raw;
+ perf_sample_save_raw_data(&data, &raw);
perf_trace_buf_update(record, event_type);
hlist_for_each_entry_rcu(event, head, hlist_entry) {
- if (perf_tp_event_match(event, &data, regs))
+ if (perf_tp_event_match(event, &data, regs)) {
perf_swevent_event(event, count, &data, regs);
+
+ /*
+ * Here use the same on-stack perf_sample_data,
+ * some members in data are event-specific and
+ * need to be re-computed for different sweveents.
+ * Re-initialize data->sample_flags safely to avoid
+ * the problem that next event skips preparing data
+ * because data->sample_flags is set.
+ */
+ perf_sample_data_init(&data, 0, 0);
+ perf_sample_save_raw_data(&data, &raw);
+ }
}
/*
@@ -9284,23 +10252,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
*/
if (task && task != current) {
struct perf_event_context *ctx;
- struct trace_entry *entry = record;
rcu_read_lock();
- ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+ ctx = rcu_dereference(task->perf_event_ctxp);
if (!ctx)
goto unlock;
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (event->cpu != smp_processor_id())
- continue;
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- continue;
- if (event->attr.config != entry->type)
- continue;
- if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, &data, regs);
- }
+ raw_spin_lock(&ctx->lock);
+ perf_tp_event_target_task(count, record, regs, &data, ctx);
+ raw_spin_unlock(&ctx->lock);
unlock:
rcu_read_unlock();
}
@@ -9309,44 +10269,6 @@ unlock:
}
EXPORT_SYMBOL_GPL(perf_tp_event);
-static void tp_perf_event_destroy(struct perf_event *event)
-{
- perf_trace_destroy(event);
-}
-
-static int perf_tp_event_init(struct perf_event *event)
-{
- int err;
-
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -ENOENT;
-
- /*
- * no branch sampling for tracepoint events
- */
- if (has_branch_stack(event))
- return -EOPNOTSUPP;
-
- err = perf_trace_init(event);
- if (err)
- return err;
-
- event->destroy = tp_perf_event_destroy;
-
- return 0;
-}
-
-static struct pmu perf_tracepoint = {
- .task_ctx_nr = perf_sw_context,
-
- .event_init = perf_tp_event_init,
- .add = perf_trace_add,
- .del = perf_trace_del,
- .start = perf_swevent_start,
- .stop = perf_swevent_stop,
- .read = perf_swevent_read,
-};
-
#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
/*
* Flags in config, used by dynamic PMU kprobe and uprobe
@@ -9513,13 +10435,18 @@ static void bpf_overflow_handler(struct perf_event *event,
.data = data,
.event = event,
};
+ struct bpf_prog *prog;
int ret = 0;
ctx.regs = perf_arch_bpf_user_pt_regs(regs);
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
goto out;
rcu_read_lock();
- ret = BPF_PROG_RUN(event->prog, &ctx);
+ prog = READ_ONCE(event->prog);
+ if (prog) {
+ perf_prepare_sample(data, event, regs);
+ ret = bpf_prog_run(prog, &ctx);
+ }
rcu_read_unlock();
out:
__this_cpu_dec(bpf_prog_active);
@@ -9529,10 +10456,10 @@ out:
event->orig_overflow_handler(event, data, regs);
}
-static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
+static int perf_event_set_bpf_handler(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
{
- struct bpf_prog *prog;
-
if (event->overflow_handler_context)
/* hw breakpoint or kernel counter */
return -EINVAL;
@@ -9540,11 +10467,28 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
if (event->prog)
return -EEXIST;
- prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
+ if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
+ return -EINVAL;
+
+ if (event->attr.precise_ip &&
+ prog->call_get_stack &&
+ (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
+ event->attr.exclude_callchain_kernel ||
+ event->attr.exclude_callchain_user)) {
+ /*
+ * On perf_event with precise_ip, calling bpf_get_stack()
+ * may trigger unwinder warnings and occasional crashes.
+ * bpf_get_[stack|stackid] works around this issue by using
+ * callchain attached to perf_sample_data. If the
+ * perf_event does not full (kernel and user) callchain
+ * attached to perf_sample_data, do not allow attaching BPF
+ * program that calls bpf_get_[stack|stackid].
+ */
+ return -EPROTO;
+ }
event->prog = prog;
+ event->bpf_cookie = bpf_cookie;
event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
return 0;
@@ -9562,7 +10506,9 @@ static void perf_event_free_bpf_handler(struct perf_event *event)
bpf_prog_put(prog);
}
#else
-static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
+static int perf_event_set_bpf_handler(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
{
return -EOPNOTSUPP;
}
@@ -9590,57 +10536,46 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
return false;
}
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+ u64 bpf_cookie)
{
- bool is_kprobe, is_tracepoint, is_syscall_tp;
- struct bpf_prog *prog;
- int ret;
+ bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
if (!perf_event_is_tracing(event))
- return perf_event_set_bpf_handler(event, prog_fd);
+ return perf_event_set_bpf_handler(event, prog, bpf_cookie);
- is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
+ is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE;
+ is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
is_syscall_tp = is_syscall_trace_event(event->tp_event);
- if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
+ if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp)
/* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL;
- prog = bpf_prog_get(prog_fd);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
-
- if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
+ if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) ||
(is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
- (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
- /* valid fd, but invalid bpf program type */
- bpf_prog_put(prog);
+ (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
+ return -EINVAL;
+
+ if (prog->type == BPF_PROG_TYPE_KPROBE && prog->aux->sleepable && !is_uprobe)
+ /* only uprobe programs are allowed to be sleepable */
return -EINVAL;
- }
/* Kprobe override only works for kprobes, not uprobes. */
- if (prog->kprobe_override &&
- !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
- bpf_prog_put(prog);
+ if (prog->kprobe_override && !is_kprobe)
return -EINVAL;
- }
if (is_tracepoint || is_syscall_tp) {
int off = trace_event_get_offsets(event->tp_event);
- if (prog->aux->max_ctx_offset > off) {
- bpf_prog_put(prog);
+ if (prog->aux->max_ctx_offset > off)
return -EACCES;
- }
}
- ret = perf_event_attach_bpf_prog(event, prog);
- if (ret)
- bpf_prog_put(prog);
- return ret;
+ return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
}
-static void perf_event_free_bpf_prog(struct perf_event *event)
+void perf_event_free_bpf_prog(struct perf_event *event)
{
if (!perf_event_is_tracing(event)) {
perf_event_free_bpf_handler(event);
@@ -9659,12 +10594,13 @@ static void perf_event_free_filter(struct perf_event *event)
{
}
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+ u64 bpf_cookie)
{
return -ENOENT;
}
-static void perf_event_free_bpf_prog(struct perf_event *event)
+void perf_event_free_bpf_prog(struct perf_event *event)
{
}
#endif /* CONFIG_EVENT_TRACING */
@@ -9749,8 +10685,9 @@ static void perf_addr_filter_apply(struct perf_addr_filter *filter,
struct perf_addr_filter_range *fr)
{
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!vma->vm_file)
continue;
@@ -9780,7 +10717,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
return;
if (ifh->nr_file_filters) {
- mm = get_task_mm(event->ctx->task);
+ mm = get_task_mm(task);
if (!mm)
goto restart;
@@ -9917,7 +10854,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
case IF_SRC_KERNELADDR:
case IF_SRC_KERNEL:
kernel = 1;
- /* fall through */
+ fallthrough;
case IF_SRC_FILEADDR:
case IF_SRC_FILE:
@@ -9939,6 +10876,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
int fpos = token == IF_SRC_FILE ? 2 : 1;
+ kfree(filename);
filename = match_strdup(&args[fpos]);
if (!filename) {
ret = -ENOMEM;
@@ -9960,8 +10898,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
*/
if (state == IF_STATE_END) {
ret = -EINVAL;
- if (kernel && event->attr.exclude_kernel)
- goto fail;
/*
* ACTION "filter" must have a non-zero length region
@@ -9985,16 +10921,13 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
*/
ret = -EOPNOTSUPP;
if (!event->ctx->task)
- goto fail_free_name;
+ goto fail;
/* look up the path and grab its inode */
ret = kern_path(filename, LOOKUP_FOLLOW,
&filter->path);
if (ret)
- goto fail_free_name;
-
- kfree(filename);
- filename = NULL;
+ goto fail;
ret = -EINVAL;
if (!filter->path.dentry ||
@@ -10006,21 +10939,24 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
}
/* ready to consume more filters */
+ kfree(filename);
+ filename = NULL;
state = IF_STATE_ACTION;
filter = NULL;
+ kernel = 0;
}
}
if (state != IF_STATE_ACTION)
goto fail;
+ kfree(filename);
kfree(orig);
return 0;
-fail_free_name:
- kfree(filename);
fail:
+ kfree(filename);
free_filters_list(filters);
kfree(orig);
@@ -10242,7 +11178,7 @@ static void cpu_clock_event_read(struct perf_event *event)
static int cpu_clock_event_init(struct perf_event *event)
{
- if (event->attr.type != PERF_TYPE_SOFTWARE)
+ if (event->attr.type != perf_cpu_clock.type)
return -ENOENT;
if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
@@ -10263,6 +11199,7 @@ static struct pmu perf_cpu_clock = {
.task_ctx_nr = perf_sw_context,
.capabilities = PERF_PMU_CAP_NO_NMI,
+ .dev = PMU_NULL_DEV,
.event_init = cpu_clock_event_init,
.add = cpu_clock_event_add,
@@ -10323,7 +11260,7 @@ static void task_clock_event_read(struct perf_event *event)
static int task_clock_event_init(struct perf_event *event)
{
- if (event->attr.type != PERF_TYPE_SOFTWARE)
+ if (event->attr.type != perf_task_clock.type)
return -ENOENT;
if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
@@ -10344,6 +11281,7 @@ static struct pmu perf_task_clock = {
.task_ctx_nr = perf_sw_context,
.capabilities = PERF_PMU_CAP_NO_NMI,
+ .dev = PMU_NULL_DEV,
.event_init = task_clock_event_init,
.add = task_clock_event_add,
@@ -10413,36 +11351,9 @@ static int perf_event_idx_default(struct perf_event *event)
return 0;
}
-/*
- * Ensures all contexts with the same task_ctx_nr have the same
- * pmu_cpu_context too.
- */
-static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
-{
- struct pmu *pmu;
-
- if (ctxn < 0)
- return NULL;
-
- list_for_each_entry(pmu, &pmus, entry) {
- if (pmu->task_ctx_nr == ctxn)
- return pmu->pmu_cpu_context;
- }
-
- return NULL;
-}
-
static void free_pmu_context(struct pmu *pmu)
{
- /*
- * Static contexts such as perf_sw_context have a global lifetime
- * and may be shared between different PMUs. Avoid freeing them
- * when a single PMU is going away.
- */
- if (pmu->task_ctx_nr > perf_invalid_context)
- return;
-
- free_percpu(pmu->pmu_cpu_context);
+ free_percpu(pmu->cpu_pmu_context);
}
/*
@@ -10454,7 +11365,7 @@ static ssize_t nr_addr_filters_show(struct device *dev,
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
+ return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
}
DEVICE_ATTR_RO(nr_addr_filters);
@@ -10465,7 +11376,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
+ return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type);
}
static DEVICE_ATTR_RO(type);
@@ -10476,7 +11387,7 @@ perf_event_mux_interval_ms_show(struct device *dev,
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
+ return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms);
}
static DEFINE_MUTEX(mux_interval_mutex);
@@ -10506,12 +11417,11 @@ perf_event_mux_interval_ms_store(struct device *dev,
/* update all cpuctx for this PMU */
cpus_read_lock();
for_each_online_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+ struct perf_cpu_pmu_context *cpc;
+ cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
- cpu_function_call(cpu,
- (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
+ cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
}
cpus_read_unlock();
mutex_unlock(&mux_interval_mutex);
@@ -10523,9 +11433,30 @@ static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
static struct attribute *pmu_dev_attrs[] = {
&dev_attr_type.attr,
&dev_attr_perf_event_mux_interval_ms.attr,
+ &dev_attr_nr_addr_filters.attr,
+ NULL,
+};
+
+static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ if (n == 2 && !pmu->nr_addr_filters)
+ return 0;
+
+ return a->mode;
+}
+
+static struct attribute_group pmu_dev_attr_group = {
+ .is_visible = pmu_dev_is_visible,
+ .attrs = pmu_dev_attrs,
+};
+
+static const struct attribute_group *pmu_dev_groups[] = {
+ &pmu_dev_attr_group,
NULL,
};
-ATTRIBUTE_GROUPS(pmu_dev);
static int pmu_bus_running;
static struct bus_type pmu_bus = {
@@ -10548,29 +11479,25 @@ static int pmu_dev_alloc(struct pmu *pmu)
pmu->dev->groups = pmu->attr_groups;
device_initialize(pmu->dev);
- ret = dev_set_name(pmu->dev, "%s", pmu->name);
- if (ret)
- goto free_dev;
dev_set_drvdata(pmu->dev, pmu);
pmu->dev->bus = &pmu_bus;
+ pmu->dev->parent = pmu->parent;
pmu->dev->release = pmu_dev_release;
- ret = device_add(pmu->dev);
+
+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
if (ret)
goto free_dev;
- /* For PMUs with address filters, throw in an extra attribute: */
- if (pmu->nr_addr_filters)
- ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
-
+ ret = device_add(pmu->dev);
if (ret)
- goto del_dev;
+ goto free_dev;
- if (pmu->attr_update)
+ if (pmu->attr_update) {
ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
-
- if (ret)
- goto del_dev;
+ if (ret)
+ goto del_dev;
+ }
out:
return ret;
@@ -10597,72 +11524,44 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
goto unlock;
pmu->type = -1;
- if (!name)
- goto skip_type;
+ if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) {
+ ret = -EINVAL;
+ goto free_pdc;
+ }
+
pmu->name = name;
- if (type != PERF_TYPE_SOFTWARE) {
- if (type >= 0)
- max = type;
+ if (type >= 0)
+ max = type;
- ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
- if (ret < 0)
- goto free_pdc;
+ ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
+ if (ret < 0)
+ goto free_pdc;
- WARN_ON(type >= 0 && ret != type);
+ WARN_ON(type >= 0 && ret != type);
- type = ret;
- }
+ type = ret;
pmu->type = type;
- if (pmu_bus_running) {
+ if (pmu_bus_running && !pmu->dev) {
ret = pmu_dev_alloc(pmu);
if (ret)
goto free_idr;
}
-skip_type:
- if (pmu->task_ctx_nr == perf_hw_context) {
- static int hw_context_taken = 0;
-
- /*
- * Other than systems with heterogeneous CPUs, it never makes
- * sense for two PMUs to share perf_hw_context. PMUs which are
- * uncore must use perf_invalid_context.
- */
- if (WARN_ON_ONCE(hw_context_taken &&
- !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
- pmu->task_ctx_nr = perf_invalid_context;
-
- hw_context_taken = 1;
- }
-
- pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
- if (pmu->pmu_cpu_context)
- goto got_cpu_context;
-
ret = -ENOMEM;
- pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
- if (!pmu->pmu_cpu_context)
+ pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
+ if (!pmu->cpu_pmu_context)
goto free_dev;
for_each_possible_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_pmu_context *cpc;
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- __perf_event_init_context(&cpuctx->ctx);
- lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
- lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
- cpuctx->ctx.pmu = pmu;
- cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
-
- __perf_mux_hrtimer_init(cpuctx, cpu);
-
- cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
- cpuctx->heap = cpuctx->heap_default;
+ cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ __perf_init_event_pmu_context(&cpc->epc, pmu);
+ __perf_mux_hrtimer_init(cpc, cpu);
}
-got_cpu_context:
if (!pmu->start_txn) {
if (pmu->pmu_enable) {
/*
@@ -10691,16 +11590,7 @@ got_cpu_context:
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
- /*
- * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
- * since these cannot be in the IDR. This way the linear search
- * is fast, provided a valid software event is provided.
- */
- if (type == PERF_TYPE_SOFTWARE || !name)
- list_add_rcu(&pmu->entry, &pmus);
- else
- list_add_tail_rcu(&pmu->entry, &pmus);
-
+ list_add_rcu(&pmu->entry, &pmus);
atomic_set(&pmu->exclusive_cnt, 0);
ret = 0;
unlock:
@@ -10709,12 +11599,13 @@ unlock:
return ret;
free_dev:
- device_del(pmu->dev);
- put_device(pmu->dev);
+ if (pmu->dev && pmu->dev != PMU_NULL_DEV) {
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+ }
free_idr:
- if (pmu->type != PERF_TYPE_SOFTWARE)
- idr_remove(&pmu_idr, pmu->type);
+ idr_remove(&pmu_idr, pmu->type);
free_pdc:
free_percpu(pmu->pmu_disable_count);
@@ -10735,9 +11626,8 @@ void perf_pmu_unregister(struct pmu *pmu)
synchronize_rcu();
free_percpu(pmu->pmu_disable_count);
- if (pmu->type != PERF_TYPE_SOFTWARE)
- idr_remove(&pmu_idr, pmu->type);
- if (pmu_bus_running) {
+ idr_remove(&pmu_idr, pmu->type);
+ if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
if (pmu->nr_addr_filters)
device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
device_del(pmu->dev);
@@ -10805,11 +11695,18 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
static struct pmu *perf_init_event(struct perf_event *event)
{
+ bool extended_type = false;
int idx, type, ret;
struct pmu *pmu;
idx = srcu_read_lock(&pmus_srcu);
+ /*
+ * Save original type before calling pmu->event_init() since certain
+ * pmus overwrites event->attr.type to forward event to another pmu.
+ */
+ event->orig_type = event->attr.type;
+
/* Try parent's PMU first: */
if (event->parent && event->parent->pmu) {
pmu = event->parent->pmu;
@@ -10823,16 +11720,27 @@ static struct pmu *perf_init_event(struct perf_event *event)
* are often aliases for PERF_TYPE_RAW.
*/
type = event->attr.type;
- if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)
- type = PERF_TYPE_RAW;
+ if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) {
+ type = event->attr.config >> PERF_PMU_TYPE_SHIFT;
+ if (!type) {
+ type = PERF_TYPE_RAW;
+ } else {
+ extended_type = true;
+ event->attr.config &= PERF_HW_EVENT_MASK;
+ }
+ }
again:
rcu_read_lock();
pmu = idr_find(&pmu_idr, type);
rcu_read_unlock();
if (pmu) {
+ if (event->attr.type != type && type != PERF_TYPE_RAW &&
+ !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
+ goto fail;
+
ret = perf_try_init_event(pmu, event);
- if (ret == -ENOENT && event->attr.type != type) {
+ if (ret == -ENOENT && event->attr.type != type && !extended_type) {
type = event->attr.type;
goto again;
}
@@ -10853,6 +11761,7 @@ again:
goto unlock;
}
}
+fail:
pmu = ERR_PTR(-ENOENT);
unlock:
srcu_read_unlock(&pmus_srcu, idx);
@@ -10882,15 +11791,6 @@ static void account_pmu_sb_event(struct perf_event *event)
attach_sb_event(event);
}
-static void account_event_cpu(struct perf_event *event, int cpu)
-{
- if (event->parent)
- return;
-
- if (is_cgroup_event(event))
- atomic_inc(&per_cpu(perf_cgroup_events, cpu));
-}
-
/* Freq events need the tick to stay alive (see perf_event_task_tick). */
static void account_freq_event_nohz(void)
{
@@ -10919,10 +11819,12 @@ static void account_event(struct perf_event *event)
if (event->parent)
return;
- if (event->attach_state & PERF_ATTACH_TASK)
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
inc = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
+ if (event->attr.build_id)
+ atomic_inc(&nr_build_id_events);
if (event->attr.comm)
atomic_inc(&nr_comm_events);
if (event->attr.namespaces)
@@ -10945,6 +11847,8 @@ static void account_event(struct perf_event *event)
atomic_inc(&nr_ksymbol_events);
if (event->attr.bpf_event)
atomic_inc(&nr_bpf_events);
+ if (event->attr.text_poke)
+ atomic_inc(&nr_text_poke_events);
if (inc) {
/*
@@ -10974,8 +11878,6 @@ static void account_event(struct perf_event *event)
}
enabled:
- account_event_cpu(event, event->cpu);
-
account_pmu_sb_event(event);
}
@@ -10994,13 +11896,20 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
struct perf_event *event;
struct hw_perf_event *hwc;
long err = -EINVAL;
+ int node;
if ((unsigned)cpu >= nr_cpu_ids) {
if (!task || cpu != -1)
return ERR_PTR(-EINVAL);
}
+ if (attr->sigtrap && !task) {
+ /* Requires a task: avoid signalling random tasks. */
+ return ERR_PTR(-EINVAL);
+ }
- event = kzalloc(sizeof(*event), GFP_KERNEL);
+ node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
+ event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO,
+ node);
if (!event)
return ERR_PTR(-ENOMEM);
@@ -11025,8 +11934,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
init_waitqueue_head(&event->waitq);
- event->pending_disable = -1;
- init_irq_work(&event->pending, perf_pending_event);
+ init_irq_work(&event->pending_irq, perf_pending_irq);
+ init_task_work(&event->pending_task, perf_pending_task);
mutex_init(&event->mmap_mutex);
raw_spin_lock_init(&event->addr_filters.lock);
@@ -11045,6 +11954,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->state = PERF_EVENT_STATE_INACTIVE;
+ if (parent_event)
+ event->event_caps = parent_event->event_caps;
+
if (task) {
event->attach_state = PERF_ATTACH_TASK;
/*
@@ -11114,10 +12026,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
/*
- * Disallow uncore-cgroup events, they don't make sense as the cgroup will
- * be different on other CPUs in the uncore mask.
+ * Disallow uncore-task events. Similarly, disallow uncore-cgroup
+ * events (they don't make sense as the cgroup will be different
+ * on other CPUs in the uncore mask).
*/
- if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
+ if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
err = -EINVAL;
goto err_pmu;
}
@@ -11200,11 +12113,9 @@ err_pmu:
event->destroy(event);
module_put(pmu->module);
err_ns:
- if (event->ns)
- put_pid_ns(event->ns);
if (event->hw.target)
put_task_struct(event->hw.target);
- kfree(event);
+ call_rcu(&event->rcu_head, free_event_rcu);
return ERR_PTR(err);
}
@@ -11313,6 +12224,18 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (attr->sample_type & PERF_SAMPLE_CGROUP)
return -EINVAL;
#endif
+ if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
+ (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
+ return -EINVAL;
+
+ if (!attr->inherit && attr->inherit_thread)
+ return -EINVAL;
+
+ if (attr->remove_on_exec && attr->enable_on_exec)
+ return -EINVAL;
+
+ if (attr->sigtrap && !attr->remove_on_exec)
+ return -EINVAL;
out:
return ret;
@@ -11323,14 +12246,25 @@ err_size:
goto out;
}
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+ if (b < a)
+ swap(a, b);
+
+ mutex_lock(a);
+ mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
struct perf_buffer *rb = NULL;
int ret = -EINVAL;
- if (!output_event)
+ if (!output_event) {
+ mutex_lock(&event->mmap_mutex);
goto set;
+ }
/* don't allow circular references */
if (event == output_event)
@@ -11345,7 +12279,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
/*
* If its not a per-cpu rb, it must be the same task.
*/
- if (output_event->cpu == -1 && output_event->ctx != event->ctx)
+ if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
goto out;
/*
@@ -11368,8 +12302,15 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
event->pmu != output_event->pmu)
goto out;
+ /*
+ * Hold both mmap_mutex to serialize against perf_mmap_close(). Since
+ * output_event is already on rb->event_list, and the list iteration
+ * restarts after every removal, it is guaranteed this new event is
+ * observed *OR* if output_event is already removed, it's guaranteed we
+ * observe !rb->mmap_count.
+ */
+ mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
- mutex_lock(&event->mmap_mutex);
/* Can't redirect output if we've got an active mmap() */
if (atomic_read(&event->mmap_count))
goto unlock;
@@ -11379,6 +12320,12 @@ set:
rb = ring_buffer_get(output_event);
if (!rb)
goto unlock;
+
+ /* did we race against perf_mmap_close() */
+ if (!atomic_read(&rb->mmap_count)) {
+ ring_buffer_put(rb);
+ goto unlock;
+ }
}
ring_buffer_attach(event, rb);
@@ -11386,20 +12333,13 @@ set:
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex);
+ if (output_event)
+ mutex_unlock(&output_event->mmap_mutex);
out:
return ret;
}
-static void mutex_lock_double(struct mutex *a, struct mutex *b)
-{
- if (b < a)
- swap(a, b);
-
- mutex_lock(a);
- mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
-}
-
static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
{
bool nmi_safe = false;
@@ -11437,35 +12377,35 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
return 0;
}
-/*
- * Variation on perf_event_ctx_lock_nested(), except we take two context
- * mutexes.
- */
-static struct perf_event_context *
-__perf_event_ctx_lock_double(struct perf_event *group_leader,
- struct perf_event_context *ctx)
+static bool
+perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
{
- struct perf_event_context *gctx;
+ unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS;
+ bool is_capable = perfmon_capable();
-again:
- rcu_read_lock();
- gctx = READ_ONCE(group_leader->ctx);
- if (!refcount_inc_not_zero(&gctx->refcount)) {
+ if (attr->sigtrap) {
+ /*
+ * perf_event_attr::sigtrap sends signals to the other task.
+ * Require the current task to also have CAP_KILL.
+ */
+ rcu_read_lock();
+ is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
rcu_read_unlock();
- goto again;
- }
- rcu_read_unlock();
- mutex_lock_double(&gctx->mutex, &ctx->mutex);
-
- if (group_leader->ctx != gctx) {
- mutex_unlock(&ctx->mutex);
- mutex_unlock(&gctx->mutex);
- put_ctx(gctx);
- goto again;
+ /*
+ * If the required capabilities aren't available, checks for
+ * ptrace permissions: upgrade to ATTACH, since sending signals
+ * can effectively change the target task.
+ */
+ ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS;
}
- return gctx;
+ /*
+ * Preserve ptrace permission check for backwards compatibility. The
+ * ptrace check also includes checks that the current task and other
+ * task have matching uids, and is therefore not done here explicitly.
+ */
+ return is_capable || ptrace_may_access(task, ptrace_mode);
}
/**
@@ -11475,15 +12415,17 @@ again:
* @pid: target pid
* @cpu: target cpu
* @group_fd: group leader event fd
+ * @flags: perf event open flags
*/
SYSCALL_DEFINE5(perf_event_open,
struct perf_event_attr __user *, attr_uptr,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
struct perf_event *group_leader = NULL, *output_event = NULL;
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event *event, *sibling;
struct perf_event_attr attr;
- struct perf_event_context *ctx, *uninitialized_var(gctx);
+ struct perf_event_context *ctx;
struct file *event_file = NULL;
struct fd group = {NULL, 0};
struct task_struct *task = NULL;
@@ -11498,12 +12440,12 @@ SYSCALL_DEFINE5(perf_event_open,
if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
- /* Do we allow access to perf_event_open(2) ? */
- err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
+ err = perf_copy_attr(attr_uptr, &attr);
if (err)
return err;
- err = perf_copy_attr(attr_uptr, &attr);
+ /* Do we allow access to perf_event_open(2) ? */
+ err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
if (err)
return err;
@@ -11533,12 +12475,12 @@ SYSCALL_DEFINE5(perf_event_open,
return err;
}
- err = security_locked_down(LOCKDOWN_PERF);
- if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR))
- /* REGS_INTR can leak data, lockdown must prevent this */
- return err;
-
- err = 0;
+ /* REGS_INTR can leak data, lockdown must prevent this */
+ if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
+ err = security_locked_down(LOCKDOWN_PERF);
+ if (err)
+ return err;
+ }
/*
* In cgroup mode, the pid argument is used to pass the fd
@@ -11581,24 +12523,6 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_task;
}
- if (task) {
- err = mutex_lock_interruptible(&task->signal->exec_update_mutex);
- if (err)
- goto err_task;
-
- /*
- * Reuse ptrace permission checks for now.
- *
- * We must hold exec_update_mutex across this and any potential
- * perf_install_in_context() call for this new event to
- * serialize against exec() altering our credentials (and the
- * perf_event_exit_task() that could imply).
- */
- err = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
- goto err_cred;
- }
-
if (flags & PERF_FLAG_PID_CGROUP)
cgroup_fd = pid;
@@ -11606,7 +12530,7 @@ SYSCALL_DEFINE5(perf_event_open,
NULL, NULL, cgroup_fd);
if (IS_ERR(event)) {
err = PTR_ERR(event);
- goto err_cred;
+ goto err_task;
}
if (is_sampling_event(event)) {
@@ -11631,42 +12555,53 @@ SYSCALL_DEFINE5(perf_event_open,
if (pmu->task_ctx_nr == perf_sw_context)
event->event_caps |= PERF_EV_CAP_SOFTWARE;
- if (group_leader) {
- if (is_software_event(event) &&
- !in_software_context(group_leader)) {
- /*
- * If the event is a sw event, but the group_leader
- * is on hw context.
- *
- * Allow the addition of software events to hw
- * groups, this is safe because software events
- * never fail to schedule.
- */
- pmu = group_leader->ctx->pmu;
- } else if (!is_software_event(event) &&
- is_software_event(group_leader) &&
- (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
- /*
- * In case the group is a pure software group, and we
- * try to add a hardware event, move the whole group to
- * the hardware context.
- */
- move_group = 1;
- }
+ if (task) {
+ err = down_read_interruptible(&task->signal->exec_update_lock);
+ if (err)
+ goto err_alloc;
+
+ /*
+ * We must hold exec_update_lock across this and any potential
+ * perf_install_in_context() call for this new event to
+ * serialize against exec() altering our credentials (and the
+ * perf_event_exit_task() that could imply).
+ */
+ err = -EACCES;
+ if (!perf_check_permission(&attr, task))
+ goto err_cred;
}
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(pmu, task, event);
+ ctx = find_get_context(task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
- goto err_alloc;
+ goto err_cred;
+ }
+
+ mutex_lock(&ctx->mutex);
+
+ if (ctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_locked;
+ }
+
+ if (!task) {
+ /*
+ * Check if the @cpu we're creating an event for is online.
+ *
+ * We use the perf_cpu_context::ctx::mutex to serialize against
+ * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+ */
+ struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
+
+ if (!cpuctx->online) {
+ err = -ENODEV;
+ goto err_locked;
+ }
}
- /*
- * Look up the group leader (we will attach this event to it):
- */
if (group_leader) {
err = -EINVAL;
@@ -11675,11 +12610,11 @@ SYSCALL_DEFINE5(perf_event_open,
* becoming part of another group-sibling):
*/
if (group_leader->group_leader != group_leader)
- goto err_context;
+ goto err_locked;
/* All events in a group should have the same clock */
if (group_leader->clock != event->clock)
- goto err_context;
+ goto err_locked;
/*
* Make sure we're both events for the same CPU;
@@ -11687,115 +12622,76 @@ SYSCALL_DEFINE5(perf_event_open,
* you can never concurrently schedule them anyhow.
*/
if (group_leader->cpu != event->cpu)
- goto err_context;
-
- /*
- * Make sure we're both on the same task, or both
- * per-CPU events.
- */
- if (group_leader->ctx->task != ctx->task)
- goto err_context;
+ goto err_locked;
/*
- * Do not allow to attach to a group in a different task
- * or CPU context. If we're moving SW events, we'll fix
- * this up later, so allow that.
+ * Make sure we're both on the same context; either task or cpu.
*/
- if (!move_group && group_leader->ctx != ctx)
- goto err_context;
+ if (group_leader->ctx != ctx)
+ goto err_locked;
/*
* Only a group leader can be exclusive or pinned
*/
if (attr.exclusive || attr.pinned)
- goto err_context;
- }
-
- if (output_event) {
- err = perf_event_set_output(event, output_event);
- if (err)
- goto err_context;
- }
-
- event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
- f_flags);
- if (IS_ERR(event_file)) {
- err = PTR_ERR(event_file);
- event_file = NULL;
- goto err_context;
- }
-
- if (move_group) {
- gctx = __perf_event_ctx_lock_double(group_leader, ctx);
-
- if (gctx->task == TASK_TOMBSTONE) {
- err = -ESRCH;
goto err_locked;
- }
- /*
- * Check if we raced against another sys_perf_event_open() call
- * moving the software group underneath us.
- */
- if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ if (is_software_event(event) &&
+ !in_software_context(group_leader)) {
/*
- * If someone moved the group out from under us, check
- * if this new event wound up on the same ctx, if so
- * its the regular !move_group case, otherwise fail.
+ * If the event is a sw event, but the group_leader
+ * is on hw context.
+ *
+ * Allow the addition of software events to hw
+ * groups, this is safe because software events
+ * never fail to schedule.
+ *
+ * Note the comment that goes with struct
+ * perf_event_pmu_context.
*/
- if (gctx != ctx) {
- err = -EINVAL;
- goto err_locked;
- } else {
- perf_event_ctx_unlock(group_leader, gctx);
- move_group = 0;
+ pmu = group_leader->pmu_ctx->pmu;
+ } else if (!is_software_event(event)) {
+ if (is_software_event(group_leader) &&
+ (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ /*
+ * In case the group is a pure software group, and we
+ * try to add a hardware event, move the whole group to
+ * the hardware context.
+ */
+ move_group = 1;
}
- }
- /*
- * Failure to create exclusive events returns -EBUSY.
- */
- err = -EBUSY;
- if (!exclusive_event_installable(group_leader, ctx))
- goto err_locked;
-
- for_each_sibling_event(sibling, group_leader) {
- if (!exclusive_event_installable(sibling, ctx))
+ /* Don't allow group of multiple hw events from different pmus */
+ if (!in_software_context(group_leader) &&
+ group_leader->pmu_ctx->pmu != pmu)
goto err_locked;
}
- } else {
- mutex_lock(&ctx->mutex);
}
- if (ctx->task == TASK_TOMBSTONE) {
- err = -ESRCH;
+ /*
+ * Now that we're certain of the pmu; find the pmu_ctx.
+ */
+ pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+ if (IS_ERR(pmu_ctx)) {
+ err = PTR_ERR(pmu_ctx);
goto err_locked;
}
+ event->pmu_ctx = pmu_ctx;
- if (!perf_event_validate_size(event)) {
- err = -E2BIG;
- goto err_locked;
+ if (output_event) {
+ err = perf_event_set_output(event, output_event);
+ if (err)
+ goto err_context;
}
- if (!task) {
- /*
- * Check if the @cpu we're creating an event for is online.
- *
- * We use the perf_cpu_context::ctx::mutex to serialize against
- * the hotplug notifiers. See perf_event_{init,exit}_cpu().
- */
- struct perf_cpu_context *cpuctx =
- container_of(ctx, struct perf_cpu_context, ctx);
-
- if (!cpuctx->online) {
- err = -ENODEV;
- goto err_locked;
- }
+ if (!perf_event_validate_size(event)) {
+ err = -E2BIG;
+ goto err_context;
}
if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
err = -EINVAL;
- goto err_locked;
+ goto err_context;
}
/*
@@ -11804,36 +12700,33 @@ SYSCALL_DEFINE5(perf_event_open,
*/
if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY;
- goto err_locked;
+ goto err_context;
}
WARN_ON_ONCE(ctx->parent_ctx);
+ event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags);
+ if (IS_ERR(event_file)) {
+ err = PTR_ERR(event_file);
+ event_file = NULL;
+ goto err_context;
+ }
+
/*
* This is the point on no return; we cannot fail hereafter. This is
* where we start modifying current state.
*/
if (move_group) {
- /*
- * See perf_event_ctx_lock() for comments on the details
- * of swizzling perf_event::ctx.
- */
perf_remove_from_context(group_leader, 0);
- put_ctx(gctx);
+ put_pmu_ctx(group_leader->pmu_ctx);
for_each_sibling_event(sibling, group_leader) {
perf_remove_from_context(sibling, 0);
- put_ctx(gctx);
+ put_pmu_ctx(sibling->pmu_ctx);
}
/*
- * Wait for everybody to stop referencing the events through
- * the old lists, before installing it on new lists.
- */
- synchronize_rcu();
-
- /*
* Install the group siblings before the group leader.
*
* Because a group leader will try and install the entire group
@@ -11844,9 +12737,10 @@ SYSCALL_DEFINE5(perf_event_open,
* reachable through the group lists.
*/
for_each_sibling_event(sibling, group_leader) {
+ sibling->pmu_ctx = pmu_ctx;
+ get_pmu_ctx(pmu_ctx);
perf_event__state_init(sibling);
perf_install_in_context(ctx, sibling, sibling->cpu);
- get_ctx(ctx);
}
/*
@@ -11854,9 +12748,10 @@ SYSCALL_DEFINE5(perf_event_open,
* event. What we want here is event in the initial
* startup state, ready to be add into new context.
*/
+ group_leader->pmu_ctx = pmu_ctx;
+ get_pmu_ctx(pmu_ctx);
perf_event__state_init(group_leader);
perf_install_in_context(ctx, group_leader, group_leader->cpu);
- get_ctx(ctx);
}
/*
@@ -11873,12 +12768,10 @@ SYSCALL_DEFINE5(perf_event_open,
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
- if (move_group)
- perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
if (task) {
- mutex_unlock(&task->signal->exec_update_mutex);
+ up_read(&task->signal->exec_update_lock);
put_task_struct(task);
}
@@ -11896,25 +12789,18 @@ SYSCALL_DEFINE5(perf_event_open,
fd_install(event_fd, event_file);
return event_fd;
+err_context:
+ put_pmu_ctx(event->pmu_ctx);
+ event->pmu_ctx = NULL; /* _free_event() */
err_locked:
- if (move_group)
- perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
-/* err_file: */
- fput(event_file);
-err_context:
perf_unpin_context(ctx);
put_ctx(ctx);
-err_alloc:
- /*
- * If event_file is set, the fput() above will have called ->release()
- * and that will take care of freeing the event.
- */
- if (!event_file)
- free_event(event);
err_cred:
if (task)
- mutex_unlock(&task->signal->exec_update_mutex);
+ up_read(&task->signal->exec_update_lock);
+err_alloc:
+ free_event(event);
err_task:
if (task)
put_task_struct(task);
@@ -11931,6 +12817,8 @@ err_fd:
* @attr: attributes of the counter to create
* @cpu: cpu in which the counter is bound
* @task: task to profile (NULL for percpu)
+ * @overflow_handler: callback to trigger when we hit the event
+ * @context: context data could be used in overflow_handler callback
*/
struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
@@ -11938,8 +12826,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
perf_overflow_handler_t overflow_handler,
void *context)
{
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event_context *ctx;
struct perf_event *event;
+ struct pmu *pmu;
int err;
/*
@@ -11958,14 +12848,18 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
/* Mark owner so we could distinguish it from user events. */
event->owner = TASK_TOMBSTONE;
+ pmu = event->pmu;
+
+ if (pmu->task_ctx_nr == perf_sw_context)
+ event->event_caps |= PERF_EV_CAP_SOFTWARE;
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(event->pmu, task, event);
+ ctx = find_get_context(task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
- goto err_free;
+ goto err_alloc;
}
WARN_ON_ONCE(ctx->parent_ctx);
@@ -11975,6 +12869,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err_unlock;
}
+ pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+ if (IS_ERR(pmu_ctx)) {
+ err = PTR_ERR(pmu_ctx);
+ goto err_unlock;
+ }
+ event->pmu_ctx = pmu_ctx;
+
if (!task) {
/*
* Check if the @cpu we're creating an event for is online.
@@ -11986,13 +12887,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
container_of(ctx, struct perf_cpu_context, ctx);
if (!cpuctx->online) {
err = -ENODEV;
- goto err_unlock;
+ goto err_pmu_ctx;
}
}
if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY;
- goto err_unlock;
+ goto err_pmu_ctx;
}
perf_install_in_context(ctx, event, event->cpu);
@@ -12001,44 +12902,67 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
return event;
+err_pmu_ctx:
+ put_pmu_ctx(pmu_ctx);
+ event->pmu_ctx = NULL; /* _free_event() */
err_unlock:
mutex_unlock(&ctx->mutex);
perf_unpin_context(ctx);
put_ctx(ctx);
-err_free:
+err_alloc:
free_event(event);
err:
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
-void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+static void __perf_pmu_remove(struct perf_event_context *ctx,
+ int cpu, struct pmu *pmu,
+ struct perf_event_groups *groups,
+ struct list_head *events)
{
- struct perf_event_context *src_ctx;
- struct perf_event_context *dst_ctx;
- struct perf_event *event, *tmp;
- LIST_HEAD(events);
-
- src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
- dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
+ struct perf_event *event, *sibling;
- /*
- * See perf_event_ctx_lock() for comments on the details
- * of swizzling perf_event::ctx.
- */
- mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
- list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
- event_entry) {
+ perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
perf_remove_from_context(event, 0);
- unaccount_event_cpu(event, src_cpu);
- put_ctx(src_ctx);
- list_add(&event->migrate_entry, &events);
+ put_pmu_ctx(event->pmu_ctx);
+ list_add(&event->migrate_entry, events);
+
+ for_each_sibling_event(sibling, event) {
+ perf_remove_from_context(sibling, 0);
+ put_pmu_ctx(sibling->pmu_ctx);
+ list_add(&sibling->migrate_entry, events);
+ }
}
+}
+
+static void __perf_pmu_install_event(struct pmu *pmu,
+ struct perf_event_context *ctx,
+ int cpu, struct perf_event *event)
+{
+ struct perf_event_pmu_context *epc;
+ struct perf_event_context *old_ctx = event->ctx;
+
+ get_ctx(ctx); /* normally find_get_context() */
+
+ event->cpu = cpu;
+ epc = find_get_pmu_context(pmu, ctx, event);
+ event->pmu_ctx = epc;
+
+ if (event->state >= PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ perf_install_in_context(ctx, event, cpu);
/*
- * Wait for the events to quiesce before re-instating them.
+ * Now that event->ctx is updated and visible, put the old ctx.
*/
- synchronize_rcu();
+ put_ctx(old_ctx);
+}
+
+static void __perf_pmu_install(struct perf_event_context *ctx,
+ int cpu, struct pmu *pmu, struct list_head *events)
+{
+ struct perf_event *event, *tmp;
/*
* Re-instate events in 2 passes.
@@ -12048,43 +12972,70 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
* leader will enable its siblings, even if those are still on the old
* context.
*/
- list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_for_each_entry_safe(event, tmp, events, migrate_entry) {
if (event->group_leader == event)
continue;
list_del(&event->migrate_entry);
- if (event->state >= PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_INACTIVE;
- account_event_cpu(event, dst_cpu);
- perf_install_in_context(dst_ctx, event, dst_cpu);
- get_ctx(dst_ctx);
+ __perf_pmu_install_event(pmu, ctx, cpu, event);
}
/*
* Once all the siblings are setup properly, install the group leaders
* to make it go.
*/
- list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_for_each_entry_safe(event, tmp, events, migrate_entry) {
list_del(&event->migrate_entry);
- if (event->state >= PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_INACTIVE;
- account_event_cpu(event, dst_cpu);
- perf_install_in_context(dst_ctx, event, dst_cpu);
- get_ctx(dst_ctx);
+ __perf_pmu_install_event(pmu, ctx, cpu, event);
+ }
+}
+
+void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+{
+ struct perf_event_context *src_ctx, *dst_ctx;
+ LIST_HEAD(events);
+
+ /*
+ * Since per-cpu context is persistent, no need to grab an extra
+ * reference.
+ */
+ src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
+ dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
+
+ /*
+ * See perf_event_ctx_lock() for comments on the details
+ * of swizzling perf_event::ctx.
+ */
+ mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
+
+ __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
+ __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);
+
+ if (!list_empty(&events)) {
+ /*
+ * Wait for the events to quiesce before re-instating them.
+ */
+ synchronize_rcu();
+
+ __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
}
+
mutex_unlock(&dst_ctx->mutex);
mutex_unlock(&src_ctx->mutex);
}
EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
-static void sync_child_event(struct perf_event *child_event,
- struct task_struct *child)
+static void sync_child_event(struct perf_event *child_event)
{
struct perf_event *parent_event = child_event->parent;
u64 child_val;
- if (child_event->attr.inherit_stat)
- perf_event_read_event(child_event, child);
+ if (child_event->attr.inherit_stat) {
+ struct task_struct *task = child_event->ctx->task;
+
+ if (task && task != TASK_TOMBSTONE)
+ perf_event_read_event(child_event, task);
+ }
child_val = perf_event_count(child_event);
@@ -12099,70 +13050,63 @@ static void sync_child_event(struct perf_event *child_event,
}
static void
-perf_event_exit_event(struct perf_event *child_event,
- struct perf_event_context *child_ctx,
- struct task_struct *child)
+perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
{
- struct perf_event *parent_event = child_event->parent;
+ struct perf_event *parent_event = event->parent;
+ unsigned long detach_flags = 0;
- /*
- * Do not destroy the 'original' grouping; because of the context
- * switch optimization the original events could've ended up in a
- * random child task.
- *
- * If we were to destroy the original group, all group related
- * operations would cease to function properly after this random
- * child dies.
- *
- * Do destroy all inherited groups, we don't care about those
- * and being thorough is better.
- */
- raw_spin_lock_irq(&child_ctx->lock);
- WARN_ON_ONCE(child_ctx->is_active);
+ if (parent_event) {
+ /*
+ * Do not destroy the 'original' grouping; because of the
+ * context switch optimization the original events could've
+ * ended up in a random child task.
+ *
+ * If we were to destroy the original group, all group related
+ * operations would cease to function properly after this
+ * random child dies.
+ *
+ * Do destroy all inherited groups, we don't care about those
+ * and being thorough is better.
+ */
+ detach_flags = DETACH_GROUP | DETACH_CHILD;
+ mutex_lock(&parent_event->child_mutex);
+ }
- if (parent_event)
- perf_group_detach(child_event);
- list_del_event(child_event, child_ctx);
- perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
- raw_spin_unlock_irq(&child_ctx->lock);
+ perf_remove_from_context(event, detach_flags);
+
+ raw_spin_lock_irq(&ctx->lock);
+ if (event->state > PERF_EVENT_STATE_EXIT)
+ perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
+ raw_spin_unlock_irq(&ctx->lock);
/*
- * Parent events are governed by their filedesc, retain them.
+ * Child events can be freed.
*/
- if (!parent_event) {
- perf_event_wakeup(child_event);
+ if (parent_event) {
+ mutex_unlock(&parent_event->child_mutex);
+ /*
+ * Kick perf_poll() for is_event_hup();
+ */
+ perf_event_wakeup(parent_event);
+ free_event(event);
+ put_event(parent_event);
return;
}
- /*
- * Child events can be cleaned up.
- */
-
- sync_child_event(child_event, child);
/*
- * Remove this event from the parent's list
- */
- WARN_ON_ONCE(parent_event->ctx->parent_ctx);
- mutex_lock(&parent_event->child_mutex);
- list_del_init(&child_event->child_list);
- mutex_unlock(&parent_event->child_mutex);
-
- /*
- * Kick perf_poll() for is_event_hup().
+ * Parent events are governed by their filedesc, retain them.
*/
- perf_event_wakeup(parent_event);
- free_event(child_event);
- put_event(parent_event);
+ perf_event_wakeup(event);
}
-static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
+static void perf_event_exit_task_context(struct task_struct *child)
{
struct perf_event_context *child_ctx, *clone_ctx = NULL;
struct perf_event *child_event, *next;
WARN_ON_ONCE(child != current);
- child_ctx = perf_pin_task_context(child, ctxn);
+ child_ctx = perf_pin_task_context(child);
if (!child_ctx)
return;
@@ -12184,13 +13128,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* in.
*/
raw_spin_lock_irq(&child_ctx->lock);
- task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
+ task_ctx_sched_out(child_ctx, EVENT_ALL);
/*
* Now that the context is inactive, destroy the task <-> ctx relation
* and mark the context dead.
*/
- RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
+ RCU_INIT_POINTER(child->perf_event_ctxp, NULL);
put_ctx(child_ctx); /* cannot be last */
WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
put_task_struct(current); /* cannot be last */
@@ -12209,7 +13153,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
perf_event_task(child, child_ctx, 0);
list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
- perf_event_exit_event(child_event, child_ctx, child);
+ perf_event_exit_event(child_event, child_ctx);
mutex_unlock(&child_ctx->mutex);
@@ -12219,13 +13163,12 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
/*
* When a child task exits, feed back event values to parent events.
*
- * Can be called with exec_update_mutex held when called from
+ * Can be called with exec_update_lock held when called from
* setup_new_exec().
*/
void perf_event_exit_task(struct task_struct *child)
{
struct perf_event *event, *tmp;
- int ctxn;
mutex_lock(&child->perf_event_mutex);
list_for_each_entry_safe(event, tmp, &child->perf_event_list,
@@ -12241,8 +13184,7 @@ void perf_event_exit_task(struct task_struct *child)
}
mutex_unlock(&child->perf_event_mutex);
- for_each_task_context_nr(ctxn)
- perf_event_exit_task_context(child, ctxn);
+ perf_event_exit_task_context(child);
/*
* The perf_event_exit_task_context calls perf_event_task
@@ -12285,56 +13227,51 @@ void perf_event_free_task(struct task_struct *task)
{
struct perf_event_context *ctx;
struct perf_event *event, *tmp;
- int ctxn;
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (!ctx)
- continue;
+ ctx = rcu_access_pointer(task->perf_event_ctxp);
+ if (!ctx)
+ return;
- mutex_lock(&ctx->mutex);
- raw_spin_lock_irq(&ctx->lock);
- /*
- * Destroy the task <-> ctx relation and mark the context dead.
- *
- * This is important because even though the task hasn't been
- * exposed yet the context has been (through child_list).
- */
- RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
- WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
- put_task_struct(task); /* cannot be last */
- raw_spin_unlock_irq(&ctx->lock);
+ mutex_lock(&ctx->mutex);
+ raw_spin_lock_irq(&ctx->lock);
+ /*
+ * Destroy the task <-> ctx relation and mark the context dead.
+ *
+ * This is important because even though the task hasn't been
+ * exposed yet the context has been (through child_list).
+ */
+ RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
+ WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
+ put_task_struct(task); /* cannot be last */
+ raw_spin_unlock_irq(&ctx->lock);
- list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
- perf_free_event(event, ctx);
- mutex_unlock(&ctx->mutex);
+ list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
+ perf_free_event(event, ctx);
- /*
- * perf_event_release_kernel() could've stolen some of our
- * child events and still have them on its free_list. In that
- * case we must wait for these events to have been freed (in
- * particular all their references to this task must've been
- * dropped).
- *
- * Without this copy_process() will unconditionally free this
- * task (irrespective of its reference count) and
- * _free_event()'s put_task_struct(event->hw.target) will be a
- * use-after-free.
- *
- * Wait for all events to drop their context reference.
- */
- wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
- put_ctx(ctx); /* must be last */
- }
+ mutex_unlock(&ctx->mutex);
+
+ /*
+ * perf_event_release_kernel() could've stolen some of our
+ * child events and still have them on its free_list. In that
+ * case we must wait for these events to have been freed (in
+ * particular all their references to this task must've been
+ * dropped).
+ *
+ * Without this copy_process() will unconditionally free this
+ * task (irrespective of its reference count) and
+ * _free_event()'s put_task_struct(event->hw.target) will be a
+ * use-after-free.
+ *
+ * Wait for all events to drop their context reference.
+ */
+ wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
+ put_ctx(ctx); /* must be last */
}
void perf_event_delayed_put(struct task_struct *task)
{
- int ctxn;
-
- for_each_task_context_nr(ctxn)
- WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+ WARN_ON_ONCE(task->perf_event_ctxp);
}
struct file *perf_event_get(unsigned int fd)
@@ -12384,6 +13321,7 @@ inherit_event(struct perf_event *parent_event,
struct perf_event_context *child_ctx)
{
enum perf_event_state parent_state = parent_event->state;
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event *child_event;
unsigned long flags;
@@ -12404,18 +13342,12 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
-
- if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
- !child_ctx->task_ctx_data) {
- struct pmu *pmu = child_event->pmu;
-
- child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
- GFP_KERNEL);
- if (!child_ctx->task_ctx_data) {
- free_event(child_event);
- return ERR_PTR(-ENOMEM);
- }
+ pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
+ if (IS_ERR(pmu_ctx)) {
+ free_event(child_event);
+ return ERR_CAST(pmu_ctx);
}
+ child_event->pmu_ctx = pmu_ctx;
/*
* is_orphaned_event() and list_add_tail(&parent_event->child_list)
@@ -12470,6 +13402,7 @@ inherit_event(struct perf_event *parent_event,
*/
raw_spin_lock_irqsave(&child_ctx->lock, flags);
add_event_to_ctx(child_event, child_ctx);
+ child_event->attach_state |= PERF_ATTACH_CHILD;
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
/*
@@ -12520,6 +13453,8 @@ static int inherit_group(struct perf_event *parent_event,
!perf_get_aux_event(child_ctr, leader))
return -EINVAL;
}
+ if (leader)
+ leader->group_generation = parent_event->group_generation;
return 0;
}
@@ -12537,18 +13472,21 @@ static int inherit_group(struct perf_event *parent_event,
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
struct perf_event_context *parent_ctx,
- struct task_struct *child, int ctxn,
- int *inherited_all)
+ struct task_struct *child,
+ u64 clone_flags, int *inherited_all)
{
- int ret;
struct perf_event_context *child_ctx;
+ int ret;
- if (!event->attr.inherit) {
+ if (!event->attr.inherit ||
+ (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
+ /* Do not inherit if sigtrap and signal handlers were cleared. */
+ (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) {
*inherited_all = 0;
return 0;
}
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = child->perf_event_ctxp;
if (!child_ctx) {
/*
* This is executed from the parent task context, so
@@ -12556,16 +13494,14 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* First allocate and initialize a context for the
* child.
*/
- child_ctx = alloc_perf_context(parent_ctx->pmu, child);
+ child_ctx = alloc_perf_context(child);
if (!child_ctx)
return -ENOMEM;
- child->perf_event_ctxp[ctxn] = child_ctx;
+ child->perf_event_ctxp = child_ctx;
}
- ret = inherit_group(event, parent, parent_ctx,
- child, child_ctx);
-
+ ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
if (ret)
*inherited_all = 0;
@@ -12575,7 +13511,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
/*
* Initialize the perf_event context in task_struct
*/
-static int perf_event_init_context(struct task_struct *child, int ctxn)
+static int perf_event_init_context(struct task_struct *child, u64 clone_flags)
{
struct perf_event_context *child_ctx, *parent_ctx;
struct perf_event_context *cloned_ctx;
@@ -12585,14 +13521,14 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
unsigned long flags;
int ret = 0;
- if (likely(!parent->perf_event_ctxp[ctxn]))
+ if (likely(!parent->perf_event_ctxp))
return 0;
/*
* If the parent's context is a clone, pin it so it won't get
* swapped under us.
*/
- parent_ctx = perf_pin_task_context(parent, ctxn);
+ parent_ctx = perf_pin_task_context(parent);
if (!parent_ctx)
return 0;
@@ -12615,7 +13551,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
*/
perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, &inherited_all);
+ child, clone_flags, &inherited_all);
if (ret)
goto out_unlock;
}
@@ -12631,7 +13567,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, &inherited_all);
+ child, clone_flags, &inherited_all);
if (ret)
goto out_unlock;
}
@@ -12639,7 +13575,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
parent_ctx->rotate_disable = 0;
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = child->perf_event_ctxp;
if (child_ctx && inherited_all) {
/*
@@ -12673,20 +13609,18 @@ out_unlock:
/*
* Initialize the perf_event context in task_struct
*/
-int perf_event_init_task(struct task_struct *child)
+int perf_event_init_task(struct task_struct *child, u64 clone_flags)
{
- int ctxn, ret;
+ int ret;
- memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+ child->perf_event_ctxp = NULL;
mutex_init(&child->perf_event_mutex);
INIT_LIST_HEAD(&child->perf_event_list);
- for_each_task_context_nr(ctxn) {
- ret = perf_event_init_context(child, ctxn);
- if (ret) {
- perf_event_free_task(child);
- return ret;
- }
+ ret = perf_event_init_context(child, clone_flags);
+ if (ret) {
+ perf_event_free_task(child);
+ return ret;
}
return 0;
@@ -12695,6 +13629,7 @@ int perf_event_init_task(struct task_struct *child)
static void __init perf_event_init_all_cpus(void)
{
struct swevent_htable *swhash;
+ struct perf_cpu_context *cpuctx;
int cpu;
zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
@@ -12702,15 +13637,19 @@ static void __init perf_event_init_all_cpus(void)
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
mutex_init(&swhash->hlist_mutex);
- INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
-#ifdef CONFIG_CGROUP_PERF
- INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
-#endif
INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
+
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ __perf_event_init_context(&cpuctx->ctx);
+ lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
+ lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
+ cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
+ cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
+ cpuctx->heap = cpuctx->heap_default;
}
}
@@ -12732,12 +13671,12 @@ static void perf_swevent_init_cpu(unsigned int cpu)
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *ctx = __info;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event *event;
raw_spin_lock(&ctx->lock);
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
list_for_each_entry(event, &ctx->event_list, event_entry)
__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
raw_spin_unlock(&ctx->lock);
@@ -12747,18 +13686,16 @@ static void perf_event_exit_cpu_context(int cpu)
{
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- struct pmu *pmu;
+ // XXX simplify cpuctx->online
mutex_lock(&pmus_lock);
- list_for_each_entry(pmu, &pmus, entry) {
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- ctx = &cpuctx->ctx;
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
- mutex_lock(&ctx->mutex);
- smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
- cpuctx->online = 0;
- mutex_unlock(&ctx->mutex);
- }
+ mutex_lock(&ctx->mutex);
+ smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+ cpuctx->online = 0;
+ mutex_unlock(&ctx->mutex);
cpumask_clear_cpu(cpu, perf_online_mask);
mutex_unlock(&pmus_lock);
}
@@ -12772,20 +13709,17 @@ int perf_event_init_cpu(unsigned int cpu)
{
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- struct pmu *pmu;
perf_swevent_init_cpu(cpu);
mutex_lock(&pmus_lock);
cpumask_set_cpu(cpu, perf_online_mask);
- list_for_each_entry(pmu, &pmus, entry) {
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- ctx = &cpuctx->ctx;
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
- mutex_lock(&ctx->mutex);
- cpuctx->online = 1;
- mutex_unlock(&ctx->mutex);
- }
+ mutex_lock(&ctx->mutex);
+ cpuctx->online = 1;
+ mutex_unlock(&ctx->mutex);
mutex_unlock(&pmus_lock);
return 0;
@@ -12826,8 +13760,8 @@ void __init perf_event_init(void)
perf_event_init_all_cpus();
init_srcu_struct(&pmus_srcu);
perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
- perf_pmu_register(&perf_cpu_clock, NULL, -1);
- perf_pmu_register(&perf_task_clock, NULL, -1);
+ perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1);
+ perf_pmu_register(&perf_task_clock, "task_clock", -1);
perf_tp_register();
perf_event_init_cpu(smp_processor_id());
register_reboot_notifier(&perf_reboot_notifier);
@@ -12835,6 +13769,8 @@ void __init perf_event_init(void)
ret = init_hw_breakpoint();
WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
+ perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC);
+
/*
* Build time assertion that we keep the data_head at the intended
* location. IOW, validation we got the __reserved[] size right.
@@ -12868,7 +13804,7 @@ static int __init perf_event_sysfs_init(void)
goto unlock;
list_for_each_entry(pmu, &pmus, entry) {
- if (!pmu->name || pmu->type < 0)
+ if (pmu->dev)
continue;
ret = pmu_dev_alloc(pmu);
@@ -12920,9 +13856,11 @@ static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
static int __perf_cgroup_move(void *info)
{
struct task_struct *task = info;
- rcu_read_lock();
- perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
- rcu_read_unlock();
+
+ preempt_disable();
+ perf_cgroup_switch(task);
+ preempt_enable();
+
return 0;
}
@@ -12949,3 +13887,5 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
.threaded = true,
};
#endif /* CONFIG_CGROUP_PERF */
+
+DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index b48d7039a015..6c2cb4e4f48d 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -17,61 +17,276 @@
* This file contains the arch-independent routines.
*/
+#include <linux/hw_breakpoint.h>
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
+#include <linux/init.h>
#include <linux/irqflags.h>
-#include <linux/kallsyms.h>
-#include <linux/notifier.h>
-#include <linux/kprobes.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/percpu-rwsem.h>
#include <linux/percpu.h>
+#include <linux/rhashtable.h>
#include <linux/sched.h>
-#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/bug.h>
-#include <linux/hw_breakpoint.h>
/*
- * Constraints data
+ * Datastructure to track the total uses of N slots across tasks or CPUs;
+ * bp_slots_histogram::count[N] is the number of assigned N+1 breakpoint slots.
+ */
+struct bp_slots_histogram {
+#ifdef hw_breakpoint_slots
+ atomic_t count[hw_breakpoint_slots(0)];
+#else
+ atomic_t *count;
+#endif
+};
+
+/*
+ * Per-CPU constraints data.
*/
struct bp_cpuinfo {
- /* Number of pinned cpu breakpoints in a cpu */
- unsigned int cpu_pinned;
- /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
- unsigned int *tsk_pinned;
- /* Number of non-pinned cpu/task breakpoints in a cpu */
- unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */
+ /* Number of pinned CPU breakpoints in a CPU. */
+ unsigned int cpu_pinned;
+ /* Histogram of pinned task breakpoints in a CPU. */
+ struct bp_slots_histogram tsk_pinned;
};
static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);
-static int nr_slots[TYPE_MAX];
static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
{
return per_cpu_ptr(bp_cpuinfo + type, cpu);
}
+/* Number of pinned CPU breakpoints globally. */
+static struct bp_slots_histogram cpu_pinned[TYPE_MAX];
+/* Number of pinned CPU-independent task breakpoints. */
+static struct bp_slots_histogram tsk_pinned_all[TYPE_MAX];
+
/* Keep track of the breakpoints attached to tasks */
-static LIST_HEAD(bp_task_head);
+static struct rhltable task_bps_ht;
+static const struct rhashtable_params task_bps_ht_params = {
+ .head_offset = offsetof(struct hw_perf_event, bp_list),
+ .key_offset = offsetof(struct hw_perf_event, target),
+ .key_len = sizeof_field(struct hw_perf_event, target),
+ .automatic_shrinking = true,
+};
-static int constraints_initialized;
+static bool constraints_initialized __ro_after_init;
-/* Gather the number of total pinned and un-pinned bp in a cpuset */
-struct bp_busy_slots {
- unsigned int pinned;
- unsigned int flexible;
-};
+/*
+ * Synchronizes accesses to the per-CPU constraints; the locking rules are:
+ *
+ * 1. Atomic updates to bp_cpuinfo::tsk_pinned only require a held read-lock
+ * (due to bp_slots_histogram::count being atomic, no update are lost).
+ *
+ * 2. Holding a write-lock is required for computations that require a
+ * stable snapshot of all bp_cpuinfo::tsk_pinned.
+ *
+ * 3. In all other cases, non-atomic accesses require the appropriately held
+ * lock (read-lock for read-only accesses; write-lock for reads/writes).
+ */
+DEFINE_STATIC_PERCPU_RWSEM(bp_cpuinfo_sem);
+
+/*
+ * Return mutex to serialize accesses to per-task lists in task_bps_ht. Since
+ * rhltable synchronizes concurrent insertions/deletions, independent tasks may
+ * insert/delete concurrently; therefore, a mutex per task is sufficient.
+ *
+ * Uses task_struct::perf_event_mutex, to avoid extending task_struct with a
+ * hw_breakpoint-only mutex, which may be infrequently used. The caveat here is
+ * that hw_breakpoint may contend with per-task perf event list management. The
+ * assumption is that perf usecases involving hw_breakpoints are very unlikely
+ * to result in unnecessary contention.
+ */
+static inline struct mutex *get_task_bps_mutex(struct perf_event *bp)
+{
+ struct task_struct *tsk = bp->hw.target;
+
+ return tsk ? &tsk->perf_event_mutex : NULL;
+}
+
+static struct mutex *bp_constraints_lock(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ if (tsk_mtx) {
+ /*
+ * Fully analogous to the perf_try_init_event() nesting
+ * argument in the comment near perf_event_ctx_lock_nested();
+ * this child->perf_event_mutex cannot ever deadlock against
+ * the parent->perf_event_mutex usage from
+ * perf_event_task_{en,dis}able().
+ *
+ * Specifically, inherited events will never occur on
+ * ->perf_event_list.
+ */
+ mutex_lock_nested(tsk_mtx, SINGLE_DEPTH_NESTING);
+ percpu_down_read(&bp_cpuinfo_sem);
+ } else {
+ percpu_down_write(&bp_cpuinfo_sem);
+ }
+
+ return tsk_mtx;
+}
+
+static void bp_constraints_unlock(struct mutex *tsk_mtx)
+{
+ if (tsk_mtx) {
+ percpu_up_read(&bp_cpuinfo_sem);
+ mutex_unlock(tsk_mtx);
+ } else {
+ percpu_up_write(&bp_cpuinfo_sem);
+ }
+}
+
+static bool bp_constraints_is_locked(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
-/* Serialize accesses to the above constraints */
-static DEFINE_MUTEX(nr_bp_mutex);
+ return percpu_is_write_locked(&bp_cpuinfo_sem) ||
+ (tsk_mtx ? mutex_is_locked(tsk_mtx) :
+ percpu_is_read_locked(&bp_cpuinfo_sem));
+}
-__weak int hw_breakpoint_weight(struct perf_event *bp)
+static inline void assert_bp_constraints_lock_held(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ if (tsk_mtx)
+ lockdep_assert_held(tsk_mtx);
+ lockdep_assert_held(&bp_cpuinfo_sem);
+}
+
+#ifdef hw_breakpoint_slots
+/*
+ * Number of breakpoint slots is constant, and the same for all types.
+ */
+static_assert(hw_breakpoint_slots(TYPE_INST) == hw_breakpoint_slots(TYPE_DATA));
+static inline int hw_breakpoint_slots_cached(int type) { return hw_breakpoint_slots(type); }
+static inline int init_breakpoint_slots(void) { return 0; }
+#else
+/*
+ * Dynamic number of breakpoint slots.
+ */
+static int __nr_bp_slots[TYPE_MAX] __ro_after_init;
+
+static inline int hw_breakpoint_slots_cached(int type)
+{
+ return __nr_bp_slots[type];
+}
+
+static __init bool
+bp_slots_histogram_alloc(struct bp_slots_histogram *hist, enum bp_type_idx type)
+{
+ hist->count = kcalloc(hw_breakpoint_slots_cached(type), sizeof(*hist->count), GFP_KERNEL);
+ return hist->count;
+}
+
+static __init void bp_slots_histogram_free(struct bp_slots_histogram *hist)
+{
+ kfree(hist->count);
+}
+
+static __init int init_breakpoint_slots(void)
+{
+ int i, cpu, err_cpu;
+
+ for (i = 0; i < TYPE_MAX; i++)
+ __nr_bp_slots[i] = hw_breakpoint_slots(i);
+
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < TYPE_MAX; i++) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, i);
+
+ if (!bp_slots_histogram_alloc(&info->tsk_pinned, i))
+ goto err;
+ }
+ }
+ for (i = 0; i < TYPE_MAX; i++) {
+ if (!bp_slots_histogram_alloc(&cpu_pinned[i], i))
+ goto err;
+ if (!bp_slots_histogram_alloc(&tsk_pinned_all[i], i))
+ goto err;
+ }
+
+ return 0;
+err:
+ for_each_possible_cpu(err_cpu) {
+ for (i = 0; i < TYPE_MAX; i++)
+ bp_slots_histogram_free(&get_bp_info(err_cpu, i)->tsk_pinned);
+ if (err_cpu == cpu)
+ break;
+ }
+ for (i = 0; i < TYPE_MAX; i++) {
+ bp_slots_histogram_free(&cpu_pinned[i]);
+ bp_slots_histogram_free(&tsk_pinned_all[i]);
+ }
+
+ return -ENOMEM;
+}
+#endif
+
+static inline void
+bp_slots_histogram_add(struct bp_slots_histogram *hist, int old, int val)
+{
+ const int old_idx = old - 1;
+ const int new_idx = old_idx + val;
+
+ if (old_idx >= 0)
+ WARN_ON(atomic_dec_return_relaxed(&hist->count[old_idx]) < 0);
+ if (new_idx >= 0)
+ WARN_ON(atomic_inc_return_relaxed(&hist->count[new_idx]) < 0);
+}
+
+static int
+bp_slots_histogram_max(struct bp_slots_histogram *hist, enum bp_type_idx type)
+{
+ for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) {
+ const int count = atomic_read(&hist->count[i]);
+
+ /* Catch unexpected writers; we want a stable snapshot. */
+ ASSERT_EXCLUSIVE_WRITER(hist->count[i]);
+ if (count > 0)
+ return i + 1;
+ WARN(count < 0, "inconsistent breakpoint slots histogram");
+ }
+
+ return 0;
+}
+
+static int
+bp_slots_histogram_max_merge(struct bp_slots_histogram *hist1, struct bp_slots_histogram *hist2,
+ enum bp_type_idx type)
+{
+ for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) {
+ const int count1 = atomic_read(&hist1->count[i]);
+ const int count2 = atomic_read(&hist2->count[i]);
+
+ /* Catch unexpected writers; we want a stable snapshot. */
+ ASSERT_EXCLUSIVE_WRITER(hist1->count[i]);
+ ASSERT_EXCLUSIVE_WRITER(hist2->count[i]);
+ if (count1 + count2 > 0)
+ return i + 1;
+ WARN(count1 < 0, "inconsistent breakpoint slots histogram");
+ WARN(count2 < 0, "inconsistent breakpoint slots histogram");
+ }
+
+ return 0;
+}
+
+#ifndef hw_breakpoint_weight
+static inline int hw_breakpoint_weight(struct perf_event *bp)
{
return 1;
}
+#endif
static inline enum bp_type_idx find_slot_idx(u64 bp_type)
{
@@ -82,39 +297,61 @@ static inline enum bp_type_idx find_slot_idx(u64 bp_type)
}
/*
- * Report the maximum number of pinned breakpoints a task
- * have in this cpu
+ * Return the maximum number of pinned breakpoints a task has in this CPU.
*/
static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
{
- unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
- int i;
-
- for (i = nr_slots[type] - 1; i >= 0; i--) {
- if (tsk_pinned[i] > 0)
- return i + 1;
- }
+ struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned;
- return 0;
+ /*
+ * At this point we want to have acquired the bp_cpuinfo_sem as a
+ * writer to ensure that there are no concurrent writers in
+ * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot.
+ */
+ lockdep_assert_held_write(&bp_cpuinfo_sem);
+ return bp_slots_histogram_max_merge(tsk_pinned, &tsk_pinned_all[type], type);
}
/*
* Count the number of breakpoints of the same type and same task.
* The given event must be not on the list.
+ *
+ * If @cpu is -1, but the result of task_bp_pinned() is not CPU-independent,
+ * returns a negative value.
*/
static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
{
- struct task_struct *tsk = bp->hw.target;
+ struct rhlist_head *head, *pos;
struct perf_event *iter;
int count = 0;
- list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
- if (iter->hw.target == tsk &&
- find_slot_idx(iter->attr.bp_type) == type &&
- (iter->cpu < 0 || cpu == iter->cpu))
- count += hw_breakpoint_weight(iter);
+ /*
+ * We need a stable snapshot of the per-task breakpoint list.
+ */
+ assert_bp_constraints_lock_held(bp);
+
+ rcu_read_lock();
+ head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params);
+ if (!head)
+ goto out;
+
+ rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) {
+ if (find_slot_idx(iter->attr.bp_type) != type)
+ continue;
+
+ if (iter->cpu >= 0) {
+ if (cpu == -1) {
+ count = -1;
+ goto out;
+ } else if (cpu != iter->cpu)
+ continue;
+ }
+
+ count += hw_breakpoint_weight(iter);
}
+out:
+ rcu_read_unlock();
return count;
}
@@ -126,16 +363,29 @@ static const struct cpumask *cpumask_of_bp(struct perf_event *bp)
}
/*
- * Report the number of pinned/un-pinned breakpoints we have in
- * a given cpu (cpu > -1) or in all of them (cpu = -1).
+ * Returns the max pinned breakpoint slots in a given
+ * CPU (cpu > -1) or across all of them (cpu = -1).
*/
-static void
-fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
- enum bp_type_idx type)
+static int
+max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type)
{
const struct cpumask *cpumask = cpumask_of_bp(bp);
+ int pinned_slots = 0;
int cpu;
+ if (bp->hw.target && bp->cpu < 0) {
+ int max_pinned = task_bp_pinned(-1, bp, type);
+
+ if (max_pinned >= 0) {
+ /*
+ * Fast path: task_bp_pinned() is CPU-independent and
+ * returns the same value for any CPU.
+ */
+ max_pinned += bp_slots_histogram_max(&cpu_pinned[type], type);
+ return max_pinned;
+ }
+ }
+
for_each_cpu(cpu, cpumask) {
struct bp_cpuinfo *info = get_bp_info(cpu, type);
int nr;
@@ -146,95 +396,140 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
else
nr += task_bp_pinned(cpu, bp, type);
- if (nr > slots->pinned)
- slots->pinned = nr;
-
- nr = info->flexible;
- if (nr > slots->flexible)
- slots->flexible = nr;
+ pinned_slots = max(nr, pinned_slots);
}
-}
-
-/*
- * For now, continue to consider flexible as pinned, until we can
- * ensure no flexible event can ever be scheduled before a pinned event
- * in a same cpu.
- */
-static void
-fetch_this_slot(struct bp_busy_slots *slots, int weight)
-{
- slots->pinned += weight;
-}
-
-/*
- * Add a pinned breakpoint for the given task in our constraint table
- */
-static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
- enum bp_type_idx type, int weight)
-{
- unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
- int old_idx, new_idx;
-
- old_idx = task_bp_pinned(cpu, bp, type) - 1;
- new_idx = old_idx + weight;
- if (old_idx >= 0)
- tsk_pinned[old_idx]--;
- if (new_idx >= 0)
- tsk_pinned[new_idx]++;
+ return pinned_slots;
}
/*
* Add/remove the given breakpoint in our constraint table
*/
-static void
-toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
- int weight)
+static int
+toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight)
{
- const struct cpumask *cpumask = cpumask_of_bp(bp);
- int cpu;
+ int cpu, next_tsk_pinned;
if (!enable)
weight = -weight;
- /* Pinned counter cpu profiling */
if (!bp->hw.target) {
- get_bp_info(bp->cpu, type)->cpu_pinned += weight;
- return;
+ /*
+ * Update the pinned CPU slots, in per-CPU bp_cpuinfo and in the
+ * global histogram.
+ */
+ struct bp_cpuinfo *info = get_bp_info(bp->cpu, type);
+
+ lockdep_assert_held_write(&bp_cpuinfo_sem);
+ bp_slots_histogram_add(&cpu_pinned[type], info->cpu_pinned, weight);
+ info->cpu_pinned += weight;
+ return 0;
}
- /* Pinned counter task profiling */
- for_each_cpu(cpu, cpumask)
- toggle_bp_task_slot(bp, cpu, type, weight);
+ /*
+ * If bp->hw.target, tsk_pinned is only modified, but not used
+ * otherwise. We can permit concurrent updates as long as there are no
+ * other uses: having acquired bp_cpuinfo_sem as a reader allows
+ * concurrent updates here. Uses of tsk_pinned will require acquiring
+ * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value.
+ */
+ lockdep_assert_held_read(&bp_cpuinfo_sem);
- if (enable)
- list_add_tail(&bp->hw.bp_list, &bp_task_head);
- else
- list_del(&bp->hw.bp_list);
-}
+ /*
+ * Update the pinned task slots, in per-CPU bp_cpuinfo and in the global
+ * histogram. We need to take care of 4 cases:
+ *
+ * 1. This breakpoint targets all CPUs (cpu < 0), and there may only
+ * exist other task breakpoints targeting all CPUs. In this case we
+ * can simply update the global slots histogram.
+ *
+ * 2. This breakpoint targets a specific CPU (cpu >= 0), but there may
+ * only exist other task breakpoints targeting all CPUs.
+ *
+ * a. On enable: remove the existing breakpoints from the global
+ * slots histogram and use the per-CPU histogram.
+ *
+ * b. On disable: re-insert the existing breakpoints into the global
+ * slots histogram and remove from per-CPU histogram.
+ *
+ * 3. Some other existing task breakpoints target specific CPUs. Only
+ * update the per-CPU slots histogram.
+ */
-__weak int arch_reserve_bp_slot(struct perf_event *bp)
-{
- return 0;
-}
+ if (!enable) {
+ /*
+ * Remove before updating histograms so we can determine if this
+ * was the last task breakpoint for a specific CPU.
+ */
+ int ret = rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
-__weak void arch_release_bp_slot(struct perf_event *bp)
-{
-}
+ if (ret)
+ return ret;
+ }
+ /*
+ * Note: If !enable, next_tsk_pinned will not count the to-be-removed breakpoint.
+ */
+ next_tsk_pinned = task_bp_pinned(-1, bp, type);
+
+ if (next_tsk_pinned >= 0) {
+ if (bp->cpu < 0) { /* Case 1: fast path */
+ if (!enable)
+ next_tsk_pinned += hw_breakpoint_weight(bp);
+ bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, weight);
+ } else if (enable) { /* Case 2.a: slow path */
+ /* Add existing to per-CPU histograms. */
+ for_each_possible_cpu(cpu) {
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ 0, next_tsk_pinned);
+ }
+ /* Add this first CPU-pinned task breakpoint. */
+ bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned,
+ next_tsk_pinned, weight);
+ /* Rebalance global task pinned histogram. */
+ bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned,
+ -next_tsk_pinned);
+ } else { /* Case 2.b: slow path */
+ /* Remove this last CPU-pinned task breakpoint. */
+ bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned,
+ next_tsk_pinned + hw_breakpoint_weight(bp), weight);
+ /* Remove all from per-CPU histograms. */
+ for_each_possible_cpu(cpu) {
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ next_tsk_pinned, -next_tsk_pinned);
+ }
+ /* Rebalance global task pinned histogram. */
+ bp_slots_histogram_add(&tsk_pinned_all[type], 0, next_tsk_pinned);
+ }
+ } else { /* Case 3: slow path */
+ const struct cpumask *cpumask = cpumask_of_bp(bp);
+
+ for_each_cpu(cpu, cpumask) {
+ next_tsk_pinned = task_bp_pinned(cpu, bp, type);
+ if (!enable)
+ next_tsk_pinned += hw_breakpoint_weight(bp);
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ next_tsk_pinned, weight);
+ }
+ }
-/*
- * Function to perform processor-specific cleanup during unregistration
- */
-__weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
-{
/*
- * A weak stub function here for those archs that don't define
- * it inside arch/.../kernel/hw_breakpoint.c
+ * Readers want a stable snapshot of the per-task breakpoint list.
*/
+ assert_bp_constraints_lock_held(bp);
+
+ if (enable)
+ return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
+
+ return 0;
}
/*
- * Constraints to check before allowing this new breakpoint counter:
+ * Constraints to check before allowing this new breakpoint counter.
+ *
+ * Note: Flexible breakpoints are currently unimplemented, but outlined in the
+ * below algorithm for completeness. The implementation treats flexible as
+ * pinned due to no guarantee that we currently always schedule flexible events
+ * before a pinned event in a same CPU.
*
* == Non-pinned counter == (Considered as pinned for now)
*
@@ -276,10 +571,9 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
*/
static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
{
- struct bp_busy_slots slots = {0};
enum bp_type_idx type;
+ int max_pinned_slots;
int weight;
- int ret;
/* We couldn't initialize breakpoint constraints on boot */
if (!constraints_initialized)
@@ -293,36 +587,20 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
- fetch_bp_busy_slots(&slots, bp, type);
- /*
- * Simulate the addition of this breakpoint to the constraints
- * and see the result.
- */
- fetch_this_slot(&slots, weight);
-
- /* Flexible counters need to keep at least one slot */
- if (slots.pinned + (!!slots.flexible) > nr_slots[type])
+ /* Check if this new breakpoint can be satisfied across all CPUs. */
+ max_pinned_slots = max_bp_pinned_slots(bp, type) + weight;
+ if (max_pinned_slots > hw_breakpoint_slots_cached(type))
return -ENOSPC;
- ret = arch_reserve_bp_slot(bp);
- if (ret)
- return ret;
-
- toggle_bp_slot(bp, true, type, weight);
-
- return 0;
+ return toggle_bp_slot(bp, true, type, weight);
}
int reserve_bp_slot(struct perf_event *bp)
{
- int ret;
-
- mutex_lock(&nr_bp_mutex);
-
- ret = __reserve_bp_slot(bp, bp->attr.bp_type);
-
- mutex_unlock(&nr_bp_mutex);
+ struct mutex *mtx = bp_constraints_lock(bp);
+ int ret = __reserve_bp_slot(bp, bp->attr.bp_type);
+ bp_constraints_unlock(mtx);
return ret;
}
@@ -331,21 +609,17 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type)
enum bp_type_idx type;
int weight;
- arch_release_bp_slot(bp);
-
type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
- toggle_bp_slot(bp, false, type, weight);
+ WARN_ON(toggle_bp_slot(bp, false, type, weight));
}
void release_bp_slot(struct perf_event *bp)
{
- mutex_lock(&nr_bp_mutex);
+ struct mutex *mtx = bp_constraints_lock(bp);
- arch_unregister_hw_breakpoint(bp);
__release_bp_slot(bp, bp->attr.bp_type);
-
- mutex_unlock(&nr_bp_mutex);
+ bp_constraints_unlock(mtx);
}
static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
@@ -372,11 +646,10 @@ static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
{
- int ret;
+ struct mutex *mtx = bp_constraints_lock(bp);
+ int ret = __modify_bp_slot(bp, old_type, new_type);
- mutex_lock(&nr_bp_mutex);
- ret = __modify_bp_slot(bp, old_type, new_type);
- mutex_unlock(&nr_bp_mutex);
+ bp_constraints_unlock(mtx);
return ret;
}
@@ -387,18 +660,28 @@ static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
*/
int dbg_reserve_bp_slot(struct perf_event *bp)
{
- if (mutex_is_locked(&nr_bp_mutex))
+ int ret;
+
+ if (bp_constraints_is_locked(bp))
return -1;
- return __reserve_bp_slot(bp, bp->attr.bp_type);
+ /* Locks aren't held; disable lockdep assert checking. */
+ lockdep_off();
+ ret = __reserve_bp_slot(bp, bp->attr.bp_type);
+ lockdep_on();
+
+ return ret;
}
int dbg_release_bp_slot(struct perf_event *bp)
{
- if (mutex_is_locked(&nr_bp_mutex))
+ if (bp_constraints_is_locked(bp))
return -1;
+ /* Locks aren't held; disable lockdep assert checking. */
+ lockdep_off();
__release_bp_slot(bp, bp->attr.bp_type);
+ lockdep_on();
return 0;
}
@@ -451,6 +734,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
* register_user_hw_breakpoint - register a hardware breakpoint for user space
* @attr: breakpoint attributes
* @triggered: callback to trigger when we hit the breakpoint
+ * @context: context data could be used in the triggered callback
* @tsk: pointer to 'task_struct' of the process to which the address belongs
*/
struct perf_event *
@@ -550,6 +834,7 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
* register_wide_hw_breakpoint - register a wide breakpoint in the kernel
* @attr: breakpoint attributes
* @triggered: callback to trigger when we hit the breakpoint
+ * @context: context data could be used in the triggered callback
*
* @return a set of per_cpu pointers to perf events
*/
@@ -566,7 +851,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
if (!cpu_events)
return (void __percpu __force *)ERR_PTR(-ENOMEM);
- get_online_cpus();
+ cpus_read_lock();
for_each_online_cpu(cpu) {
bp = perf_event_create_kernel_counter(attr, cpu, NULL,
triggered, context);
@@ -577,7 +862,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
per_cpu(*cpu_events, cpu) = bp;
}
- put_online_cpus();
+ cpus_read_unlock();
if (likely(!err))
return cpu_events;
@@ -602,6 +887,50 @@ void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
}
EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
+/**
+ * hw_breakpoint_is_used - check if breakpoints are currently used
+ *
+ * Returns: true if breakpoints are used, false otherwise.
+ */
+bool hw_breakpoint_is_used(void)
+{
+ int cpu;
+
+ if (!constraints_initialized)
+ return false;
+
+ for_each_possible_cpu(cpu) {
+ for (int type = 0; type < TYPE_MAX; ++type) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, type);
+
+ if (info->cpu_pinned)
+ return true;
+
+ for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) {
+ if (atomic_read(&info->tsk_pinned.count[slot]))
+ return true;
+ }
+ }
+ }
+
+ for (int type = 0; type < TYPE_MAX; ++type) {
+ for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) {
+ /*
+ * Warn, because if there are CPU pinned counters,
+ * should never get here; bp_cpuinfo::cpu_pinned should
+ * be consistent with the global cpu_pinned histogram.
+ */
+ if (WARN_ON(atomic_read(&cpu_pinned[type].count[slot])))
+ return true;
+
+ if (atomic_read(&tsk_pinned_all[type].count[slot]))
+ return true;
+ }
+ }
+
+ return false;
+}
+
static struct notifier_block hw_breakpoint_exceptions_nb = {
.notifier_call = hw_breakpoint_exceptions_notify,
/* we need to be notified first */
@@ -676,38 +1005,19 @@ static struct pmu perf_breakpoint = {
int __init init_hw_breakpoint(void)
{
- int cpu, err_cpu;
- int i;
-
- for (i = 0; i < TYPE_MAX; i++)
- nr_slots[i] = hw_breakpoint_slots(i);
+ int ret;
- for_each_possible_cpu(cpu) {
- for (i = 0; i < TYPE_MAX; i++) {
- struct bp_cpuinfo *info = get_bp_info(cpu, i);
+ ret = rhltable_init(&task_bps_ht, &task_bps_ht_params);
+ if (ret)
+ return ret;
- info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
- GFP_KERNEL);
- if (!info->tsk_pinned)
- goto err_alloc;
- }
- }
+ ret = init_breakpoint_slots();
+ if (ret)
+ return ret;
- constraints_initialized = 1;
+ constraints_initialized = true;
perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
return register_die_notifier(&hw_breakpoint_exceptions_nb);
-
- err_alloc:
- for_each_possible_cpu(err_cpu) {
- for (i = 0; i < TYPE_MAX; i++)
- kfree(get_bp_info(err_cpu, i)->tsk_pinned);
- if (err_cpu == cpu)
- break;
- }
-
- return -ENOMEM;
}
-
-
diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c
new file mode 100644
index 000000000000..2cfeeecf8de9
--- /dev/null
+++ b/kernel/events/hw_breakpoint_test.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test for hw_breakpoint constraints accounting logic.
+ *
+ * Copyright (C) 2022, Google LLC.
+ */
+
+#include <kunit/test.h>
+#include <linux/cpumask.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/kthread.h>
+#include <linux/perf_event.h>
+#include <asm/hw_breakpoint.h>
+
+#define TEST_REQUIRES_BP_SLOTS(test, slots) \
+ do { \
+ if ((slots) > get_test_bp_slots()) { \
+ kunit_skip((test), "Requires breakpoint slots: %d > %d", slots, \
+ get_test_bp_slots()); \
+ } \
+ } while (0)
+
+#define TEST_EXPECT_NOSPC(expr) KUNIT_EXPECT_EQ(test, -ENOSPC, PTR_ERR(expr))
+
+#define MAX_TEST_BREAKPOINTS 512
+
+static char break_vars[MAX_TEST_BREAKPOINTS];
+static struct perf_event *test_bps[MAX_TEST_BREAKPOINTS];
+static struct task_struct *__other_task;
+
+static struct perf_event *register_test_bp(int cpu, struct task_struct *tsk, int idx)
+{
+ struct perf_event_attr attr = {};
+
+ if (WARN_ON(idx < 0 || idx >= MAX_TEST_BREAKPOINTS))
+ return NULL;
+
+ hw_breakpoint_init(&attr);
+ attr.bp_addr = (unsigned long)&break_vars[idx];
+ attr.bp_len = HW_BREAKPOINT_LEN_1;
+ attr.bp_type = HW_BREAKPOINT_RW;
+ return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL);
+}
+
+static void unregister_test_bp(struct perf_event **bp)
+{
+ if (WARN_ON(IS_ERR(*bp)))
+ return;
+ if (WARN_ON(!*bp))
+ return;
+ unregister_hw_breakpoint(*bp);
+ *bp = NULL;
+}
+
+static int get_test_bp_slots(void)
+{
+ static int slots;
+
+ if (!slots)
+ slots = hw_breakpoint_slots(TYPE_DATA);
+
+ return slots;
+}
+
+static void fill_one_bp_slot(struct kunit *test, int *id, int cpu, struct task_struct *tsk)
+{
+ struct perf_event *bp = register_test_bp(cpu, tsk, *id);
+
+ KUNIT_ASSERT_NOT_NULL(test, bp);
+ KUNIT_ASSERT_FALSE(test, IS_ERR(bp));
+ KUNIT_ASSERT_NULL(test, test_bps[*id]);
+ test_bps[(*id)++] = bp;
+}
+
+/*
+ * Fills up the given @cpu/@tsk with breakpoints, only leaving @skip slots free.
+ *
+ * Returns true if this can be called again, continuing at @id.
+ */
+static bool fill_bp_slots(struct kunit *test, int *id, int cpu, struct task_struct *tsk, int skip)
+{
+ for (int i = 0; i < get_test_bp_slots() - skip; ++i)
+ fill_one_bp_slot(test, id, cpu, tsk);
+
+ return *id + get_test_bp_slots() <= MAX_TEST_BREAKPOINTS;
+}
+
+static int dummy_kthread(void *arg)
+{
+ return 0;
+}
+
+static struct task_struct *get_other_task(struct kunit *test)
+{
+ struct task_struct *tsk;
+
+ if (__other_task)
+ return __other_task;
+
+ tsk = kthread_create(dummy_kthread, NULL, "hw_breakpoint_dummy_task");
+ KUNIT_ASSERT_FALSE(test, IS_ERR(tsk));
+ __other_task = tsk;
+ return __other_task;
+}
+
+static int get_test_cpu(int num)
+{
+ int cpu;
+
+ WARN_ON(num < 0);
+
+ for_each_online_cpu(cpu) {
+ if (num-- <= 0)
+ break;
+ }
+
+ return cpu;
+}
+
+/* ===== Test cases ===== */
+
+static void test_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), NULL, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_many_cpus(struct kunit *test)
+{
+ int idx = 0;
+ int cpu;
+
+ /* Test that CPUs are independent. */
+ for_each_online_cpu(cpu) {
+ bool do_continue = fill_bp_slots(test, &idx, cpu, NULL, 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(cpu, NULL, idx));
+ if (!do_continue)
+ break;
+ }
+}
+
+static void test_one_task_on_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, -1, current, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Remove one and adding back CPU-target should work. */
+ unregister_test_bp(&test_bps[0]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+}
+
+static void test_two_tasks_on_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ /* Test that tasks are independent. */
+ fill_bp_slots(test, &idx, -1, current, 0);
+ fill_bp_slots(test, &idx, -1, get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Remove one from first task and adding back CPU-target should not work. */
+ unregister_test_bp(&test_bps[0]);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_one_task_on_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /*
+ * Remove one and adding back CPU-target should work; this case is
+ * special vs. above because the task's constraints are CPU-dependent.
+ */
+ unregister_test_bp(&test_bps[0]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+}
+
+static void test_one_task_mixed(struct kunit *test)
+{
+ int idx = 0;
+
+ TEST_REQUIRES_BP_SLOTS(test, 3);
+
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), current);
+ fill_bp_slots(test, &idx, -1, current, 1);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+
+ /* Transition from CPU-dependent pinned count to CPU-independent. */
+ unregister_test_bp(&test_bps[0]);
+ unregister_test_bp(&test_bps[1]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_two_tasks_on_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ fill_bp_slots(test, &idx, get_test_cpu(0), get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Can still create breakpoints on some other CPU. */
+ fill_bp_slots(test, &idx, get_test_cpu(1), NULL, 0);
+}
+
+static void test_two_tasks_on_one_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ fill_bp_slots(test, &idx, -1, get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Cannot create breakpoints on some other CPU either. */
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+}
+
+static void test_task_on_all_and_one_cpu(struct kunit *test)
+{
+ int tsk_on_cpu_idx, cpu_idx;
+ int idx = 0;
+
+ TEST_REQUIRES_BP_SLOTS(test, 3);
+
+ fill_bp_slots(test, &idx, -1, current, 2);
+ /* Transitioning from only all CPU breakpoints to mixed. */
+ tsk_on_cpu_idx = idx;
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), current);
+ fill_one_bp_slot(test, &idx, -1, current);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+
+ /* We should still be able to use up another CPU's slots. */
+ cpu_idx = idx;
+ fill_one_bp_slot(test, &idx, get_test_cpu(1), NULL);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+
+ /* Transitioning back to task target on all CPUs. */
+ unregister_test_bp(&test_bps[tsk_on_cpu_idx]);
+ /* Still have a CPU target breakpoint in get_test_cpu(1). */
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ /* Remove it and try again. */
+ unregister_test_bp(&test_bps[cpu_idx]);
+ fill_one_bp_slot(test, &idx, -1, current);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+}
+
+static struct kunit_case hw_breakpoint_test_cases[] = {
+ KUNIT_CASE(test_one_cpu),
+ KUNIT_CASE(test_many_cpus),
+ KUNIT_CASE(test_one_task_on_all_cpus),
+ KUNIT_CASE(test_two_tasks_on_all_cpus),
+ KUNIT_CASE(test_one_task_on_one_cpu),
+ KUNIT_CASE(test_one_task_mixed),
+ KUNIT_CASE(test_two_tasks_on_one_cpu),
+ KUNIT_CASE(test_two_tasks_on_one_all_cpus),
+ KUNIT_CASE(test_task_on_all_and_one_cpu),
+ {},
+};
+
+static int test_init(struct kunit *test)
+{
+ /* Most test cases want 2 distinct CPUs. */
+ if (num_online_cpus() < 2)
+ kunit_skip(test, "not enough cpus");
+
+ /* Want the system to not use breakpoints elsewhere. */
+ if (hw_breakpoint_is_used())
+ kunit_skip(test, "hw breakpoint already in use");
+
+ return 0;
+}
+
+static void test_exit(struct kunit *test)
+{
+ for (int i = 0; i < MAX_TEST_BREAKPOINTS; ++i) {
+ if (test_bps[i])
+ unregister_test_bp(&test_bps[i]);
+ }
+
+ if (__other_task) {
+ kthread_stop(__other_task);
+ __other_task = NULL;
+ }
+
+ /* Verify that internal state agrees that no breakpoints are in use. */
+ KUNIT_EXPECT_FALSE(test, hw_breakpoint_is_used());
+}
+
+static struct kunit_suite hw_breakpoint_test_suite = {
+ .name = "hw_breakpoint",
+ .test_cases = hw_breakpoint_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+};
+
+kunit_test_suites(&hw_breakpoint_test_suite);
+
+MODULE_AUTHOR("Marco Elver <elver@google.com>");
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index fcbf5616a441..5150d5f84c03 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -116,6 +116,11 @@ static inline int page_order(struct perf_buffer *rb)
}
#endif
+static inline int data_page_nr(struct perf_buffer *rb)
+{
+ return rb->nr_pages << page_order(rb);
+}
+
static inline unsigned long perf_data_size(struct perf_buffer *rb)
{
return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
@@ -205,16 +210,7 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
static inline int get_recursion_context(int *recursion)
{
- int rctx;
-
- if (unlikely(in_nmi()))
- rctx = 3;
- else if (in_irq())
- rctx = 2;
- else if (in_softirq())
- rctx = 1;
- else
- rctx = 0;
+ unsigned char rctx = interrupt_context_level();
if (recursion[rctx])
return -1;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 192b8abc6330..60ed43d1c29e 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -22,7 +22,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
atomic_set(&handle->rb->poll, EPOLLIN);
handle->event->pending_wakeup = 1;
- irq_work_queue(&handle->event->pending);
+ irq_work_queue(&handle->event->pending_irq);
}
/*
@@ -147,6 +147,7 @@ ring_buffer_has_space(unsigned long head, unsigned long tail,
static __always_inline int
__perf_output_begin(struct perf_output_handle *handle,
+ struct perf_sample_data *data,
struct perf_event *event, unsigned int size,
bool backward)
{
@@ -171,8 +172,10 @@ __perf_output_begin(struct perf_output_handle *handle,
goto out;
if (unlikely(rb->paused)) {
- if (rb->nr_pages)
+ if (rb->nr_pages) {
local_inc(&rb->lost);
+ atomic64_inc(&event->lost_samples);
+ }
goto out;
}
@@ -188,9 +191,10 @@ __perf_output_begin(struct perf_output_handle *handle,
perf_output_get_handle(handle);
+ offset = local_read(&rb->head);
do {
+ head = offset;
tail = READ_ONCE(rb->user_page->data_tail);
- offset = head = local_read(&rb->head);
if (!rb->overwrite) {
if (unlikely(!ring_buffer_has_space(head, tail,
perf_data_size(rb),
@@ -214,7 +218,7 @@ __perf_output_begin(struct perf_output_handle *handle,
head += size;
else
head -= size;
- } while (local_cmpxchg(&rb->head, offset, head) != offset);
+ } while (!local_try_cmpxchg(&rb->head, &offset, head));
if (backward) {
offset = head;
@@ -237,24 +241,23 @@ __perf_output_begin(struct perf_output_handle *handle,
handle->size = (1UL << page_shift) - offset;
if (unlikely(have_lost)) {
- struct perf_sample_data sample_data;
-
lost_event.header.size = sizeof(lost_event);
lost_event.header.type = PERF_RECORD_LOST;
lost_event.header.misc = 0;
lost_event.id = event->id;
lost_event.lost = local_xchg(&rb->lost, 0);
- perf_event_header__init_id(&lost_event.header,
- &sample_data, event);
+ /* XXX mostly redundant; @data is already fully initializes */
+ perf_event_header__init_id(&lost_event.header, data, event);
perf_output_put(handle, lost_event);
- perf_event__output_id_sample(event, handle, &sample_data);
+ perf_event__output_id_sample(event, handle, data);
}
return 0;
fail:
local_inc(&rb->lost);
+ atomic64_inc(&event->lost_samples);
perf_output_put_handle(handle);
out:
rcu_read_unlock();
@@ -263,22 +266,25 @@ out:
}
int perf_output_begin_forward(struct perf_output_handle *handle,
- struct perf_event *event, unsigned int size)
+ struct perf_sample_data *data,
+ struct perf_event *event, unsigned int size)
{
- return __perf_output_begin(handle, event, size, false);
+ return __perf_output_begin(handle, data, event, size, false);
}
int perf_output_begin_backward(struct perf_output_handle *handle,
+ struct perf_sample_data *data,
struct perf_event *event, unsigned int size)
{
- return __perf_output_begin(handle, event, size, true);
+ return __perf_output_begin(handle, data, event, size, true);
}
int perf_output_begin(struct perf_output_handle *handle,
+ struct perf_sample_data *data,
struct perf_event *event, unsigned int size)
{
- return __perf_output_begin(handle, event, size,
+ return __perf_output_begin(handle, data, event, size,
unlikely(is_write_backward(event)));
}
@@ -604,8 +610,8 @@ static struct page *rb_alloc_aux_page(int node, int order)
{
struct page *page;
- if (order > MAX_ORDER)
- order = MAX_ORDER;
+ if (order > MAX_PAGE_ORDER)
+ order = MAX_PAGE_ORDER;
do {
page = alloc_pages_node(node, PERF_AUX_GFP, order);
@@ -672,23 +678,34 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
if (!has_aux(event))
return -EOPNOTSUPP;
- /*
- * We need to start with the max_order that fits in nr_pages,
- * not the other way around, hence ilog2() and not get_order.
- */
- max_order = ilog2(nr_pages);
-
- /*
- * PMU requests more than one contiguous chunks of memory
- * for SW double buffering
- */
if (!overwrite) {
- if (!max_order)
- return -EINVAL;
+ /*
+ * Watermark defaults to half the buffer, and so does the
+ * max_order, to aid PMU drivers in double buffering.
+ */
+ if (!watermark)
+ watermark = nr_pages << (PAGE_SHIFT - 1);
- max_order--;
+ /*
+ * Use aux_watermark as the basis for chunking to
+ * help PMU drivers honor the watermark.
+ */
+ max_order = get_order(watermark);
+ } else {
+ /*
+ * We need to start with the max_order that fits in nr_pages,
+ * not the other way around, hence ilog2() and not get_order.
+ */
+ max_order = ilog2(nr_pages);
+ watermark = 0;
}
+ /*
+ * kcalloc_node() is unable to allocate buffer if the size is larger
+ * than: PAGE_SIZE << MAX_PAGE_ORDER; directly bail out in this case.
+ */
+ if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_PAGE_ORDER)
+ return -ENOMEM;
rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
node);
if (!rb->aux_pages)
@@ -741,9 +758,6 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
rb->aux_overwrite = overwrite;
rb->aux_watermark = watermark;
- if (!rb->aux_watermark && !rb->aux_overwrite)
- rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1);
-
out:
if (!ret)
rb->aux_pgoff = pgoff;
@@ -802,15 +816,16 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
{
struct perf_buffer *rb;
unsigned long size;
- int i;
+ int i, node;
size = sizeof(struct perf_buffer);
size += nr_pages * sizeof(void *);
- if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
+ if (order_base_2(size) > PAGE_SHIFT+MAX_PAGE_ORDER)
goto fail;
- rb = kzalloc(size, GFP_KERNEL);
+ node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+ rb = kzalloc_node(size, GFP_KERNEL, node);
if (!rb)
goto fail;
@@ -854,11 +869,6 @@ void rb_free(struct perf_buffer *rb)
}
#else
-static int data_page_nr(struct perf_buffer *rb)
-{
- return rb->nr_pages << page_order(rb);
-}
-
static struct page *
__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
{
@@ -904,11 +914,13 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
struct perf_buffer *rb;
unsigned long size;
void *all_buf;
+ int node;
size = sizeof(struct perf_buffer);
size += sizeof(void *);
- rb = kzalloc(size, GFP_KERNEL);
+ node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+ rb = kzalloc_node(size, GFP_KERNEL, node);
if (!rb)
goto fail;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 5f8b0c52fd2e..929e98c62965 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -19,10 +19,9 @@
#include <linux/export.h>
#include <linux/rmap.h> /* anon_vma_prepare */
#include <linux/mmu_notifier.h> /* set_pte_at_notify */
-#include <linux/swap.h> /* try_to_free_swap */
+#include <linux/swap.h> /* folio_free_swap */
#include <linux/ptrace.h> /* user_enable_single_step */
#include <linux/kdebug.h> /* notifier mechanism */
-#include "../../mm/internal.h" /* munlock_vma_page */
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
@@ -154,26 +153,25 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct page *old_page, struct page *new_page)
{
+ struct folio *old_folio = page_folio(old_page);
+ struct folio *new_folio;
struct mm_struct *mm = vma->vm_mm;
- struct page_vma_mapped_walk pvmw = {
- .page = compound_head(old_page),
- .vma = vma,
- .address = addr,
- };
+ DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0);
int err;
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
addr + PAGE_SIZE);
if (new_page) {
- err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL);
+ new_folio = page_folio(new_page);
+ err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL);
if (err)
return err;
}
- /* For try_to_free_swap() and munlock_vma_page() below */
- lock_page(old_page);
+ /* For folio_free_swap() below */
+ folio_lock(old_folio);
mmu_notifier_invalidate_range_start(&range);
err = -EAGAIN;
@@ -182,37 +180,34 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
if (new_page) {
- get_page(new_page);
- page_add_new_anon_rmap(new_page, vma, addr, false);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ folio_get(new_folio);
+ folio_add_new_anon_rmap(new_folio, vma, addr);
+ folio_add_lru_vma(new_folio, vma);
} else
/* no new page, just dec_mm_counter for old_page */
dec_mm_counter(mm, MM_ANONPAGES);
- if (!PageAnon(old_page)) {
+ if (!folio_test_anon(old_folio)) {
dec_mm_counter(mm, mm_counter_file(old_page));
inc_mm_counter(mm, MM_ANONPAGES);
}
- flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
- ptep_clear_flush_notify(vma, addr, pvmw.pte);
+ flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte)));
+ ptep_clear_flush(vma, addr, pvmw.pte);
if (new_page)
set_pte_at_notify(mm, addr, pvmw.pte,
mk_pte(new_page, vma->vm_page_prot));
- page_remove_rmap(old_page, false);
- if (!page_mapped(old_page))
- try_to_free_swap(old_page);
+ folio_remove_rmap_pte(old_folio, old_page, vma);
+ if (!folio_mapped(old_folio))
+ folio_free_swap(old_folio);
page_vma_mapped_walk_done(&pvmw);
-
- if (vma->vm_flags & VM_LOCKED)
- munlock_vma_page(old_page);
- put_page(old_page);
+ folio_put(old_folio);
err = 0;
unlock:
mmu_notifier_invalidate_range_end(&range);
- unlock_page(old_page);
+ folio_unlock(old_folio);
return err;
}
@@ -355,9 +350,10 @@ static bool valid_ref_ctr_vma(struct uprobe *uprobe,
static struct vm_area_struct *
find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *tmp;
- for (tmp = mm->mmap; tmp; tmp = tmp->vm_next)
+ for_each_vma(vmi, tmp)
if (valid_ref_ctr_vma(uprobe, tmp))
return tmp;
@@ -369,15 +365,14 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
{
void *kaddr;
struct page *page;
- struct vm_area_struct *vma;
int ret;
short *ptr;
if (!vaddr || !d)
return -EINVAL;
- ret = get_user_pages_remote(NULL, mm, vaddr, 1,
- FOLL_WRITE, &page, &vma, NULL);
+ ret = get_user_pages_remote(mm, vaddr, 1,
+ FOLL_WRITE, &page, NULL);
if (unlikely(ret <= 0)) {
/*
* We are asking for 1 page. If get_user_pages_remote() fails,
@@ -453,6 +448,7 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
* that have fixed length instructions.
*
* uprobe_write_opcode - write the opcode at a given virtual address.
+ * @auprobe: arch specific probepoint information.
* @mm: the probed process address space.
* @vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @vaddr.
@@ -477,10 +473,9 @@ retry:
if (is_register)
gup_flags |= FOLL_SPLIT_PMD;
/* Read the page with vaddr into memory */
- ret = get_user_pages_remote(NULL, mm, vaddr, 1, gup_flags,
- &old_page, &vma, NULL);
- if (ret <= 0)
- return ret;
+ old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
+ if (IS_ERR(old_page))
+ return PTR_ERR(old_page);
ret = verify_opcode(old_page, vaddr, &opcode);
if (ret <= 0)
@@ -542,7 +537,7 @@ retry:
}
}
- ret = __replace_page(vma, vaddr, old_page, new_page);
+ ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page);
if (new_page)
put_page(new_page);
put_old:
@@ -557,7 +552,7 @@ put_old:
/* try collapse pmd for compound page */
if (!ret && orig_page_huge)
- collapse_pte_mapped_thp(mm, vaddr);
+ collapse_pte_mapped_thp(mm, vaddr, false);
return ret;
}
@@ -613,41 +608,56 @@ static void put_uprobe(struct uprobe *uprobe)
}
}
-static int match_uprobe(struct uprobe *l, struct uprobe *r)
+static __always_inline
+int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset,
+ const struct uprobe *r)
{
- if (l->inode < r->inode)
+ if (l_inode < r->inode)
return -1;
- if (l->inode > r->inode)
+ if (l_inode > r->inode)
return 1;
- if (l->offset < r->offset)
+ if (l_offset < r->offset)
return -1;
- if (l->offset > r->offset)
+ if (l_offset > r->offset)
return 1;
return 0;
}
+#define __node_2_uprobe(node) \
+ rb_entry((node), struct uprobe, rb_node)
+
+struct __uprobe_key {
+ struct inode *inode;
+ loff_t offset;
+};
+
+static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b)
+{
+ const struct __uprobe_key *a = key;
+ return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b));
+}
+
+static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b)
+{
+ struct uprobe *u = __node_2_uprobe(a);
+ return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
+}
+
static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
{
- struct uprobe u = { .inode = inode, .offset = offset };
- struct rb_node *n = uprobes_tree.rb_node;
- struct uprobe *uprobe;
- int match;
+ struct __uprobe_key key = {
+ .inode = inode,
+ .offset = offset,
+ };
+ struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
- while (n) {
- uprobe = rb_entry(n, struct uprobe, rb_node);
- match = match_uprobe(&u, uprobe);
- if (!match)
- return get_uprobe(uprobe);
+ if (node)
+ return get_uprobe(__node_2_uprobe(node));
- if (match < 0)
- n = n->rb_left;
- else
- n = n->rb_right;
- }
return NULL;
}
@@ -668,32 +678,15 @@ static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
{
- struct rb_node **p = &uprobes_tree.rb_node;
- struct rb_node *parent = NULL;
- struct uprobe *u;
- int match;
-
- while (*p) {
- parent = *p;
- u = rb_entry(parent, struct uprobe, rb_node);
- match = match_uprobe(uprobe, u);
- if (!match)
- return get_uprobe(u);
+ struct rb_node *node;
- if (match < 0)
- p = &parent->rb_left;
- else
- p = &parent->rb_right;
-
- }
+ node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
+ if (node)
+ return get_uprobe(__node_2_uprobe(node));
- u = NULL;
- rb_link_node(&uprobe->rb_node, parent, p);
- rb_insert_color(&uprobe->rb_node, &uprobes_tree);
/* get access + creation ref */
refcount_set(&uprobe->ref, 2);
-
- return u;
+ return NULL;
}
/*
@@ -794,10 +787,10 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
struct page *page;
/*
* Ensure that the page that has the original instruction is populated
- * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
+ * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(),
* see uprobe_register().
*/
- if (mapping->a_ops->readpage)
+ if (mapping->a_ops->read_folio)
page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
else
page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
@@ -1150,7 +1143,8 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
return -EINVAL;
/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
- if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
+ if (!inode->i_mapping->a_ops->read_folio &&
+ !shmem_mapping(inode->i_mapping))
return -EIO;
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
@@ -1237,11 +1231,12 @@ int uprobe_apply(struct inode *inode, loff_t offset,
static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
int err = 0;
mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
unsigned long vaddr;
loff_t offset;
@@ -1354,7 +1349,7 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
}
/*
- * Called from mmap_region/vma_adjust with mm->mmap_lock acquired.
+ * Called from mmap_region/vma_merge with mm->mmap_lock acquired.
*
* Currently we ignore all errors and always return 0, the callers
* can't handle the failure anyway.
@@ -1735,7 +1730,7 @@ void uprobe_free_utask(struct task_struct *t)
}
/*
- * Allocate a uprobe_task object for the task if if necessary.
+ * Allocate a uprobe_task object for the task if necessary.
* Called when the thread hits a breakpoint.
*
* Returns:
@@ -1823,7 +1818,7 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags)
t->utask->dup_xol_addr = area->vaddr;
init_task_work(&t->utask->dup_xol_work, dup_xol_work);
- task_work_add(t, &t->utask->dup_xol_work, true);
+ task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME);
}
/*
@@ -1973,7 +1968,7 @@ bool uprobe_deny_signal(void)
WARN_ON_ONCE(utask->state != UTASK_SSTEP);
- if (signal_pending(t)) {
+ if (task_sigpending(t)) {
spin_lock_irq(&t->sighand->siglock);
clear_tsk_thread_flag(t, TIF_SIGPENDING);
spin_unlock_irq(&t->sighand->siglock);
@@ -1989,9 +1984,10 @@ bool uprobe_deny_signal(void)
static void mmf_recalc_uprobes(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!valid_vma(vma, false))
continue;
/*
@@ -2029,8 +2025,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
* but we treat this as a 'remote' access since it is
* essentially a kernel access to the memory.
*/
- result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
- NULL, NULL);
+ result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL);
if (result < 0)
return result;
@@ -2048,8 +2043,8 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
struct vm_area_struct *vma;
mmap_read_lock(mm);
- vma = find_vma(mm, bp_vaddr);
- if (vma && vma->vm_start <= bp_vaddr) {
+ vma = vma_lookup(mm, bp_vaddr);
+ if (vma) {
if (valid_vma(vma, false)) {
struct inode *inode = file_inode(vma->vm_file);
loff_t offset = vaddr_to_offset(vma, bp_vaddr);
@@ -2189,7 +2184,7 @@ static void handle_swbp(struct pt_regs *regs)
{
struct uprobe *uprobe;
unsigned long bp_vaddr;
- int uninitialized_var(is_swbp);
+ int is_swbp;
bp_vaddr = uprobe_get_swbp_addr(regs);
if (bp_vaddr == get_trampoline_vaddr())
diff --git a/kernel/exit.c b/kernel/exit.c
index 727150f28103..dfb963d2f862 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,9 +48,9 @@
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
-#include <linux/blkdev.h>
#include <linux/task_io_accounting_ops.h>
-#include <linux/tracehook.h>
+#include <linux/blkdev.h>
+#include <linux/task_work.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include <linux/perf_event.h>
@@ -60,14 +60,70 @@
#include <linux/writeback.h>
#include <linux/shm.h>
#include <linux/kcov.h>
+#include <linux/kmsan.h>
#include <linux/random.h>
#include <linux/rcuwait.h>
#include <linux/compat.h>
-
+#include <linux/io_uring.h>
+#include <linux/kprobes.h>
+#include <linux/rethook.h>
+#include <linux/sysfs.h>
+#include <linux/user_events.h>
#include <linux/uaccess.h>
+
+#include <uapi/linux/wait.h>
+
#include <asm/unistd.h>
#include <asm/mmu_context.h>
+#include "exit.h"
+
+/*
+ * The default value should be high enough to not crash a system that randomly
+ * crashes its kernel from time to time, but low enough to at least not permit
+ * overflowing 32-bit refcounts or the ldsem writer count.
+ */
+static unsigned int oops_limit = 10000;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_exit_table[] = {
+ {
+ .procname = "oops_limit",
+ .data = &oops_limit,
+ .maxlen = sizeof(oops_limit),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+ { }
+};
+
+static __init int kernel_exit_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_exit_table);
+ return 0;
+}
+late_initcall(kernel_exit_sysctls_init);
+#endif
+
+static atomic_t oops_count = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%d\n", atomic_read(&oops_count));
+}
+
+static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count);
+
+static __init int kernel_exit_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL);
+ return 0;
+}
+late_initcall(kernel_exit_sysfs_init);
+#endif
+
static void __unhash_process(struct task_struct *p, bool group_dead)
{
nr_threads--;
@@ -81,7 +137,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
list_del_init(&p->sibling);
__this_cpu_dec(process_counts);
}
- list_del_rcu(&p->thread_group);
list_del_rcu(&p->thread_node);
}
@@ -93,7 +148,7 @@ static void __exit_signal(struct task_struct *tsk)
struct signal_struct *sig = tsk->signal;
bool group_dead = thread_group_leader(tsk);
struct sighand_struct *sighand;
- struct tty_struct *uninitialized_var(tty);
+ struct tty_struct *tty;
u64 utime, stime;
sighand = rcu_dereference_check(tsk->sighand,
@@ -115,7 +170,7 @@ static void __exit_signal(struct task_struct *tsk)
* then notify it:
*/
if (sig->notify_count > 0 && !--sig->notify_count)
- wake_up_process(sig->group_exit_task);
+ wake_up_process(sig->group_exec_task);
if (tsk == sig->curr_target)
sig->curr_target = next_thread(tsk);
@@ -167,6 +222,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+ kprobe_flush_task(tsk);
+ rethook_flush_task(tsk);
perf_event_delayed_put(tsk);
trace_sched_process_free(tsk);
put_task_struct(tsk);
@@ -178,6 +235,10 @@ void put_task_struct_rcu_user(struct task_struct *task)
call_rcu(&task->rcu, delayed_put_task_struct);
}
+void __weak release_thread(struct task_struct *dead_task)
+{
+}
+
void release_task(struct task_struct *p)
{
struct task_struct *leader;
@@ -187,7 +248,7 @@ repeat:
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
- atomic_dec(&__task_cred(p)->user->processes);
+ dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
rcu_read_unlock();
cgroup_release(p);
@@ -217,6 +278,7 @@ repeat:
}
write_unlock_irq(&tasklist_lock);
+ seccomp_filter_release(p);
proc_flush_pid(thread_pid);
put_pid(thread_pid);
release_thread(p);
@@ -337,6 +399,49 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
}
}
+static void coredump_task_exit(struct task_struct *tsk)
+{
+ struct core_state *core_state;
+
+ /*
+ * Serialize with any possible pending coredump.
+ * We must hold siglock around checking core_state
+ * and setting PF_POSTCOREDUMP. The core-inducing thread
+ * will increment ->nr_threads for each thread in the
+ * group without PF_POSTCOREDUMP set.
+ */
+ spin_lock_irq(&tsk->sighand->siglock);
+ tsk->flags |= PF_POSTCOREDUMP;
+ core_state = tsk->signal->core_state;
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ /* The vhost_worker does not particpate in coredumps */
+ if (core_state &&
+ ((tsk->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)) {
+ struct core_thread self;
+
+ self.task = current;
+ if (self.task->flags & PF_SIGNALED)
+ self.next = xchg(&core_state->dumper.next, &self);
+ else
+ self.task = NULL;
+ /*
+ * Implies mb(), the result of xchg() must be visible
+ * to core_state->dumper.
+ */
+ if (atomic_dec_and_test(&core_state->nr_threads))
+ complete(&core_state->startup);
+
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
+ if (!self.task) /* see coredump_finish() */
+ break;
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+ }
+}
+
#ifdef CONFIG_MEMCG
/*
* A task is exiting. If it owned this mm, find a new owner for the mm.
@@ -420,6 +525,7 @@ assign_new_owner:
goto retry;
}
WRITE_ONCE(mm->owner, c);
+ lru_gen_migrate_mm(mm);
task_unlock(c);
put_task_struct(c);
}
@@ -432,52 +538,33 @@ assign_new_owner:
static void exit_mm(void)
{
struct mm_struct *mm = current->mm;
- struct core_state *core_state;
exit_mm_release(current, mm);
if (!mm)
return;
- sync_mm_rss(mm);
- /*
- * Serialize with any possible pending coredump.
- * We must hold mmap_lock around checking core_state
- * and clearing tsk->mm. The core-inducing thread
- * will increment ->nr_threads for each thread in the
- * group with ->mm != NULL.
- */
mmap_read_lock(mm);
- core_state = mm->core_state;
- if (core_state) {
- struct core_thread self;
-
- mmap_read_unlock(mm);
-
- self.task = current;
- self.next = xchg(&core_state->dumper.next, &self);
- /*
- * Implies mb(), the result of xchg() must be visible
- * to core_state->dumper.
- */
- if (atomic_dec_and_test(&core_state->nr_threads))
- complete(&core_state->startup);
-
- for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (!self.task) /* see coredump_finish() */
- break;
- freezable_schedule();
- }
- __set_current_state(TASK_RUNNING);
- mmap_read_lock(mm);
- }
- mmgrab(mm);
+ mmgrab_lazy_tlb(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
+ /*
+ * When a thread stops operating on an address space, the loop
+ * in membarrier_private_expedited() may not observe that
+ * tsk->mm, and the loop in membarrier_global_expedited() may
+ * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
+ * rq->membarrier_state, so those would not issue an IPI.
+ * Membarrier requires a memory barrier after accessing
+ * user-space memory, before clearing tsk->mm or the
+ * rq->membarrier_state.
+ */
+ smp_mb__after_spinlock();
+ local_irq_disable();
current->mm = NULL;
- mmap_read_unlock(mm);
+ membarrier_update_current_mm(NULL);
enter_lazy_tlb(mm, current);
+ local_irq_enable();
task_unlock(current);
+ mmap_read_unlock(mm);
mm_update_next_owner(mm);
mmput(mm);
if (test_thread_flag(TIF_MEMDIE))
@@ -672,7 +759,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
- wake_up_process(tsk->signal->group_exit_task);
+ wake_up_process(tsk->signal->group_exec_task);
write_unlock_irq(&tasklist_lock);
list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
@@ -705,64 +792,43 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {}
#endif
+static void synchronize_group_exit(struct task_struct *tsk, long code)
+{
+ struct sighand_struct *sighand = tsk->sighand;
+ struct signal_struct *signal = tsk->signal;
+
+ spin_lock_irq(&sighand->siglock);
+ signal->quick_threads--;
+ if ((signal->quick_threads == 0) &&
+ !(signal->flags & SIGNAL_GROUP_EXIT)) {
+ signal->flags = SIGNAL_GROUP_EXIT;
+ signal->group_exit_code = code;
+ signal->group_stop_count = 0;
+ }
+ spin_unlock_irq(&sighand->siglock);
+}
+
void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
- /*
- * We can get here from a kernel oops, sometimes with preemption off.
- * Start by checking for critical errors.
- * Then fix up important state like USER_DS and preemption.
- * Then do everything else.
- */
-
- WARN_ON(blk_needs_flush_plug(tsk));
-
- if (unlikely(in_interrupt()))
- panic("Aiee, killing interrupt handler!");
- if (unlikely(!tsk->pid))
- panic("Attempted to kill the idle task!");
+ WARN_ON(irqs_disabled());
- /*
- * If do_exit is called because this processes oopsed, it's possible
- * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
- * continuing. Amongst other possible reasons, this is to prevent
- * mm_release()->clear_child_tid() from writing to a user-controlled
- * kernel address.
- */
- set_fs(USER_DS);
+ synchronize_group_exit(tsk, code);
- if (unlikely(in_atomic())) {
- pr_info("note: %s[%d] exited with preempt_count %d\n",
- current->comm, task_pid_nr(current),
- preempt_count());
- preempt_count_set(PREEMPT_ENABLED);
- }
+ WARN_ON(tsk->plug);
- profile_task_exit(tsk);
kcov_task_exit(tsk);
+ kmsan_task_exit(tsk);
+ coredump_task_exit(tsk);
ptrace_event(PTRACE_EVENT_EXIT, code);
+ user_events_exit(tsk);
- validate_creds_for_do_exit(tsk);
-
- /*
- * We're taking recursive faults here in do_exit. Safest is to just
- * leave this task alone and wait for reboot.
- */
- if (unlikely(tsk->flags & PF_EXITING)) {
- pr_alert("Fixing recursive fault but reboot is needed!\n");
- futex_exit_recursive(tsk);
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule();
- }
-
+ io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */
- /* sync mm's RSS info before statistics gathering */
- if (tsk->mm)
- sync_mm_rss(tsk->mm);
acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
@@ -776,7 +842,7 @@ void __noreturn do_exit(long code)
#ifdef CONFIG_POSIX_TIMERS
hrtimer_cancel(&tsk->signal->real_timer);
- exit_itimers(tsk->signal);
+ exit_itimers(tsk);
#endif
if (tsk->mm)
setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
@@ -804,7 +870,6 @@ void __noreturn do_exit(long code)
exit_task_namespaces(tsk);
exit_task_work(tsk);
exit_thread(tsk);
- exit_umh(tsk);
/*
* Flush inherited counters to the parent - before the parent
@@ -844,7 +909,7 @@ void __noreturn do_exit(long code)
if (tsk->task_frag.page)
put_page(tsk->task_frag.page);
- validate_creds_for_do_exit(tsk);
+ exit_task_stack_account(tsk);
check_stack_usage();
preempt_disable();
@@ -856,16 +921,66 @@ void __noreturn do_exit(long code)
lockdep_free_task(tsk);
do_task_dead();
}
-EXPORT_SYMBOL_GPL(do_exit);
-void complete_and_exit(struct completion *comp, long code)
+void __noreturn make_task_dead(int signr)
{
- if (comp)
- complete(comp);
+ /*
+ * Take the task off the cpu after something catastrophic has
+ * happened.
+ *
+ * We can get here from a kernel oops, sometimes with preemption off.
+ * Start by checking for critical errors.
+ * Then fix up important state like USER_DS and preemption.
+ * Then do everything else.
+ */
+ struct task_struct *tsk = current;
+ unsigned int limit;
+
+ if (unlikely(in_interrupt()))
+ panic("Aiee, killing interrupt handler!");
+ if (unlikely(!tsk->pid))
+ panic("Attempted to kill the idle task!");
- do_exit(code);
+ if (unlikely(irqs_disabled())) {
+ pr_info("note: %s[%d] exited with irqs disabled\n",
+ current->comm, task_pid_nr(current));
+ local_irq_enable();
+ }
+ if (unlikely(in_atomic())) {
+ pr_info("note: %s[%d] exited with preempt_count %d\n",
+ current->comm, task_pid_nr(current),
+ preempt_count());
+ preempt_count_set(PREEMPT_ENABLED);
+ }
+
+ /*
+ * Every time the system oopses, if the oops happens while a reference
+ * to an object was held, the reference leaks.
+ * If the oops doesn't also leak memory, repeated oopsing can cause
+ * reference counters to wrap around (if they're not using refcount_t).
+ * This means that repeated oopsing can make unexploitable-looking bugs
+ * exploitable through repeated oopsing.
+ * To make sure this can't happen, place an upper bound on how often the
+ * kernel may oops without panic().
+ */
+ limit = READ_ONCE(oops_limit);
+ if (atomic_inc_return(&oops_count) >= limit && limit)
+ panic("Oopsed too often (kernel.oops_limit is %d)", limit);
+
+ /*
+ * We're taking recursive faults here in make_task_dead. Safest is to just
+ * leave this task alone and wait for reboot.
+ */
+ if (unlikely(tsk->flags & PF_EXITING)) {
+ pr_alert("Fixing recursive fault but reboot is needed!\n");
+ futex_exit_recursive(tsk);
+ tsk->exit_state = EXIT_DEAD;
+ refcount_inc(&tsk->rcu_users);
+ do_task_dead();
+ }
+
+ do_exit(signr);
}
-EXPORT_SYMBOL(complete_and_exit);
SYSCALL_DEFINE1(exit, int, error_code)
{
@@ -876,22 +991,24 @@ SYSCALL_DEFINE1(exit, int, error_code)
* Take down every thread in the group. This is called by fatal signals
* as well as by sys_exit_group (below).
*/
-void
+void __noreturn
do_group_exit(int exit_code)
{
struct signal_struct *sig = current->signal;
- BUG_ON(exit_code & 0x80); /* core dumps don't get here */
-
- if (signal_group_exit(sig))
+ if (sig->flags & SIGNAL_GROUP_EXIT)
exit_code = sig->group_exit_code;
- else if (!thread_group_empty(current)) {
+ else if (sig->group_exec_task)
+ exit_code = 0;
+ else {
struct sighand_struct *const sighand = current->sighand;
spin_lock_irq(&sighand->siglock);
- if (signal_group_exit(sig))
+ if (sig->flags & SIGNAL_GROUP_EXIT)
/* Another thread got here before we took the lock. */
exit_code = sig->group_exit_code;
+ else if (sig->group_exec_task)
+ exit_code = 0;
else {
sig->group_exit_code = exit_code;
sig->flags = SIGNAL_GROUP_EXIT;
@@ -916,26 +1033,6 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
return 0;
}
-struct waitid_info {
- pid_t pid;
- uid_t uid;
- int status;
- int cause;
-};
-
-struct wait_opts {
- enum pid_type wo_type;
- int wo_flags;
- struct pid *wo_pid;
-
- struct waitid_info *wo_info;
- int wo_stat;
- struct rusage *wo_rusage;
-
- wait_queue_entry_t child_wait;
- int notask_error;
-};
-
static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
{
return wo->wo_type == PIDTYPE_MAX ||
@@ -986,7 +1083,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
return 0;
if (unlikely(wo->wo_flags & WNOWAIT)) {
- status = p->exit_code;
+ status = (p->signal->flags & SIGNAL_GROUP_EXIT)
+ ? p->signal->group_exit_code : p->exit_code;
get_task_struct(p);
read_unlock(&tasklist_lock);
sched_annotate_sleep();
@@ -1028,18 +1126,15 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* p->signal fields because the whole thread group is dead
* and nobody can change them.
*
- * psig->stats_lock also protects us from our sub-theads
- * which can reap other children at the same time. Until
- * we change k_getrusage()-like users to rely on this lock
- * we have to take ->siglock as well.
+ * psig->stats_lock also protects us from our sub-threads
+ * which can reap other children at the same time.
*
* We use thread_group_cputime_adjusted() to get times for
* the thread group, which consolidates times for all threads
* in the group including the group leader.
*/
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- spin_lock_irq(&current->sighand->siglock);
- write_seqlock(&psig->stats_lock);
+ write_seqlock_irq(&psig->stats_lock);
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1062,8 +1157,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
psig->cmaxrss = maxrss;
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
- write_sequnlock(&psig->stats_lock);
- spin_unlock_irq(&current->sighand->siglock);
+ write_sequnlock_irq(&psig->stats_lock);
}
if (wo->wo_rusage)
@@ -1398,6 +1492,17 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
return 0;
}
+bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
+{
+ if (!eligible_pid(wo, p))
+ return false;
+
+ if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
+ return false;
+
+ return true;
+}
+
static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
int sync, void *key)
{
@@ -1405,13 +1510,10 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
child_wait);
struct task_struct *p = key;
- if (!eligible_pid(wo, p))
- return 0;
-
- if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
- return 0;
+ if (pid_child_should_wake(wo, p))
+ return default_wake_function(wait, mode, sync, key);
- return default_wake_function(wait, mode, sync, key);
+ return 0;
}
void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
@@ -1420,17 +1522,50 @@ void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
TASK_INTERRUPTIBLE, p);
}
-static long do_wait(struct wait_opts *wo)
+static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
+ struct task_struct *target)
{
- struct task_struct *tsk;
+ struct task_struct *parent =
+ !ptrace ? target->real_parent : target->parent;
+
+ return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
+ same_thread_group(current, parent));
+}
+
+/*
+ * Optimization for waiting on PIDTYPE_PID. No need to iterate through child
+ * and tracee lists to find the target task.
+ */
+static int do_wait_pid(struct wait_opts *wo)
+{
+ bool ptrace;
+ struct task_struct *target;
int retval;
- trace_sched_process_wait(wo->wo_pid);
+ ptrace = false;
+ target = pid_task(wo->wo_pid, PIDTYPE_TGID);
+ if (target && is_effectively_child(wo, ptrace, target)) {
+ retval = wait_consider_task(wo, ptrace, target);
+ if (retval)
+ return retval;
+ }
+
+ ptrace = true;
+ target = pid_task(wo->wo_pid, PIDTYPE_PID);
+ if (target && target->ptrace &&
+ is_effectively_child(wo, ptrace, target)) {
+ retval = wait_consider_task(wo, ptrace, target);
+ if (retval)
+ return retval;
+ }
+
+ return 0;
+}
+
+long __do_wait(struct wait_opts *wo)
+{
+ long retval;
- init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
- wo->child_wait.private = current;
- add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
-repeat:
/*
* If there is nothing that can match our criteria, just get out.
* We will clear ->notask_error to zero if we see any child that
@@ -1442,62 +1577,70 @@ repeat:
(!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
goto notask;
- set_current_state(TASK_INTERRUPTIBLE);
read_lock(&tasklist_lock);
- tsk = current;
- do {
- retval = do_wait_thread(wo, tsk);
- if (retval)
- goto end;
- retval = ptrace_do_wait(wo, tsk);
+ if (wo->wo_type == PIDTYPE_PID) {
+ retval = do_wait_pid(wo);
if (retval)
- goto end;
+ return retval;
+ } else {
+ struct task_struct *tsk = current;
- if (wo->wo_flags & __WNOTHREAD)
- break;
- } while_each_thread(current, tsk);
+ do {
+ retval = do_wait_thread(wo, tsk);
+ if (retval)
+ return retval;
+
+ retval = ptrace_do_wait(wo, tsk);
+ if (retval)
+ return retval;
+
+ if (wo->wo_flags & __WNOTHREAD)
+ break;
+ } while_each_thread(current, tsk);
+ }
read_unlock(&tasklist_lock);
notask:
retval = wo->notask_error;
- if (!retval && !(wo->wo_flags & WNOHANG)) {
- retval = -ERESTARTSYS;
- if (!signal_pending(current)) {
- schedule();
- goto repeat;
- }
- }
-end:
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
+ if (!retval && !(wo->wo_flags & WNOHANG))
+ return -ERESTARTSYS;
+
return retval;
}
-static struct pid *pidfd_get_pid(unsigned int fd)
+static long do_wait(struct wait_opts *wo)
{
- struct fd f;
- struct pid *pid;
+ int retval;
- f = fdget(fd);
- if (!f.file)
- return ERR_PTR(-EBADF);
+ trace_sched_process_wait(wo->wo_pid);
- pid = pidfd_pid(f.file);
- if (!IS_ERR(pid))
- get_pid(pid);
+ init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
+ wo->child_wait.private = current;
+ add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
- fdput(f);
- return pid;
+ do {
+ set_current_state(TASK_INTERRUPTIBLE);
+ retval = __do_wait(wo);
+ if (retval != -ERESTARTSYS)
+ break;
+ if (signal_pending(current))
+ break;
+ schedule();
+ } while (1);
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
+ return retval;
}
-static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
- int options, struct rusage *ru)
+int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
+ struct waitid_info *infop, int options,
+ struct rusage *ru)
{
- struct wait_opts wo;
+ unsigned int f_flags = 0;
struct pid *pid = NULL;
enum pid_type type;
- long ret;
if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
__WNOTHREAD|__WCLONE|__WALL))
@@ -1531,22 +1674,41 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
if (upid < 0)
return -EINVAL;
- pid = pidfd_get_pid(upid);
+ pid = pidfd_get_pid(upid, &f_flags);
if (IS_ERR(pid))
return PTR_ERR(pid);
+
break;
default:
return -EINVAL;
}
- wo.wo_type = type;
- wo.wo_pid = pid;
- wo.wo_flags = options;
- wo.wo_info = infop;
- wo.wo_rusage = ru;
+ wo->wo_type = type;
+ wo->wo_pid = pid;
+ wo->wo_flags = options;
+ wo->wo_info = infop;
+ wo->wo_rusage = ru;
+ if (f_flags & O_NONBLOCK)
+ wo->wo_flags |= WNOHANG;
+
+ return 0;
+}
+
+static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
+ int options, struct rusage *ru)
+{
+ struct wait_opts wo;
+ long ret;
+
+ ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru);
+ if (ret)
+ return ret;
+
ret = do_wait(&wo);
+ if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
+ ret = -EAGAIN;
- put_pid(pid);
+ put_pid(wo.wo_pid);
return ret;
}
@@ -1626,6 +1788,22 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
return ret;
}
+int kernel_wait(pid_t pid, int *stat)
+{
+ struct wait_opts wo = {
+ .wo_type = PIDTYPE_PID,
+ .wo_pid = find_get_pid(pid),
+ .wo_flags = WEXITED,
+ };
+ int ret;
+
+ ret = do_wait(&wo);
+ if (ret > 0 && wo.wo_stat)
+ *stat = wo.wo_stat;
+ put_pid(wo.wo_pid);
+ return ret;
+}
+
SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
int, options, struct rusage __user *, ru)
{
@@ -1711,7 +1889,38 @@ Efault:
}
#endif
-__weak void abort(void)
+/**
+ * thread_group_exited - check that a thread group has exited
+ * @pid: tgid of thread group to be checked.
+ *
+ * Test if the thread group represented by tgid has exited (all
+ * threads are zombies, dead or completely gone).
+ *
+ * Return: true if the thread group has exited. false otherwise.
+ */
+bool thread_group_exited(struct pid *pid)
+{
+ struct task_struct *task;
+ bool exited;
+
+ rcu_read_lock();
+ task = pid_task(pid, PIDTYPE_PID);
+ exited = !task ||
+ (READ_ONCE(task->exit_state) && thread_group_empty(task));
+ rcu_read_unlock();
+
+ return exited;
+}
+EXPORT_SYMBOL(thread_group_exited);
+
+/*
+ * This needs to be __function_aligned as GCC implicitly makes any
+ * implementation of abort() cold and drops alignment specified by
+ * -falign-functions=N.
+ *
+ * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11
+ */
+__weak __function_aligned void abort(void)
{
BUG();
diff --git a/kernel/exit.h b/kernel/exit.h
new file mode 100644
index 000000000000..278faa26a653
--- /dev/null
+++ b/kernel/exit.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef LINUX_WAITID_H
+#define LINUX_WAITID_H
+
+struct waitid_info {
+ pid_t pid;
+ uid_t uid;
+ int status;
+ int cause;
+};
+
+struct wait_opts {
+ enum pid_type wo_type;
+ int wo_flags;
+ struct pid *wo_pid;
+
+ struct waitid_info *wo_info;
+ int wo_stat;
+ struct rusage *wo_rusage;
+
+ wait_queue_entry_t child_wait;
+ int notask_error;
+};
+
+bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p);
+long __do_wait(struct wait_opts *wo);
+int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
+ struct waitid_info *infop, int options,
+ struct rusage *ru);
+#endif
diff --git a/kernel/extable.c b/kernel/extable.c
index b0ea5eb0c3b4..71f482581cab 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -3,6 +3,7 @@
Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
*/
+#include <linux/elf.h>
#include <linux/ftrace.h>
#include <linux/memory.h>
#include <linux/extable.h>
@@ -62,40 +63,13 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
return e;
}
-int init_kernel_text(unsigned long addr)
-{
- if (addr >= (unsigned long)_sinittext &&
- addr < (unsigned long)_einittext)
- return 1;
- return 0;
-}
-
int notrace core_kernel_text(unsigned long addr)
{
- if (addr >= (unsigned long)_stext &&
- addr < (unsigned long)_etext)
+ if (is_kernel_text(addr))
return 1;
- if (system_state < SYSTEM_RUNNING &&
- init_kernel_text(addr))
- return 1;
- return 0;
-}
-
-/**
- * core_kernel_data - tell if addr points to kernel data
- * @addr: address to test
- *
- * Returns true if @addr passed in is from the core kernel data
- * section.
- *
- * Note: On some archs it may return true for core RODATA, and false
- * for others. But will always be true for core RW data.
- */
-int core_kernel_data(unsigned long addr)
-{
- if (addr >= (unsigned long)_sdata &&
- addr < (unsigned long)_edata)
+ if (system_state < SYSTEM_FREEING_INITMEM &&
+ is_kernel_inittext(addr))
return 1;
return 0;
}
@@ -112,7 +86,7 @@ int __kernel_text_address(unsigned long addr)
* Since we are after the module-symbols check, there's
* no danger of address overlap:
*/
- if (init_kernel_text(addr))
+ if (is_kernel_inittext(addr))
return 1;
return 0;
}
@@ -140,7 +114,7 @@ int kernel_text_address(unsigned long addr)
/* Treat this like an NMI as it can happen anywhere */
if (no_rcu)
- rcu_nmi_enter();
+ ct_nmi_enter();
if (is_module_text_address(addr))
goto out;
@@ -153,18 +127,39 @@ int kernel_text_address(unsigned long addr)
ret = 0;
out:
if (no_rcu)
- rcu_nmi_exit();
+ ct_nmi_exit();
return ret;
}
/*
- * On some architectures (PPC64, IA64) function pointers
+ * On some architectures (PPC64, IA64, PARISC) function pointers
* are actually only tokens to some data that then holds the
* real function address. As a result, to find if a function
* pointer is part of the kernel text, we need to do some
* special dereferencing first.
*/
+#ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS
+void *dereference_function_descriptor(void *ptr)
+{
+ func_desc_t *desc = ptr;
+ void *p;
+
+ if (!get_kernel_nofault(p, (void *)&desc->addr))
+ ptr = p;
+ return ptr;
+}
+EXPORT_SYMBOL_GPL(dereference_function_descriptor);
+
+void *dereference_kernel_function_descriptor(void *ptr)
+{
+ if (ptr < (void *)__start_opd || ptr >= (void *)__end_opd)
+ return ptr;
+
+ return dereference_function_descriptor(ptr);
+}
+#endif
+
int func_ptr_is_kernel_text(void *ptr)
{
unsigned long addr;
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index 63b349168da7..d971a0189319 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -37,9 +37,7 @@ static unsigned long adjust_error_retval(unsigned long addr, unsigned long retv)
{
switch (get_injectable_error_type(addr)) {
case EI_ETYPE_NULL:
- if (retv != 0)
- return 0;
- break;
+ return 0;
case EI_ETYPE_ERRNO:
if (retv < (unsigned long)-MAX_ERRNO)
return (unsigned long)-EINVAL;
@@ -48,6 +46,8 @@ static unsigned long adjust_error_retval(unsigned long addr, unsigned long retv)
if (retv != 0 && retv < (unsigned long)-MAX_ERRNO)
return (unsigned long)-EINVAL;
break;
+ case EI_ETYPE_TRUE:
+ return 1;
}
return retv;
@@ -163,10 +163,7 @@ static void fei_debugfs_add_attr(struct fei_attr *attr)
static void fei_debugfs_remove_attr(struct fei_attr *attr)
{
- struct dentry *dir;
-
- dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir);
- debugfs_remove_recursive(dir);
+ debugfs_lookup_and_remove(attr->kp.symbol_name, fei_debugfs_dir);
}
static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs)
@@ -247,15 +244,11 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
/* cut off if it is too long */
if (count > KSYM_NAME_LEN)
count = KSYM_NAME_LEN;
- buf = kmalloc(count + 1, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
- if (copy_from_user(buf, buffer, count)) {
- ret = -EFAULT;
- goto out;
- }
- buf[count] = '\0';
+ buf = memdup_user_nul(buffer, count);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+
sym = strstrip(buf);
mutex_lock(&fei_lock);
@@ -298,17 +291,16 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
}
ret = register_kprobe(&attr->kp);
- if (!ret)
- fei_debugfs_add_attr(attr);
- if (ret < 0)
- fei_attr_remove(attr);
- else {
- list_add_tail(&attr->list, &fei_attr_list);
- ret = count;
+ if (ret) {
+ fei_attr_free(attr);
+ goto out;
}
+ fei_debugfs_add_attr(attr);
+ list_add_tail(&attr->list, &fei_attr_list);
+ ret = count;
out:
- kfree(buf);
mutex_unlock(&fei_lock);
+ kfree(buf);
return ret;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index efc5493203ae..0d944e92a43f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,12 +37,13 @@
#include <linux/fdtable.h>
#include <linux/iocontext.h>
#include <linux/key.h>
+#include <linux/kmsan.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
-#include <linux/vmacache.h>
+#include <linux/mm_inline.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
@@ -52,6 +53,7 @@
#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
+#include <linux/syscall_user_dispatch.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
#include <linux/compat.h>
@@ -74,9 +76,7 @@
#include <linux/freezer.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
-#include <linux/random.h>
#include <linux/tty.h>
-#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
#include <linux/perf_event.h>
@@ -95,6 +95,12 @@
#include <linux/stackleak.h>
#include <linux/kasan.h>
#include <linux/scs.h>
+#include <linux/io_uring.h>
+#include <linux/bpf.h>
+#include <linux/stackprotector.h>
+#include <linux/user_events.h>
+#include <linux/iommu.h>
+#include <linux/rseq.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -161,7 +167,6 @@ void __weak arch_release_task_struct(struct task_struct *tsk)
{
}
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static struct kmem_cache *task_struct_cachep;
static inline struct task_struct *alloc_task_struct_node(int node)
@@ -173,9 +178,6 @@ static inline void free_task_struct(struct task_struct *tsk)
{
kmem_cache_free(task_struct_cachep, tsk);
}
-#endif
-
-#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
/*
* Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
@@ -183,7 +185,7 @@ static inline void free_task_struct(struct task_struct *tsk)
*/
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
-#ifdef CONFIG_VMAP_STACK
+# ifdef CONFIG_VMAP_STACK
/*
* vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
* flush. Try to minimize the number of calls by caching stacks.
@@ -191,6 +193,41 @@ static inline void free_task_struct(struct task_struct *tsk)
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+struct vm_stack {
+ struct rcu_head rcu;
+ struct vm_struct *stack_vm_area;
+};
+
+static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
+{
+ unsigned int i;
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL)
+ continue;
+ return true;
+ }
+ return false;
+}
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
+
+ if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
+ return;
+
+ vfree(vm_stack);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct vm_stack *vm_stack = tsk->stack;
+
+ vm_stack->stack_vm_area = tsk->stack_vm_area;
+ call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
+}
+
static int free_vm_stack_cache(unsigned int cpu)
{
struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
@@ -208,11 +245,31 @@ static int free_vm_stack_cache(unsigned int cpu)
return 0;
}
-#endif
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+static int memcg_charge_kernel_stack(struct vm_struct *vm)
{
-#ifdef CONFIG_VMAP_STACK
+ int i;
+ int ret;
+ int nr_charged = 0;
+
+ BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
+ if (ret)
+ goto err;
+ nr_charged++;
+ }
+ return 0;
+err:
+ for (i = 0; i < nr_charged; i++)
+ memcg_kmem_uncharge_page(vm->pages[i], 0);
+ return ret;
+}
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
+ struct vm_struct *vm;
void *stack;
int i;
@@ -224,15 +281,22 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
if (!s)
continue;
- /* Clear the KASAN shadow of the stack. */
- kasan_unpoison_shadow(s->addr, THREAD_SIZE);
+ /* Reset stack metadata. */
+ kasan_unpoison_range(s->addr, THREAD_SIZE);
+
+ stack = kasan_reset_tag(s->addr);
/* Clear stale pointers from reused stack. */
- memset(s->addr, 0, THREAD_SIZE);
+ memset(stack, 0, THREAD_SIZE);
+
+ if (memcg_charge_kernel_stack(s)) {
+ vfree(s->addr);
+ return -ENOMEM;
+ }
tsk->stack_vm_area = s;
- tsk->stack = s->addr;
- return s->addr;
+ tsk->stack = stack;
+ return 0;
}
/*
@@ -245,75 +309,96 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
THREADINFO_GFP & ~__GFP_ACCOUNT,
PAGE_KERNEL,
0, node, __builtin_return_address(0));
+ if (!stack)
+ return -ENOMEM;
+ vm = find_vm_area(stack);
+ if (memcg_charge_kernel_stack(vm)) {
+ vfree(stack);
+ return -ENOMEM;
+ }
/*
* We can't call find_vm_area() in interrupt context, and
* free_thread_stack() can be called in interrupt context,
* so cache the vm_struct.
*/
- if (stack) {
- tsk->stack_vm_area = find_vm_area(stack);
- tsk->stack = stack;
- }
- return stack;
-#else
+ tsk->stack_vm_area = vm;
+ stack = kasan_reset_tag(stack);
+ tsk->stack = stack;
+ return 0;
+}
+
+static void free_thread_stack(struct task_struct *tsk)
+{
+ if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
+ thread_stack_delayed_free(tsk);
+
+ tsk->stack = NULL;
+ tsk->stack_vm_area = NULL;
+}
+
+# else /* !CONFIG_VMAP_STACK */
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct rcu_head *rh = tsk->stack;
+
+ call_rcu(rh, thread_stack_free_rcu);
+}
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);
if (likely(page)) {
- tsk->stack = page_address(page);
- return tsk->stack;
+ tsk->stack = kasan_reset_tag(page_address(page));
+ return 0;
}
- return NULL;
-#endif
+ return -ENOMEM;
}
-static inline void free_thread_stack(struct task_struct *tsk)
+static void free_thread_stack(struct task_struct *tsk)
{
-#ifdef CONFIG_VMAP_STACK
- struct vm_struct *vm = task_stack_vm_area(tsk);
-
- if (vm) {
- int i;
-
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- mod_memcg_page_state(vm->pages[i],
- MEMCG_KERNEL_STACK_KB,
- -(int)(PAGE_SIZE / 1024));
+ thread_stack_delayed_free(tsk);
+ tsk->stack = NULL;
+}
- memcg_kmem_uncharge_page(vm->pages[i], 0);
- }
+# endif /* CONFIG_VMAP_STACK */
+# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */
- for (i = 0; i < NR_CACHED_STACKS; i++) {
- if (this_cpu_cmpxchg(cached_stacks[i],
- NULL, tsk->stack_vm_area) != NULL)
- continue;
+static struct kmem_cache *thread_stack_cache;
- return;
- }
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ kmem_cache_free(thread_stack_cache, rh);
+}
- vfree_atomic(tsk->stack);
- return;
- }
-#endif
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct rcu_head *rh = tsk->stack;
- __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
+ call_rcu(rh, thread_stack_free_rcu);
}
-# else
-static struct kmem_cache *thread_stack_cache;
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
- int node)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
unsigned long *stack;
stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
+ stack = kasan_reset_tag(stack);
tsk->stack = stack;
- return stack;
+ return stack ? 0 : -ENOMEM;
}
static void free_thread_stack(struct task_struct *tsk)
{
- kmem_cache_free(thread_stack_cache, tsk->stack);
+ thread_stack_delayed_free(tsk);
+ tsk->stack = NULL;
}
void thread_stack_cache_init(void)
@@ -323,8 +408,8 @@ void thread_stack_cache_init(void)
THREAD_SIZE, NULL);
BUG_ON(thread_stack_cache == NULL);
}
-# endif
-#endif
+
+# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;
@@ -344,13 +429,49 @@ static struct kmem_cache *vm_area_cachep;
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
+#ifdef CONFIG_PER_VMA_LOCK
+
+/* SLAB cache for vm_area_struct.lock */
+static struct kmem_cache *vma_lock_cachep;
+
+static bool vma_lock_alloc(struct vm_area_struct *vma)
+{
+ vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
+ if (!vma->vm_lock)
+ return false;
+
+ init_rwsem(&vma->vm_lock->lock);
+ vma->vm_lock_seq = -1;
+
+ return true;
+}
+
+static inline void vma_lock_free(struct vm_area_struct *vma)
+{
+ kmem_cache_free(vma_lock_cachep, vma->vm_lock);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
+static inline void vma_lock_free(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
struct vm_area_struct *vma;
vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (vma)
- vma_init(vma, mm);
+ if (!vma)
+ return NULL;
+
+ vma_init(vma, mm);
+ if (!vma_lock_alloc(vma)) {
+ kmem_cache_free(vm_area_cachep, vma);
+ return NULL;
+ }
+
return vma;
}
@@ -358,92 +479,94 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
{
struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (new) {
- *new = *orig;
- INIT_LIST_HEAD(&new->anon_vma_chain);
- new->vm_next = new->vm_prev = NULL;
+ if (!new)
+ return NULL;
+
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+ /*
+ * orig->shared.rb may be modified concurrently, but the clone
+ * will be reinitialized.
+ */
+ data_race(memcpy(new, orig, sizeof(*new)));
+ if (!vma_lock_alloc(new)) {
+ kmem_cache_free(vm_area_cachep, new);
+ return NULL;
}
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+ vma_numab_state_init(new);
+ dup_anon_vma_name(orig, new);
+
return new;
}
-void vm_area_free(struct vm_area_struct *vma)
+void __vm_area_free(struct vm_area_struct *vma)
{
+ vma_numab_state_free(vma);
+ free_anon_vma_name(vma);
+ vma_lock_free(vma);
kmem_cache_free(vm_area_cachep, vma);
}
-static void account_kernel_stack(struct task_struct *tsk, int account)
+#ifdef CONFIG_PER_VMA_LOCK
+static void vm_area_free_rcu_cb(struct rcu_head *head)
{
- void *stack = task_stack_page(tsk);
- struct vm_struct *vm = task_stack_vm_area(tsk);
+ struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
+ vm_rcu);
- BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+ /* The vma should not be locked while being destroyed. */
+ VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
+ __vm_area_free(vma);
+}
+#endif
- if (vm) {
- int i;
+void vm_area_free(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_PER_VMA_LOCK
+ call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
+#else
+ __vm_area_free(vma);
+#endif
+}
- BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+static void account_kernel_stack(struct task_struct *tsk, int account)
+{
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ struct vm_struct *vm = task_stack_vm_area(tsk);
+ int i;
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- mod_zone_page_state(page_zone(vm->pages[i]),
- NR_KERNEL_STACK_KB,
- PAGE_SIZE / 1024 * account);
- }
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
+ account * (PAGE_SIZE / 1024));
} else {
- /*
- * All stack pages are in the same zone and belong to the
- * same memcg.
- */
- struct page *first_page = virt_to_page(stack);
+ void *stack = task_stack_page(tsk);
- mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
- THREAD_SIZE / 1024 * account);
-
- mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB,
- account * (THREAD_SIZE / 1024));
+ /* All stack pages are in the same node. */
+ mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
}
}
-static int memcg_charge_kernel_stack(struct task_struct *tsk)
+void exit_task_stack_account(struct task_struct *tsk)
{
-#ifdef CONFIG_VMAP_STACK
- struct vm_struct *vm = task_stack_vm_area(tsk);
- int ret;
+ account_kernel_stack(tsk, -1);
- if (vm) {
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ struct vm_struct *vm;
int i;
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- /*
- * If memcg_kmem_charge_page() fails, page->mem_cgroup
- * pointer is NULL, and both memcg_kmem_uncharge_page()
- * and mod_memcg_page_state() in free_thread_stack()
- * will ignore this page. So it's safe.
- */
- ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
- 0);
- if (ret)
- return ret;
-
- mod_memcg_page_state(vm->pages[i],
- MEMCG_KERNEL_STACK_KB,
- PAGE_SIZE / 1024);
- }
+ vm = task_stack_vm_area(tsk);
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ memcg_kmem_uncharge_page(vm->pages[i], 0);
}
-#endif
- return 0;
}
static void release_task_stack(struct task_struct *tsk)
{
- if (WARN_ON(tsk->state != TASK_DEAD))
+ if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
return; /* Better to leak the stack than to free prematurely */
- account_kernel_stack(tsk, -1);
free_thread_stack(tsk);
- tsk->stack = NULL;
-#ifdef CONFIG_VMAP_STACK
- tsk->stack_vm_area = NULL;
-#endif
}
#ifdef CONFIG_THREAD_INFO_IN_TASK
@@ -456,6 +579,10 @@ void put_task_stack(struct task_struct *tsk)
void free_task(struct task_struct *tsk)
{
+#ifdef CONFIG_SECCOMP
+ WARN_ON_ONCE(tsk->seccomp.filter);
+#endif
+ release_user_cpus_ptr(tsk);
scs_release(tsk);
#ifndef CONFIG_THREAD_INFO_IN_TASK
@@ -473,23 +600,37 @@ void free_task(struct task_struct *tsk)
#endif
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
- put_seccomp_filter(tsk);
arch_release_task_struct(tsk);
if (tsk->flags & PF_KTHREAD)
free_kthread_struct(tsk);
+ bpf_task_storage_free(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
+static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ struct file *exe_file;
+
+ exe_file = get_mm_exe_file(oldmm);
+ RCU_INIT_POINTER(mm->exe_file, exe_file);
+ /*
+ * We depend on the oldmm having properly denied write access to the
+ * exe_file already.
+ */
+ if (exe_file && deny_write_access(exe_file))
+ pr_warn_once("deny_write_access() failed in %s\n", __func__);
+}
+
#ifdef CONFIG_MMU
static __latent_entropy int dup_mmap(struct mm_struct *mm,
struct mm_struct *oldmm)
{
- struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
- struct rb_node **rb_link, *rb_parent;
+ struct vm_area_struct *mpnt, *tmp;
int retval;
- unsigned long charge;
+ unsigned long charge = 0;
LIST_HEAD(uf);
+ VMA_ITERATOR(vmi, mm, 0);
uprobe_start_dup_mmap();
if (mmap_write_lock_killable(oldmm)) {
@@ -504,28 +645,34 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
/* No ordering required: file already has been exposed. */
- RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+ dup_mm_exe_file(mm, oldmm);
mm->total_vm = oldmm->total_vm;
mm->data_vm = oldmm->data_vm;
mm->exec_vm = oldmm->exec_vm;
mm->stack_vm = oldmm->stack_vm;
- rb_link = &mm->mm_rb.rb_node;
- rb_parent = NULL;
- pprev = &mm->mmap;
retval = ksm_fork(mm, oldmm);
if (retval)
goto out;
- retval = khugepaged_fork(mm, oldmm);
- if (retval)
+ khugepaged_fork(mm, oldmm);
+
+ /* Use __mt_dup() to efficiently build an identical maple tree. */
+ retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
+ if (unlikely(retval))
goto out;
- prev = NULL;
- for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
+ mt_clear_in_rcu(vmi.mas.tree);
+ for_each_vma(vmi, mpnt) {
struct file *file;
+ vma_start_write(mpnt);
if (mpnt->vm_flags & VM_DONTCOPY) {
+ retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
+ mpnt->vm_end, GFP_KERNEL);
+ if (retval)
+ goto loop_out;
+
vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
continue;
}
@@ -536,7 +683,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
*/
if (fatal_signal_pending(current)) {
retval = -EINTR;
- goto out;
+ goto loop_out;
}
if (mpnt->vm_flags & VM_ACCOUNT) {
unsigned long len = vma_pages(mpnt);
@@ -564,18 +711,15 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp->anon_vma = NULL;
} else if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
+ vm_flags_clear(tmp, VM_LOCKED_MASK);
file = tmp->vm_file;
if (file) {
- struct inode *inode = file_inode(file);
struct address_space *mapping = file->f_mapping;
get_file(file);
- if (tmp->vm_flags & VM_DENYWRITE)
- atomic_dec(&inode->i_writecount);
i_mmap_lock_write(mapping);
- if (tmp->vm_flags & VM_SHARED)
- atomic_inc(&mapping->i_mmap_writable);
+ if (vma_is_shared_maywrite(tmp))
+ mapping_allow_writable(mapping);
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
vma_interval_tree_insert_after(tmp, mpnt,
@@ -585,37 +729,46 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
}
/*
- * Clear hugetlb-related page reserves for children. This only
- * affects MAP_PRIVATE mappings. Faults generated by the child
- * are not guaranteed to succeed, even if read-only
+ * Copy/update hugetlb private vma information.
*/
if (is_vm_hugetlb_page(tmp))
- reset_vma_resv_huge_pages(tmp);
+ hugetlb_dup_vma_private(tmp);
/*
- * Link in the new vma and copy the page table entries.
+ * Link the vma into the MT. After using __mt_dup(), memory
+ * allocation is not necessary here, so it cannot fail.
*/
- *pprev = tmp;
- pprev = &tmp->vm_next;
- tmp->vm_prev = prev;
- prev = tmp;
-
- __vma_link_rb(mm, tmp, rb_link, rb_parent);
- rb_link = &tmp->vm_rb.rb_right;
- rb_parent = &tmp->vm_rb;
+ vma_iter_bulk_store(&vmi, tmp);
mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
- retval = copy_page_range(mm, oldmm, mpnt);
+ retval = copy_page_range(tmp, mpnt);
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
- if (retval)
- goto out;
+ if (retval) {
+ mpnt = vma_next(&vmi);
+ goto loop_out;
+ }
}
/* a new mm has just been created */
retval = arch_dup_mmap(oldmm, mm);
+loop_out:
+ vma_iter_free(&vmi);
+ if (!retval) {
+ mt_set_in_rcu(vmi.mas.tree);
+ } else if (mpnt) {
+ /*
+ * The entire maple tree has already been duplicated. If the
+ * mmap duplication fails, mark the failure point with
+ * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
+ * stop releasing VMAs that have not been duplicated after this
+ * point.
+ */
+ mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
+ mas_store(&vmi.mas, XA_ZERO_ENTRY);
+ }
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
@@ -624,6 +777,7 @@ out:
fail_uprobe_end:
uprobe_end_dup_mmap();
return retval;
+
fail_nomem_anon_vma_fork:
mpol_put(vma_policy(tmp));
fail_nomem_policy:
@@ -631,7 +785,7 @@ fail_nomem_policy:
fail_nomem:
retval = -ENOMEM;
vm_unacct_memory(charge);
- goto out;
+ goto loop_out;
}
static inline int mm_alloc_pgd(struct mm_struct *mm)
@@ -650,7 +804,7 @@ static inline void mm_free_pgd(struct mm_struct *mm)
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
mmap_write_lock(oldmm);
- RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+ dup_mm_exe_file(mm, oldmm);
mmap_write_unlock(oldmm);
return 0;
}
@@ -666,7 +820,7 @@ static void check_mm(struct mm_struct *mm)
"Please make sure 'struct resident_page_types[]' is updated as well");
for (i = 0; i < NR_MM_COUNTERS; i++) {
- long x = atomic_long_read(&mm->rss_stat.count[i]);
+ long x = percpu_counter_sum(&mm->rss_stat[i]);
if (unlikely(x))
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
@@ -685,6 +839,67 @@ static void check_mm(struct mm_struct *mm)
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+static void do_check_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void do_shoot_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ if (current->active_mm == mm) {
+ WARN_ON_ONCE(current->mm);
+ current->active_mm = &init_mm;
+ switch_mm(mm, &init_mm, current);
+ }
+}
+
+static void cleanup_lazy_tlbs(struct mm_struct *mm)
+{
+ if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+ /*
+ * In this case, lazy tlb mms are refounted and would not reach
+ * __mmdrop until all CPUs have switched away and mmdrop()ed.
+ */
+ return;
+ }
+
+ /*
+ * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
+ * requires lazy mm users to switch to another mm when the refcount
+ * drops to zero, before the mm is freed. This requires IPIs here to
+ * switch kernel threads to init_mm.
+ *
+ * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
+ * switch with the final userspace teardown TLB flush which leaves the
+ * mm lazy on this CPU but no others, reducing the need for additional
+ * IPIs here. There are cases where a final IPI is still required here,
+ * such as the final mmdrop being performed on a different CPU than the
+ * one exiting, or kernel threads using the mm when userspace exits.
+ *
+ * IPI overheads have not found to be expensive, but they could be
+ * reduced in a number of possible ways, for example (roughly
+ * increasing order of complexity):
+ * - The last lazy reference created by exit_mm() could instead switch
+ * to init_mm, however it's probable this will run on the same CPU
+ * immediately afterwards, so this may not reduce IPIs much.
+ * - A batch of mms requiring IPIs could be gathered and freed at once.
+ * - CPUs store active_mm where it can be remotely checked without a
+ * lock, to filter out false-positives in the cpumask.
+ * - After mm_users or mm_count reaches zero, switching away from the
+ * mm could clear mm_cpumask to reduce some IPIs, perhaps together
+ * with some batching or delaying of the final IPIs.
+ * - A delayed freeing and RCU-like quiescing sequence based on mm
+ * switching to avoid IPIs completely.
+ */
+ on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+ if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
+ on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+}
+
/*
* Called when the last reference to the mm
* is dropped: either by a lazy thread or by
@@ -694,12 +909,20 @@ void __mmdrop(struct mm_struct *mm)
{
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
+
+ /* Ensure no CPUs are using this as their lazy tlb mm */
+ cleanup_lazy_tlbs(mm);
+
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_subscriptions_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
+ mm_pasid_drop(mm);
+ mm_destroy_cid(mm);
+ percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
+
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -745,18 +968,26 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
+ io_uring_free(tsk);
cgroup_free(tsk);
task_numa_free(tsk, true);
security_task_free(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
-
- if (!profile_handoff_task(tsk))
- free_task(tsk);
+ sched_core_free(tsk);
+ free_task(tsk);
}
EXPORT_SYMBOL_GPL(__put_task_struct);
+void __put_task_struct_rcu_cb(struct rcu_head *rhp)
+{
+ struct task_struct *task = container_of(rhp, struct task_struct, rcu);
+
+ __put_task_struct(task);
+}
+EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);
+
void __init __weak arch_task_cache_init(void) { }
/*
@@ -788,7 +1019,6 @@ static void set_max_threads(unsigned int max_threads_suggested)
int arch_task_struct_size __read_mostly;
#endif
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
/* Fetch thread_struct whitelist for the architecture. */
@@ -803,12 +1033,10 @@ static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
else
*offset += offsetof(struct task_struct, thread);
}
-#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
void __init fork_init(void)
{
int i;
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN 0
#endif
@@ -821,7 +1049,6 @@ void __init fork_init(void)
arch_task_struct_size, align,
SLAB_PANIC|SLAB_ACCOUNT,
useroffset, usersize, NULL);
-#endif
/* do the arch specific task caches init */
arch_task_cache_init();
@@ -833,9 +1060,13 @@ void __init fork_init(void)
init_task.signal->rlim[RLIMIT_SIGPENDING] =
init_task.signal->rlim[RLIMIT_NPROC];
- for (i = 0; i < UCOUNT_COUNTS; i++) {
+ for (i = 0; i < UCOUNT_COUNTS; i++)
init_user_ns.ucount_max[i] = max_threads/2;
- }
+
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY);
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY);
#ifdef CONFIG_VMAP_STACK
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
@@ -866,8 +1097,6 @@ void set_task_stack_end_magic(struct task_struct *tsk)
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
- unsigned long *stack;
- struct vm_struct *stack_vm_area __maybe_unused;
int err;
if (node == NUMA_NO_NODE)
@@ -876,32 +1105,18 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
if (!tsk)
return NULL;
- stack = alloc_thread_stack_node(tsk, node);
- if (!stack)
+ err = arch_dup_task_struct(tsk, orig);
+ if (err)
goto free_tsk;
- if (memcg_charge_kernel_stack(tsk))
- goto free_stack;
-
- stack_vm_area = task_stack_vm_area(tsk);
-
- err = arch_dup_task_struct(tsk, orig);
+ err = alloc_thread_stack_node(tsk, node);
+ if (err)
+ goto free_tsk;
- /*
- * arch_dup_task_struct() clobbers the stack-related fields. Make
- * sure they're properly initialized before using any stack-related
- * functions again.
- */
- tsk->stack = stack;
-#ifdef CONFIG_VMAP_STACK
- tsk->stack_vm_area = stack_vm_area;
-#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
refcount_set(&tsk->stack_refcount, 1);
#endif
-
- if (err)
- goto free_stack;
+ account_kernel_stack(tsk, 1);
err = scs_prepare(tsk, node);
if (err)
@@ -921,12 +1136,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
clear_user_return_notifier(tsk);
clear_tsk_need_resched(tsk);
set_task_stack_end_magic(tsk);
+ clear_syscall_work_syscall_user_dispatch(tsk);
#ifdef CONFIG_STACKPROTECTOR
tsk->stack_canary = get_random_canary();
#endif
if (orig->cpus_ptr == &orig->cpus_mask)
tsk->cpus_ptr = &tsk->cpus_mask;
+ dup_user_cpus_ptr(tsk, orig, node);
/*
* One for the user space visible state that goes away when reaped.
@@ -941,26 +1158,43 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
-
- account_kernel_stack(tsk, 1);
+ tsk->worker_private = NULL;
kcov_task_init(tsk);
+ kmsan_task_create(tsk);
+ kmap_local_fork(tsk);
#ifdef CONFIG_FAULT_INJECTION
tsk->fail_nth = 0;
#endif
#ifdef CONFIG_BLK_CGROUP
- tsk->throttle_queue = NULL;
+ tsk->throttle_disk = NULL;
tsk->use_memdelay = 0;
#endif
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
+ tsk->pasid_activated = 0;
+#endif
+
#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
#endif
+
+#ifdef CONFIG_CPU_SUP_INTEL
+ tsk->reported_split_lock = 0;
+#endif
+
+#ifdef CONFIG_SCHED_MM_CID
+ tsk->mm_cid = -1;
+ tsk->last_mm_cid = -1;
+ tsk->mm_cid_active = 0;
+ tsk->migrate_from_cpu = -1;
+#endif
return tsk;
free_stack:
+ exit_task_stack_account(tsk);
free_thread_stack(tsk);
free_tsk:
free_task_struct(tsk);
@@ -1017,14 +1251,16 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
- mm->mmap = NULL;
- mm->mm_rb = RB_ROOT;
- mm->vmacache_seqnum = 0;
+ mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
+ mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
+ seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
- mm->core_state = NULL;
+#ifdef CONFIG_PER_VMA_LOCK
+ mm->mm_lock_seq = 0;
+#endif
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
@@ -1035,6 +1271,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
+ mm_pasid_init(mm);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
@@ -1042,9 +1279,10 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->pmd_huge_pte = NULL;
#endif
mm_init_uprobes_state(mm);
+ hugetlb_count_init(mm);
if (current->mm) {
- mm->flags = current->mm->flags & MMF_INIT_MASK;
+ mm->flags = mmf_init_flags(current->mm->flags);
mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
} else {
mm->flags = default_dump_filter;
@@ -1057,9 +1295,21 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (init_new_context(p, mm))
goto fail_nocontext;
+ if (mm_alloc_cid(mm))
+ goto fail_cid;
+
+ if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
+ NR_MM_COUNTERS))
+ goto fail_pcpu;
+
mm->user_ns = get_user_ns(user_ns);
+ lru_gen_init_mm(mm);
return mm;
+fail_pcpu:
+ mm_destroy_cid(mm);
+fail_cid:
+ destroy_context(mm);
fail_nocontext:
mm_free_pgd(mm);
fail_nopgd:
@@ -1100,6 +1350,7 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
+ lru_gen_del_mm(mm);
mmdrop(mm);
}
@@ -1131,20 +1382,23 @@ void mmput_async(struct mm_struct *mm)
schedule_work(&mm->async_put_work);
}
}
+EXPORT_SYMBOL_GPL(mmput_async);
#endif
/**
* set_mm_exe_file - change a reference to the mm's executable file
+ * @mm: The mm to change.
+ * @new_exe_file: The new file to use.
*
* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
*
* Main users are mmput() and sys_execve(). Callers prevent concurrent
- * invocations: in mmput() nobody alive left, in execve task is single
- * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
- * mm->exe_file, but does so without using set_mm_exe_file() in order
- * to do avoid the need for any locks.
+ * invocations: in mmput() nobody alive left, in execve it happens before
+ * the new mm is made visible to anyone.
+ *
+ * Can only fail if new_exe_file != NULL.
*/
-void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
struct file *old_exe_file;
@@ -1155,15 +1409,79 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
*/
old_exe_file = rcu_dereference_raw(mm->exe_file);
- if (new_exe_file)
+ if (new_exe_file) {
+ /*
+ * We expect the caller (i.e., sys_execve) to already denied
+ * write access, so this is unlikely to fail.
+ */
+ if (unlikely(deny_write_access(new_exe_file)))
+ return -EACCES;
get_file(new_exe_file);
+ }
rcu_assign_pointer(mm->exe_file, new_exe_file);
- if (old_exe_file)
+ if (old_exe_file) {
+ allow_write_access(old_exe_file);
fput(old_exe_file);
+ }
+ return 0;
+}
+
+/**
+ * replace_mm_exe_file - replace a reference to the mm's executable file
+ * @mm: The mm to change.
+ * @new_exe_file: The new file to use.
+ *
+ * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
+ *
+ * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
+ */
+int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+{
+ struct vm_area_struct *vma;
+ struct file *old_exe_file;
+ int ret = 0;
+
+ /* Forbid mm->exe_file change if old file still mapped. */
+ old_exe_file = get_mm_exe_file(mm);
+ if (old_exe_file) {
+ VMA_ITERATOR(vmi, mm, 0);
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma) {
+ if (!vma->vm_file)
+ continue;
+ if (path_equal(&vma->vm_file->f_path,
+ &old_exe_file->f_path)) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+ mmap_read_unlock(mm);
+ fput(old_exe_file);
+ if (ret)
+ return ret;
+ }
+
+ ret = deny_write_access(new_exe_file);
+ if (ret)
+ return -EACCES;
+ get_file(new_exe_file);
+
+ /* set the new file */
+ mmap_write_lock(mm);
+ old_exe_file = rcu_dereference_raw(mm->exe_file);
+ rcu_assign_pointer(mm->exe_file, new_exe_file);
+ mmap_write_unlock(mm);
+
+ if (old_exe_file) {
+ allow_write_access(old_exe_file);
+ fput(old_exe_file);
+ }
+ return 0;
}
/**
* get_mm_exe_file - acquire a reference to the mm's executable file
+ * @mm: The mm of interest.
*
* Returns %NULL if mm has no associated executable file.
* User must release file via fput().
@@ -1173,16 +1491,14 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
struct file *exe_file;
rcu_read_lock();
- exe_file = rcu_dereference(mm->exe_file);
- if (exe_file && !get_file_rcu(exe_file))
- exe_file = NULL;
+ exe_file = get_file_rcu(&mm->exe_file);
rcu_read_unlock();
return exe_file;
}
-EXPORT_SYMBOL(get_mm_exe_file);
/**
* get_task_exe_file - acquire a reference to the task's executable file
+ * @task: The task.
*
* Returns %NULL if task's mm (if any) has no associated executable file or
* this is a kernel thread with borrowed mm (see the comment above get_task_mm).
@@ -1202,10 +1518,10 @@ struct file *get_task_exe_file(struct task_struct *task)
task_unlock(task);
return exe_file;
}
-EXPORT_SYMBOL(get_task_exe_file);
/**
* get_task_mm - acquire a reference to the task's mm
+ * @task: The task.
*
* Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
* this kernel workthread has transiently adopted a user mm with use_mm,
@@ -1235,7 +1551,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
struct mm_struct *mm;
int err;
- err = mutex_lock_killable(&task->signal->exec_update_mutex);
+ err = down_read_killable(&task->signal->exec_update_lock);
if (err)
return ERR_PTR(err);
@@ -1245,7 +1561,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
mmput(mm);
mm = ERR_PTR(-EACCES);
}
- mutex_unlock(&task->signal->exec_update_mutex);
+ up_read(&task->signal->exec_update_lock);
return mm;
}
@@ -1266,13 +1582,12 @@ static void complete_vfork_done(struct task_struct *tsk)
static int wait_for_vfork_done(struct task_struct *child,
struct completion *vfork)
{
+ unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
int killed;
- freezer_do_not_count();
cgroup_enter_frozen();
- killed = wait_for_completion_killable(vfork);
+ killed = wait_for_completion_state(vfork, state);
cgroup_leave_frozen(false);
- freezer_count();
if (killed) {
task_lock(child);
@@ -1310,8 +1625,7 @@ static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
* purposes.
*/
if (tsk->clear_child_tid) {
- if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
- atomic_read(&mm->mm_users) > 1) {
+ if (atomic_read(&mm->mm_users) > 1) {
/*
* We don't check the error code - if userspace has
* not set up a proper pointer then tough luck.
@@ -1393,7 +1707,6 @@ fail_nomem:
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm;
- int retval;
tsk->min_flt = tsk->maj_flt = 0;
tsk->nvcsw = tsk->nivcsw = 0;
@@ -1414,27 +1727,19 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
if (!oldmm)
return 0;
- /* initialize the new vmacache entries */
- vmacache_flush(tsk);
-
if (clone_flags & CLONE_VM) {
mmget(oldmm);
mm = oldmm;
- goto good_mm;
+ } else {
+ mm = dup_mm(tsk, current->mm);
+ if (!mm)
+ return -ENOMEM;
}
- retval = -ENOMEM;
- mm = dup_mm(tsk, current->mm);
- if (!mm)
- goto fail_nomem;
-
-good_mm:
tsk->mm = mm;
tsk->active_mm = mm;
+ sched_mm_cid_fork(tsk);
return 0;
-
-fail_nomem:
- return retval;
}
static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
@@ -1443,6 +1748,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
if (clone_flags & CLONE_FS) {
/* tsk->fs is already what we want */
spin_lock(&fs->lock);
+ /* "users" and "in_exec" locked for check_unsafe_exec() */
if (fs->in_exec) {
spin_unlock(&fs->lock);
return -EAGAIN;
@@ -1457,7 +1763,8 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
+ int no_files)
{
struct files_struct *oldf, *newf;
int error = 0;
@@ -1469,12 +1776,17 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
if (!oldf)
goto out;
+ if (no_files) {
+ tsk->files = NULL;
+ goto out;
+ }
+
if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count);
goto out;
}
- newf = dup_fd(oldf, &error);
+ newf = dup_fd(oldf, NR_OPEN_MAX, &error);
if (!newf)
goto out;
@@ -1484,32 +1796,6 @@ out:
return error;
}
-static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
-{
-#ifdef CONFIG_BLOCK
- struct io_context *ioc = current->io_context;
- struct io_context *new_ioc;
-
- if (!ioc)
- return 0;
- /*
- * Share io context with parent, if CLONE_IO is set
- */
- if (clone_flags & CLONE_IO) {
- ioc_task_link(ioc);
- tsk->io_context = ioc;
- } else if (ioprio_valid(ioc->ioprio)) {
- new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
- if (unlikely(!new_ioc))
- return -ENOMEM;
-
- new_ioc->ioprio = ioc->ioprio;
- put_io_context(new_ioc);
- }
-#endif
- return 0;
-}
-
static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
{
struct sighand_struct *sig;
@@ -1572,6 +1858,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
return -ENOMEM;
sig->nr_threads = 1;
+ sig->quick_threads = 1;
atomic_set(&sig->live, 1);
refcount_set(&sig->sigcnt, 1);
@@ -1605,7 +1892,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
mutex_init(&sig->cred_guard_mutex);
- mutex_init(&sig->exec_update_mutex);
+ init_rwsem(&sig->exec_update_lock);
return 0;
}
@@ -1639,7 +1926,7 @@ static void copy_seccomp(struct task_struct *p)
* to manually enable the seccomp thread flag here.
*/
if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
- set_tsk_thread_flag(p, TIF_SECCOMP);
+ set_task_syscall_work(p, SECCOMP);
#endif
}
@@ -1664,9 +1951,8 @@ static inline void init_task_pid_links(struct task_struct *task)
{
enum pid_type type;
- for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
+ for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type)
INIT_HLIST_NODE(&task->pid_links[type]);
- }
}
static inline void
@@ -1695,6 +1981,7 @@ static inline void rcu_copy_process(struct task_struct *p)
p->trc_reader_nesting = 0;
p->trc_reader_special.s = 0;
INIT_LIST_HEAD(&p->trc_holdout_list);
+ INIT_LIST_HEAD(&p->trc_blkd_node);
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
@@ -1741,7 +2028,7 @@ static int pidfd_release(struct inode *inode, struct file *file)
* /proc/<pid>/status where Pid and NSpid are always shown relative to
* the pid namespace of the procfs instance. The difference becomes
* obvious when sending around a pidfd between pid namespaces from a
- * different branch of the tree, i.e. where no ancestoral relation is
+ * different branch of the tree, i.e. where no ancestral relation is
* present between the pid namespaces:
* - create two new pid namespaces ns1 and ns2 in the initial pid
* namespace (also take care to create new mount namespaces in the
@@ -1787,22 +2074,18 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
*/
static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
{
- struct task_struct *task;
struct pid *pid = file->private_data;
__poll_t poll_flags = 0;
poll_wait(file, &pid->wait_pidfd, pts);
- rcu_read_lock();
- task = pid_task(pid, PIDTYPE_PID);
/*
* Inform pollers only when the whole thread group exits.
* If the thread group leader exits before all other threads in the
* group, then poll(2) should block, similar to the wait(2) family.
*/
- if (!task || (task->exit_state && thread_group_empty(task)))
+ if (thread_group_exited(pid))
poll_flags = EPOLLIN | EPOLLRDNORM;
- rcu_read_unlock();
return poll_flags;
}
@@ -1815,6 +2098,91 @@ const struct file_operations pidfd_fops = {
#endif
};
+/**
+ * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid: the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @ret: Where to return the file for the pidfd.
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+ *
+ * The helper doesn't perform checks on @pid which makes it useful for pidfds
+ * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
+ * pidfd file are prepared.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ * pidfd file is returned in the last argument to the function. On
+ * error, a negative error code is returned from the function and the
+ * last argument remains unchanged.
+ */
+static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+ int pidfd;
+ struct file *pidfd_file;
+
+ if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
+ return -EINVAL;
+
+ pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ if (pidfd < 0)
+ return pidfd;
+
+ pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
+ flags | O_RDWR | O_CLOEXEC);
+ if (IS_ERR(pidfd_file)) {
+ put_unused_fd(pidfd);
+ return PTR_ERR(pidfd_file);
+ }
+ get_pid(pid); /* held by pidfd_file now */
+ *ret = pidfd_file;
+ return pidfd;
+}
+
+/**
+ * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid: the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @ret: Where to return the pidfd.
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+ *
+ * The helper verifies that @pid is used as a thread group leader.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ * pidfd file is returned in the last argument to the function. On
+ * error, a negative error code is returned from the function and the
+ * last argument remains unchanged.
+ */
+int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+ if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+ return -EINVAL;
+
+ return __pidfd_prepare(pid, flags, ret);
+}
+
static void __delayed_free_task(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
@@ -1830,6 +2198,37 @@ static __always_inline void delayed_free_task(struct task_struct *tsk)
free_task(tsk);
}
+static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
+{
+ /* Skip if kernel thread */
+ if (!tsk->mm)
+ return;
+
+ /* Skip if spawning a thread or using vfork */
+ if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
+ return;
+
+ /* We need to synchronize with __set_oom_adj */
+ mutex_lock(&oom_adj_mutex);
+ set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
+ /* Update the values in case they were changed after copy_signal */
+ tsk->signal->oom_score_adj = current->signal->oom_score_adj;
+ tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
+ mutex_unlock(&oom_adj_mutex);
+}
+
+#ifdef CONFIG_RV
+static void rv_task_fork(struct task_struct *p)
+{
+ int i;
+
+ for (i = 0; i < RV_PER_TASK_MONITORS; i++)
+ p->rv[i].da_mon.monitoring = false;
+}
+#else
+#define rv_task_fork(p) do {} while (0)
+#endif
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -1838,7 +2237,7 @@ static __always_inline void delayed_free_task(struct task_struct *tsk)
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
-static __latent_entropy struct task_struct *copy_process(
+__latent_entropy struct task_struct *copy_process(
struct pid *pid,
int trace,
int node,
@@ -1848,7 +2247,7 @@ static __latent_entropy struct task_struct *copy_process(
struct task_struct *p;
struct multiprocess_signals delayed;
struct file *pidfile = NULL;
- u64 clone_flags = args->flags;
+ const u64 clone_flags = args->flags;
struct nsproxy *nsp = current->nsproxy;
/*
@@ -1896,15 +2295,6 @@ static __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
- /*
- * If the new process will be in a different time namespace
- * do not allow it to share VM or a thread group with the forking task.
- */
- if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
- if (nsp->time_ns != nsp->time_ns_for_children)
- return ERR_PTR(-EINVAL);
- }
-
if (clone_flags & CLONE_PIDFD) {
/*
* - CLONE_DETACHED is blocked so that we can potentially
@@ -1930,20 +2320,30 @@ static __latent_entropy struct task_struct *copy_process(
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
retval = -ERESTARTNOINTR;
- if (signal_pending(current))
+ if (task_sigpending(current))
goto fork_out;
retval = -ENOMEM;
p = dup_task_struct(current, node);
if (!p)
goto fork_out;
+ p->flags &= ~PF_KTHREAD;
+ if (args->kthread)
+ p->flags |= PF_KTHREAD;
+ if (args->user_worker) {
+ /*
+ * Mark us a user worker, and block any signal that isn't
+ * fatal or STOP
+ */
+ p->flags |= PF_USER_WORKER;
+ siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
+ }
+ if (args->io_thread)
+ p->flags |= PF_IO_WORKER;
+
+ if (args->name)
+ strscpy_pad(p->comm, args->name, sizeof(p->comm));
- /*
- * This _must_ happen before we call free_task(), i.e. before we jump
- * to any of the bad_fork_* labels. This is to avoid freeing
- * p->set_child_tid which is (ab)used as a kthread's data pointer for
- * kernel threads (PF_KTHREAD).
- */
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
* Clear TID on mm_release()?
@@ -1954,23 +2354,22 @@ static __latent_entropy struct task_struct *copy_process(
rt_mutex_init_task(p);
+ lockdep_assert_irqs_enabled();
#ifdef CONFIG_PROVE_LOCKING
- DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
+ retval = copy_creds(p, clone_flags);
+ if (retval < 0)
+ goto bad_fork_free;
+
retval = -EAGAIN;
- if (atomic_read(&p->real_cred->user->processes) >=
- task_rlimit(p, RLIMIT_NPROC)) {
+ if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
- goto bad_fork_free;
+ goto bad_fork_cleanup_count;
}
current->flags &= ~PF_NPROC_EXCEEDED;
- retval = copy_creds(p, clone_flags);
- if (retval < 0)
- goto bad_fork_free;
-
/*
* If multiple threads are within copy_process(), then this check
* triggers too late. This doesn't hurt, the check is only there
@@ -1981,7 +2380,7 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_cleanup_count;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
- p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
+ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
@@ -2003,8 +2402,8 @@ static __latent_entropy struct task_struct *copy_process(
p->vtime.state = VTIME_INACTIVE;
#endif
-#if defined(SPLIT_RSS_COUNTING)
- memset(&p->rss_stat, 0, sizeof(p->rss_stat));
+#ifdef CONFIG_IO_URING
+ p->io_uring = NULL;
#endif
p->default_timer_slack_ns = current->timer_slack_ns;
@@ -2021,33 +2420,29 @@ static __latent_entropy struct task_struct *copy_process(
p->io_context = NULL;
audit_set_context(p, NULL);
cgroup_fork(p);
+ if (args->kthread) {
+ if (!set_kthread_struct(p))
+ goto bad_fork_cleanup_delayacct;
+ }
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
- goto bad_fork_cleanup_threadgroup_lock;
+ goto bad_fork_cleanup_delayacct;
}
#endif
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
- seqcount_init(&p->mems_allowed_seq);
+ seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
- p->irq_events = 0;
- p->hardirqs_enabled = 0;
- p->hardirq_enable_ip = 0;
- p->hardirq_enable_event = 0;
- p->hardirq_disable_ip = _THIS_IP_;
- p->hardirq_disable_event = 0;
- p->softirqs_enabled = 1;
- p->softirq_enable_ip = _THIS_IP_;
- p->softirq_enable_event = 0;
- p->softirq_disable_ip = 0;
- p->softirq_disable_event = 0;
- p->hardirq_context = 0;
- p->softirq_context = 0;
+ memset(&p->irqtrace, 0, sizeof(p->irqtrace));
+ p->irqtrace.hardirq_disable_ip = _THIS_IP_;
+ p->irqtrace.softirq_enable_ip = _THIS_IP_;
+ p->softirqs_enabled = 1;
+ p->softirq_context = 0;
#endif
p->pagefault_disabled = 0;
@@ -2063,13 +2458,17 @@ static __latent_entropy struct task_struct *copy_process(
p->sequential_io = 0;
p->sequential_io_avg = 0;
#endif
+#ifdef CONFIG_BPF_SYSCALL
+ RCU_INIT_POINTER(p->bpf_storage, NULL);
+ p->bpf_ctx = NULL;
+#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
if (retval)
goto bad_fork_cleanup_policy;
- retval = perf_event_init_task(p);
+ retval = perf_event_init_task(p, clone_flags);
if (retval)
goto bad_fork_cleanup_policy;
retval = audit_alloc(p);
@@ -2083,7 +2482,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_security;
- retval = copy_files(clone_flags, p);
+ retval = copy_files(clone_flags, p, args->no_files);
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p);
@@ -2104,8 +2503,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
- args->tls);
+ retval = copy_thread(p, args);
if (retval)
goto bad_fork_cleanup_io;
@@ -2126,21 +2524,12 @@ static __latent_entropy struct task_struct *copy_process(
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
- retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ /* Note that no task has been attached to @pid yet. */
+ retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
-
pidfd = retval;
- pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
- O_RDWR | O_CLOEXEC);
- if (IS_ERR(pidfile)) {
- put_unused_fd(pidfd);
- retval = PTR_ERR(pidfile);
- goto bad_fork_free_pid;
- }
- get_pid(pid); /* held by pidfile now */
-
retval = put_user(pidfd, args->pidfd);
if (retval)
goto bad_fork_put_pidfd;
@@ -2162,23 +2551,18 @@ static __latent_entropy struct task_struct *copy_process(
* child regardless of CLONE_PTRACE.
*/
user_disable_single_step(p);
- clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
-#ifdef TIF_SYSCALL_EMU
- clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
+ clear_task_syscall_work(p, SYSCALL_TRACE);
+#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
+ clear_task_syscall_work(p, SYSCALL_EMU);
#endif
clear_tsk_latency_tracing(p);
/* ok, now we should be set up.. */
p->pid = pid_nr(pid);
if (clone_flags & CLONE_THREAD) {
- p->exit_signal = -1;
p->group_leader = current->group_leader;
p->tgid = current->tgid;
} else {
- if (clone_flags & CLONE_PARENT)
- p->exit_signal = current->group_leader->exit_signal;
- else
- p->exit_signal = args->exit_signal;
p->group_leader = p;
p->tgid = p->pid;
}
@@ -2188,12 +2572,19 @@ static __latent_entropy struct task_struct *copy_process(
p->dirty_paused_when = 0;
p->pdeath_signal = 0;
- INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
+ clear_posix_cputimers_work(p);
+
+#ifdef CONFIG_KRETPROBES
+ p->kretprobe_instances.first = NULL;
+#endif
+#ifdef CONFIG_RETHOOK
+ p->rethooks.first = NULL;
+#endif
/*
* Ensure that the cgroup subsystem policies allow the new process to be
- * forked. It should be noted the the new process's css_set can be changed
+ * forked. It should be noted that the new process's css_set can be changed
* between here and cgroup_post_fork() if an organisation operation is in
* progress.
*/
@@ -2202,6 +2593,17 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_put_pidfd;
/*
+ * Now that the cgroups are pinned, re-clone the parent cgroup and put
+ * the new task on the correct runqueue. All this *before* the task
+ * becomes visible.
+ *
+ * This isn't part of ->can_fork() because while the re-cloning is
+ * cgroup specific, it unconditionally needs to place the task on a
+ * runqueue.
+ */
+ sched_cgroup_fork(p, args);
+
+ /*
* From this point on we must avoid any synchronous user-space
* communication until we take the tasklist-lock. In particular, we do
* not want user-space to be able to predict the process start-time by
@@ -2222,20 +2624,23 @@ static __latent_entropy struct task_struct *copy_process(
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
p->parent_exec_id = current->parent_exec_id;
+ if (clone_flags & CLONE_THREAD)
+ p->exit_signal = -1;
+ else
+ p->exit_signal = current->group_leader->exit_signal;
} else {
p->real_parent = current;
p->parent_exec_id = current->self_exec_id;
+ p->exit_signal = args->exit_signal;
}
klp_copy_process(p);
+ sched_core_fork(p);
+
spin_lock(&current->sighand->siglock);
- /*
- * Copy seccomp details explicitly here, in case they were changed
- * before holding sighand lock.
- */
- copy_seccomp(p);
+ rv_task_fork(p);
rseq_fork(p, clone_flags);
@@ -2251,9 +2656,13 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_cancel_cgroup;
}
- /* past the last point of failure */
- if (pidfile)
- fd_install(pidfd, pidfile);
+ /* No more failure paths after this point. */
+
+ /*
+ * Copy seccomp details explicitly here, in case they were changed
+ * before holding sighand lock.
+ */
+ copy_seccomp(p);
init_task_pid_links(p);
if (likely(p->pid)) {
@@ -2286,11 +2695,10 @@ static __latent_entropy struct task_struct *copy_process(
__this_cpu_inc(process_counts);
} else {
current->signal->nr_threads++;
+ current->signal->quick_threads++;
atomic_inc(&current->signal->live);
refcount_inc(&current->signal->sigcnt);
task_join_group_stop(p);
- list_add_tail_rcu(&p->thread_group,
- &p->group_leader->thread_group);
list_add_tail_rcu(&p->thread_node,
&p->signal->thread_head);
}
@@ -2303,16 +2711,24 @@ static __latent_entropy struct task_struct *copy_process(
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
+ if (pidfile)
+ fd_install(pidfd, pidfile);
+
proc_fork_connector(p);
+ sched_post_fork(p);
cgroup_post_fork(p, args);
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
uprobe_copy_process(p, clone_flags);
+ user_events_fork(p, clone_flags);
+
+ copy_oom_score_adj(clone_flags, p);
return p;
bad_fork_cancel_cgroup:
+ sched_core_free(p);
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
cgroup_cancel_fork(p, args);
@@ -2357,14 +2773,15 @@ bad_fork_cleanup_policy:
lockdep_free_task(p);
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
-bad_fork_cleanup_threadgroup_lock:
#endif
+bad_fork_cleanup_delayacct:
delayacct_tsk_free(p);
bad_fork_cleanup_count:
- atomic_dec(&p->cred->user->processes);
+ dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
exit_creds(p);
bad_fork_free:
- p->state = TASK_DEAD;
+ WRITE_ONCE(p->__state, TASK_DEAD);
+ exit_task_stack_account(p);
put_task_stack(p);
delayed_free_task(p);
fork_out:
@@ -2384,11 +2801,21 @@ static inline void init_idle_pids(struct task_struct *idle)
}
}
-struct task_struct *fork_idle(int cpu)
+static int idle_dummy(void *dummy)
+{
+ /* This function is never called */
+ return 0;
+}
+
+struct task_struct * __init fork_idle(int cpu)
{
struct task_struct *task;
struct kernel_clone_args args = {
- .flags = CLONE_VM,
+ .flags = CLONE_VM,
+ .fn = &idle_dummy,
+ .fn_arg = NULL,
+ .kthread = 1,
+ .idle = 1,
};
task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
@@ -2400,9 +2827,27 @@ struct task_struct *fork_idle(int cpu)
return task;
}
-struct mm_struct *copy_init_mm(void)
+/*
+ * This is like kernel_clone(), but shaved down and tailored to just
+ * creating io_uring workers. It returns a created task, or an error pointer.
+ * The returned task is inactive, and the caller must fire it up through
+ * wake_up_new_task(p). All signals are blocked in the created task.
+ */
+struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
{
- return dup_mm(NULL, &init_mm);
+ unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
+ CLONE_IO;
+ struct kernel_clone_args args = {
+ .flags = ((lower_32_bits(flags) | CLONE_VM |
+ CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .fn = fn,
+ .fn_arg = arg,
+ .io_thread = 1,
+ .user_worker = 1,
+ };
+
+ return copy_process(NULL, 0, node, &args);
}
/*
@@ -2413,14 +2858,28 @@ struct mm_struct *copy_init_mm(void)
*
* args->exit_signal is expected to be checked for sanity by the caller.
*/
-long _do_fork(struct kernel_clone_args *args)
+pid_t kernel_clone(struct kernel_clone_args *args)
{
u64 clone_flags = args->flags;
struct completion vfork;
struct pid *pid;
struct task_struct *p;
int trace = 0;
- long nr;
+ pid_t nr;
+
+ /*
+ * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
+ * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
+ * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
+ * field in struct clone_args and it still doesn't make sense to have
+ * them both point at the same memory location. Performing this check
+ * here has the advantage that we don't need to have a separate helper
+ * to check for legacy clone().
+ */
+ if ((args->flags & CLONE_PIDFD) &&
+ (args->flags & CLONE_PARENT_SETTID) &&
+ (args->pidfd == args->parent_tid))
+ return -EINVAL;
/*
* Determine whether and which event to report to ptracer. When
@@ -2464,6 +2923,13 @@ long _do_fork(struct kernel_clone_args *args)
get_task_struct(p);
}
+ if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
+ /* lock the task to synchronize with memcg migration */
+ task_lock(p);
+ lru_gen_add_mm(p->mm);
+ task_unlock(p);
+ }
+
wake_up_new_task(p);
/* forking complete and child started to run, tell ptracer */
@@ -2479,56 +2945,39 @@ long _do_fork(struct kernel_clone_args *args)
return nr;
}
-bool legacy_clone_args_valid(const struct kernel_clone_args *kargs)
-{
- /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
- if ((kargs->flags & CLONE_PIDFD) &&
- (kargs->flags & CLONE_PARENT_SETTID))
- return false;
-
- return true;
-}
-
-#ifndef CONFIG_HAVE_COPY_THREAD_TLS
-/* For compatibility with architectures that call do_fork directly rather than
- * using the syscall entry points below. */
-long do_fork(unsigned long clone_flags,
- unsigned long stack_start,
- unsigned long stack_size,
- int __user *parent_tidptr,
- int __user *child_tidptr)
+/*
+ * Create a kernel thread.
+ */
+pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+ unsigned long flags)
{
struct kernel_clone_args args = {
- .flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
- .pidfd = parent_tidptr,
- .child_tid = child_tidptr,
- .parent_tid = parent_tidptr,
- .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
- .stack = stack_start,
- .stack_size = stack_size,
+ .flags = ((lower_32_bits(flags) | CLONE_VM |
+ CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .fn = fn,
+ .fn_arg = arg,
+ .name = name,
+ .kthread = 1,
};
- if (!legacy_clone_args_valid(&args))
- return -EINVAL;
-
- return _do_fork(&args);
+ return kernel_clone(&args);
}
-#endif
/*
- * Create a kernel thread.
+ * Create a user mode thread.
*/
-pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
struct kernel_clone_args args = {
.flags = ((lower_32_bits(flags) | CLONE_VM |
CLONE_UNTRACED) & ~CSIGNAL),
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
- .stack = (unsigned long)fn,
- .stack_size = (unsigned long)arg,
+ .fn = fn,
+ .fn_arg = arg,
};
- return _do_fork(&args);
+ return kernel_clone(&args);
}
#ifdef __ARCH_WANT_SYS_FORK
@@ -2539,7 +2988,7 @@ SYSCALL_DEFINE0(fork)
.exit_signal = SIGCHLD,
};
- return _do_fork(&args);
+ return kernel_clone(&args);
#else
/* can not support in nommu mode */
return -EINVAL;
@@ -2555,7 +3004,7 @@ SYSCALL_DEFINE0(vfork)
.exit_signal = SIGCHLD,
};
- return _do_fork(&args);
+ return kernel_clone(&args);
}
#endif
@@ -2593,24 +3042,12 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
.tls = tls,
};
- if (!legacy_clone_args_valid(&args))
- return -EINVAL;
-
- return _do_fork(&args);
+ return kernel_clone(&args);
}
#endif
#ifdef __ARCH_WANT_SYS_CLONE3
-/*
- * copy_thread implementations handle CLONE_SETTLS by reading the TLS value from
- * the registers containing the syscall arguments for clone. This doesn't work
- * with clone3 since the TLS value is passed in clone_args instead.
- */
-#ifndef CONFIG_HAVE_COPY_THREAD_TLS
-#error clone3 requires copy_thread_tls support in arch
-#endif
-
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
struct clone_args __user *uargs,
size_t usize)
@@ -2700,7 +3137,7 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
return false;
-#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
+#if !defined(CONFIG_STACK_GROWSUP)
kargs->stack += kargs->stack_size;
#endif
}
@@ -2716,10 +3153,10 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
return false;
/*
- * - make the CLONE_DETACHED bit reuseable for clone3
- * - make the CSIGNAL bits reuseable for clone3
+ * - make the CLONE_DETACHED bit reusable for clone3
+ * - make the CSIGNAL bits reusable for clone3
*/
- if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
+ if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME))))
return false;
if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
@@ -2737,7 +3174,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
}
/**
- * clone3 - create a new process with specific properties
+ * sys_clone3 - create a new process with specific properties
* @uargs: argument structure
* @size: size of @uargs
*
@@ -2763,7 +3200,7 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
if (!clone3_args_valid(&kargs))
return -EINVAL;
- return _do_fork(&kargs);
+ return kernel_clone(&kargs);
}
#endif
@@ -2811,10 +3248,27 @@ static void sighand_ctor(void *data)
init_waitqueue_head(&sighand->signalfd_wqh);
}
-void __init proc_caches_init(void)
+void __init mm_cache_init(void)
{
unsigned int mm_size;
+ /*
+ * The mm_cpumask is located at the end of mm_struct, and is
+ * dynamically sized based on the maximum CPU number this system
+ * can have, taking hotplug into account (nr_cpu_ids).
+ */
+ mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();
+
+ mm_cachep = kmem_cache_create_usercopy("mm_struct",
+ mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
+ offsetof(struct mm_struct, saved_auxv),
+ sizeof_field(struct mm_struct, saved_auxv),
+ NULL);
+}
+
+void __init proc_caches_init(void)
+{
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@@ -2832,20 +3286,10 @@ void __init proc_caches_init(void)
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
- /*
- * The mm_cpumask is located at the end of mm_struct, and is
- * dynamically sized based on the maximum CPU number this system
- * can have, taking hotplug into account (nr_cpu_ids).
- */
- mm_size = sizeof(struct mm_struct) + cpumask_size();
-
- mm_cachep = kmem_cache_create_usercopy("mm_struct",
- mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
- offsetof(struct mm_struct, saved_auxv),
- sizeof_field(struct mm_struct, saved_auxv),
- NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
+#ifdef CONFIG_PER_VMA_LOCK
+ vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
+#endif
mmap_init();
nsproxy_cache_init();
}
@@ -2907,14 +3351,15 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
/*
* Unshare file descriptor table if it is being shared
*/
-static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
+int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
+ struct files_struct **new_fdp)
{
struct files_struct *fd = current->files;
int error = 0;
if ((unshare_flags & CLONE_FILES) &&
(fd && atomic_read(&fd->count) > 1)) {
- *new_fdp = dup_fd(fd, &error);
+ *new_fdp = dup_fd(fd, max_fds, &error);
if (!*new_fdp)
return error;
}
@@ -2925,7 +3370,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
/*
* unshare allows a process to 'unshare' part of the process
* context which was originally shared using clone. copy_*
- * functions used by do_fork() cannot be used here directly
+ * functions used by kernel_clone() cannot be used here directly
* because they modify an inactive task_struct that is being
* constructed. Here we are modifying the current, active,
* task_struct.
@@ -2933,7 +3378,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
int ksys_unshare(unsigned long unshare_flags)
{
struct fs_struct *fs, *new_fs = NULL;
- struct files_struct *fd, *new_fd = NULL;
+ struct files_struct *new_fd = NULL;
struct cred *new_cred = NULL;
struct nsproxy *new_nsproxy = NULL;
int do_sysvsem = 0;
@@ -2974,7 +3419,7 @@ int ksys_unshare(unsigned long unshare_flags)
err = unshare_fs(unshare_flags, &new_fs);
if (err)
goto bad_unshare_out;
- err = unshare_fd(unshare_flags, &new_fd);
+ err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
if (err)
goto bad_unshare_cleanup_fs;
err = unshare_userns(unshare_flags, &new_cred);
@@ -2985,6 +3430,12 @@ int ksys_unshare(unsigned long unshare_flags)
if (err)
goto bad_unshare_cleanup_cred;
+ if (new_cred) {
+ err = set_cred_ucounts(new_cred);
+ if (err)
+ goto bad_unshare_cleanup_cred;
+ }
+
if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
if (do_sysvsem) {
/*
@@ -3014,11 +3465,8 @@ int ksys_unshare(unsigned long unshare_flags)
spin_unlock(&fs->lock);
}
- if (new_fd) {
- fd = current->files;
- current->files = new_fd;
- new_fd = fd;
- }
+ if (new_fd)
+ swap(current->files, new_fd);
task_unlock(current);
@@ -3057,26 +3505,26 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
* the exec layer of the kernel.
*/
-int unshare_files(struct files_struct **displaced)
+int unshare_files(void)
{
struct task_struct *task = current;
- struct files_struct *copy = NULL;
+ struct files_struct *old, *copy = NULL;
int error;
- error = unshare_fd(CLONE_FILES, &copy);
- if (error || !copy) {
- *displaced = NULL;
+ error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
+ if (error || !copy)
return error;
- }
- *displaced = task->files;
+
+ old = task->files;
task_lock(task);
task->files = copy;
task_unlock(task);
+ put_files_struct(old);
return 0;
}
int sysctl_max_threads(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int ret;
diff --git a/kernel/freezer.c b/kernel/freezer.c
index dc520f01f99d..f57aaf96b829 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -13,10 +13,11 @@
#include <linux/kthread.h>
/* total number of freezing conditions in effect */
-atomic_t system_freezing_cnt = ATOMIC_INIT(0);
-EXPORT_SYMBOL(system_freezing_cnt);
+DEFINE_STATIC_KEY_FALSE(freezer_active);
+EXPORT_SYMBOL(freezer_active);
-/* indicate whether PM freezing is in effect, protected by
+/*
+ * indicate whether PM freezing is in effect, protected by
* system_transition_mutex
*/
bool pm_freezing;
@@ -29,7 +30,7 @@ static DEFINE_SPINLOCK(freezer_lock);
* freezing_slow_path - slow path for testing whether a task needs to be frozen
* @p: task to be tested
*
- * This function is called by freezing() if system_freezing_cnt isn't zero
+ * This function is called by freezing() if freezer_active isn't zero
* and tests whether @p needs to enter and stay in frozen state. Can be
* called under any context. The freezers are responsible for ensuring the
* target tasks see the updated state.
@@ -52,41 +53,44 @@ bool freezing_slow_path(struct task_struct *p)
}
EXPORT_SYMBOL(freezing_slow_path);
+bool frozen(struct task_struct *p)
+{
+ return READ_ONCE(p->__state) & TASK_FROZEN;
+}
+
/* Refrigerator is place where frozen processes are stored :-). */
bool __refrigerator(bool check_kthr_stop)
{
- /* Hmm, should we be allowed to suspend when there are realtime
- processes around? */
+ unsigned int state = get_current_state();
bool was_frozen = false;
- long save = current->state;
pr_debug("%s entered refrigerator\n", current->comm);
+ WARN_ON_ONCE(state && !(state & TASK_NORMAL));
+
for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ bool freeze;
+
+ raw_spin_lock_irq(&current->pi_lock);
+ set_current_state(TASK_FROZEN);
+ /* unstale saved_state so that __thaw_task() will wake us up */
+ current->saved_state = TASK_RUNNING;
+ raw_spin_unlock_irq(&current->pi_lock);
spin_lock_irq(&freezer_lock);
- current->flags |= PF_FROZEN;
- if (!freezing(current) ||
- (check_kthr_stop && kthread_should_stop()))
- current->flags &= ~PF_FROZEN;
+ freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop());
spin_unlock_irq(&freezer_lock);
- if (!(current->flags & PF_FROZEN))
+ if (!freeze)
break;
+
was_frozen = true;
schedule();
}
+ __set_current_state(TASK_RUNNING);
pr_debug("%s left refrigerator\n", current->comm);
- /*
- * Restore saved task state before returning. The mb'd version
- * needs to be used; otherwise, it might silently break
- * synchronization which depends on ordered task state change.
- */
- set_current_state(save);
-
return was_frozen;
}
EXPORT_SYMBOL(__refrigerator);
@@ -101,6 +105,45 @@ static void fake_signal_wake_up(struct task_struct *p)
}
}
+static int __set_task_frozen(struct task_struct *p, void *arg)
+{
+ unsigned int state = READ_ONCE(p->__state);
+
+ if (p->on_rq)
+ return 0;
+
+ if (p != current && task_curr(p))
+ return 0;
+
+ if (!(state & (TASK_FREEZABLE | __TASK_STOPPED | __TASK_TRACED)))
+ return 0;
+
+ /*
+ * Only TASK_NORMAL can be augmented with TASK_FREEZABLE, since they
+ * can suffer spurious wakeups.
+ */
+ if (state & TASK_FREEZABLE)
+ WARN_ON_ONCE(!(state & TASK_NORMAL));
+
+#ifdef CONFIG_LOCKDEP
+ /*
+ * It's dangerous to freeze with locks held; there be dragons there.
+ */
+ if (!(state & __TASK_FREEZABLE_UNSAFE))
+ WARN_ON_ONCE(debug_locks && p->lockdep_depth);
+#endif
+
+ p->saved_state = p->__state;
+ WRITE_ONCE(p->__state, TASK_FROZEN);
+ return TASK_FROZEN;
+}
+
+static bool __freeze_task(struct task_struct *p)
+{
+ /* TASK_FREEZABLE|TASK_STOPPED|TASK_TRACED -> TASK_FROZEN */
+ return task_call_func(p, __set_task_frozen, NULL);
+}
+
/**
* freeze_task - send a freeze request to given task
* @p: task to send the request to
@@ -116,20 +159,8 @@ bool freeze_task(struct task_struct *p)
{
unsigned long flags;
- /*
- * This check can race with freezer_do_not_count, but worst case that
- * will result in an extra wakeup being sent to the task. It does not
- * race with freezer_count(), the barriers in freezer_count() and
- * freezer_should_skip() ensure that either freezer_count() sees
- * freezing == true in try_to_freeze() and freezes, or
- * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task
- * normally.
- */
- if (freezer_should_skip(p))
- return false;
-
spin_lock_irqsave(&freezer_lock, flags);
- if (!freezing(p) || frozen(p)) {
+ if (!freezing(p) || frozen(p) || __freeze_task(p)) {
spin_unlock_irqrestore(&freezer_lock, flags);
return false;
}
@@ -137,19 +168,45 @@ bool freeze_task(struct task_struct *p)
if (!(p->flags & PF_KTHREAD))
fake_signal_wake_up(p);
else
- wake_up_state(p, TASK_INTERRUPTIBLE);
+ wake_up_state(p, TASK_NORMAL);
spin_unlock_irqrestore(&freezer_lock, flags);
return true;
}
+/*
+ * Restore the saved_state before the task entered freezer. For typical task
+ * in the __refrigerator(), saved_state == TASK_RUNNING so nothing happens
+ * here. For tasks which were TASK_NORMAL | TASK_FREEZABLE, their initial state
+ * is restored unless they got an expected wakeup (see ttwu_state_match()).
+ * Returns 1 if the task state was restored.
+ */
+static int __restore_freezer_state(struct task_struct *p, void *arg)
+{
+ unsigned int state = p->saved_state;
+
+ if (state != TASK_RUNNING) {
+ WRITE_ONCE(p->__state, state);
+ p->saved_state = TASK_RUNNING;
+ return 1;
+ }
+
+ return 0;
+}
+
void __thaw_task(struct task_struct *p)
{
unsigned long flags;
spin_lock_irqsave(&freezer_lock, flags);
- if (frozen(p))
- wake_up_process(p);
+ if (WARN_ON_ONCE(freezing(p)))
+ goto unlock;
+
+ if (!frozen(p) || task_call_func(p, __restore_freezer_state, NULL))
+ goto unlock;
+
+ wake_up_state(p, TASK_FROZEN);
+unlock:
spin_unlock_irqrestore(&freezer_lock, flags);
}
diff --git a/kernel/futex.c b/kernel/futex.c
deleted file mode 100644
index e646661f6282..000000000000
--- a/kernel/futex.c
+++ /dev/null
@@ -1,4112 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Fast Userspace Mutexes (which I call "Futexes!").
- * (C) Rusty Russell, IBM 2002
- *
- * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
- * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
- *
- * Removed page pinning, fix privately mapped COW pages and other cleanups
- * (C) Copyright 2003, 2004 Jamie Lokier
- *
- * Robust futex support started by Ingo Molnar
- * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
- * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
- *
- * PI-futex support started by Ingo Molnar and Thomas Gleixner
- * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * PRIVATE futexes by Eric Dumazet
- * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
- *
- * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
- * Copyright (C) IBM Corporation, 2009
- * Thanks to Thomas Gleixner for conceptual design and careful reviews.
- *
- * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
- * enough at me, Linus for the original (flawed) idea, Matthew
- * Kirkwood for proof-of-concept implementation.
- *
- * "The futexes are also cursed."
- * "But they come in a choice of three flavours!"
- */
-#include <linux/compat.h>
-#include <linux/slab.h>
-#include <linux/poll.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/jhash.h>
-#include <linux/init.h>
-#include <linux/futex.h>
-#include <linux/mount.h>
-#include <linux/pagemap.h>
-#include <linux/syscalls.h>
-#include <linux/signal.h>
-#include <linux/export.h>
-#include <linux/magic.h>
-#include <linux/pid.h>
-#include <linux/nsproxy.h>
-#include <linux/ptrace.h>
-#include <linux/sched/rt.h>
-#include <linux/sched/wake_q.h>
-#include <linux/sched/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/freezer.h>
-#include <linux/memblock.h>
-#include <linux/fault-inject.h>
-#include <linux/refcount.h>
-
-#include <asm/futex.h>
-
-#include "locking/rtmutex_common.h"
-
-/*
- * READ this before attempting to hack on futexes!
- *
- * Basic futex operation and ordering guarantees
- * =============================================
- *
- * The waiter reads the futex value in user space and calls
- * futex_wait(). This function computes the hash bucket and acquires
- * the hash bucket lock. After that it reads the futex user space value
- * again and verifies that the data has not changed. If it has not changed
- * it enqueues itself into the hash bucket, releases the hash bucket lock
- * and schedules.
- *
- * The waker side modifies the user space value of the futex and calls
- * futex_wake(). This function computes the hash bucket and acquires the
- * hash bucket lock. Then it looks for waiters on that futex in the hash
- * bucket and wakes them.
- *
- * In futex wake up scenarios where no tasks are blocked on a futex, taking
- * the hb spinlock can be avoided and simply return. In order for this
- * optimization to work, ordering guarantees must exist so that the waiter
- * being added to the list is acknowledged when the list is concurrently being
- * checked by the waker, avoiding scenarios like the following:
- *
- * CPU 0 CPU 1
- * val = *futex;
- * sys_futex(WAIT, futex, val);
- * futex_wait(futex, val);
- * uval = *futex;
- * *futex = newval;
- * sys_futex(WAKE, futex);
- * futex_wake(futex);
- * if (queue_empty())
- * return;
- * if (uval == val)
- * lock(hash_bucket(futex));
- * queue();
- * unlock(hash_bucket(futex));
- * schedule();
- *
- * This would cause the waiter on CPU 0 to wait forever because it
- * missed the transition of the user space value from val to newval
- * and the waker did not find the waiter in the hash bucket queue.
- *
- * The correct serialization ensures that a waiter either observes
- * the changed user space value before blocking or is woken by a
- * concurrent waker:
- *
- * CPU 0 CPU 1
- * val = *futex;
- * sys_futex(WAIT, futex, val);
- * futex_wait(futex, val);
- *
- * waiters++; (a)
- * smp_mb(); (A) <-- paired with -.
- * |
- * lock(hash_bucket(futex)); |
- * |
- * uval = *futex; |
- * | *futex = newval;
- * | sys_futex(WAKE, futex);
- * | futex_wake(futex);
- * |
- * `--------> smp_mb(); (B)
- * if (uval == val)
- * queue();
- * unlock(hash_bucket(futex));
- * schedule(); if (waiters)
- * lock(hash_bucket(futex));
- * else wake_waiters(futex);
- * waiters--; (b) unlock(hash_bucket(futex));
- *
- * Where (A) orders the waiters increment and the futex value read through
- * atomic operations (see hb_waiters_inc) and where (B) orders the write
- * to futex and the waiters read (see hb_waiters_pending()).
- *
- * This yields the following case (where X:=waiters, Y:=futex):
- *
- * X = Y = 0
- *
- * w[X]=1 w[Y]=1
- * MB MB
- * r[Y]=y r[X]=x
- *
- * Which guarantees that x==0 && y==0 is impossible; which translates back into
- * the guarantee that we cannot both miss the futex variable change and the
- * enqueue.
- *
- * Note that a new waiter is accounted for in (a) even when it is possible that
- * the wait call can return error, in which case we backtrack from it in (b).
- * Refer to the comment in queue_lock().
- *
- * Similarly, in order to account for waiters being requeued on another
- * address we always increment the waiters for the destination bucket before
- * acquiring the lock. It then decrements them again after releasing it -
- * the code that actually moves the futex(es) between hash buckets (requeue_futex)
- * will do the additional required waiter count housekeeping. This is done for
- * double_lock_hb() and double_unlock_hb(), respectively.
- */
-
-#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
-#define futex_cmpxchg_enabled 1
-#else
-static int __read_mostly futex_cmpxchg_enabled;
-#endif
-
-/*
- * Futex flags used to encode options to functions and preserve them across
- * restarts.
- */
-#ifdef CONFIG_MMU
-# define FLAGS_SHARED 0x01
-#else
-/*
- * NOMMU does not have per process address space. Let the compiler optimize
- * code away.
- */
-# define FLAGS_SHARED 0x00
-#endif
-#define FLAGS_CLOCKRT 0x02
-#define FLAGS_HAS_TIMEOUT 0x04
-
-/*
- * Priority Inheritance state:
- */
-struct futex_pi_state {
- /*
- * list of 'owned' pi_state instances - these have to be
- * cleaned up in do_exit() if the task exits prematurely:
- */
- struct list_head list;
-
- /*
- * The PI object:
- */
- struct rt_mutex pi_mutex;
-
- struct task_struct *owner;
- refcount_t refcount;
-
- union futex_key key;
-} __randomize_layout;
-
-/**
- * struct futex_q - The hashed futex queue entry, one per waiting task
- * @list: priority-sorted list of tasks waiting on this futex
- * @task: the task waiting on the futex
- * @lock_ptr: the hash bucket lock
- * @key: the key the futex is hashed on
- * @pi_state: optional priority inheritance state
- * @rt_waiter: rt_waiter storage for use with requeue_pi
- * @requeue_pi_key: the requeue_pi target futex key
- * @bitset: bitset for the optional bitmasked wakeup
- *
- * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
- * we can wake only the relevant ones (hashed queues may be shared).
- *
- * A futex_q has a woken state, just like tasks have TASK_RUNNING.
- * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
- * The order of wakeup is always to make the first condition true, then
- * the second.
- *
- * PI futexes are typically woken before they are removed from the hash list via
- * the rt_mutex code. See unqueue_me_pi().
- */
-struct futex_q {
- struct plist_node list;
-
- struct task_struct *task;
- spinlock_t *lock_ptr;
- union futex_key key;
- struct futex_pi_state *pi_state;
- struct rt_mutex_waiter *rt_waiter;
- union futex_key *requeue_pi_key;
- u32 bitset;
-} __randomize_layout;
-
-static const struct futex_q futex_q_init = {
- /* list gets initialized in queue_me()*/
- .key = FUTEX_KEY_INIT,
- .bitset = FUTEX_BITSET_MATCH_ANY
-};
-
-/*
- * Hash buckets are shared by all the futex_keys that hash to the same
- * location. Each key may have multiple futex_q structures, one for each task
- * waiting on a futex.
- */
-struct futex_hash_bucket {
- atomic_t waiters;
- spinlock_t lock;
- struct plist_head chain;
-} ____cacheline_aligned_in_smp;
-
-/*
- * The base of the bucket array and its size are always used together
- * (after initialization only in hash_futex()), so ensure that they
- * reside in the same cacheline.
- */
-static struct {
- struct futex_hash_bucket *queues;
- unsigned long hashsize;
-} __futex_data __read_mostly __aligned(2*sizeof(long));
-#define futex_queues (__futex_data.queues)
-#define futex_hashsize (__futex_data.hashsize)
-
-
-/*
- * Fault injections for futexes.
- */
-#ifdef CONFIG_FAIL_FUTEX
-
-static struct {
- struct fault_attr attr;
-
- bool ignore_private;
-} fail_futex = {
- .attr = FAULT_ATTR_INITIALIZER,
- .ignore_private = false,
-};
-
-static int __init setup_fail_futex(char *str)
-{
- return setup_fault_attr(&fail_futex.attr, str);
-}
-__setup("fail_futex=", setup_fail_futex);
-
-static bool should_fail_futex(bool fshared)
-{
- if (fail_futex.ignore_private && !fshared)
- return false;
-
- return should_fail(&fail_futex.attr, 1);
-}
-
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-
-static int __init fail_futex_debugfs(void)
-{
- umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
- struct dentry *dir;
-
- dir = fault_create_debugfs_attr("fail_futex", NULL,
- &fail_futex.attr);
- if (IS_ERR(dir))
- return PTR_ERR(dir);
-
- debugfs_create_bool("ignore-private", mode, dir,
- &fail_futex.ignore_private);
- return 0;
-}
-
-late_initcall(fail_futex_debugfs);
-
-#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
-
-#else
-static inline bool should_fail_futex(bool fshared)
-{
- return false;
-}
-#endif /* CONFIG_FAIL_FUTEX */
-
-#ifdef CONFIG_COMPAT
-static void compat_exit_robust_list(struct task_struct *curr);
-#else
-static inline void compat_exit_robust_list(struct task_struct *curr) { }
-#endif
-
-/*
- * Reflects a new waiter being added to the waitqueue.
- */
-static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
-{
-#ifdef CONFIG_SMP
- atomic_inc(&hb->waiters);
- /*
- * Full barrier (A), see the ordering comment above.
- */
- smp_mb__after_atomic();
-#endif
-}
-
-/*
- * Reflects a waiter being removed from the waitqueue by wakeup
- * paths.
- */
-static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
-{
-#ifdef CONFIG_SMP
- atomic_dec(&hb->waiters);
-#endif
-}
-
-static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
-{
-#ifdef CONFIG_SMP
- /*
- * Full barrier (B), see the ordering comment above.
- */
- smp_mb();
- return atomic_read(&hb->waiters);
-#else
- return 1;
-#endif
-}
-
-/**
- * hash_futex - Return the hash bucket in the global hash
- * @key: Pointer to the futex key for which the hash is calculated
- *
- * We hash on the keys returned from get_futex_key (see below) and return the
- * corresponding hash bucket in the global hash.
- */
-static struct futex_hash_bucket *hash_futex(union futex_key *key)
-{
- u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
- key->both.offset);
-
- return &futex_queues[hash & (futex_hashsize - 1)];
-}
-
-
-/**
- * match_futex - Check whether two futex keys are equal
- * @key1: Pointer to key1
- * @key2: Pointer to key2
- *
- * Return 1 if two futex_keys are equal, 0 otherwise.
- */
-static inline int match_futex(union futex_key *key1, union futex_key *key2)
-{
- return (key1 && key2
- && key1->both.word == key2->both.word
- && key1->both.ptr == key2->both.ptr
- && key1->both.offset == key2->both.offset);
-}
-
-enum futex_access {
- FUTEX_READ,
- FUTEX_WRITE
-};
-
-/**
- * futex_setup_timer - set up the sleeping hrtimer.
- * @time: ptr to the given timeout value
- * @timeout: the hrtimer_sleeper structure to be set up
- * @flags: futex flags
- * @range_ns: optional range in ns
- *
- * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
- * value given
- */
-static inline struct hrtimer_sleeper *
-futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
- int flags, u64 range_ns)
-{
- if (!time)
- return NULL;
-
- hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
- CLOCK_REALTIME : CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- /*
- * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
- * effectively the same as calling hrtimer_set_expires().
- */
- hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
-
- return timeout;
-}
-
-/*
- * Generate a machine wide unique identifier for this inode.
- *
- * This relies on u64 not wrapping in the life-time of the machine; which with
- * 1ns resolution means almost 585 years.
- *
- * This further relies on the fact that a well formed program will not unmap
- * the file while it has a (shared) futex waiting on it. This mapping will have
- * a file reference which pins the mount and inode.
- *
- * If for some reason an inode gets evicted and read back in again, it will get
- * a new sequence number and will _NOT_ match, even though it is the exact same
- * file.
- *
- * It is important that match_futex() will never have a false-positive, esp.
- * for PI futexes that can mess up the state. The above argues that false-negatives
- * are only possible for malformed programs.
- */
-static u64 get_inode_sequence_number(struct inode *inode)
-{
- static atomic64_t i_seq;
- u64 old;
-
- /* Does the inode already have a sequence number? */
- old = atomic64_read(&inode->i_sequence);
- if (likely(old))
- return old;
-
- for (;;) {
- u64 new = atomic64_add_return(1, &i_seq);
- if (WARN_ON_ONCE(!new))
- continue;
-
- old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
- if (old)
- return old;
- return new;
- }
-}
-
-/**
- * get_futex_key() - Get parameters which are the keys for a futex
- * @uaddr: virtual address of the futex
- * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
- * @key: address where result is stored.
- * @rw: mapping needs to be read/write (values: FUTEX_READ,
- * FUTEX_WRITE)
- *
- * Return: a negative error code or 0
- *
- * The key words are stored in @key on success.
- *
- * For shared mappings (when @fshared), the key is:
- *
- * ( inode->i_sequence, page->index, offset_within_page )
- *
- * [ also see get_inode_sequence_number() ]
- *
- * For private mappings (or when !@fshared), the key is:
- *
- * ( current->mm, address, 0 )
- *
- * This allows (cross process, where applicable) identification of the futex
- * without keeping the page pinned for the duration of the FUTEX_WAIT.
- *
- * lock_page() might sleep, the caller should not hold a spinlock.
- */
-static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw)
-{
- unsigned long address = (unsigned long)uaddr;
- struct mm_struct *mm = current->mm;
- struct page *page, *tail;
- struct address_space *mapping;
- int err, ro = 0;
-
- /*
- * The futex address must be "naturally" aligned.
- */
- key->both.offset = address % PAGE_SIZE;
- if (unlikely((address % sizeof(u32)) != 0))
- return -EINVAL;
- address -= key->both.offset;
-
- if (unlikely(!access_ok(uaddr, sizeof(u32))))
- return -EFAULT;
-
- if (unlikely(should_fail_futex(fshared)))
- return -EFAULT;
-
- /*
- * PROCESS_PRIVATE futexes are fast.
- * As the mm cannot disappear under us and the 'key' only needs
- * virtual address, we dont even have to find the underlying vma.
- * Note : We do have to check 'uaddr' is a valid user address,
- * but access_ok() should be faster than find_vma()
- */
- if (!fshared) {
- key->private.mm = mm;
- key->private.address = address;
- return 0;
- }
-
-again:
- /* Ignore any VERIFY_READ mapping (futex common case) */
- if (unlikely(should_fail_futex(fshared)))
- return -EFAULT;
-
- err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
- /*
- * If write access is not required (eg. FUTEX_WAIT), try
- * and get read-only access.
- */
- if (err == -EFAULT && rw == FUTEX_READ) {
- err = get_user_pages_fast(address, 1, 0, &page);
- ro = 1;
- }
- if (err < 0)
- return err;
- else
- err = 0;
-
- /*
- * The treatment of mapping from this point on is critical. The page
- * lock protects many things but in this context the page lock
- * stabilizes mapping, prevents inode freeing in the shared
- * file-backed region case and guards against movement to swap cache.
- *
- * Strictly speaking the page lock is not needed in all cases being
- * considered here and page lock forces unnecessarily serialization
- * From this point on, mapping will be re-verified if necessary and
- * page lock will be acquired only if it is unavoidable
- *
- * Mapping checks require the head page for any compound page so the
- * head page and mapping is looked up now. For anonymous pages, it
- * does not matter if the page splits in the future as the key is
- * based on the address. For filesystem-backed pages, the tail is
- * required as the index of the page determines the key. For
- * base pages, there is no tail page and tail == page.
- */
- tail = page;
- page = compound_head(page);
- mapping = READ_ONCE(page->mapping);
-
- /*
- * If page->mapping is NULL, then it cannot be a PageAnon
- * page; but it might be the ZERO_PAGE or in the gate area or
- * in a special mapping (all cases which we are happy to fail);
- * or it may have been a good file page when get_user_pages_fast
- * found it, but truncated or holepunched or subjected to
- * invalidate_complete_page2 before we got the page lock (also
- * cases which we are happy to fail). And we hold a reference,
- * so refcount care in invalidate_complete_page's remove_mapping
- * prevents drop_caches from setting mapping to NULL beneath us.
- *
- * The case we do have to guard against is when memory pressure made
- * shmem_writepage move it from filecache to swapcache beneath us:
- * an unlikely race, but we do need to retry for page->mapping.
- */
- if (unlikely(!mapping)) {
- int shmem_swizzled;
-
- /*
- * Page lock is required to identify which special case above
- * applies. If this is really a shmem page then the page lock
- * will prevent unexpected transitions.
- */
- lock_page(page);
- shmem_swizzled = PageSwapCache(page) || page->mapping;
- unlock_page(page);
- put_page(page);
-
- if (shmem_swizzled)
- goto again;
-
- return -EFAULT;
- }
-
- /*
- * Private mappings are handled in a simple way.
- *
- * If the futex key is stored on an anonymous page, then the associated
- * object is the mm which is implicitly pinned by the calling process.
- *
- * NOTE: When userspace waits on a MAP_SHARED mapping, even if
- * it's a read-only handle, it's expected that futexes attach to
- * the object not the particular process.
- */
- if (PageAnon(page)) {
- /*
- * A RO anonymous page will never change and thus doesn't make
- * sense for futex operations.
- */
- if (unlikely(should_fail_futex(fshared)) || ro) {
- err = -EFAULT;
- goto out;
- }
-
- key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
- key->private.mm = mm;
- key->private.address = address;
-
- } else {
- struct inode *inode;
-
- /*
- * The associated futex object in this case is the inode and
- * the page->mapping must be traversed. Ordinarily this should
- * be stabilised under page lock but it's not strictly
- * necessary in this case as we just want to pin the inode, not
- * update the radix tree or anything like that.
- *
- * The RCU read lock is taken as the inode is finally freed
- * under RCU. If the mapping still matches expectations then the
- * mapping->host can be safely accessed as being a valid inode.
- */
- rcu_read_lock();
-
- if (READ_ONCE(page->mapping) != mapping) {
- rcu_read_unlock();
- put_page(page);
-
- goto again;
- }
-
- inode = READ_ONCE(mapping->host);
- if (!inode) {
- rcu_read_unlock();
- put_page(page);
-
- goto again;
- }
-
- key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.i_seq = get_inode_sequence_number(inode);
- key->shared.pgoff = basepage_index(tail);
- rcu_read_unlock();
- }
-
-out:
- put_page(page);
- return err;
-}
-
-static inline void put_futex_key(union futex_key *key)
-{
-}
-
-/**
- * fault_in_user_writeable() - Fault in user address and verify RW access
- * @uaddr: pointer to faulting user space address
- *
- * Slow path to fixup the fault we just took in the atomic write
- * access to @uaddr.
- *
- * We have no generic implementation of a non-destructive write to the
- * user address. We know that we faulted in the atomic pagefault
- * disabled section so we can as well avoid the #PF overhead by
- * calling get_user_pages() right away.
- */
-static int fault_in_user_writeable(u32 __user *uaddr)
-{
- struct mm_struct *mm = current->mm;
- int ret;
-
- mmap_read_lock(mm);
- ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
- FAULT_FLAG_WRITE, NULL);
- mmap_read_unlock(mm);
-
- return ret < 0 ? ret : 0;
-}
-
-/**
- * futex_top_waiter() - Return the highest priority waiter on a futex
- * @hb: the hash bucket the futex_q's reside in
- * @key: the futex key (to distinguish it from other futex futex_q's)
- *
- * Must be called with the hb lock held.
- */
-static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
- union futex_key *key)
-{
- struct futex_q *this;
-
- plist_for_each_entry(this, &hb->chain, list) {
- if (match_futex(&this->key, key))
- return this;
- }
- return NULL;
-}
-
-static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
- u32 uval, u32 newval)
-{
- int ret;
-
- pagefault_disable();
- ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
- pagefault_enable();
-
- return ret;
-}
-
-static int get_futex_value_locked(u32 *dest, u32 __user *from)
-{
- int ret;
-
- pagefault_disable();
- ret = __get_user(*dest, from);
- pagefault_enable();
-
- return ret ? -EFAULT : 0;
-}
-
-
-/*
- * PI code:
- */
-static int refill_pi_state_cache(void)
-{
- struct futex_pi_state *pi_state;
-
- if (likely(current->pi_state_cache))
- return 0;
-
- pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
-
- if (!pi_state)
- return -ENOMEM;
-
- INIT_LIST_HEAD(&pi_state->list);
- /* pi_mutex gets initialized later */
- pi_state->owner = NULL;
- refcount_set(&pi_state->refcount, 1);
- pi_state->key = FUTEX_KEY_INIT;
-
- current->pi_state_cache = pi_state;
-
- return 0;
-}
-
-static struct futex_pi_state *alloc_pi_state(void)
-{
- struct futex_pi_state *pi_state = current->pi_state_cache;
-
- WARN_ON(!pi_state);
- current->pi_state_cache = NULL;
-
- return pi_state;
-}
-
-static void get_pi_state(struct futex_pi_state *pi_state)
-{
- WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
-}
-
-/*
- * Drops a reference to the pi_state object and frees or caches it
- * when the last reference is gone.
- */
-static void put_pi_state(struct futex_pi_state *pi_state)
-{
- if (!pi_state)
- return;
-
- if (!refcount_dec_and_test(&pi_state->refcount))
- return;
-
- /*
- * If pi_state->owner is NULL, the owner is most probably dying
- * and has cleaned up the pi_state already
- */
- if (pi_state->owner) {
- struct task_struct *owner;
-
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
- owner = pi_state->owner;
- if (owner) {
- raw_spin_lock(&owner->pi_lock);
- list_del_init(&pi_state->list);
- raw_spin_unlock(&owner->pi_lock);
- }
- rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- }
-
- if (current->pi_state_cache) {
- kfree(pi_state);
- } else {
- /*
- * pi_state->list is already empty.
- * clear pi_state->owner.
- * refcount is at 0 - put it back to 1.
- */
- pi_state->owner = NULL;
- refcount_set(&pi_state->refcount, 1);
- current->pi_state_cache = pi_state;
- }
-}
-
-#ifdef CONFIG_FUTEX_PI
-
-/*
- * This task is holding PI mutexes at exit time => bad.
- * Kernel cleans up PI-state, but userspace is likely hosed.
- * (Robust-futex cleanup is separate and might save the day for userspace.)
- */
-static void exit_pi_state_list(struct task_struct *curr)
-{
- struct list_head *next, *head = &curr->pi_state_list;
- struct futex_pi_state *pi_state;
- struct futex_hash_bucket *hb;
- union futex_key key = FUTEX_KEY_INIT;
-
- if (!futex_cmpxchg_enabled)
- return;
- /*
- * We are a ZOMBIE and nobody can enqueue itself on
- * pi_state_list anymore, but we have to be careful
- * versus waiters unqueueing themselves:
- */
- raw_spin_lock_irq(&curr->pi_lock);
- while (!list_empty(head)) {
- next = head->next;
- pi_state = list_entry(next, struct futex_pi_state, list);
- key = pi_state->key;
- hb = hash_futex(&key);
-
- /*
- * We can race against put_pi_state() removing itself from the
- * list (a waiter going away). put_pi_state() will first
- * decrement the reference count and then modify the list, so
- * its possible to see the list entry but fail this reference
- * acquire.
- *
- * In that case; drop the locks to let put_pi_state() make
- * progress and retry the loop.
- */
- if (!refcount_inc_not_zero(&pi_state->refcount)) {
- raw_spin_unlock_irq(&curr->pi_lock);
- cpu_relax();
- raw_spin_lock_irq(&curr->pi_lock);
- continue;
- }
- raw_spin_unlock_irq(&curr->pi_lock);
-
- spin_lock(&hb->lock);
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
- raw_spin_lock(&curr->pi_lock);
- /*
- * We dropped the pi-lock, so re-check whether this
- * task still owns the PI-state:
- */
- if (head->next != next) {
- /* retain curr->pi_lock for the loop invariant */
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
- put_pi_state(pi_state);
- continue;
- }
-
- WARN_ON(pi_state->owner != curr);
- WARN_ON(list_empty(&pi_state->list));
- list_del_init(&pi_state->list);
- pi_state->owner = NULL;
-
- raw_spin_unlock(&curr->pi_lock);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
-
- rt_mutex_futex_unlock(&pi_state->pi_mutex);
- put_pi_state(pi_state);
-
- raw_spin_lock_irq(&curr->pi_lock);
- }
- raw_spin_unlock_irq(&curr->pi_lock);
-}
-#else
-static inline void exit_pi_state_list(struct task_struct *curr) { }
-#endif
-
-/*
- * We need to check the following states:
- *
- * Waiter | pi_state | pi->owner | uTID | uODIED | ?
- *
- * [1] NULL | --- | --- | 0 | 0/1 | Valid
- * [2] NULL | --- | --- | >0 | 0/1 | Valid
- *
- * [3] Found | NULL | -- | Any | 0/1 | Invalid
- *
- * [4] Found | Found | NULL | 0 | 1 | Valid
- * [5] Found | Found | NULL | >0 | 1 | Invalid
- *
- * [6] Found | Found | task | 0 | 1 | Valid
- *
- * [7] Found | Found | NULL | Any | 0 | Invalid
- *
- * [8] Found | Found | task | ==taskTID | 0/1 | Valid
- * [9] Found | Found | task | 0 | 0 | Invalid
- * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
- *
- * [1] Indicates that the kernel can acquire the futex atomically. We
- * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
- *
- * [2] Valid, if TID does not belong to a kernel thread. If no matching
- * thread is found then it indicates that the owner TID has died.
- *
- * [3] Invalid. The waiter is queued on a non PI futex
- *
- * [4] Valid state after exit_robust_list(), which sets the user space
- * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
- *
- * [5] The user space value got manipulated between exit_robust_list()
- * and exit_pi_state_list()
- *
- * [6] Valid state after exit_pi_state_list() which sets the new owner in
- * the pi_state but cannot access the user space value.
- *
- * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
- *
- * [8] Owner and user space value match
- *
- * [9] There is no transient state which sets the user space TID to 0
- * except exit_robust_list(), but this is indicated by the
- * FUTEX_OWNER_DIED bit. See [4]
- *
- * [10] There is no transient state which leaves owner and user space
- * TID out of sync.
- *
- *
- * Serialization and lifetime rules:
- *
- * hb->lock:
- *
- * hb -> futex_q, relation
- * futex_q -> pi_state, relation
- *
- * (cannot be raw because hb can contain arbitrary amount
- * of futex_q's)
- *
- * pi_mutex->wait_lock:
- *
- * {uval, pi_state}
- *
- * (and pi_mutex 'obviously')
- *
- * p->pi_lock:
- *
- * p->pi_state_list -> pi_state->list, relation
- *
- * pi_state->refcount:
- *
- * pi_state lifetime
- *
- *
- * Lock order:
- *
- * hb->lock
- * pi_mutex->wait_lock
- * p->pi_lock
- *
- */
-
-/*
- * Validate that the existing waiter has a pi_state and sanity check
- * the pi_state against the user space value. If correct, attach to
- * it.
- */
-static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
- struct futex_pi_state *pi_state,
- struct futex_pi_state **ps)
-{
- pid_t pid = uval & FUTEX_TID_MASK;
- u32 uval2;
- int ret;
-
- /*
- * Userspace might have messed up non-PI and PI futexes [3]
- */
- if (unlikely(!pi_state))
- return -EINVAL;
-
- /*
- * We get here with hb->lock held, and having found a
- * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
- * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
- * which in turn means that futex_lock_pi() still has a reference on
- * our pi_state.
- *
- * The waiter holding a reference on @pi_state also protects against
- * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
- * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
- * free pi_state before we can take a reference ourselves.
- */
- WARN_ON(!refcount_read(&pi_state->refcount));
-
- /*
- * Now that we have a pi_state, we can acquire wait_lock
- * and do the state validation.
- */
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
-
- /*
- * Since {uval, pi_state} is serialized by wait_lock, and our current
- * uval was read without holding it, it can have changed. Verify it
- * still is what we expect it to be, otherwise retry the entire
- * operation.
- */
- if (get_futex_value_locked(&uval2, uaddr))
- goto out_efault;
-
- if (uval != uval2)
- goto out_eagain;
-
- /*
- * Handle the owner died case:
- */
- if (uval & FUTEX_OWNER_DIED) {
- /*
- * exit_pi_state_list sets owner to NULL and wakes the
- * topmost waiter. The task which acquires the
- * pi_state->rt_mutex will fixup owner.
- */
- if (!pi_state->owner) {
- /*
- * No pi state owner, but the user space TID
- * is not 0. Inconsistent state. [5]
- */
- if (pid)
- goto out_einval;
- /*
- * Take a ref on the state and return success. [4]
- */
- goto out_attach;
- }
-
- /*
- * If TID is 0, then either the dying owner has not
- * yet executed exit_pi_state_list() or some waiter
- * acquired the rtmutex in the pi state, but did not
- * yet fixup the TID in user space.
- *
- * Take a ref on the state and return success. [6]
- */
- if (!pid)
- goto out_attach;
- } else {
- /*
- * If the owner died bit is not set, then the pi_state
- * must have an owner. [7]
- */
- if (!pi_state->owner)
- goto out_einval;
- }
-
- /*
- * Bail out if user space manipulated the futex value. If pi
- * state exists then the owner TID must be the same as the
- * user space TID. [9/10]
- */
- if (pid != task_pid_vnr(pi_state->owner))
- goto out_einval;
-
-out_attach:
- get_pi_state(pi_state);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- *ps = pi_state;
- return 0;
-
-out_einval:
- ret = -EINVAL;
- goto out_error;
-
-out_eagain:
- ret = -EAGAIN;
- goto out_error;
-
-out_efault:
- ret = -EFAULT;
- goto out_error;
-
-out_error:
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- return ret;
-}
-
-/**
- * wait_for_owner_exiting - Block until the owner has exited
- * @ret: owner's current futex lock status
- * @exiting: Pointer to the exiting task
- *
- * Caller must hold a refcount on @exiting.
- */
-static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
-{
- if (ret != -EBUSY) {
- WARN_ON_ONCE(exiting);
- return;
- }
-
- if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
- return;
-
- mutex_lock(&exiting->futex_exit_mutex);
- /*
- * No point in doing state checking here. If the waiter got here
- * while the task was in exec()->exec_futex_release() then it can
- * have any FUTEX_STATE_* value when the waiter has acquired the
- * mutex. OK, if running, EXITING or DEAD if it reached exit()
- * already. Highly unlikely and not a problem. Just one more round
- * through the futex maze.
- */
- mutex_unlock(&exiting->futex_exit_mutex);
-
- put_task_struct(exiting);
-}
-
-static int handle_exit_race(u32 __user *uaddr, u32 uval,
- struct task_struct *tsk)
-{
- u32 uval2;
-
- /*
- * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
- * caller that the alleged owner is busy.
- */
- if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
- return -EBUSY;
-
- /*
- * Reread the user space value to handle the following situation:
- *
- * CPU0 CPU1
- *
- * sys_exit() sys_futex()
- * do_exit() futex_lock_pi()
- * futex_lock_pi_atomic()
- * exit_signals(tsk) No waiters:
- * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
- * mm_release(tsk) Set waiter bit
- * exit_robust_list(tsk) { *uaddr = 0x80000PID;
- * Set owner died attach_to_pi_owner() {
- * *uaddr = 0xC0000000; tsk = get_task(PID);
- * } if (!tsk->flags & PF_EXITING) {
- * ... attach();
- * tsk->futex_state = } else {
- * FUTEX_STATE_DEAD; if (tsk->futex_state !=
- * FUTEX_STATE_DEAD)
- * return -EAGAIN;
- * return -ESRCH; <--- FAIL
- * }
- *
- * Returning ESRCH unconditionally is wrong here because the
- * user space value has been changed by the exiting task.
- *
- * The same logic applies to the case where the exiting task is
- * already gone.
- */
- if (get_futex_value_locked(&uval2, uaddr))
- return -EFAULT;
-
- /* If the user space value has changed, try again. */
- if (uval2 != uval)
- return -EAGAIN;
-
- /*
- * The exiting task did not have a robust list, the robust list was
- * corrupted or the user space value in *uaddr is simply bogus.
- * Give up and tell user space.
- */
- return -ESRCH;
-}
-
-/*
- * Lookup the task for the TID provided from user space and attach to
- * it after doing proper sanity checks.
- */
-static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
- struct futex_pi_state **ps,
- struct task_struct **exiting)
-{
- pid_t pid = uval & FUTEX_TID_MASK;
- struct futex_pi_state *pi_state;
- struct task_struct *p;
-
- /*
- * We are the first waiter - try to look up the real owner and attach
- * the new pi_state to it, but bail out when TID = 0 [1]
- *
- * The !pid check is paranoid. None of the call sites should end up
- * with pid == 0, but better safe than sorry. Let the caller retry
- */
- if (!pid)
- return -EAGAIN;
- p = find_get_task_by_vpid(pid);
- if (!p)
- return handle_exit_race(uaddr, uval, NULL);
-
- if (unlikely(p->flags & PF_KTHREAD)) {
- put_task_struct(p);
- return -EPERM;
- }
-
- /*
- * We need to look at the task state to figure out, whether the
- * task is exiting. To protect against the change of the task state
- * in futex_exit_release(), we do this protected by p->pi_lock:
- */
- raw_spin_lock_irq(&p->pi_lock);
- if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
- /*
- * The task is on the way out. When the futex state is
- * FUTEX_STATE_DEAD, we know that the task has finished
- * the cleanup:
- */
- int ret = handle_exit_race(uaddr, uval, p);
-
- raw_spin_unlock_irq(&p->pi_lock);
- /*
- * If the owner task is between FUTEX_STATE_EXITING and
- * FUTEX_STATE_DEAD then store the task pointer and keep
- * the reference on the task struct. The calling code will
- * drop all locks, wait for the task to reach
- * FUTEX_STATE_DEAD and then drop the refcount. This is
- * required to prevent a live lock when the current task
- * preempted the exiting task between the two states.
- */
- if (ret == -EBUSY)
- *exiting = p;
- else
- put_task_struct(p);
- return ret;
- }
-
- /*
- * No existing pi state. First waiter. [2]
- *
- * This creates pi_state, we have hb->lock held, this means nothing can
- * observe this state, wait_lock is irrelevant.
- */
- pi_state = alloc_pi_state();
-
- /*
- * Initialize the pi_mutex in locked state and make @p
- * the owner of it:
- */
- rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
-
- /* Store the key for possible exit cleanups: */
- pi_state->key = *key;
-
- WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &p->pi_state_list);
- /*
- * Assignment without holding pi_state->pi_mutex.wait_lock is safe
- * because there is no concurrency as the object is not published yet.
- */
- pi_state->owner = p;
- raw_spin_unlock_irq(&p->pi_lock);
-
- put_task_struct(p);
-
- *ps = pi_state;
-
- return 0;
-}
-
-static int lookup_pi_state(u32 __user *uaddr, u32 uval,
- struct futex_hash_bucket *hb,
- union futex_key *key, struct futex_pi_state **ps,
- struct task_struct **exiting)
-{
- struct futex_q *top_waiter = futex_top_waiter(hb, key);
-
- /*
- * If there is a waiter on that futex, validate it and
- * attach to the pi_state when the validation succeeds.
- */
- if (top_waiter)
- return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
-
- /*
- * We are the first waiter - try to look up the owner based on
- * @uval and attach to it.
- */
- return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
-}
-
-static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
-{
- int err;
- u32 uninitialized_var(curval);
-
- if (unlikely(should_fail_futex(true)))
- return -EFAULT;
-
- err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
- if (unlikely(err))
- return err;
-
- /* If user space value changed, let the caller retry */
- return curval != uval ? -EAGAIN : 0;
-}
-
-/**
- * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
- * @uaddr: the pi futex user address
- * @hb: the pi futex hash bucket
- * @key: the futex key associated with uaddr and hb
- * @ps: the pi_state pointer where we store the result of the
- * lookup
- * @task: the task to perform the atomic lock work for. This will
- * be "current" except in the case of requeue pi.
- * @exiting: Pointer to store the task pointer of the owner task
- * which is in the middle of exiting
- * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
- *
- * Return:
- * - 0 - ready to wait;
- * - 1 - acquired the lock;
- * - <0 - error
- *
- * The hb->lock and futex_key refs shall be held by the caller.
- *
- * @exiting is only set when the return value is -EBUSY. If so, this holds
- * a refcount on the exiting task on return and the caller needs to drop it
- * after waiting for the exit to complete.
- */
-static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
- union futex_key *key,
- struct futex_pi_state **ps,
- struct task_struct *task,
- struct task_struct **exiting,
- int set_waiters)
-{
- u32 uval, newval, vpid = task_pid_vnr(task);
- struct futex_q *top_waiter;
- int ret;
-
- /*
- * Read the user space value first so we can validate a few
- * things before proceeding further.
- */
- if (get_futex_value_locked(&uval, uaddr))
- return -EFAULT;
-
- if (unlikely(should_fail_futex(true)))
- return -EFAULT;
-
- /*
- * Detect deadlocks.
- */
- if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
- return -EDEADLK;
-
- if ((unlikely(should_fail_futex(true))))
- return -EDEADLK;
-
- /*
- * Lookup existing state first. If it exists, try to attach to
- * its pi_state.
- */
- top_waiter = futex_top_waiter(hb, key);
- if (top_waiter)
- return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
-
- /*
- * No waiter and user TID is 0. We are here because the
- * waiters or the owner died bit is set or called from
- * requeue_cmp_pi or for whatever reason something took the
- * syscall.
- */
- if (!(uval & FUTEX_TID_MASK)) {
- /*
- * We take over the futex. No other waiters and the user space
- * TID is 0. We preserve the owner died bit.
- */
- newval = uval & FUTEX_OWNER_DIED;
- newval |= vpid;
-
- /* The futex requeue_pi code can enforce the waiters bit */
- if (set_waiters)
- newval |= FUTEX_WAITERS;
-
- ret = lock_pi_update_atomic(uaddr, uval, newval);
- /* If the take over worked, return 1 */
- return ret < 0 ? ret : 1;
- }
-
- /*
- * First waiter. Set the waiters bit before attaching ourself to
- * the owner. If owner tries to unlock, it will be forced into
- * the kernel and blocked on hb->lock.
- */
- newval = uval | FUTEX_WAITERS;
- ret = lock_pi_update_atomic(uaddr, uval, newval);
- if (ret)
- return ret;
- /*
- * If the update of the user space value succeeded, we try to
- * attach to the owner. If that fails, no harm done, we only
- * set the FUTEX_WAITERS bit in the user space variable.
- */
- return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
-}
-
-/**
- * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
- * @q: The futex_q to unqueue
- *
- * The q->lock_ptr must not be NULL and must be held by the caller.
- */
-static void __unqueue_futex(struct futex_q *q)
-{
- struct futex_hash_bucket *hb;
-
- if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
- return;
- lockdep_assert_held(q->lock_ptr);
-
- hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
- plist_del(&q->list, &hb->chain);
- hb_waiters_dec(hb);
-}
-
-/*
- * The hash bucket lock must be held when this is called.
- * Afterwards, the futex_q must not be accessed. Callers
- * must ensure to later call wake_up_q() for the actual
- * wakeups to occur.
- */
-static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
-{
- struct task_struct *p = q->task;
-
- if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
- return;
-
- get_task_struct(p);
- __unqueue_futex(q);
- /*
- * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
- * is written, without taking any locks. This is possible in the event
- * of a spurious wakeup, for example. A memory barrier is required here
- * to prevent the following store to lock_ptr from getting ahead of the
- * plist_del in __unqueue_futex().
- */
- smp_store_release(&q->lock_ptr, NULL);
-
- /*
- * Queue the task for later wakeup for after we've released
- * the hb->lock.
- */
- wake_q_add_safe(wake_q, p);
-}
-
-/*
- * Caller must hold a reference on @pi_state.
- */
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
-{
- u32 uninitialized_var(curval), newval;
- struct task_struct *new_owner;
- bool postunlock = false;
- DEFINE_WAKE_Q(wake_q);
- int ret = 0;
-
- new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
- if (WARN_ON_ONCE(!new_owner)) {
- /*
- * As per the comment in futex_unlock_pi() this should not happen.
- *
- * When this happens, give up our locks and try again, giving
- * the futex_lock_pi() instance time to complete, either by
- * waiting on the rtmutex or removing itself from the futex
- * queue.
- */
- ret = -EAGAIN;
- goto out_unlock;
- }
-
- /*
- * We pass it to the next owner. The WAITERS bit is always kept
- * enabled while there is PI state around. We cleanup the owner
- * died bit, because we are the owner.
- */
- newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
-
- if (unlikely(should_fail_futex(true)))
- ret = -EFAULT;
-
- ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
- if (!ret && (curval != uval)) {
- /*
- * If a unconditional UNLOCK_PI operation (user space did not
- * try the TID->0 transition) raced with a waiter setting the
- * FUTEX_WAITERS flag between get_user() and locking the hash
- * bucket lock, retry the operation.
- */
- if ((FUTEX_TID_MASK & curval) == uval)
- ret = -EAGAIN;
- else
- ret = -EINVAL;
- }
-
- if (ret)
- goto out_unlock;
-
- /*
- * This is a point of no return; once we modify the uval there is no
- * going back and subsequent operations must not fail.
- */
-
- raw_spin_lock(&pi_state->owner->pi_lock);
- WARN_ON(list_empty(&pi_state->list));
- list_del_init(&pi_state->list);
- raw_spin_unlock(&pi_state->owner->pi_lock);
-
- raw_spin_lock(&new_owner->pi_lock);
- WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &new_owner->pi_state_list);
- pi_state->owner = new_owner;
- raw_spin_unlock(&new_owner->pi_lock);
-
- postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
-
-out_unlock:
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-
- if (postunlock)
- rt_mutex_postunlock(&wake_q);
-
- return ret;
-}
-
-/*
- * Express the locking dependencies for lockdep:
- */
-static inline void
-double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
-{
- if (hb1 <= hb2) {
- spin_lock(&hb1->lock);
- if (hb1 < hb2)
- spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
- } else { /* hb1 > hb2 */
- spin_lock(&hb2->lock);
- spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
- }
-}
-
-static inline void
-double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
-{
- spin_unlock(&hb1->lock);
- if (hb1 != hb2)
- spin_unlock(&hb2->lock);
-}
-
-/*
- * Wake up waiters matching bitset queued on this futex (uaddr).
- */
-static int
-futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
-{
- struct futex_hash_bucket *hb;
- struct futex_q *this, *next;
- union futex_key key = FUTEX_KEY_INIT;
- int ret;
- DEFINE_WAKE_Q(wake_q);
-
- if (!bitset)
- return -EINVAL;
-
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
- if (unlikely(ret != 0))
- goto out;
-
- hb = hash_futex(&key);
-
- /* Make sure we really have tasks to wakeup */
- if (!hb_waiters_pending(hb))
- goto out_put_key;
-
- spin_lock(&hb->lock);
-
- plist_for_each_entry_safe(this, next, &hb->chain, list) {
- if (match_futex (&this->key, &key)) {
- if (this->pi_state || this->rt_waiter) {
- ret = -EINVAL;
- break;
- }
-
- /* Check if one of the bits is set in both bitsets */
- if (!(this->bitset & bitset))
- continue;
-
- mark_wake_futex(&wake_q, this);
- if (++ret >= nr_wake)
- break;
- }
- }
-
- spin_unlock(&hb->lock);
- wake_up_q(&wake_q);
-out_put_key:
- put_futex_key(&key);
-out:
- return ret;
-}
-
-static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
-{
- unsigned int op = (encoded_op & 0x70000000) >> 28;
- unsigned int cmp = (encoded_op & 0x0f000000) >> 24;
- int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
- int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
- int oldval, ret;
-
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
- if (oparg < 0 || oparg > 31) {
- char comm[sizeof(current->comm)];
- /*
- * kill this print and return -EINVAL when userspace
- * is sane again
- */
- pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
- get_task_comm(comm, current), oparg);
- oparg &= 31;
- }
- oparg = 1 << oparg;
- }
-
- pagefault_disable();
- ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
- pagefault_enable();
- if (ret)
- return ret;
-
- switch (cmp) {
- case FUTEX_OP_CMP_EQ:
- return oldval == cmparg;
- case FUTEX_OP_CMP_NE:
- return oldval != cmparg;
- case FUTEX_OP_CMP_LT:
- return oldval < cmparg;
- case FUTEX_OP_CMP_GE:
- return oldval >= cmparg;
- case FUTEX_OP_CMP_LE:
- return oldval <= cmparg;
- case FUTEX_OP_CMP_GT:
- return oldval > cmparg;
- default:
- return -ENOSYS;
- }
-}
-
-/*
- * Wake up all waiters hashed on the physical page that is mapped
- * to this virtual address:
- */
-static int
-futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
- int nr_wake, int nr_wake2, int op)
-{
- union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
- struct futex_hash_bucket *hb1, *hb2;
- struct futex_q *this, *next;
- int ret, op_ret;
- DEFINE_WAKE_Q(wake_q);
-
-retry:
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
- if (unlikely(ret != 0))
- goto out;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
- if (unlikely(ret != 0))
- goto out_put_key1;
-
- hb1 = hash_futex(&key1);
- hb2 = hash_futex(&key2);
-
-retry_private:
- double_lock_hb(hb1, hb2);
- op_ret = futex_atomic_op_inuser(op, uaddr2);
- if (unlikely(op_ret < 0)) {
- double_unlock_hb(hb1, hb2);
-
- if (!IS_ENABLED(CONFIG_MMU) ||
- unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
- /*
- * we don't get EFAULT from MMU faults if we don't have
- * an MMU, but we might get them from range checking
- */
- ret = op_ret;
- goto out_put_keys;
- }
-
- if (op_ret == -EFAULT) {
- ret = fault_in_user_writeable(uaddr2);
- if (ret)
- goto out_put_keys;
- }
-
- if (!(flags & FLAGS_SHARED)) {
- cond_resched();
- goto retry_private;
- }
-
- put_futex_key(&key2);
- put_futex_key(&key1);
- cond_resched();
- goto retry;
- }
-
- plist_for_each_entry_safe(this, next, &hb1->chain, list) {
- if (match_futex (&this->key, &key1)) {
- if (this->pi_state || this->rt_waiter) {
- ret = -EINVAL;
- goto out_unlock;
- }
- mark_wake_futex(&wake_q, this);
- if (++ret >= nr_wake)
- break;
- }
- }
-
- if (op_ret > 0) {
- op_ret = 0;
- plist_for_each_entry_safe(this, next, &hb2->chain, list) {
- if (match_futex (&this->key, &key2)) {
- if (this->pi_state || this->rt_waiter) {
- ret = -EINVAL;
- goto out_unlock;
- }
- mark_wake_futex(&wake_q, this);
- if (++op_ret >= nr_wake2)
- break;
- }
- }
- ret += op_ret;
- }
-
-out_unlock:
- double_unlock_hb(hb1, hb2);
- wake_up_q(&wake_q);
-out_put_keys:
- put_futex_key(&key2);
-out_put_key1:
- put_futex_key(&key1);
-out:
- return ret;
-}
-
-/**
- * requeue_futex() - Requeue a futex_q from one hb to another
- * @q: the futex_q to requeue
- * @hb1: the source hash_bucket
- * @hb2: the target hash_bucket
- * @key2: the new key for the requeued futex_q
- */
-static inline
-void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
- struct futex_hash_bucket *hb2, union futex_key *key2)
-{
-
- /*
- * If key1 and key2 hash to the same bucket, no need to
- * requeue.
- */
- if (likely(&hb1->chain != &hb2->chain)) {
- plist_del(&q->list, &hb1->chain);
- hb_waiters_dec(hb1);
- hb_waiters_inc(hb2);
- plist_add(&q->list, &hb2->chain);
- q->lock_ptr = &hb2->lock;
- }
- q->key = *key2;
-}
-
-/**
- * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
- * @q: the futex_q
- * @key: the key of the requeue target futex
- * @hb: the hash_bucket of the requeue target futex
- *
- * During futex_requeue, with requeue_pi=1, it is possible to acquire the
- * target futex if it is uncontended or via a lock steal. Set the futex_q key
- * to the requeue target futex so the waiter can detect the wakeup on the right
- * futex, but remove it from the hb and NULL the rt_waiter so it can detect
- * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
- * to protect access to the pi_state to fixup the owner later. Must be called
- * with both q->lock_ptr and hb->lock held.
- */
-static inline
-void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
- struct futex_hash_bucket *hb)
-{
- q->key = *key;
-
- __unqueue_futex(q);
-
- WARN_ON(!q->rt_waiter);
- q->rt_waiter = NULL;
-
- q->lock_ptr = &hb->lock;
-
- wake_up_state(q->task, TASK_NORMAL);
-}
-
-/**
- * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
- * @pifutex: the user address of the to futex
- * @hb1: the from futex hash bucket, must be locked by the caller
- * @hb2: the to futex hash bucket, must be locked by the caller
- * @key1: the from futex key
- * @key2: the to futex key
- * @ps: address to store the pi_state pointer
- * @exiting: Pointer to store the task pointer of the owner task
- * which is in the middle of exiting
- * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
- *
- * Try and get the lock on behalf of the top waiter if we can do it atomically.
- * Wake the top waiter if we succeed. If the caller specified set_waiters,
- * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
- * hb1 and hb2 must be held by the caller.
- *
- * @exiting is only set when the return value is -EBUSY. If so, this holds
- * a refcount on the exiting task on return and the caller needs to drop it
- * after waiting for the exit to complete.
- *
- * Return:
- * - 0 - failed to acquire the lock atomically;
- * - >0 - acquired the lock, return value is vpid of the top_waiter
- * - <0 - error
- */
-static int
-futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
- struct futex_hash_bucket *hb2, union futex_key *key1,
- union futex_key *key2, struct futex_pi_state **ps,
- struct task_struct **exiting, int set_waiters)
-{
- struct futex_q *top_waiter = NULL;
- u32 curval;
- int ret, vpid;
-
- if (get_futex_value_locked(&curval, pifutex))
- return -EFAULT;
-
- if (unlikely(should_fail_futex(true)))
- return -EFAULT;
-
- /*
- * Find the top_waiter and determine if there are additional waiters.
- * If the caller intends to requeue more than 1 waiter to pifutex,
- * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
- * as we have means to handle the possible fault. If not, don't set
- * the bit unecessarily as it will force the subsequent unlock to enter
- * the kernel.
- */
- top_waiter = futex_top_waiter(hb1, key1);
-
- /* There are no waiters, nothing for us to do. */
- if (!top_waiter)
- return 0;
-
- /* Ensure we requeue to the expected futex. */
- if (!match_futex(top_waiter->requeue_pi_key, key2))
- return -EINVAL;
-
- /*
- * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
- * the contended case or if set_waiters is 1. The pi_state is returned
- * in ps in contended cases.
- */
- vpid = task_pid_vnr(top_waiter->task);
- ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
- exiting, set_waiters);
- if (ret == 1) {
- requeue_pi_wake_futex(top_waiter, key2, hb2);
- return vpid;
- }
- return ret;
-}
-
-/**
- * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
- * @uaddr1: source futex user address
- * @flags: futex flags (FLAGS_SHARED, etc.)
- * @uaddr2: target futex user address
- * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
- * @nr_requeue: number of waiters to requeue (0-INT_MAX)
- * @cmpval: @uaddr1 expected value (or %NULL)
- * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
- * pi futex (pi to pi requeue is not supported)
- *
- * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
- * uaddr2 atomically on behalf of the top waiter.
- *
- * Return:
- * - >=0 - on success, the number of tasks requeued or woken;
- * - <0 - on error
- */
-static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
- u32 __user *uaddr2, int nr_wake, int nr_requeue,
- u32 *cmpval, int requeue_pi)
-{
- union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
- int task_count = 0, ret;
- struct futex_pi_state *pi_state = NULL;
- struct futex_hash_bucket *hb1, *hb2;
- struct futex_q *this, *next;
- DEFINE_WAKE_Q(wake_q);
-
- if (nr_wake < 0 || nr_requeue < 0)
- return -EINVAL;
-
- /*
- * When PI not supported: return -ENOSYS if requeue_pi is true,
- * consequently the compiler knows requeue_pi is always false past
- * this point which will optimize away all the conditional code
- * further down.
- */
- if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
- return -ENOSYS;
-
- if (requeue_pi) {
- /*
- * Requeue PI only works on two distinct uaddrs. This
- * check is only valid for private futexes. See below.
- */
- if (uaddr1 == uaddr2)
- return -EINVAL;
-
- /*
- * requeue_pi requires a pi_state, try to allocate it now
- * without any locks in case it fails.
- */
- if (refill_pi_state_cache())
- return -ENOMEM;
- /*
- * requeue_pi must wake as many tasks as it can, up to nr_wake
- * + nr_requeue, since it acquires the rt_mutex prior to
- * returning to userspace, so as to not leave the rt_mutex with
- * waiters and no owner. However, second and third wake-ups
- * cannot be predicted as they involve race conditions with the
- * first wake and a fault while looking up the pi_state. Both
- * pthread_cond_signal() and pthread_cond_broadcast() should
- * use nr_wake=1.
- */
- if (nr_wake != 1)
- return -EINVAL;
- }
-
-retry:
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
- if (unlikely(ret != 0))
- goto out;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
- requeue_pi ? FUTEX_WRITE : FUTEX_READ);
- if (unlikely(ret != 0))
- goto out_put_key1;
-
- /*
- * The check above which compares uaddrs is not sufficient for
- * shared futexes. We need to compare the keys:
- */
- if (requeue_pi && match_futex(&key1, &key2)) {
- ret = -EINVAL;
- goto out_put_keys;
- }
-
- hb1 = hash_futex(&key1);
- hb2 = hash_futex(&key2);
-
-retry_private:
- hb_waiters_inc(hb2);
- double_lock_hb(hb1, hb2);
-
- if (likely(cmpval != NULL)) {
- u32 curval;
-
- ret = get_futex_value_locked(&curval, uaddr1);
-
- if (unlikely(ret)) {
- double_unlock_hb(hb1, hb2);
- hb_waiters_dec(hb2);
-
- ret = get_user(curval, uaddr1);
- if (ret)
- goto out_put_keys;
-
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
-
- put_futex_key(&key2);
- put_futex_key(&key1);
- goto retry;
- }
- if (curval != *cmpval) {
- ret = -EAGAIN;
- goto out_unlock;
- }
- }
-
- if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
- struct task_struct *exiting = NULL;
-
- /*
- * Attempt to acquire uaddr2 and wake the top waiter. If we
- * intend to requeue waiters, force setting the FUTEX_WAITERS
- * bit. We force this here where we are able to easily handle
- * faults rather in the requeue loop below.
- */
- ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
- &key2, &pi_state,
- &exiting, nr_requeue);
-
- /*
- * At this point the top_waiter has either taken uaddr2 or is
- * waiting on it. If the former, then the pi_state will not
- * exist yet, look it up one more time to ensure we have a
- * reference to it. If the lock was taken, ret contains the
- * vpid of the top waiter task.
- * If the lock was not taken, we have pi_state and an initial
- * refcount on it. In case of an error we have nothing.
- */
- if (ret > 0) {
- WARN_ON(pi_state);
- task_count++;
- /*
- * If we acquired the lock, then the user space value
- * of uaddr2 should be vpid. It cannot be changed by
- * the top waiter as it is blocked on hb2 lock if it
- * tries to do so. If something fiddled with it behind
- * our back the pi state lookup might unearth it. So
- * we rather use the known value than rereading and
- * handing potential crap to lookup_pi_state.
- *
- * If that call succeeds then we have pi_state and an
- * initial refcount on it.
- */
- ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
- &pi_state, &exiting);
- }
-
- switch (ret) {
- case 0:
- /* We hold a reference on the pi state. */
- break;
-
- /* If the above failed, then pi_state is NULL */
- case -EFAULT:
- double_unlock_hb(hb1, hb2);
- hb_waiters_dec(hb2);
- put_futex_key(&key2);
- put_futex_key(&key1);
- ret = fault_in_user_writeable(uaddr2);
- if (!ret)
- goto retry;
- goto out;
- case -EBUSY:
- case -EAGAIN:
- /*
- * Two reasons for this:
- * - EBUSY: Owner is exiting and we just wait for the
- * exit to complete.
- * - EAGAIN: The user space value changed.
- */
- double_unlock_hb(hb1, hb2);
- hb_waiters_dec(hb2);
- put_futex_key(&key2);
- put_futex_key(&key1);
- /*
- * Handle the case where the owner is in the middle of
- * exiting. Wait for the exit to complete otherwise
- * this task might loop forever, aka. live lock.
- */
- wait_for_owner_exiting(ret, exiting);
- cond_resched();
- goto retry;
- default:
- goto out_unlock;
- }
- }
-
- plist_for_each_entry_safe(this, next, &hb1->chain, list) {
- if (task_count - nr_wake >= nr_requeue)
- break;
-
- if (!match_futex(&this->key, &key1))
- continue;
-
- /*
- * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
- * be paired with each other and no other futex ops.
- *
- * We should never be requeueing a futex_q with a pi_state,
- * which is awaiting a futex_unlock_pi().
- */
- if ((requeue_pi && !this->rt_waiter) ||
- (!requeue_pi && this->rt_waiter) ||
- this->pi_state) {
- ret = -EINVAL;
- break;
- }
-
- /*
- * Wake nr_wake waiters. For requeue_pi, if we acquired the
- * lock, we already woke the top_waiter. If not, it will be
- * woken by futex_unlock_pi().
- */
- if (++task_count <= nr_wake && !requeue_pi) {
- mark_wake_futex(&wake_q, this);
- continue;
- }
-
- /* Ensure we requeue to the expected futex for requeue_pi. */
- if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
- ret = -EINVAL;
- break;
- }
-
- /*
- * Requeue nr_requeue waiters and possibly one more in the case
- * of requeue_pi if we couldn't acquire the lock atomically.
- */
- if (requeue_pi) {
- /*
- * Prepare the waiter to take the rt_mutex. Take a
- * refcount on the pi_state and store the pointer in
- * the futex_q object of the waiter.
- */
- get_pi_state(pi_state);
- this->pi_state = pi_state;
- ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
- this->rt_waiter,
- this->task);
- if (ret == 1) {
- /*
- * We got the lock. We do neither drop the
- * refcount on pi_state nor clear
- * this->pi_state because the waiter needs the
- * pi_state for cleaning up the user space
- * value. It will drop the refcount after
- * doing so.
- */
- requeue_pi_wake_futex(this, &key2, hb2);
- continue;
- } else if (ret) {
- /*
- * rt_mutex_start_proxy_lock() detected a
- * potential deadlock when we tried to queue
- * that waiter. Drop the pi_state reference
- * which we took above and remove the pointer
- * to the state from the waiters futex_q
- * object.
- */
- this->pi_state = NULL;
- put_pi_state(pi_state);
- /*
- * We stop queueing more waiters and let user
- * space deal with the mess.
- */
- break;
- }
- }
- requeue_futex(this, hb1, hb2, &key2);
- }
-
- /*
- * We took an extra initial reference to the pi_state either
- * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
- * need to drop it here again.
- */
- put_pi_state(pi_state);
-
-out_unlock:
- double_unlock_hb(hb1, hb2);
- wake_up_q(&wake_q);
- hb_waiters_dec(hb2);
-
-out_put_keys:
- put_futex_key(&key2);
-out_put_key1:
- put_futex_key(&key1);
-out:
- return ret ? ret : task_count;
-}
-
-/* The key must be already stored in q->key. */
-static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
- __acquires(&hb->lock)
-{
- struct futex_hash_bucket *hb;
-
- hb = hash_futex(&q->key);
-
- /*
- * Increment the counter before taking the lock so that
- * a potential waker won't miss a to-be-slept task that is
- * waiting for the spinlock. This is safe as all queue_lock()
- * users end up calling queue_me(). Similarly, for housekeeping,
- * decrement the counter at queue_unlock() when some error has
- * occurred and we don't end up adding the task to the list.
- */
- hb_waiters_inc(hb); /* implies smp_mb(); (A) */
-
- q->lock_ptr = &hb->lock;
-
- spin_lock(&hb->lock);
- return hb;
-}
-
-static inline void
-queue_unlock(struct futex_hash_bucket *hb)
- __releases(&hb->lock)
-{
- spin_unlock(&hb->lock);
- hb_waiters_dec(hb);
-}
-
-static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
-{
- int prio;
-
- /*
- * The priority used to register this element is
- * - either the real thread-priority for the real-time threads
- * (i.e. threads with a priority lower than MAX_RT_PRIO)
- * - or MAX_RT_PRIO for non-RT threads.
- * Thus, all RT-threads are woken first in priority order, and
- * the others are woken last, in FIFO order.
- */
- prio = min(current->normal_prio, MAX_RT_PRIO);
-
- plist_node_init(&q->list, prio);
- plist_add(&q->list, &hb->chain);
- q->task = current;
-}
-
-/**
- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
- * @q: The futex_q to enqueue
- * @hb: The destination hash bucket
- *
- * The hb->lock must be held by the caller, and is released here. A call to
- * queue_me() is typically paired with exactly one call to unqueue_me(). The
- * exceptions involve the PI related operations, which may use unqueue_me_pi()
- * or nothing if the unqueue is done as part of the wake process and the unqueue
- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
- * an example).
- */
-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
- __releases(&hb->lock)
-{
- __queue_me(q, hb);
- spin_unlock(&hb->lock);
-}
-
-/**
- * unqueue_me() - Remove the futex_q from its futex_hash_bucket
- * @q: The futex_q to unqueue
- *
- * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
- * be paired with exactly one earlier call to queue_me().
- *
- * Return:
- * - 1 - if the futex_q was still queued (and we removed unqueued it);
- * - 0 - if the futex_q was already removed by the waking thread
- */
-static int unqueue_me(struct futex_q *q)
-{
- spinlock_t *lock_ptr;
- int ret = 0;
-
- /* In the common case we don't take the spinlock, which is nice. */
-retry:
- /*
- * q->lock_ptr can change between this read and the following spin_lock.
- * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
- * optimizing lock_ptr out of the logic below.
- */
- lock_ptr = READ_ONCE(q->lock_ptr);
- if (lock_ptr != NULL) {
- spin_lock(lock_ptr);
- /*
- * q->lock_ptr can change between reading it and
- * spin_lock(), causing us to take the wrong lock. This
- * corrects the race condition.
- *
- * Reasoning goes like this: if we have the wrong lock,
- * q->lock_ptr must have changed (maybe several times)
- * between reading it and the spin_lock(). It can
- * change again after the spin_lock() but only if it was
- * already changed before the spin_lock(). It cannot,
- * however, change back to the original value. Therefore
- * we can detect whether we acquired the correct lock.
- */
- if (unlikely(lock_ptr != q->lock_ptr)) {
- spin_unlock(lock_ptr);
- goto retry;
- }
- __unqueue_futex(q);
-
- BUG_ON(q->pi_state);
-
- spin_unlock(lock_ptr);
- ret = 1;
- }
-
- return ret;
-}
-
-/*
- * PI futexes can not be requeued and must remove themself from the
- * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
- * and dropped here.
- */
-static void unqueue_me_pi(struct futex_q *q)
- __releases(q->lock_ptr)
-{
- __unqueue_futex(q);
-
- BUG_ON(!q->pi_state);
- put_pi_state(q->pi_state);
- q->pi_state = NULL;
-
- spin_unlock(q->lock_ptr);
-}
-
-static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *argowner)
-{
- struct futex_pi_state *pi_state = q->pi_state;
- u32 uval, uninitialized_var(curval), newval;
- struct task_struct *oldowner, *newowner;
- u32 newtid;
- int ret, err = 0;
-
- lockdep_assert_held(q->lock_ptr);
-
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
-
- oldowner = pi_state->owner;
-
- /*
- * We are here because either:
- *
- * - we stole the lock and pi_state->owner needs updating to reflect
- * that (@argowner == current),
- *
- * or:
- *
- * - someone stole our lock and we need to fix things to point to the
- * new owner (@argowner == NULL).
- *
- * Either way, we have to replace the TID in the user space variable.
- * This must be atomic as we have to preserve the owner died bit here.
- *
- * Note: We write the user space value _before_ changing the pi_state
- * because we can fault here. Imagine swapped out pages or a fork
- * that marked all the anonymous memory readonly for cow.
- *
- * Modifying pi_state _before_ the user space value would leave the
- * pi_state in an inconsistent state when we fault here, because we
- * need to drop the locks to handle the fault. This might be observed
- * in the PID check in lookup_pi_state.
- */
-retry:
- if (!argowner) {
- if (oldowner != current) {
- /*
- * We raced against a concurrent self; things are
- * already fixed up. Nothing to do.
- */
- ret = 0;
- goto out_unlock;
- }
-
- if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
- /* We got the lock after all, nothing to fix. */
- ret = 0;
- goto out_unlock;
- }
-
- /*
- * Since we just failed the trylock; there must be an owner.
- */
- newowner = rt_mutex_owner(&pi_state->pi_mutex);
- BUG_ON(!newowner);
- } else {
- WARN_ON_ONCE(argowner != current);
- if (oldowner == current) {
- /*
- * We raced against a concurrent self; things are
- * already fixed up. Nothing to do.
- */
- ret = 0;
- goto out_unlock;
- }
- newowner = argowner;
- }
-
- newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
- /* Owner died? */
- if (!pi_state->owner)
- newtid |= FUTEX_OWNER_DIED;
-
- err = get_futex_value_locked(&uval, uaddr);
- if (err)
- goto handle_err;
-
- for (;;) {
- newval = (uval & FUTEX_OWNER_DIED) | newtid;
-
- err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
- if (err)
- goto handle_err;
-
- if (curval == uval)
- break;
- uval = curval;
- }
-
- /*
- * We fixed up user space. Now we need to fix the pi_state
- * itself.
- */
- if (pi_state->owner != NULL) {
- raw_spin_lock(&pi_state->owner->pi_lock);
- WARN_ON(list_empty(&pi_state->list));
- list_del_init(&pi_state->list);
- raw_spin_unlock(&pi_state->owner->pi_lock);
- }
-
- pi_state->owner = newowner;
-
- raw_spin_lock(&newowner->pi_lock);
- WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &newowner->pi_state_list);
- raw_spin_unlock(&newowner->pi_lock);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-
- return 0;
-
- /*
- * In order to reschedule or handle a page fault, we need to drop the
- * locks here. In the case of a fault, this gives the other task
- * (either the highest priority waiter itself or the task which stole
- * the rtmutex) the chance to try the fixup of the pi_state. So once we
- * are back from handling the fault we need to check the pi_state after
- * reacquiring the locks and before trying to do another fixup. When
- * the fixup has been done already we simply return.
- *
- * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
- * drop hb->lock since the caller owns the hb -> futex_q relation.
- * Dropping the pi_mutex->wait_lock requires the state revalidate.
- */
-handle_err:
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(q->lock_ptr);
-
- switch (err) {
- case -EFAULT:
- ret = fault_in_user_writeable(uaddr);
- break;
-
- case -EAGAIN:
- cond_resched();
- ret = 0;
- break;
-
- default:
- WARN_ON_ONCE(1);
- ret = err;
- break;
- }
-
- spin_lock(q->lock_ptr);
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
-
- /*
- * Check if someone else fixed it for us:
- */
- if (pi_state->owner != oldowner) {
- ret = 0;
- goto out_unlock;
- }
-
- if (ret)
- goto out_unlock;
-
- goto retry;
-
-out_unlock:
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- return ret;
-}
-
-static long futex_wait_restart(struct restart_block *restart);
-
-/**
- * fixup_owner() - Post lock pi_state and corner case management
- * @uaddr: user address of the futex
- * @q: futex_q (contains pi_state and access to the rt_mutex)
- * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
- *
- * After attempting to lock an rt_mutex, this function is called to cleanup
- * the pi_state owner as well as handle race conditions that may allow us to
- * acquire the lock. Must be called with the hb lock held.
- *
- * Return:
- * - 1 - success, lock taken;
- * - 0 - success, lock not taken;
- * - <0 - on error (-EFAULT)
- */
-static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
-{
- int ret = 0;
-
- if (locked) {
- /*
- * Got the lock. We might not be the anticipated owner if we
- * did a lock-steal - fix up the PI-state in that case:
- *
- * Speculative pi_state->owner read (we don't hold wait_lock);
- * since we own the lock pi_state->owner == current is the
- * stable state, anything else needs more attention.
- */
- if (q->pi_state->owner != current)
- ret = fixup_pi_state_owner(uaddr, q, current);
- goto out;
- }
-
- /*
- * If we didn't get the lock; check if anybody stole it from us. In
- * that case, we need to fix up the uval to point to them instead of
- * us, otherwise bad things happen. [10]
- *
- * Another speculative read; pi_state->owner == current is unstable
- * but needs our attention.
- */
- if (q->pi_state->owner == current) {
- ret = fixup_pi_state_owner(uaddr, q, NULL);
- goto out;
- }
-
- /*
- * Paranoia check. If we did not take the lock, then we should not be
- * the owner of the rt_mutex.
- */
- if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
- printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
- "pi-state %p\n", ret,
- q->pi_state->pi_mutex.owner,
- q->pi_state->owner);
- }
-
-out:
- return ret ? ret : locked;
-}
-
-/**
- * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
- * @hb: the futex hash bucket, must be locked by the caller
- * @q: the futex_q to queue up on
- * @timeout: the prepared hrtimer_sleeper, or null for no timeout
- */
-static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
- struct hrtimer_sleeper *timeout)
-{
- /*
- * The task state is guaranteed to be set before another task can
- * wake it. set_current_state() is implemented using smp_store_mb() and
- * queue_me() calls spin_unlock() upon completion, both serializing
- * access to the hash list and forcing another memory barrier.
- */
- set_current_state(TASK_INTERRUPTIBLE);
- queue_me(q, hb);
-
- /* Arm the timer */
- if (timeout)
- hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
-
- /*
- * If we have been removed from the hash list, then another task
- * has tried to wake us, and we can skip the call to schedule().
- */
- if (likely(!plist_node_empty(&q->list))) {
- /*
- * If the timer has already expired, current will already be
- * flagged for rescheduling. Only call schedule if there
- * is no timeout, or if it has yet to expire.
- */
- if (!timeout || timeout->task)
- freezable_schedule();
- }
- __set_current_state(TASK_RUNNING);
-}
-
-/**
- * futex_wait_setup() - Prepare to wait on a futex
- * @uaddr: the futex userspace address
- * @val: the expected value
- * @flags: futex flags (FLAGS_SHARED, etc.)
- * @q: the associated futex_q
- * @hb: storage for hash_bucket pointer to be returned to caller
- *
- * Setup the futex_q and locate the hash_bucket. Get the futex value and
- * compare it with the expected value. Handle atomic faults internally.
- * Return with the hb lock held and a q.key reference on success, and unlocked
- * with no q.key reference on failure.
- *
- * Return:
- * - 0 - uaddr contains val and hb has been locked;
- * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
- */
-static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
- struct futex_q *q, struct futex_hash_bucket **hb)
-{
- u32 uval;
- int ret;
-
- /*
- * Access the page AFTER the hash-bucket is locked.
- * Order is important:
- *
- * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
- * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
- *
- * The basic logical guarantee of a futex is that it blocks ONLY
- * if cond(var) is known to be true at the time of blocking, for
- * any cond. If we locked the hash-bucket after testing *uaddr, that
- * would open a race condition where we could block indefinitely with
- * cond(var) false, which would violate the guarantee.
- *
- * On the other hand, we insert q and release the hash-bucket only
- * after testing *uaddr. This guarantees that futex_wait() will NOT
- * absorb a wakeup if *uaddr does not match the desired values
- * while the syscall executes.
- */
-retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
- if (unlikely(ret != 0))
- return ret;
-
-retry_private:
- *hb = queue_lock(q);
-
- ret = get_futex_value_locked(&uval, uaddr);
-
- if (ret) {
- queue_unlock(*hb);
-
- ret = get_user(uval, uaddr);
- if (ret)
- goto out;
-
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
-
- put_futex_key(&q->key);
- goto retry;
- }
-
- if (uval != val) {
- queue_unlock(*hb);
- ret = -EWOULDBLOCK;
- }
-
-out:
- if (ret)
- put_futex_key(&q->key);
- return ret;
-}
-
-static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
- ktime_t *abs_time, u32 bitset)
-{
- struct hrtimer_sleeper timeout, *to;
- struct restart_block *restart;
- struct futex_hash_bucket *hb;
- struct futex_q q = futex_q_init;
- int ret;
-
- if (!bitset)
- return -EINVAL;
- q.bitset = bitset;
-
- to = futex_setup_timer(abs_time, &timeout, flags,
- current->timer_slack_ns);
-retry:
- /*
- * Prepare to wait on uaddr. On success, holds hb lock and increments
- * q.key refs.
- */
- ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
- if (ret)
- goto out;
-
- /* queue_me and wait for wakeup, timeout, or a signal. */
- futex_wait_queue_me(hb, &q, to);
-
- /* If we were woken (and unqueued), we succeeded, whatever. */
- ret = 0;
- /* unqueue_me() drops q.key ref */
- if (!unqueue_me(&q))
- goto out;
- ret = -ETIMEDOUT;
- if (to && !to->task)
- goto out;
-
- /*
- * We expect signal_pending(current), but we might be the
- * victim of a spurious wakeup as well.
- */
- if (!signal_pending(current))
- goto retry;
-
- ret = -ERESTARTSYS;
- if (!abs_time)
- goto out;
-
- restart = &current->restart_block;
- restart->fn = futex_wait_restart;
- restart->futex.uaddr = uaddr;
- restart->futex.val = val;
- restart->futex.time = *abs_time;
- restart->futex.bitset = bitset;
- restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
-
- ret = -ERESTART_RESTARTBLOCK;
-
-out:
- if (to) {
- hrtimer_cancel(&to->timer);
- destroy_hrtimer_on_stack(&to->timer);
- }
- return ret;
-}
-
-
-static long futex_wait_restart(struct restart_block *restart)
-{
- u32 __user *uaddr = restart->futex.uaddr;
- ktime_t t, *tp = NULL;
-
- if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
- t = restart->futex.time;
- tp = &t;
- }
- restart->fn = do_no_restart_syscall;
-
- return (long)futex_wait(uaddr, restart->futex.flags,
- restart->futex.val, tp, restart->futex.bitset);
-}
-
-
-/*
- * Userspace tried a 0 -> TID atomic transition of the futex value
- * and failed. The kernel side here does the whole locking operation:
- * if there are waiters then it will block as a consequence of relying
- * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
- * a 0 value of the futex too.).
- *
- * Also serves as futex trylock_pi()'ing, and due semantics.
- */
-static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
- ktime_t *time, int trylock)
-{
- struct hrtimer_sleeper timeout, *to;
- struct futex_pi_state *pi_state = NULL;
- struct task_struct *exiting = NULL;
- struct rt_mutex_waiter rt_waiter;
- struct futex_hash_bucket *hb;
- struct futex_q q = futex_q_init;
- int res, ret;
-
- if (!IS_ENABLED(CONFIG_FUTEX_PI))
- return -ENOSYS;
-
- if (refill_pi_state_cache())
- return -ENOMEM;
-
- to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0);
-
-retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
- if (unlikely(ret != 0))
- goto out;
-
-retry_private:
- hb = queue_lock(&q);
-
- ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
- &exiting, 0);
- if (unlikely(ret)) {
- /*
- * Atomic work succeeded and we got the lock,
- * or failed. Either way, we do _not_ block.
- */
- switch (ret) {
- case 1:
- /* We got the lock. */
- ret = 0;
- goto out_unlock_put_key;
- case -EFAULT:
- goto uaddr_faulted;
- case -EBUSY:
- case -EAGAIN:
- /*
- * Two reasons for this:
- * - EBUSY: Task is exiting and we just wait for the
- * exit to complete.
- * - EAGAIN: The user space value changed.
- */
- queue_unlock(hb);
- put_futex_key(&q.key);
- /*
- * Handle the case where the owner is in the middle of
- * exiting. Wait for the exit to complete otherwise
- * this task might loop forever, aka. live lock.
- */
- wait_for_owner_exiting(ret, exiting);
- cond_resched();
- goto retry;
- default:
- goto out_unlock_put_key;
- }
- }
-
- WARN_ON(!q.pi_state);
-
- /*
- * Only actually queue now that the atomic ops are done:
- */
- __queue_me(&q, hb);
-
- if (trylock) {
- ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
- /* Fixup the trylock return value: */
- ret = ret ? 0 : -EWOULDBLOCK;
- goto no_block;
- }
-
- rt_mutex_init_waiter(&rt_waiter);
-
- /*
- * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
- * hold it while doing rt_mutex_start_proxy(), because then it will
- * include hb->lock in the blocking chain, even through we'll not in
- * fact hold it while blocking. This will lead it to report -EDEADLK
- * and BUG when futex_unlock_pi() interleaves with this.
- *
- * Therefore acquire wait_lock while holding hb->lock, but drop the
- * latter before calling __rt_mutex_start_proxy_lock(). This
- * interleaves with futex_unlock_pi() -- which does a similar lock
- * handoff -- such that the latter can observe the futex_q::pi_state
- * before __rt_mutex_start_proxy_lock() is done.
- */
- raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
- spin_unlock(q.lock_ptr);
- /*
- * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
- * such that futex_unlock_pi() is guaranteed to observe the waiter when
- * it sees the futex_q::pi_state.
- */
- ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
- raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
-
- if (ret) {
- if (ret == 1)
- ret = 0;
- goto cleanup;
- }
-
- if (unlikely(to))
- hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
-
- ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
-
-cleanup:
- spin_lock(q.lock_ptr);
- /*
- * If we failed to acquire the lock (deadlock/signal/timeout), we must
- * first acquire the hb->lock before removing the lock from the
- * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
- * lists consistent.
- *
- * In particular; it is important that futex_unlock_pi() can not
- * observe this inconsistency.
- */
- if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
- ret = 0;
-
-no_block:
- /*
- * Fixup the pi_state owner and possibly acquire the lock if we
- * haven't already.
- */
- res = fixup_owner(uaddr, &q, !ret);
- /*
- * If fixup_owner() returned an error, proprogate that. If it acquired
- * the lock, clear our -ETIMEDOUT or -EINTR.
- */
- if (res)
- ret = (res < 0) ? res : 0;
-
- /*
- * If fixup_owner() faulted and was unable to handle the fault, unlock
- * it and return the fault to userspace.
- */
- if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
- pi_state = q.pi_state;
- get_pi_state(pi_state);
- }
-
- /* Unqueue and drop the lock */
- unqueue_me_pi(&q);
-
- if (pi_state) {
- rt_mutex_futex_unlock(&pi_state->pi_mutex);
- put_pi_state(pi_state);
- }
-
- goto out_put_key;
-
-out_unlock_put_key:
- queue_unlock(hb);
-
-out_put_key:
- put_futex_key(&q.key);
-out:
- if (to) {
- hrtimer_cancel(&to->timer);
- destroy_hrtimer_on_stack(&to->timer);
- }
- return ret != -EINTR ? ret : -ERESTARTNOINTR;
-
-uaddr_faulted:
- queue_unlock(hb);
-
- ret = fault_in_user_writeable(uaddr);
- if (ret)
- goto out_put_key;
-
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
-
- put_futex_key(&q.key);
- goto retry;
-}
-
-/*
- * Userspace attempted a TID -> 0 atomic transition, and failed.
- * This is the in-kernel slowpath: we look up the PI state (if any),
- * and do the rt-mutex unlock.
- */
-static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
-{
- u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
- union futex_key key = FUTEX_KEY_INIT;
- struct futex_hash_bucket *hb;
- struct futex_q *top_waiter;
- int ret;
-
- if (!IS_ENABLED(CONFIG_FUTEX_PI))
- return -ENOSYS;
-
-retry:
- if (get_user(uval, uaddr))
- return -EFAULT;
- /*
- * We release only a lock we actually own:
- */
- if ((uval & FUTEX_TID_MASK) != vpid)
- return -EPERM;
-
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
- if (ret)
- return ret;
-
- hb = hash_futex(&key);
- spin_lock(&hb->lock);
-
- /*
- * Check waiters first. We do not trust user space values at
- * all and we at least want to know if user space fiddled
- * with the futex value instead of blindly unlocking.
- */
- top_waiter = futex_top_waiter(hb, &key);
- if (top_waiter) {
- struct futex_pi_state *pi_state = top_waiter->pi_state;
-
- ret = -EINVAL;
- if (!pi_state)
- goto out_unlock;
-
- /*
- * If current does not own the pi_state then the futex is
- * inconsistent and user space fiddled with the futex value.
- */
- if (pi_state->owner != current)
- goto out_unlock;
-
- get_pi_state(pi_state);
- /*
- * By taking wait_lock while still holding hb->lock, we ensure
- * there is no point where we hold neither; and therefore
- * wake_futex_pi() must observe a state consistent with what we
- * observed.
- *
- * In particular; this forces __rt_mutex_start_proxy() to
- * complete such that we're guaranteed to observe the
- * rt_waiter. Also see the WARN in wake_futex_pi().
- */
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
-
- /* drops pi_state->pi_mutex.wait_lock */
- ret = wake_futex_pi(uaddr, uval, pi_state);
-
- put_pi_state(pi_state);
-
- /*
- * Success, we're done! No tricky corner cases.
- */
- if (!ret)
- goto out_putkey;
- /*
- * The atomic access to the futex value generated a
- * pagefault, so retry the user-access and the wakeup:
- */
- if (ret == -EFAULT)
- goto pi_faulted;
- /*
- * A unconditional UNLOCK_PI op raced against a waiter
- * setting the FUTEX_WAITERS bit. Try again.
- */
- if (ret == -EAGAIN)
- goto pi_retry;
- /*
- * wake_futex_pi has detected invalid state. Tell user
- * space.
- */
- goto out_putkey;
- }
-
- /*
- * We have no kernel internal state, i.e. no waiters in the
- * kernel. Waiters which are about to queue themselves are stuck
- * on hb->lock. So we can safely ignore them. We do neither
- * preserve the WAITERS bit not the OWNER_DIED one. We are the
- * owner.
- */
- if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
- spin_unlock(&hb->lock);
- switch (ret) {
- case -EFAULT:
- goto pi_faulted;
-
- case -EAGAIN:
- goto pi_retry;
-
- default:
- WARN_ON_ONCE(1);
- goto out_putkey;
- }
- }
-
- /*
- * If uval has changed, let user space handle it.
- */
- ret = (curval == uval) ? 0 : -EAGAIN;
-
-out_unlock:
- spin_unlock(&hb->lock);
-out_putkey:
- put_futex_key(&key);
- return ret;
-
-pi_retry:
- put_futex_key(&key);
- cond_resched();
- goto retry;
-
-pi_faulted:
- put_futex_key(&key);
-
- ret = fault_in_user_writeable(uaddr);
- if (!ret)
- goto retry;
-
- return ret;
-}
-
-/**
- * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
- * @hb: the hash_bucket futex_q was original enqueued on
- * @q: the futex_q woken while waiting to be requeued
- * @key2: the futex_key of the requeue target futex
- * @timeout: the timeout associated with the wait (NULL if none)
- *
- * Detect if the task was woken on the initial futex as opposed to the requeue
- * target futex. If so, determine if it was a timeout or a signal that caused
- * the wakeup and return the appropriate error code to the caller. Must be
- * called with the hb lock held.
- *
- * Return:
- * - 0 = no early wakeup detected;
- * - <0 = -ETIMEDOUT or -ERESTARTNOINTR
- */
-static inline
-int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
- struct futex_q *q, union futex_key *key2,
- struct hrtimer_sleeper *timeout)
-{
- int ret = 0;
-
- /*
- * With the hb lock held, we avoid races while we process the wakeup.
- * We only need to hold hb (and not hb2) to ensure atomicity as the
- * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
- * It can't be requeued from uaddr2 to something else since we don't
- * support a PI aware source futex for requeue.
- */
- if (!match_futex(&q->key, key2)) {
- WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
- /*
- * We were woken prior to requeue by a timeout or a signal.
- * Unqueue the futex_q and determine which it was.
- */
- plist_del(&q->list, &hb->chain);
- hb_waiters_dec(hb);
-
- /* Handle spurious wakeups gracefully */
- ret = -EWOULDBLOCK;
- if (timeout && !timeout->task)
- ret = -ETIMEDOUT;
- else if (signal_pending(current))
- ret = -ERESTARTNOINTR;
- }
- return ret;
-}
-
-/**
- * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
- * @uaddr: the futex we initially wait on (non-pi)
- * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
- * the same type, no requeueing from private to shared, etc.
- * @val: the expected value of uaddr
- * @abs_time: absolute timeout
- * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
- * @uaddr2: the pi futex we will take prior to returning to user-space
- *
- * The caller will wait on uaddr and will be requeued by futex_requeue() to
- * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
- * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
- * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
- * without one, the pi logic would not know which task to boost/deboost, if
- * there was a need to.
- *
- * We call schedule in futex_wait_queue_me() when we enqueue and return there
- * via the following--
- * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
- * 2) wakeup on uaddr2 after a requeue
- * 3) signal
- * 4) timeout
- *
- * If 3, cleanup and return -ERESTARTNOINTR.
- *
- * If 2, we may then block on trying to take the rt_mutex and return via:
- * 5) successful lock
- * 6) signal
- * 7) timeout
- * 8) other lock acquisition failure
- *
- * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
- *
- * If 4 or 7, we cleanup and return with -ETIMEDOUT.
- *
- * Return:
- * - 0 - On success;
- * - <0 - On error
- */
-static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
- u32 val, ktime_t *abs_time, u32 bitset,
- u32 __user *uaddr2)
-{
- struct hrtimer_sleeper timeout, *to;
- struct futex_pi_state *pi_state = NULL;
- struct rt_mutex_waiter rt_waiter;
- struct futex_hash_bucket *hb;
- union futex_key key2 = FUTEX_KEY_INIT;
- struct futex_q q = futex_q_init;
- int res, ret;
-
- if (!IS_ENABLED(CONFIG_FUTEX_PI))
- return -ENOSYS;
-
- if (uaddr == uaddr2)
- return -EINVAL;
-
- if (!bitset)
- return -EINVAL;
-
- to = futex_setup_timer(abs_time, &timeout, flags,
- current->timer_slack_ns);
-
- /*
- * The waiter is allocated on our stack, manipulated by the requeue
- * code while we sleep on uaddr.
- */
- rt_mutex_init_waiter(&rt_waiter);
-
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
- if (unlikely(ret != 0))
- goto out;
-
- q.bitset = bitset;
- q.rt_waiter = &rt_waiter;
- q.requeue_pi_key = &key2;
-
- /*
- * Prepare to wait on uaddr. On success, increments q.key (key1) ref
- * count.
- */
- ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
- if (ret)
- goto out_key2;
-
- /*
- * The check above which compares uaddrs is not sufficient for
- * shared futexes. We need to compare the keys:
- */
- if (match_futex(&q.key, &key2)) {
- queue_unlock(hb);
- ret = -EINVAL;
- goto out_put_keys;
- }
-
- /* Queue the futex_q, drop the hb lock, wait for wakeup. */
- futex_wait_queue_me(hb, &q, to);
-
- spin_lock(&hb->lock);
- ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
- spin_unlock(&hb->lock);
- if (ret)
- goto out_put_keys;
-
- /*
- * In order for us to be here, we know our q.key == key2, and since
- * we took the hb->lock above, we also know that futex_requeue() has
- * completed and we no longer have to concern ourselves with a wakeup
- * race with the atomic proxy lock acquisition by the requeue code. The
- * futex_requeue dropped our key1 reference and incremented our key2
- * reference count.
- */
-
- /* Check if the requeue code acquired the second futex for us. */
- if (!q.rt_waiter) {
- /*
- * Got the lock. We might not be the anticipated owner if we
- * did a lock-steal - fix up the PI-state in that case.
- */
- if (q.pi_state && (q.pi_state->owner != current)) {
- spin_lock(q.lock_ptr);
- ret = fixup_pi_state_owner(uaddr2, &q, current);
- if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
- pi_state = q.pi_state;
- get_pi_state(pi_state);
- }
- /*
- * Drop the reference to the pi state which
- * the requeue_pi() code acquired for us.
- */
- put_pi_state(q.pi_state);
- spin_unlock(q.lock_ptr);
- }
- } else {
- struct rt_mutex *pi_mutex;
-
- /*
- * We have been woken up by futex_unlock_pi(), a timeout, or a
- * signal. futex_unlock_pi() will not destroy the lock_ptr nor
- * the pi_state.
- */
- WARN_ON(!q.pi_state);
- pi_mutex = &q.pi_state->pi_mutex;
- ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
-
- spin_lock(q.lock_ptr);
- if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
- ret = 0;
-
- debug_rt_mutex_free_waiter(&rt_waiter);
- /*
- * Fixup the pi_state owner and possibly acquire the lock if we
- * haven't already.
- */
- res = fixup_owner(uaddr2, &q, !ret);
- /*
- * If fixup_owner() returned an error, proprogate that. If it
- * acquired the lock, clear -ETIMEDOUT or -EINTR.
- */
- if (res)
- ret = (res < 0) ? res : 0;
-
- /*
- * If fixup_pi_state_owner() faulted and was unable to handle
- * the fault, unlock the rt_mutex and return the fault to
- * userspace.
- */
- if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
- pi_state = q.pi_state;
- get_pi_state(pi_state);
- }
-
- /* Unqueue and drop the lock. */
- unqueue_me_pi(&q);
- }
-
- if (pi_state) {
- rt_mutex_futex_unlock(&pi_state->pi_mutex);
- put_pi_state(pi_state);
- }
-
- if (ret == -EINTR) {
- /*
- * We've already been requeued, but cannot restart by calling
- * futex_lock_pi() directly. We could restart this syscall, but
- * it would detect that the user space "val" changed and return
- * -EWOULDBLOCK. Save the overhead of the restart and return
- * -EWOULDBLOCK directly.
- */
- ret = -EWOULDBLOCK;
- }
-
-out_put_keys:
- put_futex_key(&q.key);
-out_key2:
- put_futex_key(&key2);
-
-out:
- if (to) {
- hrtimer_cancel(&to->timer);
- destroy_hrtimer_on_stack(&to->timer);
- }
- return ret;
-}
-
-/*
- * Support for robust futexes: the kernel cleans up held futexes at
- * thread exit time.
- *
- * Implementation: user-space maintains a per-thread list of locks it
- * is holding. Upon do_exit(), the kernel carefully walks this list,
- * and marks all locks that are owned by this thread with the
- * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
- * always manipulated with the lock held, so the list is private and
- * per-thread. Userspace also maintains a per-thread 'list_op_pending'
- * field, to allow the kernel to clean up if the thread dies after
- * acquiring the lock, but just before it could have added itself to
- * the list. There can only be one such pending lock.
- */
-
-/**
- * sys_set_robust_list() - Set the robust-futex list head of a task
- * @head: pointer to the list-head
- * @len: length of the list-head, as userspace expects
- */
-SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
- size_t, len)
-{
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
- /*
- * The kernel knows only one size for now:
- */
- if (unlikely(len != sizeof(*head)))
- return -EINVAL;
-
- current->robust_list = head;
-
- return 0;
-}
-
-/**
- * sys_get_robust_list() - Get the robust-futex list head of a task
- * @pid: pid of the process [zero for current task]
- * @head_ptr: pointer to a list-head pointer, the kernel fills it in
- * @len_ptr: pointer to a length field, the kernel fills in the header size
- */
-SYSCALL_DEFINE3(get_robust_list, int, pid,
- struct robust_list_head __user * __user *, head_ptr,
- size_t __user *, len_ptr)
-{
- struct robust_list_head __user *head;
- unsigned long ret;
- struct task_struct *p;
-
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
- rcu_read_lock();
-
- ret = -ESRCH;
- if (!pid)
- p = current;
- else {
- p = find_task_by_vpid(pid);
- if (!p)
- goto err_unlock;
- }
-
- ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
- goto err_unlock;
-
- head = p->robust_list;
- rcu_read_unlock();
-
- if (put_user(sizeof(*head), len_ptr))
- return -EFAULT;
- return put_user(head, head_ptr);
-
-err_unlock:
- rcu_read_unlock();
-
- return ret;
-}
-
-/* Constants for the pending_op argument of handle_futex_death */
-#define HANDLE_DEATH_PENDING true
-#define HANDLE_DEATH_LIST false
-
-/*
- * Process a futex-list entry, check whether it's owned by the
- * dying task, and do notification if so:
- */
-static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
- bool pi, bool pending_op)
-{
- u32 uval, uninitialized_var(nval), mval;
- int err;
-
- /* Futex address must be 32bit aligned */
- if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
- return -1;
-
-retry:
- if (get_user(uval, uaddr))
- return -1;
-
- /*
- * Special case for regular (non PI) futexes. The unlock path in
- * user space has two race scenarios:
- *
- * 1. The unlock path releases the user space futex value and
- * before it can execute the futex() syscall to wake up
- * waiters it is killed.
- *
- * 2. A woken up waiter is killed before it can acquire the
- * futex in user space.
- *
- * In both cases the TID validation below prevents a wakeup of
- * potential waiters which can cause these waiters to block
- * forever.
- *
- * In both cases the following conditions are met:
- *
- * 1) task->robust_list->list_op_pending != NULL
- * @pending_op == true
- * 2) User space futex value == 0
- * 3) Regular futex: @pi == false
- *
- * If these conditions are met, it is safe to attempt waking up a
- * potential waiter without touching the user space futex value and
- * trying to set the OWNER_DIED bit. The user space futex value is
- * uncontended and the rest of the user space mutex state is
- * consistent, so a woken waiter will just take over the
- * uncontended futex. Setting the OWNER_DIED bit would create
- * inconsistent state and malfunction of the user space owner died
- * handling.
- */
- if (pending_op && !pi && !uval) {
- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
- return 0;
- }
-
- if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
- return 0;
-
- /*
- * Ok, this dying thread is truly holding a futex
- * of interest. Set the OWNER_DIED bit atomically
- * via cmpxchg, and if the value had FUTEX_WAITERS
- * set, wake up a waiter (if any). (We have to do a
- * futex_wake() even if OWNER_DIED is already set -
- * to handle the rare but possible case of recursive
- * thread-death.) The rest of the cleanup is done in
- * userspace.
- */
- mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
-
- /*
- * We are not holding a lock here, but we want to have
- * the pagefault_disable/enable() protection because
- * we want to handle the fault gracefully. If the
- * access fails we try to fault in the futex with R/W
- * verification via get_user_pages. get_user() above
- * does not guarantee R/W access. If that fails we
- * give up and leave the futex locked.
- */
- if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
- switch (err) {
- case -EFAULT:
- if (fault_in_user_writeable(uaddr))
- return -1;
- goto retry;
-
- case -EAGAIN:
- cond_resched();
- goto retry;
-
- default:
- WARN_ON_ONCE(1);
- return err;
- }
- }
-
- if (nval != uval)
- goto retry;
-
- /*
- * Wake robust non-PI futexes here. The wakeup of
- * PI futexes happens in exit_pi_state():
- */
- if (!pi && (uval & FUTEX_WAITERS))
- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
-
- return 0;
-}
-
-/*
- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
- */
-static inline int fetch_robust_entry(struct robust_list __user **entry,
- struct robust_list __user * __user *head,
- unsigned int *pi)
-{
- unsigned long uentry;
-
- if (get_user(uentry, (unsigned long __user *)head))
- return -EFAULT;
-
- *entry = (void __user *)(uentry & ~1UL);
- *pi = uentry & 1;
-
- return 0;
-}
-
-/*
- * Walk curr->robust_list (very carefully, it's a userspace list!)
- * and mark any locks found there dead, and notify any waiters.
- *
- * We silently return on any sign of list-walking problem.
- */
-static void exit_robust_list(struct task_struct *curr)
-{
- struct robust_list_head __user *head = curr->robust_list;
- struct robust_list __user *entry, *next_entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int uninitialized_var(next_pi);
- unsigned long futex_offset;
- int rc;
-
- if (!futex_cmpxchg_enabled)
- return;
-
- /*
- * Fetch the list head (which was registered earlier, via
- * sys_set_robust_list()):
- */
- if (fetch_robust_entry(&entry, &head->list.next, &pi))
- return;
- /*
- * Fetch the relative futex offset:
- */
- if (get_user(futex_offset, &head->futex_offset))
- return;
- /*
- * Fetch any possibly pending lock-add first, and handle it
- * if it exists:
- */
- if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
- return;
-
- next_entry = NULL; /* avoid warning with gcc */
- while (entry != &head->list) {
- /*
- * Fetch the next entry in the list before calling
- * handle_futex_death:
- */
- rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
- /*
- * A pending lock might already be on the list, so
- * don't process it twice:
- */
- if (entry != pending) {
- if (handle_futex_death((void __user *)entry + futex_offset,
- curr, pi, HANDLE_DEATH_LIST))
- return;
- }
- if (rc)
- return;
- entry = next_entry;
- pi = next_pi;
- /*
- * Avoid excessively long or circular lists:
- */
- if (!--limit)
- break;
-
- cond_resched();
- }
-
- if (pending) {
- handle_futex_death((void __user *)pending + futex_offset,
- curr, pip, HANDLE_DEATH_PENDING);
- }
-}
-
-static void futex_cleanup(struct task_struct *tsk)
-{
- if (unlikely(tsk->robust_list)) {
- exit_robust_list(tsk);
- tsk->robust_list = NULL;
- }
-
-#ifdef CONFIG_COMPAT
- if (unlikely(tsk->compat_robust_list)) {
- compat_exit_robust_list(tsk);
- tsk->compat_robust_list = NULL;
- }
-#endif
-
- if (unlikely(!list_empty(&tsk->pi_state_list)))
- exit_pi_state_list(tsk);
-}
-
-/**
- * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
- * @tsk: task to set the state on
- *
- * Set the futex exit state of the task lockless. The futex waiter code
- * observes that state when a task is exiting and loops until the task has
- * actually finished the futex cleanup. The worst case for this is that the
- * waiter runs through the wait loop until the state becomes visible.
- *
- * This is called from the recursive fault handling path in do_exit().
- *
- * This is best effort. Either the futex exit code has run already or
- * not. If the OWNER_DIED bit has been set on the futex then the waiter can
- * take it over. If not, the problem is pushed back to user space. If the
- * futex exit code did not run yet, then an already queued waiter might
- * block forever, but there is nothing which can be done about that.
- */
-void futex_exit_recursive(struct task_struct *tsk)
-{
- /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
- if (tsk->futex_state == FUTEX_STATE_EXITING)
- mutex_unlock(&tsk->futex_exit_mutex);
- tsk->futex_state = FUTEX_STATE_DEAD;
-}
-
-static void futex_cleanup_begin(struct task_struct *tsk)
-{
- /*
- * Prevent various race issues against a concurrent incoming waiter
- * including live locks by forcing the waiter to block on
- * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
- * attach_to_pi_owner().
- */
- mutex_lock(&tsk->futex_exit_mutex);
-
- /*
- * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
- *
- * This ensures that all subsequent checks of tsk->futex_state in
- * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
- * tsk->pi_lock held.
- *
- * It guarantees also that a pi_state which was queued right before
- * the state change under tsk->pi_lock by a concurrent waiter must
- * be observed in exit_pi_state_list().
- */
- raw_spin_lock_irq(&tsk->pi_lock);
- tsk->futex_state = FUTEX_STATE_EXITING;
- raw_spin_unlock_irq(&tsk->pi_lock);
-}
-
-static void futex_cleanup_end(struct task_struct *tsk, int state)
-{
- /*
- * Lockless store. The only side effect is that an observer might
- * take another loop until it becomes visible.
- */
- tsk->futex_state = state;
- /*
- * Drop the exit protection. This unblocks waiters which observed
- * FUTEX_STATE_EXITING to reevaluate the state.
- */
- mutex_unlock(&tsk->futex_exit_mutex);
-}
-
-void futex_exec_release(struct task_struct *tsk)
-{
- /*
- * The state handling is done for consistency, but in the case of
- * exec() there is no way to prevent futher damage as the PID stays
- * the same. But for the unlikely and arguably buggy case that a
- * futex is held on exec(), this provides at least as much state
- * consistency protection which is possible.
- */
- futex_cleanup_begin(tsk);
- futex_cleanup(tsk);
- /*
- * Reset the state to FUTEX_STATE_OK. The task is alive and about
- * exec a new binary.
- */
- futex_cleanup_end(tsk, FUTEX_STATE_OK);
-}
-
-void futex_exit_release(struct task_struct *tsk)
-{
- futex_cleanup_begin(tsk);
- futex_cleanup(tsk);
- futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
-}
-
-long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
- u32 __user *uaddr2, u32 val2, u32 val3)
-{
- int cmd = op & FUTEX_CMD_MASK;
- unsigned int flags = 0;
-
- if (!(op & FUTEX_PRIVATE_FLAG))
- flags |= FLAGS_SHARED;
-
- if (op & FUTEX_CLOCK_REALTIME) {
- flags |= FLAGS_CLOCKRT;
- if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
- cmd != FUTEX_WAIT_REQUEUE_PI)
- return -ENOSYS;
- }
-
- switch (cmd) {
- case FUTEX_LOCK_PI:
- case FUTEX_UNLOCK_PI:
- case FUTEX_TRYLOCK_PI:
- case FUTEX_WAIT_REQUEUE_PI:
- case FUTEX_CMP_REQUEUE_PI:
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
- }
-
- switch (cmd) {
- case FUTEX_WAIT:
- val3 = FUTEX_BITSET_MATCH_ANY;
- /* fall through */
- case FUTEX_WAIT_BITSET:
- return futex_wait(uaddr, flags, val, timeout, val3);
- case FUTEX_WAKE:
- val3 = FUTEX_BITSET_MATCH_ANY;
- /* fall through */
- case FUTEX_WAKE_BITSET:
- return futex_wake(uaddr, flags, val, val3);
- case FUTEX_REQUEUE:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
- case FUTEX_CMP_REQUEUE:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
- case FUTEX_WAKE_OP:
- return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
- case FUTEX_LOCK_PI:
- return futex_lock_pi(uaddr, flags, timeout, 0);
- case FUTEX_UNLOCK_PI:
- return futex_unlock_pi(uaddr, flags);
- case FUTEX_TRYLOCK_PI:
- return futex_lock_pi(uaddr, flags, NULL, 1);
- case FUTEX_WAIT_REQUEUE_PI:
- val3 = FUTEX_BITSET_MATCH_ANY;
- return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
- uaddr2);
- case FUTEX_CMP_REQUEUE_PI:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
- }
- return -ENOSYS;
-}
-
-
-SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
- struct __kernel_timespec __user *, utime, u32 __user *, uaddr2,
- u32, val3)
-{
- struct timespec64 ts;
- ktime_t t, *tp = NULL;
- u32 val2 = 0;
- int cmd = op & FUTEX_CMD_MASK;
-
- if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
- cmd == FUTEX_WAIT_BITSET ||
- cmd == FUTEX_WAIT_REQUEUE_PI)) {
- if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
- return -EFAULT;
- if (get_timespec64(&ts, utime))
- return -EFAULT;
- if (!timespec64_valid(&ts))
- return -EINVAL;
-
- t = timespec64_to_ktime(ts);
- if (cmd == FUTEX_WAIT)
- t = ktime_add_safe(ktime_get(), t);
- tp = &t;
- }
- /*
- * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
- * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
- */
- if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
- cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
- val2 = (u32) (unsigned long) utime;
-
- return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
-}
-
-#ifdef CONFIG_COMPAT
-/*
- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
- */
-static inline int
-compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
- compat_uptr_t __user *head, unsigned int *pi)
-{
- if (get_user(*uentry, head))
- return -EFAULT;
-
- *entry = compat_ptr((*uentry) & ~1);
- *pi = (unsigned int)(*uentry) & 1;
-
- return 0;
-}
-
-static void __user *futex_uaddr(struct robust_list __user *entry,
- compat_long_t futex_offset)
-{
- compat_uptr_t base = ptr_to_compat(entry);
- void __user *uaddr = compat_ptr(base + futex_offset);
-
- return uaddr;
-}
-
-/*
- * Walk curr->robust_list (very carefully, it's a userspace list!)
- * and mark any locks found there dead, and notify any waiters.
- *
- * We silently return on any sign of list-walking problem.
- */
-static void compat_exit_robust_list(struct task_struct *curr)
-{
- struct compat_robust_list_head __user *head = curr->compat_robust_list;
- struct robust_list __user *entry, *next_entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int uninitialized_var(next_pi);
- compat_uptr_t uentry, next_uentry, upending;
- compat_long_t futex_offset;
- int rc;
-
- if (!futex_cmpxchg_enabled)
- return;
-
- /*
- * Fetch the list head (which was registered earlier, via
- * sys_set_robust_list()):
- */
- if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
- return;
- /*
- * Fetch the relative futex offset:
- */
- if (get_user(futex_offset, &head->futex_offset))
- return;
- /*
- * Fetch any possibly pending lock-add first, and handle it
- * if it exists:
- */
- if (compat_fetch_robust_entry(&upending, &pending,
- &head->list_op_pending, &pip))
- return;
-
- next_entry = NULL; /* avoid warning with gcc */
- while (entry != (struct robust_list __user *) &head->list) {
- /*
- * Fetch the next entry in the list before calling
- * handle_futex_death:
- */
- rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
- (compat_uptr_t __user *)&entry->next, &next_pi);
- /*
- * A pending lock might already be on the list, so
- * dont process it twice:
- */
- if (entry != pending) {
- void __user *uaddr = futex_uaddr(entry, futex_offset);
-
- if (handle_futex_death(uaddr, curr, pi,
- HANDLE_DEATH_LIST))
- return;
- }
- if (rc)
- return;
- uentry = next_uentry;
- entry = next_entry;
- pi = next_pi;
- /*
- * Avoid excessively long or circular lists:
- */
- if (!--limit)
- break;
-
- cond_resched();
- }
- if (pending) {
- void __user *uaddr = futex_uaddr(pending, futex_offset);
-
- handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
- }
-}
-
-COMPAT_SYSCALL_DEFINE2(set_robust_list,
- struct compat_robust_list_head __user *, head,
- compat_size_t, len)
-{
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
- if (unlikely(len != sizeof(*head)))
- return -EINVAL;
-
- current->compat_robust_list = head;
-
- return 0;
-}
-
-COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
- compat_uptr_t __user *, head_ptr,
- compat_size_t __user *, len_ptr)
-{
- struct compat_robust_list_head __user *head;
- unsigned long ret;
- struct task_struct *p;
-
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
- rcu_read_lock();
-
- ret = -ESRCH;
- if (!pid)
- p = current;
- else {
- p = find_task_by_vpid(pid);
- if (!p)
- goto err_unlock;
- }
-
- ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
- goto err_unlock;
-
- head = p->compat_robust_list;
- rcu_read_unlock();
-
- if (put_user(sizeof(*head), len_ptr))
- return -EFAULT;
- return put_user(ptr_to_compat(head), head_ptr);
-
-err_unlock:
- rcu_read_unlock();
-
- return ret;
-}
-#endif /* CONFIG_COMPAT */
-
-#ifdef CONFIG_COMPAT_32BIT_TIME
-SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
- struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
- u32, val3)
-{
- struct timespec64 ts;
- ktime_t t, *tp = NULL;
- int val2 = 0;
- int cmd = op & FUTEX_CMD_MASK;
-
- if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
- cmd == FUTEX_WAIT_BITSET ||
- cmd == FUTEX_WAIT_REQUEUE_PI)) {
- if (get_old_timespec32(&ts, utime))
- return -EFAULT;
- if (!timespec64_valid(&ts))
- return -EINVAL;
-
- t = timespec64_to_ktime(ts);
- if (cmd == FUTEX_WAIT)
- t = ktime_add_safe(ktime_get(), t);
- tp = &t;
- }
- if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
- cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
- val2 = (int) (unsigned long) utime;
-
- return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
-}
-#endif /* CONFIG_COMPAT_32BIT_TIME */
-
-static void __init futex_detect_cmpxchg(void)
-{
-#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
- u32 curval;
-
- /*
- * This will fail and we want it. Some arch implementations do
- * runtime detection of the futex_atomic_cmpxchg_inatomic()
- * functionality. We want to know that before we call in any
- * of the complex code paths. Also we want to prevent
- * registration of robust lists in that case. NULL is
- * guaranteed to fault and we get -EFAULT on functional
- * implementation, the non-functional ones will return
- * -ENOSYS.
- */
- if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
- futex_cmpxchg_enabled = 1;
-#endif
-}
-
-static int __init futex_init(void)
-{
- unsigned int futex_shift;
- unsigned long i;
-
-#if CONFIG_BASE_SMALL
- futex_hashsize = 16;
-#else
- futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
-#endif
-
- futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
- futex_hashsize, 0,
- futex_hashsize < 256 ? HASH_SMALL : 0,
- &futex_shift, NULL,
- futex_hashsize, futex_hashsize);
- futex_hashsize = 1UL << futex_shift;
-
- futex_detect_cmpxchg();
-
- for (i = 0; i < futex_hashsize; i++) {
- atomic_set(&futex_queues[i].waiters, 0);
- plist_head_init(&futex_queues[i].chain);
- spin_lock_init(&futex_queues[i].lock);
- }
-
- return 0;
-}
-core_initcall(futex_init);
diff --git a/kernel/futex/Makefile b/kernel/futex/Makefile
new file mode 100644
index 000000000000..b77188d1fa07
--- /dev/null
+++ b/kernel/futex/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y += core.o syscalls.o pi.o requeue.o waitwake.o
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
new file mode 100644
index 000000000000..1e78ef24321e
--- /dev/null
+++ b/kernel/futex/core.c
@@ -0,0 +1,1173 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Fast Userspace Mutexes (which I call "Futexes!").
+ * (C) Rusty Russell, IBM 2002
+ *
+ * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
+ * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
+ *
+ * Removed page pinning, fix privately mapped COW pages and other cleanups
+ * (C) Copyright 2003, 2004 Jamie Lokier
+ *
+ * Robust futex support started by Ingo Molnar
+ * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
+ * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
+ *
+ * PI-futex support started by Ingo Molnar and Thomas Gleixner
+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * PRIVATE futexes by Eric Dumazet
+ * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
+ *
+ * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
+ * Copyright (C) IBM Corporation, 2009
+ * Thanks to Thomas Gleixner for conceptual design and careful reviews.
+ *
+ * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
+ * enough at me, Linus for the original (flawed) idea, Matthew
+ * Kirkwood for proof-of-concept implementation.
+ *
+ * "The futexes are also cursed."
+ * "But they come in a choice of three flavours!"
+ */
+#include <linux/compat.h>
+#include <linux/jhash.h>
+#include <linux/pagemap.h>
+#include <linux/plist.h>
+#include <linux/memblock.h>
+#include <linux/fault-inject.h>
+#include <linux/slab.h>
+
+#include "futex.h"
+#include "../locking/rtmutex_common.h"
+
+/*
+ * The base of the bucket array and its size are always used together
+ * (after initialization only in futex_hash()), so ensure that they
+ * reside in the same cacheline.
+ */
+static struct {
+ struct futex_hash_bucket *queues;
+ unsigned long hashsize;
+} __futex_data __read_mostly __aligned(2*sizeof(long));
+#define futex_queues (__futex_data.queues)
+#define futex_hashsize (__futex_data.hashsize)
+
+
+/*
+ * Fault injections for futexes.
+ */
+#ifdef CONFIG_FAIL_FUTEX
+
+static struct {
+ struct fault_attr attr;
+
+ bool ignore_private;
+} fail_futex = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_private = false,
+};
+
+static int __init setup_fail_futex(char *str)
+{
+ return setup_fault_attr(&fail_futex.attr, str);
+}
+__setup("fail_futex=", setup_fail_futex);
+
+bool should_fail_futex(bool fshared)
+{
+ if (fail_futex.ignore_private && !fshared)
+ return false;
+
+ return should_fail(&fail_futex.attr, 1);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_futex_debugfs(void)
+{
+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_futex", NULL,
+ &fail_futex.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ debugfs_create_bool("ignore-private", mode, dir,
+ &fail_futex.ignore_private);
+ return 0;
+}
+
+late_initcall(fail_futex_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#endif /* CONFIG_FAIL_FUTEX */
+
+/**
+ * futex_hash - Return the hash bucket in the global hash
+ * @key: Pointer to the futex key for which the hash is calculated
+ *
+ * We hash on the keys returned from get_futex_key (see below) and return the
+ * corresponding hash bucket in the global hash.
+ */
+struct futex_hash_bucket *futex_hash(union futex_key *key)
+{
+ u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
+ key->both.offset);
+
+ return &futex_queues[hash & (futex_hashsize - 1)];
+}
+
+
+/**
+ * futex_setup_timer - set up the sleeping hrtimer.
+ * @time: ptr to the given timeout value
+ * @timeout: the hrtimer_sleeper structure to be set up
+ * @flags: futex flags
+ * @range_ns: optional range in ns
+ *
+ * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
+ * value given
+ */
+struct hrtimer_sleeper *
+futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
+ int flags, u64 range_ns)
+{
+ if (!time)
+ return NULL;
+
+ hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
+ /*
+ * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
+ * effectively the same as calling hrtimer_set_expires().
+ */
+ hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
+
+ return timeout;
+}
+
+/*
+ * Generate a machine wide unique identifier for this inode.
+ *
+ * This relies on u64 not wrapping in the life-time of the machine; which with
+ * 1ns resolution means almost 585 years.
+ *
+ * This further relies on the fact that a well formed program will not unmap
+ * the file while it has a (shared) futex waiting on it. This mapping will have
+ * a file reference which pins the mount and inode.
+ *
+ * If for some reason an inode gets evicted and read back in again, it will get
+ * a new sequence number and will _NOT_ match, even though it is the exact same
+ * file.
+ *
+ * It is important that futex_match() will never have a false-positive, esp.
+ * for PI futexes that can mess up the state. The above argues that false-negatives
+ * are only possible for malformed programs.
+ */
+static u64 get_inode_sequence_number(struct inode *inode)
+{
+ static atomic64_t i_seq;
+ u64 old;
+
+ /* Does the inode already have a sequence number? */
+ old = atomic64_read(&inode->i_sequence);
+ if (likely(old))
+ return old;
+
+ for (;;) {
+ u64 new = atomic64_add_return(1, &i_seq);
+ if (WARN_ON_ONCE(!new))
+ continue;
+
+ old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
+ if (old)
+ return old;
+ return new;
+ }
+}
+
+/**
+ * get_futex_key() - Get parameters which are the keys for a futex
+ * @uaddr: virtual address of the futex
+ * @flags: FLAGS_*
+ * @key: address where result is stored.
+ * @rw: mapping needs to be read/write (values: FUTEX_READ,
+ * FUTEX_WRITE)
+ *
+ * Return: a negative error code or 0
+ *
+ * The key words are stored in @key on success.
+ *
+ * For shared mappings (when @fshared), the key is:
+ *
+ * ( inode->i_sequence, page->index, offset_within_page )
+ *
+ * [ also see get_inode_sequence_number() ]
+ *
+ * For private mappings (or when !@fshared), the key is:
+ *
+ * ( current->mm, address, 0 )
+ *
+ * This allows (cross process, where applicable) identification of the futex
+ * without keeping the page pinned for the duration of the FUTEX_WAIT.
+ *
+ * lock_page() might sleep, the caller should not hold a spinlock.
+ */
+int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
+ enum futex_access rw)
+{
+ unsigned long address = (unsigned long)uaddr;
+ struct mm_struct *mm = current->mm;
+ struct page *page;
+ struct folio *folio;
+ struct address_space *mapping;
+ int err, ro = 0;
+ bool fshared;
+
+ fshared = flags & FLAGS_SHARED;
+
+ /*
+ * The futex address must be "naturally" aligned.
+ */
+ key->both.offset = address % PAGE_SIZE;
+ if (unlikely((address % sizeof(u32)) != 0))
+ return -EINVAL;
+ address -= key->both.offset;
+
+ if (unlikely(!access_ok(uaddr, sizeof(u32))))
+ return -EFAULT;
+
+ if (unlikely(should_fail_futex(fshared)))
+ return -EFAULT;
+
+ /*
+ * PROCESS_PRIVATE futexes are fast.
+ * As the mm cannot disappear under us and the 'key' only needs
+ * virtual address, we dont even have to find the underlying vma.
+ * Note : We do have to check 'uaddr' is a valid user address,
+ * but access_ok() should be faster than find_vma()
+ */
+ if (!fshared) {
+ /*
+ * On no-MMU, shared futexes are treated as private, therefore
+ * we must not include the current process in the key. Since
+ * there is only one address space, the address is a unique key
+ * on its own.
+ */
+ if (IS_ENABLED(CONFIG_MMU))
+ key->private.mm = mm;
+ else
+ key->private.mm = NULL;
+
+ key->private.address = address;
+ return 0;
+ }
+
+again:
+ /* Ignore any VERIFY_READ mapping (futex common case) */
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
+ err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
+ /*
+ * If write access is not required (eg. FUTEX_WAIT), try
+ * and get read-only access.
+ */
+ if (err == -EFAULT && rw == FUTEX_READ) {
+ err = get_user_pages_fast(address, 1, 0, &page);
+ ro = 1;
+ }
+ if (err < 0)
+ return err;
+ else
+ err = 0;
+
+ /*
+ * The treatment of mapping from this point on is critical. The folio
+ * lock protects many things but in this context the folio lock
+ * stabilizes mapping, prevents inode freeing in the shared
+ * file-backed region case and guards against movement to swap cache.
+ *
+ * Strictly speaking the folio lock is not needed in all cases being
+ * considered here and folio lock forces unnecessarily serialization.
+ * From this point on, mapping will be re-verified if necessary and
+ * folio lock will be acquired only if it is unavoidable
+ *
+ * Mapping checks require the folio so it is looked up now. For
+ * anonymous pages, it does not matter if the folio is split
+ * in the future as the key is based on the address. For
+ * filesystem-backed pages, the precise page is required as the
+ * index of the page determines the key.
+ */
+ folio = page_folio(page);
+ mapping = READ_ONCE(folio->mapping);
+
+ /*
+ * If folio->mapping is NULL, then it cannot be an anonymous
+ * page; but it might be the ZERO_PAGE or in the gate area or
+ * in a special mapping (all cases which we are happy to fail);
+ * or it may have been a good file page when get_user_pages_fast
+ * found it, but truncated or holepunched or subjected to
+ * invalidate_complete_page2 before we got the folio lock (also
+ * cases which we are happy to fail). And we hold a reference,
+ * so refcount care in invalidate_inode_page's remove_mapping
+ * prevents drop_caches from setting mapping to NULL beneath us.
+ *
+ * The case we do have to guard against is when memory pressure made
+ * shmem_writepage move it from filecache to swapcache beneath us:
+ * an unlikely race, but we do need to retry for folio->mapping.
+ */
+ if (unlikely(!mapping)) {
+ int shmem_swizzled;
+
+ /*
+ * Folio lock is required to identify which special case above
+ * applies. If this is really a shmem page then the folio lock
+ * will prevent unexpected transitions.
+ */
+ folio_lock(folio);
+ shmem_swizzled = folio_test_swapcache(folio) || folio->mapping;
+ folio_unlock(folio);
+ folio_put(folio);
+
+ if (shmem_swizzled)
+ goto again;
+
+ return -EFAULT;
+ }
+
+ /*
+ * Private mappings are handled in a simple way.
+ *
+ * If the futex key is stored in anonymous memory, then the associated
+ * object is the mm which is implicitly pinned by the calling process.
+ *
+ * NOTE: When userspace waits on a MAP_SHARED mapping, even if
+ * it's a read-only handle, it's expected that futexes attach to
+ * the object not the particular process.
+ */
+ if (folio_test_anon(folio)) {
+ /*
+ * A RO anonymous page will never change and thus doesn't make
+ * sense for futex operations.
+ */
+ if (unlikely(should_fail_futex(true)) || ro) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
+ key->private.mm = mm;
+ key->private.address = address;
+
+ } else {
+ struct inode *inode;
+
+ /*
+ * The associated futex object in this case is the inode and
+ * the folio->mapping must be traversed. Ordinarily this should
+ * be stabilised under folio lock but it's not strictly
+ * necessary in this case as we just want to pin the inode, not
+ * update i_pages or anything like that.
+ *
+ * The RCU read lock is taken as the inode is finally freed
+ * under RCU. If the mapping still matches expectations then the
+ * mapping->host can be safely accessed as being a valid inode.
+ */
+ rcu_read_lock();
+
+ if (READ_ONCE(folio->mapping) != mapping) {
+ rcu_read_unlock();
+ folio_put(folio);
+
+ goto again;
+ }
+
+ inode = READ_ONCE(mapping->host);
+ if (!inode) {
+ rcu_read_unlock();
+ folio_put(folio);
+
+ goto again;
+ }
+
+ key->both.offset |= FUT_OFF_INODE; /* inode-based key */
+ key->shared.i_seq = get_inode_sequence_number(inode);
+ key->shared.pgoff = folio->index + folio_page_idx(folio, page);
+ rcu_read_unlock();
+ }
+
+out:
+ folio_put(folio);
+ return err;
+}
+
+/**
+ * fault_in_user_writeable() - Fault in user address and verify RW access
+ * @uaddr: pointer to faulting user space address
+ *
+ * Slow path to fixup the fault we just took in the atomic write
+ * access to @uaddr.
+ *
+ * We have no generic implementation of a non-destructive write to the
+ * user address. We know that we faulted in the atomic pagefault
+ * disabled section so we can as well avoid the #PF overhead by
+ * calling get_user_pages() right away.
+ */
+int fault_in_user_writeable(u32 __user *uaddr)
+{
+ struct mm_struct *mm = current->mm;
+ int ret;
+
+ mmap_read_lock(mm);
+ ret = fixup_user_fault(mm, (unsigned long)uaddr,
+ FAULT_FLAG_WRITE, NULL);
+ mmap_read_unlock(mm);
+
+ return ret < 0 ? ret : 0;
+}
+
+/**
+ * futex_top_waiter() - Return the highest priority waiter on a futex
+ * @hb: the hash bucket the futex_q's reside in
+ * @key: the futex key (to distinguish it from other futex futex_q's)
+ *
+ * Must be called with the hb lock held.
+ */
+struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key)
+{
+ struct futex_q *this;
+
+ plist_for_each_entry(this, &hb->chain, list) {
+ if (futex_match(&this->key, key))
+ return this;
+ }
+ return NULL;
+}
+
+int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
+{
+ int ret;
+
+ pagefault_disable();
+ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
+ pagefault_enable();
+
+ return ret;
+}
+
+int futex_get_value_locked(u32 *dest, u32 __user *from)
+{
+ int ret;
+
+ pagefault_disable();
+ ret = __get_user(*dest, from);
+ pagefault_enable();
+
+ return ret ? -EFAULT : 0;
+}
+
+/**
+ * wait_for_owner_exiting - Block until the owner has exited
+ * @ret: owner's current futex lock status
+ * @exiting: Pointer to the exiting task
+ *
+ * Caller must hold a refcount on @exiting.
+ */
+void wait_for_owner_exiting(int ret, struct task_struct *exiting)
+{
+ if (ret != -EBUSY) {
+ WARN_ON_ONCE(exiting);
+ return;
+ }
+
+ if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
+ return;
+
+ mutex_lock(&exiting->futex_exit_mutex);
+ /*
+ * No point in doing state checking here. If the waiter got here
+ * while the task was in exec()->exec_futex_release() then it can
+ * have any FUTEX_STATE_* value when the waiter has acquired the
+ * mutex. OK, if running, EXITING or DEAD if it reached exit()
+ * already. Highly unlikely and not a problem. Just one more round
+ * through the futex maze.
+ */
+ mutex_unlock(&exiting->futex_exit_mutex);
+
+ put_task_struct(exiting);
+}
+
+/**
+ * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket
+ * @q: The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be NULL and must be held by the caller.
+ */
+void __futex_unqueue(struct futex_q *q)
+{
+ struct futex_hash_bucket *hb;
+
+ if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
+ return;
+ lockdep_assert_held(q->lock_ptr);
+
+ hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
+ plist_del(&q->list, &hb->chain);
+ futex_hb_waiters_dec(hb);
+}
+
+/* The key must be already stored in q->key. */
+struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
+ __acquires(&hb->lock)
+{
+ struct futex_hash_bucket *hb;
+
+ hb = futex_hash(&q->key);
+
+ /*
+ * Increment the counter before taking the lock so that
+ * a potential waker won't miss a to-be-slept task that is
+ * waiting for the spinlock. This is safe as all futex_q_lock()
+ * users end up calling futex_queue(). Similarly, for housekeeping,
+ * decrement the counter at futex_q_unlock() when some error has
+ * occurred and we don't end up adding the task to the list.
+ */
+ futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */
+
+ q->lock_ptr = &hb->lock;
+
+ spin_lock(&hb->lock);
+ return hb;
+}
+
+void futex_q_unlock(struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
+{
+ spin_unlock(&hb->lock);
+ futex_hb_waiters_dec(hb);
+}
+
+void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
+{
+ int prio;
+
+ /*
+ * The priority used to register this element is
+ * - either the real thread-priority for the real-time threads
+ * (i.e. threads with a priority lower than MAX_RT_PRIO)
+ * - or MAX_RT_PRIO for non-RT threads.
+ * Thus, all RT-threads are woken first in priority order, and
+ * the others are woken last, in FIFO order.
+ */
+ prio = min(current->normal_prio, MAX_RT_PRIO);
+
+ plist_node_init(&q->list, prio);
+ plist_add(&q->list, &hb->chain);
+ q->task = current;
+}
+
+/**
+ * futex_unqueue() - Remove the futex_q from its futex_hash_bucket
+ * @q: The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must
+ * be paired with exactly one earlier call to futex_queue().
+ *
+ * Return:
+ * - 1 - if the futex_q was still queued (and we removed unqueued it);
+ * - 0 - if the futex_q was already removed by the waking thread
+ */
+int futex_unqueue(struct futex_q *q)
+{
+ spinlock_t *lock_ptr;
+ int ret = 0;
+
+ /* In the common case we don't take the spinlock, which is nice. */
+retry:
+ /*
+ * q->lock_ptr can change between this read and the following spin_lock.
+ * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
+ * optimizing lock_ptr out of the logic below.
+ */
+ lock_ptr = READ_ONCE(q->lock_ptr);
+ if (lock_ptr != NULL) {
+ spin_lock(lock_ptr);
+ /*
+ * q->lock_ptr can change between reading it and
+ * spin_lock(), causing us to take the wrong lock. This
+ * corrects the race condition.
+ *
+ * Reasoning goes like this: if we have the wrong lock,
+ * q->lock_ptr must have changed (maybe several times)
+ * between reading it and the spin_lock(). It can
+ * change again after the spin_lock() but only if it was
+ * already changed before the spin_lock(). It cannot,
+ * however, change back to the original value. Therefore
+ * we can detect whether we acquired the correct lock.
+ */
+ if (unlikely(lock_ptr != q->lock_ptr)) {
+ spin_unlock(lock_ptr);
+ goto retry;
+ }
+ __futex_unqueue(q);
+
+ BUG_ON(q->pi_state);
+
+ spin_unlock(lock_ptr);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+/*
+ * PI futexes can not be requeued and must remove themselves from the hash
+ * bucket. The hash bucket lock (i.e. lock_ptr) is held.
+ */
+void futex_unqueue_pi(struct futex_q *q)
+{
+ /*
+ * If the lock was not acquired (due to timeout or signal) then the
+ * rt_waiter is removed before futex_q is. If this is observed by
+ * an unlocker after dropping the rtmutex wait lock and before
+ * acquiring the hash bucket lock, then the unlocker dequeues the
+ * futex_q from the hash bucket list to guarantee consistent state
+ * vs. userspace. Therefore the dequeue here must be conditional.
+ */
+ if (!plist_node_empty(&q->list))
+ __futex_unqueue(q);
+
+ BUG_ON(!q->pi_state);
+ put_pi_state(q->pi_state);
+ q->pi_state = NULL;
+}
+
+/* Constants for the pending_op argument of handle_futex_death */
+#define HANDLE_DEATH_PENDING true
+#define HANDLE_DEATH_LIST false
+
+/*
+ * Process a futex-list entry, check whether it's owned by the
+ * dying task, and do notification if so:
+ */
+static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
+ bool pi, bool pending_op)
+{
+ u32 uval, nval, mval;
+ pid_t owner;
+ int err;
+
+ /* Futex address must be 32bit aligned */
+ if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
+ return -1;
+
+retry:
+ if (get_user(uval, uaddr))
+ return -1;
+
+ /*
+ * Special case for regular (non PI) futexes. The unlock path in
+ * user space has two race scenarios:
+ *
+ * 1. The unlock path releases the user space futex value and
+ * before it can execute the futex() syscall to wake up
+ * waiters it is killed.
+ *
+ * 2. A woken up waiter is killed before it can acquire the
+ * futex in user space.
+ *
+ * In the second case, the wake up notification could be generated
+ * by the unlock path in user space after setting the futex value
+ * to zero or by the kernel after setting the OWNER_DIED bit below.
+ *
+ * In both cases the TID validation below prevents a wakeup of
+ * potential waiters which can cause these waiters to block
+ * forever.
+ *
+ * In both cases the following conditions are met:
+ *
+ * 1) task->robust_list->list_op_pending != NULL
+ * @pending_op == true
+ * 2) The owner part of user space futex value == 0
+ * 3) Regular futex: @pi == false
+ *
+ * If these conditions are met, it is safe to attempt waking up a
+ * potential waiter without touching the user space futex value and
+ * trying to set the OWNER_DIED bit. If the futex value is zero,
+ * the rest of the user space mutex state is consistent, so a woken
+ * waiter will just take over the uncontended futex. Setting the
+ * OWNER_DIED bit would create inconsistent state and malfunction
+ * of the user space owner died handling. Otherwise, the OWNER_DIED
+ * bit is already set, and the woken waiter is expected to deal with
+ * this.
+ */
+ owner = uval & FUTEX_TID_MASK;
+
+ if (pending_op && !pi && !owner) {
+ futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+ FUTEX_BITSET_MATCH_ANY);
+ return 0;
+ }
+
+ if (owner != task_pid_vnr(curr))
+ return 0;
+
+ /*
+ * Ok, this dying thread is truly holding a futex
+ * of interest. Set the OWNER_DIED bit atomically
+ * via cmpxchg, and if the value had FUTEX_WAITERS
+ * set, wake up a waiter (if any). (We have to do a
+ * futex_wake() even if OWNER_DIED is already set -
+ * to handle the rare but possible case of recursive
+ * thread-death.) The rest of the cleanup is done in
+ * userspace.
+ */
+ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+
+ /*
+ * We are not holding a lock here, but we want to have
+ * the pagefault_disable/enable() protection because
+ * we want to handle the fault gracefully. If the
+ * access fails we try to fault in the futex with R/W
+ * verification via get_user_pages. get_user() above
+ * does not guarantee R/W access. If that fails we
+ * give up and leave the futex locked.
+ */
+ if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) {
+ switch (err) {
+ case -EFAULT:
+ if (fault_in_user_writeable(uaddr))
+ return -1;
+ goto retry;
+
+ case -EAGAIN:
+ cond_resched();
+ goto retry;
+
+ default:
+ WARN_ON_ONCE(1);
+ return err;
+ }
+ }
+
+ if (nval != uval)
+ goto retry;
+
+ /*
+ * Wake robust non-PI futexes here. The wakeup of
+ * PI futexes happens in exit_pi_state():
+ */
+ if (!pi && (uval & FUTEX_WAITERS)) {
+ futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+ FUTEX_BITSET_MATCH_ANY);
+ }
+
+ return 0;
+}
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int fetch_robust_entry(struct robust_list __user **entry,
+ struct robust_list __user * __user *head,
+ unsigned int *pi)
+{
+ unsigned long uentry;
+
+ if (get_user(uentry, (unsigned long __user *)head))
+ return -EFAULT;
+
+ *entry = (void __user *)(uentry & ~1UL);
+ *pi = uentry & 1;
+
+ return 0;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+static void exit_robust_list(struct task_struct *curr)
+{
+ struct robust_list_head __user *head = curr->robust_list;
+ struct robust_list __user *entry, *next_entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+ unsigned int next_pi;
+ unsigned long futex_offset;
+ int rc;
+
+ /*
+ * Fetch the list head (which was registered earlier, via
+ * sys_set_robust_list()):
+ */
+ if (fetch_robust_entry(&entry, &head->list.next, &pi))
+ return;
+ /*
+ * Fetch the relative futex offset:
+ */
+ if (get_user(futex_offset, &head->futex_offset))
+ return;
+ /*
+ * Fetch any possibly pending lock-add first, and handle it
+ * if it exists:
+ */
+ if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
+ return;
+
+ next_entry = NULL; /* avoid warning with gcc */
+ while (entry != &head->list) {
+ /*
+ * Fetch the next entry in the list before calling
+ * handle_futex_death:
+ */
+ rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
+ /*
+ * A pending lock might already be on the list, so
+ * don't process it twice:
+ */
+ if (entry != pending) {
+ if (handle_futex_death((void __user *)entry + futex_offset,
+ curr, pi, HANDLE_DEATH_LIST))
+ return;
+ }
+ if (rc)
+ return;
+ entry = next_entry;
+ pi = next_pi;
+ /*
+ * Avoid excessively long or circular lists:
+ */
+ if (!--limit)
+ break;
+
+ cond_resched();
+ }
+
+ if (pending) {
+ handle_futex_death((void __user *)pending + futex_offset,
+ curr, pip, HANDLE_DEATH_PENDING);
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static void __user *futex_uaddr(struct robust_list __user *entry,
+ compat_long_t futex_offset)
+{
+ compat_uptr_t base = ptr_to_compat(entry);
+ void __user *uaddr = compat_ptr(base + futex_offset);
+
+ return uaddr;
+}
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+ compat_uptr_t __user *head, unsigned int *pi)
+{
+ if (get_user(*uentry, head))
+ return -EFAULT;
+
+ *entry = compat_ptr((*uentry) & ~1);
+ *pi = (unsigned int)(*uentry) & 1;
+
+ return 0;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+static void compat_exit_robust_list(struct task_struct *curr)
+{
+ struct compat_robust_list_head __user *head = curr->compat_robust_list;
+ struct robust_list __user *entry, *next_entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+ unsigned int next_pi;
+ compat_uptr_t uentry, next_uentry, upending;
+ compat_long_t futex_offset;
+ int rc;
+
+ /*
+ * Fetch the list head (which was registered earlier, via
+ * sys_set_robust_list()):
+ */
+ if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
+ return;
+ /*
+ * Fetch the relative futex offset:
+ */
+ if (get_user(futex_offset, &head->futex_offset))
+ return;
+ /*
+ * Fetch any possibly pending lock-add first, and handle it
+ * if it exists:
+ */
+ if (compat_fetch_robust_entry(&upending, &pending,
+ &head->list_op_pending, &pip))
+ return;
+
+ next_entry = NULL; /* avoid warning with gcc */
+ while (entry != (struct robust_list __user *) &head->list) {
+ /*
+ * Fetch the next entry in the list before calling
+ * handle_futex_death:
+ */
+ rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
+ (compat_uptr_t __user *)&entry->next, &next_pi);
+ /*
+ * A pending lock might already be on the list, so
+ * dont process it twice:
+ */
+ if (entry != pending) {
+ void __user *uaddr = futex_uaddr(entry, futex_offset);
+
+ if (handle_futex_death(uaddr, curr, pi,
+ HANDLE_DEATH_LIST))
+ return;
+ }
+ if (rc)
+ return;
+ uentry = next_uentry;
+ entry = next_entry;
+ pi = next_pi;
+ /*
+ * Avoid excessively long or circular lists:
+ */
+ if (!--limit)
+ break;
+
+ cond_resched();
+ }
+ if (pending) {
+ void __user *uaddr = futex_uaddr(pending, futex_offset);
+
+ handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
+ }
+}
+#endif
+
+#ifdef CONFIG_FUTEX_PI
+
+/*
+ * This task is holding PI mutexes at exit time => bad.
+ * Kernel cleans up PI-state, but userspace is likely hosed.
+ * (Robust-futex cleanup is separate and might save the day for userspace.)
+ */
+static void exit_pi_state_list(struct task_struct *curr)
+{
+ struct list_head *next, *head = &curr->pi_state_list;
+ struct futex_pi_state *pi_state;
+ struct futex_hash_bucket *hb;
+ union futex_key key = FUTEX_KEY_INIT;
+
+ /*
+ * We are a ZOMBIE and nobody can enqueue itself on
+ * pi_state_list anymore, but we have to be careful
+ * versus waiters unqueueing themselves:
+ */
+ raw_spin_lock_irq(&curr->pi_lock);
+ while (!list_empty(head)) {
+ next = head->next;
+ pi_state = list_entry(next, struct futex_pi_state, list);
+ key = pi_state->key;
+ hb = futex_hash(&key);
+
+ /*
+ * We can race against put_pi_state() removing itself from the
+ * list (a waiter going away). put_pi_state() will first
+ * decrement the reference count and then modify the list, so
+ * its possible to see the list entry but fail this reference
+ * acquire.
+ *
+ * In that case; drop the locks to let put_pi_state() make
+ * progress and retry the loop.
+ */
+ if (!refcount_inc_not_zero(&pi_state->refcount)) {
+ raw_spin_unlock_irq(&curr->pi_lock);
+ cpu_relax();
+ raw_spin_lock_irq(&curr->pi_lock);
+ continue;
+ }
+ raw_spin_unlock_irq(&curr->pi_lock);
+
+ spin_lock(&hb->lock);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ raw_spin_lock(&curr->pi_lock);
+ /*
+ * We dropped the pi-lock, so re-check whether this
+ * task still owns the PI-state:
+ */
+ if (head->next != next) {
+ /* retain curr->pi_lock for the loop invariant */
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+ put_pi_state(pi_state);
+ continue;
+ }
+
+ WARN_ON(pi_state->owner != curr);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ pi_state->owner = NULL;
+
+ raw_spin_unlock(&curr->pi_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+
+ raw_spin_lock_irq(&curr->pi_lock);
+ }
+ raw_spin_unlock_irq(&curr->pi_lock);
+}
+#else
+static inline void exit_pi_state_list(struct task_struct *curr) { }
+#endif
+
+static void futex_cleanup(struct task_struct *tsk)
+{
+ if (unlikely(tsk->robust_list)) {
+ exit_robust_list(tsk);
+ tsk->robust_list = NULL;
+ }
+
+#ifdef CONFIG_COMPAT
+ if (unlikely(tsk->compat_robust_list)) {
+ compat_exit_robust_list(tsk);
+ tsk->compat_robust_list = NULL;
+ }
+#endif
+
+ if (unlikely(!list_empty(&tsk->pi_state_list)))
+ exit_pi_state_list(tsk);
+}
+
+/**
+ * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
+ * @tsk: task to set the state on
+ *
+ * Set the futex exit state of the task lockless. The futex waiter code
+ * observes that state when a task is exiting and loops until the task has
+ * actually finished the futex cleanup. The worst case for this is that the
+ * waiter runs through the wait loop until the state becomes visible.
+ *
+ * This is called from the recursive fault handling path in make_task_dead().
+ *
+ * This is best effort. Either the futex exit code has run already or
+ * not. If the OWNER_DIED bit has been set on the futex then the waiter can
+ * take it over. If not, the problem is pushed back to user space. If the
+ * futex exit code did not run yet, then an already queued waiter might
+ * block forever, but there is nothing which can be done about that.
+ */
+void futex_exit_recursive(struct task_struct *tsk)
+{
+ /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
+ if (tsk->futex_state == FUTEX_STATE_EXITING)
+ mutex_unlock(&tsk->futex_exit_mutex);
+ tsk->futex_state = FUTEX_STATE_DEAD;
+}
+
+static void futex_cleanup_begin(struct task_struct *tsk)
+{
+ /*
+ * Prevent various race issues against a concurrent incoming waiter
+ * including live locks by forcing the waiter to block on
+ * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
+ * attach_to_pi_owner().
+ */
+ mutex_lock(&tsk->futex_exit_mutex);
+
+ /*
+ * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
+ *
+ * This ensures that all subsequent checks of tsk->futex_state in
+ * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
+ * tsk->pi_lock held.
+ *
+ * It guarantees also that a pi_state which was queued right before
+ * the state change under tsk->pi_lock by a concurrent waiter must
+ * be observed in exit_pi_state_list().
+ */
+ raw_spin_lock_irq(&tsk->pi_lock);
+ tsk->futex_state = FUTEX_STATE_EXITING;
+ raw_spin_unlock_irq(&tsk->pi_lock);
+}
+
+static void futex_cleanup_end(struct task_struct *tsk, int state)
+{
+ /*
+ * Lockless store. The only side effect is that an observer might
+ * take another loop until it becomes visible.
+ */
+ tsk->futex_state = state;
+ /*
+ * Drop the exit protection. This unblocks waiters which observed
+ * FUTEX_STATE_EXITING to reevaluate the state.
+ */
+ mutex_unlock(&tsk->futex_exit_mutex);
+}
+
+void futex_exec_release(struct task_struct *tsk)
+{
+ /*
+ * The state handling is done for consistency, but in the case of
+ * exec() there is no way to prevent further damage as the PID stays
+ * the same. But for the unlikely and arguably buggy case that a
+ * futex is held on exec(), this provides at least as much state
+ * consistency protection which is possible.
+ */
+ futex_cleanup_begin(tsk);
+ futex_cleanup(tsk);
+ /*
+ * Reset the state to FUTEX_STATE_OK. The task is alive and about
+ * exec a new binary.
+ */
+ futex_cleanup_end(tsk, FUTEX_STATE_OK);
+}
+
+void futex_exit_release(struct task_struct *tsk)
+{
+ futex_cleanup_begin(tsk);
+ futex_cleanup(tsk);
+ futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
+}
+
+static int __init futex_init(void)
+{
+ unsigned int futex_shift;
+ unsigned long i;
+
+#if CONFIG_BASE_SMALL
+ futex_hashsize = 16;
+#else
+ futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+#endif
+
+ futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
+ futex_hashsize, 0, 0,
+ &futex_shift, NULL,
+ futex_hashsize, futex_hashsize);
+ futex_hashsize = 1UL << futex_shift;
+
+ for (i = 0; i < futex_hashsize; i++) {
+ atomic_set(&futex_queues[i].waiters, 0);
+ plist_head_init(&futex_queues[i].chain);
+ spin_lock_init(&futex_queues[i].lock);
+ }
+
+ return 0;
+}
+core_initcall(futex_init);
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
new file mode 100644
index 000000000000..8b195d06f4e8
--- /dev/null
+++ b/kernel/futex/futex.h
@@ -0,0 +1,386 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FUTEX_H
+#define _FUTEX_H
+
+#include <linux/futex.h>
+#include <linux/rtmutex.h>
+#include <linux/sched/wake_q.h>
+#include <linux/compat.h>
+
+#ifdef CONFIG_PREEMPT_RT
+#include <linux/rcuwait.h>
+#endif
+
+#include <asm/futex.h>
+
+/*
+ * Futex flags used to encode options to functions and preserve them across
+ * restarts.
+ */
+#define FLAGS_SIZE_8 0x0000
+#define FLAGS_SIZE_16 0x0001
+#define FLAGS_SIZE_32 0x0002
+#define FLAGS_SIZE_64 0x0003
+
+#define FLAGS_SIZE_MASK 0x0003
+
+#ifdef CONFIG_MMU
+# define FLAGS_SHARED 0x0010
+#else
+/*
+ * NOMMU does not have per process address space. Let the compiler optimize
+ * code away.
+ */
+# define FLAGS_SHARED 0x0000
+#endif
+#define FLAGS_CLOCKRT 0x0020
+#define FLAGS_HAS_TIMEOUT 0x0040
+#define FLAGS_NUMA 0x0080
+#define FLAGS_STRICT 0x0100
+
+/* FUTEX_ to FLAGS_ */
+static inline unsigned int futex_to_flags(unsigned int op)
+{
+ unsigned int flags = FLAGS_SIZE_32;
+
+ if (!(op & FUTEX_PRIVATE_FLAG))
+ flags |= FLAGS_SHARED;
+
+ if (op & FUTEX_CLOCK_REALTIME)
+ flags |= FLAGS_CLOCKRT;
+
+ return flags;
+}
+
+#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE)
+
+/* FUTEX2_ to FLAGS_ */
+static inline unsigned int futex2_to_flags(unsigned int flags2)
+{
+ unsigned int flags = flags2 & FUTEX2_SIZE_MASK;
+
+ if (!(flags2 & FUTEX2_PRIVATE))
+ flags |= FLAGS_SHARED;
+
+ if (flags2 & FUTEX2_NUMA)
+ flags |= FLAGS_NUMA;
+
+ return flags;
+}
+
+static inline unsigned int futex_size(unsigned int flags)
+{
+ return 1 << (flags & FLAGS_SIZE_MASK);
+}
+
+static inline bool futex_flags_valid(unsigned int flags)
+{
+ /* Only 64bit futexes for 64bit code */
+ if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) {
+ if ((flags & FLAGS_SIZE_MASK) == FLAGS_SIZE_64)
+ return false;
+ }
+
+ /* Only 32bit futexes are implemented -- for now */
+ if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32)
+ return false;
+
+ return true;
+}
+
+static inline bool futex_validate_input(unsigned int flags, u64 val)
+{
+ int bits = 8 * futex_size(flags);
+
+ if (bits < 64 && (val >> bits))
+ return false;
+
+ return true;
+}
+
+#ifdef CONFIG_FAIL_FUTEX
+extern bool should_fail_futex(bool fshared);
+#else
+static inline bool should_fail_futex(bool fshared)
+{
+ return false;
+}
+#endif
+
+/*
+ * Hash buckets are shared by all the futex_keys that hash to the same
+ * location. Each key may have multiple futex_q structures, one for each task
+ * waiting on a futex.
+ */
+struct futex_hash_bucket {
+ atomic_t waiters;
+ spinlock_t lock;
+ struct plist_head chain;
+} ____cacheline_aligned_in_smp;
+
+/*
+ * Priority Inheritance state:
+ */
+struct futex_pi_state {
+ /*
+ * list of 'owned' pi_state instances - these have to be
+ * cleaned up in do_exit() if the task exits prematurely:
+ */
+ struct list_head list;
+
+ /*
+ * The PI object:
+ */
+ struct rt_mutex_base pi_mutex;
+
+ struct task_struct *owner;
+ refcount_t refcount;
+
+ union futex_key key;
+} __randomize_layout;
+
+struct futex_q;
+typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q);
+
+/**
+ * struct futex_q - The hashed futex queue entry, one per waiting task
+ * @list: priority-sorted list of tasks waiting on this futex
+ * @task: the task waiting on the futex
+ * @lock_ptr: the hash bucket lock
+ * @wake: the wake handler for this queue
+ * @wake_data: data associated with the wake handler
+ * @key: the key the futex is hashed on
+ * @pi_state: optional priority inheritance state
+ * @rt_waiter: rt_waiter storage for use with requeue_pi
+ * @requeue_pi_key: the requeue_pi target futex key
+ * @bitset: bitset for the optional bitmasked wakeup
+ * @requeue_state: State field for futex_requeue_pi()
+ * @requeue_wait: RCU wait for futex_requeue_pi() (RT only)
+ *
+ * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
+ * we can wake only the relevant ones (hashed queues may be shared).
+ *
+ * A futex_q has a woken state, just like tasks have TASK_RUNNING.
+ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
+ * The order of wakeup is always to make the first condition true, then
+ * the second.
+ *
+ * PI futexes are typically woken before they are removed from the hash list via
+ * the rt_mutex code. See futex_unqueue_pi().
+ */
+struct futex_q {
+ struct plist_node list;
+
+ struct task_struct *task;
+ spinlock_t *lock_ptr;
+ futex_wake_fn *wake;
+ void *wake_data;
+ union futex_key key;
+ struct futex_pi_state *pi_state;
+ struct rt_mutex_waiter *rt_waiter;
+ union futex_key *requeue_pi_key;
+ u32 bitset;
+ atomic_t requeue_state;
+#ifdef CONFIG_PREEMPT_RT
+ struct rcuwait requeue_wait;
+#endif
+} __randomize_layout;
+
+extern const struct futex_q futex_q_init;
+
+enum futex_access {
+ FUTEX_READ,
+ FUTEX_WRITE
+};
+
+extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
+ enum futex_access rw);
+
+extern struct hrtimer_sleeper *
+futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
+ int flags, u64 range_ns);
+
+extern struct futex_hash_bucket *futex_hash(union futex_key *key);
+
+/**
+ * futex_match - Check whether two futex keys are equal
+ * @key1: Pointer to key1
+ * @key2: Pointer to key2
+ *
+ * Return 1 if two futex_keys are equal, 0 otherwise.
+ */
+static inline int futex_match(union futex_key *key1, union futex_key *key2)
+{
+ return (key1 && key2
+ && key1->both.word == key2->both.word
+ && key1->both.ptr == key2->both.ptr
+ && key1->both.offset == key2->both.offset);
+}
+
+extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
+ struct futex_q *q, struct futex_hash_bucket **hb);
+extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
+ struct hrtimer_sleeper *timeout);
+extern bool __futex_wake_mark(struct futex_q *q);
+extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);
+
+extern int fault_in_user_writeable(u32 __user *uaddr);
+extern int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval);
+extern int futex_get_value_locked(u32 *dest, u32 __user *from);
+extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key);
+
+extern void __futex_unqueue(struct futex_q *q);
+extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb);
+extern int futex_unqueue(struct futex_q *q);
+
+/**
+ * futex_queue() - Enqueue the futex_q on the futex_hash_bucket
+ * @q: The futex_q to enqueue
+ * @hb: The destination hash bucket
+ *
+ * The hb->lock must be held by the caller, and is released here. A call to
+ * futex_queue() is typically paired with exactly one call to futex_unqueue(). The
+ * exceptions involve the PI related operations, which may use futex_unqueue_pi()
+ * or nothing if the unqueue is done as part of the wake process and the unqueue
+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+ * an example).
+ */
+static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
+{
+ __futex_queue(q, hb);
+ spin_unlock(&hb->lock);
+}
+
+extern void futex_unqueue_pi(struct futex_q *q);
+
+extern void wait_for_owner_exiting(int ret, struct task_struct *exiting);
+
+/*
+ * Reflects a new waiter being added to the waitqueue.
+ */
+static inline void futex_hb_waiters_inc(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_inc(&hb->waiters);
+ /*
+ * Full barrier (A), see the ordering comment above.
+ */
+ smp_mb__after_atomic();
+#endif
+}
+
+/*
+ * Reflects a waiter being removed from the waitqueue by wakeup
+ * paths.
+ */
+static inline void futex_hb_waiters_dec(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_dec(&hb->waiters);
+#endif
+}
+
+static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ /*
+ * Full barrier (B), see the ordering comment above.
+ */
+ smp_mb();
+ return atomic_read(&hb->waiters);
+#else
+ return 1;
+#endif
+}
+
+extern struct futex_hash_bucket *futex_q_lock(struct futex_q *q);
+extern void futex_q_unlock(struct futex_hash_bucket *hb);
+
+
+extern int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+ union futex_key *key,
+ struct futex_pi_state **ps,
+ struct task_struct *task,
+ struct task_struct **exiting,
+ int set_waiters);
+
+extern int refill_pi_state_cache(void);
+extern void get_pi_state(struct futex_pi_state *pi_state);
+extern void put_pi_state(struct futex_pi_state *pi_state);
+extern int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked);
+
+/*
+ * Express the locking dependencies for lockdep:
+ */
+static inline void
+double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+{
+ if (hb1 > hb2)
+ swap(hb1, hb2);
+
+ spin_lock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
+}
+
+static inline void
+double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+{
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+}
+
+/* syscalls */
+
+extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32
+ val, ktime_t *abs_time, u32 bitset, u32 __user
+ *uaddr2);
+
+extern int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
+ u32 __user *uaddr2, unsigned int flags2,
+ int nr_wake, int nr_requeue,
+ u32 *cmpval, int requeue_pi);
+
+extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ struct hrtimer_sleeper *to, u32 bitset);
+
+extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ ktime_t *abs_time, u32 bitset);
+
+/**
+ * struct futex_vector - Auxiliary struct for futex_waitv()
+ * @w: Userspace provided data
+ * @q: Kernel side data
+ *
+ * Struct used to build an array with all data need for futex_waitv()
+ */
+struct futex_vector {
+ struct futex_waitv w;
+ struct futex_q q;
+};
+
+extern int futex_parse_waitv(struct futex_vector *futexv,
+ struct futex_waitv __user *uwaitv,
+ unsigned int nr_futexes, futex_wake_fn *wake,
+ void *wake_data);
+
+extern int futex_wait_multiple_setup(struct futex_vector *vs, int count,
+ int *woken);
+
+extern int futex_unqueue_multiple(struct futex_vector *v, int count);
+
+extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
+ struct hrtimer_sleeper *to);
+
+extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset);
+
+extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags,
+ u32 __user *uaddr2, int nr_wake, int nr_wake2, int op);
+
+extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags);
+
+extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock);
+
+#endif /* _FUTEX_H */
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
new file mode 100644
index 000000000000..5722467f2737
--- /dev/null
+++ b/kernel/futex/pi.c
@@ -0,0 +1,1269 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/slab.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/task.h>
+
+#include "futex.h"
+#include "../locking/rtmutex_common.h"
+
+/*
+ * PI code:
+ */
+int refill_pi_state_cache(void)
+{
+ struct futex_pi_state *pi_state;
+
+ if (likely(current->pi_state_cache))
+ return 0;
+
+ pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
+
+ if (!pi_state)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&pi_state->list);
+ /* pi_mutex gets initialized later */
+ pi_state->owner = NULL;
+ refcount_set(&pi_state->refcount, 1);
+ pi_state->key = FUTEX_KEY_INIT;
+
+ current->pi_state_cache = pi_state;
+
+ return 0;
+}
+
+static struct futex_pi_state *alloc_pi_state(void)
+{
+ struct futex_pi_state *pi_state = current->pi_state_cache;
+
+ WARN_ON(!pi_state);
+ current->pi_state_cache = NULL;
+
+ return pi_state;
+}
+
+static void pi_state_update_owner(struct futex_pi_state *pi_state,
+ struct task_struct *new_owner)
+{
+ struct task_struct *old_owner = pi_state->owner;
+
+ lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
+
+ if (old_owner) {
+ raw_spin_lock(&old_owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ raw_spin_unlock(&old_owner->pi_lock);
+ }
+
+ if (new_owner) {
+ raw_spin_lock(&new_owner->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &new_owner->pi_state_list);
+ pi_state->owner = new_owner;
+ raw_spin_unlock(&new_owner->pi_lock);
+ }
+}
+
+void get_pi_state(struct futex_pi_state *pi_state)
+{
+ WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
+}
+
+/*
+ * Drops a reference to the pi_state object and frees or caches it
+ * when the last reference is gone.
+ */
+void put_pi_state(struct futex_pi_state *pi_state)
+{
+ if (!pi_state)
+ return;
+
+ if (!refcount_dec_and_test(&pi_state->refcount))
+ return;
+
+ /*
+ * If pi_state->owner is NULL, the owner is most probably dying
+ * and has cleaned up the pi_state already
+ */
+ if (pi_state->owner) {
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
+ pi_state_update_owner(pi_state, NULL);
+ rt_mutex_proxy_unlock(&pi_state->pi_mutex);
+ raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
+ }
+
+ if (current->pi_state_cache) {
+ kfree(pi_state);
+ } else {
+ /*
+ * pi_state->list is already empty.
+ * clear pi_state->owner.
+ * refcount is at 0 - put it back to 1.
+ */
+ pi_state->owner = NULL;
+ refcount_set(&pi_state->refcount, 1);
+ current->pi_state_cache = pi_state;
+ }
+}
+
+/*
+ * We need to check the following states:
+ *
+ * Waiter | pi_state | pi->owner | uTID | uODIED | ?
+ *
+ * [1] NULL | --- | --- | 0 | 0/1 | Valid
+ * [2] NULL | --- | --- | >0 | 0/1 | Valid
+ *
+ * [3] Found | NULL | -- | Any | 0/1 | Invalid
+ *
+ * [4] Found | Found | NULL | 0 | 1 | Valid
+ * [5] Found | Found | NULL | >0 | 1 | Invalid
+ *
+ * [6] Found | Found | task | 0 | 1 | Valid
+ *
+ * [7] Found | Found | NULL | Any | 0 | Invalid
+ *
+ * [8] Found | Found | task | ==taskTID | 0/1 | Valid
+ * [9] Found | Found | task | 0 | 0 | Invalid
+ * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
+ *
+ * [1] Indicates that the kernel can acquire the futex atomically. We
+ * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
+ *
+ * [2] Valid, if TID does not belong to a kernel thread. If no matching
+ * thread is found then it indicates that the owner TID has died.
+ *
+ * [3] Invalid. The waiter is queued on a non PI futex
+ *
+ * [4] Valid state after exit_robust_list(), which sets the user space
+ * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
+ *
+ * [5] The user space value got manipulated between exit_robust_list()
+ * and exit_pi_state_list()
+ *
+ * [6] Valid state after exit_pi_state_list() which sets the new owner in
+ * the pi_state but cannot access the user space value.
+ *
+ * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
+ *
+ * [8] Owner and user space value match
+ *
+ * [9] There is no transient state which sets the user space TID to 0
+ * except exit_robust_list(), but this is indicated by the
+ * FUTEX_OWNER_DIED bit. See [4]
+ *
+ * [10] There is no transient state which leaves owner and user space
+ * TID out of sync. Except one error case where the kernel is denied
+ * write access to the user address, see fixup_pi_state_owner().
+ *
+ *
+ * Serialization and lifetime rules:
+ *
+ * hb->lock:
+ *
+ * hb -> futex_q, relation
+ * futex_q -> pi_state, relation
+ *
+ * (cannot be raw because hb can contain arbitrary amount
+ * of futex_q's)
+ *
+ * pi_mutex->wait_lock:
+ *
+ * {uval, pi_state}
+ *
+ * (and pi_mutex 'obviously')
+ *
+ * p->pi_lock:
+ *
+ * p->pi_state_list -> pi_state->list, relation
+ * pi_mutex->owner -> pi_state->owner, relation
+ *
+ * pi_state->refcount:
+ *
+ * pi_state lifetime
+ *
+ *
+ * Lock order:
+ *
+ * hb->lock
+ * pi_mutex->wait_lock
+ * p->pi_lock
+ *
+ */
+
+/*
+ * Validate that the existing waiter has a pi_state and sanity check
+ * the pi_state against the user space value. If correct, attach to
+ * it.
+ */
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_pi_state *pi_state,
+ struct futex_pi_state **ps)
+{
+ pid_t pid = uval & FUTEX_TID_MASK;
+ u32 uval2;
+ int ret;
+
+ /*
+ * Userspace might have messed up non-PI and PI futexes [3]
+ */
+ if (unlikely(!pi_state))
+ return -EINVAL;
+
+ /*
+ * We get here with hb->lock held, and having found a
+ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+ * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
+ * which in turn means that futex_lock_pi() still has a reference on
+ * our pi_state.
+ *
+ * The waiter holding a reference on @pi_state also protects against
+ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
+ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
+ * free pi_state before we can take a reference ourselves.
+ */
+ WARN_ON(!refcount_read(&pi_state->refcount));
+
+ /*
+ * Now that we have a pi_state, we can acquire wait_lock
+ * and do the state validation.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Since {uval, pi_state} is serialized by wait_lock, and our current
+ * uval was read without holding it, it can have changed. Verify it
+ * still is what we expect it to be, otherwise retry the entire
+ * operation.
+ */
+ if (futex_get_value_locked(&uval2, uaddr))
+ goto out_efault;
+
+ if (uval != uval2)
+ goto out_eagain;
+
+ /*
+ * Handle the owner died case:
+ */
+ if (uval & FUTEX_OWNER_DIED) {
+ /*
+ * exit_pi_state_list sets owner to NULL and wakes the
+ * topmost waiter. The task which acquires the
+ * pi_state->rt_mutex will fixup owner.
+ */
+ if (!pi_state->owner) {
+ /*
+ * No pi state owner, but the user space TID
+ * is not 0. Inconsistent state. [5]
+ */
+ if (pid)
+ goto out_einval;
+ /*
+ * Take a ref on the state and return success. [4]
+ */
+ goto out_attach;
+ }
+
+ /*
+ * If TID is 0, then either the dying owner has not
+ * yet executed exit_pi_state_list() or some waiter
+ * acquired the rtmutex in the pi state, but did not
+ * yet fixup the TID in user space.
+ *
+ * Take a ref on the state and return success. [6]
+ */
+ if (!pid)
+ goto out_attach;
+ } else {
+ /*
+ * If the owner died bit is not set, then the pi_state
+ * must have an owner. [7]
+ */
+ if (!pi_state->owner)
+ goto out_einval;
+ }
+
+ /*
+ * Bail out if user space manipulated the futex value. If pi
+ * state exists then the owner TID must be the same as the
+ * user space TID. [9/10]
+ */
+ if (pid != task_pid_vnr(pi_state->owner))
+ goto out_einval;
+
+out_attach:
+ get_pi_state(pi_state);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ *ps = pi_state;
+ return 0;
+
+out_einval:
+ ret = -EINVAL;
+ goto out_error;
+
+out_eagain:
+ ret = -EAGAIN;
+ goto out_error;
+
+out_efault:
+ ret = -EFAULT;
+ goto out_error;
+
+out_error:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
+}
+
+static int handle_exit_race(u32 __user *uaddr, u32 uval,
+ struct task_struct *tsk)
+{
+ u32 uval2;
+
+ /*
+ * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
+ * caller that the alleged owner is busy.
+ */
+ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
+ return -EBUSY;
+
+ /*
+ * Reread the user space value to handle the following situation:
+ *
+ * CPU0 CPU1
+ *
+ * sys_exit() sys_futex()
+ * do_exit() futex_lock_pi()
+ * futex_lock_pi_atomic()
+ * exit_signals(tsk) No waiters:
+ * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
+ * mm_release(tsk) Set waiter bit
+ * exit_robust_list(tsk) { *uaddr = 0x80000PID;
+ * Set owner died attach_to_pi_owner() {
+ * *uaddr = 0xC0000000; tsk = get_task(PID);
+ * } if (!tsk->flags & PF_EXITING) {
+ * ... attach();
+ * tsk->futex_state = } else {
+ * FUTEX_STATE_DEAD; if (tsk->futex_state !=
+ * FUTEX_STATE_DEAD)
+ * return -EAGAIN;
+ * return -ESRCH; <--- FAIL
+ * }
+ *
+ * Returning ESRCH unconditionally is wrong here because the
+ * user space value has been changed by the exiting task.
+ *
+ * The same logic applies to the case where the exiting task is
+ * already gone.
+ */
+ if (futex_get_value_locked(&uval2, uaddr))
+ return -EFAULT;
+
+ /* If the user space value has changed, try again. */
+ if (uval2 != uval)
+ return -EAGAIN;
+
+ /*
+ * The exiting task did not have a robust list, the robust list was
+ * corrupted or the user space value in *uaddr is simply bogus.
+ * Give up and tell user space.
+ */
+ return -ESRCH;
+}
+
+static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
+ struct futex_pi_state **ps)
+{
+ /*
+ * No existing pi state. First waiter. [2]
+ *
+ * This creates pi_state, we have hb->lock held, this means nothing can
+ * observe this state, wait_lock is irrelevant.
+ */
+ struct futex_pi_state *pi_state = alloc_pi_state();
+
+ /*
+ * Initialize the pi_mutex in locked state and make @p
+ * the owner of it:
+ */
+ rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
+
+ /* Store the key for possible exit cleanups: */
+ pi_state->key = *key;
+
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &p->pi_state_list);
+ /*
+ * Assignment without holding pi_state->pi_mutex.wait_lock is safe
+ * because there is no concurrency as the object is not published yet.
+ */
+ pi_state->owner = p;
+
+ *ps = pi_state;
+}
+/*
+ * Lookup the task for the TID provided from user space and attach to
+ * it after doing proper sanity checks.
+ */
+static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
+ struct futex_pi_state **ps,
+ struct task_struct **exiting)
+{
+ pid_t pid = uval & FUTEX_TID_MASK;
+ struct task_struct *p;
+
+ /*
+ * We are the first waiter - try to look up the real owner and attach
+ * the new pi_state to it, but bail out when TID = 0 [1]
+ *
+ * The !pid check is paranoid. None of the call sites should end up
+ * with pid == 0, but better safe than sorry. Let the caller retry
+ */
+ if (!pid)
+ return -EAGAIN;
+ p = find_get_task_by_vpid(pid);
+ if (!p)
+ return handle_exit_race(uaddr, uval, NULL);
+
+ if (unlikely(p->flags & PF_KTHREAD)) {
+ put_task_struct(p);
+ return -EPERM;
+ }
+
+ /*
+ * We need to look at the task state to figure out, whether the
+ * task is exiting. To protect against the change of the task state
+ * in futex_exit_release(), we do this protected by p->pi_lock:
+ */
+ raw_spin_lock_irq(&p->pi_lock);
+ if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
+ /*
+ * The task is on the way out. When the futex state is
+ * FUTEX_STATE_DEAD, we know that the task has finished
+ * the cleanup:
+ */
+ int ret = handle_exit_race(uaddr, uval, p);
+
+ raw_spin_unlock_irq(&p->pi_lock);
+ /*
+ * If the owner task is between FUTEX_STATE_EXITING and
+ * FUTEX_STATE_DEAD then store the task pointer and keep
+ * the reference on the task struct. The calling code will
+ * drop all locks, wait for the task to reach
+ * FUTEX_STATE_DEAD and then drop the refcount. This is
+ * required to prevent a live lock when the current task
+ * preempted the exiting task between the two states.
+ */
+ if (ret == -EBUSY)
+ *exiting = p;
+ else
+ put_task_struct(p);
+ return ret;
+ }
+
+ __attach_to_pi_owner(p, key, ps);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
+{
+ int err;
+ u32 curval;
+
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
+ err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
+ if (unlikely(err))
+ return err;
+
+ /* If user space value changed, let the caller retry */
+ return curval != uval ? -EAGAIN : 0;
+}
+
+/**
+ * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
+ * @uaddr: the pi futex user address
+ * @hb: the pi futex hash bucket
+ * @key: the futex key associated with uaddr and hb
+ * @ps: the pi_state pointer where we store the result of the
+ * lookup
+ * @task: the task to perform the atomic lock work for. This will
+ * be "current" except in the case of requeue pi.
+ * @exiting: Pointer to store the task pointer of the owner task
+ * which is in the middle of exiting
+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Return:
+ * - 0 - ready to wait;
+ * - 1 - acquired the lock;
+ * - <0 - error
+ *
+ * The hb->lock must be held by the caller.
+ *
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
+ */
+int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+ union futex_key *key,
+ struct futex_pi_state **ps,
+ struct task_struct *task,
+ struct task_struct **exiting,
+ int set_waiters)
+{
+ u32 uval, newval, vpid = task_pid_vnr(task);
+ struct futex_q *top_waiter;
+ int ret;
+
+ /*
+ * Read the user space value first so we can validate a few
+ * things before proceeding further.
+ */
+ if (futex_get_value_locked(&uval, uaddr))
+ return -EFAULT;
+
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
+ /*
+ * Detect deadlocks.
+ */
+ if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
+ return -EDEADLK;
+
+ if ((unlikely(should_fail_futex(true))))
+ return -EDEADLK;
+
+ /*
+ * Lookup existing state first. If it exists, try to attach to
+ * its pi_state.
+ */
+ top_waiter = futex_top_waiter(hb, key);
+ if (top_waiter)
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
+
+ /*
+ * No waiter and user TID is 0. We are here because the
+ * waiters or the owner died bit is set or called from
+ * requeue_cmp_pi or for whatever reason something took the
+ * syscall.
+ */
+ if (!(uval & FUTEX_TID_MASK)) {
+ /*
+ * We take over the futex. No other waiters and the user space
+ * TID is 0. We preserve the owner died bit.
+ */
+ newval = uval & FUTEX_OWNER_DIED;
+ newval |= vpid;
+
+ /* The futex requeue_pi code can enforce the waiters bit */
+ if (set_waiters)
+ newval |= FUTEX_WAITERS;
+
+ ret = lock_pi_update_atomic(uaddr, uval, newval);
+ if (ret)
+ return ret;
+
+ /*
+ * If the waiter bit was requested the caller also needs PI
+ * state attached to the new owner of the user space futex.
+ *
+ * @task is guaranteed to be alive and it cannot be exiting
+ * because it is either sleeping or waiting in
+ * futex_requeue_pi_wakeup_sync().
+ *
+ * No need to do the full attach_to_pi_owner() exercise
+ * because @task is known and valid.
+ */
+ if (set_waiters) {
+ raw_spin_lock_irq(&task->pi_lock);
+ __attach_to_pi_owner(task, key, ps);
+ raw_spin_unlock_irq(&task->pi_lock);
+ }
+ return 1;
+ }
+
+ /*
+ * First waiter. Set the waiters bit before attaching ourself to
+ * the owner. If owner tries to unlock, it will be forced into
+ * the kernel and blocked on hb->lock.
+ */
+ newval = uval | FUTEX_WAITERS;
+ ret = lock_pi_update_atomic(uaddr, uval, newval);
+ if (ret)
+ return ret;
+ /*
+ * If the update of the user space value succeeded, we try to
+ * attach to the owner. If that fails, no harm done, we only
+ * set the FUTEX_WAITERS bit in the user space variable.
+ */
+ return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
+}
+
+/*
+ * Caller must hold a reference on @pi_state.
+ */
+static int wake_futex_pi(u32 __user *uaddr, u32 uval,
+ struct futex_pi_state *pi_state,
+ struct rt_mutex_waiter *top_waiter)
+{
+ struct task_struct *new_owner;
+ bool postunlock = false;
+ DEFINE_RT_WAKE_Q(wqh);
+ u32 curval, newval;
+ int ret = 0;
+
+ new_owner = top_waiter->task;
+
+ /*
+ * We pass it to the next owner. The WAITERS bit is always kept
+ * enabled while there is PI state around. We cleanup the owner
+ * died bit, because we are the owner.
+ */
+ newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+
+ if (unlikely(should_fail_futex(true))) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
+ if (!ret && (curval != uval)) {
+ /*
+ * If a unconditional UNLOCK_PI operation (user space did not
+ * try the TID->0 transition) raced with a waiter setting the
+ * FUTEX_WAITERS flag between get_user() and locking the hash
+ * bucket lock, retry the operation.
+ */
+ if ((FUTEX_TID_MASK & curval) == uval)
+ ret = -EAGAIN;
+ else
+ ret = -EINVAL;
+ }
+
+ if (!ret) {
+ /*
+ * This is a point of no return; once we modified the uval
+ * there is no going back and subsequent operations must
+ * not fail.
+ */
+ pi_state_update_owner(pi_state, new_owner);
+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
+ }
+
+out_unlock:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+
+ if (postunlock)
+ rt_mutex_postunlock(&wqh);
+
+ return ret;
+}
+
+static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+ struct task_struct *argowner)
+{
+ struct futex_pi_state *pi_state = q->pi_state;
+ struct task_struct *oldowner, *newowner;
+ u32 uval, curval, newval, newtid;
+ int err = 0;
+
+ oldowner = pi_state->owner;
+
+ /*
+ * We are here because either:
+ *
+ * - we stole the lock and pi_state->owner needs updating to reflect
+ * that (@argowner == current),
+ *
+ * or:
+ *
+ * - someone stole our lock and we need to fix things to point to the
+ * new owner (@argowner == NULL).
+ *
+ * Either way, we have to replace the TID in the user space variable.
+ * This must be atomic as we have to preserve the owner died bit here.
+ *
+ * Note: We write the user space value _before_ changing the pi_state
+ * because we can fault here. Imagine swapped out pages or a fork
+ * that marked all the anonymous memory readonly for cow.
+ *
+ * Modifying pi_state _before_ the user space value would leave the
+ * pi_state in an inconsistent state when we fault here, because we
+ * need to drop the locks to handle the fault. This might be observed
+ * in the PID checks when attaching to PI state .
+ */
+retry:
+ if (!argowner) {
+ if (oldowner != current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ return 0;
+ }
+
+ if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
+ /* We got the lock. pi_state is correct. Tell caller. */
+ return 1;
+ }
+
+ /*
+ * The trylock just failed, so either there is an owner or
+ * there is a higher priority waiter than this one.
+ */
+ newowner = rt_mutex_owner(&pi_state->pi_mutex);
+ /*
+ * If the higher priority waiter has not yet taken over the
+ * rtmutex then newowner is NULL. We can't return here with
+ * that state because it's inconsistent vs. the user space
+ * state. So drop the locks and try again. It's a valid
+ * situation and not any different from the other retry
+ * conditions.
+ */
+ if (unlikely(!newowner)) {
+ err = -EAGAIN;
+ goto handle_err;
+ }
+ } else {
+ WARN_ON_ONCE(argowner != current);
+ if (oldowner == current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ return 1;
+ }
+ newowner = argowner;
+ }
+
+ newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+ /* Owner died? */
+ if (!pi_state->owner)
+ newtid |= FUTEX_OWNER_DIED;
+
+ err = futex_get_value_locked(&uval, uaddr);
+ if (err)
+ goto handle_err;
+
+ for (;;) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
+
+ err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
+ if (err)
+ goto handle_err;
+
+ if (curval == uval)
+ break;
+ uval = curval;
+ }
+
+ /*
+ * We fixed up user space. Now we need to fix the pi_state
+ * itself.
+ */
+ pi_state_update_owner(pi_state, newowner);
+
+ return argowner == current;
+
+ /*
+ * In order to reschedule or handle a page fault, we need to drop the
+ * locks here. In the case of a fault, this gives the other task
+ * (either the highest priority waiter itself or the task which stole
+ * the rtmutex) the chance to try the fixup of the pi_state. So once we
+ * are back from handling the fault we need to check the pi_state after
+ * reacquiring the locks and before trying to do another fixup. When
+ * the fixup has been done already we simply return.
+ *
+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+ * drop hb->lock since the caller owns the hb -> futex_q relation.
+ * Dropping the pi_mutex->wait_lock requires the state revalidate.
+ */
+handle_err:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(q->lock_ptr);
+
+ switch (err) {
+ case -EFAULT:
+ err = fault_in_user_writeable(uaddr);
+ break;
+
+ case -EAGAIN:
+ cond_resched();
+ err = 0;
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ spin_lock(q->lock_ptr);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Check if someone else fixed it for us:
+ */
+ if (pi_state->owner != oldowner)
+ return argowner == current;
+
+ /* Retry if err was -EAGAIN or the fault in succeeded */
+ if (!err)
+ goto retry;
+
+ /*
+ * fault_in_user_writeable() failed so user state is immutable. At
+ * best we can make the kernel state consistent but user state will
+ * be most likely hosed and any subsequent unlock operation will be
+ * rejected due to PI futex rule [10].
+ *
+ * Ensure that the rtmutex owner is also the pi_state owner despite
+ * the user space value claiming something different. There is no
+ * point in unlocking the rtmutex if current is the owner as it
+ * would need to wait until the next waiter has taken the rtmutex
+ * to guarantee consistent state. Keep it simple. Userspace asked
+ * for this wreckaged state.
+ *
+ * The rtmutex has an owner - either current or some other
+ * task. See the EAGAIN loop above.
+ */
+ pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
+
+ return err;
+}
+
+static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+ struct task_struct *argowner)
+{
+ struct futex_pi_state *pi_state = q->pi_state;
+ int ret;
+
+ lockdep_assert_held(q->lock_ptr);
+
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ ret = __fixup_pi_state_owner(uaddr, q, argowner);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
+}
+
+/**
+ * fixup_pi_owner() - Post lock pi_state and corner case management
+ * @uaddr: user address of the futex
+ * @q: futex_q (contains pi_state and access to the rt_mutex)
+ * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
+ *
+ * After attempting to lock an rt_mutex, this function is called to cleanup
+ * the pi_state owner as well as handle race conditions that may allow us to
+ * acquire the lock. Must be called with the hb lock held.
+ *
+ * Return:
+ * - 1 - success, lock taken;
+ * - 0 - success, lock not taken;
+ * - <0 - on error (-EFAULT)
+ */
+int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
+{
+ if (locked) {
+ /*
+ * Got the lock. We might not be the anticipated owner if we
+ * did a lock-steal - fix up the PI-state in that case:
+ *
+ * Speculative pi_state->owner read (we don't hold wait_lock);
+ * since we own the lock pi_state->owner == current is the
+ * stable state, anything else needs more attention.
+ */
+ if (q->pi_state->owner != current)
+ return fixup_pi_state_owner(uaddr, q, current);
+ return 1;
+ }
+
+ /*
+ * If we didn't get the lock; check if anybody stole it from us. In
+ * that case, we need to fix up the uval to point to them instead of
+ * us, otherwise bad things happen. [10]
+ *
+ * Another speculative read; pi_state->owner == current is unstable
+ * but needs our attention.
+ */
+ if (q->pi_state->owner == current)
+ return fixup_pi_state_owner(uaddr, q, NULL);
+
+ /*
+ * Paranoia check. If we did not take the lock, then we should not be
+ * the owner of the rt_mutex. Warn and establish consistent state.
+ */
+ if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
+ return fixup_pi_state_owner(uaddr, q, current);
+
+ return 0;
+}
+
+/*
+ * Userspace tried a 0 -> TID atomic transition of the futex value
+ * and failed. The kernel side here does the whole locking operation:
+ * if there are waiters then it will block as a consequence of relying
+ * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
+ * a 0 value of the futex too.).
+ *
+ * Also serves as futex trylock_pi()'ing, and due semantics.
+ */
+int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
+{
+ struct hrtimer_sleeper timeout, *to;
+ struct task_struct *exiting = NULL;
+ struct rt_mutex_waiter rt_waiter;
+ struct futex_hash_bucket *hb;
+ struct futex_q q = futex_q_init;
+ int res, ret;
+
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+
+ to = futex_setup_timer(time, &timeout, flags, 0);
+
+retry:
+ ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
+ if (unlikely(ret != 0))
+ goto out;
+
+retry_private:
+ hb = futex_q_lock(&q);
+
+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
+ &exiting, 0);
+ if (unlikely(ret)) {
+ /*
+ * Atomic work succeeded and we got the lock,
+ * or failed. Either way, we do _not_ block.
+ */
+ switch (ret) {
+ case 1:
+ /* We got the lock. */
+ ret = 0;
+ goto out_unlock_put_key;
+ case -EFAULT:
+ goto uaddr_faulted;
+ case -EBUSY:
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+ * - EBUSY: Task is exiting and we just wait for the
+ * exit to complete.
+ * - EAGAIN: The user space value changed.
+ */
+ futex_q_unlock(hb);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock_put_key;
+ }
+ }
+
+ WARN_ON(!q.pi_state);
+
+ /*
+ * Only actually queue now that the atomic ops are done:
+ */
+ __futex_queue(&q, hb);
+
+ if (trylock) {
+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
+ /* Fixup the trylock return value: */
+ ret = ret ? 0 : -EWOULDBLOCK;
+ goto no_block;
+ }
+
+ /*
+ * Must be done before we enqueue the waiter, here is unfortunately
+ * under the hb lock, but that *should* work because it does nothing.
+ */
+ rt_mutex_pre_schedule();
+
+ rt_mutex_init_waiter(&rt_waiter);
+
+ /*
+ * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
+ * hold it while doing rt_mutex_start_proxy(), because then it will
+ * include hb->lock in the blocking chain, even through we'll not in
+ * fact hold it while blocking. This will lead it to report -EDEADLK
+ * and BUG when futex_unlock_pi() interleaves with this.
+ *
+ * Therefore acquire wait_lock while holding hb->lock, but drop the
+ * latter before calling __rt_mutex_start_proxy_lock(). This
+ * interleaves with futex_unlock_pi() -- which does a similar lock
+ * handoff -- such that the latter can observe the futex_q::pi_state
+ * before __rt_mutex_start_proxy_lock() is done.
+ */
+ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+ spin_unlock(q.lock_ptr);
+ /*
+ * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
+ * such that futex_unlock_pi() is guaranteed to observe the waiter when
+ * it sees the futex_q::pi_state.
+ */
+ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+ goto cleanup;
+ }
+
+ if (unlikely(to))
+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
+
+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+
+cleanup:
+ /*
+ * If we failed to acquire the lock (deadlock/signal/timeout), we must
+ * must unwind the above, however we canont lock hb->lock because
+ * rt_mutex already has a waiter enqueued and hb->lock can itself try
+ * and enqueue an rt_waiter through rtlock.
+ *
+ * Doing the cleanup without holding hb->lock can cause inconsistent
+ * state between hb and pi_state, but only in the direction of not
+ * seeing a waiter that is leaving.
+ *
+ * See futex_unlock_pi(), it deals with this inconsistency.
+ *
+ * There be dragons here, since we must deal with the inconsistency on
+ * the way out (here), it is impossible to detect/warn about the race
+ * the other way around (missing an incoming waiter).
+ *
+ * What could possibly go wrong...
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+ ret = 0;
+
+ /*
+ * Now that the rt_waiter has been dequeued, it is safe to use
+ * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
+ * the
+ */
+ spin_lock(q.lock_ptr);
+ /*
+ * Waiter is unqueued.
+ */
+ rt_mutex_post_schedule();
+no_block:
+ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+ res = fixup_pi_owner(uaddr, &q, !ret);
+ /*
+ * If fixup_pi_owner() returned an error, propagate that. If it acquired
+ * the lock, clear our -ETIMEDOUT or -EINTR.
+ */
+ if (res)
+ ret = (res < 0) ? res : 0;
+
+ futex_unqueue_pi(&q);
+ spin_unlock(q.lock_ptr);
+ goto out;
+
+out_unlock_put_key:
+ futex_q_unlock(hb);
+
+out:
+ if (to) {
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+ }
+ return ret != -EINTR ? ret : -ERESTARTNOINTR;
+
+uaddr_faulted:
+ futex_q_unlock(hb);
+
+ ret = fault_in_user_writeable(uaddr);
+ if (ret)
+ goto out;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ goto retry;
+}
+
+/*
+ * Userspace attempted a TID -> 0 atomic transition, and failed.
+ * This is the in-kernel slowpath: we look up the PI state (if any),
+ * and do the rt-mutex unlock.
+ */
+int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
+{
+ u32 curval, uval, vpid = task_pid_vnr(current);
+ union futex_key key = FUTEX_KEY_INIT;
+ struct futex_hash_bucket *hb;
+ struct futex_q *top_waiter;
+ int ret;
+
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
+retry:
+ if (get_user(uval, uaddr))
+ return -EFAULT;
+ /*
+ * We release only a lock we actually own:
+ */
+ if ((uval & FUTEX_TID_MASK) != vpid)
+ return -EPERM;
+
+ ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
+ if (ret)
+ return ret;
+
+ hb = futex_hash(&key);
+ spin_lock(&hb->lock);
+retry_hb:
+
+ /*
+ * Check waiters first. We do not trust user space values at
+ * all and we at least want to know if user space fiddled
+ * with the futex value instead of blindly unlocking.
+ */
+ top_waiter = futex_top_waiter(hb, &key);
+ if (top_waiter) {
+ struct futex_pi_state *pi_state = top_waiter->pi_state;
+ struct rt_mutex_waiter *rt_waiter;
+
+ ret = -EINVAL;
+ if (!pi_state)
+ goto out_unlock;
+
+ /*
+ * If current does not own the pi_state then the futex is
+ * inconsistent and user space fiddled with the futex value.
+ */
+ if (pi_state->owner != current)
+ goto out_unlock;
+
+ /*
+ * By taking wait_lock while still holding hb->lock, we ensure
+ * there is no point where we hold neither; and thereby
+ * wake_futex_pi() must observe any new waiters.
+ *
+ * Since the cleanup: case in futex_lock_pi() removes the
+ * rt_waiter without holding hb->lock, it is possible for
+ * wake_futex_pi() to not find a waiter while the above does,
+ * in this case the waiter is on the way out and it can be
+ * ignored.
+ *
+ * In particular; this forces __rt_mutex_start_proxy() to
+ * complete such that we're guaranteed to observe the
+ * rt_waiter.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Futex vs rt_mutex waiter state -- if there are no rt_mutex
+ * waiters even though futex thinks there are, then the waiter
+ * is leaving. The entry needs to be removed from the list so a
+ * new futex_lock_pi() is not using this stale PI-state while
+ * the futex is available in user space again.
+ * There can be more than one task on its way out so it needs
+ * to retry.
+ */
+ rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
+ if (!rt_waiter) {
+ __futex_unqueue(top_waiter);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ goto retry_hb;
+ }
+
+ get_pi_state(pi_state);
+ spin_unlock(&hb->lock);
+
+ /* drops pi_state->pi_mutex.wait_lock */
+ ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter);
+
+ put_pi_state(pi_state);
+
+ /*
+ * Success, we're done! No tricky corner cases.
+ */
+ if (!ret)
+ return ret;
+ /*
+ * The atomic access to the futex value generated a
+ * pagefault, so retry the user-access and the wakeup:
+ */
+ if (ret == -EFAULT)
+ goto pi_faulted;
+ /*
+ * A unconditional UNLOCK_PI op raced against a waiter
+ * setting the FUTEX_WAITERS bit. Try again.
+ */
+ if (ret == -EAGAIN)
+ goto pi_retry;
+ /*
+ * wake_futex_pi has detected invalid state. Tell user
+ * space.
+ */
+ return ret;
+ }
+
+ /*
+ * We have no kernel internal state, i.e. no waiters in the
+ * kernel. Waiters which are about to queue themselves are stuck
+ * on hb->lock. So we can safely ignore them. We do neither
+ * preserve the WAITERS bit not the OWNER_DIED one. We are the
+ * owner.
+ */
+ if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
+ spin_unlock(&hb->lock);
+ switch (ret) {
+ case -EFAULT:
+ goto pi_faulted;
+
+ case -EAGAIN:
+ goto pi_retry;
+
+ default:
+ WARN_ON_ONCE(1);
+ return ret;
+ }
+ }
+
+ /*
+ * If uval has changed, let user space handle it.
+ */
+ ret = (curval == uval) ? 0 : -EAGAIN;
+
+out_unlock:
+ spin_unlock(&hb->lock);
+ return ret;
+
+pi_retry:
+ cond_resched();
+ goto retry;
+
+pi_faulted:
+
+ ret = fault_in_user_writeable(uaddr);
+ if (!ret)
+ goto retry;
+
+ return ret;
+}
+
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
new file mode 100644
index 000000000000..b47bb764b352
--- /dev/null
+++ b/kernel/futex/requeue.c
@@ -0,0 +1,903 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/plist.h>
+#include <linux/sched/signal.h>
+
+#include "futex.h"
+#include "../locking/rtmutex_common.h"
+
+/*
+ * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an
+ * underlying rtmutex. The task which is about to be requeued could have
+ * just woken up (timeout, signal). After the wake up the task has to
+ * acquire hash bucket lock, which is held by the requeue code. As a task
+ * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking
+ * and the hash bucket lock blocking would collide and corrupt state.
+ *
+ * On !PREEMPT_RT this is not a problem and everything could be serialized
+ * on hash bucket lock, but aside of having the benefit of common code,
+ * this allows to avoid doing the requeue when the task is already on the
+ * way out and taking the hash bucket lock of the original uaddr1 when the
+ * requeue has been completed.
+ *
+ * The following state transitions are valid:
+ *
+ * On the waiter side:
+ * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE
+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT
+ *
+ * On the requeue side:
+ * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS
+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED
+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed)
+ * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED
+ * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed)
+ *
+ * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this
+ * signals that the waiter is already on the way out. It also means that
+ * the waiter is still on the 'wait' futex, i.e. uaddr1.
+ *
+ * The waiter side signals early wakeup to the requeue side either through
+ * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending
+ * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately
+ * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT,
+ * which means the wakeup is interleaving with a requeue in progress it has
+ * to wait for the requeue side to change the state. Either to DONE/LOCKED
+ * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex
+ * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by
+ * the requeue side when the requeue attempt failed via deadlock detection
+ * and therefore the waiter q is still on the uaddr1 futex.
+ */
+enum {
+ Q_REQUEUE_PI_NONE = 0,
+ Q_REQUEUE_PI_IGNORE,
+ Q_REQUEUE_PI_IN_PROGRESS,
+ Q_REQUEUE_PI_WAIT,
+ Q_REQUEUE_PI_DONE,
+ Q_REQUEUE_PI_LOCKED,
+};
+
+const struct futex_q futex_q_init = {
+ /* list gets initialized in futex_queue()*/
+ .wake = futex_wake_mark,
+ .key = FUTEX_KEY_INIT,
+ .bitset = FUTEX_BITSET_MATCH_ANY,
+ .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE),
+};
+
+/**
+ * requeue_futex() - Requeue a futex_q from one hb to another
+ * @q: the futex_q to requeue
+ * @hb1: the source hash_bucket
+ * @hb2: the target hash_bucket
+ * @key2: the new key for the requeued futex_q
+ */
+static inline
+void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
+ struct futex_hash_bucket *hb2, union futex_key *key2)
+{
+
+ /*
+ * If key1 and key2 hash to the same bucket, no need to
+ * requeue.
+ */
+ if (likely(&hb1->chain != &hb2->chain)) {
+ plist_del(&q->list, &hb1->chain);
+ futex_hb_waiters_dec(hb1);
+ futex_hb_waiters_inc(hb2);
+ plist_add(&q->list, &hb2->chain);
+ q->lock_ptr = &hb2->lock;
+ }
+ q->key = *key2;
+}
+
+static inline bool futex_requeue_pi_prepare(struct futex_q *q,
+ struct futex_pi_state *pi_state)
+{
+ int old, new;
+
+ /*
+ * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has
+ * already set Q_REQUEUE_PI_IGNORE to signal that requeue should
+ * ignore the waiter.
+ */
+ old = atomic_read_acquire(&q->requeue_state);
+ do {
+ if (old == Q_REQUEUE_PI_IGNORE)
+ return false;
+
+ /*
+ * futex_proxy_trylock_atomic() might have set it to
+ * IN_PROGRESS and a interleaved early wake to WAIT.
+ *
+ * It was considered to have an extra state for that
+ * trylock, but that would just add more conditionals
+ * all over the place for a dubious value.
+ */
+ if (old != Q_REQUEUE_PI_NONE)
+ break;
+
+ new = Q_REQUEUE_PI_IN_PROGRESS;
+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+
+ q->pi_state = pi_state;
+ return true;
+}
+
+static inline void futex_requeue_pi_complete(struct futex_q *q, int locked)
+{
+ int old, new;
+
+ old = atomic_read_acquire(&q->requeue_state);
+ do {
+ if (old == Q_REQUEUE_PI_IGNORE)
+ return;
+
+ if (locked >= 0) {
+ /* Requeue succeeded. Set DONE or LOCKED */
+ WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS &&
+ old != Q_REQUEUE_PI_WAIT);
+ new = Q_REQUEUE_PI_DONE + locked;
+ } else if (old == Q_REQUEUE_PI_IN_PROGRESS) {
+ /* Deadlock, no early wakeup interleave */
+ new = Q_REQUEUE_PI_NONE;
+ } else {
+ /* Deadlock, early wakeup interleave. */
+ WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT);
+ new = Q_REQUEUE_PI_IGNORE;
+ }
+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+
+#ifdef CONFIG_PREEMPT_RT
+ /* If the waiter interleaved with the requeue let it know */
+ if (unlikely(old == Q_REQUEUE_PI_WAIT))
+ rcuwait_wake_up(&q->requeue_wait);
+#endif
+}
+
+static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)
+{
+ int old, new;
+
+ old = atomic_read_acquire(&q->requeue_state);
+ do {
+ /* Is requeue done already? */
+ if (old >= Q_REQUEUE_PI_DONE)
+ return old;
+
+ /*
+ * If not done, then tell the requeue code to either ignore
+ * the waiter or to wake it up once the requeue is done.
+ */
+ new = Q_REQUEUE_PI_WAIT;
+ if (old == Q_REQUEUE_PI_NONE)
+ new = Q_REQUEUE_PI_IGNORE;
+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+
+ /* If the requeue was in progress, wait for it to complete */
+ if (old == Q_REQUEUE_PI_IN_PROGRESS) {
+#ifdef CONFIG_PREEMPT_RT
+ rcuwait_wait_event(&q->requeue_wait,
+ atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT,
+ TASK_UNINTERRUPTIBLE);
+#else
+ (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT);
+#endif
+ }
+
+ /*
+ * Requeue is now either prohibited or complete. Reread state
+ * because during the wait above it might have changed. Nothing
+ * will modify q->requeue_state after this point.
+ */
+ return atomic_read(&q->requeue_state);
+}
+
+/**
+ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
+ * @q: the futex_q
+ * @key: the key of the requeue target futex
+ * @hb: the hash_bucket of the requeue target futex
+ *
+ * During futex_requeue, with requeue_pi=1, it is possible to acquire the
+ * target futex if it is uncontended or via a lock steal.
+ *
+ * 1) Set @q::key to the requeue target futex key so the waiter can detect
+ * the wakeup on the right futex.
+ *
+ * 2) Dequeue @q from the hash bucket.
+ *
+ * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock
+ * acquisition.
+ *
+ * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that
+ * the waiter has to fixup the pi state.
+ *
+ * 5) Complete the requeue state so the waiter can make progress. After
+ * this point the waiter task can return from the syscall immediately in
+ * case that the pi state does not have to be fixed up.
+ *
+ * 6) Wake the waiter task.
+ *
+ * Must be called with both q->lock_ptr and hb->lock held.
+ */
+static inline
+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
+ struct futex_hash_bucket *hb)
+{
+ q->key = *key;
+
+ __futex_unqueue(q);
+
+ WARN_ON(!q->rt_waiter);
+ q->rt_waiter = NULL;
+
+ q->lock_ptr = &hb->lock;
+
+ /* Signal locked state to the waiter */
+ futex_requeue_pi_complete(q, 1);
+ wake_up_state(q->task, TASK_NORMAL);
+}
+
+/**
+ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
+ * @pifutex: the user address of the to futex
+ * @hb1: the from futex hash bucket, must be locked by the caller
+ * @hb2: the to futex hash bucket, must be locked by the caller
+ * @key1: the from futex key
+ * @key2: the to futex key
+ * @ps: address to store the pi_state pointer
+ * @exiting: Pointer to store the task pointer of the owner task
+ * which is in the middle of exiting
+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Try and get the lock on behalf of the top waiter if we can do it atomically.
+ * Wake the top waiter if we succeed. If the caller specified set_waiters,
+ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
+ * hb1 and hb2 must be held by the caller.
+ *
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
+ *
+ * Return:
+ * - 0 - failed to acquire the lock atomically;
+ * - >0 - acquired the lock, return value is vpid of the top_waiter
+ * - <0 - error
+ */
+static int
+futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
+ struct futex_hash_bucket *hb2, union futex_key *key1,
+ union futex_key *key2, struct futex_pi_state **ps,
+ struct task_struct **exiting, int set_waiters)
+{
+ struct futex_q *top_waiter;
+ u32 curval;
+ int ret;
+
+ if (futex_get_value_locked(&curval, pifutex))
+ return -EFAULT;
+
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
+ /*
+ * Find the top_waiter and determine if there are additional waiters.
+ * If the caller intends to requeue more than 1 waiter to pifutex,
+ * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
+ * as we have means to handle the possible fault. If not, don't set
+ * the bit unnecessarily as it will force the subsequent unlock to enter
+ * the kernel.
+ */
+ top_waiter = futex_top_waiter(hb1, key1);
+
+ /* There are no waiters, nothing for us to do. */
+ if (!top_waiter)
+ return 0;
+
+ /*
+ * Ensure that this is a waiter sitting in futex_wait_requeue_pi()
+ * and waiting on the 'waitqueue' futex which is always !PI.
+ */
+ if (!top_waiter->rt_waiter || top_waiter->pi_state)
+ return -EINVAL;
+
+ /* Ensure we requeue to the expected futex. */
+ if (!futex_match(top_waiter->requeue_pi_key, key2))
+ return -EINVAL;
+
+ /* Ensure that this does not race against an early wakeup */
+ if (!futex_requeue_pi_prepare(top_waiter, NULL))
+ return -EAGAIN;
+
+ /*
+ * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit
+ * in the contended case or if @set_waiters is true.
+ *
+ * In the contended case PI state is attached to the lock owner. If
+ * the user space lock can be acquired then PI state is attached to
+ * the new owner (@top_waiter->task) when @set_waiters is true.
+ */
+ ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
+ exiting, set_waiters);
+ if (ret == 1) {
+ /*
+ * Lock was acquired in user space and PI state was
+ * attached to @top_waiter->task. That means state is fully
+ * consistent and the waiter can return to user space
+ * immediately after the wakeup.
+ */
+ requeue_pi_wake_futex(top_waiter, key2, hb2);
+ } else if (ret < 0) {
+ /* Rewind top_waiter::requeue_state */
+ futex_requeue_pi_complete(top_waiter, ret);
+ } else {
+ /*
+ * futex_lock_pi_atomic() did not acquire the user space
+ * futex, but managed to establish the proxy lock and pi
+ * state. top_waiter::requeue_state cannot be fixed up here
+ * because the waiter is not enqueued on the rtmutex
+ * yet. This is handled at the callsite depending on the
+ * result of rt_mutex_start_proxy_lock() which is
+ * guaranteed to be reached with this function returning 0.
+ */
+ }
+ return ret;
+}
+
+/**
+ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
+ * @uaddr1: source futex user address
+ * @flags1: futex flags (FLAGS_SHARED, etc.)
+ * @uaddr2: target futex user address
+ * @flags2: futex flags (FLAGS_SHARED, etc.)
+ * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
+ * @nr_requeue: number of waiters to requeue (0-INT_MAX)
+ * @cmpval: @uaddr1 expected value (or %NULL)
+ * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
+ * pi futex (pi to pi requeue is not supported)
+ *
+ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
+ * uaddr2 atomically on behalf of the top waiter.
+ *
+ * Return:
+ * - >=0 - on success, the number of tasks requeued or woken;
+ * - <0 - on error
+ */
+int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
+ u32 __user *uaddr2, unsigned int flags2,
+ int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi)
+{
+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+ int task_count = 0, ret;
+ struct futex_pi_state *pi_state = NULL;
+ struct futex_hash_bucket *hb1, *hb2;
+ struct futex_q *this, *next;
+ DEFINE_WAKE_Q(wake_q);
+
+ if (nr_wake < 0 || nr_requeue < 0)
+ return -EINVAL;
+
+ /*
+ * When PI not supported: return -ENOSYS if requeue_pi is true,
+ * consequently the compiler knows requeue_pi is always false past
+ * this point which will optimize away all the conditional code
+ * further down.
+ */
+ if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
+ return -ENOSYS;
+
+ if (requeue_pi) {
+ /*
+ * Requeue PI only works on two distinct uaddrs. This
+ * check is only valid for private futexes. See below.
+ */
+ if (uaddr1 == uaddr2)
+ return -EINVAL;
+
+ /*
+ * futex_requeue() allows the caller to define the number
+ * of waiters to wake up via the @nr_wake argument. With
+ * REQUEUE_PI, waking up more than one waiter is creating
+ * more problems than it solves. Waking up a waiter makes
+ * only sense if the PI futex @uaddr2 is uncontended as
+ * this allows the requeue code to acquire the futex
+ * @uaddr2 before waking the waiter. The waiter can then
+ * return to user space without further action. A secondary
+ * wakeup would just make the futex_wait_requeue_pi()
+ * handling more complex, because that code would have to
+ * look up pi_state and do more or less all the handling
+ * which the requeue code has to do for the to be requeued
+ * waiters. So restrict the number of waiters to wake to
+ * one, and only wake it up when the PI futex is
+ * uncontended. Otherwise requeue it and let the unlock of
+ * the PI futex handle the wakeup.
+ *
+ * All REQUEUE_PI users, e.g. pthread_cond_signal() and
+ * pthread_cond_broadcast() must use nr_wake=1.
+ */
+ if (nr_wake != 1)
+ return -EINVAL;
+
+ /*
+ * requeue_pi requires a pi_state, try to allocate it now
+ * without any locks in case it fails.
+ */
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+ }
+
+retry:
+ ret = get_futex_key(uaddr1, flags1, &key1, FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+ ret = get_futex_key(uaddr2, flags2, &key2,
+ requeue_pi ? FUTEX_WRITE : FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+ if (requeue_pi && futex_match(&key1, &key2))
+ return -EINVAL;
+
+ hb1 = futex_hash(&key1);
+ hb2 = futex_hash(&key2);
+
+retry_private:
+ futex_hb_waiters_inc(hb2);
+ double_lock_hb(hb1, hb2);
+
+ if (likely(cmpval != NULL)) {
+ u32 curval;
+
+ ret = futex_get_value_locked(&curval, uaddr1);
+
+ if (unlikely(ret)) {
+ double_unlock_hb(hb1, hb2);
+ futex_hb_waiters_dec(hb2);
+
+ ret = get_user(curval, uaddr1);
+ if (ret)
+ return ret;
+
+ if (!(flags1 & FLAGS_SHARED))
+ goto retry_private;
+
+ goto retry;
+ }
+ if (curval != *cmpval) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ }
+
+ if (requeue_pi) {
+ struct task_struct *exiting = NULL;
+
+ /*
+ * Attempt to acquire uaddr2 and wake the top waiter. If we
+ * intend to requeue waiters, force setting the FUTEX_WAITERS
+ * bit. We force this here where we are able to easily handle
+ * faults rather in the requeue loop below.
+ *
+ * Updates topwaiter::requeue_state if a top waiter exists.
+ */
+ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+ &key2, &pi_state,
+ &exiting, nr_requeue);
+
+ /*
+ * At this point the top_waiter has either taken uaddr2 or
+ * is waiting on it. In both cases pi_state has been
+ * established and an initial refcount on it. In case of an
+ * error there's nothing.
+ *
+ * The top waiter's requeue_state is up to date:
+ *
+ * - If the lock was acquired atomically (ret == 1), then
+ * the state is Q_REQUEUE_PI_LOCKED.
+ *
+ * The top waiter has been dequeued and woken up and can
+ * return to user space immediately. The kernel/user
+ * space state is consistent. In case that there must be
+ * more waiters requeued the WAITERS bit in the user
+ * space futex is set so the top waiter task has to go
+ * into the syscall slowpath to unlock the futex. This
+ * will block until this requeue operation has been
+ * completed and the hash bucket locks have been
+ * dropped.
+ *
+ * - If the trylock failed with an error (ret < 0) then
+ * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
+ * happened", or Q_REQUEUE_PI_IGNORE when there was an
+ * interleaved early wakeup.
+ *
+ * - If the trylock did not succeed (ret == 0) then the
+ * state is either Q_REQUEUE_PI_IN_PROGRESS or
+ * Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
+ * This will be cleaned up in the loop below, which
+ * cannot fail because futex_proxy_trylock_atomic() did
+ * the same sanity checks for requeue_pi as the loop
+ * below does.
+ */
+ switch (ret) {
+ case 0:
+ /* We hold a reference on the pi state. */
+ break;
+
+ case 1:
+ /*
+ * futex_proxy_trylock_atomic() acquired the user space
+ * futex. Adjust task_count.
+ */
+ task_count++;
+ ret = 0;
+ break;
+
+ /*
+ * If the above failed, then pi_state is NULL and
+ * waiter::requeue_state is correct.
+ */
+ case -EFAULT:
+ double_unlock_hb(hb1, hb2);
+ futex_hb_waiters_dec(hb2);
+ ret = fault_in_user_writeable(uaddr2);
+ if (!ret)
+ goto retry;
+ return ret;
+ case -EBUSY:
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+ * - EBUSY: Owner is exiting and we just wait for the
+ * exit to complete.
+ * - EAGAIN: The user space value changed.
+ */
+ double_unlock_hb(hb1, hb2);
+ futex_hb_waiters_dec(hb2);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock;
+ }
+ }
+
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+ if (task_count - nr_wake >= nr_requeue)
+ break;
+
+ if (!futex_match(&this->key, &key1))
+ continue;
+
+ /*
+ * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
+ * be paired with each other and no other futex ops.
+ *
+ * We should never be requeueing a futex_q with a pi_state,
+ * which is awaiting a futex_unlock_pi().
+ */
+ if ((requeue_pi && !this->rt_waiter) ||
+ (!requeue_pi && this->rt_waiter) ||
+ this->pi_state) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /* Plain futexes just wake or requeue and are done */
+ if (!requeue_pi) {
+ if (++task_count <= nr_wake)
+ this->wake(&wake_q, this);
+ else
+ requeue_futex(this, hb1, hb2, &key2);
+ continue;
+ }
+
+ /* Ensure we requeue to the expected futex for requeue_pi. */
+ if (!futex_match(this->requeue_pi_key, &key2)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /*
+ * Requeue nr_requeue waiters and possibly one more in the case
+ * of requeue_pi if we couldn't acquire the lock atomically.
+ *
+ * Prepare the waiter to take the rt_mutex. Take a refcount
+ * on the pi_state and store the pointer in the futex_q
+ * object of the waiter.
+ */
+ get_pi_state(pi_state);
+
+ /* Don't requeue when the waiter is already on the way out. */
+ if (!futex_requeue_pi_prepare(this, pi_state)) {
+ /*
+ * Early woken waiter signaled that it is on the
+ * way out. Drop the pi_state reference and try the
+ * next waiter. @this->pi_state is still NULL.
+ */
+ put_pi_state(pi_state);
+ continue;
+ }
+
+ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+ this->rt_waiter,
+ this->task);
+
+ if (ret == 1) {
+ /*
+ * We got the lock. We do neither drop the refcount
+ * on pi_state nor clear this->pi_state because the
+ * waiter needs the pi_state for cleaning up the
+ * user space value. It will drop the refcount
+ * after doing so. this::requeue_state is updated
+ * in the wakeup as well.
+ */
+ requeue_pi_wake_futex(this, &key2, hb2);
+ task_count++;
+ } else if (!ret) {
+ /* Waiter is queued, move it to hb2 */
+ requeue_futex(this, hb1, hb2, &key2);
+ futex_requeue_pi_complete(this, 0);
+ task_count++;
+ } else {
+ /*
+ * rt_mutex_start_proxy_lock() detected a potential
+ * deadlock when we tried to queue that waiter.
+ * Drop the pi_state reference which we took above
+ * and remove the pointer to the state from the
+ * waiters futex_q object.
+ */
+ this->pi_state = NULL;
+ put_pi_state(pi_state);
+ futex_requeue_pi_complete(this, ret);
+ /*
+ * We stop queueing more waiters and let user space
+ * deal with the mess.
+ */
+ break;
+ }
+ }
+
+ /*
+ * We took an extra initial reference to the pi_state in
+ * futex_proxy_trylock_atomic(). We need to drop it here again.
+ */
+ put_pi_state(pi_state);
+
+out_unlock:
+ double_unlock_hb(hb1, hb2);
+ wake_up_q(&wake_q);
+ futex_hb_waiters_dec(hb2);
+ return ret ? ret : task_count;
+}
+
+/**
+ * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex
+ * @hb: the hash_bucket futex_q was original enqueued on
+ * @q: the futex_q woken while waiting to be requeued
+ * @timeout: the timeout associated with the wait (NULL if none)
+ *
+ * Determine the cause for the early wakeup.
+ *
+ * Return:
+ * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR
+ */
+static inline
+int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
+ struct futex_q *q,
+ struct hrtimer_sleeper *timeout)
+{
+ int ret;
+
+ /*
+ * With the hb lock held, we avoid races while we process the wakeup.
+ * We only need to hold hb (and not hb2) to ensure atomicity as the
+ * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
+ * It can't be requeued from uaddr2 to something else since we don't
+ * support a PI aware source futex for requeue.
+ */
+ WARN_ON_ONCE(&hb->lock != q->lock_ptr);
+
+ /*
+ * We were woken prior to requeue by a timeout or a signal.
+ * Unqueue the futex_q and determine which it was.
+ */
+ plist_del(&q->list, &hb->chain);
+ futex_hb_waiters_dec(hb);
+
+ /* Handle spurious wakeups gracefully */
+ ret = -EWOULDBLOCK;
+ if (timeout && !timeout->task)
+ ret = -ETIMEDOUT;
+ else if (signal_pending(current))
+ ret = -ERESTARTNOINTR;
+ return ret;
+}
+
+/**
+ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
+ * @uaddr: the futex we initially wait on (non-pi)
+ * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
+ * the same type, no requeueing from private to shared, etc.
+ * @val: the expected value of uaddr
+ * @abs_time: absolute timeout
+ * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
+ * @uaddr2: the pi futex we will take prior to returning to user-space
+ *
+ * The caller will wait on uaddr and will be requeued by futex_requeue() to
+ * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
+ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
+ * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
+ * without one, the pi logic would not know which task to boost/deboost, if
+ * there was a need to.
+ *
+ * We call schedule in futex_wait_queue() when we enqueue and return there
+ * via the following--
+ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
+ * 2) wakeup on uaddr2 after a requeue
+ * 3) signal
+ * 4) timeout
+ *
+ * If 3, cleanup and return -ERESTARTNOINTR.
+ *
+ * If 2, we may then block on trying to take the rt_mutex and return via:
+ * 5) successful lock
+ * 6) signal
+ * 7) timeout
+ * 8) other lock acquisition failure
+ *
+ * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
+ *
+ * If 4 or 7, we cleanup and return with -ETIMEDOUT.
+ *
+ * Return:
+ * - 0 - On success;
+ * - <0 - On error
+ */
+int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
+ u32 val, ktime_t *abs_time, u32 bitset,
+ u32 __user *uaddr2)
+{
+ struct hrtimer_sleeper timeout, *to;
+ struct rt_mutex_waiter rt_waiter;
+ struct futex_hash_bucket *hb;
+ union futex_key key2 = FUTEX_KEY_INIT;
+ struct futex_q q = futex_q_init;
+ struct rt_mutex_base *pi_mutex;
+ int res, ret;
+
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
+ if (uaddr == uaddr2)
+ return -EINVAL;
+
+ if (!bitset)
+ return -EINVAL;
+
+ to = futex_setup_timer(abs_time, &timeout, flags,
+ current->timer_slack_ns);
+
+ /*
+ * The waiter is allocated on our stack, manipulated by the requeue
+ * code while we sleep on uaddr.
+ */
+ rt_mutex_init_waiter(&rt_waiter);
+
+ ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
+ if (unlikely(ret != 0))
+ goto out;
+
+ q.bitset = bitset;
+ q.rt_waiter = &rt_waiter;
+ q.requeue_pi_key = &key2;
+
+ /*
+ * Prepare to wait on uaddr. On success, it holds hb->lock and q
+ * is initialized.
+ */
+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
+ if (ret)
+ goto out;
+
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+ if (futex_match(&q.key, &key2)) {
+ futex_q_unlock(hb);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Queue the futex_q, drop the hb lock, wait for wakeup. */
+ futex_wait_queue(hb, &q, to);
+
+ switch (futex_requeue_pi_wakeup_sync(&q)) {
+ case Q_REQUEUE_PI_IGNORE:
+ /* The waiter is still on uaddr1 */
+ spin_lock(&hb->lock);
+ ret = handle_early_requeue_pi_wakeup(hb, &q, to);
+ spin_unlock(&hb->lock);
+ break;
+
+ case Q_REQUEUE_PI_LOCKED:
+ /* The requeue acquired the lock */
+ if (q.pi_state && (q.pi_state->owner != current)) {
+ spin_lock(q.lock_ptr);
+ ret = fixup_pi_owner(uaddr2, &q, true);
+ /*
+ * Drop the reference to the pi state which the
+ * requeue_pi() code acquired for us.
+ */
+ put_pi_state(q.pi_state);
+ spin_unlock(q.lock_ptr);
+ /*
+ * Adjust the return value. It's either -EFAULT or
+ * success (1) but the caller expects 0 for success.
+ */
+ ret = ret < 0 ? ret : 0;
+ }
+ break;
+
+ case Q_REQUEUE_PI_DONE:
+ /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */
+ pi_mutex = &q.pi_state->pi_mutex;
+ ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
+
+ /*
+ * See futex_unlock_pi()'s cleanup: comment.
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
+ ret = 0;
+
+ spin_lock(q.lock_ptr);
+ debug_rt_mutex_free_waiter(&rt_waiter);
+ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+ res = fixup_pi_owner(uaddr2, &q, !ret);
+ /*
+ * If fixup_pi_owner() returned an error, propagate that. If it
+ * acquired the lock, clear -ETIMEDOUT or -EINTR.
+ */
+ if (res)
+ ret = (res < 0) ? res : 0;
+
+ futex_unqueue_pi(&q);
+ spin_unlock(q.lock_ptr);
+
+ if (ret == -EINTR) {
+ /*
+ * We've already been requeued, but cannot restart
+ * by calling futex_lock_pi() directly. We could
+ * restart this syscall, but it would detect that
+ * the user space "val" changed and return
+ * -EWOULDBLOCK. Save the overhead of the restart
+ * and return -EWOULDBLOCK directly.
+ */
+ ret = -EWOULDBLOCK;
+ }
+ break;
+ default:
+ BUG();
+ }
+
+out:
+ if (to) {
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+ }
+ return ret;
+}
+
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
new file mode 100644
index 000000000000..4b6da9116aa6
--- /dev/null
+++ b/kernel/futex/syscalls.c
@@ -0,0 +1,512 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/syscalls.h>
+#include <linux/time_namespace.h>
+
+#include "futex.h"
+
+/*
+ * Support for robust futexes: the kernel cleans up held futexes at
+ * thread exit time.
+ *
+ * Implementation: user-space maintains a per-thread list of locks it
+ * is holding. Upon do_exit(), the kernel carefully walks this list,
+ * and marks all locks that are owned by this thread with the
+ * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
+ * always manipulated with the lock held, so the list is private and
+ * per-thread. Userspace also maintains a per-thread 'list_op_pending'
+ * field, to allow the kernel to clean up if the thread dies after
+ * acquiring the lock, but just before it could have added itself to
+ * the list. There can only be one such pending lock.
+ */
+
+/**
+ * sys_set_robust_list() - Set the robust-futex list head of a task
+ * @head: pointer to the list-head
+ * @len: length of the list-head, as userspace expects
+ */
+SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
+ size_t, len)
+{
+ /*
+ * The kernel knows only one size for now:
+ */
+ if (unlikely(len != sizeof(*head)))
+ return -EINVAL;
+
+ current->robust_list = head;
+
+ return 0;
+}
+
+/**
+ * sys_get_robust_list() - Get the robust-futex list head of a task
+ * @pid: pid of the process [zero for current task]
+ * @head_ptr: pointer to a list-head pointer, the kernel fills it in
+ * @len_ptr: pointer to a length field, the kernel fills in the header size
+ */
+SYSCALL_DEFINE3(get_robust_list, int, pid,
+ struct robust_list_head __user * __user *, head_ptr,
+ size_t __user *, len_ptr)
+{
+ struct robust_list_head __user *head;
+ unsigned long ret;
+ struct task_struct *p;
+
+ rcu_read_lock();
+
+ ret = -ESRCH;
+ if (!pid)
+ p = current;
+ else {
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto err_unlock;
+ }
+
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+ goto err_unlock;
+
+ head = p->robust_list;
+ rcu_read_unlock();
+
+ if (put_user(sizeof(*head), len_ptr))
+ return -EFAULT;
+ return put_user(head, head_ptr);
+
+err_unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
+ u32 __user *uaddr2, u32 val2, u32 val3)
+{
+ unsigned int flags = futex_to_flags(op);
+ int cmd = op & FUTEX_CMD_MASK;
+
+ if (flags & FLAGS_CLOCKRT) {
+ if (cmd != FUTEX_WAIT_BITSET &&
+ cmd != FUTEX_WAIT_REQUEUE_PI &&
+ cmd != FUTEX_LOCK_PI2)
+ return -ENOSYS;
+ }
+
+ switch (cmd) {
+ case FUTEX_WAIT:
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ fallthrough;
+ case FUTEX_WAIT_BITSET:
+ return futex_wait(uaddr, flags, val, timeout, val3);
+ case FUTEX_WAKE:
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ fallthrough;
+ case FUTEX_WAKE_BITSET:
+ return futex_wake(uaddr, flags, val, val3);
+ case FUTEX_REQUEUE:
+ return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0);
+ case FUTEX_CMP_REQUEUE:
+ return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0);
+ case FUTEX_WAKE_OP:
+ return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
+ case FUTEX_LOCK_PI:
+ flags |= FLAGS_CLOCKRT;
+ fallthrough;
+ case FUTEX_LOCK_PI2:
+ return futex_lock_pi(uaddr, flags, timeout, 0);
+ case FUTEX_UNLOCK_PI:
+ return futex_unlock_pi(uaddr, flags);
+ case FUTEX_TRYLOCK_PI:
+ return futex_lock_pi(uaddr, flags, NULL, 1);
+ case FUTEX_WAIT_REQUEUE_PI:
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+ uaddr2);
+ case FUTEX_CMP_REQUEUE_PI:
+ return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1);
+ }
+ return -ENOSYS;
+}
+
+static __always_inline bool futex_cmd_has_timeout(u32 cmd)
+{
+ switch (cmd) {
+ case FUTEX_WAIT:
+ case FUTEX_LOCK_PI:
+ case FUTEX_LOCK_PI2:
+ case FUTEX_WAIT_BITSET:
+ case FUTEX_WAIT_REQUEUE_PI:
+ return true;
+ }
+ return false;
+}
+
+static __always_inline int
+futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
+{
+ if (!timespec64_valid(ts))
+ return -EINVAL;
+
+ *t = timespec64_to_ktime(*ts);
+ if (cmd == FUTEX_WAIT)
+ *t = ktime_add_safe(ktime_get(), *t);
+ else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
+ *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
+ return 0;
+}
+
+SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+ const struct __kernel_timespec __user *, utime,
+ u32 __user *, uaddr2, u32, val3)
+{
+ int ret, cmd = op & FUTEX_CMD_MASK;
+ ktime_t t, *tp = NULL;
+ struct timespec64 ts;
+
+ if (utime && futex_cmd_has_timeout(cmd)) {
+ if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
+ return -EFAULT;
+ if (get_timespec64(&ts, utime))
+ return -EFAULT;
+ ret = futex_init_timeout(cmd, op, &ts, &t);
+ if (ret)
+ return ret;
+ tp = &t;
+ }
+
+ return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
+}
+
+/**
+ * futex_parse_waitv - Parse a waitv array from userspace
+ * @futexv: Kernel side list of waiters to be filled
+ * @uwaitv: Userspace list to be parsed
+ * @nr_futexes: Length of futexv
+ * @wake: Wake to call when futex is woken
+ * @wake_data: Data for the wake handler
+ *
+ * Return: Error code on failure, 0 on success
+ */
+int futex_parse_waitv(struct futex_vector *futexv,
+ struct futex_waitv __user *uwaitv,
+ unsigned int nr_futexes, futex_wake_fn *wake,
+ void *wake_data)
+{
+ struct futex_waitv aux;
+ unsigned int i;
+
+ for (i = 0; i < nr_futexes; i++) {
+ unsigned int flags;
+
+ if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
+ return -EFAULT;
+
+ if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved)
+ return -EINVAL;
+
+ flags = futex2_to_flags(aux.flags);
+ if (!futex_flags_valid(flags))
+ return -EINVAL;
+
+ if (!futex_validate_input(flags, aux.val))
+ return -EINVAL;
+
+ futexv[i].w.flags = flags;
+ futexv[i].w.val = aux.val;
+ futexv[i].w.uaddr = aux.uaddr;
+ futexv[i].q = futex_q_init;
+ futexv[i].q.wake = wake;
+ futexv[i].q.wake_data = wake_data;
+ }
+
+ return 0;
+}
+
+static int futex2_setup_timeout(struct __kernel_timespec __user *timeout,
+ clockid_t clockid, struct hrtimer_sleeper *to)
+{
+ int flag_clkid = 0, flag_init = 0;
+ struct timespec64 ts;
+ ktime_t time;
+ int ret;
+
+ if (!timeout)
+ return 0;
+
+ if (clockid == CLOCK_REALTIME) {
+ flag_clkid = FLAGS_CLOCKRT;
+ flag_init = FUTEX_CLOCK_REALTIME;
+ }
+
+ if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
+ return -EINVAL;
+
+ if (get_timespec64(&ts, timeout))
+ return -EFAULT;
+
+ /*
+ * Since there's no opcode for futex_waitv, use
+ * FUTEX_WAIT_BITSET that uses absolute timeout as well
+ */
+ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
+ if (ret)
+ return ret;
+
+ futex_setup_timer(&time, to, flag_clkid, 0);
+ return 0;
+}
+
+static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to)
+{
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+}
+
+/**
+ * sys_futex_waitv - Wait on a list of futexes
+ * @waiters: List of futexes to wait on
+ * @nr_futexes: Length of futexv
+ * @flags: Flag for timeout (monotonic/realtime)
+ * @timeout: Optional absolute timeout.
+ * @clockid: Clock to be used for the timeout, realtime or monotonic.
+ *
+ * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes
+ * if a futex_wake() is performed at any uaddr. The syscall returns immediately
+ * if any waiter has *uaddr != val. *timeout is an optional timeout value for
+ * the operation. Each waiter has individual flags. The `flags` argument for
+ * the syscall should be used solely for specifying the timeout as realtime, if
+ * needed. Flags for private futexes, sizes, etc. should be used on the
+ * individual flags of each waiter.
+ *
+ * Returns the array index of one of the woken futexes. No further information
+ * is provided: any number of other futexes may also have been woken by the
+ * same event, and if more than one futex was woken, the retrned index may
+ * refer to any one of them. (It is not necessaryily the futex with the
+ * smallest index, nor the one most recently woken, nor...)
+ */
+
+SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
+ unsigned int, nr_futexes, unsigned int, flags,
+ struct __kernel_timespec __user *, timeout, clockid_t, clockid)
+{
+ struct hrtimer_sleeper to;
+ struct futex_vector *futexv;
+ int ret;
+
+ /* This syscall supports no flags for now */
+ if (flags)
+ return -EINVAL;
+
+ if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
+ return -EINVAL;
+
+ if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
+ return ret;
+
+ futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
+ if (!futexv) {
+ ret = -ENOMEM;
+ goto destroy_timer;
+ }
+
+ ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark,
+ NULL);
+ if (!ret)
+ ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
+
+ kfree(futexv);
+
+destroy_timer:
+ if (timeout)
+ futex2_destroy_timeout(&to);
+ return ret;
+}
+
+/*
+ * sys_futex_wake - Wake a number of futexes
+ * @uaddr: Address of the futex(es) to wake
+ * @mask: bitmask
+ * @nr: Number of the futexes to wake
+ * @flags: FUTEX2 flags
+ *
+ * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the
+ * futex2 family of calls.
+ */
+
+SYSCALL_DEFINE4(futex_wake,
+ void __user *, uaddr,
+ unsigned long, mask,
+ int, nr,
+ unsigned int, flags)
+{
+ if (flags & ~FUTEX2_VALID_MASK)
+ return -EINVAL;
+
+ flags = futex2_to_flags(flags);
+ if (!futex_flags_valid(flags))
+ return -EINVAL;
+
+ if (!futex_validate_input(flags, mask))
+ return -EINVAL;
+
+ return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask);
+}
+
+/*
+ * sys_futex_wait - Wait on a futex
+ * @uaddr: Address of the futex to wait on
+ * @val: Value of @uaddr
+ * @mask: bitmask
+ * @flags: FUTEX2 flags
+ * @timeout: Optional absolute timeout
+ * @clockid: Clock to be used for the timeout, realtime or monotonic
+ *
+ * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the
+ * futex2 familiy of calls.
+ */
+
+SYSCALL_DEFINE6(futex_wait,
+ void __user *, uaddr,
+ unsigned long, val,
+ unsigned long, mask,
+ unsigned int, flags,
+ struct __kernel_timespec __user *, timeout,
+ clockid_t, clockid)
+{
+ struct hrtimer_sleeper to;
+ int ret;
+
+ if (flags & ~FUTEX2_VALID_MASK)
+ return -EINVAL;
+
+ flags = futex2_to_flags(flags);
+ if (!futex_flags_valid(flags))
+ return -EINVAL;
+
+ if (!futex_validate_input(flags, val) ||
+ !futex_validate_input(flags, mask))
+ return -EINVAL;
+
+ if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
+ return ret;
+
+ ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask);
+
+ if (timeout)
+ futex2_destroy_timeout(&to);
+
+ return ret;
+}
+
+/*
+ * sys_futex_requeue - Requeue a waiter from one futex to another
+ * @waiters: array describing the source and destination futex
+ * @flags: unused
+ * @nr_wake: number of futexes to wake
+ * @nr_requeue: number of futexes to requeue
+ *
+ * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the
+ * futex2 family of calls.
+ */
+
+SYSCALL_DEFINE4(futex_requeue,
+ struct futex_waitv __user *, waiters,
+ unsigned int, flags,
+ int, nr_wake,
+ int, nr_requeue)
+{
+ struct futex_vector futexes[2];
+ u32 cmpval;
+ int ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (!waiters)
+ return -EINVAL;
+
+ ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL);
+ if (ret)
+ return ret;
+
+ cmpval = futexes[0].w.val;
+
+ return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
+ u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags,
+ nr_wake, nr_requeue, &cmpval, 0);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(set_robust_list,
+ struct compat_robust_list_head __user *, head,
+ compat_size_t, len)
+{
+ if (unlikely(len != sizeof(*head)))
+ return -EINVAL;
+
+ current->compat_robust_list = head;
+
+ return 0;
+}
+
+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
+ compat_uptr_t __user *, head_ptr,
+ compat_size_t __user *, len_ptr)
+{
+ struct compat_robust_list_head __user *head;
+ unsigned long ret;
+ struct task_struct *p;
+
+ rcu_read_lock();
+
+ ret = -ESRCH;
+ if (!pid)
+ p = current;
+ else {
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto err_unlock;
+ }
+
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+ goto err_unlock;
+
+ head = p->compat_robust_list;
+ rcu_read_unlock();
+
+ if (put_user(sizeof(*head), len_ptr))
+ return -EFAULT;
+ return put_user(ptr_to_compat(head), head_ptr);
+
+err_unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+#endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
+SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
+ const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
+ u32, val3)
+{
+ int ret, cmd = op & FUTEX_CMD_MASK;
+ ktime_t t, *tp = NULL;
+ struct timespec64 ts;
+
+ if (utime && futex_cmd_has_timeout(cmd)) {
+ if (get_old_timespec32(&ts, utime))
+ return -EFAULT;
+ ret = futex_init_timeout(cmd, op, &ts, &t);
+ if (ret)
+ return ret;
+ tp = &t;
+ }
+
+ return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
+}
+#endif /* CONFIG_COMPAT_32BIT_TIME */
+
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
new file mode 100644
index 000000000000..3a10375d9521
--- /dev/null
+++ b/kernel/futex/waitwake.c
@@ -0,0 +1,734 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/plist.h>
+#include <linux/sched/task.h>
+#include <linux/sched/signal.h>
+#include <linux/freezer.h>
+
+#include "futex.h"
+
+/*
+ * READ this before attempting to hack on futexes!
+ *
+ * Basic futex operation and ordering guarantees
+ * =============================================
+ *
+ * The waiter reads the futex value in user space and calls
+ * futex_wait(). This function computes the hash bucket and acquires
+ * the hash bucket lock. After that it reads the futex user space value
+ * again and verifies that the data has not changed. If it has not changed
+ * it enqueues itself into the hash bucket, releases the hash bucket lock
+ * and schedules.
+ *
+ * The waker side modifies the user space value of the futex and calls
+ * futex_wake(). This function computes the hash bucket and acquires the
+ * hash bucket lock. Then it looks for waiters on that futex in the hash
+ * bucket and wakes them.
+ *
+ * In futex wake up scenarios where no tasks are blocked on a futex, taking
+ * the hb spinlock can be avoided and simply return. In order for this
+ * optimization to work, ordering guarantees must exist so that the waiter
+ * being added to the list is acknowledged when the list is concurrently being
+ * checked by the waker, avoiding scenarios like the following:
+ *
+ * CPU 0 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ * futex_wait(futex, val);
+ * uval = *futex;
+ * *futex = newval;
+ * sys_futex(WAKE, futex);
+ * futex_wake(futex);
+ * if (queue_empty())
+ * return;
+ * if (uval == val)
+ * lock(hash_bucket(futex));
+ * queue();
+ * unlock(hash_bucket(futex));
+ * schedule();
+ *
+ * This would cause the waiter on CPU 0 to wait forever because it
+ * missed the transition of the user space value from val to newval
+ * and the waker did not find the waiter in the hash bucket queue.
+ *
+ * The correct serialization ensures that a waiter either observes
+ * the changed user space value before blocking or is woken by a
+ * concurrent waker:
+ *
+ * CPU 0 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ * futex_wait(futex, val);
+ *
+ * waiters++; (a)
+ * smp_mb(); (A) <-- paired with -.
+ * |
+ * lock(hash_bucket(futex)); |
+ * |
+ * uval = *futex; |
+ * | *futex = newval;
+ * | sys_futex(WAKE, futex);
+ * | futex_wake(futex);
+ * |
+ * `--------> smp_mb(); (B)
+ * if (uval == val)
+ * queue();
+ * unlock(hash_bucket(futex));
+ * schedule(); if (waiters)
+ * lock(hash_bucket(futex));
+ * else wake_waiters(futex);
+ * waiters--; (b) unlock(hash_bucket(futex));
+ *
+ * Where (A) orders the waiters increment and the futex value read through
+ * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write
+ * to futex and the waiters read (see futex_hb_waiters_pending()).
+ *
+ * This yields the following case (where X:=waiters, Y:=futex):
+ *
+ * X = Y = 0
+ *
+ * w[X]=1 w[Y]=1
+ * MB MB
+ * r[Y]=y r[X]=x
+ *
+ * Which guarantees that x==0 && y==0 is impossible; which translates back into
+ * the guarantee that we cannot both miss the futex variable change and the
+ * enqueue.
+ *
+ * Note that a new waiter is accounted for in (a) even when it is possible that
+ * the wait call can return error, in which case we backtrack from it in (b).
+ * Refer to the comment in futex_q_lock().
+ *
+ * Similarly, in order to account for waiters being requeued on another
+ * address we always increment the waiters for the destination bucket before
+ * acquiring the lock. It then decrements them again after releasing it -
+ * the code that actually moves the futex(es) between hash buckets (requeue_futex)
+ * will do the additional required waiter count housekeeping. This is done for
+ * double_lock_hb() and double_unlock_hb(), respectively.
+ */
+
+bool __futex_wake_mark(struct futex_q *q)
+{
+ if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
+ return false;
+
+ __futex_unqueue(q);
+ /*
+ * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
+ * is written, without taking any locks. This is possible in the event
+ * of a spurious wakeup, for example. A memory barrier is required here
+ * to prevent the following store to lock_ptr from getting ahead of the
+ * plist_del in __futex_unqueue().
+ */
+ smp_store_release(&q->lock_ptr, NULL);
+
+ return true;
+}
+
+/*
+ * The hash bucket lock must be held when this is called.
+ * Afterwards, the futex_q must not be accessed. Callers
+ * must ensure to later call wake_up_q() for the actual
+ * wakeups to occur.
+ */
+void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
+{
+ struct task_struct *p = q->task;
+
+ get_task_struct(p);
+
+ if (!__futex_wake_mark(q)) {
+ put_task_struct(p);
+ return;
+ }
+
+ /*
+ * Queue the task for later wakeup for after we've released
+ * the hb->lock.
+ */
+ wake_q_add_safe(wake_q, p);
+}
+
+/*
+ * Wake up waiters matching bitset queued on this futex (uaddr).
+ */
+int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+{
+ struct futex_hash_bucket *hb;
+ struct futex_q *this, *next;
+ union futex_key key = FUTEX_KEY_INIT;
+ DEFINE_WAKE_Q(wake_q);
+ int ret;
+
+ if (!bitset)
+ return -EINVAL;
+
+ ret = get_futex_key(uaddr, flags, &key, FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+
+ if ((flags & FLAGS_STRICT) && !nr_wake)
+ return 0;
+
+ hb = futex_hash(&key);
+
+ /* Make sure we really have tasks to wakeup */
+ if (!futex_hb_waiters_pending(hb))
+ return ret;
+
+ spin_lock(&hb->lock);
+
+ plist_for_each_entry_safe(this, next, &hb->chain, list) {
+ if (futex_match (&this->key, &key)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /* Check if one of the bits is set in both bitsets */
+ if (!(this->bitset & bitset))
+ continue;
+
+ this->wake(&wake_q, this);
+ if (++ret >= nr_wake)
+ break;
+ }
+ }
+
+ spin_unlock(&hb->lock);
+ wake_up_q(&wake_q);
+ return ret;
+}
+
+static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
+{
+ unsigned int op = (encoded_op & 0x70000000) >> 28;
+ unsigned int cmp = (encoded_op & 0x0f000000) >> 24;
+ int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
+ int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
+ int oldval, ret;
+
+ if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
+ if (oparg < 0 || oparg > 31) {
+ char comm[sizeof(current->comm)];
+ /*
+ * kill this print and return -EINVAL when userspace
+ * is sane again
+ */
+ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
+ get_task_comm(comm, current), oparg);
+ oparg &= 31;
+ }
+ oparg = 1 << oparg;
+ }
+
+ pagefault_disable();
+ ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
+ pagefault_enable();
+ if (ret)
+ return ret;
+
+ switch (cmp) {
+ case FUTEX_OP_CMP_EQ:
+ return oldval == cmparg;
+ case FUTEX_OP_CMP_NE:
+ return oldval != cmparg;
+ case FUTEX_OP_CMP_LT:
+ return oldval < cmparg;
+ case FUTEX_OP_CMP_GE:
+ return oldval >= cmparg;
+ case FUTEX_OP_CMP_LE:
+ return oldval <= cmparg;
+ case FUTEX_OP_CMP_GT:
+ return oldval > cmparg;
+ default:
+ return -ENOSYS;
+ }
+}
+
+/*
+ * Wake up all waiters hashed on the physical page that is mapped
+ * to this virtual address:
+ */
+int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+ int nr_wake, int nr_wake2, int op)
+{
+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+ struct futex_hash_bucket *hb1, *hb2;
+ struct futex_q *this, *next;
+ int ret, op_ret;
+ DEFINE_WAKE_Q(wake_q);
+
+retry:
+ ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+ ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
+ if (unlikely(ret != 0))
+ return ret;
+
+ hb1 = futex_hash(&key1);
+ hb2 = futex_hash(&key2);
+
+retry_private:
+ double_lock_hb(hb1, hb2);
+ op_ret = futex_atomic_op_inuser(op, uaddr2);
+ if (unlikely(op_ret < 0)) {
+ double_unlock_hb(hb1, hb2);
+
+ if (!IS_ENABLED(CONFIG_MMU) ||
+ unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
+ /*
+ * we don't get EFAULT from MMU faults if we don't have
+ * an MMU, but we might get them from range checking
+ */
+ ret = op_ret;
+ return ret;
+ }
+
+ if (op_ret == -EFAULT) {
+ ret = fault_in_user_writeable(uaddr2);
+ if (ret)
+ return ret;
+ }
+
+ cond_resched();
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+ goto retry;
+ }
+
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+ if (futex_match (&this->key, &key1)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ this->wake(&wake_q, this);
+ if (++ret >= nr_wake)
+ break;
+ }
+ }
+
+ if (op_ret > 0) {
+ op_ret = 0;
+ plist_for_each_entry_safe(this, next, &hb2->chain, list) {
+ if (futex_match (&this->key, &key2)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ this->wake(&wake_q, this);
+ if (++op_ret >= nr_wake2)
+ break;
+ }
+ }
+ ret += op_ret;
+ }
+
+out_unlock:
+ double_unlock_hb(hb1, hb2);
+ wake_up_q(&wake_q);
+ return ret;
+}
+
+static long futex_wait_restart(struct restart_block *restart);
+
+/**
+ * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal
+ * @hb: the futex hash bucket, must be locked by the caller
+ * @q: the futex_q to queue up on
+ * @timeout: the prepared hrtimer_sleeper, or null for no timeout
+ */
+void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
+ struct hrtimer_sleeper *timeout)
+{
+ /*
+ * The task state is guaranteed to be set before another task can
+ * wake it. set_current_state() is implemented using smp_store_mb() and
+ * futex_queue() calls spin_unlock() upon completion, both serializing
+ * access to the hash list and forcing another memory barrier.
+ */
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ futex_queue(q, hb);
+
+ /* Arm the timer */
+ if (timeout)
+ hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
+
+ /*
+ * If we have been removed from the hash list, then another task
+ * has tried to wake us, and we can skip the call to schedule().
+ */
+ if (likely(!plist_node_empty(&q->list))) {
+ /*
+ * If the timer has already expired, current will already be
+ * flagged for rescheduling. Only call schedule if there
+ * is no timeout, or if it has yet to expire.
+ */
+ if (!timeout || timeout->task)
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+}
+
+/**
+ * futex_unqueue_multiple - Remove various futexes from their hash bucket
+ * @v: The list of futexes to unqueue
+ * @count: Number of futexes in the list
+ *
+ * Helper to unqueue a list of futexes. This can't fail.
+ *
+ * Return:
+ * - >=0 - Index of the last futex that was awoken;
+ * - -1 - No futex was awoken
+ */
+int futex_unqueue_multiple(struct futex_vector *v, int count)
+{
+ int ret = -1, i;
+
+ for (i = 0; i < count; i++) {
+ if (!futex_unqueue(&v[i].q))
+ ret = i;
+ }
+
+ return ret;
+}
+
+/**
+ * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes
+ * @vs: The futex list to wait on
+ * @count: The size of the list
+ * @woken: Index of the last woken futex, if any. Used to notify the
+ * caller that it can return this index to userspace (return parameter)
+ *
+ * Prepare multiple futexes in a single step and enqueue them. This may fail if
+ * the futex list is invalid or if any futex was already awoken. On success the
+ * task is ready to interruptible sleep.
+ *
+ * Return:
+ * - 1 - One of the futexes was woken by another thread
+ * - 0 - Success
+ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL
+ */
+int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
+{
+ struct futex_hash_bucket *hb;
+ bool retry = false;
+ int ret, i;
+ u32 uval;
+
+ /*
+ * Enqueuing multiple futexes is tricky, because we need to enqueue
+ * each futex on the list before dealing with the next one to avoid
+ * deadlocking on the hash bucket. But, before enqueuing, we need to
+ * make sure that current->state is TASK_INTERRUPTIBLE, so we don't
+ * lose any wake events, which cannot be done before the get_futex_key
+ * of the next key, because it calls get_user_pages, which can sleep.
+ * Thus, we fetch the list of futexes keys in two steps, by first
+ * pinning all the memory keys in the futex key, and only then we read
+ * each key and queue the corresponding futex.
+ *
+ * Private futexes doesn't need to recalculate hash in retry, so skip
+ * get_futex_key() when retrying.
+ */
+retry:
+ for (i = 0; i < count; i++) {
+ if (!(vs[i].w.flags & FLAGS_SHARED) && retry)
+ continue;
+
+ ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),
+ vs[i].w.flags,
+ &vs[i].q.key, FUTEX_READ);
+
+ if (unlikely(ret))
+ return ret;
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+
+ for (i = 0; i < count; i++) {
+ u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
+ struct futex_q *q = &vs[i].q;
+ u32 val = vs[i].w.val;
+
+ hb = futex_q_lock(q);
+ ret = futex_get_value_locked(&uval, uaddr);
+
+ if (!ret && uval == val) {
+ /*
+ * The bucket lock can't be held while dealing with the
+ * next futex. Queue each futex at this moment so hb can
+ * be unlocked.
+ */
+ futex_queue(q, hb);
+ continue;
+ }
+
+ futex_q_unlock(hb);
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * Even if something went wrong, if we find out that a futex
+ * was woken, we don't return error and return this index to
+ * userspace
+ */
+ *woken = futex_unqueue_multiple(vs, i);
+ if (*woken >= 0)
+ return 1;
+
+ if (ret) {
+ /*
+ * If we need to handle a page fault, we need to do so
+ * without any lock and any enqueued futex (otherwise
+ * we could lose some wakeup). So we do it here, after
+ * undoing all the work done so far. In success, we
+ * retry all the work.
+ */
+ if (get_user(uval, uaddr))
+ return -EFAULT;
+
+ retry = true;
+ goto retry;
+ }
+
+ if (uval != val)
+ return -EWOULDBLOCK;
+ }
+
+ return 0;
+}
+
+/**
+ * futex_sleep_multiple - Check sleeping conditions and sleep
+ * @vs: List of futexes to wait for
+ * @count: Length of vs
+ * @to: Timeout
+ *
+ * Sleep if and only if the timeout hasn't expired and no futex on the list has
+ * been woken up.
+ */
+static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count,
+ struct hrtimer_sleeper *to)
+{
+ if (to && !to->task)
+ return;
+
+ for (; count; count--, vs++) {
+ if (!READ_ONCE(vs->q.lock_ptr))
+ return;
+ }
+
+ schedule();
+}
+
+/**
+ * futex_wait_multiple - Prepare to wait on and enqueue several futexes
+ * @vs: The list of futexes to wait on
+ * @count: The number of objects
+ * @to: Timeout before giving up and returning to userspace
+ *
+ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function
+ * sleeps on a group of futexes and returns on the first futex that is
+ * wake, or after the timeout has elapsed.
+ *
+ * Return:
+ * - >=0 - Hint to the futex that was awoken
+ * - <0 - On error
+ */
+int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
+ struct hrtimer_sleeper *to)
+{
+ int ret, hint = 0;
+
+ if (to)
+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
+
+ while (1) {
+ ret = futex_wait_multiple_setup(vs, count, &hint);
+ if (ret) {
+ if (ret > 0) {
+ /* A futex was woken during setup */
+ ret = hint;
+ }
+ return ret;
+ }
+
+ futex_sleep_multiple(vs, count, to);
+
+ __set_current_state(TASK_RUNNING);
+
+ ret = futex_unqueue_multiple(vs, count);
+ if (ret >= 0)
+ return ret;
+
+ if (to && !to->task)
+ return -ETIMEDOUT;
+ else if (signal_pending(current))
+ return -ERESTARTSYS;
+ /*
+ * The final case is a spurious wakeup, for
+ * which just retry.
+ */
+ }
+}
+
+/**
+ * futex_wait_setup() - Prepare to wait on a futex
+ * @uaddr: the futex userspace address
+ * @val: the expected value
+ * @flags: futex flags (FLAGS_SHARED, etc.)
+ * @q: the associated futex_q
+ * @hb: storage for hash_bucket pointer to be returned to caller
+ *
+ * Setup the futex_q and locate the hash_bucket. Get the futex value and
+ * compare it with the expected value. Handle atomic faults internally.
+ * Return with the hb lock held on success, and unlocked on failure.
+ *
+ * Return:
+ * - 0 - uaddr contains val and hb has been locked;
+ * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
+ */
+int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
+ struct futex_q *q, struct futex_hash_bucket **hb)
+{
+ u32 uval;
+ int ret;
+
+ /*
+ * Access the page AFTER the hash-bucket is locked.
+ * Order is important:
+ *
+ * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
+ * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
+ *
+ * The basic logical guarantee of a futex is that it blocks ONLY
+ * if cond(var) is known to be true at the time of blocking, for
+ * any cond. If we locked the hash-bucket after testing *uaddr, that
+ * would open a race condition where we could block indefinitely with
+ * cond(var) false, which would violate the guarantee.
+ *
+ * On the other hand, we insert q and release the hash-bucket only
+ * after testing *uaddr. This guarantees that futex_wait() will NOT
+ * absorb a wakeup if *uaddr does not match the desired values
+ * while the syscall executes.
+ */
+retry:
+ ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+
+retry_private:
+ *hb = futex_q_lock(q);
+
+ ret = futex_get_value_locked(&uval, uaddr);
+
+ if (ret) {
+ futex_q_unlock(*hb);
+
+ ret = get_user(uval, uaddr);
+ if (ret)
+ return ret;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ goto retry;
+ }
+
+ if (uval != val) {
+ futex_q_unlock(*hb);
+ ret = -EWOULDBLOCK;
+ }
+
+ return ret;
+}
+
+int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ struct hrtimer_sleeper *to, u32 bitset)
+{
+ struct futex_q q = futex_q_init;
+ struct futex_hash_bucket *hb;
+ int ret;
+
+ if (!bitset)
+ return -EINVAL;
+
+ q.bitset = bitset;
+
+retry:
+ /*
+ * Prepare to wait on uaddr. On success, it holds hb->lock and q
+ * is initialized.
+ */
+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
+ if (ret)
+ return ret;
+
+ /* futex_queue and wait for wakeup, timeout, or a signal. */
+ futex_wait_queue(hb, &q, to);
+
+ /* If we were woken (and unqueued), we succeeded, whatever. */
+ if (!futex_unqueue(&q))
+ return 0;
+
+ if (to && !to->task)
+ return -ETIMEDOUT;
+
+ /*
+ * We expect signal_pending(current), but we might be the
+ * victim of a spurious wakeup as well.
+ */
+ if (!signal_pending(current))
+ goto retry;
+
+ return -ERESTARTSYS;
+}
+
+int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
+{
+ struct hrtimer_sleeper timeout, *to;
+ struct restart_block *restart;
+ int ret;
+
+ to = futex_setup_timer(abs_time, &timeout, flags,
+ current->timer_slack_ns);
+
+ ret = __futex_wait(uaddr, flags, val, to, bitset);
+
+ /* No timeout, nothing to clean up. */
+ if (!to)
+ return ret;
+
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+
+ if (ret == -ERESTARTSYS) {
+ restart = &current->restart_block;
+ restart->futex.uaddr = uaddr;
+ restart->futex.val = val;
+ restart->futex.time = *abs_time;
+ restart->futex.bitset = bitset;
+ restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
+
+ return set_restart_fn(restart, futex_wait_restart);
+ }
+
+ return ret;
+}
+
+static long futex_wait_restart(struct restart_block *restart)
+{
+ u32 __user *uaddr = restart->futex.uaddr;
+ ktime_t t, *tp = NULL;
+
+ if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
+ t = restart->futex.time;
+ tp = &t;
+ }
+ restart->fn = do_no_restart_syscall;
+
+ return (long)futex_wait(uaddr, restart->futex.flags,
+ restart->futex.val, tp, restart->futex.bitset);
+}
+
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 3110c77230c7..04f4ebdc3cf5 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -4,7 +4,8 @@ menu "GCOV-based kernel profiling"
config GCOV_KERNEL
bool "Enable gcov-based kernel profiling"
depends on DEBUG_FS
- select CONSTRUCTORS if !UML
+ depends on !ARCH_WANTS_NO_INSTR || CC_HAS_NO_PROFILE_FN_ATTR
+ select CONSTRUCTORS
default n
help
This option enables gcov-based code profiling (e.g. for code coverage
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 16f8ecc7d882..ccd02afaeffb 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -3,4 +3,6 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
obj-y := base.o fs.o
obj-$(CONFIG_CC_IS_GCC) += gcc_base.o gcc_4_7.o
+CFLAGS_gcc_base.o += -Wno-missing-prototypes -Wno-missing-declarations
obj-$(CONFIG_CC_IS_CLANG) += clang.o
+CFLAGS_clang.o += -Wno-missing-prototypes -Wno-missing-declarations
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 0ffe9f194080..073a3738c5e6 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -49,6 +49,55 @@ void gcov_enable_events(void)
mutex_unlock(&gcov_lock);
}
+/**
+ * store_gcov_u32 - store 32 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
+ * store anything.
+ */
+size_t store_gcov_u32(void *buffer, size_t off, u32 v)
+{
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+ *data = v;
+ }
+
+ return sizeof(*data);
+}
+
+/**
+ * store_gcov_u64 - store 64 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
+ * anything.
+ */
+size_t store_gcov_u64(void *buffer, size_t off, u64 v)
+{
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+
+ data[0] = (v & 0xffffffffUL);
+ data[1] = (v >> 32);
+ }
+
+ return sizeof(*data) * 2;
+}
+
#ifdef CONFIG_MODULES
/* Update list and generate events when modules are unloaded. */
static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
index c94b820a1b62..7670a811a565 100644
--- a/kernel/gcov/clang.c
+++ b/kernel/gcov/clang.c
@@ -48,9 +48,8 @@
#include <linux/list.h>
#include <linux/printk.h>
#include <linux/ratelimit.h>
-#include <linux/seq_file.h>
#include <linux/slab.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include "gcov.h"
typedef void (*llvm_gcov_callback)(void);
@@ -70,12 +69,10 @@ struct gcov_fn_info {
u32 ident;
u32 checksum;
- u8 use_extra_checksum;
u32 cfg_checksum;
u32 num_counters;
u64 *counters;
- const char *function_name;
};
static struct gcov_info *current_info;
@@ -105,17 +102,15 @@ void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush)
}
EXPORT_SYMBOL(llvm_gcov_init);
-void llvm_gcda_start_file(const char *orig_filename, const char version[4],
- u32 checksum)
+void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum)
{
current_info->filename = orig_filename;
- memcpy(&current_info->version, version, sizeof(current_info->version));
+ current_info->version = version;
current_info->checksum = checksum;
}
EXPORT_SYMBOL(llvm_gcda_start_file);
-void llvm_gcda_emit_function(u32 ident, const char *function_name,
- u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum)
+void llvm_gcda_emit_function(u32 ident, u32 func_checksum, u32 cfg_checksum)
{
struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
@@ -125,11 +120,7 @@ void llvm_gcda_emit_function(u32 ident, const char *function_name,
INIT_LIST_HEAD(&info->head);
info->ident = ident;
info->checksum = func_checksum;
- info->use_extra_checksum = use_extra_checksum;
info->cfg_checksum = cfg_checksum;
- if (function_name)
- info->function_name = kstrdup(function_name, GFP_KERNEL);
-
list_add_tail(&info->head, &current_info->functions);
}
EXPORT_SYMBOL(llvm_gcda_emit_function);
@@ -262,10 +253,7 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
!list_is_last(&fn_ptr2->head, &info2->functions)) {
if (fn_ptr1->checksum != fn_ptr2->checksum)
return false;
- if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum)
- return false;
- if (fn_ptr1->use_extra_checksum &&
- fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
+ if (fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
return false;
fn_ptr1 = list_next_entry(fn_ptr1, head);
fn_ptr2 = list_next_entry(fn_ptr2, head);
@@ -292,6 +280,8 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
for (i = 0; i < sfn_ptr->num_counters; i++)
dfn_ptr->counters[i] += sfn_ptr->counters[i];
+
+ sfn_ptr = list_next_entry(sfn_ptr, head);
}
}
@@ -304,23 +294,16 @@ static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
return NULL;
INIT_LIST_HEAD(&fn_dup->head);
- fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL);
- if (!fn_dup->function_name)
- goto err_name;
-
cv_size = fn->num_counters * sizeof(fn->counters[0]);
- fn_dup->counters = vmalloc(cv_size);
- if (!fn_dup->counters)
- goto err_counters;
+ fn_dup->counters = kvmalloc(cv_size, GFP_KERNEL);
+ if (!fn_dup->counters) {
+ kfree(fn_dup);
+ return NULL;
+ }
+
memcpy(fn_dup->counters, fn->counters, cv_size);
return fn_dup;
-
-err_counters:
- kfree(fn_dup->function_name);
-err_name:
- kfree(fn_dup);
- return NULL;
}
/**
@@ -367,8 +350,7 @@ void gcov_info_free(struct gcov_info *info)
struct gcov_fn_info *fn, *tmp;
list_for_each_entry_safe(fn, tmp, &info->functions, head) {
- kfree(fn->function_name);
- vfree(fn->counters);
+ kvfree(fn->counters);
list_del(&fn->head);
kfree(fn);
}
@@ -376,71 +358,6 @@ void gcov_info_free(struct gcov_info *info)
kfree(info);
}
-#define ITER_STRIDE PAGE_SIZE
-
-/**
- * struct gcov_iterator - specifies current file position in logical records
- * @info: associated profiling data
- * @buffer: buffer containing file data
- * @size: size of buffer
- * @pos: current position in file
- */
-struct gcov_iterator {
- struct gcov_info *info;
- void *buffer;
- size_t size;
- loff_t pos;
-};
-
-/**
- * store_gcov_u32 - store 32 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
- * store anything.
- */
-static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
-{
- u32 *data;
-
- if (buffer) {
- data = buffer + off;
- *data = v;
- }
-
- return sizeof(*data);
-}
-
-/**
- * store_gcov_u64 - store 64 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. 64 bit numbers are stored as two 32 bit numbers, the low part
- * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
- * anything.
- */
-static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
-{
- u32 *data;
-
- if (buffer) {
- data = buffer + off;
-
- data[0] = (v & 0xffffffffUL);
- data[1] = (v >> 32);
- }
-
- return sizeof(*data) * 2;
-}
-
/**
* convert_to_gcda - convert profiling data set to gcda file format
* @buffer: the buffer to store file data or %NULL if no data should be stored
@@ -448,7 +365,7 @@ static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
*
* Returns the number of bytes that were/would have been stored into the buffer.
*/
-static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+size_t convert_to_gcda(char *buffer, struct gcov_info *info)
{
struct gcov_fn_info *fi_ptr;
size_t pos = 0;
@@ -460,18 +377,12 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
list_for_each_entry(fi_ptr, &info->functions, head) {
u32 i;
- u32 len = 2;
-
- if (fi_ptr->use_extra_checksum)
- len++;
pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
- pos += store_gcov_u32(buffer, pos, len);
+ pos += store_gcov_u32(buffer, pos, 3);
pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
pos += store_gcov_u32(buffer, pos, fi_ptr->checksum);
- if (fi_ptr->use_extra_checksum)
- pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
-
+ pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE);
pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2);
for (i = 0; i < fi_ptr->num_counters; i++)
@@ -480,102 +391,3 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
return pos;
}
-
-/**
- * gcov_iter_new - allocate and initialize profiling data iterator
- * @info: profiling data set to be iterated
- *
- * Return file iterator on success, %NULL otherwise.
- */
-struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
-{
- struct gcov_iterator *iter;
-
- iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
- if (!iter)
- goto err_free;
-
- iter->info = info;
- /* Dry-run to get the actual buffer size. */
- iter->size = convert_to_gcda(NULL, info);
- iter->buffer = vmalloc(iter->size);
- if (!iter->buffer)
- goto err_free;
-
- convert_to_gcda(iter->buffer, info);
-
- return iter;
-
-err_free:
- kfree(iter);
- return NULL;
-}
-
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-void gcov_iter_free(struct gcov_iterator *iter)
-{
- vfree(iter->buffer);
- kfree(iter);
-}
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
-{
- return iter->info;
-}
-
-/**
- * gcov_iter_start - reset file iterator to starting position
- * @iter: file iterator
- */
-void gcov_iter_start(struct gcov_iterator *iter)
-{
- iter->pos = 0;
-}
-
-/**
- * gcov_iter_next - advance file iterator to next logical record
- * @iter: file iterator
- *
- * Return zero if new position is valid, non-zero if iterator has reached end.
- */
-int gcov_iter_next(struct gcov_iterator *iter)
-{
- if (iter->pos < iter->size)
- iter->pos += ITER_STRIDE;
-
- if (iter->pos >= iter->size)
- return -EINVAL;
-
- return 0;
-}
-
-/**
- * gcov_iter_write - write data for current pos to seq_file
- * @iter: file iterator
- * @seq: seq_file handle
- *
- * Return zero on success, non-zero otherwise.
- */
-int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
-{
- size_t len;
-
- if (iter->pos >= iter->size)
- return -EINVAL;
-
- len = ITER_STRIDE;
- if (iter->pos + len > iter->size)
- len = iter->size - iter->pos;
-
- seq_write(seq, iter->buffer + iter->pos, len);
-
- return 0;
-}
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 82babf5aa077..01520689b57c 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -26,6 +26,7 @@
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
+#include <linux/mm.h>
#include "gcov.h"
/**
@@ -85,6 +86,115 @@ static int __init gcov_persist_setup(char *str)
}
__setup("gcov_persist=", gcov_persist_setup);
+#define ITER_STRIDE PAGE_SIZE
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @buffer: buffer containing file data
+ * @size: size of buffer
+ * @pos: current position in file
+ */
+struct gcov_iterator {
+ struct gcov_info *info;
+ size_t size;
+ loff_t pos;
+ char buffer[] __counted_by(size);
+};
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+static struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+ struct gcov_iterator *iter;
+ size_t size;
+
+ /* Dry-run to get the actual buffer size. */
+ size = convert_to_gcda(NULL, info);
+
+ iter = kvmalloc(struct_size(iter, buffer, size), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ iter->info = info;
+ iter->size = size;
+ convert_to_gcda(iter->buffer, info);
+
+ return iter;
+}
+
+
+/**
+ * gcov_iter_free - free iterator data
+ * @iter: file iterator
+ */
+static void gcov_iter_free(struct gcov_iterator *iter)
+{
+ kvfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+static struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+ return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+static void gcov_iter_start(struct gcov_iterator *iter)
+{
+ iter->pos = 0;
+}
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+static int gcov_iter_next(struct gcov_iterator *iter)
+{
+ if (iter->pos < iter->size)
+ iter->pos += ITER_STRIDE;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+static int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+ size_t len;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ len = ITER_STRIDE;
+ if (iter->pos + len > iter->size)
+ len = iter->size - iter->pos;
+
+ seq_write(seq, iter->buffer + iter->pos, len);
+
+ return 0;
+}
+
/*
* seq_file.start() implementation for gcov data files. Note that the
* gcov_iterator interface is designed to be more restrictive than seq_file
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 908fdf5098c3..74a4ef1da9ad 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -15,22 +15,28 @@
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/string.h>
-#include <linux/seq_file.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include "gcov.h"
-#if (__GNUC__ >= 7)
+#if (__GNUC__ >= 10)
+#define GCOV_COUNTERS 8
+#elif (__GNUC__ >= 7)
#define GCOV_COUNTERS 9
#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
#define GCOV_COUNTERS 10
-#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9
-#define GCOV_COUNTERS 9
#else
-#define GCOV_COUNTERS 8
+#define GCOV_COUNTERS 9
#endif
#define GCOV_TAG_FUNCTION_LENGTH 3
+/* Since GCC 12.1 sizes are in BYTES and not in WORDS (4B). */
+#if (__GNUC__ >= 12)
+#define GCOV_UNIT_SIZE 4
+#else
+#define GCOV_UNIT_SIZE 1
+#endif
+
static struct gcov_info *gcov_info_head;
/**
@@ -76,6 +82,7 @@ struct gcov_fn_info {
* @version: gcov version magic indicating the gcc version used for compilation
* @next: list head for a singly-linked list
* @stamp: uniquifying time stamp
+ * @checksum: unique object checksum
* @filename: name of the associated gcov data file
* @merge: merge functions (null for unused counter type)
* @n_functions: number of instrumented functions
@@ -88,6 +95,10 @@ struct gcov_info {
unsigned int version;
struct gcov_info *next;
unsigned int stamp;
+ /* Since GCC 12.1 a checksum field is added. */
+#if (__GNUC__ >= 12)
+ unsigned int checksum;
+#endif
const char *filename;
void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int);
unsigned int n_functions;
@@ -227,10 +238,10 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
/**
* gcov_info_add - add up profiling data
- * @dest: profiling data set to which data is added
- * @source: profiling data set which is added
+ * @dst: profiling data set to which data is added
+ * @src: profiling data set which is added
*
- * Adds profiling counts of @source to @dest.
+ * Adds profiling counts of @src to @dst.
*/
void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
{
@@ -310,7 +321,7 @@ struct gcov_info *gcov_info_dup(struct gcov_info *info)
cv_size = sizeof(gcov_type) * sci_ptr->num;
- dci_ptr->values = vmalloc(cv_size);
+ dci_ptr->values = kvmalloc(cv_size, GFP_KERNEL);
if (!dci_ptr->values)
goto err_free;
@@ -352,7 +363,7 @@ void gcov_info_free(struct gcov_info *info)
ci_ptr = info->functions[fi_idx]->ctrs;
for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++)
- vfree(ci_ptr->values);
+ kvfree(ci_ptr->values);
kfree(info->functions[fi_idx]);
}
@@ -363,71 +374,6 @@ free_info:
kfree(info);
}
-#define ITER_STRIDE PAGE_SIZE
-
-/**
- * struct gcov_iterator - specifies current file position in logical records
- * @info: associated profiling data
- * @buffer: buffer containing file data
- * @size: size of buffer
- * @pos: current position in file
- */
-struct gcov_iterator {
- struct gcov_info *info;
- void *buffer;
- size_t size;
- loff_t pos;
-};
-
-/**
- * store_gcov_u32 - store 32 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
- * store anything.
- */
-static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
-{
- u32 *data;
-
- if (buffer) {
- data = buffer + off;
- *data = v;
- }
-
- return sizeof(*data);
-}
-
-/**
- * store_gcov_u64 - store 64 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. 64 bit numbers are stored as two 32 bit numbers, the low part
- * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
- * anything.
- */
-static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
-{
- u32 *data;
-
- if (buffer) {
- data = buffer + off;
-
- data[0] = (v & 0xffffffffUL);
- data[1] = (v >> 32);
- }
-
- return sizeof(*data) * 2;
-}
-
/**
* convert_to_gcda - convert profiling data set to gcda file format
* @buffer: the buffer to store file data or %NULL if no data should be stored
@@ -435,7 +381,7 @@ static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
*
* Returns the number of bytes that were/would have been stored into the buffer.
*/
-static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+size_t convert_to_gcda(char *buffer, struct gcov_info *info)
{
struct gcov_fn_info *fi_ptr;
struct gcov_ctr_info *ci_ptr;
@@ -449,12 +395,18 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
pos += store_gcov_u32(buffer, pos, info->version);
pos += store_gcov_u32(buffer, pos, info->stamp);
+#if (__GNUC__ >= 12)
+ /* Use zero as checksum of the compilation unit. */
+ pos += store_gcov_u32(buffer, pos, 0);
+#endif
+
for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
fi_ptr = info->functions[fi_idx];
/* Function record. */
pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
- pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH);
+ pos += store_gcov_u32(buffer, pos,
+ GCOV_TAG_FUNCTION_LENGTH * GCOV_UNIT_SIZE);
pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum);
pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
@@ -468,7 +420,8 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
/* Counter record. */
pos += store_gcov_u32(buffer, pos,
GCOV_TAG_FOR_COUNTER(ct_idx));
- pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2);
+ pos += store_gcov_u32(buffer, pos,
+ ci_ptr->num * 2 * GCOV_UNIT_SIZE);
for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) {
pos += store_gcov_u64(buffer, pos,
@@ -481,102 +434,3 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
return pos;
}
-
-/**
- * gcov_iter_new - allocate and initialize profiling data iterator
- * @info: profiling data set to be iterated
- *
- * Return file iterator on success, %NULL otherwise.
- */
-struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
-{
- struct gcov_iterator *iter;
-
- iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
- if (!iter)
- goto err_free;
-
- iter->info = info;
- /* Dry-run to get the actual buffer size. */
- iter->size = convert_to_gcda(NULL, info);
- iter->buffer = vmalloc(iter->size);
- if (!iter->buffer)
- goto err_free;
-
- convert_to_gcda(iter->buffer, info);
-
- return iter;
-
-err_free:
- kfree(iter);
- return NULL;
-}
-
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-void gcov_iter_free(struct gcov_iterator *iter)
-{
- vfree(iter->buffer);
- kfree(iter);
-}
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
-{
- return iter->info;
-}
-
-/**
- * gcov_iter_start - reset file iterator to starting position
- * @iter: file iterator
- */
-void gcov_iter_start(struct gcov_iterator *iter)
-{
- iter->pos = 0;
-}
-
-/**
- * gcov_iter_next - advance file iterator to next logical record
- * @iter: file iterator
- *
- * Return zero if new position is valid, non-zero if iterator has reached end.
- */
-int gcov_iter_next(struct gcov_iterator *iter)
-{
- if (iter->pos < iter->size)
- iter->pos += ITER_STRIDE;
-
- if (iter->pos >= iter->size)
- return -EINVAL;
-
- return 0;
-}
-
-/**
- * gcov_iter_write - write data for current pos to seq_file
- * @iter: file iterator
- * @seq: seq_file handle
- *
- * Return zero on success, non-zero otherwise.
- */
-int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
-{
- size_t len;
-
- if (iter->pos >= iter->size)
- return -EINVAL;
-
- len = ITER_STRIDE;
- if (iter->pos + len > iter->size)
- len = iter->size - iter->pos;
-
- seq_write(seq, iter->buffer + iter->pos, len);
-
- return 0;
-}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index 6ab2c1808c9d..912b8ea01d33 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -48,6 +48,7 @@ struct gcov_info *gcov_info_next(struct gcov_info *info);
void gcov_info_link(struct gcov_info *info);
void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
bool gcov_info_within_module(struct gcov_info *info, struct module *mod);
+size_t convert_to_gcda(char *buffer, struct gcov_info *info);
/* Base interface. */
enum gcov_action {
@@ -58,16 +59,9 @@ enum gcov_action {
void gcov_event(enum gcov_action action, struct gcov_info *info);
void gcov_enable_events(void);
-/* Iterator control. */
-struct seq_file;
-struct gcov_iterator;
-
-struct gcov_iterator *gcov_iter_new(struct gcov_info *info);
-void gcov_iter_free(struct gcov_iterator *iter);
-void gcov_iter_start(struct gcov_iterator *iter);
-int gcov_iter_next(struct gcov_iterator *iter);
-int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq);
-struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter);
+/* writing helpers */
+size_t store_gcov_u32(void *buffer, size_t off, u32 v);
+size_t store_gcov_u64(void *buffer, size_t off, u64 v);
/* gcov_info control. */
void gcov_info_reset(struct gcov_info *info);
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index c1510f0ab3ea..6d443ea22bb7 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -7,13 +7,15 @@ set -e
sfile="$(readlink -f "$0")"
outdir="$(pwd)"
tarfile=$1
-cpio_dir=$outdir/$tarfile.tmp
+cpio_dir=$outdir/${tarfile%/*}/.tmp_cpio_dir
dir_list="
include/
arch/$SRCARCH/include/
"
+type cpio > /dev/null
+
# Support incremental builds by skipping archive generation
# if timestamps of files being archived are not changed.
@@ -31,18 +33,18 @@ if [ "$building_out_of_srctree" ]; then
fi
all_dirs="$all_dirs $dir_list"
-# include/generated/compile.h is ignored because it is touched even when none
-# of the source files changed.
+# include/generated/utsversion.h is ignored because it is generated after this
+# script is executed. (utsversion.h is unneeded for kheaders)
#
# When Kconfig regenerates include/generated/autoconf.h, its timestamp is
# updated, but the contents might be still the same. When any CONFIG option is
-# changed, Kconfig touches the corresponding timestamp file include/config/*.h.
+# changed, Kconfig touches the corresponding timestamp file include/config/*.
# Hence, the md5sum detects the configuration change anyway. We do not need to
# check include/generated/autoconf.h explicitly.
#
# Ignore them for md5 calculation to avoid pointless regeneration.
headers_md5="$(find $all_dirs -name "*.h" |
- grep -v "include/generated/compile.h" |
+ grep -v "include/generated/utsversion.h" |
grep -v "include/generated/autoconf.h" |
xargs ls -l | md5sum | cut -d ' ' -f1)"
@@ -56,9 +58,7 @@ if [ -f kernel/kheaders.md5 ] &&
exit
fi
-if [ "${quiet}" != "silent_" ]; then
- echo " GEN $tarfile"
-fi
+echo " GEN $tarfile"
rm -rf $cpio_dir
mkdir $cpio_dir
@@ -76,19 +76,16 @@ fi
# of tree builds having stale headers in srctree. Just silence CPIO for now.
for f in $dir_list;
do find "$f" -name "*.h";
-done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
+done | cpio --quiet -pdu $cpio_dir >/dev/null 2>&1
# Remove comments except SDPX lines
find $cpio_dir -type f -print0 |
xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;'
# Create archive and try to normalize metadata for reproducibility.
-# For compatibility with older versions of tar, files are fed to tar
-# pre-sorted, as --sort=name might not be available.
-find $cpio_dir -printf "./%P\n" | LC_ALL=C sort | \
- tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \
- --owner=0 --group=0 --numeric-owner --no-recursion \
- -I $XZ -cf $tarfile -C $cpio_dir/ -T - > /dev/null
+tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \
+ --owner=0 --group=0 --sort=name --numeric-owner \
+ -I $XZ -cf $tarfile -C $cpio_dir/ . > /dev/null
echo $headers_md5 > kernel/kheaders.md5
echo "$this_file_md5" >> kernel/kheaders.md5
diff --git a/kernel/groups.c b/kernel/groups.c
index 6ee6691f6839..9b43da22647d 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -15,16 +15,11 @@
struct group_info *groups_alloc(int gidsetsize)
{
struct group_info *gi;
- unsigned int len;
-
- len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize;
- gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY);
- if (!gi)
- gi = __vmalloc(len, GFP_KERNEL_ACCOUNT);
+ gi = kvmalloc(struct_size(gi, gid, gidsetsize), GFP_KERNEL_ACCOUNT);
if (!gi)
return NULL;
- atomic_set(&gi->usage, 1);
+ refcount_set(&gi->usage, 1);
gi->ngroups = gidsetsize;
return gi;
}
@@ -139,13 +134,26 @@ EXPORT_SYMBOL(set_groups);
int set_current_groups(struct group_info *group_info)
{
struct cred *new;
+ const struct cred *old;
+ int retval;
new = prepare_creds();
if (!new)
return -ENOMEM;
+ old = current_cred();
+
set_groups(new, group_info);
+
+ retval = security_task_fix_setgroups(new, old);
+ if (retval < 0)
+ goto error;
+
return commit_creds(new);
+
+error:
+ abort_creds(new);
+ return retval;
}
EXPORT_SYMBOL(set_current_groups);
@@ -178,7 +186,7 @@ bool may_setgroups(void)
{
struct user_namespace *user_ns = current_user_ns();
- return ns_capable(user_ns, CAP_SETGID) &&
+ return ns_capable_setid(user_ns, CAP_SETGID) &&
userns_may_setgroups(user_ns);
}
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ce76f490126c..9a24574988d2 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -15,6 +15,7 @@
#include <linux/kthread.h>
#include <linux/lockdep.h>
#include <linux/export.h>
+#include <linux/panic_notifier.h>
#include <linux/sysctl.h>
#include <linux/suspend.h>
#include <linux/utsname.h>
@@ -27,7 +28,7 @@
/*
* The number of tasks checked:
*/
-int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
/*
* Limit number of tasks checked in a batch.
@@ -46,9 +47,9 @@ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_
/*
* Zero (default value) means use sysctl_hung_task_timeout_secs:
*/
-unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
+static unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
-int __read_mostly sysctl_hung_task_warnings = 10;
+static int __read_mostly sysctl_hung_task_warnings = 10;
static int __read_mostly did_panic;
static bool hung_task_show_lock;
@@ -62,15 +63,17 @@ static struct task_struct *watchdog_task;
* Should we dump all CPUs backtraces in a hung task event?
* Defaults to 0, can be changed via sysctl.
*/
-unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
+static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
+#else
+#define sysctl_hung_task_all_cpu_backtrace 0
#endif /* CONFIG_SMP */
/*
* Should we panic (and reboot, if panic_timeout= is set) when a
* hung task is detected:
*/
-unsigned int __read_mostly sysctl_hung_task_panic =
- CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
+static unsigned int __read_mostly sysctl_hung_task_panic =
+ IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC);
static int
hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -92,8 +95,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
* Ensure the task is not frozen.
* Also, skip vfork and any other user process that freezer should skip.
*/
- if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
- return;
+ if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN))
+ return;
/*
* When a freshly created task is scheduled once, changes its state to
@@ -139,6 +142,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
if (sysctl_hung_task_all_cpu_backtrace)
hung_task_show_all_bt = true;
+ if (!sysctl_hung_task_warnings)
+ pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n");
}
touch_nmi_watchdog();
@@ -188,6 +193,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
hung_task_show_lock = false;
rcu_read_lock();
for_each_process_thread(g, t) {
+ unsigned int state;
+
if (!max_count--)
goto unlock;
if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
@@ -195,8 +202,14 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
goto unlock;
last_break = jiffies;
}
- /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
- if (t->state == TASK_UNINTERRUPTIBLE)
+ /*
+ * skip the TASK_KILLABLE tasks -- these can be killed
+ * skip the TASK_IDLE tasks -- those are genuinely idle
+ */
+ state = READ_ONCE(t->__state);
+ if ((state & TASK_UNINTERRUPTIBLE) &&
+ !(state & TASK_WAKEKILL) &&
+ !(state & TASK_NOLOAD))
check_hung_task(t, timeout);
}
unlock:
@@ -221,11 +234,12 @@ static long hung_timeout_jiffies(unsigned long last_checked,
MAX_SCHEDULE_TIMEOUT;
}
+#ifdef CONFIG_SYSCTL
/*
* Process updating of timeout sysctl
*/
-int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
- void __user *buffer,
+static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+ void *buffer,
size_t *lenp, loff_t *ppos)
{
int ret;
@@ -241,6 +255,76 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
return ret;
}
+/*
+ * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs
+ * and hung_task_check_interval_secs
+ */
+static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ);
+static struct ctl_table hung_task_sysctls[] = {
+#ifdef CONFIG_SMP
+ {
+ .procname = "hung_task_all_cpu_backtrace",
+ .data = &sysctl_hung_task_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_SMP */
+ {
+ .procname = "hung_task_panic",
+ .data = &sysctl_hung_task_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "hung_task_check_count",
+ .data = &sysctl_hung_task_check_count,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "hung_task_timeout_secs",
+ .data = &sysctl_hung_task_timeout_secs,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_dohung_task_timeout_secs,
+ .extra2 = (void *)&hung_task_timeout_max,
+ },
+ {
+ .procname = "hung_task_check_interval_secs",
+ .data = &sysctl_hung_task_check_interval_secs,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_dohung_task_timeout_secs,
+ .extra2 = (void *)&hung_task_timeout_max,
+ },
+ {
+ .procname = "hung_task_warnings",
+ .data = &sysctl_hung_task_warnings,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_NEG_ONE,
+ },
+ {}
+};
+
+static void __init hung_task_sysctl_init(void)
+{
+ register_sysctl_init("kernel", hung_task_sysctls);
+}
+#else
+#define hung_task_sysctl_init() do { } while (0)
+#endif /* CONFIG_SYSCTL */
+
+
static atomic_t reset_hung_task = ATOMIC_INIT(0);
void reset_hung_task_detector(void)
@@ -310,6 +394,7 @@ static int __init hung_task_init(void)
pm_notifier(hungtask_pm_notify, 0);
watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
+ hung_task_sysctl_init();
return 0;
}
diff --git a/kernel/iomem.c b/kernel/iomem.c
index 62c92e43aa0d..dc2120776e1c 100644
--- a/kernel/iomem.c
+++ b/kernel/iomem.c
@@ -3,19 +3,16 @@
#include <linux/types.h>
#include <linux/io.h>
#include <linux/mm.h>
-
-#ifndef ioremap_cache
-/* temporary while we convert existing ioremap_cache users to memremap */
-__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
-{
- return ioremap(offset, size);
-}
-#endif
+#include <linux/ioremap.h>
#ifndef arch_memremap_wb
static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
{
+#ifdef ioremap_cache
return (__force void *)ioremap_cache(offset, size);
+#else
+ return (__force void *)ioremap(offset, size);
+#endif
}
#endif
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 20512252ecc9..2531f3496ab6 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -24,11 +24,7 @@ config GENERIC_IRQ_SHOW_LEVEL
# Supports effective affinity mask
config GENERIC_IRQ_EFFECTIVE_AFF_MASK
- bool
-
-# Facility to allocate a hardware interrupt. This is legacy support
-# and should not be used in new code. Use irq domains instead.
-config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
+ depends on SMP
bool
# Support for delayed migration from interrupt context
@@ -51,10 +47,6 @@ config GENERIC_IRQ_INJECTION
config HARDIRQS_SW_RESEND
bool
-# Preflow handler support for fasteoi (sparc64)
-config IRQ_PREFLOW_FASTEOI
- bool
-
# Edge style eoi based handler (cell)
config IRQ_EDGE_EOI_HANDLER
bool
@@ -79,6 +71,11 @@ config IRQ_DOMAIN_HIERARCHY
bool
select IRQ_DOMAIN
+# Support for obsolete non-mapping irq domains
+config IRQ_DOMAIN_NOMAP
+ bool
+ select IRQ_DOMAIN
+
# Support for hierarchical fasteoi+edge and fasteoi+level handlers
config IRQ_FASTEOI_HIERARCHY_HANDLERS
bool
@@ -86,23 +83,22 @@ config IRQ_FASTEOI_HIERARCHY_HANDLERS
# Generic IRQ IPI support
config GENERIC_IRQ_IPI
bool
+ depends on SMP
+ select IRQ_DOMAIN_HIERARCHY
-# Generic MSI interrupt support
-config GENERIC_MSI_IRQ
+# Generic IRQ IPI Mux support
+config GENERIC_IRQ_IPI_MUX
bool
+ depends on SMP
# Generic MSI hierarchical interrupt domain support
-config GENERIC_MSI_IRQ_DOMAIN
+config GENERIC_MSI_IRQ
bool
select IRQ_DOMAIN_HIERARCHY
- select GENERIC_MSI_IRQ
config IRQ_MSI_IOMMU
bool
-config HANDLE_DOMAIN_IRQ
- bool
-
config IRQ_TIMINGS
bool
@@ -147,3 +143,10 @@ config GENERIC_IRQ_MULTI_HANDLER
bool
help
Allow to specify the low level IRQ handler at run time.
+
+# Cavium Octeon is the last system to use this deprecated option
+# Do not even think of enabling this on any new platform
+config DEPRECATED_IRQ_CPU_ONOFFLINE
+ bool
+ depends on CAVIUM_OCTEON_SOC
+ default CAVIUM_OCTEON_SOC
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index b4f53717d143..f19d3080bf11 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
obj-$(CONFIG_PM_SLEEP) += pm.o
obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
+obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o
obj-$(CONFIG_SMP) += affinity.o
obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 4d89ad4fae3b..44a4eba80315 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -7,397 +7,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/cpu.h>
-#include <linux/sort.h>
-
-static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
- unsigned int cpus_per_vec)
-{
- const struct cpumask *siblmsk;
- int cpu, sibl;
-
- for ( ; cpus_per_vec > 0; ) {
- cpu = cpumask_first(nmsk);
-
- /* Should not happen, but I'm too lazy to think about it */
- if (cpu >= nr_cpu_ids)
- return;
-
- cpumask_clear_cpu(cpu, nmsk);
- cpumask_set_cpu(cpu, irqmsk);
- cpus_per_vec--;
-
- /* If the cpu has siblings, use them first */
- siblmsk = topology_sibling_cpumask(cpu);
- for (sibl = -1; cpus_per_vec > 0; ) {
- sibl = cpumask_next(sibl, siblmsk);
- if (sibl >= nr_cpu_ids)
- break;
- if (!cpumask_test_and_clear_cpu(sibl, nmsk))
- continue;
- cpumask_set_cpu(sibl, irqmsk);
- cpus_per_vec--;
- }
- }
-}
-
-static cpumask_var_t *alloc_node_to_cpumask(void)
-{
- cpumask_var_t *masks;
- int node;
-
- masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL);
- if (!masks)
- return NULL;
-
- for (node = 0; node < nr_node_ids; node++) {
- if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL))
- goto out_unwind;
- }
-
- return masks;
-
-out_unwind:
- while (--node >= 0)
- free_cpumask_var(masks[node]);
- kfree(masks);
- return NULL;
-}
-
-static void free_node_to_cpumask(cpumask_var_t *masks)
-{
- int node;
-
- for (node = 0; node < nr_node_ids; node++)
- free_cpumask_var(masks[node]);
- kfree(masks);
-}
-
-static void build_node_to_cpumask(cpumask_var_t *masks)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
-}
-
-static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
- const struct cpumask *mask, nodemask_t *nodemsk)
-{
- int n, nodes = 0;
-
- /* Calculate the number of nodes in the supplied affinity mask */
- for_each_node(n) {
- if (cpumask_intersects(mask, node_to_cpumask[n])) {
- node_set(n, *nodemsk);
- nodes++;
- }
- }
- return nodes;
-}
-
-struct node_vectors {
- unsigned id;
-
- union {
- unsigned nvectors;
- unsigned ncpus;
- };
-};
-
-static int ncpus_cmp_func(const void *l, const void *r)
-{
- const struct node_vectors *ln = l;
- const struct node_vectors *rn = r;
-
- return ln->ncpus - rn->ncpus;
-}
-
-/*
- * Allocate vector number for each node, so that for each node:
- *
- * 1) the allocated number is >= 1
- *
- * 2) the allocated numbver is <= active CPU number of this node
- *
- * The actual allocated total vectors may be less than @numvecs when
- * active total CPU number is less than @numvecs.
- *
- * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
- * for each node.
- */
-static void alloc_nodes_vectors(unsigned int numvecs,
- cpumask_var_t *node_to_cpumask,
- const struct cpumask *cpu_mask,
- const nodemask_t nodemsk,
- struct cpumask *nmsk,
- struct node_vectors *node_vectors)
-{
- unsigned n, remaining_ncpus = 0;
-
- for (n = 0; n < nr_node_ids; n++) {
- node_vectors[n].id = n;
- node_vectors[n].ncpus = UINT_MAX;
- }
-
- for_each_node_mask(n, nodemsk) {
- unsigned ncpus;
-
- cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
- ncpus = cpumask_weight(nmsk);
-
- if (!ncpus)
- continue;
- remaining_ncpus += ncpus;
- node_vectors[n].ncpus = ncpus;
- }
-
- numvecs = min_t(unsigned, remaining_ncpus, numvecs);
-
- sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]),
- ncpus_cmp_func, NULL);
-
- /*
- * Allocate vectors for each node according to the ratio of this
- * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is
- * bigger than number of active numa nodes. Always start the
- * allocation from the node with minimized nr_cpus.
- *
- * This way guarantees that each active node gets allocated at
- * least one vector, and the theory is simple: over-allocation
- * is only done when this node is assigned by one vector, so
- * other nodes will be allocated >= 1 vector, since 'numvecs' is
- * bigger than number of numa nodes.
- *
- * One perfect invariant is that number of allocated vectors for
- * each node is <= CPU count of this node:
- *
- * 1) suppose there are two nodes: A and B
- * ncpu(X) is CPU count of node X
- * vecs(X) is the vector count allocated to node X via this
- * algorithm
- *
- * ncpu(A) <= ncpu(B)
- * ncpu(A) + ncpu(B) = N
- * vecs(A) + vecs(B) = V
- *
- * vecs(A) = max(1, round_down(V * ncpu(A) / N))
- * vecs(B) = V - vecs(A)
- *
- * both N and V are integer, and 2 <= V <= N, suppose
- * V = N - delta, and 0 <= delta <= N - 2
- *
- * 2) obviously vecs(A) <= ncpu(A) because:
- *
- * if vecs(A) is 1, then vecs(A) <= ncpu(A) given
- * ncpu(A) >= 1
- *
- * otherwise,
- * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N
- *
- * 3) prove how vecs(B) <= ncpu(B):
- *
- * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be
- * over-allocated, so vecs(B) <= ncpu(B),
- *
- * otherwise:
- *
- * vecs(A) =
- * round_down(V * ncpu(A) / N) =
- * round_down((N - delta) * ncpu(A) / N) =
- * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
- * round_down((N * ncpu(A) - delta * N) / N) =
- * cpu(A) - delta
- *
- * then:
- *
- * vecs(A) - V >= ncpu(A) - delta - V
- * =>
- * V - vecs(A) <= V + delta - ncpu(A)
- * =>
- * vecs(B) <= N - ncpu(A)
- * =>
- * vecs(B) <= cpu(B)
- *
- * For nodes >= 3, it can be thought as one node and another big
- * node given that is exactly what this algorithm is implemented,
- * and we always re-calculate 'remaining_ncpus' & 'numvecs', and
- * finally for each node X: vecs(X) <= ncpu(X).
- *
- */
- for (n = 0; n < nr_node_ids; n++) {
- unsigned nvectors, ncpus;
-
- if (node_vectors[n].ncpus == UINT_MAX)
- continue;
-
- WARN_ON_ONCE(numvecs == 0);
-
- ncpus = node_vectors[n].ncpus;
- nvectors = max_t(unsigned, 1,
- numvecs * ncpus / remaining_ncpus);
- WARN_ON_ONCE(nvectors > ncpus);
-
- node_vectors[n].nvectors = nvectors;
-
- remaining_ncpus -= ncpus;
- numvecs -= nvectors;
- }
-}
-
-static int __irq_build_affinity_masks(unsigned int startvec,
- unsigned int numvecs,
- unsigned int firstvec,
- cpumask_var_t *node_to_cpumask,
- const struct cpumask *cpu_mask,
- struct cpumask *nmsk,
- struct irq_affinity_desc *masks)
-{
- unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
- unsigned int last_affv = firstvec + numvecs;
- unsigned int curvec = startvec;
- nodemask_t nodemsk = NODE_MASK_NONE;
- struct node_vectors *node_vectors;
-
- if (!cpumask_weight(cpu_mask))
- return 0;
-
- nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
-
- /*
- * If the number of nodes in the mask is greater than or equal the
- * number of vectors we just spread the vectors across the nodes.
- */
- if (numvecs <= nodes) {
- for_each_node_mask(n, nodemsk) {
- cpumask_or(&masks[curvec].mask, &masks[curvec].mask,
- node_to_cpumask[n]);
- if (++curvec == last_affv)
- curvec = firstvec;
- }
- return numvecs;
- }
-
- node_vectors = kcalloc(nr_node_ids,
- sizeof(struct node_vectors),
- GFP_KERNEL);
- if (!node_vectors)
- return -ENOMEM;
-
- /* allocate vector number for each node */
- alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask,
- nodemsk, nmsk, node_vectors);
-
- for (i = 0; i < nr_node_ids; i++) {
- unsigned int ncpus, v;
- struct node_vectors *nv = &node_vectors[i];
-
- if (nv->nvectors == UINT_MAX)
- continue;
-
- /* Get the cpus on this node which are in the mask */
- cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
- ncpus = cpumask_weight(nmsk);
- if (!ncpus)
- continue;
-
- WARN_ON_ONCE(nv->nvectors > ncpus);
-
- /* Account for rounding errors */
- extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors);
-
- /* Spread allocated vectors on CPUs of the current node */
- for (v = 0; v < nv->nvectors; v++, curvec++) {
- cpus_per_vec = ncpus / nv->nvectors;
-
- /* Account for extra vectors to compensate rounding errors */
- if (extra_vecs) {
- cpus_per_vec++;
- --extra_vecs;
- }
-
- /*
- * wrapping has to be considered given 'startvec'
- * may start anywhere
- */
- if (curvec >= last_affv)
- curvec = firstvec;
- irq_spread_init_one(&masks[curvec].mask, nmsk,
- cpus_per_vec);
- }
- done += nv->nvectors;
- }
- kfree(node_vectors);
- return done;
-}
-
-/*
- * build affinity in two stages:
- * 1) spread present CPU on these vectors
- * 2) spread other possible CPUs on these vectors
- */
-static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
- unsigned int firstvec,
- struct irq_affinity_desc *masks)
-{
- unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
- cpumask_var_t *node_to_cpumask;
- cpumask_var_t nmsk, npresmsk;
- int ret = -ENOMEM;
-
- if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
- return ret;
-
- if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
- goto fail_nmsk;
-
- node_to_cpumask = alloc_node_to_cpumask();
- if (!node_to_cpumask)
- goto fail_npresmsk;
-
- /* Stabilize the cpumasks */
- get_online_cpus();
- build_node_to_cpumask(node_to_cpumask);
-
- /* Spread on present CPUs starting from affd->pre_vectors */
- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
- node_to_cpumask, cpu_present_mask,
- nmsk, masks);
- if (ret < 0)
- goto fail_build_affinity;
- nr_present = ret;
-
- /*
- * Spread on non present CPUs starting from the next vector to be
- * handled. If the spreading of present CPUs already exhausted the
- * vector space, assign the non present CPUs to the already spread
- * out vectors.
- */
- if (nr_present >= numvecs)
- curvec = firstvec;
- else
- curvec = firstvec + nr_present;
- cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
- node_to_cpumask, npresmsk, nmsk,
- masks);
- if (ret >= 0)
- nr_others = ret;
-
- fail_build_affinity:
- put_online_cpus();
-
- if (ret >= 0)
- WARN_ON(nr_present + nr_others < numvecs);
-
- free_node_to_cpumask(node_to_cpumask);
-
- fail_npresmsk:
- free_cpumask_var(npresmsk);
-
- fail_nmsk:
- free_cpumask_var(nmsk);
- return ret < 0 ? ret : 0;
-}
+#include <linux/group_cpus.h>
static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
{
@@ -460,14 +70,18 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
*/
for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
unsigned int this_vecs = affd->set_size[i];
- int ret;
+ int j;
+ struct cpumask *result = group_cpus_evenly(this_vecs);
- ret = irq_build_affinity_masks(curvec, this_vecs,
- curvec, masks);
- if (ret) {
+ if (!result) {
kfree(masks);
return NULL;
}
+
+ for (j = 0; j < this_vecs; j++)
+ cpumask_copy(&masks[curvec + j].mask, &result[j]);
+ kfree(result);
+
curvec += this_vecs;
usedvecs += this_vecs;
}
@@ -505,9 +119,9 @@ unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
if (affd->calc_sets) {
set_vecs = maxvec - resv;
} else {
- get_online_cpus();
+ cpus_read_lock();
set_vecs = cpumask_weight(cpu_possible_mask);
- put_online_cpus();
+ cpus_read_unlock();
}
return resv + min(set_vecs, maxvec - resv);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 41e7e37a0928..dc94e0bf2c94 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -38,7 +38,7 @@ struct irqaction chained_action = {
* @irq: irq number
* @chip: pointer to irq chip description structure
*/
-int irq_set_chip(unsigned int irq, struct irq_chip *chip)
+int irq_set_chip(unsigned int irq, const struct irq_chip *chip)
{
unsigned long flags;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
@@ -46,10 +46,7 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip)
if (!desc)
return -EINVAL;
- if (!chip)
- chip = &no_irq_chip;
-
- desc->irq_data.chip = chip;
+ desc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip);
irq_put_desc_unlock(desc, flags);
/*
* For !CONFIG_SPARSE_IRQ make the irq show up in
@@ -61,7 +58,7 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip)
EXPORT_SYMBOL(irq_set_chip);
/**
- * irq_set_type - set the irq trigger type for an irq
+ * irq_set_irq_type - set the irq trigger type for an irq
* @irq: irq number
* @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
*/
@@ -191,7 +188,8 @@ enum {
#ifdef CONFIG_SMP
static int
-__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
+__irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
+ bool force)
{
struct irq_data *d = irq_desc_get_irq_data(desc);
@@ -227,7 +225,8 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
}
#else
static __always_inline int
-__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
+__irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
+ bool force)
{
return IRQ_STARTUP_NORMAL;
}
@@ -255,7 +254,7 @@ static int __irq_startup(struct irq_desc *desc)
int irq_startup(struct irq_desc *desc, bool resend, bool force)
{
struct irq_data *d = irq_desc_get_irq_data(desc);
- struct cpumask *aff = irq_data_get_affinity_mask(d);
+ const struct cpumask *aff = irq_data_get_affinity_mask(d);
int ret = 0;
desc->depth = 0;
@@ -265,8 +264,11 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
} else {
switch (__irq_startup_managed(desc, aff, force)) {
case IRQ_STARTUP_NORMAL:
+ if (d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP)
+ irq_setup_affinity(desc);
ret = __irq_startup(desc);
- irq_setup_affinity(desc);
+ if (!(d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP))
+ irq_setup_affinity(desc);
break;
case IRQ_STARTUP_MANAGED:
irq_do_set_affinity(d, aff, false);
@@ -304,6 +306,7 @@ static void __irq_disable(struct irq_desc *desc, bool mask);
void irq_shutdown(struct irq_desc *desc)
{
if (irqd_is_started(&desc->irq_data)) {
+ clear_irq_resend(desc);
desc->depth = 1;
if (desc->irq_data.chip->irq_shutdown) {
desc->irq_data.chip->irq_shutdown(&desc->irq_data);
@@ -470,25 +473,22 @@ void handle_nested_irq(unsigned int irq)
action = desc->action;
if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
desc->istate |= IRQS_PENDING;
- goto out_unlock;
+ raw_spin_unlock_irq(&desc->lock);
+ return;
}
kstat_incr_irqs_this_cpu(desc);
- irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+ atomic_inc(&desc->threads_active);
raw_spin_unlock_irq(&desc->lock);
action_ret = IRQ_NONE;
for_each_action_of_desc(desc, action)
action_ret |= action->thread_fn(action->irq, action->dev_id);
- if (!noirqdebug)
+ if (!irq_settings_no_debug(desc))
note_interrupt(desc, action_ret);
- raw_spin_lock_irq(&desc->lock);
- irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
-
-out_unlock:
- raw_spin_unlock_irq(&desc->lock);
+ wake_threads_waitq(desc);
}
EXPORT_SYMBOL_GPL(handle_nested_irq);
@@ -572,8 +572,6 @@ EXPORT_SYMBOL_GPL(handle_simple_irq);
*/
void handle_untracked_irq(struct irq_desc *desc)
{
- unsigned int flags = 0;
-
raw_spin_lock(&desc->lock);
if (!irq_may_run(desc))
@@ -590,7 +588,7 @@ void handle_untracked_irq(struct irq_desc *desc)
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock(&desc->lock);
- __handle_irq_event_percpu(desc, &flags);
+ __handle_irq_event_percpu(desc);
raw_spin_lock(&desc->lock);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
@@ -656,16 +654,6 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(handle_level_irq);
-#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
-static inline void preflow_handler(struct irq_desc *desc)
-{
- if (desc->preflow_handler)
- desc->preflow_handler(&desc->irq_data);
-}
-#else
-static inline void preflow_handler(struct irq_desc *desc) { }
-#endif
-
static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
{
if (!(desc->istate & IRQS_ONESHOT)) {
@@ -702,8 +690,16 @@ void handle_fasteoi_irq(struct irq_desc *desc)
raw_spin_lock(&desc->lock);
- if (!irq_may_run(desc))
+ /*
+ * When an affinity change races with IRQ handling, the next interrupt
+ * can arrive on the new CPU before the original CPU has completed
+ * handling the previous one - it may need to be resent.
+ */
+ if (!irq_may_run(desc)) {
+ if (irqd_needs_resend_when_in_progress(&desc->irq_data))
+ desc->istate |= IRQS_PENDING;
goto out;
+ }
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
@@ -721,11 +717,16 @@ void handle_fasteoi_irq(struct irq_desc *desc)
if (desc->istate & IRQS_ONESHOT)
mask_irq(desc);
- preflow_handler(desc);
handle_irq_event(desc);
cond_unmask_eoi_irq(desc, chip);
+ /*
+ * When the race described above happens this will resend the interrupt.
+ */
+ if (unlikely(desc->istate & IRQS_PENDING))
+ check_irq_resend(desc, false);
+
raw_spin_unlock(&desc->lock);
return;
out:
@@ -772,7 +773,7 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_nmi);
* handle_edge_irq - edge type IRQ handler
* @desc: the interrupt description structure for this irq
*
- * Interrupt occures on the falling and/or rising edge of a hardware
+ * Interrupt occurs on the falling and/or rising edge of a hardware
* signal. The occurrence is latched into the irq controller hardware
* and must be acked in order to be reenabled. After the ack another
* interrupt can happen on the same source even before the first one
@@ -819,7 +820,7 @@ void handle_edge_irq(struct irq_desc *desc)
/*
* When another irq arrived while we were handling
* one, we could have masked the irq.
- * Renable it, if it was not disabled in meantime.
+ * Reenable it, if it was not disabled in meantime.
*/
if (unlikely(desc->istate & IRQS_PENDING)) {
if (!irqd_irq_disabled(&desc->irq_data) &&
@@ -1019,8 +1020,10 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
if (desc->irq_data.chip != &no_irq_chip)
mask_ack_irq(desc);
irq_state_set_disabled(desc);
- if (is_chained)
+ if (is_chained) {
desc->action = NULL;
+ WARN_ON(irq_chip_pm_put(irq_desc_get_irq_data(desc)));
+ }
desc->depth = 1;
}
desc->handle_irq = handle;
@@ -1046,6 +1049,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
irq_settings_set_norequest(desc);
irq_settings_set_nothread(desc);
desc->action = &chained_action;
+ WARN_ON(irq_chip_pm_get(irq_desc_get_irq_data(desc)));
irq_activate_and_startup(desc, IRQ_RESEND);
}
}
@@ -1083,7 +1087,7 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data);
void
-irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+irq_set_chip_and_handler_name(unsigned int irq, const struct irq_chip *chip,
irq_flow_handler_t handle, const char *name)
{
irq_set_chip(irq, chip);
@@ -1130,6 +1134,7 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
}
EXPORT_SYMBOL_GPL(irq_modify_status);
+#ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE
/**
* irq_cpu_online - Invoke all irq_cpu_online functions.
*
@@ -1189,6 +1194,7 @@ void irq_cpu_offline(void)
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
+#endif
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
@@ -1231,7 +1237,6 @@ void handle_fasteoi_ack_irq(struct irq_desc *desc)
/* Start handling the irq */
desc->irq_data.chip->irq_ack(&desc->irq_data);
- preflow_handler(desc);
handle_irq_event(desc);
cond_unmask_eoi_irq(desc, chip);
@@ -1281,7 +1286,6 @@ void handle_fasteoi_mask_irq(struct irq_desc *desc)
if (desc->istate & IRQS_ONESHOT)
mask_irq(desc);
- preflow_handler(desc);
handle_irq_event(desc);
cond_unmask_eoi_irq(desc, chip);
@@ -1432,7 +1436,7 @@ EXPORT_SYMBOL_GPL(irq_chip_eoi_parent);
* @dest: The affinity mask to set
* @force: Flag to enforce setting (disable online checks)
*
- * Conditinal, as the underlying parent chip might not implement it.
+ * Conditional, as the underlying parent chip might not implement it.
*/
int irq_chip_set_affinity_parent(struct irq_data *data,
const struct cpumask *dest, bool force)
@@ -1478,6 +1482,7 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
return 0;
}
+EXPORT_SYMBOL_GPL(irq_chip_retrigger_hierarchy);
/**
* irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt
@@ -1492,7 +1497,7 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
return -ENOSYS;
}
-
+EXPORT_SYMBOL_GPL(irq_chip_set_vcpu_affinity_parent);
/**
* irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt
* @data: Pointer to interrupt specific data
@@ -1525,7 +1530,8 @@ int irq_chip_request_resources_parent(struct irq_data *data)
if (data->chip->irq_request_resources)
return data->chip->irq_request_resources(data);
- return -ENOSYS;
+ /* no error on missing optional irq_chip::irq_request_resources */
+ return 0;
}
EXPORT_SYMBOL_GPL(irq_chip_request_resources_parent);
@@ -1543,7 +1549,7 @@ EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent);
#endif
/**
- * irq_chip_compose_msi_msg - Componse msi message for a irq chip
+ * irq_chip_compose_msi_msg - Compose msi message for a irq chip
* @data: Pointer to interrupt specific data
* @msg: Pointer to the MSI message
*
@@ -1553,21 +1559,28 @@ EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent);
*/
int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
{
- struct irq_data *pos = NULL;
+ struct irq_data *pos;
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
- for (; data; data = data->parent_data)
-#endif
+ for (pos = NULL; !pos && data; data = irqd_get_parent_data(data)) {
if (data->chip && data->chip->irq_compose_msi_msg)
pos = data;
+ }
+
if (!pos)
return -ENOSYS;
pos->chip->irq_compose_msi_msg(pos, msg);
-
return 0;
}
+static struct device *irq_get_pm_device(struct irq_data *data)
+{
+ if (data->domain)
+ return data->domain->pm_dev;
+
+ return NULL;
+}
+
/**
* irq_chip_pm_get - Enable power for an IRQ chip
* @data: Pointer to interrupt specific data
@@ -1577,17 +1590,13 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
*/
int irq_chip_pm_get(struct irq_data *data)
{
- int retval;
+ struct device *dev = irq_get_pm_device(data);
+ int retval = 0;
- if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) {
- retval = pm_runtime_get_sync(data->chip->parent_device);
- if (retval < 0) {
- pm_runtime_put_noidle(data->chip->parent_device);
- return retval;
- }
- }
+ if (IS_ENABLED(CONFIG_PM) && dev)
+ retval = pm_runtime_resume_and_get(dev);
- return 0;
+ return retval;
}
/**
@@ -1600,10 +1609,11 @@ int irq_chip_pm_get(struct irq_data *data)
*/
int irq_chip_pm_put(struct irq_data *data)
{
+ struct device *dev = irq_get_pm_device(data);
int retval = 0;
- if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device)
- retval = pm_runtime_put(data->chip->parent_device);
+ if (IS_ENABLED(CONFIG_PM) && dev)
+ retval = pm_runtime_put(dev);
return (retval < 0) ? retval : 0;
}
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 02236b13b359..1ed2b1739363 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -166,7 +166,7 @@ void irq_migrate_all_off_this_cpu(void)
raw_spin_unlock(&desc->lock);
if (affinity_broken) {
- pr_warn_ratelimited("IRQ %u: no longer affine to CPU%u\n",
+ pr_debug_ratelimited("IRQ %u: no longer affine to CPU%u\n",
irq, smp_processor_id());
}
}
@@ -176,10 +176,10 @@ static bool hk_should_isolate(struct irq_data *data, unsigned int cpu)
{
const struct cpumask *hk_mask;
- if (!housekeeping_enabled(HK_FLAG_MANAGED_IRQ))
+ if (!housekeeping_enabled(HK_TYPE_MANAGED_IRQ))
return false;
- hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
+ hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
if (cpumask_subset(irq_data_get_effective_affinity_mask(data), hk_mask))
return false;
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 4f9f844074db..aae0402507ed 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -30,7 +30,7 @@ static void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state,
static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc)
{
struct irq_data *data = irq_desc_get_irq_data(desc);
- struct cpumask *msk;
+ const struct cpumask *msk;
msk = irq_data_get_affinity_mask(data);
seq_printf(m, "affinity: %*pbl\n", cpumask_pr_args(msk));
@@ -57,6 +57,8 @@ static const struct irq_bit_descr irqchip_flags[] = {
BIT_MASK_DESCR(IRQCHIP_EOI_THREADED),
BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI),
BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI),
+ BIT_MASK_DESCR(IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND),
+ BIT_MASK_DESCR(IRQCHIP_IMMUTABLE),
};
static void
@@ -68,8 +70,12 @@ irq_debug_show_chip(struct seq_file *m, struct irq_data *data, int ind)
seq_printf(m, "chip: None\n");
return;
}
- seq_printf(m, "%*schip: %s\n", ind, "", chip->name);
- seq_printf(m, "%*sflags: 0x%lx\n", ind + 1, "", chip->flags);
+ seq_printf(m, "%*schip: ", ind, "");
+ if (chip->irq_print_chip)
+ chip->irq_print_chip(data, m);
+ else
+ seq_printf(m, "%s", chip->name);
+ seq_printf(m, "\n%*sflags: 0x%lx\n", ind + 1, "", chip->flags);
irq_debug_show_bits(m, ind, chip->flags, irqchip_flags,
ARRAY_SIZE(irqchip_flags));
}
@@ -112,14 +118,22 @@ static const struct irq_bit_descr irqdata_states[] = {
BIT_MASK_DESCR(IRQD_AFFINITY_SET),
BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
+ BIT_MASK_DESCR(IRQD_AFFINITY_ON_ACTIVATE),
BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
BIT_MASK_DESCR(IRQD_CAN_RESERVE),
- BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
BIT_MASK_DESCR(IRQD_WAKEUP_STATE),
BIT_MASK_DESCR(IRQD_WAKEUP_ARMED),
+
+ BIT_MASK_DESCR(IRQD_DEFAULT_TRIGGER_SET),
+
+ BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX),
+
+ BIT_MASK_DESCR(IRQD_IRQ_ENABLED_ON_SUSPEND),
+
+ BIT_MASK_DESCR(IRQD_RESEND_WHEN_IN_PROGRESS),
};
static const struct irq_bit_descr irqdesc_states[] = {
@@ -131,6 +145,7 @@ static const struct irq_bit_descr irqdesc_states[] = {
BIT_MASK_DESCR(_IRQ_PER_CPU_DEVID),
BIT_MASK_DESCR(_IRQ_IS_POLLED),
BIT_MASK_DESCR(_IRQ_DISABLE_UNLAZY),
+ BIT_MASK_DESCR(_IRQ_HIDDEN),
};
static const struct irq_bit_descr irqdesc_istates[] = {
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 0b0cdf206dc4..7fe6cffe7d0d 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -13,7 +13,7 @@
/*
* What should we do if we get a hw irq event on an illegal vector?
- * Each architecture has to answer this themself.
+ * Each architecture has to answer this themselves.
*/
static void ack_bad(struct irq_data *data)
{
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index e2999a070a99..d39a40bc542b 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -25,6 +25,7 @@ static DEFINE_RAW_SPINLOCK(gc_lock);
void irq_gc_noop(struct irq_data *d)
{
}
+EXPORT_SYMBOL_GPL(irq_gc_noop);
/**
* irq_gc_mask_disable_reg - Mask chip via disable register
@@ -44,6 +45,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
*ct->mask_cache &= ~mask;
irq_gc_unlock(gc);
}
+EXPORT_SYMBOL_GPL(irq_gc_mask_disable_reg);
/**
* irq_gc_mask_set_bit - Mask chip via setting bit in mask register
@@ -103,6 +105,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
*ct->mask_cache |= mask;
irq_gc_unlock(gc);
}
+EXPORT_SYMBOL_GPL(irq_gc_unmask_enable_reg);
/**
* irq_gc_ack_set_bit - Ack pending interrupt via setting bit
@@ -200,6 +203,7 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
irq_gc_unlock(gc);
return 0;
}
+EXPORT_SYMBOL_GPL(irq_gc_set_wake);
static u32 irq_readl_be(void __iomem *addr)
{
@@ -215,11 +219,15 @@ void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
int num_ct, unsigned int irq_base,
void __iomem *reg_base, irq_flow_handler_t handler)
{
+ struct irq_chip_type *ct = gc->chip_types;
+ int i;
+
raw_spin_lock_init(&gc->lock);
gc->num_ct = num_ct;
gc->irq_base = irq_base;
gc->reg_base = reg_base;
- gc->chip_types->chip.name = name;
+ for (i = 0; i < num_ct; i++)
+ ct[i].chip.name = name;
gc->chip_types->handler = handler;
}
@@ -239,9 +247,8 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
void __iomem *reg_base, irq_flow_handler_t handler)
{
struct irq_chip_generic *gc;
- unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
- gc = kzalloc(sz, GFP_KERNEL);
+ gc = kzalloc(struct_size(gc, chip_types, num_ct), GFP_KERNEL);
if (gc) {
irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base,
handler);
@@ -269,7 +276,7 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
}
/**
- * __irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain
+ * __irq_alloc_domain_generic_chips - Allocate generic chips for an irq domain
* @d: irq domain for which to allocate chips
* @irqs_per_chip: Number of interrupts each chip handles (max 32)
* @num_ct: Number of irq_chip_type instances associated with this
@@ -287,8 +294,11 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
{
struct irq_domain_chip_generic *dgc;
struct irq_chip_generic *gc;
- int numchips, sz, i;
unsigned long flags;
+ int numchips, i;
+ size_t dgc_sz;
+ size_t gc_sz;
+ size_t sz;
void *tmp;
if (d->gc)
@@ -299,8 +309,9 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
return -EINVAL;
/* Allocate a pointer, generic chip and chiptypes for each chip */
- sz = sizeof(*dgc) + numchips * sizeof(gc);
- sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type));
+ gc_sz = struct_size(gc, chip_types, num_ct);
+ dgc_sz = struct_size(dgc, gc, numchips);
+ sz = dgc_sz + numchips * gc_sz;
tmp = dgc = kzalloc(sz, GFP_KERNEL);
if (!dgc)
@@ -313,7 +324,7 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
d->gc = dgc;
/* Calc pointer to the first generic chip */
- tmp += sizeof(*dgc) + numchips * sizeof(gc);
+ tmp += dgc_sz;
for (i = 0; i < numchips; i++) {
/* Store the pointer to the generic chip */
dgc->gc[i] = gc = tmp;
@@ -330,7 +341,7 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
list_add_tail(&gc->list, &gc_list);
raw_spin_unlock_irqrestore(&gc_lock, flags);
/* Calc pointer to the next generic chip */
- tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
+ tmp += gc_sz;
}
return 0;
}
@@ -424,7 +435,7 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
return 0;
}
-static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
+void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
{
struct irq_data *data = irq_domain_get_irq_data(d, virq);
struct irq_domain_chip_generic *dgc = d->gc;
@@ -444,7 +455,7 @@ static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
}
-struct irq_domain_ops irq_generic_chip_ops = {
+const struct irq_domain_ops irq_generic_chip_ops = {
.map = irq_map_generic_chip,
.unmap = irq_unmap_generic_chip,
.xlate = irq_domain_xlate_onetwocell,
@@ -537,21 +548,34 @@ EXPORT_SYMBOL_GPL(irq_setup_alt_chip);
void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
unsigned int clr, unsigned int set)
{
- unsigned int i = gc->irq_base;
+ unsigned int i, virq;
raw_spin_lock(&gc_lock);
list_del(&gc->list);
raw_spin_unlock(&gc_lock);
- for (; msk; msk >>= 1, i++) {
+ for (i = 0; msk; msk >>= 1, i++) {
if (!(msk & 0x01))
continue;
+ /*
+ * Interrupt domain based chips store the base hardware
+ * interrupt number in gc::irq_base. Otherwise gc::irq_base
+ * contains the base Linux interrupt number.
+ */
+ if (gc->domain) {
+ virq = irq_find_mapping(gc->domain, gc->irq_base + i);
+ if (!virq)
+ continue;
+ } else {
+ virq = gc->irq_base + i;
+ }
+
/* Remove handler first. That will mask the irq line */
- irq_set_handler(i, NULL);
- irq_set_chip(i, &no_irq_chip);
- irq_set_chip_data(i, NULL);
- irq_modify_status(i, clr, set);
+ irq_set_handler(virq, NULL);
+ irq_set_chip(virq, &no_irq_chip);
+ irq_set_chip_data(virq, NULL);
+ irq_modify_status(virq, clr, set);
}
}
EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a8e14c80b405..9489f93b3db3 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -14,6 +14,8 @@
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
+#include <asm/irq_regs.h>
+
#include <trace/events/irq.h>
#include "internals.h"
@@ -134,7 +136,7 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
wake_up_process(action->thread);
}
-irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags)
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval = IRQ_NONE;
unsigned int irq = desc->irq_data.irq;
@@ -172,10 +174,6 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
}
__irq_wake_thread(desc, action);
-
- /* Fall through - to add to randomness */
- case IRQ_HANDLED:
- *flags |= action->flags;
break;
default:
@@ -191,13 +189,12 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval;
- unsigned int flags = 0;
- retval = __handle_irq_event_percpu(desc, &flags);
+ retval = __handle_irq_event_percpu(desc);
- add_interrupt_randomness(desc->irq_data.irq, flags);
+ add_interrupt_randomness(desc->irq_data.irq);
- if (!noirqdebug)
+ if (!irq_settings_no_debug(desc))
note_interrupt(desc, retval);
return retval;
}
@@ -226,4 +223,20 @@ int __init set_handle_irq(void (*handle_irq)(struct pt_regs *))
handle_arch_irq = handle_irq;
return 0;
}
+
+/**
+ * generic_handle_arch_irq - root irq handler for architectures which do no
+ * entry accounting themselves
+ * @regs: Register file coming from the low-level handling code
+ */
+asmlinkage void noinstr generic_handle_arch_irq(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs;
+
+ irq_enter();
+ old_regs = set_irq_regs(regs);
+ handle_arch_irq(regs);
+ set_irq_regs(old_regs);
+ irq_exit();
+}
#endif
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 7db284b10ac9..bcc7f21db9ee 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -12,9 +12,9 @@
#include <linux/sched/clock.h>
#ifdef CONFIG_SPARSE_IRQ
-# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
+# define MAX_SPARSE_IRQS INT_MAX
#else
-# define IRQ_BITMAP_BITS NR_IRQS
+# define MAX_SPARSE_IRQS NR_IRQS
#endif
#define istate core_internal_state__do_not_mess_with_it
@@ -29,12 +29,14 @@ extern struct irqaction chained_action;
* IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
* IRQTF_AFFINITY - irq thread is requested to adjust affinity
* IRQTF_FORCED_THREAD - irq action is force threaded
+ * IRQTF_READY - signals that irq thread is ready
*/
enum {
IRQTF_RUNTHREAD,
IRQTF_WARNED,
IRQTF_AFFINITY,
IRQTF_FORCED_THREAD,
+ IRQTF_READY,
};
/*
@@ -45,11 +47,15 @@ enum {
* detection
* IRQS_POLL_INPROGRESS - polling in progress
* IRQS_ONESHOT - irq is not unmasked in primary handler
- * IRQS_REPLAY - irq is replayed
+ * IRQS_REPLAY - irq has been resent and will not be resent
+ * again until the handler has run and cleared
+ * this flag.
* IRQS_WAITING - irq is waiting
- * IRQS_PENDING - irq is pending and replayed later
+ * IRQS_PENDING - irq needs to be resent and should be resent
+ * at the next available opportunity.
* IRQS_SUSPENDED - irq is suspended
* IRQS_NMI - irq line is used to deliver NMIs
+ * IRQS_SYSFS - descriptor has been added to sysfs
*/
enum {
IRQS_AUTODETECT = 0x00000001,
@@ -62,6 +68,7 @@ enum {
IRQS_SUSPENDED = 0x00000800,
IRQS_TIMINGS = 0x00001000,
IRQS_NMI = 0x00002000,
+ IRQS_SYSFS = 0x00004000,
};
#include "debug.h"
@@ -101,17 +108,19 @@ extern int __irq_get_irqchip_state(struct irq_data *data,
enum irqchip_irq_state which,
bool *state);
-extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
-
-irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags);
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event(struct irq_desc *desc);
/* Resending of interrupts :*/
int check_irq_resend(struct irq_desc *desc, bool inject);
+void clear_irq_resend(struct irq_desc *desc);
+void irq_resend_init(struct irq_desc *desc);
bool irq_wait_for_poll(struct irq_desc *desc);
void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
+void wake_threads_waitq(struct irq_desc *desc);
+
#ifdef CONFIG_PROC_FS
extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc);
@@ -473,6 +482,15 @@ static inline void irq_domain_deactivate_irq(struct irq_data *data)
}
#endif
+static inline struct irq_data *irqd_get_parent_data(struct irq_data *irqd)
+{
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ return irqd->parent_data;
+#else
+ return NULL;
+#endif
+}
+
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
#include <linux/debugfs.h>
diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c
new file mode 100644
index 000000000000..fa4fc18c6131
--- /dev/null
+++ b/kernel/irq/ipi-mux.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Multiplex several virtual IPIs over a single HW IPI.
+ *
+ * Copyright The Asahi Linux Contributors
+ * Copyright (c) 2022 Ventana Micro Systems Inc.
+ */
+
+#define pr_fmt(fmt) "ipi-mux: " fmt
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/irqdomain.h>
+#include <linux/jump_label.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+
+struct ipi_mux_cpu {
+ atomic_t enable;
+ atomic_t bits;
+};
+
+static struct ipi_mux_cpu __percpu *ipi_mux_pcpu;
+static struct irq_domain *ipi_mux_domain;
+static void (*ipi_mux_send)(unsigned int cpu);
+
+static void ipi_mux_mask(struct irq_data *d)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+
+ atomic_andnot(BIT(irqd_to_hwirq(d)), &icpu->enable);
+}
+
+static void ipi_mux_unmask(struct irq_data *d)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ u32 ibit = BIT(irqd_to_hwirq(d));
+
+ atomic_or(ibit, &icpu->enable);
+
+ /*
+ * The atomic_or() above must complete before the atomic_read()
+ * below to avoid racing ipi_mux_send_mask().
+ */
+ smp_mb__after_atomic();
+
+ /* If a pending IPI was unmasked, raise a parent IPI immediately. */
+ if (atomic_read(&icpu->bits) & ibit)
+ ipi_mux_send(smp_processor_id());
+}
+
+static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ u32 ibit = BIT(irqd_to_hwirq(d));
+ unsigned long pending;
+ int cpu;
+
+ for_each_cpu(cpu, mask) {
+ icpu = per_cpu_ptr(ipi_mux_pcpu, cpu);
+
+ /*
+ * This sequence is the mirror of the one in ipi_mux_unmask();
+ * see the comment there. Additionally, release semantics
+ * ensure that the vIPI flag set is ordered after any shared
+ * memory accesses that precede it. This therefore also pairs
+ * with the atomic_fetch_andnot in ipi_mux_process().
+ */
+ pending = atomic_fetch_or_release(ibit, &icpu->bits);
+
+ /*
+ * The atomic_fetch_or_release() above must complete
+ * before the atomic_read() below to avoid racing with
+ * ipi_mux_unmask().
+ */
+ smp_mb__after_atomic();
+
+ /*
+ * The flag writes must complete before the physical IPI is
+ * issued to another CPU. This is implied by the control
+ * dependency on the result of atomic_read() below, which is
+ * itself already ordered after the vIPI flag write.
+ */
+ if (!(pending & ibit) && (atomic_read(&icpu->enable) & ibit))
+ ipi_mux_send(cpu);
+ }
+}
+
+static const struct irq_chip ipi_mux_chip = {
+ .name = "IPI Mux",
+ .irq_mask = ipi_mux_mask,
+ .irq_unmask = ipi_mux_unmask,
+ .ipi_send_mask = ipi_mux_send_mask,
+};
+
+static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_set_percpu_devid(virq + i);
+ irq_domain_set_info(d, virq + i, i, &ipi_mux_chip, NULL,
+ handle_percpu_devid_irq, NULL, NULL);
+ }
+
+ return 0;
+}
+
+static const struct irq_domain_ops ipi_mux_domain_ops = {
+ .alloc = ipi_mux_domain_alloc,
+ .free = irq_domain_free_irqs_top,
+};
+
+/**
+ * ipi_mux_process - Process multiplexed virtual IPIs
+ */
+void ipi_mux_process(void)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ irq_hw_number_t hwirq;
+ unsigned long ipis;
+ unsigned int en;
+
+ /*
+ * Reading enable mask does not need to be ordered as long as
+ * this function is called from interrupt handler because only
+ * the CPU itself can change it's own enable mask.
+ */
+ en = atomic_read(&icpu->enable);
+
+ /*
+ * Clear the IPIs we are about to handle. This pairs with the
+ * atomic_fetch_or_release() in ipi_mux_send_mask().
+ */
+ ipis = atomic_fetch_andnot(en, &icpu->bits) & en;
+
+ for_each_set_bit(hwirq, &ipis, BITS_PER_TYPE(int))
+ generic_handle_domain_irq(ipi_mux_domain, hwirq);
+}
+
+/**
+ * ipi_mux_create - Create virtual IPIs multiplexed on top of a single
+ * parent IPI.
+ * @nr_ipi: number of virtual IPIs to create. This should
+ * be <= BITS_PER_TYPE(int)
+ * @mux_send: callback to trigger parent IPI for a particular CPU
+ *
+ * Returns first virq of the newly created virtual IPIs upon success
+ * or <=0 upon failure
+ */
+int ipi_mux_create(unsigned int nr_ipi, void (*mux_send)(unsigned int cpu))
+{
+ struct fwnode_handle *fwnode;
+ struct irq_domain *domain;
+ int rc;
+
+ if (ipi_mux_domain)
+ return -EEXIST;
+
+ if (BITS_PER_TYPE(int) < nr_ipi || !mux_send)
+ return -EINVAL;
+
+ ipi_mux_pcpu = alloc_percpu(typeof(*ipi_mux_pcpu));
+ if (!ipi_mux_pcpu)
+ return -ENOMEM;
+
+ fwnode = irq_domain_alloc_named_fwnode("IPI-Mux");
+ if (!fwnode) {
+ pr_err("unable to create IPI Mux fwnode\n");
+ rc = -ENOMEM;
+ goto fail_free_cpu;
+ }
+
+ domain = irq_domain_create_linear(fwnode, nr_ipi,
+ &ipi_mux_domain_ops, NULL);
+ if (!domain) {
+ pr_err("unable to add IPI Mux domain\n");
+ rc = -ENOMEM;
+ goto fail_free_fwnode;
+ }
+
+ domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE;
+ irq_domain_update_bus_token(domain, DOMAIN_BUS_IPI);
+
+ rc = irq_domain_alloc_irqs(domain, nr_ipi, NUMA_NO_NODE, NULL);
+ if (rc <= 0) {
+ pr_err("unable to alloc IRQs from IPI Mux domain\n");
+ goto fail_free_domain;
+ }
+
+ ipi_mux_domain = domain;
+ ipi_mux_send = mux_send;
+
+ return rc;
+
+fail_free_domain:
+ irq_domain_remove(domain);
+fail_free_fwnode:
+ irq_domain_free_fwnode(fwnode);
+fail_free_cpu:
+ free_percpu(ipi_mux_pcpu);
+ return rc;
+}
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 43e3d1be622c..961d4af76af3 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -14,11 +14,11 @@
/**
* irq_reserve_ipi() - Setup an IPI to destination cpumask
* @domain: IPI domain
- * @dest: cpumask of cpus which can receive the IPI
+ * @dest: cpumask of CPUs which can receive the IPI
*
* Allocate a virq that can be used to send IPI to any CPU in dest mask.
*
- * On success it'll return linux irq number and error code on failure
+ * Return: Linux IRQ number on success or error code on failure
*/
int irq_reserve_ipi(struct irq_domain *domain,
const struct cpumask *dest)
@@ -104,22 +104,22 @@ free_descs:
/**
* irq_destroy_ipi() - unreserve an IPI that was previously allocated
- * @irq: linux irq number to be destroyed
- * @dest: cpumask of cpus which should have the IPI removed
+ * @irq: Linux IRQ number to be destroyed
+ * @dest: cpumask of CPUs which should have the IPI removed
*
- * The IPIs allocated with irq_reserve_ipi() are retuerned to the system
+ * The IPIs allocated with irq_reserve_ipi() are returned to the system
* destroying all virqs associated with them.
*
- * Return 0 on success or error code on failure.
+ * Return: %0 on success or error code on failure.
*/
int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
{
struct irq_data *data = irq_get_irq_data(irq);
- struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
+ const struct cpumask *ipimask;
struct irq_domain *domain;
unsigned int nr_irqs;
- if (!irq || !data || !ipimask)
+ if (!irq || !data)
return -EINVAL;
domain = data->domain;
@@ -131,7 +131,8 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
return -EINVAL;
}
- if (WARN_ON(!cpumask_subset(dest, ipimask)))
+ ipimask = irq_data_get_affinity_mask(data);
+ if (!ipimask || WARN_ON(!cpumask_subset(dest, ipimask)))
/*
* Must be destroying a subset of CPUs to which this IPI
* was set up to target
@@ -150,24 +151,25 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
}
/**
- * ipi_get_hwirq - Get the hwirq associated with an IPI to a cpu
- * @irq: linux irq number
- * @cpu: the target cpu
+ * ipi_get_hwirq - Get the hwirq associated with an IPI to a CPU
+ * @irq: Linux IRQ number
+ * @cpu: the target CPU
*
* When dealing with coprocessors IPI, we need to inform the coprocessor of
* the hwirq it needs to use to receive and send IPIs.
*
- * Returns hwirq value on success and INVALID_HWIRQ on failure.
+ * Return: hwirq value on success or INVALID_HWIRQ on failure.
*/
irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
{
struct irq_data *data = irq_get_irq_data(irq);
- struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
+ const struct cpumask *ipimask;
- if (!data || !ipimask || cpu >= nr_cpu_ids)
+ if (!data || cpu >= nr_cpu_ids)
return INVALID_HWIRQ;
- if (!cpumask_test_cpu(cpu, ipimask))
+ ipimask = irq_data_get_affinity_mask(data);
+ if (!ipimask || !cpumask_test_cpu(cpu, ipimask))
return INVALID_HWIRQ;
/*
@@ -186,9 +188,9 @@ EXPORT_SYMBOL_GPL(ipi_get_hwirq);
static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
const struct cpumask *dest, unsigned int cpu)
{
- struct cpumask *ipimask = irq_data_get_affinity_mask(data);
+ const struct cpumask *ipimask;
- if (!chip || !ipimask)
+ if (!chip || !data)
return -EINVAL;
if (!chip->ipi_send_single && !chip->ipi_send_mask)
@@ -197,6 +199,10 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
if (cpu >= nr_cpu_ids)
return -EINVAL;
+ ipimask = irq_data_get_affinity_mask(data);
+ if (!ipimask)
+ return -EINVAL;
+
if (dest) {
if (!cpumask_subset(dest, ipimask))
return -EINVAL;
@@ -216,7 +222,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
* This function is for architecture or core code to speed up IPI sending. Not
* usable from driver code.
*
- * Returns zero on success and negative error number on failure.
+ * Return: %0 on success or negative error number on failure.
*/
int __ipi_send_single(struct irq_desc *desc, unsigned int cpu)
{
@@ -250,7 +256,7 @@ int __ipi_send_single(struct irq_desc *desc, unsigned int cpu)
}
/**
- * ipi_send_mask - send an IPI to target Linux SMP CPU(s)
+ * __ipi_send_mask - send an IPI to target Linux SMP CPU(s)
* @desc: pointer to irq_desc of the IRQ
* @dest: dest CPU(s), must be a subset of the mask passed to
* irq_reserve_ipi()
@@ -258,7 +264,7 @@ int __ipi_send_single(struct irq_desc *desc, unsigned int cpu)
* This function is for architecture or core code to speed up IPI sending. Not
* usable from driver code.
*
- * Returns zero on success and negative error number on failure.
+ * Return: %0 on success or negative error number on failure.
*/
int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest)
{
@@ -298,11 +304,11 @@ int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest)
/**
* ipi_send_single - Send an IPI to a single CPU
- * @virq: linux irq number from irq_reserve_ipi()
+ * @virq: Linux IRQ number from irq_reserve_ipi()
* @cpu: destination CPU, must in the destination mask passed to
* irq_reserve_ipi()
*
- * Returns zero on success and negative error number on failure.
+ * Return: %0 on success or negative error number on failure.
*/
int ipi_send_single(unsigned int virq, unsigned int cpu)
{
@@ -319,11 +325,11 @@ EXPORT_SYMBOL_GPL(ipi_send_single);
/**
* ipi_send_mask - Send an IPI to target CPU(s)
- * @virq: linux irq number from irq_reserve_ipi()
+ * @virq: Linux IRQ number from irq_reserve_ipi()
* @dest: dest CPU(s), must be a subset of the mask passed to
* irq_reserve_ipi()
*
- * Returns zero on success and negative error number on failure.
+ * Return: %0 on success or negative error number on failure.
*/
int ipi_send_mask(unsigned int virq, const struct cpumask *dest)
{
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 48006608baf0..dd76323ea3fd 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -24,10 +24,6 @@ struct irq_sim_irq_ctx {
struct irq_sim_work_ctx *work_ctx;
};
-struct irq_sim_devres {
- struct irq_domain *domain;
-};
-
static void irq_sim_irqmask(struct irq_data *data)
{
struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
@@ -159,7 +155,7 @@ static const struct irq_domain_ops irq_sim_domain_ops = {
* irq_domain_create_sim - Create a new interrupt simulator irq_domain and
* allocate a range of dummy interrupts.
*
- * @fnode: struct fwnode_handle to be associated with this domain.
+ * @fwnode: struct fwnode_handle to be associated with this domain.
* @num_irqs: Number of interrupts to allocate.
*
* On success: return a new irq_domain object.
@@ -185,7 +181,7 @@ struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode,
goto err_free_bitmap;
work_ctx->irq_count = num_irqs;
- init_irq_work(&work_ctx->work, irq_sim_handle_irq);
+ work_ctx->work = IRQ_WORK_INIT_HARD(irq_sim_handle_irq);
return work_ctx->domain;
@@ -216,11 +212,11 @@ void irq_domain_remove_sim(struct irq_domain *domain)
}
EXPORT_SYMBOL_GPL(irq_domain_remove_sim);
-static void devm_irq_domain_release_sim(struct device *dev, void *res)
+static void devm_irq_domain_remove_sim(void *data)
{
- struct irq_sim_devres *this = res;
+ struct irq_domain *domain = data;
- irq_domain_remove_sim(this->domain);
+ irq_domain_remove_sim(domain);
}
/**
@@ -228,7 +224,7 @@ static void devm_irq_domain_release_sim(struct device *dev, void *res)
* a managed device.
*
* @dev: Device to initialize the simulator object for.
- * @fnode: struct fwnode_handle to be associated with this domain.
+ * @fwnode: struct fwnode_handle to be associated with this domain.
* @num_irqs: Number of interrupts to allocate
*
* On success: return a new irq_domain object.
@@ -238,20 +234,17 @@ struct irq_domain *devm_irq_domain_create_sim(struct device *dev,
struct fwnode_handle *fwnode,
unsigned int num_irqs)
{
- struct irq_sim_devres *dr;
+ struct irq_domain *domain;
+ int ret;
- dr = devres_alloc(devm_irq_domain_release_sim,
- sizeof(*dr), GFP_KERNEL);
- if (!dr)
- return ERR_PTR(-ENOMEM);
+ domain = irq_domain_create_sim(fwnode, num_irqs);
+ if (IS_ERR(domain))
+ return domain;
- dr->domain = irq_domain_create_sim(fwnode, num_irqs);
- if (IS_ERR(dr->domain)) {
- devres_free(dr);
- return dr->domain;
- }
+ ret = devm_add_action_or_reset(dev, devm_irq_domain_remove_sim, domain);
+ if (ret)
+ return ERR_PTR(ret);
- devres_add(dev, dr);
- return dr->domain;
+ return domain;
}
EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 1a7723604399..371eb1711d34 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -12,8 +12,7 @@
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
-#include <linux/radix-tree.h>
-#include <linux/bitmap.h>
+#include <linux/maple_tree.h>
#include <linux/irqdomain.h>
#include <linux/sysfs.h>
@@ -31,7 +30,7 @@ static int __init irq_affinity_setup(char *str)
cpulist_parse(str, irq_default_affinity);
/*
* Set at least the boot cpu. We don't want to end up with
- * bugreports caused by random comandline masks
+ * bugreports caused by random commandline masks
*/
cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
return 1;
@@ -131,7 +130,40 @@ int nr_irqs = NR_IRQS;
EXPORT_SYMBOL_GPL(nr_irqs);
static DEFINE_MUTEX(sparse_irq_lock);
-static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
+static struct maple_tree sparse_irqs = MTREE_INIT_EXT(sparse_irqs,
+ MT_FLAGS_ALLOC_RANGE |
+ MT_FLAGS_LOCK_EXTERN |
+ MT_FLAGS_USE_RCU,
+ sparse_irq_lock);
+
+static int irq_find_free_area(unsigned int from, unsigned int cnt)
+{
+ MA_STATE(mas, &sparse_irqs, 0, 0);
+
+ if (mas_empty_area(&mas, from, MAX_SPARSE_IRQS, cnt))
+ return -ENOSPC;
+ return mas.index;
+}
+
+static unsigned int irq_find_at_or_after(unsigned int offset)
+{
+ unsigned long index = offset;
+ struct irq_desc *desc = mt_find(&sparse_irqs, &index, nr_irqs);
+
+ return desc ? irq_desc_get_irq(desc) : nr_irqs;
+}
+
+static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
+{
+ MA_STATE(mas, &sparse_irqs, irq, irq);
+ WARN_ON(mas_store_gfp(&mas, desc, GFP_KERNEL) != 0);
+}
+
+static void delete_irq_desc(unsigned int irq)
+{
+ MA_STATE(mas, &sparse_irqs, irq, irq);
+ mas_erase(&mas);
+}
#ifdef CONFIG_SPARSE_IRQ
@@ -147,12 +179,12 @@ static ssize_t per_cpu_count_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
- int cpu, irq = desc->irq_data.irq;
ssize_t ret = 0;
char *p = "";
+ int cpu;
for_each_possible_cpu(cpu) {
- unsigned int c = kstat_irqs_cpu(irq, cpu);
+ unsigned int c = irq_desc_kstat_cpu(desc, cpu);
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c);
p = ",";
@@ -188,7 +220,7 @@ static ssize_t hwirq_show(struct kobject *kobj,
raw_spin_lock_irq(&desc->lock);
if (desc->irq_data.domain)
- ret = sprintf(buf, "%d\n", (int)desc->irq_data.hwirq);
+ ret = sprintf(buf, "%lu\n", desc->irq_data.hwirq);
raw_spin_unlock_irq(&desc->lock);
return ret;
@@ -251,7 +283,7 @@ static ssize_t actions_show(struct kobject *kobj,
char *p = "";
raw_spin_lock_irq(&desc->lock);
- for (action = desc->action; action != NULL; action = action->next) {
+ for_each_action_of_desc(desc, action) {
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
p, action->name);
p = ",";
@@ -277,7 +309,7 @@ static struct attribute *irq_attrs[] = {
};
ATTRIBUTE_GROUPS(irq);
-static struct kobj_type irq_kobj_type = {
+static const struct kobj_type irq_kobj_type = {
.release = irq_kobj_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = irq_groups,
@@ -288,22 +320,25 @@ static void irq_sysfs_add(int irq, struct irq_desc *desc)
if (irq_kobj_base) {
/*
* Continue even in case of failure as this is nothing
- * crucial.
+ * crucial and failures in the late irq_sysfs_init()
+ * cannot be rolled back.
*/
if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq))
pr_warn("Failed to add kobject for irq %d\n", irq);
+ else
+ desc->istate |= IRQS_SYSFS;
}
}
static void irq_sysfs_del(struct irq_desc *desc)
{
/*
- * If irq_sysfs_init() has not yet been invoked (early boot), then
- * irq_kobj_base is NULL and the descriptor was never added.
- * kobject_del() complains about a object with no parent, so make
- * it conditional.
+ * Only invoke kobject_del() when kobject_add() was successfully
+ * invoked for the descriptor. This covers both early boot, where
+ * sysfs is not initialized yet, and the case of a failed
+ * kobject_add() invocation.
*/
- if (irq_kobj_base)
+ if (desc->istate & IRQS_SYSFS)
kobject_del(&desc->kobj);
}
@@ -332,7 +367,7 @@ postcore_initcall(irq_sysfs_init);
#else /* !CONFIG_SYSFS */
-static struct kobj_type irq_kobj_type = {
+static const struct kobj_type irq_kobj_type = {
.release = irq_kobj_release,
};
@@ -341,23 +376,13 @@ static void irq_sysfs_del(struct irq_desc *desc) {}
#endif /* CONFIG_SYSFS */
-static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
-
-static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
-{
- radix_tree_insert(&irq_desc_tree, irq, desc);
-}
-
struct irq_desc *irq_to_desc(unsigned int irq)
{
- return radix_tree_lookup(&irq_desc_tree, irq);
-}
-EXPORT_SYMBOL(irq_to_desc);
-
-static void delete_irq_desc(unsigned int irq)
-{
- radix_tree_delete(&irq_desc_tree, irq);
+ return mtree_load(&sparse_irqs, irq);
}
+#ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE
+EXPORT_SYMBOL_GPL(irq_to_desc);
+#endif
#ifdef CONFIG_SMP
static void free_masks(struct irq_desc *desc)
@@ -405,10 +430,12 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
mutex_init(&desc->request_mutex);
init_rcu_head(&desc->rcu);
+ init_waitqueue_head(&desc->wait_for_threads);
desc_set_defaults(irq, desc, node, affinity, owner);
irqd_set(&desc->irq_data, flags);
kobject_init(&desc->kobj, &irq_kobj_type);
+ irq_resend_init(desc);
return desc;
@@ -499,7 +526,6 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
irq_sysfs_add(start + i, desc);
irq_add_debugfs_entry(start + i, desc);
}
- bitmap_set(allocated_irqs, start, cnt);
return start;
err:
@@ -510,7 +536,7 @@ err:
static int irq_expand_nr_irqs(unsigned int nr)
{
- if (nr > IRQ_BITMAP_BITS)
+ if (nr > MAX_SPARSE_IRQS)
return -ENOMEM;
nr_irqs = nr;
return 0;
@@ -528,18 +554,17 @@ int __init early_irq_init(void)
printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n",
NR_IRQS, nr_irqs, initcnt);
- if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
- nr_irqs = IRQ_BITMAP_BITS;
+ if (WARN_ON(nr_irqs > MAX_SPARSE_IRQS))
+ nr_irqs = MAX_SPARSE_IRQS;
- if (WARN_ON(initcnt > IRQ_BITMAP_BITS))
- initcnt = IRQ_BITMAP_BITS;
+ if (WARN_ON(initcnt > MAX_SPARSE_IRQS))
+ initcnt = MAX_SPARSE_IRQS;
if (initcnt > nr_irqs)
nr_irqs = initcnt;
for (i = 0; i < initcnt; i++) {
desc = alloc_desc(i, node, 0, NULL, NULL);
- set_bit(i, allocated_irqs);
irq_insert_desc(i, desc);
}
return arch_early_irq_init();
@@ -573,7 +598,9 @@ int __init early_irq_init(void)
raw_spin_lock_init(&desc[i].lock);
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
mutex_init(&desc[i].request_mutex);
+ init_waitqueue_head(&desc[i].wait_for_threads);
desc_set_defaults(i, &desc[i], node, NULL, NULL);
+ irq_resend_init(&desc[i]);
}
return arch_early_irq_init();
}
@@ -592,6 +619,7 @@ static void free_desc(unsigned int irq)
raw_spin_lock_irqsave(&desc->lock, flags);
desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL);
raw_spin_unlock_irqrestore(&desc->lock, flags);
+ delete_irq_desc(irq);
}
static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
@@ -604,8 +632,8 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
struct irq_desc *desc = irq_to_desc(start + i);
desc->owner = owner;
+ irq_insert_desc(start + i, desc);
}
- bitmap_set(allocated_irqs, start, cnt);
return start;
}
@@ -617,7 +645,7 @@ static int irq_expand_nr_irqs(unsigned int nr)
void irq_mark_irq(unsigned int irq)
{
mutex_lock(&sparse_irq_lock);
- bitmap_set(allocated_irqs, irq, 1);
+ irq_insert_desc(irq, irq_desc + irq);
mutex_unlock(&sparse_irq_lock);
}
@@ -630,106 +658,117 @@ void irq_init_desc(unsigned int irq)
#endif /* !CONFIG_SPARSE_IRQ */
-/**
- * generic_handle_irq - Invoke the handler for a particular irq
- * @irq: The irq number to handle
- *
- */
-int generic_handle_irq(unsigned int irq)
+int handle_irq_desc(struct irq_desc *desc)
{
- struct irq_desc *desc = irq_to_desc(irq);
struct irq_data *data;
if (!desc)
return -EINVAL;
data = irq_desc_get_irq_data(desc);
- if (WARN_ON_ONCE(!in_irq() && handle_enforce_irqctx(data)))
+ if (WARN_ON_ONCE(!in_hardirq() && handle_enforce_irqctx(data)))
return -EPERM;
generic_handle_irq_desc(desc);
return 0;
}
-EXPORT_SYMBOL_GPL(generic_handle_irq);
-#ifdef CONFIG_HANDLE_DOMAIN_IRQ
/**
- * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain
- * @domain: The domain where to perform the lookup
- * @hwirq: The HW irq number to convert to a logical one
- * @lookup: Whether to perform the domain lookup or not
- * @regs: Register file coming from the low-level handling code
+ * generic_handle_irq - Invoke the handler for a particular irq
+ * @irq: The irq number to handle
*
* Returns: 0 on success, or -EINVAL if conversion has failed
- */
-int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
- bool lookup, struct pt_regs *regs)
+ *
+ * This function must be called from an IRQ context with irq regs
+ * initialized.
+ */
+int generic_handle_irq(unsigned int irq)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
- unsigned int irq = hwirq;
- int ret = 0;
-
- irq_enter();
-
-#ifdef CONFIG_IRQ_DOMAIN
- if (lookup)
- irq = irq_find_mapping(domain, hwirq);
-#endif
+ return handle_irq_desc(irq_to_desc(irq));
+}
+EXPORT_SYMBOL_GPL(generic_handle_irq);
- /*
- * Some hardware gives randomly wrong interrupts. Rather
- * than crashing, do something sensible.
- */
- if (unlikely(!irq || irq >= nr_irqs)) {
- ack_bad_irq(irq);
- ret = -EINVAL;
- } else {
- generic_handle_irq(irq);
- }
+/**
+ * generic_handle_irq_safe - Invoke the handler for a particular irq from any
+ * context.
+ * @irq: The irq number to handle
+ *
+ * Returns: 0 on success, a negative value on error.
+ *
+ * This function can be called from any context (IRQ or process context). It
+ * will report an error if not invoked from IRQ context and the irq has been
+ * marked to enforce IRQ-context only.
+ */
+int generic_handle_irq_safe(unsigned int irq)
+{
+ unsigned long flags;
+ int ret;
- irq_exit();
- set_irq_regs(old_regs);
+ local_irq_save(flags);
+ ret = handle_irq_desc(irq_to_desc(irq));
+ local_irq_restore(flags);
return ret;
}
+EXPORT_SYMBOL_GPL(generic_handle_irq_safe);
#ifdef CONFIG_IRQ_DOMAIN
/**
- * handle_domain_nmi - Invoke the handler for a HW irq belonging to a domain
+ * generic_handle_domain_irq - Invoke the handler for a HW irq belonging
+ * to a domain.
* @domain: The domain where to perform the lookup
* @hwirq: The HW irq number to convert to a logical one
- * @regs: Register file coming from the low-level handling code
- *
- * This function must be called from an NMI context.
*
* Returns: 0 on success, or -EINVAL if conversion has failed
+ *
+ * This function must be called from an IRQ context with irq regs
+ * initialized.
*/
-int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
- struct pt_regs *regs)
+int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
- unsigned int irq;
- int ret = 0;
-
- /*
- * NMI context needs to be setup earlier in order to deal with tracing.
- */
- WARN_ON(!in_nmi());
-
- irq = irq_find_mapping(domain, hwirq);
+ return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
+}
+EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
- /*
- * ack_bad_irq is not NMI-safe, just report
- * an invalid interrupt.
- */
- if (likely(irq))
- generic_handle_irq(irq);
- else
- ret = -EINVAL;
+ /**
+ * generic_handle_irq_safe - Invoke the handler for a HW irq belonging
+ * to a domain from any context.
+ * @domain: The domain where to perform the lookup
+ * @hwirq: The HW irq number to convert to a logical one
+ *
+ * Returns: 0 on success, a negative value on error.
+ *
+ * This function can be called from any context (IRQ or process
+ * context). If the interrupt is marked as 'enforce IRQ-context only' then
+ * the function must be invoked from hard interrupt context.
+ */
+int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq)
+{
+ unsigned long flags;
+ int ret;
- set_irq_regs(old_regs);
+ local_irq_save(flags);
+ ret = handle_irq_desc(irq_resolve_mapping(domain, hwirq));
+ local_irq_restore(flags);
return ret;
}
-#endif
+EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe);
+
+/**
+ * generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging
+ * to a domain.
+ * @domain: The domain where to perform the lookup
+ * @hwirq: The HW irq number to convert to a logical one
+ *
+ * Returns: 0 on success, or -EINVAL if conversion has failed
+ *
+ * This function must be called from an NMI context with irq regs
+ * initialized.
+ **/
+int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq)
+{
+ WARN_ON_ONCE(!in_nmi());
+ return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
+}
#endif
/* Dynamic interrupt handling */
@@ -750,7 +789,6 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
for (i = 0; i < cnt; i++)
free_desc(from + i);
- bitmap_clear(allocated_irqs, from, cnt);
mutex_unlock(&sparse_irq_lock);
}
EXPORT_SYMBOL_GPL(irq_free_descs);
@@ -792,8 +830,7 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
mutex_lock(&sparse_irq_lock);
- start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
- from, cnt, 0);
+ start = irq_find_free_area(from, cnt);
ret = -EEXIST;
if (irq >=0 && start != irq)
goto unlock;
@@ -810,57 +847,6 @@ unlock:
}
EXPORT_SYMBOL_GPL(__irq_alloc_descs);
-#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
-/**
- * irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware
- * @cnt: number of interrupts to allocate
- * @node: node on which to allocate
- *
- * Returns an interrupt number > 0 or 0, if the allocation fails.
- */
-unsigned int irq_alloc_hwirqs(int cnt, int node)
-{
- int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL, NULL);
-
- if (irq < 0)
- return 0;
-
- for (i = irq; cnt > 0; i++, cnt--) {
- if (arch_setup_hwirq(i, node))
- goto err;
- irq_clear_status_flags(i, _IRQ_NOREQUEST);
- }
- return irq;
-
-err:
- for (i--; i >= irq; i--) {
- irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
- arch_teardown_hwirq(i);
- }
- irq_free_descs(irq, cnt);
- return 0;
-}
-EXPORT_SYMBOL_GPL(irq_alloc_hwirqs);
-
-/**
- * irq_free_hwirqs - Free irq descriptor and cleanup the hardware
- * @from: Free from irq number
- * @cnt: number of interrupts to free
- *
- */
-void irq_free_hwirqs(unsigned int from, int cnt)
-{
- int i, j;
-
- for (i = from, j = cnt; j > 0; i++, j--) {
- irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
- arch_teardown_hwirq(i);
- }
- irq_free_descs(from, cnt);
-}
-EXPORT_SYMBOL_GPL(irq_free_hwirqs);
-#endif
-
/**
* irq_get_next_irq - get next allocated irq number
* @offset: where to start the search
@@ -869,7 +855,7 @@ EXPORT_SYMBOL_GPL(irq_free_hwirqs);
*/
unsigned int irq_get_next_irq(unsigned int offset)
{
- return find_next_bit(allocated_irqs, nr_irqs, offset);
+ return irq_find_at_or_after(offset);
}
struct irq_desc *
@@ -975,15 +961,7 @@ static bool irq_is_nmi(struct irq_desc *desc)
return desc->istate & IRQS_NMI;
}
-/**
- * kstat_irqs - Get the statistics for an interrupt
- * @irq: The interrupt number
- *
- * Returns the sum of interrupt counts on all cpus since boot for
- * @irq. The caller must ensure that the interrupt is not removed
- * concurrently.
- */
-unsigned int kstat_irqs(unsigned int irq)
+static unsigned int kstat_irqs(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned int sum = 0;
@@ -994,21 +972,22 @@ unsigned int kstat_irqs(unsigned int irq)
if (!irq_settings_is_per_cpu_devid(desc) &&
!irq_settings_is_per_cpu(desc) &&
!irq_is_nmi(desc))
- return desc->tot_count;
+ return data_race(desc->tot_count);
for_each_possible_cpu(cpu)
- sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
+ sum += data_race(*per_cpu_ptr(desc->kstat_irqs, cpu));
return sum;
}
/**
- * kstat_irqs_usr - Get the statistics for an interrupt
+ * kstat_irqs_usr - Get the statistics for an interrupt from thread context
* @irq: The interrupt number
*
* Returns the sum of interrupt counts on all cpus since boot for @irq.
- * Contrary to kstat_irqs() this can be called from any context.
- * It uses rcu since a concurrent removal of an interrupt descriptor is
- * observing an rcu grace period before delayed_free_desc()/irq_kobj_release().
+ *
+ * It uses rcu to protect the access since a concurrent removal of an
+ * interrupt descriptor is observing an rcu grace period before
+ * delayed_free_desc()/irq_kobj_release().
*/
unsigned int kstat_irqs_usr(unsigned int irq)
{
@@ -1019,3 +998,17 @@ unsigned int kstat_irqs_usr(unsigned int irq)
rcu_read_unlock();
return sum;
}
+
+#ifdef CONFIG_LOCKDEP
+void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class,
+ struct lock_class_key *request_class)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (desc) {
+ lockdep_set_class(&desc->lock, lock_class);
+ lockdep_set_class(&desc->request_mutex, request_class);
+ }
+}
+EXPORT_SYMBOL_GPL(__irq_set_lockdep_class);
+#endif
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index a4c2c915511d..0bdef4fe925b 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -25,6 +25,9 @@ static DEFINE_MUTEX(irq_domain_mutex);
static struct irq_domain *irq_default_domain;
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity);
static void irq_domain_check_hierarchy(struct irq_domain *domain);
struct irqchip_fwid {
@@ -42,7 +45,16 @@ static inline void debugfs_add_domain_dir(struct irq_domain *d) { }
static inline void debugfs_remove_domain_dir(struct irq_domain *d) { }
#endif
-const struct fwnode_operations irqchip_fwnode_ops;
+static const char *irqchip_fwnode_get_name(const struct fwnode_handle *fwnode)
+{
+ struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
+
+ return fwid->name;
+}
+
+const struct fwnode_operations irqchip_fwnode_ops = {
+ .get_name = irqchip_fwnode_get_name,
+};
EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
/**
@@ -53,7 +65,7 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
* @name: Optional user provided domain name
* @pa: Optional user-provided physical address
*
- * Allocate a struct irqchip_fwid, and return a poiner to the embedded
+ * Allocate a struct irqchip_fwid, and return a pointer to the embedded
* fwnode_handle (or NULL on failure).
*
* Note: The types IRQCHIP_FWNODE_NAMED and IRQCHIP_FWNODE_NAMED_ID are
@@ -91,7 +103,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
fwid->type = type;
fwid->name = n;
fwid->pa = pa;
- fwid->fwnode.ops = &irqchip_fwnode_ops;
+ fwnode_init(&fwid->fwnode, &irqchip_fwnode_ops);
return &fwid->fwnode;
}
EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode);
@@ -105,7 +117,7 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
{
struct irqchip_fwid *fwid;
- if (WARN_ON(!is_fwnode_irqchip(fwnode)))
+ if (!fwnode || WARN_ON(!is_fwnode_irqchip(fwnode)))
return;
fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
@@ -114,35 +126,29 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
}
EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
-/**
- * __irq_domain_add() - Allocate a new irq_domain data structure
- * @fwnode: firmware node for the interrupt controller
- * @size: Size of linear map; 0 for radix mapping only
- * @hwirq_max: Maximum number of interrupts supported by controller
- * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
- * direct mapping
- * @ops: domain callbacks
- * @host_data: Controller private data pointer
- *
- * Allocates and initializes an irq_domain structure.
- * Returns pointer to IRQ domain, or NULL on failure.
- */
-struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
- irq_hw_number_t hwirq_max, int direct_max,
- const struct irq_domain_ops *ops,
- void *host_data)
+static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode,
+ unsigned int size,
+ irq_hw_number_t hwirq_max,
+ int direct_max,
+ const struct irq_domain_ops *ops,
+ void *host_data)
{
struct irqchip_fwid *fwid;
struct irq_domain *domain;
static atomic_t unknown_domains;
- domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
+ if (WARN_ON((size && direct_max) ||
+ (!IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && direct_max) ||
+ (direct_max && (direct_max != hwirq_max))))
+ return NULL;
+
+ domain = kzalloc_node(struct_size(domain, revmap, size),
GFP_KERNEL, of_node_to_nid(to_of_node(fwnode)));
if (!domain)
return NULL;
- if (fwnode && is_fwnode_irqchip(fwnode)) {
+ if (is_fwnode_irqchip(fwnode)) {
fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
switch (fwid->type) {
@@ -176,9 +182,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
return NULL;
}
- strreplace(name, '/', ':');
-
- domain->name = name;
+ domain->name = strreplace(name, '/', ':');
domain->fwnode = fwnode;
domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
}
@@ -196,23 +200,70 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
}
fwnode_handle_get(fwnode);
+ fwnode_dev_initialized(fwnode, true);
/* Fill structure */
INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
- mutex_init(&domain->revmap_tree_mutex);
domain->ops = ops;
domain->host_data = host_data;
domain->hwirq_max = hwirq_max;
+
+ if (direct_max)
+ domain->flags |= IRQ_DOMAIN_FLAG_NO_MAP;
+
domain->revmap_size = size;
- domain->revmap_direct_max_irq = direct_max;
+
+ /*
+ * Hierarchical domains use the domain lock of the root domain
+ * (innermost domain).
+ *
+ * For non-hierarchical domains (as for root domains), the root
+ * pointer is set to the domain itself so that &domain->root->mutex
+ * always points to the right lock.
+ */
+ mutex_init(&domain->mutex);
+ domain->root = domain;
+
irq_domain_check_hierarchy(domain);
+ return domain;
+}
+
+static void __irq_domain_publish(struct irq_domain *domain)
+{
mutex_lock(&irq_domain_mutex);
debugfs_add_domain_dir(domain);
list_add(&domain->link, &irq_domain_list);
mutex_unlock(&irq_domain_mutex);
pr_debug("Added domain %s\n", domain->name);
+}
+
+/**
+ * __irq_domain_add() - Allocate a new irq_domain data structure
+ * @fwnode: firmware node for the interrupt controller
+ * @size: Size of linear map; 0 for radix mapping only
+ * @hwirq_max: Maximum number of interrupts supported by controller
+ * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
+ * direct mapping
+ * @ops: domain callbacks
+ * @host_data: Controller private data pointer
+ *
+ * Allocates and initializes an irq_domain structure.
+ * Returns pointer to IRQ domain, or NULL on failure.
+ */
+struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size,
+ irq_hw_number_t hwirq_max, int direct_max,
+ const struct irq_domain_ops *ops,
+ void *host_data)
+{
+ struct irq_domain *domain;
+
+ domain = __irq_domain_create(fwnode, size, hwirq_max, direct_max,
+ ops, host_data);
+ if (domain)
+ __irq_domain_publish(domain);
+
return domain;
}
EXPORT_SYMBOL_GPL(__irq_domain_add);
@@ -244,6 +295,7 @@ void irq_domain_remove(struct irq_domain *domain)
pr_debug("Removed domain %s\n", domain->name);
+ fwnode_dev_initialized(domain->fwnode, false);
fwnode_handle_put(domain->fwnode);
if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED)
kfree(domain->name);
@@ -281,10 +333,11 @@ void irq_domain_update_bus_token(struct irq_domain *domain,
mutex_unlock(&irq_domain_mutex);
}
+EXPORT_SYMBOL_GPL(irq_domain_update_bus_token);
/**
- * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs
- * @of_node: pointer to interrupt controller's device tree node.
+ * irq_domain_create_simple() - Register an irq_domain and optionally map a range of irqs
+ * @fwnode: firmware node for the interrupt controller
* @size: total number of irqs in mapping
* @first_irq: first number of irq block assigned to the domain,
* pass zero to assign irqs on-the-fly. If first_irq is non-zero, then
@@ -300,15 +353,15 @@ void irq_domain_update_bus_token(struct irq_domain *domain,
* irqs get mapped dynamically on the fly. However, if the controller requires
* static virq assignments (non-DT boot) then it will set that up correctly.
*/
-struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
- unsigned int size,
- unsigned int first_irq,
- const struct irq_domain_ops *ops,
- void *host_data)
+struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode,
+ unsigned int size,
+ unsigned int first_irq,
+ const struct irq_domain_ops *ops,
+ void *host_data)
{
struct irq_domain *domain;
- domain = __irq_domain_add(of_node_to_fwnode(of_node), size, size, 0, ops, host_data);
+ domain = __irq_domain_add(fwnode, size, size, 0, ops, host_data);
if (!domain)
return NULL;
@@ -316,7 +369,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
/* attempt to allocated irq_descs */
int rc = irq_alloc_descs(first_irq, first_irq, size,
- of_node_to_nid(of_node));
+ of_node_to_nid(to_of_node(fwnode)));
if (rc < 0)
pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
first_irq);
@@ -326,7 +379,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
return domain;
}
-EXPORT_SYMBOL_GPL(irq_domain_add_simple);
+EXPORT_SYMBOL_GPL(irq_domain_create_simple);
/**
* irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
@@ -350,16 +403,27 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
const struct irq_domain_ops *ops,
void *host_data)
{
+ return irq_domain_create_legacy(of_node_to_fwnode(of_node), size,
+ first_irq, first_hwirq, ops, host_data);
+}
+EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
+
+struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode,
+ unsigned int size,
+ unsigned int first_irq,
+ irq_hw_number_t first_hwirq,
+ const struct irq_domain_ops *ops,
+ void *host_data)
+{
struct irq_domain *domain;
- domain = __irq_domain_add(of_node_to_fwnode(of_node), first_hwirq + size,
- first_hwirq + size, 0, ops, host_data);
+ domain = __irq_domain_add(fwnode, first_hwirq + size, first_hwirq + size, 0, ops, host_data);
if (domain)
irq_domain_associate_many(domain, first_irq, first_hwirq, size);
return domain;
}
-EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
+EXPORT_SYMBOL_GPL(irq_domain_create_legacy);
/**
* irq_find_matching_fwspec() - Locates a domain for a given fwspec
@@ -404,31 +468,6 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
EXPORT_SYMBOL_GPL(irq_find_matching_fwspec);
/**
- * irq_domain_check_msi_remap - Check whether all MSI irq domains implement
- * IRQ remapping
- *
- * Return: false if any MSI irq domain does not support IRQ remapping,
- * true otherwise (including if there is no MSI irq domain)
- */
-bool irq_domain_check_msi_remap(void)
-{
- struct irq_domain *h;
- bool ret = true;
-
- mutex_lock(&irq_domain_mutex);
- list_for_each_entry(h, &irq_domain_list, link) {
- if (irq_domain_is_msi(h) &&
- !irq_domain_hierarchical_is_msi_remap(h)) {
- ret = false;
- break;
- }
- }
- mutex_unlock(&irq_domain_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(irq_domain_check_msi_remap);
-
-/**
* irq_set_default_host() - Set a "default" irq domain
* @domain: default domain pointer
*
@@ -458,33 +497,48 @@ struct irq_domain *irq_get_default_host(void)
{
return irq_default_domain;
}
+EXPORT_SYMBOL_GPL(irq_get_default_host);
+
+static bool irq_domain_is_nomap(struct irq_domain *domain)
+{
+ return IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) &&
+ (domain->flags & IRQ_DOMAIN_FLAG_NO_MAP);
+}
static void irq_domain_clear_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = 0;
- } else {
- mutex_lock(&domain->revmap_tree_mutex);
+ lockdep_assert_held(&domain->root->mutex);
+
+ if (irq_domain_is_nomap(domain))
+ return;
+
+ if (hwirq < domain->revmap_size)
+ rcu_assign_pointer(domain->revmap[hwirq], NULL);
+ else
radix_tree_delete(&domain->revmap_tree, hwirq);
- mutex_unlock(&domain->revmap_tree_mutex);
- }
}
static void irq_domain_set_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq,
struct irq_data *irq_data)
{
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = irq_data->irq;
- } else {
- mutex_lock(&domain->revmap_tree_mutex);
+ /*
+ * This also makes sure that all domains point to the same root when
+ * called from irq_domain_insert_irq() for each domain in a hierarchy.
+ */
+ lockdep_assert_held(&domain->root->mutex);
+
+ if (irq_domain_is_nomap(domain))
+ return;
+
+ if (hwirq < domain->revmap_size)
+ rcu_assign_pointer(domain->revmap[hwirq], irq_data);
+ else
radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
- mutex_unlock(&domain->revmap_tree_mutex);
- }
}
-void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
+static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
{
struct irq_data *irq_data = irq_get_irq_data(irq);
irq_hw_number_t hwirq;
@@ -494,6 +548,9 @@ void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
return;
hwirq = irq_data->hwirq;
+
+ mutex_lock(&domain->root->mutex);
+
irq_set_status_flags(irq, IRQ_NOREQUEST);
/* remove chip and handler */
@@ -513,10 +570,12 @@ void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
/* Clear reverse map for this hwirq */
irq_domain_clear_mapping(domain, hwirq);
+
+ mutex_unlock(&domain->root->mutex);
}
-int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq)
+static int irq_domain_associate_locked(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
{
struct irq_data *irq_data = irq_get_irq_data(virq);
int ret;
@@ -529,7 +588,6 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
if (WARN(irq_data->domain, "error: virq%i is already associated", virq))
return -EINVAL;
- mutex_lock(&irq_domain_mutex);
irq_data->hwirq = hwirq;
irq_data->domain = domain;
if (domain->ops->map) {
@@ -546,23 +604,29 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
}
irq_data->domain = NULL;
irq_data->hwirq = 0;
- mutex_unlock(&irq_domain_mutex);
return ret;
}
-
- /* If not already assigned, give the domain the chip's name */
- if (!domain->name && irq_data->chip)
- domain->name = irq_data->chip->name;
}
domain->mapcount++;
irq_domain_set_mapping(domain, hwirq, irq_data);
- mutex_unlock(&irq_domain_mutex);
irq_clear_status_flags(virq, IRQ_NOREQUEST);
return 0;
}
+
+int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
+{
+ int ret;
+
+ mutex_lock(&domain->root->mutex);
+ ret = irq_domain_associate_locked(domain, virq, hwirq);
+ mutex_unlock(&domain->root->mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(irq_domain_associate);
void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
@@ -575,12 +639,12 @@ void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
of_node_full_name(of_node), irq_base, (int)hwirq_base, count);
- for (i = 0; i < count; i++) {
+ for (i = 0; i < count; i++)
irq_domain_associate(domain, irq_base + i, hwirq_base + i);
- }
}
EXPORT_SYMBOL_GPL(irq_domain_associate_many);
+#ifdef CONFIG_IRQ_DOMAIN_NOMAP
/**
* irq_create_direct_mapping() - Allocate an irq for direct mapping
* @domain: domain to allocate the irq for or NULL for default domain
@@ -605,9 +669,9 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
pr_debug("create_direct virq allocation failed\n");
return 0;
}
- if (virq >= domain->revmap_direct_max_irq) {
- pr_err("ERROR: no free irqs available below %i maximum\n",
- domain->revmap_direct_max_irq);
+ if (virq >= domain->hwirq_max) {
+ pr_err("ERROR: no free irqs available below %lu maximum\n",
+ domain->hwirq_max);
irq_free_desc(virq);
return 0;
}
@@ -621,51 +685,26 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
return virq;
}
EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
+#endif
-/**
- * irq_create_mapping() - Map a hardware interrupt into linux irq space
- * @domain: domain owning this hardware interrupt or NULL for default domain
- * @hwirq: hardware irq number in that domain space
- *
- * Only one mapping per hardware interrupt is permitted. Returns a linux
- * irq number.
- * If the sense/trigger is to be specified, set_irq_type() should be called
- * on the number returned from that call.
- */
-unsigned int irq_create_mapping(struct irq_domain *domain,
- irq_hw_number_t hwirq)
+static unsigned int irq_create_mapping_affinity_locked(struct irq_domain *domain,
+ irq_hw_number_t hwirq,
+ const struct irq_affinity_desc *affinity)
{
- struct device_node *of_node;
+ struct device_node *of_node = irq_domain_get_of_node(domain);
int virq;
pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
- /* Look for default domain if nececssary */
- if (domain == NULL)
- domain = irq_default_domain;
- if (domain == NULL) {
- WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq);
- return 0;
- }
- pr_debug("-> using domain @%p\n", domain);
-
- of_node = irq_domain_get_of_node(domain);
-
- /* Check if mapping already exists */
- virq = irq_find_mapping(domain, hwirq);
- if (virq) {
- pr_debug("-> existing mapping on virq %d\n", virq);
- return virq;
- }
-
/* Allocate a virtual interrupt number */
- virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL);
+ virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node),
+ affinity);
if (virq <= 0) {
pr_debug("-> virq allocation failed\n");
return 0;
}
- if (irq_domain_associate(domain, virq, hwirq)) {
+ if (irq_domain_associate_locked(domain, virq, hwirq)) {
irq_free_desc(virq);
return 0;
}
@@ -675,42 +714,48 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
return virq;
}
-EXPORT_SYMBOL_GPL(irq_create_mapping);
/**
- * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs
- * @domain: domain owning the interrupt range
- * @irq_base: beginning of linux IRQ range
- * @hwirq_base: beginning of hardware IRQ range
- * @count: Number of interrupts to map
- *
- * This routine is used for allocating and mapping a range of hardware
- * irqs to linux irqs where the linux irq numbers are at pre-defined
- * locations. For use by controllers that already have static mappings
- * to insert in to the domain.
- *
- * Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time
- * domain insertion.
+ * irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space
+ * @domain: domain owning this hardware interrupt or NULL for default domain
+ * @hwirq: hardware irq number in that domain space
+ * @affinity: irq affinity
*
- * 0 is returned upon success, while any failure to establish a static
- * mapping is treated as an error.
+ * Only one mapping per hardware interrupt is permitted. Returns a linux
+ * irq number.
+ * If the sense/trigger is to be specified, set_irq_type() should be called
+ * on the number returned from that call.
*/
-int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
- irq_hw_number_t hwirq_base, int count)
+unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
+ irq_hw_number_t hwirq,
+ const struct irq_affinity_desc *affinity)
{
- struct device_node *of_node;
- int ret;
+ int virq;
- of_node = irq_domain_get_of_node(domain);
- ret = irq_alloc_descs(irq_base, irq_base, count,
- of_node_to_nid(of_node));
- if (unlikely(ret < 0))
- return ret;
+ /* Look for default domain if necessary */
+ if (domain == NULL)
+ domain = irq_default_domain;
+ if (domain == NULL) {
+ WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq);
+ return 0;
+ }
- irq_domain_associate_many(domain, irq_base, hwirq_base, count);
- return 0;
+ mutex_lock(&domain->root->mutex);
+
+ /* Check if mapping already exists */
+ virq = irq_find_mapping(domain, hwirq);
+ if (virq) {
+ pr_debug("existing mapping on virq %d\n", virq);
+ goto out;
+ }
+
+ virq = irq_create_mapping_affinity_locked(domain, hwirq, affinity);
+out:
+ mutex_unlock(&domain->root->mutex);
+
+ return virq;
}
-EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
+EXPORT_SYMBOL_GPL(irq_create_mapping_affinity);
static int irq_domain_translate(struct irq_domain *d,
struct irq_fwspec *fwspec,
@@ -730,18 +775,18 @@ static int irq_domain_translate(struct irq_domain *d,
return 0;
}
-static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
- unsigned int count,
- struct irq_fwspec *fwspec)
+void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
+ unsigned int count, struct irq_fwspec *fwspec)
{
int i;
- fwspec->fwnode = np ? &np->fwnode : NULL;
+ fwspec->fwnode = of_node_to_fwnode(np);
fwspec->param_count = count;
for (i = 0; i < count; i++)
fwspec->param[i] = args[i];
}
+EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec);
unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
{
@@ -775,6 +820,8 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK))
type &= IRQ_TYPE_SENSE_MASK;
+ mutex_lock(&domain->root->mutex);
+
/*
* If we've already configured this interrupt,
* don't do it again, or hell will break loose.
@@ -787,7 +834,7 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
* interrupt number.
*/
if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
- return virq;
+ goto out;
/*
* If the trigger type has not been set yet, then set
@@ -795,40 +842,45 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
*/
if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) {
irq_data = irq_get_irq_data(virq);
- if (!irq_data)
- return 0;
+ if (!irq_data) {
+ virq = 0;
+ goto out;
+ }
irqd_set_trigger_type(irq_data, type);
- return virq;
+ goto out;
}
pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n",
hwirq, of_node_full_name(to_of_node(fwspec->fwnode)));
- return 0;
+ virq = 0;
+ goto out;
}
if (irq_domain_is_hierarchy(domain)) {
- virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec);
- if (virq <= 0)
- return 0;
+ virq = irq_domain_alloc_irqs_locked(domain, -1, 1, NUMA_NO_NODE,
+ fwspec, false, NULL);
+ if (virq <= 0) {
+ virq = 0;
+ goto out;
+ }
} else {
/* Create mapping */
- virq = irq_create_mapping(domain, hwirq);
+ virq = irq_create_mapping_affinity_locked(domain, hwirq, NULL);
if (!virq)
- return virq;
+ goto out;
}
irq_data = irq_get_irq_data(virq);
- if (!irq_data) {
- if (irq_domain_is_hierarchy(domain))
- irq_domain_free_irqs(virq, 1);
- else
- irq_dispose_mapping(virq);
- return 0;
+ if (WARN_ON(!irq_data)) {
+ virq = 0;
+ goto out;
}
/* Store trigger type */
irqd_set_trigger_type(irq_data, type);
+out:
+ mutex_unlock(&domain->root->mutex);
return virq;
}
@@ -871,37 +923,55 @@ void irq_dispose_mapping(unsigned int virq)
EXPORT_SYMBOL_GPL(irq_dispose_mapping);
/**
- * irq_find_mapping() - Find a linux irq from a hw irq number.
+ * __irq_resolve_mapping() - Find a linux irq from a hw irq number.
* @domain: domain owning this hardware interrupt
* @hwirq: hardware irq number in that domain space
+ * @irq: optional pointer to return the Linux irq if required
+ *
+ * Returns the interrupt descriptor.
*/
-unsigned int irq_find_mapping(struct irq_domain *domain,
- irq_hw_number_t hwirq)
+struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain,
+ irq_hw_number_t hwirq,
+ unsigned int *irq)
{
+ struct irq_desc *desc = NULL;
struct irq_data *data;
- /* Look for default domain if nececssary */
+ /* Look for default domain if necessary */
if (domain == NULL)
domain = irq_default_domain;
if (domain == NULL)
- return 0;
+ return desc;
+
+ if (irq_domain_is_nomap(domain)) {
+ if (hwirq < domain->hwirq_max) {
+ data = irq_domain_get_irq_data(domain, hwirq);
+ if (data && data->hwirq == hwirq)
+ desc = irq_data_to_desc(data);
+ if (irq && desc)
+ *irq = hwirq;
+ }
- if (hwirq < domain->revmap_direct_max_irq) {
- data = irq_domain_get_irq_data(domain, hwirq);
- if (data && data->hwirq == hwirq)
- return hwirq;
+ return desc;
}
+ rcu_read_lock();
/* Check if the hwirq is in the linear revmap. */
if (hwirq < domain->revmap_size)
- return domain->linear_revmap[hwirq];
+ data = rcu_dereference(domain->revmap[hwirq]);
+ else
+ data = radix_tree_lookup(&domain->revmap_tree, hwirq);
+
+ if (likely(data)) {
+ desc = irq_data_to_desc(data);
+ if (irq)
+ *irq = data->irq;
+ }
- rcu_read_lock();
- data = radix_tree_lookup(&domain->revmap_tree, hwirq);
rcu_read_unlock();
- return data ? data->irq : 0;
+ return desc;
}
-EXPORT_SYMBOL_GPL(irq_find_mapping);
+EXPORT_SYMBOL_GPL(__irq_resolve_mapping);
/**
* irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
@@ -1070,12 +1140,17 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,
struct irq_domain *domain;
if (size)
- domain = irq_domain_create_linear(fwnode, size, ops, host_data);
+ domain = __irq_domain_create(fwnode, size, size, 0, ops, host_data);
else
- domain = irq_domain_create_tree(fwnode, ops, host_data);
+ domain = __irq_domain_create(fwnode, 0, ~0, 0, ops, host_data);
+
if (domain) {
+ if (parent)
+ domain->root = parent->root;
domain->parent = parent;
domain->flags |= flags;
+
+ __irq_domain_publish(domain);
}
return domain;
@@ -1091,10 +1166,6 @@ static void irq_domain_insert_irq(int virq)
domain->mapcount++;
irq_domain_set_mapping(domain, data->hwirq, data);
-
- /* If not already assigned, give the domain the chip's name */
- if (!domain->name && data->chip)
- domain->name = data->chip->name;
}
irq_clear_status_flags(virq, IRQ_NOREQUEST);
@@ -1135,6 +1206,17 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
return irq_data;
}
+static void __irq_domain_free_hierarchy(struct irq_data *irq_data)
+{
+ struct irq_data *tmp;
+
+ while (irq_data) {
+ tmp = irq_data;
+ irq_data = irq_data->parent_data;
+ kfree(tmp);
+ }
+}
+
static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs)
{
struct irq_data *irq_data, *tmp;
@@ -1146,12 +1228,84 @@ static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs)
irq_data->parent_data = NULL;
irq_data->domain = NULL;
- while (tmp) {
- irq_data = tmp;
- tmp = tmp->parent_data;
- kfree(irq_data);
+ __irq_domain_free_hierarchy(tmp);
+ }
+}
+
+/**
+ * irq_domain_disconnect_hierarchy - Mark the first unused level of a hierarchy
+ * @domain: IRQ domain from which the hierarchy is to be disconnected
+ * @virq: IRQ number where the hierarchy is to be trimmed
+ *
+ * Marks the @virq level belonging to @domain as disconnected.
+ * Returns -EINVAL if @virq doesn't have a valid irq_data pointing
+ * to @domain.
+ *
+ * Its only use is to be able to trim levels of hierarchy that do not
+ * have any real meaning for this interrupt, and that the driver marks
+ * as such from its .alloc() callback.
+ */
+int irq_domain_disconnect_hierarchy(struct irq_domain *domain,
+ unsigned int virq)
+{
+ struct irq_data *irqd;
+
+ irqd = irq_domain_get_irq_data(domain, virq);
+ if (!irqd)
+ return -EINVAL;
+
+ irqd->chip = ERR_PTR(-ENOTCONN);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_disconnect_hierarchy);
+
+static int irq_domain_trim_hierarchy(unsigned int virq)
+{
+ struct irq_data *tail, *irqd, *irq_data;
+
+ irq_data = irq_get_irq_data(virq);
+ tail = NULL;
+
+ /* The first entry must have a valid irqchip */
+ if (!irq_data->chip || IS_ERR(irq_data->chip))
+ return -EINVAL;
+
+ /*
+ * Validate that the irq_data chain is sane in the presence of
+ * a hierarchy trimming marker.
+ */
+ for (irqd = irq_data->parent_data; irqd; irq_data = irqd, irqd = irqd->parent_data) {
+ /* Can't have a valid irqchip after a trim marker */
+ if (irqd->chip && tail)
+ return -EINVAL;
+
+ /* Can't have an empty irqchip before a trim marker */
+ if (!irqd->chip && !tail)
+ return -EINVAL;
+
+ if (IS_ERR(irqd->chip)) {
+ /* Only -ENOTCONN is a valid trim marker */
+ if (PTR_ERR(irqd->chip) != -ENOTCONN)
+ return -EINVAL;
+
+ tail = irq_data;
}
}
+
+ /* No trim marker, nothing to do */
+ if (!tail)
+ return 0;
+
+ pr_info("IRQ%d: trimming hierarchy from %s\n",
+ virq, tail->parent_data->domain->name);
+
+ /* Sever the inner part of the hierarchy... */
+ irqd = tail;
+ tail = tail->parent_data;
+ irqd->parent_data = NULL;
+ __irq_domain_free_hierarchy(tail);
+
+ return 0;
}
static int irq_domain_alloc_irq_data(struct irq_domain *domain,
@@ -1206,7 +1360,8 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
* @chip_data: The associated chip data
*/
int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq,
+ const struct irq_chip *chip,
void *chip_data)
{
struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
@@ -1215,7 +1370,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
return -ENOENT;
irq_data->hwirq = hwirq;
- irq_data->chip = chip ? chip : &no_irq_chip;
+ irq_data->chip = (struct irq_chip *)(chip ? chip : &no_irq_chip);
irq_data->chip_data = chip_data;
return 0;
@@ -1234,7 +1389,7 @@ EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip);
* @handler_name: The interrupt handler name
*/
void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq, const struct irq_chip *chip,
void *chip_data, irq_flow_handler_t handler,
void *handler_data, const char *handler_name)
{
@@ -1287,8 +1442,15 @@ static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain,
unsigned int irq_base,
unsigned int nr_irqs)
{
- if (domain->ops->free)
- domain->ops->free(domain, irq_base, nr_irqs);
+ unsigned int i;
+
+ if (!domain->ops->free)
+ return;
+
+ for (i = 0; i < nr_irqs; i++) {
+ if (irq_domain_get_irq_data(domain, irq_base + i))
+ domain->ops->free(domain, irq_base + i, 1);
+ }
}
int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
@@ -1303,40 +1465,12 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
return domain->ops->alloc(domain, irq_base, nr_irqs, arg);
}
-/**
- * __irq_domain_alloc_irqs - Allocate IRQs from domain
- * @domain: domain to allocate from
- * @irq_base: allocate specified IRQ number if irq_base >= 0
- * @nr_irqs: number of IRQs to allocate
- * @node: NUMA node id for memory allocation
- * @arg: domain specific argument
- * @realloc: IRQ descriptors have already been allocated if true
- * @affinity: Optional irq affinity mask for multiqueue devices
- *
- * Allocate IRQ numbers and initialized all data structures to support
- * hierarchy IRQ domains.
- * Parameter @realloc is mainly to support legacy IRQs.
- * Returns error code or allocated IRQ number
- *
- * The whole process to setup an IRQ has been split into two steps.
- * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
- * descriptor and required hardware resources. The second step,
- * irq_domain_activate_irq(), is to program hardwares with preallocated
- * resources. In this way, it's easier to rollback when failing to
- * allocate resources.
- */
-int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
- unsigned int nr_irqs, int node, void *arg,
- bool realloc, const struct irq_affinity_desc *affinity)
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
{
int i, ret, virq;
- if (domain == NULL) {
- domain = irq_default_domain;
- if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
- return -EINVAL;
- }
-
if (realloc && irq_base >= 0) {
virq = irq_base;
} else {
@@ -1355,15 +1489,18 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
goto out_free_desc;
}
- mutex_lock(&irq_domain_mutex);
ret = irq_domain_alloc_irqs_hierarchy(domain, virq, nr_irqs, arg);
- if (ret < 0) {
- mutex_unlock(&irq_domain_mutex);
+ if (ret < 0)
goto out_free_irq_data;
+
+ for (i = 0; i < nr_irqs; i++) {
+ ret = irq_domain_trim_hierarchy(virq + i);
+ if (ret)
+ goto out_free_irq_data;
}
+
for (i = 0; i < nr_irqs; i++)
irq_domain_insert_irq(virq + i);
- mutex_unlock(&irq_domain_mutex);
return virq;
@@ -1374,20 +1511,68 @@ out_free_desc:
return ret;
}
+/**
+ * __irq_domain_alloc_irqs - Allocate IRQs from domain
+ * @domain: domain to allocate from
+ * @irq_base: allocate specified IRQ number if irq_base >= 0
+ * @nr_irqs: number of IRQs to allocate
+ * @node: NUMA node id for memory allocation
+ * @arg: domain specific argument
+ * @realloc: IRQ descriptors have already been allocated if true
+ * @affinity: Optional irq affinity mask for multiqueue devices
+ *
+ * Allocate IRQ numbers and initialized all data structures to support
+ * hierarchy IRQ domains.
+ * Parameter @realloc is mainly to support legacy IRQs.
+ * Returns error code or allocated IRQ number
+ *
+ * The whole process to setup an IRQ has been split into two steps.
+ * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
+ * descriptor and required hardware resources. The second step,
+ * irq_domain_activate_irq(), is to program the hardware with preallocated
+ * resources. In this way, it's easier to rollback when failing to
+ * allocate resources.
+ */
+int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
+{
+ int ret;
+
+ if (domain == NULL) {
+ domain = irq_default_domain;
+ if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
+ return -EINVAL;
+ }
+
+ mutex_lock(&domain->root->mutex);
+ ret = irq_domain_alloc_irqs_locked(domain, irq_base, nr_irqs, node, arg,
+ realloc, affinity);
+ mutex_unlock(&domain->root->mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs);
+
/* The irq_data was moved, fix the revmap to refer to the new location */
static void irq_domain_fix_revmap(struct irq_data *d)
{
void __rcu **slot;
- if (d->hwirq < d->domain->revmap_size)
- return; /* Not using radix tree. */
+ lockdep_assert_held(&d->domain->root->mutex);
+
+ if (irq_domain_is_nomap(d->domain))
+ return;
/* Fix up the revmap. */
- mutex_lock(&d->domain->revmap_tree_mutex);
- slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq);
- if (slot)
- radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
- mutex_unlock(&d->domain->revmap_tree_mutex);
+ if (d->hwirq < d->domain->revmap_size) {
+ /* Not using radix tree */
+ rcu_assign_pointer(d->domain->revmap[d->hwirq], d);
+ } else {
+ slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq);
+ if (slot)
+ radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
+ }
}
/**
@@ -1403,8 +1588,8 @@ static void irq_domain_fix_revmap(struct irq_data *d)
*/
int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
{
- struct irq_data *child_irq_data;
- struct irq_data *root_irq_data = irq_get_irq_data(virq);
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_data *parent_irq_data;
struct irq_desc *desc;
int rv = 0;
@@ -1429,47 +1614,46 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
if (WARN_ON(!irq_domain_is_hierarchy(domain)))
return -EINVAL;
- if (!root_irq_data)
+ if (!irq_data)
return -EINVAL;
- if (domain->parent != root_irq_data->domain)
+ if (domain->parent != irq_data->domain)
return -EINVAL;
- child_irq_data = kzalloc_node(sizeof(*child_irq_data), GFP_KERNEL,
- irq_data_get_node(root_irq_data));
- if (!child_irq_data)
+ parent_irq_data = kzalloc_node(sizeof(*parent_irq_data), GFP_KERNEL,
+ irq_data_get_node(irq_data));
+ if (!parent_irq_data)
return -ENOMEM;
- mutex_lock(&irq_domain_mutex);
+ mutex_lock(&domain->root->mutex);
/* Copy the original irq_data. */
- *child_irq_data = *root_irq_data;
+ *parent_irq_data = *irq_data;
/*
- * Overwrite the root_irq_data, which is embedded in struct
- * irq_desc, with values for this domain.
+ * Overwrite the irq_data, which is embedded in struct irq_desc, with
+ * values for this domain.
*/
- root_irq_data->parent_data = child_irq_data;
- root_irq_data->domain = domain;
- root_irq_data->mask = 0;
- root_irq_data->hwirq = 0;
- root_irq_data->chip = NULL;
- root_irq_data->chip_data = NULL;
+ irq_data->parent_data = parent_irq_data;
+ irq_data->domain = domain;
+ irq_data->mask = 0;
+ irq_data->hwirq = 0;
+ irq_data->chip = NULL;
+ irq_data->chip_data = NULL;
/* May (probably does) set hwirq, chip, etc. */
rv = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
if (rv) {
/* Restore the original irq_data. */
- *root_irq_data = *child_irq_data;
- kfree(child_irq_data);
+ *irq_data = *parent_irq_data;
+ kfree(parent_irq_data);
goto error;
}
- irq_domain_fix_revmap(child_irq_data);
- irq_domain_set_mapping(domain, root_irq_data->hwirq, root_irq_data);
-
+ irq_domain_fix_revmap(parent_irq_data);
+ irq_domain_set_mapping(domain, irq_data->hwirq, irq_data);
error:
- mutex_unlock(&irq_domain_mutex);
+ mutex_unlock(&domain->root->mutex);
return rv;
}
@@ -1485,8 +1669,8 @@ EXPORT_SYMBOL_GPL(irq_domain_push_irq);
*/
int irq_domain_pop_irq(struct irq_domain *domain, int virq)
{
- struct irq_data *root_irq_data = irq_get_irq_data(virq);
- struct irq_data *child_irq_data;
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_data *parent_irq_data;
struct irq_data *tmp_irq_data;
struct irq_desc *desc;
@@ -1508,37 +1692,37 @@ int irq_domain_pop_irq(struct irq_domain *domain, int virq)
if (domain == NULL)
return -EINVAL;
- if (!root_irq_data)
+ if (!irq_data)
return -EINVAL;
tmp_irq_data = irq_domain_get_irq_data(domain, virq);
/* We can only "pop" if this domain is at the top of the list */
- if (WARN_ON(root_irq_data != tmp_irq_data))
+ if (WARN_ON(irq_data != tmp_irq_data))
return -EINVAL;
- if (WARN_ON(root_irq_data->domain != domain))
+ if (WARN_ON(irq_data->domain != domain))
return -EINVAL;
- child_irq_data = root_irq_data->parent_data;
- if (WARN_ON(!child_irq_data))
+ parent_irq_data = irq_data->parent_data;
+ if (WARN_ON(!parent_irq_data))
return -EINVAL;
- mutex_lock(&irq_domain_mutex);
+ mutex_lock(&domain->root->mutex);
- root_irq_data->parent_data = NULL;
+ irq_data->parent_data = NULL;
- irq_domain_clear_mapping(domain, root_irq_data->hwirq);
+ irq_domain_clear_mapping(domain, irq_data->hwirq);
irq_domain_free_irqs_hierarchy(domain, virq, 1);
/* Restore the original irq_data. */
- *root_irq_data = *child_irq_data;
+ *irq_data = *parent_irq_data;
- irq_domain_fix_revmap(root_irq_data);
+ irq_domain_fix_revmap(irq_data);
- mutex_unlock(&irq_domain_mutex);
+ mutex_unlock(&domain->root->mutex);
- kfree(child_irq_data);
+ kfree(parent_irq_data);
return 0;
}
@@ -1552,17 +1736,20 @@ EXPORT_SYMBOL_GPL(irq_domain_pop_irq);
void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
{
struct irq_data *data = irq_get_irq_data(virq);
+ struct irq_domain *domain;
int i;
if (WARN(!data || !data->domain || !data->domain->ops->free,
"NULL pointer, cannot free irq\n"))
return;
- mutex_lock(&irq_domain_mutex);
+ domain = data->domain;
+
+ mutex_lock(&domain->root->mutex);
for (i = 0; i < nr_irqs; i++)
irq_domain_remove_irq(virq + i);
- irq_domain_free_irqs_hierarchy(data->domain, virq, nr_irqs);
- mutex_unlock(&irq_domain_mutex);
+ irq_domain_free_irqs_hierarchy(domain, virq, nr_irqs);
+ mutex_unlock(&domain->root->mutex);
irq_domain_free_irq_data(virq, nr_irqs);
irq_free_descs(virq, nr_irqs);
@@ -1570,12 +1757,10 @@ void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
/**
* irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain
+ * @domain: Domain below which interrupts must be allocated
* @irq_base: Base IRQ number
* @nr_irqs: Number of IRQs to allocate
* @arg: Allocation data (arch/domain specific)
- *
- * Check whether the domain has been setup recursive. If not allocate
- * through the parent domain.
*/
int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
unsigned int irq_base, unsigned int nr_irqs,
@@ -1591,11 +1776,9 @@ EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent);
/**
* irq_domain_free_irqs_parent - Free interrupts from parent domain
+ * @domain: Domain below which interrupts must be freed
* @irq_base: Base IRQ number
* @nr_irqs: Number of IRQs to free
- *
- * Check whether the domain has been setup recursive. If not free
- * through the parent domain.
*/
void irq_domain_free_irqs_parent(struct irq_domain *domain,
unsigned int irq_base, unsigned int nr_irqs)
@@ -1681,20 +1864,6 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain)
if (domain->ops->alloc)
domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY;
}
-
-/**
- * irq_domain_hierarchical_is_msi_remap - Check if the domain or any
- * parent has MSI remapping support
- * @domain: domain pointer
- */
-bool irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain)
-{
- for (; domain; domain = domain->parent) {
- if (irq_domain_is_msi_remap(domain))
- return true;
- }
- return false;
-}
#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */
/**
* irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
@@ -1722,7 +1891,7 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
* @handler_name: The interrupt handler name
*/
void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq, const struct irq_chip *chip,
void *chip_data, irq_flow_handler_t handler,
void *handler_data, const char *handler_name)
{
@@ -1731,20 +1900,28 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
irq_set_handler_data(virq, handler_data);
}
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
+{
+ return -EINVAL;
+}
+
static void irq_domain_check_hierarchy(struct irq_domain *domain)
{
}
#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+#include "internals.h"
+
static struct dentry *domain_dir;
static void
irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind)
{
seq_printf(m, "%*sname: %s\n", ind, "", d->name);
- seq_printf(m, "%*ssize: %u\n", ind + 1, "",
- d->revmap_size + d->revmap_direct_max_irq);
+ seq_printf(m, "%*ssize: %u\n", ind + 1, "", d->revmap_size);
seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount);
seq_printf(m, "%*sflags: 0x%08x\n", ind +1 , "", d->flags);
if (d->ops && d->ops->debug_show)
@@ -1774,16 +1951,15 @@ DEFINE_SHOW_ATTRIBUTE(irq_domain_debug);
static void debugfs_add_domain_dir(struct irq_domain *d)
{
- if (!d->name || !domain_dir || d->debugfs_file)
+ if (!d->name || !domain_dir)
return;
- d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d,
- &irq_domain_debug_fops);
+ debugfs_create_file(d->name, 0444, domain_dir, d,
+ &irq_domain_debug_fops);
}
static void debugfs_remove_domain_dir(struct irq_domain *d)
{
- debugfs_remove(d->debugfs_file);
- d->debugfs_file = NULL;
+ debugfs_lookup_and_remove(d->name, domain_dir);
}
void __init irq_domain_debugfs_init(struct dentry *root)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2a9fec53e159..1782f90cd8c6 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -25,12 +25,11 @@
#include "internals.h"
#if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT)
-__read_mostly bool force_irqthreads;
-EXPORT_SYMBOL_GPL(force_irqthreads);
+DEFINE_STATIC_KEY_FALSE(force_irqthreads_key);
static int __init setup_forced_irqthreads(char *arg)
{
- force_irqthreads = true;
+ static_branch_enable(&force_irqthreads_key);
return 0;
}
early_param("threadirqs", setup_forced_irqthreads);
@@ -109,6 +108,16 @@ bool synchronize_hardirq(unsigned int irq)
}
EXPORT_SYMBOL(synchronize_hardirq);
+static void __synchronize_irq(struct irq_desc *desc)
+{
+ __synchronize_hardirq(desc, true);
+ /*
+ * We made sure that no hardirq handler is running. Now verify that no
+ * threaded handlers are active.
+ */
+ wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
+}
+
/**
* synchronize_irq - wait for pending IRQ handlers (on other CPUs)
* @irq: interrupt number to wait for
@@ -128,16 +137,8 @@ void synchronize_irq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- if (desc) {
- __synchronize_hardirq(desc, true);
- /*
- * We made sure that no hardirq handler is
- * running. Now verify that no threaded handlers are
- * active.
- */
- wait_event(desc->wait_for_threads,
- !atomic_read(&desc->threads_active));
- }
+ if (desc)
+ __synchronize_irq(desc);
}
EXPORT_SYMBOL(synchronize_irq);
@@ -179,7 +180,7 @@ bool irq_can_set_affinity_usr(unsigned int irq)
/**
* irq_set_thread_affinity - Notify irq threads to adjust affinity
- * @desc: irq descriptor which has affitnity changed
+ * @desc: irq descriptor which has affinity changed
*
* We just set IRQTF_AFFINITY and delegate the affinity setting
* to the interrupt thread itself. We can not call
@@ -190,9 +191,12 @@ void irq_set_thread_affinity(struct irq_desc *desc)
{
struct irqaction *action;
- for_each_action_of_desc(desc, action)
+ for_each_action_of_desc(desc, action) {
if (action->thread)
set_bit(IRQTF_AFFINITY, &action->thread_flags);
+ if (action->secondary && action->secondary->thread)
+ set_bit(IRQTF_AFFINITY, &action->secondary->thread_flags);
+ }
}
#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
@@ -206,16 +210,8 @@ static void irq_validate_effective_affinity(struct irq_data *data)
pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n",
chip->name, data->irq);
}
-
-static inline void irq_init_effective_affinity(struct irq_data *data,
- const struct cpumask *mask)
-{
- cpumask_copy(irq_data_get_effective_affinity_mask(data), mask);
-}
#else
static inline void irq_validate_effective_affinity(struct irq_data *data) { }
-static inline void irq_init_effective_affinity(struct irq_data *data,
- const struct cpumask *mask) { }
#endif
int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
@@ -223,11 +219,16 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
{
struct irq_desc *desc = irq_data_to_desc(data);
struct irq_chip *chip = irq_data_get_irq_chip(data);
+ const struct cpumask *prog_mask;
int ret;
+ static DEFINE_RAW_SPINLOCK(tmp_mask_lock);
+ static struct cpumask tmp_mask;
+
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
+ raw_spin_lock(&tmp_mask_lock);
/*
* If this is a managed interrupt and housekeeping is enabled on
* it check whether the requested affinity mask intersects with
@@ -248,30 +249,40 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
* online.
*/
if (irqd_affinity_is_managed(data) &&
- housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) {
- const struct cpumask *hk_mask, *prog_mask;
+ housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) {
+ const struct cpumask *hk_mask;
- static DEFINE_RAW_SPINLOCK(tmp_mask_lock);
- static struct cpumask tmp_mask;
+ hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
- hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
-
- raw_spin_lock(&tmp_mask_lock);
cpumask_and(&tmp_mask, mask, hk_mask);
if (!cpumask_intersects(&tmp_mask, cpu_online_mask))
prog_mask = mask;
else
prog_mask = &tmp_mask;
- ret = chip->irq_set_affinity(data, prog_mask, force);
- raw_spin_unlock(&tmp_mask_lock);
} else {
- ret = chip->irq_set_affinity(data, mask, force);
+ prog_mask = mask;
}
+
+ /*
+ * Make sure we only provide online CPUs to the irqchip,
+ * unless we are being asked to force the affinity (in which
+ * case we do as we are told).
+ */
+ cpumask_and(&tmp_mask, prog_mask, cpu_online_mask);
+ if (!force && !cpumask_empty(&tmp_mask))
+ ret = chip->irq_set_affinity(data, &tmp_mask, force);
+ else if (force)
+ ret = chip->irq_set_affinity(data, mask, force);
+ else
+ ret = -EINVAL;
+
+ raw_spin_unlock(&tmp_mask_lock);
+
switch (ret) {
case IRQ_SET_MASK_OK:
case IRQ_SET_MASK_OK_DONE:
cpumask_copy(desc->irq_common_data.affinity, mask);
- /* fall through */
+ fallthrough;
case IRQ_SET_MASK_OK_NOCOPY:
irq_validate_effective_affinity(data);
irq_set_thread_affinity(desc);
@@ -315,21 +326,25 @@ static int irq_try_set_affinity(struct irq_data *data,
}
static bool irq_set_affinity_deactivated(struct irq_data *data,
- const struct cpumask *mask, bool force)
+ const struct cpumask *mask)
{
struct irq_desc *desc = irq_data_to_desc(data);
/*
+ * Handle irq chips which can handle affinity only in activated
+ * state correctly
+ *
* If the interrupt is not yet activated, just store the affinity
* mask and do not call the chip driver at all. On activation the
* driver has to make sure anyway that the interrupt is in a
- * useable state so startup works.
+ * usable state so startup works.
*/
- if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data))
+ if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) ||
+ irqd_is_activated(data) || !irqd_affinity_on_activate(data))
return false;
cpumask_copy(desc->irq_common_data.affinity, mask);
- irq_init_effective_affinity(data, mask);
+ irq_data_update_effective_affinity(data, mask);
irqd_set(data, IRQD_AFFINITY_SET);
return true;
}
@@ -344,7 +359,7 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
- if (irq_set_affinity_deactivated(data, mask, force))
+ if (irq_set_affinity_deactivated(data, mask))
return 0;
if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) {
@@ -367,7 +382,78 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
return ret;
}
-int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
+/**
+ * irq_update_affinity_desc - Update affinity management for an interrupt
+ * @irq: The interrupt number to update
+ * @affinity: Pointer to the affinity descriptor
+ *
+ * This interface can be used to configure the affinity management of
+ * interrupts which have been allocated already.
+ *
+ * There are certain limitations on when it may be used - attempts to use it
+ * for when the kernel is configured for generic IRQ reservation mode (in
+ * config GENERIC_IRQ_RESERVATION_MODE) will fail, as it may conflict with
+ * managed/non-managed interrupt accounting. In addition, attempts to use it on
+ * an interrupt which is already started or which has already been configured
+ * as managed will also fail, as these mean invalid init state or double init.
+ */
+int irq_update_affinity_desc(unsigned int irq,
+ struct irq_affinity_desc *affinity)
+{
+ struct irq_desc *desc;
+ unsigned long flags;
+ bool activated;
+ int ret = 0;
+
+ /*
+ * Supporting this with the reservation scheme used by x86 needs
+ * some more thought. Fail it for now.
+ */
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE))
+ return -EOPNOTSUPP;
+
+ desc = irq_get_desc_buslock(irq, &flags, 0);
+ if (!desc)
+ return -EINVAL;
+
+ /* Requires the interrupt to be shut down */
+ if (irqd_is_started(&desc->irq_data)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ /* Interrupts which are already managed cannot be modified */
+ if (irqd_affinity_is_managed(&desc->irq_data)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ /*
+ * Deactivate the interrupt. That's required to undo
+ * anything an earlier activation has established.
+ */
+ activated = irqd_is_activated(&desc->irq_data);
+ if (activated)
+ irq_domain_deactivate_irq(&desc->irq_data);
+
+ if (affinity->is_managed) {
+ irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED);
+ irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN);
+ }
+
+ cpumask_copy(desc->irq_common_data.affinity, &affinity->mask);
+
+ /* Restore the activation state */
+ if (activated)
+ irq_domain_activate_irq(&desc->irq_data, false);
+
+out_unlock:
+ irq_put_desc_busunlock(desc, flags);
+ return ret;
+}
+
+static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask,
+ bool force)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
@@ -382,7 +468,38 @@ int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
return ret;
}
-int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
+/**
+ * irq_set_affinity - Set the irq affinity of a given irq
+ * @irq: Interrupt to set affinity
+ * @cpumask: cpumask
+ *
+ * Fails if cpumask does not contain an online CPU
+ */
+int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+ return __irq_set_affinity(irq, cpumask, false);
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity);
+
+/**
+ * irq_force_affinity - Force the irq affinity of a given irq
+ * @irq: Interrupt to set affinity
+ * @cpumask: cpumask
+ *
+ * Same as irq_set_affinity, but without checking the mask against
+ * online cpus.
+ *
+ * Solely for low level cpu hotplug code, where we need to make per
+ * cpu interrupts affine before the cpu becomes online.
+ */
+int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+ return __irq_set_affinity(irq, cpumask, true);
+}
+EXPORT_SYMBOL_GPL(irq_force_affinity);
+
+int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m,
+ bool setaffinity)
{
unsigned long flags;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
@@ -391,12 +508,11 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
return -EINVAL;
desc->affinity_hint = m;
irq_put_desc_unlock(desc, flags);
- /* set the initial affinity to prevent every interrupt being on CPU0 */
- if (m)
+ if (m && setaffinity)
__irq_set_affinity(irq, m, false);
return 0;
}
-EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint);
static void irq_affinity_notify(struct work_struct *work)
{
@@ -612,10 +728,13 @@ EXPORT_SYMBOL(disable_irq_nosync);
* to complete before returning. If you use this function while
* holding a resource the IRQ handler may need you will deadlock.
*
- * This function may be called - with care - from IRQ context.
+ * Can only be called from preemptible code as it might sleep when
+ * an interrupt thread is associated to @irq.
+ *
*/
void disable_irq(unsigned int irq)
{
+ might_sleep();
if (!__disable_irq_nosync(irq))
synchronize_irq(irq);
}
@@ -864,7 +983,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
case IRQ_SET_MASK_OK_DONE:
irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
irqd_set(&desc->irq_data, flags);
- /* fall through */
+ fallthrough;
case IRQ_SET_MASK_OK_NOCOPY:
flags = irqd_get_trigger_type(&desc->irq_data);
@@ -980,7 +1099,7 @@ again:
* to IRQS_INPROGRESS and the irq line is masked forever.
*
* This also serializes the state of shared oneshot handlers
- * versus "desc->threads_onehsot |= action->thread_mask;" in
+ * versus "desc->threads_oneshot |= action->thread_mask;" in
* irq_wake_thread(). See the comment there which explains the
* serialization.
*/
@@ -1068,18 +1187,22 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
irqreturn_t ret;
local_bh_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_disable();
ret = action->thread_fn(action->irq, action->dev_id);
if (ret == IRQ_HANDLED)
atomic_inc(&desc->threads_handled);
irq_finalize_oneshot(desc, action);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_enable();
local_bh_enable();
return ret;
}
/*
* Interrupts explicitly requested as threaded interrupts want to be
- * preemtible - many of them need to sleep and wait for slow busses to
+ * preemptible - many of them need to sleep and wait for slow busses to
* complete.
*/
static irqreturn_t irq_thread_fn(struct irq_desc *desc,
@@ -1095,7 +1218,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc,
return ret;
}
-static void wake_threads_waitq(struct irq_desc *desc)
+void wake_threads_waitq(struct irq_desc *desc)
{
if (atomic_dec_and_test(&desc->threads_active))
wake_up(&desc->wait_for_threads);
@@ -1141,6 +1264,31 @@ static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action)
}
/*
+ * Internal function to notify that a interrupt thread is ready.
+ */
+static void irq_thread_set_ready(struct irq_desc *desc,
+ struct irqaction *action)
+{
+ set_bit(IRQTF_READY, &action->thread_flags);
+ wake_up(&desc->wait_for_threads);
+}
+
+/*
+ * Internal function to wake up a interrupt thread and wait until it is
+ * ready.
+ */
+static void wake_up_and_wait_for_irq_thread_ready(struct irq_desc *desc,
+ struct irqaction *action)
+{
+ if (!action || !action->thread)
+ return;
+
+ wake_up_process(action->thread);
+ wait_event(desc->wait_for_threads,
+ test_bit(IRQTF_READY, &action->thread_flags));
+}
+
+/*
* Interrupt handler thread
*/
static int irq_thread(void *data)
@@ -1151,14 +1299,18 @@ static int irq_thread(void *data)
irqreturn_t (*handler_fn)(struct irq_desc *desc,
struct irqaction *action);
- if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD,
- &action->thread_flags))
+ irq_thread_set_ready(desc, action);
+
+ sched_set_fifo(current);
+
+ if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD,
+ &action->thread_flags))
handler_fn = irq_forced_thread_fn;
else
handler_fn = irq_thread_fn;
init_task_work(&on_exit_work, irq_thread_dtor);
- task_work_add(current, &on_exit_work, false);
+ task_work_add(current, &on_exit_work, TWA_NONE);
irq_thread_check_affinity(desc, action);
@@ -1213,7 +1365,7 @@ EXPORT_SYMBOL_GPL(irq_wake_thread);
static int irq_setup_forced_threading(struct irqaction *new)
{
- if (!force_irqthreads)
+ if (!force_irqthreads())
return 0;
if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
return 0;
@@ -1304,9 +1456,6 @@ static int
setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
{
struct task_struct *t;
- struct sched_param param = {
- .sched_priority = MAX_USER_RT_PRIO/2,
- };
if (!secondary) {
t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
@@ -1314,14 +1463,11 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
} else {
t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq,
new->name);
- param.sched_priority -= 1;
}
if (IS_ERR(t))
return PTR_ERR(t);
- sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
-
/*
* We keep the reference to the task struct even if
* the thread dies to avoid that the interrupt code
@@ -1579,8 +1725,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
if (!shared) {
- init_waitqueue_head(&desc->wait_for_threads);
-
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
ret = __irq_set_trigger(desc,
@@ -1612,8 +1756,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (new->flags & IRQF_PERCPU) {
irqd_set(&desc->irq_data, IRQD_PER_CPU);
irq_settings_set_per_cpu(desc);
+ if (new->flags & IRQF_NO_DEBUG)
+ irq_settings_set_no_debug(desc);
}
+ if (noirqdebug)
+ irq_settings_set_no_debug(desc);
+
if (new->flags & IRQF_ONESHOT)
desc->istate |= IRQS_ONESHOT;
@@ -1623,7 +1772,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
}
- if (irq_settings_can_autoenable(desc)) {
+ if (!(new->flags & IRQF_NO_AUTOEN) &&
+ irq_settings_can_autoenable(desc)) {
irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
} else {
/*
@@ -1670,14 +1820,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
irq_setup_timings(desc, new);
- /*
- * Strictly no need to wake it up, but hung_task complains
- * when no hard interrupt wakes the thread up.
- */
- if (new->thread)
- wake_up_process(new->thread);
- if (new->secondary)
- wake_up_process(new->secondary->thread);
+ wake_up_and_wait_for_irq_thread_ready(desc, new);
+ wake_up_and_wait_for_irq_thread_ready(desc, new->secondary);
register_irq_proc(irq, desc);
new->dir = NULL;
@@ -1708,15 +1852,13 @@ out_thread:
struct task_struct *t = new->thread;
new->thread = NULL;
- kthread_stop(t);
- put_task_struct(t);
+ kthread_stop_put(t);
}
if (new->secondary && new->secondary->thread) {
struct task_struct *t = new->secondary->thread;
new->secondary->thread = NULL;
- kthread_stop(t);
- put_task_struct(t);
+ kthread_stop_put(t);
}
out_mput:
module_put(desc->owner);
@@ -1802,7 +1944,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
* supports it also make sure that there is no (not yet serviced)
* interrupt in flight at the hardware level.
*/
- __synchronize_hardirq(desc, true);
+ __synchronize_irq(desc);
#ifdef CONFIG_DEBUG_SHIRQ
/*
@@ -1827,18 +1969,15 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
* the same bit to a newly requested action.
*/
if (action->thread) {
- kthread_stop(action->thread);
- put_task_struct(action->thread);
- if (action->secondary && action->secondary->thread) {
- kthread_stop(action->secondary->thread);
- put_task_struct(action->secondary->thread);
- }
+ kthread_stop_put(action->thread);
+ if (action->secondary && action->secondary->thread)
+ kthread_stop_put(action->secondary->thread);
}
/* Last action releases resources */
if (!desc->action) {
/*
- * Reaquire bus lock as irq_release_resources() might
+ * Reacquire bus lock as irq_release_resources() might
* require it to deallocate resources over the slow bus.
*/
chip_bus_lock(desc);
@@ -1961,9 +2100,9 @@ const void *free_nmi(unsigned int irq, void *dev_id)
* request_threaded_irq - allocate an interrupt line
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs.
- * Primary handler for threaded interrupts
- * If NULL and thread_fn != NULL the default
- * primary handler is installed
+ * Primary handler for threaded interrupts.
+ * If handler is NULL and thread_fn != NULL
+ * the default primary handler is installed.
* @thread_fn: Function called from the irq handler thread
* If NULL, no irq thread is created
* @irqflags: Interrupt type flags
@@ -1997,7 +2136,7 @@ const void *free_nmi(unsigned int irq, void *dev_id)
*
* IRQF_SHARED Interrupt is shared
* IRQF_TRIGGER_* Specify active edge(s) or level
- *
+ * IRQF_ONESHOT Run thread_fn with interrupt line masked
*/
int request_threaded_irq(unsigned int irq, irq_handler_t handler,
irq_handler_t thread_fn, unsigned long irqflags,
@@ -2016,10 +2155,15 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
* which interrupt is which (messes up the interrupt freeing
* logic etc).
*
+ * Also shared interrupts do not go well with disabling auto enable.
+ * The sharing interrupt might request it while it's still disabled
+ * and then wait for interrupts forever.
+ *
* Also IRQF_COND_SUSPEND only makes sense for shared interrupts and
* it cannot be set along with IRQF_NO_SUSPEND.
*/
if (((irqflags & IRQF_SHARED) && !dev_id) ||
+ ((irqflags & IRQF_SHARED) && (irqflags & IRQF_NO_AUTOEN)) ||
(!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) ||
((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND)))
return -EINVAL;
@@ -2175,7 +2319,8 @@ int request_nmi(unsigned int irq, irq_handler_t handler,
desc = irq_to_desc(irq);
- if (!desc || irq_settings_can_autoenable(desc) ||
+ if (!desc || (irq_settings_can_autoenable(desc) &&
+ !(irqflags & IRQF_NO_AUTOEN)) ||
!irq_settings_can_request(desc) ||
WARN_ON(irq_settings_is_per_cpu_devid(desc)) ||
!irq_supports_nmi(desc))
@@ -2672,7 +2817,7 @@ int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which,
* irq_get_irqchip_state - returns the irqchip state of a interrupt.
* @irq: Interrupt line that is forwarded to a VM
* @which: One of IRQCHIP_STATE_* the caller wants to know about
- * @state: a pointer to a boolean where the state is to be storeed
+ * @state: a pointer to a boolean where the state is to be stored
*
* This call snapshots the internal irqchip state of an
* interrupt, returning into @state the bit corresponding to
@@ -2711,7 +2856,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
* This call sets the internal irqchip state of an interrupt,
* depending on the value of @which.
*
- * This function should be called with preemption disabled if the
+ * This function should be called with migration disabled if the
* interrupt controller has per-cpu registers.
*/
int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
@@ -2731,8 +2876,10 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
do {
chip = irq_data_get_irq_chip(data);
- if (WARN_ON_ONCE(!chip))
- return -ENODEV;
+ if (WARN_ON_ONCE(!chip)) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
if (chip->irq_set_irqchip_state)
break;
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
@@ -2745,7 +2892,46 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
if (data)
err = chip->irq_set_irqchip_state(data, which, val);
+out_unlock:
irq_put_desc_busunlock(desc, flags);
return err;
}
EXPORT_SYMBOL_GPL(irq_set_irqchip_state);
+
+/**
+ * irq_has_action - Check whether an interrupt is requested
+ * @irq: The linux irq number
+ *
+ * Returns: A snapshot of the current state
+ */
+bool irq_has_action(unsigned int irq)
+{
+ bool res;
+
+ rcu_read_lock();
+ res = irq_desc_has_action(irq_to_desc(irq));
+ rcu_read_unlock();
+ return res;
+}
+EXPORT_SYMBOL_GPL(irq_has_action);
+
+/**
+ * irq_check_status_bit - Check whether bits in the irq descriptor status are set
+ * @irq: The linux irq number
+ * @bitmask: The bitmask to evaluate
+ *
+ * Returns: True if one of the bits in @bitmask is set
+ */
+bool irq_check_status_bit(unsigned int irq, unsigned int bitmask)
+{
+ struct irq_desc *desc;
+ bool res = false;
+
+ rcu_read_lock();
+ desc = irq_to_desc(irq);
+ if (desc)
+ res = !!(desc->status_use_accessors & bitmask);
+ rcu_read_unlock();
+ return res;
+}
+EXPORT_SYMBOL_GPL(irq_check_status_bit);
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 30cc217b8631..75d0ae490e29 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -280,12 +280,13 @@ void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk)
/**
* irq_matrix_alloc_managed - Allocate a managed interrupt in a CPU map
* @m: Matrix pointer
- * @cpu: On which CPU the interrupt should be allocated
+ * @msk: Which CPUs to search in
+ * @mapped_cpu: Pointer to store the CPU for which the irq was allocated
*/
int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk,
unsigned int *mapped_cpu)
{
- unsigned int bit, cpu, end = m->alloc_end;
+ unsigned int bit, cpu, end;
struct cpumap *cm;
if (cpumask_empty(msk))
@@ -337,15 +338,14 @@ void irq_matrix_assign(struct irq_matrix *m, unsigned int bit)
* irq_matrix_reserve - Reserve interrupts
* @m: Matrix pointer
*
- * This is merily a book keeping call. It increments the number of globally
+ * This is merely a book keeping call. It increments the number of globally
* reserved interrupt bits w/o actually allocating them. This allows to
* setup interrupt descriptors w/o assigning low level resources to it.
* The actual allocation happens when the interrupt gets activated.
*/
void irq_matrix_reserve(struct irq_matrix *m)
{
- if (m->global_reserved <= m->global_available &&
- m->global_reserved + 1 > m->global_available)
+ if (m->global_reserved == m->global_available)
pr_warn("Interrupt reservation exceeds available resources\n");
m->global_reserved++;
@@ -356,7 +356,7 @@ void irq_matrix_reserve(struct irq_matrix *m)
* irq_matrix_remove_reserved - Remove interrupt reservation
* @m: Matrix pointer
*
- * This is merily a book keeping call. It decrements the number of globally
+ * This is merely a book keeping call. It decrements the number of globally
* reserved interrupt bits. This is used to undo irq_matrix_reserve() when the
* interrupt was never in use and a real vector allocated, which undid the
* reservation.
@@ -380,6 +380,13 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
unsigned int cpu, bit;
struct cpumap *cm;
+ /*
+ * Not required in theory, but matrix_find_best_cpu() uses
+ * for_each_cpu() which ignores the cpumask on UP .
+ */
+ if (cpumask_empty(msk))
+ return -EINVAL;
+
cpu = matrix_find_best_cpu(m, msk);
if (cpu == UINT_MAX)
return -ENOSPC;
@@ -416,7 +423,9 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu,
if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end))
return;
- clear_bit(bit, cm->alloc_map);
+ if (WARN_ON_ONCE(!test_and_clear_bit(bit, cm->alloc_map)))
+ return;
+
cm->allocated--;
if(managed)
cm->managed_allocated--;
@@ -457,16 +466,16 @@ unsigned int irq_matrix_reserved(struct irq_matrix *m)
}
/**
- * irq_matrix_allocated - Get the number of allocated irqs on the local cpu
+ * irq_matrix_allocated - Get the number of allocated non-managed irqs on the local CPU
* @m: Pointer to the matrix to search
*
- * This returns number of allocated irqs
+ * This returns number of allocated non-managed interrupts.
*/
unsigned int irq_matrix_allocated(struct irq_matrix *m)
{
struct cpumap *cm = this_cpu_ptr(m->maps);
- return cm->allocated;
+ return cm->allocated - cm->managed_allocated;
}
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index def48589ea48..61ca924ef4b4 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -7,7 +7,7 @@
/**
* irq_fixup_move_pending - Cleanup irq move pending from a dying CPU
- * @desc: Interrupt descpriptor to clean up
+ * @desc: Interrupt descriptor to clean up
* @force_clear: If set clear the move pending bit unconditionally.
* If not set, clear it only when the dying CPU is the
* last one in the pending mask.
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index eb95f6106a1e..79b4a58ba9c3 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -5,7 +5,7 @@
*
* This file is licensed under GPLv2.
*
- * This file contains common code to support Message Signalled Interrupt for
+ * This file contains common code to support Message Signaled Interrupts for
* PCI compatible and non PCI compatible devices.
*/
#include <linux/types.h>
@@ -14,46 +14,243 @@
#include <linux/irqdomain.h>
#include <linux/msi.h>
#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/pci.h>
#include "internals.h"
/**
- * alloc_msi_entry - Allocate an initialize msi_entry
+ * struct msi_ctrl - MSI internal management control structure
+ * @domid: ID of the domain on which management operations should be done
+ * @first: First (hardware) slot index to operate on
+ * @last: Last (hardware) slot index to operate on
+ * @nirqs: The number of Linux interrupts to allocate. Can be larger
+ * than the range due to PCI/multi-MSI.
+ */
+struct msi_ctrl {
+ unsigned int domid;
+ unsigned int first;
+ unsigned int last;
+ unsigned int nirqs;
+};
+
+/* Invalid Xarray index which is outside of any searchable range */
+#define MSI_XA_MAX_INDEX (ULONG_MAX - 1)
+/* The maximum domain size */
+#define MSI_XA_DOMAIN_SIZE (MSI_MAX_INDEX + 1)
+
+static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl);
+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid);
+static inline int msi_sysfs_create_group(struct device *dev);
+
+
+/**
+ * msi_alloc_desc - Allocate an initialized msi_desc
* @dev: Pointer to the device for which this is allocated
* @nvec: The number of vectors used in this entry
* @affinity: Optional pointer to an affinity mask array size of @nvec
*
- * If @affinity is not NULL then an affinity array[@nvec] is allocated
+ * If @affinity is not %NULL then an affinity array[@nvec] is allocated
* and the affinity masks and flags from @affinity are copied.
+ *
+ * Return: pointer to allocated &msi_desc on success or %NULL on failure
*/
-struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
- const struct irq_affinity_desc *affinity)
+static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec,
+ const struct irq_affinity_desc *affinity)
{
- struct msi_desc *desc;
+ struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
- desc = kzalloc(sizeof(*desc), GFP_KERNEL);
if (!desc)
return NULL;
- INIT_LIST_HEAD(&desc->list);
desc->dev = dev;
desc->nvec_used = nvec;
if (affinity) {
- desc->affinity = kmemdup(affinity,
- nvec * sizeof(*desc->affinity), GFP_KERNEL);
+ desc->affinity = kmemdup(affinity, nvec * sizeof(*desc->affinity), GFP_KERNEL);
if (!desc->affinity) {
kfree(desc);
return NULL;
}
}
-
return desc;
}
-void free_msi_entry(struct msi_desc *entry)
+static void msi_free_desc(struct msi_desc *desc)
+{
+ kfree(desc->affinity);
+ kfree(desc);
+}
+
+static int msi_insert_desc(struct device *dev, struct msi_desc *desc,
+ unsigned int domid, unsigned int index)
+{
+ struct msi_device_data *md = dev->msi.data;
+ struct xarray *xa = &md->__domains[domid].store;
+ unsigned int hwsize;
+ int ret;
+
+ hwsize = msi_domain_get_hwsize(dev, domid);
+
+ if (index == MSI_ANY_INDEX) {
+ struct xa_limit limit = { .min = 0, .max = hwsize - 1 };
+ unsigned int index;
+
+ /* Let the xarray allocate a free index within the limit */
+ ret = xa_alloc(xa, &index, desc, limit, GFP_KERNEL);
+ if (ret)
+ goto fail;
+
+ desc->msi_index = index;
+ return 0;
+ } else {
+ if (index >= hwsize) {
+ ret = -ERANGE;
+ goto fail;
+ }
+
+ desc->msi_index = index;
+ ret = xa_insert(xa, index, desc, GFP_KERNEL);
+ if (ret)
+ goto fail;
+ return 0;
+ }
+fail:
+ msi_free_desc(desc);
+ return ret;
+}
+
+/**
+ * msi_domain_insert_msi_desc - Allocate and initialize a MSI descriptor and
+ * insert it at @init_desc->msi_index
+ *
+ * @dev: Pointer to the device for which the descriptor is allocated
+ * @domid: The id of the interrupt domain to which the desriptor is added
+ * @init_desc: Pointer to an MSI descriptor to initialize the new descriptor
+ *
+ * Return: 0 on success or an appropriate failure code.
+ */
+int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid,
+ struct msi_desc *init_desc)
+{
+ struct msi_desc *desc;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ desc = msi_alloc_desc(dev, init_desc->nvec_used, init_desc->affinity);
+ if (!desc)
+ return -ENOMEM;
+
+ /* Copy type specific data to the new descriptor. */
+ desc->pci = init_desc->pci;
+
+ return msi_insert_desc(dev, desc, domid, init_desc->msi_index);
+}
+
+static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)
{
- kfree(entry->affinity);
- kfree(entry);
+ switch (filter) {
+ case MSI_DESC_ALL:
+ return true;
+ case MSI_DESC_NOTASSOCIATED:
+ return !desc->irq;
+ case MSI_DESC_ASSOCIATED:
+ return !!desc->irq;
+ }
+ WARN_ON_ONCE(1);
+ return false;
+}
+
+static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl)
+{
+ unsigned int hwsize;
+
+ if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS ||
+ (dev->msi.domain &&
+ !dev->msi.data->__domains[ctrl->domid].domain)))
+ return false;
+
+ hwsize = msi_domain_get_hwsize(dev, ctrl->domid);
+ if (WARN_ON_ONCE(ctrl->first > ctrl->last ||
+ ctrl->first >= hwsize ||
+ ctrl->last >= hwsize))
+ return false;
+ return true;
+}
+
+static void msi_domain_free_descs(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_desc *desc;
+ struct xarray *xa;
+ unsigned long idx;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return;
+
+ xa = &dev->msi.data->__domains[ctrl->domid].store;
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ xa_erase(xa, idx);
+
+ /* Leak the descriptor when it is still referenced */
+ if (WARN_ON_ONCE(msi_desc_match(desc, MSI_DESC_ASSOCIATED)))
+ continue;
+ msi_free_desc(desc);
+ }
+}
+
+/**
+ * msi_domain_free_msi_descs_range - Free a range of MSI descriptors of a device in an irqdomain
+ * @dev: Device for which to free the descriptors
+ * @domid: Id of the domain to operate on
+ * @first: Index to start freeing from (inclusive)
+ * @last: Last index to be freed (inclusive)
+ */
+void msi_domain_free_msi_descs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ };
+
+ msi_domain_free_descs(dev, &ctrl);
+}
+
+/**
+ * msi_domain_add_simple_msi_descs - Allocate and initialize MSI descriptors
+ * @dev: Pointer to the device for which the descriptors are allocated
+ * @ctrl: Allocation control struct
+ *
+ * Return: 0 on success or an appropriate failure code.
+ */
+static int msi_domain_add_simple_msi_descs(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_desc *desc;
+ unsigned int idx;
+ int ret;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return -EINVAL;
+
+ for (idx = ctrl->first; idx <= ctrl->last; idx++) {
+ desc = msi_alloc_desc(dev, 1, NULL);
+ if (!desc)
+ goto fail_mem;
+ ret = msi_insert_desc(dev, desc, ctrl->domid, idx);
+ if (ret)
+ goto fail;
+ }
+ return 0;
+
+fail_mem:
+ ret = -ENOMEM;
+fail:
+ msi_domain_free_descs(dev, ctrl);
+ return ret;
}
void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
@@ -69,7 +266,354 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
}
EXPORT_SYMBOL_GPL(get_cached_msi_msg);
-#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+static void msi_device_data_release(struct device *dev, void *res)
+{
+ struct msi_device_data *md = res;
+ int i;
+
+ for (i = 0; i < MSI_MAX_DEVICE_IRQDOMAINS; i++) {
+ msi_remove_device_irq_domain(dev, i);
+ WARN_ON_ONCE(!xa_empty(&md->__domains[i].store));
+ xa_destroy(&md->__domains[i].store);
+ }
+ dev->msi.data = NULL;
+}
+
+/**
+ * msi_setup_device_data - Setup MSI device data
+ * @dev: Device for which MSI device data should be set up
+ *
+ * Return: 0 on success, appropriate error code otherwise
+ *
+ * This can be called more than once for @dev. If the MSI device data is
+ * already allocated the call succeeds. The allocated memory is
+ * automatically released when the device is destroyed.
+ */
+int msi_setup_device_data(struct device *dev)
+{
+ struct msi_device_data *md;
+ int ret, i;
+
+ if (dev->msi.data)
+ return 0;
+
+ md = devres_alloc(msi_device_data_release, sizeof(*md), GFP_KERNEL);
+ if (!md)
+ return -ENOMEM;
+
+ ret = msi_sysfs_create_group(dev);
+ if (ret) {
+ devres_free(md);
+ return ret;
+ }
+
+ for (i = 0; i < MSI_MAX_DEVICE_IRQDOMAINS; i++)
+ xa_init_flags(&md->__domains[i].store, XA_FLAGS_ALLOC);
+
+ /*
+ * If @dev::msi::domain is set and is a global MSI domain, copy the
+ * pointer into the domain array so all code can operate on domain
+ * ids. The NULL pointer check is required to keep the legacy
+ * architecture specific PCI/MSI support working.
+ */
+ if (dev->msi.domain && !irq_domain_is_msi_parent(dev->msi.domain))
+ md->__domains[MSI_DEFAULT_DOMAIN].domain = dev->msi.domain;
+
+ mutex_init(&md->mutex);
+ dev->msi.data = md;
+ devres_add(dev, md);
+ return 0;
+}
+
+/**
+ * msi_lock_descs - Lock the MSI descriptor storage of a device
+ * @dev: Device to operate on
+ */
+void msi_lock_descs(struct device *dev)
+{
+ mutex_lock(&dev->msi.data->mutex);
+}
+EXPORT_SYMBOL_GPL(msi_lock_descs);
+
+/**
+ * msi_unlock_descs - Unlock the MSI descriptor storage of a device
+ * @dev: Device to operate on
+ */
+void msi_unlock_descs(struct device *dev)
+{
+ /* Invalidate the index which was cached by the iterator */
+ dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX;
+ mutex_unlock(&dev->msi.data->mutex);
+}
+EXPORT_SYMBOL_GPL(msi_unlock_descs);
+
+static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid,
+ enum msi_desc_filter filter)
+{
+ struct xarray *xa = &md->__domains[domid].store;
+ struct msi_desc *desc;
+
+ xa_for_each_start(xa, md->__iter_idx, desc, md->__iter_idx) {
+ if (msi_desc_match(desc, filter))
+ return desc;
+ }
+ md->__iter_idx = MSI_XA_MAX_INDEX;
+ return NULL;
+}
+
+/**
+ * msi_domain_first_desc - Get the first MSI descriptor of an irqdomain associated to a device
+ * @dev: Device to operate on
+ * @domid: The id of the interrupt domain which should be walked.
+ * @filter: Descriptor state filter
+ *
+ * Must be called with the MSI descriptor mutex held, i.e. msi_lock_descs()
+ * must be invoked before the call.
+ *
+ * Return: Pointer to the first MSI descriptor matching the search
+ * criteria, NULL if none found.
+ */
+struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid,
+ enum msi_desc_filter filter)
+{
+ struct msi_device_data *md = dev->msi.data;
+
+ if (WARN_ON_ONCE(!md || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return NULL;
+
+ lockdep_assert_held(&md->mutex);
+
+ md->__iter_idx = 0;
+ return msi_find_desc(md, domid, filter);
+}
+EXPORT_SYMBOL_GPL(msi_domain_first_desc);
+
+/**
+ * msi_next_desc - Get the next MSI descriptor of a device
+ * @dev: Device to operate on
+ * @domid: The id of the interrupt domain which should be walked.
+ * @filter: Descriptor state filter
+ *
+ * The first invocation of msi_next_desc() has to be preceeded by a
+ * successful invocation of __msi_first_desc(). Consecutive invocations are
+ * only valid if the previous one was successful. All these operations have
+ * to be done within the same MSI mutex held region.
+ *
+ * Return: Pointer to the next MSI descriptor matching the search
+ * criteria, NULL if none found.
+ */
+struct msi_desc *msi_next_desc(struct device *dev, unsigned int domid,
+ enum msi_desc_filter filter)
+{
+ struct msi_device_data *md = dev->msi.data;
+
+ if (WARN_ON_ONCE(!md || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return NULL;
+
+ lockdep_assert_held(&md->mutex);
+
+ if (md->__iter_idx >= (unsigned long)MSI_MAX_INDEX)
+ return NULL;
+
+ md->__iter_idx++;
+ return msi_find_desc(md, domid, filter);
+}
+EXPORT_SYMBOL_GPL(msi_next_desc);
+
+/**
+ * msi_domain_get_virq - Lookup the Linux interrupt number for a MSI index on a interrupt domain
+ * @dev: Device to operate on
+ * @domid: Domain ID of the interrupt domain associated to the device
+ * @index: MSI interrupt index to look for (0-based)
+ *
+ * Return: The Linux interrupt number on success (> 0), 0 if not found
+ */
+unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index)
+{
+ struct msi_desc *desc;
+ unsigned int ret = 0;
+ bool pcimsi = false;
+ struct xarray *xa;
+
+ if (!dev->msi.data)
+ return 0;
+
+ if (WARN_ON_ONCE(index > MSI_MAX_INDEX || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return 0;
+
+ /* This check is only valid for the PCI default MSI domain */
+ if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN)
+ pcimsi = to_pci_dev(dev)->msi_enabled;
+
+ msi_lock_descs(dev);
+ xa = &dev->msi.data->__domains[domid].store;
+ desc = xa_load(xa, pcimsi ? 0 : index);
+ if (desc && desc->irq) {
+ /*
+ * PCI-MSI has only one descriptor for multiple interrupts.
+ * PCI-MSIX and platform MSI use a descriptor per
+ * interrupt.
+ */
+ if (pcimsi) {
+ if (index < desc->nvec_used)
+ ret = desc->irq + index;
+ } else {
+ ret = desc->irq;
+ }
+ }
+
+ msi_unlock_descs(dev);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(msi_domain_get_virq);
+
+#ifdef CONFIG_SYSFS
+static struct attribute *msi_dev_attrs[] = {
+ NULL
+};
+
+static const struct attribute_group msi_irqs_group = {
+ .name = "msi_irqs",
+ .attrs = msi_dev_attrs,
+};
+
+static inline int msi_sysfs_create_group(struct device *dev)
+{
+ return devm_device_add_group(dev, &msi_irqs_group);
+}
+
+static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ /* MSI vs. MSIX is per device not per interrupt */
+ bool is_msix = dev_is_pci(dev) ? to_pci_dev(dev)->msix_enabled : false;
+
+ return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi");
+}
+
+static void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc)
+{
+ struct device_attribute *attrs = desc->sysfs_attrs;
+ int i;
+
+ if (!attrs)
+ return;
+
+ desc->sysfs_attrs = NULL;
+ for (i = 0; i < desc->nvec_used; i++) {
+ if (attrs[i].show)
+ sysfs_remove_file_from_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name);
+ kfree(attrs[i].attr.name);
+ }
+ kfree(attrs);
+}
+
+static int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc)
+{
+ struct device_attribute *attrs;
+ int ret, i;
+
+ attrs = kcalloc(desc->nvec_used, sizeof(*attrs), GFP_KERNEL);
+ if (!attrs)
+ return -ENOMEM;
+
+ desc->sysfs_attrs = attrs;
+ for (i = 0; i < desc->nvec_used; i++) {
+ sysfs_attr_init(&attrs[i].attr);
+ attrs[i].attr.name = kasprintf(GFP_KERNEL, "%d", desc->irq + i);
+ if (!attrs[i].attr.name) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ attrs[i].attr.mode = 0444;
+ attrs[i].show = msi_mode_show;
+
+ ret = sysfs_add_file_to_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name);
+ if (ret) {
+ attrs[i].show = NULL;
+ goto fail;
+ }
+ }
+ return 0;
+
+fail:
+ msi_sysfs_remove_desc(dev, desc);
+ return ret;
+}
+
+#if defined(CONFIG_PCI_MSI_ARCH_FALLBACKS) || defined(CONFIG_PCI_XEN)
+/**
+ * msi_device_populate_sysfs - Populate msi_irqs sysfs entries for a device
+ * @dev: The device (PCI, platform etc) which will get sysfs entries
+ */
+int msi_device_populate_sysfs(struct device *dev)
+{
+ struct msi_desc *desc;
+ int ret;
+
+ msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
+ if (desc->sysfs_attrs)
+ continue;
+ ret = msi_sysfs_populate_desc(dev, desc);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * msi_device_destroy_sysfs - Destroy msi_irqs sysfs entries for a device
+ * @dev: The device (PCI, platform etc) for which to remove
+ * sysfs entries
+ */
+void msi_device_destroy_sysfs(struct device *dev)
+{
+ struct msi_desc *desc;
+
+ msi_for_each_desc(desc, dev, MSI_DESC_ALL)
+ msi_sysfs_remove_desc(dev, desc);
+}
+#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK || CONFIG_PCI_XEN */
+#else /* CONFIG_SYSFS */
+static inline int msi_sysfs_create_group(struct device *dev) { return 0; }
+static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) { return 0; }
+static inline void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) { }
+#endif /* !CONFIG_SYSFS */
+
+static struct irq_domain *msi_get_device_domain(struct device *dev, unsigned int domid)
+{
+ struct irq_domain *domain;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ if (WARN_ON_ONCE(domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return NULL;
+
+ domain = dev->msi.data->__domains[domid].domain;
+ if (!domain)
+ return NULL;
+
+ if (WARN_ON_ONCE(irq_domain_is_msi_parent(domain)))
+ return NULL;
+
+ return domain;
+}
+
+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid)
+{
+ struct msi_domain_info *info;
+ struct irq_domain *domain;
+
+ domain = msi_get_device_domain(dev, domid);
+ if (domain) {
+ info = domain->host_data;
+ return info->hwsize;
+ }
+ /* No domain, default to MSI_XA_DOMAIN_SIZE */
+ return MSI_XA_DOMAIN_SIZE;
+}
+
static inline void irq_chip_write_msi_msg(struct irq_data *data,
struct msi_msg *msg)
{
@@ -97,6 +641,8 @@ static void msi_check_level(struct irq_domain *domain, struct msi_msg *msg)
*
* Intended to be used by MSI interrupt controllers which are
* implemented with hierarchical domains.
+ *
+ * Return: IRQ_SET_MASK_* result code
*/
int msi_domain_set_affinity(struct irq_data *irq_data,
const struct cpumask *mask, bool force)
@@ -187,7 +733,6 @@ static const struct irq_domain_ops msi_domain_ops = {
.deactivate = msi_domain_deactivate,
};
-#ifdef GENERIC_MSI_DOMAIN_OPS
static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info,
msi_alloc_info_t *arg)
{
@@ -206,11 +751,6 @@ static void msi_domain_ops_set_desc(msi_alloc_info_t *arg,
{
arg->desc = desc;
}
-#else
-#define msi_domain_ops_get_hwirq NULL
-#define msi_domain_ops_prepare NULL
-#define msi_domain_ops_set_desc NULL
-#endif /* !GENERIC_MSI_DOMAIN_OPS */
static int msi_domain_ops_init(struct irq_domain *domain,
struct msi_domain_info *info,
@@ -227,19 +767,11 @@ static int msi_domain_ops_init(struct irq_domain *domain,
return 0;
}
-static int msi_domain_ops_check(struct irq_domain *domain,
- struct msi_domain_info *info,
- struct device *dev)
-{
- return 0;
-}
-
static struct msi_domain_ops msi_domain_ops_default = {
- .get_hwirq = msi_domain_ops_get_hwirq,
- .msi_init = msi_domain_ops_init,
- .msi_check = msi_domain_ops_check,
- .msi_prepare = msi_domain_ops_prepare,
- .set_desc = msi_domain_ops_set_desc,
+ .get_hwirq = msi_domain_ops_get_hwirq,
+ .msi_init = msi_domain_ops_init,
+ .msi_prepare = msi_domain_ops_prepare,
+ .set_desc = msi_domain_ops_set_desc,
};
static void msi_domain_update_dom_ops(struct msi_domain_info *info)
@@ -251,12 +783,13 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)
return;
}
+ if (!(info->flags & MSI_FLAG_USE_DEF_DOM_OPS))
+ return;
+
if (ops->get_hwirq == NULL)
ops->get_hwirq = msi_domain_ops_default.get_hwirq;
if (ops->msi_init == NULL)
ops->msi_init = msi_domain_ops_default.msi_init;
- if (ops->msi_check == NULL)
- ops->msi_check = msi_domain_ops_default.msi_check;
if (ops->msi_prepare == NULL)
ops->msi_prepare = msi_domain_ops_default.msi_prepare;
if (ops->set_desc == NULL)
@@ -272,30 +805,257 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info)
chip->irq_set_affinity = msi_domain_set_affinity;
}
+static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode,
+ struct msi_domain_info *info,
+ unsigned int flags,
+ struct irq_domain *parent)
+{
+ struct irq_domain *domain;
+
+ if (info->hwsize > MSI_XA_DOMAIN_SIZE)
+ return NULL;
+
+ /*
+ * Hardware size 0 is valid for backwards compatibility and for
+ * domains which are not backed by a hardware table. Grant the
+ * maximum index space.
+ */
+ if (!info->hwsize)
+ info->hwsize = MSI_XA_DOMAIN_SIZE;
+
+ msi_domain_update_dom_ops(info);
+ if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
+ msi_domain_update_chip_ops(info);
+
+ domain = irq_domain_create_hierarchy(parent, flags | IRQ_DOMAIN_FLAG_MSI, 0,
+ fwnode, &msi_domain_ops, info);
+
+ if (domain)
+ irq_domain_update_bus_token(domain, info->bus_token);
+
+ return domain;
+}
+
/**
- * msi_create_irq_domain - Create a MSI interrupt domain
+ * msi_create_irq_domain - Create an MSI interrupt domain
* @fwnode: Optional fwnode of the interrupt controller
* @info: MSI domain info
* @parent: Parent irq domain
+ *
+ * Return: pointer to the created &struct irq_domain or %NULL on failure
*/
struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
struct msi_domain_info *info,
struct irq_domain *parent)
{
+ return __msi_create_irq_domain(fwnode, info, 0, parent);
+}
+
+/**
+ * msi_parent_init_dev_msi_info - Delegate initialization of device MSI info down
+ * in the domain hierarchy
+ * @dev: The device for which the domain should be created
+ * @domain: The domain in the hierarchy this op is being called on
+ * @msi_parent_domain: The IRQ_DOMAIN_FLAG_MSI_PARENT domain for the child to
+ * be created
+ * @msi_child_info: The MSI domain info of the IRQ_DOMAIN_FLAG_MSI_DEVICE
+ * domain to be created
+ *
+ * Return: true on success, false otherwise
+ *
+ * This is the most complex problem of per device MSI domains and the
+ * underlying interrupt domain hierarchy:
+ *
+ * The device domain to be initialized requests the broadest feature set
+ * possible and the underlying domain hierarchy puts restrictions on it.
+ *
+ * That's trivial for a simple parent->child relationship, but it gets
+ * interesting with an intermediate domain: root->parent->child. The
+ * intermediate 'parent' can expand the capabilities which the 'root'
+ * domain is providing. So that creates a classic hen and egg problem:
+ * Which entity is doing the restrictions/expansions?
+ *
+ * One solution is to let the root domain handle the initialization that's
+ * why there is the @domain and the @msi_parent_domain pointer.
+ */
+bool msi_parent_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+ struct irq_domain *msi_parent_domain,
+ struct msi_domain_info *msi_child_info)
+{
+ struct irq_domain *parent = domain->parent;
+
+ if (WARN_ON_ONCE(!parent || !parent->msi_parent_ops ||
+ !parent->msi_parent_ops->init_dev_msi_info))
+ return false;
+
+ return parent->msi_parent_ops->init_dev_msi_info(dev, parent, msi_parent_domain,
+ msi_child_info);
+}
+
+/**
+ * msi_create_device_irq_domain - Create a device MSI interrupt domain
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ * @template: MSI domain info bundle used as template
+ * @hwsize: Maximum number of MSI table entries (0 if unknown or unlimited)
+ * @domain_data: Optional pointer to domain specific data which is set in
+ * msi_domain_info::data
+ * @chip_data: Optional pointer to chip specific data which is set in
+ * msi_domain_info::chip_data
+ *
+ * Return: True on success, false otherwise
+ *
+ * There is no firmware node required for this interface because the per
+ * device domains are software constructs which are actually closer to the
+ * hardware reality than any firmware can describe them.
+ *
+ * The domain name and the irq chip name for a MSI device domain are
+ * composed by: "$(PREFIX)$(CHIPNAME)-$(DEVNAME)"
+ *
+ * $PREFIX: Optional prefix provided by the underlying MSI parent domain
+ * via msi_parent_ops::prefix. If that pointer is NULL the prefix
+ * is empty.
+ * $CHIPNAME: The name of the irq_chip in @template
+ * $DEVNAME: The name of the device
+ *
+ * This results in understandable chip names and hardware interrupt numbers
+ * in e.g. /proc/interrupts
+ *
+ * PCI-MSI-0000:00:1c.0 0-edge Parent domain has no prefix
+ * IR-PCI-MSI-0000:00:1c.4 0-edge Same with interrupt remapping prefix 'IR-'
+ *
+ * IR-PCI-MSIX-0000:3d:00.0 0-edge Hardware interrupt numbers reflect
+ * IR-PCI-MSIX-0000:3d:00.0 1-edge the real MSI-X index on that device
+ * IR-PCI-MSIX-0000:3d:00.0 2-edge
+ *
+ * On IMS domains the hardware interrupt number is either a table entry
+ * index or a purely software managed index but it is guaranteed to be
+ * unique.
+ *
+ * The domain pointer is stored in @dev::msi::data::__irqdomains[]. All
+ * subsequent operations on the domain depend on the domain id.
+ *
+ * The domain is automatically freed when the device is removed via devres
+ * in the context of @dev::msi::data freeing, but it can also be
+ * independently removed via @msi_remove_device_irq_domain().
+ */
+bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
+ const struct msi_domain_template *template,
+ unsigned int hwsize, void *domain_data,
+ void *chip_data)
+{
+ struct irq_domain *domain, *parent = dev->msi.domain;
+ const struct msi_parent_ops *pops;
+ struct msi_domain_template *bundle;
+ struct fwnode_handle *fwnode;
+
+ if (!irq_domain_is_msi_parent(parent))
+ return false;
+
+ if (domid >= MSI_MAX_DEVICE_IRQDOMAINS)
+ return false;
+
+ bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL);
+ if (!bundle)
+ return false;
+
+ bundle->info.hwsize = hwsize;
+ bundle->info.chip = &bundle->chip;
+ bundle->info.ops = &bundle->ops;
+ bundle->info.data = domain_data;
+ bundle->info.chip_data = chip_data;
+
+ pops = parent->msi_parent_ops;
+ snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s",
+ pops->prefix ? : "", bundle->chip.name, dev_name(dev));
+ bundle->chip.name = bundle->name;
+
+ fwnode = irq_domain_alloc_named_fwnode(bundle->name);
+ if (!fwnode)
+ goto free_bundle;
+
+ if (msi_setup_device_data(dev))
+ goto free_fwnode;
+
+ msi_lock_descs(dev);
+
+ if (WARN_ON_ONCE(msi_get_device_domain(dev, domid)))
+ goto fail;
+
+ if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info))
+ goto fail;
+
+ domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent);
+ if (!domain)
+ goto fail;
+
+ domain->dev = dev;
+ dev->msi.data->__domains[domid].domain = domain;
+ msi_unlock_descs(dev);
+ return true;
+
+fail:
+ msi_unlock_descs(dev);
+free_fwnode:
+ irq_domain_free_fwnode(fwnode);
+free_bundle:
+ kfree(bundle);
+ return false;
+}
+
+/**
+ * msi_remove_device_irq_domain - Free a device MSI interrupt domain
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ */
+void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
+{
+ struct fwnode_handle *fwnode = NULL;
+ struct msi_domain_info *info;
struct irq_domain *domain;
- if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
- msi_domain_update_dom_ops(info);
- if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
- msi_domain_update_chip_ops(info);
+ msi_lock_descs(dev);
- domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0,
- fwnode, &msi_domain_ops, info);
+ domain = msi_get_device_domain(dev, domid);
- if (domain && !domain->name && info->chip)
- domain->name = info->chip->name;
+ if (!domain || !irq_domain_is_msi_device(domain))
+ goto unlock;
- return domain;
+ dev->msi.data->__domains[domid].domain = NULL;
+ info = domain->host_data;
+ if (irq_domain_is_msi_device(domain))
+ fwnode = domain->fwnode;
+ irq_domain_remove(domain);
+ irq_domain_free_fwnode(fwnode);
+ kfree(container_of(info, struct msi_domain_template, info));
+
+unlock:
+ msi_unlock_descs(dev);
+}
+
+/**
+ * msi_match_device_irq_domain - Match a device irq domain against a bus token
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ * @bus_token: Bus token to match against the domain bus token
+ *
+ * Return: True if device domain exists and bus tokens match.
+ */
+bool msi_match_device_irq_domain(struct device *dev, unsigned int domid,
+ enum irq_domain_bus_token bus_token)
+{
+ struct msi_domain_info *info;
+ struct irq_domain *domain;
+ bool ret = false;
+
+ msi_lock_descs(dev);
+ domain = msi_get_device_domain(dev, domid);
+ if (domain && irq_domain_is_msi_device(domain)) {
+ info = domain->host_data;
+ ret = info->bus_token == bus_token;
+ }
+ msi_unlock_descs(dev);
+ return ret;
}
int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
@@ -303,56 +1063,81 @@ int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
- int ret;
-
- ret = ops->msi_check(domain, info, dev);
- if (ret == 0)
- ret = ops->msi_prepare(domain, dev, nvec, arg);
- return ret;
+ return ops->msi_prepare(domain, dev, nvec, arg);
}
int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
- int virq, int nvec, msi_alloc_info_t *arg)
+ int virq_base, int nvec, msi_alloc_info_t *arg)
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
+ struct msi_ctrl ctrl = {
+ .domid = MSI_DEFAULT_DOMAIN,
+ .first = virq_base,
+ .last = virq_base + nvec - 1,
+ };
struct msi_desc *desc;
- int ret = 0;
+ struct xarray *xa;
+ int ret, virq;
- for_each_msi_entry(desc, dev) {
- /* Don't even try the multi-MSI brain damage. */
- if (WARN_ON(!desc->irq || desc->nvec_used != 1)) {
- ret = -EINVAL;
- break;
- }
+ msi_lock_descs(dev);
- if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
- continue;
+ if (!msi_ctrl_valid(dev, &ctrl)) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ ret = msi_domain_add_simple_msi_descs(dev, &ctrl);
+ if (ret)
+ goto unlock;
+
+ xa = &dev->msi.data->__domains[ctrl.domid].store;
+
+ for (virq = virq_base; virq < virq_base + nvec; virq++) {
+ desc = xa_load(xa, virq);
+ desc->irq = virq;
ops->set_desc(arg, desc);
- /* Assumes the domain mutex is held! */
- ret = irq_domain_alloc_irqs_hierarchy(domain, desc->irq, 1,
- arg);
+ ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
if (ret)
- break;
+ goto fail;
- irq_set_msi_desc_off(desc->irq, 0, desc);
+ irq_set_msi_desc(virq, desc);
}
+ msi_unlock_descs(dev);
+ return 0;
- if (ret) {
- /* Mop up the damage */
- for_each_msi_entry(desc, dev) {
- if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
- continue;
-
- irq_domain_free_irqs_common(domain, desc->irq, 1);
- }
+fail:
+ for (--virq; virq >= virq_base; virq--) {
+ msi_domain_depopulate_descs(dev, virq, 1);
+ irq_domain_free_irqs_common(domain, virq, 1);
}
-
+ msi_domain_free_descs(dev, &ctrl);
+unlock:
+ msi_unlock_descs(dev);
return ret;
}
+void msi_domain_depopulate_descs(struct device *dev, int virq_base, int nvec)
+{
+ struct msi_ctrl ctrl = {
+ .domid = MSI_DEFAULT_DOMAIN,
+ .first = virq_base,
+ .last = virq_base + nvec - 1,
+ };
+ struct msi_desc *desc;
+ struct xarray *xa;
+ unsigned long idx;
+
+ if (!msi_ctrl_valid(dev, &ctrl))
+ return;
+
+ xa = &dev->msi.data->__domains[ctrl.domid].store;
+ xa_for_each_range(xa, idx, desc, ctrl.first, ctrl.last)
+ desc->irq = 0;
+}
+
/*
* Carefully check whether the device can use reservation mode. If
* reservation mode is enabled then the early activation will assign a
@@ -370,8 +1155,15 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
{
struct msi_desc *desc;
- if (domain->bus_token != DOMAIN_BUS_PCI_MSI)
+ switch(domain->bus_token) {
+ case DOMAIN_BUS_PCI_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
+ case DOMAIN_BUS_VMD_MSI:
+ break;
+ default:
return false;
+ }
if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
return false;
@@ -381,148 +1173,495 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
/*
* Checking the first MSI descriptor is sufficient. MSIX supports
- * masking and MSI does so when the maskbit is set.
+ * masking and MSI does so when the can_mask attribute is set.
*/
- desc = first_msi_entry(dev);
- return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit;
+ desc = msi_first_desc(dev, MSI_DESC_ALL);
+ return desc->pci.msi_attrib.is_msix || desc->pci.msi_attrib.can_mask;
}
-/**
- * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
- * @domain: The domain to allocate from
- * @dev: Pointer to device struct of the device for which the interrupts
- * are allocated
- * @nvec: The number of interrupts to allocate
- *
- * Returns 0 on success or an error code.
- */
-int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
- int nvec)
+static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc,
+ int allocated)
+{
+ switch(domain->bus_token) {
+ case DOMAIN_BUS_PCI_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
+ case DOMAIN_BUS_VMD_MSI:
+ if (IS_ENABLED(CONFIG_PCI_MSI))
+ break;
+ fallthrough;
+ default:
+ return -ENOSPC;
+ }
+
+ /* Let a failed PCI multi MSI allocation retry */
+ if (desc->nvec_used > 1)
+ return 1;
+
+ /* If there was a successful allocation let the caller know */
+ return allocated ? allocated : -ENOSPC;
+}
+
+#define VIRQ_CAN_RESERVE 0x01
+#define VIRQ_ACTIVATE 0x02
+
+static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflags)
{
+ struct irq_data *irqd = irq_domain_get_irq_data(domain, virq);
+ int ret;
+
+ if (!(vflags & VIRQ_CAN_RESERVE)) {
+ irqd_clr_can_reserve(irqd);
+
+ /*
+ * If the interrupt is managed but no CPU is available to
+ * service it, shut it down until better times. Note that
+ * we only do this on the !RESERVE path as x86 (the only
+ * architecture using this flag) deals with this in a
+ * different way by using a catch-all vector.
+ */
+ if ((vflags & VIRQ_ACTIVATE) &&
+ irqd_affinity_is_managed(irqd) &&
+ !cpumask_intersects(irq_data_get_affinity_mask(irqd),
+ cpu_online_mask)) {
+ irqd_set_managed_shutdown(irqd);
+ return 0;
+ }
+ }
+
+ if (!(vflags & VIRQ_ACTIVATE))
+ return 0;
+
+ ret = irq_domain_activate_irq(irqd, vflags & VIRQ_CAN_RESERVE);
+ if (ret)
+ return ret;
+ /*
+ * If the interrupt uses reservation mode, clear the activated bit
+ * so request_irq() will assign the final vector.
+ */
+ if (vflags & VIRQ_CAN_RESERVE)
+ irqd_clr_activated(irqd);
+ return 0;
+}
+
+static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain,
+ struct msi_ctrl *ctrl)
+{
+ struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store;
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
- struct irq_data *irq_data;
+ unsigned int vflags = 0, allocated = 0;
+ msi_alloc_info_t arg = { };
struct msi_desc *desc;
- msi_alloc_info_t arg;
+ unsigned long idx;
int i, ret, virq;
- bool can_reserve;
- ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
+ ret = msi_domain_prepare_irqs(domain, dev, ctrl->nirqs, &arg);
if (ret)
return ret;
- for_each_msi_entry(desc, dev) {
+ /*
+ * This flag is set by the PCI layer as we need to activate
+ * the MSI entries before the PCI layer enables MSI in the
+ * card. Otherwise the card latches a random msi message.
+ */
+ if (info->flags & MSI_FLAG_ACTIVATE_EARLY)
+ vflags |= VIRQ_ACTIVATE;
+
+ /*
+ * Interrupt can use a reserved vector and will not occupy
+ * a real device vector until the interrupt is requested.
+ */
+ if (msi_check_reservation_mode(domain, info, dev))
+ vflags |= VIRQ_CAN_RESERVE;
+
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ if (!msi_desc_match(desc, MSI_DESC_NOTASSOCIATED))
+ continue;
+
+ /* This should return -ECONFUSED... */
+ if (WARN_ON_ONCE(allocated >= ctrl->nirqs))
+ return -EINVAL;
+
+ if (ops->prepare_desc)
+ ops->prepare_desc(domain, &arg, desc);
+
ops->set_desc(&arg, desc);
virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
dev_to_node(dev), &arg, false,
desc->affinity);
- if (virq < 0) {
- ret = -ENOSPC;
- if (ops->handle_error)
- ret = ops->handle_error(domain, desc, ret);
- if (ops->msi_finish)
- ops->msi_finish(&arg, ret);
- return ret;
- }
+ if (virq < 0)
+ return msi_handle_pci_fail(domain, desc, allocated);
for (i = 0; i < desc->nvec_used; i++) {
irq_set_msi_desc_off(virq, i, desc);
irq_debugfs_copy_devname(virq + i, dev);
+ ret = msi_init_virq(domain, virq + i, vflags);
+ if (ret)
+ return ret;
}
+ if (info->flags & MSI_FLAG_DEV_SYSFS) {
+ ret = msi_sysfs_populate_desc(dev, desc);
+ if (ret)
+ return ret;
+ }
+ allocated++;
}
+ return 0;
+}
- if (ops->msi_finish)
- ops->msi_finish(&arg, 0);
+static int msi_domain_alloc_simple_msi_descs(struct device *dev,
+ struct msi_domain_info *info,
+ struct msi_ctrl *ctrl)
+{
+ if (!(info->flags & MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS))
+ return 0;
- can_reserve = msi_check_reservation_mode(domain, info, dev);
+ return msi_domain_add_simple_msi_descs(dev, ctrl);
+}
- for_each_msi_entry(desc, dev) {
- virq = desc->irq;
- if (desc->nvec_used == 1)
- dev_dbg(dev, "irq %d for MSI\n", virq);
- else
- dev_dbg(dev, "irq [%d-%d] for MSI\n",
- virq, virq + desc->nvec_used - 1);
- /*
- * This flag is set by the PCI layer as we need to activate
- * the MSI entries before the PCI layer enables MSI in the
- * card. Otherwise the card latches a random msi message.
- */
- if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
- continue;
+static int __msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_domain_info *info;
+ struct msi_domain_ops *ops;
+ struct irq_domain *domain;
+ int ret;
- irq_data = irq_domain_get_irq_data(domain, desc->irq);
- if (!can_reserve) {
- irqd_clr_can_reserve(irq_data);
- if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
- irqd_set_msi_nomask_quirk(irq_data);
- }
- ret = irq_domain_activate_irq(irq_data, can_reserve);
- if (ret)
- goto cleanup;
- }
+ if (!msi_ctrl_valid(dev, ctrl))
+ return -EINVAL;
- /*
- * If these interrupts use reservation mode, clear the activated bit
- * so request_irq() will assign the final vector.
- */
- if (can_reserve) {
- for_each_msi_entry(desc, dev) {
- irq_data = irq_domain_get_irq_data(domain, desc->irq);
- irqd_clr_activated(irq_data);
- }
- }
- return 0;
+ domain = msi_get_device_domain(dev, ctrl->domid);
+ if (!domain)
+ return -ENODEV;
-cleanup:
- for_each_msi_entry(desc, dev) {
- struct irq_data *irqd;
+ info = domain->host_data;
- if (desc->irq == virq)
- break;
+ ret = msi_domain_alloc_simple_msi_descs(dev, info, ctrl);
+ if (ret)
+ return ret;
- irqd = irq_domain_get_irq_data(domain, desc->irq);
- if (irqd_is_activated(irqd))
- irq_domain_deactivate_irq(irqd);
- }
- msi_domain_free_irqs(domain, dev);
+ ops = info->ops;
+ if (ops->domain_alloc_irqs)
+ return ops->domain_alloc_irqs(domain, dev, ctrl->nirqs);
+
+ return __msi_domain_alloc_irqs(dev, domain, ctrl);
+}
+
+static int msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl)
+{
+ int ret = __msi_domain_alloc_locked(dev, ctrl);
+
+ if (ret)
+ msi_domain_free_locked(dev, ctrl);
return ret;
}
/**
- * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev
- * @domain: The domain to managing the interrupts
+ * msi_domain_alloc_irqs_range_locked - Allocate interrupts from a MSI interrupt domain
* @dev: Pointer to device struct of the device for which the interrupts
- * are free
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to allocate (inclusive)
+ * @last: Last index to allocate (inclusive)
+ *
+ * Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
+ * pair. Use this for MSI irqdomains which implement their own descriptor
+ * allocation/free.
+ *
+ * Return: %0 on success or an error code.
*/
-void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ .nirqs = last + 1 - first,
+ };
+
+ return msi_domain_alloc_locked(dev, &ctrl);
+}
+
+/**
+ * msi_domain_alloc_irqs_range - Allocate interrupts from a MSI interrupt domain
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to allocate (inclusive)
+ * @last: Last index to allocate (inclusive)
+ *
+ * Return: %0 on success or an error code.
+ */
+int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ int ret;
+
+ msi_lock_descs(dev);
+ ret = msi_domain_alloc_irqs_range_locked(dev, domid, first, last);
+ msi_unlock_descs(dev);
+ return ret;
+}
+
+/**
+ * msi_domain_alloc_irqs_all_locked - Allocate all interrupts from a MSI interrupt domain
+ *
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @nirqs: The number of interrupts to allocate
+ *
+ * This function scans all MSI descriptors of the MSI domain and allocates interrupts
+ * for all unassigned ones. That function is to be used for MSI domain usage where
+ * the descriptor allocation is handled at the call site, e.g. PCI/MSI[X].
+ *
+ * Return: %0 on success or an error code.
+ */
+int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int nirqs)
+{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = 0,
+ .last = msi_domain_get_hwsize(dev, domid) - 1,
+ .nirqs = nirqs,
+ };
+
+ return msi_domain_alloc_locked(dev, &ctrl);
+}
+
+/**
+ * msi_domain_alloc_irq_at - Allocate an interrupt from a MSI interrupt domain at
+ * a given index - or at the next free index
+ *
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @index: Index for allocation. If @index == %MSI_ANY_INDEX the allocation
+ * uses the next free index.
+ * @affdesc: Optional pointer to an interrupt affinity descriptor structure
+ * @icookie: Optional pointer to a domain specific per instance cookie. If
+ * non-NULL the content of the cookie is stored in msi_desc::data.
+ * Must be NULL for MSI-X allocations
+ *
+ * This requires a MSI interrupt domain which lets the core code manage the
+ * MSI descriptors.
+ *
+ * Return: struct msi_map
+ *
+ * On success msi_map::index contains the allocated index number and
+ * msi_map::virq the corresponding Linux interrupt number
+ *
+ * On failure msi_map::index contains the error code and msi_map::virq
+ * is %0.
+ */
+struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, unsigned int index,
+ const struct irq_affinity_desc *affdesc,
+ union msi_instance_cookie *icookie)
+{
+ struct msi_ctrl ctrl = { .domid = domid, .nirqs = 1, };
+ struct irq_domain *domain;
+ struct msi_map map = { };
struct msi_desc *desc;
+ int ret;
- for_each_msi_entry(desc, dev) {
- /*
- * We might have failed to allocate an MSI early
- * enough that there is no IRQ associated to this
- * entry. If that's the case, don't do anything.
- */
- if (desc->irq) {
- irq_domain_free_irqs(desc->irq, desc->nvec_used);
- desc->irq = 0;
+ msi_lock_descs(dev);
+ domain = msi_get_device_domain(dev, domid);
+ if (!domain) {
+ map.index = -ENODEV;
+ goto unlock;
+ }
+
+ desc = msi_alloc_desc(dev, 1, affdesc);
+ if (!desc) {
+ map.index = -ENOMEM;
+ goto unlock;
+ }
+
+ if (icookie)
+ desc->data.icookie = *icookie;
+
+ ret = msi_insert_desc(dev, desc, domid, index);
+ if (ret) {
+ map.index = ret;
+ goto unlock;
+ }
+
+ ctrl.first = ctrl.last = desc->msi_index;
+
+ ret = __msi_domain_alloc_irqs(dev, domain, &ctrl);
+ if (ret) {
+ map.index = ret;
+ msi_domain_free_locked(dev, &ctrl);
+ } else {
+ map.index = desc->msi_index;
+ map.virq = desc->irq;
+ }
+unlock:
+ msi_unlock_descs(dev);
+ return map;
+}
+
+static void __msi_domain_free_irqs(struct device *dev, struct irq_domain *domain,
+ struct msi_ctrl *ctrl)
+{
+ struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store;
+ struct msi_domain_info *info = domain->host_data;
+ struct irq_data *irqd;
+ struct msi_desc *desc;
+ unsigned long idx;
+ int i;
+
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ /* Only handle MSI entries which have an interrupt associated */
+ if (!msi_desc_match(desc, MSI_DESC_ASSOCIATED))
+ continue;
+
+ /* Make sure all interrupts are deactivated */
+ for (i = 0; i < desc->nvec_used; i++) {
+ irqd = irq_domain_get_irq_data(domain, desc->irq + i);
+ if (irqd && irqd_is_activated(irqd))
+ irq_domain_deactivate_irq(irqd);
}
+
+ irq_domain_free_irqs(desc->irq, desc->nvec_used);
+ if (info->flags & MSI_FLAG_DEV_SYSFS)
+ msi_sysfs_remove_desc(dev, desc);
+ desc->irq = 0;
}
}
+static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_domain_info *info;
+ struct msi_domain_ops *ops;
+ struct irq_domain *domain;
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return;
+
+ domain = msi_get_device_domain(dev, ctrl->domid);
+ if (!domain)
+ return;
+
+ info = domain->host_data;
+ ops = info->ops;
+
+ if (ops->domain_free_irqs)
+ ops->domain_free_irqs(domain, dev);
+ else
+ __msi_domain_free_irqs(dev, domain, ctrl);
+
+ if (ops->msi_post_free)
+ ops->msi_post_free(domain, dev);
+
+ if (info->flags & MSI_FLAG_FREE_MSI_DESCS)
+ msi_domain_free_descs(dev, ctrl);
+}
+
+/**
+ * msi_domain_free_irqs_range_locked - Free a range of interrupts from a MSI interrupt domain
+ * associated to @dev with msi_lock held
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to free (inclusive)
+ * @last: Last index to free (inclusive)
+ */
+void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ };
+ msi_domain_free_locked(dev, &ctrl);
+}
+
+/**
+ * msi_domain_free_irqs_range - Free a range of interrupts from a MSI interrupt domain
+ * associated to @dev
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to free (inclusive)
+ * @last: Last index to free (inclusive)
+ */
+void msi_domain_free_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ msi_lock_descs(dev);
+ msi_domain_free_irqs_range_locked(dev, domid, first, last);
+ msi_unlock_descs(dev);
+}
+
+/**
+ * msi_domain_free_irqs_all_locked - Free all interrupts from a MSI interrupt domain
+ * associated to a device
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: The id of the domain to operate on
+ *
+ * Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
+ * pair. Use this for MSI irqdomains which implement their own vector
+ * allocation.
+ */
+void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid)
+{
+ msi_domain_free_irqs_range_locked(dev, domid, 0,
+ msi_domain_get_hwsize(dev, domid) - 1);
+}
+
+/**
+ * msi_domain_free_irqs_all - Free all interrupts from a MSI interrupt domain
+ * associated to a device
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: The id of the domain to operate on
+ */
+void msi_domain_free_irqs_all(struct device *dev, unsigned int domid)
+{
+ msi_lock_descs(dev);
+ msi_domain_free_irqs_all_locked(dev, domid);
+ msi_unlock_descs(dev);
+}
+
/**
* msi_get_domain_info - Get the MSI interrupt domain info for @domain
* @domain: The interrupt domain to retrieve data from
*
- * Returns the pointer to the msi_domain_info stored in
- * @domain->host_data.
+ * Return: the pointer to the msi_domain_info stored in @domain->host_data.
*/
struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
{
return (struct msi_domain_info *)domain->host_data;
}
-#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
+/**
+ * msi_device_has_isolated_msi - True if the device has isolated MSI
+ * @dev: The device to check
+ *
+ * Isolated MSI means that HW modeled by an irq_domain on the path from the
+ * initiating device to the CPU will validate that the MSI message specifies an
+ * interrupt number that the device is authorized to trigger. This must block
+ * devices from triggering interrupts they are not authorized to trigger.
+ * Currently authorization means the MSI vector is one assigned to the device.
+ *
+ * This is interesting for securing VFIO use cases where a rouge MSI (eg created
+ * by abusing a normal PCI MemWr DMA) must not allow the VFIO userspace to
+ * impact outside its security domain, eg userspace triggering interrupts on
+ * kernel drivers, a VM triggering interrupts on the hypervisor, or a VM
+ * triggering interrupts on another VM.
+ */
+bool msi_device_has_isolated_msi(struct device *dev)
+{
+ struct irq_domain *domain = dev_get_msi_domain(dev);
+
+ for (; domain; domain = domain->parent)
+ if (domain->flags & IRQ_DOMAIN_FLAG_ISOLATED_MSI)
+ return true;
+ return arch_is_isolated_msi();
+}
+EXPORT_SYMBOL_GPL(msi_device_has_isolated_msi);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 8f557fa1f4fe..c556bc49d213 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -69,12 +69,26 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
static bool suspend_device_irq(struct irq_desc *desc)
{
+ unsigned long chipflags = irq_desc_get_chip(desc)->flags;
+ struct irq_data *irqd = &desc->irq_data;
+
if (!desc->action || irq_desc_is_chained(desc) ||
desc->no_suspend_depth)
return false;
- if (irqd_is_wakeup_set(&desc->irq_data)) {
- irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ if (irqd_is_wakeup_set(irqd)) {
+ irqd_set(irqd, IRQD_WAKEUP_ARMED);
+
+ if ((chipflags & IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND) &&
+ irqd_irq_disabled(irqd)) {
+ /*
+ * Interrupt marked for wakeup is in disabled state.
+ * Enable interrupt here to unmask/enable in irqchip
+ * to be able to resume with such interrupts.
+ */
+ __enable_irq(desc);
+ irqd_set(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND);
+ }
/*
* We return true here to force the caller to issue
* synchronize_irq(). We need to make sure that the
@@ -93,7 +107,7 @@ static bool suspend_device_irq(struct irq_desc *desc)
* chip level. The chip implementation indicates that with
* IRQCHIP_MASK_ON_SUSPEND.
*/
- if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
+ if (chipflags & IRQCHIP_MASK_ON_SUSPEND)
mask_irq(desc);
return true;
}
@@ -133,11 +147,22 @@ void suspend_device_irqs(void)
synchronize_irq(irq);
}
}
-EXPORT_SYMBOL_GPL(suspend_device_irqs);
static void resume_irq(struct irq_desc *desc)
{
- irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ struct irq_data *irqd = &desc->irq_data;
+
+ irqd_clear(irqd, IRQD_WAKEUP_ARMED);
+
+ if (irqd_is_enabled_on_suspend(irqd)) {
+ /*
+ * Interrupt marked for wakeup was enabled during suspend
+ * entry. Disable such interrupts to restore them back to
+ * original state.
+ */
+ __disable_irq(desc);
+ irqd_clear(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND);
+ }
if (desc->istate & IRQS_SUSPENDED)
goto resume;
@@ -185,19 +210,23 @@ void rearm_wake_irq(unsigned int irq)
unsigned long flags;
struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
- if (!desc || !(desc->istate & IRQS_SUSPENDED) ||
- !irqd_is_wakeup_set(&desc->irq_data))
+ if (!desc)
return;
+ if (!(desc->istate & IRQS_SUSPENDED) ||
+ !irqd_is_wakeup_set(&desc->irq_data))
+ goto unlock;
+
desc->istate &= ~IRQS_SUSPENDED;
irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
__enable_irq(desc);
+unlock:
irq_put_desc_busunlock(desc, flags);
}
/**
- * irq_pm_syscore_ops - enable interrupt lines early
+ * irq_pm_syscore_resume - enable interrupt lines early
*
* Enable all interrupt lines with %IRQF_EARLY_RESUME set.
*/
@@ -229,4 +258,3 @@ void resume_device_irqs(void)
{
resume_irqs(false);
}
-EXPORT_SYMBOL_GPL(resume_device_irqs);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 32c071d7bc03..623b8136e9af 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -137,14 +137,14 @@ static inline int irq_select_affinity_usr(unsigned int irq)
static ssize_t write_irq_affinity(int type, struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
- unsigned int irq = (int)(long)PDE_DATA(file_inode(file));
+ unsigned int irq = (int)(long)pde_data(file_inode(file));
cpumask_var_t new_value;
int err;
if (!irq_can_set_affinity_usr(irq) || no_irq_affinity)
return -EIO;
- if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
if (type)
@@ -190,12 +190,12 @@ static ssize_t irq_affinity_list_proc_write(struct file *file,
static int irq_affinity_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_affinity_proc_show, PDE_DATA(inode));
+ return single_open(file, irq_affinity_proc_show, pde_data(inode));
}
static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
+ return single_open(file, irq_affinity_list_proc_show, pde_data(inode));
}
static const struct proc_ops irq_affinity_proc_ops = {
@@ -238,7 +238,7 @@ static ssize_t default_affinity_write(struct file *file,
cpumask_var_t new_value;
int err;
- if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
err = cpumask_parse_user(buffer, count, new_value);
@@ -265,7 +265,7 @@ out:
static int default_affinity_open(struct inode *inode, struct file *file)
{
- return single_open(file, default_affinity_show, PDE_DATA(inode));
+ return single_open(file, default_affinity_show, pde_data(inode));
}
static const struct proc_ops default_affinity_proc_ops = {
@@ -485,12 +485,13 @@ int show_interrupts(struct seq_file *p, void *v)
rcu_read_lock();
desc = irq_to_desc(i);
- if (!desc)
+ if (!desc || irq_settings_is_hidden(desc))
goto outsparse;
- if (desc->kstat_irqs)
+ if (desc->kstat_irqs) {
for_each_online_cpu(j)
- any_count |= *per_cpu_ptr(desc->kstat_irqs, j);
+ any_count |= data_race(*per_cpu_ptr(desc->kstat_irqs, j));
+ }
if ((!desc->action || irq_desc_is_chained(desc)) && !any_count)
goto outsparse;
@@ -512,7 +513,7 @@ int show_interrupts(struct seq_file *p, void *v)
seq_printf(p, " %8s", "None");
}
if (desc->irq_data.domain)
- seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);
+ seq_printf(p, " %*lu", prec, desc->irq_data.hwirq);
else
seq_printf(p, " %*s", prec, "");
#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 27634f4022d0..5f2c66860ac6 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -21,36 +21,34 @@
#ifdef CONFIG_HARDIRQS_SW_RESEND
-/* Bitmap to handle software resend of interrupts: */
-static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
+/* hlist_head to handle software resend of interrupts: */
+static HLIST_HEAD(irq_resend_list);
+static DEFINE_RAW_SPINLOCK(irq_resend_lock);
/*
* Run software resends of IRQ's
*/
-static void resend_irqs(unsigned long arg)
+static void resend_irqs(struct tasklet_struct *unused)
{
struct irq_desc *desc;
- int irq;
- while (!bitmap_empty(irqs_resend, nr_irqs)) {
- irq = find_first_bit(irqs_resend, nr_irqs);
- clear_bit(irq, irqs_resend);
- desc = irq_to_desc(irq);
- if (!desc)
- continue;
- local_irq_disable();
+ raw_spin_lock_irq(&irq_resend_lock);
+ while (!hlist_empty(&irq_resend_list)) {
+ desc = hlist_entry(irq_resend_list.first, struct irq_desc,
+ resend_node);
+ hlist_del_init(&desc->resend_node);
+ raw_spin_unlock(&irq_resend_lock);
desc->handle_irq(desc);
- local_irq_enable();
+ raw_spin_lock(&irq_resend_lock);
}
+ raw_spin_unlock_irq(&irq_resend_lock);
}
/* Tasklet to handle resend: */
-static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
+static DECLARE_TASKLET(resend_tasklet, resend_irqs);
static int irq_sw_resend(struct irq_desc *desc)
{
- unsigned int irq = irq_desc_get_irq(desc);
-
/*
* Validate whether this interrupt can be safely injected from
* non interrupt context
@@ -70,22 +68,54 @@ static int irq_sw_resend(struct irq_desc *desc)
*/
if (!desc->parent_irq)
return -EINVAL;
- irq = desc->parent_irq;
+
+ desc = irq_to_desc(desc->parent_irq);
+ if (!desc)
+ return -EINVAL;
}
- /* Set it pending and activate the softirq: */
- set_bit(irq, irqs_resend);
+ /* Add to resend_list and activate the softirq: */
+ raw_spin_lock(&irq_resend_lock);
+ if (hlist_unhashed(&desc->resend_node))
+ hlist_add_head(&desc->resend_node, &irq_resend_list);
+ raw_spin_unlock(&irq_resend_lock);
tasklet_schedule(&resend_tasklet);
return 0;
}
+void clear_irq_resend(struct irq_desc *desc)
+{
+ raw_spin_lock(&irq_resend_lock);
+ hlist_del_init(&desc->resend_node);
+ raw_spin_unlock(&irq_resend_lock);
+}
+
+void irq_resend_init(struct irq_desc *desc)
+{
+ INIT_HLIST_NODE(&desc->resend_node);
+}
#else
+void clear_irq_resend(struct irq_desc *desc) {}
+void irq_resend_init(struct irq_desc *desc) {}
+
static int irq_sw_resend(struct irq_desc *desc)
{
return -EINVAL;
}
#endif
+static int try_retrigger(struct irq_desc *desc)
+{
+ if (desc->irq_data.chip->irq_retrigger)
+ return desc->irq_data.chip->irq_retrigger(&desc->irq_data);
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ return irq_chip_retrigger_hierarchy(&desc->irq_data);
+#else
+ return 0;
+#endif
+}
+
/*
* IRQ resend
*
@@ -113,11 +143,10 @@ int check_irq_resend(struct irq_desc *desc, bool inject)
desc->istate &= ~IRQS_PENDING;
- if (!desc->irq_data.chip->irq_retrigger ||
- !desc->irq_data.chip->irq_retrigger(&desc->irq_data))
+ if (!try_retrigger(desc))
err = irq_sw_resend(desc);
- /* If the retrigger was successfull, mark it with the REPLAY bit */
+ /* If the retrigger was successful, mark it with the REPLAY bit */
if (!err)
desc->istate |= IRQS_REPLAY;
return err;
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index e43795cd2ccf..7b7efb1a114b 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -17,6 +17,8 @@ enum {
_IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
_IRQ_IS_POLLED = IRQ_IS_POLLED,
_IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY,
+ _IRQ_HIDDEN = IRQ_HIDDEN,
+ _IRQ_NO_DEBUG = IRQ_NO_DEBUG,
_IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
};
@@ -31,6 +33,8 @@ enum {
#define IRQ_PER_CPU_DEVID GOT_YOU_MORON
#define IRQ_IS_POLLED GOT_YOU_MORON
#define IRQ_DISABLE_UNLAZY GOT_YOU_MORON
+#define IRQ_HIDDEN GOT_YOU_MORON
+#define IRQ_NO_DEBUG GOT_YOU_MORON
#undef IRQF_MODIFY_MASK
#define IRQF_MODIFY_MASK GOT_YOU_MORON
@@ -167,3 +171,18 @@ static inline void irq_settings_clr_disable_unlazy(struct irq_desc *desc)
{
desc->status_use_accessors &= ~_IRQ_DISABLE_UNLAZY;
}
+
+static inline bool irq_settings_is_hidden(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_HIDDEN;
+}
+
+static inline void irq_settings_set_no_debug(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_NO_DEBUG;
+}
+
+static inline bool irq_settings_no_debug(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_NO_DEBUG;
+}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index f865e5f4d382..02b2daf07441 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -403,6 +403,10 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
desc->irqs_unhandled -= ok;
}
+ if (likely(!desc->irqs_unhandled))
+ return;
+
+ /* Now getting into unhandled irq detection */
desc->irq_count++;
if (likely(desc->irq_count < 100000))
return;
@@ -443,6 +447,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
static int __init irqfixup_setup(char *str)
{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ pr_warn("irqfixup boot option not supported with PREEMPT_RT\n");
+ return 1;
+ }
irqfixup = 1;
printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
printk(KERN_WARNING "This may impact system performance.\n");
@@ -455,6 +463,10 @@ module_param(irqfixup, int, 0644);
static int __init irqpoll_setup(char *str)
{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ pr_warn("irqpoll boot option not supported with PREEMPT_RT\n");
+ return 1;
+ }
irqfixup = 2;
printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
"enabled\n");
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index e960d7ce7bcc..c43e2ac2f8de 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -84,7 +84,7 @@ void irq_timings_disable(void)
* 2. Log interval
*
* We saw the irq timings allow to compute the interval of the
- * occurrences for a specific interrupt. We can reasonibly assume the
+ * occurrences for a specific interrupt. We can reasonably assume the
* longer is the interval, the higher is the error for the next event
* and we can consider storing those interval values into an array
* where each slot in the array correspond to an interval at the power
@@ -416,7 +416,7 @@ static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
* Copy the content of the circular buffer into another buffer
* in order to linearize the buffer instead of dealing with
* wrapping indexes and shifted array which will be prone to
- * error and extremelly difficult to debug.
+ * error and extremely difficult to debug.
*/
for (i = 0; i < count; i++) {
int index = (start + i) & IRQ_TIMINGS_MASK;
@@ -453,6 +453,11 @@ static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs,
*/
index = irq_timings_interval_index(interval);
+ if (index > PREDICTION_BUFFER_SIZE - 1) {
+ irqs->count = 0;
+ return;
+ }
+
/*
* Store the index as an element of the pattern in another
* circular array.
@@ -485,7 +490,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
/*
* The interrupt triggered more than one second apart, that
- * ends the sequence as predictible for our purpose. In this
+ * ends the sequence as predictable for our purpose. In this
* case, assume we have the beginning of a sequence and the
* timestamp is the first value. As it is impossible to
* predict anything at this point, return.
@@ -514,7 +519,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
* If more than the array size interrupts happened during the
* last busy/idle cycle, the index wrapped up and we have to
* begin with the next element in the array which is the last one
- * in the sequence, otherwise it is a the index 0.
+ * in the sequence, otherwise it is at the index 0.
*
* - have an indication of the interrupts activity on this CPU
* (eg. irq/sec)
@@ -604,7 +609,7 @@ int irq_timings_alloc(int irq)
/*
* Some platforms can have the same private interrupt per cpu,
- * so this function may be be called several times with the
+ * so this function may be called several times with the
* same interrupt number. Just bail out in case the per cpu
* stat structure is already allocated.
*/
@@ -794,12 +799,14 @@ static int __init irq_timings_test_irqs(struct timings_intervals *ti)
__irq_timings_store(irq, irqs, ti->intervals[i]);
if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) {
+ ret = -EBADSLT;
pr_err("Failed to store in the circular buffer\n");
goto out;
}
}
if (irqs->count != ti->count) {
+ ret = -ERANGE;
pr_err("Count differs\n");
goto out;
}
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index eca83965b631..2f4fb336dda1 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -18,11 +18,38 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/smp.h>
+#include <linux/smpboot.h>
#include <asm/processor.h>
+#include <linux/kasan.h>
+#include <trace/events/ipi.h>
static DEFINE_PER_CPU(struct llist_head, raised_list);
static DEFINE_PER_CPU(struct llist_head, lazy_list);
+static DEFINE_PER_CPU(struct task_struct *, irq_workd);
+
+static void wake_irq_workd(void)
+{
+ struct task_struct *tsk = __this_cpu_read(irq_workd);
+
+ if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk)
+ wake_up_process(tsk);
+}
+
+#ifdef CONFIG_SMP
+static void irq_work_wake(struct irq_work *entry)
+{
+ wake_irq_workd();
+}
+
+static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) =
+ IRQ_WORK_INIT_HARD(irq_work_wake);
+#endif
+
+static int irq_workd_should_run(unsigned int cpu)
+{
+ return !llist_empty(this_cpu_ptr(&lazy_list));
+}
/*
* Claim the entry so that no one else will poke at it.
@@ -31,10 +58,10 @@ static bool irq_work_claim(struct irq_work *work)
{
int oflags;
- oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags);
+ oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags);
/*
* If the work is already pending, no need to raise the IPI.
- * The pairing atomic_fetch_andnot() in irq_work_run() makes sure
+ * The pairing smp_mb() in irq_work_single() makes sure
* everything we did before is visible.
*/
if (oflags & IRQ_WORK_PENDING)
@@ -49,18 +76,40 @@ void __weak arch_irq_work_raise(void)
*/
}
+static __always_inline void irq_work_raise(struct irq_work *work)
+{
+ if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt())
+ trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func);
+
+ arch_irq_work_raise();
+}
+
/* Enqueue on current CPU, work must already be claimed and preempt disabled */
static void __irq_work_queue_local(struct irq_work *work)
{
+ struct llist_head *list;
+ bool rt_lazy_work = false;
+ bool lazy_work = false;
+ int work_flags;
+
+ work_flags = atomic_read(&work->node.a_flags);
+ if (work_flags & IRQ_WORK_LAZY)
+ lazy_work = true;
+ else if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
+ !(work_flags & IRQ_WORK_HARD_IRQ))
+ rt_lazy_work = true;
+
+ if (lazy_work || rt_lazy_work)
+ list = this_cpu_ptr(&lazy_list);
+ else
+ list = this_cpu_ptr(&raised_list);
+
+ if (!llist_add(&work->node.llist, list))
+ return;
+
/* If the work is "lazy", handle it from next tick if any */
- if (atomic_read(&work->flags) & IRQ_WORK_LAZY) {
- if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
- tick_nohz_tick_stopped())
- arch_irq_work_raise();
- } else {
- if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
- arch_irq_work_raise();
- }
+ if (!lazy_work || tick_nohz_tick_stopped())
+ irq_work_raise(work);
}
/* Enqueue the irq work @work on the current CPU */
@@ -98,21 +147,40 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
if (!irq_work_claim(work))
return false;
+ kasan_record_aux_stack_noalloc(work);
+
preempt_disable();
if (cpu != smp_processor_id()) {
/* Arch remote IPI send/receive backend aren't NMI safe */
WARN_ON_ONCE(in_nmi());
- __smp_call_single_queue(cpu, &work->llnode);
+
+ /*
+ * On PREEMPT_RT the items which are not marked as
+ * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work
+ * item is used on the remote CPU to wake the thread.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
+ !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) {
+
+ if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu)))
+ goto out;
+
+ work = &per_cpu(irq_work_wakeup, cpu);
+ if (!irq_work_claim(work))
+ goto out;
+ }
+
+ __smp_call_single_queue(cpu, &work->node.llist);
} else {
__irq_work_queue_local(work);
}
+out:
preempt_enable();
return true;
#endif /* CONFIG_SMP */
}
-
bool irq_work_needs_cpu(void)
{
struct llist_head *raised, *lazy;
@@ -136,23 +204,32 @@ void irq_work_single(void *arg)
int flags;
/*
- * Clear the PENDING bit, after this point the @work
- * can be re-used.
- * Make it immediately visible so that other CPUs trying
- * to claim that work don't rely on us to handle their data
- * while we are in the middle of the func.
+ * Clear the PENDING bit, after this point the @work can be re-used.
+ * The PENDING bit acts as a lock, and we own it, so we can clear it
+ * without atomic ops.
*/
- flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
+ flags = atomic_read(&work->node.a_flags);
+ flags &= ~IRQ_WORK_PENDING;
+ atomic_set(&work->node.a_flags, flags);
- lockdep_irq_work_enter(work);
+ /*
+ * See irq_work_claim().
+ */
+ smp_mb();
+
+ lockdep_irq_work_enter(flags);
work->func(work);
- lockdep_irq_work_exit(work);
+ lockdep_irq_work_exit(flags);
+
/*
- * Clear the BUSY bit and return to the free state if
- * no-one else claimed it meanwhile.
+ * Clear the BUSY bit, if set, and return to the free state if no-one
+ * else claimed it meanwhile.
*/
- flags &= ~IRQ_WORK_PENDING;
- (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
+ (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);
+
+ if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
+ !arch_irq_work_has_interrupt())
+ rcuwait_wake_up(&work->irqwait);
}
static void irq_work_run_list(struct llist_head *list)
@@ -160,13 +237,18 @@ static void irq_work_run_list(struct llist_head *list)
struct irq_work *work, *tmp;
struct llist_node *llnode;
- BUG_ON(!irqs_disabled());
+ /*
+ * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed
+ * in a per-CPU thread in preemptible context. Only the items which are
+ * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context.
+ */
+ BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT));
if (llist_empty(list))
return;
llnode = llist_del_all(list);
- llist_for_each_entry_safe(work, tmp, llnode, llnode)
+ llist_for_each_entry_safe(work, tmp, llnode, node.llist)
irq_work_single(work);
}
@@ -177,7 +259,10 @@ static void irq_work_run_list(struct llist_head *list)
void irq_work_run(void)
{
irq_work_run_list(this_cpu_ptr(&raised_list));
- irq_work_run_list(this_cpu_ptr(&lazy_list));
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
+ else
+ wake_irq_workd();
}
EXPORT_SYMBOL_GPL(irq_work_run);
@@ -187,7 +272,11 @@ void irq_work_tick(void)
if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
irq_work_run_list(raised);
- irq_work_run_list(this_cpu_ptr(&lazy_list));
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
+ else
+ wake_irq_workd();
}
/*
@@ -197,8 +286,42 @@ void irq_work_tick(void)
void irq_work_sync(struct irq_work *work)
{
lockdep_assert_irqs_enabled();
+ might_sleep();
- while (atomic_read(&work->flags) & IRQ_WORK_BUSY)
+ if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
+ !arch_irq_work_has_interrupt()) {
+ rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work),
+ TASK_UNINTERRUPTIBLE);
+ return;
+ }
+
+ while (irq_work_is_busy(work))
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);
+
+static void run_irq_workd(unsigned int cpu)
+{
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
+}
+
+static void irq_workd_setup(unsigned int cpu)
+{
+ sched_set_fifo_low(current);
+}
+
+static struct smp_hotplug_thread irqwork_threads = {
+ .store = &irq_workd,
+ .setup = irq_workd_setup,
+ .thread_should_run = irq_workd_should_run,
+ .thread_fn = run_irq_workd,
+ .thread_comm = "irq_work/%u",
+};
+
+static __init int irq_work_init_threads(void)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ BUG_ON(smpboot_register_percpu_thread(&irqwork_threads));
+ return 0;
+}
+early_initcall(irq_work_init_threads);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index cdb3ffab128b..d9c822bbffb8 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -19,7 +19,7 @@
#include <linux/cpu.h>
#include <asm/sections.h>
-/* mutex to protect coming/going of the the jump_label table */
+/* mutex to protect coming/going of the jump_label table */
static DEFINE_MUTEX(jump_label_mutex);
void jump_label_lock(void)
@@ -113,11 +113,40 @@ int static_key_count(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_count);
-void static_key_slow_inc_cpuslocked(struct static_key *key)
+/*
+ * static_key_fast_inc_not_disabled - adds a user for a static key
+ * @key: static key that must be already enabled
+ *
+ * The caller must make sure that the static key can't get disabled while
+ * in this function. It doesn't patch jump labels, only adds a user to
+ * an already enabled static key.
+ *
+ * Returns true if the increment was done. Unlike refcount_t the ref counter
+ * is not saturated, but will fail to increment on overflow.
+ */
+bool static_key_fast_inc_not_disabled(struct static_key *key)
{
- int v, v1;
+ int v;
STATIC_KEY_CHECK_USE(key);
+ /*
+ * Negative key->enabled has a special meaning: it sends
+ * static_key_slow_inc() down the slow path, and it is non-zero
+ * so it counts as "enabled" in jump_label_update(). Note that
+ * atomic_inc_unless_negative() checks >= 0, so roll our own.
+ */
+ v = atomic_read(&key->enabled);
+ do {
+ if (v <= 0 || (v + 1) < 0)
+ return false;
+ } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1)));
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(static_key_fast_inc_not_disabled);
+
+bool static_key_slow_inc_cpuslocked(struct static_key *key)
+{
lockdep_assert_cpus_held();
/*
@@ -126,17 +155,9 @@ void static_key_slow_inc_cpuslocked(struct static_key *key)
* jump_label_update() process. At the same time, however,
* the jump_label_update() call below wants to see
* static_key_enabled(&key) for jumps to be updated properly.
- *
- * So give a special meaning to negative key->enabled: it sends
- * static_key_slow_inc() down the slow path, and it is non-zero
- * so it counts as "enabled" in jump_label_update(). Note that
- * atomic_inc_unless_negative() checks >= 0, so roll our own.
*/
- for (v = atomic_read(&key->enabled); v > 0; v = v1) {
- v1 = atomic_cmpxchg(&key->enabled, v, v + 1);
- if (likely(v1 == v))
- return;
- }
+ if (static_key_fast_inc_not_disabled(key))
+ return true;
jump_label_lock();
if (atomic_read(&key->enabled) == 0) {
@@ -148,16 +169,23 @@ void static_key_slow_inc_cpuslocked(struct static_key *key)
*/
atomic_set_release(&key->enabled, 1);
} else {
- atomic_inc(&key->enabled);
+ if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key))) {
+ jump_label_unlock();
+ return false;
+ }
}
jump_label_unlock();
+ return true;
}
-void static_key_slow_inc(struct static_key *key)
+bool static_key_slow_inc(struct static_key *key)
{
+ bool ret;
+
cpus_read_lock();
- static_key_slow_inc_cpuslocked(key);
+ ret = static_key_slow_inc_cpuslocked(key);
cpus_read_unlock();
+ return ret;
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -309,38 +337,36 @@ EXPORT_SYMBOL_GPL(jump_label_rate_limit);
static int addr_conflict(struct jump_entry *entry, void *start, void *end)
{
if (jump_entry_code(entry) <= (unsigned long)end &&
- jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
+ jump_entry_code(entry) + jump_entry_size(entry) > (unsigned long)start)
return 1;
return 0;
}
static int __jump_label_text_reserved(struct jump_entry *iter_start,
- struct jump_entry *iter_stop, void *start, void *end)
+ struct jump_entry *iter_stop, void *start, void *end, bool init)
{
struct jump_entry *iter;
iter = iter_start;
while (iter < iter_stop) {
- if (addr_conflict(iter, start, end))
- return 1;
+ if (init || !jump_entry_is_init(iter)) {
+ if (addr_conflict(iter, start, end))
+ return 1;
+ }
iter++;
}
return 0;
}
-/*
- * Update code which is definitely not currently executing.
- * Architectures which need heavyweight synchronization to modify
- * running code can override this to make the non-live update case
- * cheaper.
- */
-void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
- enum jump_label_type type)
+#ifndef arch_jump_label_transform_static
+static void arch_jump_label_transform_static(struct jump_entry *entry,
+ enum jump_label_type type)
{
- arch_jump_label_transform(entry, type);
+ /* nothing to do on most architectures */
}
+#endif
static inline struct jump_entry *static_key_entries(struct static_key *key)
{
@@ -407,6 +433,14 @@ static bool jump_label_can_update(struct jump_entry *entry, bool init)
return false;
if (!kernel_text_address(jump_entry_code(entry))) {
+ /*
+ * This skips patching built-in __exit, which
+ * is part of init_section_contains() but is
+ * not part of kernel_text_address().
+ *
+ * Skipping built-in __exit is fine since it
+ * will never be executed.
+ */
WARN_ONCE(!jump_entry_is_init(entry),
"can't patch jump_label at %pS",
(void *)jump_entry_code(entry));
@@ -475,13 +509,14 @@ void __init jump_label_init(void)
for (iter = iter_start; iter < iter_stop; iter++) {
struct static_key *iterk;
+ bool in_init;
/* rewrite NOPs */
if (jump_label_type(iter) == JUMP_LABEL_NOP)
arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
- if (init_section_contains((void *)jump_entry_code(iter), 1))
- jump_entry_set_init(iter);
+ in_init = init_section_contains((void *)jump_entry_code(iter), 1);
+ jump_entry_set_init(iter, in_init);
iterk = jump_entry_key(iter);
if (iterk == key)
@@ -497,7 +532,7 @@ void __init jump_label_init(void)
#ifdef CONFIG_MODULES
-static enum jump_label_type jump_label_init_type(struct jump_entry *entry)
+enum jump_label_type jump_label_init_type(struct jump_entry *entry)
{
struct static_key *key = jump_entry_key(entry);
bool type = static_key_type(key);
@@ -539,19 +574,25 @@ static void static_key_set_mod(struct static_key *key,
static int __jump_label_mod_text_reserved(void *start, void *end)
{
struct module *mod;
+ int ret;
preempt_disable();
mod = __module_text_address((unsigned long)start);
WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+ if (!try_module_get(mod))
+ mod = NULL;
preempt_enable();
if (!mod)
return 0;
-
- return __jump_label_text_reserved(mod->jump_entries,
+ ret = __jump_label_text_reserved(mod->jump_entries,
mod->jump_entries + mod->num_jump_entries,
- start, end);
+ start, end, mod->state == MODULE_STATE_COMING);
+
+ module_put(mod);
+
+ return ret;
}
static void __jump_label_mod_update(struct static_key *key)
@@ -579,31 +620,6 @@ static void __jump_label_mod_update(struct static_key *key)
}
}
-/***
- * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
- * @mod: module to patch
- *
- * Allow for run-time selection of the optimal nops. Before the module
- * loads patch these with arch_get_jump_label_nop(), which is specified by
- * the arch specific jump label code.
- */
-void jump_label_apply_nops(struct module *mod)
-{
- struct jump_entry *iter_start = mod->jump_entries;
- struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
- struct jump_entry *iter;
-
- /* if the module doesn't have jump label entries, just return */
- if (iter_start == iter_stop)
- return;
-
- for (iter = iter_start; iter < iter_stop; iter++) {
- /* Only write NOPs for arch_branch_static(). */
- if (jump_label_init_type(iter) == JUMP_LABEL_NOP)
- arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
- }
-}
-
static int jump_label_add_module(struct module *mod)
{
struct jump_entry *iter_start = mod->jump_entries;
@@ -620,9 +636,10 @@ static int jump_label_add_module(struct module *mod)
for (iter = iter_start; iter < iter_stop; iter++) {
struct static_key *iterk;
+ bool in_init;
- if (within_module_init(jump_entry_code(iter), mod))
- jump_entry_set_init(iter);
+ in_init = within_module_init(jump_entry_code(iter), mod);
+ jump_entry_set_init(iter, in_init);
iterk = jump_entry_key(iter);
if (iterk == key)
@@ -772,8 +789,9 @@ early_initcall(jump_label_init_module);
*/
int jump_label_text_reserved(void *start, void *end)
{
+ bool init = system_state < SYSTEM_RUNNING;
int ret = __jump_label_text_reserved(__start___jump_table,
- __stop___jump_table, start, end);
+ __stop___jump_table, start, end, init);
if (ret)
return ret;
@@ -787,6 +805,7 @@ int jump_label_text_reserved(void *start, void *end)
static void jump_label_update(struct static_key *key)
{
struct jump_entry *stop = __stop___jump_table;
+ bool init = system_state < SYSTEM_RUNNING;
struct jump_entry *entry;
#ifdef CONFIG_MODULES
struct module *mod;
@@ -798,15 +817,16 @@ static void jump_label_update(struct static_key *key)
preempt_disable();
mod = __module_address((unsigned long)key);
- if (mod)
+ if (mod) {
stop = mod->jump_entries + mod->num_jump_entries;
+ init = mod->state == MODULE_STATE_COMING;
+ }
preempt_enable();
#endif
entry = static_key_entries(key);
/* if there are no users, entry can be NULL */
if (entry)
- __jump_label_update(key, entry, stop,
- system_state < SYSTEM_RUNNING);
+ __jump_label_update(key, entry, stop, init);
}
#ifdef CONFIG_STATIC_KEYS_SELFTEST
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index bb14e64f62a4..18edd57b5fe8 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -24,30 +24,15 @@
#include <linux/slab.h>
#include <linux/filter.h>
#include <linux/ftrace.h>
+#include <linux/kprobes.h>
+#include <linux/build_bug.h>
#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bsearch.h>
+#include <linux/btf_ids.h>
-/*
- * These will be re-linked against their real values
- * during the second link stage.
- */
-extern const unsigned long kallsyms_addresses[] __weak;
-extern const int kallsyms_offsets[] __weak;
-extern const u8 kallsyms_names[] __weak;
-
-/*
- * Tell the compiler that the count isn't in the small data section if the arch
- * has one (eg: FRV).
- */
-extern const unsigned int kallsyms_num_syms
-__attribute__((weak, section(".rodata")));
-
-extern const unsigned long kallsyms_relative_base
-__attribute__((weak, section(".rodata")));
-
-extern const char kallsyms_token_table[] __weak;
-extern const u16 kallsyms_token_index[] __weak;
-
-extern const unsigned int kallsyms_markers[] __weak;
+#include "kallsyms_internal.h"
/*
* Expand a compressed symbol data into the resulting uncompressed string,
@@ -65,12 +50,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off,
data = &kallsyms_names[off];
len = *data;
data++;
+ off++;
+
+ /* If MSB is 1, it is a "big" symbol, so needs an additional byte. */
+ if ((len & 0x80) != 0) {
+ len = (len & 0x7F) | (*data << 7);
+ data++;
+ off++;
+ }
/*
* Update the offset to return the offset for the next symbol on
* the compressed stream.
*/
- off += len + 1;
+ off += len;
/*
* For every byte on the compressed symbol data, copy the table
@@ -123,7 +116,7 @@ static char kallsyms_get_symbol_type(unsigned int off)
static unsigned int get_symbol_offset(unsigned long pos)
{
const u8 *name;
- int i;
+ int i, len;
/*
* Use the closest marker we have. We have markers every 256 positions,
@@ -137,13 +130,23 @@ static unsigned int get_symbol_offset(unsigned long pos)
* so we just need to add the len to the current pointer for every
* symbol we wish to skip.
*/
- for (i = 0; i < (pos & 0xFF); i++)
- name = name + (*name) + 1;
+ for (i = 0; i < (pos & 0xFF); i++) {
+ len = *name;
+
+ /*
+ * If MSB is 1, it is a "big" symbol, so we need to look into
+ * the next byte (and skip it, too).
+ */
+ if ((len & 0x80) != 0)
+ len = ((len & 0x7F) | (name[1] << 7)) + 1;
+
+ name = name + len + 1;
+ }
return name - kallsyms_names;
}
-static unsigned long kallsyms_sym_address(int idx)
+unsigned long kallsyms_sym_address(int idx)
{
if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE))
return kallsyms_addresses[idx];
@@ -160,24 +163,126 @@ static unsigned long kallsyms_sym_address(int idx)
return kallsyms_relative_base - 1 - kallsyms_offsets[idx];
}
-/* Lookup the address for this symbol. Returns 0 if not found. */
-unsigned long kallsyms_lookup_name(const char *name)
+static void cleanup_symbol_name(char *s)
+{
+ char *res;
+
+ if (!IS_ENABLED(CONFIG_LTO_CLANG))
+ return;
+
+ /*
+ * LLVM appends various suffixes for local functions and variables that
+ * must be promoted to global scope as part of LTO. This can break
+ * hooking of static functions with kprobes. '.' is not a valid
+ * character in an identifier in C. Suffixes only in LLVM LTO observed:
+ * - foo.llvm.[0-9a-f]+
+ */
+ res = strstr(s, ".llvm.");
+ if (res)
+ *res = '\0';
+
+ return;
+}
+
+static int compare_symbol_name(const char *name, char *namebuf)
{
+ /* The kallsyms_seqs_of_names is sorted based on names after
+ * cleanup_symbol_name() (see scripts/kallsyms.c) if clang lto is enabled.
+ * To ensure correct bisection in kallsyms_lookup_names(), do
+ * cleanup_symbol_name(namebuf) before comparing name and namebuf.
+ */
+ cleanup_symbol_name(namebuf);
+ return strcmp(name, namebuf);
+}
+
+static unsigned int get_symbol_seq(int index)
+{
+ unsigned int i, seq = 0;
+
+ for (i = 0; i < 3; i++)
+ seq = (seq << 8) | kallsyms_seqs_of_names[3 * index + i];
+
+ return seq;
+}
+
+static int kallsyms_lookup_names(const char *name,
+ unsigned int *start,
+ unsigned int *end)
+{
+ int ret;
+ int low, mid, high;
+ unsigned int seq, off;
char namebuf[KSYM_NAME_LEN];
- unsigned long i;
- unsigned int off;
- for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
- off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ low = 0;
+ high = kallsyms_num_syms - 1;
- if (strcmp(namebuf, name) == 0)
- return kallsyms_sym_address(i);
+ while (low <= high) {
+ mid = low + (high - low) / 2;
+ seq = get_symbol_seq(mid);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ ret = compare_symbol_name(name, namebuf);
+ if (ret > 0)
+ low = mid + 1;
+ else if (ret < 0)
+ high = mid - 1;
+ else
+ break;
}
+
+ if (low > high)
+ return -ESRCH;
+
+ low = mid;
+ while (low) {
+ seq = get_symbol_seq(low - 1);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ if (compare_symbol_name(name, namebuf))
+ break;
+ low--;
+ }
+ *start = low;
+
+ if (end) {
+ high = mid;
+ while (high < kallsyms_num_syms - 1) {
+ seq = get_symbol_seq(high + 1);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ if (compare_symbol_name(name, namebuf))
+ break;
+ high++;
+ }
+ *end = high;
+ }
+
+ return 0;
+}
+
+/* Lookup the address for this symbol. Returns 0 if not found. */
+unsigned long kallsyms_lookup_name(const char *name)
+{
+ int ret;
+ unsigned int i;
+
+ /* Skip the search for empty string. */
+ if (!*name)
+ return 0;
+
+ ret = kallsyms_lookup_names(name, &i, NULL);
+ if (!ret)
+ return kallsyms_sym_address(get_symbol_seq(i));
+
return module_kallsyms_lookup_name(name);
}
-int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
- unsigned long),
+/*
+ * Iterate over all symbols in vmlinux. For symbols from modules use
+ * module_kallsyms_on_each_symbol instead.
+ */
+int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long),
void *data)
{
char namebuf[KSYM_NAME_LEN];
@@ -187,11 +292,30 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
- ret = fn(data, namebuf, NULL, kallsyms_sym_address(i));
+ ret = fn(data, namebuf, kallsyms_sym_address(i));
if (ret != 0)
return ret;
+ cond_resched();
}
- return module_kallsyms_on_each_symbol(fn, data);
+ return 0;
+}
+
+int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long),
+ const char *name, void *data)
+{
+ int ret;
+ unsigned int i, start, end;
+
+ ret = kallsyms_lookup_names(name, &start, &end);
+ if (ret)
+ return 0;
+
+ for (i = start; !ret && i <= end; i++) {
+ ret = fn(data, kallsyms_sym_address(get_symbol_seq(i)));
+ cond_resched();
+ }
+
+ return ret;
}
static unsigned long get_symbol_pos(unsigned long addr,
@@ -266,21 +390,14 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
get_symbol_pos(addr, symbolsize, offset);
return 1;
}
- return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) ||
+ return !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf) ||
!!__bpf_address_lookup(addr, symbolsize, offset, namebuf);
}
-/*
- * Lookup an address
- * - modname is set to NULL if it's in the kernel.
- * - We guarantee that the returned name is valid until we reschedule even if.
- * It resides in a module.
- * - We also guarantee that modname will be valid until rescheduled.
- */
-const char *kallsyms_lookup(unsigned long addr,
- unsigned long *symbolsize,
- unsigned long *offset,
- char **modname, char *namebuf)
+static const char *kallsyms_lookup_buildid(unsigned long addr,
+ unsigned long *symbolsize,
+ unsigned long *offset, char **modname,
+ const unsigned char **modbuildid, char *namebuf)
{
const char *ret;
@@ -296,12 +413,16 @@ const char *kallsyms_lookup(unsigned long addr,
namebuf, KSYM_NAME_LEN);
if (modname)
*modname = NULL;
- return namebuf;
+ if (modbuildid)
+ *modbuildid = NULL;
+
+ ret = namebuf;
+ goto found;
}
/* See if it's in a module or a BPF JITed image. */
ret = module_address_lookup(addr, symbolsize, offset,
- modname, namebuf);
+ modname, modbuildid, namebuf);
if (!ret)
ret = bpf_address_lookup(addr, symbolsize,
offset, modname, namebuf);
@@ -309,11 +430,32 @@ const char *kallsyms_lookup(unsigned long addr,
if (!ret)
ret = ftrace_mod_address_lookup(addr, symbolsize,
offset, modname, namebuf);
+
+found:
+ cleanup_symbol_name(namebuf);
return ret;
}
+/*
+ * Lookup an address
+ * - modname is set to NULL if it's in the kernel.
+ * - We guarantee that the returned name is valid until we reschedule even if.
+ * It resides in a module.
+ * - We also guarantee that modname will be valid until rescheduled.
+ */
+const char *kallsyms_lookup(unsigned long addr,
+ unsigned long *symbolsize,
+ unsigned long *offset,
+ char **modname, char *namebuf)
+{
+ return kallsyms_lookup_buildid(addr, symbolsize, offset, modname,
+ NULL, namebuf);
+}
+
int lookup_symbol_name(unsigned long addr, char *symname)
{
+ int res;
+
symname[0] = '\0';
symname[KSYM_NAME_LEN - 1] = '\0';
@@ -324,43 +466,31 @@ int lookup_symbol_name(unsigned long addr, char *symname)
/* Grab name */
kallsyms_expand_symbol(get_symbol_offset(pos),
symname, KSYM_NAME_LEN);
- return 0;
+ goto found;
}
/* See if it's in a module. */
- return lookup_module_symbol_name(addr, symname);
-}
-
-int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
- unsigned long *offset, char *modname, char *name)
-{
- name[0] = '\0';
- name[KSYM_NAME_LEN - 1] = '\0';
-
- if (is_ksym_addr(addr)) {
- unsigned long pos;
+ res = lookup_module_symbol_name(addr, symname);
+ if (res)
+ return res;
- pos = get_symbol_pos(addr, size, offset);
- /* Grab name */
- kallsyms_expand_symbol(get_symbol_offset(pos),
- name, KSYM_NAME_LEN);
- modname[0] = '\0';
- return 0;
- }
- /* See if it's in a module. */
- return lookup_module_symbol_attrs(addr, size, offset, modname, name);
+found:
+ cleanup_symbol_name(symname);
+ return 0;
}
/* Look up a kernel symbol and return it in a text buffer. */
static int __sprint_symbol(char *buffer, unsigned long address,
- int symbol_offset, int add_offset)
+ int symbol_offset, int add_offset, int add_buildid)
{
char *modname;
+ const unsigned char *buildid;
const char *name;
unsigned long offset, size;
int len;
address += symbol_offset;
- name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
+ name = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid,
+ buffer);
if (!name)
return sprintf(buffer, "0x%lx", address - symbol_offset);
@@ -372,8 +502,19 @@ static int __sprint_symbol(char *buffer, unsigned long address,
if (add_offset)
len += sprintf(buffer + len, "+%#lx/%#lx", offset, size);
- if (modname)
- len += sprintf(buffer + len, " [%s]", modname);
+ if (modname) {
+ len += sprintf(buffer + len, " [%s", modname);
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+ if (add_buildid && buildid) {
+ /* build ID should match length of sprintf */
+#if IS_ENABLED(CONFIG_MODULES)
+ static_assert(sizeof(typeof_member(struct module, build_id)) == 20);
+#endif
+ len += sprintf(buffer + len, " %20phN", buildid);
+ }
+#endif
+ len += sprintf(buffer + len, "]");
+ }
return len;
}
@@ -391,11 +532,28 @@ static int __sprint_symbol(char *buffer, unsigned long address,
*/
int sprint_symbol(char *buffer, unsigned long address)
{
- return __sprint_symbol(buffer, address, 0, 1);
+ return __sprint_symbol(buffer, address, 0, 1, 0);
}
EXPORT_SYMBOL_GPL(sprint_symbol);
/**
+ * sprint_symbol_build_id - Look up a kernel symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function looks up a kernel symbol with @address and stores its name,
+ * offset, size, module name and module build ID to @buffer if possible. If no
+ * symbol was found, just saves its @address as is.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_symbol_build_id(char *buffer, unsigned long address)
+{
+ return __sprint_symbol(buffer, address, 0, 1, 1);
+}
+EXPORT_SYMBOL_GPL(sprint_symbol_build_id);
+
+/**
* sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer
* @buffer: buffer to be stored
* @address: address to lookup
@@ -408,7 +566,7 @@ EXPORT_SYMBOL_GPL(sprint_symbol);
*/
int sprint_symbol_no_offset(char *buffer, unsigned long address)
{
- return __sprint_symbol(buffer, address, 0, 0);
+ return __sprint_symbol(buffer, address, 0, 0, 0);
}
EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
@@ -428,15 +586,35 @@ EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
*/
int sprint_backtrace(char *buffer, unsigned long address)
{
- return __sprint_symbol(buffer, address, -1, 1);
+ return __sprint_symbol(buffer, address, -1, 1, 0);
+}
+
+/**
+ * sprint_backtrace_build_id - Look up a backtrace symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function is for stack backtrace and does the same thing as
+ * sprint_symbol() but with modified/decreased @address. If there is a
+ * tail-call to the function marked "noreturn", gcc optimized out code after
+ * the call so that the stack-saved return address could point outside of the
+ * caller. This function ensures that kallsyms will find the original caller
+ * by decreasing @address. This function also appends the module build ID to
+ * the @buffer if @address is within a kernel module.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_backtrace_build_id(char *buffer, unsigned long address)
+{
+ return __sprint_symbol(buffer, address, -1, 1, 1);
}
/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
struct kallsym_iter {
loff_t pos;
- loff_t pos_arch_end;
loff_t pos_mod_end;
loff_t pos_ftrace_mod_end;
+ loff_t pos_bpf_end;
unsigned long value;
unsigned int nameoff; /* If iterating in core kernel symbols. */
char type;
@@ -446,29 +624,9 @@ struct kallsym_iter {
int show_value;
};
-int __weak arch_get_kallsym(unsigned int symnum, unsigned long *value,
- char *type, char *name)
-{
- return -EINVAL;
-}
-
-static int get_ksymbol_arch(struct kallsym_iter *iter)
-{
- int ret = arch_get_kallsym(iter->pos - kallsyms_num_syms,
- &iter->value, &iter->type,
- iter->name);
-
- if (ret < 0) {
- iter->pos_arch_end = iter->pos;
- return 0;
- }
-
- return 1;
-}
-
static int get_ksymbol_mod(struct kallsym_iter *iter)
{
- int ret = module_get_kallsym(iter->pos - iter->pos_arch_end,
+ int ret = module_get_kallsym(iter->pos - kallsyms_num_syms,
&iter->value, &iter->type,
iter->name, iter->module_name,
&iter->exported);
@@ -480,6 +638,11 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
return 1;
}
+/*
+ * ftrace_mod_get_kallsym() may also get symbols for pages allocated for ftrace
+ * purposes. In that case "__builtin__ftrace" is used as a module name, even
+ * though "__builtin__ftrace" is not a module.
+ */
static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
{
int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end,
@@ -496,11 +659,33 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
static int get_ksymbol_bpf(struct kallsym_iter *iter)
{
- strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN);
+ int ret;
+
+ strscpy(iter->module_name, "bpf", MODULE_NAME_LEN);
iter->exported = 0;
- return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
- &iter->value, &iter->type,
- iter->name) < 0 ? 0 : 1;
+ ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
+ &iter->value, &iter->type,
+ iter->name);
+ if (ret < 0) {
+ iter->pos_bpf_end = iter->pos;
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * This uses "__builtin__kprobes" as a module name for symbols for pages
+ * allocated for kprobes' purposes, even though "__builtin__kprobes" is not a
+ * module.
+ */
+static int get_ksymbol_kprobe(struct kallsym_iter *iter)
+{
+ strscpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN);
+ iter->exported = 0;
+ return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end,
+ &iter->value, &iter->type,
+ iter->name) < 0 ? 0 : 1;
}
/* Returns space to next name. */
@@ -524,9 +709,9 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
iter->nameoff = get_symbol_offset(new_pos);
iter->pos = new_pos;
if (new_pos == 0) {
- iter->pos_arch_end = 0;
iter->pos_mod_end = 0;
iter->pos_ftrace_mod_end = 0;
+ iter->pos_bpf_end = 0;
}
}
@@ -539,10 +724,6 @@ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
{
iter->pos = pos;
- if ((!iter->pos_arch_end || iter->pos_arch_end > pos) &&
- get_ksymbol_arch(iter))
- return 1;
-
if ((!iter->pos_mod_end || iter->pos_mod_end > pos) &&
get_ksymbol_mod(iter))
return 1;
@@ -551,7 +732,11 @@ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
get_ksymbol_ftrace_mod(iter))
return 1;
- return get_ksymbol_bpf(iter);
+ if ((!iter->pos_bpf_end || iter->pos_bpf_end > pos) &&
+ get_ksymbol_bpf(iter))
+ return 1;
+
+ return get_ksymbol_kprobe(iter);
}
/* Returns false if pos at or past end of file. */
@@ -626,41 +811,96 @@ static const struct seq_operations kallsyms_op = {
.show = s_show
};
-static inline int kallsyms_for_perf(void)
+#ifdef CONFIG_BPF_SYSCALL
+
+struct bpf_iter__ksym {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct kallsym_iter *, ksym);
+};
+
+static int ksym_prog_seq_show(struct seq_file *m, bool in_stop)
{
-#ifdef CONFIG_PERF_EVENTS
- extern int sysctl_perf_event_paranoid;
- if (sysctl_perf_event_paranoid <= 1)
- return 1;
-#endif
+ struct bpf_iter__ksym ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = m;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.ksym = m ? m->private : NULL;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_iter_ksym_seq_show(struct seq_file *m, void *p)
+{
+ return ksym_prog_seq_show(m, false);
+}
+
+static void bpf_iter_ksym_seq_stop(struct seq_file *m, void *p)
+{
+ if (!p)
+ (void) ksym_prog_seq_show(m, true);
+ else
+ s_stop(m, p);
+}
+
+static const struct seq_operations bpf_iter_ksym_ops = {
+ .start = s_start,
+ .next = s_next,
+ .stop = bpf_iter_ksym_seq_stop,
+ .show = bpf_iter_ksym_seq_show,
+};
+
+static int bpf_iter_ksym_init(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+ struct kallsym_iter *iter = priv_data;
+
+ reset_iter(iter, 0);
+
+ /* cache here as in kallsyms_open() case; use current process
+ * credentials to tell BPF iterators if values should be shown.
+ */
+ iter->show_value = kallsyms_show_value(current_cred());
+
return 0;
}
-/*
- * We show kallsyms information even to normal users if we've enabled
- * kernel profiling and are explicitly not paranoid (so kptr_restrict
- * is clear, and sysctl_perf_event_paranoid isn't set).
- *
- * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to
- * block even that).
- */
-bool kallsyms_show_value(const struct cred *cred)
-{
- switch (kptr_restrict) {
- case 0:
- if (kallsyms_for_perf())
- return true;
- /* fallthrough */
- case 1:
- if (security_capable(cred, &init_user_ns, CAP_SYSLOG,
- CAP_OPT_NOAUDIT) == 0)
- return true;
- /* fallthrough */
- default:
- return false;
- }
+DEFINE_BPF_ITER_FUNC(ksym, struct bpf_iter_meta *meta, struct kallsym_iter *ksym)
+
+static const struct bpf_iter_seq_info ksym_iter_seq_info = {
+ .seq_ops = &bpf_iter_ksym_ops,
+ .init_seq_private = bpf_iter_ksym_init,
+ .fini_seq_private = NULL,
+ .seq_priv_size = sizeof(struct kallsym_iter),
+};
+
+static struct bpf_iter_reg ksym_iter_reg_info = {
+ .target = "ksym",
+ .feature = BPF_ITER_RESCHED,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__ksym, ksym),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &ksym_iter_seq_info,
+};
+
+BTF_ID_LIST(btf_ksym_iter_id)
+BTF_ID(struct, kallsym_iter)
+
+static int __init bpf_ksym_iter_register(void)
+{
+ ksym_iter_reg_info.ctx_arg_info[0].btf_id = *btf_ksym_iter_id;
+ return bpf_iter_reg_target(&ksym_iter_reg_info);
}
+late_initcall(bpf_ksym_iter_register);
+
+#endif /* CONFIG_BPF_SYSCALL */
+
static int kallsyms_open(struct inode *inode, struct file *file)
{
/*
diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h
new file mode 100644
index 000000000000..27fabdcc40f5
--- /dev/null
+++ b/kernel/kallsyms_internal.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef LINUX_KALLSYMS_INTERNAL_H_
+#define LINUX_KALLSYMS_INTERNAL_H_
+
+#include <linux/types.h>
+
+/*
+ * These will be re-linked against their real values
+ * during the second link stage.
+ */
+extern const unsigned long kallsyms_addresses[] __weak;
+extern const int kallsyms_offsets[] __weak;
+extern const u8 kallsyms_names[] __weak;
+
+/*
+ * Tell the compiler that the count isn't in the small data section if the arch
+ * has one (eg: FRV).
+ */
+extern const unsigned int kallsyms_num_syms
+__section(".rodata") __attribute__((weak));
+
+extern const unsigned long kallsyms_relative_base
+__section(".rodata") __attribute__((weak));
+
+extern const char kallsyms_token_table[] __weak;
+extern const u16 kallsyms_token_index[] __weak;
+
+extern const unsigned int kallsyms_markers[] __weak;
+extern const u8 kallsyms_seqs_of_names[] __weak;
+
+#endif // LINUX_KALLSYMS_INTERNAL_H_
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
new file mode 100644
index 000000000000..b4cac76ea5e9
--- /dev/null
+++ b/kernel/kallsyms_selftest.c
@@ -0,0 +1,469 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test the function and performance of kallsyms
+ *
+ * Copyright (C) Huawei Technologies Co., Ltd., 2022
+ *
+ * Authors: Zhen Lei <thunder.leizhen@huawei.com> Huawei
+ */
+
+#define pr_fmt(fmt) "kallsyms_selftest: " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/random.h>
+#include <linux/sched/clock.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+
+#include "kallsyms_internal.h"
+#include "kallsyms_selftest.h"
+
+
+#define MAX_NUM_OF_RECORDS 64
+
+struct test_stat {
+ int min;
+ int max;
+ int save_cnt;
+ int real_cnt;
+ int perf;
+ u64 sum;
+ char *name;
+ unsigned long addr;
+ unsigned long addrs[MAX_NUM_OF_RECORDS];
+};
+
+struct test_item {
+ char *name;
+ unsigned long addr;
+};
+
+#define ITEM_FUNC(s) \
+ { \
+ .name = #s, \
+ .addr = (unsigned long)s, \
+ }
+
+#define ITEM_DATA(s) \
+ { \
+ .name = #s, \
+ .addr = (unsigned long)&s, \
+ }
+
+
+static int kallsyms_test_var_bss_static;
+static int kallsyms_test_var_data_static = 1;
+int kallsyms_test_var_bss;
+int kallsyms_test_var_data = 1;
+
+static int kallsyms_test_func_static(void)
+{
+ kallsyms_test_var_bss_static++;
+ kallsyms_test_var_data_static++;
+
+ return 0;
+}
+
+int kallsyms_test_func(void)
+{
+ return kallsyms_test_func_static();
+}
+
+__weak int kallsyms_test_func_weak(void)
+{
+ kallsyms_test_var_bss++;
+ kallsyms_test_var_data++;
+ return 0;
+}
+
+static struct test_item test_items[] = {
+ ITEM_FUNC(kallsyms_test_func_static),
+ ITEM_FUNC(kallsyms_test_func),
+ ITEM_FUNC(kallsyms_test_func_weak),
+ ITEM_FUNC(vmalloc),
+ ITEM_FUNC(vfree),
+#ifdef CONFIG_KALLSYMS_ALL
+ ITEM_DATA(kallsyms_test_var_bss_static),
+ ITEM_DATA(kallsyms_test_var_data_static),
+ ITEM_DATA(kallsyms_test_var_bss),
+ ITEM_DATA(kallsyms_test_var_data),
+ ITEM_DATA(vmap_area_list),
+#endif
+};
+
+static char stub_name[KSYM_NAME_LEN];
+
+static int stat_symbol_len(void *data, const char *name, unsigned long addr)
+{
+ *(u32 *)data += strlen(name);
+
+ return 0;
+}
+
+static void test_kallsyms_compression_ratio(void)
+{
+ u32 pos, off, len, num;
+ u32 ratio, total_size, total_len = 0;
+
+ kallsyms_on_each_symbol(stat_symbol_len, &total_len);
+
+ /*
+ * A symbol name cannot start with a number. This stub name helps us
+ * traverse the entire symbol table without finding a match. It's used
+ * for subsequent performance tests, and its length is the average
+ * length of all symbol names.
+ */
+ memset(stub_name, '4', sizeof(stub_name));
+ pos = total_len / kallsyms_num_syms;
+ stub_name[pos] = 0;
+
+ pos = 0;
+ num = 0;
+ off = 0;
+ while (pos < kallsyms_num_syms) {
+ len = kallsyms_names[off];
+ num++;
+ off++;
+ pos++;
+ if ((len & 0x80) != 0) {
+ len = (len & 0x7f) | (kallsyms_names[off] << 7);
+ num++;
+ off++;
+ }
+ off += len;
+ }
+
+ /*
+ * 1. The length fields is not counted
+ * 2. The memory occupied by array kallsyms_token_table[] and
+ * kallsyms_token_index[] needs to be counted.
+ */
+ total_size = off - num;
+ pos = kallsyms_token_index[0xff];
+ total_size += pos + strlen(&kallsyms_token_table[pos]) + 1;
+ total_size += 0x100 * sizeof(u16);
+
+ pr_info(" ---------------------------------------------------------\n");
+ pr_info("| nr_symbols | compressed size | original size | ratio(%%) |\n");
+ pr_info("|---------------------------------------------------------|\n");
+ ratio = (u32)div_u64(10000ULL * total_size, total_len);
+ pr_info("| %10d | %10d | %10d | %2d.%-2d |\n",
+ kallsyms_num_syms, total_size, total_len, ratio / 100, ratio % 100);
+ pr_info(" ---------------------------------------------------------\n");
+}
+
+static int lookup_name(void *data, const char *name, unsigned long addr)
+{
+ u64 t0, t1, t;
+ struct test_stat *stat = (struct test_stat *)data;
+
+ t0 = ktime_get_ns();
+ (void)kallsyms_lookup_name(name);
+ t1 = ktime_get_ns();
+
+ t = t1 - t0;
+ if (t < stat->min)
+ stat->min = t;
+
+ if (t > stat->max)
+ stat->max = t;
+
+ stat->real_cnt++;
+ stat->sum += t;
+
+ return 0;
+}
+
+static void test_perf_kallsyms_lookup_name(void)
+{
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.min = INT_MAX;
+ kallsyms_on_each_symbol(lookup_name, &stat);
+ pr_info("kallsyms_lookup_name() looked up %d symbols\n", stat.real_cnt);
+ pr_info("The time spent on each symbol is (ns): min=%d, max=%d, avg=%lld\n",
+ stat.min, stat.max, div_u64(stat.sum, stat.real_cnt));
+}
+
+static bool match_cleanup_name(const char *s, const char *name)
+{
+ char *p;
+ int len;
+
+ if (!IS_ENABLED(CONFIG_LTO_CLANG))
+ return false;
+
+ p = strstr(s, ".llvm.");
+ if (!p)
+ return false;
+
+ len = strlen(name);
+ if (p - s != len)
+ return false;
+
+ return !strncmp(s, name, len);
+}
+
+static int find_symbol(void *data, const char *name, unsigned long addr)
+{
+ struct test_stat *stat = (struct test_stat *)data;
+
+ if (strcmp(name, stat->name) == 0 ||
+ (!stat->perf && match_cleanup_name(name, stat->name))) {
+ stat->real_cnt++;
+ stat->addr = addr;
+
+ if (stat->save_cnt < MAX_NUM_OF_RECORDS) {
+ stat->addrs[stat->save_cnt] = addr;
+ stat->save_cnt++;
+ }
+
+ if (stat->real_cnt == stat->max)
+ return 1;
+ }
+
+ return 0;
+}
+
+static void test_perf_kallsyms_on_each_symbol(void)
+{
+ u64 t0, t1;
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.max = INT_MAX;
+ stat.name = stub_name;
+ stat.perf = 1;
+ t0 = ktime_get_ns();
+ kallsyms_on_each_symbol(find_symbol, &stat);
+ t1 = ktime_get_ns();
+ pr_info("kallsyms_on_each_symbol() traverse all: %lld ns\n", t1 - t0);
+}
+
+static int match_symbol(void *data, unsigned long addr)
+{
+ struct test_stat *stat = (struct test_stat *)data;
+
+ stat->real_cnt++;
+ stat->addr = addr;
+
+ if (stat->save_cnt < MAX_NUM_OF_RECORDS) {
+ stat->addrs[stat->save_cnt] = addr;
+ stat->save_cnt++;
+ }
+
+ if (stat->real_cnt == stat->max)
+ return 1;
+
+ return 0;
+}
+
+static void test_perf_kallsyms_on_each_match_symbol(void)
+{
+ u64 t0, t1;
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.max = INT_MAX;
+ stat.name = stub_name;
+ t0 = ktime_get_ns();
+ kallsyms_on_each_match_symbol(match_symbol, stat.name, &stat);
+ t1 = ktime_get_ns();
+ pr_info("kallsyms_on_each_match_symbol() traverse all: %lld ns\n", t1 - t0);
+}
+
+static int test_kallsyms_basic_function(void)
+{
+ int i, j, ret;
+ int next = 0, nr_failed = 0;
+ char *prefix;
+ unsigned short rand;
+ unsigned long addr, lookup_addr;
+ char namebuf[KSYM_NAME_LEN];
+ struct test_stat *stat, *stat2;
+
+ stat = kmalloc(sizeof(*stat) * 2, GFP_KERNEL);
+ if (!stat)
+ return -ENOMEM;
+ stat2 = stat + 1;
+
+ prefix = "kallsyms_lookup_name() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ addr = kallsyms_lookup_name(test_items[i].name);
+ if (addr != test_items[i].addr) {
+ nr_failed++;
+ pr_info("%s %s failed: addr=%lx, expect %lx\n",
+ prefix, test_items[i].name, addr, test_items[i].addr);
+ }
+ }
+
+ prefix = "kallsyms_on_each_symbol() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ stat->name = test_items[i].name;
+ kallsyms_on_each_symbol(find_symbol, stat);
+ if (stat->addr != test_items[i].addr || stat->real_cnt != 1) {
+ nr_failed++;
+ pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n",
+ prefix, test_items[i].name,
+ stat->real_cnt, stat->addr, test_items[i].addr);
+ }
+ }
+
+ prefix = "kallsyms_on_each_match_symbol() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ stat->name = test_items[i].name;
+ kallsyms_on_each_match_symbol(match_symbol, test_items[i].name, stat);
+ if (stat->addr != test_items[i].addr || stat->real_cnt != 1) {
+ nr_failed++;
+ pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n",
+ prefix, test_items[i].name,
+ stat->real_cnt, stat->addr, test_items[i].addr);
+ }
+ }
+
+ if (nr_failed) {
+ kfree(stat);
+ return -ESRCH;
+ }
+
+ for (i = 0; i < kallsyms_num_syms; i++) {
+ addr = kallsyms_sym_address(i);
+ if (!is_ksym_addr(addr))
+ continue;
+
+ ret = lookup_symbol_name(addr, namebuf);
+ if (unlikely(ret)) {
+ namebuf[0] = 0;
+ pr_info("%d: lookup_symbol_name(%lx) failed\n", i, addr);
+ goto failed;
+ }
+
+ lookup_addr = kallsyms_lookup_name(namebuf);
+
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ kallsyms_on_each_match_symbol(match_symbol, namebuf, stat);
+
+ /*
+ * kallsyms_on_each_symbol() is too slow, randomly select some
+ * symbols for test.
+ */
+ if (i >= next) {
+ memset(stat2, 0, sizeof(*stat2));
+ stat2->max = INT_MAX;
+ stat2->name = namebuf;
+ kallsyms_on_each_symbol(find_symbol, stat2);
+
+ /*
+ * kallsyms_on_each_symbol() and kallsyms_on_each_match_symbol()
+ * need to get the same traversal result.
+ */
+ if (stat->addr != stat2->addr ||
+ stat->real_cnt != stat2->real_cnt ||
+ memcmp(stat->addrs, stat2->addrs,
+ stat->save_cnt * sizeof(stat->addrs[0]))) {
+ pr_info("%s: mismatch between kallsyms_on_each_symbol() and kallsyms_on_each_match_symbol()\n",
+ namebuf);
+ goto failed;
+ }
+
+ /*
+ * The average of random increments is 128, that is, one of
+ * them is tested every 128 symbols.
+ */
+ get_random_bytes(&rand, sizeof(rand));
+ next = i + (rand & 0xff) + 1;
+ }
+
+ /* Need to be found at least once */
+ if (!stat->real_cnt) {
+ pr_info("%s: Never found\n", namebuf);
+ goto failed;
+ }
+
+ /*
+ * kallsyms_lookup_name() returns the address of the first
+ * symbol found and cannot be NULL.
+ */
+ if (!lookup_addr) {
+ pr_info("%s: NULL lookup_addr?!\n", namebuf);
+ goto failed;
+ }
+ if (lookup_addr != stat->addrs[0]) {
+ pr_info("%s: lookup_addr != stat->addrs[0]\n", namebuf);
+ goto failed;
+ }
+
+ /*
+ * If the addresses of all matching symbols are recorded, the
+ * target address needs to be exist.
+ */
+ if (stat->real_cnt <= MAX_NUM_OF_RECORDS) {
+ for (j = 0; j < stat->save_cnt; j++) {
+ if (stat->addrs[j] == addr)
+ break;
+ }
+
+ if (j == stat->save_cnt) {
+ pr_info("%s: j == save_cnt?!\n", namebuf);
+ goto failed;
+ }
+ }
+ }
+
+ kfree(stat);
+
+ return 0;
+
+failed:
+ pr_info("Test for %dth symbol failed: (%s) addr=%lx", i, namebuf, addr);
+ kfree(stat);
+ return -ESRCH;
+}
+
+static int test_entry(void *p)
+{
+ int ret;
+
+ do {
+ schedule_timeout(5 * HZ);
+ } while (system_state != SYSTEM_RUNNING);
+
+ pr_info("start\n");
+ ret = test_kallsyms_basic_function();
+ if (ret) {
+ pr_info("abort\n");
+ return 0;
+ }
+
+ test_kallsyms_compression_ratio();
+ test_perf_kallsyms_lookup_name();
+ test_perf_kallsyms_on_each_symbol();
+ test_perf_kallsyms_on_each_match_symbol();
+ pr_info("finish\n");
+
+ return 0;
+}
+
+static int __init kallsyms_test_init(void)
+{
+ struct task_struct *t;
+
+ t = kthread_create(test_entry, NULL, "kallsyms_test");
+ if (IS_ERR(t)) {
+ pr_info("Create kallsyms selftest task failed\n");
+ return PTR_ERR(t);
+ }
+ kthread_bind(t, 0);
+ wake_up_process(t);
+
+ return 0;
+}
+late_initcall(kallsyms_test_init);
diff --git a/kernel/kallsyms_selftest.h b/kernel/kallsyms_selftest.h
new file mode 100644
index 000000000000..c0ca548e2a22
--- /dev/null
+++ b/kernel/kallsyms_selftest.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef LINUX_KALLSYMS_SELFTEST_H_
+#define LINUX_KALLSYMS_SELFTEST_H_
+
+#include <linux/types.h>
+
+extern int kallsyms_test_var_bss;
+extern int kallsyms_test_var_data;
+
+extern int kallsyms_test_func(void);
+extern int kallsyms_test_func_weak(void);
+
+#endif // LINUX_KALLSYMS_SELFTEST_H_
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index b3ff9288c6cc..b0639f21041f 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -61,39 +61,36 @@ static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
static struct file *
get_file_raw_ptr(struct task_struct *task, unsigned int idx)
{
- struct file *file = NULL;
+ struct file *file;
- task_lock(task);
rcu_read_lock();
-
- if (task->files)
- file = fcheck_files(task->files, idx);
-
+ file = task_lookup_fdget_rcu(task, idx);
rcu_read_unlock();
- task_unlock(task);
+ if (file)
+ fput(file);
return file;
}
-static void kcmp_unlock(struct mutex *m1, struct mutex *m2)
+static void kcmp_unlock(struct rw_semaphore *l1, struct rw_semaphore *l2)
{
- if (likely(m2 != m1))
- mutex_unlock(m2);
- mutex_unlock(m1);
+ if (likely(l2 != l1))
+ up_read(l2);
+ up_read(l1);
}
-static int kcmp_lock(struct mutex *m1, struct mutex *m2)
+static int kcmp_lock(struct rw_semaphore *l1, struct rw_semaphore *l2)
{
int err;
- if (m2 > m1)
- swap(m1, m2);
+ if (l2 > l1)
+ swap(l1, l2);
- err = mutex_lock_killable(m1);
- if (!err && likely(m1 != m2)) {
- err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING);
+ err = down_read_killable(l1);
+ if (!err && likely(l1 != l2)) {
+ err = down_read_killable_nested(l2, SINGLE_DEPTH_NESTING);
if (err)
- mutex_unlock(m1);
+ up_read(l1);
}
return err;
@@ -107,7 +104,6 @@ static int kcmp_epoll_target(struct task_struct *task1,
{
struct file *filp, *filp_epoll, *filp_tgt;
struct kcmp_epoll_slot slot;
- struct files_struct *files;
if (copy_from_user(&slot, uslot, sizeof(slot)))
return -EFAULT;
@@ -116,23 +112,12 @@ static int kcmp_epoll_target(struct task_struct *task1,
if (!filp)
return -EBADF;
- files = get_files_struct(task2);
- if (!files)
+ filp_epoll = fget_task(task2, slot.efd);
+ if (!filp_epoll)
return -EBADF;
- spin_lock(&files->file_lock);
- filp_epoll = fcheck_files(files, slot.efd);
- if (filp_epoll)
- get_file(filp_epoll);
- else
- filp_tgt = ERR_PTR(-EBADF);
- spin_unlock(&files->file_lock);
- put_files_struct(files);
-
- if (filp_epoll) {
- filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
- fput(filp_epoll);
- }
+ filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
+ fput(filp_epoll);
if (IS_ERR(filp_tgt))
return PTR_ERR(filp_tgt);
@@ -173,8 +158,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
/*
* One should have enough rights to inspect task details.
*/
- ret = kcmp_lock(&task1->signal->exec_update_mutex,
- &task2->signal->exec_update_mutex);
+ ret = kcmp_lock(&task1->signal->exec_update_lock,
+ &task2->signal->exec_update_lock);
if (ret)
goto err;
if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) ||
@@ -229,8 +214,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
}
err_unlock:
- kcmp_unlock(&task1->signal->exec_update_mutex,
- &task2->signal->exec_update_mutex);
+ kcmp_unlock(&task1->signal->exec_update_lock,
+ &task2->signal->exec_update_lock);
err:
put_task_struct(task1);
put_task_struct(task2);
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 6afae0bcbac4..f9ac2e9e460f 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -11,6 +11,7 @@
#include <linux/fs.h>
#include <linux/hashtable.h>
#include <linux/init.h>
+#include <linux/kmsan-checks.h>
#include <linux/mm.h>
#include <linux/preempt.h>
#include <linux/printk.h>
@@ -88,6 +89,7 @@ static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas);
struct kcov_percpu_data {
void *irq_area;
+ local_lock_t lock;
unsigned int saved_mode;
unsigned int saved_size;
@@ -96,7 +98,9 @@ struct kcov_percpu_data {
int saved_sequence;
};
-DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data);
+static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
/* Must be called with kcov_remote_lock locked. */
static struct kcov_remote *kcov_remote_find(u64 handle)
@@ -149,6 +153,12 @@ static void kcov_remote_area_put(struct kcov_remote_area *area,
INIT_LIST_HEAD(&area->list);
area->size = size;
list_add(&area->list, &kcov_remote_areas);
+ /*
+ * KMSAN doesn't instrument this file, so it may not know area->list
+ * is initialized. Unpoison it explicitly to avoid reports in
+ * kcov_remote_area_get().
+ */
+ kmsan_unpoison_memory(&area->list, sizeof(area->list));
}
static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
@@ -201,8 +211,16 @@ void notrace __sanitizer_cov_trace_pc(void)
/* The first 64-bit word is the number of subsequent PCs. */
pos = READ_ONCE(area[0]) + 1;
if (likely(pos < t->kcov_size)) {
- area[pos] = ip;
+ /* Previously we write pc before updating pos. However, some
+ * early interrupt code could bypass check_kcov_mode() check
+ * and invoke __sanitizer_cov_trace_pc(). If such interrupt is
+ * raised between writing pc and updating pos, the pc could be
+ * overitten by the recursive __sanitizer_cov_trace_pc().
+ * Update pos before writing pc to avoid such interleaving.
+ */
WRITE_ONCE(area[0], pos);
+ barrier();
+ area[pos] = ip;
}
}
EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
@@ -233,11 +251,13 @@ static void notrace write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
start_index = 1 + count * KCOV_WORDS_PER_CMP;
end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
if (likely(end_pos <= max_pos)) {
+ /* See comment in __sanitizer_cov_trace_pc(). */
+ WRITE_ONCE(area[0], count + 1);
+ barrier();
area[start_index] = type;
area[start_index + 1] = arg1;
area[start_index + 2] = arg2;
area[start_index + 3] = ip;
- WRITE_ONCE(area[0], count + 1);
}
}
@@ -259,7 +279,7 @@ void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2)
}
EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4);
-void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2)
+void notrace __sanitizer_cov_trace_cmp8(kcov_u64 arg1, kcov_u64 arg2)
{
write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_);
}
@@ -286,16 +306,17 @@ void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2)
}
EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4);
-void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2)
+void notrace __sanitizer_cov_trace_const_cmp8(kcov_u64 arg1, kcov_u64 arg2)
{
write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2,
_RET_IP_);
}
EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8);
-void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
+void notrace __sanitizer_cov_trace_switch(kcov_u64 val, void *arg)
{
u64 i;
+ u64 *cases = arg;
u64 count = cases[0];
u64 size = cases[1];
u64 type = KCOV_CMP_CONST;
@@ -456,37 +477,31 @@ void kcov_task_exit(struct task_struct *t)
static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
{
int res = 0;
- void *area;
struct kcov *kcov = vma->vm_file->private_data;
unsigned long size, off;
struct page *page;
unsigned long flags;
- area = vmalloc_user(vma->vm_end - vma->vm_start);
- if (!area)
- return -ENOMEM;
-
spin_lock_irqsave(&kcov->lock, flags);
size = kcov->size * sizeof(unsigned long);
- if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
+ if (kcov->area == NULL || vma->vm_pgoff != 0 ||
vma->vm_end - vma->vm_start != size) {
res = -EINVAL;
goto exit;
}
- if (!kcov->area) {
- kcov->area = area;
- vma->vm_flags |= VM_DONTEXPAND;
- spin_unlock_irqrestore(&kcov->lock, flags);
- for (off = 0; off < size; off += PAGE_SIZE) {
- page = vmalloc_to_page(kcov->area + off);
- if (vm_insert_page(vma, vma->vm_start + off, page))
- WARN_ONCE(1, "vm_insert_page() failed");
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ vm_flags_set(vma, VM_DONTEXPAND);
+ for (off = 0; off < size; off += PAGE_SIZE) {
+ page = vmalloc_to_page(kcov->area + off);
+ res = vm_insert_page(vma, vma->vm_start + off, page);
+ if (res) {
+ pr_warn_once("kcov: vm_insert_page() failed\n");
+ return res;
}
- return 0;
}
+ return 0;
exit:
spin_unlock_irqrestore(&kcov->lock, flags);
- vfree(area);
return res;
}
@@ -561,31 +576,12 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
unsigned long arg)
{
struct task_struct *t;
- unsigned long size, unused;
+ unsigned long flags, unused;
int mode, i;
struct kcov_remote_arg *remote_arg;
struct kcov_remote *remote;
- unsigned long flags;
switch (cmd) {
- case KCOV_INIT_TRACE:
- /*
- * Enable kcov in trace mode and setup buffer size.
- * Must happen before anything else.
- */
- if (kcov->mode != KCOV_MODE_DISABLED)
- return -EBUSY;
- /*
- * Size must be at least 2 to hold current position and one PC.
- * Later we allocate size * sizeof(unsigned long) memory,
- * that must not overflow.
- */
- size = arg;
- if (size < 2 || size > INT_MAX / sizeof(unsigned long))
- return -EINVAL;
- kcov->size = size;
- kcov->mode = KCOV_MODE_INIT;
- return 0;
case KCOV_ENABLE:
/*
* Enable coverage for the current task.
@@ -689,9 +685,37 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
struct kcov_remote_arg *remote_arg = NULL;
unsigned int remote_num_handles;
unsigned long remote_arg_size;
- unsigned long flags;
+ unsigned long size, flags;
+ void *area;
- if (cmd == KCOV_REMOTE_ENABLE) {
+ kcov = filep->private_data;
+ switch (cmd) {
+ case KCOV_INIT_TRACE:
+ /*
+ * Enable kcov in trace mode and setup buffer size.
+ * Must happen before anything else.
+ *
+ * First check the size argument - it must be at least 2
+ * to hold the current position and one PC.
+ */
+ size = arg;
+ if (size < 2 || size > INT_MAX / sizeof(unsigned long))
+ return -EINVAL;
+ area = vmalloc_user(size * sizeof(unsigned long));
+ if (area == NULL)
+ return -ENOMEM;
+ spin_lock_irqsave(&kcov->lock, flags);
+ if (kcov->mode != KCOV_MODE_DISABLED) {
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ vfree(area);
+ return -EBUSY;
+ }
+ kcov->area = area;
+ kcov->size = size;
+ kcov->mode = KCOV_MODE_INIT;
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ return 0;
+ case KCOV_REMOTE_ENABLE:
if (get_user(remote_num_handles, (unsigned __user *)(arg +
offsetof(struct kcov_remote_arg, num_handles))))
return -EFAULT;
@@ -707,16 +731,18 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
return -EINVAL;
}
arg = (unsigned long)remote_arg;
+ fallthrough;
+ default:
+ /*
+ * All other commands can be normally executed under a spin lock, so we
+ * obtain and release it here in order to simplify kcov_ioctl_locked().
+ */
+ spin_lock_irqsave(&kcov->lock, flags);
+ res = kcov_ioctl_locked(kcov, cmd, arg);
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ kfree(remote_arg);
+ return res;
}
-
- kcov = filep->private_data;
- spin_lock_irqsave(&kcov->lock, flags);
- res = kcov_ioctl_locked(kcov, cmd, arg);
- spin_unlock_irqrestore(&kcov->lock, flags);
-
- kfree(remote_arg);
-
- return res;
}
static const struct file_operations kcov_fops = {
@@ -775,7 +801,7 @@ static inline bool kcov_mode_enabled(unsigned int mode)
return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED;
}
-void kcov_remote_softirq_start(struct task_struct *t)
+static void kcov_remote_softirq_start(struct task_struct *t)
{
struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
unsigned int mode;
@@ -792,7 +818,7 @@ void kcov_remote_softirq_start(struct task_struct *t)
}
}
-void kcov_remote_softirq_stop(struct task_struct *t)
+static void kcov_remote_softirq_stop(struct task_struct *t)
{
struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
@@ -824,7 +850,7 @@ void kcov_remote_start(u64 handle)
if (!in_task() && !in_serving_softirq())
return;
- local_irq_save(flags);
+ local_lock_irqsave(&kcov_percpu_data.lock, flags);
/*
* Check that kcov_remote_start() is not called twice in background
@@ -832,7 +858,7 @@ void kcov_remote_start(u64 handle)
*/
mode = READ_ONCE(t->kcov_mode);
if (WARN_ON(in_task() && kcov_mode_enabled(mode))) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
/*
@@ -841,14 +867,15 @@ void kcov_remote_start(u64 handle)
* happened while collecting coverage from a background thread.
*/
if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
spin_lock(&kcov_remote_lock);
remote = kcov_remote_find(handle);
if (!remote) {
- spin_unlock_irqrestore(&kcov_remote_lock, flags);
+ spin_unlock(&kcov_remote_lock);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
kcov_debug("handle = %llx, context: %s\n", handle,
@@ -869,19 +896,19 @@ void kcov_remote_start(u64 handle)
size = CONFIG_KCOV_IRQ_AREA_SIZE;
area = this_cpu_ptr(&kcov_percpu_data)->irq_area;
}
- spin_unlock_irqrestore(&kcov_remote_lock, flags);
+ spin_unlock(&kcov_remote_lock);
/* Can only happen when in_task(). */
if (!area) {
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
area = vmalloc(size * sizeof(unsigned long));
if (!area) {
kcov_put(kcov);
return;
}
+ local_lock_irqsave(&kcov_percpu_data.lock, flags);
}
- local_irq_save(flags);
-
/* Reset coverage size. */
*(u64 *)area = 0;
@@ -891,7 +918,7 @@ void kcov_remote_start(u64 handle)
}
kcov_start(t, kcov, size, area, mode, sequence);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
}
EXPORT_SYMBOL(kcov_remote_start);
@@ -965,12 +992,12 @@ void kcov_remote_stop(void)
if (!in_task() && !in_serving_softirq())
return;
- local_irq_save(flags);
+ local_lock_irqsave(&kcov_percpu_data.lock, flags);
mode = READ_ONCE(t->kcov_mode);
barrier();
if (!kcov_mode_enabled(mode)) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
/*
@@ -978,12 +1005,12 @@ void kcov_remote_stop(void)
* actually found the remote handle and started collecting coverage.
*/
if (in_serving_softirq() && !t->kcov_softirq) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
/* Make sure that kcov_softirq is only set when in softirq. */
if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
@@ -1013,7 +1040,7 @@ void kcov_remote_stop(void)
spin_unlock(&kcov_remote_lock);
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
/* Get in kcov_remote_start(). */
kcov_put(kcov);
@@ -1023,6 +1050,8 @@ EXPORT_SYMBOL(kcov_remote_stop);
/* See the comment before kcov_remote_start() for usage details. */
u64 kcov_common_handle(void)
{
+ if (!in_task())
+ return 0;
return current->kcov_handle;
}
EXPORT_SYMBOL(kcov_common_handle);
@@ -1032,8 +1061,8 @@ static int __init kcov_init(void)
int cpu;
for_each_possible_cpu(cpu) {
- void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE *
- sizeof(unsigned long));
+ void *area = vmalloc_node(CONFIG_KCOV_IRQ_AREA_SIZE *
+ sizeof(unsigned long), cpu_to_node(cpu));
if (!area)
return -ENOMEM;
per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area;
diff --git a/kernel/kcsan/.kunitconfig b/kernel/kcsan/.kunitconfig
new file mode 100644
index 000000000000..e82f0f52ab0a
--- /dev/null
+++ b/kernel/kcsan/.kunitconfig
@@ -0,0 +1,24 @@
+# Note that the KCSAN tests need to run on an SMP setup.
+# Under kunit_tool, this can be done by using the --qemu_args
+# option to configure a machine with several cores. For example:
+# ./tools/testing/kunit/kunit.py run --kunitconfig=kernel/kcsan \
+# --arch=x86_64 --qemu_args="-smp 8"
+
+CONFIG_KUNIT=y
+
+CONFIG_DEBUG_KERNEL=y
+
+# Need some level of concurrency to test a concurrency sanitizer.
+CONFIG_SMP=y
+
+CONFIG_KCSAN=y
+CONFIG_KCSAN_KUNIT_TEST=y
+
+# Set these if you want to run test_barrier_nothreads
+#CONFIG_KCSAN_STRICT=y
+#CONFIG_KCSAN_WEAK_MEMORY=y
+
+# This prevents the test from timing out on many setups. Feel free to remove
+# (or alter) this, in conjunction with setting a different test timeout with,
+# for example, the --timeout kunit_tool option.
+CONFIG_KCSAN_REPORT_ONCE_IN_MS=100
diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index d4999b38d1be..a45f3dfc8d14 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -7,8 +7,15 @@ CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
-CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \
- $(call cc-option,-fno-stack-protector,)
+CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \
+ $(call cc-option,-mno-outline-atomics) \
+ -fno-stack-protector -DDISABLE_BRANCH_PROFILING
obj-y := core.o debugfs.o report.o
-obj-$(CONFIG_KCSAN_SELFTEST) += test.o
+
+KCSAN_INSTRUMENT_BARRIERS_selftest.o := y
+obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
+
+CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -fno-omit-frame-pointer
+CFLAGS_kcsan_test.o += $(DISABLE_STRUCTLEAK_PLUGIN)
+obj-$(CONFIG_KCSAN_KUNIT_TEST) += kcsan_test.o
diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h
deleted file mode 100644
index be9e625227f3..000000000000
--- a/kernel/kcsan/atomic.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _KERNEL_KCSAN_ATOMIC_H
-#define _KERNEL_KCSAN_ATOMIC_H
-
-#include <linux/jiffies.h>
-#include <linux/sched.h>
-
-/*
- * Special rules for certain memory where concurrent conflicting accesses are
- * common, however, the current convention is to not mark them; returns true if
- * access to @ptr should be considered atomic. Called from slow-path.
- */
-static bool kcsan_is_atomic_special(const volatile void *ptr)
-{
- /* volatile globals that have been observed in data races. */
- return ptr == &jiffies || ptr == &current->state;
-}
-
-#endif /* _KERNEL_KCSAN_ATOMIC_H */
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 15f67949d11e..8a7baf4e332e 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -1,4 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN core runtime.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
+
+#define pr_fmt(fmt) "kcsan: " fmt
#include <linux/atomic.h>
#include <linux/bug.h>
@@ -7,16 +14,17 @@
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/list.h>
+#include <linux/minmax.h>
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
-#include <linux/random.h>
#include <linux/sched.h>
+#include <linux/string.h>
#include <linux/uaccess.h>
-#include "atomic.h"
#include "encoding.h"
#include "kcsan.h"
+#include "permissive.h"
static bool kcsan_early_enable = IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE);
unsigned int kcsan_udelay_task = CONFIG_KCSAN_UDELAY_TASK;
@@ -34,15 +42,17 @@ module_param_named(udelay_interrupt, kcsan_udelay_interrupt, uint, 0644);
module_param_named(skip_watch, kcsan_skip_watch, long, 0644);
module_param_named(interrupt_watcher, kcsan_interrupt_watcher, bool, 0444);
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+static bool kcsan_weak_memory = true;
+module_param_named(weak_memory, kcsan_weak_memory, bool, 0644);
+#else
+#define kcsan_weak_memory false
+#endif
+
bool kcsan_enabled;
/* Per-CPU kcsan_ctx for interrupts */
static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = {
- .disable_count = 0,
- .atomic_next = 0,
- .atomic_nest_count = 0,
- .in_flat_atomic = false,
- .access_mask = 0,
.scoped_accesses = {LIST_POISON1, NULL},
};
@@ -98,6 +108,9 @@ static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1];
*/
static DEFINE_PER_CPU(long, kcsan_skip);
+/* For kcsan_prandom_u32_max(). */
+static DEFINE_PER_CPU(u32, kcsan_rand_state);
+
static __always_inline atomic_long_t *find_watchpoint(unsigned long addr,
size_t size,
bool expect_write,
@@ -193,22 +206,29 @@ static __always_inline struct kcsan_ctx *get_ctx(void)
return in_task() ? &current->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx);
}
+static __always_inline void
+check_access(const volatile void *ptr, size_t size, int type, unsigned long ip);
+
/* Check scoped accesses; never inline because this is a slow-path! */
static noinline void kcsan_check_scoped_accesses(void)
{
struct kcsan_ctx *ctx = get_ctx();
- struct list_head *prev_save = ctx->scoped_accesses.prev;
struct kcsan_scoped_access *scoped_access;
- ctx->scoped_accesses.prev = NULL; /* Avoid recursion. */
- list_for_each_entry(scoped_access, &ctx->scoped_accesses, list)
- __kcsan_check_access(scoped_access->ptr, scoped_access->size, scoped_access->type);
- ctx->scoped_accesses.prev = prev_save;
+ if (ctx->disable_scoped)
+ return;
+
+ ctx->disable_scoped++;
+ list_for_each_entry(scoped_access, &ctx->scoped_accesses, list) {
+ check_access(scoped_access->ptr, scoped_access->size,
+ scoped_access->type, scoped_access->ip);
+ }
+ ctx->disable_scoped--;
}
/* Rules for generic atomic accesses. Called from fast-path. */
static __always_inline bool
-is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx)
+is_atomic(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, int type)
{
if (type & KCSAN_ACCESS_ATOMIC)
return true;
@@ -223,7 +243,7 @@ is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx
if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) &&
(type & KCSAN_ACCESS_WRITE) && size <= sizeof(long) &&
- IS_ALIGNED((unsigned long)ptr, size))
+ !(type & KCSAN_ACCESS_COMPOUND) && IS_ALIGNED((unsigned long)ptr, size))
return true; /* Assume aligned writes up to word size are atomic. */
if (ctx->atomic_next > 0) {
@@ -245,7 +265,7 @@ is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx
}
static __always_inline bool
-should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx)
+should_watch(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, int type)
{
/*
* Never set up watchpoints when memory operations are atomic.
@@ -254,7 +274,7 @@ should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *
* should not count towards skipped instructions, and (2) to actually
* decrement kcsan_atomic_next for consecutive instruction stream.
*/
- if (is_atomic(ptr, size, type, ctx))
+ if (is_atomic(ctx, ptr, size, type))
return false;
if (this_cpu_dec_return(kcsan_skip) >= 0)
@@ -269,26 +289,154 @@ should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *
return true;
}
+/*
+ * Returns a pseudo-random number in interval [0, ep_ro). Simple linear
+ * congruential generator, using constants from "Numerical Recipes".
+ */
+static u32 kcsan_prandom_u32_max(u32 ep_ro)
+{
+ u32 state = this_cpu_read(kcsan_rand_state);
+
+ state = 1664525 * state + 1013904223;
+ this_cpu_write(kcsan_rand_state, state);
+
+ return state % ep_ro;
+}
+
static inline void reset_kcsan_skip(void)
{
long skip_count = kcsan_skip_watch -
(IS_ENABLED(CONFIG_KCSAN_SKIP_WATCH_RANDOMIZE) ?
- prandom_u32_max(kcsan_skip_watch) :
+ kcsan_prandom_u32_max(kcsan_skip_watch) :
0);
this_cpu_write(kcsan_skip, skip_count);
}
-static __always_inline bool kcsan_is_enabled(void)
+static __always_inline bool kcsan_is_enabled(struct kcsan_ctx *ctx)
{
- return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0;
+ return READ_ONCE(kcsan_enabled) && !ctx->disable_count;
}
-static inline unsigned int get_delay(void)
+/* Introduce delay depending on context and configuration. */
+static void delay_access(int type)
{
unsigned int delay = in_task() ? kcsan_udelay_task : kcsan_udelay_interrupt;
- return delay - (IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ?
- prandom_u32_max(delay) :
- 0);
+ /* For certain access types, skew the random delay to be longer. */
+ unsigned int skew_delay_order =
+ (type & (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_ASSERT)) ? 1 : 0;
+
+ delay -= IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ?
+ kcsan_prandom_u32_max(delay >> skew_delay_order) :
+ 0;
+ udelay(delay);
+}
+
+/*
+ * Reads the instrumented memory for value change detection; value change
+ * detection is currently done for accesses up to a size of 8 bytes.
+ */
+static __always_inline u64 read_instrumented_memory(const volatile void *ptr, size_t size)
+{
+ /*
+ * In the below we don't necessarily need the read of the location to
+ * be atomic, and we don't use READ_ONCE(), since all we need for race
+ * detection is to observe 2 different values.
+ *
+ * Furthermore, on certain architectures (such as arm64), READ_ONCE()
+ * may turn into more complex instructions than a plain load that cannot
+ * do unaligned accesses.
+ */
+ switch (size) {
+ case 1: return *(const volatile u8 *)ptr;
+ case 2: return *(const volatile u16 *)ptr;
+ case 4: return *(const volatile u32 *)ptr;
+ case 8: return *(const volatile u64 *)ptr;
+ default: return 0; /* Ignore; we do not diff the values. */
+ }
+}
+
+void kcsan_save_irqtrace(struct task_struct *task)
+{
+#ifdef CONFIG_TRACE_IRQFLAGS
+ task->kcsan_save_irqtrace = task->irqtrace;
+#endif
+}
+
+void kcsan_restore_irqtrace(struct task_struct *task)
+{
+#ifdef CONFIG_TRACE_IRQFLAGS
+ task->irqtrace = task->kcsan_save_irqtrace;
+#endif
+}
+
+static __always_inline int get_kcsan_stack_depth(void)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ return current->kcsan_stack_depth;
+#else
+ BUILD_BUG();
+ return 0;
+#endif
+}
+
+static __always_inline void add_kcsan_stack_depth(int val)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ current->kcsan_stack_depth += val;
+#else
+ BUILD_BUG();
+#endif
+}
+
+static __always_inline struct kcsan_scoped_access *get_reorder_access(struct kcsan_ctx *ctx)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ return ctx->disable_scoped ? NULL : &ctx->reorder_access;
+#else
+ return NULL;
+#endif
+}
+
+static __always_inline bool
+find_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size,
+ int type, unsigned long ip)
+{
+ struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
+
+ if (!reorder_access)
+ return false;
+
+ /*
+ * Note: If accesses are repeated while reorder_access is identical,
+ * never matches the new access, because !(type & KCSAN_ACCESS_SCOPED).
+ */
+ return reorder_access->ptr == ptr && reorder_access->size == size &&
+ reorder_access->type == type && reorder_access->ip == ip;
+}
+
+static inline void
+set_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size,
+ int type, unsigned long ip)
+{
+ struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
+
+ if (!reorder_access || !kcsan_weak_memory)
+ return;
+
+ /*
+ * To avoid nested interrupts or scheduler (which share kcsan_ctx)
+ * reading an inconsistent reorder_access, ensure that the below has
+ * exclusive access to reorder_access by disallowing concurrent use.
+ */
+ ctx->disable_scoped++;
+ barrier();
+ reorder_access->ptr = ptr;
+ reorder_access->size = size;
+ reorder_access->type = type | KCSAN_ACCESS_SCOPED;
+ reorder_access->ip = ip;
+ reorder_access->stack_depth = get_kcsan_stack_depth();
+ barrier();
+ ctx->disable_scoped--;
}
/*
@@ -307,28 +455,47 @@ static inline unsigned int get_delay(void)
static noinline void kcsan_found_watchpoint(const volatile void *ptr,
size_t size,
int type,
+ unsigned long ip,
atomic_long_t *watchpoint,
long encoded_watchpoint)
{
+ const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0;
+ struct kcsan_ctx *ctx = get_ctx();
unsigned long flags;
bool consumed;
- if (!kcsan_is_enabled())
+ /*
+ * We know a watchpoint exists. Let's try to keep the race-window
+ * between here and finally consuming the watchpoint below as small as
+ * possible -- avoid unneccessarily complex code until consumed.
+ */
+
+ if (!kcsan_is_enabled(ctx))
return;
/*
* The access_mask check relies on value-change comparison. To avoid
* reporting a race where e.g. the writer set up the watchpoint, but the
* reader has access_mask!=0, we have to ignore the found watchpoint.
+ *
+ * reorder_access is never created from an access with access_mask set.
+ */
+ if (ctx->access_mask && !find_reorder_access(ctx, ptr, size, type, ip))
+ return;
+
+ /*
+ * If the other thread does not want to ignore the access, and there was
+ * a value change as a result of this thread's operation, we will still
+ * generate a report of unknown origin.
+ *
+ * Use CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN=n to filter.
*/
- if (get_ctx()->access_mask != 0)
+ if (!is_assert && kcsan_ignore_address(ptr))
return;
/*
- * Consume the watchpoint as soon as possible, to minimize the chances
- * of !consumed. Consuming the watchpoint must always be guarded by
- * kcsan_is_enabled() check, as otherwise we might erroneously
- * triggering reports when disabled.
+ * Consuming the watchpoint must be guarded by kcsan_is_enabled() to
+ * avoid erroneously triggering reports if the context is disabled.
*/
consumed = try_consume_watchpoint(watchpoint, encoded_watchpoint);
@@ -336,42 +503,40 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
flags = user_access_save();
if (consumed) {
- kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_MAYBE,
- KCSAN_REPORT_CONSUMED_WATCHPOINT,
- watchpoint - watchpoints);
+ kcsan_save_irqtrace(current);
+ kcsan_report_set_info(ptr, size, type, ip, watchpoint - watchpoints);
+ kcsan_restore_irqtrace(current);
} else {
/*
* The other thread may not print any diagnostics, as it has
* already removed the watchpoint, or another thread consumed
* the watchpoint before this thread.
*/
- kcsan_counter_inc(KCSAN_COUNTER_REPORT_RACES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_REPORT_RACES]);
}
- if ((type & KCSAN_ACCESS_ASSERT) != 0)
- kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES);
+ if (is_assert)
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
else
- kcsan_counter_inc(KCSAN_COUNTER_DATA_RACES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_DATA_RACES]);
user_access_restore(flags);
}
static noinline void
-kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
+kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned long ip)
{
const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0;
atomic_long_t *watchpoint;
- union {
- u8 _1;
- u16 _2;
- u32 _4;
- u64 _8;
- } expect_value;
- unsigned long access_mask;
+ u64 old, new, diff;
enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE;
+ bool interrupt_watcher = kcsan_interrupt_watcher;
unsigned long ua_flags = user_access_save();
+ struct kcsan_ctx *ctx = get_ctx();
+ unsigned long access_mask = ctx->access_mask;
unsigned long irq_flags = 0;
+ bool is_reorder_access;
/*
* Always reset kcsan_skip counter in slow-path to avoid underflow; see
@@ -379,26 +544,49 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
*/
reset_kcsan_skip();
- if (!kcsan_is_enabled())
+ if (!kcsan_is_enabled(ctx))
goto out;
/*
- * Special atomic rules: unlikely to be true, so we check them here in
- * the slow-path, and not in the fast-path in is_atomic(). Call after
- * kcsan_is_enabled(), as we may access memory that is not yet
- * initialized during early boot.
+ * Check to-ignore addresses after kcsan_is_enabled(), as we may access
+ * memory that is not yet initialized during early boot.
*/
- if (!is_assert && kcsan_is_atomic_special(ptr))
+ if (!is_assert && kcsan_ignore_address(ptr))
goto out;
if (!check_encodable((unsigned long)ptr, size)) {
- kcsan_counter_inc(KCSAN_COUNTER_UNENCODABLE_ACCESSES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_UNENCODABLE_ACCESSES]);
goto out;
}
- if (!kcsan_interrupt_watcher)
- /* Use raw to avoid lockdep recursion via IRQ flags tracing. */
- raw_local_irq_save(irq_flags);
+ /*
+ * The local CPU cannot observe reordering of its own accesses, and
+ * therefore we need to take care of 2 cases to avoid false positives:
+ *
+ * 1. Races of the reordered access with interrupts. To avoid, if
+ * the current access is reorder_access, disable interrupts.
+ * 2. Avoid races of scoped accesses from nested interrupts (below).
+ */
+ is_reorder_access = find_reorder_access(ctx, ptr, size, type, ip);
+ if (is_reorder_access)
+ interrupt_watcher = false;
+ /*
+ * Avoid races of scoped accesses from nested interrupts (or scheduler).
+ * Assume setting up a watchpoint for a non-scoped (normal) access that
+ * also conflicts with a current scoped access. In a nested interrupt,
+ * which shares the context, it would check a conflicting scoped access.
+ * To avoid, disable scoped access checking.
+ */
+ ctx->disable_scoped++;
+
+ /*
+ * Save and restore the IRQ state trace touched by KCSAN, since KCSAN's
+ * runtime is entered for every memory access, and potentially useful
+ * information is lost if dirtied by KCSAN.
+ */
+ kcsan_save_irqtrace(current);
+ if (!interrupt_watcher)
+ local_irq_save(irq_flags);
watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write);
if (watchpoint == NULL) {
@@ -407,82 +595,53 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* with which should_watch() returns true should be tweaked so
* that this case happens very rarely.
*/
- kcsan_counter_inc(KCSAN_COUNTER_NO_CAPACITY);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_NO_CAPACITY]);
goto out_unlock;
}
- kcsan_counter_inc(KCSAN_COUNTER_SETUP_WATCHPOINTS);
- kcsan_counter_inc(KCSAN_COUNTER_USED_WATCHPOINTS);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_SETUP_WATCHPOINTS]);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]);
/*
* Read the current value, to later check and infer a race if the data
* was modified via a non-instrumented access, e.g. from a device.
*/
- expect_value._8 = 0;
- switch (size) {
- case 1:
- expect_value._1 = READ_ONCE(*(const u8 *)ptr);
- break;
- case 2:
- expect_value._2 = READ_ONCE(*(const u16 *)ptr);
- break;
- case 4:
- expect_value._4 = READ_ONCE(*(const u32 *)ptr);
- break;
- case 8:
- expect_value._8 = READ_ONCE(*(const u64 *)ptr);
- break;
- default:
- break; /* ignore; we do not diff the values */
- }
-
- if (IS_ENABLED(CONFIG_KCSAN_DEBUG)) {
- kcsan_disable_current();
- pr_err("KCSAN: watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n",
- is_write ? "write" : "read", size, ptr,
- watchpoint_slot((unsigned long)ptr),
- encode_watchpoint((unsigned long)ptr, size, is_write));
- kcsan_enable_current();
- }
+ old = is_reorder_access ? 0 : read_instrumented_memory(ptr, size);
/*
* Delay this thread, to increase probability of observing a racy
* conflicting access.
*/
- udelay(get_delay());
+ delay_access(type);
/*
* Re-read value, and check if it is as expected; if not, we infer a
* racy access.
*/
- access_mask = get_ctx()->access_mask;
- switch (size) {
- case 1:
- expect_value._1 ^= READ_ONCE(*(const u8 *)ptr);
- if (access_mask)
- expect_value._1 &= (u8)access_mask;
- break;
- case 2:
- expect_value._2 ^= READ_ONCE(*(const u16 *)ptr);
- if (access_mask)
- expect_value._2 &= (u16)access_mask;
- break;
- case 4:
- expect_value._4 ^= READ_ONCE(*(const u32 *)ptr);
- if (access_mask)
- expect_value._4 &= (u32)access_mask;
- break;
- case 8:
- expect_value._8 ^= READ_ONCE(*(const u64 *)ptr);
- if (access_mask)
- expect_value._8 &= (u64)access_mask;
- break;
- default:
- break; /* ignore; we do not diff the values */
+ if (!is_reorder_access) {
+ new = read_instrumented_memory(ptr, size);
+ } else {
+ /*
+ * Reordered accesses cannot be used for value change detection,
+ * because the memory location may no longer be accessible and
+ * could result in a fault.
+ */
+ new = 0;
+ access_mask = 0;
}
- /* Were we able to observe a value-change? */
- if (expect_value._8 != 0)
+ diff = old ^ new;
+ if (access_mask)
+ diff &= access_mask;
+
+ /*
+ * Check if we observed a value change.
+ *
+ * Also check if the data race should be ignored (the rules depend on
+ * non-zero diff); if it is to be ignored, the below rules for
+ * KCSAN_VALUE_CHANGE_MAYBE apply.
+ */
+ if (diff && !kcsan_ignore_data_race(size, type, old, new, diff))
value_change = KCSAN_VALUE_CHANGE_TRUE;
/* Check if this access raced with another. */
@@ -514,21 +673,22 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* increment this counter.
*/
if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE)
- kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
- kcsan_report(ptr, size, type, value_change, KCSAN_REPORT_RACE_SIGNAL,
- watchpoint - watchpoints);
+ kcsan_report_known_origin(ptr, size, type, ip,
+ value_change, watchpoint - watchpoints,
+ old, new, access_mask);
} else if (value_change == KCSAN_VALUE_CHANGE_TRUE) {
/* Inferring a race, since the value should not have changed. */
- kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN]);
if (is_assert)
- kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
- if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert)
- kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE,
- KCSAN_REPORT_RACE_UNKNOWN_ORIGIN,
- watchpoint - watchpoints);
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) {
+ kcsan_report_unknown_origin(ptr, size, type, ip,
+ old, new, access_mask);
+ }
}
/*
@@ -536,18 +696,28 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* reused after this point.
*/
remove_watchpoint(watchpoint);
- kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS);
+ atomic_long_dec(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]);
+
out_unlock:
- if (!kcsan_interrupt_watcher)
- raw_local_irq_restore(irq_flags);
+ if (!interrupt_watcher)
+ local_irq_restore(irq_flags);
+ kcsan_restore_irqtrace(current);
+ ctx->disable_scoped--;
+
+ /*
+ * Reordered accesses cannot be used for value change detection,
+ * therefore never consider for reordering if access_mask is set.
+ * ASSERT_EXCLUSIVE are not real accesses, ignore them as well.
+ */
+ if (!access_mask && !is_assert)
+ set_reorder_access(ctx, ptr, size, type, ip);
out:
user_access_restore(ua_flags);
}
-static __always_inline void check_access(const volatile void *ptr, size_t size,
- int type)
+static __always_inline void
+check_access(const volatile void *ptr, size_t size, int type, unsigned long ip)
{
- const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
atomic_long_t *watchpoint;
long encoded_watchpoint;
@@ -558,12 +728,14 @@ static __always_inline void check_access(const volatile void *ptr, size_t size,
if (unlikely(size == 0))
return;
+again:
/*
* Avoid user_access_save in fast-path: find_watchpoint is safe without
* user_access_save, as the address that ptr points to is only used to
* check if a watchpoint exists; ptr is never dereferenced.
*/
- watchpoint = find_watchpoint((unsigned long)ptr, size, !is_write,
+ watchpoint = find_watchpoint((unsigned long)ptr, size,
+ !(type & KCSAN_ACCESS_WRITE),
&encoded_watchpoint);
/*
* It is safe to check kcsan_is_enabled() after find_watchpoint in the
@@ -573,14 +745,46 @@ static __always_inline void check_access(const volatile void *ptr, size_t size,
*/
if (unlikely(watchpoint != NULL))
- kcsan_found_watchpoint(ptr, size, type, watchpoint,
- encoded_watchpoint);
+ kcsan_found_watchpoint(ptr, size, type, ip, watchpoint, encoded_watchpoint);
else {
struct kcsan_ctx *ctx = get_ctx(); /* Call only once in fast-path. */
- if (unlikely(should_watch(ptr, size, type, ctx)))
- kcsan_setup_watchpoint(ptr, size, type);
- else if (unlikely(ctx->scoped_accesses.prev))
+ if (unlikely(should_watch(ctx, ptr, size, type))) {
+ kcsan_setup_watchpoint(ptr, size, type, ip);
+ return;
+ }
+
+ if (!(type & KCSAN_ACCESS_SCOPED)) {
+ struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
+
+ if (reorder_access) {
+ /*
+ * reorder_access check: simulates reordering of
+ * the access after subsequent operations.
+ */
+ ptr = reorder_access->ptr;
+ type = reorder_access->type;
+ ip = reorder_access->ip;
+ /*
+ * Upon a nested interrupt, this context's
+ * reorder_access can be modified (shared ctx).
+ * We know that upon return, reorder_access is
+ * always invalidated by setting size to 0 via
+ * __tsan_func_exit(). Therefore we must read
+ * and check size after the other fields.
+ */
+ barrier();
+ size = READ_ONCE(reorder_access->size);
+ if (size)
+ goto again;
+ }
+ }
+
+ /*
+ * Always checked last, right before returning from runtime;
+ * if reorder_access is valid, checked after it was checked.
+ */
+ if (unlikely(ctx->scoped_accesses.prev))
kcsan_check_scoped_accesses();
}
}
@@ -589,16 +793,30 @@ static __always_inline void check_access(const volatile void *ptr, size_t size,
void __init kcsan_init(void)
{
+ int cpu;
+
BUG_ON(!in_task());
- kcsan_debugfs_init();
+ for_each_possible_cpu(cpu)
+ per_cpu(kcsan_rand_state, cpu) = (u32)get_cycles();
/*
* We are in the init task, and no other tasks should be running;
* WRITE_ONCE without memory barrier is sufficient.
*/
- if (kcsan_early_enable)
+ if (kcsan_early_enable) {
+ pr_info("enabled early\n");
WRITE_ONCE(kcsan_enabled, true);
+ }
+
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) ||
+ IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) ||
+ IS_ENABLED(CONFIG_KCSAN_PERMISSIVE) ||
+ IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) {
+ pr_warn("non-strict mode configured - use CONFIG_KCSAN_STRICT=y to see all data races\n");
+ } else {
+ pr_info("strict mode configured\n");
+ }
}
/* === Exported interface =================================================== */
@@ -691,7 +909,7 @@ kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type,
{
struct kcsan_ctx *ctx = get_ctx();
- __kcsan_check_access(ptr, size, type);
+ check_access(ptr, size, type, _RET_IP_);
ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */
@@ -699,6 +917,7 @@ kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type,
sa->ptr = ptr;
sa->size = size;
sa->type = type;
+ sa->ip = _RET_IP_;
if (!ctx->scoped_accesses.prev) /* Lazy initialize list head. */
INIT_LIST_HEAD(&ctx->scoped_accesses);
@@ -730,16 +949,32 @@ void kcsan_end_scoped_access(struct kcsan_scoped_access *sa)
ctx->disable_count--;
- __kcsan_check_access(sa->ptr, sa->size, sa->type);
+ check_access(sa->ptr, sa->size, sa->type, sa->ip);
}
EXPORT_SYMBOL(kcsan_end_scoped_access);
void __kcsan_check_access(const volatile void *ptr, size_t size, int type)
{
- check_access(ptr, size, type);
+ check_access(ptr, size, type, _RET_IP_);
}
EXPORT_SYMBOL(__kcsan_check_access);
+#define DEFINE_MEMORY_BARRIER(name, order_before_cond) \
+ void __kcsan_##name(void) \
+ { \
+ struct kcsan_scoped_access *sa = get_reorder_access(get_ctx()); \
+ if (!sa) \
+ return; \
+ if (order_before_cond) \
+ sa->size = 0; \
+ } \
+ EXPORT_SYMBOL(__kcsan_##name)
+
+DEFINE_MEMORY_BARRIER(mb, true);
+DEFINE_MEMORY_BARRIER(wmb, sa->type & (KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND));
+DEFINE_MEMORY_BARRIER(rmb, !(sa->type & KCSAN_ACCESS_WRITE) || (sa->type & KCSAN_ACCESS_COMPOUND));
+DEFINE_MEMORY_BARRIER(release, true);
+
/*
* KCSAN uses the same instrumentation that is emitted by supported compilers
* for ThreadSanitizer (TSAN).
@@ -754,22 +989,35 @@ EXPORT_SYMBOL(__kcsan_check_access);
*/
#define DEFINE_TSAN_READ_WRITE(size) \
+ void __tsan_read##size(void *ptr); \
void __tsan_read##size(void *ptr) \
{ \
- check_access(ptr, size, 0); \
+ check_access(ptr, size, 0, _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_read##size); \
void __tsan_unaligned_read##size(void *ptr) \
__alias(__tsan_read##size); \
EXPORT_SYMBOL(__tsan_unaligned_read##size); \
+ void __tsan_write##size(void *ptr); \
void __tsan_write##size(void *ptr) \
{ \
- check_access(ptr, size, KCSAN_ACCESS_WRITE); \
+ check_access(ptr, size, KCSAN_ACCESS_WRITE, _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_write##size); \
void __tsan_unaligned_write##size(void *ptr) \
__alias(__tsan_write##size); \
- EXPORT_SYMBOL(__tsan_unaligned_write##size)
+ EXPORT_SYMBOL(__tsan_unaligned_write##size); \
+ void __tsan_read_write##size(void *ptr); \
+ void __tsan_read_write##size(void *ptr) \
+ { \
+ check_access(ptr, size, \
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE, \
+ _RET_IP_); \
+ } \
+ EXPORT_SYMBOL(__tsan_read_write##size); \
+ void __tsan_unaligned_read_write##size(void *ptr) \
+ __alias(__tsan_read_write##size); \
+ EXPORT_SYMBOL(__tsan_unaligned_read_write##size)
DEFINE_TSAN_READ_WRITE(1);
DEFINE_TSAN_READ_WRITE(2);
@@ -777,15 +1025,17 @@ DEFINE_TSAN_READ_WRITE(4);
DEFINE_TSAN_READ_WRITE(8);
DEFINE_TSAN_READ_WRITE(16);
+void __tsan_read_range(void *ptr, size_t size);
void __tsan_read_range(void *ptr, size_t size)
{
- check_access(ptr, size, 0);
+ check_access(ptr, size, 0, _RET_IP_);
}
EXPORT_SYMBOL(__tsan_read_range);
+void __tsan_write_range(void *ptr, size_t size);
void __tsan_write_range(void *ptr, size_t size)
{
- check_access(ptr, size, KCSAN_ACCESS_WRITE);
+ check_access(ptr, size, KCSAN_ACCESS_WRITE, _RET_IP_);
}
EXPORT_SYMBOL(__tsan_write_range);
@@ -799,18 +1049,21 @@ EXPORT_SYMBOL(__tsan_write_range);
* the size-check of compiletime_assert_rwonce_type().
*/
#define DEFINE_TSAN_VOLATILE_READ_WRITE(size) \
+ void __tsan_volatile_read##size(void *ptr); \
void __tsan_volatile_read##size(void *ptr) \
{ \
const bool is_atomic = size <= sizeof(long long) && \
IS_ALIGNED((unsigned long)ptr, size); \
if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) && is_atomic) \
return; \
- check_access(ptr, size, is_atomic ? KCSAN_ACCESS_ATOMIC : 0); \
+ check_access(ptr, size, is_atomic ? KCSAN_ACCESS_ATOMIC : 0, \
+ _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_volatile_read##size); \
void __tsan_unaligned_volatile_read##size(void *ptr) \
__alias(__tsan_volatile_read##size); \
EXPORT_SYMBOL(__tsan_unaligned_volatile_read##size); \
+ void __tsan_volatile_write##size(void *ptr); \
void __tsan_volatile_write##size(void *ptr) \
{ \
const bool is_atomic = size <= sizeof(long long) && \
@@ -819,7 +1072,8 @@ EXPORT_SYMBOL(__tsan_write_range);
return; \
check_access(ptr, size, \
KCSAN_ACCESS_WRITE | \
- (is_atomic ? KCSAN_ACCESS_ATOMIC : 0)); \
+ (is_atomic ? KCSAN_ACCESS_ATOMIC : 0), \
+ _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_volatile_write##size); \
void __tsan_unaligned_volatile_write##size(void *ptr) \
@@ -833,18 +1087,285 @@ DEFINE_TSAN_VOLATILE_READ_WRITE(8);
DEFINE_TSAN_VOLATILE_READ_WRITE(16);
/*
- * The below are not required by KCSAN, but can still be emitted by the
- * compiler.
+ * Function entry and exit are used to determine the validty of reorder_access.
+ * Reordering of the access ends at the end of the function scope where the
+ * access happened. This is done for two reasons:
+ *
+ * 1. Artificially limits the scope where missing barriers are detected.
+ * This minimizes false positives due to uninstrumented functions that
+ * contain the required barriers but were missed.
+ *
+ * 2. Simplifies generating the stack trace of the access.
*/
-void __tsan_func_entry(void *call_pc)
+void __tsan_func_entry(void *call_pc);
+noinline void __tsan_func_entry(void *call_pc)
{
+ if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ return;
+
+ add_kcsan_stack_depth(1);
}
EXPORT_SYMBOL(__tsan_func_entry);
-void __tsan_func_exit(void)
+
+void __tsan_func_exit(void);
+noinline void __tsan_func_exit(void)
{
+ struct kcsan_scoped_access *reorder_access;
+
+ if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ return;
+
+ reorder_access = get_reorder_access(get_ctx());
+ if (!reorder_access)
+ goto out;
+
+ if (get_kcsan_stack_depth() <= reorder_access->stack_depth) {
+ /*
+ * Access check to catch cases where write without a barrier
+ * (supposed release) was last access in function: because
+ * instrumentation is inserted before the real access, a data
+ * race due to the write giving up a c-s would only be caught if
+ * we do the conflicting access after.
+ */
+ check_access(reorder_access->ptr, reorder_access->size,
+ reorder_access->type, reorder_access->ip);
+ reorder_access->size = 0;
+ reorder_access->stack_depth = INT_MIN;
+ }
+out:
+ add_kcsan_stack_depth(-1);
}
EXPORT_SYMBOL(__tsan_func_exit);
+
+void __tsan_init(void);
void __tsan_init(void)
{
}
EXPORT_SYMBOL(__tsan_init);
+
+/*
+ * Instrumentation for atomic builtins (__atomic_*, __sync_*).
+ *
+ * Normal kernel code _should not_ be using them directly, but some
+ * architectures may implement some or all atomics using the compilers'
+ * builtins.
+ *
+ * Note: If an architecture decides to fully implement atomics using the
+ * builtins, because they are implicitly instrumented by KCSAN (and KASAN,
+ * etc.), implementing the ARCH_ATOMIC interface (to get instrumentation via
+ * atomic-instrumented) is no longer necessary.
+ *
+ * TSAN instrumentation replaces atomic accesses with calls to any of the below
+ * functions, whose job is to also execute the operation itself.
+ */
+
+static __always_inline void kcsan_atomic_builtin_memorder(int memorder)
+{
+ if (memorder == __ATOMIC_RELEASE ||
+ memorder == __ATOMIC_SEQ_CST ||
+ memorder == __ATOMIC_ACQ_REL)
+ __kcsan_release();
+}
+
+#define DEFINE_TSAN_ATOMIC_LOAD_STORE(bits) \
+ u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder); \
+ u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \
+ { \
+ kcsan_atomic_builtin_memorder(memorder); \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC, _RET_IP_); \
+ } \
+ return __atomic_load_n(ptr, memorder); \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_load); \
+ void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder); \
+ void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder) \
+ { \
+ kcsan_atomic_builtin_memorder(memorder); \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, \
+ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC, _RET_IP_); \
+ } \
+ __atomic_store_n(ptr, v, memorder); \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_store)
+
+#define DEFINE_TSAN_ATOMIC_RMW(op, bits, suffix) \
+ u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \
+ u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \
+ { \
+ kcsan_atomic_builtin_memorder(memorder); \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, \
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
+ KCSAN_ACCESS_ATOMIC, _RET_IP_); \
+ } \
+ return __atomic_##op##suffix(ptr, v, memorder); \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_##op)
+
+/*
+ * Note: CAS operations are always classified as write, even in case they
+ * fail. We cannot perform check_access() after a write, as it might lead to
+ * false positives, in cases such as:
+ *
+ * T0: __atomic_compare_exchange_n(&p->flag, &old, 1, ...)
+ *
+ * T1: if (__atomic_load_n(&p->flag, ...)) {
+ * modify *p;
+ * p->flag = 0;
+ * }
+ *
+ * The only downside is that, if there are 3 threads, with one CAS that
+ * succeeds, another CAS that fails, and an unmarked racing operation, we may
+ * point at the wrong CAS as the source of the race. However, if we assume that
+ * all CAS can succeed in some other execution, the data race is still valid.
+ */
+#define DEFINE_TSAN_ATOMIC_CMPXCHG(bits, strength, weak) \
+ int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \
+ u##bits val, int mo, int fail_mo); \
+ int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \
+ u##bits val, int mo, int fail_mo) \
+ { \
+ kcsan_atomic_builtin_memorder(mo); \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, \
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
+ KCSAN_ACCESS_ATOMIC, _RET_IP_); \
+ } \
+ return __atomic_compare_exchange_n(ptr, exp, val, weak, mo, fail_mo); \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_##strength)
+
+#define DEFINE_TSAN_ATOMIC_CMPXCHG_VAL(bits) \
+ u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \
+ int mo, int fail_mo); \
+ u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \
+ int mo, int fail_mo) \
+ { \
+ kcsan_atomic_builtin_memorder(mo); \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, \
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
+ KCSAN_ACCESS_ATOMIC, _RET_IP_); \
+ } \
+ __atomic_compare_exchange_n(ptr, &exp, val, 0, mo, fail_mo); \
+ return exp; \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_val)
+
+#define DEFINE_TSAN_ATOMIC_OPS(bits) \
+ DEFINE_TSAN_ATOMIC_LOAD_STORE(bits); \
+ DEFINE_TSAN_ATOMIC_RMW(exchange, bits, _n); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_add, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_sub, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_and, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_or, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_xor, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_nand, bits, ); \
+ DEFINE_TSAN_ATOMIC_CMPXCHG(bits, strong, 0); \
+ DEFINE_TSAN_ATOMIC_CMPXCHG(bits, weak, 1); \
+ DEFINE_TSAN_ATOMIC_CMPXCHG_VAL(bits)
+
+DEFINE_TSAN_ATOMIC_OPS(8);
+DEFINE_TSAN_ATOMIC_OPS(16);
+DEFINE_TSAN_ATOMIC_OPS(32);
+#ifdef CONFIG_64BIT
+DEFINE_TSAN_ATOMIC_OPS(64);
+#endif
+
+void __tsan_atomic_thread_fence(int memorder);
+void __tsan_atomic_thread_fence(int memorder)
+{
+ kcsan_atomic_builtin_memorder(memorder);
+ __atomic_thread_fence(memorder);
+}
+EXPORT_SYMBOL(__tsan_atomic_thread_fence);
+
+/*
+ * In instrumented files, we emit instrumentation for barriers by mapping the
+ * kernel barriers to an __atomic_signal_fence(), which is interpreted specially
+ * and otherwise has no relation to a real __atomic_signal_fence(). No known
+ * kernel code uses __atomic_signal_fence().
+ *
+ * Since fsanitize=thread instrumentation handles __atomic_signal_fence(), which
+ * are turned into calls to __tsan_atomic_signal_fence(), such instrumentation
+ * can be disabled via the __no_kcsan function attribute (vs. an explicit call
+ * which could not). When __no_kcsan is requested, __atomic_signal_fence()
+ * generates no code.
+ *
+ * Note: The result of using __atomic_signal_fence() with KCSAN enabled is
+ * potentially limiting the compiler's ability to reorder operations; however,
+ * if barriers were instrumented with explicit calls (without LTO), the compiler
+ * couldn't optimize much anyway. The result of a hypothetical architecture
+ * using __atomic_signal_fence() in normal code would be KCSAN false negatives.
+ */
+void __tsan_atomic_signal_fence(int memorder);
+noinline void __tsan_atomic_signal_fence(int memorder)
+{
+ switch (memorder) {
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_mb:
+ __kcsan_mb();
+ break;
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_wmb:
+ __kcsan_wmb();
+ break;
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_rmb:
+ __kcsan_rmb();
+ break;
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_release:
+ __kcsan_release();
+ break;
+ default:
+ break;
+ }
+}
+EXPORT_SYMBOL(__tsan_atomic_signal_fence);
+
+#ifdef __HAVE_ARCH_MEMSET
+void *__tsan_memset(void *s, int c, size_t count);
+noinline void *__tsan_memset(void *s, int c, size_t count)
+{
+ /*
+ * Instead of not setting up watchpoints where accessed size is greater
+ * than MAX_ENCODABLE_SIZE, truncate checked size to MAX_ENCODABLE_SIZE.
+ */
+ size_t check_len = min_t(size_t, count, MAX_ENCODABLE_SIZE);
+
+ check_access(s, check_len, KCSAN_ACCESS_WRITE, _RET_IP_);
+ return memset(s, c, count);
+}
+#else
+void *__tsan_memset(void *s, int c, size_t count) __alias(memset);
+#endif
+EXPORT_SYMBOL(__tsan_memset);
+
+#ifdef __HAVE_ARCH_MEMMOVE
+void *__tsan_memmove(void *dst, const void *src, size_t len);
+noinline void *__tsan_memmove(void *dst, const void *src, size_t len)
+{
+ size_t check_len = min_t(size_t, len, MAX_ENCODABLE_SIZE);
+
+ check_access(dst, check_len, KCSAN_ACCESS_WRITE, _RET_IP_);
+ check_access(src, check_len, 0, _RET_IP_);
+ return memmove(dst, src, len);
+}
+#else
+void *__tsan_memmove(void *dst, const void *src, size_t len) __alias(memmove);
+#endif
+EXPORT_SYMBOL(__tsan_memmove);
+
+#ifdef __HAVE_ARCH_MEMCPY
+void *__tsan_memcpy(void *dst, const void *src, size_t len);
+noinline void *__tsan_memcpy(void *dst, const void *src, size_t len)
+{
+ size_t check_len = min_t(size_t, len, MAX_ENCODABLE_SIZE);
+
+ check_access(dst, check_len, KCSAN_ACCESS_WRITE, _RET_IP_);
+ check_access(src, check_len, 0, _RET_IP_);
+ return memcpy(dst, src, len);
+}
+#else
+void *__tsan_memcpy(void *dst, const void *src, size_t len) __alias(memcpy);
+#endif
+EXPORT_SYMBOL(__tsan_memcpy);
diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c
index 023e49c58d55..1d1d1b0e4248 100644
--- a/kernel/kcsan/debugfs.c
+++ b/kernel/kcsan/debugfs.c
@@ -1,4 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN debugfs interface.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
+
+#define pr_fmt(fmt) "kcsan: " fmt
#include <linux/atomic.h>
#include <linux/bsearch.h>
@@ -15,10 +22,19 @@
#include "kcsan.h"
-/*
- * Statistics counters.
- */
-static atomic_long_t counters[KCSAN_COUNTER_COUNT];
+atomic_long_t kcsan_counters[KCSAN_COUNTER_COUNT];
+static const char *const counter_names[] = {
+ [KCSAN_COUNTER_USED_WATCHPOINTS] = "used_watchpoints",
+ [KCSAN_COUNTER_SETUP_WATCHPOINTS] = "setup_watchpoints",
+ [KCSAN_COUNTER_DATA_RACES] = "data_races",
+ [KCSAN_COUNTER_ASSERT_FAILURES] = "assert_failures",
+ [KCSAN_COUNTER_NO_CAPACITY] = "no_capacity",
+ [KCSAN_COUNTER_REPORT_RACES] = "report_races",
+ [KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN] = "races_unknown_origin",
+ [KCSAN_COUNTER_UNENCODABLE_ACCESSES] = "unencodable_accesses",
+ [KCSAN_COUNTER_ENCODING_FALSE_POSITIVES] = "encoding_false_positives",
+};
+static_assert(ARRAY_SIZE(counter_names) == KCSAN_COUNTER_COUNT);
/*
* Addresses for filtering functions from reporting. This list can be used as a
@@ -39,34 +55,6 @@ static struct {
};
static DEFINE_SPINLOCK(report_filterlist_lock);
-static const char *counter_to_name(enum kcsan_counter_id id)
-{
- switch (id) {
- case KCSAN_COUNTER_USED_WATCHPOINTS: return "used_watchpoints";
- case KCSAN_COUNTER_SETUP_WATCHPOINTS: return "setup_watchpoints";
- case KCSAN_COUNTER_DATA_RACES: return "data_races";
- case KCSAN_COUNTER_ASSERT_FAILURES: return "assert_failures";
- case KCSAN_COUNTER_NO_CAPACITY: return "no_capacity";
- case KCSAN_COUNTER_REPORT_RACES: return "report_races";
- case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: return "races_unknown_origin";
- case KCSAN_COUNTER_UNENCODABLE_ACCESSES: return "unencodable_accesses";
- case KCSAN_COUNTER_ENCODING_FALSE_POSITIVES: return "encoding_false_positives";
- case KCSAN_COUNTER_COUNT:
- BUG();
- }
- return NULL;
-}
-
-void kcsan_counter_inc(enum kcsan_counter_id id)
-{
- atomic_long_inc(&counters[id]);
-}
-
-void kcsan_counter_dec(enum kcsan_counter_id id)
-{
- atomic_long_dec(&counters[id]);
-}
-
/*
* The microbenchmark allows benchmarking KCSAN core runtime only. To run
* multiple threads, pipe 'microbench=<iters>' from multiple tasks into the
@@ -76,7 +64,7 @@ static noinline void microbenchmark(unsigned long iters)
{
const struct kcsan_ctx ctx_save = current->kcsan_ctx;
const bool was_enabled = READ_ONCE(kcsan_enabled);
- cycles_t cycles;
+ u64 cycles;
/* We may have been called from an atomic region; reset context. */
memset(&current->kcsan_ctx, 0, sizeof(current->kcsan_ctx));
@@ -86,7 +74,7 @@ static noinline void microbenchmark(unsigned long iters)
*/
WRITE_ONCE(kcsan_enabled, false);
- pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters);
+ pr_info("%s begin | iters: %lu\n", __func__, iters);
cycles = get_cycles();
while (iters--) {
@@ -97,73 +85,13 @@ static noinline void microbenchmark(unsigned long iters)
}
cycles = get_cycles() - cycles;
- pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles);
+ pr_info("%s end | cycles: %llu\n", __func__, cycles);
WRITE_ONCE(kcsan_enabled, was_enabled);
/* restore context */
current->kcsan_ctx = ctx_save;
}
-/*
- * Simple test to create conflicting accesses. Write 'test=<iters>' to KCSAN's
- * debugfs file from multiple tasks to generate real conflicts and show reports.
- */
-static long test_dummy;
-static long test_flags;
-static long test_scoped;
-static noinline void test_thread(unsigned long iters)
-{
- const long CHANGE_BITS = 0xff00ff00ff00ff00L;
- const struct kcsan_ctx ctx_save = current->kcsan_ctx;
- cycles_t cycles;
-
- /* We may have been called from an atomic region; reset context. */
- memset(&current->kcsan_ctx, 0, sizeof(current->kcsan_ctx));
-
- pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters);
- pr_info("test_dummy@%px, test_flags@%px, test_scoped@%px,\n",
- &test_dummy, &test_flags, &test_scoped);
-
- cycles = get_cycles();
- while (iters--) {
- /* These all should generate reports. */
- __kcsan_check_read(&test_dummy, sizeof(test_dummy));
- ASSERT_EXCLUSIVE_WRITER(test_dummy);
- ASSERT_EXCLUSIVE_ACCESS(test_dummy);
-
- ASSERT_EXCLUSIVE_BITS(test_flags, ~CHANGE_BITS); /* no report */
- __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */
-
- ASSERT_EXCLUSIVE_BITS(test_flags, CHANGE_BITS); /* report */
- __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */
-
- /* not actually instrumented */
- WRITE_ONCE(test_dummy, iters); /* to observe value-change */
- __kcsan_check_write(&test_dummy, sizeof(test_dummy));
-
- test_flags ^= CHANGE_BITS; /* generate value-change */
- __kcsan_check_write(&test_flags, sizeof(test_flags));
-
- BUG_ON(current->kcsan_ctx.scoped_accesses.prev);
- {
- /* Should generate reports anywhere in this block. */
- ASSERT_EXCLUSIVE_WRITER_SCOPED(test_scoped);
- ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_scoped);
- BUG_ON(!current->kcsan_ctx.scoped_accesses.prev);
- /* Unrelated accesses. */
- __kcsan_check_access(&cycles, sizeof(cycles), 0);
- __kcsan_check_access(&cycles, sizeof(cycles), KCSAN_ACCESS_ATOMIC);
- }
- BUG_ON(current->kcsan_ctx.scoped_accesses.prev);
- }
- cycles = get_cycles() - cycles;
-
- pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles);
-
- /* restore context */
- current->kcsan_ctx = ctx_save;
-}
-
static int cmp_filterlist_addrs(const void *rhs, const void *lhs)
{
const unsigned long a = *(const unsigned long *)rhs;
@@ -220,7 +148,7 @@ static ssize_t insert_report_filterlist(const char *func)
ssize_t ret = 0;
if (!addr) {
- pr_err("KCSAN: could not find function: '%s'\n", func);
+ pr_err("could not find function: '%s'\n", func);
return -ENOENT;
}
@@ -270,9 +198,10 @@ static int show_info(struct seq_file *file, void *v)
/* show stats */
seq_printf(file, "enabled: %i\n", READ_ONCE(kcsan_enabled));
- for (i = 0; i < KCSAN_COUNTER_COUNT; ++i)
- seq_printf(file, "%s: %ld\n", counter_to_name(i),
- atomic_long_read(&counters[i]));
+ for (i = 0; i < KCSAN_COUNTER_COUNT; ++i) {
+ seq_printf(file, "%s: %ld\n", counter_names[i],
+ atomic_long_read(&kcsan_counters[i]));
+ }
/* show filter functions, and filter type */
spin_lock_irqsave(&report_filterlist_lock, flags);
@@ -307,18 +236,12 @@ debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *o
WRITE_ONCE(kcsan_enabled, true);
} else if (!strcmp(arg, "off")) {
WRITE_ONCE(kcsan_enabled, false);
- } else if (!strncmp(arg, "microbench=", sizeof("microbench=") - 1)) {
+ } else if (str_has_prefix(arg, "microbench=")) {
unsigned long iters;
- if (kstrtoul(&arg[sizeof("microbench=") - 1], 0, &iters))
+ if (kstrtoul(&arg[strlen("microbench=")], 0, &iters))
return -EINVAL;
microbenchmark(iters);
- } else if (!strncmp(arg, "test=", sizeof("test=") - 1)) {
- unsigned long iters;
-
- if (kstrtoul(&arg[sizeof("test=") - 1], 0, &iters))
- return -EINVAL;
- test_thread(iters);
} else if (!strcmp(arg, "whitelist")) {
set_report_filterlist_whitelist(true);
} else if (!strcmp(arg, "blacklist")) {
@@ -343,7 +266,10 @@ static const struct file_operations debugfs_ops =
.release = single_release
};
-void __init kcsan_debugfs_init(void)
+static int __init kcsan_debugfs_init(void)
{
debugfs_create_file("kcsan", 0644, NULL, NULL, &debugfs_ops);
+ return 0;
}
+
+late_initcall(kcsan_debugfs_init);
diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h
index f03562aaf2eb..170a2bb22f53 100644
--- a/kernel/kcsan/encoding.h
+++ b/kernel/kcsan/encoding.h
@@ -1,4 +1,9 @@
/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * KCSAN watchpoint encoding.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
#ifndef _KERNEL_KCSAN_ENCODING_H
#define _KERNEL_KCSAN_ENCODING_H
@@ -32,23 +37,25 @@
* 1. different addresses but with the same encoded address race;
* 2. and both map onto the same watchpoint slots;
*
- * Both these are assumed to be very unlikely. However, in case it still happens
+ * Both these are assumed to be very unlikely. However, in case it still
* happens, the report logic will filter out the false positive (see report.c).
*/
#define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS)
-/*
- * Masks to set/retrieve the encoded data.
- */
-#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG-1)
-#define WATCHPOINT_SIZE_MASK \
- GENMASK(BITS_PER_LONG-2, BITS_PER_LONG-2 - WATCHPOINT_SIZE_BITS)
-#define WATCHPOINT_ADDR_MASK \
- GENMASK(BITS_PER_LONG-3 - WATCHPOINT_SIZE_BITS, 0)
+/* Bitmasks for the encoded watchpoint access information. */
+#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG-1)
+#define WATCHPOINT_SIZE_MASK GENMASK(BITS_PER_LONG-2, WATCHPOINT_ADDR_BITS)
+#define WATCHPOINT_ADDR_MASK GENMASK(WATCHPOINT_ADDR_BITS-1, 0)
+static_assert(WATCHPOINT_ADDR_MASK == (1UL << WATCHPOINT_ADDR_BITS) - 1);
+static_assert((WATCHPOINT_WRITE_MASK ^ WATCHPOINT_SIZE_MASK ^ WATCHPOINT_ADDR_MASK) == ~0UL);
static inline bool check_encodable(unsigned long addr, size_t size)
{
- return size <= MAX_ENCODABLE_SIZE;
+ /*
+ * While we can encode addrs<PAGE_SIZE, avoid crashing with a NULL
+ * pointer deref inside KCSAN.
+ */
+ return addr >= PAGE_SIZE && size <= MAX_ENCODABLE_SIZE;
}
static inline long
diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
index 763d6d08d94b..ae33c2a7f07e 100644
--- a/kernel/kcsan/kcsan.h
+++ b/kernel/kcsan/kcsan.h
@@ -1,14 +1,17 @@
/* SPDX-License-Identifier: GPL-2.0 */
-
/*
* The Kernel Concurrency Sanitizer (KCSAN) infrastructure. For more info please
* see Documentation/dev-tools/kcsan.rst.
+ *
+ * Copyright (C) 2019, Google LLC.
*/
#ifndef _KERNEL_KCSAN_KCSAN_H
#define _KERNEL_KCSAN_KCSAN_H
+#include <linux/atomic.h>
#include <linux/kcsan.h>
+#include <linux/sched.h>
/* The number of adjacent watchpoints to check. */
#define KCSAN_CHECK_ADJACENT 1
@@ -23,10 +26,15 @@ extern unsigned int kcsan_udelay_interrupt;
extern bool kcsan_enabled;
/*
- * Initialize debugfs file.
+ * Save/restore IRQ flags state trace dirtied by KCSAN.
*/
-void kcsan_debugfs_init(void);
+void kcsan_save_irqtrace(struct task_struct *task);
+void kcsan_restore_irqtrace(struct task_struct *task);
+/*
+ * Statistics counters displayed via debugfs; should only be modified in
+ * slow-paths.
+ */
enum kcsan_counter_id {
/*
* Number of watchpoints currently in use.
@@ -79,12 +87,7 @@ enum kcsan_counter_id {
KCSAN_COUNTER_COUNT, /* number of counters */
};
-
-/*
- * Increment/decrement counter with given id; avoid calling these in fast-path.
- */
-extern void kcsan_counter_inc(enum kcsan_counter_id id);
-extern void kcsan_counter_dec(enum kcsan_counter_id id);
+extern atomic_long_t kcsan_counters[KCSAN_COUNTER_COUNT];
/*
* Returns true if data races in the function symbol that maps to func_addr
@@ -113,30 +116,27 @@ enum kcsan_value_change {
KCSAN_VALUE_CHANGE_TRUE,
};
-enum kcsan_report_type {
- /*
- * The thread that set up the watchpoint and briefly stalled was
- * signalled that another thread triggered the watchpoint.
- */
- KCSAN_REPORT_RACE_SIGNAL,
-
- /*
- * A thread found and consumed a matching watchpoint.
- */
- KCSAN_REPORT_CONSUMED_WATCHPOINT,
+/*
+ * The calling thread hit and consumed a watchpoint: set the access information
+ * to be consumed by the reporting thread. No report is printed yet.
+ */
+void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_type,
+ unsigned long ip, int watchpoint_idx);
- /*
- * No other thread was observed to race with the access, but the data
- * value before and after the stall differs.
- */
- KCSAN_REPORT_RACE_UNKNOWN_ORIGIN,
-};
+/*
+ * The calling thread observed that the watchpoint it set up was hit and
+ * consumed: print the full report based on information set by the racing
+ * thread.
+ */
+void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access_type,
+ unsigned long ip, enum kcsan_value_change value_change,
+ int watchpoint_idx, u64 old, u64 new, u64 mask);
/*
- * Print a race report from thread that encountered the race.
+ * No other thread was observed to race with the access, but the data value
+ * before and after the stall differs. Reports a race of "unknown origin".
*/
-extern void kcsan_report(const volatile void *ptr, size_t size, int access_type,
- enum kcsan_value_change value_change,
- enum kcsan_report_type type, int watchpoint_idx);
+void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int access_type,
+ unsigned long ip, u64 old, u64 new, u64 mask);
#endif /* _KERNEL_KCSAN_KCSAN_H */
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
new file mode 100644
index 000000000000..015586217875
--- /dev/null
+++ b/kernel/kcsan/kcsan_test.c
@@ -0,0 +1,1607 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN test with various race scenarious to test runtime behaviour. Since the
+ * interface with which KCSAN's reports are obtained is via the console, this is
+ * the output we should verify. For each test case checks the presence (or
+ * absence) of generated reports. Relies on 'console' tracepoint to capture
+ * reports as they appear in the kernel log.
+ *
+ * Makes use of KUnit for test organization, and the Torture framework for test
+ * thread control.
+ *
+ * Copyright (C) 2020, Google LLC.
+ * Author: Marco Elver <elver@google.com>
+ */
+
+#define pr_fmt(fmt) "kcsan_test: " fmt
+
+#include <kunit/test.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/jiffies.h>
+#include <linux/kcsan-checks.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/seqlock.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/timer.h>
+#include <linux/torture.h>
+#include <linux/tracepoint.h>
+#include <linux/types.h>
+#include <trace/events/printk.h>
+
+#define KCSAN_TEST_REQUIRES(test, cond) do { \
+ if (!(cond)) \
+ kunit_skip((test), "Test requires: " #cond); \
+} while (0)
+
+#ifdef CONFIG_CC_HAS_TSAN_COMPOUND_READ_BEFORE_WRITE
+#define __KCSAN_ACCESS_RW(alt) (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE)
+#else
+#define __KCSAN_ACCESS_RW(alt) (alt)
+#endif
+
+/* Points to current test-case memory access "kernels". */
+static void (*access_kernels[2])(void);
+
+static struct task_struct **threads; /* Lists of threads. */
+static unsigned long end_time; /* End time of test. */
+
+/* Report as observed from console. */
+static struct {
+ spinlock_t lock;
+ int nlines;
+ char lines[3][512];
+} observed = {
+ .lock = __SPIN_LOCK_UNLOCKED(observed.lock),
+};
+
+/* Setup test checking loop. */
+static __no_kcsan inline void
+begin_test_checks(void (*func1)(void), void (*func2)(void))
+{
+ kcsan_disable_current();
+
+ /*
+ * Require at least as long as KCSAN_REPORT_ONCE_IN_MS, to ensure at
+ * least one race is reported.
+ */
+ end_time = jiffies + msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS + 500);
+
+ /* Signal start; release potential initialization of shared data. */
+ smp_store_release(&access_kernels[0], func1);
+ smp_store_release(&access_kernels[1], func2);
+}
+
+/* End test checking loop. */
+static __no_kcsan inline bool
+end_test_checks(bool stop)
+{
+ if (!stop && time_before(jiffies, end_time)) {
+ /* Continue checking */
+ might_sleep();
+ return false;
+ }
+
+ kcsan_enable_current();
+ return true;
+}
+
+/*
+ * Probe for console output: checks if a race was reported, and obtains observed
+ * lines of interest.
+ */
+__no_kcsan
+static void probe_console(void *ignore, const char *buf, size_t len)
+{
+ unsigned long flags;
+ int nlines;
+
+ /*
+ * Note that KCSAN reports under a global lock, so we do not risk the
+ * possibility of having multiple reports interleaved. If that were the
+ * case, we'd expect tests to fail.
+ */
+
+ spin_lock_irqsave(&observed.lock, flags);
+ nlines = observed.nlines;
+
+ if (strnstr(buf, "BUG: KCSAN: ", len) && strnstr(buf, "test_", len)) {
+ /*
+ * KCSAN report and related to the test.
+ *
+ * The provided @buf is not NUL-terminated; copy no more than
+ * @len bytes and let strscpy() add the missing NUL-terminator.
+ */
+ strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0])));
+ nlines = 1;
+ } else if ((nlines == 1 || nlines == 2) && strnstr(buf, "bytes by", len)) {
+ strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0])));
+
+ if (strnstr(buf, "race at unknown origin", len)) {
+ if (WARN_ON(nlines != 2))
+ goto out;
+
+ /* No second line of interest. */
+ strcpy(observed.lines[nlines++], "<none>");
+ }
+ }
+
+out:
+ WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */
+ spin_unlock_irqrestore(&observed.lock, flags);
+}
+
+/* Check if a report related to the test exists. */
+__no_kcsan
+static bool report_available(void)
+{
+ return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines);
+}
+
+/* Report information we expect in a report. */
+struct expect_report {
+ /* Access information of both accesses. */
+ struct {
+ void *fn; /* Function pointer to expected function of top frame. */
+ void *addr; /* Address of access; unchecked if NULL. */
+ size_t size; /* Size of access; unchecked if @addr is NULL. */
+ int type; /* Access type, see KCSAN_ACCESS definitions. */
+ } access[2];
+};
+
+/* Check observed report matches information in @r. */
+__no_kcsan
+static bool __report_matches(const struct expect_report *r)
+{
+ const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT;
+ bool ret = false;
+ unsigned long flags;
+ typeof(*observed.lines) *expect;
+ const char *end;
+ char *cur;
+ int i;
+
+ /* Doubled-checked locking. */
+ if (!report_available())
+ return false;
+
+ expect = kmalloc(sizeof(observed.lines), GFP_KERNEL);
+ if (WARN_ON(!expect))
+ return false;
+
+ /* Generate expected report contents. */
+
+ /* Title */
+ cur = expect[0];
+ end = &expect[0][sizeof(expect[0]) - 1];
+ cur += scnprintf(cur, end - cur, "BUG: KCSAN: %s in ",
+ is_assert ? "assert: race" : "data-race");
+ if (r->access[1].fn) {
+ char tmp[2][64];
+ int cmp;
+
+ /* Expect lexographically sorted function names in title. */
+ scnprintf(tmp[0], sizeof(tmp[0]), "%pS", r->access[0].fn);
+ scnprintf(tmp[1], sizeof(tmp[1]), "%pS", r->access[1].fn);
+ cmp = strcmp(tmp[0], tmp[1]);
+ cur += scnprintf(cur, end - cur, "%ps / %ps",
+ cmp < 0 ? r->access[0].fn : r->access[1].fn,
+ cmp < 0 ? r->access[1].fn : r->access[0].fn);
+ } else {
+ scnprintf(cur, end - cur, "%pS", r->access[0].fn);
+ /* The exact offset won't match, remove it. */
+ cur = strchr(expect[0], '+');
+ if (cur)
+ *cur = '\0';
+ }
+
+ /* Access 1 */
+ cur = expect[1];
+ end = &expect[1][sizeof(expect[1]) - 1];
+ if (!r->access[1].fn)
+ cur += scnprintf(cur, end - cur, "race at unknown origin, with ");
+
+ /* Access 1 & 2 */
+ for (i = 0; i < 2; ++i) {
+ const int ty = r->access[i].type;
+ const char *const access_type =
+ (ty & KCSAN_ACCESS_ASSERT) ?
+ ((ty & KCSAN_ACCESS_WRITE) ?
+ "assert no accesses" :
+ "assert no writes") :
+ ((ty & KCSAN_ACCESS_WRITE) ?
+ ((ty & KCSAN_ACCESS_COMPOUND) ?
+ "read-write" :
+ "write") :
+ "read");
+ const bool is_atomic = (ty & KCSAN_ACCESS_ATOMIC);
+ const bool is_scoped = (ty & KCSAN_ACCESS_SCOPED);
+ const char *const access_type_aux =
+ (is_atomic && is_scoped) ? " (marked, reordered)"
+ : (is_atomic ? " (marked)"
+ : (is_scoped ? " (reordered)" : ""));
+
+ if (i == 1) {
+ /* Access 2 */
+ cur = expect[2];
+ end = &expect[2][sizeof(expect[2]) - 1];
+
+ if (!r->access[1].fn) {
+ /* Dummy string if no second access is available. */
+ strcpy(cur, "<none>");
+ break;
+ }
+ }
+
+ cur += scnprintf(cur, end - cur, "%s%s to ", access_type,
+ access_type_aux);
+
+ if (r->access[i].addr) /* Address is optional. */
+ cur += scnprintf(cur, end - cur, "0x%px of %zu bytes",
+ r->access[i].addr, r->access[i].size);
+ }
+
+ spin_lock_irqsave(&observed.lock, flags);
+ if (!report_available())
+ goto out; /* A new report is being captured. */
+
+ /* Finally match expected output to what we actually observed. */
+ ret = strstr(observed.lines[0], expect[0]) &&
+ /* Access info may appear in any order. */
+ ((strstr(observed.lines[1], expect[1]) &&
+ strstr(observed.lines[2], expect[2])) ||
+ (strstr(observed.lines[1], expect[2]) &&
+ strstr(observed.lines[2], expect[1])));
+out:
+ spin_unlock_irqrestore(&observed.lock, flags);
+ kfree(expect);
+ return ret;
+}
+
+static __always_inline const struct expect_report *
+__report_set_scoped(struct expect_report *r, int accesses)
+{
+ BUILD_BUG_ON(accesses > 3);
+
+ if (accesses & 1)
+ r->access[0].type |= KCSAN_ACCESS_SCOPED;
+ else
+ r->access[0].type &= ~KCSAN_ACCESS_SCOPED;
+
+ if (accesses & 2)
+ r->access[1].type |= KCSAN_ACCESS_SCOPED;
+ else
+ r->access[1].type &= ~KCSAN_ACCESS_SCOPED;
+
+ return r;
+}
+
+__no_kcsan
+static bool report_matches_any_reordered(struct expect_report *r)
+{
+ return __report_matches(__report_set_scoped(r, 0)) ||
+ __report_matches(__report_set_scoped(r, 1)) ||
+ __report_matches(__report_set_scoped(r, 2)) ||
+ __report_matches(__report_set_scoped(r, 3));
+}
+
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+/* Due to reordering accesses, any access may appear as "(reordered)". */
+#define report_matches report_matches_any_reordered
+#else
+#define report_matches __report_matches
+#endif
+
+/* ===== Test kernels ===== */
+
+static long test_sink;
+static long test_var;
+/* @test_array should be large enough to fall into multiple watchpoint slots. */
+static long test_array[3 * PAGE_SIZE / sizeof(long)];
+static struct {
+ long val[8];
+} test_struct;
+static DEFINE_SEQLOCK(test_seqlock);
+static DEFINE_SPINLOCK(test_spinlock);
+static DEFINE_MUTEX(test_mutex);
+
+/*
+ * Helper to avoid compiler optimizing out reads, and to generate source values
+ * for writes.
+ */
+__no_kcsan
+static noinline void sink_value(long v) { WRITE_ONCE(test_sink, v); }
+
+/*
+ * Generates a delay and some accesses that enter the runtime but do not produce
+ * data races.
+ */
+static noinline void test_delay(int iter)
+{
+ while (iter--)
+ sink_value(READ_ONCE(test_sink));
+}
+
+static noinline void test_kernel_read(void) { sink_value(test_var); }
+
+static noinline void test_kernel_write(void)
+{
+ test_var = READ_ONCE_NOCHECK(test_sink) + 1;
+}
+
+static noinline void test_kernel_write_nochange(void) { test_var = 42; }
+
+/* Suffixed by value-change exception filter. */
+static noinline void test_kernel_write_nochange_rcu(void) { test_var = 42; }
+
+static noinline void test_kernel_read_atomic(void)
+{
+ sink_value(READ_ONCE(test_var));
+}
+
+static noinline void test_kernel_write_atomic(void)
+{
+ WRITE_ONCE(test_var, READ_ONCE_NOCHECK(test_sink) + 1);
+}
+
+static noinline void test_kernel_atomic_rmw(void)
+{
+ /* Use builtin, so we can set up the "bad" atomic/non-atomic scenario. */
+ __atomic_fetch_add(&test_var, 1, __ATOMIC_RELAXED);
+}
+
+__no_kcsan
+static noinline void test_kernel_write_uninstrumented(void) { test_var++; }
+
+static noinline void test_kernel_data_race(void) { data_race(test_var++); }
+
+static noinline void test_kernel_assert_writer(void)
+{
+ ASSERT_EXCLUSIVE_WRITER(test_var);
+}
+
+static noinline void test_kernel_assert_access(void)
+{
+ ASSERT_EXCLUSIVE_ACCESS(test_var);
+}
+
+#define TEST_CHANGE_BITS 0xff00ff00
+
+static noinline void test_kernel_change_bits(void)
+{
+ if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) {
+ /*
+ * Avoid race of unknown origin for this test, just pretend they
+ * are atomic.
+ */
+ kcsan_nestable_atomic_begin();
+ test_var ^= TEST_CHANGE_BITS;
+ kcsan_nestable_atomic_end();
+ } else
+ WRITE_ONCE(test_var, READ_ONCE(test_var) ^ TEST_CHANGE_BITS);
+}
+
+static noinline void test_kernel_assert_bits_change(void)
+{
+ ASSERT_EXCLUSIVE_BITS(test_var, TEST_CHANGE_BITS);
+}
+
+static noinline void test_kernel_assert_bits_nochange(void)
+{
+ ASSERT_EXCLUSIVE_BITS(test_var, ~TEST_CHANGE_BITS);
+}
+
+/*
+ * Scoped assertions do trigger anywhere in scope. However, the report should
+ * still only point at the start of the scope.
+ */
+static noinline void test_enter_scope(void)
+{
+ int x = 0;
+
+ /* Unrelated accesses to scoped assert. */
+ READ_ONCE(test_sink);
+ kcsan_check_read(&x, sizeof(x));
+}
+
+static noinline void test_kernel_assert_writer_scoped(void)
+{
+ ASSERT_EXCLUSIVE_WRITER_SCOPED(test_var);
+ test_enter_scope();
+}
+
+static noinline void test_kernel_assert_access_scoped(void)
+{
+ ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_var);
+ test_enter_scope();
+}
+
+static noinline void test_kernel_rmw_array(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(test_array); ++i)
+ test_array[i]++;
+}
+
+static noinline void test_kernel_write_struct(void)
+{
+ kcsan_check_write(&test_struct, sizeof(test_struct));
+ kcsan_disable_current();
+ test_struct.val[3]++; /* induce value change */
+ kcsan_enable_current();
+}
+
+static noinline void test_kernel_write_struct_part(void)
+{
+ test_struct.val[3] = 42;
+}
+
+static noinline void test_kernel_read_struct_zero_size(void)
+{
+ kcsan_check_read(&test_struct.val[3], 0);
+}
+
+static noinline void test_kernel_jiffies_reader(void)
+{
+ sink_value((long)jiffies);
+}
+
+static noinline void test_kernel_seqlock_reader(void)
+{
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&test_seqlock);
+ sink_value(test_var);
+ } while (read_seqretry(&test_seqlock, seq));
+}
+
+static noinline void test_kernel_seqlock_writer(void)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&test_seqlock, flags);
+ test_var++;
+ write_sequnlock_irqrestore(&test_seqlock, flags);
+}
+
+static noinline void test_kernel_atomic_builtins(void)
+{
+ /*
+ * Generate concurrent accesses, expecting no reports, ensuring KCSAN
+ * treats builtin atomics as actually atomic.
+ */
+ __atomic_load_n(&test_var, __ATOMIC_RELAXED);
+}
+
+static noinline void test_kernel_xor_1bit(void)
+{
+ /* Do not report data races between the read-writes. */
+ kcsan_nestable_atomic_begin();
+ test_var ^= 0x10000;
+ kcsan_nestable_atomic_end();
+}
+
+#define TEST_KERNEL_LOCKED(name, acquire, release) \
+ static noinline void test_kernel_##name(void) \
+ { \
+ long *flag = &test_struct.val[0]; \
+ long v = 0; \
+ if (!(acquire)) \
+ return; \
+ while (v++ < 100) { \
+ test_var++; \
+ barrier(); \
+ } \
+ release; \
+ test_delay(10); \
+ }
+
+TEST_KERNEL_LOCKED(with_memorder,
+ cmpxchg_acquire(flag, 0, 1) == 0,
+ smp_store_release(flag, 0));
+TEST_KERNEL_LOCKED(wrong_memorder,
+ cmpxchg_relaxed(flag, 0, 1) == 0,
+ WRITE_ONCE(*flag, 0));
+TEST_KERNEL_LOCKED(atomic_builtin_with_memorder,
+ __atomic_compare_exchange_n(flag, &v, 1, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED),
+ __atomic_store_n(flag, 0, __ATOMIC_RELEASE));
+TEST_KERNEL_LOCKED(atomic_builtin_wrong_memorder,
+ __atomic_compare_exchange_n(flag, &v, 1, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED),
+ __atomic_store_n(flag, 0, __ATOMIC_RELAXED));
+
+/* ===== Test cases ===== */
+
+/*
+ * Tests that various barriers have the expected effect on internal state. Not
+ * exhaustive on atomic_t operations. Unlike the selftest, also checks for
+ * too-strict barrier instrumentation; these can be tolerated, because it does
+ * not cause false positives, but at least we should be aware of such cases.
+ */
+static void test_barrier_nothreads(struct kunit *test)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ struct kcsan_scoped_access *reorder_access = &current->kcsan_ctx.reorder_access;
+#else
+ struct kcsan_scoped_access *reorder_access = NULL;
+#endif
+ arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED;
+ atomic_t dummy;
+
+ KCSAN_TEST_REQUIRES(test, reorder_access != NULL);
+ KCSAN_TEST_REQUIRES(test, IS_ENABLED(CONFIG_SMP));
+
+#define __KCSAN_EXPECT_BARRIER(access_type, barrier, order_before, name) \
+ do { \
+ reorder_access->type = (access_type) | KCSAN_ACCESS_SCOPED; \
+ reorder_access->size = sizeof(test_var); \
+ barrier; \
+ KUNIT_EXPECT_EQ_MSG(test, reorder_access->size, \
+ order_before ? 0 : sizeof(test_var), \
+ "improperly instrumented type=(" #access_type "): " name); \
+ } while (0)
+#define KCSAN_EXPECT_READ_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(0, b, o, #b)
+#define KCSAN_EXPECT_WRITE_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_WRITE, b, o, #b)
+#define KCSAN_EXPECT_RW_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE, b, o, #b)
+
+ /*
+ * Lockdep initialization can strengthen certain locking operations due
+ * to calling into instrumented files; "warm up" our locks.
+ */
+ spin_lock(&test_spinlock);
+ spin_unlock(&test_spinlock);
+ mutex_lock(&test_mutex);
+ mutex_unlock(&test_mutex);
+
+ /* Force creating a valid entry in reorder_access first. */
+ test_var = 0;
+ while (test_var++ < 1000000 && reorder_access->size != sizeof(test_var))
+ __kcsan_check_read(&test_var, sizeof(test_var));
+ KUNIT_ASSERT_EQ(test, reorder_access->size, sizeof(test_var));
+
+ kcsan_nestable_atomic_begin(); /* No watchpoints in called functions. */
+
+ KCSAN_EXPECT_READ_BARRIER(mb(), true);
+ KCSAN_EXPECT_READ_BARRIER(wmb(), false);
+ KCSAN_EXPECT_READ_BARRIER(rmb(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_wmb(), false);
+ KCSAN_EXPECT_READ_BARRIER(smp_rmb(), true);
+ KCSAN_EXPECT_READ_BARRIER(dma_wmb(), false);
+ KCSAN_EXPECT_READ_BARRIER(dma_rmb(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb__before_atomic(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb__after_atomic(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb__after_spinlock(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_store_mb(test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_load_acquire(&test_var), false);
+ KCSAN_EXPECT_READ_BARRIER(smp_store_release(&test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(xchg(&test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(xchg_release(&test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(xchg_relaxed(&test_var, 0), false);
+ KCSAN_EXPECT_READ_BARRIER(cmpxchg(&test_var, 0, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_read(&dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_read_acquire(&dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_set(&dummy, 0), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_set_release(&dummy, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return_acquire(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return_release(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_release(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(test_and_set_bit(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(test_and_clear_bit(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(test_and_change_bit(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(__clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(arch_spin_lock(&arch_spinlock), false);
+ KCSAN_EXPECT_READ_BARRIER(arch_spin_unlock(&arch_spinlock), true);
+ KCSAN_EXPECT_READ_BARRIER(spin_lock(&test_spinlock), false);
+ KCSAN_EXPECT_READ_BARRIER(spin_unlock(&test_spinlock), true);
+ KCSAN_EXPECT_READ_BARRIER(mutex_lock(&test_mutex), false);
+ KCSAN_EXPECT_READ_BARRIER(mutex_unlock(&test_mutex), true);
+
+ KCSAN_EXPECT_WRITE_BARRIER(mb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(wmb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(rmb(), false);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_wmb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_rmb(), false);
+ KCSAN_EXPECT_WRITE_BARRIER(dma_wmb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(dma_rmb(), false);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb__before_atomic(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb__after_atomic(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb__after_spinlock(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_store_mb(test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_load_acquire(&test_var), false);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_store_release(&test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xchg(&test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xchg_release(&test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xchg_relaxed(&test_var, 0), false);
+ KCSAN_EXPECT_WRITE_BARRIER(cmpxchg(&test_var, 0, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_read(&dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_read_acquire(&dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_set(&dummy, 0), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_set_release(&dummy, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_acquire(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_release(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_release(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(test_and_set_bit(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(test_and_clear_bit(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(test_and_change_bit(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(__clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(arch_spin_lock(&arch_spinlock), false);
+ KCSAN_EXPECT_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock), true);
+ KCSAN_EXPECT_WRITE_BARRIER(spin_lock(&test_spinlock), false);
+ KCSAN_EXPECT_WRITE_BARRIER(spin_unlock(&test_spinlock), true);
+ KCSAN_EXPECT_WRITE_BARRIER(mutex_lock(&test_mutex), false);
+ KCSAN_EXPECT_WRITE_BARRIER(mutex_unlock(&test_mutex), true);
+
+ KCSAN_EXPECT_RW_BARRIER(mb(), true);
+ KCSAN_EXPECT_RW_BARRIER(wmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(rmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_wmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_rmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(dma_wmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(dma_rmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb__before_atomic(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb__after_atomic(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb__after_spinlock(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_store_mb(test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_load_acquire(&test_var), false);
+ KCSAN_EXPECT_RW_BARRIER(smp_store_release(&test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(xchg(&test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(xchg_release(&test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(xchg_relaxed(&test_var, 0), false);
+ KCSAN_EXPECT_RW_BARRIER(cmpxchg(&test_var, 0, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_read(&dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_read_acquire(&dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_set(&dummy, 0), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_set_release(&dummy, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return_acquire(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return_release(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_release(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(test_and_set_bit(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(test_and_clear_bit(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(test_and_change_bit(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(__clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(arch_spin_lock(&arch_spinlock), false);
+ KCSAN_EXPECT_RW_BARRIER(arch_spin_unlock(&arch_spinlock), true);
+ KCSAN_EXPECT_RW_BARRIER(spin_lock(&test_spinlock), false);
+ KCSAN_EXPECT_RW_BARRIER(spin_unlock(&test_spinlock), true);
+ KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false);
+ KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true);
+ KCSAN_EXPECT_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
+ kcsan_nestable_atomic_end();
+}
+
+/* Simple test with normal data race. */
+__no_kcsan
+static void test_basic(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ struct expect_report never = {
+ .access = {
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ bool match_expect = false;
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_write, test_kernel_read);
+ do {
+ match_expect |= report_matches(&expect);
+ match_never = report_matches(&never);
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/*
+ * Stress KCSAN with lots of concurrent races on different addresses until
+ * timeout.
+ */
+__no_kcsan
+static void test_concurrent_races(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ /* NULL will match any address. */
+ { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ struct expect_report never = {
+ .access = {
+ { test_kernel_rmw_array, NULL, 0, 0 },
+ { test_kernel_rmw_array, NULL, 0, 0 },
+ },
+ };
+ bool match_expect = false;
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_rmw_array, test_kernel_rmw_array);
+ do {
+ match_expect |= report_matches(&expect);
+ match_never |= report_matches(&never);
+ } while (!end_test_checks(false));
+ KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check matches exist. */
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/* Test the KCSAN_REPORT_VALUE_CHANGE_ONLY option. */
+__no_kcsan
+static void test_novalue_change(struct kunit *test)
+{
+ struct expect_report expect_rw = {
+ .access = {
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ struct expect_report expect_ww = {
+ .access = {
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
+ bool match_expect = false;
+
+ test_kernel_write_nochange(); /* Reset value. */
+ begin_test_checks(test_kernel_write_nochange, test_kernel_read);
+ do {
+ match_expect = report_matches(&expect_rw) || report_matches(&expect_ww);
+ } while (!end_test_checks(match_expect));
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY))
+ KUNIT_EXPECT_FALSE(test, match_expect);
+ else
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/*
+ * Test that the rules where the KCSAN_REPORT_VALUE_CHANGE_ONLY option should
+ * never apply work.
+ */
+__no_kcsan
+static void test_novalue_change_exception(struct kunit *test)
+{
+ struct expect_report expect_rw = {
+ .access = {
+ { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ struct expect_report expect_ww = {
+ .access = {
+ { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
+ bool match_expect = false;
+
+ test_kernel_write_nochange_rcu(); /* Reset value. */
+ begin_test_checks(test_kernel_write_nochange_rcu, test_kernel_read);
+ do {
+ match_expect = report_matches(&expect_rw) || report_matches(&expect_ww);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/* Test that data races of unknown origin are reported. */
+__no_kcsan
+static void test_unknown_origin(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ { NULL },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_write_uninstrumented, test_kernel_read);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN))
+ KUNIT_EXPECT_TRUE(test, match_expect);
+ else
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+/* Test KCSAN_ASSUME_PLAIN_WRITES_ATOMIC if it is selected. */
+__no_kcsan
+static void test_write_write_assume_atomic(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_write, test_kernel_write);
+ do {
+ sink_value(READ_ONCE(test_var)); /* induce value-change */
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC))
+ KUNIT_EXPECT_FALSE(test, match_expect);
+ else
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/*
+ * Test that data races with writes larger than word-size are always reported,
+ * even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected.
+ */
+__no_kcsan
+static void test_write_write_struct(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+ { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_write_struct, test_kernel_write_struct);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/*
+ * Test that data races where only one write is larger than word-size are always
+ * reported, even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected.
+ */
+__no_kcsan
+static void test_write_write_struct_part(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+ { test_kernel_write_struct_part, &test_struct.val[3], sizeof(test_struct.val[3]), KCSAN_ACCESS_WRITE },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_write_struct, test_kernel_write_struct_part);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/* Test that races with atomic accesses never result in reports. */
+__no_kcsan
+static void test_read_atomic_write_atomic(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_read_atomic, test_kernel_write_atomic);
+ do {
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/* Test that a race with an atomic and plain access result in reports. */
+__no_kcsan
+static void test_read_plain_atomic_write(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ { test_kernel_write_atomic, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC },
+ },
+ };
+ bool match_expect = false;
+
+ KCSAN_TEST_REQUIRES(test, !IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS));
+
+ begin_test_checks(test_kernel_read, test_kernel_write_atomic);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/* Test that atomic RMWs generate correct report. */
+__no_kcsan
+static void test_read_plain_atomic_rmw(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ { test_kernel_atomic_rmw, &test_var, sizeof(test_var),
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC },
+ },
+ };
+ bool match_expect = false;
+
+ KCSAN_TEST_REQUIRES(test, !IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS));
+
+ begin_test_checks(test_kernel_read, test_kernel_atomic_rmw);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/* Zero-sized accesses should never cause data race reports. */
+__no_kcsan
+static void test_zero_size_access(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+ { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+ },
+ };
+ struct expect_report never = {
+ .access = {
+ { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+ { test_kernel_read_struct_zero_size, &test_struct.val[3], 0, 0 },
+ },
+ };
+ bool match_expect = false;
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_write_struct, test_kernel_read_struct_zero_size);
+ do {
+ match_expect |= report_matches(&expect);
+ match_never = report_matches(&never);
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check. */
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/* Test the data_race() macro. */
+__no_kcsan
+static void test_data_race(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_data_race, test_kernel_data_race);
+ do {
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+__no_kcsan
+static void test_assert_exclusive_writer(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_assert_writer, test_kernel_write_nochange);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+__no_kcsan
+static void test_assert_exclusive_access(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_assert_access, test_kernel_read);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+__no_kcsan
+static void test_assert_exclusive_access_writer(struct kunit *test)
+{
+ struct expect_report expect_access_writer = {
+ .access = {
+ { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+ { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+ },
+ };
+ struct expect_report expect_access_access = {
+ .access = {
+ { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+ { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+ },
+ };
+ struct expect_report never = {
+ .access = {
+ { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+ { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+ },
+ };
+ bool match_expect_access_writer = false;
+ bool match_expect_access_access = false;
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_assert_access, test_kernel_assert_writer);
+ do {
+ match_expect_access_writer |= report_matches(&expect_access_writer);
+ match_expect_access_access |= report_matches(&expect_access_access);
+ match_never |= report_matches(&never);
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_TRUE(test, match_expect_access_writer);
+ KUNIT_EXPECT_TRUE(test, match_expect_access_access);
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+__no_kcsan
+static void test_assert_exclusive_bits_change(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_assert_bits_change, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+ { test_kernel_change_bits, &test_var, sizeof(test_var),
+ KCSAN_ACCESS_WRITE | (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) ? 0 : KCSAN_ACCESS_ATOMIC) },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_assert_bits_change, test_kernel_change_bits);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+__no_kcsan
+static void test_assert_exclusive_bits_nochange(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_assert_bits_nochange, test_kernel_change_bits);
+ do {
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+__no_kcsan
+static void test_assert_exclusive_writer_scoped(struct kunit *test)
+{
+ struct expect_report expect_start = {
+ .access = {
+ { test_kernel_assert_writer_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
+ struct expect_report expect_inscope = {
+ .access = {
+ { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
+ bool match_expect_start = false;
+ bool match_expect_inscope = false;
+
+ begin_test_checks(test_kernel_assert_writer_scoped, test_kernel_write_nochange);
+ do {
+ match_expect_start |= report_matches(&expect_start);
+ match_expect_inscope |= report_matches(&expect_inscope);
+ } while (!end_test_checks(match_expect_inscope));
+ KUNIT_EXPECT_TRUE(test, match_expect_start);
+ KUNIT_EXPECT_FALSE(test, match_expect_inscope);
+}
+
+__no_kcsan
+static void test_assert_exclusive_access_scoped(struct kunit *test)
+{
+ struct expect_report expect_start1 = {
+ .access = {
+ { test_kernel_assert_access_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ struct expect_report expect_start2 = {
+ .access = { expect_start1.access[0], expect_start1.access[0] },
+ };
+ struct expect_report expect_inscope = {
+ .access = {
+ { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ bool match_expect_start = false;
+ bool match_expect_inscope = false;
+
+ begin_test_checks(test_kernel_assert_access_scoped, test_kernel_read);
+ end_time += msecs_to_jiffies(1000); /* This test requires a bit more time. */
+ do {
+ match_expect_start |= report_matches(&expect_start1) || report_matches(&expect_start2);
+ match_expect_inscope |= report_matches(&expect_inscope);
+ } while (!end_test_checks(match_expect_inscope));
+ KUNIT_EXPECT_TRUE(test, match_expect_start);
+ KUNIT_EXPECT_FALSE(test, match_expect_inscope);
+}
+
+/*
+ * jiffies is special (declared to be volatile) and its accesses are typically
+ * not marked; this test ensures that the compiler nor KCSAN gets confused about
+ * jiffies's declaration on different architectures.
+ */
+__no_kcsan
+static void test_jiffies_noreport(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_jiffies_reader, test_kernel_jiffies_reader);
+ do {
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/* Test that racing accesses in seqlock critical sections are not reported. */
+__no_kcsan
+static void test_seqlock_noreport(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_seqlock_reader, test_kernel_seqlock_writer);
+ do {
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/*
+ * Test atomic builtins work and required instrumentation functions exist. We
+ * also test that KCSAN understands they're atomic by racing with them via
+ * test_kernel_atomic_builtins(), and expect no reports.
+ *
+ * The atomic builtins _SHOULD NOT_ be used in normal kernel code!
+ */
+static void test_atomic_builtins(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_atomic_builtins, test_kernel_atomic_builtins);
+ do {
+ long tmp;
+
+ kcsan_enable_current();
+
+ __atomic_store_n(&test_var, 42L, __ATOMIC_RELAXED);
+ KUNIT_EXPECT_EQ(test, 42L, __atomic_load_n(&test_var, __ATOMIC_RELAXED));
+
+ KUNIT_EXPECT_EQ(test, 42L, __atomic_exchange_n(&test_var, 20, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 20L, test_var);
+
+ tmp = 20L;
+ KUNIT_EXPECT_TRUE(test, __atomic_compare_exchange_n(&test_var, &tmp, 30L,
+ 0, __ATOMIC_RELAXED,
+ __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, tmp, 20L);
+ KUNIT_EXPECT_EQ(test, test_var, 30L);
+ KUNIT_EXPECT_FALSE(test, __atomic_compare_exchange_n(&test_var, &tmp, 40L,
+ 1, __ATOMIC_RELAXED,
+ __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, tmp, 30L);
+ KUNIT_EXPECT_EQ(test, test_var, 30L);
+
+ KUNIT_EXPECT_EQ(test, 30L, __atomic_fetch_add(&test_var, 1, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 31L, __atomic_fetch_sub(&test_var, 1, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 30L, __atomic_fetch_and(&test_var, 0xf, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 14L, __atomic_fetch_xor(&test_var, 0xf, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 1L, __atomic_fetch_or(&test_var, 0xf0, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 241L, __atomic_fetch_nand(&test_var, 0xf, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, -2L, test_var);
+
+ __atomic_thread_fence(__ATOMIC_SEQ_CST);
+ __atomic_signal_fence(__ATOMIC_SEQ_CST);
+
+ kcsan_disable_current();
+
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+__no_kcsan
+static void test_1bit_value_change(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ { test_kernel_xor_1bit, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ },
+ };
+ bool match = false;
+
+ begin_test_checks(test_kernel_read, test_kernel_xor_1bit);
+ do {
+ match = IS_ENABLED(CONFIG_KCSAN_PERMISSIVE)
+ ? report_available()
+ : report_matches(&expect);
+ } while (!end_test_checks(match));
+ if (IS_ENABLED(CONFIG_KCSAN_PERMISSIVE))
+ KUNIT_EXPECT_FALSE(test, match);
+ else
+ KUNIT_EXPECT_TRUE(test, match);
+}
+
+__no_kcsan
+static void test_correct_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_with_memorder, test_kernel_with_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+__no_kcsan
+static void test_missing_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_wrong_memorder, test_kernel_wrong_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ if (IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ KUNIT_EXPECT_TRUE(test, match_expect);
+ else
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+__no_kcsan
+static void test_atomic_builtins_correct_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_atomic_builtin_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_atomic_builtin_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_atomic_builtin_with_memorder,
+ test_kernel_atomic_builtin_with_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+__no_kcsan
+static void test_atomic_builtins_missing_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_atomic_builtin_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_atomic_builtin_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_atomic_builtin_wrong_memorder,
+ test_kernel_atomic_builtin_wrong_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ if (IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ KUNIT_EXPECT_TRUE(test, match_expect);
+ else
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+/*
+ * Generate thread counts for all test cases. Values generated are in interval
+ * [2, 5] followed by exponentially increasing thread counts from 8 to 32.
+ *
+ * The thread counts are chosen to cover potentially interesting boundaries and
+ * corner cases (2 to 5), and then stress the system with larger counts.
+ */
+static const void *nthreads_gen_params(const void *prev, char *desc)
+{
+ long nthreads = (long)prev;
+
+ if (nthreads < 0 || nthreads >= 32)
+ nthreads = 0; /* stop */
+ else if (!nthreads)
+ nthreads = 2; /* initial value */
+ else if (nthreads < 5)
+ nthreads++;
+ else if (nthreads == 5)
+ nthreads = 8;
+ else
+ nthreads *= 2;
+
+ if (!preempt_model_preemptible() ||
+ !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) {
+ /*
+ * Without any preemption, keep 2 CPUs free for other tasks, one
+ * of which is the main test case function checking for
+ * completion or failure.
+ */
+ const long min_unused_cpus = preempt_model_none() ? 2 : 0;
+ const long min_required_cpus = 2 + min_unused_cpus;
+
+ if (num_online_cpus() < min_required_cpus) {
+ pr_err_once("Too few online CPUs (%u < %ld) for test\n",
+ num_online_cpus(), min_required_cpus);
+ nthreads = 0;
+ } else if (nthreads >= num_online_cpus() - min_unused_cpus) {
+ /* Use negative value to indicate last param. */
+ nthreads = -(num_online_cpus() - min_unused_cpus);
+ pr_warn_once("Limiting number of threads to %ld (only %d online CPUs)\n",
+ -nthreads, num_online_cpus());
+ }
+ }
+
+ snprintf(desc, KUNIT_PARAM_DESC_SIZE, "threads=%ld", abs(nthreads));
+ return (void *)nthreads;
+}
+
+#define KCSAN_KUNIT_CASE(test_name) KUNIT_CASE_PARAM(test_name, nthreads_gen_params)
+static struct kunit_case kcsan_test_cases[] = {
+ KUNIT_CASE(test_barrier_nothreads),
+ KCSAN_KUNIT_CASE(test_basic),
+ KCSAN_KUNIT_CASE(test_concurrent_races),
+ KCSAN_KUNIT_CASE(test_novalue_change),
+ KCSAN_KUNIT_CASE(test_novalue_change_exception),
+ KCSAN_KUNIT_CASE(test_unknown_origin),
+ KCSAN_KUNIT_CASE(test_write_write_assume_atomic),
+ KCSAN_KUNIT_CASE(test_write_write_struct),
+ KCSAN_KUNIT_CASE(test_write_write_struct_part),
+ KCSAN_KUNIT_CASE(test_read_atomic_write_atomic),
+ KCSAN_KUNIT_CASE(test_read_plain_atomic_write),
+ KCSAN_KUNIT_CASE(test_read_plain_atomic_rmw),
+ KCSAN_KUNIT_CASE(test_zero_size_access),
+ KCSAN_KUNIT_CASE(test_data_race),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_writer),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_access),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_access_writer),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_bits_change),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_bits_nochange),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_writer_scoped),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped),
+ KCSAN_KUNIT_CASE(test_jiffies_noreport),
+ KCSAN_KUNIT_CASE(test_seqlock_noreport),
+ KCSAN_KUNIT_CASE(test_atomic_builtins),
+ KCSAN_KUNIT_CASE(test_1bit_value_change),
+ KCSAN_KUNIT_CASE(test_correct_barrier),
+ KCSAN_KUNIT_CASE(test_missing_barrier),
+ KCSAN_KUNIT_CASE(test_atomic_builtins_correct_barrier),
+ KCSAN_KUNIT_CASE(test_atomic_builtins_missing_barrier),
+ {},
+};
+
+/* ===== End test cases ===== */
+
+/* Concurrent accesses from interrupts. */
+__no_kcsan
+static void access_thread_timer(struct timer_list *timer)
+{
+ static atomic_t cnt = ATOMIC_INIT(0);
+ unsigned int idx;
+ void (*func)(void);
+
+ idx = (unsigned int)atomic_inc_return(&cnt) % ARRAY_SIZE(access_kernels);
+ /* Acquire potential initialization. */
+ func = smp_load_acquire(&access_kernels[idx]);
+ if (func)
+ func();
+}
+
+/* The main loop for each thread. */
+__no_kcsan
+static int access_thread(void *arg)
+{
+ struct timer_list timer;
+ unsigned int cnt = 0;
+ unsigned int idx;
+ void (*func)(void);
+
+ timer_setup_on_stack(&timer, access_thread_timer, 0);
+ do {
+ might_sleep();
+
+ if (!timer_pending(&timer))
+ mod_timer(&timer, jiffies + 1);
+ else {
+ /* Iterate through all kernels. */
+ idx = cnt++ % ARRAY_SIZE(access_kernels);
+ /* Acquire potential initialization. */
+ func = smp_load_acquire(&access_kernels[idx]);
+ if (func)
+ func();
+ }
+ } while (!torture_must_stop());
+ del_timer_sync(&timer);
+ destroy_timer_on_stack(&timer);
+
+ torture_kthread_stopping("access_thread");
+ return 0;
+}
+
+__no_kcsan
+static int test_init(struct kunit *test)
+{
+ unsigned long flags;
+ int nthreads;
+ int i;
+
+ spin_lock_irqsave(&observed.lock, flags);
+ for (i = 0; i < ARRAY_SIZE(observed.lines); ++i)
+ observed.lines[i][0] = '\0';
+ observed.nlines = 0;
+ spin_unlock_irqrestore(&observed.lock, flags);
+
+ if (strstr(test->name, "nothreads"))
+ return 0;
+
+ if (!torture_init_begin((char *)test->name, 1))
+ return -EBUSY;
+
+ if (WARN_ON(threads))
+ goto err;
+
+ for (i = 0; i < ARRAY_SIZE(access_kernels); ++i) {
+ if (WARN_ON(access_kernels[i]))
+ goto err;
+ }
+
+ nthreads = abs((long)test->param_value);
+ if (WARN_ON(!nthreads))
+ goto err;
+
+ threads = kcalloc(nthreads + 1, sizeof(struct task_struct *), GFP_KERNEL);
+ if (WARN_ON(!threads))
+ goto err;
+
+ threads[nthreads] = NULL;
+ for (i = 0; i < nthreads; ++i) {
+ if (torture_create_kthread(access_thread, NULL, threads[i]))
+ goto err;
+ }
+
+ torture_init_end();
+
+ return 0;
+
+err:
+ kfree(threads);
+ threads = NULL;
+ torture_init_end();
+ return -EINVAL;
+}
+
+__no_kcsan
+static void test_exit(struct kunit *test)
+{
+ struct task_struct **stop_thread;
+ int i;
+
+ if (strstr(test->name, "nothreads"))
+ return;
+
+ if (torture_cleanup_begin())
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(access_kernels); ++i)
+ WRITE_ONCE(access_kernels[i], NULL);
+
+ if (threads) {
+ for (stop_thread = threads; *stop_thread; stop_thread++)
+ torture_stop_kthread(reader_thread, *stop_thread);
+
+ kfree(threads);
+ threads = NULL;
+ }
+
+ torture_cleanup_end();
+}
+
+__no_kcsan
+static void register_tracepoints(void)
+{
+ register_trace_console(probe_console, NULL);
+}
+
+__no_kcsan
+static void unregister_tracepoints(void)
+{
+ unregister_trace_console(probe_console, NULL);
+}
+
+static int kcsan_suite_init(struct kunit_suite *suite)
+{
+ register_tracepoints();
+ return 0;
+}
+
+static void kcsan_suite_exit(struct kunit_suite *suite)
+{
+ unregister_tracepoints();
+ tracepoint_synchronize_unregister();
+}
+
+static struct kunit_suite kcsan_test_suite = {
+ .name = "kcsan",
+ .test_cases = kcsan_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+ .suite_init = kcsan_suite_init,
+ .suite_exit = kcsan_suite_exit,
+};
+
+kunit_test_suites(&kcsan_test_suite);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Marco Elver <elver@google.com>");
diff --git a/kernel/kcsan/permissive.h b/kernel/kcsan/permissive.h
new file mode 100644
index 000000000000..2c01fe4a59ee
--- /dev/null
+++ b/kernel/kcsan/permissive.h
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Special rules for ignoring entire classes of data-racy memory accesses. None
+ * of the rules here imply that such data races are generally safe!
+ *
+ * All rules in this file can be configured via CONFIG_KCSAN_PERMISSIVE. Keep
+ * them separate from core code to make it easier to audit.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
+
+#ifndef _KERNEL_KCSAN_PERMISSIVE_H
+#define _KERNEL_KCSAN_PERMISSIVE_H
+
+#include <linux/bitops.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+/*
+ * Access ignore rules based on address.
+ */
+static __always_inline bool kcsan_ignore_address(const volatile void *ptr)
+{
+ if (!IS_ENABLED(CONFIG_KCSAN_PERMISSIVE))
+ return false;
+
+ /*
+ * Data-racy bitops on current->flags are too common, ignore completely
+ * for now.
+ */
+ return ptr == &current->flags;
+}
+
+/*
+ * Data race ignore rules based on access type and value change patterns.
+ */
+static bool
+kcsan_ignore_data_race(size_t size, int type, u64 old, u64 new, u64 diff)
+{
+ if (!IS_ENABLED(CONFIG_KCSAN_PERMISSIVE))
+ return false;
+
+ /*
+ * Rules here are only for plain read accesses, so that we still report
+ * data races between plain read-write accesses.
+ */
+ if (type || size > sizeof(long))
+ return false;
+
+ /*
+ * A common pattern is checking/setting just 1 bit in a variable; for
+ * example:
+ *
+ * if (flags & SOME_FLAG) { ... }
+ *
+ * and elsewhere flags is updated concurrently:
+ *
+ * flags |= SOME_OTHER_FLAG; // just 1 bit
+ *
+ * While it is still recommended that such accesses be marked
+ * appropriately, in many cases these types of data races are so common
+ * that marking them all is often unrealistic and left to maintainer
+ * preference.
+ *
+ * The assumption in all cases is that with all known compiler
+ * optimizations (including those that tear accesses), because no more
+ * than 1 bit changed, the plain accesses are safe despite the presence
+ * of data races.
+ *
+ * The rules here will ignore the data races if we observe no more than
+ * 1 bit changed.
+ *
+ * Of course many operations can effecively change just 1 bit, but the
+ * general assuption that data races involving 1-bit changes can be
+ * tolerated still applies.
+ *
+ * And in case a true bug is missed, the bug likely manifests as a
+ * reportable data race elsewhere.
+ */
+ if (hweight64(diff) == 1) {
+ /*
+ * Exception: Report data races where the values look like
+ * ordinary booleans (one of them was 0 and the 0th bit was
+ * changed) More often than not, they come with interesting
+ * memory ordering requirements, so let's report them.
+ */
+ if (!((!old || !new) && diff == 1))
+ return true;
+ }
+
+ return false;
+}
+
+#endif /* _KERNEL_KCSAN_PERMISSIVE_H */
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index ac5f8345bae9..e95ce7d7a76e 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -1,8 +1,14 @@
// SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN reporting.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
#include <linux/debug_locks.h>
#include <linux/delay.h>
#include <linux/jiffies.h>
+#include <linux/kallsyms.h>
#include <linux/kernel.h>
#include <linux/lockdep.h>
#include <linux/preempt.h>
@@ -26,6 +32,7 @@ struct access_info {
int access_type;
int task_pid;
int cpu_id;
+ unsigned long ip;
};
/*
@@ -208,9 +215,9 @@ static const char *get_access_type(int type)
if (type & KCSAN_ACCESS_ASSERT) {
if (type & KCSAN_ACCESS_SCOPED) {
if (type & KCSAN_ACCESS_WRITE)
- return "assert no accesses (scoped)";
+ return "assert no accesses (reordered)";
else
- return "assert no writes (scoped)";
+ return "assert no writes (reordered)";
} else {
if (type & KCSAN_ACCESS_WRITE)
return "assert no accesses";
@@ -228,14 +235,22 @@ static const char *get_access_type(int type)
return "write";
case KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
return "write (marked)";
+ case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE:
+ return "read-write";
+ case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
+ return "read-write (marked)";
case KCSAN_ACCESS_SCOPED:
- return "read (scoped)";
+ return "read (reordered)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC:
- return "read (marked, scoped)";
+ return "read (marked, reordered)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE:
- return "write (scoped)";
+ return "write (reordered)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
- return "write (marked, scoped)";
+ return "write (marked, reordered)";
+ case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE:
+ return "read-write (reordered)";
+ case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
+ return "read-write (marked, reordered)";
default:
BUG();
}
@@ -275,8 +290,8 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
cur = strnstr(buf, "kcsan_", len);
if (cur) {
- cur += sizeof("kcsan_") - 1;
- if (strncmp(cur, "test", sizeof("test") - 1))
+ cur += strlen("kcsan_");
+ if (!str_has_prefix(cur, "test"))
continue; /* KCSAN runtime function. */
/* KCSAN related test. */
}
@@ -291,6 +306,52 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
return skip;
}
+/*
+ * Skips to the first entry that matches the function of @ip, and then replaces
+ * that entry with @ip, returning the entries to skip with @replaced containing
+ * the replaced entry.
+ */
+static int
+replace_stack_entry(unsigned long stack_entries[], int num_entries, unsigned long ip,
+ unsigned long *replaced)
+{
+ unsigned long symbolsize, offset;
+ unsigned long target_func;
+ int skip;
+
+ if (kallsyms_lookup_size_offset(ip, &symbolsize, &offset))
+ target_func = ip - offset;
+ else
+ goto fallback;
+
+ for (skip = 0; skip < num_entries; ++skip) {
+ unsigned long func = stack_entries[skip];
+
+ if (!kallsyms_lookup_size_offset(func, &symbolsize, &offset))
+ goto fallback;
+ func -= offset;
+
+ if (func == target_func) {
+ *replaced = stack_entries[skip];
+ stack_entries[skip] = ip;
+ return skip;
+ }
+ }
+
+fallback:
+ /* Should not happen; the resulting stack trace is likely misleading. */
+ WARN_ONCE(1, "Cannot find frame for %pS in stack trace", (void *)ip);
+ return get_stack_skipnr(stack_entries, num_entries);
+}
+
+static int
+sanitize_stack_entries(unsigned long stack_entries[], int num_entries, unsigned long ip,
+ unsigned long *replaced)
+{
+ return ip ? replace_stack_entry(stack_entries, num_entries, ip, replaced) :
+ get_stack_skipnr(stack_entries, num_entries);
+}
+
/* Compares symbolized strings of addr1 and addr2. */
static int sym_strcmp(void *addr1, void *addr2)
{
@@ -303,28 +364,38 @@ static int sym_strcmp(void *addr1, void *addr2)
return strncmp(buf1, buf2, sizeof(buf1));
}
+static void
+print_stack_trace(unsigned long stack_entries[], int num_entries, unsigned long reordered_to)
+{
+ stack_trace_print(stack_entries, num_entries, 0);
+ if (reordered_to)
+ pr_err(" |\n +-> reordered to: %pS\n", (void *)reordered_to);
+}
+
static void print_verbose_info(struct task_struct *task)
{
if (!task)
return;
+ /* Restore IRQ state trace for printing. */
+ kcsan_restore_irqtrace(task);
+
pr_err("\n");
debug_show_held_locks(task);
print_irqtrace_events(task);
}
-/*
- * Returns true if a report was generated, false otherwise.
- */
-static bool print_report(enum kcsan_value_change value_change,
- enum kcsan_report_type type,
+static void print_report(enum kcsan_value_change value_change,
const struct access_info *ai,
- const struct other_info *other_info)
+ struct other_info *other_info,
+ u64 old, u64 new, u64 mask)
{
+ unsigned long reordered_to = 0;
unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 };
int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1);
- int skipnr = get_stack_skipnr(stack_entries, num_stack_entries);
+ int skipnr = sanitize_stack_entries(stack_entries, num_stack_entries, ai->ip, &reordered_to);
unsigned long this_frame = stack_entries[skipnr];
+ unsigned long other_reordered_to = 0;
unsigned long other_frame = 0;
int other_skipnr = 0; /* silence uninit warnings */
@@ -332,25 +403,25 @@ static bool print_report(enum kcsan_value_change value_change,
* Must check report filter rules before starting to print.
*/
if (skip_report(KCSAN_VALUE_CHANGE_TRUE, stack_entries[skipnr]))
- return false;
+ return;
- if (type == KCSAN_REPORT_RACE_SIGNAL) {
- other_skipnr = get_stack_skipnr(other_info->stack_entries,
- other_info->num_stack_entries);
+ if (other_info) {
+ other_skipnr = sanitize_stack_entries(other_info->stack_entries,
+ other_info->num_stack_entries,
+ other_info->ai.ip, &other_reordered_to);
other_frame = other_info->stack_entries[other_skipnr];
/* @value_change is only known for the other thread */
if (skip_report(value_change, other_frame))
- return false;
+ return;
}
if (rate_limit_report(this_frame, other_frame))
- return false;
+ return;
/* Print report header. */
pr_err("==================================================================\n");
- switch (type) {
- case KCSAN_REPORT_RACE_SIGNAL: {
+ if (other_info) {
int cmp;
/*
@@ -362,32 +433,24 @@ static bool print_report(enum kcsan_value_change value_change,
get_bug_type(ai->access_type | other_info->ai.access_type),
(void *)(cmp < 0 ? other_frame : this_frame),
(void *)(cmp < 0 ? this_frame : other_frame));
- } break;
-
- case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN:
+ } else {
pr_err("BUG: KCSAN: %s in %pS\n", get_bug_type(ai->access_type),
(void *)this_frame);
- break;
-
- default:
- BUG();
}
pr_err("\n");
/* Print information about the racing accesses. */
- switch (type) {
- case KCSAN_REPORT_RACE_SIGNAL:
+ if (other_info) {
pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n",
get_access_type(other_info->ai.access_type), other_info->ai.ptr,
other_info->ai.size, get_thread_desc(other_info->ai.task_pid),
other_info->ai.cpu_id);
/* Print the other thread's stack trace. */
- stack_trace_print(other_info->stack_entries + other_skipnr,
+ print_stack_trace(other_info->stack_entries + other_skipnr,
other_info->num_stack_entries - other_skipnr,
- 0);
-
+ other_reordered_to);
if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
print_verbose_info(other_info->task);
@@ -395,42 +458,50 @@ static bool print_report(enum kcsan_value_change value_change,
pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n",
get_access_type(ai->access_type), ai->ptr, ai->size,
get_thread_desc(ai->task_pid), ai->cpu_id);
- break;
-
- case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN:
+ } else {
pr_err("race at unknown origin, with %s to 0x%px of %zu bytes by %s on cpu %i:\n",
get_access_type(ai->access_type), ai->ptr, ai->size,
get_thread_desc(ai->task_pid), ai->cpu_id);
- break;
-
- default:
- BUG();
}
/* Print stack trace of this thread. */
- stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr,
- 0);
-
+ print_stack_trace(stack_entries + skipnr, num_stack_entries - skipnr, reordered_to);
if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
print_verbose_info(current);
+ /* Print observed value change. */
+ if (ai->size <= 8) {
+ int hex_len = ai->size * 2;
+ u64 diff = old ^ new;
+
+ if (mask)
+ diff &= mask;
+ if (diff) {
+ pr_err("\n");
+ pr_err("value changed: 0x%0*llx -> 0x%0*llx\n",
+ hex_len, old, hex_len, new);
+ if (mask) {
+ pr_err(" bits changed: 0x%0*llx with mask 0x%0*llx\n",
+ hex_len, diff, hex_len, mask);
+ }
+ }
+ }
+
/* Print report footer. */
pr_err("\n");
pr_err("Reported by Kernel Concurrency Sanitizer on:\n");
dump_stack_print_info(KERN_DEFAULT);
pr_err("==================================================================\n");
- return true;
+ check_panic_on_warn("KCSAN");
}
static void release_report(unsigned long *flags, struct other_info *other_info)
{
- if (other_info)
- /*
- * Use size to denote valid/invalid, since KCSAN entirely
- * ignores 0-sized accesses.
- */
- other_info->ai.size = 0;
-
+ /*
+ * Use size to denote valid/invalid, since KCSAN entirely ignores
+ * 0-sized accesses.
+ */
+ other_info->ai.size = 0;
raw_spin_unlock_irqrestore(&report_lock, *flags);
}
@@ -448,7 +519,7 @@ static void set_other_info_task_blocking(unsigned long *flags,
* We may be instrumenting a code-path where current->state is already
* something other than TASK_RUNNING.
*/
- const bool is_running = current->state == TASK_RUNNING;
+ const bool is_running = task_is_running(current);
/*
* To avoid deadlock in case we are in an interrupt here and this is a
* race with a task on the same CPU (KCSAN_INTERRUPT_WATCHER), provide a
@@ -552,7 +623,7 @@ static bool prepare_report_consumer(unsigned long *flags,
* If the actual accesses to not match, this was a false
* positive due to watchpoint encoding.
*/
- kcsan_counter_inc(KCSAN_COUNTER_ENCODING_FALSE_POSITIVES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ENCODING_FALSE_POSITIVES]);
goto discard;
}
@@ -563,72 +634,82 @@ discard:
return false;
}
-/*
- * Depending on the report type either sets @other_info and returns false, or
- * awaits @other_info and returns true. If @other_info is not required for the
- * report type, simply acquires @report_lock and returns true.
- */
-static noinline bool prepare_report(unsigned long *flags,
- enum kcsan_report_type type,
- const struct access_info *ai,
- struct other_info *other_info)
-{
- switch (type) {
- case KCSAN_REPORT_CONSUMED_WATCHPOINT:
- prepare_report_producer(flags, ai, other_info);
- return false;
- case KCSAN_REPORT_RACE_SIGNAL:
- return prepare_report_consumer(flags, ai, other_info);
- default:
- /* @other_info not required; just acquire @report_lock. */
- raw_spin_lock_irqsave(&report_lock, *flags);
- return true;
- }
-}
-
-void kcsan_report(const volatile void *ptr, size_t size, int access_type,
- enum kcsan_value_change value_change,
- enum kcsan_report_type type, int watchpoint_idx)
+static struct access_info prepare_access_info(const volatile void *ptr, size_t size,
+ int access_type, unsigned long ip)
{
- unsigned long flags = 0;
- const struct access_info ai = {
+ return (struct access_info) {
.ptr = ptr,
.size = size,
.access_type = access_type,
.task_pid = in_task() ? task_pid_nr(current) : -1,
- .cpu_id = raw_smp_processor_id()
+ .cpu_id = raw_smp_processor_id(),
+ /* Only replace stack entry with @ip if scoped access. */
+ .ip = (access_type & KCSAN_ACCESS_SCOPED) ? ip : 0,
};
- struct other_info *other_info = type == KCSAN_REPORT_RACE_UNKNOWN_ORIGIN
- ? NULL : &other_infos[watchpoint_idx];
+}
+
+void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_type,
+ unsigned long ip, int watchpoint_idx)
+{
+ const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
+ unsigned long flags;
kcsan_disable_current();
- if (WARN_ON(watchpoint_idx < 0 || watchpoint_idx >= ARRAY_SIZE(other_infos)))
- goto out;
+ lockdep_off(); /* See kcsan_report_known_origin(). */
+
+ prepare_report_producer(&flags, &ai, &other_infos[watchpoint_idx]);
+ lockdep_on();
+ kcsan_enable_current();
+}
+
+void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access_type,
+ unsigned long ip, enum kcsan_value_change value_change,
+ int watchpoint_idx, u64 old, u64 new, u64 mask)
+{
+ const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
+ struct other_info *other_info = &other_infos[watchpoint_idx];
+ unsigned long flags = 0;
+
+ kcsan_disable_current();
/*
- * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if
- * we do not turn off lockdep here; this could happen due to recursion
- * into lockdep via KCSAN if we detect a race in utilities used by
- * lockdep.
+ * Because we may generate reports when we're in scheduler code, the use
+ * of printk() could deadlock. Until such time that all printing code
+ * called in print_report() is scheduler-safe, accept the risk, and just
+ * get our message out. As such, also disable lockdep to hide the
+ * warning, and avoid disabling lockdep for the rest of the kernel.
*/
lockdep_off();
- if (prepare_report(&flags, type, &ai, other_info)) {
- /*
- * Never report if value_change is FALSE, only if we it is
- * either TRUE or MAYBE. In case of MAYBE, further filtering may
- * be done once we know the full stack trace in print_report().
- */
- bool reported = value_change != KCSAN_VALUE_CHANGE_FALSE &&
- print_report(value_change, type, &ai, other_info);
+ if (!prepare_report_consumer(&flags, &ai, other_info))
+ goto out;
+ /*
+ * Never report if value_change is FALSE, only when it is
+ * either TRUE or MAYBE. In case of MAYBE, further filtering may
+ * be done once we know the full stack trace in print_report().
+ */
+ if (value_change != KCSAN_VALUE_CHANGE_FALSE)
+ print_report(value_change, &ai, other_info, old, new, mask);
- if (reported && panic_on_warn)
- panic("panic_on_warn set ...\n");
+ release_report(&flags, other_info);
+out:
+ lockdep_on();
+ kcsan_enable_current();
+}
- release_report(&flags, other_info);
- }
+void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int access_type,
+ unsigned long ip, u64 old, u64 new, u64 mask)
+{
+ const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
+ unsigned long flags;
+
+ kcsan_disable_current();
+ lockdep_off(); /* See kcsan_report_known_origin(). */
+
+ raw_spin_lock_irqsave(&report_lock, flags);
+ print_report(KCSAN_VALUE_CHANGE_TRUE, &ai, NULL, old, new, mask);
+ raw_spin_unlock_irqrestore(&report_lock, flags);
lockdep_on();
-out:
kcsan_enable_current();
}
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
new file mode 100644
index 000000000000..84a1200271af
--- /dev/null
+++ b/kernel/kcsan/selftest.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN short boot-time selftests.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
+
+#define pr_fmt(fmt) "kcsan: " fmt
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/init.h>
+#include <linux/kcsan-checks.h>
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include "encoding.h"
+
+#define ITERS_PER_TEST 2000
+
+/*
+ * Test watchpoint encode and decode: check that encoding some access's info,
+ * and then subsequent decode preserves the access's info.
+ */
+static bool __init test_encode_decode(void)
+{
+ int i;
+
+ for (i = 0; i < ITERS_PER_TEST; ++i) {
+ size_t size = get_random_u32_inclusive(1, MAX_ENCODABLE_SIZE);
+ bool is_write = !!get_random_u32_below(2);
+ unsigned long verif_masked_addr;
+ long encoded_watchpoint;
+ bool verif_is_write;
+ unsigned long addr;
+ size_t verif_size;
+
+ get_random_bytes(&addr, sizeof(addr));
+ if (addr < PAGE_SIZE)
+ addr = PAGE_SIZE;
+
+ if (WARN_ON(!check_encodable(addr, size)))
+ return false;
+
+ encoded_watchpoint = encode_watchpoint(addr, size, is_write);
+
+ /* Check special watchpoints */
+ if (WARN_ON(decode_watchpoint(INVALID_WATCHPOINT, &verif_masked_addr, &verif_size, &verif_is_write)))
+ return false;
+ if (WARN_ON(decode_watchpoint(CONSUMED_WATCHPOINT, &verif_masked_addr, &verif_size, &verif_is_write)))
+ return false;
+
+ /* Check decoding watchpoint returns same data */
+ if (WARN_ON(!decode_watchpoint(encoded_watchpoint, &verif_masked_addr, &verif_size, &verif_is_write)))
+ return false;
+ if (WARN_ON(verif_masked_addr != (addr & WATCHPOINT_ADDR_MASK)))
+ goto fail;
+ if (WARN_ON(verif_size != size))
+ goto fail;
+ if (WARN_ON(is_write != verif_is_write))
+ goto fail;
+
+ continue;
+fail:
+ pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n",
+ __func__, is_write ? "write" : "read", size, addr, encoded_watchpoint,
+ verif_is_write ? "write" : "read", verif_size, verif_masked_addr);
+ return false;
+ }
+
+ return true;
+}
+
+/* Test access matching function. */
+static bool __init test_matching_access(void)
+{
+ if (WARN_ON(!matching_access(10, 1, 10, 1)))
+ return false;
+ if (WARN_ON(!matching_access(10, 2, 11, 1)))
+ return false;
+ if (WARN_ON(!matching_access(10, 1, 9, 2)))
+ return false;
+ if (WARN_ON(matching_access(10, 1, 11, 1)))
+ return false;
+ if (WARN_ON(matching_access(9, 1, 10, 1)))
+ return false;
+
+ /*
+ * An access of size 0 could match another access, as demonstrated here.
+ * Rather than add more comparisons to 'matching_access()', which would
+ * end up in the fast-path for *all* checks, check_access() simply
+ * returns for all accesses of size 0.
+ */
+ if (WARN_ON(!matching_access(8, 8, 12, 0)))
+ return false;
+
+ return true;
+}
+
+/*
+ * Correct memory barrier instrumentation is critical to avoiding false
+ * positives: simple test to check at boot certain barriers are always properly
+ * instrumented. See kcsan_test for a more complete test.
+ */
+static DEFINE_SPINLOCK(test_spinlock);
+static bool __init test_barrier(void)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ struct kcsan_scoped_access *reorder_access = &current->kcsan_ctx.reorder_access;
+#else
+ struct kcsan_scoped_access *reorder_access = NULL;
+#endif
+ bool ret = true;
+ arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED;
+ atomic_t dummy;
+ long test_var;
+
+ if (!reorder_access || !IS_ENABLED(CONFIG_SMP))
+ return true;
+
+#define __KCSAN_CHECK_BARRIER(access_type, barrier, name) \
+ do { \
+ reorder_access->type = (access_type) | KCSAN_ACCESS_SCOPED; \
+ reorder_access->size = 1; \
+ barrier; \
+ if (reorder_access->size != 0) { \
+ pr_err("improperly instrumented type=(" #access_type "): " name "\n"); \
+ ret = false; \
+ } \
+ } while (0)
+#define KCSAN_CHECK_READ_BARRIER(b) __KCSAN_CHECK_BARRIER(0, b, #b)
+#define KCSAN_CHECK_WRITE_BARRIER(b) __KCSAN_CHECK_BARRIER(KCSAN_ACCESS_WRITE, b, #b)
+#define KCSAN_CHECK_RW_BARRIER(b) __KCSAN_CHECK_BARRIER(KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND, b, #b)
+
+ kcsan_nestable_atomic_begin(); /* No watchpoints in called functions. */
+
+ KCSAN_CHECK_READ_BARRIER(mb());
+ KCSAN_CHECK_READ_BARRIER(rmb());
+ KCSAN_CHECK_READ_BARRIER(smp_mb());
+ KCSAN_CHECK_READ_BARRIER(smp_rmb());
+ KCSAN_CHECK_READ_BARRIER(dma_rmb());
+ KCSAN_CHECK_READ_BARRIER(smp_mb__before_atomic());
+ KCSAN_CHECK_READ_BARRIER(smp_mb__after_atomic());
+ KCSAN_CHECK_READ_BARRIER(smp_mb__after_spinlock());
+ KCSAN_CHECK_READ_BARRIER(smp_store_mb(test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(smp_store_release(&test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(xchg(&test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(xchg_release(&test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(cmpxchg(&test_var, 0, 0));
+ KCSAN_CHECK_READ_BARRIER(cmpxchg_release(&test_var, 0, 0));
+ KCSAN_CHECK_READ_BARRIER(atomic_set_release(&dummy, 0));
+ KCSAN_CHECK_READ_BARRIER(atomic_add_return(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(atomic_add_return_release(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(atomic_fetch_add(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(atomic_fetch_add_release(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(test_and_set_bit(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(test_and_clear_bit(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(test_and_change_bit(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(clear_bit_unlock(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(__clear_bit_unlock(0, &test_var));
+ arch_spin_lock(&arch_spinlock);
+ KCSAN_CHECK_READ_BARRIER(arch_spin_unlock(&arch_spinlock));
+ spin_lock(&test_spinlock);
+ KCSAN_CHECK_READ_BARRIER(spin_unlock(&test_spinlock));
+
+ KCSAN_CHECK_WRITE_BARRIER(mb());
+ KCSAN_CHECK_WRITE_BARRIER(wmb());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb());
+ KCSAN_CHECK_WRITE_BARRIER(smp_wmb());
+ KCSAN_CHECK_WRITE_BARRIER(dma_wmb());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb__before_atomic());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb__after_atomic());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb__after_spinlock());
+ KCSAN_CHECK_WRITE_BARRIER(smp_store_mb(test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(smp_store_release(&test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(xchg(&test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(xchg_release(&test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(cmpxchg(&test_var, 0, 0));
+ KCSAN_CHECK_WRITE_BARRIER(cmpxchg_release(&test_var, 0, 0));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_set_release(&dummy, 0));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_add_return(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_add_return_release(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_fetch_add(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_fetch_add_release(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(test_and_set_bit(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(test_and_clear_bit(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(test_and_change_bit(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(__clear_bit_unlock(0, &test_var));
+ arch_spin_lock(&arch_spinlock);
+ KCSAN_CHECK_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock));
+ spin_lock(&test_spinlock);
+ KCSAN_CHECK_WRITE_BARRIER(spin_unlock(&test_spinlock));
+
+ KCSAN_CHECK_RW_BARRIER(mb());
+ KCSAN_CHECK_RW_BARRIER(wmb());
+ KCSAN_CHECK_RW_BARRIER(rmb());
+ KCSAN_CHECK_RW_BARRIER(smp_mb());
+ KCSAN_CHECK_RW_BARRIER(smp_wmb());
+ KCSAN_CHECK_RW_BARRIER(smp_rmb());
+ KCSAN_CHECK_RW_BARRIER(dma_wmb());
+ KCSAN_CHECK_RW_BARRIER(dma_rmb());
+ KCSAN_CHECK_RW_BARRIER(smp_mb__before_atomic());
+ KCSAN_CHECK_RW_BARRIER(smp_mb__after_atomic());
+ KCSAN_CHECK_RW_BARRIER(smp_mb__after_spinlock());
+ KCSAN_CHECK_RW_BARRIER(smp_store_mb(test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(smp_store_release(&test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(xchg(&test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(xchg_release(&test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(cmpxchg(&test_var, 0, 0));
+ KCSAN_CHECK_RW_BARRIER(cmpxchg_release(&test_var, 0, 0));
+ KCSAN_CHECK_RW_BARRIER(atomic_set_release(&dummy, 0));
+ KCSAN_CHECK_RW_BARRIER(atomic_add_return(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(atomic_add_return_release(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(atomic_fetch_add(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(atomic_fetch_add_release(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(test_and_set_bit(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(test_and_clear_bit(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(test_and_change_bit(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(clear_bit_unlock(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(__clear_bit_unlock(0, &test_var));
+ arch_spin_lock(&arch_spinlock);
+ KCSAN_CHECK_RW_BARRIER(arch_spin_unlock(&arch_spinlock));
+ spin_lock(&test_spinlock);
+ KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock));
+ KCSAN_CHECK_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
+ KCSAN_CHECK_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
+ kcsan_nestable_atomic_end();
+
+ return ret;
+}
+
+static int __init kcsan_selftest(void)
+{
+ int passed = 0;
+ int total = 0;
+
+#define RUN_TEST(do_test) \
+ do { \
+ ++total; \
+ if (do_test()) \
+ ++passed; \
+ else \
+ pr_err("selftest: " #do_test " failed"); \
+ } while (0)
+
+ RUN_TEST(test_encode_decode);
+ RUN_TEST(test_matching_access);
+ RUN_TEST(test_barrier);
+
+ pr_info("selftest: %d/%d tests passed\n", passed, total);
+ if (passed != total)
+ panic("selftests failed");
+ return 0;
+}
+postcore_initcall(kcsan_selftest);
diff --git a/kernel/kcsan/test.c b/kernel/kcsan/test.c
deleted file mode 100644
index d26a052d3383..000000000000
--- a/kernel/kcsan/test.c
+++ /dev/null
@@ -1,131 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/printk.h>
-#include <linux/random.h>
-#include <linux/types.h>
-
-#include "encoding.h"
-
-#define ITERS_PER_TEST 2000
-
-/* Test requirements. */
-static bool test_requires(void)
-{
- /* random should be initialized for the below tests */
- return prandom_u32() + prandom_u32() != 0;
-}
-
-/*
- * Test watchpoint encode and decode: check that encoding some access's info,
- * and then subsequent decode preserves the access's info.
- */
-static bool test_encode_decode(void)
-{
- int i;
-
- for (i = 0; i < ITERS_PER_TEST; ++i) {
- size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1;
- bool is_write = !!prandom_u32_max(2);
- unsigned long addr;
-
- prandom_bytes(&addr, sizeof(addr));
- if (WARN_ON(!check_encodable(addr, size)))
- return false;
-
- /* Encode and decode */
- {
- const long encoded_watchpoint =
- encode_watchpoint(addr, size, is_write);
- unsigned long verif_masked_addr;
- size_t verif_size;
- bool verif_is_write;
-
- /* Check special watchpoints */
- if (WARN_ON(decode_watchpoint(
- INVALID_WATCHPOINT, &verif_masked_addr,
- &verif_size, &verif_is_write)))
- return false;
- if (WARN_ON(decode_watchpoint(
- CONSUMED_WATCHPOINT, &verif_masked_addr,
- &verif_size, &verif_is_write)))
- return false;
-
- /* Check decoding watchpoint returns same data */
- if (WARN_ON(!decode_watchpoint(
- encoded_watchpoint, &verif_masked_addr,
- &verif_size, &verif_is_write)))
- return false;
- if (WARN_ON(verif_masked_addr !=
- (addr & WATCHPOINT_ADDR_MASK)))
- goto fail;
- if (WARN_ON(verif_size != size))
- goto fail;
- if (WARN_ON(is_write != verif_is_write))
- goto fail;
-
- continue;
-fail:
- pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n",
- __func__, is_write ? "write" : "read", size,
- addr, encoded_watchpoint,
- verif_is_write ? "write" : "read", verif_size,
- verif_masked_addr);
- return false;
- }
- }
-
- return true;
-}
-
-/* Test access matching function. */
-static bool test_matching_access(void)
-{
- if (WARN_ON(!matching_access(10, 1, 10, 1)))
- return false;
- if (WARN_ON(!matching_access(10, 2, 11, 1)))
- return false;
- if (WARN_ON(!matching_access(10, 1, 9, 2)))
- return false;
- if (WARN_ON(matching_access(10, 1, 11, 1)))
- return false;
- if (WARN_ON(matching_access(9, 1, 10, 1)))
- return false;
-
- /*
- * An access of size 0 could match another access, as demonstrated here.
- * Rather than add more comparisons to 'matching_access()', which would
- * end up in the fast-path for *all* checks, check_access() simply
- * returns for all accesses of size 0.
- */
- if (WARN_ON(!matching_access(8, 8, 12, 0)))
- return false;
-
- return true;
-}
-
-static int __init kcsan_selftest(void)
-{
- int passed = 0;
- int total = 0;
-
-#define RUN_TEST(do_test) \
- do { \
- ++total; \
- if (do_test()) \
- ++passed; \
- else \
- pr_err("KCSAN selftest: " #do_test " failed"); \
- } while (0)
-
- RUN_TEST(test_requires);
- RUN_TEST(test_encode_decode);
- RUN_TEST(test_matching_access);
-
- pr_info("KCSAN selftest: %d/%d tests passed\n", passed, total);
- if (passed != total)
- panic("KCSAN selftests failed");
- return 0;
-}
-postcore_initcall(kcsan_selftest);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index f977786fe498..8f35a5a42af8 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -19,26 +19,9 @@
#include "kexec_internal.h"
-static int copy_user_segment_list(struct kimage *image,
- unsigned long nr_segments,
- struct kexec_segment __user *segments)
-{
- int ret;
- size_t segment_bytes;
-
- /* Read in the segments */
- image->nr_segments = nr_segments;
- segment_bytes = nr_segments * sizeof(*segments);
- ret = copy_from_user(image->segment, segments, segment_bytes);
- if (ret)
- ret = -EFAULT;
-
- return ret;
-}
-
static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
unsigned long nr_segments,
- struct kexec_segment __user *segments,
+ struct kexec_segment *segments,
unsigned long flags)
{
int ret;
@@ -58,10 +41,8 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
return -ENOMEM;
image->start = entry;
-
- ret = copy_user_segment_list(image, nr_segments, segments);
- if (ret)
- goto out_free_image;
+ image->nr_segments = nr_segments;
+ memcpy(image->segment, segments, nr_segments * sizeof(*segments));
if (kexec_on_panic) {
/* Enable special crash kernel control page alloc policy. */
@@ -104,12 +85,20 @@ out_free_image:
}
static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
- struct kexec_segment __user *segments, unsigned long flags)
+ struct kexec_segment *segments, unsigned long flags)
{
struct kimage **dest_image, *image;
unsigned long i;
int ret;
+ /*
+ * Because we write directly to the reserved memory region when loading
+ * crash kernels we need a serialization here to prevent multiple crash
+ * kernels from attempting to load simultaneously.
+ */
+ if (!kexec_trylock())
+ return -EBUSY;
+
if (flags & KEXEC_ON_CRASH) {
dest_image = &kexec_crash_image;
if (kexec_crash_image)
@@ -121,7 +110,8 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
if (nr_segments == 0) {
/* Uninstall image */
kimage_free(xchg(dest_image, NULL));
- return 0;
+ ret = 0;
+ goto out_unlock;
}
if (flags & KEXEC_ON_CRASH) {
/*
@@ -134,11 +124,16 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags);
if (ret)
- return ret;
+ goto out_unlock;
if (flags & KEXEC_PRESERVE_CONTEXT)
image->preserve_context = 1;
+#ifdef CONFIG_CRASH_HOTPLUG
+ if (flags & KEXEC_UPDATE_ELFCOREHDR)
+ image->update_elfcorehdr = 1;
+#endif
+
ret = machine_kexec_prepare(image);
if (ret)
goto out;
@@ -171,6 +166,8 @@ out:
arch_kexec_protect_crashkres();
kimage_free(image);
+out_unlock:
+ kexec_unlock();
return ret;
}
@@ -198,14 +195,16 @@ out:
static inline int kexec_load_check(unsigned long nr_segments,
unsigned long flags)
{
+ int image_type = (flags & KEXEC_ON_CRASH) ?
+ KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT;
int result;
/* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ if (!kexec_load_permitted(image_type))
return -EPERM;
/* Permit LSMs and IMA to fail the kexec */
- result = security_kernel_load_data(LOADING_KEXEC_IMAGE);
+ result = security_kernel_load_data(LOADING_KEXEC_IMAGE, false);
if (result < 0)
return result;
@@ -236,7 +235,8 @@ static inline int kexec_load_check(unsigned long nr_segments,
SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
struct kexec_segment __user *, segments, unsigned long, flags)
{
- int result;
+ struct kexec_segment *ksegments;
+ unsigned long result;
result = kexec_load_check(nr_segments, flags);
if (result)
@@ -247,20 +247,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
return -EINVAL;
- /* Because we write directly to the reserved memory
- * region when loading crash kernels we need a mutex here to
- * prevent multiple crash kernels from attempting to load
- * simultaneously, and to prevent a crash kernel from loading
- * over the top of a in use crash kernel.
- *
- * KISS: always take the mutex.
- */
- if (!mutex_trylock(&kexec_mutex))
- return -EBUSY;
+ ksegments = memdup_array_user(segments, nr_segments, sizeof(ksegments[0]));
+ if (IS_ERR(ksegments))
+ return PTR_ERR(ksegments);
- result = do_kexec_load(entry, nr_segments, segments, flags);
-
- mutex_unlock(&kexec_mutex);
+ result = do_kexec_load(entry, nr_segments, ksegments, flags);
+ kfree(ksegments);
return result;
}
@@ -272,7 +264,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
compat_ulong_t, flags)
{
struct compat_kexec_segment in;
- struct kexec_segment out, __user *ksegments;
+ struct kexec_segment *ksegments;
unsigned long i, result;
result = kexec_load_check(nr_segments, flags);
@@ -285,37 +277,26 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
return -EINVAL;
- ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
+ ksegments = kmalloc_array(nr_segments, sizeof(ksegments[0]),
+ GFP_KERNEL);
+ if (!ksegments)
+ return -ENOMEM;
+
for (i = 0; i < nr_segments; i++) {
result = copy_from_user(&in, &segments[i], sizeof(in));
if (result)
- return -EFAULT;
+ goto fail;
- out.buf = compat_ptr(in.buf);
- out.bufsz = in.bufsz;
- out.mem = in.mem;
- out.memsz = in.memsz;
-
- result = copy_to_user(&ksegments[i], &out, sizeof(out));
- if (result)
- return -EFAULT;
+ ksegments[i].buf = compat_ptr(in.buf);
+ ksegments[i].bufsz = in.bufsz;
+ ksegments[i].mem = in.mem;
+ ksegments[i].memsz = in.memsz;
}
- /* Because we write directly to the reserved memory
- * region when loading crash kernels we need a mutex here to
- * prevent multiple crash kernels from attempting to load
- * simultaneously, and to prevent a crash kernel from loading
- * over the top of a in use crash kernel.
- *
- * KISS: always take the mutex.
- */
- if (!mutex_trylock(&kexec_mutex))
- return -EBUSY;
-
result = do_kexec_load(entry, nr_segments, ksegments, flags);
- mutex_unlock(&kexec_mutex);
-
+fail:
+ kfree(ksegments);
return result;
}
#endif
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index c19c0dad1ebe..d08fc7b5db97 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -6,6 +6,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/btf.h>
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
@@ -26,6 +27,7 @@
#include <linux/suspend.h>
#include <linux/device.h>
#include <linux/freezer.h>
+#include <linux/panic_notifier.h>
#include <linux/pm.h>
#include <linux/cpu.h>
#include <linux/uaccess.h>
@@ -36,39 +38,21 @@
#include <linux/syscore_ops.h>
#include <linux/compiler.h>
#include <linux/hugetlb.h>
-#include <linux/frame.h>
+#include <linux/objtool.h>
+#include <linux/kmsg_dump.h>
#include <asm/page.h>
#include <asm/sections.h>
#include <crypto/hash.h>
-#include <crypto/sha.h>
#include "kexec_internal.h"
-DEFINE_MUTEX(kexec_mutex);
-
-/* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t __percpu *crash_notes;
+atomic_t __kexec_lock = ATOMIC_INIT(0);
/* Flag to indicate we are going to kexec a new kernel */
bool kexec_in_progress = false;
-
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
- .desc = IORES_DESC_CRASH_KERNEL
-};
-struct resource crashk_low_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
- .desc = IORES_DESC_CRASH_KERNEL
-};
+bool kexec_file_dbg_print;
int kexec_should_crash(struct task_struct *p)
{
@@ -80,7 +64,7 @@ int kexec_should_crash(struct task_struct *p)
if (crash_kexec_post_notifiers)
return 0;
/*
- * There are 4 panic() calls in do_exit() path, each of which
+ * There are 4 panic() calls in make_task_dead() path, each of which
* corresponds to each of these 4 conditions.
*/
if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
@@ -109,7 +93,7 @@ EXPORT_SYMBOL_GPL(kexec_crash_loaded);
* defined more restrictively in <asm/kexec.h>.
*
* The code for the transition from the current kernel to the
- * the new kernel is placed in the control_code_buffer, whose size
+ * new kernel is placed in the control_code_buffer, whose size
* is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
* page of memory is necessary, but some architectures require more.
* Because this memory must be identity mapped in the transition from
@@ -275,6 +259,12 @@ struct kimage *do_kimage_alloc_init(void)
/* Initialize the list of unusable pages */
INIT_LIST_HEAD(&image->unusable_pages);
+#ifdef CONFIG_CRASH_HOTPLUG
+ image->hp_action = KEXEC_CRASH_HP_NONE;
+ image->elfcorehdr_index = -1;
+ image->elfcorehdr_updated = false;
+#endif
+
return image;
}
@@ -288,8 +278,8 @@ int kimage_is_destination_range(struct kimage *image,
unsigned long mstart, mend;
mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- if ((end > mstart) && (start < mend))
+ mend = mstart + image->segment[i].memsz - 1;
+ if ((end >= mstart) && (start <= mend))
return 1;
}
@@ -382,7 +372,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
pfn = page_to_boot_pfn(pages);
epfn = pfn + count;
addr = pfn << PAGE_SHIFT;
- eaddr = epfn << PAGE_SHIFT;
+ eaddr = (epfn << PAGE_SHIFT) - 1;
if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
kimage_is_destination_range(image, addr, eaddr)) {
list_add(&pages->lru, &extra_pages);
@@ -442,7 +432,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
pages = NULL;
size = (1 << order) << PAGE_SHIFT;
- hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+ hole_start = ALIGN(image->control_page, size);
hole_end = hole_start + size - 1;
while (hole_end <= crashk_res.end) {
unsigned long i;
@@ -459,7 +449,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
mend = mstart + image->segment[i].memsz - 1;
if ((hole_end >= mstart) && (hole_start <= mend)) {
/* Advance the hole to the end of the segment */
- hole_start = (mend + (size - 1)) & ~(size - 1);
+ hole_start = ALIGN(mend, size);
hole_end = hole_start + size - 1;
break;
}
@@ -467,7 +457,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
/* If I don't overlap any segments I have found my hole! */
if (i == image->nr_segments) {
pages = pfn_to_page(hole_start >> PAGE_SHIFT);
- image->control_page = hole_end;
+ image->control_page = hole_end + 1;
break;
}
}
@@ -560,23 +550,17 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
static int kimage_set_destination(struct kimage *image,
unsigned long destination)
{
- int result;
-
destination &= PAGE_MASK;
- result = kimage_add_entry(image, destination | IND_DESTINATION);
- return result;
+ return kimage_add_entry(image, destination | IND_DESTINATION);
}
static int kimage_add_page(struct kimage *image, unsigned long page)
{
- int result;
-
page &= PAGE_MASK;
- result = kimage_add_entry(image, page | IND_SOURCE);
- return result;
+ return kimage_add_entry(image, page | IND_SOURCE);
}
@@ -590,11 +574,6 @@ static void kimage_free_extra_pages(struct kimage *image)
}
-int __weak machine_kexec_post_load(struct kimage *image)
-{
- return 0;
-}
-
void kimage_terminate(struct kimage *image)
{
if (*image->entry != 0)
@@ -739,7 +718,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
/* If the page is not a destination page use it */
if (!kimage_is_destination_range(image, addr,
- addr + PAGE_SIZE))
+ addr + PAGE_SIZE - 1))
break;
/*
@@ -767,7 +746,6 @@ static struct page *kimage_alloc_page(struct kimage *image,
kimage_free_pages(old_page);
continue;
}
- addr = old_addr;
page = old_page;
break;
}
@@ -787,7 +765,6 @@ static int kimage_load_normal_segment(struct kimage *image,
unsigned char __user *buf = NULL;
unsigned char *kbuf = NULL;
- result = 0;
if (image->file_mode)
kbuf = segment->kbuf;
else
@@ -815,7 +792,7 @@ static int kimage_load_normal_segment(struct kimage *image,
if (result < 0)
goto out;
- ptr = kmap(page);
+ ptr = kmap_local_page(page);
/* Start with a clear page */
clear_page(ptr);
ptr += maddr & ~PAGE_MASK;
@@ -828,7 +805,7 @@ static int kimage_load_normal_segment(struct kimage *image,
memcpy(ptr, kbuf, uchunk);
else
result = copy_from_user(ptr, buf, uchunk);
- kunmap(page);
+ kunmap_local(ptr);
if (result) {
result = -EFAULT;
goto out;
@@ -879,7 +856,7 @@ static int kimage_load_crash_segment(struct kimage *image,
goto out;
}
arch_kexec_post_alloc_pages(page_address(page), 1, 0);
- ptr = kmap(page);
+ ptr = kmap_local_page(page);
ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes,
PAGE_SIZE - (maddr & ~PAGE_MASK));
@@ -895,7 +872,7 @@ static int kimage_load_crash_segment(struct kimage *image,
else
result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
- kunmap(page);
+ kunmap_local(ptr);
arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
@@ -932,9 +909,123 @@ int kimage_load_segment(struct kimage *image,
return result;
}
+struct kexec_load_limit {
+ /* Mutex protects the limit count. */
+ struct mutex mutex;
+ int limit;
+};
+
+static struct kexec_load_limit load_limit_reboot = {
+ .mutex = __MUTEX_INITIALIZER(load_limit_reboot.mutex),
+ .limit = -1,
+};
+
+static struct kexec_load_limit load_limit_panic = {
+ .mutex = __MUTEX_INITIALIZER(load_limit_panic.mutex),
+ .limit = -1,
+};
+
struct kimage *kexec_image;
struct kimage *kexec_crash_image;
-int kexec_load_disabled;
+static int kexec_load_disabled;
+
+#ifdef CONFIG_SYSCTL
+static int kexec_limit_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct kexec_load_limit *limit = table->data;
+ int val;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ };
+ int ret;
+
+ if (write) {
+ ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+
+ if (val < 0)
+ return -EINVAL;
+
+ mutex_lock(&limit->mutex);
+ if (limit->limit != -1 && val >= limit->limit)
+ ret = -EINVAL;
+ else
+ limit->limit = val;
+ mutex_unlock(&limit->mutex);
+
+ return ret;
+ }
+
+ mutex_lock(&limit->mutex);
+ val = limit->limit;
+ mutex_unlock(&limit->mutex);
+
+ return proc_dointvec(&tmp, write, buffer, lenp, ppos);
+}
+
+static struct ctl_table kexec_core_sysctls[] = {
+ {
+ .procname = "kexec_load_disabled",
+ .data = &kexec_load_disabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ /* only handle a transition from default "0" to "1" */
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "kexec_load_limit_panic",
+ .data = &load_limit_panic,
+ .mode = 0644,
+ .proc_handler = kexec_limit_handler,
+ },
+ {
+ .procname = "kexec_load_limit_reboot",
+ .data = &load_limit_reboot,
+ .mode = 0644,
+ .proc_handler = kexec_limit_handler,
+ },
+ { }
+};
+
+static int __init kexec_core_sysctl_init(void)
+{
+ register_sysctl_init("kernel", kexec_core_sysctls);
+ return 0;
+}
+late_initcall(kexec_core_sysctl_init);
+#endif
+
+bool kexec_load_permitted(int kexec_image_type)
+{
+ struct kexec_load_limit *limit;
+
+ /*
+ * Only the superuser can use the kexec syscall and if it has not
+ * been disabled.
+ */
+ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ return false;
+
+ /* Check limit counter and decrease it.*/
+ limit = (kexec_image_type == KEXEC_TYPE_CRASH) ?
+ &load_limit_panic : &load_limit_reboot;
+ mutex_lock(&limit->mutex);
+ if (!limit->limit) {
+ mutex_unlock(&limit->mutex);
+ return false;
+ }
+ if (limit->limit != -1)
+ limit->limit--;
+ mutex_unlock(&limit->mutex);
+
+ return true;
+}
/*
* No panic_cpu check version of crash_kexec(). This function is called
@@ -943,7 +1034,7 @@ int kexec_load_disabled;
*/
void __noclone __crash_kexec(struct pt_regs *regs)
{
- /* Take the kexec_mutex here to prevent sys_kexec_load
+ /* Take the kexec_lock here to prevent sys_kexec_load
* running on one cpu from replacing the crash kernel
* we are using after a panic on a different cpu.
*
@@ -951,7 +1042,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
* of memory the xchg(&kexec_crash_image) would be
* sufficient. But since I reuse the memory...
*/
- if (mutex_trylock(&kexec_mutex)) {
+ if (kexec_trylock()) {
if (kexec_crash_image) {
struct pt_regs fixed_regs;
@@ -960,12 +1051,12 @@ void __noclone __crash_kexec(struct pt_regs *regs)
machine_crash_shutdown(&fixed_regs);
machine_kexec(kexec_crash_image);
}
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
}
}
STACK_FRAME_NON_STANDARD(__crash_kexec);
-void crash_kexec(struct pt_regs *regs)
+__bpf_kfunc void crash_kexec(struct pt_regs *regs)
{
int old_cpu, this_cpu;
@@ -974,11 +1065,11 @@ void crash_kexec(struct pt_regs *regs)
* panic(). Otherwise parallel calls of panic() and crash_kexec()
* may stop each other. To exclude them, we use panic_cpu here too.
*/
+ old_cpu = PANIC_CPU_INVALID;
this_cpu = raw_smp_processor_id();
- old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
- if (old_cpu == PANIC_CPU_INVALID) {
+
+ if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
/* This is the 1st CPU which comes here, so go ahead. */
- printk_safe_flush_on_panic();
__crash_kexec(regs);
/*
@@ -989,72 +1080,104 @@ void crash_kexec(struct pt_regs *regs)
}
}
-size_t crash_get_memory_size(void)
+static inline resource_size_t crash_resource_size(const struct resource *res)
{
- size_t size = 0;
+ return !res->end ? 0 : resource_size(res);
+}
+
+ssize_t crash_get_memory_size(void)
+{
+ ssize_t size = 0;
- mutex_lock(&kexec_mutex);
- if (crashk_res.end != crashk_res.start)
- size = resource_size(&crashk_res);
- mutex_unlock(&kexec_mutex);
+ if (!kexec_trylock())
+ return -EBUSY;
+
+ size += crash_resource_size(&crashk_res);
+ size += crash_resource_size(&crashk_low_res);
+
+ kexec_unlock();
return size;
}
-void __weak crash_free_reserved_phys_range(unsigned long begin,
- unsigned long end)
+static int __crash_shrink_memory(struct resource *old_res,
+ unsigned long new_size)
{
- unsigned long addr;
+ struct resource *ram_res;
+
+ ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+ if (!ram_res)
+ return -ENOMEM;
+
+ ram_res->start = old_res->start + new_size;
+ ram_res->end = old_res->end;
+ ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
+ ram_res->name = "System RAM";
+
+ if (!new_size) {
+ release_resource(old_res);
+ old_res->start = 0;
+ old_res->end = 0;
+ } else {
+ crashk_res.end = ram_res->start - 1;
+ }
+
+ crash_free_reserved_phys_range(ram_res->start, ram_res->end);
+ insert_resource(&iomem_resource, ram_res);
- for (addr = begin; addr < end; addr += PAGE_SIZE)
- free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT));
+ return 0;
}
int crash_shrink_memory(unsigned long new_size)
{
int ret = 0;
- unsigned long start, end;
- unsigned long old_size;
- struct resource *ram_res;
+ unsigned long old_size, low_size;
- mutex_lock(&kexec_mutex);
+ if (!kexec_trylock())
+ return -EBUSY;
if (kexec_crash_image) {
ret = -ENOENT;
goto unlock;
}
- start = crashk_res.start;
- end = crashk_res.end;
- old_size = (end == 0) ? 0 : end - start + 1;
+
+ low_size = crash_resource_size(&crashk_low_res);
+ old_size = crash_resource_size(&crashk_res) + low_size;
+ new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN);
if (new_size >= old_size) {
ret = (new_size == old_size) ? 0 : -EINVAL;
goto unlock;
}
- ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
- if (!ram_res) {
- ret = -ENOMEM;
- goto unlock;
+ /*
+ * (low_size > new_size) implies that low_size is greater than zero.
+ * This also means that if low_size is zero, the else branch is taken.
+ *
+ * If low_size is greater than 0, (low_size > new_size) indicates that
+ * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res
+ * needs to be shrunken.
+ */
+ if (low_size > new_size) {
+ ret = __crash_shrink_memory(&crashk_res, 0);
+ if (ret)
+ goto unlock;
+
+ ret = __crash_shrink_memory(&crashk_low_res, new_size);
+ } else {
+ ret = __crash_shrink_memory(&crashk_res, new_size - low_size);
}
- start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
- end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
-
- crash_free_reserved_phys_range(end, crashk_res.end);
-
- if ((start == end) && (crashk_res.parent != NULL))
- release_resource(&crashk_res);
-
- ram_res->start = end;
- ram_res->end = crashk_res.end;
- ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
- ram_res->name = "System RAM";
-
- crashk_res.end = end - 1;
-
- insert_resource(&iomem_resource, ram_res);
+ /* Swap crashk_res and crashk_low_res if needed */
+ if (!crashk_res.end && crashk_low_res.end) {
+ crashk_res.start = crashk_low_res.start;
+ crashk_res.end = crashk_low_res.end;
+ release_resource(&crashk_low_res);
+ crashk_low_res.start = 0;
+ crashk_low_res.end = 0;
+ insert_resource(&iomem_resource, &crashk_res);
+ }
unlock:
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
return ret;
}
@@ -1077,47 +1200,13 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
if (!buf)
return;
memset(&prstatus, 0, sizeof(prstatus));
- prstatus.pr_pid = current->pid;
- elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+ prstatus.common.pr_pid = current->pid;
+ elf_core_copy_regs(&prstatus.pr_reg, regs);
buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
&prstatus, sizeof(prstatus));
final_note(buf);
}
-static int __init crash_notes_memory_init(void)
-{
- /* Allocate memory for saving cpu registers. */
- size_t size, align;
-
- /*
- * crash_notes could be allocated across 2 vmalloc pages when percpu
- * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc
- * pages are also on 2 continuous physical pages. In this case the
- * 2nd part of crash_notes in 2nd page could be lost since only the
- * starting address and size of crash_notes are exported through sysfs.
- * Here round up the size of crash_notes to the nearest power of two
- * and pass it to __alloc_percpu as align value. This can make sure
- * crash_notes is allocated inside one physical page.
- */
- size = sizeof(note_buf_t);
- align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE);
-
- /*
- * Break compile if size is bigger than PAGE_SIZE since crash_notes
- * definitely will be in 2 pages with that.
- */
- BUILD_BUG_ON(size > PAGE_SIZE);
-
- crash_notes = __alloc_percpu(size, align);
- if (!crash_notes) {
- pr_warn("Memory allocation for saving cpu register states failed\n");
- return -ENOMEM;
- }
- return 0;
-}
-subsys_initcall(crash_notes_memory_init);
-
-
/*
* Move into place and start executing a preloaded standalone
* executable. If nothing was preloaded return an error.
@@ -1126,7 +1215,7 @@ int kernel_kexec(void)
{
int error = 0;
- if (!mutex_trylock(&kexec_mutex))
+ if (!kexec_trylock())
return -EBUSY;
if (!kexec_image) {
error = -EINVAL;
@@ -1135,7 +1224,6 @@ int kernel_kexec(void)
#ifdef CONFIG_KEXEC_JUMP
if (kexec_image->preserve_context) {
- lock_system_sleep();
pm_prepare_console();
error = freeze_processes();
if (error) {
@@ -1167,8 +1255,9 @@ int kernel_kexec(void)
#endif
{
kexec_in_progress = true;
- kernel_restart_prepare(NULL);
+ kernel_restart_prepare("kexec reboot");
migrate_to_reboot_cpu();
+ syscore_shutdown();
/*
* migrate_to_reboot_cpu() disables CPU hotplug assuming that
@@ -1181,6 +1270,7 @@ int kernel_kexec(void)
machine_shutdown();
}
+ kmsg_dump(KMSG_DUMP_SHUTDOWN);
machine_kexec(kexec_image);
#ifdef CONFIG_KEXEC_JUMP
@@ -1198,24 +1288,10 @@ int kernel_kexec(void)
thaw_processes();
Restore_console:
pm_restore_console();
- unlock_system_sleep();
}
#endif
Unlock:
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
return error;
}
-
-/*
- * Protection mechanism for crashkernel reserved memory after
- * the kdump kernel is loaded.
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_kexec_protect_crashkres(void)
-{}
-
-void __weak arch_kexec_unprotect_crashkres(void)
-{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 09cc78df53c6..bef2f6f2571b 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -20,16 +20,29 @@
#include <linux/fs.h>
#include <linux/ima.h>
#include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
#include <linux/kernel.h>
+#include <linux/kernel_read_file.h>
#include <linux/syscalls.h>
#include <linux/vmalloc.h>
#include "kexec_internal.h"
+#ifdef CONFIG_KEXEC_SIG
+static bool sig_enforce = IS_ENABLED(CONFIG_KEXEC_SIG_FORCE);
+
+void set_kexec_sig_enforced(void)
+{
+ sig_enforce = true;
+}
+#endif
+
static int kexec_calculate_store_digests(struct kimage *image);
+/* Maximum size in bytes for kernel/initrd files. */
+#define KEXEC_FILE_SIZE_MAX min_t(s64, 4LL << 30, SSIZE_MAX)
+
/*
* Currently this is the only default function that is exported as some
* architectures need it to do additional handlings.
@@ -52,13 +65,6 @@ int kexec_image_probe_default(struct kimage *image, void *buf,
return ret;
}
-/* Architectures can provide this probe function */
-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- return kexec_image_probe_default(image, buf, buf_len);
-}
-
static void *kexec_image_load_default(struct kimage *image)
{
if (!image->fops || !image->fops->load)
@@ -70,11 +76,6 @@ static void *kexec_image_load_default(struct kimage *image)
image->cmdline_buf_len);
}
-void * __weak arch_kexec_kernel_image_load(struct kimage *image)
-{
- return kexec_image_load_default(image);
-}
-
int kexec_image_post_load_cleanup_default(struct kimage *image)
{
if (!image->fops || !image->fops->cleanup)
@@ -83,64 +84,6 @@ int kexec_image_post_load_cleanup_default(struct kimage *image)
return image->fops->cleanup(image->image_loader_data);
}
-int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
-{
- return kexec_image_post_load_cleanup_default(image);
-}
-
-#ifdef CONFIG_KEXEC_SIG
-static int kexec_image_verify_sig_default(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- if (!image->fops || !image->fops->verify_sig) {
- pr_debug("kernel loader does not support signature verification.\n");
- return -EKEYREJECTED;
- }
-
- return image->fops->verify_sig(buf, buf_len);
-}
-
-int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- return kexec_image_verify_sig_default(image, buf, buf_len);
-}
-#endif
-
-/*
- * arch_kexec_apply_relocations_add - apply relocations of type RELA
- * @pi: Purgatory to be relocated.
- * @section: Section relocations applying to.
- * @relsec: Section containing RELAs.
- * @symtab: Corresponding symtab.
- *
- * Return: 0 on success, negative errno on error.
- */
-int __weak
-arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section,
- const Elf_Shdr *relsec, const Elf_Shdr *symtab)
-{
- pr_err("RELA relocation unsupported.\n");
- return -ENOEXEC;
-}
-
-/*
- * arch_kexec_apply_relocations - apply relocations of type REL
- * @pi: Purgatory to be relocated.
- * @section: Section relocations applying to.
- * @relsec: Section containing RELs.
- * @symtab: Corresponding symtab.
- *
- * Return: 0 on success, negative errno on error.
- */
-int __weak
-arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section,
- const Elf_Shdr *relsec, const Elf_Shdr *symtab)
-{
- pr_err("REL relocation unsupported.\n");
- return -ENOEXEC;
-}
-
/*
* Free up memory used by kernel, initrd, and command line. This is temporary
* memory allocation which is not needed any more after these buffers have
@@ -165,6 +108,11 @@ void kimage_file_post_load_cleanup(struct kimage *image)
vfree(pi->sechdrs);
pi->sechdrs = NULL;
+#ifdef CONFIG_IMA_KEXEC
+ vfree(image->ima_buffer);
+ image->ima_buffer = NULL;
+#endif /* CONFIG_IMA_KEXEC */
+
/* See if architecture has anything to cleanup post load */
arch_kimage_file_post_load_cleanup(image);
@@ -175,19 +123,49 @@ void kimage_file_post_load_cleanup(struct kimage *image)
*/
kfree(image->image_loader_data);
image->image_loader_data = NULL;
+
+ kexec_file_dbg_print = false;
}
#ifdef CONFIG_KEXEC_SIG
+#ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION
+int kexec_kernel_verify_pe_sig(const char *kernel, unsigned long kernel_len)
+{
+ int ret;
+
+ ret = verify_pefile_signature(kernel, kernel_len,
+ VERIFY_USE_SECONDARY_KEYRING,
+ VERIFYING_KEXEC_PE_SIGNATURE);
+ if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) {
+ ret = verify_pefile_signature(kernel, kernel_len,
+ VERIFY_USE_PLATFORM_KEYRING,
+ VERIFYING_KEXEC_PE_SIGNATURE);
+ }
+ return ret;
+}
+#endif
+
+static int kexec_image_verify_sig(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ if (!image->fops || !image->fops->verify_sig) {
+ pr_debug("kernel loader does not support signature verification.\n");
+ return -EKEYREJECTED;
+ }
+
+ return image->fops->verify_sig(buf, buf_len);
+}
+
static int
kimage_validate_signature(struct kimage *image)
{
int ret;
- ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
- image->kernel_buf_len);
+ ret = kexec_image_verify_sig(image, image->kernel_buf,
+ image->kernel_buf_len);
if (ret) {
- if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) {
+ if (sig_enforce) {
pr_notice("Enforced kernel signature verification failed (%d).\n", ret);
return ret;
}
@@ -217,15 +195,17 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
const char __user *cmdline_ptr,
unsigned long cmdline_len, unsigned flags)
{
- int ret;
+ ssize_t ret;
void *ldata;
- loff_t size;
- ret = kernel_read_file_from_fd(kernel_fd, &image->kernel_buf,
- &size, INT_MAX, READING_KEXEC_IMAGE);
- if (ret)
+ ret = kernel_read_file_from_fd(kernel_fd, 0, &image->kernel_buf,
+ KEXEC_FILE_SIZE_MAX, NULL,
+ READING_KEXEC_IMAGE);
+ if (ret < 0)
return ret;
- image->kernel_buf_len = size;
+ image->kernel_buf_len = ret;
+ kexec_dprintk("kernel: %p kernel_size: %#lx\n",
+ image->kernel_buf, image->kernel_buf_len);
/* Call arch image probe handlers */
ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
@@ -241,12 +221,13 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
#endif
/* It is possible that there no initramfs is being loaded */
if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
- ret = kernel_read_file_from_fd(initrd_fd, &image->initrd_buf,
- &size, INT_MAX,
+ ret = kernel_read_file_from_fd(initrd_fd, 0, &image->initrd_buf,
+ KEXEC_FILE_SIZE_MAX, NULL,
READING_KEXEC_INITRAMFS);
- if (ret)
+ if (ret < 0)
goto out;
- image->initrd_buf_len = size;
+ image->initrd_buf_len = ret;
+ ret = 0;
}
if (cmdline_len) {
@@ -265,15 +246,15 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
goto out;
}
- ima_kexec_cmdline(image->cmdline_buf,
+ ima_kexec_cmdline(kernel_fd, image->cmdline_buf,
image->cmdline_buf_len - 1);
}
/* IMA needs to pass the measurement list to the next kernel. */
ima_add_kexec_buffer(image);
- /* Call arch image load handlers */
- ldata = arch_kexec_kernel_image_load(image);
+ /* Call image load handler */
+ ldata = kexec_image_load_default(image);
if (IS_ERR(ldata)) {
ret = PTR_ERR(ldata);
@@ -301,6 +282,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
if (!image)
return -ENOMEM;
+ kexec_file_dbg_print = !!(flags & KEXEC_FILE_DEBUG);
image->file_mode = 1;
if (kexec_on_panic) {
@@ -349,11 +331,13 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
unsigned long, cmdline_len, const char __user *, cmdline_ptr,
unsigned long, flags)
{
- int ret = 0, i;
+ int image_type = (flags & KEXEC_FILE_ON_CRASH) ?
+ KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT;
struct kimage **dest_image, *image;
+ int ret = 0, i;
/* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ if (!kexec_load_permitted(image_type))
return -EPERM;
/* Make sure we have a legal set of flags */
@@ -362,14 +346,15 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
image = NULL;
- if (!mutex_trylock(&kexec_mutex))
+ if (!kexec_trylock())
return -EBUSY;
- dest_image = &kexec_image;
- if (flags & KEXEC_FILE_ON_CRASH) {
+ if (image_type == KEXEC_TYPE_CRASH) {
dest_image = &kexec_crash_image;
if (kexec_crash_image)
arch_kexec_unprotect_crashkres();
+ } else {
+ dest_image = &kexec_image;
}
if (flags & KEXEC_FILE_UNLOAD)
@@ -404,13 +389,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
if (ret)
goto out;
+ kexec_dprintk("nr_segments = %lu\n", image->nr_segments);
for (i = 0; i < image->nr_segments; i++) {
struct kexec_segment *ksegment;
ksegment = &image->segment[i];
- pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
- i, ksegment->buf, ksegment->bufsz, ksegment->mem,
- ksegment->memsz);
+ kexec_dprintk("segment[%d]: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+ i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+ ksegment->memsz);
ret = kimage_load_segment(image, &image->segment[i]);
if (ret)
@@ -423,6 +409,8 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
if (ret)
goto out;
+ kexec_dprintk("kexec_file_load: type:%u, start:0x%lx head:0x%lx flags:0x%lx\n",
+ image->type, image->start, image->head, flags);
/*
* Free up any temporary buffers allocated which are not needed
* after image has been loaded
@@ -434,7 +422,7 @@ out:
if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
arch_kexec_protect_crashkres();
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
kimage_free(image);
return ret;
}
@@ -446,11 +434,11 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
unsigned long temp_start, temp_end;
temp_end = min(end, kbuf->buf_max);
- temp_start = temp_end - kbuf->memsz;
+ temp_start = temp_end - kbuf->memsz + 1;
do {
/* align down start */
- temp_start = temp_start & (~(kbuf->buf_align - 1));
+ temp_start = ALIGN_DOWN(temp_start, kbuf->buf_align);
if (temp_start < start || temp_start < kbuf->buf_min)
return 0;
@@ -520,7 +508,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg)
/* Returning 0 will take to next memory range */
/* Don't use memory that will be detected and handled by a driver. */
- if (res->flags & IORESOURCE_MEM_DRIVER_MANAGED)
+ if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED)
return 0;
if (sz < kbuf->memsz)
@@ -550,6 +538,11 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
if (kbuf->image->type == KEXEC_TYPE_CRASH)
return func(&crashk_res, kbuf);
+ /*
+ * Using MEMBLOCK_NONE will properly skip MEMBLOCK_DRIVER_MANAGED. See
+ * IORESOURCE_SYSRAM_DRIVER_MANAGED handling in
+ * locate_mem_hole_callback().
+ */
if (kbuf->top_down) {
for_each_free_mem_range_reverse(i, NUMA_NO_NODE, MEMBLOCK_NONE,
&mstart, &mend, NULL) {
@@ -607,6 +600,8 @@ static int kexec_walk_resources(struct kexec_buf *kbuf,
IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
crashk_res.start, crashk_res.end,
kbuf, func);
+ else if (kbuf->top_down)
+ return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func);
else
return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
}
@@ -639,7 +634,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
* kexec_add_buffer - place a buffer in a kexec segment
* @kbuf: Buffer contents and memory parameters.
*
- * This function assumes that kexec_mutex is held.
+ * This function assumes that kexec_lock is held.
* On successful return, @kbuf->mem will have the physical address of
* the buffer in memory.
*
@@ -647,7 +642,6 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
*/
int kexec_add_buffer(struct kexec_buf *kbuf)
{
-
struct kexec_segment *ksegment;
int ret;
@@ -675,7 +669,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
/* Walk the RAM ranges and allocate a suitable range for the buffer */
- ret = kexec_locate_mem_hole(kbuf);
+ ret = arch_kexec_locate_mem_hole(kbuf);
if (ret)
return ret;
@@ -701,7 +695,7 @@ static int kexec_calculate_store_digests(struct kimage *image)
struct kexec_sha_region *sha_regions;
struct purgatory_info *pi = &image->purgatory_info;
- if (!IS_ENABLED(CONFIG_ARCH_HAS_KEXEC_PURGATORY))
+ if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY))
return 0;
zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
@@ -722,8 +716,10 @@ static int kexec_calculate_store_digests(struct kimage *image)
sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
sha_regions = vzalloc(sha_region_sz);
- if (!sha_regions)
+ if (!sha_regions) {
+ ret = -ENOMEM;
goto out_free_desc;
+ }
desc->tfm = tfm;
@@ -740,6 +736,12 @@ static int kexec_calculate_store_digests(struct kimage *image)
for (j = i = 0; i < image->nr_segments; i++) {
struct kexec_segment *ksegment;
+#ifdef CONFIG_CRASH_HOTPLUG
+ /* Exclude elfcorehdr segment to allow future changes via hotplug */
+ if (j == image->elfcorehdr_index)
+ continue;
+#endif
+
ksegment = &image->segment[i];
/*
* Skip purgatory as it will be modified once we put digest
@@ -804,7 +806,7 @@ out:
return ret;
}
-#ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY
+#ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY
/*
* kexec_purgatory_setup_kbuf - prepare buffer to load purgatory.
* @pi: Purgatory to be loaded.
@@ -881,6 +883,7 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
{
unsigned long bss_addr;
unsigned long offset;
+ size_t sechdrs_size;
Elf_Shdr *sechdrs;
int i;
@@ -888,11 +891,11 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
* The section headers in kexec_purgatory are read-only. In order to
* have them modifiable make a temporary copy.
*/
- sechdrs = vzalloc(array_size(sizeof(Elf_Shdr), pi->ehdr->e_shnum));
+ sechdrs_size = array_size(sizeof(Elf_Shdr), pi->ehdr->e_shnum);
+ sechdrs = vzalloc(sechdrs_size);
if (!sechdrs)
return -ENOMEM;
- memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff,
- pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+ memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff, sechdrs_size);
pi->sechdrs = sechdrs;
offset = 0;
@@ -915,10 +918,22 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
}
offset = ALIGN(offset, align);
+
+ /*
+ * Check if the segment contains the entry point, if so,
+ * calculate the value of image->start based on it.
+ * If the compiler has produced more than one .text section
+ * (Eg: .text.hot), they are generally after the main .text
+ * section, and they shall not be used to calculate
+ * image->start. So do not re-calculate image->start if it
+ * is not set to the initial value, and warn the user so they
+ * have a chance to fix their purgatory's linker script.
+ */
if (sechdrs[i].sh_flags & SHF_EXECINSTR &&
pi->ehdr->e_entry >= sechdrs[i].sh_addr &&
pi->ehdr->e_entry < (sechdrs[i].sh_addr
- + sechdrs[i].sh_size)) {
+ + sechdrs[i].sh_size) &&
+ !WARN_ON(kbuf->image->start != pi->ehdr->e_entry)) {
kbuf->image->start -= sechdrs[i].sh_addr;
kbuf->image->start += kbuf->mem + offset;
}
@@ -1151,174 +1166,4 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
return 0;
}
-#endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */
-
-int crash_exclude_mem_range(struct crash_mem *mem,
- unsigned long long mstart, unsigned long long mend)
-{
- int i, j;
- unsigned long long start, end;
- struct crash_mem_range temp_range = {0, 0};
-
- for (i = 0; i < mem->nr_ranges; i++) {
- start = mem->ranges[i].start;
- end = mem->ranges[i].end;
-
- if (mstart > end || mend < start)
- continue;
-
- /* Truncate any area outside of range */
- if (mstart < start)
- mstart = start;
- if (mend > end)
- mend = end;
-
- /* Found completely overlapping range */
- if (mstart == start && mend == end) {
- mem->ranges[i].start = 0;
- mem->ranges[i].end = 0;
- if (i < mem->nr_ranges - 1) {
- /* Shift rest of the ranges to left */
- for (j = i; j < mem->nr_ranges - 1; j++) {
- mem->ranges[j].start =
- mem->ranges[j+1].start;
- mem->ranges[j].end =
- mem->ranges[j+1].end;
- }
- }
- mem->nr_ranges--;
- return 0;
- }
-
- if (mstart > start && mend < end) {
- /* Split original range */
- mem->ranges[i].end = mstart - 1;
- temp_range.start = mend + 1;
- temp_range.end = end;
- } else if (mstart != start)
- mem->ranges[i].end = mstart - 1;
- else
- mem->ranges[i].start = mend + 1;
- break;
- }
-
- /* If a split happened, add the split to array */
- if (!temp_range.end)
- return 0;
-
- /* Split happened */
- if (i == mem->max_nr_ranges - 1)
- return -ENOMEM;
-
- /* Location where new range should go */
- j = i + 1;
- if (j < mem->nr_ranges) {
- /* Move over all ranges one slot towards the end */
- for (i = mem->nr_ranges - 1; i >= j; i--)
- mem->ranges[i + 1] = mem->ranges[i];
- }
-
- mem->ranges[j].start = temp_range.start;
- mem->ranges[j].end = temp_range.end;
- mem->nr_ranges++;
- return 0;
-}
-
-int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
- void **addr, unsigned long *sz)
-{
- Elf64_Ehdr *ehdr;
- Elf64_Phdr *phdr;
- unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
- unsigned char *buf;
- unsigned int cpu, i;
- unsigned long long notes_addr;
- unsigned long mstart, mend;
-
- /* extra phdr for vmcoreinfo elf note */
- nr_phdr = nr_cpus + 1;
- nr_phdr += mem->nr_ranges;
-
- /*
- * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
- * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
- * I think this is required by tools like gdb. So same physical
- * memory will be mapped in two elf headers. One will contain kernel
- * text virtual addresses and other will have __va(physical) addresses.
- */
-
- nr_phdr++;
- elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
- elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
-
- buf = vzalloc(elf_sz);
- if (!buf)
- return -ENOMEM;
-
- ehdr = (Elf64_Ehdr *)buf;
- phdr = (Elf64_Phdr *)(ehdr + 1);
- memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
- ehdr->e_ident[EI_CLASS] = ELFCLASS64;
- ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
- ehdr->e_ident[EI_VERSION] = EV_CURRENT;
- ehdr->e_ident[EI_OSABI] = ELF_OSABI;
- memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
- ehdr->e_type = ET_CORE;
- ehdr->e_machine = ELF_ARCH;
- ehdr->e_version = EV_CURRENT;
- ehdr->e_phoff = sizeof(Elf64_Ehdr);
- ehdr->e_ehsize = sizeof(Elf64_Ehdr);
- ehdr->e_phentsize = sizeof(Elf64_Phdr);
-
- /* Prepare one phdr of type PT_NOTE for each present cpu */
- for_each_present_cpu(cpu) {
- phdr->p_type = PT_NOTE;
- notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
- phdr->p_offset = phdr->p_paddr = notes_addr;
- phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
- (ehdr->e_phnum)++;
- phdr++;
- }
-
- /* Prepare one PT_NOTE header for vmcoreinfo */
- phdr->p_type = PT_NOTE;
- phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
- phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
- (ehdr->e_phnum)++;
- phdr++;
-
- /* Prepare PT_LOAD type program header for kernel text region */
- if (kernel_map) {
- phdr->p_type = PT_LOAD;
- phdr->p_flags = PF_R|PF_W|PF_X;
- phdr->p_vaddr = (unsigned long) _text;
- phdr->p_filesz = phdr->p_memsz = _end - _text;
- phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
- ehdr->e_phnum++;
- phdr++;
- }
-
- /* Go through all the ranges in mem->ranges[] and prepare phdr */
- for (i = 0; i < mem->nr_ranges; i++) {
- mstart = mem->ranges[i].start;
- mend = mem->ranges[i].end;
-
- phdr->p_type = PT_LOAD;
- phdr->p_flags = PF_R|PF_W|PF_X;
- phdr->p_offset = mstart;
-
- phdr->p_paddr = mstart;
- phdr->p_vaddr = (unsigned long) __va(mstart);
- phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
- phdr->p_align = 0;
- ehdr->e_phnum++;
- phdr++;
- pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
- phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
- ehdr->e_phnum, phdr->p_offset);
- }
-
- *addr = buf;
- *sz = elf_sz;
- return 0;
-}
+#endif /* CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY */
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 39d30ccf8d87..74da1409cd14 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -13,9 +13,20 @@ void kimage_terminate(struct kimage *image);
int kimage_is_destination_range(struct kimage *image,
unsigned long start, unsigned long end);
-int machine_kexec_post_load(struct kimage *image);
-
-extern struct mutex kexec_mutex;
+/*
+ * Whatever is used to serialize accesses to the kexec_crash_image needs to be
+ * NMI safe, as __crash_kexec() can happen during nmi_panic(), so here we use a
+ * "simple" atomic variable that is acquired with a cmpxchg().
+ */
+extern atomic_t __kexec_lock;
+static inline bool kexec_trylock(void)
+{
+ return atomic_cmpxchg_acquire(&__kexec_lock, 0, 1) == 0;
+}
+static inline void kexec_unlock(void)
+{
+ atomic_set_release(&__kexec_lock, 0);
+}
#ifdef CONFIG_KEXEC_FILE
#include <linux/purgatory.h>
diff --git a/kernel/kheaders.c b/kernel/kheaders.c
index 8f69772af77b..42163c9e94e5 100644
--- a/kernel/kheaders.c
+++ b/kernel/kheaders.c
@@ -26,15 +26,15 @@ asm (
" .popsection \n"
);
-extern char kernel_headers_data;
-extern char kernel_headers_data_end;
+extern char kernel_headers_data[];
+extern char kernel_headers_data_end[];
static ssize_t
ikheaders_read(struct file *file, struct kobject *kobj,
struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t len)
{
- memcpy(buf, &kernel_headers_data + off, len);
+ memcpy(buf, &kernel_headers_data[off], len);
return len;
}
@@ -48,8 +48,8 @@ static struct bin_attribute kheaders_attr __ro_after_init = {
static int __init ikheaders_init(void)
{
- kheaders_attr.size = (&kernel_headers_data_end -
- &kernel_headers_data);
+ kheaders_attr.size = (kernel_headers_data_end -
+ kernel_headers_data);
return sysfs_create_bin_file(kernel_kobj, &kheaders_attr);
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 2e97febeef77..9d9095e81792 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1,7 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Kernel Probes (KProbes)
- * kernel/kprobes.c
*
* Copyright (C) IBM Corporation, 2002, 2004
*
@@ -18,6 +17,9 @@
* <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
* <prasanna@in.ibm.com> added function-return probes.
*/
+
+#define pr_fmt(fmt) "kprobes: " fmt
+
#include <linux/kprobes.h>
#include <linux/hash.h>
#include <linux/init.h>
@@ -35,6 +37,8 @@
#include <linux/ftrace.h>
#include <linux/cpu.h>
#include <linux/jump_label.h>
+#include <linux/static_call.h>
+#include <linux/perf_event.h>
#include <asm/sections.h>
#include <asm/cacheflush.h>
@@ -44,25 +48,24 @@
#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
+#if !defined(CONFIG_OPTPROBES) || !defined(CONFIG_SYSCTL)
+#define kprobe_sysctls_init() do { } while (0)
+#endif
static int kprobes_initialized;
/* kprobe_table can be accessed by
- * - Normal hlist traversal and RCU add/del under kprobe_mutex is held.
+ * - Normal hlist traversal and RCU add/del under 'kprobe_mutex' is held.
* Or
* - RCU hlist traversal under disabling preempt (breakpoint handlers)
*/
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
-static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
-/* NOTE: change this value only with kprobe_mutex held */
+/* NOTE: change this value only with 'kprobe_mutex' held */
static bool kprobes_all_disarmed;
-/* This protects kprobe_table and optimizing_list */
+/* This protects 'kprobe_table' and 'optimizing_list' */
static DEFINE_MUTEX(kprobe_mutex);
-static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
-static struct {
- raw_spinlock_t lock ____cacheline_aligned_in_smp;
-} kretprobe_table_locks[KPROBE_TABLE_SIZE];
+static DEFINE_PER_CPU(struct kprobe *, kprobe_instance);
kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
unsigned int __unused)
@@ -70,17 +73,15 @@ kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
return ((kprobe_opcode_t *)(kallsyms_lookup_name(name)));
}
-static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
-{
- return &(kretprobe_table_locks[hash].lock);
-}
-
-/* Blacklist -- list of struct kprobe_blacklist_entry */
+/*
+ * Blacklist -- list of 'struct kprobe_blacklist_entry' to store info where
+ * kprobes can not probe.
+ */
static LIST_HEAD(kprobe_blacklist);
#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
/*
- * kprobe->ainsn.insn points to the copy of the instruction to be
+ * 'kprobe::ainsn.insn' points to the copy of the instruction to be
* single-stepped. x86_64, POWER4 and above have no-exec support and
* stepping on the instruction on a vmalloced/kmalloced/data page
* is a recipe for disaster
@@ -111,10 +112,16 @@ enum kprobe_slot_state {
void __weak *alloc_insn_page(void)
{
+ /*
+ * Use module_alloc() so this page is within +/- 2GB of where the
+ * kernel image and loaded module images reside. This is required
+ * for most of the architectures.
+ * (e.g. x86-64 needs this to handle the %rip-relative fixups.)
+ */
return module_alloc(PAGE_SIZE);
}
-void __weak free_insn_page(void *page)
+static void free_insn_page(void *page)
{
module_memfree(page);
}
@@ -123,6 +130,7 @@ struct kprobe_insn_cache kprobe_insn_slots = {
.mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
.alloc = alloc_insn_page,
.free = free_insn_page,
+ .sym = KPROBE_INSN_PAGE_SYM,
.pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
.insn_size = MAX_INSN_SIZE,
.nr_garbage = 0,
@@ -145,6 +153,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
list_for_each_entry_rcu(kip, &c->pages, list) {
if (kip->nused < slots_per_page(c)) {
int i;
+
for (i = 0; i < slots_per_page(c); i++) {
if (kip->slot_used[i] == SLOT_CLEAN) {
kip->slot_used[i] = SLOT_USED;
@@ -170,11 +179,6 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
if (!kip)
goto out;
- /*
- * Use module_alloc so this page is within +/- 2GB of where the
- * kernel image and loaded module images reside. This is required
- * so x86_64 can correctly handle the %rip-relative fixups.
- */
kip->insns = c->alloc();
if (!kip->insns) {
kfree(kip);
@@ -188,13 +192,17 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
kip->cache = c;
list_add_rcu(&kip->list, &c->pages);
slot = kip->insns;
+
+ /* Record the perf ksymbol register event after adding the page */
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns,
+ PAGE_SIZE, false, c->sym);
out:
mutex_unlock(&c->mutex);
return slot;
}
-/* Return 1 if all garbages are collected, otherwise 0. */
-static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
+/* Return true if all garbages are collected, otherwise false. */
+static bool collect_one_slot(struct kprobe_insn_page *kip, int idx)
{
kip->slot_used[idx] = SLOT_CLEAN;
kip->nused--;
@@ -206,14 +214,21 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
* next time somebody inserts a probe.
*/
if (!list_is_singular(&kip->list)) {
+ /*
+ * Record perf ksymbol unregister event before removing
+ * the page.
+ */
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ (unsigned long)kip->insns, PAGE_SIZE, true,
+ kip->cache->sym);
list_del_rcu(&kip->list);
synchronize_rcu();
kip->cache->free(kip->insns);
kfree(kip);
}
- return 1;
+ return true;
}
- return 0;
+ return false;
}
static int collect_garbage_slots(struct kprobe_insn_cache *c)
@@ -225,6 +240,7 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c)
list_for_each_entry_safe(kip, next, &c->pages, list) {
int i;
+
if (kip->ngarbage == 0)
continue;
kip->ngarbage = 0; /* we will collect all garbages */
@@ -295,12 +311,44 @@ bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr)
return ret;
}
+int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
+ unsigned long *value, char *type, char *sym)
+{
+ struct kprobe_insn_page *kip;
+ int ret = -ERANGE;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(kip, &c->pages, list) {
+ if ((*symnum)--)
+ continue;
+ strscpy(sym, c->sym, KSYM_NAME_LEN);
+ *type = 't';
+ *value = (unsigned long)kip->insns;
+ ret = 0;
+ break;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
#ifdef CONFIG_OPTPROBES
+void __weak *alloc_optinsn_page(void)
+{
+ return alloc_insn_page();
+}
+
+void __weak free_optinsn_page(void *page)
+{
+ free_insn_page(page);
+}
+
/* For optimized_kprobe buffer */
struct kprobe_insn_cache kprobe_optinsn_slots = {
.mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
- .alloc = alloc_insn_page,
- .free = free_insn_page,
+ .alloc = alloc_optinsn_page,
+ .free = free_optinsn_page,
+ .sym = KPROBE_OPTINSN_PAGE_SYM,
.pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
/* .insn_size is initialized later */
.nr_garbage = 0,
@@ -321,9 +369,9 @@ static inline void reset_kprobe_instance(void)
/*
* This routine is called either:
- * - under the kprobe_mutex - during kprobe_[un]register()
- * OR
- * - with preemption disabled - from arch/xxx/kernel/kprobes.c
+ * - under the 'kprobe_mutex' - during kprobe_[un]register().
+ * OR
+ * - with preemption disabled - from architecture specific code.
*/
struct kprobe *get_kprobe(void *addr)
{
@@ -343,22 +391,20 @@ NOKPROBE_SYMBOL(get_kprobe);
static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
-/* Return true if the kprobe is an aggregator */
-static inline int kprobe_aggrprobe(struct kprobe *p)
+/* Return true if 'p' is an aggregator */
+static inline bool kprobe_aggrprobe(struct kprobe *p)
{
return p->pre_handler == aggr_pre_handler;
}
-/* Return true(!0) if the kprobe is unused */
-static inline int kprobe_unused(struct kprobe *p)
+/* Return true if 'p' is unused */
+static inline bool kprobe_unused(struct kprobe *p)
{
return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
list_empty(&p->list);
}
-/*
- * Keep all fields in the kprobe consistent
- */
+/* Keep all fields in the kprobe consistent. */
static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
{
memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
@@ -366,11 +412,11 @@ static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
}
#ifdef CONFIG_OPTPROBES
-/* NOTE: change this value only with kprobe_mutex held */
+/* NOTE: This is protected by 'kprobe_mutex'. */
static bool kprobes_allow_optimization;
/*
- * Call all pre_handler on the list, but ignores its return value.
+ * Call all 'kprobe::pre_handler' on the list, but ignores its return value.
* This must be called from arch-dep optimized caller.
*/
void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
@@ -398,7 +444,7 @@ static void free_aggr_kprobe(struct kprobe *p)
kfree(op);
}
-/* Return true(!0) if the kprobe is ready for optimization. */
+/* Return true if the kprobe is ready for optimization. */
static inline int kprobe_optready(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -411,8 +457,8 @@ static inline int kprobe_optready(struct kprobe *p)
return 0;
}
-/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
-static inline int kprobe_disarmed(struct kprobe *p)
+/* Return true if the kprobe is disarmed. Note: p must be on hash list */
+bool kprobe_disarmed(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -425,32 +471,32 @@ static inline int kprobe_disarmed(struct kprobe *p)
return kprobe_disabled(p) && list_empty(&op->list);
}
-/* Return true(!0) if the probe is queued on (un)optimizing lists */
-static int kprobe_queued(struct kprobe *p)
+/* Return true if the probe is queued on (un)optimizing lists */
+static bool kprobe_queued(struct kprobe *p)
{
struct optimized_kprobe *op;
if (kprobe_aggrprobe(p)) {
op = container_of(p, struct optimized_kprobe, kp);
if (!list_empty(&op->list))
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/*
* Return an optimized kprobe whose optimizing code replaces
- * instructions including addr (exclude breakpoint).
+ * instructions including 'addr' (exclude breakpoint).
*/
-static struct kprobe *get_optimized_kprobe(unsigned long addr)
+static struct kprobe *get_optimized_kprobe(kprobe_opcode_t *addr)
{
int i;
struct kprobe *p = NULL;
struct optimized_kprobe *op;
/* Don't check i == 0, since that is a breakpoint case. */
- for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
- p = get_kprobe((void *)(addr - i));
+ for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH / sizeof(kprobe_opcode_t); i++)
+ p = get_kprobe(addr - i);
if (p && kprobe_optready(p)) {
op = container_of(p, struct optimized_kprobe, kp);
@@ -461,7 +507,7 @@ static struct kprobe *get_optimized_kprobe(unsigned long addr)
return NULL;
}
-/* Optimization staging list, protected by kprobe_mutex */
+/* Optimization staging list, protected by 'kprobe_mutex' */
static LIST_HEAD(optimizing_list);
static LIST_HEAD(unoptimizing_list);
static LIST_HEAD(freeing_list);
@@ -472,20 +518,20 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
/*
* Optimize (replace a breakpoint with a jump) kprobes listed on
- * optimizing_list.
+ * 'optimizing_list'.
*/
static void do_optimize_kprobes(void)
{
lockdep_assert_held(&text_mutex);
/*
- * The optimization/unoptimization refers online_cpus via
- * stop_machine() and cpu-hotplug modifies online_cpus.
- * And same time, text_mutex will be held in cpu-hotplug and here.
- * This combination can cause a deadlock (cpu-hotplug try to lock
- * text_mutex but stop_machine can not be done because online_cpus
- * has been changed)
- * To avoid this deadlock, caller must have locked cpu hotplug
- * for preventing cpu-hotplug outside of text_mutex locking.
+ * The optimization/unoptimization refers 'online_cpus' via
+ * stop_machine() and cpu-hotplug modifies the 'online_cpus'.
+ * And same time, 'text_mutex' will be held in cpu-hotplug and here.
+ * This combination can cause a deadlock (cpu-hotplug tries to lock
+ * 'text_mutex' but stop_machine() can not be done because
+ * the 'online_cpus' has been changed)
+ * To avoid this deadlock, caller must have locked cpu-hotplug
+ * for preventing cpu-hotplug outside of 'text_mutex' locking.
*/
lockdep_assert_cpus_held();
@@ -499,7 +545,7 @@ static void do_optimize_kprobes(void)
/*
* Unoptimize (replace a jump with a breakpoint and remove the breakpoint
- * if need) kprobes listed on unoptimizing_list.
+ * if need) kprobes listed on 'unoptimizing_list'.
*/
static void do_unoptimize_kprobes(void)
{
@@ -509,23 +555,21 @@ static void do_unoptimize_kprobes(void)
/* See comment in do_optimize_kprobes() */
lockdep_assert_cpus_held();
- /* Unoptimization must be done anytime */
- if (list_empty(&unoptimizing_list))
- return;
+ if (!list_empty(&unoptimizing_list))
+ arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
- arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
- /* Loop free_list for disarming */
+ /* Loop on 'freeing_list' for disarming and removing from kprobe hash list */
list_for_each_entry_safe(op, tmp, &freeing_list, list) {
/* Switching from detour code to origin */
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
- /* Disarm probes if marked disabled */
- if (kprobe_disabled(&op->kp))
+ /* Disarm probes if marked disabled and not gone */
+ if (kprobe_disabled(&op->kp) && !kprobe_gone(&op->kp))
arch_disarm_kprobe(&op->kp);
if (kprobe_unused(&op->kp)) {
/*
* Remove unused probes from hash list. After waiting
* for synchronization, these probes are reclaimed.
- * (reclaiming is done by do_free_cleaned_kprobes.)
+ * (reclaiming is done by do_free_cleaned_kprobes().)
*/
hlist_del_rcu(&op->kp.hlist);
} else
@@ -533,7 +577,7 @@ static void do_unoptimize_kprobes(void)
}
}
-/* Reclaim all kprobes on the free_list */
+/* Reclaim all kprobes on the 'freeing_list' */
static void do_free_cleaned_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
@@ -563,8 +607,6 @@ static void kprobe_optimizer(struct work_struct *work)
mutex_lock(&kprobe_mutex);
cpus_read_lock();
mutex_lock(&text_mutex);
- /* Lock modules while optimizing kprobes */
- mutex_lock(&module_mutex);
/*
* Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
@@ -589,7 +631,6 @@ static void kprobe_optimizer(struct work_struct *work)
/* Step 4: Free cleaned kprobes after quiesence period */
do_free_cleaned_kprobes();
- mutex_unlock(&module_mutex);
mutex_unlock(&text_mutex);
cpus_read_unlock();
@@ -608,9 +649,9 @@ void wait_for_kprobe_optimizer(void)
while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
mutex_unlock(&kprobe_mutex);
- /* this will also make optimizing_work execute immmediately */
+ /* This will also make 'optimizing_work' execute immmediately */
flush_delayed_work(&optimizing_work);
- /* @optimizing_work might not have been queued yet, relax */
+ /* 'optimizing_work' might not have been queued yet, relax */
cpu_relax();
mutex_lock(&kprobe_mutex);
@@ -619,7 +660,7 @@ void wait_for_kprobe_optimizer(void)
mutex_unlock(&kprobe_mutex);
}
-static bool optprobe_queued_unopt(struct optimized_kprobe *op)
+bool optprobe_queued_unopt(struct optimized_kprobe *op)
{
struct optimized_kprobe *_op;
@@ -641,7 +682,7 @@ static void optimize_kprobe(struct kprobe *p)
(kprobe_disabled(p) || kprobes_all_disarmed))
return;
- /* kprobes with post_handler can not be optimized */
+ /* kprobes with 'post_handler' can not be optimized */
if (p->post_handler)
return;
@@ -661,7 +702,10 @@ static void optimize_kprobe(struct kprobe *p)
}
op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
- /* On unoptimizing/optimizing_list, op must have OPTIMIZED flag */
+ /*
+ * On the 'unoptimizing_list' and 'optimizing_list',
+ * 'op' must have OPTIMIZED flag
+ */
if (WARN_ON_ONCE(!list_empty(&op->list)))
return;
@@ -731,7 +775,7 @@ static int reuse_unused_kprobe(struct kprobe *ap)
WARN_ON_ONCE(list_empty(&op->list));
/* Enable the probe again */
ap->flags &= ~KPROBE_FLAG_DISABLED;
- /* Optimize it again (remove from op->list) */
+ /* Optimize it again. (remove from 'op->list') */
if (!kprobe_optready(ap))
return -EINVAL;
@@ -751,14 +795,13 @@ static void kill_optimized_kprobe(struct kprobe *p)
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
if (kprobe_unused(p)) {
- /* Enqueue if it is unused */
- list_add(&op->list, &freeing_list);
/*
- * Remove unused probes from the hash list. After waiting
- * for synchronization, this probe is reclaimed.
- * (reclaiming is done by do_free_cleaned_kprobes().)
+ * Unused kprobe is on unoptimizing or freeing list. We move it
+ * to freeing_list and let the kprobe_optimizer() remove it from
+ * the kprobe hash list and free it.
*/
- hlist_del_rcu(&op->kp.hlist);
+ if (optprobe_queued_unopt(op))
+ list_move(&op->list, &freeing_list);
}
/* Don't touch the code, because it is already freed. */
@@ -781,7 +824,7 @@ static void prepare_optimized_kprobe(struct kprobe *p)
__prepare_optimized_kprobe(op, p);
}
-/* Allocate new optimized_kprobe and try to prepare optimized instructions */
+/* Allocate new optimized_kprobe and try to prepare optimized instructions. */
static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -800,19 +843,19 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
/*
- * Prepare an optimized_kprobe and optimize it
- * NOTE: p must be a normal registered kprobe
+ * Prepare an optimized_kprobe and optimize it.
+ * NOTE: 'p' must be a normal registered kprobe.
*/
static void try_to_optimize_kprobe(struct kprobe *p)
{
struct kprobe *ap;
struct optimized_kprobe *op;
- /* Impossible to optimize ftrace-based kprobe */
+ /* Impossible to optimize ftrace-based kprobe. */
if (kprobe_ftrace(p))
return;
- /* For preparing optimization, jump_label_text_reserved() is called */
+ /* For preparing optimization, jump_label_text_reserved() is called. */
cpus_read_lock();
jump_label_lock();
mutex_lock(&text_mutex);
@@ -823,14 +866,14 @@ static void try_to_optimize_kprobe(struct kprobe *p)
op = container_of(ap, struct optimized_kprobe, kp);
if (!arch_prepared_optinsn(&op->optinsn)) {
- /* If failed to setup optimizing, fallback to kprobe */
+ /* If failed to setup optimizing, fallback to kprobe. */
arch_remove_optimized_kprobe(op);
kfree(op);
goto out;
}
init_aggr_kprobe(ap, p);
- optimize_kprobe(ap); /* This just kicks optimizer thread */
+ optimize_kprobe(ap); /* This just kicks optimizer thread. */
out:
mutex_unlock(&text_mutex);
@@ -838,7 +881,6 @@ out:
cpus_read_unlock();
}
-#ifdef CONFIG_SYSCTL
static void optimize_all_kprobes(void)
{
struct hlist_head *head;
@@ -846,7 +888,7 @@ static void optimize_all_kprobes(void)
unsigned int i;
mutex_lock(&kprobe_mutex);
- /* If optimization is already allowed, just return */
+ /* If optimization is already allowed, just return. */
if (kprobes_allow_optimization)
goto out;
@@ -859,11 +901,12 @@ static void optimize_all_kprobes(void)
optimize_kprobe(p);
}
cpus_read_unlock();
- printk(KERN_INFO "Kprobes globally optimized\n");
+ pr_info("kprobe jump-optimization is enabled. All kprobes are optimized if possible.\n");
out:
mutex_unlock(&kprobe_mutex);
}
+#ifdef CONFIG_SYSCTL
static void unoptimize_all_kprobes(void)
{
struct hlist_head *head;
@@ -871,7 +914,7 @@ static void unoptimize_all_kprobes(void)
unsigned int i;
mutex_lock(&kprobe_mutex);
- /* If optimization is already prohibited, just return */
+ /* If optimization is already prohibited, just return. */
if (!kprobes_allow_optimization) {
mutex_unlock(&kprobe_mutex);
return;
@@ -889,16 +932,16 @@ static void unoptimize_all_kprobes(void)
cpus_read_unlock();
mutex_unlock(&kprobe_mutex);
- /* Wait for unoptimizing completion */
+ /* Wait for unoptimizing completion. */
wait_for_kprobe_optimizer();
- printk(KERN_INFO "Kprobes globally unoptimized\n");
+ pr_info("kprobe jump-optimization is disabled. All kprobes are based on software breakpoint.\n");
}
static DEFINE_MUTEX(kprobe_sysctl_mutex);
-int sysctl_kprobes_optimization;
-int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
- void *buffer, size_t *length,
- loff_t *ppos)
+static int sysctl_kprobes_optimization;
+static int proc_kprobes_optimization_handler(struct ctl_table *table,
+ int write, void *buffer,
+ size_t *length, loff_t *ppos)
{
int ret;
@@ -914,15 +957,35 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
return ret;
}
+
+static struct ctl_table kprobe_sysctls[] = {
+ {
+ .procname = "kprobes-optimization",
+ .data = &sysctl_kprobes_optimization,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_kprobes_optimization_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {}
+};
+
+static void __init kprobe_sysctls_init(void)
+{
+ register_sysctl_init("debug", kprobe_sysctls);
+}
#endif /* CONFIG_SYSCTL */
-/* Put a breakpoint for a probe. Must be called with text_mutex locked */
+/* Put a breakpoint for a probe. */
static void __arm_kprobe(struct kprobe *p)
{
struct kprobe *_p;
- /* Check collision with other optimized kprobes */
- _p = get_optimized_kprobe((unsigned long)p->addr);
+ lockdep_assert_held(&text_mutex);
+
+ /* Find the overlapping optimized kprobes. */
+ _p = get_optimized_kprobe(p->addr);
if (unlikely(_p))
/* Fallback to unoptimized kprobe */
unoptimize_kprobe(_p, true);
@@ -931,22 +994,29 @@ static void __arm_kprobe(struct kprobe *p)
optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
}
-/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
+/* Remove the breakpoint of a probe. */
static void __disarm_kprobe(struct kprobe *p, bool reopt)
{
struct kprobe *_p;
+ lockdep_assert_held(&text_mutex);
+
/* Try to unoptimize */
unoptimize_kprobe(p, kprobes_all_disarmed);
if (!kprobe_queued(p)) {
arch_disarm_kprobe(p);
- /* If another kprobe was blocked, optimize it. */
- _p = get_optimized_kprobe((unsigned long)p->addr);
+ /* If another kprobe was blocked, re-optimize it. */
+ _p = get_optimized_kprobe(p->addr);
if (unlikely(_p) && reopt)
optimize_kprobe(_p);
}
- /* TODO: reoptimize others after unoptimized this probe */
+ /*
+ * TODO: Since unoptimization and real disarming will be done by
+ * the worker thread, we can not check whether another probe are
+ * unoptimized because of this probe here. It should be re-optimized
+ * by the worker thread.
+ */
}
#else /* !CONFIG_OPTPROBES */
@@ -969,7 +1039,7 @@ static int reuse_unused_kprobe(struct kprobe *ap)
* unregistered.
* Thus there should be no chance to reuse unused kprobe.
*/
- printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
+ WARN_ON_ONCE(1);
return -EINVAL;
}
@@ -999,34 +1069,21 @@ static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = {
static int kprobe_ipmodify_enabled;
static int kprobe_ftrace_enabled;
-/* Must ensure p->addr is really on ftrace */
-static int prepare_kprobe(struct kprobe *p)
-{
- if (!kprobe_ftrace(p))
- return arch_prepare_kprobe(p);
-
- return arch_prepare_kprobe_ftrace(p);
-}
-
-/* Caller must lock kprobe_mutex */
static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
int *cnt)
{
- int ret = 0;
+ int ret;
+
+ lockdep_assert_held(&kprobe_mutex);
ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0);
- if (ret) {
- pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n",
- p->addr, ret);
+ if (WARN_ONCE(ret < 0, "Failed to arm kprobe-ftrace at %pS (error %d)\n", p->addr, ret))
return ret;
- }
if (*cnt == 0) {
ret = register_ftrace_function(ops);
- if (ret) {
- pr_debug("Failed to init kprobe-ftrace (%d)\n", ret);
+ if (WARN(ret < 0, "Failed to register kprobe-ftrace (error %d)\n", ret))
goto err_ftrace;
- }
}
(*cnt)++;
@@ -1050,22 +1107,23 @@ static int arm_kprobe_ftrace(struct kprobe *p)
ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}
-/* Caller must lock kprobe_mutex */
static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
int *cnt)
{
- int ret = 0;
+ int ret;
+
+ lockdep_assert_held(&kprobe_mutex);
if (*cnt == 1) {
ret = unregister_ftrace_function(ops);
- if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret))
+ if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (error %d)\n", ret))
return ret;
}
(*cnt)--;
ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
- WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n",
+ WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (error %d)\n",
p->addr, ret);
return ret;
}
@@ -1079,12 +1137,26 @@ static int disarm_kprobe_ftrace(struct kprobe *p)
ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}
#else /* !CONFIG_KPROBES_ON_FTRACE */
-#define prepare_kprobe(p) arch_prepare_kprobe(p)
-#define arm_kprobe_ftrace(p) (-ENODEV)
-#define disarm_kprobe_ftrace(p) (-ENODEV)
+static inline int arm_kprobe_ftrace(struct kprobe *p)
+{
+ return -ENODEV;
+}
+
+static inline int disarm_kprobe_ftrace(struct kprobe *p)
+{
+ return -ENODEV;
+}
#endif
-/* Arm a kprobe with text_mutex */
+static int prepare_kprobe(struct kprobe *p)
+{
+ /* Must ensure p->addr is really on ftrace */
+ if (kprobe_ftrace(p))
+ return arch_prepare_kprobe_ftrace(p);
+
+ return arch_prepare_kprobe(p);
+}
+
static int arm_kprobe(struct kprobe *kp)
{
if (unlikely(kprobe_ftrace(kp)))
@@ -1099,7 +1171,6 @@ static int arm_kprobe(struct kprobe *kp)
return 0;
}
-/* Disarm a kprobe with text_mutex */
static int disarm_kprobe(struct kprobe *kp, bool reopt)
{
if (unlikely(kprobe_ftrace(kp)))
@@ -1149,99 +1220,21 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
}
NOKPROBE_SYMBOL(aggr_post_handler);
-static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
- int trapnr)
-{
- struct kprobe *cur = __this_cpu_read(kprobe_instance);
-
- /*
- * if we faulted "during" the execution of a user specified
- * probe handler, invoke just that probe's fault handler
- */
- if (cur && cur->fault_handler) {
- if (cur->fault_handler(cur, regs, trapnr))
- return 1;
- }
- return 0;
-}
-NOKPROBE_SYMBOL(aggr_fault_handler);
-
-/* Walks the list and increments nmissed count for multiprobe case */
+/* Walks the list and increments 'nmissed' if 'p' has child probes. */
void kprobes_inc_nmissed_count(struct kprobe *p)
{
struct kprobe *kp;
+
if (!kprobe_aggrprobe(p)) {
p->nmissed++;
} else {
list_for_each_entry_rcu(kp, &p->list, list)
kp->nmissed++;
}
- return;
}
NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
-void recycle_rp_inst(struct kretprobe_instance *ri,
- struct hlist_head *head)
-{
- struct kretprobe *rp = ri->rp;
-
- /* remove rp inst off the rprobe_inst_table */
- hlist_del(&ri->hlist);
- INIT_HLIST_NODE(&ri->hlist);
- if (likely(rp)) {
- raw_spin_lock(&rp->lock);
- hlist_add_head(&ri->hlist, &rp->free_instances);
- raw_spin_unlock(&rp->lock);
- } else
- /* Unregistering */
- hlist_add_head(&ri->hlist, head);
-}
-NOKPROBE_SYMBOL(recycle_rp_inst);
-
-void kretprobe_hash_lock(struct task_struct *tsk,
- struct hlist_head **head, unsigned long *flags)
-__acquires(hlist_lock)
-{
- unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
- raw_spinlock_t *hlist_lock;
-
- *head = &kretprobe_inst_table[hash];
- hlist_lock = kretprobe_table_lock_ptr(hash);
- raw_spin_lock_irqsave(hlist_lock, *flags);
-}
-NOKPROBE_SYMBOL(kretprobe_hash_lock);
-
-static void kretprobe_table_lock(unsigned long hash,
- unsigned long *flags)
-__acquires(hlist_lock)
-{
- raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
- raw_spin_lock_irqsave(hlist_lock, *flags);
-}
-NOKPROBE_SYMBOL(kretprobe_table_lock);
-
-void kretprobe_hash_unlock(struct task_struct *tsk,
- unsigned long *flags)
-__releases(hlist_lock)
-{
- unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
- raw_spinlock_t *hlist_lock;
-
- hlist_lock = kretprobe_table_lock_ptr(hash);
- raw_spin_unlock_irqrestore(hlist_lock, *flags);
-}
-NOKPROBE_SYMBOL(kretprobe_hash_unlock);
-
-static void kretprobe_table_unlock(unsigned long hash,
- unsigned long *flags)
-__releases(hlist_lock)
-{
- raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
- raw_spin_unlock_irqrestore(hlist_lock, *flags);
-}
-NOKPROBE_SYMBOL(kretprobe_table_unlock);
-
-struct kprobe kprobe_busy = {
+static struct kprobe kprobe_busy = {
.addr = (void *) get_kprobe,
};
@@ -1261,76 +1254,7 @@ void kprobe_busy_end(void)
preempt_enable();
}
-/*
- * This function is called from finish_task_switch when task tk becomes dead,
- * so that we can recycle any function-return probe instances associated
- * with this task. These left over instances represent probed functions
- * that have been called but will never return.
- */
-void kprobe_flush_task(struct task_struct *tk)
-{
- struct kretprobe_instance *ri;
- struct hlist_head *head, empty_rp;
- struct hlist_node *tmp;
- unsigned long hash, flags = 0;
-
- if (unlikely(!kprobes_initialized))
- /* Early boot. kretprobe_table_locks not yet initialized. */
- return;
-
- kprobe_busy_begin();
-
- INIT_HLIST_HEAD(&empty_rp);
- hash = hash_ptr(tk, KPROBE_HASH_BITS);
- head = &kretprobe_inst_table[hash];
- kretprobe_table_lock(hash, &flags);
- hlist_for_each_entry_safe(ri, tmp, head, hlist) {
- if (ri->task == tk)
- recycle_rp_inst(ri, &empty_rp);
- }
- kretprobe_table_unlock(hash, &flags);
- hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
- hlist_del(&ri->hlist);
- kfree(ri);
- }
-
- kprobe_busy_end();
-}
-NOKPROBE_SYMBOL(kprobe_flush_task);
-
-static inline void free_rp_inst(struct kretprobe *rp)
-{
- struct kretprobe_instance *ri;
- struct hlist_node *next;
-
- hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
- hlist_del(&ri->hlist);
- kfree(ri);
- }
-}
-
-static void cleanup_rp_inst(struct kretprobe *rp)
-{
- unsigned long flags, hash;
- struct kretprobe_instance *ri;
- struct hlist_node *next;
- struct hlist_head *head;
-
- /* No race here */
- for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
- kretprobe_table_lock(hash, &flags);
- head = &kretprobe_inst_table[hash];
- hlist_for_each_entry_safe(ri, next, head, hlist) {
- if (ri->rp == rp)
- ri->rp = NULL;
- }
- kretprobe_table_unlock(hash, &flags);
- }
- free_rp_inst(rp);
-}
-NOKPROBE_SYMBOL(cleanup_rp_inst);
-
-/* Add the new probe to ap->list */
+/* Add the new probe to 'ap->list'. */
static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
{
if (p->post_handler)
@@ -1344,18 +1268,17 @@ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
}
/*
- * Fill in the required fields of the "manager kprobe". Replace the
- * earlier kprobe in the hlist with the manager kprobe
+ * Fill in the required fields of the aggregator kprobe. Replace the
+ * earlier kprobe in the hlist with the aggregator kprobe.
*/
static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
{
- /* Copy p's insn slot to ap */
+ /* Copy the insn slot of 'p' to 'ap'. */
copy_kprobe(p, ap);
flush_insn_slot(ap);
ap->addr = p->addr;
ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
ap->pre_handler = aggr_pre_handler;
- ap->fault_handler = aggr_fault_handler;
/* We don't care the kprobe which has gone. */
if (p->post_handler && !kprobe_gone(p))
ap->post_handler = aggr_post_handler;
@@ -1368,8 +1291,7 @@ static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
}
/*
- * This is the second or subsequent kprobe at the address - handle
- * the intricacies
+ * This registers the second or subsequent kprobe at the same address.
*/
static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
{
@@ -1383,7 +1305,7 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
mutex_lock(&text_mutex);
if (!kprobe_aggrprobe(orig_p)) {
- /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
+ /* If 'orig_p' is not an 'aggr_kprobe', create new one. */
ap = alloc_aggr_kprobe(orig_p);
if (!ap) {
ret = -ENOMEM;
@@ -1408,8 +1330,8 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
if (ret)
/*
* Even if fail to allocate new slot, don't need to
- * free aggr_probe. It will be used next time, or
- * freed by unregister_kprobe.
+ * free the 'ap'. It will be used next time, or
+ * freed by unregister_kprobe().
*/
goto out;
@@ -1424,7 +1346,7 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
| KPROBE_FLAG_DISABLED;
}
- /* Copy ap's insn slot to p */
+ /* Copy the insn slot of 'p' to 'ap'. */
copy_kprobe(ap, p);
ret = add_new_kprobe(ap, p);
@@ -1450,7 +1372,7 @@ out:
bool __weak arch_within_kprobe_blacklist(unsigned long addr)
{
- /* The __kprobes marked functions and entry code must not be probed */
+ /* The '__kprobes' functions and entry code must not be probed. */
return addr >= (unsigned long)__kprobes_text_start &&
addr < (unsigned long)__kprobes_text_end;
}
@@ -1462,8 +1384,8 @@ static bool __within_kprobe_blacklist(unsigned long addr)
if (arch_within_kprobe_blacklist(addr))
return true;
/*
- * If there exists a kprobe_blacklist, verify and
- * fail any probe registration in the prohibited area
+ * If 'kprobe_blacklist' is defined, check the address and
+ * reject any probe registration in the prohibited area.
*/
list_for_each_entry(ent, &kprobe_blacklist, list) {
if (addr >= ent->start_addr && addr < ent->end_addr)
@@ -1493,24 +1415,68 @@ bool within_kprobe_blacklist(unsigned long addr)
}
/*
- * If we have a symbol_name argument, look it up and add the offset field
+ * arch_adjust_kprobe_addr - adjust the address
+ * @addr: symbol base address
+ * @offset: offset within the symbol
+ * @on_func_entry: was this @addr+@offset on the function entry
+ *
+ * Typically returns @addr + @offset, except for special cases where the
+ * function might be prefixed by a CFI landing pad, in that case any offset
+ * inside the landing pad is mapped to the first 'real' instruction of the
+ * symbol.
+ *
+ * Specifically, for things like IBT/BTI, skip the resp. ENDBR/BTI.C
+ * instruction at +0.
+ */
+kprobe_opcode_t *__weak arch_adjust_kprobe_addr(unsigned long addr,
+ unsigned long offset,
+ bool *on_func_entry)
+{
+ *on_func_entry = !offset;
+ return (kprobe_opcode_t *)(addr + offset);
+}
+
+/*
+ * If 'symbol_name' is specified, look it up and add the 'offset'
* to it. This way, we can specify a relative address to a symbol.
* This returns encoded errors if it fails to look up symbol or invalid
* combination of parameters.
*/
-static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
- const char *symbol_name, unsigned int offset)
+static kprobe_opcode_t *
+_kprobe_addr(kprobe_opcode_t *addr, const char *symbol_name,
+ unsigned long offset, bool *on_func_entry)
{
if ((symbol_name && addr) || (!symbol_name && !addr))
goto invalid;
if (symbol_name) {
+ /*
+ * Input: @sym + @offset
+ * Output: @addr + @offset
+ *
+ * NOTE: kprobe_lookup_name() does *NOT* fold the offset
+ * argument into it's output!
+ */
addr = kprobe_lookup_name(symbol_name, offset);
if (!addr)
return ERR_PTR(-ENOENT);
}
- addr = (kprobe_opcode_t *)(((char *)addr) + offset);
+ /*
+ * So here we have @addr + @offset, displace it into a new
+ * @addr' + @offset' where @addr' is the symbol start address.
+ */
+ addr = (void *)addr + offset;
+ if (!kallsyms_lookup_size_offset((unsigned long)addr, NULL, &offset))
+ return ERR_PTR(-ENOENT);
+ addr = (void *)addr - offset;
+
+ /*
+ * Then ask the architecture to re-combine them, taking care of
+ * magical function entry details while telling us if this was indeed
+ * at the start of the function.
+ */
+ addr = arch_adjust_kprobe_addr((unsigned long)addr, offset, on_func_entry);
if (addr)
return addr;
@@ -1520,10 +1486,14 @@ invalid:
static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
{
- return _kprobe_addr(p->addr, p->symbol_name, p->offset);
+ bool on_func_entry;
+ return _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry);
}
-/* Check passed kprobe is valid and return kprobe in kprobe_table. */
+/*
+ * Check the 'p' is valid and return the aggregator kprobe
+ * at the same address.
+ */
static struct kprobe *__get_valid_kprobe(struct kprobe *p)
{
struct kprobe *ap, *list_p;
@@ -1545,29 +1515,28 @@ valid:
return ap;
}
-/* Return error if the kprobe is being re-registered */
-static inline int check_kprobe_rereg(struct kprobe *p)
+/*
+ * Warn and return error if the kprobe is being re-registered since
+ * there must be a software bug.
+ */
+static inline int warn_kprobe_rereg(struct kprobe *p)
{
int ret = 0;
mutex_lock(&kprobe_mutex);
- if (__get_valid_kprobe(p))
+ if (WARN_ON_ONCE(__get_valid_kprobe(p)))
ret = -EINVAL;
mutex_unlock(&kprobe_mutex);
return ret;
}
-int __weak arch_check_ftrace_location(struct kprobe *p)
+static int check_ftrace_location(struct kprobe *p)
{
- unsigned long ftrace_addr;
+ unsigned long addr = (unsigned long)p->addr;
- ftrace_addr = ftrace_location((unsigned long)p->addr);
- if (ftrace_addr) {
+ if (ftrace_location(addr) == addr) {
#ifdef CONFIG_KPROBES_ON_FTRACE
- /* Given address is not on the instruction boundary */
- if ((unsigned long)p->addr != ftrace_addr)
- return -EILSEQ;
p->flags |= KPROBE_FLAG_FTRACE;
#else /* !CONFIG_KPROBES_ON_FTRACE */
return -EINVAL;
@@ -1576,27 +1545,42 @@ int __weak arch_check_ftrace_location(struct kprobe *p)
return 0;
}
+static bool is_cfi_preamble_symbol(unsigned long addr)
+{
+ char symbuf[KSYM_NAME_LEN];
+
+ if (lookup_symbol_name(addr, symbuf))
+ return false;
+
+ return str_has_prefix("__cfi_", symbuf) ||
+ str_has_prefix("__pfx_", symbuf);
+}
+
static int check_kprobe_address_safe(struct kprobe *p,
struct module **probed_mod)
{
int ret;
- ret = arch_check_ftrace_location(p);
+ ret = check_ftrace_location(p);
if (ret)
return ret;
jump_label_lock();
preempt_disable();
/* Ensure it is not in reserved area nor out of text */
- if (!kernel_text_address((unsigned long) p->addr) ||
+ if (!(core_kernel_text((unsigned long) p->addr) ||
+ is_module_text_address((unsigned long) p->addr)) ||
+ in_gate_area_no_mm((unsigned long) p->addr) ||
within_kprobe_blacklist((unsigned long) p->addr) ||
jump_label_text_reserved(p->addr, p->addr) ||
- find_bug((unsigned long)p->addr)) {
+ static_call_text_reserved(p->addr, p->addr) ||
+ find_bug((unsigned long)p->addr) ||
+ is_cfi_preamble_symbol((unsigned long)p->addr)) {
ret = -EINVAL;
goto out;
}
- /* Check if are we probing a module */
+ /* Check if 'p' is probing a module. */
*probed_mod = __module_text_address((unsigned long) p->addr);
if (*probed_mod) {
/*
@@ -1609,7 +1593,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
}
/*
- * If the module freed .init.text, we couldn't insert
+ * If the module freed '.init.text', we couldn't insert
* kprobes in there.
*/
if (within_module_init((unsigned long)p->addr, *probed_mod) &&
@@ -1632,14 +1616,15 @@ int register_kprobe(struct kprobe *p)
struct kprobe *old_p;
struct module *probed_mod;
kprobe_opcode_t *addr;
+ bool on_func_entry;
/* Adjust probe address from symbol */
- addr = kprobe_addr(p);
+ addr = _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry);
if (IS_ERR(addr))
return PTR_ERR(addr);
p->addr = addr;
- ret = check_kprobe_rereg(p);
+ ret = warn_kprobe_rereg(p);
if (ret)
return ret;
@@ -1654,9 +1639,12 @@ int register_kprobe(struct kprobe *p)
mutex_lock(&kprobe_mutex);
+ if (on_func_entry)
+ p->flags |= KPROBE_FLAG_ON_FUNC_ENTRY;
+
old_p = get_kprobe(p->addr);
if (old_p) {
- /* Since this may unoptimize old_p, locking text_mutex. */
+ /* Since this may unoptimize 'old_p', locking 'text_mutex'. */
ret = register_aggr_kprobe(old_p, p);
goto out;
}
@@ -1695,8 +1683,8 @@ out:
}
EXPORT_SYMBOL_GPL(register_kprobe);
-/* Check if all probes on the aggrprobe are disabled */
-static int aggr_kprobe_disabled(struct kprobe *ap)
+/* Check if all probes on the 'ap' are disabled. */
+static bool aggr_kprobe_disabled(struct kprobe *ap)
{
struct kprobe *kp;
@@ -1705,20 +1693,21 @@ static int aggr_kprobe_disabled(struct kprobe *ap)
list_for_each_entry(kp, &ap->list, list)
if (!kprobe_disabled(kp))
/*
- * There is an active probe on the list.
- * We can't disable this ap.
+ * Since there is an active probe on the list,
+ * we can't disable this 'ap'.
*/
- return 0;
+ return false;
- return 1;
+ return true;
}
-/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
static struct kprobe *__disable_kprobe(struct kprobe *p)
{
struct kprobe *orig_p;
int ret;
+ lockdep_assert_held(&kprobe_mutex);
+
/* Get an original kprobe for return */
orig_p = __get_valid_kprobe(p);
if (unlikely(orig_p == NULL))
@@ -1732,11 +1721,12 @@ static struct kprobe *__disable_kprobe(struct kprobe *p)
/* Try to disarm and disable this/parent probe */
if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
/*
- * If kprobes_all_disarmed is set, orig_p
- * should have already been disarmed, so
- * skip unneed disarming process.
+ * Don't be lazy here. Even if 'kprobes_all_disarmed'
+ * is false, 'orig_p' might not have been armed yet.
+ * Note arm_all_kprobes() __tries__ to arm all kprobes
+ * on the best effort basis.
*/
- if (!kprobes_all_disarmed) {
+ if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) {
ret = disarm_kprobe(orig_p, true);
if (ret) {
p->flags &= ~KPROBE_FLAG_DISABLED;
@@ -1785,7 +1775,13 @@ static int __unregister_kprobe_top(struct kprobe *p)
if ((list_p != p) && (list_p->post_handler))
goto noclean;
}
- ap->post_handler = NULL;
+ /*
+ * For the kprobe-on-ftrace case, we keep the
+ * post_handler setting to identify this aggrprobe
+ * armed with kprobe_ipmodify_ops.
+ */
+ if (!kprobe_ftrace(ap))
+ ap->post_handler = NULL;
}
noclean:
/*
@@ -1878,12 +1874,215 @@ static struct notifier_block kprobe_exceptions_nb = {
.priority = 0x7fffffff /* we need to be notified first */
};
-unsigned long __weak arch_deref_entry_point(void *entry)
+#ifdef CONFIG_KRETPROBES
+
+#if !defined(CONFIG_KRETPROBE_ON_RETHOOK)
+
+/* callbacks for objpool of kretprobe instances */
+static int kretprobe_init_inst(void *nod, void *context)
+{
+ struct kretprobe_instance *ri = nod;
+
+ ri->rph = context;
+ return 0;
+}
+static int kretprobe_fini_pool(struct objpool_head *head, void *context)
{
- return (unsigned long)entry;
+ kfree(context);
+ return 0;
}
-#ifdef CONFIG_KRETPROBES
+static void free_rp_inst_rcu(struct rcu_head *head)
+{
+ struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
+ struct kretprobe_holder *rph = ri->rph;
+
+ objpool_drop(ri, &rph->pool);
+}
+NOKPROBE_SYMBOL(free_rp_inst_rcu);
+
+static void recycle_rp_inst(struct kretprobe_instance *ri)
+{
+ struct kretprobe *rp = get_kretprobe(ri);
+
+ if (likely(rp))
+ objpool_push(ri, &rp->rph->pool);
+ else
+ call_rcu(&ri->rcu, free_rp_inst_rcu);
+}
+NOKPROBE_SYMBOL(recycle_rp_inst);
+
+/*
+ * This function is called from delayed_put_task_struct() when a task is
+ * dead and cleaned up to recycle any kretprobe instances associated with
+ * this task. These left over instances represent probed functions that
+ * have been called but will never return.
+ */
+void kprobe_flush_task(struct task_struct *tk)
+{
+ struct kretprobe_instance *ri;
+ struct llist_node *node;
+
+ /* Early boot, not yet initialized. */
+ if (unlikely(!kprobes_initialized))
+ return;
+
+ kprobe_busy_begin();
+
+ node = __llist_del_all(&tk->kretprobe_instances);
+ while (node) {
+ ri = container_of(node, struct kretprobe_instance, llist);
+ node = node->next;
+
+ recycle_rp_inst(ri);
+ }
+
+ kprobe_busy_end();
+}
+NOKPROBE_SYMBOL(kprobe_flush_task);
+
+static inline void free_rp_inst(struct kretprobe *rp)
+{
+ struct kretprobe_holder *rph = rp->rph;
+
+ if (!rph)
+ return;
+ rp->rph = NULL;
+ objpool_fini(&rph->pool);
+}
+
+/* This assumes the 'tsk' is the current task or the is not running. */
+static kprobe_opcode_t *__kretprobe_find_ret_addr(struct task_struct *tsk,
+ struct llist_node **cur)
+{
+ struct kretprobe_instance *ri = NULL;
+ struct llist_node *node = *cur;
+
+ if (!node)
+ node = tsk->kretprobe_instances.first;
+ else
+ node = node->next;
+
+ while (node) {
+ ri = container_of(node, struct kretprobe_instance, llist);
+ if (ri->ret_addr != kretprobe_trampoline_addr()) {
+ *cur = node;
+ return ri->ret_addr;
+ }
+ node = node->next;
+ }
+ return NULL;
+}
+NOKPROBE_SYMBOL(__kretprobe_find_ret_addr);
+
+/**
+ * kretprobe_find_ret_addr -- Find correct return address modified by kretprobe
+ * @tsk: Target task
+ * @fp: A frame pointer
+ * @cur: a storage of the loop cursor llist_node pointer for next call
+ *
+ * Find the correct return address modified by a kretprobe on @tsk in unsigned
+ * long type. If it finds the return address, this returns that address value,
+ * or this returns 0.
+ * The @tsk must be 'current' or a task which is not running. @fp is a hint
+ * to get the currect return address - which is compared with the
+ * kretprobe_instance::fp field. The @cur is a loop cursor for searching the
+ * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the
+ * first call, but '@cur' itself must NOT NULL.
+ */
+unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp,
+ struct llist_node **cur)
+{
+ struct kretprobe_instance *ri;
+ kprobe_opcode_t *ret;
+
+ if (WARN_ON_ONCE(!cur))
+ return 0;
+
+ do {
+ ret = __kretprobe_find_ret_addr(tsk, cur);
+ if (!ret)
+ break;
+ ri = container_of(*cur, struct kretprobe_instance, llist);
+ } while (ri->fp != fp);
+
+ return (unsigned long)ret;
+}
+NOKPROBE_SYMBOL(kretprobe_find_ret_addr);
+
+void __weak arch_kretprobe_fixup_return(struct pt_regs *regs,
+ kprobe_opcode_t *correct_ret_addr)
+{
+ /*
+ * Do nothing by default. Please fill this to update the fake return
+ * address on the stack with the correct one on each arch if possible.
+ */
+}
+
+unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
+ void *frame_pointer)
+{
+ struct kretprobe_instance *ri = NULL;
+ struct llist_node *first, *node = NULL;
+ kprobe_opcode_t *correct_ret_addr;
+ struct kretprobe *rp;
+
+ /* Find correct address and all nodes for this frame. */
+ correct_ret_addr = __kretprobe_find_ret_addr(current, &node);
+ if (!correct_ret_addr) {
+ pr_err("kretprobe: Return address not found, not execute handler. Maybe there is a bug in the kernel.\n");
+ BUG_ON(1);
+ }
+
+ /*
+ * Set the return address as the instruction pointer, because if the
+ * user handler calls stack_trace_save_regs() with this 'regs',
+ * the stack trace will start from the instruction pointer.
+ */
+ instruction_pointer_set(regs, (unsigned long)correct_ret_addr);
+
+ /* Run the user handler of the nodes. */
+ first = current->kretprobe_instances.first;
+ while (first) {
+ ri = container_of(first, struct kretprobe_instance, llist);
+
+ if (WARN_ON_ONCE(ri->fp != frame_pointer))
+ break;
+
+ rp = get_kretprobe(ri);
+ if (rp && rp->handler) {
+ struct kprobe *prev = kprobe_running();
+
+ __this_cpu_write(current_kprobe, &rp->kp);
+ ri->ret_addr = correct_ret_addr;
+ rp->handler(ri, regs);
+ __this_cpu_write(current_kprobe, prev);
+ }
+ if (first == node)
+ break;
+
+ first = first->next;
+ }
+
+ arch_kretprobe_fixup_return(regs, correct_ret_addr);
+
+ /* Unlink all nodes for this frame. */
+ first = current->kretprobe_instances.first;
+ current->kretprobe_instances.first = node->next;
+ node->next = NULL;
+
+ /* Recycle free instances. */
+ while (first) {
+ ri = container_of(first, struct kretprobe_instance, llist);
+ first = first->next;
+
+ recycle_rp_inst(ri);
+ }
+
+ return (unsigned long)correct_ret_addr;
+}
+NOKPROBE_SYMBOL(__kretprobe_trampoline_handler)
+
/*
* This kprobe pre_handler is registered with every kretprobe. When probe
* hits it will set up the return probe.
@@ -1891,81 +2090,119 @@ unsigned long __weak arch_deref_entry_point(void *entry)
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
struct kretprobe *rp = container_of(p, struct kretprobe, kp);
- unsigned long hash, flags = 0;
+ struct kretprobe_holder *rph = rp->rph;
struct kretprobe_instance *ri;
- /*
- * To avoid deadlocks, prohibit return probing in NMI contexts,
- * just skip the probe and increase the (inexact) 'nmissed'
- * statistical counter, so that the user is informed that
- * something happened:
- */
- if (unlikely(in_nmi())) {
+ ri = objpool_pop(&rph->pool);
+ if (!ri) {
rp->nmissed++;
return 0;
}
- /* TODO: consider to only swap the RA after the last pre_handler fired */
- hash = hash_ptr(current, KPROBE_HASH_BITS);
- raw_spin_lock_irqsave(&rp->lock, flags);
- if (!hlist_empty(&rp->free_instances)) {
- ri = hlist_entry(rp->free_instances.first,
- struct kretprobe_instance, hlist);
- hlist_del(&ri->hlist);
- raw_spin_unlock_irqrestore(&rp->lock, flags);
+ if (rp->entry_handler && rp->entry_handler(ri, regs)) {
+ objpool_push(ri, &rph->pool);
+ return 0;
+ }
- ri->rp = rp;
- ri->task = current;
+ arch_prepare_kretprobe(ri, regs);
- if (rp->entry_handler && rp->entry_handler(ri, regs)) {
- raw_spin_lock_irqsave(&rp->lock, flags);
- hlist_add_head(&ri->hlist, &rp->free_instances);
- raw_spin_unlock_irqrestore(&rp->lock, flags);
- return 0;
- }
+ __llist_add(&ri->llist, &current->kretprobe_instances);
- arch_prepare_kretprobe(ri, regs);
+ return 0;
+}
+NOKPROBE_SYMBOL(pre_handler_kretprobe);
+#else /* CONFIG_KRETPROBE_ON_RETHOOK */
+/*
+ * This kprobe pre_handler is registered with every kretprobe. When probe
+ * hits it will set up the return probe.
+ */
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+ struct kretprobe_instance *ri;
+ struct rethook_node *rhn;
- /* XXX(hch): why is there no hlist_move_head? */
- INIT_HLIST_NODE(&ri->hlist);
- kretprobe_table_lock(hash, &flags);
- hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
- kretprobe_table_unlock(hash, &flags);
- } else {
+ rhn = rethook_try_get(rp->rh);
+ if (!rhn) {
rp->nmissed++;
- raw_spin_unlock_irqrestore(&rp->lock, flags);
+ return 0;
}
+
+ ri = container_of(rhn, struct kretprobe_instance, node);
+
+ if (rp->entry_handler && rp->entry_handler(ri, regs))
+ rethook_recycle(rhn);
+ else
+ rethook_hook(rhn, regs, kprobe_ftrace(p));
+
return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
-bool __weak arch_kprobe_on_func_entry(unsigned long offset)
+static void kretprobe_rethook_handler(struct rethook_node *rh, void *data,
+ unsigned long ret_addr,
+ struct pt_regs *regs)
{
- return !offset;
+ struct kretprobe *rp = (struct kretprobe *)data;
+ struct kretprobe_instance *ri;
+ struct kprobe_ctlblk *kcb;
+
+ /* The data must NOT be null. This means rethook data structure is broken. */
+ if (WARN_ON_ONCE(!data) || !rp->handler)
+ return;
+
+ __this_cpu_write(current_kprobe, &rp->kp);
+ kcb = get_kprobe_ctlblk();
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+ ri = container_of(rh, struct kretprobe_instance, node);
+ rp->handler(ri, regs);
+
+ __this_cpu_write(current_kprobe, NULL);
}
+NOKPROBE_SYMBOL(kretprobe_rethook_handler);
+
+#endif /* !CONFIG_KRETPROBE_ON_RETHOOK */
-bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
+/**
+ * kprobe_on_func_entry() -- check whether given address is function entry
+ * @addr: Target address
+ * @sym: Target symbol name
+ * @offset: The offset from the symbol or the address
+ *
+ * This checks whether the given @addr+@offset or @sym+@offset is on the
+ * function entry address or not.
+ * This returns 0 if it is the function entry, or -EINVAL if it is not.
+ * And also it returns -ENOENT if it fails the symbol or address lookup.
+ * Caller must pass @addr or @sym (either one must be NULL), or this
+ * returns -EINVAL.
+ */
+int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
{
- kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
+ bool on_func_entry;
+ kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset, &on_func_entry);
if (IS_ERR(kp_addr))
- return false;
+ return PTR_ERR(kp_addr);
- if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) ||
- !arch_kprobe_on_func_entry(offset))
- return false;
+ if (!on_func_entry)
+ return -EINVAL;
- return true;
+ return 0;
}
int register_kretprobe(struct kretprobe *rp)
{
- int ret = 0;
- struct kretprobe_instance *inst;
+ int ret;
int i;
void *addr;
- if (!kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
+ ret = kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset);
+ if (ret)
+ return ret;
+
+ /* If only 'rp->kp.addr' is specified, check reregistering kprobes */
+ if (rp->kp.addr && warn_kprobe_rereg(&rp->kp))
return -EINVAL;
if (kretprobe_blacklist_size) {
@@ -1979,36 +2216,49 @@ int register_kretprobe(struct kretprobe *rp)
}
}
+ if (rp->data_size > KRETPROBE_MAX_DATA_SIZE)
+ return -E2BIG;
+
rp->kp.pre_handler = pre_handler_kretprobe;
rp->kp.post_handler = NULL;
- rp->kp.fault_handler = NULL;
/* Pre-allocate memory for max kretprobe instances */
- if (rp->maxactive <= 0) {
-#ifdef CONFIG_PREEMPTION
+ if (rp->maxactive <= 0)
rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
-#else
- rp->maxactive = num_possible_cpus();
-#endif
- }
- raw_spin_lock_init(&rp->lock);
- INIT_HLIST_HEAD(&rp->free_instances);
- for (i = 0; i < rp->maxactive; i++) {
- inst = kmalloc(sizeof(struct kretprobe_instance) +
- rp->data_size, GFP_KERNEL);
- if (inst == NULL) {
- free_rp_inst(rp);
- return -ENOMEM;
- }
- INIT_HLIST_NODE(&inst->hlist);
- hlist_add_head(&inst->hlist, &rp->free_instances);
+
+#ifdef CONFIG_KRETPROBE_ON_RETHOOK
+ rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler,
+ sizeof(struct kretprobe_instance) +
+ rp->data_size, rp->maxactive);
+ if (IS_ERR(rp->rh))
+ return PTR_ERR(rp->rh);
+
+ rp->nmissed = 0;
+ /* Establish function entry probe point */
+ ret = register_kprobe(&rp->kp);
+ if (ret != 0) {
+ rethook_free(rp->rh);
+ rp->rh = NULL;
}
+#else /* !CONFIG_KRETPROBE_ON_RETHOOK */
+ rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL);
+ if (!rp->rph)
+ return -ENOMEM;
+ if (objpool_init(&rp->rph->pool, rp->maxactive, rp->data_size +
+ sizeof(struct kretprobe_instance), GFP_KERNEL,
+ rp->rph, kretprobe_init_inst, kretprobe_fini_pool)) {
+ kfree(rp->rph);
+ rp->rph = NULL;
+ return -ENOMEM;
+ }
+ rcu_assign_pointer(rp->rph->rp, rp);
rp->nmissed = 0;
/* Establish function entry probe point */
ret = register_kprobe(&rp->kp);
if (ret != 0)
free_rp_inst(rp);
+#endif
return ret;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
@@ -2044,16 +2294,24 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
if (num <= 0)
return;
mutex_lock(&kprobe_mutex);
- for (i = 0; i < num; i++)
+ for (i = 0; i < num; i++) {
if (__unregister_kprobe_top(&rps[i]->kp) < 0)
rps[i]->kp.addr = NULL;
+#ifdef CONFIG_KRETPROBE_ON_RETHOOK
+ rethook_free(rps[i]->rh);
+#else
+ rcu_assign_pointer(rps[i]->rph->rp, NULL);
+#endif
+ }
mutex_unlock(&kprobe_mutex);
synchronize_rcu();
for (i = 0; i < num; i++) {
if (rps[i]->kp.addr) {
__unregister_kprobe_bottom(&rps[i]->kp);
- cleanup_rp_inst(rps[i]);
+#ifndef CONFIG_KRETPROBE_ON_RETHOOK
+ free_rp_inst(rps[i]);
+#endif
}
}
}
@@ -2062,13 +2320,13 @@ EXPORT_SYMBOL_GPL(unregister_kretprobes);
#else /* CONFIG_KRETPROBES */
int register_kretprobe(struct kretprobe *rp)
{
- return -ENOSYS;
+ return -EOPNOTSUPP;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
int register_kretprobes(struct kretprobe **rps, int num)
{
- return -ENOSYS;
+ return -EOPNOTSUPP;
}
EXPORT_SYMBOL_GPL(register_kretprobes);
@@ -2097,6 +2355,14 @@ static void kill_kprobe(struct kprobe *p)
lockdep_assert_held(&kprobe_mutex);
+ /*
+ * The module is going away. We should disarm the kprobe which
+ * is using ftrace, because ftrace framework is still available at
+ * 'MODULE_STATE_GOING' notification.
+ */
+ if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
+ disarm_kprobe_ftrace(p);
+
p->flags |= KPROBE_FLAG_GONE;
if (kprobe_aggrprobe(p)) {
/*
@@ -2160,8 +2426,11 @@ int enable_kprobe(struct kprobe *kp)
if (!kprobes_all_disarmed && kprobe_disabled(p)) {
p->flags &= ~KPROBE_FLAG_DISABLED;
ret = arm_kprobe(p);
- if (ret)
+ if (ret) {
p->flags |= KPROBE_FLAG_DISABLED;
+ if (p != kp)
+ kp->flags |= KPROBE_FLAG_DISABLED;
+ }
}
out:
mutex_unlock(&kprobe_mutex);
@@ -2172,8 +2441,7 @@ EXPORT_SYMBOL_GPL(enable_kprobe);
/* Caller must NOT call this in usual path. This is only for critical case */
void dump_kprobe(struct kprobe *kp)
{
- pr_err("Dumping kprobe:\n");
- pr_err("Name: %s\nOffset: %x\nAddress: %pS\n",
+ pr_err("Dump kprobe:\n.symbol_name = %s, .offset = %x, .addr = %pS\n",
kp->symbol_name, kp->offset, kp->addr);
}
NOKPROBE_SYMBOL(dump_kprobe);
@@ -2232,6 +2500,28 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry)
kprobe_remove_area_blacklist(entry, entry + 1);
}
+int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
+ char *type, char *sym)
+{
+ return -ERANGE;
+}
+
+int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *sym)
+{
+#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
+ if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym))
+ return 0;
+#ifdef CONFIG_OPTPROBES
+ if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym))
+ return 0;
+#endif
+#endif
+ if (!arch_kprobe_get_kallsym(&symnum, value, type, sym))
+ return 0;
+ return -ERANGE;
+}
+
int __init __weak arch_populate_kprobe_blacklist(void)
{
return 0;
@@ -2253,7 +2543,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
int ret;
for (iter = start; iter < end; iter++) {
- entry = arch_deref_entry_point((void *)*iter);
+ entry = (unsigned long)dereference_symbol_descriptor((void *)*iter);
ret = kprobe_add_ksym_blacklist(entry);
if (ret == -EINVAL)
continue;
@@ -2261,13 +2551,13 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
return ret;
}
- /* Symbols in __kprobes_text are blacklisted */
+ /* Symbols in '__kprobes_text' are blacklisted */
ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start,
(unsigned long)__kprobes_text_end);
if (ret)
return ret;
- /* Symbols in noinstr section are blacklisted */
+ /* Symbols in 'noinstr' section are blacklisted */
ret = kprobe_add_area_blacklist((unsigned long)__noinstr_text_start,
(unsigned long)__noinstr_text_end);
@@ -2339,9 +2629,9 @@ static int kprobes_module_callback(struct notifier_block *nb,
return NOTIFY_DONE;
/*
- * When MODULE_STATE_GOING was notified, both of module .text and
- * .init.text sections would be freed. When MODULE_STATE_LIVE was
- * notified, only .init.text section would be freed. We need to
+ * When 'MODULE_STATE_GOING' was notified, both of module '.text' and
+ * '.init.text' sections would be freed. When 'MODULE_STATE_LIVE' was
+ * notified, only '.init.text' section would be freed. We need to
* disable kprobes which have been inserted in the sections.
*/
mutex_lock(&kprobe_mutex);
@@ -2358,9 +2648,9 @@ static int kprobes_module_callback(struct notifier_block *nb,
*
* Note, this will also move any optimized probes
* that are pending to be removed from their
- * corresponding lists to the freeing_list and
+ * corresponding lists to the 'freeing_list' and
* will not be touched by the delayed
- * kprobe_optimizer work handler.
+ * kprobe_optimizer() work handler.
*/
kill_kprobe(p);
}
@@ -2376,28 +2666,41 @@ static struct notifier_block kprobe_module_nb = {
.priority = 0
};
-/* Markers of _kprobe_blacklist section */
-extern unsigned long __start_kprobe_blacklist[];
-extern unsigned long __stop_kprobe_blacklist[];
+void kprobe_free_init_mem(void)
+{
+ void *start = (void *)(&__init_begin);
+ void *end = (void *)(&__init_end);
+ struct hlist_head *head;
+ struct kprobe *p;
+ int i;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* Kill all kprobes on initmem because the target code has been freed. */
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ head = &kprobe_table[i];
+ hlist_for_each_entry(p, head, hlist) {
+ if (start <= (void *)p->addr && (void *)p->addr < end)
+ kill_kprobe(p);
+ }
+ }
+
+ mutex_unlock(&kprobe_mutex);
+}
static int __init init_kprobes(void)
{
- int i, err = 0;
+ int i, err;
/* FIXME allocate the probe table, currently defined statically */
/* initialize all list heads */
- for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++)
INIT_HLIST_HEAD(&kprobe_table[i]);
- INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
- raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
- }
err = populate_kprobe_blacklist(__start_kprobe_blacklist,
__stop_kprobe_blacklist);
- if (err) {
- pr_err("kprobes: failed to populate blacklist: %d\n", err);
- pr_err("Please take care of using kprobes.\n");
- }
+ if (err)
+ pr_err("Failed to populate blacklist (error %d), kprobes not restricted, be careful using them!\n", err);
if (kretprobe_blacklist_size) {
/* lookup the function address from its name */
@@ -2405,23 +2708,19 @@ static int __init init_kprobes(void)
kretprobe_blacklist[i].addr =
kprobe_lookup_name(kretprobe_blacklist[i].name, 0);
if (!kretprobe_blacklist[i].addr)
- printk("kretprobe: lookup failed: %s\n",
+ pr_err("Failed to lookup symbol '%s' for kretprobe blacklist. Maybe the target function is removed or renamed.\n",
kretprobe_blacklist[i].name);
}
}
-#if defined(CONFIG_OPTPROBES)
-#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
- /* Init kprobe_optinsn_slots */
- kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
-#endif
- /* By default, kprobes can be optimized */
- kprobes_allow_optimization = true;
-#endif
-
/* By default, kprobes are armed */
kprobes_all_disarmed = false;
+#if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT)
+ /* Init 'kprobe_optinsn_slots' for allocation */
+ kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
+#endif
+
err = arch_init_kprobes();
if (!err)
err = register_die_notifier(&kprobe_exceptions_nb);
@@ -2429,12 +2728,25 @@ static int __init init_kprobes(void)
err = register_module_notifier(&kprobe_module_nb);
kprobes_initialized = (err == 0);
-
- if (!err)
- init_test_probes();
+ kprobe_sysctls_init();
return err;
}
-subsys_initcall(init_kprobes);
+early_initcall(init_kprobes);
+
+#if defined(CONFIG_OPTPROBES)
+static int __init init_optprobes(void)
+{
+ /*
+ * Enable kprobe optimization - this kicks the optimizer which
+ * depends on synchronize_rcu_tasks() and ksoftirqd, that is
+ * not spawned in early initcall. So delay the optimization.
+ */
+ optimize_all_kprobes();
+
+ return 0;
+}
+subsys_initcall(init_optprobes);
+#endif
#ifdef CONFIG_DEBUG_FS
static void report_probe(struct seq_file *pi, struct kprobe *p,
@@ -2490,7 +2802,7 @@ static int show_kprobe_addr(struct seq_file *pi, void *v)
{
struct hlist_head *head;
struct kprobe *p, *kp;
- const char *sym = NULL;
+ const char *sym;
unsigned int i = *(loff_t *) v;
unsigned long offset = 0;
char *modname, namebuf[KSYM_NAME_LEN];
@@ -2537,7 +2849,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
list_entry(v, struct kprobe_blacklist_entry, list);
/*
- * If /proc/kallsyms is not showing kernel address, we won't
+ * If '/proc/kallsyms' is not showing kernel address, we won't
* show them here either.
*/
if (!kallsyms_show_value(m->file->f_cred))
@@ -2598,7 +2910,7 @@ static int arm_all_kprobes(void)
}
if (errors)
- pr_warn("Kprobes globally enabled, but failed to arm %d out of %d probes\n",
+ pr_warn("Kprobes globally enabled, but failed to enable %d out of %d probes. Please check which kprobes are kept disabled via debugfs.\n",
errors, total);
else
pr_info("Kprobes globally enabled\n");
@@ -2641,7 +2953,7 @@ static int disarm_all_kprobes(void)
}
if (errors)
- pr_warn("Kprobes globally disabled, but failed to disarm %d out of %d probes\n",
+ pr_warn("Kprobes globally disabled, but failed to disable %d out of %d probes. Please check which kprobes are kept enabled via debugfs.\n",
errors, total);
else
pr_info("Kprobes globally disabled\n");
@@ -2676,30 +2988,14 @@ static ssize_t read_enabled_file_bool(struct file *file,
static ssize_t write_enabled_file_bool(struct file *file,
const char __user *user_buf, size_t count, loff_t *ppos)
{
- char buf[32];
- size_t buf_size;
- int ret = 0;
-
- buf_size = min(count, (sizeof(buf)-1));
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
+ bool enable;
+ int ret;
- buf[buf_size] = '\0';
- switch (buf[0]) {
- case 'y':
- case 'Y':
- case '1':
- ret = arm_all_kprobes();
- break;
- case 'n':
- case 'N':
- case '0':
- ret = disarm_all_kprobes();
- break;
- default:
- return -EINVAL;
- }
+ ret = kstrtobool_from_user(user_buf, count, &enable);
+ if (ret)
+ return ret;
+ ret = enable ? arm_all_kprobes() : disarm_all_kprobes();
if (ret)
return ret;
@@ -2715,13 +3011,12 @@ static const struct file_operations fops_kp = {
static int __init debugfs_kprobe_init(void)
{
struct dentry *dir;
- unsigned int value = 1;
dir = debugfs_create_dir("kprobes", NULL);
debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops);
- debugfs_create_file("enabled", 0600, dir, &value, &fops_kp);
+ debugfs_create_file("enabled", 0600, dir, NULL, &fops_kp);
debugfs_create_file("blacklist", 0400, dir, NULL,
&kprobe_blacklist_fops);
diff --git a/kernel/ksyms_common.c b/kernel/ksyms_common.c
new file mode 100644
index 000000000000..cf1a73cbf2f6
--- /dev/null
+++ b/kernel/ksyms_common.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ksyms_common.c: A split of kernel/kallsyms.c
+ * Contains a few generic function definations independent of config KALLSYMS.
+ */
+#include <linux/kallsyms.h>
+#include <linux/security.h>
+
+static inline int kallsyms_for_perf(void)
+{
+#ifdef CONFIG_PERF_EVENTS
+ extern int sysctl_perf_event_paranoid;
+
+ if (sysctl_perf_event_paranoid <= 1)
+ return 1;
+#endif
+ return 0;
+}
+
+/*
+ * We show kallsyms information even to normal users if we've enabled
+ * kernel profiling and are explicitly not paranoid (so kptr_restrict
+ * is clear, and sysctl_perf_event_paranoid isn't set).
+ *
+ * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to
+ * block even that).
+ */
+bool kallsyms_show_value(const struct cred *cred)
+{
+ switch (kptr_restrict) {
+ case 0:
+ if (kallsyms_for_perf())
+ return true;
+ fallthrough;
+ case 1:
+ if (security_capable(cred, &init_user_ns, CAP_SYSLOG,
+ CAP_OPT_NOAUDIT) == 0)
+ return true;
+ fallthrough;
+ default:
+ return false;
+ }
+}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 35859da8bd4f..1d4bc493b2f4 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -6,6 +6,7 @@
* Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
*/
+#include <asm/byteorder.h>
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/sysfs.h>
@@ -20,27 +21,50 @@
#include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */
+#if defined(__LITTLE_ENDIAN)
+#define CPU_BYTEORDER_STRING "little"
+#elif defined(__BIG_ENDIAN)
+#define CPU_BYTEORDER_STRING "big"
+#else
+#error Unknown byteorder
+#endif
+
#define KERNEL_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
#define KERNEL_ATTR_RW(_name) \
-static struct kobj_attribute _name##_attr = \
- __ATTR(_name, 0644, _name##_show, _name##_store)
+static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
/* current uevent sequence number */
static ssize_t uevent_seqnum_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum);
+ return sysfs_emit(buf, "%llu\n", (unsigned long long)uevent_seqnum);
}
KERNEL_ATTR_RO(uevent_seqnum);
+/* cpu byteorder */
+static ssize_t cpu_byteorder_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%s\n", CPU_BYTEORDER_STRING);
+}
+KERNEL_ATTR_RO(cpu_byteorder);
+
+/* address bits */
+static ssize_t address_bits_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%zu\n", sizeof(void *) * 8 /* CHAR_BIT */);
+}
+KERNEL_ATTR_RO(address_bits);
+
#ifdef CONFIG_UEVENT_HELPER
/* uevent helper program, used during early boot */
static ssize_t uevent_helper_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%s\n", uevent_helper);
+ return sysfs_emit(buf, "%s\n", uevent_helper);
}
static ssize_t uevent_helper_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -61,7 +85,7 @@ KERNEL_ATTR_RW(uevent_helper);
static ssize_t profiling_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", prof_on);
+ return sysfs_emit(buf, "%d\n", prof_on);
}
static ssize_t profiling_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -92,21 +116,26 @@ KERNEL_ATTR_RW(profiling);
static ssize_t kexec_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", !!kexec_image);
+ return sysfs_emit(buf, "%d\n", !!kexec_image);
}
KERNEL_ATTR_RO(kexec_loaded);
static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", kexec_crash_loaded());
+ return sysfs_emit(buf, "%d\n", kexec_crash_loaded());
}
KERNEL_ATTR_RO(kexec_crash_loaded);
static ssize_t kexec_crash_size_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%zu\n", crash_get_memory_size());
+ ssize_t size = crash_get_memory_size();
+
+ if (size < 0)
+ return size;
+
+ return sysfs_emit(buf, "%zd\n", size);
}
static ssize_t kexec_crash_size_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -131,18 +160,30 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
- return sprintf(buf, "%pa %x\n", &vmcore_base,
- (unsigned int)VMCOREINFO_NOTE_SIZE);
+ return sysfs_emit(buf, "%pa %x\n", &vmcore_base,
+ (unsigned int)VMCOREINFO_NOTE_SIZE);
}
KERNEL_ATTR_RO(vmcoreinfo);
+#ifdef CONFIG_CRASH_HOTPLUG
+static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ unsigned int sz = crash_get_elfcorehdr_size();
+
+ return sysfs_emit(buf, "%u\n", sz);
+}
+KERNEL_ATTR_RO(crash_elfcorehdr_size);
+
+#endif
+
#endif /* CONFIG_CRASH_CORE */
/* whether file capabilities are enabled */
static ssize_t fscaps_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", file_caps_enabled);
+ return sysfs_emit(buf, "%d\n", file_caps_enabled);
}
KERNEL_ATTR_RO(fscaps);
@@ -151,7 +192,7 @@ int rcu_expedited;
static ssize_t rcu_expedited_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", READ_ONCE(rcu_expedited));
+ return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_expedited));
}
static ssize_t rcu_expedited_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -168,7 +209,7 @@ int rcu_normal;
static ssize_t rcu_normal_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", READ_ONCE(rcu_normal));
+ return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_normal));
}
static ssize_t rcu_normal_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -211,6 +252,8 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
static struct attribute * kernel_attrs[] = {
&fscaps_attr.attr,
&uevent_seqnum_attr.attr,
+ &cpu_byteorder_attr.attr,
+ &address_bits_attr.attr,
#ifdef CONFIG_UEVENT_HELPER
&uevent_helper_attr.attr,
#endif
@@ -224,6 +267,9 @@ static struct attribute * kernel_attrs[] = {
#endif
#ifdef CONFIG_CRASH_CORE
&vmcoreinfo_attr.attr,
+#ifdef CONFIG_CRASH_HOTPLUG
+ &crash_elfcorehdr_size_attr.attr,
+#endif
#endif
#ifndef CONFIG_TINY_RCU
&rcu_expedited_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 132f84a5fde3..c5e40830c1f2 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,6 +27,7 @@
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <linux/numa.h>
+#include <linux/sched/isolation.h>
#include <trace/events/sched.h>
@@ -37,6 +38,7 @@ struct task_struct *kthreadd_task;
struct kthread_create_info
{
/* Information passed to kthread() from kthreadd. */
+ char *full_name;
int (*threadfn)(void *data);
void *data;
int node;
@@ -51,14 +53,16 @@ struct kthread_create_info
struct kthread {
unsigned long flags;
unsigned int cpu;
+ int result;
int (*threadfn)(void *);
void *data;
- mm_segment_t oldfs;
struct completion parked;
struct completion exited;
#ifdef CONFIG_BLK_CGROUP
struct cgroup_subsys_state *blkcg_css;
#endif
+ /* To store the full name if task comm is truncated. */
+ char *full_name;
};
enum KTHREAD_BITS {
@@ -67,20 +71,60 @@ enum KTHREAD_BITS {
KTHREAD_SHOULD_PARK,
};
-static inline void set_kthread_struct(void *kthread)
+static inline struct kthread *to_kthread(struct task_struct *k)
{
- /*
- * We abuse ->set_child_tid to avoid the new member and because it
- * can't be wrongly copied by copy_process(). We also rely on fact
- * that the caller can't exec, so PF_KTHREAD can't be cleared.
- */
- current->set_child_tid = (__force void __user *)kthread;
+ WARN_ON(!(k->flags & PF_KTHREAD));
+ return k->worker_private;
}
-static inline struct kthread *to_kthread(struct task_struct *k)
+/*
+ * Variant of to_kthread() that doesn't assume @p is a kthread.
+ *
+ * Per construction; when:
+ *
+ * (p->flags & PF_KTHREAD) && p->worker_private
+ *
+ * the task is both a kthread and struct kthread is persistent. However
+ * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
+ * begin_new_exec()).
+ */
+static inline struct kthread *__to_kthread(struct task_struct *p)
{
- WARN_ON(!(k->flags & PF_KTHREAD));
- return (__force void *)k->set_child_tid;
+ void *kthread = p->worker_private;
+ if (kthread && !(p->flags & PF_KTHREAD))
+ kthread = NULL;
+ return kthread;
+}
+
+void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk)
+{
+ struct kthread *kthread = to_kthread(tsk);
+
+ if (!kthread || !kthread->full_name) {
+ __get_task_comm(buf, buf_size, tsk);
+ return;
+ }
+
+ strscpy_pad(buf, kthread->full_name, buf_size);
+}
+
+bool set_kthread_struct(struct task_struct *p)
+{
+ struct kthread *kthread;
+
+ if (WARN_ON_ONCE(to_kthread(p)))
+ return false;
+
+ kthread = kzalloc(sizeof(*kthread), GFP_KERNEL);
+ if (!kthread)
+ return false;
+
+ init_completion(&kthread->exited);
+ init_completion(&kthread->parked);
+ p->vfork_done = &kthread->exited;
+
+ p->worker_private = kthread;
+ return true;
}
void free_kthread_struct(struct task_struct *k)
@@ -88,13 +132,17 @@ void free_kthread_struct(struct task_struct *k)
struct kthread *kthread;
/*
- * Can be NULL if this kthread was created by kernel_thread()
- * or if kmalloc() in kthread() failed.
+ * Can be NULL if kmalloc() in set_kthread_struct() failed.
*/
kthread = to_kthread(k);
+ if (!kthread)
+ return;
+
#ifdef CONFIG_BLK_CGROUP
- WARN_ON_ONCE(kthread && kthread->blkcg_css);
+ WARN_ON_ONCE(kthread->blkcg_css);
#endif
+ k->worker_private = NULL;
+ kfree(kthread->full_name);
kfree(kthread);
}
@@ -111,11 +159,10 @@ bool kthread_should_stop(void)
}
EXPORT_SYMBOL(kthread_should_stop);
-bool __kthread_should_park(struct task_struct *k)
+static bool __kthread_should_park(struct task_struct *k)
{
return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags);
}
-EXPORT_SYMBOL_GPL(__kthread_should_park);
/**
* kthread_should_park - should this kthread park now?
@@ -134,6 +181,16 @@ bool kthread_should_park(void)
}
EXPORT_SYMBOL_GPL(kthread_should_park);
+bool kthread_should_stop_or_park(void)
+{
+ struct kthread *kthread = __to_kthread(current);
+
+ if (!kthread)
+ return false;
+
+ return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK));
+}
+
/**
* kthread_freezable_should_stop - should this freezable kthread return now?
* @was_frozen: optional out parameter, indicates whether %current was frozen
@@ -167,8 +224,9 @@ EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
*/
void *kthread_func(struct task_struct *task)
{
- if (task->flags & PF_KTHREAD)
- return to_kthread(task)->threadfn;
+ struct kthread *kthread = __to_kthread(task);
+ if (kthread)
+ return kthread->threadfn;
return NULL;
}
EXPORT_SYMBOL_GPL(kthread_func);
@@ -198,10 +256,11 @@ EXPORT_SYMBOL_GPL(kthread_data);
*/
void *kthread_probe_data(struct task_struct *task)
{
- struct kthread *kthread = to_kthread(task);
+ struct kthread *kthread = __to_kthread(task);
void *data = NULL;
- copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
+ if (kthread)
+ copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
return data;
}
@@ -240,8 +299,47 @@ void kthread_parkme(void)
}
EXPORT_SYMBOL_GPL(kthread_parkme);
+/**
+ * kthread_exit - Cause the current kthread return @result to kthread_stop().
+ * @result: The integer value to return to kthread_stop().
+ *
+ * While kthread_exit can be called directly, it exists so that
+ * functions which do some additional work in non-modular code such as
+ * module_put_and_kthread_exit can be implemented.
+ *
+ * Does not return.
+ */
+void __noreturn kthread_exit(long result)
+{
+ struct kthread *kthread = to_kthread(current);
+ kthread->result = result;
+ do_exit(0);
+}
+
+/**
+ * kthread_complete_and_exit - Exit the current kthread.
+ * @comp: Completion to complete
+ * @code: The integer value to return to kthread_stop().
+ *
+ * If present, complete @comp and then return code to kthread_stop().
+ *
+ * A kernel thread whose module may be removed after the completion of
+ * @comp can use this function to exit safely.
+ *
+ * Does not return.
+ */
+void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
+{
+ if (comp)
+ complete(comp);
+
+ kthread_exit(code);
+}
+EXPORT_SYMBOL(kthread_complete_and_exit);
+
static int kthread(void *_create)
{
+ static const struct sched_param param = { .sched_priority = 0 };
/* Copy data: it's on kthread's stack */
struct kthread_create_info *create = _create;
int (*threadfn)(void *data) = create->threadfn;
@@ -250,27 +348,26 @@ static int kthread(void *_create)
struct kthread *self;
int ret;
- self = kzalloc(sizeof(*self), GFP_KERNEL);
- set_kthread_struct(self);
+ self = to_kthread(current);
- /* If user was SIGKILLed, I release the structure. */
+ /* Release the structure when caller killed by a fatal signal. */
done = xchg(&create->done, NULL);
if (!done) {
+ kfree(create->full_name);
kfree(create);
- do_exit(-EINTR);
- }
-
- if (!self) {
- create->result = ERR_PTR(-ENOMEM);
- complete(done);
- do_exit(-ENOMEM);
+ kthread_exit(-EINTR);
}
+ self->full_name = create->full_name;
self->threadfn = threadfn;
self->data = data;
- init_completion(&self->exited);
- init_completion(&self->parked);
- current->vfork_done = &self->exited;
+
+ /*
+ * The new thread inherited kthreadd's priority and CPU mask. Reset
+ * back to default in case they have been changed.
+ */
+ sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
+ set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
@@ -290,10 +387,10 @@ static int kthread(void *_create)
__kthread_parkme(self);
ret = threadfn(data);
}
- do_exit(ret);
+ kthread_exit(ret);
}
-/* called from do_fork() to get node information for about to be created task */
+/* called from kernel_clone() to get node information for about to be created task */
int tsk_fork_get_node(struct task_struct *tsk)
{
#ifdef CONFIG_NUMA
@@ -311,11 +408,13 @@ static void create_kthread(struct kthread_create_info *create)
current->pref_node_fork = create->node;
#endif
/* We want our own signal handler (we take no signals by default). */
- pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
+ pid = kernel_thread(kthread, create, create->full_name,
+ CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
- /* If user was SIGKILLed, I release the structure. */
+ /* Release the structure when caller killed by a fatal signal. */
struct completion *done = xchg(&create->done, NULL);
+ kfree(create->full_name);
if (!done) {
kfree(create);
return;
@@ -342,6 +441,11 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
create->data = data;
create->node = node;
create->done = &done;
+ create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
+ if (!create->full_name) {
+ task = ERR_PTR(-ENOMEM);
+ goto free_create;
+ }
spin_lock(&kthread_create_lock);
list_add_tail(&create->list, &kthread_create_list);
@@ -355,9 +459,9 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
*/
if (unlikely(wait_for_completion_killable(&done))) {
/*
- * If I was SIGKILLed before kthreadd (or new kernel thread)
- * calls complete(), leave the cleanup of this structure to
- * that thread.
+ * If I was killed by a fatal signal before kthreadd (or new
+ * kernel thread) calls complete(), leave the cleanup of this
+ * structure to that thread.
*/
if (xchg(&create->done, NULL))
return ERR_PTR(-EINTR);
@@ -368,23 +472,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
wait_for_completion(&done);
}
task = create->result;
- if (!IS_ERR(task)) {
- static const struct sched_param param = { .sched_priority = 0 };
- char name[TASK_COMM_LEN];
-
- /*
- * task is already visible to other tasks, so updating
- * COMM must be protected.
- */
- vsnprintf(name, sizeof(name), namefmt, args);
- set_task_comm(task, name);
- /*
- * root may have changed our (kthreadd's) priority or CPU mask.
- * The kernel thread should not inherit these properties.
- */
- sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
- set_cpus_allowed_ptr(task, cpu_all_mask);
- }
+free_create:
kfree(create);
return task;
}
@@ -404,7 +492,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
* If thread is going to be bound on a particular cpu, give its node
* in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
* When woken, the thread will run @threadfn() with @data as its
- * argument. @threadfn() can either call do_exit() directly if it is a
+ * argument. @threadfn() can either return directly if it is a
* standalone thread for which no one will call kthread_stop(), or
* return when 'kthread_should_stop()' is true (which means
* kthread_stop() has been called). The return value should be zero
@@ -428,7 +516,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
}
EXPORT_SYMBOL(kthread_create_on_node);
-static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
+static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
{
unsigned long flags;
@@ -444,7 +532,7 @@ static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mas
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
-static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
{
__kthread_bind_mask(p, cpumask_of(cpu), state);
}
@@ -478,7 +566,6 @@ EXPORT_SYMBOL(kthread_bind);
* to "name.*%u". Code fills in cpu number.
*
* Description: This helper function creates and names a kernel thread
- * The thread will be woken and put into park mode.
*/
struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
void *data, unsigned int cpu,
@@ -492,10 +579,36 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
return p;
kthread_bind(p, cpu);
/* CPU hotplug need to bind once again when unparking the thread. */
- set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
to_kthread(p)->cpu = cpu;
return p;
}
+EXPORT_SYMBOL(kthread_create_on_cpu);
+
+void kthread_set_per_cpu(struct task_struct *k, int cpu)
+{
+ struct kthread *kthread = to_kthread(k);
+ if (!kthread)
+ return;
+
+ WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));
+
+ if (cpu < 0) {
+ clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
+ return;
+ }
+
+ kthread->cpu = cpu;
+ set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
+}
+
+bool kthread_is_per_cpu(struct task_struct *p)
+{
+ struct kthread *kthread = __to_kthread(p);
+ if (!kthread)
+ return false;
+
+ return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
+}
/**
* kthread_unpark - unpark a thread created by kthread_create().
@@ -574,7 +687,7 @@ EXPORT_SYMBOL_GPL(kthread_park);
* instead of calling wake_up_process(): the thread will exit without
* calling threadfn().
*
- * If threadfn() may call do_exit() itself, the caller must ensure
+ * If threadfn() may call kthread_exit() itself, the caller must ensure
* task_struct can't go away.
*
* Returns the result of threadfn(), or %-EINTR if wake_up_process()
@@ -591,9 +704,10 @@ int kthread_stop(struct task_struct *k)
kthread = to_kthread(k);
set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
kthread_unpark(k);
+ set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL);
wake_up_process(k);
wait_for_completion(&kthread->exited);
- ret = k->exit_code;
+ ret = kthread->result;
put_task_struct(k);
trace_sched_kthread_stop_ret(ret);
@@ -601,6 +715,24 @@ int kthread_stop(struct task_struct *k)
}
EXPORT_SYMBOL(kthread_stop);
+/**
+ * kthread_stop_put - stop a thread and put its task struct
+ * @k: thread created by kthread_create().
+ *
+ * Stops a thread created by kthread_create() and put its task_struct.
+ * Only use when holding an extra task struct reference obtained by
+ * calling get_task_struct().
+ */
+int kthread_stop_put(struct task_struct *k)
+{
+ int ret;
+
+ ret = kthread_stop(k);
+ put_task_struct(k);
+ return ret;
+}
+EXPORT_SYMBOL(kthread_stop_put);
+
int kthreadd(void *unused)
{
struct task_struct *tsk = current;
@@ -608,7 +740,7 @@ int kthreadd(void *unused)
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, "kthreadd");
ignore_signals(tsk);
- set_cpus_allowed_ptr(tsk, cpu_all_mask);
+ set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD));
set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;
@@ -703,8 +835,15 @@ repeat:
raw_spin_unlock_irq(&worker->lock);
if (work) {
+ kthread_work_func_t func = work->func;
__set_current_state(TASK_RUNNING);
+ trace_sched_kthread_work_execute_start(work);
work->func(work);
+ /*
+ * Avoid dereferencing work after this point. The trace
+ * event only cares about the address.
+ */
+ trace_sched_kthread_work_execute_end(work, func);
} else if (!freezing(current))
schedule();
@@ -756,7 +895,7 @@ fail_task:
*
* Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
* when the needed structures could not get allocated, and ERR_PTR(-EINTR)
- * when the worker was SIGKILLed.
+ * when the caller was killed by a fatal signal.
*/
struct kthread_worker *
kthread_create_worker(unsigned int flags, const char namefmt[], ...)
@@ -774,7 +913,7 @@ EXPORT_SYMBOL(kthread_create_worker);
/**
* kthread_create_worker_on_cpu - create a kthread worker and bind it
- * it to a given CPU and the associated NUMA node.
+ * to a given CPU and the associated NUMA node.
* @cpu: CPU number
* @flags: flags modifying the default behavior of the worker
* @namefmt: printf-style name for the kthread worker (task).
@@ -785,9 +924,27 @@ EXPORT_SYMBOL(kthread_create_worker);
* A good practice is to add the cpu number also into the worker name.
* For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu).
*
- * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
+ * CPU hotplug:
+ * The kthread worker API is simple and generic. It just provides a way
+ * to create, use, and destroy workers.
+ *
+ * It is up to the API user how to handle CPU hotplug. They have to decide
+ * how to handle pending work items, prevent queuing new ones, and
+ * restore the functionality when the CPU goes off and on. There are a
+ * few catches:
+ *
+ * - CPU affinity gets lost when it is scheduled on an offline CPU.
+ *
+ * - The worker might not exist when the CPU was off when the user
+ * created the workers.
+ *
+ * Good practice is to implement two CPU hotplug callbacks and to
+ * destroy/create the worker when the CPU goes down/up.
+ *
+ * Return:
+ * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
* when the needed structures could not get allocated, and ERR_PTR(-EINTR)
- * when the worker was SIGKILLed.
+ * when the caller was killed by a fatal signal.
*/
struct kthread_worker *
kthread_create_worker_on_cpu(int cpu, unsigned int flags,
@@ -833,6 +990,8 @@ static void kthread_insert_work(struct kthread_worker *worker,
{
kthread_insert_work_sanity_check(worker, work);
+ trace_sched_kthread_work_queue_work(worker, work);
+
list_add_tail(&work->node, pos);
work->worker = worker;
if (!worker->current_work && likely(worker->task))
@@ -896,7 +1055,8 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)
/* Move the work from worker->delayed_work_list. */
WARN_ON_ONCE(list_empty(&work->node));
list_del_init(&work->node);
- kthread_insert_work(worker, work, &worker->work_list);
+ if (!work->canceling)
+ kthread_insert_work(worker, work, &worker->work_list);
raw_spin_unlock_irqrestore(&worker->lock, flags);
}
@@ -1017,8 +1177,38 @@ void kthread_flush_work(struct kthread_work *work)
EXPORT_SYMBOL_GPL(kthread_flush_work);
/*
- * This function removes the work from the worker queue. Also it makes sure
- * that it won't get queued later via the delayed work's timer.
+ * Make sure that the timer is neither set nor running and could
+ * not manipulate the work list_head any longer.
+ *
+ * The function is called under worker->lock. The lock is temporary
+ * released but the timer can't be set again in the meantime.
+ */
+static void kthread_cancel_delayed_work_timer(struct kthread_work *work,
+ unsigned long *flags)
+{
+ struct kthread_delayed_work *dwork =
+ container_of(work, struct kthread_delayed_work, work);
+ struct kthread_worker *worker = work->worker;
+
+ /*
+ * del_timer_sync() must be called to make sure that the timer
+ * callback is not running. The lock must be temporary released
+ * to avoid a deadlock with the callback. In the meantime,
+ * any queuing is blocked by setting the canceling counter.
+ */
+ work->canceling++;
+ raw_spin_unlock_irqrestore(&worker->lock, *flags);
+ del_timer_sync(&dwork->timer);
+ raw_spin_lock_irqsave(&worker->lock, *flags);
+ work->canceling--;
+}
+
+/*
+ * This function removes the work from the worker queue.
+ *
+ * It is called under worker->lock. The caller must make sure that
+ * the timer used by delayed work is not running, e.g. by calling
+ * kthread_cancel_delayed_work_timer().
*
* The work might still be in use when this function finishes. See the
* current_work proceed by the worker.
@@ -1026,28 +1216,8 @@ EXPORT_SYMBOL_GPL(kthread_flush_work);
* Return: %true if @work was pending and successfully canceled,
* %false if @work was not pending
*/
-static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork,
- unsigned long *flags)
+static bool __kthread_cancel_work(struct kthread_work *work)
{
- /* Try to cancel the timer if exists. */
- if (is_dwork) {
- struct kthread_delayed_work *dwork =
- container_of(work, struct kthread_delayed_work, work);
- struct kthread_worker *worker = work->worker;
-
- /*
- * del_timer_sync() must be called to make sure that the timer
- * callback is not running. The lock must be temporary released
- * to avoid a deadlock with the callback. In the meantime,
- * any queuing is blocked by setting the canceling counter.
- */
- work->canceling++;
- raw_spin_unlock_irqrestore(&worker->lock, *flags);
- del_timer_sync(&dwork->timer);
- raw_spin_lock_irqsave(&worker->lock, *flags);
- work->canceling--;
- }
-
/*
* Try to remove the work from a worker list. It might either
* be from worker->work_list or from worker->delayed_work_list.
@@ -1070,14 +1240,14 @@ static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork,
* modify @dwork's timer so that it expires after @delay. If @delay is zero,
* @work is guaranteed to be queued immediately.
*
- * Return: %true if @dwork was pending and its timer was modified,
- * %false otherwise.
+ * Return: %false if @dwork was idle and queued, %true otherwise.
*
* A special case is when the work is being canceled in parallel.
* It might be caused either by the real kthread_cancel_delayed_work_sync()
* or yet another kthread_mod_delayed_work() call. We let the other command
- * win and return %false here. The caller is supposed to synchronize these
- * operations a reasonable way.
+ * win and return %true here. The return value can be used for reference
+ * counting and the number of queued works stays the same. Anyway, the caller
+ * is supposed to synchronize these operations a reasonable way.
*
* This function is safe to call from any context including IRQ handler.
* See __kthread_cancel_work() and kthread_delayed_work_timer_fn()
@@ -1089,22 +1259,39 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker,
{
struct kthread_work *work = &dwork->work;
unsigned long flags;
- int ret = false;
+ int ret;
raw_spin_lock_irqsave(&worker->lock, flags);
/* Do not bother with canceling when never queued. */
- if (!work->worker)
+ if (!work->worker) {
+ ret = false;
goto fast_queue;
+ }
/* Work must not be used with >1 worker, see kthread_queue_work() */
WARN_ON_ONCE(work->worker != worker);
- /* Do not fight with another command that is canceling this work. */
- if (work->canceling)
+ /*
+ * Temporary cancel the work but do not fight with another command
+ * that is canceling the work as well.
+ *
+ * It is a bit tricky because of possible races with another
+ * mod_delayed_work() and cancel_delayed_work() callers.
+ *
+ * The timer must be canceled first because worker->lock is released
+ * when doing so. But the work can be removed from the queue (list)
+ * only when it can be queued again so that the return value can
+ * be used for reference counting.
+ */
+ kthread_cancel_delayed_work_timer(work, &flags);
+ if (work->canceling) {
+ /* The number of works in the queue does not change. */
+ ret = true;
goto out;
+ }
+ ret = __kthread_cancel_work(work);
- ret = __kthread_cancel_work(work, true, &flags);
fast_queue:
__kthread_queue_delayed_work(worker, dwork, delay);
out:
@@ -1126,7 +1313,10 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
/* Work must not be used with >1 worker, see kthread_queue_work(). */
WARN_ON_ONCE(work->worker != worker);
- ret = __kthread_cancel_work(work, is_dwork, &flags);
+ if (is_dwork)
+ kthread_cancel_delayed_work_timer(work, &flags);
+
+ ret = __kthread_cancel_work(work);
if (worker->current_work != work)
goto out_fast;
@@ -1210,6 +1400,10 @@ EXPORT_SYMBOL_GPL(kthread_flush_worker);
* Flush and destroy @worker. The simple flush is enough because the kthread
* worker API is used only in trivial scenarios. There are no multi-step state
* machines needed.
+ *
+ * Note that this function is not responsible for handling delayed work, so
+ * caller should be responsible for queuing or canceling all delayed work items
+ * before invoke this function.
*/
void kthread_destroy_worker(struct kthread_worker *worker)
{
@@ -1221,6 +1415,7 @@ void kthread_destroy_worker(struct kthread_worker *worker)
kthread_flush_worker(worker);
kthread_stop(task);
+ WARN_ON(!list_empty(&worker->delayed_work_list));
WARN_ON(!list_empty(&worker->work_list));
kfree(worker);
}
@@ -1238,24 +1433,37 @@ void kthread_use_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(tsk->mm);
+ /*
+ * It is possible for mm to be the same as tsk->active_mm, but
+ * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm),
+ * because these references are not equivalent.
+ */
+ mmgrab(mm);
+
task_lock(tsk);
+ /* Hold off tlb flush IPIs while switching mm's */
+ local_irq_disable();
active_mm = tsk->active_mm;
- if (active_mm != mm) {
- mmgrab(mm);
- tsk->active_mm = mm;
- }
+ tsk->active_mm = mm;
tsk->mm = mm;
- switch_mm(active_mm, mm, tsk);
+ membarrier_update_current_mm(mm);
+ switch_mm_irqs_off(active_mm, mm, tsk);
+ local_irq_enable();
task_unlock(tsk);
#ifdef finish_arch_post_lock_switch
finish_arch_post_lock_switch();
#endif
- if (active_mm != mm)
- mmdrop(active_mm);
-
- to_kthread(tsk)->oldfs = get_fs();
- set_fs(USER_DS);
+ /*
+ * When a kthread starts operating on an address space, the loop
+ * in membarrier_{private,global}_expedited() may not observe
+ * that tsk->mm, and not issue an IPI. Membarrier requires a
+ * memory barrier after storing to tsk->mm, before accessing
+ * user-space memory. A full memory barrier for membarrier
+ * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
+ * mmdrop_lazy_tlb().
+ */
+ mmdrop_lazy_tlb(active_mm);
}
EXPORT_SYMBOL_GPL(kthread_use_mm);
@@ -1270,14 +1478,25 @@ void kthread_unuse_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(!tsk->mm);
- set_fs(to_kthread(tsk)->oldfs);
-
task_lock(tsk);
- sync_mm_rss(mm);
+ /*
+ * When a kthread stops operating on an address space, the loop
+ * in membarrier_{private,global}_expedited() may not observe
+ * that tsk->mm, and not issue an IPI. Membarrier requires a
+ * memory barrier after accessing user-space memory, before
+ * clearing tsk->mm.
+ */
+ smp_mb__after_spinlock();
+ local_irq_disable();
tsk->mm = NULL;
+ membarrier_update_current_mm(NULL);
+ mmgrab_lazy_tlb(mm);
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
+ local_irq_enable();
task_unlock(tsk);
+
+ mmdrop(mm);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);
@@ -1329,5 +1548,4 @@ struct cgroup_subsys_state *kthread_blkcg(void)
}
return NULL;
}
-EXPORT_SYMBOL(kthread_blkcg);
#endif
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 166d7bf49666..781249098cb6 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -55,6 +55,7 @@
#include <linux/sched/stat.h>
#include <linux/list.h>
#include <linux/stacktrace.h>
+#include <linux/sysctl.h>
static DEFINE_RAW_SPINLOCK(latency_lock);
@@ -63,6 +64,31 @@ static struct latency_record latency_record[MAXLR];
int latencytop_enabled;
+#ifdef CONFIG_SYSCTL
+static int sysctl_latencytop(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int err;
+
+ err = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (latencytop_enabled)
+ force_schedstat_enabled();
+
+ return err;
+}
+
+static struct ctl_table latencytop_sysctl[] = {
+ {
+ .procname = "latencytop",
+ .data = &latencytop_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_latencytop,
+ },
+ {}
+};
+#endif
+
void clear_tsk_latency_tracing(struct task_struct *p)
{
unsigned long flags;
@@ -86,7 +112,7 @@ static void __sched
account_global_scheduler_latency(struct task_struct *tsk,
struct latency_record *lat)
{
- int firstnonnull = MAXLR + 1;
+ int firstnonnull = MAXLR;
int i;
/* skip kernel threads for now */
@@ -124,7 +150,7 @@ account_global_scheduler_latency(struct task_struct *tsk,
}
i = firstnonnull;
- if (i >= MAXLR - 1)
+ if (i >= MAXLR)
return;
/* Allocted a new one: */
@@ -266,18 +292,9 @@ static const struct proc_ops lstats_proc_ops = {
static int __init init_lstats_procfs(void)
{
proc_create("latency_stats", 0644, NULL, &lstats_proc_ops);
+#ifdef CONFIG_SYSCTL
+ register_sysctl_init("kernel", latencytop_sysctl);
+#endif
return 0;
}
-
-int sysctl_latencytop(struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
-{
- int err;
-
- err = proc_dointvec(table, write, buffer, lenp, ppos);
- if (latencytop_enabled)
- force_schedstat_enabled();
-
- return err;
-}
device_initcall(init_lstats_procfs);
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index 54102deb50ba..53d51ed619a3 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -6,7 +6,7 @@ config HAVE_LIVEPATCH
config LIVEPATCH
bool "Kernel Live Patching"
- depends on DYNAMIC_FTRACE_WITH_REGS
+ depends on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS
depends on MODULES
depends on SYSFS
depends on KALLSYMS_ALL
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index f76fdb925532..ecbc9b6aba3a 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -19,6 +19,7 @@
#include <linux/moduleloader.h>
#include <linux/completion.h>
#include <linux/memory.h>
+#include <linux/rcupdate.h>
#include <asm/cacheflush.h>
#include "core.h"
#include "patch.h"
@@ -32,6 +33,7 @@
*
* - klp_ftrace_handler()
* - klp_update_patch_state()
+ * - __klp_sched_try_switch()
*/
DEFINE_MUTEX(klp_mutex);
@@ -57,7 +59,7 @@ static void klp_find_object_module(struct klp_object *obj)
if (!klp_is_module(obj))
return;
- mutex_lock(&module_mutex);
+ rcu_read_lock_sched();
/*
* We do not want to block removal of patched modules and therefore
* we do not take a reference here. The patches are removed by
@@ -74,7 +76,7 @@ static void klp_find_object_module(struct klp_object *obj)
if (mod && mod->klp_alive)
obj->mod = mod;
- mutex_unlock(&module_mutex);
+ rcu_read_unlock_sched();
}
static bool klp_initialized(void)
@@ -117,27 +119,16 @@ static struct klp_object *klp_find_object(struct klp_patch *patch,
}
struct klp_find_arg {
- const char *objname;
const char *name;
unsigned long addr;
unsigned long count;
unsigned long pos;
};
-static int klp_find_callback(void *data, const char *name,
- struct module *mod, unsigned long addr)
+static int klp_match_callback(void *data, unsigned long addr)
{
struct klp_find_arg *args = data;
- if ((mod && !args->objname) || (!mod && args->objname))
- return 0;
-
- if (strcmp(args->name, name))
- return 0;
-
- if (args->objname && strcmp(args->objname, mod->name))
- return 0;
-
args->addr = addr;
args->count++;
@@ -152,23 +143,30 @@ static int klp_find_callback(void *data, const char *name,
return 0;
}
+static int klp_find_callback(void *data, const char *name, unsigned long addr)
+{
+ struct klp_find_arg *args = data;
+
+ if (strcmp(args->name, name))
+ return 0;
+
+ return klp_match_callback(data, addr);
+}
+
static int klp_find_object_symbol(const char *objname, const char *name,
unsigned long sympos, unsigned long *addr)
{
struct klp_find_arg args = {
- .objname = objname,
.name = name,
.addr = 0,
.count = 0,
.pos = sympos,
};
- mutex_lock(&module_mutex);
if (objname)
- module_kallsyms_on_each_symbol(klp_find_callback, &args);
+ module_kallsyms_on_each_symbol(objname, klp_find_callback, &args);
else
- kallsyms_on_each_symbol(klp_find_callback, &args);
- mutex_unlock(&module_mutex);
+ kallsyms_on_each_match_symbol(klp_match_callback, name, &args);
/*
* Ensure an address was found. If sympos is 0, ensure symbol is unique;
@@ -191,7 +189,7 @@ static int klp_find_object_symbol(const char *objname, const char *name,
return -EINVAL;
}
-static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
+static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
unsigned int symndx, Elf_Shdr *relasec,
const char *sec_objname)
{
@@ -214,12 +212,12 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
* we use the smallest/strictest upper bound possible (56, based on
* the current definition of MODULE_NAME_LEN) to prevent overflows.
*/
- BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128);
+ BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 512);
relas = (Elf_Rela *) relasec->sh_addr;
/* For each rela in this klp relocation section */
for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) {
- sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info);
+ sym = (Elf_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info);
if (sym->st_shndx != SHN_LIVEPATCH) {
pr_err("symbol %s is not marked as a livepatch symbol\n",
strtab + sym->st_name);
@@ -228,7 +226,7 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
/* Format: .klp.sym.sym_objname.sym_name,sympos */
cnt = sscanf(strtab + sym->st_name,
- ".klp.sym.%55[^.].%127[^,],%lu",
+ ".klp.sym.%55[^.].%511[^,],%lu",
sym_objname, sym_name, &sympos);
if (cnt != 3) {
pr_err("symbol %s has an incorrectly formatted name\n",
@@ -245,7 +243,7 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
* symbols are exported and normal relas can be used instead.
*/
if (!sec_vmlinux && sym_vmlinux) {
- pr_err("invalid access to vmlinux symbol '%s' from module-specific livepatch relocation section",
+ pr_err("invalid access to vmlinux symbol '%s' from module-specific livepatch relocation section\n",
sym_name);
return -EINVAL;
}
@@ -262,6 +260,14 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
return 0;
}
+void __weak clear_relocate_add(Elf_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+}
+
/*
* At a high-level, there are two types of klp relocation sections: those which
* reference symbols which live in vmlinux; and those which reference symbols
@@ -285,10 +291,10 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
* the to-be-patched module to be loaded and patched sometime *after* the
* klp module is loaded.
*/
-int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
- const char *shstrtab, const char *strtab,
- unsigned int symndx, unsigned int secndx,
- const char *objname)
+static int klp_write_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
+ const char *shstrtab, const char *strtab,
+ unsigned int symndx, unsigned int secndx,
+ const char *objname, bool apply)
{
int cnt, ret;
char sec_objname[MODULE_NAME_LEN];
@@ -310,11 +316,26 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
if (strcmp(objname ? objname : "vmlinux", sec_objname))
return 0;
- ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec, sec_objname);
- if (ret)
- return ret;
+ if (apply) {
+ ret = klp_resolve_symbols(sechdrs, strtab, symndx,
+ sec, sec_objname);
+ if (ret)
+ return ret;
- return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod);
+ return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod);
+ }
+
+ clear_relocate_add(sechdrs, strtab, symndx, secndx, pmod);
+ return 0;
+}
+
+int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
+ const char *shstrtab, const char *strtab,
+ unsigned int symndx, unsigned int secndx,
+ const char *objname)
+{
+ return klp_write_section_relocs(pmod, sechdrs, shstrtab, strtab, symndx,
+ secndx, objname, true);
}
/*
@@ -326,6 +347,7 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
* /sys/kernel/livepatch/<patch>/transition
* /sys/kernel/livepatch/<patch>/force
* /sys/kernel/livepatch/<patch>/<object>
+ * /sys/kernel/livepatch/<patch>/<object>/patched
* /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
*/
static int __klp_disable_patch(struct klp_patch *patch);
@@ -432,6 +454,22 @@ static struct attribute *klp_patch_attrs[] = {
};
ATTRIBUTE_GROUPS(klp_patch);
+static ssize_t patched_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct klp_object *obj;
+
+ obj = container_of(kobj, struct klp_object, kobj);
+ return sysfs_emit(buf, "%d\n", obj->patched);
+}
+
+static struct kobj_attribute patched_kobj_attr = __ATTR_RO(patched);
+static struct attribute *klp_object_attrs[] = {
+ &patched_kobj_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(klp_object);
+
static void klp_free_object_dynamic(struct klp_object *obj)
{
kfree(obj->name);
@@ -558,7 +596,7 @@ static void klp_kobj_release_patch(struct kobject *kobj)
complete(&patch->finish);
}
-static struct kobj_type klp_ktype_patch = {
+static const struct kobj_type klp_ktype_patch = {
.release = klp_kobj_release_patch,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = klp_patch_groups,
@@ -574,9 +612,10 @@ static void klp_kobj_release_object(struct kobject *kobj)
klp_free_object_dynamic(obj);
}
-static struct kobj_type klp_ktype_object = {
+static const struct kobj_type klp_ktype_object = {
.release = klp_kobj_release_object,
.sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = klp_object_groups,
};
static void klp_kobj_release_func(struct kobject *kobj)
@@ -589,7 +628,7 @@ static void klp_kobj_release_func(struct kobject *kobj)
klp_free_func_nop(func);
}
-static struct kobj_type klp_ktype_func = {
+static const struct kobj_type klp_ktype_func = {
.release = klp_kobj_release_func,
.sysfs_ops = &kobj_sysfs_ops,
};
@@ -745,8 +784,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
func->old_sympos ? func->old_sympos : 1);
}
-static int klp_apply_object_relocs(struct klp_patch *patch,
- struct klp_object *obj)
+static int klp_write_object_relocs(struct klp_patch *patch,
+ struct klp_object *obj,
+ bool apply)
{
int i, ret;
struct klp_modinfo *info = patch->mod->klp_info;
@@ -757,10 +797,10 @@ static int klp_apply_object_relocs(struct klp_patch *patch,
if (!(sec->sh_flags & SHF_RELA_LIVEPATCH))
continue;
- ret = klp_apply_section_relocs(patch->mod, info->sechdrs,
+ ret = klp_write_section_relocs(patch->mod, info->sechdrs,
info->secstrings,
patch->mod->core_kallsyms.strtab,
- info->symndx, i, obj->name);
+ info->symndx, i, obj->name, apply);
if (ret)
return ret;
}
@@ -768,6 +808,18 @@ static int klp_apply_object_relocs(struct klp_patch *patch,
return 0;
}
+static int klp_apply_object_relocs(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ return klp_write_object_relocs(patch, obj, true);
+}
+
+static void klp_clear_object_relocs(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ klp_write_object_relocs(patch, obj, false);
+}
+
/* parts of the initialization that is done only when the object is loaded */
static int klp_init_object_loaded(struct klp_patch *patch,
struct klp_object *obj)
@@ -863,14 +915,11 @@ static void klp_init_object_early(struct klp_patch *patch,
list_add_tail(&obj->node, &patch->obj_list);
}
-static int klp_init_patch_early(struct klp_patch *patch)
+static void klp_init_patch_early(struct klp_patch *patch)
{
struct klp_object *obj;
struct klp_func *func;
- if (!patch->objs)
- return -EINVAL;
-
INIT_LIST_HEAD(&patch->list);
INIT_LIST_HEAD(&patch->obj_list);
kobject_init(&patch->kobj, &klp_ktype_patch);
@@ -880,20 +929,12 @@ static int klp_init_patch_early(struct klp_patch *patch)
init_completion(&patch->finish);
klp_for_each_object_static(patch, obj) {
- if (!obj->funcs)
- return -EINVAL;
-
klp_init_object_early(patch, obj);
klp_for_each_func_static(obj, func) {
klp_init_func_early(obj, func);
}
}
-
- if (!try_module_get(patch->mod))
- return -ENODEV;
-
- return 0;
}
static int klp_init_patch(struct klp_patch *patch)
@@ -1025,10 +1066,17 @@ err:
int klp_enable_patch(struct klp_patch *patch)
{
int ret;
+ struct klp_object *obj;
- if (!patch || !patch->mod)
+ if (!patch || !patch->mod || !patch->objs)
return -EINVAL;
+ klp_for_each_object_static(patch, obj) {
+ if (!obj->funcs)
+ return -EINVAL;
+ }
+
+
if (!is_livepatch_module(patch->mod)) {
pr_err("module %s is not marked as a livepatch module\n",
patch->mod->name);
@@ -1052,12 +1100,13 @@ int klp_enable_patch(struct klp_patch *patch)
return -EINVAL;
}
- ret = klp_init_patch_early(patch);
- if (ret) {
+ if (!try_module_get(patch->mod)) {
mutex_unlock(&klp_mutex);
- return ret;
+ return -ENODEV;
}
+ klp_init_patch_early(patch);
+
ret = klp_init_patch(patch);
if (ret)
goto err;
@@ -1158,7 +1207,7 @@ static void klp_cleanup_module_patches_limited(struct module *mod,
klp_unpatch_object(obj);
klp_post_unpatch_callback(obj);
-
+ klp_clear_object_relocs(patch, obj);
klp_free_object_loaded(obj);
break;
}
@@ -1175,7 +1224,7 @@ int klp_module_coming(struct module *mod)
return -EINVAL;
if (!strcmp(mod->name, "vmlinux")) {
- pr_err("vmlinux.ko: invalid module name");
+ pr_err("vmlinux.ko: invalid module name\n");
return -EINVAL;
}
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index b552cf2d85f8..4152c71507e2 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -40,19 +40,24 @@ struct klp_ops *klp_find_ops(void *old_func)
static void notrace klp_ftrace_handler(unsigned long ip,
unsigned long parent_ip,
struct ftrace_ops *fops,
- struct pt_regs *regs)
+ struct ftrace_regs *fregs)
{
struct klp_ops *ops;
struct klp_func *func;
int patch_state;
+ int bit;
ops = container_of(fops, struct klp_ops, fops);
/*
- * A variant of synchronize_rcu() is used to allow patching functions
- * where RCU is not watching, see klp_synchronize_transition().
+ * The ftrace_test_recursion_trylock() will disable preemption,
+ * which is required for the variant of synchronize_rcu() that is
+ * used to allow patching functions where RCU is not watching.
+ * See klp_synchronize_transition() for more details.
*/
- preempt_disable_notrace();
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (WARN_ON_ONCE(bit < 0))
+ return;
func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
stack_node);
@@ -113,25 +118,12 @@ static void notrace klp_ftrace_handler(unsigned long ip,
if (func->nop)
goto unlock;
- klp_arch_set_pc(regs, (unsigned long)func->new_func);
+ ftrace_regs_set_instruction_pointer(fregs, (unsigned long)func->new_func);
unlock:
- preempt_enable_notrace();
+ ftrace_test_recursion_unlock(bit);
}
-/*
- * Convert a function address into the appropriate ftrace location.
- *
- * Usually this is just the address of the function, but on some architectures
- * it's more complicated so allow them to provide a custom behaviour.
- */
-#ifndef klp_get_ftrace_location
-static unsigned long klp_get_ftrace_location(unsigned long faddr)
-{
- return faddr;
-}
-#endif
-
static void klp_unpatch_func(struct klp_func *func)
{
struct klp_ops *ops;
@@ -148,8 +140,7 @@ static void klp_unpatch_func(struct klp_func *func)
if (list_is_singular(&ops->func_stack)) {
unsigned long ftrace_loc;
- ftrace_loc =
- klp_get_ftrace_location((unsigned long)func->old_func);
+ ftrace_loc = ftrace_location((unsigned long)func->old_func);
if (WARN_ON(!ftrace_loc))
return;
@@ -181,8 +172,7 @@ static int klp_patch_func(struct klp_func *func)
if (!ops) {
unsigned long ftrace_loc;
- ftrace_loc =
- klp_get_ftrace_location((unsigned long)func->old_func);
+ ftrace_loc = ftrace_location((unsigned long)func->old_func);
if (!ftrace_loc) {
pr_err("failed to find location for function '%s'\n",
func->old_name);
@@ -194,8 +184,10 @@ static int klp_patch_func(struct klp_func *func)
return -ENOMEM;
ops->fops.func = klp_ftrace_handler;
- ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
- FTRACE_OPS_FL_DYNAMIC |
+ ops->fops.flags = FTRACE_OPS_FL_DYNAMIC |
+#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+ FTRACE_OPS_FL_SAVE_REGS |
+#endif
FTRACE_OPS_FL_IPMODIFY |
FTRACE_OPS_FL_PERMANENT;
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
index e5c9fb295ba9..c2e724d97ddf 100644
--- a/kernel/livepatch/shadow.c
+++ b/kernel/livepatch/shadow.c
@@ -272,12 +272,12 @@ void klp_shadow_free(void *obj, unsigned long id, klp_shadow_dtor_t dtor)
EXPORT_SYMBOL_GPL(klp_shadow_free);
/**
- * klp_shadow_free_all() - detach and free all <*, id> shadow variables
+ * klp_shadow_free_all() - detach and free all <_, id> shadow variables
* @id: data identifier
* @dtor: custom callback that can be used to unregister the variable
* and/or free data that the shadow variable points to (optional)
*
- * This function releases the memory for all <*, id> shadow variable
+ * This function releases the memory for all <_, id> shadow variable
* instances, callers should stop referencing them accordingly.
*/
void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor)
@@ -288,7 +288,7 @@ void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor)
spin_lock_irqsave(&klp_shadow_lock, flags);
- /* Delete all <*, id> from hash */
+ /* Delete all <_, id> from hash */
hash_for_each(klp_shadow_hash, i, shadow, node) {
if (klp_shadow_match(shadow, shadow->obj, id))
klp_shadow_free_struct(shadow, dtor);
diff --git a/kernel/livepatch/state.c b/kernel/livepatch/state.c
index 7ee19476de9d..2565d039ade0 100644
--- a/kernel/livepatch/state.c
+++ b/kernel/livepatch/state.c
@@ -55,7 +55,7 @@ EXPORT_SYMBOL_GPL(klp_get_state);
*
* The function can be called only during transition when a new
* livepatch is being enabled or when such a transition is reverted.
- * It is typically called only from from pre/post (un)patch
+ * It is typically called only from pre/post (un)patch
* callbacks.
*
* Return: pointer to the latest struct klp_state from already
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index f6310f848f34..e54c3d60a904 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -9,12 +9,14 @@
#include <linux/cpu.h>
#include <linux/stacktrace.h>
+#include <linux/static_call.h>
#include "core.h"
#include "patch.h"
#include "transition.h"
-#include "../sched/sched.h"
#define MAX_STACK_ENTRIES 100
+static DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries);
+
#define STACK_ERR_BUF_SIZE 128
#define SIGNALS_TIMEOUT 15
@@ -26,6 +28,25 @@ static int klp_target_state = KLP_UNDEFINED;
static unsigned int klp_signals_cnt;
/*
+ * When a livepatch is in progress, enable klp stack checking in
+ * cond_resched(). This helps CPU-bound kthreads get patched.
+ */
+#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+
+#define klp_cond_resched_enable() sched_dynamic_klp_enable()
+#define klp_cond_resched_disable() sched_dynamic_klp_disable()
+
+#else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
+EXPORT_SYMBOL(klp_sched_try_switch_key);
+
+#define klp_cond_resched_enable() static_branch_enable(&klp_sched_try_switch_key)
+#define klp_cond_resched_disable() static_branch_disable(&klp_sched_try_switch_key)
+
+#endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+/*
* This work can be performed periodically to finish patching or unpatching any
* "straggler" tasks which failed to transition in the first attempt.
*/
@@ -173,8 +194,8 @@ void klp_update_patch_state(struct task_struct *task)
* barrier (smp_rmb) for two cases:
*
* 1) Enforce the order of the TIF_PATCH_PENDING read and the
- * klp_target_state read. The corresponding write barrier is in
- * klp_init_transition().
+ * klp_target_state read. The corresponding write barriers are in
+ * klp_init_transition() and klp_reverse_transition().
*
* 2) Enforce the order of the TIF_PATCH_PENDING read and a future read
* of func->transition, if klp_ftrace_handler() is called later on
@@ -197,36 +218,36 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
struct klp_ops *ops;
int i;
- for (i = 0; i < nr_entries; i++) {
- address = entries[i];
+ if (klp_target_state == KLP_UNPATCHED) {
+ /*
+ * Check for the to-be-unpatched function
+ * (the func itself).
+ */
+ func_addr = (unsigned long)func->new_func;
+ func_size = func->new_size;
+ } else {
+ /*
+ * Check for the to-be-patched function
+ * (the previous func).
+ */
+ ops = klp_find_ops(func->old_func);
- if (klp_target_state == KLP_UNPATCHED) {
- /*
- * Check for the to-be-unpatched function
- * (the func itself).
- */
- func_addr = (unsigned long)func->new_func;
- func_size = func->new_size;
+ if (list_is_singular(&ops->func_stack)) {
+ /* original function */
+ func_addr = (unsigned long)func->old_func;
+ func_size = func->old_size;
} else {
- /*
- * Check for the to-be-patched function
- * (the previous func).
- */
- ops = klp_find_ops(func->old_func);
-
- if (list_is_singular(&ops->func_stack)) {
- /* original function */
- func_addr = (unsigned long)func->old_func;
- func_size = func->old_size;
- } else {
- /* previously patched function */
- struct klp_func *prev;
-
- prev = list_next_entry(func, stack_node);
- func_addr = (unsigned long)prev->new_func;
- func_size = prev->new_size;
- }
+ /* previously patched function */
+ struct klp_func *prev;
+
+ prev = list_next_entry(func, stack_node);
+ func_addr = (unsigned long)prev->new_func;
+ func_size = prev->new_size;
}
+ }
+
+ for (i = 0; i < nr_entries; i++) {
+ address = entries[i];
if (address >= func_addr && address < func_addr + func_size)
return -EAGAIN;
@@ -239,20 +260,19 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
* Determine whether it's safe to transition the task to the target patch state
* by looking for any to-be-patched or to-be-unpatched functions on its stack.
*/
-static int klp_check_stack(struct task_struct *task, char *err_buf)
+static int klp_check_stack(struct task_struct *task, const char **oldname)
{
- static unsigned long entries[MAX_STACK_ENTRIES];
+ unsigned long *entries = this_cpu_ptr(klp_stack_entries);
struct klp_object *obj;
struct klp_func *func;
int ret, nr_entries;
- ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
- if (ret < 0) {
- snprintf(err_buf, STACK_ERR_BUF_SIZE,
- "%s: %s:%d has an unreliable stack\n",
- __func__, task->comm, task->pid);
- return ret;
- }
+ /* Protect 'klp_stack_entries' */
+ lockdep_assert_preemption_disabled();
+
+ ret = stack_trace_save_tsk_reliable(task, entries, MAX_STACK_ENTRIES);
+ if (ret < 0)
+ return -EINVAL;
nr_entries = ret;
klp_for_each_object(klp_transition_patch, obj) {
@@ -261,11 +281,8 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
klp_for_each_func(obj, func) {
ret = klp_check_stack_func(func, entries, nr_entries);
if (ret) {
- snprintf(err_buf, STACK_ERR_BUF_SIZE,
- "%s: %s:%d is sleeping on function %s\n",
- __func__, task->comm, task->pid,
- func->old_name);
- return ret;
+ *oldname = func->old_name;
+ return -EADDRINUSE;
}
}
}
@@ -273,6 +290,22 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
return 0;
}
+static int klp_check_and_switch_task(struct task_struct *task, void *arg)
+{
+ int ret;
+
+ if (task_curr(task) && task != current)
+ return -EBUSY;
+
+ ret = klp_check_stack(task, arg);
+ if (ret)
+ return ret;
+
+ clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
+ task->patch_state = klp_target_state;
+ return 0;
+}
+
/*
* Try to safely switch a task to the target patch state. If it's currently
* running, or it's sleeping on a to-be-patched or to-be-unpatched function, or
@@ -280,13 +313,8 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
*/
static bool klp_try_switch_task(struct task_struct *task)
{
- static char err_buf[STACK_ERR_BUF_SIZE];
- struct rq *rq;
- struct rq_flags flags;
+ const char *old_name;
int ret;
- bool success = false;
-
- err_buf[0] = '\0';
/* check if this task has already switched over */
if (task->patch_state == klp_target_state)
@@ -304,37 +332,74 @@ static bool klp_try_switch_task(struct task_struct *task)
* functions. If all goes well, switch the task to the target patch
* state.
*/
- rq = task_rq_lock(task, &flags);
+ if (task == current)
+ ret = klp_check_and_switch_task(current, &old_name);
+ else
+ ret = task_call_func(task, klp_check_and_switch_task, &old_name);
+
+ switch (ret) {
+ case 0: /* success */
+ break;
- if (task_running(rq, task) && task != current) {
- snprintf(err_buf, STACK_ERR_BUF_SIZE,
- "%s: %s:%d is running\n", __func__, task->comm,
- task->pid);
- goto done;
+ case -EBUSY: /* klp_check_and_switch_task() */
+ pr_debug("%s: %s:%d is running\n",
+ __func__, task->comm, task->pid);
+ break;
+ case -EINVAL: /* klp_check_and_switch_task() */
+ pr_debug("%s: %s:%d has an unreliable stack\n",
+ __func__, task->comm, task->pid);
+ break;
+ case -EADDRINUSE: /* klp_check_and_switch_task() */
+ pr_debug("%s: %s:%d is sleeping on function %s\n",
+ __func__, task->comm, task->pid, old_name);
+ break;
+
+ default:
+ pr_debug("%s: Unknown error code (%d) when trying to switch %s:%d\n",
+ __func__, ret, task->comm, task->pid);
+ break;
}
- ret = klp_check_stack(task, err_buf);
- if (ret)
- goto done;
+ return !ret;
+}
- success = true;
+void __klp_sched_try_switch(void)
+{
+ if (likely(!klp_patch_pending(current)))
+ return;
- clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
- task->patch_state = klp_target_state;
+ /*
+ * This function is called from cond_resched() which is called in many
+ * places throughout the kernel. Using the klp_mutex here might
+ * deadlock.
+ *
+ * Instead, disable preemption to prevent racing with other callers of
+ * klp_try_switch_task(). Thanks to task_call_func() they won't be
+ * able to switch this task while it's running.
+ */
+ preempt_disable();
-done:
- task_rq_unlock(rq, task, &flags);
+ /*
+ * Make sure current didn't get patched between the above check and
+ * preempt_disable().
+ */
+ if (unlikely(!klp_patch_pending(current)))
+ goto out;
/*
- * Due to console deadlock issues, pr_debug() can't be used while
- * holding the task rq lock. Instead we have to use a temporary buffer
- * and print the debug message after releasing the lock.
+ * Enforce the order of the TIF_PATCH_PENDING read above and the
+ * klp_target_state read in klp_try_switch_task(). The corresponding
+ * write barriers are in klp_init_transition() and
+ * klp_reverse_transition().
*/
- if (err_buf[0] != '\0')
- pr_debug("%s", err_buf);
+ smp_rmb();
- return success;
+ klp_try_switch_task(current);
+
+out:
+ preempt_enable();
}
+EXPORT_SYMBOL(__klp_sched_try_switch);
/*
* Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
@@ -369,9 +434,7 @@ static void klp_send_signals(void)
* Send fake signal to all non-kthread tasks which are
* still not migrated.
*/
- spin_lock_irq(&task->sighand->siglock);
- signal_wake_up(task, 0);
- spin_unlock_irq(&task->sighand->siglock);
+ set_notify_signal(task);
}
}
read_unlock(&tasklist_lock);
@@ -412,19 +475,22 @@ void klp_try_complete_transition(void)
/*
* Ditto for the idle "swapper" tasks.
*/
- get_online_cpus();
+ cpus_read_lock();
for_each_possible_cpu(cpu) {
task = idle_task(cpu);
if (cpu_online(cpu)) {
- if (!klp_try_switch_task(task))
+ if (!klp_try_switch_task(task)) {
complete = false;
+ /* Make idle task go through the main loop. */
+ wake_up_if_idle(cpu);
+ }
} else if (task->patch_state != klp_target_state) {
/* offline idle tasks can be switched immediately */
clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
task->patch_state = klp_target_state;
}
}
- put_online_cpus();
+ cpus_read_unlock();
if (!complete) {
if (klp_signals_cnt && !(klp_signals_cnt % SIGNALS_TIMEOUT))
@@ -441,7 +507,8 @@ void klp_try_complete_transition(void)
return;
}
- /* we're done, now cleanup the data structures */
+ /* Done! Now cleanup the data structures. */
+ klp_cond_resched_disable();
patch = klp_transition_patch;
klp_complete_transition();
@@ -493,6 +560,8 @@ void klp_start_transition(void)
set_tsk_thread_flag(task, TIF_PATCH_PENDING);
}
+ klp_cond_resched_enable();
+
klp_signals_cnt = 0;
}
@@ -548,8 +617,9 @@ void klp_init_transition(struct klp_patch *patch, int state)
* see a func in transition with a task->patch_state of KLP_UNDEFINED.
*
* Also enforce the order of the klp_target_state write and future
- * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't
- * set a task->patch_state to KLP_UNDEFINED.
+ * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() and
+ * __klp_sched_try_switch() don't set a task->patch_state to
+ * KLP_UNDEFINED.
*/
smp_wmb();
@@ -585,14 +655,10 @@ void klp_reverse_transition(void)
klp_target_state == KLP_PATCHED ? "patching to unpatching" :
"unpatching to patching");
- klp_transition_patch->enabled = !klp_transition_patch->enabled;
-
- klp_target_state = !klp_target_state;
-
/*
* Clear all TIF_PATCH_PENDING flags to prevent races caused by
- * klp_update_patch_state() running in parallel with
- * klp_start_transition().
+ * klp_update_patch_state() or __klp_sched_try_switch() running in
+ * parallel with the reverse transition.
*/
read_lock(&tasklist_lock);
for_each_process_thread(g, task)
@@ -602,18 +668,51 @@ void klp_reverse_transition(void)
for_each_possible_cpu(cpu)
clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING);
- /* Let any remaining calls to klp_update_patch_state() complete */
+ /*
+ * Make sure all existing invocations of klp_update_patch_state() and
+ * __klp_sched_try_switch() see the cleared TIF_PATCH_PENDING before
+ * starting the reverse transition.
+ */
klp_synchronize_transition();
+ /*
+ * All patching has stopped, now re-initialize the global variables to
+ * prepare for the reverse transition.
+ */
+ klp_transition_patch->enabled = !klp_transition_patch->enabled;
+ klp_target_state = !klp_target_state;
+
+ /*
+ * Enforce the order of the klp_target_state write and the
+ * TIF_PATCH_PENDING writes in klp_start_transition() to ensure
+ * klp_update_patch_state() and __klp_sched_try_switch() don't set
+ * task->patch_state to the wrong value.
+ */
+ smp_wmb();
+
klp_start_transition();
}
/* Called from copy_process() during fork */
void klp_copy_process(struct task_struct *child)
{
- child->patch_state = current->patch_state;
- /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */
+ /*
+ * The parent process may have gone through a KLP transition since
+ * the thread flag was copied in setup_thread_stack earlier. Bring
+ * the task flag up to date with the parent here.
+ *
+ * The operation is serialized against all klp_*_transition()
+ * operations by the tasklist_lock. The only exceptions are
+ * klp_update_patch_state(current) and __klp_sched_try_switch(), but we
+ * cannot race with them because we are current.
+ */
+ if (test_tsk_thread_flag(current, TIF_PATCH_PENDING))
+ set_tsk_thread_flag(child, TIF_PATCH_PENDING);
+ else
+ clear_tsk_thread_flag(child, TIF_PATCH_PENDING);
+
+ child->patch_state = current->patch_state;
}
/*
@@ -641,6 +740,13 @@ void klp_force_transition(void)
for_each_possible_cpu(cpu)
klp_update_patch_state(idle_task(cpu));
- klp_for_each_patch(patch)
- patch->forced = true;
+ /* Set forced flag for patches being removed. */
+ if (klp_target_state == KLP_UNPATCHED)
+ klp_transition_patch->forced = true;
+ else if (klp_transition_patch->replace) {
+ klp_for_each_patch(patch) {
+ if (patch != klp_transition_patch)
+ patch->forced = true;
+ }
+ }
}
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 6d11cfb9b41f..0db4093d17b8 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -5,16 +5,16 @@ KCOV_INSTRUMENT := n
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
-# Avoid recursion lockdep -> KCSAN -> ... -> lockdep.
+# Avoid recursion lockdep -> sanitizer -> ... -> lockdep.
KCSAN_SANITIZE_lockdep.o := n
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
endif
+obj-$(CONFIG_DEBUG_IRQFLAGS) += irqflag-debug.o
obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
obj-$(CONFIG_LOCKDEP) += lockdep.o
ifeq ($(CONFIG_PROC_FS),y)
@@ -24,8 +24,8 @@ obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
-obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
-obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
+obj-$(CONFIG_RT_MUTEXES) += rtmutex_api.o
+obj-$(CONFIG_PREEMPT_RT) += spinlock_rt.o ww_rt_mutex.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
diff --git a/kernel/locking/irqflag-debug.c b/kernel/locking/irqflag-debug.c
new file mode 100644
index 000000000000..810b50344d35
--- /dev/null
+++ b/kernel/locking/irqflag-debug.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/irqflags.h>
+
+noinstr void warn_bogus_irq_restore(void)
+{
+ instrumentation_begin();
+ WARN_ONCE(1, "raw_local_irq_restore() called with IRQs enabled\n");
+ instrumentation_end();
+}
+EXPORT_SYMBOL(warn_bogus_irq_restore);
diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c
index fa2c2f951c6b..e68d82099558 100644
--- a/kernel/locking/lock_events.c
+++ b/kernel/locking/lock_events.c
@@ -146,7 +146,7 @@ static int __init init_lockevent_counts(void)
struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL);
int i;
- if (!d_counts)
+ if (IS_ERR(d_counts))
goto out;
/*
@@ -159,14 +159,14 @@ static int __init init_lockevent_counts(void)
for (i = 0; i < lockevent_num; i++) {
if (skip_lockevent(lockevent_names[i]))
continue;
- if (!debugfs_create_file(lockevent_names[i], 0400, d_counts,
- (void *)(long)i, &fops_lockevent))
+ if (IS_ERR(debugfs_create_file(lockevent_names[i], 0400, d_counts,
+ (void *)(long)i, &fops_lockevent)))
goto fail_undo;
}
- if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
+ if (IS_ERR(debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
d_counts, (void *)(long)LOCKEVENT_reset_cnts,
- &fops_lockevent))
+ &fops_lockevent)))
goto fail_undo;
return 0;
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
index 8c7e7d25f09c..a6016b91803d 100644
--- a/kernel/locking/lock_events.h
+++ b/kernel/locking/lock_events.h
@@ -57,4 +57,8 @@ static inline void __lockevent_add(enum lock_events event, int inc)
#define lockevent_cond_inc(ev, c)
#endif /* CONFIG_LOCK_EVENT_COUNTS */
+
+ssize_t lockevent_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos);
+
#endif /* __LOCKING_LOCK_EVENTS_H */
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
index 239039d0ce21..97fb6f3f840a 100644
--- a/kernel/locking/lock_events_list.h
+++ b/kernel/locking/lock_events_list.h
@@ -56,13 +56,11 @@ LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */
LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */
LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */
LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */
-LOCK_EVENT(rwsem_opt_rlock) /* # of opt-acquired read locks */
-LOCK_EVENT(rwsem_opt_wlock) /* # of opt-acquired write locks */
+LOCK_EVENT(rwsem_opt_lock) /* # of opt-acquired write locks */
LOCK_EVENT(rwsem_opt_fail) /* # of failed optspins */
LOCK_EVENT(rwsem_opt_nospin) /* # of disabled optspins */
-LOCK_EVENT(rwsem_opt_norspin) /* # of disabled reader-only optspins */
-LOCK_EVENT(rwsem_opt_rlock2) /* # of opt-acquired 2ndary read locks */
LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */
+LOCK_EVENT(rwsem_rlock_steal) /* # of read locks by lock stealing */
LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */
LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */
LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 29a8de4c50b9..151bd3de5936 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -54,28 +54,77 @@
#include <linux/nmi.h>
#include <linux/rcupdate.h>
#include <linux/kprobes.h>
+#include <linux/lockdep.h>
+#include <linux/context_tracking.h>
#include <asm/sections.h>
#include "lockdep_internals.h"
-#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
#ifdef CONFIG_PROVE_LOCKING
-int prove_locking = 1;
+static int prove_locking = 1;
module_param(prove_locking, int, 0644);
#else
#define prove_locking 0
#endif
#ifdef CONFIG_LOCK_STAT
-int lock_stat = 1;
+static int lock_stat = 1;
module_param(lock_stat, int, 0644);
#else
#define lock_stat 0
#endif
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_lockdep_table[] = {
+#ifdef CONFIG_PROVE_LOCKING
+ {
+ .procname = "prove_locking",
+ .data = &prove_locking,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_PROVE_LOCKING */
+#ifdef CONFIG_LOCK_STAT
+ {
+ .procname = "lock_stat",
+ .data = &lock_stat,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_LOCK_STAT */
+ { }
+};
+
+static __init int kernel_lockdep_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_lockdep_table);
+ return 0;
+}
+late_initcall(kernel_lockdep_sysctls_init);
+#endif /* CONFIG_SYSCTL */
+
+DEFINE_PER_CPU(unsigned int, lockdep_recursion);
+EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion);
+
+static __always_inline bool lockdep_enabled(void)
+{
+ if (!debug_locks)
+ return false;
+
+ if (this_cpu_read(lockdep_recursion))
+ return false;
+
+ if (current->lockdep_recursion)
+ return false;
+
+ return true;
+}
+
/*
* lockdep_lock: protects the lockdep graph, the hashes and the
* class/list/hash allocators.
@@ -91,19 +140,21 @@ static inline void lockdep_lock(void)
{
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+ __this_cpu_inc(lockdep_recursion);
arch_spin_lock(&__lock);
__owner = current;
- current->lockdep_recursion++;
}
static inline void lockdep_unlock(void)
{
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+
if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current))
return;
- current->lockdep_recursion--;
__owner = NULL;
arch_spin_unlock(&__lock);
+ __this_cpu_dec(lockdep_recursion);
}
static inline bool lockdep_assert_locked(void)
@@ -163,11 +214,9 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES);
static struct hlist_head lock_keys_hash[KEYHASH_SIZE];
unsigned long nr_lock_classes;
unsigned long nr_zapped_classes;
-#ifndef CONFIG_DEBUG_LOCKDEP
-static
-#endif
+unsigned long max_lock_class_idx;
struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
-static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
+DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
static inline struct lock_class *hlock_class(struct held_lock *hlock)
{
@@ -318,7 +367,7 @@ static inline void lock_release_holdtime(struct held_lock *hlock)
* elements. These elements are linked together by the lock_entry member in
* struct lock_class.
*/
-LIST_HEAD(all_lock_classes);
+static LIST_HEAD(all_lock_classes);
static LIST_HEAD(free_lock_classes);
/**
@@ -372,6 +421,21 @@ static struct hlist_head classhash_table[CLASSHASH_SIZE];
static struct hlist_head chainhash_table[CHAINHASH_SIZE];
/*
+ * the id of held_lock
+ */
+static inline u16 hlock_id(struct held_lock *hlock)
+{
+ BUILD_BUG_ON(MAX_LOCKDEP_KEYS_BITS + 2 > 16);
+
+ return (hlock->class_idx | (hlock->read << MAX_LOCKDEP_KEYS_BITS));
+}
+
+static inline unsigned int chain_hlock_class_idx(u16 hlock_id)
+{
+ return hlock_id & (MAX_LOCKDEP_KEYS - 1);
+}
+
+/*
* The hash key of the lock dependency chains is a hash itself too:
* it's a hash of all locks taken up to that lock, including that lock.
* It's a 64-bit hash, because it's important for the keys to be
@@ -393,10 +457,15 @@ void lockdep_init_task(struct task_struct *task)
task->lockdep_recursion = 0;
}
+static __always_inline void lockdep_recursion_inc(void)
+{
+ __this_cpu_inc(lockdep_recursion);
+}
+
static __always_inline void lockdep_recursion_finish(void)
{
- if (WARN_ON_ONCE(--current->lockdep_recursion))
- current->lockdep_recursion = 0;
+ if (WARN_ON_ONCE(__this_cpu_dec_return(lockdep_recursion)))
+ __this_cpu_write(lockdep_recursion, 0);
}
void lockdep_set_selftest_task(struct task_struct *task)
@@ -585,6 +654,8 @@ static const char *usage_str[] =
#include "lockdep_states.h"
#undef LOCKDEP_STATE
[LOCK_USED] = "INITIAL USE",
+ [LOCK_USED_READ] = "INITIAL READ USE",
+ /* abused as string storage for verify_lock_unused() */
[LOCK_USAGE_STATES] = "IN-NMI",
};
#endif
@@ -638,7 +709,7 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
usage[i] = '\0';
}
-static void __print_lock_name(struct lock_class *class)
+static void __print_lock_name(struct held_lock *hlock, struct lock_class *class)
{
char str[KSYM_NAME_LEN];
const char *name;
@@ -653,18 +724,20 @@ static void __print_lock_name(struct lock_class *class)
printk(KERN_CONT "#%d", class->name_version);
if (class->subclass)
printk(KERN_CONT "/%d", class->subclass);
+ if (hlock && class->print_fn)
+ class->print_fn(hlock->instance);
}
}
-static void print_lock_name(struct lock_class *class)
+static void print_lock_name(struct held_lock *hlock, struct lock_class *class)
{
char usage[LOCK_USAGE_CHARS];
get_usage_chars(class, usage);
printk(KERN_CONT " (");
- __print_lock_name(class);
- printk(KERN_CONT "){%s}-{%hd:%hd}", usage,
+ __print_lock_name(hlock, class);
+ printk(KERN_CONT "){%s}-{%d:%d}", usage,
class->wait_type_outer ?: class->wait_type_inner,
class->wait_type_inner);
}
@@ -701,7 +774,7 @@ static void print_lock(struct held_lock *hlock)
}
printk(KERN_CONT "%px", hlock->instance);
- print_lock_name(lock);
+ print_lock_name(hlock, lock);
printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
}
@@ -718,7 +791,7 @@ static void lockdep_print_held_locks(struct task_struct *p)
* It's not reliable to print a task's held locks if it's not sleeping
* and it's not the current task.
*/
- if (p->state == TASK_RUNNING && p != current)
+ if (p != current && task_is_running(p))
return;
for (i = 0; i < depth; i++) {
printk(" #%d: ", i);
@@ -748,20 +821,24 @@ static int very_verbose(struct lock_class *class)
#ifdef __KERNEL__
static int static_obj(const void *obj)
{
- unsigned long start = (unsigned long) &_stext,
- end = (unsigned long) &_end,
- addr = (unsigned long) obj;
+ unsigned long addr = (unsigned long) obj;
- if (arch_is_kernel_initmem_freed(addr))
- return 0;
+ if (is_kernel_core_data(addr))
+ return 1;
/*
- * static variable?
+ * keys are allowed in the __ro_after_init section.
*/
- if ((addr >= start) && (addr < end))
+ if (is_kernel_rodata(addr))
return 1;
- if (arch_is_kernel_data(addr))
+ /*
+ * in initdata section and used during bootup only?
+ * NOTE: On some platforms the initdata section is
+ * outside of the _stext ... _end range.
+ */
+ if (system_state < SYSTEM_FREEING_INITMEM &&
+ init_section_contains((void *)addr, 1))
return 1;
/*
@@ -801,7 +878,7 @@ static int count_matching_names(struct lock_class *new_class)
}
/* used from NMI context -- must be lockless */
-static __always_inline struct lock_class *
+static noinstr struct lock_class *
look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
{
struct lockdep_subclass_key *key;
@@ -809,12 +886,14 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
struct lock_class *class;
if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
+ instrumentation_begin();
debug_locks_off();
printk(KERN_ERR
"BUG: looking up invalid subclass: %u\n", subclass);
printk(KERN_ERR
"turning off the locking correctness validator.\n");
dump_stack();
+ instrumentation_end();
return NULL;
}
@@ -844,14 +923,16 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return NULL;
- hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
+ hlist_for_each_entry_rcu_notrace(class, hash_head, hash_entry) {
if (class->key == key) {
/*
* Huh! same key, different name? Did someone trample
* on some memory? We're most confused.
*/
- WARN_ON_ONCE(class->name != lock->name &&
- lock->key != &__lockdep_no_validate__);
+ WARN_ONCE(class->name != lock->name &&
+ lock->key != &__lockdep_no_validate__,
+ "Looking for class \"%s\" with key %ps, but found a different class \"%s\" with the same key\n",
+ lock->name, lock->key, class->name);
return class;
}
}
@@ -889,7 +970,8 @@ static bool assign_lock_key(struct lockdep_map *lock)
/* Debug-check: all keys must be persistent! */
debug_locks_off();
pr_err("INFO: trying to register non-static key.\n");
- pr_err("the code is fine but needs lockdep annotation.\n");
+ pr_err("The code is fine but needs lockdep annotation, or maybe\n");
+ pr_err("you didn't initialize this object before use?\n");
pr_err("turning off the locking correctness validator.\n");
dump_stack();
return false;
@@ -1195,6 +1277,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
struct lockdep_subclass_key *key;
struct hlist_head *hash_head;
struct lock_class *class;
+ int idx;
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -1249,6 +1332,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
class->name_version = count_matching_names(class);
class->wait_type_inner = lock->wait_type_inner;
class->wait_type_outer = lock->wait_type_outer;
+ class->lock_type = lock->lock_type;
/*
* We use RCU's safe list-add method to make
* parallel walking of the hash-list safe:
@@ -1259,6 +1343,9 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
* of classes.
*/
list_move_tail(&class->lock_entry, &all_lock_classes);
+ idx = class - lock_classes;
+ if (idx > max_lock_class_idx)
+ max_lock_class_idx = idx;
if (verbose(class)) {
graph_unlock();
@@ -1320,7 +1407,7 @@ static struct lock_list *alloc_list_entry(void)
*/
static int add_lock_to_list(struct lock_class *this,
struct lock_class *links_to, struct list_head *head,
- unsigned long ip, int distance,
+ u16 distance, u8 dep,
const struct lock_trace *trace)
{
struct lock_list *entry;
@@ -1334,6 +1421,7 @@ static int add_lock_to_list(struct lock_class *this,
entry->class = this;
entry->links_to = links_to;
+ entry->dep = dep;
entry->distance = distance;
entry->trace = trace;
/*
@@ -1349,7 +1437,7 @@ static int add_lock_to_list(struct lock_class *this,
/*
* For good efficiency of modular, we use power of 2
*/
-#define MAX_CIRCULAR_QUEUE_SIZE 4096UL
+#define MAX_CIRCULAR_QUEUE_SIZE (1UL << CONFIG_LOCKDEP_CIRCULAR_QUEUE_BITS)
#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
/*
@@ -1421,23 +1509,19 @@ static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
return (cq->rear - cq->front) & CQ_MASK;
}
-static inline void mark_lock_accessed(struct lock_list *lock,
- struct lock_list *parent)
+static inline void mark_lock_accessed(struct lock_list *lock)
{
- unsigned long nr;
+ lock->class->dep_gen_id = lockdep_dependency_gen_id;
+}
- nr = lock - list_entries;
- WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */
+static inline void visit_lock_entry(struct lock_list *lock,
+ struct lock_list *parent)
+{
lock->parent = parent;
- lock->class->dep_gen_id = lockdep_dependency_gen_id;
}
static inline unsigned long lock_accessed(struct lock_list *lock)
{
- unsigned long nr;
-
- nr = lock - list_entries;
- WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */
return lock->class->dep_gen_id == lockdep_dependency_gen_id;
}
@@ -1471,87 +1555,293 @@ static inline struct list_head *get_dep_list(struct lock_list *lock, int offset)
return lock_class + offset;
}
+/*
+ * Return values of a bfs search:
+ *
+ * BFS_E* indicates an error
+ * BFS_R* indicates a result (match or not)
+ *
+ * BFS_EINVALIDNODE: Find a invalid node in the graph.
+ *
+ * BFS_EQUEUEFULL: The queue is full while doing the bfs.
+ *
+ * BFS_RMATCH: Find the matched node in the graph, and put that node into
+ * *@target_entry.
+ *
+ * BFS_RNOMATCH: Haven't found the matched node and keep *@target_entry
+ * _unchanged_.
+ */
+enum bfs_result {
+ BFS_EINVALIDNODE = -2,
+ BFS_EQUEUEFULL = -1,
+ BFS_RMATCH = 0,
+ BFS_RNOMATCH = 1,
+};
+
+/*
+ * bfs_result < 0 means error
+ */
+static inline bool bfs_error(enum bfs_result res)
+{
+ return res < 0;
+}
/*
- * Forward- or backward-dependency search, used for both circular dependency
- * checking and hardirq-unsafe/softirq-unsafe checking.
+ * DEP_*_BIT in lock_list::dep
+ *
+ * For dependency @prev -> @next:
+ *
+ * SR: @prev is shared reader (->read != 0) and @next is recursive reader
+ * (->read == 2)
+ * ER: @prev is exclusive locker (->read == 0) and @next is recursive reader
+ * SN: @prev is shared reader and @next is non-recursive locker (->read != 2)
+ * EN: @prev is exclusive locker and @next is non-recursive locker
+ *
+ * Note that we define the value of DEP_*_BITs so that:
+ * bit0 is prev->read == 0
+ * bit1 is next->read != 2
*/
-static int __bfs(struct lock_list *source_entry,
- void *data,
- int (*match)(struct lock_list *entry, void *data),
- struct lock_list **target_entry,
- int offset)
+#define DEP_SR_BIT (0 + (0 << 1)) /* 0 */
+#define DEP_ER_BIT (1 + (0 << 1)) /* 1 */
+#define DEP_SN_BIT (0 + (1 << 1)) /* 2 */
+#define DEP_EN_BIT (1 + (1 << 1)) /* 3 */
+
+#define DEP_SR_MASK (1U << (DEP_SR_BIT))
+#define DEP_ER_MASK (1U << (DEP_ER_BIT))
+#define DEP_SN_MASK (1U << (DEP_SN_BIT))
+#define DEP_EN_MASK (1U << (DEP_EN_BIT))
+
+static inline unsigned int
+__calc_dep_bit(struct held_lock *prev, struct held_lock *next)
{
+ return (prev->read == 0) + ((next->read != 2) << 1);
+}
+
+static inline u8 calc_dep(struct held_lock *prev, struct held_lock *next)
+{
+ return 1U << __calc_dep_bit(prev, next);
+}
+
+/*
+ * calculate the dep_bit for backwards edges. We care about whether @prev is
+ * shared and whether @next is recursive.
+ */
+static inline unsigned int
+__calc_dep_bitb(struct held_lock *prev, struct held_lock *next)
+{
+ return (next->read != 2) + ((prev->read == 0) << 1);
+}
+
+static inline u8 calc_depb(struct held_lock *prev, struct held_lock *next)
+{
+ return 1U << __calc_dep_bitb(prev, next);
+}
+
+/*
+ * Initialize a lock_list entry @lock belonging to @class as the root for a BFS
+ * search.
+ */
+static inline void __bfs_init_root(struct lock_list *lock,
+ struct lock_class *class)
+{
+ lock->class = class;
+ lock->parent = NULL;
+ lock->only_xr = 0;
+}
+
+/*
+ * Initialize a lock_list entry @lock based on a lock acquisition @hlock as the
+ * root for a BFS search.
+ *
+ * ->only_xr of the initial lock node is set to @hlock->read == 2, to make sure
+ * that <prev> -> @hlock and @hlock -> <whatever __bfs() found> is not -(*R)->
+ * and -(S*)->.
+ */
+static inline void bfs_init_root(struct lock_list *lock,
+ struct held_lock *hlock)
+{
+ __bfs_init_root(lock, hlock_class(hlock));
+ lock->only_xr = (hlock->read == 2);
+}
+
+/*
+ * Similar to bfs_init_root() but initialize the root for backwards BFS.
+ *
+ * ->only_xr of the initial lock node is set to @hlock->read != 0, to make sure
+ * that <next> -> @hlock and @hlock -> <whatever backwards BFS found> is not
+ * -(*S)-> and -(R*)-> (reverse order of -(*R)-> and -(S*)->).
+ */
+static inline void bfs_init_rootb(struct lock_list *lock,
+ struct held_lock *hlock)
+{
+ __bfs_init_root(lock, hlock_class(hlock));
+ lock->only_xr = (hlock->read != 0);
+}
+
+static inline struct lock_list *__bfs_next(struct lock_list *lock, int offset)
+{
+ if (!lock || !lock->parent)
+ return NULL;
+
+ return list_next_or_null_rcu(get_dep_list(lock->parent, offset),
+ &lock->entry, struct lock_list, entry);
+}
+
+/*
+ * Breadth-First Search to find a strong path in the dependency graph.
+ *
+ * @source_entry: the source of the path we are searching for.
+ * @data: data used for the second parameter of @match function
+ * @match: match function for the search
+ * @target_entry: pointer to the target of a matched path
+ * @offset: the offset to struct lock_class to determine whether it is
+ * locks_after or locks_before
+ *
+ * We may have multiple edges (considering different kinds of dependencies,
+ * e.g. ER and SN) between two nodes in the dependency graph. But
+ * only the strong dependency path in the graph is relevant to deadlocks. A
+ * strong dependency path is a dependency path that doesn't have two adjacent
+ * dependencies as -(*R)-> -(S*)->, please see:
+ *
+ * Documentation/locking/lockdep-design.rst
+ *
+ * for more explanation of the definition of strong dependency paths
+ *
+ * In __bfs(), we only traverse in the strong dependency path:
+ *
+ * In lock_list::only_xr, we record whether the previous dependency only
+ * has -(*R)-> in the search, and if it does (prev only has -(*R)->), we
+ * filter out any -(S*)-> in the current dependency and after that, the
+ * ->only_xr is set according to whether we only have -(*R)-> left.
+ */
+static enum bfs_result __bfs(struct lock_list *source_entry,
+ void *data,
+ bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry,
+ int offset)
+{
+ struct circular_queue *cq = &lock_cq;
+ struct lock_list *lock = NULL;
struct lock_list *entry;
- struct lock_list *lock;
struct list_head *head;
- struct circular_queue *cq = &lock_cq;
- int ret = 1;
+ unsigned int cq_depth;
+ bool first;
lockdep_assert_locked();
- if (match(source_entry, data)) {
- *target_entry = source_entry;
- ret = 0;
- goto exit;
- }
-
- head = get_dep_list(source_entry, offset);
- if (list_empty(head))
- goto exit;
-
__cq_init(cq);
__cq_enqueue(cq, source_entry);
- while ((lock = __cq_dequeue(cq))) {
+ while ((lock = __bfs_next(lock, offset)) || (lock = __cq_dequeue(cq))) {
+ if (!lock->class)
+ return BFS_EINVALIDNODE;
- if (!lock->class) {
- ret = -2;
- goto exit;
+ /*
+ * Step 1: check whether we already finish on this one.
+ *
+ * If we have visited all the dependencies from this @lock to
+ * others (iow, if we have visited all lock_list entries in
+ * @lock->class->locks_{after,before}) we skip, otherwise go
+ * and visit all the dependencies in the list and mark this
+ * list accessed.
+ */
+ if (lock_accessed(lock))
+ continue;
+ else
+ mark_lock_accessed(lock);
+
+ /*
+ * Step 2: check whether prev dependency and this form a strong
+ * dependency path.
+ */
+ if (lock->parent) { /* Parent exists, check prev dependency */
+ u8 dep = lock->dep;
+ bool prev_only_xr = lock->parent->only_xr;
+
+ /*
+ * Mask out all -(S*)-> if we only have *R in previous
+ * step, because -(*R)-> -(S*)-> don't make up a strong
+ * dependency.
+ */
+ if (prev_only_xr)
+ dep &= ~(DEP_SR_MASK | DEP_SN_MASK);
+
+ /* If nothing left, we skip */
+ if (!dep)
+ continue;
+
+ /* If there are only -(*R)-> left, set that for the next step */
+ lock->only_xr = !(dep & (DEP_SN_MASK | DEP_EN_MASK));
}
- head = get_dep_list(lock, offset);
+ /*
+ * Step 3: we haven't visited this and there is a strong
+ * dependency path to this, so check with @match.
+ * If @skip is provide and returns true, we skip this
+ * lock (and any path this lock is in).
+ */
+ if (skip && skip(lock, data))
+ continue;
+
+ if (match(lock, data)) {
+ *target_entry = lock;
+ return BFS_RMATCH;
+ }
+ /*
+ * Step 4: if not match, expand the path by adding the
+ * forward or backwards dependencies in the search
+ *
+ */
+ first = true;
+ head = get_dep_list(lock, offset);
list_for_each_entry_rcu(entry, head, entry) {
- if (!lock_accessed(entry)) {
- unsigned int cq_depth;
- mark_lock_accessed(entry, lock);
- if (match(entry, data)) {
- *target_entry = entry;
- ret = 0;
- goto exit;
- }
+ visit_lock_entry(entry, lock);
- if (__cq_enqueue(cq, entry)) {
- ret = -1;
- goto exit;
- }
- cq_depth = __cq_get_elem_count(cq);
- if (max_bfs_queue_depth < cq_depth)
- max_bfs_queue_depth = cq_depth;
- }
+ /*
+ * Note we only enqueue the first of the list into the
+ * queue, because we can always find a sibling
+ * dependency from one (see __bfs_next()), as a result
+ * the space of queue is saved.
+ */
+ if (!first)
+ continue;
+
+ first = false;
+
+ if (__cq_enqueue(cq, entry))
+ return BFS_EQUEUEFULL;
+
+ cq_depth = __cq_get_elem_count(cq);
+ if (max_bfs_queue_depth < cq_depth)
+ max_bfs_queue_depth = cq_depth;
}
}
-exit:
- return ret;
+
+ return BFS_RNOMATCH;
}
-static inline int __bfs_forwards(struct lock_list *src_entry,
- void *data,
- int (*match)(struct lock_list *entry, void *data),
- struct lock_list **target_entry)
+static inline enum bfs_result
+__bfs_forwards(struct lock_list *src_entry,
+ void *data,
+ bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
{
- return __bfs(src_entry, data, match, target_entry,
+ return __bfs(src_entry, data, match, skip, target_entry,
offsetof(struct lock_class, locks_after));
}
-static inline int __bfs_backwards(struct lock_list *src_entry,
- void *data,
- int (*match)(struct lock_list *entry, void *data),
- struct lock_list **target_entry)
+static inline enum bfs_result
+__bfs_backwards(struct lock_list *src_entry,
+ void *data,
+ bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
{
- return __bfs(src_entry, data, match, target_entry,
+ return __bfs(src_entry, data, match, skip, target_entry,
offsetof(struct lock_class, locks_before));
}
@@ -1572,7 +1862,7 @@ print_circular_bug_entry(struct lock_list *target, int depth)
if (debug_locks_silent)
return;
printk("\n-> #%u", depth);
- print_lock_name(target->class);
+ print_lock_name(NULL, target->class);
printk(KERN_CONT ":\n");
print_lock_trace(target->trace, 6);
}
@@ -1585,6 +1875,8 @@ print_circular_lock_scenario(struct held_lock *src,
struct lock_class *source = hlock_class(src);
struct lock_class *target = hlock_class(tgt);
struct lock_class *parent = prt->class;
+ int src_read = src->read;
+ int tgt_read = tgt->read;
/*
* A direct locking problem where unsafe_class lock is taken
@@ -1601,28 +1893,36 @@ print_circular_lock_scenario(struct held_lock *src,
*/
if (parent != source) {
printk("Chain exists of:\n ");
- __print_lock_name(source);
+ __print_lock_name(src, source);
printk(KERN_CONT " --> ");
- __print_lock_name(parent);
+ __print_lock_name(NULL, parent);
printk(KERN_CONT " --> ");
- __print_lock_name(target);
+ __print_lock_name(tgt, target);
printk(KERN_CONT "\n\n");
}
printk(" Possible unsafe locking scenario:\n\n");
printk(" CPU0 CPU1\n");
printk(" ---- ----\n");
- printk(" lock(");
- __print_lock_name(target);
+ if (tgt_read != 0)
+ printk(" rlock(");
+ else
+ printk(" lock(");
+ __print_lock_name(tgt, target);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(parent);
+ __print_lock_name(NULL, parent);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(target);
+ __print_lock_name(tgt, target);
printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(source);
+ if (src_read != 0)
+ printk(" rlock(");
+ else if (src->sync)
+ printk(" sync(");
+ else
+ printk(" lock(");
+ __print_lock_name(src, source);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
@@ -1659,15 +1959,72 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
print_circular_bug_entry(entry, depth);
}
-static inline int class_equal(struct lock_list *entry, void *data)
+/*
+ * We are about to add A -> B into the dependency graph, and in __bfs() a
+ * strong dependency path A -> .. -> B is found: hlock_class equals
+ * entry->class.
+ *
+ * If A -> .. -> B can replace A -> B in any __bfs() search (means the former
+ * is _stronger_ than or equal to the latter), we consider A -> B as redundant.
+ * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A
+ * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the
+ * dependency graph, as any strong path ..-> A -> B ->.. we can get with
+ * having dependency A -> B, we could already get a equivalent path ..-> A ->
+ * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant.
+ *
+ * We need to make sure both the start and the end of A -> .. -> B is not
+ * weaker than A -> B. For the start part, please see the comment in
+ * check_redundant(). For the end part, we need:
+ *
+ * Either
+ *
+ * a) A -> B is -(*R)-> (everything is not weaker than that)
+ *
+ * or
+ *
+ * b) A -> .. -> B is -(*N)-> (nothing is stronger than this)
+ *
+ */
+static inline bool hlock_equal(struct lock_list *entry, void *data)
{
- return entry->class == data;
+ struct held_lock *hlock = (struct held_lock *)data;
+
+ return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */
+ (hlock->read == 2 || /* A -> B is -(*R)-> */
+ !entry->only_xr); /* A -> .. -> B is -(*N)-> */
+}
+
+/*
+ * We are about to add B -> A into the dependency graph, and in __bfs() a
+ * strong dependency path A -> .. -> B is found: hlock_class equals
+ * entry->class.
+ *
+ * We will have a deadlock case (conflict) if A -> .. -> B -> A is a strong
+ * dependency cycle, that means:
+ *
+ * Either
+ *
+ * a) B -> A is -(E*)->
+ *
+ * or
+ *
+ * b) A -> .. -> B is -(*N)-> (i.e. A -> .. -(*N)-> B)
+ *
+ * as then we don't have -(*R)-> -(S*)-> in the cycle.
+ */
+static inline bool hlock_conflict(struct lock_list *entry, void *data)
+{
+ struct held_lock *hlock = (struct held_lock *)data;
+
+ return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */
+ (hlock->read == 0 || /* B -> A is -(E*)-> */
+ !entry->only_xr); /* A -> .. -> B is -(*N)-> */
}
static noinline void print_circular_bug(struct lock_list *this,
- struct lock_list *target,
- struct held_lock *check_src,
- struct held_lock *check_tgt)
+ struct lock_list *target,
+ struct held_lock *check_src,
+ struct held_lock *check_tgt)
{
struct task_struct *curr = current;
struct lock_list *parent;
@@ -1714,18 +2071,18 @@ static noinline void print_bfs_bug(int ret)
WARN(1, "lockdep bfs error:%d\n", ret);
}
-static int noop_count(struct lock_list *entry, void *data)
+static bool noop_count(struct lock_list *entry, void *data)
{
(*(unsigned long *)data)++;
- return 0;
+ return false;
}
static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
{
unsigned long count = 0;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
- __bfs_forwards(this, (void *)&count, noop_count, &target_entry);
+ __bfs_forwards(this, (void *)&count, noop_count, NULL, &target_entry);
return count;
}
@@ -1734,8 +2091,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
unsigned long ret, flags;
struct lock_list this;
- this.parent = NULL;
- this.class = class;
+ __bfs_init_root(&this, class);
raw_local_irq_save(flags);
lockdep_lock();
@@ -1749,9 +2105,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
static unsigned long __lockdep_count_backward_deps(struct lock_list *this)
{
unsigned long count = 0;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
- __bfs_backwards(this, (void *)&count, noop_count, &target_entry);
+ __bfs_backwards(this, (void *)&count, noop_count, NULL, &target_entry);
return count;
}
@@ -1761,8 +2117,7 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
unsigned long ret, flags;
struct lock_list this;
- this.parent = NULL;
- this.class = class;
+ __bfs_init_root(&this, class);
raw_local_irq_save(flags);
lockdep_lock();
@@ -1775,46 +2130,48 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
/*
* Check that the dependency graph starting at <src> can lead to
- * <target> or not. Print an error and return 0 if it does.
+ * <target> or not.
*/
-static noinline int
-check_path(struct lock_class *target, struct lock_list *src_entry,
+static noinline enum bfs_result
+check_path(struct held_lock *target, struct lock_list *src_entry,
+ bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
struct lock_list **target_entry)
{
- int ret;
+ enum bfs_result ret;
- ret = __bfs_forwards(src_entry, (void *)target, class_equal,
- target_entry);
+ ret = __bfs_forwards(src_entry, target, match, skip, target_entry);
- if (unlikely(ret < 0))
+ if (unlikely(bfs_error(ret)))
print_bfs_bug(ret);
return ret;
}
+static void print_deadlock_bug(struct task_struct *, struct held_lock *, struct held_lock *);
+
/*
* Prove that the dependency graph starting at <src> can not
* lead to <target>. If it can, there is a circle when adding
* <target> -> <src> dependency.
*
- * Print an error and return 0 if it does.
+ * Print an error and return BFS_RMATCH if it does.
*/
-static noinline int
+static noinline enum bfs_result
check_noncircular(struct held_lock *src, struct held_lock *target,
struct lock_trace **const trace)
{
- int ret;
- struct lock_list *uninitialized_var(target_entry);
- struct lock_list src_entry = {
- .class = hlock_class(src),
- .parent = NULL,
- };
+ enum bfs_result ret;
+ struct lock_list *target_entry;
+ struct lock_list src_entry;
+
+ bfs_init_root(&src_entry, src);
debug_atomic_inc(nr_cyclic_checks);
- ret = check_path(hlock_class(target), &src_entry, &target_entry);
+ ret = check_path(target, &src_entry, hlock_conflict, NULL, &target_entry);
- if (unlikely(!ret)) {
+ if (unlikely(ret == BFS_RMATCH)) {
if (!*trace) {
/*
* If save_trace fails here, the printing might
@@ -1824,83 +2181,144 @@ check_noncircular(struct held_lock *src, struct held_lock *target,
*trace = save_trace();
}
- print_circular_bug(&src_entry, target_entry, src, target);
+ if (src->class_idx == target->class_idx)
+ print_deadlock_bug(current, src, target);
+ else
+ print_circular_bug(&src_entry, target_entry, src, target);
}
return ret;
}
-#ifdef CONFIG_LOCKDEP_SMALL
+#ifdef CONFIG_TRACE_IRQFLAGS
+
/*
- * Check that the dependency graph starting at <src> can lead to
- * <target> or not. If it can, <src> -> <target> dependency is already
- * in the graph.
+ * Forwards and backwards subgraph searching, for the purposes of
+ * proving that two subgraphs can be connected by a new dependency
+ * without creating any illegal irq-safe -> irq-unsafe lock dependency.
+ *
+ * A irq safe->unsafe deadlock happens with the following conditions:
+ *
+ * 1) We have a strong dependency path A -> ... -> B
*
- * Print an error and return 2 if it does or 1 if it does not.
+ * 2) and we have ENABLED_IRQ usage of B and USED_IN_IRQ usage of A, therefore
+ * irq can create a new dependency B -> A (consider the case that a holder
+ * of B gets interrupted by an irq whose handler will try to acquire A).
+ *
+ * 3) the dependency circle A -> ... -> B -> A we get from 1) and 2) is a
+ * strong circle:
+ *
+ * For the usage bits of B:
+ * a) if A -> B is -(*N)->, then B -> A could be any type, so any
+ * ENABLED_IRQ usage suffices.
+ * b) if A -> B is -(*R)->, then B -> A must be -(E*)->, so only
+ * ENABLED_IRQ_*_READ usage suffices.
+ *
+ * For the usage bits of A:
+ * c) if A -> B is -(E*)->, then B -> A could be any type, so any
+ * USED_IN_IRQ usage suffices.
+ * d) if A -> B is -(S*)->, then B -> A must be -(*N)->, so only
+ * USED_IN_IRQ_*_READ usage suffices.
*/
-static noinline int
-check_redundant(struct held_lock *src, struct held_lock *target)
-{
- int ret;
- struct lock_list *uninitialized_var(target_entry);
- struct lock_list src_entry = {
- .class = hlock_class(src),
- .parent = NULL,
- };
-
- debug_atomic_inc(nr_redundant_checks);
-
- ret = check_path(hlock_class(target), &src_entry, &target_entry);
- if (!ret) {
- debug_atomic_inc(nr_redundant);
- ret = 2;
- } else if (ret < 0)
- ret = 0;
-
- return ret;
-}
-#endif
-
-#ifdef CONFIG_TRACE_IRQFLAGS
-
-static inline int usage_accumulate(struct lock_list *entry, void *mask)
+/*
+ * There is a strong dependency path in the dependency graph: A -> B, and now
+ * we need to decide which usage bit of A should be accumulated to detect
+ * safe->unsafe bugs.
+ *
+ * Note that usage_accumulate() is used in backwards search, so ->only_xr
+ * stands for whether A -> B only has -(S*)-> (in this case ->only_xr is true).
+ *
+ * As above, if only_xr is false, which means A -> B has -(E*)-> dependency
+ * path, any usage of A should be considered. Otherwise, we should only
+ * consider _READ usage.
+ */
+static inline bool usage_accumulate(struct lock_list *entry, void *mask)
{
- *(unsigned long *)mask |= entry->class->usage_mask;
+ if (!entry->only_xr)
+ *(unsigned long *)mask |= entry->class->usage_mask;
+ else /* Mask out _READ usage bits */
+ *(unsigned long *)mask |= (entry->class->usage_mask & LOCKF_IRQ);
- return 0;
+ return false;
}
/*
- * Forwards and backwards subgraph searching, for the purposes of
- * proving that two subgraphs can be connected by a new dependency
- * without creating any illegal irq-safe -> irq-unsafe lock dependency.
+ * There is a strong dependency path in the dependency graph: A -> B, and now
+ * we need to decide which usage bit of B conflicts with the usage bits of A,
+ * i.e. which usage bit of B may introduce safe->unsafe deadlocks.
+ *
+ * As above, if only_xr is false, which means A -> B has -(*N)-> dependency
+ * path, any usage of B should be considered. Otherwise, we should only
+ * consider _READ usage.
*/
+static inline bool usage_match(struct lock_list *entry, void *mask)
+{
+ if (!entry->only_xr)
+ return !!(entry->class->usage_mask & *(unsigned long *)mask);
+ else /* Mask out _READ usage bits */
+ return !!((entry->class->usage_mask & LOCKF_IRQ) & *(unsigned long *)mask);
+}
-static inline int usage_match(struct lock_list *entry, void *mask)
+static inline bool usage_skip(struct lock_list *entry, void *mask)
{
- return entry->class->usage_mask & *(unsigned long *)mask;
+ if (entry->class->lock_type == LD_LOCK_NORMAL)
+ return false;
+
+ /*
+ * Skip local_lock() for irq inversion detection.
+ *
+ * For !RT, local_lock() is not a real lock, so it won't carry any
+ * dependency.
+ *
+ * For RT, an irq inversion happens when we have lock A and B, and on
+ * some CPU we can have:
+ *
+ * lock(A);
+ * <interrupted>
+ * lock(B);
+ *
+ * where lock(B) cannot sleep, and we have a dependency B -> ... -> A.
+ *
+ * Now we prove local_lock() cannot exist in that dependency. First we
+ * have the observation for any lock chain L1 -> ... -> Ln, for any
+ * 1 <= i <= n, Li.inner_wait_type <= L1.inner_wait_type, otherwise
+ * wait context check will complain. And since B is not a sleep lock,
+ * therefore B.inner_wait_type >= 2, and since the inner_wait_type of
+ * local_lock() is 3, which is greater than 2, therefore there is no
+ * way the local_lock() exists in the dependency B -> ... -> A.
+ *
+ * As a result, we will skip local_lock(), when we search for irq
+ * inversion bugs.
+ */
+ if (entry->class->lock_type == LD_LOCK_PERCPU &&
+ DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
+ return false;
+
+ /*
+ * Skip WAIT_OVERRIDE for irq inversion detection -- it's not actually
+ * a lock and only used to override the wait_type.
+ */
+
+ return true;
}
/*
* Find a node in the forwards-direction dependency sub-graph starting
* at @root->class that matches @bit.
*
- * Return 0 if such a node exists in the subgraph, and put that node
+ * Return BFS_MATCH if such a node exists in the subgraph, and put that node
* into *@target_entry.
- *
- * Return 1 otherwise and keep *@target_entry unchanged.
- * Return <0 on error.
*/
-static int
+static enum bfs_result
find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
struct lock_list **target_entry)
{
- int result;
+ enum bfs_result result;
debug_atomic_inc(nr_find_usage_forwards_checks);
- result = __bfs_forwards(root, &usage_mask, usage_match, target_entry);
+ result = __bfs_forwards(root, &usage_mask, usage_match, usage_skip, target_entry);
return result;
}
@@ -1908,22 +2326,16 @@ find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
/*
* Find a node in the backwards-direction dependency sub-graph starting
* at @root->class that matches @bit.
- *
- * Return 0 if such a node exists in the subgraph, and put that node
- * into *@target_entry.
- *
- * Return 1 otherwise and keep *@target_entry unchanged.
- * Return <0 on error.
*/
-static int
+static enum bfs_result
find_usage_backwards(struct lock_list *root, unsigned long usage_mask,
struct lock_list **target_entry)
{
- int result;
+ enum bfs_result result;
debug_atomic_inc(nr_find_usage_backwards_checks);
- result = __bfs_backwards(root, &usage_mask, usage_match, target_entry);
+ result = __bfs_backwards(root, &usage_mask, usage_match, usage_skip, target_entry);
return result;
}
@@ -1933,13 +2345,13 @@ static void print_lock_class_header(struct lock_class *class, int depth)
int bit;
printk("%*s->", depth, "");
- print_lock_name(class);
+ print_lock_name(NULL, class);
#ifdef CONFIG_DEBUG_LOCKDEP
printk(KERN_CONT " ops: %lu", debug_class_ops_read(class));
#endif
printk(KERN_CONT " {\n");
- for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
+ for (bit = 0; bit < LOCK_TRACE_STATES; bit++) {
if (class->usage_mask & (1 << bit)) {
int len = depth;
@@ -1955,7 +2367,56 @@ static void print_lock_class_header(struct lock_class *class, int depth)
}
/*
- * printk the shortest lock dependencies from @start to @end in reverse order:
+ * Dependency path printing:
+ *
+ * After BFS we get a lock dependency path (linked via ->parent of lock_list),
+ * printing out each lock in the dependency path will help on understanding how
+ * the deadlock could happen. Here are some details about dependency path
+ * printing:
+ *
+ * 1) A lock_list can be either forwards or backwards for a lock dependency,
+ * for a lock dependency A -> B, there are two lock_lists:
+ *
+ * a) lock_list in the ->locks_after list of A, whose ->class is B and
+ * ->links_to is A. In this case, we can say the lock_list is
+ * "A -> B" (forwards case).
+ *
+ * b) lock_list in the ->locks_before list of B, whose ->class is A
+ * and ->links_to is B. In this case, we can say the lock_list is
+ * "B <- A" (bacwards case).
+ *
+ * The ->trace of both a) and b) point to the call trace where B was
+ * acquired with A held.
+ *
+ * 2) A "helper" lock_list is introduced during BFS, this lock_list doesn't
+ * represent a certain lock dependency, it only provides an initial entry
+ * for BFS. For example, BFS may introduce a "helper" lock_list whose
+ * ->class is A, as a result BFS will search all dependencies starting with
+ * A, e.g. A -> B or A -> C.
+ *
+ * The notation of a forwards helper lock_list is like "-> A", which means
+ * we should search the forwards dependencies starting with "A", e.g A -> B
+ * or A -> C.
+ *
+ * The notation of a bacwards helper lock_list is like "<- B", which means
+ * we should search the backwards dependencies ending with "B", e.g.
+ * B <- A or B <- C.
+ */
+
+/*
+ * printk the shortest lock dependencies from @root to @leaf in reverse order.
+ *
+ * We have a lock dependency path as follow:
+ *
+ * @root @leaf
+ * | |
+ * V V
+ * ->parent ->parent
+ * | lock_list | <--------- | lock_list | ... | lock_list | <--------- | lock_list |
+ * | -> L1 | | L1 -> L2 | ... |Ln-2 -> Ln-1| | Ln-1 -> Ln|
+ *
+ * , so it's natural that we start from @leaf and print every ->class and
+ * ->trace until we reach the @root.
*/
static void __used
print_shortest_lock_dependencies(struct lock_list *leaf,
@@ -1983,6 +2444,61 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
} while (entry && (depth >= 0));
}
+/*
+ * printk the shortest lock dependencies from @leaf to @root.
+ *
+ * We have a lock dependency path (from a backwards search) as follow:
+ *
+ * @leaf @root
+ * | |
+ * V V
+ * ->parent ->parent
+ * | lock_list | ---------> | lock_list | ... | lock_list | ---------> | lock_list |
+ * | L2 <- L1 | | L3 <- L2 | ... | Ln <- Ln-1 | | <- Ln |
+ *
+ * , so when we iterate from @leaf to @root, we actually print the lock
+ * dependency path L1 -> L2 -> .. -> Ln in the non-reverse order.
+ *
+ * Another thing to notice here is that ->class of L2 <- L1 is L1, while the
+ * ->trace of L2 <- L1 is the call trace of L2, in fact we don't have the call
+ * trace of L1 in the dependency path, which is alright, because most of the
+ * time we can figure out where L1 is held from the call trace of L2.
+ */
+static void __used
+print_shortest_lock_dependencies_backwards(struct lock_list *leaf,
+ struct lock_list *root)
+{
+ struct lock_list *entry = leaf;
+ const struct lock_trace *trace = NULL;
+ int depth;
+
+ /*compute depth from generated tree by BFS*/
+ depth = get_lock_depth(leaf);
+
+ do {
+ print_lock_class_header(entry->class, depth);
+ if (trace) {
+ printk("%*s ... acquired at:\n", depth, "");
+ print_lock_trace(trace, 2);
+ printk("\n");
+ }
+
+ /*
+ * Record the pointer to the trace for the next lock_list
+ * entry, see the comments for the function.
+ */
+ trace = entry->trace;
+
+ if (depth == 0 && (entry != root)) {
+ printk("lockdep:%s bad path found in chain graph\n", __func__);
+ break;
+ }
+
+ entry = get_lock_parent(entry);
+ depth--;
+ } while (entry && (depth >= 0));
+}
+
static void
print_irq_lock_scenario(struct lock_list *safe_entry,
struct lock_list *unsafe_entry,
@@ -2011,11 +2527,11 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
*/
if (middle_class != unsafe_class) {
printk("Chain exists of:\n ");
- __print_lock_name(safe_class);
+ __print_lock_name(NULL, safe_class);
printk(KERN_CONT " --> ");
- __print_lock_name(middle_class);
+ __print_lock_name(NULL, middle_class);
printk(KERN_CONT " --> ");
- __print_lock_name(unsafe_class);
+ __print_lock_name(NULL, unsafe_class);
printk(KERN_CONT "\n\n");
}
@@ -2023,18 +2539,18 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
printk(" CPU0 CPU1\n");
printk(" ---- ----\n");
printk(" lock(");
- __print_lock_name(unsafe_class);
+ __print_lock_name(NULL, unsafe_class);
printk(KERN_CONT ");\n");
printk(" local_irq_disable();\n");
printk(" lock(");
- __print_lock_name(safe_class);
+ __print_lock_name(NULL, safe_class);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(middle_class);
+ __print_lock_name(NULL, middle_class);
printk(KERN_CONT ");\n");
printk(" <Interrupt>\n");
printk(" lock(");
- __print_lock_name(safe_class);
+ __print_lock_name(NULL, safe_class);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
@@ -2062,29 +2578,29 @@ print_bad_irq_dependency(struct task_struct *curr,
pr_warn("-----------------------------------------------------\n");
pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
curr->comm, task_pid_nr(curr),
- curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
+ lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT,
curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
- curr->hardirqs_enabled,
+ lockdep_hardirqs_enabled(),
curr->softirqs_enabled);
print_lock(next);
pr_warn("\nand this task is already holding:\n");
print_lock(prev);
pr_warn("which would create a new lock dependency:\n");
- print_lock_name(hlock_class(prev));
+ print_lock_name(prev, hlock_class(prev));
pr_cont(" ->");
- print_lock_name(hlock_class(next));
+ print_lock_name(next, hlock_class(next));
pr_cont("\n");
pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n",
irqclass);
- print_lock_name(backwards_entry->class);
+ print_lock_name(NULL, backwards_entry->class);
pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
print_lock_trace(backwards_entry->class->usage_traces[bit1], 1);
pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
- print_lock_name(forwards_entry->class);
+ print_lock_name(NULL, forwards_entry->class);
pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
pr_warn("...");
@@ -2097,10 +2613,7 @@ print_bad_irq_dependency(struct task_struct *curr,
lockdep_print_held_locks(curr);
pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
- prev_root->trace = save_trace();
- if (!prev_root->trace)
- return;
- print_shortest_lock_dependencies(backwards_entry, prev_root);
+ print_shortest_lock_dependencies_backwards(backwards_entry, prev_root);
pr_warn("\nthe dependencies between the lock to be acquired");
pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
@@ -2179,17 +2692,39 @@ static unsigned long invert_dir_mask(unsigned long mask)
}
/*
- * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all
- * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*).
- * And then mask out all bitnr0.
+ * Note that a LOCK_ENABLED_IRQ_*_READ usage and a LOCK_USED_IN_IRQ_*_READ
+ * usage may cause deadlock too, for example:
+ *
+ * P1 P2
+ * <irq disabled>
+ * write_lock(l1); <irq enabled>
+ * read_lock(l2);
+ * write_lock(l2);
+ * <in irq>
+ * read_lock(l1);
+ *
+ * , in above case, l1 will be marked as LOCK_USED_IN_IRQ_HARDIRQ_READ and l2
+ * will marked as LOCK_ENABLE_IRQ_HARDIRQ_READ, and this is a possible
+ * deadlock.
+ *
+ * In fact, all of the following cases may cause deadlocks:
+ *
+ * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_*
+ * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_*
+ * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_*_READ
+ * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_*_READ
+ *
+ * As a result, to calculate the "exclusive mask", first we invert the
+ * direction (USED_IN/ENABLED) of the original mask, and 1) for all bits with
+ * bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). 2) for all
+ * bits with bitnr0 cleared (LOCK_*_READ), add those with bitnr0 set (LOCK_*).
*/
static unsigned long exclusive_mask(unsigned long mask)
{
unsigned long excl = invert_dir_mask(mask);
- /* Strip read */
excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK;
- excl &= ~LOCKF_IRQ_READ;
+ excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK;
return excl;
}
@@ -2206,6 +2741,7 @@ static unsigned long original_mask(unsigned long mask)
unsigned long excl = invert_dir_mask(mask);
/* Include read in existing usages */
+ excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK;
excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK;
return excl;
@@ -2220,14 +2756,24 @@ static int find_exclusive_match(unsigned long mask,
enum lock_usage_bit *bitp,
enum lock_usage_bit *excl_bitp)
{
- int bit, excl;
+ int bit, excl, excl_read;
for_each_set_bit(bit, &mask, LOCK_USED) {
+ /*
+ * exclusive_bit() strips the read bit, however,
+ * LOCK_ENABLED_IRQ_*_READ may cause deadlocks too, so we need
+ * to search excl | LOCK_USAGE_READ_MASK as well.
+ */
excl = exclusive_bit(bit);
+ excl_read = excl | LOCK_USAGE_READ_MASK;
if (excl_mask & lock_flag(excl)) {
*bitp = bit;
*excl_bitp = excl;
return 0;
+ } else if (excl_mask & lock_flag(excl_read)) {
+ *bitp = bit;
+ *excl_bitp = excl_read;
+ return 0;
}
}
return -1;
@@ -2244,20 +2790,19 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
{
unsigned long usage_mask = 0, forward_mask, backward_mask;
enum lock_usage_bit forward_bit = 0, backward_bit = 0;
- struct lock_list *uninitialized_var(target_entry1);
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry1;
+ struct lock_list *target_entry;
struct lock_list this, that;
- int ret;
+ enum bfs_result ret;
/*
* Step 1: gather all hard/soft IRQs usages backward in an
* accumulated usage mask.
*/
- this.parent = NULL;
- this.class = hlock_class(prev);
+ bfs_init_rootb(&this, prev);
- ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL);
- if (ret < 0) {
+ ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, usage_skip, NULL);
+ if (bfs_error(ret)) {
print_bfs_bug(ret);
return 0;
}
@@ -2272,30 +2817,39 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
*/
forward_mask = exclusive_mask(usage_mask);
- that.parent = NULL;
- that.class = hlock_class(next);
+ bfs_init_root(&that, next);
ret = find_usage_forwards(&that, forward_mask, &target_entry1);
- if (ret < 0) {
+ if (bfs_error(ret)) {
print_bfs_bug(ret);
return 0;
}
- if (ret == 1)
- return ret;
+ if (ret == BFS_RNOMATCH)
+ return 1;
/*
* Step 3: we found a bad match! Now retrieve a lock from the backward
* list whose usage mask matches the exclusive usage mask from the
* lock found on the forward list.
+ *
+ * Note, we should only keep the LOCKF_ENABLED_IRQ_ALL bits, considering
+ * the follow case:
+ *
+ * When trying to add A -> B to the graph, we find that there is a
+ * hardirq-safe L, that L -> ... -> A, and another hardirq-unsafe M,
+ * that B -> ... -> M. However M is **softirq-safe**, if we use exact
+ * invert bits of M's usage_mask, we will find another lock N that is
+ * **softirq-unsafe** and N -> ... -> A, however N -> .. -> M will not
+ * cause a inversion deadlock.
*/
- backward_mask = original_mask(target_entry1->class->usage_mask);
+ backward_mask = original_mask(target_entry1->class->usage_mask & LOCKF_ENABLED_IRQ_ALL);
ret = find_usage_backwards(&this, backward_mask, &target_entry);
- if (ret < 0) {
+ if (bfs_error(ret)) {
print_bfs_bug(ret);
return 0;
}
- if (DEBUG_LOCKS_WARN_ON(ret == 1))
+ if (DEBUG_LOCKS_WARN_ON(ret == BFS_RNOMATCH))
return 1;
/*
@@ -2324,8 +2878,68 @@ static inline int check_irq_usage(struct task_struct *curr,
{
return 1;
}
+
+static inline bool usage_skip(struct lock_list *entry, void *mask)
+{
+ return false;
+}
+
#endif /* CONFIG_TRACE_IRQFLAGS */
+#ifdef CONFIG_LOCKDEP_SMALL
+/*
+ * Check that the dependency graph starting at <src> can lead to
+ * <target> or not. If it can, <src> -> <target> dependency is already
+ * in the graph.
+ *
+ * Return BFS_RMATCH if it does, or BFS_RNOMATCH if it does not, return BFS_E* if
+ * any error appears in the bfs search.
+ */
+static noinline enum bfs_result
+check_redundant(struct held_lock *src, struct held_lock *target)
+{
+ enum bfs_result ret;
+ struct lock_list *target_entry;
+ struct lock_list src_entry;
+
+ bfs_init_root(&src_entry, src);
+ /*
+ * Special setup for check_redundant().
+ *
+ * To report redundant, we need to find a strong dependency path that
+ * is equal to or stronger than <src> -> <target>. So if <src> is E,
+ * we need to let __bfs() only search for a path starting at a -(E*)->,
+ * we achieve this by setting the initial node's ->only_xr to true in
+ * that case. And if <prev> is S, we set initial ->only_xr to false
+ * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant.
+ */
+ src_entry.only_xr = src->read == 0;
+
+ debug_atomic_inc(nr_redundant_checks);
+
+ /*
+ * Note: we skip local_lock() for redundant check, because as the
+ * comment in usage_skip(), A -> local_lock() -> B and A -> B are not
+ * the same.
+ */
+ ret = check_path(target, &src_entry, hlock_equal, usage_skip, &target_entry);
+
+ if (ret == BFS_RMATCH)
+ debug_atomic_inc(nr_redundant);
+
+ return ret;
+}
+
+#else
+
+static inline enum bfs_result
+check_redundant(struct held_lock *src, struct held_lock *target)
+{
+ return BFS_RNOMATCH;
+}
+
+#endif
+
static void inc_chains(int irq_context)
{
if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT)
@@ -2356,10 +2970,10 @@ print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv)
printk(" CPU0\n");
printk(" ----\n");
printk(" lock(");
- __print_lock_name(prev);
+ __print_lock_name(prv, prev);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(next);
+ __print_lock_name(nxt, next);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
printk(" May be due to missing lock nesting notation\n\n");
@@ -2369,6 +2983,8 @@ static void
print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
struct held_lock *next)
{
+ struct lock_class *class = hlock_class(prev);
+
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return;
@@ -2383,6 +2999,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
pr_warn("\nbut task is already holding lock:\n");
print_lock(prev);
+ if (class->cmp_fn) {
+ pr_warn("and the lock comparison function returns %i:\n",
+ class->cmp_fn(prev->instance, next->instance));
+ }
+
pr_warn("\nother info that might help us debug this:\n");
print_deadlock_scenario(next, prev);
lockdep_print_held_locks(curr);
@@ -2397,11 +3018,14 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
* (Note that this has to be done separately, because the graph cannot
* detect such classes of deadlocks.)
*
- * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read
+ * Returns: 0 on deadlock detected, 1 on OK, 2 if another lock with the same
+ * lock class is held but nest_lock is also held, i.e. we rely on the
+ * nest_lock to avoid the deadlock.
*/
static int
check_deadlock(struct task_struct *curr, struct held_lock *next)
{
+ struct lock_class *class;
struct held_lock *prev;
struct held_lock *nest = NULL;
int i;
@@ -2420,7 +3044,13 @@ check_deadlock(struct task_struct *curr, struct held_lock *next)
* lock class (i.e. read_lock(lock)+read_lock(lock)):
*/
if ((next->read == 2) && prev->read)
- return 2;
+ continue;
+
+ class = hlock_class(prev);
+
+ if (class->cmp_fn &&
+ class->cmp_fn(prev->instance, next->instance) < 0)
+ continue;
/*
* We're holding the nest_lock, which serializes this lock's
@@ -2459,11 +3089,11 @@ check_deadlock(struct task_struct *curr, struct held_lock *next)
*/
static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, int distance,
+ struct held_lock *next, u16 distance,
struct lock_trace **const trace)
{
struct lock_list *entry;
- int ret;
+ enum bfs_result ret;
if (!hlock_class(prev)->key || !hlock_class(next)->key) {
/*
@@ -2483,6 +3113,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
return 2;
}
+ if (prev->class_idx == next->class_idx) {
+ struct lock_class *class = hlock_class(prev);
+
+ if (class->cmp_fn &&
+ class->cmp_fn(prev->instance, next->instance) < 0)
+ return 2;
+ }
+
/*
* Prove that the new <prev> -> <next> dependency would not
* create a circular dependency in the graph. (We do this by
@@ -2494,23 +3132,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* in the graph whose neighbours are to be checked.
*/
ret = check_noncircular(next, prev, trace);
- if (unlikely(ret <= 0))
+ if (unlikely(bfs_error(ret) || ret == BFS_RMATCH))
return 0;
if (!check_irq_usage(curr, prev, next))
return 0;
/*
- * For recursive read-locks we do all the dependency checks,
- * but we dont store read-triggered dependencies (only
- * write-triggered dependencies). This ensures that only the
- * write-side dependencies matter, and that if for example a
- * write-lock never takes any other locks, then the reads are
- * equivalent to a NOP.
- */
- if (next->read == 2 || prev->read == 2)
- return 1;
- /*
* Is the <prev> -> <next> dependency already present?
*
* (this may occur even though this is a new chain: consider
@@ -2522,18 +3150,46 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
if (entry->class == hlock_class(next)) {
if (distance == 1)
entry->distance = 1;
- return 1;
+ entry->dep |= calc_dep(prev, next);
+
+ /*
+ * Also, update the reverse dependency in @next's
+ * ->locks_before list.
+ *
+ * Here we reuse @entry as the cursor, which is fine
+ * because we won't go to the next iteration of the
+ * outer loop:
+ *
+ * For normal cases, we return in the inner loop.
+ *
+ * If we fail to return, we have inconsistency, i.e.
+ * <prev>::locks_after contains <next> while
+ * <next>::locks_before doesn't contain <prev>. In
+ * that case, we return after the inner and indicate
+ * something is wrong.
+ */
+ list_for_each_entry(entry, &hlock_class(next)->locks_before, entry) {
+ if (entry->class == hlock_class(prev)) {
+ if (distance == 1)
+ entry->distance = 1;
+ entry->dep |= calc_depb(prev, next);
+ return 1;
+ }
+ }
+
+ /* <prev> is not found in <next>::locks_before */
+ return 0;
}
}
-#ifdef CONFIG_LOCKDEP_SMALL
/*
* Is the <prev> -> <next> link redundant?
*/
ret = check_redundant(prev, next);
- if (ret != 1)
- return ret;
-#endif
+ if (bfs_error(ret))
+ return 0;
+ else if (ret == BFS_RMATCH)
+ return 2;
if (!*trace) {
*trace = save_trace();
@@ -2546,15 +3202,15 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* to the previous lock's dependency list:
*/
ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
- &hlock_class(prev)->locks_after,
- next->acquire_ip, distance, *trace);
+ &hlock_class(prev)->locks_after, distance,
+ calc_dep(prev, next), *trace);
if (!ret)
return 0;
ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
- &hlock_class(next)->locks_before,
- next->acquire_ip, distance, *trace);
+ &hlock_class(next)->locks_before, distance,
+ calc_depb(prev, next), *trace);
if (!ret)
return 0;
@@ -2590,16 +3246,11 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
goto out_bug;
for (;;) {
- int distance = curr->lockdep_depth - depth + 1;
+ u16 distance = curr->lockdep_depth - depth + 1;
hlock = curr->held_locks + depth - 1;
- /*
- * Only non-recursive-read entries get new dependencies
- * added:
- */
- if (hlock->read != 2 && hlock->check) {
- int ret = check_prev_add(curr, hlock, next, distance,
- &trace);
+ if (hlock->check) {
+ int ret = check_prev_add(curr, hlock, next, distance, &trace);
if (!ret)
return 0;
@@ -2846,7 +3497,8 @@ static int alloc_chain_hlocks(int req)
size = chain_block_size(curr);
if (likely(size >= req)) {
del_chain_block(0, size, chain_block_next(curr));
- add_chain_block(curr + req, size - req);
+ if (size > req)
+ add_chain_block(curr + req, size - req);
return curr;
}
}
@@ -2875,7 +3527,10 @@ static inline void free_chain_hlocks(int base, int size)
struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
{
- return lock_classes + chain_hlocks[chain->base + i];
+ u16 chain_hlock = chain_hlocks[chain->base + i];
+ unsigned int class_idx = chain_hlock_class_idx(chain_hlock);
+
+ return lock_classes + class_idx;
}
/*
@@ -2901,12 +3556,12 @@ static inline int get_first_held_lock(struct task_struct *curr,
/*
* Returns the next chain_key iteration
*/
-static u64 print_chain_key_iteration(int class_idx, u64 chain_key)
+static u64 print_chain_key_iteration(u16 hlock_id, u64 chain_key)
{
- u64 new_chain_key = iterate_chain_key(chain_key, class_idx);
+ u64 new_chain_key = iterate_chain_key(chain_key, hlock_id);
- printk(" class_idx:%d -> chain_key:%016Lx",
- class_idx,
+ printk(" hlock_id:%d -> chain_key:%016Lx",
+ (unsigned int)hlock_id,
(unsigned long long)new_chain_key);
return new_chain_key;
}
@@ -2923,12 +3578,12 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne
hlock_next->irq_context);
for (; i < depth; i++) {
hlock = curr->held_locks + i;
- chain_key = print_chain_key_iteration(hlock->class_idx, chain_key);
+ chain_key = print_chain_key_iteration(hlock_id(hlock), chain_key);
print_lock(hlock);
}
- print_chain_key_iteration(hlock_next->class_idx, chain_key);
+ print_chain_key_iteration(hlock_id(hlock_next), chain_key);
print_lock(hlock_next);
}
@@ -2936,14 +3591,14 @@ static void print_chain_keys_chain(struct lock_chain *chain)
{
int i;
u64 chain_key = INITIAL_CHAIN_KEY;
- int class_id;
+ u16 hlock_id;
printk("depth: %u\n", chain->depth);
for (i = 0; i < chain->depth; i++) {
- class_id = chain_hlocks[chain->base + i];
- chain_key = print_chain_key_iteration(class_id, chain_key);
+ hlock_id = chain_hlocks[chain->base + i];
+ chain_key = print_chain_key_iteration(hlock_id, chain_key);
- print_lock_name(lock_classes + class_id);
+ print_lock_name(NULL, lock_classes + chain_hlock_class_idx(hlock_id));
printk("\n");
}
}
@@ -2992,7 +3647,7 @@ static int check_no_collision(struct task_struct *curr,
}
for (j = 0; j < chain->depth - 1; j++, i++) {
- id = curr->held_locks[i].class_idx;
+ id = hlock_id(&curr->held_locks[i]);
if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) {
print_collision(curr, hlock, chain);
@@ -3041,7 +3696,6 @@ static inline int add_chain_cache(struct task_struct *curr,
struct held_lock *hlock,
u64 chain_key)
{
- struct lock_class *class = hlock_class(hlock);
struct hlist_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
int i, j;
@@ -3084,11 +3738,11 @@ static inline int add_chain_cache(struct task_struct *curr,
chain->base = j;
for (j = 0; j < chain->depth - 1; j++, i++) {
- int lock_id = curr->held_locks[i].class_idx;
+ int lock_id = hlock_id(curr->held_locks + i);
chain_hlocks[chain->base + j] = lock_id;
}
- chain_hlocks[chain->base + j] = class - lock_classes;
+ chain_hlocks[chain->base + j] = hlock_id(hlock);
hlist_add_head_rcu(&chain->entry, hash_head);
debug_atomic_inc(chain_lookup_misses);
inc_chains(chain->irq_context);
@@ -3204,15 +3858,12 @@ static int validate_chain(struct task_struct *curr,
if (!ret)
return 0;
/*
- * Mark recursive read, as we jump over it when
- * building dependencies (just like we jump over
- * trylock entries):
- */
- if (ret == 2)
- hlock->read = 2;
- /*
* Add dependency only if this lock is not the head
- * of the chain, and if it's not a secondary read-lock:
+ * of the chain, and if the new lock introduces no more
+ * lock dependency (because we already hold a lock with the
+ * same lock class) nor deadlock (because the nest_lock
+ * serializes nesting locks), see the comments for
+ * check_deadlock().
*/
if (!chain_head && ret != 2) {
if (!check_prevs_add(curr, hlock))
@@ -3275,7 +3926,7 @@ static void check_chain_key(struct task_struct *curr)
if (prev_hlock && (prev_hlock->irq_context !=
hlock->irq_context))
chain_key = INITIAL_CHAIN_KEY;
- chain_key = iterate_chain_key(chain_key, hlock->class_idx);
+ chain_key = iterate_chain_key(chain_key, hlock_id(hlock));
prev_hlock = hlock;
}
if (chain_key != curr->curr_chain_key) {
@@ -3304,11 +3955,11 @@ static void print_usage_bug_scenario(struct held_lock *lock)
printk(" CPU0\n");
printk(" ----\n");
printk(" lock(");
- __print_lock_name(class);
+ __print_lock_name(lock, class);
printk(KERN_CONT ");\n");
printk(" <Interrupt>\n");
printk(" lock(");
- __print_lock_name(class);
+ __print_lock_name(lock, class);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
@@ -3317,7 +3968,7 @@ static void
print_usage_bug(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
{
- if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ if (!debug_locks_off() || debug_locks_silent)
return;
pr_warn("\n");
@@ -3331,9 +3982,9 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
curr->comm, task_pid_nr(curr),
- lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
+ lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT,
lockdep_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
- lockdep_hardirqs_enabled(curr),
+ lockdep_hardirqs_enabled(),
lockdep_softirqs_enabled(curr));
print_lock(this);
@@ -3358,6 +4009,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
{
if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) {
+ graph_unlock();
print_usage_bug(curr, this, bad_bit, new_bit);
return 0;
}
@@ -3393,7 +4045,7 @@ print_irq_inversion_bug(struct task_struct *curr,
pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
else
pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
- print_lock_name(other->class);
+ print_lock_name(NULL, other->class);
pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n");
pr_warn("\nother info that might help us debug this:\n");
@@ -3434,24 +4086,32 @@ print_irq_inversion_bug(struct task_struct *curr,
*/
static int
check_usage_forwards(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit bit, const char *irqclass)
+ enum lock_usage_bit bit)
{
- int ret;
+ enum bfs_result ret;
struct lock_list root;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
+ enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK;
+ unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit);
- root.parent = NULL;
- root.class = hlock_class(this);
- ret = find_usage_forwards(&root, lock_flag(bit), &target_entry);
- if (ret < 0) {
+ bfs_init_root(&root, this);
+ ret = find_usage_forwards(&root, usage_mask, &target_entry);
+ if (bfs_error(ret)) {
print_bfs_bug(ret);
return 0;
}
- if (ret == 1)
- return ret;
+ if (ret == BFS_RNOMATCH)
+ return 1;
+
+ /* Check whether write or read usage is the match */
+ if (target_entry->class->usage_mask & lock_flag(bit)) {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 1, state_name(bit));
+ } else {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 1, state_name(read_bit));
+ }
- print_irq_inversion_bug(curr, &root, target_entry,
- this, 1, irqclass);
return 0;
}
@@ -3461,42 +4121,52 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
*/
static int
check_usage_backwards(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit bit, const char *irqclass)
+ enum lock_usage_bit bit)
{
- int ret;
+ enum bfs_result ret;
struct lock_list root;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
+ enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK;
+ unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit);
- root.parent = NULL;
- root.class = hlock_class(this);
- ret = find_usage_backwards(&root, lock_flag(bit), &target_entry);
- if (ret < 0) {
+ bfs_init_rootb(&root, this);
+ ret = find_usage_backwards(&root, usage_mask, &target_entry);
+ if (bfs_error(ret)) {
print_bfs_bug(ret);
return 0;
}
- if (ret == 1)
- return ret;
+ if (ret == BFS_RNOMATCH)
+ return 1;
+
+ /* Check whether write or read usage is the match */
+ if (target_entry->class->usage_mask & lock_flag(bit)) {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 0, state_name(bit));
+ } else {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 0, state_name(read_bit));
+ }
- print_irq_inversion_bug(curr, &root, target_entry,
- this, 0, irqclass);
return 0;
}
void print_irqtrace_events(struct task_struct *curr)
{
- printk("irq event stamp: %u\n", curr->irq_events);
+ const struct irqtrace_events *trace = &curr->irqtrace;
+
+ printk("irq event stamp: %u\n", trace->irq_events);
printk("hardirqs last enabled at (%u): [<%px>] %pS\n",
- curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip,
- (void *)curr->hardirq_enable_ip);
+ trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip,
+ (void *)trace->hardirq_enable_ip);
printk("hardirqs last disabled at (%u): [<%px>] %pS\n",
- curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip,
- (void *)curr->hardirq_disable_ip);
+ trace->hardirq_disable_event, (void *)trace->hardirq_disable_ip,
+ (void *)trace->hardirq_disable_ip);
printk("softirqs last enabled at (%u): [<%px>] %pS\n",
- curr->softirq_enable_event, (void *)curr->softirq_enable_ip,
- (void *)curr->softirq_enable_ip);
+ trace->softirq_enable_event, (void *)trace->softirq_enable_ip,
+ (void *)trace->softirq_enable_ip);
printk("softirqs last disabled at (%u): [<%px>] %pS\n",
- curr->softirq_disable_event, (void *)curr->softirq_disable_ip,
- (void *)curr->softirq_disable_ip);
+ trace->softirq_disable_event, (void *)trace->softirq_disable_ip,
+ (void *)trace->softirq_disable_ip);
}
static int HARDIRQ_verbose(struct lock_class *class)
@@ -3515,8 +4185,6 @@ static int SOFTIRQ_verbose(struct lock_class *class)
return 0;
}
-#define STRICT_READ_CHECKS 1
-
static int (*state_verbose_f[])(struct lock_class *class) = {
#define LOCKDEP_STATE(__STATE) \
__STATE##_verbose,
@@ -3542,16 +4210,6 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
int dir = new_bit & LOCK_USAGE_DIR_MASK;
/*
- * mark USED_IN has to look forwards -- to ensure no dependency
- * has ENABLED state, which would allow recursion deadlocks.
- *
- * mark ENABLED has to look backwards -- to ensure no dependee
- * has USED_IN state, which, again, would allow recursion deadlocks.
- */
- check_usage_f usage = dir ?
- check_usage_backwards : check_usage_forwards;
-
- /*
* Validate that this particular lock does not have conflicting
* usage states.
*/
@@ -3559,23 +4217,30 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
return 0;
/*
- * Validate that the lock dependencies don't have conflicting usage
- * states.
+ * Check for read in write conflicts
*/
- if ((!read || STRICT_READ_CHECKS) &&
- !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK)))
+ if (!read && !valid_state(curr, this, new_bit,
+ excl_bit + LOCK_USAGE_READ_MASK))
return 0;
+
/*
- * Check for read in write conflicts
+ * Validate that the lock dependencies don't have conflicting usage
+ * states.
*/
- if (!read) {
- if (!valid_state(curr, this, new_bit, excl_bit + LOCK_USAGE_READ_MASK))
+ if (dir) {
+ /*
+ * mark ENABLED has to look backwards -- to ensure no dependee
+ * has USED_IN state, which, again, would allow recursion deadlocks.
+ */
+ if (!check_usage_backwards(curr, this, excl_bit))
return 0;
-
- if (STRICT_READ_CHECKS &&
- !usage(curr, this, excl_bit + LOCK_USAGE_READ_MASK,
- state_name(new_bit + LOCK_USAGE_READ_MASK)))
+ } else {
+ /*
+ * mark USED_IN has to look forwards -- to ensure no dependency
+ * has ENABLED state, which would allow recursion deadlocks.
+ */
+ if (!check_usage_forwards(curr, this, excl_bit))
return 0;
}
@@ -3637,19 +4302,27 @@ static void __trace_hardirqs_on_caller(void)
/**
* lockdep_hardirqs_on_prepare - Prepare for enabling interrupts
- * @ip: Caller address
*
* Invoked before a possible transition to RCU idle from exit to user or
* guest mode. This ensures that all RCU operations are done before RCU
* stops watching. After the RCU transition lockdep_hardirqs_on() has to be
* invoked to set the final state.
*/
-void lockdep_hardirqs_on_prepare(unsigned long ip)
+void lockdep_hardirqs_on_prepare(void)
{
- if (unlikely(!debug_locks || current->lockdep_recursion))
+ if (unlikely(!debug_locks))
+ return;
+
+ /*
+ * NMIs do not (and cannot) track lock dependencies, nothing to do.
+ */
+ if (unlikely(in_nmi()))
return;
- if (unlikely(current->hardirqs_enabled)) {
+ if (unlikely(this_cpu_read(lockdep_recursion)))
+ return;
+
+ if (unlikely(lockdep_hardirqs_enabled())) {
/*
* Neither irq nor preemption are disabled here
* so this is racy by nature but losing one hit
@@ -3677,12 +4350,12 @@ void lockdep_hardirqs_on_prepare(unsigned long ip)
* Can't allow enabling interrupts while in an interrupt handler,
* that's general bad form and such. Recursion, limited stack etc..
*/
- if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
+ if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context()))
return;
current->hardirq_chain_key = current->curr_chain_key;
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
__trace_hardirqs_on_caller();
lockdep_recursion_finish();
}
@@ -3690,12 +4363,35 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare);
void noinstr lockdep_hardirqs_on(unsigned long ip)
{
- struct task_struct *curr = current;
+ struct irqtrace_events *trace = &current->irqtrace;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ /*
+ * NMIs can happen in the middle of local_irq_{en,dis}able() where the
+ * tracking state and hardware state are out of sync.
+ *
+ * NMIs must save lockdep_hardirqs_enabled() to restore IRQ state from,
+ * and not rely on hardware state like normal interrupts.
+ */
+ if (unlikely(in_nmi())) {
+ if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI))
+ return;
- if (unlikely(!debug_locks || curr->lockdep_recursion))
+ /*
+ * Skip:
+ * - recursion check, because NMI can hit lockdep;
+ * - hardware state check, because above;
+ * - chain_key check, see lockdep_hardirqs_on_prepare().
+ */
+ goto skip_checks;
+ }
+
+ if (unlikely(this_cpu_read(lockdep_recursion)))
return;
- if (curr->hardirqs_enabled) {
+ if (lockdep_hardirqs_enabled()) {
/*
* Neither irq nor preemption are disabled here
* so this is racy by nature but losing one hit
@@ -3720,10 +4416,11 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key !=
current->curr_chain_key);
+skip_checks:
/* we'll do an OFF -> ON transition: */
- curr->hardirqs_enabled = 1;
- curr->hardirq_enable_ip = ip;
- curr->hardirq_enable_event = ++curr->irq_events;
+ __this_cpu_write(hardirqs_enabled, 1);
+ trace->hardirq_enable_ip = ip;
+ trace->hardirq_enable_event = ++trace->irq_events;
debug_atomic_inc(hardirqs_on_events);
}
EXPORT_SYMBOL_GPL(lockdep_hardirqs_on);
@@ -3733,9 +4430,18 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on);
*/
void noinstr lockdep_hardirqs_off(unsigned long ip)
{
- struct task_struct *curr = current;
+ if (unlikely(!debug_locks))
+ return;
- if (unlikely(!debug_locks || curr->lockdep_recursion))
+ /*
+ * Matching lockdep_hardirqs_on(), allow NMIs in the middle of lockdep;
+ * they will restore the software state. This ensures the software
+ * state is consistent inside NMIs as well.
+ */
+ if (in_nmi()) {
+ if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI))
+ return;
+ } else if (__this_cpu_read(lockdep_recursion))
return;
/*
@@ -3745,13 +4451,15 @@ void noinstr lockdep_hardirqs_off(unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
- if (curr->hardirqs_enabled) {
+ if (lockdep_hardirqs_enabled()) {
+ struct irqtrace_events *trace = &current->irqtrace;
+
/*
* We have done an ON -> OFF transition:
*/
- curr->hardirqs_enabled = 0;
- curr->hardirq_disable_ip = ip;
- curr->hardirq_disable_event = ++curr->irq_events;
+ __this_cpu_write(hardirqs_enabled, 0);
+ trace->hardirq_disable_ip = ip;
+ trace->hardirq_disable_event = ++trace->irq_events;
debug_atomic_inc(hardirqs_off_events);
} else {
debug_atomic_inc(redundant_hardirqs_off);
@@ -3764,9 +4472,9 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_off);
*/
void lockdep_softirqs_on(unsigned long ip)
{
- struct task_struct *curr = current;
+ struct irqtrace_events *trace = &current->irqtrace;
- if (unlikely(!debug_locks || current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
/*
@@ -3776,26 +4484,26 @@ void lockdep_softirqs_on(unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
- if (curr->softirqs_enabled) {
+ if (current->softirqs_enabled) {
debug_atomic_inc(redundant_softirqs_on);
return;
}
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
/*
* We'll do an OFF -> ON transition:
*/
- curr->softirqs_enabled = 1;
- curr->softirq_enable_ip = ip;
- curr->softirq_enable_event = ++curr->irq_events;
+ current->softirqs_enabled = 1;
+ trace->softirq_enable_ip = ip;
+ trace->softirq_enable_event = ++trace->irq_events;
debug_atomic_inc(softirqs_on_events);
/*
* We are going to turn softirqs on, so set the
* usage bit for all held locks, if hardirqs are
* enabled too:
*/
- if (curr->hardirqs_enabled)
- mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ);
+ if (lockdep_hardirqs_enabled())
+ mark_held_locks(current, LOCK_ENABLED_SOFTIRQ);
lockdep_recursion_finish();
}
@@ -3804,9 +4512,7 @@ void lockdep_softirqs_on(unsigned long ip)
*/
void lockdep_softirqs_off(unsigned long ip)
{
- struct task_struct *curr = current;
-
- if (unlikely(!debug_locks || current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
/*
@@ -3815,13 +4521,15 @@ void lockdep_softirqs_off(unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
- if (curr->softirqs_enabled) {
+ if (current->softirqs_enabled) {
+ struct irqtrace_events *trace = &current->irqtrace;
+
/*
* We have done an ON -> OFF transition:
*/
- curr->softirqs_enabled = 0;
- curr->softirq_disable_ip = ip;
- curr->softirq_disable_event = ++curr->irq_events;
+ current->softirqs_enabled = 0;
+ trace->softirq_disable_ip = ip;
+ trace->softirq_disable_event = ++trace->irq_events;
debug_atomic_inc(softirqs_off_events);
/*
* Whoops, we wanted softirqs off, so why aren't they?
@@ -3843,7 +4551,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
*/
if (!hlock->trylock) {
if (hlock->read) {
- if (curr->hardirq_context)
+ if (lockdep_hardirq_context())
if (!mark_lock(curr, hlock,
LOCK_USED_IN_HARDIRQ_READ))
return 0;
@@ -3852,7 +4560,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
LOCK_USED_IN_SOFTIRQ_READ))
return 0;
} else {
- if (curr->hardirq_context)
+ if (lockdep_hardirq_context())
if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
return 0;
if (curr->softirq_context)
@@ -3860,7 +4568,13 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
return 0;
}
}
- if (!hlock->hardirqs_off) {
+
+ /*
+ * For lock_sync(), don't mark the ENABLED usage, since lock_sync()
+ * creates no critical section and no extra dependency can be introduced
+ * by interrupts
+ */
+ if (!hlock->hardirqs_off && !hlock->sync) {
if (hlock->read) {
if (!mark_lock(curr, hlock,
LOCK_ENABLED_HARDIRQ_READ))
@@ -3890,7 +4604,7 @@ lock_used:
static inline unsigned int task_irq_context(struct task_struct *task)
{
- return LOCK_CHAIN_HARDIRQ_CONTEXT * !!task->hardirq_context +
+ return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context() +
LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context;
}
@@ -3923,13 +4637,18 @@ static int separate_irq_context(struct task_struct *curr,
static int mark_lock(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit)
{
- unsigned int new_mask = 1 << new_bit, ret = 1;
+ unsigned int new_mask, ret = 1;
if (new_bit >= LOCK_USAGE_STATES) {
DEBUG_LOCKS_WARN_ON(1);
return 0;
}
+ if (new_bit == LOCK_USED && this->read)
+ new_bit = LOCK_USED_READ;
+
+ new_mask = 1 << new_bit;
+
/*
* If already set then do not dirty the cacheline,
* nor do any checks:
@@ -3942,26 +4661,26 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
/*
* Make sure we didn't race:
*/
- if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
- graph_unlock();
- return 1;
- }
+ if (unlikely(hlock_class(this)->usage_mask & new_mask))
+ goto unlock;
+
+ if (!hlock_class(this)->usage_mask)
+ debug_atomic_dec(nr_unused_locks);
hlock_class(this)->usage_mask |= new_mask;
- if (!(hlock_class(this)->usage_traces[new_bit] = save_trace()))
- return 0;
+ if (new_bit < LOCK_TRACE_STATES) {
+ if (!(hlock_class(this)->usage_traces[new_bit] = save_trace()))
+ return 0;
+ }
- switch (new_bit) {
- case LOCK_USED:
- debug_atomic_dec(nr_unused_locks);
- break;
- default:
+ if (new_bit < LOCK_USED) {
ret = mark_lock_irq(curr, this, new_bit);
if (!ret)
return 0;
}
+unlock:
graph_unlock();
/*
@@ -3983,7 +4702,7 @@ static inline short task_wait_context(struct task_struct *curr)
* Set appropriate wait type for the context; for IRQs we have to take
* into account force_irqthread as that is implied by PREEMPT_RT.
*/
- if (curr->hardirq_context) {
+ if (lockdep_hardirq_context()) {
/*
* Check if force_irqthreads will run us threaded.
*/
@@ -4037,7 +4756,7 @@ print_lock_invalid_wait_context(struct task_struct *curr,
/*
* Verify the wait_type context.
*
- * This check validates we takes locks in the right wait-type order; that is it
+ * This check validates we take locks in the right wait-type order; that is it
* ensures that we do not take mutexes inside spinlocks and do not attempt to
* acquire spinlocks inside raw_spinlocks and the sort.
*
@@ -4051,12 +4770,12 @@ print_lock_invalid_wait_context(struct task_struct *curr,
*/
static int check_wait_context(struct task_struct *curr, struct held_lock *next)
{
- short next_inner = hlock_class(next)->wait_type_inner;
- short next_outer = hlock_class(next)->wait_type_outer;
- short curr_inner;
+ u8 next_inner = hlock_class(next)->wait_type_inner;
+ u8 next_outer = hlock_class(next)->wait_type_outer;
+ u8 curr_inner;
int depth;
- if (!curr->lockdep_depth || !next_inner || next->trylock)
+ if (!next_inner || next->trylock)
return 0;
if (!next_outer)
@@ -4076,7 +4795,8 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
for (; depth < curr->lockdep_depth; depth++) {
struct held_lock *prev = curr->held_locks + depth;
- short prev_inner = hlock_class(prev)->wait_type_inner;
+ struct lock_class *class = hlock_class(prev);
+ u8 prev_inner = class->wait_type_inner;
if (prev_inner) {
/*
@@ -4086,6 +4806,14 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
* Also due to trylocks.
*/
curr_inner = min(curr_inner, prev_inner);
+
+ /*
+ * Allow override for annotations -- this is typically
+ * only valid/needed for code that only exists when
+ * CONFIG_PREEMPT_RT=n.
+ */
+ if (unlikely(class->lock_type == LD_LOCK_WAIT_OVERRIDE))
+ curr_inner = prev_inner;
}
}
@@ -4125,9 +4853,9 @@ static inline int check_wait_context(struct task_struct *curr,
/*
* Initialize a lock instance's lock-class mapping info:
*/
-void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
+void lockdep_init_map_type(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, int subclass,
- short inner, short outer)
+ u8 inner, u8 outer, u8 lock_type)
{
int i;
@@ -4150,6 +4878,7 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
lock->wait_type_outer = outer;
lock->wait_type_inner = inner;
+ lock->lock_type = lock_type;
/*
* No key, no joy, we need to hash something.
@@ -4174,25 +4903,51 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
if (subclass) {
unsigned long flags;
- if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion))
+ if (DEBUG_LOCKS_WARN_ON(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
register_lock_class(lock, subclass, 1);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
}
-EXPORT_SYMBOL_GPL(lockdep_init_map_waits);
+EXPORT_SYMBOL_GPL(lockdep_init_map_type);
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
+#ifdef CONFIG_PROVE_LOCKING
+void lockdep_set_lock_cmp_fn(struct lockdep_map *lock, lock_cmp_fn cmp_fn,
+ lock_print_fn print_fn)
+{
+ struct lock_class *class = lock->class_cache[0];
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ lockdep_recursion_inc();
+
+ if (!class)
+ class = register_lock_class(lock, 0, 0);
+
+ if (class) {
+ WARN_ON(class->cmp_fn && class->cmp_fn != cmp_fn);
+ WARN_ON(class->print_fn && class->print_fn != print_fn);
+
+ class->cmp_fn = cmp_fn;
+ class->print_fn = print_fn;
+ }
+
+ lockdep_recursion_finish();
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lockdep_set_lock_cmp_fn);
+#endif
+
static void
print_lock_nested_lock_not_held(struct task_struct *curr,
- struct held_lock *hlock,
- unsigned long ip)
+ struct held_lock *hlock)
{
if (!debug_locks_off())
return;
@@ -4234,7 +4989,7 @@ static int __lock_is_held(const struct lockdep_map *lock, int read);
static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int trylock, int read, int check, int hardirqs_off,
struct lockdep_map *nest_lock, unsigned long ip,
- int references, int pin_count)
+ int references, int pin_count, int sync)
{
struct task_struct *curr = current;
struct lock_class *class = NULL;
@@ -4285,7 +5040,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
class_idx = class - lock_classes;
- if (depth) { /* we're holding locks */
+ if (depth && !sync) {
+ /* we're holding locks and the new held lock is not a sync */
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
if (!references)
@@ -4319,6 +5075,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->trylock = trylock;
hlock->read = read;
hlock->check = check;
+ hlock->sync = !!sync;
hlock->hardirqs_off = !!hardirqs_off;
hlock->references = references;
#ifdef CONFIG_LOCK_STAT
@@ -4365,10 +5122,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
chain_key = INITIAL_CHAIN_KEY;
chain_head = 1;
}
- chain_key = iterate_chain_key(chain_key, class_idx);
+ chain_key = iterate_chain_key(chain_key, hlock_id(hlock));
if (nest_lock && !__lock_is_held(nest_lock, -1)) {
- print_lock_nested_lock_not_held(curr, hlock, ip);
+ print_lock_nested_lock_not_held(curr, hlock);
return 0;
}
@@ -4380,6 +5137,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!validate_chain(curr, hlock, chain_head, chain_key))
return 0;
+ /* For lock_sync(), we are done here since no actual critical section */
+ if (hlock->sync)
+ return 1;
+
curr->curr_chain_key = chain_key;
curr->lockdep_depth++;
check_chain_key(curr);
@@ -4521,7 +5282,7 @@ static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
hlock->read, hlock->check,
hlock->hardirqs_off,
hlock->nest_lock, hlock->acquire_ip,
- hlock->references, hlock->pin_count)) {
+ hlock->references, hlock->pin_count, 0)) {
case 0:
return 1;
case 1:
@@ -4565,9 +5326,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
return 0;
}
- lockdep_init_map_waits(lock, name, key, 0,
- lock->wait_type_inner,
- lock->wait_type_outer);
+ lockdep_init_map_type(lock, name, key, 0,
+ lock->wait_type_inner,
+ lock->wait_type_outer,
+ lock->lock_type);
class = register_lock_class(lock, subclass, 0);
hlock->class_idx = class - lock_classes;
@@ -4731,14 +5493,14 @@ int __lock_is_held(const struct lockdep_map *lock, int read)
struct held_lock *hlock = curr->held_locks + i;
if (match_held_lock(hlock, lock)) {
- if (read == -1 || hlock->read == read)
- return 1;
+ if (read == -1 || !!hlock->read == read)
+ return LOCK_STATE_HELD;
- return 0;
+ return LOCK_STATE_NOT_HELD;
}
}
- return 0;
+ return LOCK_STATE_NOT_HELD;
}
static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
@@ -4759,7 +5521,7 @@ static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
* be guessable and still allows some pin nesting in
* our u32 pin_count.
*/
- cookie.val = 1 + (prandom_u32() >> 16);
+ cookie.val = 1 + (sched_clock() & 0xffff);
hlock->pin_count += cookie.val;
return cookie;
}
@@ -4819,22 +5581,26 @@ static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie
/*
* Check whether we follow the irq-flags state precisely:
*/
-static void check_flags(unsigned long flags)
+static noinstr void check_flags(unsigned long flags)
{
#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP)
if (!debug_locks)
return;
+ /* Get the warning out.. */
+ instrumentation_begin();
+
if (irqs_disabled_flags(flags)) {
- if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) {
+ if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled())) {
printk("possible reason: unannotated irqs-off.\n");
}
} else {
- if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) {
+ if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled())) {
printk("possible reason: unannotated irqs-on.\n");
}
}
+#ifndef CONFIG_PREEMPT_RT
/*
* We dont accurately track softirq state in e.g.
* hardirq contexts (such as on 4KSTACKS), so only
@@ -4849,9 +5615,12 @@ static void check_flags(unsigned long flags)
DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
}
}
+#endif
if (!debug_locks)
print_irqtrace_events(current);
+
+ instrumentation_end();
#endif
}
@@ -4861,11 +5630,11 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
check_flags(flags);
if (__lock_set_class(lock, name, key, subclass, ip))
check_chain_key(current);
@@ -4878,11 +5647,11 @@ void lock_downgrade(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
check_flags(flags);
if (__lock_downgrade(lock, ip))
check_chain_key(current);
@@ -4896,12 +5665,20 @@ static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock
{
#ifdef CONFIG_PROVE_LOCKING
struct lock_class *class = look_up_lock_class(lock, subclass);
+ unsigned long mask = LOCKF_USED;
/* if it doesn't have a class (yet), it certainly hasn't been used yet */
if (!class)
return;
- if (!(class->usage_mask & LOCK_USED))
+ /*
+ * READ locks only conflict with USED, such that if we only ever use
+ * READ locks, there is no deadlock possible -- RCU.
+ */
+ if (!hlock->read)
+ mask |= LOCKF_USED_READ;
+
+ if (!(class->usage_mask & mask))
return;
hlock->class_idx = class - lock_classes;
@@ -4912,7 +5689,7 @@ static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock
static bool lockdep_nmi(void)
{
- if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK)
+ if (raw_cpu_read(lockdep_recursion))
return false;
if (!in_nmi())
@@ -4922,6 +5699,20 @@ static bool lockdep_nmi(void)
}
/*
+ * read_lock() is recursive if:
+ * 1. We force lockdep think this way in selftests or
+ * 2. The implementation is not queued read/write lock or
+ * 3. The locker is at an in_interrupt() context.
+ */
+bool read_lock_is_recursive(void)
+{
+ return force_read_lock_recursive ||
+ !IS_ENABLED(CONFIG_QUEUED_RWLOCKS) ||
+ in_interrupt();
+}
+EXPORT_SYMBOL_GPL(read_lock_is_recursive);
+
+/*
* We are not always called with irqs disabled - do that here,
* and also avoid lockdep recursion:
*/
@@ -4931,7 +5722,12 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion)) {
+ trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
+
+ if (!debug_locks)
+ return;
+
+ if (unlikely(!lockdep_enabled())) {
/* XXX allow trylock from NMI ?!? */
if (lockdep_nmi() && !trylock) {
struct held_lock hlock;
@@ -4954,10 +5750,9 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
- trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
+ lockdep_recursion_inc();
__lock_acquire(lock, subclass, trylock, read, check,
- irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 0);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
@@ -4967,13 +5762,15 @@ void lock_release(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ trace_lock_release(lock, ip);
+
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
- trace_lock_release(lock, ip);
+
+ lockdep_recursion_inc();
if (__lock_release(lock, ip))
check_chain_key(current);
lockdep_recursion_finish();
@@ -4981,18 +5778,50 @@ void lock_release(struct lockdep_map *lock, unsigned long ip)
}
EXPORT_SYMBOL_GPL(lock_release);
+/*
+ * lock_sync() - A special annotation for synchronize_{s,}rcu()-like API.
+ *
+ * No actual critical section is created by the APIs annotated with this: these
+ * APIs are used to wait for one or multiple critical sections (on other CPUs
+ * or threads), and it means that calling these APIs inside these critical
+ * sections is potential deadlock.
+ */
+void lock_sync(struct lockdep_map *lock, unsigned subclass, int read,
+ int check, struct lockdep_map *nest_lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(!lockdep_enabled()))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ lockdep_recursion_inc();
+ __lock_acquire(lock, subclass, 0, read, check,
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 1);
+ check_chain_key(current);
+ lockdep_recursion_finish();
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_sync);
+
noinstr int lock_is_held_type(const struct lockdep_map *lock, int read)
{
unsigned long flags;
- int ret = 0;
+ int ret = LOCK_STATE_NOT_HELD;
- if (unlikely(current->lockdep_recursion))
- return 1; /* avoid false negative lockdep_assert_held() */
+ /*
+ * Avoid false negative lockdep_assert_held() and
+ * lockdep_assert_not_held().
+ */
+ if (unlikely(!lockdep_enabled()))
+ return LOCK_STATE_UNKNOWN;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
ret = __lock_is_held(lock, read);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -5007,13 +5836,13 @@ struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
struct pin_cookie cookie = NIL_COOKIE;
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return cookie;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
cookie = __lock_pin_lock(lock);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -5026,13 +5855,13 @@ void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
__lock_repin_lock(lock, cookie);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -5043,13 +5872,13 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
__lock_unpin_lock(lock, cookie);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -5159,8 +5988,6 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
hlock->holdtime_stamp = now;
}
- trace_lock_acquired(lock, ip);
-
stats = get_lock_stats(hlock_class(hlock));
if (waittime) {
if (hlock->read)
@@ -5179,16 +6006,14 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
- if (unlikely(!lock_stat || !debug_locks))
- return;
+ trace_lock_contended(lock, ip);
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lock_stat || !lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
- trace_lock_contended(lock, ip);
+ lockdep_recursion_inc();
__lock_contended(lock, ip);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -5199,15 +6024,14 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
- if (unlikely(!lock_stat || !debug_locks))
- return;
+ trace_lock_acquired(lock, ip);
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lock_stat || !lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
__lock_acquired(lock, ip);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -5246,7 +6070,7 @@ static void remove_class_from_lock_chain(struct pending_free *pf,
int i;
for (i = chain->base; i < chain->base + chain->depth; i++) {
- if (chain_hlocks[i] != class - lock_classes)
+ if (chain_hlock_class_idx(chain_hlocks[i]) != class - lock_classes)
continue;
/*
* Each lock class occurs at most once in a lock chain so once
@@ -5319,6 +6143,8 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
WRITE_ONCE(class->name, NULL);
nr_lock_classes--;
__clear_bit(class - lock_classes, lock_classes_in_use);
+ if (class - lock_classes == max_lock_class_idx)
+ max_lock_class_idx--;
} else {
WARN_ONCE(true, "%s() failed for class %s\n", __func__,
class->name);
@@ -5330,13 +6156,10 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
static void reinit_class(struct lock_class *class)
{
- void *const p = class;
- const unsigned int offset = offsetof(struct lock_class, key);
-
WARN_ON_ONCE(!class->lock_entry.next);
WARN_ON_ONCE(!list_empty(&class->locks_after));
WARN_ON_ONCE(!list_empty(&class->locks_before));
- memset(p + offset, 0, sizeof(*class) - offset);
+ memset_startat(class, 0, key);
WARN_ON_ONCE(!class->lock_entry.next);
WARN_ON_ONCE(!list_empty(&class->locks_after));
WARN_ON_ONCE(!list_empty(&class->locks_before));
@@ -5609,7 +6432,13 @@ void lockdep_reset_lock(struct lockdep_map *lock)
lockdep_reset_lock_reg(lock);
}
-/* Unregister a dynamically allocated key. */
+/*
+ * Unregister a dynamically allocated key.
+ *
+ * Unlike lockdep_register_key(), a search is always done to find a matching
+ * key irrespective of debug_locks to avoid potential invalid access to freed
+ * memory in lock_class entry.
+ */
void lockdep_unregister_key(struct lock_class_key *key)
{
struct hlist_head *hash_head = keyhashentry(key);
@@ -5624,10 +6453,8 @@ void lockdep_unregister_key(struct lock_class_key *key)
return;
raw_local_irq_save(flags);
- if (!graph_lock())
- goto out_irq;
+ lockdep_lock();
- pf = get_pending_free();
hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
if (k == key) {
hlist_del_rcu(&k->hash_entry);
@@ -5635,11 +6462,13 @@ void lockdep_unregister_key(struct lock_class_key *key)
break;
}
}
- WARN_ON_ONCE(!found);
- __lockdep_free_key_range(pf, key, 1);
- call_rcu_zapped(pf);
- graph_unlock();
-out_irq:
+ WARN_ON_ONCE(!found && debug_locks);
+ if (found) {
+ pf = get_pending_free();
+ __lockdep_free_key_range(pf, key, 1);
+ call_rcu_zapped(pf);
+ }
+ lockdep_unlock();
raw_local_irq_restore(flags);
/* Wait until is_dynamic_key() has finished accessing k->hash_entry. */
@@ -5839,6 +6668,8 @@ asmlinkage __visible void lockdep_sys_exit(void)
void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
{
struct task_struct *curr = current;
+ int dl = READ_ONCE(debug_locks);
+ bool rcu = warn_rcu_enter();
/* Note: the following can be executed concurrently, so be careful. */
pr_warn("\n");
@@ -5848,17 +6679,16 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
pr_warn("-----------------------------\n");
pr_warn("%s:%d %s!\n", file, line, s);
pr_warn("\nother info that might help us debug this:\n\n");
- pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+ pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n%s",
!rcu_lockdep_current_cpu_online()
? "RCU used illegally from offline CPU!\n"
- : !rcu_is_watching()
- ? "RCU used illegally from idle CPU!\n"
- : "",
- rcu_scheduler_active, debug_locks);
+ : "",
+ rcu_scheduler_active, dl,
+ dl ? "" : "Possible false positive due to lockdep disabling via debug_locks = 0\n");
/*
* If a CPU is in the RCU-free window in idle (ie: in the section
- * between rcu_idle_enter() and rcu_idle_exit(), then RCU
+ * between ct_idle_enter() and ct_idle_exit(), then RCU
* considers that CPU to be in an "extended quiescent state",
* which means that RCU will be completely ignoring that CPU.
* Therefore, rcu_read_lock() and friends have absolutely no
@@ -5880,5 +6710,6 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
lockdep_print_held_locks(curr);
pr_warn("\nstack backtrace:\n");
dump_stack();
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index baca699b94e9..bbe9000260d0 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -19,9 +19,13 @@ enum lock_usage_bit {
#include "lockdep_states.h"
#undef LOCKDEP_STATE
LOCK_USED,
- LOCK_USAGE_STATES
+ LOCK_USED_READ,
+ LOCK_USAGE_STATES,
};
+/* states after LOCK_USED_READ are not traced and printed */
+static_assert(LOCK_TRACE_STATES == LOCK_USAGE_STATES);
+
#define LOCK_USAGE_READ_MASK 1
#define LOCK_USAGE_DIR_MASK 2
#define LOCK_USAGE_STATE_MASK (~(LOCK_USAGE_READ_MASK | LOCK_USAGE_DIR_MASK))
@@ -40,6 +44,7 @@ enum {
#include "lockdep_states.h"
#undef LOCKDEP_STATE
__LOCKF(USED)
+ __LOCKF(USED_READ)
};
#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE |
@@ -94,16 +99,16 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
#define MAX_STACK_TRACE_ENTRIES 262144UL
#define STACK_TRACE_HASH_SIZE 8192
#else
-#define MAX_LOCKDEP_ENTRIES 32768UL
+#define MAX_LOCKDEP_ENTRIES (1UL << CONFIG_LOCKDEP_BITS)
-#define MAX_LOCKDEP_CHAINS_BITS 16
+#define MAX_LOCKDEP_CHAINS_BITS CONFIG_LOCKDEP_CHAINS_BITS
/*
* Stack-trace: tightly packed array of stack backtrace
* addresses. Protected by the hash_lock.
*/
-#define MAX_STACK_TRACE_ENTRIES 524288UL
-#define STACK_TRACE_HASH_SIZE 16384
+#define MAX_STACK_TRACE_ENTRIES (1UL << CONFIG_LOCKDEP_STACK_TRACE_BITS)
+#define STACK_TRACE_HASH_SIZE (1 << CONFIG_LOCKDEP_STACK_TRACE_HASH_BITS)
#endif
/*
@@ -116,10 +121,9 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
-extern struct list_head all_lock_classes;
extern struct lock_chain lock_chains[];
-#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2)
+#define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1)
extern void get_usage_chars(struct lock_class *class,
char usage[LOCK_USAGE_CHARS]);
@@ -146,6 +150,10 @@ extern unsigned int nr_large_chain_blocks;
extern unsigned int max_lockdep_depth;
extern unsigned int max_bfs_queue_depth;
+extern unsigned long max_lock_class_idx;
+
+extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+extern unsigned long lock_classes_in_use[];
#ifdef CONFIG_PROVE_LOCKING
extern unsigned long lockdep_count_forward_deps(struct lock_class *);
@@ -200,7 +208,6 @@ struct lockdep_stats {
};
DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
-extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
#define __debug_atomic_inc(ptr) \
this_cpu_inc(lockdep_stats.ptr);
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 5525cd3ba0c8..e2bfb1db589d 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -24,14 +24,33 @@
#include "lockdep_internals.h"
+/*
+ * Since iteration of lock_classes is done without holding the lockdep lock,
+ * it is not safe to iterate all_lock_classes list directly as the iteration
+ * may branch off to free_lock_classes or the zapped list. Iteration is done
+ * directly on the lock_classes array by checking the lock_classes_in_use
+ * bitmap and max_lock_class_idx.
+ */
+#define iterate_lock_classes(idx, class) \
+ for (idx = 0, class = lock_classes; idx <= max_lock_class_idx; \
+ idx++, class++)
+
static void *l_next(struct seq_file *m, void *v, loff_t *pos)
{
- return seq_list_next(v, &all_lock_classes, pos);
+ struct lock_class *class = v;
+
+ ++class;
+ *pos = class - lock_classes;
+ return (*pos > max_lock_class_idx) ? NULL : class;
}
static void *l_start(struct seq_file *m, loff_t *pos)
{
- return seq_list_start_head(&all_lock_classes, *pos);
+ unsigned long idx = *pos;
+
+ if (idx > max_lock_class_idx)
+ return NULL;
+ return lock_classes + idx;
}
static void l_stop(struct seq_file *m, void *v)
@@ -57,39 +76,43 @@ static void print_name(struct seq_file *m, struct lock_class *class)
static int l_show(struct seq_file *m, void *v)
{
- struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
+ struct lock_class *class = v;
struct lock_list *entry;
char usage[LOCK_USAGE_CHARS];
+ int idx = class - lock_classes;
- if (v == &all_lock_classes) {
+ if (v == lock_classes)
seq_printf(m, "all lock classes:\n");
+
+ if (!test_bit(idx, lock_classes_in_use))
return 0;
- }
seq_printf(m, "%p", class->key);
#ifdef CONFIG_DEBUG_LOCKDEP
seq_printf(m, " OPS:%8ld", debug_class_ops_read(class));
#endif
-#ifdef CONFIG_PROVE_LOCKING
- seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
- seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
-#endif
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
+ seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
+ seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
- get_usage_chars(class, usage);
- seq_printf(m, " %s", usage);
+ get_usage_chars(class, usage);
+ seq_printf(m, " %s", usage);
+ }
seq_printf(m, ": ");
print_name(m, class);
seq_puts(m, "\n");
- list_for_each_entry(entry, &class->locks_after, entry) {
- if (entry->distance == 1) {
- seq_printf(m, " -> [%p] ", entry->class->key);
- print_name(m, entry->class);
- seq_puts(m, "\n");
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
+ list_for_each_entry(entry, &class->locks_after, entry) {
+ if (entry->distance == 1) {
+ seq_printf(m, " -> [%p] ", entry->class->key);
+ print_name(m, entry->class);
+ seq_puts(m, "\n");
+ }
}
+ seq_puts(m, "\n");
}
- seq_puts(m, "\n");
return 0;
}
@@ -218,8 +241,11 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
#ifdef CONFIG_PROVE_LOCKING
struct lock_class *class;
+ unsigned long idx;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
if (class->usage_mask == 0)
nr_unused++;
@@ -252,6 +278,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
sum_forward_deps += lockdep_count_forward_deps(class);
}
+
#ifdef CONFIG_DEBUG_LOCKDEP
DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
#endif
@@ -343,12 +370,14 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
seq_printf(m, " max bfs queue depth: %11u\n",
max_bfs_queue_depth);
#endif
+ seq_printf(m, " max lock class index: %11lu\n",
+ max_lock_class_idx);
lockdep_stats_debug_show(m);
seq_printf(m, " debug_locks: %11u\n",
debug_locks);
/*
- * Zappped classes and lockdep data buffers reuse statistics.
+ * Zapped classes and lockdep data buffers reuse statistics.
*/
seq_puts(m, "\n");
seq_printf(m, " zapped classes: %11lu\n",
@@ -411,7 +440,7 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr)
static void seq_time(struct seq_file *m, s64 time)
{
- char num[15];
+ char num[22];
snprint_time(num, sizeof(num), time);
seq_printf(m, " %14s", num);
@@ -423,7 +452,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
seq_time(m, lt->min);
seq_time(m, lt->max);
seq_time(m, lt->total);
- seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0);
+ seq_time(m, lt->nr ? div64_u64(lt->total, lt->nr) : 0);
}
static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
@@ -620,12 +649,16 @@ static int lock_stat_open(struct inode *inode, struct file *file)
if (!res) {
struct lock_stat_data *iter = data->stats;
struct seq_file *m = file->private_data;
+ unsigned long idx;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
iter->class = class;
iter->stats = lock_stats(class);
iter++;
}
+
data->iter_end = iter;
sort(data->stats, data->iter_end - data->stats,
@@ -643,6 +676,7 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct lock_class *class;
+ unsigned long idx;
char c;
if (count) {
@@ -652,8 +686,11 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf,
if (c != '0')
return count;
- list_for_each_entry(class, &all_lock_classes, lock_entry)
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
clear_lock_stats(class);
+ }
}
return count;
}
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 5efbfc68ce99..415d81e6ce70 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -27,45 +27,105 @@
#include <linux/moduleparam.h>
#include <linux/delay.h>
#include <linux/slab.h>
-#include <linux/percpu-rwsem.h>
#include <linux/torture.h>
+#include <linux/reboot.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
-torture_param(int, nwriters_stress, -1,
- "Number of write-locking stress-test threads");
-torture_param(int, nreaders_stress, -1,
- "Number of read-locking stress-test threads");
+torture_param(int, acq_writer_lim, 0, "Write_acquisition time limit (jiffies).");
+torture_param(int, call_rcu_chains, 0, "Self-propagate call_rcu() chains during test (0=disable).");
+torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable");
+torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
+torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads");
+torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads");
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
-torture_param(int, onoff_interval, 0,
- "Time between CPU hotplugs (s), 0=disable");
-torture_param(int, shuffle_interval, 3,
- "Number of jiffies between shuffles, 0=disable");
+torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, rt_boost, 2,
+ "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types.");
+torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
+torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable");
torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
-torture_param(int, stat_interval, 60,
- "Number of seconds between stats printk()s");
+torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
-torture_param(int, verbose, 1,
- "Enable verbose debugging printk()s");
+torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
+torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority");
+/* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */
+#define MAX_NESTED_LOCKS 8
-static char *torture_type = "spin_lock";
+static char *torture_type = IS_ENABLED(CONFIG_PREEMPT_RT) ? "raw_spin_lock" : "spin_lock";
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type,
"Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
+static cpumask_var_t bind_readers; // Bind the readers to the specified set of CPUs.
+static cpumask_var_t bind_writers; // Bind the writers to the specified set of CPUs.
+
+// Parse a cpumask kernel parameter. If there are more users later on,
+// this might need to got to a more central location.
+static int param_set_cpumask(const char *val, const struct kernel_param *kp)
+{
+ cpumask_var_t *cm_bind = kp->arg;
+ int ret;
+ char *s;
+
+ if (!alloc_cpumask_var(cm_bind, GFP_KERNEL)) {
+ s = "Out of memory";
+ ret = -ENOMEM;
+ goto out_err;
+ }
+ ret = cpulist_parse(val, *cm_bind);
+ if (!ret)
+ return ret;
+ s = "Bad CPU range";
+out_err:
+ pr_warn("%s: %s, all CPUs set\n", kp->name, s);
+ cpumask_setall(*cm_bind);
+ return ret;
+}
+
+// Output a cpumask kernel parameter.
+static int param_get_cpumask(char *buffer, const struct kernel_param *kp)
+{
+ cpumask_var_t *cm_bind = kp->arg;
+
+ return sprintf(buffer, "%*pbl", cpumask_pr_args(*cm_bind));
+}
+
+static bool cpumask_nonempty(cpumask_var_t mask)
+{
+ return cpumask_available(mask) && !cpumask_empty(mask);
+}
+
+static const struct kernel_param_ops lt_bind_ops = {
+ .set = param_set_cpumask,
+ .get = param_get_cpumask,
+};
+
+module_param_cb(bind_readers, &lt_bind_ops, &bind_readers, 0644);
+module_param_cb(bind_writers, &lt_bind_ops, &bind_writers, 0644);
+
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
+
static struct task_struct *stats_task;
static struct task_struct **writer_tasks;
static struct task_struct **reader_tasks;
static bool lock_is_write_held;
-static bool lock_is_read_held;
+static atomic_t lock_is_read_held;
+static unsigned long last_lock_release;
struct lock_stress_stats {
long n_lock_fail;
long n_lock_acquired;
};
+struct call_rcu_chain {
+ struct rcu_head crc_rh;
+ bool crc_stop;
+};
+struct call_rcu_chain *call_rcu_chain_list;
+
/* Forward reference. */
static void lock_torture_cleanup(void);
@@ -74,13 +134,16 @@ static void lock_torture_cleanup(void);
*/
struct lock_torture_ops {
void (*init)(void);
- int (*writelock)(void);
+ void (*exit)(void);
+ int (*nested_lock)(int tid, u32 lockset);
+ int (*writelock)(int tid);
void (*write_delay)(struct torture_random_state *trsp);
void (*task_boost)(struct torture_random_state *trsp);
- void (*writeunlock)(void);
- int (*readlock)(void);
+ void (*writeunlock)(int tid);
+ void (*nested_unlock)(int tid, u32 lockset);
+ int (*readlock)(int tid);
void (*read_delay)(struct torture_random_state *trsp);
- void (*readunlock)(void);
+ void (*readunlock)(int tid);
unsigned long flags; /* for irq spinlocks */
const char *name;
@@ -90,49 +153,82 @@ struct lock_torture_cxt {
int nrealwriters_stress;
int nrealreaders_stress;
bool debug_lock;
+ bool init_called;
atomic_t n_lock_torture_errors;
struct lock_torture_ops *cur_ops;
struct lock_stress_stats *lwsa; /* writer statistics */
struct lock_stress_stats *lrsa; /* reader statistics */
};
-static struct lock_torture_cxt cxt = { 0, 0, false,
+static struct lock_torture_cxt cxt = { 0, 0, false, false,
ATOMIC_INIT(0),
NULL, NULL};
/*
* Definitions for lock torture testing.
*/
-static int torture_lock_busted_write_lock(void)
+static int torture_lock_busted_write_lock(int tid __maybe_unused)
{
return 0; /* BUGGY, do not use in real life!!! */
}
static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_lock_busted_write_unlock(void)
+static void torture_lock_busted_write_unlock(int tid __maybe_unused)
{
/* BUGGY, do not use in real life!!! */
}
-static void torture_boost_dummy(struct torture_random_state *trsp)
+static void __torture_rt_boost(struct torture_random_state *trsp)
{
- /* Only rtmutexes care about priority */
+ const unsigned int factor = rt_boost_factor;
+
+ if (!rt_task(current)) {
+ /*
+ * Boost priority once every rt_boost_factor operations. When
+ * the task tries to take the lock, the rtmutex it will account
+ * for the new priority, and do any corresponding pi-dance.
+ */
+ if (trsp && !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor))) {
+ sched_set_fifo(current);
+ } else /* common case, do nothing */
+ return;
+ } else {
+ /*
+ * The task will remain boosted for another 10 * rt_boost_factor
+ * operations, then restored back to its original prio, and so
+ * forth.
+ *
+ * When @trsp is nil, we want to force-reset the task for
+ * stopping the kthread.
+ */
+ if (!trsp || !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor * 2))) {
+ sched_set_normal(current, 0);
+ } else /* common case, do nothing */
+ return;
+ }
+}
+
+static void torture_rt_boost(struct torture_random_state *trsp)
+{
+ if (rt_boost != 2)
+ return;
+
+ __torture_rt_boost(trsp);
}
static struct lock_torture_ops lock_busted_ops = {
.writelock = torture_lock_busted_write_lock,
.write_delay = torture_lock_busted_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_lock_busted_write_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -142,7 +238,8 @@ static struct lock_torture_ops lock_busted_ops = {
static DEFINE_SPINLOCK(torture_spinlock);
-static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
+static int torture_spin_lock_write_lock(int tid __maybe_unused)
+__acquires(torture_spinlock)
{
spin_lock(&torture_spinlock);
return 0;
@@ -151,22 +248,24 @@ static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = 100;
+ unsigned long j;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) {
+ j = jiffies;
+ mdelay(long_hold);
+ pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j);
+ }
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us)))
udelay(shortdelay_us);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
+static void torture_spin_lock_write_unlock(int tid __maybe_unused)
+__releases(torture_spinlock)
{
spin_unlock(&torture_spinlock);
}
@@ -174,7 +273,7 @@ static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_ops = {
.writelock = torture_spin_lock_write_lock,
.write_delay = torture_spin_lock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_spin_lock_write_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -182,7 +281,7 @@ static struct lock_torture_ops spin_lock_ops = {
.name = "spin_lock"
};
-static int torture_spin_lock_write_lock_irq(void)
+static int torture_spin_lock_write_lock_irq(int tid __maybe_unused)
__acquires(torture_spinlock)
{
unsigned long flags;
@@ -192,7 +291,7 @@ __acquires(torture_spinlock)
return 0;
}
-static void torture_lock_spin_write_unlock_irq(void)
+static void torture_lock_spin_write_unlock_irq(int tid __maybe_unused)
__releases(torture_spinlock)
{
spin_unlock_irqrestore(&torture_spinlock, cxt.cur_ops->flags);
@@ -201,7 +300,7 @@ __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_irq_ops = {
.writelock = torture_spin_lock_write_lock_irq,
.write_delay = torture_spin_lock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_lock_spin_write_unlock_irq,
.readlock = NULL,
.read_delay = NULL,
@@ -209,9 +308,63 @@ static struct lock_torture_ops spin_lock_irq_ops = {
.name = "spin_lock_irq"
};
+static DEFINE_RAW_SPINLOCK(torture_raw_spinlock);
+
+static int torture_raw_spin_lock_write_lock(int tid __maybe_unused)
+__acquires(torture_raw_spinlock)
+{
+ raw_spin_lock(&torture_raw_spinlock);
+ return 0;
+}
+
+static void torture_raw_spin_lock_write_unlock(int tid __maybe_unused)
+__releases(torture_raw_spinlock)
+{
+ raw_spin_unlock(&torture_raw_spinlock);
+}
+
+static struct lock_torture_ops raw_spin_lock_ops = {
+ .writelock = torture_raw_spin_lock_write_lock,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_spin_lock_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_spin_lock"
+};
+
+static int torture_raw_spin_lock_write_lock_irq(int tid __maybe_unused)
+__acquires(torture_raw_spinlock)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&torture_raw_spinlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_raw_spin_lock_write_unlock_irq(int tid __maybe_unused)
+__releases(torture_raw_spinlock)
+{
+ raw_spin_unlock_irqrestore(&torture_raw_spinlock, cxt.cur_ops->flags);
+}
+
+static struct lock_torture_ops raw_spin_lock_irq_ops = {
+ .writelock = torture_raw_spin_lock_write_lock_irq,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_spin_lock_write_unlock_irq,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_spin_lock_irq"
+};
+
static DEFINE_RWLOCK(torture_rwlock);
-static int torture_rwlock_write_lock(void) __acquires(torture_rwlock)
+static int torture_rwlock_write_lock(int tid __maybe_unused)
+__acquires(torture_rwlock)
{
write_lock(&torture_rwlock);
return 0;
@@ -220,24 +373,24 @@ static int torture_rwlock_write_lock(void) __acquires(torture_rwlock)
static void torture_rwlock_write_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = 100;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold);
else
udelay(shortdelay_us);
}
-static void torture_rwlock_write_unlock(void) __releases(torture_rwlock)
+static void torture_rwlock_write_unlock(int tid __maybe_unused)
+__releases(torture_rwlock)
{
write_unlock(&torture_rwlock);
}
-static int torture_rwlock_read_lock(void) __acquires(torture_rwlock)
+static int torture_rwlock_read_lock(int tid __maybe_unused)
+__acquires(torture_rwlock)
{
read_lock(&torture_rwlock);
return 0;
@@ -246,19 +399,18 @@ static int torture_rwlock_read_lock(void) __acquires(torture_rwlock)
static void torture_rwlock_read_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 10;
- const unsigned long longdelay_ms = 100;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold)))
+ mdelay(long_hold);
else
udelay(shortdelay_us);
}
-static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
+static void torture_rwlock_read_unlock(int tid __maybe_unused)
+__releases(torture_rwlock)
{
read_unlock(&torture_rwlock);
}
@@ -266,7 +418,7 @@ static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
static struct lock_torture_ops rw_lock_ops = {
.writelock = torture_rwlock_write_lock,
.write_delay = torture_rwlock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwlock_write_unlock,
.readlock = torture_rwlock_read_lock,
.read_delay = torture_rwlock_read_delay,
@@ -274,7 +426,8 @@ static struct lock_torture_ops rw_lock_ops = {
.name = "rw_lock"
};
-static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock)
+static int torture_rwlock_write_lock_irq(int tid __maybe_unused)
+__acquires(torture_rwlock)
{
unsigned long flags;
@@ -283,13 +436,14 @@ static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock)
return 0;
}
-static void torture_rwlock_write_unlock_irq(void)
+static void torture_rwlock_write_unlock_irq(int tid __maybe_unused)
__releases(torture_rwlock)
{
write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
}
-static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
+static int torture_rwlock_read_lock_irq(int tid __maybe_unused)
+__acquires(torture_rwlock)
{
unsigned long flags;
@@ -298,7 +452,7 @@ static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
return 0;
}
-static void torture_rwlock_read_unlock_irq(void)
+static void torture_rwlock_read_unlock_irq(int tid __maybe_unused)
__releases(torture_rwlock)
{
read_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
@@ -307,7 +461,7 @@ __releases(torture_rwlock)
static struct lock_torture_ops rw_lock_irq_ops = {
.writelock = torture_rwlock_write_lock_irq,
.write_delay = torture_rwlock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwlock_write_unlock_irq,
.readlock = torture_rwlock_read_lock_irq,
.read_delay = torture_rwlock_read_delay,
@@ -316,8 +470,31 @@ static struct lock_torture_ops rw_lock_irq_ops = {
};
static DEFINE_MUTEX(torture_mutex);
+static struct mutex torture_nested_mutexes[MAX_NESTED_LOCKS];
+static struct lock_class_key nested_mutex_keys[MAX_NESTED_LOCKS];
+
+static void torture_mutex_init(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_NESTED_LOCKS; i++)
+ __mutex_init(&torture_nested_mutexes[i], __func__,
+ &nested_mutex_keys[i]);
+}
+
+static int torture_mutex_nested_lock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = 0; i < nested_locks; i++)
+ if (lockset & (1 << i))
+ mutex_lock(&torture_nested_mutexes[i]);
+ return 0;
+}
-static int torture_mutex_lock(void) __acquires(torture_mutex)
+static int torture_mutex_lock(int tid __maybe_unused)
+__acquires(torture_mutex)
{
mutex_lock(&torture_mutex);
return 0;
@@ -325,28 +502,37 @@ static int torture_mutex_lock(void) __acquires(torture_mutex)
static void torture_mutex_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms * 5);
- else
- mdelay(longdelay_ms / 5);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold * 5);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_mutex_unlock(void) __releases(torture_mutex)
+static void torture_mutex_unlock(int tid __maybe_unused)
+__releases(torture_mutex)
{
mutex_unlock(&torture_mutex);
}
+static void torture_mutex_nested_unlock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = nested_locks - 1; i >= 0; i--)
+ if (lockset & (1 << i))
+ mutex_unlock(&torture_nested_mutexes[i]);
+}
+
static struct lock_torture_ops mutex_lock_ops = {
+ .init = torture_mutex_init,
+ .nested_lock = torture_mutex_nested_lock,
.writelock = torture_mutex_lock,
.write_delay = torture_mutex_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_mutex_unlock,
+ .nested_unlock = torture_mutex_nested_unlock,
.readlock = NULL,
.read_delay = NULL,
.readunlock = NULL,
@@ -354,12 +540,34 @@ static struct lock_torture_ops mutex_lock_ops = {
};
#include <linux/ww_mutex.h>
+/*
+ * The torture ww_mutexes should belong to the same lock class as
+ * torture_ww_class to avoid lockdep problem. The ww_mutex_init()
+ * function is called for initialization to ensure that.
+ */
static DEFINE_WD_CLASS(torture_ww_class);
-static DEFINE_WW_MUTEX(torture_ww_mutex_0, &torture_ww_class);
-static DEFINE_WW_MUTEX(torture_ww_mutex_1, &torture_ww_class);
-static DEFINE_WW_MUTEX(torture_ww_mutex_2, &torture_ww_class);
+static struct ww_mutex torture_ww_mutex_0, torture_ww_mutex_1, torture_ww_mutex_2;
+static struct ww_acquire_ctx *ww_acquire_ctxs;
-static int torture_ww_mutex_lock(void)
+static void torture_ww_mutex_init(void)
+{
+ ww_mutex_init(&torture_ww_mutex_0, &torture_ww_class);
+ ww_mutex_init(&torture_ww_mutex_1, &torture_ww_class);
+ ww_mutex_init(&torture_ww_mutex_2, &torture_ww_class);
+
+ ww_acquire_ctxs = kmalloc_array(cxt.nrealwriters_stress,
+ sizeof(*ww_acquire_ctxs),
+ GFP_KERNEL);
+ if (!ww_acquire_ctxs)
+ VERBOSE_TOROUT_STRING("ww_acquire_ctx: Out of memory");
+}
+
+static void torture_ww_mutex_exit(void)
+{
+ kfree(ww_acquire_ctxs);
+}
+
+static int torture_ww_mutex_lock(int tid)
__acquires(torture_ww_mutex_0)
__acquires(torture_ww_mutex_1)
__acquires(torture_ww_mutex_2)
@@ -369,7 +577,7 @@ __acquires(torture_ww_mutex_2)
struct list_head link;
struct ww_mutex *lock;
} locks[3], *ll, *ln;
- struct ww_acquire_ctx ctx;
+ struct ww_acquire_ctx *ctx = &ww_acquire_ctxs[tid];
locks[0].lock = &torture_ww_mutex_0;
list_add(&locks[0].link, &list);
@@ -380,12 +588,12 @@ __acquires(torture_ww_mutex_2)
locks[2].lock = &torture_ww_mutex_2;
list_add(&locks[2].link, &list);
- ww_acquire_init(&ctx, &torture_ww_class);
+ ww_acquire_init(ctx, &torture_ww_class);
list_for_each_entry(ll, &list, link) {
int err;
- err = ww_mutex_lock(ll->lock, &ctx);
+ err = ww_mutex_lock(ll->lock, ctx);
if (!err)
continue;
@@ -396,28 +604,32 @@ __acquires(torture_ww_mutex_2)
if (err != -EDEADLK)
return err;
- ww_mutex_lock_slow(ll->lock, &ctx);
+ ww_mutex_lock_slow(ll->lock, ctx);
list_move(&ll->link, &list);
}
- ww_acquire_fini(&ctx);
return 0;
}
-static void torture_ww_mutex_unlock(void)
+static void torture_ww_mutex_unlock(int tid)
__releases(torture_ww_mutex_0)
__releases(torture_ww_mutex_1)
__releases(torture_ww_mutex_2)
{
+ struct ww_acquire_ctx *ctx = &ww_acquire_ctxs[tid];
+
ww_mutex_unlock(&torture_ww_mutex_0);
ww_mutex_unlock(&torture_ww_mutex_1);
ww_mutex_unlock(&torture_ww_mutex_2);
+ ww_acquire_fini(ctx);
}
static struct lock_torture_ops ww_mutex_lock_ops = {
+ .init = torture_ww_mutex_init,
+ .exit = torture_ww_mutex_exit,
.writelock = torture_ww_mutex_lock,
.write_delay = torture_mutex_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_ww_mutex_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -427,79 +639,85 @@ static struct lock_torture_ops ww_mutex_lock_ops = {
#ifdef CONFIG_RT_MUTEXES
static DEFINE_RT_MUTEX(torture_rtmutex);
+static struct rt_mutex torture_nested_rtmutexes[MAX_NESTED_LOCKS];
+static struct lock_class_key nested_rtmutex_keys[MAX_NESTED_LOCKS];
-static int torture_rtmutex_lock(void) __acquires(torture_rtmutex)
+static void torture_rtmutex_init(void)
{
- rt_mutex_lock(&torture_rtmutex);
- return 0;
+ int i;
+
+ for (i = 0; i < MAX_NESTED_LOCKS; i++)
+ __rt_mutex_init(&torture_nested_rtmutexes[i], __func__,
+ &nested_rtmutex_keys[i]);
}
-static void torture_rtmutex_boost(struct torture_random_state *trsp)
+static int torture_rtmutex_nested_lock(int tid __maybe_unused,
+ u32 lockset)
{
- int policy;
- struct sched_param param;
- const unsigned int factor = 50000; /* yes, quite arbitrary */
+ int i;
- if (!rt_task(current)) {
- /*
- * Boost priority once every ~50k operations. When the
- * task tries to take the lock, the rtmutex it will account
- * for the new priority, and do any corresponding pi-dance.
- */
- if (trsp && !(torture_random(trsp) %
- (cxt.nrealwriters_stress * factor))) {
- policy = SCHED_FIFO;
- param.sched_priority = MAX_RT_PRIO - 1;
- } else /* common case, do nothing */
- return;
- } else {
- /*
- * The task will remain boosted for another ~500k operations,
- * then restored back to its original prio, and so forth.
- *
- * When @trsp is nil, we want to force-reset the task for
- * stopping the kthread.
- */
- if (!trsp || !(torture_random(trsp) %
- (cxt.nrealwriters_stress * factor * 2))) {
- policy = SCHED_NORMAL;
- param.sched_priority = 0;
- } else /* common case, do nothing */
- return;
- }
+ for (i = 0; i < nested_locks; i++)
+ if (lockset & (1 << i))
+ rt_mutex_lock(&torture_nested_rtmutexes[i]);
+ return 0;
+}
- sched_setscheduler_nocheck(current, policy, &param);
+static int torture_rtmutex_lock(int tid __maybe_unused)
+__acquires(torture_rtmutex)
+{
+ rt_mutex_lock(&torture_rtmutex);
+ return 0;
}
static void torture_rtmutex_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = 100;
/*
* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold);
if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+ (cxt.nrealwriters_stress * 200 * shortdelay_us)))
udelay(shortdelay_us);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_rtmutex_unlock(void) __releases(torture_rtmutex)
+static void torture_rtmutex_unlock(int tid __maybe_unused)
+__releases(torture_rtmutex)
{
rt_mutex_unlock(&torture_rtmutex);
}
+static void torture_rt_boost_rtmutex(struct torture_random_state *trsp)
+{
+ if (!rt_boost)
+ return;
+
+ __torture_rt_boost(trsp);
+}
+
+static void torture_rtmutex_nested_unlock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = nested_locks - 1; i >= 0; i--)
+ if (lockset & (1 << i))
+ rt_mutex_unlock(&torture_nested_rtmutexes[i]);
+}
+
static struct lock_torture_ops rtmutex_lock_ops = {
+ .init = torture_rtmutex_init,
+ .nested_lock = torture_rtmutex_nested_lock,
.writelock = torture_rtmutex_lock,
.write_delay = torture_rtmutex_delay,
- .task_boost = torture_rtmutex_boost,
+ .task_boost = torture_rt_boost_rtmutex,
.writeunlock = torture_rtmutex_unlock,
+ .nested_unlock = torture_rtmutex_nested_unlock,
.readlock = NULL,
.read_delay = NULL,
.readunlock = NULL,
@@ -508,7 +726,8 @@ static struct lock_torture_ops rtmutex_lock_ops = {
#endif
static DECLARE_RWSEM(torture_rwsem);
-static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
+static int torture_rwsem_down_write(int tid __maybe_unused)
+__acquires(torture_rwsem)
{
down_write(&torture_rwsem);
return 0;
@@ -516,24 +735,21 @@ static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
static void torture_rwsem_write_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms * 10);
- else
- mdelay(longdelay_ms / 10);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold * 10);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_rwsem_up_write(void) __releases(torture_rwsem)
+static void torture_rwsem_up_write(int tid __maybe_unused)
+__releases(torture_rwsem)
{
up_write(&torture_rwsem);
}
-static int torture_rwsem_down_read(void) __acquires(torture_rwsem)
+static int torture_rwsem_down_read(int tid __maybe_unused)
+__acquires(torture_rwsem)
{
down_read(&torture_rwsem);
return 0;
@@ -541,19 +757,17 @@ static int torture_rwsem_down_read(void) __acquires(torture_rwsem)
static void torture_rwsem_read_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms * 2);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold)))
+ mdelay(long_hold * 2);
else
- mdelay(longdelay_ms / 2);
+ mdelay(long_hold / 2);
if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_rwsem_up_read(void) __releases(torture_rwsem)
+static void torture_rwsem_up_read(int tid __maybe_unused)
+__releases(torture_rwsem)
{
up_read(&torture_rwsem);
}
@@ -561,7 +775,7 @@ static void torture_rwsem_up_read(void) __releases(torture_rwsem)
static struct lock_torture_ops rwsem_lock_ops = {
.writelock = torture_rwsem_down_write,
.write_delay = torture_rwsem_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwsem_up_write,
.readlock = torture_rwsem_down_read,
.read_delay = torture_rwsem_read_delay,
@@ -572,38 +786,48 @@ static struct lock_torture_ops rwsem_lock_ops = {
#include <linux/percpu-rwsem.h>
static struct percpu_rw_semaphore pcpu_rwsem;
-void torture_percpu_rwsem_init(void)
+static void torture_percpu_rwsem_init(void)
{
BUG_ON(percpu_init_rwsem(&pcpu_rwsem));
}
-static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem)
+static void torture_percpu_rwsem_exit(void)
+{
+ percpu_free_rwsem(&pcpu_rwsem);
+}
+
+static int torture_percpu_rwsem_down_write(int tid __maybe_unused)
+__acquires(pcpu_rwsem)
{
percpu_down_write(&pcpu_rwsem);
return 0;
}
-static void torture_percpu_rwsem_up_write(void) __releases(pcpu_rwsem)
+static void torture_percpu_rwsem_up_write(int tid __maybe_unused)
+__releases(pcpu_rwsem)
{
percpu_up_write(&pcpu_rwsem);
}
-static int torture_percpu_rwsem_down_read(void) __acquires(pcpu_rwsem)
+static int torture_percpu_rwsem_down_read(int tid __maybe_unused)
+__acquires(pcpu_rwsem)
{
percpu_down_read(&pcpu_rwsem);
return 0;
}
-static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem)
+static void torture_percpu_rwsem_up_read(int tid __maybe_unused)
+__releases(pcpu_rwsem)
{
percpu_up_read(&pcpu_rwsem);
}
static struct lock_torture_ops percpu_rwsem_lock_ops = {
.init = torture_percpu_rwsem_init,
+ .exit = torture_percpu_rwsem_exit,
.writelock = torture_percpu_rwsem_down_write,
.write_delay = torture_rwsem_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_percpu_rwsem_up_write,
.readlock = torture_percpu_rwsem_down_read,
.read_delay = torture_rwsem_read_delay,
@@ -617,28 +841,63 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = {
*/
static int lock_torture_writer(void *arg)
{
+ unsigned long j;
+ unsigned long j1;
+ u32 lockset_mask;
struct lock_stress_stats *lwsp = arg;
DEFINE_TORTURE_RANDOM(rand);
+ bool skip_main_lock;
+ int tid = lwsp - cxt.lwsa;
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
- set_user_nice(current, MAX_NICE);
+ if (!rt_task(current))
+ set_user_nice(current, MAX_NICE);
do {
if ((torture_random(&rand) & 0xfffff) == 0)
schedule_timeout_uninterruptible(1);
+ lockset_mask = torture_random(&rand);
+ /*
+ * When using nested_locks, we want to occasionally
+ * skip the main lock so we can avoid always serializing
+ * the lock chains on that central lock. By skipping the
+ * main lock occasionally, we can create different
+ * contention patterns (allowing for multiple disjoint
+ * blocked trees)
+ */
+ skip_main_lock = (nested_locks &&
+ !(torture_random(&rand) % 100));
+
cxt.cur_ops->task_boost(&rand);
- cxt.cur_ops->writelock();
- if (WARN_ON_ONCE(lock_is_write_held))
- lwsp->n_lock_fail++;
- lock_is_write_held = 1;
- if (WARN_ON_ONCE(lock_is_read_held))
- lwsp->n_lock_fail++; /* rare, but... */
+ if (cxt.cur_ops->nested_lock)
+ cxt.cur_ops->nested_lock(tid, lockset_mask);
+
+ if (!skip_main_lock) {
+ if (acq_writer_lim > 0)
+ j = jiffies;
+ cxt.cur_ops->writelock(tid);
+ if (WARN_ON_ONCE(lock_is_write_held))
+ lwsp->n_lock_fail++;
+ lock_is_write_held = true;
+ if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
+ lwsp->n_lock_fail++; /* rare, but... */
+ if (acq_writer_lim > 0) {
+ j1 = jiffies;
+ WARN_ONCE(time_after(j1, j + acq_writer_lim),
+ "%s: Lock acquisition took %lu jiffies.\n",
+ __func__, j1 - j);
+ }
+ lwsp->n_lock_acquired++;
+
+ cxt.cur_ops->write_delay(&rand);
- lwsp->n_lock_acquired++;
- cxt.cur_ops->write_delay(&rand);
- lock_is_write_held = 0;
- cxt.cur_ops->writeunlock();
+ lock_is_write_held = false;
+ WRITE_ONCE(last_lock_release, jiffies);
+ cxt.cur_ops->writeunlock(tid);
+ }
+ if (cxt.cur_ops->nested_unlock)
+ cxt.cur_ops->nested_unlock(tid, lockset_mask);
stutter_wait("lock_torture_writer");
} while (!torture_must_stop());
@@ -655,6 +914,7 @@ static int lock_torture_writer(void *arg)
static int lock_torture_reader(void *arg)
{
struct lock_stress_stats *lrsp = arg;
+ int tid = lrsp - cxt.lrsa;
DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("lock_torture_reader task started");
@@ -664,15 +924,15 @@ static int lock_torture_reader(void *arg)
if ((torture_random(&rand) & 0xfffff) == 0)
schedule_timeout_uninterruptible(1);
- cxt.cur_ops->readlock();
- lock_is_read_held = 1;
+ cxt.cur_ops->readlock(tid);
+ atomic_inc(&lock_is_read_held);
if (WARN_ON_ONCE(lock_is_write_held))
lrsp->n_lock_fail++; /* rare, but... */
lrsp->n_lock_acquired++;
cxt.cur_ops->read_delay(&rand);
- lock_is_read_held = 0;
- cxt.cur_ops->readunlock();
+ atomic_dec(&lock_is_read_held);
+ cxt.cur_ops->readunlock(tid);
stutter_wait("lock_torture_reader");
} while (!torture_must_stop());
@@ -686,20 +946,22 @@ static int lock_torture_reader(void *arg)
static void __torture_print_stats(char *page,
struct lock_stress_stats *statp, bool write)
{
- bool fail = 0;
+ long cur;
+ bool fail = false;
int i, n_stress;
- long max = 0, min = statp ? statp[0].n_lock_acquired : 0;
+ long max = 0, min = statp ? data_race(statp[0].n_lock_acquired) : 0;
long long sum = 0;
n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress;
for (i = 0; i < n_stress; i++) {
- if (statp[i].n_lock_fail)
+ if (data_race(statp[i].n_lock_fail))
fail = true;
- sum += statp[i].n_lock_acquired;
- if (max < statp[i].n_lock_acquired)
- max = statp[i].n_lock_acquired;
- if (min > statp[i].n_lock_acquired)
- min = statp[i].n_lock_acquired;
+ cur = data_race(statp[i].n_lock_acquired);
+ sum += cur;
+ if (max < cur)
+ max = cur;
+ if (min > cur)
+ min = cur;
}
page += sprintf(page,
"%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
@@ -771,16 +1033,69 @@ static int lock_torture_stats(void *arg)
return 0;
}
+
static inline void
lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
const char *tag)
{
+ static cpumask_t cpumask_all;
+ cpumask_t *rcmp = cpumask_nonempty(bind_readers) ? bind_readers : &cpumask_all;
+ cpumask_t *wcmp = cpumask_nonempty(bind_writers) ? bind_writers : &cpumask_all;
+
+ cpumask_setall(&cpumask_all);
pr_alert("%s" TORTURE_FLAG
- "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+ "--- %s%s: acq_writer_lim=%d bind_readers=%*pbl bind_writers=%*pbl call_rcu_chains=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d\n",
torture_type, tag, cxt.debug_lock ? " [debug]": "",
- cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval,
- verbose, shuffle_interval, stutter, shutdown_secs,
- onoff_interval, onoff_holdoff);
+ acq_writer_lim, cpumask_pr_args(rcmp), cpumask_pr_args(wcmp),
+ call_rcu_chains, long_hold, nested_locks, cxt.nrealreaders_stress,
+ cxt.nrealwriters_stress, onoff_holdoff, onoff_interval, rt_boost,
+ rt_boost_factor, shuffle_interval, shutdown_secs, stat_interval, stutter,
+ verbose, writer_fifo);
+}
+
+// If requested, maintain call_rcu() chains to keep a grace period always
+// in flight. These increase the probability of getting an RCU CPU stall
+// warning and associated diagnostics when a locking primitive stalls.
+
+static void call_rcu_chain_cb(struct rcu_head *rhp)
+{
+ struct call_rcu_chain *crcp = container_of(rhp, struct call_rcu_chain, crc_rh);
+
+ if (!smp_load_acquire(&crcp->crc_stop)) {
+ (void)start_poll_synchronize_rcu(); // Start one grace period...
+ call_rcu(&crcp->crc_rh, call_rcu_chain_cb); // ... and later start another.
+ }
+}
+
+// Start the requested number of call_rcu() chains.
+static int call_rcu_chain_init(void)
+{
+ int i;
+
+ if (call_rcu_chains <= 0)
+ return 0;
+ call_rcu_chain_list = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain_list), GFP_KERNEL);
+ if (!call_rcu_chain_list)
+ return -ENOMEM;
+ for (i = 0; i < call_rcu_chains; i++) {
+ call_rcu_chain_list[i].crc_stop = false;
+ call_rcu(&call_rcu_chain_list[i].crc_rh, call_rcu_chain_cb);
+ }
+ return 0;
+}
+
+// Stop all of the call_rcu() chains.
+static void call_rcu_chain_cleanup(void)
+{
+ int i;
+
+ if (!call_rcu_chain_list)
+ return;
+ for (i = 0; i < call_rcu_chains; i++)
+ smp_store_release(&call_rcu_chain_list[i].crc_stop, true);
+ rcu_barrier();
+ kfree(call_rcu_chain_list);
+ call_rcu_chain_list = NULL;
}
static void lock_torture_cleanup(void)
@@ -792,17 +1107,17 @@ static void lock_torture_cleanup(void)
/*
* Indicates early cleanup, meaning that the test has not run,
- * such as when passing bogus args when loading the module. As
- * such, only perform the underlying torture-specific cleanups,
- * and avoid anything related to locktorture.
+ * such as when passing bogus args when loading the module.
+ * However cxt->cur_ops.init() may have been invoked, so beside
+ * perform the underlying torture-specific cleanups, cur_ops.exit()
+ * will be invoked if needed.
*/
if (!cxt.lwsa && !cxt.lrsa)
goto end;
if (writer_tasks) {
for (i = 0; i < cxt.nrealwriters_stress; i++)
- torture_stop_kthread(lock_torture_writer,
- writer_tasks[i]);
+ torture_stop_kthread(lock_torture_writer, writer_tasks[i]);
kfree(writer_tasks);
writer_tasks = NULL;
}
@@ -833,7 +1148,14 @@ static void lock_torture_cleanup(void)
kfree(cxt.lrsa);
cxt.lrsa = NULL;
+ call_rcu_chain_cleanup();
+
end:
+ if (cxt.init_called) {
+ if (cxt.cur_ops->exit)
+ cxt.cur_ops->exit();
+ cxt.init_called = false;
+ }
torture_cleanup_end();
}
@@ -844,6 +1166,7 @@ static int __init lock_torture_init(void)
static struct lock_torture_ops *torture_ops[] = {
&lock_busted_ops,
&spin_lock_ops, &spin_lock_irq_ops,
+ &raw_spin_lock_ops, &raw_spin_lock_irq_ops,
&rw_lock_ops, &rw_lock_irq_ops,
&mutex_lock_ops,
&ww_mutex_lock_ops,
@@ -874,20 +1197,23 @@ static int __init lock_torture_init(void)
goto unwind;
}
- if (nwriters_stress == 0 && nreaders_stress == 0) {
+ if (nwriters_stress == 0 &&
+ (!cxt.cur_ops->readlock || nreaders_stress == 0)) {
pr_alert("lock-torture: must run at least one locking thread\n");
firsterr = -EINVAL;
goto unwind;
}
- if (cxt.cur_ops->init)
- cxt.cur_ops->init();
-
if (nwriters_stress >= 0)
cxt.nrealwriters_stress = nwriters_stress;
else
cxt.nrealwriters_stress = 2 * num_online_cpus();
+ if (cxt.cur_ops->init) {
+ cxt.cur_ops->init();
+ cxt.init_called = true;
+ }
+
#ifdef CONFIG_DEBUG_MUTEXES
if (str_has_prefix(torture_type, "mutex"))
cxt.debug_lock = true;
@@ -904,7 +1230,7 @@ static int __init lock_torture_init(void)
/* Initialize the statistics so that each run gets its own numbers. */
if (nwriters_stress) {
- lock_is_write_held = 0;
+ lock_is_write_held = false;
cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress,
sizeof(*cxt.lwsa),
GFP_KERNEL);
@@ -935,7 +1261,6 @@ static int __init lock_torture_init(void)
}
if (nreaders_stress) {
- lock_is_read_held = 0;
cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress,
sizeof(*cxt.lrsa),
GFP_KERNEL);
@@ -954,29 +1279,33 @@ static int __init lock_torture_init(void)
}
}
+ firsterr = call_rcu_chain_init();
+ if (torture_init_error(firsterr))
+ goto unwind;
+
lock_torture_print_module_parms(cxt.cur_ops, "Start of test");
/* Prepare torture context. */
if (onoff_interval > 0) {
firsterr = torture_onoff_init(onoff_holdoff * HZ,
onoff_interval * HZ, NULL);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (shuffle_interval > 0) {
firsterr = torture_shuffle_init(shuffle_interval);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (shutdown_secs > 0) {
firsterr = torture_shutdown_init(shutdown_secs,
lock_torture_cleanup);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stutter > 0) {
firsterr = torture_stutter_init(stutter, stutter);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
@@ -985,18 +1314,22 @@ static int __init lock_torture_init(void)
sizeof(writer_tasks[0]),
GFP_KERNEL);
if (writer_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
+ TOROUT_ERRSTRING("writer_tasks: Out of memory");
firsterr = -ENOMEM;
goto unwind;
}
}
+ /* cap nested_locks to MAX_NESTED_LOCKS */
+ if (nested_locks > MAX_NESTED_LOCKS)
+ nested_locks = MAX_NESTED_LOCKS;
+
if (cxt.cur_ops->readlock) {
reader_tasks = kcalloc(cxt.nrealreaders_stress,
sizeof(reader_tasks[0]),
GFP_KERNEL);
if (reader_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
+ TOROUT_ERRSTRING("reader_tasks: Out of memory");
kfree(writer_tasks);
writer_tasks = NULL;
firsterr = -ENOMEM;
@@ -1018,10 +1351,13 @@ static int __init lock_torture_init(void)
goto create_reader;
/* Create writer. */
- firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i],
- writer_tasks[i]);
- if (firsterr)
+ firsterr = torture_create_kthread_cb(lock_torture_writer, &cxt.lwsa[i],
+ writer_tasks[i],
+ writer_fifo ? sched_set_fifo : NULL);
+ if (torture_init_error(firsterr))
goto unwind;
+ if (cpumask_nonempty(bind_writers))
+ torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers);
create_reader:
if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress))
@@ -1029,13 +1365,15 @@ static int __init lock_torture_init(void)
/* Create reader. */
firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j],
reader_tasks[j]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
+ if (cpumask_nonempty(bind_readers))
+ torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers);
}
if (stat_interval > 0) {
firsterr = torture_create_kthread(lock_torture_stats, NULL,
stats_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
torture_init_end();
@@ -1044,6 +1382,10 @@ static int __init lock_torture_init(void)
unwind:
torture_init_end();
lock_torture_cleanup();
+ if (shutdown_secs) {
+ WARN_ON(!IS_MODULE(CONFIG_LOCK_TORTURE_TEST));
+ kernel_power_off();
+ }
return firsterr;
}
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 5e10153b4d3c..85251d8771d9 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -7,7 +7,7 @@
* The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
* with the desirable properties of being fair, and with each cpu trying
* to acquire the lock spinning on a local variable.
- * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * It avoids expensive cache bounces that common test-and-set spin-lock
* implementations incur.
*/
#ifndef __LINUX_MCS_SPINLOCK_H
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index a7276aaf2abc..bc8abb8549d2 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -1,6 +1,4 @@
/*
- * kernel/mutex-debug.c
- *
* Debugging code for mutexes
*
* Started by Ingo Molnar:
@@ -22,7 +20,7 @@
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
-#include "mutex-debug.h"
+#include "mutex.h"
/*
* Must be called with lock->wait_lock held.
@@ -32,6 +30,7 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
waiter->magic = waiter;
INIT_LIST_HEAD(&waiter->list);
+ waiter->ww_ctx = MUTEX_POISON_WW_CTX;
}
void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
@@ -57,7 +56,7 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
task->blocked_on = waiter;
}
-void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct task_struct *task)
{
DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
@@ -65,7 +64,7 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
task->blocked_on = NULL;
- list_del_init(&waiter->list);
+ INIT_LIST_HEAD(&waiter->list);
waiter->task = NULL;
}
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
deleted file mode 100644
index 1edd3f45a4ec..000000000000
--- a/kernel/locking/mutex-debug.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Mutexes: blocking mutual exclusion locks
- *
- * started by Ingo Molnar:
- *
- * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * This file contains mutex debugging related internal declarations,
- * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case.
- * More details are in kernel/mutex-debug.c.
- */
-
-/*
- * This must be called with lock->wait_lock held.
- */
-extern void debug_mutex_lock_common(struct mutex *lock,
- struct mutex_waiter *waiter);
-extern void debug_mutex_wake_waiter(struct mutex *lock,
- struct mutex_waiter *waiter);
-extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
-extern void debug_mutex_add_waiter(struct mutex *lock,
- struct mutex_waiter *waiter,
- struct task_struct *task);
-extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct task_struct *task);
-extern void debug_mutex_unlock(struct mutex *lock);
-extern void debug_mutex_init(struct mutex *lock, const char *name,
- struct lock_class_key *key);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 5352ce50a97e..cbae8c0b89ab 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -30,17 +30,23 @@
#include <linux/debug_locks.h>
#include <linux/osq_lock.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/lock.h>
+
+#ifndef CONFIG_PREEMPT_RT
+#include "mutex.h"
+
#ifdef CONFIG_DEBUG_MUTEXES
-# include "mutex-debug.h"
+# define MUTEX_WARN_ON(cond) DEBUG_LOCKS_WARN_ON(cond)
#else
-# include "mutex.h"
+# define MUTEX_WARN_ON(cond)
#endif
void
__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
{
atomic_long_set(&lock->owner, 0);
- spin_lock_init(&lock->wait_lock);
+ raw_spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
osq_lock_init(&lock->osq);
@@ -86,70 +92,64 @@ bool mutex_is_locked(struct mutex *lock)
}
EXPORT_SYMBOL(mutex_is_locked);
-__must_check enum mutex_trylock_recursive_enum
-mutex_trylock_recursive(struct mutex *lock)
-{
- if (unlikely(__mutex_owner(lock) == current))
- return MUTEX_TRYLOCK_RECURSIVE;
-
- return mutex_trylock(lock);
-}
-EXPORT_SYMBOL(mutex_trylock_recursive);
-
static inline unsigned long __owner_flags(unsigned long owner)
{
return owner & MUTEX_FLAGS;
}
/*
- * Trylock variant that retuns the owning task on failure.
+ * Returns: __mutex_owner(lock) on failure or NULL on success.
*/
-static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
+static inline struct task_struct *__mutex_trylock_common(struct mutex *lock, bool handoff)
{
unsigned long owner, curr = (unsigned long)current;
owner = atomic_long_read(&lock->owner);
for (;;) { /* must loop, can race against a flag */
- unsigned long old, flags = __owner_flags(owner);
+ unsigned long flags = __owner_flags(owner);
unsigned long task = owner & ~MUTEX_FLAGS;
if (task) {
- if (likely(task != curr))
- break;
-
- if (likely(!(flags & MUTEX_FLAG_PICKUP)))
+ if (flags & MUTEX_FLAG_PICKUP) {
+ if (task != curr)
+ break;
+ flags &= ~MUTEX_FLAG_PICKUP;
+ } else if (handoff) {
+ if (flags & MUTEX_FLAG_HANDOFF)
+ break;
+ flags |= MUTEX_FLAG_HANDOFF;
+ } else {
break;
-
- flags &= ~MUTEX_FLAG_PICKUP;
+ }
} else {
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(flags & MUTEX_FLAG_PICKUP);
-#endif
+ MUTEX_WARN_ON(flags & (MUTEX_FLAG_HANDOFF | MUTEX_FLAG_PICKUP));
+ task = curr;
}
- /*
- * We set the HANDOFF bit, we must make sure it doesn't live
- * past the point where we acquire it. This would be possible
- * if we (accidentally) set the bit on an unlocked mutex.
- */
- flags &= ~MUTEX_FLAG_HANDOFF;
-
- old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags);
- if (old == owner)
- return NULL;
-
- owner = old;
+ if (atomic_long_try_cmpxchg_acquire(&lock->owner, &owner, task | flags)) {
+ if (task == curr)
+ return NULL;
+ break;
+ }
}
return __owner_task(owner);
}
/*
+ * Trylock or set HANDOFF
+ */
+static inline bool __mutex_trylock_or_handoff(struct mutex *lock, bool handoff)
+{
+ return !__mutex_trylock_common(lock, handoff);
+}
+
+/*
* Actual trylock that will work on any unlocked state.
*/
static inline bool __mutex_trylock(struct mutex *lock)
{
- return !__mutex_trylock_or_owner(lock);
+ return !__mutex_trylock_common(lock, false);
}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
@@ -178,10 +178,7 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
{
unsigned long curr = (unsigned long)current;
- if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr)
- return true;
-
- return false;
+ return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL);
}
#endif
@@ -204,7 +201,7 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait
* Add @waiter to a given location in the lock wait_list and set the
* FLAG_WAITERS flag if it's the first waiter.
*/
-static void __sched
+static void
__mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct list_head *list)
{
@@ -215,9 +212,19 @@ __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
__mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
}
+static void
+__mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter)
+{
+ list_del(&waiter->list);
+ if (likely(list_empty(&lock->wait_list)))
+ __mutex_clear_flag(lock, MUTEX_FLAGS);
+
+ debug_mutex_remove_waiter(lock, waiter, current);
+}
+
/*
* Give up ownership to a specific task, when @task = NULL, this is equivalent
- * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves
+ * to a regular unlock. Sets PICKUP on a handoff, clears HANDOFF, preserves
* WAITERS. Provides RELEASE semantics like a regular unlock, the
* __mutex_trylock() provides a matching ACQUIRE semantics for the handoff.
*/
@@ -226,23 +233,18 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
unsigned long owner = atomic_long_read(&lock->owner);
for (;;) {
- unsigned long old, new;
+ unsigned long new;
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
- DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
-#endif
+ MUTEX_WARN_ON(__owner_task(owner) != current);
+ MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP);
new = (owner & MUTEX_FLAG_WAITERS);
new |= (unsigned long)task;
if (task)
new |= MUTEX_FLAG_PICKUP;
- old = atomic_long_cmpxchg_release(&lock->owner, owner, new);
- if (old == owner)
+ if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, new))
break;
-
- owner = old;
}
}
@@ -286,218 +288,18 @@ void __sched mutex_lock(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock);
#endif
-/*
- * Wait-Die:
- * The newer transactions are killed when:
- * It (the new transaction) makes a request for a lock being held
- * by an older transaction.
- *
- * Wound-Wait:
- * The newer transactions are wounded when:
- * An older transaction makes a request for a lock being held by
- * the newer transaction.
- */
-
-/*
- * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired
- * it.
- */
-static __always_inline void
-ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
-{
-#ifdef CONFIG_DEBUG_MUTEXES
- /*
- * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
- * but released with a normal mutex_unlock in this call.
- *
- * This should never happen, always use ww_mutex_unlock.
- */
- DEBUG_LOCKS_WARN_ON(ww->ctx);
-
- /*
- * Not quite done after calling ww_acquire_done() ?
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
-
- if (ww_ctx->contending_lock) {
- /*
- * After -EDEADLK you tried to
- * acquire a different ww_mutex? Bad!
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
-
- /*
- * You called ww_mutex_lock after receiving -EDEADLK,
- * but 'forgot' to unlock everything else first?
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
- ww_ctx->contending_lock = NULL;
- }
-
- /*
- * Naughty, using a different class will lead to undefined behavior!
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
-#endif
- ww_ctx->acquired++;
- ww->ctx = ww_ctx;
-}
+#include "ww_mutex.h"
-/*
- * Determine if context @a is 'after' context @b. IOW, @a is a younger
- * transaction than @b and depending on algorithm either needs to wait for
- * @b or die.
- */
-static inline bool __sched
-__ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
-{
-
- return (signed long)(a->stamp - b->stamp) > 0;
-}
-
-/*
- * Wait-Die; wake a younger waiter context (when locks held) such that it can
- * die.
- *
- * Among waiters with context, only the first one can have other locks acquired
- * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and
- * __ww_mutex_check_kill() wake any but the earliest context.
- */
-static bool __sched
-__ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter,
- struct ww_acquire_ctx *ww_ctx)
-{
- if (!ww_ctx->is_wait_die)
- return false;
-
- if (waiter->ww_ctx->acquired > 0 &&
- __ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) {
- debug_mutex_wake_waiter(lock, waiter);
- wake_up_process(waiter->task);
- }
-
- return true;
-}
-
-/*
- * Wound-Wait; wound a younger @hold_ctx if it holds the lock.
- *
- * Wound the lock holder if there are waiters with older transactions than
- * the lock holders. Even if multiple waiters may wound the lock holder,
- * it's sufficient that only one does.
- */
-static bool __ww_mutex_wound(struct mutex *lock,
- struct ww_acquire_ctx *ww_ctx,
- struct ww_acquire_ctx *hold_ctx)
-{
- struct task_struct *owner = __mutex_owner(lock);
-
- lockdep_assert_held(&lock->wait_lock);
-
- /*
- * Possible through __ww_mutex_add_waiter() when we race with
- * ww_mutex_set_context_fastpath(). In that case we'll get here again
- * through __ww_mutex_check_waiters().
- */
- if (!hold_ctx)
- return false;
-
- /*
- * Can have !owner because of __mutex_unlock_slowpath(), but if owner,
- * it cannot go away because we'll have FLAG_WAITERS set and hold
- * wait_lock.
- */
- if (!owner)
- return false;
-
- if (ww_ctx->acquired > 0 && __ww_ctx_stamp_after(hold_ctx, ww_ctx)) {
- hold_ctx->wounded = 1;
-
- /*
- * wake_up_process() paired with set_current_state()
- * inserts sufficient barriers to make sure @owner either sees
- * it's wounded in __ww_mutex_check_kill() or has a
- * wakeup pending to re-read the wounded state.
- */
- if (owner != current)
- wake_up_process(owner);
-
- return true;
- }
-
- return false;
-}
-
-/*
- * We just acquired @lock under @ww_ctx, if there are later contexts waiting
- * behind us on the wait-list, check if they need to die, or wound us.
- *
- * See __ww_mutex_add_waiter() for the list-order construction; basically the
- * list is ordered by stamp, smallest (oldest) first.
- *
- * This relies on never mixing wait-die/wound-wait on the same wait-list;
- * which is currently ensured by that being a ww_class property.
- *
- * The current task must not be on the wait list.
- */
-static void __sched
-__ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
-{
- struct mutex_waiter *cur;
-
- lockdep_assert_held(&lock->wait_lock);
-
- list_for_each_entry(cur, &lock->wait_list, list) {
- if (!cur->ww_ctx)
- continue;
-
- if (__ww_mutex_die(lock, cur, ww_ctx) ||
- __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx))
- break;
- }
-}
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
- * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx
- * and wake up any waiters so they can recheck.
+ * Trylock variant that returns the owning task on failure.
*/
-static __always_inline void
-ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
{
- ww_mutex_lock_acquired(lock, ctx);
-
- /*
- * The lock->ctx update should be visible on all cores before
- * the WAITERS check is done, otherwise contended waiters might be
- * missed. The contended waiters will either see ww_ctx == NULL
- * and keep spinning, or it will acquire wait_lock, add itself
- * to waiter list and sleep.
- */
- smp_mb(); /* See comments above and below. */
-
- /*
- * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS
- * MB MB
- * [R] MUTEX_FLAG_WAITERS [R] ww->ctx
- *
- * The memory barrier above pairs with the memory barrier in
- * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx
- * and/or !empty list.
- */
- if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS)))
- return;
-
- /*
- * Uh oh, we raced in fastpath, check if any of the waiters need to
- * die or wound us.
- */
- spin_lock(&lock->base.wait_lock);
- __ww_mutex_check_waiters(&lock->base, ctx);
- spin_unlock(&lock->base.wait_lock);
+ return __mutex_trylock_common(lock, false);
}
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-
static inline
bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
struct mutex_waiter *waiter)
@@ -552,21 +354,23 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
{
bool ret = true;
- rcu_read_lock();
+ lockdep_assert_preemption_disabled();
+
while (__mutex_owner(lock) == owner) {
/*
* Ensure we emit the owner->on_cpu, dereference _after_
- * checking lock->owner still matches owner. If that fails,
- * owner might point to freed memory. If it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
+ * checking lock->owner still matches owner. And we already
+ * disabled preemption which is equal to the RCU read-side
+ * crital section in optimistic spinning code. Thus the
+ * task_strcut structure won't go away during the spinning
+ * period
*/
barrier();
/*
* Use vcpu_is_preempted to detect lock holder preemption issue.
*/
- if (!owner->on_cpu || need_resched() ||
- vcpu_is_preempted(task_cpu(owner))) {
+ if (!owner_on_cpu(owner) || need_resched()) {
ret = false;
break;
}
@@ -578,7 +382,6 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
cpu_relax();
}
- rcu_read_unlock();
return ret;
}
@@ -591,19 +394,19 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
struct task_struct *owner;
int retval = 1;
+ lockdep_assert_preemption_disabled();
+
if (need_resched())
return 0;
- rcu_read_lock();
- owner = __mutex_owner(lock);
-
/*
- * As lock holder preemption issue, we both skip spinning if task is not
- * on cpu or its cpu is preempted
+ * We already disabled preemption which is equal to the RCU read-side
+ * crital section in optimistic spinning code. Thus the task_strcut
+ * structure won't go away during the spinning period.
*/
+ owner = __mutex_owner(lock);
if (owner)
- retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
- rcu_read_unlock();
+ retval = owner_on_cpu(owner);
/*
* If lock->owner is not set, the mutex has been released. Return true
@@ -636,7 +439,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
*/
static __always_inline bool
mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
- const bool use_ww_ctx, struct mutex_waiter *waiter)
+ struct mutex_waiter *waiter)
{
if (!waiter) {
/*
@@ -712,7 +515,7 @@ fail:
#else
static __always_inline bool
mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
- const bool use_ww_ctx, struct mutex_waiter *waiter)
+ struct mutex_waiter *waiter)
{
return false;
}
@@ -729,6 +532,11 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
* This function must not be used in interrupt context. Unlocking
* of a not locked mutex is not allowed.
*
+ * The caller must ensure that the mutex stays alive until this function has
+ * returned - mutex_unlock() can NOT directly be used to release an object such
+ * that another concurrent task can free it.
+ * Mutexes are different from spinlocks & refcounts in this aspect.
+ *
* This function is similar to (but not equivalent to) up().
*/
void __sched mutex_unlock(struct mutex *lock)
@@ -754,192 +562,32 @@ EXPORT_SYMBOL(mutex_unlock);
*/
void __sched ww_mutex_unlock(struct ww_mutex *lock)
{
- /*
- * The unlocking fastpath is the 0->1 transition from 'locked'
- * into 'unlocked' state:
- */
- if (lock->ctx) {
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
-#endif
- if (lock->ctx->acquired > 0)
- lock->ctx->acquired--;
- lock->ctx = NULL;
- }
-
+ __ww_mutex_unlock(lock);
mutex_unlock(&lock->base);
}
EXPORT_SYMBOL(ww_mutex_unlock);
-
-static __always_inline int __sched
-__ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
-{
- if (ww_ctx->acquired > 0) {
-#ifdef CONFIG_DEBUG_MUTEXES
- struct ww_mutex *ww;
-
- ww = container_of(lock, struct ww_mutex, base);
- DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
- ww_ctx->contending_lock = ww;
-#endif
- return -EDEADLK;
- }
-
- return 0;
-}
-
-
-/*
- * Check the wound condition for the current lock acquire.
- *
- * Wound-Wait: If we're wounded, kill ourself.
- *
- * Wait-Die: If we're trying to acquire a lock already held by an older
- * context, kill ourselves.
- *
- * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to
- * look at waiters before us in the wait-list.
- */
-static inline int __sched
-__ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter,
- struct ww_acquire_ctx *ctx)
-{
- struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
- struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
- struct mutex_waiter *cur;
-
- if (ctx->acquired == 0)
- return 0;
-
- if (!ctx->is_wait_die) {
- if (ctx->wounded)
- return __ww_mutex_kill(lock, ctx);
-
- return 0;
- }
-
- if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx))
- return __ww_mutex_kill(lock, ctx);
-
- /*
- * If there is a waiter in front of us that has a context, then its
- * stamp is earlier than ours and we must kill ourself.
- */
- cur = waiter;
- list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) {
- if (!cur->ww_ctx)
- continue;
-
- return __ww_mutex_kill(lock, ctx);
- }
-
- return 0;
-}
-
-/*
- * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest
- * first. Such that older contexts are preferred to acquire the lock over
- * younger contexts.
- *
- * Waiters without context are interspersed in FIFO order.
- *
- * Furthermore, for Wait-Die kill ourself immediately when possible (there are
- * older contexts already waiting) to avoid unnecessary waiting and for
- * Wound-Wait ensure we wound the owning context when it is younger.
- */
-static inline int __sched
-__ww_mutex_add_waiter(struct mutex_waiter *waiter,
- struct mutex *lock,
- struct ww_acquire_ctx *ww_ctx)
-{
- struct mutex_waiter *cur;
- struct list_head *pos;
- bool is_wait_die;
-
- if (!ww_ctx) {
- __mutex_add_waiter(lock, waiter, &lock->wait_list);
- return 0;
- }
-
- is_wait_die = ww_ctx->is_wait_die;
-
- /*
- * Add the waiter before the first waiter with a higher stamp.
- * Waiters without a context are skipped to avoid starving
- * them. Wait-Die waiters may die here. Wound-Wait waiters
- * never die here, but they are sorted in stamp order and
- * may wound the lock holder.
- */
- pos = &lock->wait_list;
- list_for_each_entry_reverse(cur, &lock->wait_list, list) {
- if (!cur->ww_ctx)
- continue;
-
- if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) {
- /*
- * Wait-Die: if we find an older context waiting, there
- * is no point in queueing behind it, as we'd have to
- * die the moment it would acquire the lock.
- */
- if (is_wait_die) {
- int ret = __ww_mutex_kill(lock, ww_ctx);
-
- if (ret)
- return ret;
- }
-
- break;
- }
-
- pos = &cur->list;
-
- /* Wait-Die: ensure younger waiters die. */
- __ww_mutex_die(lock, cur, ww_ctx);
- }
-
- __mutex_add_waiter(lock, waiter, pos);
-
- /*
- * Wound-Wait: if we're blocking on a mutex owned by a younger context,
- * wound that such that we might proceed.
- */
- if (!is_wait_die) {
- struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
-
- /*
- * See ww_mutex_set_context_fastpath(). Orders setting
- * MUTEX_FLAG_WAITERS vs the ww->ctx load,
- * such that either we or the fastpath will wound @ww->ctx.
- */
- smp_mb();
- __ww_mutex_wound(lock, ww_ctx, ww->ctx);
- }
-
- return 0;
-}
-
/*
* Lock a mutex (possibly interruptible), slowpath:
*/
static __always_inline int __sched
-__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
+__mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclass,
struct lockdep_map *nest_lock, unsigned long ip,
struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
{
struct mutex_waiter waiter;
- bool first = false;
struct ww_mutex *ww;
int ret;
+ if (!use_ww_ctx)
+ ww_ctx = NULL;
+
might_sleep();
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-#endif
+ MUTEX_WARN_ON(lock->magic != lock);
ww = container_of(lock, struct ww_mutex, base);
- if (use_ww_ctx && ww_ctx) {
+ if (ww_ctx) {
if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
return -EALREADY;
@@ -950,44 +598,48 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
*/
if (ww_ctx->acquired == 0)
ww_ctx->wounded = 0;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ nest_lock = &ww_ctx->dep_map;
+#endif
}
preempt_disable();
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
+ trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
if (__mutex_trylock(lock) ||
- mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, NULL)) {
+ mutex_optimistic_spin(lock, ww_ctx, NULL)) {
/* got the lock, yay! */
lock_acquired(&lock->dep_map, ip);
- if (use_ww_ctx && ww_ctx)
+ if (ww_ctx)
ww_mutex_set_context_fastpath(ww, ww_ctx);
+ trace_contention_end(lock, 0);
preempt_enable();
return 0;
}
- spin_lock(&lock->wait_lock);
+ raw_spin_lock(&lock->wait_lock);
/*
* After waiting to acquire the wait_lock, try again.
*/
if (__mutex_trylock(lock)) {
- if (use_ww_ctx && ww_ctx)
+ if (ww_ctx)
__ww_mutex_check_waiters(lock, ww_ctx);
goto skip_wait;
}
debug_mutex_lock_common(lock, &waiter);
+ waiter.task = current;
+ if (use_ww_ctx)
+ waiter.ww_ctx = ww_ctx;
lock_contended(&lock->dep_map, ip);
if (!use_ww_ctx) {
/* add waiting tasks to the end of the waitqueue (FIFO): */
__mutex_add_waiter(lock, &waiter, &lock->wait_list);
-
-
-#ifdef CONFIG_DEBUG_MUTEXES
- waiter.ww_ctx = MUTEX_POISON_WW_CTX;
-#endif
} else {
/*
* Add in stamp order, waking up waiters that must kill
@@ -996,14 +648,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
if (ret)
goto err_early_kill;
-
- waiter.ww_ctx = ww_ctx;
}
- waiter.task = current;
-
set_current_state(state);
+ trace_contention_begin(lock, LCB_F_MUTEX);
for (;;) {
+ bool first;
+
/*
* Once we hold wait_lock, we're serialized against
* mutex_unlock() handing the lock off to us, do a trylock
@@ -1023,24 +674,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
goto err;
}
- if (use_ww_ctx && ww_ctx) {
+ if (ww_ctx) {
ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx);
if (ret)
goto err;
}
- spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&lock->wait_lock);
schedule_preempt_disabled();
- /*
- * ww_mutex needs to always recheck its position since its waiter
- * list is not FIFO ordered.
- */
- if ((use_ww_ctx && ww_ctx) || !first) {
- first = __mutex_waiter_is_first(lock, &waiter);
- if (first)
- __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
- }
+ first = __mutex_waiter_is_first(lock, &waiter);
set_current_state(state);
/*
@@ -1048,17 +691,23 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* state back to RUNNING and fall through the next schedule(),
* or we must see its unlock and acquire.
*/
- if (__mutex_trylock(lock) ||
- (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter)))
+ if (__mutex_trylock_or_handoff(lock, first))
break;
- spin_lock(&lock->wait_lock);
+ if (first) {
+ trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
+ if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
+ break;
+ trace_contention_begin(lock, LCB_F_MUTEX);
+ }
+
+ raw_spin_lock(&lock->wait_lock);
}
- spin_lock(&lock->wait_lock);
+ raw_spin_lock(&lock->wait_lock);
acquired:
__set_current_state(TASK_RUNNING);
- if (use_ww_ctx && ww_ctx) {
+ if (ww_ctx) {
/*
* Wound-Wait; we stole the lock (!first_waiter), check the
* waiters as anyone might want to wound us.
@@ -1068,28 +717,28 @@ acquired:
__ww_mutex_check_waiters(lock, ww_ctx);
}
- mutex_remove_waiter(lock, &waiter, current);
- if (likely(list_empty(&lock->wait_list)))
- __mutex_clear_flag(lock, MUTEX_FLAGS);
+ __mutex_remove_waiter(lock, &waiter);
debug_mutex_free_waiter(&waiter);
skip_wait:
/* got the lock - cleanup and rejoice! */
lock_acquired(&lock->dep_map, ip);
+ trace_contention_end(lock, 0);
- if (use_ww_ctx && ww_ctx)
+ if (ww_ctx)
ww_mutex_lock_acquired(ww, ww_ctx);
- spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&lock->wait_lock);
preempt_enable();
return 0;
err:
__set_current_state(TASK_RUNNING);
- mutex_remove_waiter(lock, &waiter, current);
+ __mutex_remove_waiter(lock, &waiter);
err_early_kill:
- spin_unlock(&lock->wait_lock);
+ trace_contention_end(lock, ret);
+ raw_spin_unlock(&lock->wait_lock);
debug_mutex_free_waiter(&waiter);
mutex_release(&lock->dep_map, ip);
preempt_enable();
@@ -1097,19 +746,56 @@ err_early_kill:
}
static int __sched
-__mutex_lock(struct mutex *lock, long state, unsigned int subclass,
+__mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
struct lockdep_map *nest_lock, unsigned long ip)
{
return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false);
}
static int __sched
-__ww_mutex_lock(struct mutex *lock, long state, unsigned int subclass,
- struct lockdep_map *nest_lock, unsigned long ip,
- struct ww_acquire_ctx *ww_ctx)
+__ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
+ unsigned long ip, struct ww_acquire_ctx *ww_ctx)
+{
+ return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true);
+}
+
+/**
+ * ww_mutex_trylock - tries to acquire the w/w mutex with optional acquire context
+ * @ww: mutex to lock
+ * @ww_ctx: optional w/w acquire context
+ *
+ * Trylocks a mutex with the optional acquire context; no deadlock detection is
+ * possible. Returns 1 if the mutex has been acquired successfully, 0 otherwise.
+ *
+ * Unlike ww_mutex_lock, no deadlock handling is performed. However, if a @ctx is
+ * specified, -EALREADY handling may happen in calls to ww_mutex_trylock.
+ *
+ * A mutex acquired with this function must be released with ww_mutex_unlock.
+ */
+int ww_mutex_trylock(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
{
- return __mutex_lock_common(lock, state, subclass, nest_lock, ip, ww_ctx, true);
+ if (!ww_ctx)
+ return mutex_trylock(&ww->base);
+
+ MUTEX_WARN_ON(ww->base.magic != &ww->base);
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here, since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+ if (__mutex_trylock(&ww->base)) {
+ ww_mutex_set_context_fastpath(ww, ww_ctx);
+ mutex_acquire_nest(&ww->base.dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_);
+ return 1;
+ }
+
+ return 0;
}
+EXPORT_SYMBOL(ww_mutex_trylock);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void __sched
@@ -1188,8 +874,7 @@ ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
might_sleep();
ret = __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE,
- 0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
- ctx);
+ 0, _RET_IP_, ctx);
if (!ret && ctx && ctx->acquired > 1)
return ww_mutex_deadlock_injection(lock, ctx);
@@ -1204,8 +889,7 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
might_sleep();
ret = __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE,
- 0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
- ctx);
+ 0, _RET_IP_, ctx);
if (!ret && ctx && ctx->acquired > 1)
return ww_mutex_deadlock_injection(lock, ctx);
@@ -1236,29 +920,21 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
*/
owner = atomic_long_read(&lock->owner);
for (;;) {
- unsigned long old;
-
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
- DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
-#endif
+ MUTEX_WARN_ON(__owner_task(owner) != current);
+ MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP);
if (owner & MUTEX_FLAG_HANDOFF)
break;
- old = atomic_long_cmpxchg_release(&lock->owner, owner,
- __owner_flags(owner));
- if (old == owner) {
+ if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, __owner_flags(owner))) {
if (owner & MUTEX_FLAG_WAITERS)
break;
return;
}
-
- owner = old;
}
- spin_lock(&lock->wait_lock);
+ raw_spin_lock(&lock->wait_lock);
debug_mutex_unlock(lock);
if (!list_empty(&lock->wait_list)) {
/* get the first entry from the wait-list: */
@@ -1275,7 +951,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
if (owner & MUTEX_FLAG_HANDOFF)
__mutex_handoff(lock, next);
- spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&lock->wait_lock);
wake_up_q(&wake_q);
}
@@ -1379,7 +1055,7 @@ __mutex_lock_interruptible_slowpath(struct mutex *lock)
static noinline int __sched
__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
- return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, NULL,
+ return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0,
_RET_IP_, ctx);
}
@@ -1387,7 +1063,7 @@ static noinline int __sched
__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
struct ww_acquire_ctx *ctx)
{
- return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, NULL,
+ return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0,
_RET_IP_, ctx);
}
@@ -1411,9 +1087,7 @@ int __sched mutex_trylock(struct mutex *lock)
{
bool locked;
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-#endif
+ MUTEX_WARN_ON(lock->magic != lock);
locked = __mutex_trylock(lock);
if (locked)
@@ -1454,7 +1128,11 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
}
EXPORT_SYMBOL(ww_mutex_lock_interruptible);
-#endif
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
+#endif /* !CONFIG_PREEMPT_RT */
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(contention_begin);
+EXPORT_TRACEPOINT_SYMBOL_GPL(contention_end);
/**
* atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 1c2287d3fa71..0b2a79c4013b 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -5,21 +5,41 @@
* started by Ingo Molnar:
*
* Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * This file contains mutex debugging related internal prototypes, for the
- * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
*/
-#define mutex_remove_waiter(lock, waiter, task) \
- __list_del((waiter)->list.prev, (waiter)->list.next)
-
-#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
-#define debug_mutex_free_waiter(waiter) do { } while (0)
-#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
-#define debug_mutex_unlock(lock) do { } while (0)
-#define debug_mutex_init(lock, name, key) do { } while (0)
+/*
+ * This is the control structure for tasks blocked on mutex, which resides
+ * on the blocked task's kernel stack:
+ */
+struct mutex_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ struct ww_acquire_ctx *ww_ctx;
+#ifdef CONFIG_DEBUG_MUTEXES
+ void *magic;
+#endif
+};
-static inline void
-debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
-{
-}
+#ifdef CONFIG_DEBUG_MUTEXES
+extern void debug_mutex_lock_common(struct mutex *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_wake_waiter(struct mutex *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
+extern void debug_mutex_add_waiter(struct mutex *lock,
+ struct mutex_waiter *waiter,
+ struct task_struct *task);
+extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct task_struct *task);
+extern void debug_mutex_unlock(struct mutex *lock);
+extern void debug_mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key);
+#else /* CONFIG_DEBUG_MUTEXES */
+# define debug_mutex_lock_common(lock, waiter) do { } while (0)
+# define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
+# define debug_mutex_free_waiter(waiter) do { } while (0)
+# define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
+# define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0)
+# define debug_mutex_unlock(lock) do { } while (0)
+# define debug_mutex_init(lock, name, key) do { } while (0)
+#endif /* !CONFIG_DEBUG_MUTEXES */
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 1f7734949ac8..75a6f6133866 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -11,6 +11,13 @@
* called from interrupt context and we have preemption disabled while
* spinning.
*/
+
+struct optimistic_spin_node {
+ struct optimistic_spin_node *next, *prev;
+ int locked; /* 1 if lock acquired */
+ int cpu; /* encoded CPU # + 1 value */
+};
+
static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
/*
@@ -37,32 +44,28 @@ static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
/*
* Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
* Can return NULL in case we were the last queued and we updated @lock instead.
+ *
+ * If osq_lock() is being cancelled there must be a previous node
+ * and 'old_cpu' is its CPU #.
+ * For osq_unlock() there is never a previous node and old_cpu is
+ * set to OSQ_UNLOCKED_VAL.
*/
static inline struct optimistic_spin_node *
osq_wait_next(struct optimistic_spin_queue *lock,
struct optimistic_spin_node *node,
- struct optimistic_spin_node *prev)
+ int old_cpu)
{
- struct optimistic_spin_node *next = NULL;
int curr = encode_cpu(smp_processor_id());
- int old;
-
- /*
- * If there is a prev node in queue, then the 'old' value will be
- * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
- * we're currently last in queue, then the queue will then become empty.
- */
- old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
for (;;) {
if (atomic_read(&lock->tail) == curr &&
- atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) {
+ atomic_cmpxchg_acquire(&lock->tail, curr, old_cpu) == curr) {
/*
* We were the last queued, we moved @lock back. @prev
* will now observe @lock and will complete its
* unlock()/unqueue().
*/
- break;
+ return NULL;
}
/*
@@ -76,15 +79,15 @@ osq_wait_next(struct optimistic_spin_queue *lock,
* wait for a new @node->next from its Step-C.
*/
if (node->next) {
+ struct optimistic_spin_node *next;
+
next = xchg(&node->next, NULL);
if (next)
- break;
+ return next;
}
cpu_relax();
}
-
- return next;
}
bool osq_lock(struct optimistic_spin_queue *lock)
@@ -135,7 +138,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
*/
/*
- * Wait to acquire the lock or cancelation. Note that need_resched()
+ * Wait to acquire the lock or cancellation. Note that need_resched()
* will come with an IPI, which will wake smp_cond_load_relaxed() if it
* is implemented with a monitor-wait. vcpu_is_preempted() relies on
* polling, be careful.
@@ -154,13 +157,17 @@ bool osq_lock(struct optimistic_spin_queue *lock)
*/
for (;;) {
- if (prev->next == node &&
+ /*
+ * cpu_relax() below implies a compiler barrier which would
+ * prevent this comparison being optimized away.
+ */
+ if (data_race(prev->next) == node &&
cmpxchg(&prev->next, node, NULL) == node)
break;
/*
* We can only fail the cmpxchg() racing against an unlock(),
- * in which case we should observe @node->locked becomming
+ * in which case we should observe @node->locked becoming
* true.
*/
if (smp_load_acquire(&node->locked))
@@ -182,7 +189,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* back to @prev.
*/
- next = osq_wait_next(lock, node, prev);
+ next = osq_wait_next(lock, node, prev->cpu);
if (!next)
return false;
@@ -222,7 +229,7 @@ void osq_unlock(struct optimistic_spin_queue *lock)
return;
}
- next = osq_wait_next(lock, node, NULL);
+ next = osq_wait_next(lock, node, OSQ_UNLOCKED_VAL);
if (next)
WRITE_ONCE(next->locked, 1);
}
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 8bbafe3e5203..185bd1c906b0 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -7,7 +7,9 @@
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
+#include <linux/sched/debug.h>
#include <linux/errno.h>
+#include <trace/events/lock.h>
int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
const char *name, struct lock_class_key *key)
@@ -45,7 +47,7 @@ EXPORT_SYMBOL_GPL(percpu_free_rwsem);
static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
- __this_cpu_inc(*sem->read_count);
+ this_cpu_inc(*sem->read_count);
/*
* Due to having preemption disabled the decrement happens on
@@ -71,7 +73,7 @@ static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
if (likely(!atomic_read_acquire(&sem->block)))
return true;
- __this_cpu_dec(*sem->read_count);
+ this_cpu_dec(*sem->read_count);
/* Prod writer to re-evaluate readers_active_check() */
rcuwait_wake_up(&sem->writer);
@@ -162,7 +164,7 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
__set_current_state(TASK_RUNNING);
}
-bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
+bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
{
if (__percpu_down_read_trylock(sem))
return true;
@@ -170,9 +172,11 @@ bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
if (try)
return false;
+ trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ);
preempt_enable();
percpu_rwsem_wait(sem, /* .reader = */ true);
preempt_disable();
+ trace_contention_end(sem, 0);
return true;
}
@@ -188,6 +192,12 @@ EXPORT_SYMBOL_GPL(__percpu_down_read);
__sum; \
})
+bool percpu_is_read_locked(struct percpu_rw_semaphore *sem)
+{
+ return per_cpu_sum(*sem->read_count) != 0 && !atomic_read(&sem->block);
+}
+EXPORT_SYMBOL_GPL(percpu_is_read_locked);
+
/*
* Return true if the modular sum of the sem->read_count per-CPU variable is
* zero. If this sum is zero, then it is stable due to the fact that if any
@@ -211,10 +221,11 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem)
return true;
}
-void percpu_down_write(struct percpu_rw_semaphore *sem)
+void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
{
might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+ trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE);
/* Notify readers to take the slow path. */
rcu_sync_enter(&sem->rss);
@@ -236,6 +247,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
/* Wait for all active readers to complete. */
rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE);
+ trace_contention_end(sem, 0);
}
EXPORT_SYMBOL_GPL(percpu_down_write);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index fe9ca92faa2a..d2ef312a8611 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -12,13 +12,13 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/spinlock.h>
-#include <asm/qrwlock.h>
+#include <trace/events/lock.h>
/**
- * queued_read_lock_slowpath - acquire read lock of a queue rwlock
- * @lock: Pointer to queue rwlock structure
+ * queued_read_lock_slowpath - acquire read lock of a queued rwlock
+ * @lock: Pointer to queued rwlock structure
*/
-void queued_read_lock_slowpath(struct qrwlock *lock)
+void __lockfunc queued_read_lock_slowpath(struct qrwlock *lock)
{
/*
* Readers come here when they cannot get the lock without waiting
@@ -35,6 +35,8 @@ void queued_read_lock_slowpath(struct qrwlock *lock)
}
atomic_sub(_QR_BIAS, &lock->cnts);
+ trace_contention_begin(lock, LCB_F_SPIN | LCB_F_READ);
+
/*
* Put the reader into the wait queue
*/
@@ -52,32 +54,39 @@ void queued_read_lock_slowpath(struct qrwlock *lock)
* Signal the next one in queue to become queue head
*/
arch_spin_unlock(&lock->wait_lock);
+
+ trace_contention_end(lock, 0);
}
EXPORT_SYMBOL(queued_read_lock_slowpath);
/**
- * queued_write_lock_slowpath - acquire write lock of a queue rwlock
- * @lock : Pointer to queue rwlock structure
+ * queued_write_lock_slowpath - acquire write lock of a queued rwlock
+ * @lock : Pointer to queued rwlock structure
*/
-void queued_write_lock_slowpath(struct qrwlock *lock)
+void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
{
+ int cnts;
+
+ trace_contention_begin(lock, LCB_F_SPIN | LCB_F_WRITE);
+
/* Put the writer into the wait queue */
arch_spin_lock(&lock->wait_lock);
/* Try to acquire the lock directly if no reader is present */
- if (!atomic_read(&lock->cnts) &&
- (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
+ if (!(cnts = atomic_read(&lock->cnts)) &&
+ atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED))
goto unlock;
/* Set the waiting flag to notify readers that a writer is pending */
- atomic_add(_QW_WAITING, &lock->cnts);
+ atomic_or(_QW_WAITING, &lock->cnts);
/* When no more readers or writers, set the locked flag */
do {
- atomic_cond_read_acquire(&lock->cnts, VAL == _QW_WAITING);
- } while (atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING,
- _QW_LOCKED) != _QW_WAITING);
+ cnts = atomic_cond_read_relaxed(&lock->cnts, VAL == _QW_WAITING);
+ } while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED));
unlock:
arch_spin_unlock(&lock->wait_lock);
+
+ trace_contention_end(lock, 0);
}
EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index b9515fcc9b29..ebe6b8ec7cb3 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -22,6 +22,7 @@
#include <linux/prefetch.h>
#include <asm/byteorder.h>
#include <asm/qspinlock.h>
+#include <trace/events/lock.h>
/*
* Include queued spinlock statistics code
@@ -312,7 +313,7 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
* contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' :
* queue : ^--' :
*/
-void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
{
struct mcs_spinlock *prev, *next, *node;
u32 old, tail;
@@ -370,7 +371,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
/*
* We're pending, wait for the owner to go away.
*
- * 0,1,1 -> 0,1,0
+ * 0,1,1 -> *,1,0
*
* this wait loop must be a load-acquire such that we match the
* store-release that clears the locked bit and create lock
@@ -379,7 +380,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* barriers.
*/
if (val & _Q_LOCKED_MASK)
- atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK));
+ smp_cond_load_acquire(&lock->locked, !VAL);
/*
* take ownership and clear the pending bit.
@@ -401,6 +402,8 @@ pv_queue:
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);
+ trace_contention_begin(lock, LCB_F_SPIN);
+
/*
* 4 nodes are allocated based on the assumption that there will
* not be nested NMIs taking spinlocks. That may not be true in
@@ -554,6 +557,8 @@ locked:
pv_kick_node(lock, next);
release:
+ trace_contention_end(lock, 0);
+
/*
* release the node
*/
@@ -581,4 +586,11 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
#include "qspinlock_paravirt.h"
#include "qspinlock.c"
+bool nopvspin __initdata;
+static __init int parse_nopvspin(char *arg)
+{
+ nopvspin = true;
+ return 0;
+}
+early_param("nopvspin", parse_nopvspin);
#endif
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index e84d21aa0722..6a0184e9c234 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -486,10 +486,20 @@ gotlock:
}
/*
+ * Include the architecture specific callee-save thunk of the
+ * __pv_queued_spin_unlock(). This thunk is put together with
+ * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
+ * function close to each other sharing consecutive instruction cachelines.
+ * Alternatively, architecture specific version of __pv_queued_spin_unlock()
+ * can be defined.
+ */
+#include <asm/qspinlock_paravirt.h>
+
+/*
* PV versions of the unlock fastpath and slowpath functions to be used
* instead of queued_spin_unlock().
*/
-__visible void
+__visible __lockfunc void
__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
{
struct pv_node *node;
@@ -533,18 +543,8 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
pv_kick(node->cpu);
}
-/*
- * Include the architecture specific callee-save thunk of the
- * __pv_queued_spin_unlock(). This thunk is put together with
- * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
- * function close to each other sharing consecutive instruction cachelines.
- * Alternatively, architecture specific version of __pv_queued_spin_unlock()
- * can be defined.
- */
-#include <asm/qspinlock_paravirt.h>
-
#ifndef __pv_queued_spin_unlock
-__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+__visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock)
{
u8 locked;
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
deleted file mode 100644
index 36e69100e8e0..000000000000
--- a/kernel/locking/rtmutex-debug.c
+++ /dev/null
@@ -1,182 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * RT-Mutexes: blocking mutual exclusion locks with PI support
- *
- * started by Ingo Molnar and Thomas Gleixner:
- *
- * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * This code is based on the rt.c implementation in the preempt-rt tree.
- * Portions of said code are
- *
- * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey
- * Copyright (C) 2006 Esben Nielsen
- * Copyright (C) 2006 Kihon Technologies Inc.,
- * Steven Rostedt <rostedt@goodmis.org>
- *
- * See rt.c in preempt-rt for proper credits and further information
- */
-#include <linux/sched.h>
-#include <linux/sched/rt.h>
-#include <linux/sched/debug.h>
-#include <linux/delay.h>
-#include <linux/export.h>
-#include <linux/spinlock.h>
-#include <linux/kallsyms.h>
-#include <linux/syscalls.h>
-#include <linux/interrupt.h>
-#include <linux/rbtree.h>
-#include <linux/fs.h>
-#include <linux/debug_locks.h>
-
-#include "rtmutex_common.h"
-
-static void printk_task(struct task_struct *p)
-{
- if (p)
- printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio);
- else
- printk("<none>");
-}
-
-static void printk_lock(struct rt_mutex *lock, int print_owner)
-{
- if (lock->name)
- printk(" [%p] {%s}\n",
- lock, lock->name);
- else
- printk(" [%p] {%s:%d}\n",
- lock, lock->file, lock->line);
-
- if (print_owner && rt_mutex_owner(lock)) {
- printk(".. ->owner: %p\n", lock->owner);
- printk(".. held by: ");
- printk_task(rt_mutex_owner(lock));
- printk("\n");
- }
-}
-
-void rt_mutex_debug_task_free(struct task_struct *task)
-{
- DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
- DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
-}
-
-/*
- * We fill out the fields in the waiter to store the information about
- * the deadlock. We print when we return. act_waiter can be NULL in
- * case of a remove waiter operation.
- */
-void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
- struct rt_mutex_waiter *act_waiter,
- struct rt_mutex *lock)
-{
- struct task_struct *task;
-
- if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter)
- return;
-
- task = rt_mutex_owner(act_waiter->lock);
- if (task && task != current) {
- act_waiter->deadlock_task_pid = get_pid(task_pid(task));
- act_waiter->deadlock_lock = lock;
- }
-}
-
-void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
-{
- struct task_struct *task;
-
- if (!waiter->deadlock_lock || !debug_locks)
- return;
-
- rcu_read_lock();
- task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID);
- if (!task) {
- rcu_read_unlock();
- return;
- }
-
- if (!debug_locks_off()) {
- rcu_read_unlock();
- return;
- }
-
- pr_warn("\n");
- pr_warn("============================================\n");
- pr_warn("WARNING: circular locking deadlock detected!\n");
- pr_warn("%s\n", print_tainted());
- pr_warn("--------------------------------------------\n");
- printk("%s/%d is deadlocking current task %s/%d\n\n",
- task->comm, task_pid_nr(task),
- current->comm, task_pid_nr(current));
-
- printk("\n1) %s/%d is trying to acquire this lock:\n",
- current->comm, task_pid_nr(current));
- printk_lock(waiter->lock, 1);
-
- printk("\n2) %s/%d is blocked on this lock:\n",
- task->comm, task_pid_nr(task));
- printk_lock(waiter->deadlock_lock, 1);
-
- debug_show_held_locks(current);
- debug_show_held_locks(task);
-
- printk("\n%s/%d's [blocked] stackdump:\n\n",
- task->comm, task_pid_nr(task));
- show_stack(task, NULL, KERN_DEFAULT);
- printk("\n%s/%d's [current] stackdump:\n\n",
- current->comm, task_pid_nr(current));
- dump_stack();
- debug_show_all_locks();
- rcu_read_unlock();
-
- printk("[ turning off deadlock detection."
- "Please report this trace. ]\n\n");
-}
-
-void debug_rt_mutex_lock(struct rt_mutex *lock)
-{
-}
-
-void debug_rt_mutex_unlock(struct rt_mutex *lock)
-{
- DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
-}
-
-void
-debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
-{
-}
-
-void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
-{
- DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
-}
-
-void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
-{
- memset(waiter, 0x11, sizeof(*waiter));
- waiter->deadlock_task_pid = NULL;
-}
-
-void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
-{
- put_pid(waiter->deadlock_task_pid);
- memset(waiter, 0x22, sizeof(*waiter));
-}
-
-void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key)
-{
- /*
- * Make sure we are not reinitializing a held lock:
- */
- debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lock->name = name;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- lockdep_init_map(&lock->dep_map, name, key, 0);
-#endif
-}
-
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
deleted file mode 100644
index fc549713bba3..000000000000
--- a/kernel/locking/rtmutex-debug.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * RT-Mutexes: blocking mutual exclusion locks with PI support
- *
- * started by Ingo Molnar and Thomas Gleixner:
- *
- * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * This file contains macros used solely by rtmutex.c. Debug version.
- */
-
-extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
-extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
-extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key);
-extern void debug_rt_mutex_lock(struct rt_mutex *lock);
-extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
-extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
- struct task_struct *powner);
-extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
-extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
- struct rt_mutex_waiter *waiter,
- struct rt_mutex *lock);
-extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
-# define debug_rt_mutex_reset_waiter(w) \
- do { (w)->deadlock_lock = NULL; } while (0)
-
-static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
- enum rtmutex_chainwalk walk)
-{
- return (waiter != NULL);
-}
-
-static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
-{
- debug_rt_mutex_print_deadlock(w);
-}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index cfdd5b93264d..4a10e8c16fd2 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -8,20 +8,60 @@
* Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
* Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
* Copyright (C) 2006 Esben Nielsen
+ * Adaptive Spinlocks:
+ * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
+ * and Peter Morreale,
+ * Adaptive Spinlocks simplification:
+ * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
*
* See Documentation/locking/rt-mutex-design.rst for details.
*/
-#include <linux/spinlock.h>
-#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/deadline.h>
#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
-#include <linux/sched/deadline.h>
#include <linux/sched/wake_q.h>
-#include <linux/sched/debug.h>
-#include <linux/timer.h>
+#include <linux/ww_mutex.h>
+
+#include <trace/events/lock.h>
#include "rtmutex_common.h"
+#ifndef WW_RT
+# define build_ww_mutex() (false)
+# define ww_container_of(rtm) NULL
+
+static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter,
+ struct rt_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ return 0;
+}
+
+static inline void __ww_mutex_check_waiters(struct rt_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+}
+
+static inline void ww_mutex_lock_acquired(struct ww_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+}
+
+static inline int __ww_mutex_check_kill(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ return 0;
+}
+
+#else
+# define build_ww_mutex() (true)
+# define ww_container_of(rtm) container_of(rtm, struct ww_mutex, base)
+# include "ww_mutex.h"
+#endif
+
/*
* lock->owner state tracking:
*
@@ -49,24 +89,41 @@
* set this bit before looking at the lock.
*/
-static void
-rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
+static __always_inline struct task_struct *
+rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner)
{
unsigned long val = (unsigned long)owner;
if (rt_mutex_has_waiters(lock))
val |= RT_MUTEX_HAS_WAITERS;
- WRITE_ONCE(lock->owner, (struct task_struct *)val);
+ return (struct task_struct *)val;
+}
+
+static __always_inline void
+rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner)
+{
+ /*
+ * lock->wait_lock is held but explicit acquire semantics are needed
+ * for a new lock owner so WRITE_ONCE is insufficient.
+ */
+ xchg_acquire(&lock->owner, rt_mutex_owner_encode(lock, owner));
+}
+
+static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock)
+{
+ /* lock->wait_lock is held so the unlock provides release semantics. */
+ WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL));
}
-static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock)
{
lock->owner = (struct task_struct *)
((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
}
-static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline void
+fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock)
{
unsigned long owner, *p = (unsigned long *) &lock->owner;
@@ -132,8 +189,21 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
* still set.
*/
owner = READ_ONCE(*p);
- if (owner & RT_MUTEX_HAS_WAITERS)
- WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
+ if (owner & RT_MUTEX_HAS_WAITERS) {
+ /*
+ * See rt_mutex_set_owner() and rt_mutex_clear_owner() on
+ * why xchg_acquire() is used for updating owner for
+ * locking and WRITE_ONCE() for unlocking.
+ *
+ * WRITE_ONCE() would work for the acquire case too, but
+ * in case that the lock acquisition failed it might
+ * force other lockers into the slow path unnecessarily.
+ */
+ if (acquire_lock)
+ xchg_acquire(p, owner & ~RT_MUTEX_HAS_WAITERS);
+ else
+ WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
+ }
}
/*
@@ -141,15 +211,31 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
* set up.
*/
#ifndef CONFIG_DEBUG_RT_MUTEXES
-# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c)
-# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c)
+static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return try_cmpxchg_acquire(&lock->owner, &old, new);
+}
+
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+ return rt_mutex_cmpxchg_acquire(lock, NULL, current);
+}
+
+static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return try_cmpxchg_release(&lock->owner, &old, new);
+}
/*
* Callers must hold the ->wait_lock -- which is the whole purpose as we force
* all future threads that attempt to [Rmw] the lock to the slowpath. As such
* relaxed semantics suffice.
*/
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
{
unsigned long owner, *p = (unsigned long *) &lock->owner;
@@ -157,6 +243,13 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
owner = *p;
} while (cmpxchg_relaxed(p, owner,
owner | RT_MUTEX_HAS_WAITERS) != owner);
+
+ /*
+ * The cmpxchg loop above is relaxed to avoid back-to-back ACQUIRE
+ * operations in the event of contention. Ensure the successful
+ * cmpxchg is visible.
+ */
+ smp_mb__after_atomic();
}
/*
@@ -165,8 +258,8 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
* 2) Drop lock->wait_lock
* 3) Try to unlock the lock with cmpxchg
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
- unsigned long flags)
+static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
struct task_struct *owner = rt_mutex_owner(lock);
@@ -201,10 +294,36 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
}
#else
-# define rt_mutex_cmpxchg_acquire(l,c,n) (0)
-# define rt_mutex_cmpxchg_release(l,c,n) (0)
+static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return false;
+
+}
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock);
+
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+ /*
+ * With debug enabled rt_mutex_cmpxchg trylock() will always fail.
+ *
+ * Avoid unconditionally taking the slow path by using
+ * rt_mutex_slow_trylock() which is covered by the debug code and can
+ * acquire a non-contended rtmutex.
+ */
+ return rt_mutex_slowtrylock(lock);
+}
+
+static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return false;
+}
+
+static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
{
lock->owner = (struct task_struct *)
((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
@@ -213,8 +332,8 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
/*
* Simple slow path only version: lock->owner is protected by lock->wait_lock.
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
- unsigned long flags)
+static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
lock->owner = NULL;
@@ -223,15 +342,53 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
}
#endif
+static __always_inline int __waiter_prio(struct task_struct *task)
+{
+ int prio = task->prio;
+
+ if (!rt_prio(prio))
+ return DEFAULT_PRIO;
+
+ return prio;
+}
+
+/*
+ * Update the waiter->tree copy of the sort keys.
+ */
+static __always_inline void
+waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
+{
+ lockdep_assert_held(&waiter->lock->wait_lock);
+ lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry));
+
+ waiter->tree.prio = __waiter_prio(task);
+ waiter->tree.deadline = task->dl.deadline;
+}
+
/*
- * Only use with rt_mutex_waiter_{less,equal}()
+ * Update the waiter->pi_tree copy of the sort keys (from the tree copy).
*/
+static __always_inline void
+waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
+{
+ lockdep_assert_held(&waiter->lock->wait_lock);
+ lockdep_assert_held(&task->pi_lock);
+ lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry));
+
+ waiter->pi_tree.prio = waiter->tree.prio;
+ waiter->pi_tree.deadline = waiter->tree.deadline;
+}
+
+/*
+ * Only use with rt_waiter_node_{less,equal}()
+ */
+#define task_to_waiter_node(p) \
+ &(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
#define task_to_waiter(p) \
- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
+ &(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) }
-static inline int
-rt_mutex_waiter_less(struct rt_mutex_waiter *left,
- struct rt_mutex_waiter *right)
+static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left,
+ struct rt_waiter_node *right)
{
if (left->prio < right->prio)
return 1;
@@ -248,9 +405,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
return 0;
}
-static inline int
-rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
- struct rt_mutex_waiter *right)
+static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left,
+ struct rt_waiter_node *right)
{
if (left->prio != right->prio)
return 0;
@@ -267,76 +423,110 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
return 1;
}
-static void
-rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
+ struct rt_mutex_waiter *top_waiter)
{
- struct rb_node **link = &lock->waiters.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct rt_mutex_waiter *entry;
- bool leftmost = true;
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
- if (rt_mutex_waiter_less(waiter, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = false;
- }
+ if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree))
+ return true;
+
+#ifdef RT_MUTEX_BUILD_SPINLOCKS
+ /*
+ * Note that RT tasks are excluded from same priority (lateral)
+ * steals to prevent the introduction of an unbounded latency.
+ */
+ if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio))
+ return false;
+
+ return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);
+#else
+ return false;
+#endif
+}
+
+#define __node_2_waiter(node) \
+ rb_entry((node), struct rt_mutex_waiter, tree.entry)
+
+static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b)
+{
+ struct rt_mutex_waiter *aw = __node_2_waiter(a);
+ struct rt_mutex_waiter *bw = __node_2_waiter(b);
+
+ if (rt_waiter_node_less(&aw->tree, &bw->tree))
+ return 1;
+
+ if (!build_ww_mutex())
+ return 0;
+
+ if (rt_waiter_node_less(&bw->tree, &aw->tree))
+ return 0;
+
+ /* NOTE: relies on waiter->ww_ctx being set before insertion */
+ if (aw->ww_ctx) {
+ if (!bw->ww_ctx)
+ return 1;
+
+ return (signed long)(aw->ww_ctx->stamp -
+ bw->ww_ctx->stamp) < 0;
}
- rb_link_node(&waiter->tree_entry, parent, link);
- rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost);
+ return 0;
}
-static void
-rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+static __always_inline void
+rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
{
- if (RB_EMPTY_NODE(&waiter->tree_entry))
+ lockdep_assert_held(&lock->wait_lock);
+
+ rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less);
+}
+
+static __always_inline void
+rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
+{
+ lockdep_assert_held(&lock->wait_lock);
+
+ if (RB_EMPTY_NODE(&waiter->tree.entry))
return;
- rb_erase_cached(&waiter->tree_entry, &lock->waiters);
- RB_CLEAR_NODE(&waiter->tree_entry);
+ rb_erase_cached(&waiter->tree.entry, &lock->waiters);
+ RB_CLEAR_NODE(&waiter->tree.entry);
+}
+
+#define __node_2_rt_node(node) \
+ rb_entry((node), struct rt_waiter_node, entry)
+
+static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b)
+{
+ return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b));
}
-static void
+static __always_inline void
rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- struct rb_node **link = &task->pi_waiters.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct rt_mutex_waiter *entry;
- bool leftmost = true;
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
- if (rt_mutex_waiter_less(waiter, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = false;
- }
- }
+ lockdep_assert_held(&task->pi_lock);
- rb_link_node(&waiter->pi_tree_entry, parent, link);
- rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost);
+ rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less);
}
-static void
+static __always_inline void
rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
+ lockdep_assert_held(&task->pi_lock);
+
+ if (RB_EMPTY_NODE(&waiter->pi_tree.entry))
return;
- rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters);
- RB_CLEAR_NODE(&waiter->pi_tree_entry);
+ rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters);
+ RB_CLEAR_NODE(&waiter->pi_tree.entry);
}
-static void rt_mutex_adjust_prio(struct task_struct *p)
+static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock,
+ struct task_struct *p)
{
struct task_struct *pi_task = NULL;
+ lockdep_assert_held(&lock->wait_lock);
+ lockdep_assert(rt_mutex_owner(lock) == p);
lockdep_assert_held(&p->pi_lock);
if (task_has_pi_waiters(p))
@@ -345,6 +535,42 @@ static void rt_mutex_adjust_prio(struct task_struct *p)
rt_mutex_setprio(p, pi_task);
}
+/* RT mutex specific wake_q wrappers */
+static __always_inline void rt_mutex_wake_q_add_task(struct rt_wake_q_head *wqh,
+ struct task_struct *task,
+ unsigned int wake_state)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && wake_state == TASK_RTLOCK_WAIT) {
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+ WARN_ON_ONCE(wqh->rtlock_task);
+ get_task_struct(task);
+ wqh->rtlock_task = task;
+ } else {
+ wake_q_add(&wqh->head, task);
+ }
+}
+
+static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh,
+ struct rt_mutex_waiter *w)
+{
+ rt_mutex_wake_q_add_task(wqh, w->task, w->wake_state);
+}
+
+static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) {
+ wake_up_state(wqh->rtlock_task, TASK_RTLOCK_WAIT);
+ put_task_struct(wqh->rtlock_task);
+ wqh->rtlock_task = NULL;
+ }
+
+ if (!wake_q_empty(&wqh->head))
+ wake_up_q(&wqh->head);
+
+ /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */
+ preempt_enable();
+}
+
/*
* Deadlock detection is conditional:
*
@@ -358,25 +584,16 @@ static void rt_mutex_adjust_prio(struct task_struct *p)
* deadlock detection is disabled independent of the detect argument
* and the config settings.
*/
-static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
- enum rtmutex_chainwalk chwalk)
+static __always_inline bool
+rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
+ enum rtmutex_chainwalk chwalk)
{
- /*
- * This is just a wrapper function for the following call,
- * because debug_rt_mutex_detect_deadlock() smells like a magic
- * debug feature and I wanted to keep the cond function in the
- * main source file along with the comments instead of having
- * two of the same in the headers.
- */
- return debug_rt_mutex_detect_deadlock(waiter, chwalk);
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ return waiter != NULL;
+ return chwalk == RT_MUTEX_FULL_CHAINWALK;
}
-/*
- * Max number of times we'll walk the boosting chain:
- */
-int max_lock_depth = 1024;
-
-static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
+static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_struct *p)
{
return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
}
@@ -405,9 +622,14 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
* Chain walk basics and protection scope
*
* [R] refcount on task
- * [P] task->pi_lock held
+ * [Pn] task->pi_lock held
* [L] rtmutex->wait_lock held
*
+ * Normal locking order:
+ *
+ * rtmutex->wait_lock
+ * task->pi_lock
+ *
* Step Description Protected by
* function arguments:
* @task [R]
@@ -422,39 +644,44 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
* again:
* loop_sanity_check();
* retry:
- * [1] lock(task->pi_lock); [R] acquire [P]
- * [2] waiter = task->pi_blocked_on; [P]
- * [3] check_exit_conditions_1(); [P]
- * [4] lock = waiter->lock; [P]
- * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L]
- * unlock(task->pi_lock); release [P]
+ * [1] lock(task->pi_lock); [R] acquire [P1]
+ * [2] waiter = task->pi_blocked_on; [P1]
+ * [3] check_exit_conditions_1(); [P1]
+ * [4] lock = waiter->lock; [P1]
+ * [5] if (!try_lock(lock->wait_lock)) { [P1] try to acquire [L]
+ * unlock(task->pi_lock); release [P1]
* goto retry;
* }
- * [6] check_exit_conditions_2(); [P] + [L]
- * [7] requeue_lock_waiter(lock, waiter); [P] + [L]
- * [8] unlock(task->pi_lock); release [P]
+ * [6] check_exit_conditions_2(); [P1] + [L]
+ * [7] requeue_lock_waiter(lock, waiter); [P1] + [L]
+ * [8] unlock(task->pi_lock); release [P1]
* put_task_struct(task); release [R]
* [9] check_exit_conditions_3(); [L]
* [10] task = owner(lock); [L]
* get_task_struct(task); [L] acquire [R]
- * lock(task->pi_lock); [L] acquire [P]
- * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
- * [12] check_exit_conditions_4(); [P] + [L]
- * [13] unlock(task->pi_lock); release [P]
+ * lock(task->pi_lock); [L] acquire [P2]
+ * [11] requeue_pi_waiter(tsk, waiters(lock));[P2] + [L]
+ * [12] check_exit_conditions_4(); [P2] + [L]
+ * [13] unlock(task->pi_lock); release [P2]
* unlock(lock->wait_lock); release [L]
* goto again;
+ *
+ * Where P1 is the blocking task and P2 is the lock owner; going up one step
+ * the owner becomes the next blocked task etc..
+ *
+*
*/
-static int rt_mutex_adjust_prio_chain(struct task_struct *task,
- enum rtmutex_chainwalk chwalk,
- struct rt_mutex *orig_lock,
- struct rt_mutex *next_lock,
- struct rt_mutex_waiter *orig_waiter,
- struct task_struct *top_task)
+static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
+ enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_base *orig_lock,
+ struct rt_mutex_base *next_lock,
+ struct rt_mutex_waiter *orig_waiter,
+ struct task_struct *top_task)
{
struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
struct rt_mutex_waiter *prerequeue_top_waiter;
int ret = 0, depth = 0;
- struct rt_mutex *lock;
+ struct rt_mutex_base *lock;
bool detect_deadlock;
bool requeue = true;
@@ -537,6 +764,31 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
goto out_unlock_pi;
/*
+ * There could be 'spurious' loops in the lock graph due to ww_mutex,
+ * consider:
+ *
+ * P1: A, ww_A, ww_B
+ * P2: ww_B, ww_A
+ * P3: A
+ *
+ * P3 should not return -EDEADLK because it gets trapped in the cycle
+ * created by P1 and P2 (which will resolve -- and runs into
+ * max_lock_depth above). Therefore disable detect_deadlock such that
+ * the below termination condition can trigger once all relevant tasks
+ * are boosted.
+ *
+ * Even when we start with ww_mutex we can disable deadlock detection,
+ * since we would supress a ww_mutex induced deadlock at [6] anyway.
+ * Supressing it here however is not sufficient since we might still
+ * hit [6] due to adjustment driven iteration.
+ *
+ * NOTE: if someone were to create a deadlock between 2 ww_classes we'd
+ * utterly fail to report it; lockdep should.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && waiter->ww_ctx && detect_deadlock)
+ detect_deadlock = false;
+
+ /*
* Drop out, when the task has no waiters. Note,
* top_waiter can be NULL, when we are in the deboosting
* mode!
@@ -565,7 +817,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* enabled we continue, but stop the requeueing in the chain
* walk.
*/
- if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
+ if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
if (!detect_deadlock)
goto out_unlock_pi;
else
@@ -573,13 +825,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
}
/*
- * [4] Get the next lock
+ * [4] Get the next lock; per holding task->pi_lock we can't unblock
+ * and guarantee @lock's existence.
*/
lock = waiter->lock;
/*
* [5] We need to trylock here as we are holding task->pi_lock,
* which is the reverse lock order versus the other rtmutex
* operations.
+ *
+ * Per the above, holding task->pi_lock guarantees lock exists, so
+ * inverting this lock order is infeasible from a life-time
+ * perspective.
*/
if (!raw_spin_trylock(&lock->wait_lock)) {
raw_spin_unlock_irq(&task->pi_lock);
@@ -597,9 +854,21 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* walk, we detected a deadlock.
*/
if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
- debug_rt_mutex_deadlock(chwalk, orig_waiter, lock);
- raw_spin_unlock(&lock->wait_lock);
ret = -EDEADLK;
+
+ /*
+ * When the deadlock is due to ww_mutex; also see above. Don't
+ * report the deadlock and instead let the ww_mutex wound/die
+ * logic pick which of the contending threads gets -EDEADLK.
+ *
+ * NOTE: assumes the cycle only contains a single ww_class; any
+ * other configuration and we fail to report; also, see
+ * lockdep.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter && orig_waiter->ww_ctx)
+ ret = 0;
+
+ raw_spin_unlock(&lock->wait_lock);
goto out_unlock_pi;
}
@@ -671,18 +940,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* or
*
* DL CBS enforcement advancing the effective deadline.
- *
- * Even though pi_waiters also uses these fields, and that tree is only
- * updated in [11], we can do this here, since we hold [L], which
- * serializes all pi_waiters access and rb_erase() does not care about
- * the values of the node being removed.
*/
- waiter->prio = task->prio;
- waiter->deadline = task->dl.deadline;
+ waiter_update_prio(waiter, task);
rt_mutex_enqueue(lock, waiter);
- /* [8] Release the task */
+ /*
+ * [8] Release the (blocking) task in preparation for
+ * taking the owner task in [10].
+ *
+ * Since we hold lock->waiter_lock, task cannot unblock, even if we
+ * release task->pi_lock.
+ */
raw_spin_unlock(&task->pi_lock);
put_task_struct(task);
@@ -699,13 +968,19 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* then we need to wake the new top waiter up to try
* to get the lock.
*/
- if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
- wake_up_process(rt_mutex_top_waiter(lock)->task);
+ top_waiter = rt_mutex_top_waiter(lock);
+ if (prerequeue_top_waiter != top_waiter)
+ wake_up_state(top_waiter->task, top_waiter->wake_state);
raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
- /* [10] Grab the next task, i.e. the owner of @lock */
+ /*
+ * [10] Grab the next task, i.e. the owner of @lock
+ *
+ * Per holding lock->wait_lock and checking for !owner above, there
+ * must be an owner and it cannot go away.
+ */
task = get_task_struct(rt_mutex_owner(lock));
raw_spin_lock(&task->pi_lock);
@@ -718,13 +993,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* and adjust the priority of the owner.
*/
rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
+ waiter_clone_prio(waiter, task);
rt_mutex_enqueue_pi(task, waiter);
- rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(lock, task);
} else if (prerequeue_top_waiter == waiter) {
/*
* The waiter was the top waiter on the lock, but is
- * no longer the top prority waiter. Replace waiter in
+ * no longer the top priority waiter. Replace waiter in
* the owner tasks pi waiters tree with the new top
* (highest priority) waiter and adjust the priority
* of the owner.
@@ -734,8 +1010,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*/
rt_mutex_dequeue_pi(task, waiter);
waiter = rt_mutex_top_waiter(lock);
+ waiter_clone_prio(waiter, task);
rt_mutex_enqueue_pi(task, waiter);
- rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(lock, task);
} else {
/*
* Nothing changed. No need to do any priority
@@ -802,8 +1079,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* @waiter: The waiter that is queued to the lock's wait tree if the
* callsite called task_blocked_on_lock(), otherwise NULL
*/
-static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
- struct rt_mutex_waiter *waiter)
+static int __sched
+try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task,
+ struct rt_mutex_waiter *waiter)
{
lockdep_assert_held(&lock->wait_lock);
@@ -838,19 +1116,21 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* trylock attempt.
*/
if (waiter) {
- /*
- * If waiter is not the highest priority waiter of
- * @lock, give up.
- */
- if (waiter != rt_mutex_top_waiter(lock))
- return 0;
+ struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
/*
- * We can acquire the lock. Remove the waiter from the
- * lock waiters tree.
+ * If waiter is the highest priority waiter of @lock,
+ * or allowed to steal it, take it over.
*/
- rt_mutex_dequeue(lock, waiter);
-
+ if (waiter == top_waiter || rt_mutex_steal(waiter, top_waiter)) {
+ /*
+ * We can acquire the lock. Remove the waiter from the
+ * lock waiters tree.
+ */
+ rt_mutex_dequeue(lock, waiter);
+ } else {
+ return 0;
+ }
} else {
/*
* If the lock has waiters already we check whether @task is
@@ -861,13 +1141,9 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* not need to be dequeued.
*/
if (rt_mutex_has_waiters(lock)) {
- /*
- * If @task->prio is greater than or equal to
- * the top waiter priority (kernel view),
- * @task lost.
- */
- if (!rt_mutex_waiter_less(task_to_waiter(task),
- rt_mutex_top_waiter(lock)))
+ /* Check whether the trylock can steal it. */
+ if (!rt_mutex_steal(task_to_waiter(task),
+ rt_mutex_top_waiter(lock)))
return 0;
/*
@@ -904,9 +1180,6 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
raw_spin_unlock(&task->pi_lock);
takeit:
- /* We got the lock. */
- debug_rt_mutex_lock(lock);
-
/*
* This either preserves the RT_MUTEX_HAS_WAITERS bit if there
* are still waiters or clears it.
@@ -923,14 +1196,15 @@ takeit:
*
* This must be called with lock->wait_lock held and interrupts disabled
*/
-static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task,
- enum rtmutex_chainwalk chwalk)
+static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task,
+ struct ww_acquire_ctx *ww_ctx,
+ enum rtmutex_chainwalk chwalk)
{
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex_waiter *top_waiter = waiter;
- struct rt_mutex *next_lock;
+ struct rt_mutex_base *next_lock;
int chain_walk = 0, res;
lockdep_assert_held(&lock->wait_lock);
@@ -943,15 +1217,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
* the other will detect the deadlock and return -EDEADLOCK,
* which is wrong, as the other waiter is not in a deadlock
* situation.
+ *
+ * Except for ww_mutex, in that case the chain walk must already deal
+ * with spurious cycles, see the comments at [3] and [6].
*/
- if (owner == task)
+ if (owner == task && !(build_ww_mutex() && ww_ctx))
return -EDEADLK;
raw_spin_lock(&task->pi_lock);
waiter->task = task;
waiter->lock = lock;
- waiter->prio = task->prio;
- waiter->deadline = task->dl.deadline;
+ waiter_update_prio(waiter, task);
+ waiter_clone_prio(waiter, task);
/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
@@ -962,6 +1239,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
raw_spin_unlock(&task->pi_lock);
+ if (build_ww_mutex() && ww_ctx) {
+ struct rt_mutex *rtm;
+
+ /* Check whether the waiter should back out immediately */
+ rtm = container_of(lock, struct rt_mutex, rtmutex);
+ res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx);
+ if (res) {
+ raw_spin_lock(&task->pi_lock);
+ rt_mutex_dequeue(lock, waiter);
+ task->pi_blocked_on = NULL;
+ raw_spin_unlock(&task->pi_lock);
+ return res;
+ }
+ }
+
if (!owner)
return 0;
@@ -970,7 +1262,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
- rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(lock, owner);
if (owner->pi_blocked_on)
chain_walk = 1;
} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -1012,11 +1304,13 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
*
* Called with lock->wait_lock held and interrupts disabled.
*/
-static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
- struct rt_mutex *lock)
+static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
+ struct rt_mutex_base *lock)
{
struct rt_mutex_waiter *waiter;
+ lockdep_assert_held(&lock->wait_lock);
+
raw_spin_lock(&current->pi_lock);
waiter = rt_mutex_top_waiter(lock);
@@ -1029,7 +1323,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
* task unblocks.
*/
rt_mutex_dequeue_pi(current, waiter);
- rt_mutex_adjust_prio(current);
+ rt_mutex_adjust_prio(lock, current);
/*
* As we are waking up the top waiter, and the waiter stays
@@ -1049,244 +1343,14 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
* deboost but before waking our donor task, hence the preempt_disable()
* before unlock.
*
- * Pairs with preempt_enable() in rt_mutex_postunlock();
+ * Pairs with preempt_enable() in rt_mutex_wake_up_q();
*/
preempt_disable();
- wake_q_add(wake_q, waiter->task);
+ rt_mutex_wake_q_add(wqh, waiter);
raw_spin_unlock(&current->pi_lock);
}
-/*
- * Remove a waiter from a lock and give up
- *
- * Must be called with lock->wait_lock held and interrupts disabled. I must
- * have just failed to try_to_take_rt_mutex().
- */
-static void remove_waiter(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter)
-{
- bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
- struct task_struct *owner = rt_mutex_owner(lock);
- struct rt_mutex *next_lock;
-
- lockdep_assert_held(&lock->wait_lock);
-
- raw_spin_lock(&current->pi_lock);
- rt_mutex_dequeue(lock, waiter);
- current->pi_blocked_on = NULL;
- raw_spin_unlock(&current->pi_lock);
-
- /*
- * Only update priority if the waiter was the highest priority
- * waiter of the lock and there is an owner to update.
- */
- if (!owner || !is_top_waiter)
- return;
-
- raw_spin_lock(&owner->pi_lock);
-
- rt_mutex_dequeue_pi(owner, waiter);
-
- if (rt_mutex_has_waiters(lock))
- rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
-
- rt_mutex_adjust_prio(owner);
-
- /* Store the lock on which owner is blocked or NULL */
- next_lock = task_blocked_on_lock(owner);
-
- raw_spin_unlock(&owner->pi_lock);
-
- /*
- * Don't walk the chain, if the owner task is not blocked
- * itself.
- */
- if (!next_lock)
- return;
-
- /* gets dropped in rt_mutex_adjust_prio_chain()! */
- get_task_struct(owner);
-
- raw_spin_unlock_irq(&lock->wait_lock);
-
- rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
- next_lock, NULL, current);
-
- raw_spin_lock_irq(&lock->wait_lock);
-}
-
-/*
- * Recheck the pi chain, in case we got a priority setting
- *
- * Called from sched_setscheduler
- */
-void rt_mutex_adjust_pi(struct task_struct *task)
-{
- struct rt_mutex_waiter *waiter;
- struct rt_mutex *next_lock;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&task->pi_lock, flags);
-
- waiter = task->pi_blocked_on;
- if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- return;
- }
- next_lock = waiter->lock;
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-
- /* gets dropped in rt_mutex_adjust_prio_chain()! */
- get_task_struct(task);
-
- rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
- next_lock, NULL, task);
-}
-
-void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
-{
- debug_rt_mutex_init_waiter(waiter);
- RB_CLEAR_NODE(&waiter->pi_tree_entry);
- RB_CLEAR_NODE(&waiter->tree_entry);
- waiter->task = NULL;
-}
-
-/**
- * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
- * @lock: the rt_mutex to take
- * @state: the state the task should block in (TASK_INTERRUPTIBLE
- * or TASK_UNINTERRUPTIBLE)
- * @timeout: the pre-initialized and started timer, or NULL for none
- * @waiter: the pre-initialized rt_mutex_waiter
- *
- * Must be called with lock->wait_lock held and interrupts disabled
- */
-static int __sched
-__rt_mutex_slowlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- struct rt_mutex_waiter *waiter)
-{
- int ret = 0;
-
- for (;;) {
- /* Try to acquire the lock: */
- if (try_to_take_rt_mutex(lock, current, waiter))
- break;
-
- /*
- * TASK_INTERRUPTIBLE checks for signals and
- * timeout. Ignored otherwise.
- */
- if (likely(state == TASK_INTERRUPTIBLE)) {
- /* Signal pending? */
- if (signal_pending(current))
- ret = -EINTR;
- if (timeout && !timeout->task)
- ret = -ETIMEDOUT;
- if (ret)
- break;
- }
-
- raw_spin_unlock_irq(&lock->wait_lock);
-
- debug_rt_mutex_print_deadlock(waiter);
-
- schedule();
-
- raw_spin_lock_irq(&lock->wait_lock);
- set_current_state(state);
- }
-
- __set_current_state(TASK_RUNNING);
- return ret;
-}
-
-static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
- struct rt_mutex_waiter *w)
-{
- /*
- * If the result is not -EDEADLOCK or the caller requested
- * deadlock detection, nothing to do here.
- */
- if (res != -EDEADLOCK || detect_deadlock)
- return;
-
- /*
- * Yell lowdly and stop the task right here.
- */
- rt_mutex_print_deadlock(w);
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
- schedule();
- }
-}
-
-/*
- * Slow path lock function:
- */
-static int __sched
-rt_mutex_slowlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk)
-{
- struct rt_mutex_waiter waiter;
- unsigned long flags;
- int ret = 0;
-
- rt_mutex_init_waiter(&waiter);
-
- /*
- * Technically we could use raw_spin_[un]lock_irq() here, but this can
- * be called in early boot if the cmpxchg() fast path is disabled
- * (debug, no architecture support). In this case we will acquire the
- * rtmutex with lock->wait_lock held. But we cannot unconditionally
- * enable interrupts in that early boot case. So we need to use the
- * irqsave/restore variants.
- */
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
-
- /* Try to acquire the lock again: */
- if (try_to_take_rt_mutex(lock, current, NULL)) {
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- return 0;
- }
-
- set_current_state(state);
-
- /* Setup the timer, when timeout != NULL */
- if (unlikely(timeout))
- hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-
- ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
-
- if (likely(!ret))
- /* sleep on the mutex */
- ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
-
- if (unlikely(ret)) {
- __set_current_state(TASK_RUNNING);
- remove_waiter(lock, &waiter);
- rt_mutex_handle_deadlock(ret, chwalk, &waiter);
- }
-
- /*
- * try_to_take_rt_mutex() sets the waiter bit
- * unconditionally. We might have to fix that up.
- */
- fixup_rt_mutex_waiters(lock);
-
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
-
- /* Remove pending timer: */
- if (unlikely(timeout))
- hrtimer_cancel(&timeout->timer);
-
- debug_rt_mutex_free_waiter(&waiter);
-
- return ret;
-}
-
-static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock)
+static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock)
{
int ret = try_to_take_rt_mutex(lock, current, NULL);
@@ -1294,7 +1358,7 @@ static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock)
* try_to_take_rt_mutex() sets the lock waiters bit
* unconditionally. Clean this up.
*/
- fixup_rt_mutex_waiters(lock);
+ fixup_rt_mutex_waiters(lock, true);
return ret;
}
@@ -1302,7 +1366,7 @@ static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock)
/*
* Slow path try-lock function:
*/
-static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
+static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock)
{
unsigned long flags;
int ret;
@@ -1328,14 +1392,20 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
return ret;
}
+static __always_inline int __rt_mutex_trylock(struct rt_mutex_base *lock)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ return 1;
+
+ return rt_mutex_slowtrylock(lock);
+}
+
/*
* Slow path to release a rt-mutex.
- *
- * Return whether the current task needs to call rt_mutex_postunlock().
*/
-static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
- struct wake_q_head *wake_q)
+static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock)
{
+ DEFINE_RT_WAKE_Q(wqh);
unsigned long flags;
/* irqsave required to support early boot calls */
@@ -1377,7 +1447,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
while (!rt_mutex_has_waiters(lock)) {
/* Drops lock->wait_lock ! */
if (unlock_rt_mutex_safe(lock, flags) == true)
- return false;
+ return;
/* Relock the rtmutex and try again */
raw_spin_lock_irqsave(&lock->wait_lock, flags);
}
@@ -1388,534 +1458,407 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
*
* Queue the next waiter for wakeup once we release the wait_lock.
*/
- mark_wakeup_next_waiter(wake_q, lock);
+ mark_wakeup_next_waiter(&wqh, lock);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- return true; /* call rt_mutex_postunlock() */
+ rt_mutex_wake_up_q(&wqh);
}
-/*
- * debug aware fast / slowpath lock,trylock,unlock
- *
- * The atomic acquire/release ops are compiled away, when either the
- * architecture does not support cmpxchg or when debugging is enabled.
- */
-static inline int
-rt_mutex_fastlock(struct rt_mutex *lock, int state,
- int (*slowfn)(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk))
+static __always_inline void __rt_mutex_unlock(struct rt_mutex_base *lock)
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
- return 0;
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ return;
- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
+ rt_mutex_slowunlock(lock);
}
-static inline int
-rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk,
- int (*slowfn)(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk))
+#ifdef CONFIG_SMP
+static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *owner)
{
- if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
- likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
- return 0;
+ bool res = true;
- return slowfn(lock, state, timeout, chwalk);
+ rcu_read_lock();
+ for (;;) {
+ /* If owner changed, trylock again. */
+ if (owner != rt_mutex_owner(lock))
+ break;
+ /*
+ * Ensure that @owner is dereferenced after checking that
+ * the lock owner still matches @owner. If that fails,
+ * @owner might point to freed memory. If it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+ /*
+ * Stop spinning when:
+ * - the lock owner has been scheduled out
+ * - current is not longer the top waiter
+ * - current is requested to reschedule (redundant
+ * for CONFIG_PREEMPT_RCU=y)
+ * - the VCPU on which owner runs is preempted
+ */
+ if (!owner_on_cpu(owner) || need_resched() ||
+ !rt_mutex_waiter_is_top_waiter(lock, waiter)) {
+ res = false;
+ break;
+ }
+ cpu_relax();
+ }
+ rcu_read_unlock();
+ return res;
}
-
-static inline int
-rt_mutex_fasttrylock(struct rt_mutex *lock,
- int (*slowfn)(struct rt_mutex *lock))
+#else
+static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *owner)
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
- return 1;
-
- return slowfn(lock);
+ return false;
}
+#endif
+#ifdef RT_MUTEX_BUILD_MUTEX
/*
- * Performs the wakeup of the the top-waiter and re-enables preemption.
+ * Functions required for:
+ * - rtmutex, futex on all kernels
+ * - mutex and rwsem substitutions on RT kernels
*/
-void rt_mutex_postunlock(struct wake_q_head *wake_q)
+
+/*
+ * Remove a waiter from a lock and give up
+ *
+ * Must be called with lock->wait_lock held and interrupts disabled. It must
+ * have just failed to try_to_take_rt_mutex().
+ */
+static void __sched remove_waiter(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter)
{
- wake_up_q(wake_q);
+ bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
+ struct task_struct *owner = rt_mutex_owner(lock);
+ struct rt_mutex_base *next_lock;
- /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
- preempt_enable();
-}
+ lockdep_assert_held(&lock->wait_lock);
-static inline void
-rt_mutex_fastunlock(struct rt_mutex *lock,
- bool (*slowfn)(struct rt_mutex *lock,
- struct wake_q_head *wqh))
-{
- DEFINE_WAKE_Q(wake_q);
+ raw_spin_lock(&current->pi_lock);
+ rt_mutex_dequeue(lock, waiter);
+ current->pi_blocked_on = NULL;
+ raw_spin_unlock(&current->pi_lock);
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ /*
+ * Only update priority if the waiter was the highest priority
+ * waiter of the lock and there is an owner to update.
+ */
+ if (!owner || !is_top_waiter)
return;
- if (slowfn(lock, &wake_q))
- rt_mutex_postunlock(&wake_q);
-}
-
-static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass)
-{
- might_sleep();
+ raw_spin_lock(&owner->pi_lock);
- mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
- rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
-}
+ rt_mutex_dequeue_pi(owner, waiter);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-/**
- * rt_mutex_lock_nested - lock a rt_mutex
- *
- * @lock: the rt_mutex to be locked
- * @subclass: the lockdep subclass
- */
-void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
-{
- __rt_mutex_lock(lock, subclass);
-}
-EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
+ if (rt_mutex_has_waiters(lock))
+ rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
-#else /* !CONFIG_DEBUG_LOCK_ALLOC */
+ rt_mutex_adjust_prio(lock, owner);
-/**
- * rt_mutex_lock - lock a rt_mutex
- *
- * @lock: the rt_mutex to be locked
- */
-void __sched rt_mutex_lock(struct rt_mutex *lock)
-{
- __rt_mutex_lock(lock, 0);
-}
-EXPORT_SYMBOL_GPL(rt_mutex_lock);
-#endif
+ /* Store the lock on which owner is blocked or NULL */
+ next_lock = task_blocked_on_lock(owner);
-/**
- * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
- *
- * @lock: the rt_mutex to be locked
- *
- * Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
- */
-int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
-{
- int ret;
+ raw_spin_unlock(&owner->pi_lock);
- might_sleep();
+ /*
+ * Don't walk the chain, if the owner task is not blocked
+ * itself.
+ */
+ if (!next_lock)
+ return;
- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
- if (ret)
- mutex_release(&lock->dep_map, _RET_IP_);
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(owner);
- return ret;
-}
-EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+ raw_spin_unlock_irq(&lock->wait_lock);
-/*
- * Futex variant, must not use fastpath.
- */
-int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
-{
- return rt_mutex_slowtrylock(lock);
-}
+ rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
+ next_lock, NULL, current);
-int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
-{
- return __rt_mutex_slowtrylock(lock);
+ raw_spin_lock_irq(&lock->wait_lock);
}
/**
- * rt_mutex_timed_lock - lock a rt_mutex interruptible
- * the timeout structure is provided
- * by the caller
- *
- * @lock: the rt_mutex to be locked
- * @timeout: timeout structure or NULL (no timeout)
+ * rt_mutex_slowlock_block() - Perform the wait-wake-try-to-take loop
+ * @lock: the rt_mutex to take
+ * @ww_ctx: WW mutex context pointer
+ * @state: the state the task should block in (TASK_INTERRUPTIBLE
+ * or TASK_UNINTERRUPTIBLE)
+ * @timeout: the pre-initialized and started timer, or NULL for none
+ * @waiter: the pre-initialized rt_mutex_waiter
*
- * Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
- * -ETIMEDOUT when the timeout expired
+ * Must be called with lock->wait_lock held and interrupts disabled
*/
-int
-rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
+static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state,
+ struct hrtimer_sleeper *timeout,
+ struct rt_mutex_waiter *waiter)
{
- int ret;
+ struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
+ struct task_struct *owner;
+ int ret = 0;
- might_sleep();
+ for (;;) {
+ /* Try to acquire the lock: */
+ if (try_to_take_rt_mutex(lock, current, waiter))
+ break;
- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
- RT_MUTEX_MIN_CHAINWALK,
- rt_mutex_slowlock);
- if (ret)
- mutex_release(&lock->dep_map, _RET_IP_);
+ if (timeout && !timeout->task) {
+ ret = -ETIMEDOUT;
+ break;
+ }
+ if (signal_pending_state(state, current)) {
+ ret = -EINTR;
+ break;
+ }
- return ret;
-}
-EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
+ if (build_ww_mutex() && ww_ctx) {
+ ret = __ww_mutex_check_kill(rtm, waiter, ww_ctx);
+ if (ret)
+ break;
+ }
-/**
- * rt_mutex_trylock - try to lock a rt_mutex
- *
- * @lock: the rt_mutex to be locked
- *
- * This function can only be called in thread context. It's safe to
- * call it from atomic regions, but not from hard interrupt or soft
- * interrupt context.
- *
- * Returns 1 on success and 0 on contention
- */
-int __sched rt_mutex_trylock(struct rt_mutex *lock)
-{
- int ret;
+ if (waiter == rt_mutex_top_waiter(lock))
+ owner = rt_mutex_owner(lock);
+ else
+ owner = NULL;
+ raw_spin_unlock_irq(&lock->wait_lock);
- if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
- return 0;
+ if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
+ rt_mutex_schedule();
- ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
- if (ret)
- mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ raw_spin_lock_irq(&lock->wait_lock);
+ set_current_state(state);
+ }
+ __set_current_state(TASK_RUNNING);
return ret;
}
-EXPORT_SYMBOL_GPL(rt_mutex_trylock);
-/**
- * rt_mutex_unlock - unlock a rt_mutex
- *
- * @lock: the rt_mutex to be unlocked
- */
-void __sched rt_mutex_unlock(struct rt_mutex *lock)
+static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
+ struct rt_mutex_waiter *w)
{
- mutex_release(&lock->dep_map, _RET_IP_);
- rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
+ /*
+ * If the result is not -EDEADLOCK or the caller requested
+ * deadlock detection, nothing to do here.
+ */
+ if (res != -EDEADLOCK || detect_deadlock)
+ return;
+
+ if (build_ww_mutex() && w->ww_ctx)
+ return;
+
+ /*
+ * Yell loudly and stop the task right here.
+ */
+ WARN(1, "rtmutex deadlock detected\n");
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ rt_mutex_schedule();
+ }
}
-EXPORT_SYMBOL_GPL(rt_mutex_unlock);
/**
- * Futex variant, that since futex variants do not use the fast-path, can be
- * simple and will not need to retry.
+ * __rt_mutex_slowlock - Locking slowpath invoked with lock::wait_lock held
+ * @lock: The rtmutex to block lock
+ * @ww_ctx: WW mutex context pointer
+ * @state: The task state for sleeping
+ * @chwalk: Indicator whether full or partial chainwalk is requested
+ * @waiter: Initializer waiter for blocking
*/
-bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wake_q)
+static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state,
+ enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_waiter *waiter)
{
+ struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
+ struct ww_mutex *ww = ww_container_of(rtm);
+ int ret;
+
lockdep_assert_held(&lock->wait_lock);
- debug_rt_mutex_unlock(lock);
+ /* Try to acquire the lock again: */
+ if (try_to_take_rt_mutex(lock, current, NULL)) {
+ if (build_ww_mutex() && ww_ctx) {
+ __ww_mutex_check_waiters(rtm, ww_ctx);
+ ww_mutex_lock_acquired(ww, ww_ctx);
+ }
+ return 0;
+ }
+
+ set_current_state(state);
- if (!rt_mutex_has_waiters(lock)) {
- lock->owner = NULL;
- return false; /* done */
+ trace_contention_begin(lock, LCB_F_RT);
+
+ ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk);
+ if (likely(!ret))
+ ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter);
+
+ if (likely(!ret)) {
+ /* acquired the lock */
+ if (build_ww_mutex() && ww_ctx) {
+ if (!ww_ctx->is_wait_die)
+ __ww_mutex_check_waiters(rtm, ww_ctx);
+ ww_mutex_lock_acquired(ww, ww_ctx);
+ }
+ } else {
+ __set_current_state(TASK_RUNNING);
+ remove_waiter(lock, waiter);
+ rt_mutex_handle_deadlock(ret, chwalk, waiter);
}
/*
- * We've already deboosted, mark_wakeup_next_waiter() will
- * retain preempt_disabled when we drop the wait_lock, to
- * avoid inversion prior to the wakeup. preempt_disable()
- * therein pairs with rt_mutex_postunlock().
+ * try_to_take_rt_mutex() sets the waiter bit
+ * unconditionally. We might have to fix that up.
*/
- mark_wakeup_next_waiter(wake_q, lock);
+ fixup_rt_mutex_waiters(lock, true);
- return true; /* call postunlock() */
+ trace_contention_end(lock, ret);
+
+ return ret;
}
-void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
+static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state)
{
- DEFINE_WAKE_Q(wake_q);
- unsigned long flags;
- bool postunlock;
+ struct rt_mutex_waiter waiter;
+ int ret;
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
- postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ rt_mutex_init_waiter(&waiter);
+ waiter.ww_ctx = ww_ctx;
- if (postunlock)
- rt_mutex_postunlock(&wake_q);
-}
+ ret = __rt_mutex_slowlock(lock, ww_ctx, state, RT_MUTEX_MIN_CHAINWALK,
+ &waiter);
-/**
- * rt_mutex_destroy - mark a mutex unusable
- * @lock: the mutex to be destroyed
- *
- * This function marks the mutex uninitialized, and any subsequent
- * use of the mutex is forbidden. The mutex must not be locked when
- * this function is called.
- */
-void rt_mutex_destroy(struct rt_mutex *lock)
-{
- WARN_ON(rt_mutex_is_locked(lock));
-#ifdef CONFIG_DEBUG_RT_MUTEXES
- lock->magic = NULL;
-#endif
+ debug_rt_mutex_free_waiter(&waiter);
+ return ret;
}
-EXPORT_SYMBOL_GPL(rt_mutex_destroy);
-/**
- * __rt_mutex_init - initialize the rt lock
- *
- * @lock: the rt lock to be initialized
- *
- * Initialize the rt lock to unlocked state.
- *
- * Initializing of a locked rt lock is not allowed
+/*
+ * rt_mutex_slowlock - Locking slowpath invoked when fast path fails
+ * @lock: The rtmutex to block lock
+ * @ww_ctx: WW mutex context pointer
+ * @state: The task state for sleeping
*/
-void __rt_mutex_init(struct rt_mutex *lock, const char *name,
- struct lock_class_key *key)
+static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state)
{
- lock->owner = NULL;
- raw_spin_lock_init(&lock->wait_lock);
- lock->waiters = RB_ROOT_CACHED;
+ unsigned long flags;
+ int ret;
- if (name && key)
- debug_rt_mutex_init(lock, name, key);
+ /*
+ * Do all pre-schedule work here, before we queue a waiter and invoke
+ * PI -- any such work that trips on rtlock (PREEMPT_RT spinlock) would
+ * otherwise recurse back into task_blocks_on_rt_mutex() through
+ * rtlock_slowlock() and will then enqueue a second waiter for this
+ * same task and things get really confusing real fast.
+ */
+ rt_mutex_pre_schedule();
+
+ /*
+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
+ * be called in early boot if the cmpxchg() fast path is disabled
+ * (debug, no architecture support). In this case we will acquire the
+ * rtmutex with lock->wait_lock held. But we cannot unconditionally
+ * enable interrupts in that early boot case. So we need to use the
+ * irqsave/restore variants.
+ */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ rt_mutex_post_schedule();
+
+ return ret;
}
-EXPORT_SYMBOL_GPL(__rt_mutex_init);
-/**
- * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
- * proxy owner
- *
- * @lock: the rt_mutex to be locked
- * @proxy_owner:the task to set as owner
- *
- * No locking. Caller has to do serializing itself
- *
- * Special API call for PI-futex support. This initializes the rtmutex and
- * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not
- * possible at this point because the pi_state which contains the rtmutex
- * is not yet visible to other tasks.
- */
-void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
- struct task_struct *proxy_owner)
+static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock,
+ unsigned int state)
{
- __rt_mutex_init(lock, NULL, NULL);
- debug_rt_mutex_proxy_lock(lock, proxy_owner);
- rt_mutex_set_owner(lock, proxy_owner);
+ lockdep_assert(!current->pi_blocked_on);
+
+ if (likely(rt_mutex_try_acquire(lock)))
+ return 0;
+
+ return rt_mutex_slowlock(lock, NULL, state);
}
+#endif /* RT_MUTEX_BUILD_MUTEX */
-/**
- * rt_mutex_proxy_unlock - release a lock on behalf of owner
- *
- * @lock: the rt_mutex to be locked
- *
- * No locking. Caller has to do serializing itself
- *
- * Special API call for PI-futex support. This merrily cleans up the rtmutex
- * (debugging) state. Concurrent operations on this rt_mutex are not
- * possible because it belongs to the pi_state which is about to be freed
- * and it is not longer visible to other tasks.
+#ifdef RT_MUTEX_BUILD_SPINLOCKS
+/*
+ * Functions required for spin/rw_lock substitution on RT kernels
*/
-void rt_mutex_proxy_unlock(struct rt_mutex *lock,
- struct task_struct *proxy_owner)
-{
- debug_rt_mutex_proxy_unlock(lock);
- rt_mutex_set_owner(lock, NULL);
-}
/**
- * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock: the rt_mutex to take
- * @waiter: the pre-initialized rt_mutex_waiter
- * @task: the task to prepare
- *
- * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
- * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
- *
- * NOTE: does _NOT_ remove the @waiter on failure; must either call
- * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
- *
- * Returns:
- * 0 - task blocked on lock
- * 1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for PI-futex support.
+ * rtlock_slowlock_locked - Slow path lock acquisition for RT locks
+ * @lock: The underlying RT mutex
*/
-int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task)
+static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock)
{
- int ret;
+ struct rt_mutex_waiter waiter;
+ struct task_struct *owner;
lockdep_assert_held(&lock->wait_lock);
- if (try_to_take_rt_mutex(lock, task, NULL))
- return 1;
+ if (try_to_take_rt_mutex(lock, current, NULL))
+ return;
- /* We enforce deadlock detection for futexes */
- ret = task_blocks_on_rt_mutex(lock, waiter, task,
- RT_MUTEX_FULL_CHAINWALK);
+ rt_mutex_init_rtlock_waiter(&waiter);
- if (ret && !rt_mutex_owner(lock)) {
- /*
- * Reset the return value. We might have
- * returned with -EDEADLK and the owner
- * released the lock while we were walking the
- * pi chain. Let the waiter sort it out.
- */
- ret = 0;
- }
+ /* Save current state and set state to TASK_RTLOCK_WAIT */
+ current_save_and_set_rtlock_wait_state();
- debug_rt_mutex_print_deadlock(waiter);
+ trace_contention_begin(lock, LCB_F_RT);
- return ret;
-}
+ task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK);
-/**
- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock: the rt_mutex to take
- * @waiter: the pre-initialized rt_mutex_waiter
- * @task: the task to prepare
- *
- * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
- * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
- *
- * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
- * on failure.
- *
- * Returns:
- * 0 - task blocked on lock
- * 1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for PI-futex support.
- */
-int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task)
-{
- int ret;
+ for (;;) {
+ /* Try to acquire the lock again */
+ if (try_to_take_rt_mutex(lock, current, &waiter))
+ break;
- raw_spin_lock_irq(&lock->wait_lock);
- ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
- if (unlikely(ret))
- remove_waiter(lock, waiter);
- raw_spin_unlock_irq(&lock->wait_lock);
+ if (&waiter == rt_mutex_top_waiter(lock))
+ owner = rt_mutex_owner(lock);
+ else
+ owner = NULL;
+ raw_spin_unlock_irq(&lock->wait_lock);
- return ret;
-}
+ if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner))
+ schedule_rtlock();
-/**
- * rt_mutex_next_owner - return the next owner of the lock
- *
- * @lock: the rt lock query
- *
- * Returns the next owner of the lock or NULL
- *
- * Caller has to serialize against other accessors to the lock
- * itself.
- *
- * Special API call for PI-futex support
- */
-struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
-{
- if (!rt_mutex_has_waiters(lock))
- return NULL;
-
- return rt_mutex_top_waiter(lock)->task;
-}
+ raw_spin_lock_irq(&lock->wait_lock);
+ set_current_state(TASK_RTLOCK_WAIT);
+ }
-/**
- * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
- * @lock: the rt_mutex we were woken on
- * @to: the timeout, null if none. hrtimer should already have
- * been started.
- * @waiter: the pre-initialized rt_mutex_waiter
- *
- * Wait for the the lock acquisition started on our behalf by
- * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
- * rt_mutex_cleanup_proxy_lock().
- *
- * Returns:
- * 0 - success
- * <0 - error, one of -EINTR, -ETIMEDOUT
- *
- * Special API call for PI-futex support
- */
-int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter)
-{
- int ret;
+ /* Restore the task state */
+ current_restore_rtlock_saved_state();
- raw_spin_lock_irq(&lock->wait_lock);
- /* sleep on the mutex */
- set_current_state(TASK_INTERRUPTIBLE);
- ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
/*
- * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
- * have to fix that up.
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally.
+ * We might have to fix that up:
*/
- fixup_rt_mutex_waiters(lock);
- raw_spin_unlock_irq(&lock->wait_lock);
+ fixup_rt_mutex_waiters(lock, true);
+ debug_rt_mutex_free_waiter(&waiter);
- return ret;
+ trace_contention_end(lock, 0);
}
-/**
- * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
- * @lock: the rt_mutex we were woken on
- * @waiter: the pre-initialized rt_mutex_waiter
- *
- * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
- * rt_mutex_wait_proxy_lock().
- *
- * Unless we acquired the lock; we're still enqueued on the wait-list and can
- * in fact still be granted ownership until we're removed. Therefore we can
- * find we are in fact the owner and must disregard the
- * rt_mutex_wait_proxy_lock() failure.
- *
- * Returns:
- * true - did the cleanup, we done.
- * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
- * caller should disregards its return value.
- *
- * Special API call for PI-futex support
- */
-bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter)
+static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock)
{
- bool cleanup = false;
-
- raw_spin_lock_irq(&lock->wait_lock);
- /*
- * Do an unconditional try-lock, this deals with the lock stealing
- * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
- * sets a NULL owner.
- *
- * We're not interested in the return value, because the subsequent
- * test on rt_mutex_owner() will infer that. If the trylock succeeded,
- * we will own the lock and it will have removed the waiter. If we
- * failed the trylock, we're still not owner and we need to remove
- * ourselves.
- */
- try_to_take_rt_mutex(lock, current, waiter);
- /*
- * Unless we're the owner; we're still enqueued on the wait_list.
- * So check if we became owner, if not, take us off the wait_list.
- */
- if (rt_mutex_owner(lock) != current) {
- remove_waiter(lock, waiter);
- cleanup = true;
- }
- /*
- * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
- * have to fix that up.
- */
- fixup_rt_mutex_waiters(lock);
-
- raw_spin_unlock_irq(&lock->wait_lock);
+ unsigned long flags;
- return cleanup;
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ rtlock_slowlock_locked(lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
}
+
+#endif /* RT_MUTEX_BUILD_SPINLOCKS */
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
deleted file mode 100644
index 732f96abf462..000000000000
--- a/kernel/locking/rtmutex.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * RT-Mutexes: blocking mutual exclusion locks with PI support
- *
- * started by Ingo Molnar and Thomas Gleixner:
- *
- * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * This file contains macros used solely by rtmutex.c.
- * Non-debug version.
- */
-
-#define rt_mutex_deadlock_check(l) (0)
-#define debug_rt_mutex_init_waiter(w) do { } while (0)
-#define debug_rt_mutex_free_waiter(w) do { } while (0)
-#define debug_rt_mutex_lock(l) do { } while (0)
-#define debug_rt_mutex_proxy_lock(l,p) do { } while (0)
-#define debug_rt_mutex_proxy_unlock(l) do { } while (0)
-#define debug_rt_mutex_unlock(l) do { } while (0)
-#define debug_rt_mutex_init(m, n, k) do { } while (0)
-#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
-#define debug_rt_mutex_print_deadlock(w) do { } while (0)
-#define debug_rt_mutex_reset_waiter(w) do { } while (0)
-
-static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
-{
- WARN(1, "rtmutex deadlock detected\n");
-}
-
-static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w,
- enum rtmutex_chainwalk walk)
-{
- return walk == RT_MUTEX_FULL_CHAINWALK;
-}
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
new file mode 100644
index 000000000000..a6974d044593
--- /dev/null
+++ b/kernel/locking/rtmutex_api.c
@@ -0,0 +1,612 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * rtmutex API
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_MUTEX
+#include "rtmutex.c"
+
+/*
+ * Max number of times we'll walk the boosting chain:
+ */
+int max_lock_depth = 1024;
+
+/*
+ * Debug aware fast / slowpath lock,trylock,unlock
+ *
+ * The atomic acquire/release ops are compiled away, when either the
+ * architecture does not support cmpxchg or when debugging is enabled.
+ */
+static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock,
+ unsigned int state,
+ struct lockdep_map *nest_lock,
+ unsigned int subclass)
+{
+ int ret;
+
+ might_sleep();
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, _RET_IP_);
+ ret = __rt_mutex_lock(&lock->rtmutex, state);
+ if (ret)
+ mutex_release(&lock->dep_map, _RET_IP_);
+ return ret;
+}
+
+void rt_mutex_base_init(struct rt_mutex_base *rtb)
+{
+ __rt_mutex_base_init(rtb);
+}
+EXPORT_SYMBOL(rt_mutex_base_init);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/**
+ * rt_mutex_lock_nested - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ * @subclass: the lockdep subclass
+ */
+void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
+
+void __sched _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0);
+}
+EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock);
+
+#else /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+/**
+ * rt_mutex_lock - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ */
+void __sched rt_mutex_lock(struct rt_mutex *lock)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock);
+#endif
+
+/**
+ * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ */
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
+{
+ return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+/**
+ * rt_mutex_lock_killable - lock a rt_mutex killable
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ */
+int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
+{
+ return __rt_mutex_lock_common(lock, TASK_KILLABLE, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
+
+/**
+ * rt_mutex_trylock - try to lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * This function can only be called in thread context. It's safe to call it
+ * from atomic regions, but not from hard or soft interrupt context.
+ *
+ * Returns:
+ * 1 on success
+ * 0 on contention
+ */
+int __sched rt_mutex_trylock(struct rt_mutex *lock)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
+ return 0;
+
+ ret = __rt_mutex_trylock(&lock->rtmutex);
+ if (ret)
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+
+/**
+ * rt_mutex_unlock - unlock a rt_mutex
+ *
+ * @lock: the rt_mutex to be unlocked
+ */
+void __sched rt_mutex_unlock(struct rt_mutex *lock)
+{
+ mutex_release(&lock->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&lock->rtmutex);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+
+/*
+ * Futex variants, must not use fastpath.
+ */
+int __sched rt_mutex_futex_trylock(struct rt_mutex_base *lock)
+{
+ return rt_mutex_slowtrylock(lock);
+}
+
+int __sched __rt_mutex_futex_trylock(struct rt_mutex_base *lock)
+{
+ return __rt_mutex_slowtrylock(lock);
+}
+
+/**
+ * __rt_mutex_futex_unlock - Futex variant, that since futex variants
+ * do not use the fast-path, can be simple and will not need to retry.
+ *
+ * @lock: The rt_mutex to be unlocked
+ * @wqh: The wake queue head from which to get the next lock waiter
+ */
+bool __sched __rt_mutex_futex_unlock(struct rt_mutex_base *lock,
+ struct rt_wake_q_head *wqh)
+{
+ lockdep_assert_held(&lock->wait_lock);
+
+ debug_rt_mutex_unlock(lock);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ return false; /* done */
+ }
+
+ /*
+ * We've already deboosted, mark_wakeup_next_waiter() will
+ * retain preempt_disabled when we drop the wait_lock, to
+ * avoid inversion prior to the wakeup. preempt_disable()
+ * therein pairs with rt_mutex_postunlock().
+ */
+ mark_wakeup_next_waiter(wqh, lock);
+
+ return true; /* call postunlock() */
+}
+
+void __sched rt_mutex_futex_unlock(struct rt_mutex_base *lock)
+{
+ DEFINE_RT_WAKE_Q(wqh);
+ unsigned long flags;
+ bool postunlock;
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ postunlock = __rt_mutex_futex_unlock(lock, &wqh);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ if (postunlock)
+ rt_mutex_postunlock(&wqh);
+}
+
+/**
+ * __rt_mutex_init - initialize the rt_mutex
+ *
+ * @lock: The rt_mutex to be initialized
+ * @name: The lock name used for debugging
+ * @key: The lock class key used for debugging
+ *
+ * Initialize the rt_mutex to unlocked state.
+ *
+ * Initializing of a locked rt_mutex is not allowed
+ */
+void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ __rt_mutex_base_init(&lock->rtmutex);
+ lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL_GPL(__rt_mutex_init);
+
+/**
+ * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
+ * proxy owner
+ *
+ * @lock: the rt_mutex to be locked
+ * @proxy_owner:the task to set as owner
+ *
+ * No locking. Caller has to do serializing itself
+ *
+ * Special API call for PI-futex support. This initializes the rtmutex and
+ * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not
+ * possible at this point because the pi_state which contains the rtmutex
+ * is not yet visible to other tasks.
+ */
+void __sched rt_mutex_init_proxy_locked(struct rt_mutex_base *lock,
+ struct task_struct *proxy_owner)
+{
+ static struct lock_class_key pi_futex_key;
+
+ __rt_mutex_base_init(lock);
+ /*
+ * On PREEMPT_RT the futex hashbucket spinlock becomes 'sleeping'
+ * and rtmutex based. That causes a lockdep false positive, because
+ * some of the futex functions invoke spin_unlock(&hb->lock) with
+ * the wait_lock of the rtmutex associated to the pi_futex held.
+ * spin_unlock() in turn takes wait_lock of the rtmutex on which
+ * the spinlock is based, which makes lockdep notice a lock
+ * recursion. Give the futex/rtmutex wait_lock a separate key.
+ */
+ lockdep_set_class(&lock->wait_lock, &pi_futex_key);
+ rt_mutex_set_owner(lock, proxy_owner);
+}
+
+/**
+ * rt_mutex_proxy_unlock - release a lock on behalf of owner
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * No locking. Caller has to do serializing itself
+ *
+ * Special API call for PI-futex support. This just cleans up the rtmutex
+ * (debugging) state. Concurrent operations on this rt_mutex are not
+ * possible because it belongs to the pi_state which is about to be freed
+ * and it is not longer visible to other tasks.
+ */
+void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
+{
+ debug_rt_mutex_proxy_unlock(lock);
+ rt_mutex_clear_owner(lock);
+}
+
+/**
+ * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: does _NOT_ remove the @waiter on failure; must either call
+ * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
+int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+
+ lockdep_assert_held(&lock->wait_lock);
+
+ if (try_to_take_rt_mutex(lock, task, NULL))
+ return 1;
+
+ /* We enforce deadlock detection for futexes */
+ ret = task_blocks_on_rt_mutex(lock, waiter, task, NULL,
+ RT_MUTEX_FULL_CHAINWALK);
+
+ if (ret && !rt_mutex_owner(lock)) {
+ /*
+ * Reset the return value. We might have
+ * returned with -EDEADLK and the owner
+ * released the lock while we were walking the
+ * pi chain. Let the waiter sort it out.
+ */
+ ret = 0;
+ }
+
+ return ret;
+}
+
+/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
+ * on failure.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
+int __sched rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+ if (unlikely(ret))
+ remove_waiter(lock, waiter);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
+ * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @to: the timeout, null if none. hrtimer should already have
+ * been started.
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Wait for the lock acquisition started on our behalf by
+ * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
+ * rt_mutex_cleanup_proxy_lock().
+ *
+ * Returns:
+ * 0 - success
+ * <0 - error, one of -EINTR, -ETIMEDOUT
+ *
+ * Special API call for PI-futex support
+ */
+int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter)
+{
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /* sleep on the mutex */
+ set_current_state(TASK_INTERRUPTIBLE);
+ ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter);
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock, true);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
+ * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
+ * rt_mutex_wait_proxy_lock().
+ *
+ * Unless we acquired the lock; we're still enqueued on the wait-list and can
+ * in fact still be granted ownership until we're removed. Therefore we can
+ * find we are in fact the owner and must disregard the
+ * rt_mutex_wait_proxy_lock() failure.
+ *
+ * Returns:
+ * true - did the cleanup, we done.
+ * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
+ * caller should disregards its return value.
+ *
+ * Special API call for PI-futex support
+ */
+bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ bool cleanup = false;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /*
+ * Do an unconditional try-lock, this deals with the lock stealing
+ * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
+ * sets a NULL owner.
+ *
+ * We're not interested in the return value, because the subsequent
+ * test on rt_mutex_owner() will infer that. If the trylock succeeded,
+ * we will own the lock and it will have removed the waiter. If we
+ * failed the trylock, we're still not owner and we need to remove
+ * ourselves.
+ */
+ try_to_take_rt_mutex(lock, current, waiter);
+ /*
+ * Unless we're the owner; we're still enqueued on the wait_list.
+ * So check if we became owner, if not, take us off the wait_list.
+ */
+ if (rt_mutex_owner(lock) != current) {
+ remove_waiter(lock, waiter);
+ cleanup = true;
+ }
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock, false);
+
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return cleanup;
+}
+
+/*
+ * Recheck the pi chain, in case we got a priority setting
+ *
+ * Called from sched_setscheduler
+ */
+void __sched rt_mutex_adjust_pi(struct task_struct *task)
+{
+ struct rt_mutex_waiter *waiter;
+ struct rt_mutex_base *next_lock;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ waiter = task->pi_blocked_on;
+ if (!waiter || rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ return;
+ }
+ next_lock = waiter->lock;
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(task);
+
+ rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
+ next_lock, NULL, task);
+}
+
+/*
+ * Performs the wakeup of the top-waiter and re-enables preemption.
+ */
+void __sched rt_mutex_postunlock(struct rt_wake_q_head *wqh)
+{
+ rt_mutex_wake_up_q(wqh);
+}
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+void rt_mutex_debug_task_free(struct task_struct *task)
+{
+ DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
+ DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
+}
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+/* Mutexes */
+void __mutex_rt_init(struct mutex *mutex, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
+ lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL(__mutex_rt_init);
+
+static __always_inline int __mutex_lock_common(struct mutex *lock,
+ unsigned int state,
+ unsigned int subclass,
+ struct lockdep_map *nest_lock,
+ unsigned long ip)
+{
+ int ret;
+
+ might_sleep();
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
+ ret = __rt_mutex_lock(&lock->rtmutex, state);
+ if (ret)
+ mutex_release(&lock->dep_map, ip);
+ else
+ lock_acquired(&lock->dep_map, ip);
+ return ret;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_nested);
+
+void __sched _mutex_lock_nest_lock(struct mutex *lock,
+ struct lockdep_map *nest_lock)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest_lock, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
+
+int __sched mutex_lock_interruptible_nested(struct mutex *lock,
+ unsigned int subclass)
+{
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
+
+int __sched mutex_lock_killable_nested(struct mutex *lock,
+ unsigned int subclass)
+{
+ return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
+
+void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
+{
+ int token;
+
+ might_sleep();
+
+ token = io_schedule_prepare();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
+
+#else /* CONFIG_DEBUG_LOCK_ALLOC */
+
+void __sched mutex_lock(struct mutex *lock)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock);
+
+int __sched mutex_lock_interruptible(struct mutex *lock)
+{
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock_interruptible);
+
+int __sched mutex_lock_killable(struct mutex *lock)
+{
+ return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock_killable);
+
+void __sched mutex_lock_io(struct mutex *lock)
+{
+ int token = io_schedule_prepare();
+
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL(mutex_lock_io);
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+int __sched mutex_trylock(struct mutex *lock)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
+ return 0;
+
+ ret = __rt_mutex_trylock(&lock->rtmutex);
+ if (ret)
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL(mutex_trylock);
+
+void __sched mutex_unlock(struct mutex *lock)
+{
+ mutex_release(&lock->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&lock->rtmutex);
+}
+EXPORT_SYMBOL(mutex_unlock);
+
+#endif /* CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index d1d62f942be2..1162e07cdaea 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -13,50 +13,127 @@
#ifndef __KERNEL_RTMUTEX_COMMON_H
#define __KERNEL_RTMUTEX_COMMON_H
+#include <linux/debug_locks.h>
#include <linux/rtmutex.h>
#include <linux/sched/wake_q.h>
+
+/*
+ * This is a helper for the struct rt_mutex_waiter below. A waiter goes in two
+ * separate trees and they need their own copy of the sort keys because of
+ * different locking requirements.
+ *
+ * @entry: rbtree node to enqueue into the waiters tree
+ * @prio: Priority of the waiter
+ * @deadline: Deadline of the waiter if applicable
+ *
+ * See rt_waiter_node_less() and waiter_*_prio().
+ */
+struct rt_waiter_node {
+ struct rb_node entry;
+ int prio;
+ u64 deadline;
+};
+
/*
* This is the control structure for tasks blocked on a rt_mutex,
* which is allocated on the kernel stack on of the blocked task.
*
- * @tree_entry: pi node to enqueue into the mutex waiters tree
- * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
+ * @tree: node to enqueue into the mutex waiters tree
+ * @pi_tree: node to enqueue into the mutex owner waiters tree
* @task: task reference to the blocked task
+ * @lock: Pointer to the rt_mutex on which the waiter blocks
+ * @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT)
+ * @ww_ctx: WW context pointer
+ *
+ * @tree is ordered by @lock->wait_lock
+ * @pi_tree is ordered by rt_mutex_owner(@lock)->pi_lock
*/
struct rt_mutex_waiter {
- struct rb_node tree_entry;
- struct rb_node pi_tree_entry;
+ struct rt_waiter_node tree;
+ struct rt_waiter_node pi_tree;
struct task_struct *task;
- struct rt_mutex *lock;
-#ifdef CONFIG_DEBUG_RT_MUTEXES
- unsigned long ip;
- struct pid *deadlock_task_pid;
- struct rt_mutex *deadlock_lock;
-#endif
- int prio;
- u64 deadline;
+ struct rt_mutex_base *lock;
+ unsigned int wake_state;
+ struct ww_acquire_ctx *ww_ctx;
+};
+
+/**
+ * rt_wake_q_head - Wrapper around regular wake_q_head to support
+ * "sleeping" spinlocks on RT
+ * @head: The regular wake_q_head for sleeping lock variants
+ * @rtlock_task: Task pointer for RT lock (spin/rwlock) wakeups
+ */
+struct rt_wake_q_head {
+ struct wake_q_head head;
+ struct task_struct *rtlock_task;
};
+#define DEFINE_RT_WAKE_Q(name) \
+ struct rt_wake_q_head name = { \
+ .head = WAKE_Q_HEAD_INITIALIZER(name.head), \
+ .rtlock_task = NULL, \
+ }
+
/*
- * Various helpers to access the waiters-tree:
+ * PI-futex support (proxy locking functions, etc.):
*/
+extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock,
+ struct task_struct *proxy_owner);
+extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock);
+extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
+extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
+extern int rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter);
+extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter);
-#ifdef CONFIG_RT_MUTEXES
+extern int rt_mutex_futex_trylock(struct rt_mutex_base *l);
+extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l);
+
+extern void rt_mutex_futex_unlock(struct rt_mutex_base *lock);
+extern bool __rt_mutex_futex_unlock(struct rt_mutex_base *lock,
+ struct rt_wake_q_head *wqh);
-static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+extern void rt_mutex_postunlock(struct rt_wake_q_head *wqh);
+
+/*
+ * Must be guarded because this header is included from rcu/tree_plugin.h
+ * unconditionally.
+ */
+#ifdef CONFIG_RT_MUTEXES
+static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock)
{
return !RB_EMPTY_ROOT(&lock->waiters.rb_root);
}
-static inline struct rt_mutex_waiter *
-rt_mutex_top_waiter(struct rt_mutex *lock)
+/*
+ * Lockless speculative check whether @waiter is still the top waiter on
+ * @lock. This is solely comparing pointers and not derefencing the
+ * leftmost entry which might be about to vanish.
+ */
+static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ struct rb_node *leftmost = rb_first_cached(&lock->waiters);
+
+ return rb_entry(leftmost, struct rt_mutex_waiter, tree.entry) == waiter;
+}
+
+static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock)
{
struct rb_node *leftmost = rb_first_cached(&lock->waiters);
struct rt_mutex_waiter *w = NULL;
+ lockdep_assert_held(&lock->wait_lock);
+
if (leftmost) {
- w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry);
+ w = rb_entry(leftmost, struct rt_mutex_waiter, tree.entry);
BUG_ON(w->lock != lock);
}
return w;
@@ -67,45 +144,17 @@ static inline int task_has_pi_waiters(struct task_struct *p)
return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root);
}
-static inline struct rt_mutex_waiter *
-task_top_pi_waiter(struct task_struct *p)
-{
- return rb_entry(p->pi_waiters.rb_leftmost,
- struct rt_mutex_waiter, pi_tree_entry);
-}
-
-#else
-
-static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
-{
- return false;
-}
-
-static inline struct rt_mutex_waiter *
-rt_mutex_top_waiter(struct rt_mutex *lock)
+static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p)
{
- return NULL;
-}
+ lockdep_assert_held(&p->pi_lock);
-static inline int task_has_pi_waiters(struct task_struct *p)
-{
- return false;
+ return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter,
+ pi_tree.entry);
}
-static inline struct rt_mutex_waiter *
-task_top_pi_waiter(struct task_struct *p)
-{
- return NULL;
-}
-
-#endif
-
-/*
- * lock->owner state tracking:
- */
#define RT_MUTEX_HAS_WAITERS 1UL
-static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock)
{
unsigned long owner = (unsigned long) READ_ONCE(lock->owner);
@@ -127,40 +176,59 @@ enum rtmutex_chainwalk {
RT_MUTEX_FULL_CHAINWALK,
};
-/*
- * PI-futex support (proxy locking functions, etc.):
- */
-extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
-extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
- struct task_struct *proxy_owner);
-extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
- struct task_struct *proxy_owner);
-extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
-extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task);
-extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task);
-extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter);
-extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter);
+static inline void __rt_mutex_base_init(struct rt_mutex_base *lock)
+{
+ raw_spin_lock_init(&lock->wait_lock);
+ lock->waiters = RB_ROOT_CACHED;
+ lock->owner = NULL;
+}
-extern int rt_mutex_futex_trylock(struct rt_mutex *l);
-extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
+/* Debug functions */
+static inline void debug_rt_mutex_unlock(struct rt_mutex_base *lock)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
+}
-extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
-extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wqh);
+static inline void debug_rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
+}
-extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
+static inline void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ memset(waiter, 0x11, sizeof(*waiter));
+}
-#ifdef CONFIG_DEBUG_RT_MUTEXES
-# include "rtmutex-debug.h"
-#else
-# include "rtmutex.h"
-#endif
+static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ memset(waiter, 0x22, sizeof(*waiter));
+}
+
+static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ debug_rt_mutex_init_waiter(waiter);
+ RB_CLEAR_NODE(&waiter->pi_tree.entry);
+ RB_CLEAR_NODE(&waiter->tree.entry);
+ waiter->wake_state = TASK_NORMAL;
+ waiter->task = NULL;
+}
+
+static inline void rt_mutex_init_rtlock_waiter(struct rt_mutex_waiter *waiter)
+{
+ rt_mutex_init_waiter(waiter);
+ waiter->wake_state = TASK_RTLOCK_WAIT;
+}
+
+#else /* CONFIG_RT_MUTEXES */
+/* Used in rcu/tree_plugin.h */
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock)
+{
+ return NULL;
+}
+#endif /* !CONFIG_RT_MUTEXES */
#endif
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
new file mode 100644
index 000000000000..34a59569db6b
--- /dev/null
+++ b/kernel/locking/rwbase_rt.c
@@ -0,0 +1,297 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * RT-specific reader/writer semaphores and reader/writer locks
+ *
+ * down_write/write_lock()
+ * 1) Lock rtmutex
+ * 2) Remove the reader BIAS to force readers into the slow path
+ * 3) Wait until all readers have left the critical section
+ * 4) Mark it write locked
+ *
+ * up_write/write_unlock()
+ * 1) Remove the write locked marker
+ * 2) Set the reader BIAS, so readers can use the fast path again
+ * 3) Unlock rtmutex, to release blocked readers
+ *
+ * down_read/read_lock()
+ * 1) Try fast path acquisition (reader BIAS is set)
+ * 2) Take tmutex::wait_lock, which protects the writelocked flag
+ * 3) If !writelocked, acquire it for read
+ * 4) If writelocked, block on tmutex
+ * 5) unlock rtmutex, goto 1)
+ *
+ * up_read/read_unlock()
+ * 1) Try fast path release (reader count != 1)
+ * 2) Wake the writer waiting in down_write()/write_lock() #3
+ *
+ * down_read/read_lock()#3 has the consequence, that rw semaphores and rw
+ * locks on RT are not writer fair, but writers, which should be avoided in
+ * RT tasks (think mmap_sem), are subject to the rtmutex priority/DL
+ * inheritance mechanism.
+ *
+ * It's possible to make the rw primitives writer fair by keeping a list of
+ * active readers. A blocked writer would force all newly incoming readers
+ * to block on the rtmutex, but the rtmutex would have to be proxy locked
+ * for one reader after the other. We can't use multi-reader inheritance
+ * because there is no way to support that with SCHED_DEADLINE.
+ * Implementing the one by one reader boosting/handover mechanism is a
+ * major surgery for a very dubious value.
+ *
+ * The risk of writer starvation is there, but the pathological use cases
+ * which trigger it are not necessarily the typical RT workloads.
+ *
+ * Fast-path orderings:
+ * The lock/unlock of readers can run in fast paths: lock and unlock are only
+ * atomic ops, and there is no inner lock to provide ACQUIRE and RELEASE
+ * semantics of rwbase_rt. Atomic ops should thus provide _acquire()
+ * and _release() (or stronger).
+ *
+ * Common code shared between RT rw_semaphore and rwlock
+ */
+
+static __always_inline int rwbase_read_trylock(struct rwbase_rt *rwb)
+{
+ int r;
+
+ /*
+ * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is
+ * set.
+ */
+ for (r = atomic_read(&rwb->readers); r < 0;) {
+ if (likely(atomic_try_cmpxchg_acquire(&rwb->readers, &r, r + 1)))
+ return 1;
+ }
+ return 0;
+}
+
+static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ int ret;
+
+ rwbase_pre_schedule();
+ raw_spin_lock_irq(&rtm->wait_lock);
+
+ /*
+ * Call into the slow lock path with the rtmutex->wait_lock
+ * held, so this can't result in the following race:
+ *
+ * Reader1 Reader2 Writer
+ * down_read()
+ * down_write()
+ * rtmutex_lock(m)
+ * wait()
+ * down_read()
+ * unlock(m->wait_lock)
+ * up_read()
+ * wake(Writer)
+ * lock(m->wait_lock)
+ * sem->writelocked=true
+ * unlock(m->wait_lock)
+ *
+ * up_write()
+ * sem->writelocked=false
+ * rtmutex_unlock(m)
+ * down_read()
+ * down_write()
+ * rtmutex_lock(m)
+ * wait()
+ * rtmutex_lock(m)
+ *
+ * That would put Reader1 behind the writer waiting on
+ * Reader2 to call up_read(), which might be unbound.
+ */
+
+ trace_contention_begin(rwb, LCB_F_RT | LCB_F_READ);
+
+ /*
+ * For rwlocks this returns 0 unconditionally, so the below
+ * !ret conditionals are optimized out.
+ */
+ ret = rwbase_rtmutex_slowlock_locked(rtm, state);
+
+ /*
+ * On success the rtmutex is held, so there can't be a writer
+ * active. Increment the reader count and immediately drop the
+ * rtmutex again.
+ *
+ * rtmutex->wait_lock has to be unlocked in any case of course.
+ */
+ if (!ret)
+ atomic_inc(&rwb->readers);
+ raw_spin_unlock_irq(&rtm->wait_lock);
+ if (!ret)
+ rwbase_rtmutex_unlock(rtm);
+
+ trace_contention_end(rwb, ret);
+ rwbase_post_schedule();
+ return ret;
+}
+
+static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ lockdep_assert(!current->pi_blocked_on);
+
+ if (rwbase_read_trylock(rwb))
+ return 0;
+
+ return __rwbase_read_lock(rwb, state);
+}
+
+static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ struct task_struct *owner;
+ DEFINE_RT_WAKE_Q(wqh);
+
+ raw_spin_lock_irq(&rtm->wait_lock);
+ /*
+ * Wake the writer, i.e. the rtmutex owner. It might release the
+ * rtmutex concurrently in the fast path (due to a signal), but to
+ * clean up rwb->readers it needs to acquire rtm->wait_lock. The
+ * worst case which can happen is a spurious wakeup.
+ */
+ owner = rt_mutex_owner(rtm);
+ if (owner)
+ rt_mutex_wake_q_add_task(&wqh, owner, state);
+
+ /* Pairs with the preempt_enable in rt_mutex_wake_up_q() */
+ preempt_disable();
+ raw_spin_unlock_irq(&rtm->wait_lock);
+ rt_mutex_wake_up_q(&wqh);
+}
+
+static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ /*
+ * rwb->readers can only hit 0 when a writer is waiting for the
+ * active readers to leave the critical section.
+ *
+ * dec_and_test() is fully ordered, provides RELEASE.
+ */
+ if (unlikely(atomic_dec_and_test(&rwb->readers)))
+ __rwbase_read_unlock(rwb, state);
+}
+
+static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias,
+ unsigned long flags)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+
+ /*
+ * _release() is needed in case that reader is in fast path, pairing
+ * with atomic_try_cmpxchg_acquire() in rwbase_read_trylock().
+ */
+ (void)atomic_add_return_release(READER_BIAS - bias, &rwb->readers);
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_rtmutex_unlock(rtm);
+}
+
+static inline void rwbase_write_unlock(struct rwbase_rt *rwb)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ __rwbase_write_unlock(rwb, WRITER_BIAS, flags);
+}
+
+static inline void rwbase_write_downgrade(struct rwbase_rt *rwb)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ /* Release it and account current as reader */
+ __rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags);
+}
+
+static inline bool __rwbase_write_trylock(struct rwbase_rt *rwb)
+{
+ /* Can do without CAS because we're serialized by wait_lock. */
+ lockdep_assert_held(&rwb->rtmutex.wait_lock);
+
+ /*
+ * _acquire is needed in case the reader is in the fast path, pairing
+ * with rwbase_read_unlock(), provides ACQUIRE.
+ */
+ if (!atomic_read_acquire(&rwb->readers)) {
+ atomic_set(&rwb->readers, WRITER_BIAS);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ /* Take the rtmutex as a first step */
+ if (rwbase_rtmutex_lock_state(rtm, state))
+ return -EINTR;
+
+ /* Force readers into slow path */
+ atomic_sub(READER_BIAS, &rwb->readers);
+
+ rwbase_pre_schedule();
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ if (__rwbase_write_trylock(rwb))
+ goto out_unlock;
+
+ rwbase_set_and_save_current_state(state);
+ trace_contention_begin(rwb, LCB_F_RT | LCB_F_WRITE);
+ for (;;) {
+ /* Optimized out for rwlocks */
+ if (rwbase_signal_pending_state(state, current)) {
+ rwbase_restore_current_state();
+ __rwbase_write_unlock(rwb, 0, flags);
+ rwbase_post_schedule();
+ trace_contention_end(rwb, -EINTR);
+ return -EINTR;
+ }
+
+ if (__rwbase_write_trylock(rwb))
+ break;
+
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_schedule();
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+
+ set_current_state(state);
+ }
+ rwbase_restore_current_state();
+ trace_contention_end(rwb, 0);
+
+out_unlock:
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_post_schedule();
+ return 0;
+}
+
+static inline int rwbase_write_trylock(struct rwbase_rt *rwb)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ if (!rwbase_rtmutex_trylock(rtm))
+ return 0;
+
+ atomic_sub(READER_BIAS, &rwb->readers);
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ if (__rwbase_write_trylock(rwb)) {
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ return 1;
+ }
+ __rwbase_write_unlock(rwb, 0, flags);
+ return 0;
+}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index f11b9bd3431d..2340b6d90ec6 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -27,23 +27,19 @@
#include <linux/export.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
+#include <trace/events/lock.h>
+#ifndef CONFIG_PREEMPT_RT
#include "lock_events.h"
/*
- * The least significant 3 bits of the owner value has the following
+ * The least significant 2 bits of the owner value has the following
* meanings when set.
* - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers
- * - Bit 1: RWSEM_RD_NONSPINNABLE - Readers cannot spin on this lock.
- * - Bit 2: RWSEM_WR_NONSPINNABLE - Writers cannot spin on this lock.
+ * - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
*
- * When the rwsem is either owned by an anonymous writer, or it is
- * reader-owned, but a spinning writer has timed out, both nonspinnable
- * bits will be set to disable optimistic spinning by readers and writers.
- * In the later case, the last unlocking reader should then check the
- * writer nonspinnable bit and clear it only to give writers preference
- * to acquire the lock via optimistic spinning, but not readers. Similar
- * action is also done in the reader slowpath.
+ * When the rwsem is reader-owned and a spinning writer has timed out,
+ * the nonspinnable bit will be set to disable optimistic spinning.
* When a writer acquires a rwsem, it puts its task_struct pointer
* into the owner field. It is cleared after an unlock.
@@ -59,46 +55,13 @@
* is involved. Ideally we would like to track all the readers that own
* a rwsem, but the overhead is simply too big.
*
- * Reader optimistic spinning is helpful when the reader critical section
- * is short and there aren't that many readers around. It makes readers
- * relatively more preferred than writers. When a writer times out spinning
- * on a reader-owned lock and set the nospinnable bits, there are two main
- * reasons for that.
- *
- * 1) The reader critical section is long, perhaps the task sleeps after
- * acquiring the read lock.
- * 2) There are just too many readers contending the lock causing it to
- * take a while to service all of them.
- *
- * In the former case, long reader critical section will impede the progress
- * of writers which is usually more important for system performance. In
- * the later case, reader optimistic spinning tends to make the reader
- * groups that contain readers that acquire the lock together smaller
- * leading to more of them. That may hurt performance in some cases. In
- * other words, the setting of nonspinnable bits indicates that reader
- * optimistic spinning may not be helpful for those workloads that cause
- * it.
- *
- * Therefore, any writers that had observed the setting of the writer
- * nonspinnable bit for a given rwsem after they fail to acquire the lock
- * via optimistic spinning will set the reader nonspinnable bit once they
- * acquire the write lock. Similarly, readers that observe the setting
- * of reader nonspinnable bit at slowpath entry will set the reader
- * nonspinnable bits when they acquire the read lock via the wakeup path.
- *
- * Once the reader nonspinnable bit is on, it will only be reset when
- * a writer is able to acquire the rwsem in the fast path or somehow a
- * reader or writer in the slowpath doesn't observe the nonspinable bit.
- *
- * This is to discourage reader optmistic spinning on that particular
- * rwsem and make writers more preferred. This adaptive disabling of reader
- * optimistic spinning will alleviate the negative side effect of this
- * feature.
+ * A fast path reader optimistic lock stealing is supported when the rwsem
+ * is previously owned by a writer and the following conditions are met:
+ * - rwsem is not currently writer owned
+ * - the handoff isn't set.
*/
#define RWSEM_READER_OWNED (1UL << 0)
-#define RWSEM_RD_NONSPINNABLE (1UL << 1)
-#define RWSEM_WR_NONSPINNABLE (1UL << 2)
-#define RWSEM_NONSPINNABLE (RWSEM_RD_NONSPINNABLE | RWSEM_WR_NONSPINNABLE)
+#define RWSEM_NONSPINNABLE (1UL << 1)
#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
#ifdef CONFIG_DEBUG_RWSEMS
@@ -143,9 +106,9 @@
* atomic_long_cmpxchg() will be used to obtain writer lock.
*
* There are three places where the lock handoff bit may be set or cleared.
- * 1) rwsem_mark_wake() for readers.
- * 2) rwsem_try_write_lock() for writers.
- * 3) Error path of rwsem_down_write_slowpath().
+ * 1) rwsem_mark_wake() for readers -- set, clear
+ * 2) rwsem_try_write_lock() for writers -- set, clear
+ * 3) rwsem_del_waiter() -- clear
*
* For all the above cases, wait_lock will be held. A writer must also
* be the first one in the wait_list to be eligible for setting the handoff
@@ -170,14 +133,19 @@
* the owner value concurrently without lock. Read from owner, however,
* may not need READ_ONCE() as long as the pointer value is only used
* for comparison and isn't being dereferenced.
+ *
+ * Both rwsem_{set,clear}_owner() functions should be in the same
+ * preempt disable section as the atomic op that changes sem->count.
*/
static inline void rwsem_set_owner(struct rw_semaphore *sem)
{
+ lockdep_assert_preemption_disabled();
atomic_long_set(&sem->owner, (long)current);
}
static inline void rwsem_clear_owner(struct rw_semaphore *sem)
{
+ lockdep_assert_preemption_disabled();
atomic_long_set(&sem->owner, 0);
}
@@ -203,7 +171,7 @@ static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
struct task_struct *owner)
{
unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED |
- (atomic_long_read(&sem->owner) & RWSEM_RD_NONSPINNABLE);
+ (atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE);
atomic_long_set(&sem->owner, val);
}
@@ -270,12 +238,31 @@ static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem)
owner | RWSEM_NONSPINNABLE));
}
-static inline bool rwsem_read_trylock(struct rw_semaphore *sem)
+static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp)
{
- long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count);
- if (WARN_ON_ONCE(cnt < 0))
+ *cntp = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count);
+
+ if (WARN_ON_ONCE(*cntp < 0))
rwsem_set_nonspinnable(sem);
- return !(cnt & RWSEM_READ_FAILED_MASK);
+
+ if (!(*cntp & RWSEM_READ_FAILED_MASK)) {
+ rwsem_set_reader_owned(sem);
+ return true;
+ }
+
+ return false;
+}
+
+static inline bool rwsem_write_trylock(struct rw_semaphore *sem)
+{
+ long tmp = RWSEM_UNLOCKED_VALUE;
+
+ if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) {
+ rwsem_set_owner(sem);
+ return true;
+ }
+
+ return false;
}
/*
@@ -353,7 +340,7 @@ struct rwsem_waiter {
struct task_struct *task;
enum rwsem_waiter_type type;
unsigned long timeout;
- unsigned long last_rowner;
+ bool handoff_set;
};
#define rwsem_first_waiter(sem) \
list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
@@ -364,12 +351,6 @@ enum rwsem_wake_type {
RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
};
-enum writer_wait_state {
- WRITER_NOT_FIRST, /* Writer is not first in wait list */
- WRITER_FIRST, /* Writer is first in wait list */
- WRITER_HANDOFF /* Writer is first & handoff needed */
-};
-
/*
* The typical HZ value is either 250 or 1000. So set the minimum waiting
* time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
@@ -385,6 +366,34 @@ enum writer_wait_state {
*/
#define MAX_READERS_WAKEUP 0x100
+static inline void
+rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+{
+ lockdep_assert_held(&sem->wait_lock);
+ list_add_tail(&waiter->list, &sem->wait_list);
+ /* caller will set RWSEM_FLAG_WAITERS */
+}
+
+/*
+ * Remove a waiter from the wait_list and clear flags.
+ *
+ * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
+ * this function. Modify with care.
+ *
+ * Return: true if wait_list isn't empty and false otherwise
+ */
+static inline bool
+rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+{
+ lockdep_assert_held(&sem->wait_lock);
+ list_del(&waiter->list);
+ if (likely(!list_empty(&sem->wait_list)))
+ return true;
+
+ atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
+ return false;
+}
+
/*
* handle the lock release when processes blocked on it that can now run
* - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
@@ -396,6 +405,8 @@ enum writer_wait_state {
* preferably when the wait_lock is released
* - woken process blocks are discarded from the list after having task zeroed
* - writers are only marked woken if downgrading is false
+ *
+ * Implies rwsem_del_waiter() for all woken readers.
*/
static void rwsem_mark_wake(struct rw_semaphore *sem,
enum rwsem_wake_type wake_type,
@@ -451,10 +462,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
* to give up the lock), request a HANDOFF to
* force the issue.
*/
- if (!(oldcount & RWSEM_FLAG_HANDOFF) &&
- time_after(jiffies, waiter->timeout)) {
- adjustment -= RWSEM_FLAG_HANDOFF;
- lockevent_inc(rwsem_rlock_handoff);
+ if (time_after(jiffies, waiter->timeout)) {
+ if (!(oldcount & RWSEM_FLAG_HANDOFF)) {
+ adjustment -= RWSEM_FLAG_HANDOFF;
+ lockevent_inc(rwsem_rlock_handoff);
+ }
+ waiter->handoff_set = true;
}
atomic_long_add(-adjustment, &sem->count);
@@ -467,10 +480,6 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
* the reader is copied over.
*/
owner = waiter->task;
- if (waiter->last_rowner & RWSEM_RD_NONSPINNABLE) {
- owner = (void *)((unsigned long)owner | RWSEM_RD_NONSPINNABLE);
- lockevent_inc(rwsem_opt_norspin);
- }
__rwsem_set_reader_owned(sem, owner);
}
@@ -508,24 +517,31 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
/*
* Limit # of readers that can be woken up per wakeup call.
*/
- if (woken >= MAX_READERS_WAKEUP)
+ if (unlikely(woken >= MAX_READERS_WAKEUP))
break;
}
adjustment = woken * RWSEM_READER_BIAS - adjustment;
lockevent_cond_inc(rwsem_wake_reader, woken);
+
+ oldcount = atomic_long_read(&sem->count);
if (list_empty(&sem->wait_list)) {
- /* hit end of list above */
+ /*
+ * Combined with list_move_tail() above, this implies
+ * rwsem_del_waiter().
+ */
adjustment -= RWSEM_FLAG_WAITERS;
+ if (oldcount & RWSEM_FLAG_HANDOFF)
+ adjustment -= RWSEM_FLAG_HANDOFF;
+ } else if (woken) {
+ /*
+ * When we've woken a reader, we no longer need to force
+ * writers to give up the lock and we can clear HANDOFF.
+ */
+ if (oldcount & RWSEM_FLAG_HANDOFF)
+ adjustment -= RWSEM_FLAG_HANDOFF;
}
- /*
- * When we've woken a reader, we no longer need to force writers
- * to give up the lock and we can clear HANDOFF.
- */
- if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
- adjustment -= RWSEM_FLAG_HANDOFF;
-
if (adjustment)
atomic_long_add(adjustment, &sem->count);
@@ -552,16 +568,43 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
}
/*
+ * Remove a waiter and try to wake up other waiters in the wait queue
+ * This function is called from the out_nolock path of both the reader and
+ * writer slowpaths with wait_lock held. It releases the wait_lock and
+ * optionally wake up waiters before it returns.
+ */
+static inline void
+rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
+ struct wake_q_head *wake_q)
+ __releases(&sem->wait_lock)
+{
+ bool first = rwsem_first_waiter(sem) == waiter;
+
+ wake_q_init(wake_q);
+
+ /*
+ * If the wait_list isn't empty and the waiter to be deleted is
+ * the first waiter, we wake up the remaining waiters as they may
+ * be eligible to acquire or spin on the lock.
+ */
+ if (rwsem_del_waiter(sem, waiter) && first)
+ rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ if (!wake_q_empty(wake_q))
+ wake_up_q(wake_q);
+}
+
+/*
* This function must be called with the sem->wait_lock held to prevent
* race conditions between checking the rwsem wait list and setting the
* sem->count accordingly.
*
- * If wstate is WRITER_HANDOFF, it will make sure that either the handoff
- * bit is set or the lock is acquired with handoff bit cleared.
+ * Implies rwsem_del_waiter() on success.
*/
static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
- enum writer_wait_state wstate)
+ struct rwsem_waiter *waiter)
{
+ struct rwsem_waiter *first = rwsem_first_waiter(sem);
long count, new;
lockdep_assert_held(&sem->wait_lock);
@@ -570,13 +613,26 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
do {
bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
- if (has_handoff && wstate == WRITER_NOT_FIRST)
- return false;
+ if (has_handoff) {
+ /*
+ * Honor handoff bit and yield only when the first
+ * waiter is the one that set it. Otherwisee, we
+ * still try to acquire the rwsem.
+ */
+ if (first->handoff_set && (waiter != first))
+ return false;
+ }
new = count;
if (count & RWSEM_LOCK_MASK) {
- if (has_handoff || (wstate != WRITER_HANDOFF))
+ /*
+ * A waiter (first or not) can set the handoff bit
+ * if it is an RT task or wait in the wait queue
+ * for too long.
+ */
+ if (has_handoff || (!rt_task(waiter->task) &&
+ !time_after(jiffies, waiter->timeout)))
return false;
new |= RWSEM_FLAG_HANDOFF;
@@ -590,41 +646,44 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
} while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
/*
- * We have either acquired the lock with handoff bit cleared or
- * set the handoff bit.
+ * We have either acquired the lock with handoff bit cleared or set
+ * the handoff bit. Only the first waiter can have its handoff_set
+ * set here to enable optimistic spinning in slowpath loop.
*/
- if (new & RWSEM_FLAG_HANDOFF)
+ if (new & RWSEM_FLAG_HANDOFF) {
+ first->handoff_set = true;
+ lockevent_inc(rwsem_wlock_handoff);
return false;
+ }
+ /*
+ * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
+ * success.
+ */
+ list_del(&waiter->list);
rwsem_set_owner(sem);
return true;
}
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
/*
- * Try to acquire read lock before the reader is put on wait queue.
- * Lock acquisition isn't allowed if the rwsem is locked or a writer handoff
- * is ongoing.
+ * The rwsem_spin_on_owner() function returns the following 4 values
+ * depending on the lock owner state.
+ * OWNER_NULL : owner is currently NULL
+ * OWNER_WRITER: when owner changes and is a writer
+ * OWNER_READER: when owner changes and the new owner may be a reader.
+ * OWNER_NONSPINNABLE:
+ * when optimistic spinning has to stop because either the
+ * owner stops running, is unknown, or its timeslice has
+ * been used up.
*/
-static inline bool rwsem_try_read_lock_unqueued(struct rw_semaphore *sem)
-{
- long count = atomic_long_read(&sem->count);
-
- if (count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))
- return false;
-
- count = atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count);
- if (!(count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
- rwsem_set_reader_owned(sem);
- lockevent_inc(rwsem_opt_rlock);
- return true;
- }
-
- /* Back out the change */
- atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
- return false;
-}
+enum owner_state {
+ OWNER_NULL = 1 << 0,
+ OWNER_WRITER = 1 << 1,
+ OWNER_READER = 1 << 2,
+ OWNER_NONSPINNABLE = 1 << 3,
+};
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
/*
* Try to acquire write lock before the writer has been put on wait queue.
*/
@@ -636,24 +695,14 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
count | RWSEM_WRITER_LOCKED)) {
rwsem_set_owner(sem);
- lockevent_inc(rwsem_opt_wlock);
+ lockevent_inc(rwsem_opt_lock);
return true;
}
}
return false;
}
-static inline bool owner_on_cpu(struct task_struct *owner)
-{
- /*
- * As lock holder preemption issue, we both skip spinning if
- * task is not on cpu or its cpu is preempted
- */
- return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
-}
-
-static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
- unsigned long nonspinnable)
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *owner;
unsigned long flags;
@@ -664,45 +713,28 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
return false;
}
- preempt_disable();
- rcu_read_lock();
+ /*
+ * Disable preemption is equal to the RCU read-side crital section,
+ * thus the task_strcut structure won't go away.
+ */
owner = rwsem_owner_flags(sem, &flags);
/*
* Don't check the read-owner as the entry may be stale.
*/
- if ((flags & nonspinnable) ||
+ if ((flags & RWSEM_NONSPINNABLE) ||
(owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
ret = false;
- rcu_read_unlock();
- preempt_enable();
lockevent_cond_inc(rwsem_opt_fail, !ret);
return ret;
}
-/*
- * The rwsem_spin_on_owner() function returns the folowing 4 values
- * depending on the lock owner state.
- * OWNER_NULL : owner is currently NULL
- * OWNER_WRITER: when owner changes and is a writer
- * OWNER_READER: when owner changes and the new owner may be a reader.
- * OWNER_NONSPINNABLE:
- * when optimistic spinning has to stop because either the
- * owner stops running, is unknown, or its timeslice has
- * been used up.
- */
-enum owner_state {
- OWNER_NULL = 1 << 0,
- OWNER_WRITER = 1 << 1,
- OWNER_READER = 1 << 2,
- OWNER_NONSPINNABLE = 1 << 3,
-};
#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER)
static inline enum owner_state
-rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long nonspinnable)
+rwsem_owner_state(struct task_struct *owner, unsigned long flags)
{
- if (flags & nonspinnable)
+ if (flags & RWSEM_NONSPINNABLE)
return OWNER_NONSPINNABLE;
if (flags & RWSEM_READER_OWNED)
@@ -712,18 +744,19 @@ rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long
}
static noinline enum owner_state
-rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
+rwsem_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *new, *owner;
unsigned long flags, new_flags;
enum owner_state state;
+ lockdep_assert_preemption_disabled();
+
owner = rwsem_owner_flags(sem, &flags);
- state = rwsem_owner_state(owner, flags, nonspinnable);
+ state = rwsem_owner_state(owner, flags);
if (state != OWNER_WRITER)
return state;
- rcu_read_lock();
for (;;) {
/*
* When a waiting writer set the handoff flag, it may spin
@@ -733,7 +766,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
*/
new = rwsem_owner_flags(sem, &new_flags);
if ((new != owner) || (new_flags != flags)) {
- state = rwsem_owner_state(new, new_flags, nonspinnable);
+ state = rwsem_owner_state(new, new_flags);
break;
}
@@ -741,7 +774,9 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
* Ensure we emit the owner->on_cpu, dereference _after_
* checking sem->owner still matches owner, if that fails,
* owner might point to free()d memory, if it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
+ * our spinning context already disabled preemption which is
+ * equal to RCU read-side crital section ensures the memory
+ * stays valid.
*/
barrier();
@@ -752,7 +787,6 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
cpu_relax();
}
- rcu_read_unlock();
return state;
}
@@ -782,16 +816,12 @@ static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem)
return sched_clock() + delta;
}
-static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
{
bool taken = false;
int prev_owner_state = OWNER_NULL;
int loop = 0;
u64 rspin_threshold = 0;
- unsigned long nonspinnable = wlock ? RWSEM_WR_NONSPINNABLE
- : RWSEM_RD_NONSPINNABLE;
-
- preempt_disable();
/* sem->wait_lock should not be held when doing optimistic spinning */
if (!osq_lock(&sem->osq))
@@ -806,15 +836,14 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
for (;;) {
enum owner_state owner_state;
- owner_state = rwsem_spin_on_owner(sem, nonspinnable);
+ owner_state = rwsem_spin_on_owner(sem);
if (!(owner_state & OWNER_SPINNABLE))
break;
/*
* Try to acquire the lock
*/
- taken = wlock ? rwsem_try_write_lock_unqueued(sem)
- : rwsem_try_read_lock_unqueued(sem);
+ taken = rwsem_try_write_lock_unqueued(sem);
if (taken)
break;
@@ -822,7 +851,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
/*
* Time-based reader-owned rwsem optimistic spinning
*/
- if (wlock && (owner_state == OWNER_READER)) {
+ if (owner_state == OWNER_READER) {
/*
* Re-initialize rspin_threshold every time when
* the owner state changes from non-reader to reader.
@@ -831,7 +860,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
* the beginning of the 2nd reader phase.
*/
if (prev_owner_state != OWNER_READER) {
- if (rwsem_test_oflags(sem, nonspinnable))
+ if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))
break;
rspin_threshold = rwsem_rspin_threshold(sem);
loop = 0;
@@ -871,7 +900,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
* we try to get it. The new owner may be a spinnable
* writer.
*
- * To take advantage of two scenarios listed agove, the RT
+ * To take advantage of two scenarios listed above, the RT
* task is made to retry one more time to see if it can
* acquire the lock or continue spinning on the new owning
* writer. Of course, if the time lag is long enough or the
@@ -901,124 +930,97 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
}
osq_unlock(&sem->osq);
done:
- preempt_enable();
lockevent_cond_inc(rwsem_opt_fail, !taken);
return taken;
}
/*
- * Clear the owner's RWSEM_WR_NONSPINNABLE bit if it is set. This should
+ * Clear the owner's RWSEM_NONSPINNABLE bit if it is set. This should
* only be called when the reader count reaches 0.
- *
- * This give writers better chance to acquire the rwsem first before
- * readers when the rwsem was being held by readers for a relatively long
- * period of time. Race can happen that an optimistic spinner may have
- * just stolen the rwsem and set the owner, but just clearing the
- * RWSEM_WR_NONSPINNABLE bit will do no harm anyway.
*/
-static inline void clear_wr_nonspinnable(struct rw_semaphore *sem)
+static inline void clear_nonspinnable(struct rw_semaphore *sem)
{
- if (rwsem_test_oflags(sem, RWSEM_WR_NONSPINNABLE))
- atomic_long_andnot(RWSEM_WR_NONSPINNABLE, &sem->owner);
+ if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)))
+ atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner);
}
-/*
- * This function is called when the reader fails to acquire the lock via
- * optimistic spinning. In this case we will still attempt to do a trylock
- * when comparing the rwsem state right now with the state when entering
- * the slowpath indicates that the reader is still in a valid reader phase.
- * This happens when the following conditions are true:
- *
- * 1) The lock is currently reader owned, and
- * 2) The lock is previously not reader-owned or the last read owner changes.
- *
- * In the former case, we have transitioned from a writer phase to a
- * reader-phase while spinning. In the latter case, it means the reader
- * phase hasn't ended when we entered the optimistic spinning loop. In
- * both cases, the reader is eligible to acquire the lock. This is the
- * secondary path where a read lock is acquired optimistically.
- *
- * The reader non-spinnable bit wasn't set at time of entry or it will
- * not be here at all.
- */
-static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem,
- unsigned long last_rowner)
-{
- unsigned long owner = atomic_long_read(&sem->owner);
-
- if (!(owner & RWSEM_READER_OWNED))
- return false;
-
- if (((owner ^ last_rowner) & ~RWSEM_OWNER_FLAGS_MASK) &&
- rwsem_try_read_lock_unqueued(sem)) {
- lockevent_inc(rwsem_opt_rlock2);
- lockevent_add(rwsem_opt_fail, -1);
- return true;
- }
- return false;
-}
#else
-static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
- unsigned long nonspinnable)
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
{
return false;
}
-static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
+static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem)
{
return false;
}
-static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) { }
+static inline void clear_nonspinnable(struct rw_semaphore *sem) { }
-static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem,
- unsigned long last_rowner)
+static inline enum owner_state
+rwsem_spin_on_owner(struct rw_semaphore *sem)
{
- return false;
+ return OWNER_NONSPINNABLE;
}
+#endif
+
+/*
+ * Prepare to wake up waiter(s) in the wait queue by putting them into the
+ * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely
+ * reader-owned, wake up read lock waiters in queue front or wake up any
+ * front waiter otherwise.
-static inline int
-rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
+ * This is being called from both reader and writer slow paths.
+ */
+static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count,
+ struct wake_q_head *wake_q)
{
- return 0;
+ enum rwsem_wake_type wake_type;
+
+ if (count & RWSEM_WRITER_MASK)
+ return;
+
+ if (count & RWSEM_READER_MASK) {
+ wake_type = RWSEM_WAKE_READERS;
+ } else {
+ wake_type = RWSEM_WAKE_ANY;
+ clear_nonspinnable(sem);
+ }
+ rwsem_mark_wake(sem, wake_type, wake_q);
}
-#define OWNER_NULL 1
-#endif
/*
* Wait for the read lock to be granted
*/
static struct rw_semaphore __sched *
-rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
+rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int state)
{
- long count, adjustment = -RWSEM_READER_BIAS;
+ long adjustment = -RWSEM_READER_BIAS;
+ long rcnt = (count >> RWSEM_READER_SHIFT);
struct rwsem_waiter waiter;
DEFINE_WAKE_Q(wake_q);
- bool wake = false;
/*
- * Save the current read-owner of rwsem, if available, and the
- * reader nonspinnable bit.
+ * To prevent a constant stream of readers from starving a sleeping
+ * waiter, don't attempt optimistic lock stealing if the lock is
+ * currently owned by readers.
*/
- waiter.last_rowner = atomic_long_read(&sem->owner);
- if (!(waiter.last_rowner & RWSEM_READER_OWNED))
- waiter.last_rowner &= RWSEM_RD_NONSPINNABLE;
-
- if (!rwsem_can_spin_on_owner(sem, RWSEM_RD_NONSPINNABLE))
+ if ((atomic_long_read(&sem->owner) & RWSEM_READER_OWNED) &&
+ (rcnt > 1) && !(count & RWSEM_WRITER_LOCKED))
goto queue;
/*
- * Undo read bias from down_read() and do optimistic spinning.
+ * Reader optimistic lock stealing.
*/
- atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
- adjustment = 0;
- if (rwsem_optimistic_spin(sem, false)) {
- /* rwsem_optimistic_spin() implies ACQUIRE on success */
+ if (!(count & (RWSEM_WRITER_LOCKED | RWSEM_FLAG_HANDOFF))) {
+ rwsem_set_reader_owned(sem);
+ lockevent_inc(rwsem_rlock_steal);
+
/*
- * Wake up other readers in the wait list if the front
- * waiter is a reader.
+ * Wake up other readers in the wait queue if it is
+ * the first reader.
*/
- if ((atomic_long_read(&sem->count) & RWSEM_FLAG_WAITERS)) {
+ if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) {
raw_spin_lock_irq(&sem->wait_lock);
if (!list_empty(&sem->wait_list))
rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
@@ -1027,26 +1029,23 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
wake_up_q(&wake_q);
}
return sem;
- } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) {
- /* rwsem_reader_phase_trylock() implies ACQUIRE on success */
- return sem;
}
queue:
waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_READ;
waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+ waiter.handoff_set = false;
raw_spin_lock_irq(&sem->wait_lock);
if (list_empty(&sem->wait_list)) {
/*
* In case the wait queue is empty and the lock isn't owned
- * by a writer or has the handoff bit set, this reader can
- * exit the slowpath and return immediately as its
- * RWSEM_READER_BIAS has already been set in the count.
+ * by a writer, this reader can exit the slowpath and return
+ * immediately as its RWSEM_READER_BIAS has already been set
+ * in the count.
*/
- if (adjustment && !(atomic_long_read(&sem->count) &
- (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+ if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) {
/* Provide lock ACQUIRE */
smp_acquire__after_ctrl_dep();
raw_spin_unlock_irq(&sem->wait_lock);
@@ -1056,30 +1055,18 @@ queue:
}
adjustment += RWSEM_FLAG_WAITERS;
}
- list_add_tail(&waiter.list, &sem->wait_list);
+ rwsem_add_waiter(sem, &waiter);
/* we're now waiting on the lock, but no longer actively locking */
- if (adjustment)
- count = atomic_long_add_return(adjustment, &sem->count);
- else
- count = atomic_long_read(&sem->count);
-
- /*
- * If there are no active locks, wake the front queued process(es).
- *
- * If there are no writers and we are first in the queue,
- * wake our own waiter to join the existing active readers !
- */
- if (!(count & RWSEM_LOCK_MASK)) {
- clear_wr_nonspinnable(sem);
- wake = true;
- }
- if (wake || (!(count & RWSEM_WRITER_MASK) &&
- (adjustment & RWSEM_FLAG_WAITERS)))
- rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+ count = atomic_long_add_return(adjustment, &sem->count);
+ rwsem_cond_wake_waiter(sem, count, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
- wake_up_q(&wake_q);
+
+ if (!wake_q_empty(&wake_q))
+ wake_up_q(&wake_q);
+
+ trace_contention_begin(sem, LCB_F_READ);
/* wait to be given the lock */
for (;;) {
@@ -1096,100 +1083,54 @@ queue:
/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
break;
}
- schedule();
+ schedule_preempt_disabled();
lockevent_inc(rwsem_sleep_reader);
}
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock);
+ trace_contention_end(sem, 0);
return sem;
out_nolock:
- list_del(&waiter.list);
- if (list_empty(&sem->wait_list)) {
- atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
- &sem->count);
- }
- raw_spin_unlock_irq(&sem->wait_lock);
+ rwsem_del_wake_waiter(sem, &waiter, &wake_q);
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock_fail);
+ trace_contention_end(sem, -EINTR);
return ERR_PTR(-EINTR);
}
/*
- * This function is called by the a write lock owner. So the owner value
- * won't get changed by others.
- */
-static inline void rwsem_disable_reader_optspin(struct rw_semaphore *sem,
- bool disable)
-{
- if (unlikely(disable)) {
- atomic_long_or(RWSEM_RD_NONSPINNABLE, &sem->owner);
- lockevent_inc(rwsem_opt_norspin);
- }
-}
-
-/*
* Wait until we successfully acquire the write lock
*/
-static struct rw_semaphore *
+static struct rw_semaphore __sched *
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
{
- long count;
- bool disable_rspin;
- enum writer_wait_state wstate;
struct rwsem_waiter waiter;
- struct rw_semaphore *ret = sem;
DEFINE_WAKE_Q(wake_q);
/* do optimistic spinning and steal lock if possible */
- if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) &&
- rwsem_optimistic_spin(sem, true)) {
+ if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) {
/* rwsem_optimistic_spin() implies ACQUIRE on success */
return sem;
}
/*
- * Disable reader optimistic spinning for this rwsem after
- * acquiring the write lock when the setting of the nonspinnable
- * bits are observed.
- */
- disable_rspin = atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE;
-
- /*
* Optimistic spinning failed, proceed to the slowpath
* and block until we can acquire the sem.
*/
waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_WRITE;
waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+ waiter.handoff_set = false;
raw_spin_lock_irq(&sem->wait_lock);
-
- /* account for this before adding a new element to the list */
- wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
-
- list_add_tail(&waiter.list, &sem->wait_list);
+ rwsem_add_waiter(sem, &waiter);
/* we're now waiting on the lock */
- if (wstate == WRITER_NOT_FIRST) {
- count = atomic_long_read(&sem->count);
-
- /*
- * If there were already threads queued before us and:
- * 1) there are no no active locks, wake the front
- * queued process(es) as the handoff bit might be set.
- * 2) there are no active writers and some readers, the lock
- * must be read owned; so we try to wake any read lock
- * waiters that were queued ahead of us.
- */
- if (count & RWSEM_WRITER_MASK)
- goto wait;
-
- rwsem_mark_wake(sem, (count & RWSEM_READER_MASK)
- ? RWSEM_WAKE_READERS
- : RWSEM_WAKE_ANY, &wake_q);
-
+ if (rwsem_first_waiter(sem) != &waiter) {
+ rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count),
+ &wake_q);
if (!wake_q_empty(&wake_q)) {
/*
* We want to minimize wait_lock hold time especially
@@ -1197,24 +1138,27 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
*/
raw_spin_unlock_irq(&sem->wait_lock);
wake_up_q(&wake_q);
- wake_q_init(&wake_q); /* Used again, reinit */
raw_spin_lock_irq(&sem->wait_lock);
}
} else {
atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
}
-wait:
/* wait until we successfully acquire the lock */
set_current_state(state);
+ trace_contention_begin(sem, LCB_F_WRITE);
+
for (;;) {
- if (rwsem_try_write_lock(sem, wstate)) {
+ if (rwsem_try_write_lock(sem, &waiter)) {
/* rwsem_try_write_lock() implies ACQUIRE on success */
break;
}
raw_spin_unlock_irq(&sem->wait_lock);
+ if (signal_pending_state(state, current))
+ goto out_nolock;
+
/*
* After setting the handoff bit and failing to acquire
* the lock, attempt to spin on owner to accelerate lock
@@ -1223,71 +1167,32 @@ wait:
* In this case, we attempt to acquire the lock again
* without sleeping.
*/
- if (wstate == WRITER_HANDOFF &&
- rwsem_spin_on_owner(sem, RWSEM_NONSPINNABLE) == OWNER_NULL)
- goto trylock_again;
-
- /* Block until there are no active lockers. */
- for (;;) {
- if (signal_pending_state(state, current))
- goto out_nolock;
-
- schedule();
- lockevent_inc(rwsem_sleep_writer);
- set_current_state(state);
- /*
- * If HANDOFF bit is set, unconditionally do
- * a trylock.
- */
- if (wstate == WRITER_HANDOFF)
- break;
-
- if ((wstate == WRITER_NOT_FIRST) &&
- (rwsem_first_waiter(sem) == &waiter))
- wstate = WRITER_FIRST;
-
- count = atomic_long_read(&sem->count);
- if (!(count & RWSEM_LOCK_MASK))
- break;
+ if (waiter.handoff_set) {
+ enum owner_state owner_state;
- /*
- * The setting of the handoff bit is deferred
- * until rwsem_try_write_lock() is called.
- */
- if ((wstate == WRITER_FIRST) && (rt_task(current) ||
- time_after(jiffies, waiter.timeout))) {
- wstate = WRITER_HANDOFF;
- lockevent_inc(rwsem_wlock_handoff);
- break;
- }
+ owner_state = rwsem_spin_on_owner(sem);
+ if (owner_state == OWNER_NULL)
+ goto trylock_again;
}
+
+ schedule_preempt_disabled();
+ lockevent_inc(rwsem_sleep_writer);
+ set_current_state(state);
trylock_again:
raw_spin_lock_irq(&sem->wait_lock);
}
__set_current_state(TASK_RUNNING);
- list_del(&waiter.list);
- rwsem_disable_reader_optspin(sem, disable_rspin);
raw_spin_unlock_irq(&sem->wait_lock);
lockevent_inc(rwsem_wlock);
-
- return ret;
+ trace_contention_end(sem, 0);
+ return sem;
out_nolock:
__set_current_state(TASK_RUNNING);
raw_spin_lock_irq(&sem->wait_lock);
- list_del(&waiter.list);
-
- if (unlikely(wstate == WRITER_HANDOFF))
- atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count);
-
- if (list_empty(&sem->wait_list))
- atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
- else
- rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
- raw_spin_unlock_irq(&sem->wait_lock);
- wake_up_q(&wake_q);
+ rwsem_del_wake_waiter(sem, &waiter, &wake_q);
lockevent_inc(rwsem_wlock_fail);
-
+ trace_contention_end(sem, -EINTR);
return ERR_PTR(-EINTR);
}
@@ -1295,7 +1200,7 @@ out_nolock:
* handle waking up a waiter on the semaphore
* - up_read/up_write has decremented the active part of count if we come here
*/
-static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count)
+static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
{
unsigned long flags;
DEFINE_WAKE_Q(wake_q);
@@ -1335,89 +1240,96 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
/*
* lock for reading
*/
-static inline void __down_read(struct rw_semaphore *sem)
+static __always_inline int __down_read_common(struct rw_semaphore *sem, int state)
{
- if (!rwsem_read_trylock(sem)) {
- rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE);
+ int ret = 0;
+ long count;
+
+ preempt_disable();
+ if (!rwsem_read_trylock(sem, &count)) {
+ if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) {
+ ret = -EINTR;
+ goto out;
+ }
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
- } else {
- rwsem_set_reader_owned(sem);
}
+out:
+ preempt_enable();
+ return ret;
}
-static inline int __down_read_killable(struct rw_semaphore *sem)
+static __always_inline void __down_read(struct rw_semaphore *sem)
{
- if (!rwsem_read_trylock(sem)) {
- if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE)))
- return -EINTR;
- DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
- } else {
- rwsem_set_reader_owned(sem);
- }
- return 0;
+ __down_read_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+static __always_inline int __down_read_interruptible(struct rw_semaphore *sem)
+{
+ return __down_read_common(sem, TASK_INTERRUPTIBLE);
+}
+
+static __always_inline int __down_read_killable(struct rw_semaphore *sem)
+{
+ return __down_read_common(sem, TASK_KILLABLE);
}
static inline int __down_read_trylock(struct rw_semaphore *sem)
{
+ int ret = 0;
long tmp;
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
- /*
- * Optimize for the case when the rwsem is not locked at all.
- */
- tmp = RWSEM_UNLOCKED_VALUE;
- do {
+ preempt_disable();
+ tmp = atomic_long_read(&sem->count);
+ while (!(tmp & RWSEM_READ_FAILED_MASK)) {
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
- tmp + RWSEM_READER_BIAS)) {
+ tmp + RWSEM_READER_BIAS)) {
rwsem_set_reader_owned(sem);
- return 1;
+ ret = 1;
+ break;
}
- } while (!(tmp & RWSEM_READ_FAILED_MASK));
- return 0;
+ }
+ preempt_enable();
+ return ret;
}
/*
* lock for writing
*/
-static inline void __down_write(struct rw_semaphore *sem)
+static inline int __down_write_common(struct rw_semaphore *sem, int state)
{
- long tmp = RWSEM_UNLOCKED_VALUE;
+ int ret = 0;
- if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
- RWSEM_WRITER_LOCKED)))
- rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE);
- else
- rwsem_set_owner(sem);
+ preempt_disable();
+ if (unlikely(!rwsem_write_trylock(sem))) {
+ if (IS_ERR(rwsem_down_write_slowpath(sem, state)))
+ ret = -EINTR;
+ }
+ preempt_enable();
+ return ret;
}
-static inline int __down_write_killable(struct rw_semaphore *sem)
+static inline void __down_write(struct rw_semaphore *sem)
{
- long tmp = RWSEM_UNLOCKED_VALUE;
+ __down_write_common(sem, TASK_UNINTERRUPTIBLE);
+}
- if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
- RWSEM_WRITER_LOCKED))) {
- if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE)))
- return -EINTR;
- } else {
- rwsem_set_owner(sem);
- }
- return 0;
+static inline int __down_write_killable(struct rw_semaphore *sem)
+{
+ return __down_write_common(sem, TASK_KILLABLE);
}
static inline int __down_write_trylock(struct rw_semaphore *sem)
{
- long tmp;
+ int ret;
+ preempt_disable();
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+ ret = rwsem_write_trylock(sem);
+ preempt_enable();
- tmp = RWSEM_UNLOCKED_VALUE;
- if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
- RWSEM_WRITER_LOCKED)) {
- rwsem_set_owner(sem);
- return true;
- }
- return false;
+ return ret;
}
/*
@@ -1430,14 +1342,16 @@ static inline void __up_read(struct rw_semaphore *sem)
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+ preempt_disable();
rwsem_clear_reader_owned(sem);
tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
RWSEM_FLAG_WAITERS)) {
- clear_wr_nonspinnable(sem);
- rwsem_wake(sem, tmp);
+ clear_nonspinnable(sem);
+ rwsem_wake(sem);
}
+ preempt_enable();
}
/*
@@ -1455,10 +1369,12 @@ static inline void __up_write(struct rw_semaphore *sem)
DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
!rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
+ preempt_disable();
rwsem_clear_owner(sem);
tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
if (unlikely(tmp & RWSEM_FLAG_WAITERS))
- rwsem_wake(sem, tmp);
+ rwsem_wake(sem);
+ preempt_enable();
}
/*
@@ -1476,13 +1392,131 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
* write side. As such, rely on RELEASE semantics.
*/
DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
+ preempt_disable();
tmp = atomic_long_fetch_add_release(
-RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
rwsem_set_reader_owned(sem);
if (tmp & RWSEM_FLAG_WAITERS)
rwsem_downgrade_wake(sem);
+ preempt_enable();
}
+#else /* !CONFIG_PREEMPT_RT */
+
+#define RT_MUTEX_BUILD_MUTEX
+#include "rtmutex.c"
+
+#define rwbase_set_and_save_current_state(state) \
+ set_current_state(state)
+
+#define rwbase_restore_current_state() \
+ __set_current_state(TASK_RUNNING)
+
+#define rwbase_rtmutex_lock_state(rtm, state) \
+ __rt_mutex_lock(rtm, state)
+
+#define rwbase_rtmutex_slowlock_locked(rtm, state) \
+ __rt_mutex_slowlock_locked(rtm, NULL, state)
+
+#define rwbase_rtmutex_unlock(rtm) \
+ __rt_mutex_unlock(rtm)
+
+#define rwbase_rtmutex_trylock(rtm) \
+ __rt_mutex_trylock(rtm)
+
+#define rwbase_signal_pending_state(state, current) \
+ signal_pending_state(state, current)
+
+#define rwbase_pre_schedule() \
+ rt_mutex_pre_schedule()
+
+#define rwbase_schedule() \
+ rt_mutex_schedule()
+
+#define rwbase_post_schedule() \
+ rt_mutex_post_schedule()
+
+#include "rwbase_rt.c"
+
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key)
+{
+ init_rwbase_rt(&(sem)->rwbase);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
+#endif
+}
+EXPORT_SYMBOL(__init_rwsem);
+
+static inline void __down_read(struct rw_semaphore *sem)
+{
+ rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
+}
+
+static inline int __down_read_interruptible(struct rw_semaphore *sem)
+{
+ return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE);
+}
+
+static inline int __down_read_killable(struct rw_semaphore *sem)
+{
+ return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE);
+}
+
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+ return rwbase_read_trylock(&sem->rwbase);
+}
+
+static inline void __up_read(struct rw_semaphore *sem)
+{
+ rwbase_read_unlock(&sem->rwbase, TASK_NORMAL);
+}
+
+static inline void __sched __down_write(struct rw_semaphore *sem)
+{
+ rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
+}
+
+static inline int __sched __down_write_killable(struct rw_semaphore *sem)
+{
+ return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE);
+}
+
+static inline int __down_write_trylock(struct rw_semaphore *sem)
+{
+ return rwbase_write_trylock(&sem->rwbase);
+}
+
+static inline void __up_write(struct rw_semaphore *sem)
+{
+ rwbase_write_unlock(&sem->rwbase);
+}
+
+static inline void __downgrade_write(struct rw_semaphore *sem)
+{
+ rwbase_write_downgrade(&sem->rwbase);
+}
+
+/* Debug stubs for the common API */
+#define DEBUG_RWSEMS_WARN_ON(c, sem)
+
+static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
+ struct task_struct *owner)
+{
+}
+
+static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
+{
+ int count = atomic_read(&sem->rwbase.readers);
+
+ return count < 0 && count != READER_BIAS;
+}
+
+#endif /* CONFIG_PREEMPT_RT */
+
/*
* lock for reading
*/
@@ -1495,6 +1529,20 @@ void __sched down_read(struct rw_semaphore *sem)
}
EXPORT_SYMBOL(down_read);
+int __sched down_read_interruptible(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) {
+ rwsem_release(&sem->dep_map, _RET_IP_);
+ return -EINTR;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(down_read_interruptible);
+
int __sched down_read_killable(struct rw_semaphore *sem)
{
might_sleep();
@@ -1605,6 +1653,20 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
}
EXPORT_SYMBOL(down_read_nested);
+int down_read_killable_nested(struct rw_semaphore *sem, int subclass)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
+ rwsem_release(&sem->dep_map, _RET_IP_);
+ return -EINTR;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(down_read_killable_nested);
+
void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
{
might_sleep();
@@ -1617,6 +1679,12 @@ void down_read_non_owner(struct rw_semaphore *sem)
{
might_sleep();
__down_read(sem);
+ /*
+ * The owner value for a reader-owned lock is mostly for debugging
+ * purpose only and is not critical to the correct functioning of
+ * rwsem. So it is perfectly fine to set it in a preempt-enabled
+ * context here.
+ */
__rwsem_set_reader_owned(sem, NULL);
}
EXPORT_SYMBOL(down_read_non_owner);
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/kernel/locking/rwsem.h
+++ /dev/null
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index d9dd94defc0a..34bfae72f295 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -32,6 +32,7 @@
#include <linux/semaphore.h>
#include <linux/spinlock.h>
#include <linux/ftrace.h>
+#include <trace/events/lock.h>
static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
@@ -50,10 +51,11 @@ static noinline void __up(struct semaphore *sem);
* Use of this function is deprecated, please use down_interruptible() or
* down_killable() instead.
*/
-void down(struct semaphore *sem)
+void __sched down(struct semaphore *sem)
{
unsigned long flags;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
@@ -72,11 +74,12 @@ EXPORT_SYMBOL(down);
* If the sleep is interrupted by a signal, this function will return -EINTR.
* If the semaphore is successfully acquired, this function returns 0.
*/
-int down_interruptible(struct semaphore *sem)
+int __sched down_interruptible(struct semaphore *sem)
{
unsigned long flags;
int result = 0;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
@@ -98,11 +101,12 @@ EXPORT_SYMBOL(down_interruptible);
* -EINTR. If the semaphore is successfully acquired, this function returns
* 0.
*/
-int down_killable(struct semaphore *sem)
+int __sched down_killable(struct semaphore *sem)
{
unsigned long flags;
int result = 0;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
@@ -119,7 +123,7 @@ EXPORT_SYMBOL(down_killable);
* @sem: the semaphore to be acquired
*
* Try to acquire the semaphore atomically. Returns 0 if the semaphore has
- * been acquired successfully or 1 if it it cannot be acquired.
+ * been acquired successfully or 1 if it cannot be acquired.
*
* NOTE: This return value is inverted from both spin_trylock and
* mutex_trylock! Be careful about this when converting code.
@@ -127,7 +131,7 @@ EXPORT_SYMBOL(down_killable);
* Unlike mutex_trylock, this function can be used from interrupt context,
* and the semaphore can be released by any task or interrupt.
*/
-int down_trylock(struct semaphore *sem)
+int __sched down_trylock(struct semaphore *sem)
{
unsigned long flags;
int count;
@@ -152,11 +156,12 @@ EXPORT_SYMBOL(down_trylock);
* If the semaphore is not released within the specified number of jiffies,
* this function returns -ETIME. It returns 0 if the semaphore was acquired.
*/
-int down_timeout(struct semaphore *sem, long timeout)
+int __sched down_timeout(struct semaphore *sem, long timeout)
{
unsigned long flags;
int result = 0;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
@@ -175,7 +180,7 @@ EXPORT_SYMBOL(down_timeout);
* Release the semaphore. Unlike mutexes, up() may be called from any
* context and even by tasks which have never called down().
*/
-void up(struct semaphore *sem)
+void __sched up(struct semaphore *sem)
{
unsigned long flags;
@@ -201,7 +206,7 @@ struct semaphore_waiter {
* constant, and thus optimised away by the compiler. Likewise the
* 'timeout' parameter for the cases without timeouts.
*/
-static inline int __sched __down_common(struct semaphore *sem, long state,
+static inline int __sched ___down_common(struct semaphore *sem, long state,
long timeout)
{
struct semaphore_waiter waiter;
@@ -232,6 +237,18 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
return -EINTR;
}
+static inline int __sched __down_common(struct semaphore *sem, long state,
+ long timeout)
+{
+ int ret;
+
+ trace_contention_begin(sem, 0);
+ ret = ___down_common(sem, state, timeout);
+ trace_contention_end(sem, ret);
+
+ return ret;
+}
+
static noinline void __sched __down(struct semaphore *sem)
{
__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 0ff08380f531..8475a0794f8c 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -58,10 +58,10 @@ EXPORT_PER_CPU_SYMBOL(__mmiowb_state);
/*
* We build the __lock_function inlines here. They are too large for
* inlining all over the place, but here is only one user per function
- * which embedds them into the calling _lock_function below.
+ * which embeds them into the calling _lock_function below.
*
* This could be a long-held lock. We both prepare to spin for a long
- * time (making _this_ CPU preemptable if possible), and we also signal
+ * time (making _this_ CPU preemptible if possible), and we also signal
* towards that other CPU that it should break the lock ASAP.
*/
#define BUILD_LOCK_OPS(op, locktype) \
@@ -124,13 +124,16 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
* __[spin|read|write]_lock_bh()
*/
BUILD_LOCK_OPS(spin, raw_spinlock);
+
+#ifndef CONFIG_PREEMPT_RT
BUILD_LOCK_OPS(read, rwlock);
BUILD_LOCK_OPS(write, rwlock);
+#endif
#endif
#ifndef CONFIG_INLINE_SPIN_TRYLOCK
-int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
+noinline int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
{
return __raw_spin_trylock(lock);
}
@@ -138,7 +141,7 @@ EXPORT_SYMBOL(_raw_spin_trylock);
#endif
#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
-int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
+noinline int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
{
return __raw_spin_trylock_bh(lock);
}
@@ -146,7 +149,7 @@ EXPORT_SYMBOL(_raw_spin_trylock_bh);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK
-void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
{
__raw_spin_lock(lock);
}
@@ -154,7 +157,7 @@ EXPORT_SYMBOL(_raw_spin_lock);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
+noinline unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
{
return __raw_spin_lock_irqsave(lock);
}
@@ -162,7 +165,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
-void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
{
__raw_spin_lock_irq(lock);
}
@@ -170,7 +173,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irq);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_BH
-void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
{
__raw_spin_lock_bh(lock);
}
@@ -178,7 +181,7 @@ EXPORT_SYMBOL(_raw_spin_lock_bh);
#endif
#ifdef CONFIG_UNINLINE_SPIN_UNLOCK
-void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
{
__raw_spin_unlock(lock);
}
@@ -186,7 +189,7 @@ EXPORT_SYMBOL(_raw_spin_unlock);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
-void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
{
__raw_spin_unlock_irqrestore(lock, flags);
}
@@ -194,7 +197,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
-void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
{
__raw_spin_unlock_irq(lock);
}
@@ -202,15 +205,17 @@ EXPORT_SYMBOL(_raw_spin_unlock_irq);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
-void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
{
__raw_spin_unlock_bh(lock);
}
EXPORT_SYMBOL(_raw_spin_unlock_bh);
#endif
+#ifndef CONFIG_PREEMPT_RT
+
#ifndef CONFIG_INLINE_READ_TRYLOCK
-int __lockfunc _raw_read_trylock(rwlock_t *lock)
+noinline int __lockfunc _raw_read_trylock(rwlock_t *lock)
{
return __raw_read_trylock(lock);
}
@@ -218,7 +223,7 @@ EXPORT_SYMBOL(_raw_read_trylock);
#endif
#ifndef CONFIG_INLINE_READ_LOCK
-void __lockfunc _raw_read_lock(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock(rwlock_t *lock)
{
__raw_read_lock(lock);
}
@@ -226,7 +231,7 @@ EXPORT_SYMBOL(_raw_read_lock);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
+noinline unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
{
return __raw_read_lock_irqsave(lock);
}
@@ -234,7 +239,7 @@ EXPORT_SYMBOL(_raw_read_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_IRQ
-void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
{
__raw_read_lock_irq(lock);
}
@@ -242,7 +247,7 @@ EXPORT_SYMBOL(_raw_read_lock_irq);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_BH
-void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
{
__raw_read_lock_bh(lock);
}
@@ -250,7 +255,7 @@ EXPORT_SYMBOL(_raw_read_lock_bh);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK
-void __lockfunc _raw_read_unlock(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock(rwlock_t *lock)
{
__raw_read_unlock(lock);
}
@@ -258,7 +263,7 @@ EXPORT_SYMBOL(_raw_read_unlock);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
-void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
__raw_read_unlock_irqrestore(lock, flags);
}
@@ -266,7 +271,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
-void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
{
__raw_read_unlock_irq(lock);
}
@@ -274,7 +279,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irq);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_BH
-void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
{
__raw_read_unlock_bh(lock);
}
@@ -282,7 +287,7 @@ EXPORT_SYMBOL(_raw_read_unlock_bh);
#endif
#ifndef CONFIG_INLINE_WRITE_TRYLOCK
-int __lockfunc _raw_write_trylock(rwlock_t *lock)
+noinline int __lockfunc _raw_write_trylock(rwlock_t *lock)
{
return __raw_write_trylock(lock);
}
@@ -290,15 +295,25 @@ EXPORT_SYMBOL(_raw_write_trylock);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK
-void __lockfunc _raw_write_lock(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock(rwlock_t *lock)
{
__raw_write_lock(lock);
}
EXPORT_SYMBOL(_raw_write_lock);
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+#define __raw_write_lock_nested(lock, subclass) __raw_write_lock(((void)(subclass), (lock)))
+#endif
+
+void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass)
+{
+ __raw_write_lock_nested(lock, subclass);
+}
+EXPORT_SYMBOL(_raw_write_lock_nested);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
+noinline unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
{
return __raw_write_lock_irqsave(lock);
}
@@ -306,7 +321,7 @@ EXPORT_SYMBOL(_raw_write_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
-void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
{
__raw_write_lock_irq(lock);
}
@@ -314,7 +329,7 @@ EXPORT_SYMBOL(_raw_write_lock_irq);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_BH
-void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
{
__raw_write_lock_bh(lock);
}
@@ -322,7 +337,7 @@ EXPORT_SYMBOL(_raw_write_lock_bh);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK
-void __lockfunc _raw_write_unlock(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock(rwlock_t *lock)
{
__raw_write_unlock(lock);
}
@@ -330,7 +345,7 @@ EXPORT_SYMBOL(_raw_write_unlock);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
-void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
__raw_write_unlock_irqrestore(lock, flags);
}
@@ -338,7 +353,7 @@ EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
-void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
{
__raw_write_unlock_irq(lock);
}
@@ -346,13 +361,15 @@ EXPORT_SYMBOL(_raw_write_unlock_irq);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
-void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
{
__raw_write_unlock_bh(lock);
}
EXPORT_SYMBOL(_raw_write_unlock_bh);
#endif
+#endif /* !CONFIG_PREEMPT_RT */
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
@@ -371,8 +388,7 @@ unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
local_irq_save(flags);
preempt_disable();
spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
- LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock,
- do_raw_spin_lock_flags, &flags);
+ LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
return flags;
}
EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested);
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index b9d93087ee66..87b03d2e41db 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -12,6 +12,7 @@
#include <linux/debug_locks.h>
#include <linux/delay.h>
#include <linux/export.h>
+#include <linux/pid.h>
void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
struct lock_class_key *key, short inner)
@@ -31,6 +32,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
EXPORT_SYMBOL(__raw_spin_lock_init);
+#ifndef CONFIG_PREEMPT_RT
void __rwlock_init(rwlock_t *lock, const char *name,
struct lock_class_key *key)
{
@@ -48,6 +50,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
}
EXPORT_SYMBOL(__rwlock_init);
+#endif
static void spin_dump(raw_spinlock_t *lock, const char *msg)
{
@@ -139,6 +142,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
arch_spin_unlock(&lock->raw_lock);
}
+#ifndef CONFIG_PREEMPT_RT
static void rwlock_bug(rwlock_t *lock, const char *msg)
{
if (!debug_locks_off())
@@ -228,3 +232,5 @@ void do_raw_write_unlock(rwlock_t *lock)
debug_write_unlock(lock);
arch_write_unlock(&lock->raw_lock);
}
+
+#endif /* !CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
new file mode 100644
index 000000000000..38e292454fcc
--- /dev/null
+++ b/kernel/locking/spinlock_rt.c
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * PREEMPT_RT substitution for spin/rw_locks
+ *
+ * spinlocks and rwlocks on RT are based on rtmutexes, with a few twists to
+ * resemble the non RT semantics:
+ *
+ * - Contrary to plain rtmutexes, spinlocks and rwlocks are state
+ * preserving. The task state is saved before blocking on the underlying
+ * rtmutex, and restored when the lock has been acquired. Regular wakeups
+ * during that time are redirected to the saved state so no wake up is
+ * missed.
+ *
+ * - Non RT spin/rwlocks disable preemption and eventually interrupts.
+ * Disabling preemption has the side effect of disabling migration and
+ * preventing RCU grace periods.
+ *
+ * The RT substitutions explicitly disable migration and take
+ * rcu_read_lock() across the lock held section.
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_SPINLOCKS
+#include "rtmutex.c"
+
+/*
+ * __might_resched() skips the state check as rtlocks are state
+ * preserving. Take RCU nesting into account as spin/read/write_lock() can
+ * legitimately nest into an RCU read side critical section.
+ */
+#define RTLOCK_RESCHED_OFFSETS \
+ (rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT)
+
+#define rtlock_might_resched() \
+ __might_resched(__FILE__, __LINE__, RTLOCK_RESCHED_OFFSETS)
+
+static __always_inline void rtlock_lock(struct rt_mutex_base *rtm)
+{
+ lockdep_assert(!current->pi_blocked_on);
+
+ if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ rtlock_slowlock(rtm);
+}
+
+static __always_inline void __rt_spin_lock(spinlock_t *lock)
+{
+ rtlock_might_resched();
+ rtlock_lock(&lock->lock);
+ rcu_read_lock();
+ migrate_disable();
+}
+
+void __sched rt_spin_lock(spinlock_t *lock)
+{
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched rt_spin_lock_nested(spinlock_t *lock, int subclass)
+{
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_nested);
+
+void __sched rt_spin_lock_nest_lock(spinlock_t *lock,
+ struct lockdep_map *nest_lock)
+{
+ spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_nest_lock);
+#endif
+
+void __sched rt_spin_unlock(spinlock_t *lock)
+{
+ spin_release(&lock->dep_map, _RET_IP_);
+ migrate_enable();
+ rcu_read_unlock();
+
+ if (unlikely(!rt_mutex_cmpxchg_release(&lock->lock, current, NULL)))
+ rt_mutex_slowunlock(&lock->lock);
+}
+EXPORT_SYMBOL(rt_spin_unlock);
+
+/*
+ * Wait for the lock to get unlocked: instead of polling for an unlock
+ * (like raw spinlocks do), lock and unlock, to force the kernel to
+ * schedule if there's contention:
+ */
+void __sched rt_spin_lock_unlock(spinlock_t *lock)
+{
+ spin_lock(lock);
+ spin_unlock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_unlock);
+
+static __always_inline int __rt_spin_trylock(spinlock_t *lock)
+{
+ int ret = 1;
+
+ if (unlikely(!rt_mutex_cmpxchg_acquire(&lock->lock, NULL, current)))
+ ret = rt_mutex_slowtrylock(&lock->lock);
+
+ if (ret) {
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+
+int __sched rt_spin_trylock(spinlock_t *lock)
+{
+ return __rt_spin_trylock(lock);
+}
+EXPORT_SYMBOL(rt_spin_trylock);
+
+int __sched rt_spin_trylock_bh(spinlock_t *lock)
+{
+ int ret;
+
+ local_bh_disable();
+ ret = __rt_spin_trylock(lock);
+ if (!ret)
+ local_bh_enable();
+ return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_bh);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __rt_spin_lock_init(spinlock_t *lock, const char *name,
+ struct lock_class_key *key, bool percpu)
+{
+ u8 type = percpu ? LD_LOCK_PERCPU : LD_LOCK_NORMAL;
+
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map_type(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG,
+ LD_WAIT_INV, type);
+}
+EXPORT_SYMBOL(__rt_spin_lock_init);
+#endif
+
+/*
+ * RT-specific reader/writer locks
+ */
+#define rwbase_set_and_save_current_state(state) \
+ current_save_and_set_rtlock_wait_state()
+
+#define rwbase_restore_current_state() \
+ current_restore_rtlock_saved_state()
+
+static __always_inline int
+rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state)
+{
+ if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ rtlock_slowlock(rtm);
+ return 0;
+}
+
+static __always_inline int
+rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state)
+{
+ rtlock_slowlock_locked(rtm);
+ return 0;
+}
+
+static __always_inline void rwbase_rtmutex_unlock(struct rt_mutex_base *rtm)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(rtm, current, NULL)))
+ return;
+
+ rt_mutex_slowunlock(rtm);
+}
+
+static __always_inline int rwbase_rtmutex_trylock(struct rt_mutex_base *rtm)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ return 1;
+
+ return rt_mutex_slowtrylock(rtm);
+}
+
+#define rwbase_signal_pending_state(state, current) (0)
+
+#define rwbase_pre_schedule()
+
+#define rwbase_schedule() \
+ schedule_rtlock()
+
+#define rwbase_post_schedule()
+
+#include "rwbase_rt.c"
+/*
+ * The common functions which get wrapped into the rwlock API.
+ */
+int __sched rt_read_trylock(rwlock_t *rwlock)
+{
+ int ret;
+
+ ret = rwbase_read_trylock(&rwlock->rwbase);
+ if (ret) {
+ rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_read_trylock);
+
+int __sched rt_write_trylock(rwlock_t *rwlock)
+{
+ int ret;
+
+ ret = rwbase_write_trylock(&rwlock->rwbase);
+ if (ret) {
+ rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock);
+
+void __sched rt_read_lock(rwlock_t *rwlock)
+{
+ rtlock_might_resched();
+ rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
+ rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_read_lock);
+
+void __sched rt_write_lock(rwlock_t *rwlock)
+{
+ rtlock_might_resched();
+ rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+ rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_write_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass)
+{
+ rtlock_might_resched();
+ rwlock_acquire(&rwlock->dep_map, subclass, 0, _RET_IP_);
+ rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_write_lock_nested);
+#endif
+
+void __sched rt_read_unlock(rwlock_t *rwlock)
+{
+ rwlock_release(&rwlock->dep_map, _RET_IP_);
+ migrate_enable();
+ rcu_read_unlock();
+ rwbase_read_unlock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+}
+EXPORT_SYMBOL(rt_read_unlock);
+
+void __sched rt_write_unlock(rwlock_t *rwlock)
+{
+ rwlock_release(&rwlock->dep_map, _RET_IP_);
+ rcu_read_unlock();
+ migrate_enable();
+ rwbase_write_unlock(&rwlock->rwbase);
+}
+EXPORT_SYMBOL(rt_write_unlock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __rt_rwlock_init(rwlock_t *rwlock, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
+ lockdep_init_map_wait(&rwlock->dep_map, name, key, 0, LD_WAIT_CONFIG);
+}
+EXPORT_SYMBOL(__rt_rwlock_init);
+#endif
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 3e82f449b4ff..78719e1ef1b1 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -9,13 +9,22 @@
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/module.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include <linux/slab.h>
#include <linux/ww_mutex.h>
static DEFINE_WD_CLASS(ww_class);
struct workqueue_struct *wq;
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+#define ww_acquire_init_noinject(a, b) do { \
+ ww_acquire_init((a), (b)); \
+ (a)->deadlock_inject_countdown = ~0U; \
+ } while (0)
+#else
+#define ww_acquire_init_noinject(a, b) ww_acquire_init((a), (b))
+#endif
+
struct test_mutex {
struct work_struct work;
struct ww_mutex mutex;
@@ -36,7 +45,7 @@ static void test_mutex_work(struct work_struct *work)
wait_for_completion(&mtx->go);
if (mtx->flags & TEST_MTX_TRY) {
- while (!ww_mutex_trylock(&mtx->mutex))
+ while (!ww_mutex_trylock(&mtx->mutex, NULL))
cond_resched();
} else {
ww_mutex_lock(&mtx->mutex, NULL);
@@ -109,19 +118,39 @@ static int test_mutex(void)
return 0;
}
-static int test_aa(void)
+static int test_aa(bool trylock)
{
struct ww_mutex mutex;
struct ww_acquire_ctx ctx;
int ret;
+ const char *from = trylock ? "trylock" : "lock";
ww_mutex_init(&mutex, &ww_class);
ww_acquire_init(&ctx, &ww_class);
- ww_mutex_lock(&mutex, &ctx);
+ if (!trylock) {
+ ret = ww_mutex_lock(&mutex, &ctx);
+ if (ret) {
+ pr_err("%s: initial lock failed!\n", __func__);
+ goto out;
+ }
+ } else {
+ ret = !ww_mutex_trylock(&mutex, &ctx);
+ if (ret) {
+ pr_err("%s: initial trylock failed!\n", __func__);
+ goto out;
+ }
+ }
- if (ww_mutex_trylock(&mutex)) {
- pr_err("%s: trylocked itself!\n", __func__);
+ if (ww_mutex_trylock(&mutex, NULL)) {
+ pr_err("%s: trylocked itself without context from %s!\n", __func__, from);
+ ww_mutex_unlock(&mutex);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (ww_mutex_trylock(&mutex, &ctx)) {
+ pr_err("%s: trylocked itself with context from %s!\n", __func__, from);
ww_mutex_unlock(&mutex);
ret = -EINVAL;
goto out;
@@ -129,17 +158,17 @@ static int test_aa(void)
ret = ww_mutex_lock(&mutex, &ctx);
if (ret != -EALREADY) {
- pr_err("%s: missed deadlock for recursing, ret=%d\n",
- __func__, ret);
+ pr_err("%s: missed deadlock for recursing, ret=%d from %s\n",
+ __func__, ret, from);
if (!ret)
ww_mutex_unlock(&mutex);
ret = -EINVAL;
goto out;
}
+ ww_mutex_unlock(&mutex);
ret = 0;
out:
- ww_mutex_unlock(&mutex);
ww_acquire_fini(&ctx);
return ret;
}
@@ -150,7 +179,7 @@ struct test_abba {
struct ww_mutex b_mutex;
struct completion a_ready;
struct completion b_ready;
- bool resolve;
+ bool resolve, trylock;
int result;
};
@@ -160,8 +189,13 @@ static void test_abba_work(struct work_struct *work)
struct ww_acquire_ctx ctx;
int err;
- ww_acquire_init(&ctx, &ww_class);
- ww_mutex_lock(&abba->b_mutex, &ctx);
+ ww_acquire_init_noinject(&ctx, &ww_class);
+ if (!abba->trylock)
+ ww_mutex_lock(&abba->b_mutex, &ctx);
+ else
+ WARN_ON(!ww_mutex_trylock(&abba->b_mutex, &ctx));
+
+ WARN_ON(READ_ONCE(abba->b_mutex.ctx) != &ctx);
complete(&abba->b_ready);
wait_for_completion(&abba->a_ready);
@@ -181,7 +215,7 @@ static void test_abba_work(struct work_struct *work)
abba->result = err;
}
-static int test_abba(bool resolve)
+static int test_abba(bool trylock, bool resolve)
{
struct test_abba abba;
struct ww_acquire_ctx ctx;
@@ -192,12 +226,18 @@ static int test_abba(bool resolve)
INIT_WORK_ONSTACK(&abba.work, test_abba_work);
init_completion(&abba.a_ready);
init_completion(&abba.b_ready);
+ abba.trylock = trylock;
abba.resolve = resolve;
schedule_work(&abba.work);
- ww_acquire_init(&ctx, &ww_class);
- ww_mutex_lock(&abba.a_mutex, &ctx);
+ ww_acquire_init_noinject(&ctx, &ww_class);
+ if (!trylock)
+ ww_mutex_lock(&abba.a_mutex, &ctx);
+ else
+ WARN_ON(!ww_mutex_trylock(&abba.a_mutex, &ctx));
+
+ WARN_ON(READ_ONCE(abba.a_mutex.ctx) != &ctx);
complete(&abba.a_ready);
wait_for_completion(&abba.b_ready);
@@ -249,7 +289,7 @@ static void test_cycle_work(struct work_struct *work)
struct ww_acquire_ctx ctx;
int err, erra = 0;
- ww_acquire_init(&ctx, &ww_class);
+ ww_acquire_init_noinject(&ctx, &ww_class);
ww_mutex_lock(&cycle->a_mutex, &ctx);
complete(cycle->a_signal);
@@ -346,6 +386,19 @@ struct stress {
int nlocks;
};
+struct rnd_state rng;
+DEFINE_SPINLOCK(rng_lock);
+
+static inline u32 prandom_u32_below(u32 ceil)
+{
+ u32 ret;
+
+ spin_lock(&rng_lock);
+ ret = prandom_u32_state(&rng) % ceil;
+ spin_unlock(&rng_lock);
+ return ret;
+}
+
static int *get_random_order(int count)
{
int *order;
@@ -359,7 +412,7 @@ static int *get_random_order(int count)
order[n] = n;
for (n = count - 1; n > 1; n--) {
- r = get_random_int() % (n + 1);
+ r = prandom_u32_below(n + 1);
if (r != n) {
tmp = order[n];
order[n] = order[r];
@@ -412,21 +465,21 @@ retry:
ww_mutex_unlock(&locks[order[n]]);
if (err == -EDEADLK) {
- ww_mutex_lock_slow(&locks[order[contended]], &ctx);
- goto retry;
+ if (!time_after(jiffies, stress->timeout)) {
+ ww_mutex_lock_slow(&locks[order[contended]], &ctx);
+ goto retry;
+ }
}
+ ww_acquire_fini(&ctx);
if (err) {
pr_err_once("stress (%s) failed with %d\n",
__func__, err);
break;
}
-
- ww_acquire_fini(&ctx);
} while (!time_after(jiffies, stress->timeout));
kfree(order);
- kfree(stress);
}
struct reorder_lock {
@@ -491,14 +544,13 @@ out:
list_for_each_entry_safe(ll, ln, &locks, link)
kfree(ll);
kfree(order);
- kfree(stress);
}
static void stress_one_work(struct work_struct *work)
{
struct stress *stress = container_of(work, typeof(*stress), work);
const int nlocks = stress->nlocks;
- struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks);
+ struct ww_mutex *lock = stress->locks + get_random_u32_below(nlocks);
int err;
do {
@@ -512,8 +564,6 @@ static void stress_one_work(struct work_struct *work)
break;
}
} while (!time_after(jiffies, stress->timeout));
-
- kfree(stress);
}
#define STRESS_INORDER BIT(0)
@@ -524,15 +574,24 @@ static void stress_one_work(struct work_struct *work)
static int stress(int nlocks, int nthreads, unsigned int flags)
{
struct ww_mutex *locks;
- int n;
+ struct stress *stress_array;
+ int n, count;
locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL);
if (!locks)
return -ENOMEM;
+ stress_array = kmalloc_array(nthreads, sizeof(*stress_array),
+ GFP_KERNEL);
+ if (!stress_array) {
+ kfree(locks);
+ return -ENOMEM;
+ }
+
for (n = 0; n < nlocks; n++)
ww_mutex_init(&locks[n], &ww_class);
+ count = 0;
for (n = 0; nthreads; n++) {
struct stress *stress;
void (*fn)(struct work_struct *work);
@@ -556,9 +615,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
if (!fn)
continue;
- stress = kmalloc(sizeof(*stress), GFP_KERNEL);
- if (!stress)
- break;
+ stress = &stress_array[count++];
INIT_WORK(&stress->work, fn);
stress->locks = locks;
@@ -573,6 +630,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
for (n = 0; n < nlocks; n++)
ww_mutex_destroy(&locks[n]);
+ kfree(stress_array);
kfree(locks);
return 0;
@@ -581,7 +639,11 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
static int __init test_ww_mutex_init(void)
{
int ncpus = num_online_cpus();
- int ret;
+ int ret, i;
+
+ printk(KERN_INFO "Beginning ww mutex selftests\n");
+
+ prandom_seed_state(&rng, get_random_u64());
wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0);
if (!wq)
@@ -591,17 +653,19 @@ static int __init test_ww_mutex_init(void)
if (ret)
return ret;
- ret = test_aa();
+ ret = test_aa(false);
if (ret)
return ret;
- ret = test_abba(false);
+ ret = test_aa(true);
if (ret)
return ret;
- ret = test_abba(true);
- if (ret)
- return ret;
+ for (i = 0; i < 4; i++) {
+ ret = test_abba(i & 1, i & 2);
+ if (ret)
+ return ret;
+ }
ret = test_cycle(ncpus);
if (ret)
@@ -615,10 +679,11 @@ static int __init test_ww_mutex_init(void)
if (ret)
return ret;
- ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
+ ret = stress(2047, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
if (ret)
return ret;
+ printk(KERN_INFO "All ww mutex selftests passed\n");
return 0;
}
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
new file mode 100644
index 000000000000..3ad2cc4823e5
--- /dev/null
+++ b/kernel/locking/ww_mutex.h
@@ -0,0 +1,569 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef WW_RT
+
+#define MUTEX mutex
+#define MUTEX_WAITER mutex_waiter
+
+static inline struct mutex_waiter *
+__ww_waiter_first(struct mutex *lock)
+{
+ struct mutex_waiter *w;
+
+ w = list_first_entry(&lock->wait_list, struct mutex_waiter, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline struct mutex_waiter *
+__ww_waiter_next(struct mutex *lock, struct mutex_waiter *w)
+{
+ w = list_next_entry(w, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline struct mutex_waiter *
+__ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w)
+{
+ w = list_prev_entry(w, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline struct mutex_waiter *
+__ww_waiter_last(struct mutex *lock)
+{
+ struct mutex_waiter *w;
+
+ w = list_last_entry(&lock->wait_list, struct mutex_waiter, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline void
+__ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *pos)
+{
+ struct list_head *p = &lock->wait_list;
+ if (pos)
+ p = &pos->list;
+ __mutex_add_waiter(lock, waiter, p);
+}
+
+static inline struct task_struct *
+__ww_mutex_owner(struct mutex *lock)
+{
+ return __mutex_owner(lock);
+}
+
+static inline bool
+__ww_mutex_has_waiters(struct mutex *lock)
+{
+ return atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS;
+}
+
+static inline void lock_wait_lock(struct mutex *lock)
+{
+ raw_spin_lock(&lock->wait_lock);
+}
+
+static inline void unlock_wait_lock(struct mutex *lock)
+{
+ raw_spin_unlock(&lock->wait_lock);
+}
+
+static inline void lockdep_assert_wait_lock_held(struct mutex *lock)
+{
+ lockdep_assert_held(&lock->wait_lock);
+}
+
+#else /* WW_RT */
+
+#define MUTEX rt_mutex
+#define MUTEX_WAITER rt_mutex_waiter
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_first(struct rt_mutex *lock)
+{
+ struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+}
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w)
+{
+ struct rb_node *n = rb_next(&w->tree.entry);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+}
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w)
+{
+ struct rb_node *n = rb_prev(&w->tree.entry);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+}
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_last(struct rt_mutex *lock)
+{
+ struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+}
+
+static inline void
+__ww_waiter_add(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct rt_mutex_waiter *pos)
+{
+ /* RT unconditionally adds the waiter first and then removes it on error */
+}
+
+static inline struct task_struct *
+__ww_mutex_owner(struct rt_mutex *lock)
+{
+ return rt_mutex_owner(&lock->rtmutex);
+}
+
+static inline bool
+__ww_mutex_has_waiters(struct rt_mutex *lock)
+{
+ return rt_mutex_has_waiters(&lock->rtmutex);
+}
+
+static inline void lock_wait_lock(struct rt_mutex *lock)
+{
+ raw_spin_lock(&lock->rtmutex.wait_lock);
+}
+
+static inline void unlock_wait_lock(struct rt_mutex *lock)
+{
+ raw_spin_unlock(&lock->rtmutex.wait_lock);
+}
+
+static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock)
+{
+ lockdep_assert_held(&lock->rtmutex.wait_lock);
+}
+
+#endif /* WW_RT */
+
+/*
+ * Wait-Die:
+ * The newer transactions are killed when:
+ * It (the new transaction) makes a request for a lock being held
+ * by an older transaction.
+ *
+ * Wound-Wait:
+ * The newer transactions are wounded when:
+ * An older transaction makes a request for a lock being held by
+ * the newer transaction.
+ */
+
+/*
+ * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired
+ * it.
+ */
+static __always_inline void
+ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
+{
+#ifdef DEBUG_WW_MUTEXES
+ /*
+ * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
+ * but released with a normal mutex_unlock in this call.
+ *
+ * This should never happen, always use ww_mutex_unlock.
+ */
+ DEBUG_LOCKS_WARN_ON(ww->ctx);
+
+ /*
+ * Not quite done after calling ww_acquire_done() ?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
+
+ if (ww_ctx->contending_lock) {
+ /*
+ * After -EDEADLK you tried to
+ * acquire a different ww_mutex? Bad!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+
+ /*
+ * You called ww_mutex_lock after receiving -EDEADLK,
+ * but 'forgot' to unlock everything else first?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+ ww_ctx->contending_lock = NULL;
+ }
+
+ /*
+ * Naughty, using a different class will lead to undefined behavior!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+#endif
+ ww_ctx->acquired++;
+ ww->ctx = ww_ctx;
+}
+
+/*
+ * Determine if @a is 'less' than @b. IOW, either @a is a lower priority task
+ * or, when of equal priority, a younger transaction than @b.
+ *
+ * Depending on the algorithm, @a will either need to wait for @b, or die.
+ */
+static inline bool
+__ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
+{
+/*
+ * Can only do the RT prio for WW_RT, because task->prio isn't stable due to PI,
+ * so the wait_list ordering will go wobbly. rt_mutex re-queues the waiter and
+ * isn't affected by this.
+ */
+#ifdef WW_RT
+ /* kernel prio; less is more */
+ int a_prio = a->task->prio;
+ int b_prio = b->task->prio;
+
+ if (rt_prio(a_prio) || rt_prio(b_prio)) {
+
+ if (a_prio > b_prio)
+ return true;
+
+ if (a_prio < b_prio)
+ return false;
+
+ /* equal static prio */
+
+ if (dl_prio(a_prio)) {
+ if (dl_time_before(b->task->dl.deadline,
+ a->task->dl.deadline))
+ return true;
+
+ if (dl_time_before(a->task->dl.deadline,
+ b->task->dl.deadline))
+ return false;
+ }
+
+ /* equal prio */
+ }
+#endif
+
+ /* FIFO order tie break -- bigger is younger */
+ return (signed long)(a->stamp - b->stamp) > 0;
+}
+
+/*
+ * Wait-Die; wake a lesser waiter context (when locks held) such that it can
+ * die.
+ *
+ * Among waiters with context, only the first one can have other locks acquired
+ * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and
+ * __ww_mutex_check_kill() wake any but the earliest context.
+ */
+static bool
+__ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ if (!ww_ctx->is_wait_die)
+ return false;
+
+ if (waiter->ww_ctx->acquired > 0 && __ww_ctx_less(waiter->ww_ctx, ww_ctx)) {
+#ifndef WW_RT
+ debug_mutex_wake_waiter(lock, waiter);
+#endif
+ wake_up_process(waiter->task);
+ }
+
+ return true;
+}
+
+/*
+ * Wound-Wait; wound a lesser @hold_ctx if it holds the lock.
+ *
+ * Wound the lock holder if there are waiters with more important transactions
+ * than the lock holders. Even if multiple waiters may wound the lock holder,
+ * it's sufficient that only one does.
+ */
+static bool __ww_mutex_wound(struct MUTEX *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ struct ww_acquire_ctx *hold_ctx)
+{
+ struct task_struct *owner = __ww_mutex_owner(lock);
+
+ lockdep_assert_wait_lock_held(lock);
+
+ /*
+ * Possible through __ww_mutex_add_waiter() when we race with
+ * ww_mutex_set_context_fastpath(). In that case we'll get here again
+ * through __ww_mutex_check_waiters().
+ */
+ if (!hold_ctx)
+ return false;
+
+ /*
+ * Can have !owner because of __mutex_unlock_slowpath(), but if owner,
+ * it cannot go away because we'll have FLAG_WAITERS set and hold
+ * wait_lock.
+ */
+ if (!owner)
+ return false;
+
+ if (ww_ctx->acquired > 0 && __ww_ctx_less(hold_ctx, ww_ctx)) {
+ hold_ctx->wounded = 1;
+
+ /*
+ * wake_up_process() paired with set_current_state()
+ * inserts sufficient barriers to make sure @owner either sees
+ * it's wounded in __ww_mutex_check_kill() or has a
+ * wakeup pending to re-read the wounded state.
+ */
+ if (owner != current)
+ wake_up_process(owner);
+
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * We just acquired @lock under @ww_ctx, if there are more important contexts
+ * waiting behind us on the wait-list, check if they need to die, or wound us.
+ *
+ * See __ww_mutex_add_waiter() for the list-order construction; basically the
+ * list is ordered by stamp, smallest (oldest) first.
+ *
+ * This relies on never mixing wait-die/wound-wait on the same wait-list;
+ * which is currently ensured by that being a ww_class property.
+ *
+ * The current task must not be on the wait list.
+ */
+static void
+__ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ struct MUTEX_WAITER *cur;
+
+ lockdep_assert_wait_lock_held(lock);
+
+ for (cur = __ww_waiter_first(lock); cur;
+ cur = __ww_waiter_next(lock, cur)) {
+
+ if (!cur->ww_ctx)
+ continue;
+
+ if (__ww_mutex_die(lock, cur, ww_ctx) ||
+ __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx))
+ break;
+ }
+}
+
+/*
+ * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx
+ * and wake up any waiters so they can recheck.
+ */
+static __always_inline void
+ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ ww_mutex_lock_acquired(lock, ctx);
+
+ /*
+ * The lock->ctx update should be visible on all cores before
+ * the WAITERS check is done, otherwise contended waiters might be
+ * missed. The contended waiters will either see ww_ctx == NULL
+ * and keep spinning, or it will acquire wait_lock, add itself
+ * to waiter list and sleep.
+ */
+ smp_mb(); /* See comments above and below. */
+
+ /*
+ * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS
+ * MB MB
+ * [R] MUTEX_FLAG_WAITERS [R] ww->ctx
+ *
+ * The memory barrier above pairs with the memory barrier in
+ * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx
+ * and/or !empty list.
+ */
+ if (likely(!__ww_mutex_has_waiters(&lock->base)))
+ return;
+
+ /*
+ * Uh oh, we raced in fastpath, check if any of the waiters need to
+ * die or wound us.
+ */
+ lock_wait_lock(&lock->base);
+ __ww_mutex_check_waiters(&lock->base, ctx);
+ unlock_wait_lock(&lock->base);
+}
+
+static __always_inline int
+__ww_mutex_kill(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ if (ww_ctx->acquired > 0) {
+#ifdef DEBUG_WW_MUTEXES
+ struct ww_mutex *ww;
+
+ ww = container_of(lock, struct ww_mutex, base);
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
+ ww_ctx->contending_lock = ww;
+#endif
+ return -EDEADLK;
+ }
+
+ return 0;
+}
+
+/*
+ * Check the wound condition for the current lock acquire.
+ *
+ * Wound-Wait: If we're wounded, kill ourself.
+ *
+ * Wait-Die: If we're trying to acquire a lock already held by an older
+ * context, kill ourselves.
+ *
+ * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to
+ * look at waiters before us in the wait-list.
+ */
+static inline int
+__ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
+ struct ww_acquire_ctx *ctx)
+{
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+ struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
+ struct MUTEX_WAITER *cur;
+
+ if (ctx->acquired == 0)
+ return 0;
+
+ if (!ctx->is_wait_die) {
+ if (ctx->wounded)
+ return __ww_mutex_kill(lock, ctx);
+
+ return 0;
+ }
+
+ if (hold_ctx && __ww_ctx_less(ctx, hold_ctx))
+ return __ww_mutex_kill(lock, ctx);
+
+ /*
+ * If there is a waiter in front of us that has a context, then its
+ * stamp is earlier than ours and we must kill ourself.
+ */
+ for (cur = __ww_waiter_prev(lock, waiter); cur;
+ cur = __ww_waiter_prev(lock, cur)) {
+
+ if (!cur->ww_ctx)
+ continue;
+
+ return __ww_mutex_kill(lock, ctx);
+ }
+
+ return 0;
+}
+
+/*
+ * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest
+ * first. Such that older contexts are preferred to acquire the lock over
+ * younger contexts.
+ *
+ * Waiters without context are interspersed in FIFO order.
+ *
+ * Furthermore, for Wait-Die kill ourself immediately when possible (there are
+ * older contexts already waiting) to avoid unnecessary waiting and for
+ * Wound-Wait ensure we wound the owning context when it is younger.
+ */
+static inline int
+__ww_mutex_add_waiter(struct MUTEX_WAITER *waiter,
+ struct MUTEX *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ struct MUTEX_WAITER *cur, *pos = NULL;
+ bool is_wait_die;
+
+ if (!ww_ctx) {
+ __ww_waiter_add(lock, waiter, NULL);
+ return 0;
+ }
+
+ is_wait_die = ww_ctx->is_wait_die;
+
+ /*
+ * Add the waiter before the first waiter with a higher stamp.
+ * Waiters without a context are skipped to avoid starving
+ * them. Wait-Die waiters may die here. Wound-Wait waiters
+ * never die here, but they are sorted in stamp order and
+ * may wound the lock holder.
+ */
+ for (cur = __ww_waiter_last(lock); cur;
+ cur = __ww_waiter_prev(lock, cur)) {
+
+ if (!cur->ww_ctx)
+ continue;
+
+ if (__ww_ctx_less(ww_ctx, cur->ww_ctx)) {
+ /*
+ * Wait-Die: if we find an older context waiting, there
+ * is no point in queueing behind it, as we'd have to
+ * die the moment it would acquire the lock.
+ */
+ if (is_wait_die) {
+ int ret = __ww_mutex_kill(lock, ww_ctx);
+
+ if (ret)
+ return ret;
+ }
+
+ break;
+ }
+
+ pos = cur;
+
+ /* Wait-Die: ensure younger waiters die. */
+ __ww_mutex_die(lock, cur, ww_ctx);
+ }
+
+ __ww_waiter_add(lock, waiter, pos);
+
+ /*
+ * Wound-Wait: if we're blocking on a mutex owned by a younger context,
+ * wound that such that we might proceed.
+ */
+ if (!is_wait_die) {
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+
+ /*
+ * See ww_mutex_set_context_fastpath(). Orders setting
+ * MUTEX_FLAG_WAITERS vs the ww->ctx load,
+ * such that either we or the fastpath will wound @ww->ctx.
+ */
+ smp_mb();
+ __ww_mutex_wound(lock, ww_ctx, ww->ctx);
+ }
+
+ return 0;
+}
+
+static inline void __ww_mutex_unlock(struct ww_mutex *lock)
+{
+ if (lock->ctx) {
+#ifdef DEBUG_WW_MUTEXES
+ DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
+#endif
+ if (lock->ctx->acquired > 0)
+ lock->ctx->acquired--;
+ lock->ctx = NULL;
+ }
+}
diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c
new file mode 100644
index 000000000000..c7196de838ed
--- /dev/null
+++ b/kernel/locking/ww_rt_mutex.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * rtmutex API
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_MUTEX
+#define WW_RT
+#include "rtmutex.c"
+
+int ww_mutex_trylock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ struct rt_mutex *rtm = &lock->base;
+
+ if (!ww_ctx)
+ return rt_mutex_trylock(rtm);
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here, since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+ if (__rt_mutex_trylock(&rtm->rtmutex)) {
+ ww_mutex_set_context_fastpath(lock, ww_ctx);
+ mutex_acquire_nest(&rtm->dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_);
+ return 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ww_mutex_trylock);
+
+static int __sched
+__ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx,
+ unsigned int state, unsigned long ip)
+{
+ struct lockdep_map __maybe_unused *nest_lock = NULL;
+ struct rt_mutex *rtm = &lock->base;
+ int ret;
+
+ might_sleep();
+
+ if (ww_ctx) {
+ if (unlikely(ww_ctx == READ_ONCE(lock->ctx)))
+ return -EALREADY;
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here, since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ nest_lock = &ww_ctx->dep_map;
+#endif
+ }
+ mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip);
+
+ if (likely(rt_mutex_try_acquire(&rtm->rtmutex))) {
+ if (ww_ctx)
+ ww_mutex_set_context_fastpath(lock, ww_ctx);
+ return 0;
+ }
+
+ ret = rt_mutex_slowlock(&rtm->rtmutex, ww_ctx, state);
+
+ if (ret)
+ mutex_release(&rtm->dep_map, ip);
+ return ret;
+}
+
+int __sched
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ return __ww_rt_mutex_lock(lock, ctx, TASK_UNINTERRUPTIBLE, _RET_IP_);
+}
+EXPORT_SYMBOL(ww_mutex_lock);
+
+int __sched
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ return __ww_rt_mutex_lock(lock, ctx, TASK_INTERRUPTIBLE, _RET_IP_);
+}
+EXPORT_SYMBOL(ww_mutex_lock_interruptible);
+
+void __sched ww_mutex_unlock(struct ww_mutex *lock)
+{
+ struct rt_mutex *rtm = &lock->base;
+
+ __ww_mutex_unlock(lock);
+
+ mutex_release(&rtm->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&rtm->rtmutex);
+}
+EXPORT_SYMBOL(ww_mutex_unlock);
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
deleted file mode 100644
index 33783abc377b..000000000000
--- a/kernel/module-internal.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* Module internals
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/elf.h>
-#include <asm/module.h>
-
-struct load_info {
- const char *name;
- /* pointer to module in temporary copy, freed at end of load_module() */
- struct module *mod;
- Elf_Ehdr *hdr;
- unsigned long len;
- Elf_Shdr *sechdrs;
- char *secstrings, *strtab;
- unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
- struct _ddebug *debug;
- unsigned int num_debug;
- bool sig_ok;
-#ifdef CONFIG_KALLSYMS
- unsigned long mod_kallsyms_init_off;
-#endif
- struct {
- unsigned int sym, str, mod, vers, info, pcpu;
- } index;
-};
-
-extern int mod_verify_sig(const void *mod, struct load_info *info);
diff --git a/kernel/module.c b/kernel/module.c
deleted file mode 100644
index aa183c9ac0a2..000000000000
--- a/kernel/module.c
+++ /dev/null
@@ -1,4548 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- Copyright (C) 2002 Richard Henderson
- Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
-
-*/
-
-#define INCLUDE_VERMAGIC
-
-#include <linux/export.h>
-#include <linux/extable.h>
-#include <linux/moduleloader.h>
-#include <linux/module_signature.h>
-#include <linux/trace_events.h>
-#include <linux/init.h>
-#include <linux/kallsyms.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/sysfs.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/elf.h>
-#include <linux/proc_fs.h>
-#include <linux/security.h>
-#include <linux/seq_file.h>
-#include <linux/syscalls.h>
-#include <linux/fcntl.h>
-#include <linux/rcupdate.h>
-#include <linux/capability.h>
-#include <linux/cpu.h>
-#include <linux/moduleparam.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/vermagic.h>
-#include <linux/notifier.h>
-#include <linux/sched.h>
-#include <linux/device.h>
-#include <linux/string.h>
-#include <linux/mutex.h>
-#include <linux/rculist.h>
-#include <linux/uaccess.h>
-#include <asm/cacheflush.h>
-#include <linux/set_memory.h>
-#include <asm/mmu_context.h>
-#include <linux/license.h>
-#include <asm/sections.h>
-#include <linux/tracepoint.h>
-#include <linux/ftrace.h>
-#include <linux/livepatch.h>
-#include <linux/async.h>
-#include <linux/percpu.h>
-#include <linux/kmemleak.h>
-#include <linux/jump_label.h>
-#include <linux/pfn.h>
-#include <linux/bsearch.h>
-#include <linux/dynamic_debug.h>
-#include <linux/audit.h>
-#include <uapi/linux/module.h>
-#include "module-internal.h"
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/module.h>
-
-#ifndef ARCH_SHF_SMALL
-#define ARCH_SHF_SMALL 0
-#endif
-
-/*
- * Modules' sections will be aligned on page boundaries
- * to ensure complete separation of code and data, but
- * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y
- */
-#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
-# define debug_align(X) ALIGN(X, PAGE_SIZE)
-#else
-# define debug_align(X) (X)
-#endif
-
-/* If this is set, the section belongs in the init part of the module */
-#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
-
-/*
- * Mutex protects:
- * 1) List of modules (also safely readable with preempt_disable),
- * 2) module_use links,
- * 3) module_addr_min/module_addr_max.
- * (delete and add uses RCU list operations). */
-DEFINE_MUTEX(module_mutex);
-EXPORT_SYMBOL_GPL(module_mutex);
-static LIST_HEAD(modules);
-
-/* Work queue for freeing init sections in success case */
-static struct work_struct init_free_wq;
-static struct llist_head init_free_list;
-
-#ifdef CONFIG_MODULES_TREE_LOOKUP
-
-/*
- * Use a latched RB-tree for __module_address(); this allows us to use
- * RCU-sched lookups of the address from any context.
- *
- * This is conditional on PERF_EVENTS || TRACING because those can really hit
- * __module_address() hard by doing a lot of stack unwinding; potentially from
- * NMI context.
- */
-
-static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
-{
- struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
-
- return (unsigned long)layout->base;
-}
-
-static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
-{
- struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
-
- return (unsigned long)layout->size;
-}
-
-static __always_inline bool
-mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b)
-{
- return __mod_tree_val(a) < __mod_tree_val(b);
-}
-
-static __always_inline int
-mod_tree_comp(void *key, struct latch_tree_node *n)
-{
- unsigned long val = (unsigned long)key;
- unsigned long start, end;
-
- start = __mod_tree_val(n);
- if (val < start)
- return -1;
-
- end = start + __mod_tree_size(n);
- if (val >= end)
- return 1;
-
- return 0;
-}
-
-static const struct latch_tree_ops mod_tree_ops = {
- .less = mod_tree_less,
- .comp = mod_tree_comp,
-};
-
-static struct mod_tree_root {
- struct latch_tree_root root;
- unsigned long addr_min;
- unsigned long addr_max;
-} mod_tree __cacheline_aligned = {
- .addr_min = -1UL,
-};
-
-#define module_addr_min mod_tree.addr_min
-#define module_addr_max mod_tree.addr_max
-
-static noinline void __mod_tree_insert(struct mod_tree_node *node)
-{
- latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops);
-}
-
-static void __mod_tree_remove(struct mod_tree_node *node)
-{
- latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops);
-}
-
-/*
- * These modifications: insert, remove_init and remove; are serialized by the
- * module_mutex.
- */
-static void mod_tree_insert(struct module *mod)
-{
- mod->core_layout.mtn.mod = mod;
- mod->init_layout.mtn.mod = mod;
-
- __mod_tree_insert(&mod->core_layout.mtn);
- if (mod->init_layout.size)
- __mod_tree_insert(&mod->init_layout.mtn);
-}
-
-static void mod_tree_remove_init(struct module *mod)
-{
- if (mod->init_layout.size)
- __mod_tree_remove(&mod->init_layout.mtn);
-}
-
-static void mod_tree_remove(struct module *mod)
-{
- __mod_tree_remove(&mod->core_layout.mtn);
- mod_tree_remove_init(mod);
-}
-
-static struct module *mod_find(unsigned long addr)
-{
- struct latch_tree_node *ltn;
-
- ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops);
- if (!ltn)
- return NULL;
-
- return container_of(ltn, struct mod_tree_node, node)->mod;
-}
-
-#else /* MODULES_TREE_LOOKUP */
-
-static unsigned long module_addr_min = -1UL, module_addr_max = 0;
-
-static void mod_tree_insert(struct module *mod) { }
-static void mod_tree_remove_init(struct module *mod) { }
-static void mod_tree_remove(struct module *mod) { }
-
-static struct module *mod_find(unsigned long addr)
-{
- struct module *mod;
-
- list_for_each_entry_rcu(mod, &modules, list,
- lockdep_is_held(&module_mutex)) {
- if (within_module(addr, mod))
- return mod;
- }
-
- return NULL;
-}
-
-#endif /* MODULES_TREE_LOOKUP */
-
-/*
- * Bounds of module text, for speeding up __module_address.
- * Protected by module_mutex.
- */
-static void __mod_update_bounds(void *base, unsigned int size)
-{
- unsigned long min = (unsigned long)base;
- unsigned long max = min + size;
-
- if (min < module_addr_min)
- module_addr_min = min;
- if (max > module_addr_max)
- module_addr_max = max;
-}
-
-static void mod_update_bounds(struct module *mod)
-{
- __mod_update_bounds(mod->core_layout.base, mod->core_layout.size);
- if (mod->init_layout.size)
- __mod_update_bounds(mod->init_layout.base, mod->init_layout.size);
-}
-
-#ifdef CONFIG_KGDB_KDB
-struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
-#endif /* CONFIG_KGDB_KDB */
-
-static void module_assert_mutex(void)
-{
- lockdep_assert_held(&module_mutex);
-}
-
-static void module_assert_mutex_or_preempt(void)
-{
-#ifdef CONFIG_LOCKDEP
- if (unlikely(!debug_locks))
- return;
-
- WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
- !lockdep_is_held(&module_mutex));
-#endif
-}
-
-static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
-module_param(sig_enforce, bool_enable_only, 0644);
-
-/*
- * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
- * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
- */
-bool is_module_sig_enforced(void)
-{
- return sig_enforce;
-}
-EXPORT_SYMBOL(is_module_sig_enforced);
-
-void set_module_sig_enforced(void)
-{
- sig_enforce = true;
-}
-
-/* Block module loading/unloading? */
-int modules_disabled = 0;
-core_param(nomodule, modules_disabled, bint, 0);
-
-/* Waiting for a module to finish initializing? */
-static DECLARE_WAIT_QUEUE_HEAD(module_wq);
-
-static BLOCKING_NOTIFIER_HEAD(module_notify_list);
-
-int register_module_notifier(struct notifier_block *nb)
-{
- return blocking_notifier_chain_register(&module_notify_list, nb);
-}
-EXPORT_SYMBOL(register_module_notifier);
-
-int unregister_module_notifier(struct notifier_block *nb)
-{
- return blocking_notifier_chain_unregister(&module_notify_list, nb);
-}
-EXPORT_SYMBOL(unregister_module_notifier);
-
-/*
- * We require a truly strong try_module_get(): 0 means success.
- * Otherwise an error is returned due to ongoing or failed
- * initialization etc.
- */
-static inline int strong_try_module_get(struct module *mod)
-{
- BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
- if (mod && mod->state == MODULE_STATE_COMING)
- return -EBUSY;
- if (try_module_get(mod))
- return 0;
- else
- return -ENOENT;
-}
-
-static inline void add_taint_module(struct module *mod, unsigned flag,
- enum lockdep_ok lockdep_ok)
-{
- add_taint(flag, lockdep_ok);
- set_bit(flag, &mod->taints);
-}
-
-/*
- * A thread that wants to hold a reference to a module only while it
- * is running can call this to safely exit. nfsd and lockd use this.
- */
-void __noreturn __module_put_and_exit(struct module *mod, long code)
-{
- module_put(mod);
- do_exit(code);
-}
-EXPORT_SYMBOL(__module_put_and_exit);
-
-/* Find a module section: 0 means not found. */
-static unsigned int find_sec(const struct load_info *info, const char *name)
-{
- unsigned int i;
-
- for (i = 1; i < info->hdr->e_shnum; i++) {
- Elf_Shdr *shdr = &info->sechdrs[i];
- /* Alloc bit cleared means "ignore it." */
- if ((shdr->sh_flags & SHF_ALLOC)
- && strcmp(info->secstrings + shdr->sh_name, name) == 0)
- return i;
- }
- return 0;
-}
-
-/* Find a module section, or NULL. */
-static void *section_addr(const struct load_info *info, const char *name)
-{
- /* Section 0 has sh_addr 0. */
- return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
-}
-
-/* Find a module section, or NULL. Fill in number of "objects" in section. */
-static void *section_objs(const struct load_info *info,
- const char *name,
- size_t object_size,
- unsigned int *num)
-{
- unsigned int sec = find_sec(info, name);
-
- /* Section 0 has sh_addr 0 and sh_size 0. */
- *num = info->sechdrs[sec].sh_size / object_size;
- return (void *)info->sechdrs[sec].sh_addr;
-}
-
-/* Provided by the linker */
-extern const struct kernel_symbol __start___ksymtab[];
-extern const struct kernel_symbol __stop___ksymtab[];
-extern const struct kernel_symbol __start___ksymtab_gpl[];
-extern const struct kernel_symbol __stop___ksymtab_gpl[];
-extern const struct kernel_symbol __start___ksymtab_gpl_future[];
-extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
-extern const s32 __start___kcrctab[];
-extern const s32 __start___kcrctab_gpl[];
-extern const s32 __start___kcrctab_gpl_future[];
-#ifdef CONFIG_UNUSED_SYMBOLS
-extern const struct kernel_symbol __start___ksymtab_unused[];
-extern const struct kernel_symbol __stop___ksymtab_unused[];
-extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
-extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
-extern const s32 __start___kcrctab_unused[];
-extern const s32 __start___kcrctab_unused_gpl[];
-#endif
-
-#ifndef CONFIG_MODVERSIONS
-#define symversion(base, idx) NULL
-#else
-#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
-#endif
-
-static bool each_symbol_in_section(const struct symsearch *arr,
- unsigned int arrsize,
- struct module *owner,
- bool (*fn)(const struct symsearch *syms,
- struct module *owner,
- void *data),
- void *data)
-{
- unsigned int j;
-
- for (j = 0; j < arrsize; j++) {
- if (fn(&arr[j], owner, data))
- return true;
- }
-
- return false;
-}
-
-/* Returns true as soon as fn returns true, otherwise false. */
-bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
- struct module *owner,
- void *data),
- void *data)
-{
- struct module *mod;
- static const struct symsearch arr[] = {
- { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
- NOT_GPL_ONLY, false },
- { __start___ksymtab_gpl, __stop___ksymtab_gpl,
- __start___kcrctab_gpl,
- GPL_ONLY, false },
- { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
- __start___kcrctab_gpl_future,
- WILL_BE_GPL_ONLY, false },
-#ifdef CONFIG_UNUSED_SYMBOLS
- { __start___ksymtab_unused, __stop___ksymtab_unused,
- __start___kcrctab_unused,
- NOT_GPL_ONLY, true },
- { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
- __start___kcrctab_unused_gpl,
- GPL_ONLY, true },
-#endif
- };
-
- module_assert_mutex_or_preempt();
-
- if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
- return true;
-
- list_for_each_entry_rcu(mod, &modules, list,
- lockdep_is_held(&module_mutex)) {
- struct symsearch arr[] = {
- { mod->syms, mod->syms + mod->num_syms, mod->crcs,
- NOT_GPL_ONLY, false },
- { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
- mod->gpl_crcs,
- GPL_ONLY, false },
- { mod->gpl_future_syms,
- mod->gpl_future_syms + mod->num_gpl_future_syms,
- mod->gpl_future_crcs,
- WILL_BE_GPL_ONLY, false },
-#ifdef CONFIG_UNUSED_SYMBOLS
- { mod->unused_syms,
- mod->unused_syms + mod->num_unused_syms,
- mod->unused_crcs,
- NOT_GPL_ONLY, true },
- { mod->unused_gpl_syms,
- mod->unused_gpl_syms + mod->num_unused_gpl_syms,
- mod->unused_gpl_crcs,
- GPL_ONLY, true },
-#endif
- };
-
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
-
- if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
- return true;
- }
- return false;
-}
-EXPORT_SYMBOL_GPL(each_symbol_section);
-
-struct find_symbol_arg {
- /* Input */
- const char *name;
- bool gplok;
- bool warn;
-
- /* Output */
- struct module *owner;
- const s32 *crc;
- const struct kernel_symbol *sym;
-};
-
-static bool check_exported_symbol(const struct symsearch *syms,
- struct module *owner,
- unsigned int symnum, void *data)
-{
- struct find_symbol_arg *fsa = data;
-
- if (!fsa->gplok) {
- if (syms->licence == GPL_ONLY)
- return false;
- if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
- pr_warn("Symbol %s is being used by a non-GPL module, "
- "which will not be allowed in the future\n",
- fsa->name);
- }
- }
-
-#ifdef CONFIG_UNUSED_SYMBOLS
- if (syms->unused && fsa->warn) {
- pr_warn("Symbol %s is marked as UNUSED, however this module is "
- "using it.\n", fsa->name);
- pr_warn("This symbol will go away in the future.\n");
- pr_warn("Please evaluate if this is the right api to use and "
- "if it really is, submit a report to the linux kernel "
- "mailing list together with submitting your code for "
- "inclusion.\n");
- }
-#endif
-
- fsa->owner = owner;
- fsa->crc = symversion(syms->crcs, symnum);
- fsa->sym = &syms->start[symnum];
- return true;
-}
-
-static unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
-{
-#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
- return (unsigned long)offset_to_ptr(&sym->value_offset);
-#else
- return sym->value;
-#endif
-}
-
-static const char *kernel_symbol_name(const struct kernel_symbol *sym)
-{
-#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
- return offset_to_ptr(&sym->name_offset);
-#else
- return sym->name;
-#endif
-}
-
-static const char *kernel_symbol_namespace(const struct kernel_symbol *sym)
-{
-#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
- if (!sym->namespace_offset)
- return NULL;
- return offset_to_ptr(&sym->namespace_offset);
-#else
- return sym->namespace;
-#endif
-}
-
-static int cmp_name(const void *name, const void *sym)
-{
- return strcmp(name, kernel_symbol_name(sym));
-}
-
-static bool find_exported_symbol_in_section(const struct symsearch *syms,
- struct module *owner,
- void *data)
-{
- struct find_symbol_arg *fsa = data;
- struct kernel_symbol *sym;
-
- sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
- sizeof(struct kernel_symbol), cmp_name);
-
- if (sym != NULL && check_exported_symbol(syms, owner,
- sym - syms->start, data))
- return true;
-
- return false;
-}
-
-/* Find an exported symbol and return it, along with, (optional) crc and
- * (optional) module which owns it. Needs preempt disabled or module_mutex. */
-const struct kernel_symbol *find_symbol(const char *name,
- struct module **owner,
- const s32 **crc,
- bool gplok,
- bool warn)
-{
- struct find_symbol_arg fsa;
-
- fsa.name = name;
- fsa.gplok = gplok;
- fsa.warn = warn;
-
- if (each_symbol_section(find_exported_symbol_in_section, &fsa)) {
- if (owner)
- *owner = fsa.owner;
- if (crc)
- *crc = fsa.crc;
- return fsa.sym;
- }
-
- pr_debug("Failed to find symbol %s\n", name);
- return NULL;
-}
-EXPORT_SYMBOL_GPL(find_symbol);
-
-/*
- * Search for module by name: must hold module_mutex (or preempt disabled
- * for read-only access).
- */
-static struct module *find_module_all(const char *name, size_t len,
- bool even_unformed)
-{
- struct module *mod;
-
- module_assert_mutex_or_preempt();
-
- list_for_each_entry_rcu(mod, &modules, list,
- lockdep_is_held(&module_mutex)) {
- if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
- return mod;
- }
- return NULL;
-}
-
-struct module *find_module(const char *name)
-{
- module_assert_mutex();
- return find_module_all(name, strlen(name), false);
-}
-EXPORT_SYMBOL_GPL(find_module);
-
-#ifdef CONFIG_SMP
-
-static inline void __percpu *mod_percpu(struct module *mod)
-{
- return mod->percpu;
-}
-
-static int percpu_modalloc(struct module *mod, struct load_info *info)
-{
- Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
- unsigned long align = pcpusec->sh_addralign;
-
- if (!pcpusec->sh_size)
- return 0;
-
- if (align > PAGE_SIZE) {
- pr_warn("%s: per-cpu alignment %li > %li\n",
- mod->name, align, PAGE_SIZE);
- align = PAGE_SIZE;
- }
-
- mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
- if (!mod->percpu) {
- pr_warn("%s: Could not allocate %lu bytes percpu data\n",
- mod->name, (unsigned long)pcpusec->sh_size);
- return -ENOMEM;
- }
- mod->percpu_size = pcpusec->sh_size;
- return 0;
-}
-
-static void percpu_modfree(struct module *mod)
-{
- free_percpu(mod->percpu);
-}
-
-static unsigned int find_pcpusec(struct load_info *info)
-{
- return find_sec(info, ".data..percpu");
-}
-
-static void percpu_modcopy(struct module *mod,
- const void *from, unsigned long size)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
-}
-
-bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
-{
- struct module *mod;
- unsigned int cpu;
-
- preempt_disable();
-
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (!mod->percpu_size)
- continue;
- for_each_possible_cpu(cpu) {
- void *start = per_cpu_ptr(mod->percpu, cpu);
- void *va = (void *)addr;
-
- if (va >= start && va < start + mod->percpu_size) {
- if (can_addr) {
- *can_addr = (unsigned long) (va - start);
- *can_addr += (unsigned long)
- per_cpu_ptr(mod->percpu,
- get_boot_cpu_id());
- }
- preempt_enable();
- return true;
- }
- }
- }
-
- preempt_enable();
- return false;
-}
-
-/**
- * is_module_percpu_address - test whether address is from module static percpu
- * @addr: address to test
- *
- * Test whether @addr belongs to module static percpu area.
- *
- * RETURNS:
- * %true if @addr is from module static percpu area
- */
-bool is_module_percpu_address(unsigned long addr)
-{
- return __is_module_percpu_address(addr, NULL);
-}
-
-#else /* ... !CONFIG_SMP */
-
-static inline void __percpu *mod_percpu(struct module *mod)
-{
- return NULL;
-}
-static int percpu_modalloc(struct module *mod, struct load_info *info)
-{
- /* UP modules shouldn't have this section: ENOMEM isn't quite right */
- if (info->sechdrs[info->index.pcpu].sh_size != 0)
- return -ENOMEM;
- return 0;
-}
-static inline void percpu_modfree(struct module *mod)
-{
-}
-static unsigned int find_pcpusec(struct load_info *info)
-{
- return 0;
-}
-static inline void percpu_modcopy(struct module *mod,
- const void *from, unsigned long size)
-{
- /* pcpusec should be 0, and size of that section should be 0. */
- BUG_ON(size != 0);
-}
-bool is_module_percpu_address(unsigned long addr)
-{
- return false;
-}
-
-bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
-{
- return false;
-}
-
-#endif /* CONFIG_SMP */
-
-#define MODINFO_ATTR(field) \
-static void setup_modinfo_##field(struct module *mod, const char *s) \
-{ \
- mod->field = kstrdup(s, GFP_KERNEL); \
-} \
-static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
- struct module_kobject *mk, char *buffer) \
-{ \
- return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field); \
-} \
-static int modinfo_##field##_exists(struct module *mod) \
-{ \
- return mod->field != NULL; \
-} \
-static void free_modinfo_##field(struct module *mod) \
-{ \
- kfree(mod->field); \
- mod->field = NULL; \
-} \
-static struct module_attribute modinfo_##field = { \
- .attr = { .name = __stringify(field), .mode = 0444 }, \
- .show = show_modinfo_##field, \
- .setup = setup_modinfo_##field, \
- .test = modinfo_##field##_exists, \
- .free = free_modinfo_##field, \
-};
-
-MODINFO_ATTR(version);
-MODINFO_ATTR(srcversion);
-
-static char last_unloaded_module[MODULE_NAME_LEN+1];
-
-#ifdef CONFIG_MODULE_UNLOAD
-
-EXPORT_TRACEPOINT_SYMBOL(module_get);
-
-/* MODULE_REF_BASE is the base reference count by kmodule loader. */
-#define MODULE_REF_BASE 1
-
-/* Init the unload section of the module. */
-static int module_unload_init(struct module *mod)
-{
- /*
- * Initialize reference counter to MODULE_REF_BASE.
- * refcnt == 0 means module is going.
- */
- atomic_set(&mod->refcnt, MODULE_REF_BASE);
-
- INIT_LIST_HEAD(&mod->source_list);
- INIT_LIST_HEAD(&mod->target_list);
-
- /* Hold reference count during initialization. */
- atomic_inc(&mod->refcnt);
-
- return 0;
-}
-
-/* Does a already use b? */
-static int already_uses(struct module *a, struct module *b)
-{
- struct module_use *use;
-
- list_for_each_entry(use, &b->source_list, source_list) {
- if (use->source == a) {
- pr_debug("%s uses %s!\n", a->name, b->name);
- return 1;
- }
- }
- pr_debug("%s does not use %s!\n", a->name, b->name);
- return 0;
-}
-
-/*
- * Module a uses b
- * - we add 'a' as a "source", 'b' as a "target" of module use
- * - the module_use is added to the list of 'b' sources (so
- * 'b' can walk the list to see who sourced them), and of 'a'
- * targets (so 'a' can see what modules it targets).
- */
-static int add_module_usage(struct module *a, struct module *b)
-{
- struct module_use *use;
-
- pr_debug("Allocating new usage for %s.\n", a->name);
- use = kmalloc(sizeof(*use), GFP_ATOMIC);
- if (!use)
- return -ENOMEM;
-
- use->source = a;
- use->target = b;
- list_add(&use->source_list, &b->source_list);
- list_add(&use->target_list, &a->target_list);
- return 0;
-}
-
-/* Module a uses b: caller needs module_mutex() */
-int ref_module(struct module *a, struct module *b)
-{
- int err;
-
- if (b == NULL || already_uses(a, b))
- return 0;
-
- /* If module isn't available, we fail. */
- err = strong_try_module_get(b);
- if (err)
- return err;
-
- err = add_module_usage(a, b);
- if (err) {
- module_put(b);
- return err;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(ref_module);
-
-/* Clear the unload stuff of the module. */
-static void module_unload_free(struct module *mod)
-{
- struct module_use *use, *tmp;
-
- mutex_lock(&module_mutex);
- list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
- struct module *i = use->target;
- pr_debug("%s unusing %s\n", mod->name, i->name);
- module_put(i);
- list_del(&use->source_list);
- list_del(&use->target_list);
- kfree(use);
- }
- mutex_unlock(&module_mutex);
-}
-
-#ifdef CONFIG_MODULE_FORCE_UNLOAD
-static inline int try_force_unload(unsigned int flags)
-{
- int ret = (flags & O_TRUNC);
- if (ret)
- add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
- return ret;
-}
-#else
-static inline int try_force_unload(unsigned int flags)
-{
- return 0;
-}
-#endif /* CONFIG_MODULE_FORCE_UNLOAD */
-
-/* Try to release refcount of module, 0 means success. */
-static int try_release_module_ref(struct module *mod)
-{
- int ret;
-
- /* Try to decrement refcnt which we set at loading */
- ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
- BUG_ON(ret < 0);
- if (ret)
- /* Someone can put this right now, recover with checking */
- ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
-
- return ret;
-}
-
-static int try_stop_module(struct module *mod, int flags, int *forced)
-{
- /* If it's not unused, quit unless we're forcing. */
- if (try_release_module_ref(mod) != 0) {
- *forced = try_force_unload(flags);
- if (!(*forced))
- return -EWOULDBLOCK;
- }
-
- /* Mark it as dying. */
- mod->state = MODULE_STATE_GOING;
-
- return 0;
-}
-
-/**
- * module_refcount - return the refcount or -1 if unloading
- *
- * @mod: the module we're checking
- *
- * Returns:
- * -1 if the module is in the process of unloading
- * otherwise the number of references in the kernel to the module
- */
-int module_refcount(struct module *mod)
-{
- return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
-}
-EXPORT_SYMBOL(module_refcount);
-
-/* This exists whether we can unload or not */
-static void free_module(struct module *mod);
-
-SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
- unsigned int, flags)
-{
- struct module *mod;
- char name[MODULE_NAME_LEN];
- int ret, forced = 0;
-
- if (!capable(CAP_SYS_MODULE) || modules_disabled)
- return -EPERM;
-
- if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
- return -EFAULT;
- name[MODULE_NAME_LEN-1] = '\0';
-
- audit_log_kern_module(name);
-
- if (mutex_lock_interruptible(&module_mutex) != 0)
- return -EINTR;
-
- mod = find_module(name);
- if (!mod) {
- ret = -ENOENT;
- goto out;
- }
-
- if (!list_empty(&mod->source_list)) {
- /* Other modules depend on us: get rid of them first. */
- ret = -EWOULDBLOCK;
- goto out;
- }
-
- /* Doing init or already dying? */
- if (mod->state != MODULE_STATE_LIVE) {
- /* FIXME: if (force), slam module count damn the torpedoes */
- pr_debug("%s already dying\n", mod->name);
- ret = -EBUSY;
- goto out;
- }
-
- /* If it has an init func, it must have an exit func to unload */
- if (mod->init && !mod->exit) {
- forced = try_force_unload(flags);
- if (!forced) {
- /* This module can't be removed */
- ret = -EBUSY;
- goto out;
- }
- }
-
- /* Stop the machine so refcounts can't move and disable module. */
- ret = try_stop_module(mod, flags, &forced);
- if (ret != 0)
- goto out;
-
- mutex_unlock(&module_mutex);
- /* Final destruction now no one is using it. */
- if (mod->exit != NULL)
- mod->exit();
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_GOING, mod);
- klp_module_going(mod);
- ftrace_release_mod(mod);
-
- async_synchronize_full();
-
- /* Store the name of the last unloaded module for diagnostic purposes */
- strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
-
- free_module(mod);
- /* someone could wait for the module in add_unformed_module() */
- wake_up_all(&module_wq);
- return 0;
-out:
- mutex_unlock(&module_mutex);
- return ret;
-}
-
-static inline void print_unload_info(struct seq_file *m, struct module *mod)
-{
- struct module_use *use;
- int printed_something = 0;
-
- seq_printf(m, " %i ", module_refcount(mod));
-
- /*
- * Always include a trailing , so userspace can differentiate
- * between this and the old multi-field proc format.
- */
- list_for_each_entry(use, &mod->source_list, source_list) {
- printed_something = 1;
- seq_printf(m, "%s,", use->source->name);
- }
-
- if (mod->init != NULL && mod->exit == NULL) {
- printed_something = 1;
- seq_puts(m, "[permanent],");
- }
-
- if (!printed_something)
- seq_puts(m, "-");
-}
-
-void __symbol_put(const char *symbol)
-{
- struct module *owner;
-
- preempt_disable();
- if (!find_symbol(symbol, &owner, NULL, true, false))
- BUG();
- module_put(owner);
- preempt_enable();
-}
-EXPORT_SYMBOL(__symbol_put);
-
-/* Note this assumes addr is a function, which it currently always is. */
-void symbol_put_addr(void *addr)
-{
- struct module *modaddr;
- unsigned long a = (unsigned long)dereference_function_descriptor(addr);
-
- if (core_kernel_text(a))
- return;
-
- /*
- * Even though we hold a reference on the module; we still need to
- * disable preemption in order to safely traverse the data structure.
- */
- preempt_disable();
- modaddr = __module_text_address(a);
- BUG_ON(!modaddr);
- module_put(modaddr);
- preempt_enable();
-}
-EXPORT_SYMBOL_GPL(symbol_put_addr);
-
-static ssize_t show_refcnt(struct module_attribute *mattr,
- struct module_kobject *mk, char *buffer)
-{
- return sprintf(buffer, "%i\n", module_refcount(mk->mod));
-}
-
-static struct module_attribute modinfo_refcnt =
- __ATTR(refcnt, 0444, show_refcnt, NULL);
-
-void __module_get(struct module *module)
-{
- if (module) {
- preempt_disable();
- atomic_inc(&module->refcnt);
- trace_module_get(module, _RET_IP_);
- preempt_enable();
- }
-}
-EXPORT_SYMBOL(__module_get);
-
-bool try_module_get(struct module *module)
-{
- bool ret = true;
-
- if (module) {
- preempt_disable();
- /* Note: here, we can fail to get a reference */
- if (likely(module_is_live(module) &&
- atomic_inc_not_zero(&module->refcnt) != 0))
- trace_module_get(module, _RET_IP_);
- else
- ret = false;
-
- preempt_enable();
- }
- return ret;
-}
-EXPORT_SYMBOL(try_module_get);
-
-void module_put(struct module *module)
-{
- int ret;
-
- if (module) {
- preempt_disable();
- ret = atomic_dec_if_positive(&module->refcnt);
- WARN_ON(ret < 0); /* Failed to put refcount */
- trace_module_put(module, _RET_IP_);
- preempt_enable();
- }
-}
-EXPORT_SYMBOL(module_put);
-
-#else /* !CONFIG_MODULE_UNLOAD */
-static inline void print_unload_info(struct seq_file *m, struct module *mod)
-{
- /* We don't know the usage count, or what modules are using. */
- seq_puts(m, " - -");
-}
-
-static inline void module_unload_free(struct module *mod)
-{
-}
-
-int ref_module(struct module *a, struct module *b)
-{
- return strong_try_module_get(b);
-}
-EXPORT_SYMBOL_GPL(ref_module);
-
-static inline int module_unload_init(struct module *mod)
-{
- return 0;
-}
-#endif /* CONFIG_MODULE_UNLOAD */
-
-static size_t module_flags_taint(struct module *mod, char *buf)
-{
- size_t l = 0;
- int i;
-
- for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
- if (taint_flags[i].module && test_bit(i, &mod->taints))
- buf[l++] = taint_flags[i].c_true;
- }
-
- return l;
-}
-
-static ssize_t show_initstate(struct module_attribute *mattr,
- struct module_kobject *mk, char *buffer)
-{
- const char *state = "unknown";
-
- switch (mk->mod->state) {
- case MODULE_STATE_LIVE:
- state = "live";
- break;
- case MODULE_STATE_COMING:
- state = "coming";
- break;
- case MODULE_STATE_GOING:
- state = "going";
- break;
- default:
- BUG();
- }
- return sprintf(buffer, "%s\n", state);
-}
-
-static struct module_attribute modinfo_initstate =
- __ATTR(initstate, 0444, show_initstate, NULL);
-
-static ssize_t store_uevent(struct module_attribute *mattr,
- struct module_kobject *mk,
- const char *buffer, size_t count)
-{
- int rc;
-
- rc = kobject_synth_uevent(&mk->kobj, buffer, count);
- return rc ? rc : count;
-}
-
-struct module_attribute module_uevent =
- __ATTR(uevent, 0200, NULL, store_uevent);
-
-static ssize_t show_coresize(struct module_attribute *mattr,
- struct module_kobject *mk, char *buffer)
-{
- return sprintf(buffer, "%u\n", mk->mod->core_layout.size);
-}
-
-static struct module_attribute modinfo_coresize =
- __ATTR(coresize, 0444, show_coresize, NULL);
-
-static ssize_t show_initsize(struct module_attribute *mattr,
- struct module_kobject *mk, char *buffer)
-{
- return sprintf(buffer, "%u\n", mk->mod->init_layout.size);
-}
-
-static struct module_attribute modinfo_initsize =
- __ATTR(initsize, 0444, show_initsize, NULL);
-
-static ssize_t show_taint(struct module_attribute *mattr,
- struct module_kobject *mk, char *buffer)
-{
- size_t l;
-
- l = module_flags_taint(mk->mod, buffer);
- buffer[l++] = '\n';
- return l;
-}
-
-static struct module_attribute modinfo_taint =
- __ATTR(taint, 0444, show_taint, NULL);
-
-static struct module_attribute *modinfo_attrs[] = {
- &module_uevent,
- &modinfo_version,
- &modinfo_srcversion,
- &modinfo_initstate,
- &modinfo_coresize,
- &modinfo_initsize,
- &modinfo_taint,
-#ifdef CONFIG_MODULE_UNLOAD
- &modinfo_refcnt,
-#endif
- NULL,
-};
-
-static const char vermagic[] = VERMAGIC_STRING;
-
-static int try_to_force_load(struct module *mod, const char *reason)
-{
-#ifdef CONFIG_MODULE_FORCE_LOAD
- if (!test_taint(TAINT_FORCED_MODULE))
- pr_warn("%s: %s: kernel tainted.\n", mod->name, reason);
- add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
- return 0;
-#else
- return -ENOEXEC;
-#endif
-}
-
-#ifdef CONFIG_MODVERSIONS
-
-static u32 resolve_rel_crc(const s32 *crc)
-{
- return *(u32 *)((void *)crc + *crc);
-}
-
-static int check_version(const struct load_info *info,
- const char *symname,
- struct module *mod,
- const s32 *crc)
-{
- Elf_Shdr *sechdrs = info->sechdrs;
- unsigned int versindex = info->index.vers;
- unsigned int i, num_versions;
- struct modversion_info *versions;
-
- /* Exporting module didn't supply crcs? OK, we're already tainted. */
- if (!crc)
- return 1;
-
- /* No versions at all? modprobe --force does this. */
- if (versindex == 0)
- return try_to_force_load(mod, symname) == 0;
-
- versions = (void *) sechdrs[versindex].sh_addr;
- num_versions = sechdrs[versindex].sh_size
- / sizeof(struct modversion_info);
-
- for (i = 0; i < num_versions; i++) {
- u32 crcval;
-
- if (strcmp(versions[i].name, symname) != 0)
- continue;
-
- if (IS_ENABLED(CONFIG_MODULE_REL_CRCS))
- crcval = resolve_rel_crc(crc);
- else
- crcval = *crc;
- if (versions[i].crc == crcval)
- return 1;
- pr_debug("Found checksum %X vs module %lX\n",
- crcval, versions[i].crc);
- goto bad_version;
- }
-
- /* Broken toolchain. Warn once, then let it go.. */
- pr_warn_once("%s: no symbol version for %s\n", info->name, symname);
- return 1;
-
-bad_version:
- pr_warn("%s: disagrees about version of symbol %s\n",
- info->name, symname);
- return 0;
-}
-
-static inline int check_modstruct_version(const struct load_info *info,
- struct module *mod)
-{
- const s32 *crc;
-
- /*
- * Since this should be found in kernel (which can't be removed), no
- * locking is necessary -- use preempt_disable() to placate lockdep.
- */
- preempt_disable();
- if (!find_symbol("module_layout", NULL, &crc, true, false)) {
- preempt_enable();
- BUG();
- }
- preempt_enable();
- return check_version(info, "module_layout", mod, crc);
-}
-
-/* First part is kernel version, which we ignore if module has crcs. */
-static inline int same_magic(const char *amagic, const char *bmagic,
- bool has_crcs)
-{
- if (has_crcs) {
- amagic += strcspn(amagic, " ");
- bmagic += strcspn(bmagic, " ");
- }
- return strcmp(amagic, bmagic) == 0;
-}
-#else
-static inline int check_version(const struct load_info *info,
- const char *symname,
- struct module *mod,
- const s32 *crc)
-{
- return 1;
-}
-
-static inline int check_modstruct_version(const struct load_info *info,
- struct module *mod)
-{
- return 1;
-}
-
-static inline int same_magic(const char *amagic, const char *bmagic,
- bool has_crcs)
-{
- return strcmp(amagic, bmagic) == 0;
-}
-#endif /* CONFIG_MODVERSIONS */
-
-static char *get_modinfo(const struct load_info *info, const char *tag);
-static char *get_next_modinfo(const struct load_info *info, const char *tag,
- char *prev);
-
-static int verify_namespace_is_imported(const struct load_info *info,
- const struct kernel_symbol *sym,
- struct module *mod)
-{
- const char *namespace;
- char *imported_namespace;
-
- namespace = kernel_symbol_namespace(sym);
- if (namespace && namespace[0]) {
- imported_namespace = get_modinfo(info, "import_ns");
- while (imported_namespace) {
- if (strcmp(namespace, imported_namespace) == 0)
- return 0;
- imported_namespace = get_next_modinfo(
- info, "import_ns", imported_namespace);
- }
-#ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
- pr_warn(
-#else
- pr_err(
-#endif
- "%s: module uses symbol (%s) from namespace %s, but does not import it.\n",
- mod->name, kernel_symbol_name(sym), namespace);
-#ifndef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
- return -EINVAL;
-#endif
- }
- return 0;
-}
-
-
-/* Resolve a symbol for this module. I.e. if we find one, record usage. */
-static const struct kernel_symbol *resolve_symbol(struct module *mod,
- const struct load_info *info,
- const char *name,
- char ownername[])
-{
- struct module *owner;
- const struct kernel_symbol *sym;
- const s32 *crc;
- int err;
-
- /*
- * The module_mutex should not be a heavily contended lock;
- * if we get the occasional sleep here, we'll go an extra iteration
- * in the wait_event_interruptible(), which is harmless.
- */
- sched_annotate_sleep();
- mutex_lock(&module_mutex);
- sym = find_symbol(name, &owner, &crc,
- !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
- if (!sym)
- goto unlock;
-
- if (!check_version(info, name, mod, crc)) {
- sym = ERR_PTR(-EINVAL);
- goto getname;
- }
-
- err = verify_namespace_is_imported(info, sym, mod);
- if (err) {
- sym = ERR_PTR(err);
- goto getname;
- }
-
- err = ref_module(mod, owner);
- if (err) {
- sym = ERR_PTR(err);
- goto getname;
- }
-
-getname:
- /* We must make copy under the lock if we failed to get ref. */
- strncpy(ownername, module_name(owner), MODULE_NAME_LEN);
-unlock:
- mutex_unlock(&module_mutex);
- return sym;
-}
-
-static const struct kernel_symbol *
-resolve_symbol_wait(struct module *mod,
- const struct load_info *info,
- const char *name)
-{
- const struct kernel_symbol *ksym;
- char owner[MODULE_NAME_LEN];
-
- if (wait_event_interruptible_timeout(module_wq,
- !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
- || PTR_ERR(ksym) != -EBUSY,
- 30 * HZ) <= 0) {
- pr_warn("%s: gave up waiting for init of module %s.\n",
- mod->name, owner);
- }
- return ksym;
-}
-
-/*
- * /sys/module/foo/sections stuff
- * J. Corbet <corbet@lwn.net>
- */
-#ifdef CONFIG_SYSFS
-
-#ifdef CONFIG_KALLSYMS
-static inline bool sect_empty(const Elf_Shdr *sect)
-{
- return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
-}
-
-struct module_sect_attr {
- struct bin_attribute battr;
- unsigned long address;
-};
-
-struct module_sect_attrs {
- struct attribute_group grp;
- unsigned int nsections;
- struct module_sect_attr attrs[];
-};
-
-static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *battr,
- char *buf, loff_t pos, size_t count)
-{
- struct module_sect_attr *sattr =
- container_of(battr, struct module_sect_attr, battr);
-
- if (pos != 0)
- return -EINVAL;
-
- return sprintf(buf, "0x%px\n",
- kallsyms_show_value(file->f_cred) ? (void *)sattr->address : NULL);
-}
-
-static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
-{
- unsigned int section;
-
- for (section = 0; section < sect_attrs->nsections; section++)
- kfree(sect_attrs->attrs[section].battr.attr.name);
- kfree(sect_attrs);
-}
-
-static void add_sect_attrs(struct module *mod, const struct load_info *info)
-{
- unsigned int nloaded = 0, i, size[2];
- struct module_sect_attrs *sect_attrs;
- struct module_sect_attr *sattr;
- struct bin_attribute **gattr;
-
- /* Count loaded sections and allocate structures */
- for (i = 0; i < info->hdr->e_shnum; i++)
- if (!sect_empty(&info->sechdrs[i]))
- nloaded++;
- size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded),
- sizeof(sect_attrs->grp.bin_attrs[0]));
- size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]);
- sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
- if (sect_attrs == NULL)
- return;
-
- /* Setup section attributes. */
- sect_attrs->grp.name = "sections";
- sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0];
-
- sect_attrs->nsections = 0;
- sattr = &sect_attrs->attrs[0];
- gattr = &sect_attrs->grp.bin_attrs[0];
- for (i = 0; i < info->hdr->e_shnum; i++) {
- Elf_Shdr *sec = &info->sechdrs[i];
- if (sect_empty(sec))
- continue;
- sysfs_bin_attr_init(&sattr->battr);
- sattr->address = sec->sh_addr;
- sattr->battr.attr.name =
- kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
- if (sattr->battr.attr.name == NULL)
- goto out;
- sect_attrs->nsections++;
- sattr->battr.read = module_sect_read;
- sattr->battr.size = 3 /* "0x", "\n" */ + (BITS_PER_LONG / 4);
- sattr->battr.attr.mode = 0400;
- *(gattr++) = &(sattr++)->battr;
- }
- *gattr = NULL;
-
- if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
- goto out;
-
- mod->sect_attrs = sect_attrs;
- return;
- out:
- free_sect_attrs(sect_attrs);
-}
-
-static void remove_sect_attrs(struct module *mod)
-{
- if (mod->sect_attrs) {
- sysfs_remove_group(&mod->mkobj.kobj,
- &mod->sect_attrs->grp);
- /* We are positive that no one is using any sect attrs
- * at this point. Deallocate immediately. */
- free_sect_attrs(mod->sect_attrs);
- mod->sect_attrs = NULL;
- }
-}
-
-/*
- * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections.
- */
-
-struct module_notes_attrs {
- struct kobject *dir;
- unsigned int notes;
- struct bin_attribute attrs[];
-};
-
-static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t pos, size_t count)
-{
- /*
- * The caller checked the pos and count against our size.
- */
- memcpy(buf, bin_attr->private + pos, count);
- return count;
-}
-
-static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
- unsigned int i)
-{
- if (notes_attrs->dir) {
- while (i-- > 0)
- sysfs_remove_bin_file(notes_attrs->dir,
- &notes_attrs->attrs[i]);
- kobject_put(notes_attrs->dir);
- }
- kfree(notes_attrs);
-}
-
-static void add_notes_attrs(struct module *mod, const struct load_info *info)
-{
- unsigned int notes, loaded, i;
- struct module_notes_attrs *notes_attrs;
- struct bin_attribute *nattr;
-
- /* failed to create section attributes, so can't create notes */
- if (!mod->sect_attrs)
- return;
-
- /* Count notes sections and allocate structures. */
- notes = 0;
- for (i = 0; i < info->hdr->e_shnum; i++)
- if (!sect_empty(&info->sechdrs[i]) &&
- (info->sechdrs[i].sh_type == SHT_NOTE))
- ++notes;
-
- if (notes == 0)
- return;
-
- notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
- GFP_KERNEL);
- if (notes_attrs == NULL)
- return;
-
- notes_attrs->notes = notes;
- nattr = &notes_attrs->attrs[0];
- for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
- if (sect_empty(&info->sechdrs[i]))
- continue;
- if (info->sechdrs[i].sh_type == SHT_NOTE) {
- sysfs_bin_attr_init(nattr);
- nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name;
- nattr->attr.mode = S_IRUGO;
- nattr->size = info->sechdrs[i].sh_size;
- nattr->private = (void *) info->sechdrs[i].sh_addr;
- nattr->read = module_notes_read;
- ++nattr;
- }
- ++loaded;
- }
-
- notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
- if (!notes_attrs->dir)
- goto out;
-
- for (i = 0; i < notes; ++i)
- if (sysfs_create_bin_file(notes_attrs->dir,
- &notes_attrs->attrs[i]))
- goto out;
-
- mod->notes_attrs = notes_attrs;
- return;
-
- out:
- free_notes_attrs(notes_attrs, i);
-}
-
-static void remove_notes_attrs(struct module *mod)
-{
- if (mod->notes_attrs)
- free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes);
-}
-
-#else
-
-static inline void add_sect_attrs(struct module *mod,
- const struct load_info *info)
-{
-}
-
-static inline void remove_sect_attrs(struct module *mod)
-{
-}
-
-static inline void add_notes_attrs(struct module *mod,
- const struct load_info *info)
-{
-}
-
-static inline void remove_notes_attrs(struct module *mod)
-{
-}
-#endif /* CONFIG_KALLSYMS */
-
-static void del_usage_links(struct module *mod)
-{
-#ifdef CONFIG_MODULE_UNLOAD
- struct module_use *use;
-
- mutex_lock(&module_mutex);
- list_for_each_entry(use, &mod->target_list, target_list)
- sysfs_remove_link(use->target->holders_dir, mod->name);
- mutex_unlock(&module_mutex);
-#endif
-}
-
-static int add_usage_links(struct module *mod)
-{
- int ret = 0;
-#ifdef CONFIG_MODULE_UNLOAD
- struct module_use *use;
-
- mutex_lock(&module_mutex);
- list_for_each_entry(use, &mod->target_list, target_list) {
- ret = sysfs_create_link(use->target->holders_dir,
- &mod->mkobj.kobj, mod->name);
- if (ret)
- break;
- }
- mutex_unlock(&module_mutex);
- if (ret)
- del_usage_links(mod);
-#endif
- return ret;
-}
-
-static void module_remove_modinfo_attrs(struct module *mod, int end);
-
-static int module_add_modinfo_attrs(struct module *mod)
-{
- struct module_attribute *attr;
- struct module_attribute *temp_attr;
- int error = 0;
- int i;
-
- mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
- (ARRAY_SIZE(modinfo_attrs) + 1)),
- GFP_KERNEL);
- if (!mod->modinfo_attrs)
- return -ENOMEM;
-
- temp_attr = mod->modinfo_attrs;
- for (i = 0; (attr = modinfo_attrs[i]); i++) {
- if (!attr->test || attr->test(mod)) {
- memcpy(temp_attr, attr, sizeof(*temp_attr));
- sysfs_attr_init(&temp_attr->attr);
- error = sysfs_create_file(&mod->mkobj.kobj,
- &temp_attr->attr);
- if (error)
- goto error_out;
- ++temp_attr;
- }
- }
-
- return 0;
-
-error_out:
- if (i > 0)
- module_remove_modinfo_attrs(mod, --i);
- else
- kfree(mod->modinfo_attrs);
- return error;
-}
-
-static void module_remove_modinfo_attrs(struct module *mod, int end)
-{
- struct module_attribute *attr;
- int i;
-
- for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
- if (end >= 0 && i > end)
- break;
- /* pick a field to test for end of list */
- if (!attr->attr.name)
- break;
- sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
- if (attr->free)
- attr->free(mod);
- }
- kfree(mod->modinfo_attrs);
-}
-
-static void mod_kobject_put(struct module *mod)
-{
- DECLARE_COMPLETION_ONSTACK(c);
- mod->mkobj.kobj_completion = &c;
- kobject_put(&mod->mkobj.kobj);
- wait_for_completion(&c);
-}
-
-static int mod_sysfs_init(struct module *mod)
-{
- int err;
- struct kobject *kobj;
-
- if (!module_sysfs_initialized) {
- pr_err("%s: module sysfs not initialized\n", mod->name);
- err = -EINVAL;
- goto out;
- }
-
- kobj = kset_find_obj(module_kset, mod->name);
- if (kobj) {
- pr_err("%s: module is already loaded\n", mod->name);
- kobject_put(kobj);
- err = -EINVAL;
- goto out;
- }
-
- mod->mkobj.mod = mod;
-
- memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
- mod->mkobj.kobj.kset = module_kset;
- err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
- "%s", mod->name);
- if (err)
- mod_kobject_put(mod);
-
- /* delay uevent until full sysfs population */
-out:
- return err;
-}
-
-static int mod_sysfs_setup(struct module *mod,
- const struct load_info *info,
- struct kernel_param *kparam,
- unsigned int num_params)
-{
- int err;
-
- err = mod_sysfs_init(mod);
- if (err)
- goto out;
-
- mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
- if (!mod->holders_dir) {
- err = -ENOMEM;
- goto out_unreg;
- }
-
- err = module_param_sysfs_setup(mod, kparam, num_params);
- if (err)
- goto out_unreg_holders;
-
- err = module_add_modinfo_attrs(mod);
- if (err)
- goto out_unreg_param;
-
- err = add_usage_links(mod);
- if (err)
- goto out_unreg_modinfo_attrs;
-
- add_sect_attrs(mod, info);
- add_notes_attrs(mod, info);
-
- kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
- return 0;
-
-out_unreg_modinfo_attrs:
- module_remove_modinfo_attrs(mod, -1);
-out_unreg_param:
- module_param_sysfs_remove(mod);
-out_unreg_holders:
- kobject_put(mod->holders_dir);
-out_unreg:
- mod_kobject_put(mod);
-out:
- return err;
-}
-
-static void mod_sysfs_fini(struct module *mod)
-{
- remove_notes_attrs(mod);
- remove_sect_attrs(mod);
- mod_kobject_put(mod);
-}
-
-static void init_param_lock(struct module *mod)
-{
- mutex_init(&mod->param_lock);
-}
-#else /* !CONFIG_SYSFS */
-
-static int mod_sysfs_setup(struct module *mod,
- const struct load_info *info,
- struct kernel_param *kparam,
- unsigned int num_params)
-{
- return 0;
-}
-
-static void mod_sysfs_fini(struct module *mod)
-{
-}
-
-static void module_remove_modinfo_attrs(struct module *mod, int end)
-{
-}
-
-static void del_usage_links(struct module *mod)
-{
-}
-
-static void init_param_lock(struct module *mod)
-{
-}
-#endif /* CONFIG_SYSFS */
-
-static void mod_sysfs_teardown(struct module *mod)
-{
- del_usage_links(mod);
- module_remove_modinfo_attrs(mod, -1);
- module_param_sysfs_remove(mod);
- kobject_put(mod->mkobj.drivers_dir);
- kobject_put(mod->holders_dir);
- mod_sysfs_fini(mod);
-}
-
-/*
- * LKM RO/NX protection: protect module's text/ro-data
- * from modification and any data from execution.
- *
- * General layout of module is:
- * [text] [read-only-data] [ro-after-init] [writable data]
- * text_size -----^ ^ ^ ^
- * ro_size ------------------------| | |
- * ro_after_init_size -----------------------------| |
- * size -----------------------------------------------------------|
- *
- * These values are always page-aligned (as is base)
- */
-
-/*
- * Since some arches are moving towards PAGE_KERNEL module allocations instead
- * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() outside of the
- * CONFIG_STRICT_MODULE_RWX block below because they are needed regardless of
- * whether we are strict.
- */
-#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
-static void frob_text(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
- set_memory((unsigned long)layout->base,
- layout->text_size >> PAGE_SHIFT);
-}
-
-static void module_enable_x(const struct module *mod)
-{
- frob_text(&mod->core_layout, set_memory_x);
- frob_text(&mod->init_layout, set_memory_x);
-}
-#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
-static void module_enable_x(const struct module *mod) { }
-#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
-
-#ifdef CONFIG_STRICT_MODULE_RWX
-static void frob_rodata(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
- set_memory((unsigned long)layout->base + layout->text_size,
- (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
-}
-
-static void frob_ro_after_init(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
- set_memory((unsigned long)layout->base + layout->ro_size,
- (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT);
-}
-
-static void frob_writable_data(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1));
- set_memory((unsigned long)layout->base + layout->ro_after_init_size,
- (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
-}
-
-static void module_enable_ro(const struct module *mod, bool after_init)
-{
- if (!rodata_enabled)
- return;
-
- set_vm_flush_reset_perms(mod->core_layout.base);
- set_vm_flush_reset_perms(mod->init_layout.base);
- frob_text(&mod->core_layout, set_memory_ro);
-
- frob_rodata(&mod->core_layout, set_memory_ro);
- frob_text(&mod->init_layout, set_memory_ro);
- frob_rodata(&mod->init_layout, set_memory_ro);
-
- if (after_init)
- frob_ro_after_init(&mod->core_layout, set_memory_ro);
-}
-
-static void module_enable_nx(const struct module *mod)
-{
- frob_rodata(&mod->core_layout, set_memory_nx);
- frob_ro_after_init(&mod->core_layout, set_memory_nx);
- frob_writable_data(&mod->core_layout, set_memory_nx);
- frob_rodata(&mod->init_layout, set_memory_nx);
- frob_writable_data(&mod->init_layout, set_memory_nx);
-}
-
-static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
- char *secstrings, struct module *mod)
-{
- const unsigned long shf_wx = SHF_WRITE|SHF_EXECINSTR;
- int i;
-
- for (i = 0; i < hdr->e_shnum; i++) {
- if ((sechdrs[i].sh_flags & shf_wx) == shf_wx)
- return -ENOEXEC;
- }
-
- return 0;
-}
-
-#else /* !CONFIG_STRICT_MODULE_RWX */
-static void module_enable_nx(const struct module *mod) { }
-static void module_enable_ro(const struct module *mod, bool after_init) {}
-static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
- char *secstrings, struct module *mod)
-{
- return 0;
-}
-#endif /* CONFIG_STRICT_MODULE_RWX */
-
-#ifdef CONFIG_LIVEPATCH
-/*
- * Persist Elf information about a module. Copy the Elf header,
- * section header table, section string table, and symtab section
- * index from info to mod->klp_info.
- */
-static int copy_module_elf(struct module *mod, struct load_info *info)
-{
- unsigned int size, symndx;
- int ret;
-
- size = sizeof(*mod->klp_info);
- mod->klp_info = kmalloc(size, GFP_KERNEL);
- if (mod->klp_info == NULL)
- return -ENOMEM;
-
- /* Elf header */
- size = sizeof(mod->klp_info->hdr);
- memcpy(&mod->klp_info->hdr, info->hdr, size);
-
- /* Elf section header table */
- size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
- mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL);
- if (mod->klp_info->sechdrs == NULL) {
- ret = -ENOMEM;
- goto free_info;
- }
-
- /* Elf section name string table */
- size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
- mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL);
- if (mod->klp_info->secstrings == NULL) {
- ret = -ENOMEM;
- goto free_sechdrs;
- }
-
- /* Elf symbol section index */
- symndx = info->index.sym;
- mod->klp_info->symndx = symndx;
-
- /*
- * For livepatch modules, core_kallsyms.symtab is a complete
- * copy of the original symbol table. Adjust sh_addr to point
- * to core_kallsyms.symtab since the copy of the symtab in module
- * init memory is freed at the end of do_init_module().
- */
- mod->klp_info->sechdrs[symndx].sh_addr = \
- (unsigned long) mod->core_kallsyms.symtab;
-
- return 0;
-
-free_sechdrs:
- kfree(mod->klp_info->sechdrs);
-free_info:
- kfree(mod->klp_info);
- return ret;
-}
-
-static void free_module_elf(struct module *mod)
-{
- kfree(mod->klp_info->sechdrs);
- kfree(mod->klp_info->secstrings);
- kfree(mod->klp_info);
-}
-#else /* !CONFIG_LIVEPATCH */
-static int copy_module_elf(struct module *mod, struct load_info *info)
-{
- return 0;
-}
-
-static void free_module_elf(struct module *mod)
-{
-}
-#endif /* CONFIG_LIVEPATCH */
-
-void __weak module_memfree(void *module_region)
-{
- /*
- * This memory may be RO, and freeing RO memory in an interrupt is not
- * supported by vmalloc.
- */
- WARN_ON(in_interrupt());
- vfree(module_region);
-}
-
-void __weak module_arch_cleanup(struct module *mod)
-{
-}
-
-void __weak module_arch_freeing_init(struct module *mod)
-{
-}
-
-/* Free a module, remove from lists, etc. */
-static void free_module(struct module *mod)
-{
- trace_module_free(mod);
-
- mod_sysfs_teardown(mod);
-
- /* We leave it in list to prevent duplicate loads, but make sure
- * that noone uses it while it's being deconstructed. */
- mutex_lock(&module_mutex);
- mod->state = MODULE_STATE_UNFORMED;
- mutex_unlock(&module_mutex);
-
- /* Remove dynamic debug info */
- ddebug_remove_module(mod->name);
-
- /* Arch-specific cleanup. */
- module_arch_cleanup(mod);
-
- /* Module unload stuff */
- module_unload_free(mod);
-
- /* Free any allocated parameters. */
- destroy_params(mod->kp, mod->num_kp);
-
- if (is_livepatch_module(mod))
- free_module_elf(mod);
-
- /* Now we can delete it from the lists */
- mutex_lock(&module_mutex);
- /* Unlink carefully: kallsyms could be walking list. */
- list_del_rcu(&mod->list);
- mod_tree_remove(mod);
- /* Remove this module from bug list, this uses list_del_rcu */
- module_bug_cleanup(mod);
- /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */
- synchronize_rcu();
- mutex_unlock(&module_mutex);
-
- /* This may be empty, but that's OK */
- module_arch_freeing_init(mod);
- module_memfree(mod->init_layout.base);
- kfree(mod->args);
- percpu_modfree(mod);
-
- /* Free lock-classes; relies on the preceding sync_rcu(). */
- lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
-
- /* Finally, free the core (containing the module structure) */
- module_memfree(mod->core_layout.base);
-}
-
-void *__symbol_get(const char *symbol)
-{
- struct module *owner;
- const struct kernel_symbol *sym;
-
- preempt_disable();
- sym = find_symbol(symbol, &owner, NULL, true, true);
- if (sym && strong_try_module_get(owner))
- sym = NULL;
- preempt_enable();
-
- return sym ? (void *)kernel_symbol_value(sym) : NULL;
-}
-EXPORT_SYMBOL_GPL(__symbol_get);
-
-/*
- * Ensure that an exported symbol [global namespace] does not already exist
- * in the kernel or in some other module's exported symbol table.
- *
- * You must hold the module_mutex.
- */
-static int verify_exported_symbols(struct module *mod)
-{
- unsigned int i;
- struct module *owner;
- const struct kernel_symbol *s;
- struct {
- const struct kernel_symbol *sym;
- unsigned int num;
- } arr[] = {
- { mod->syms, mod->num_syms },
- { mod->gpl_syms, mod->num_gpl_syms },
- { mod->gpl_future_syms, mod->num_gpl_future_syms },
-#ifdef CONFIG_UNUSED_SYMBOLS
- { mod->unused_syms, mod->num_unused_syms },
- { mod->unused_gpl_syms, mod->num_unused_gpl_syms },
-#endif
- };
-
- for (i = 0; i < ARRAY_SIZE(arr); i++) {
- for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
- if (find_symbol(kernel_symbol_name(s), &owner, NULL,
- true, false)) {
- pr_err("%s: exports duplicate symbol %s"
- " (owned by %s)\n",
- mod->name, kernel_symbol_name(s),
- module_name(owner));
- return -ENOEXEC;
- }
- }
- }
- return 0;
-}
-
-/* Change all symbols so that st_value encodes the pointer directly. */
-static int simplify_symbols(struct module *mod, const struct load_info *info)
-{
- Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
- Elf_Sym *sym = (void *)symsec->sh_addr;
- unsigned long secbase;
- unsigned int i;
- int ret = 0;
- const struct kernel_symbol *ksym;
-
- for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
- const char *name = info->strtab + sym[i].st_name;
-
- switch (sym[i].st_shndx) {
- case SHN_COMMON:
- /* Ignore common symbols */
- if (!strncmp(name, "__gnu_lto", 9))
- break;
-
- /* We compiled with -fno-common. These are not
- supposed to happen. */
- pr_debug("Common symbol: %s\n", name);
- pr_warn("%s: please compile with -fno-common\n",
- mod->name);
- ret = -ENOEXEC;
- break;
-
- case SHN_ABS:
- /* Don't need to do anything */
- pr_debug("Absolute symbol: 0x%08lx\n",
- (long)sym[i].st_value);
- break;
-
- case SHN_LIVEPATCH:
- /* Livepatch symbols are resolved by livepatch */
- break;
-
- case SHN_UNDEF:
- ksym = resolve_symbol_wait(mod, info, name);
- /* Ok if resolved. */
- if (ksym && !IS_ERR(ksym)) {
- sym[i].st_value = kernel_symbol_value(ksym);
- break;
- }
-
- /* Ok if weak. */
- if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
- break;
-
- ret = PTR_ERR(ksym) ?: -ENOENT;
- pr_warn("%s: Unknown symbol %s (err %d)\n",
- mod->name, name, ret);
- break;
-
- default:
- /* Divert to percpu allocation if a percpu var. */
- if (sym[i].st_shndx == info->index.pcpu)
- secbase = (unsigned long)mod_percpu(mod);
- else
- secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
- sym[i].st_value += secbase;
- break;
- }
- }
-
- return ret;
-}
-
-static int apply_relocations(struct module *mod, const struct load_info *info)
-{
- unsigned int i;
- int err = 0;
-
- /* Now do relocations. */
- for (i = 1; i < info->hdr->e_shnum; i++) {
- unsigned int infosec = info->sechdrs[i].sh_info;
-
- /* Not a valid relocation section? */
- if (infosec >= info->hdr->e_shnum)
- continue;
-
- /* Don't bother with non-allocated sections */
- if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
- continue;
-
- if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH)
- err = klp_apply_section_relocs(mod, info->sechdrs,
- info->secstrings,
- info->strtab,
- info->index.sym, i,
- NULL);
- else if (info->sechdrs[i].sh_type == SHT_REL)
- err = apply_relocate(info->sechdrs, info->strtab,
- info->index.sym, i, mod);
- else if (info->sechdrs[i].sh_type == SHT_RELA)
- err = apply_relocate_add(info->sechdrs, info->strtab,
- info->index.sym, i, mod);
- if (err < 0)
- break;
- }
- return err;
-}
-
-/* Additional bytes needed by arch in front of individual sections */
-unsigned int __weak arch_mod_section_prepend(struct module *mod,
- unsigned int section)
-{
- /* default implementation just returns zero */
- return 0;
-}
-
-/* Update size with this section: return offset. */
-static long get_offset(struct module *mod, unsigned int *size,
- Elf_Shdr *sechdr, unsigned int section)
-{
- long ret;
-
- *size += arch_mod_section_prepend(mod, section);
- ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
- *size = ret + sechdr->sh_size;
- return ret;
-}
-
-/* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
- might -- code, read-only data, read-write data, small data. Tally
- sizes, and place the offsets into sh_entsize fields: high bit means it
- belongs in init. */
-static void layout_sections(struct module *mod, struct load_info *info)
-{
- static unsigned long const masks[][2] = {
- /* NOTE: all executable code must be the first section
- * in this array; otherwise modify the text_size
- * finder in the two loops below */
- { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
- { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
- { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
- { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
- { ARCH_SHF_SMALL | SHF_ALLOC, 0 }
- };
- unsigned int m, i;
-
- for (i = 0; i < info->hdr->e_shnum; i++)
- info->sechdrs[i].sh_entsize = ~0UL;
-
- pr_debug("Core section allocation order:\n");
- for (m = 0; m < ARRAY_SIZE(masks); ++m) {
- for (i = 0; i < info->hdr->e_shnum; ++i) {
- Elf_Shdr *s = &info->sechdrs[i];
- const char *sname = info->secstrings + s->sh_name;
-
- if ((s->sh_flags & masks[m][0]) != masks[m][0]
- || (s->sh_flags & masks[m][1])
- || s->sh_entsize != ~0UL
- || module_init_section(sname))
- continue;
- s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i);
- pr_debug("\t%s\n", sname);
- }
- switch (m) {
- case 0: /* executable */
- mod->core_layout.size = debug_align(mod->core_layout.size);
- mod->core_layout.text_size = mod->core_layout.size;
- break;
- case 1: /* RO: text and ro-data */
- mod->core_layout.size = debug_align(mod->core_layout.size);
- mod->core_layout.ro_size = mod->core_layout.size;
- break;
- case 2: /* RO after init */
- mod->core_layout.size = debug_align(mod->core_layout.size);
- mod->core_layout.ro_after_init_size = mod->core_layout.size;
- break;
- case 4: /* whole core */
- mod->core_layout.size = debug_align(mod->core_layout.size);
- break;
- }
- }
-
- pr_debug("Init section allocation order:\n");
- for (m = 0; m < ARRAY_SIZE(masks); ++m) {
- for (i = 0; i < info->hdr->e_shnum; ++i) {
- Elf_Shdr *s = &info->sechdrs[i];
- const char *sname = info->secstrings + s->sh_name;
-
- if ((s->sh_flags & masks[m][0]) != masks[m][0]
- || (s->sh_flags & masks[m][1])
- || s->sh_entsize != ~0UL
- || !module_init_section(sname))
- continue;
- s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i)
- | INIT_OFFSET_MASK);
- pr_debug("\t%s\n", sname);
- }
- switch (m) {
- case 0: /* executable */
- mod->init_layout.size = debug_align(mod->init_layout.size);
- mod->init_layout.text_size = mod->init_layout.size;
- break;
- case 1: /* RO: text and ro-data */
- mod->init_layout.size = debug_align(mod->init_layout.size);
- mod->init_layout.ro_size = mod->init_layout.size;
- break;
- case 2:
- /*
- * RO after init doesn't apply to init_layout (only
- * core_layout), so it just takes the value of ro_size.
- */
- mod->init_layout.ro_after_init_size = mod->init_layout.ro_size;
- break;
- case 4: /* whole init */
- mod->init_layout.size = debug_align(mod->init_layout.size);
- break;
- }
- }
-}
-
-static void set_license(struct module *mod, const char *license)
-{
- if (!license)
- license = "unspecified";
-
- if (!license_is_gpl_compatible(license)) {
- if (!test_taint(TAINT_PROPRIETARY_MODULE))
- pr_warn("%s: module license '%s' taints kernel.\n",
- mod->name, license);
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
- LOCKDEP_NOW_UNRELIABLE);
- }
-}
-
-/* Parse tag=value strings from .modinfo section */
-static char *next_string(char *string, unsigned long *secsize)
-{
- /* Skip non-zero chars */
- while (string[0]) {
- string++;
- if ((*secsize)-- <= 1)
- return NULL;
- }
-
- /* Skip any zero padding. */
- while (!string[0]) {
- string++;
- if ((*secsize)-- <= 1)
- return NULL;
- }
- return string;
-}
-
-static char *get_next_modinfo(const struct load_info *info, const char *tag,
- char *prev)
-{
- char *p;
- unsigned int taglen = strlen(tag);
- Elf_Shdr *infosec = &info->sechdrs[info->index.info];
- unsigned long size = infosec->sh_size;
-
- /*
- * get_modinfo() calls made before rewrite_section_headers()
- * must use sh_offset, as sh_addr isn't set!
- */
- char *modinfo = (char *)info->hdr + infosec->sh_offset;
-
- if (prev) {
- size -= prev - modinfo;
- modinfo = next_string(prev, &size);
- }
-
- for (p = modinfo; p; p = next_string(p, &size)) {
- if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
- return p + taglen + 1;
- }
- return NULL;
-}
-
-static char *get_modinfo(const struct load_info *info, const char *tag)
-{
- return get_next_modinfo(info, tag, NULL);
-}
-
-static void setup_modinfo(struct module *mod, struct load_info *info)
-{
- struct module_attribute *attr;
- int i;
-
- for (i = 0; (attr = modinfo_attrs[i]); i++) {
- if (attr->setup)
- attr->setup(mod, get_modinfo(info, attr->attr.name));
- }
-}
-
-static void free_modinfo(struct module *mod)
-{
- struct module_attribute *attr;
- int i;
-
- for (i = 0; (attr = modinfo_attrs[i]); i++) {
- if (attr->free)
- attr->free(mod);
- }
-}
-
-#ifdef CONFIG_KALLSYMS
-
-/* Lookup exported symbol in given range of kernel_symbols */
-static const struct kernel_symbol *lookup_exported_symbol(const char *name,
- const struct kernel_symbol *start,
- const struct kernel_symbol *stop)
-{
- return bsearch(name, start, stop - start,
- sizeof(struct kernel_symbol), cmp_name);
-}
-
-static int is_exported(const char *name, unsigned long value,
- const struct module *mod)
-{
- const struct kernel_symbol *ks;
- if (!mod)
- ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab);
- else
- ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms);
-
- return ks != NULL && kernel_symbol_value(ks) == value;
-}
-
-/* As per nm */
-static char elf_type(const Elf_Sym *sym, const struct load_info *info)
-{
- const Elf_Shdr *sechdrs = info->sechdrs;
-
- if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
- if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
- return 'v';
- else
- return 'w';
- }
- if (sym->st_shndx == SHN_UNDEF)
- return 'U';
- if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
- return 'a';
- if (sym->st_shndx >= SHN_LORESERVE)
- return '?';
- if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
- return 't';
- if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC
- && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
- if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
- return 'r';
- else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
- return 'g';
- else
- return 'd';
- }
- if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
- if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
- return 's';
- else
- return 'b';
- }
- if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
- ".debug")) {
- return 'n';
- }
- return '?';
-}
-
-static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
- unsigned int shnum, unsigned int pcpundx)
-{
- const Elf_Shdr *sec;
-
- if (src->st_shndx == SHN_UNDEF
- || src->st_shndx >= shnum
- || !src->st_name)
- return false;
-
-#ifdef CONFIG_KALLSYMS_ALL
- if (src->st_shndx == pcpundx)
- return true;
-#endif
-
- sec = sechdrs + src->st_shndx;
- if (!(sec->sh_flags & SHF_ALLOC)
-#ifndef CONFIG_KALLSYMS_ALL
- || !(sec->sh_flags & SHF_EXECINSTR)
-#endif
- || (sec->sh_entsize & INIT_OFFSET_MASK))
- return false;
-
- return true;
-}
-
-/*
- * We only allocate and copy the strings needed by the parts of symtab
- * we keep. This is simple, but has the effect of making multiple
- * copies of duplicates. We could be more sophisticated, see
- * linux-kernel thread starting with
- * <73defb5e4bca04a6431392cc341112b1@localhost>.
- */
-static void layout_symtab(struct module *mod, struct load_info *info)
-{
- Elf_Shdr *symsect = info->sechdrs + info->index.sym;
- Elf_Shdr *strsect = info->sechdrs + info->index.str;
- const Elf_Sym *src;
- unsigned int i, nsrc, ndst, strtab_size = 0;
-
- /* Put symbol section at end of init part of module. */
- symsect->sh_flags |= SHF_ALLOC;
- symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect,
- info->index.sym) | INIT_OFFSET_MASK;
- pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
-
- src = (void *)info->hdr + symsect->sh_offset;
- nsrc = symsect->sh_size / sizeof(*src);
-
- /* Compute total space required for the core symbols' strtab. */
- for (ndst = i = 0; i < nsrc; i++) {
- if (i == 0 || is_livepatch_module(mod) ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
- info->index.pcpu)) {
- strtab_size += strlen(&info->strtab[src[i].st_name])+1;
- ndst++;
- }
- }
-
- /* Append room for core symbols at end of core part. */
- info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);
- info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
- mod->core_layout.size += strtab_size;
- info->core_typeoffs = mod->core_layout.size;
- mod->core_layout.size += ndst * sizeof(char);
- mod->core_layout.size = debug_align(mod->core_layout.size);
-
- /* Put string table section at end of init part of module. */
- strsect->sh_flags |= SHF_ALLOC;
- strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect,
- info->index.str) | INIT_OFFSET_MASK;
- pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
-
- /* We'll tack temporary mod_kallsyms on the end. */
- mod->init_layout.size = ALIGN(mod->init_layout.size,
- __alignof__(struct mod_kallsyms));
- info->mod_kallsyms_init_off = mod->init_layout.size;
- mod->init_layout.size += sizeof(struct mod_kallsyms);
- info->init_typeoffs = mod->init_layout.size;
- mod->init_layout.size += nsrc * sizeof(char);
- mod->init_layout.size = debug_align(mod->init_layout.size);
-}
-
-/*
- * We use the full symtab and strtab which layout_symtab arranged to
- * be appended to the init section. Later we switch to the cut-down
- * core-only ones.
- */
-static void add_kallsyms(struct module *mod, const struct load_info *info)
-{
- unsigned int i, ndst;
- const Elf_Sym *src;
- Elf_Sym *dst;
- char *s;
- Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
-
- /* Set up to point into init section. */
- mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
-
- mod->kallsyms->symtab = (void *)symsec->sh_addr;
- mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
- /* Make sure we get permanent strtab: don't use info->strtab. */
- mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
- mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs;
-
- /*
- * Now populate the cut down core kallsyms for after init
- * and set types up while we still have access to sections.
- */
- mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
- mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
- mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs;
- src = mod->kallsyms->symtab;
- for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
- mod->kallsyms->typetab[i] = elf_type(src + i, info);
- if (i == 0 || is_livepatch_module(mod) ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
- info->index.pcpu)) {
- mod->core_kallsyms.typetab[ndst] =
- mod->kallsyms->typetab[i];
- dst[ndst] = src[i];
- dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
- s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
- KSYM_NAME_LEN) + 1;
- }
- }
- mod->core_kallsyms.num_symtab = ndst;
-}
-#else
-static inline void layout_symtab(struct module *mod, struct load_info *info)
-{
-}
-
-static void add_kallsyms(struct module *mod, const struct load_info *info)
-{
-}
-#endif /* CONFIG_KALLSYMS */
-
-static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num)
-{
- if (!debug)
- return;
- ddebug_add_module(debug, num, mod->name);
-}
-
-static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
-{
- if (debug)
- ddebug_remove_module(mod->name);
-}
-
-void * __weak module_alloc(unsigned long size)
-{
- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
- GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
- NUMA_NO_NODE, __builtin_return_address(0));
-}
-
-bool __weak module_init_section(const char *name)
-{
- return strstarts(name, ".init");
-}
-
-bool __weak module_exit_section(const char *name)
-{
- return strstarts(name, ".exit");
-}
-
-#ifdef CONFIG_DEBUG_KMEMLEAK
-static void kmemleak_load_module(const struct module *mod,
- const struct load_info *info)
-{
- unsigned int i;
-
- /* only scan the sections containing data */
- kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
-
- for (i = 1; i < info->hdr->e_shnum; i++) {
- /* Scan all writable sections that's not executable */
- if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
- !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
- (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
- continue;
-
- kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
- info->sechdrs[i].sh_size, GFP_KERNEL);
- }
-}
-#else
-static inline void kmemleak_load_module(const struct module *mod,
- const struct load_info *info)
-{
-}
-#endif
-
-#ifdef CONFIG_MODULE_SIG
-static int module_sig_check(struct load_info *info, int flags)
-{
- int err = -ENODATA;
- const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
- const char *reason;
- const void *mod = info->hdr;
-
- /*
- * Require flags == 0, as a module with version information
- * removed is no longer the module that was signed
- */
- if (flags == 0 &&
- info->len > markerlen &&
- memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
- /* We truncate the module to discard the signature */
- info->len -= markerlen;
- err = mod_verify_sig(mod, info);
- }
-
- switch (err) {
- case 0:
- info->sig_ok = true;
- return 0;
-
- /* We don't permit modules to be loaded into trusted kernels
- * without a valid signature on them, but if we're not
- * enforcing, certain errors are non-fatal.
- */
- case -ENODATA:
- reason = "Loading of unsigned module";
- goto decide;
- case -ENOPKG:
- reason = "Loading of module with unsupported crypto";
- goto decide;
- case -ENOKEY:
- reason = "Loading of module with unavailable key";
- decide:
- if (is_module_sig_enforced()) {
- pr_notice("%s: %s is rejected\n", info->name, reason);
- return -EKEYREJECTED;
- }
-
- return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
-
- /* All other errors are fatal, including nomem, unparseable
- * signatures and signature check failures - even if signatures
- * aren't required.
- */
- default:
- return err;
- }
-}
-#else /* !CONFIG_MODULE_SIG */
-static int module_sig_check(struct load_info *info, int flags)
-{
- return 0;
-}
-#endif /* !CONFIG_MODULE_SIG */
-
-/* Sanity checks against invalid binaries, wrong arch, weird elf version. */
-static int elf_header_check(struct load_info *info)
-{
- if (info->len < sizeof(*(info->hdr)))
- return -ENOEXEC;
-
- if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0
- || info->hdr->e_type != ET_REL
- || !elf_check_arch(info->hdr)
- || info->hdr->e_shentsize != sizeof(Elf_Shdr))
- return -ENOEXEC;
-
- if (info->hdr->e_shoff >= info->len
- || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
- info->len - info->hdr->e_shoff))
- return -ENOEXEC;
-
- return 0;
-}
-
-#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
-
-static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
-{
- do {
- unsigned long n = min(len, COPY_CHUNK_SIZE);
-
- if (copy_from_user(dst, usrc, n) != 0)
- return -EFAULT;
- cond_resched();
- dst += n;
- usrc += n;
- len -= n;
- } while (len);
- return 0;
-}
-
-#ifdef CONFIG_LIVEPATCH
-static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
-{
- if (get_modinfo(info, "livepatch")) {
- mod->klp = true;
- add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
- pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
- mod->name);
- }
-
- return 0;
-}
-#else /* !CONFIG_LIVEPATCH */
-static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
-{
- if (get_modinfo(info, "livepatch")) {
- pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
- mod->name);
- return -ENOEXEC;
- }
-
- return 0;
-}
-#endif /* CONFIG_LIVEPATCH */
-
-static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
-{
- if (retpoline_module_ok(get_modinfo(info, "retpoline")))
- return;
-
- pr_warn("%s: loading module not compiled with retpoline compiler.\n",
- mod->name);
-}
-
-/* Sets info->hdr and info->len. */
-static int copy_module_from_user(const void __user *umod, unsigned long len,
- struct load_info *info)
-{
- int err;
-
- info->len = len;
- if (info->len < sizeof(*(info->hdr)))
- return -ENOEXEC;
-
- err = security_kernel_load_data(LOADING_MODULE);
- if (err)
- return err;
-
- /* Suck in entire file: we'll want most of it. */
- info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN);
- if (!info->hdr)
- return -ENOMEM;
-
- if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
- vfree(info->hdr);
- return -EFAULT;
- }
-
- return 0;
-}
-
-static void free_copy(struct load_info *info)
-{
- vfree(info->hdr);
-}
-
-static int rewrite_section_headers(struct load_info *info, int flags)
-{
- unsigned int i;
-
- /* This should always be true, but let's be sure. */
- info->sechdrs[0].sh_addr = 0;
-
- for (i = 1; i < info->hdr->e_shnum; i++) {
- Elf_Shdr *shdr = &info->sechdrs[i];
- if (shdr->sh_type != SHT_NOBITS
- && info->len < shdr->sh_offset + shdr->sh_size) {
- pr_err("Module len %lu truncated\n", info->len);
- return -ENOEXEC;
- }
-
- /* Mark all sections sh_addr with their address in the
- temporary image. */
- shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
-
-#ifndef CONFIG_MODULE_UNLOAD
- /* Don't load .exit sections */
- if (module_exit_section(info->secstrings+shdr->sh_name))
- shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
-#endif
- }
-
- /* Track but don't keep modinfo and version sections. */
- info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
- info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
-
- return 0;
-}
-
-/*
- * Set up our basic convenience variables (pointers to section headers,
- * search for module section index etc), and do some basic section
- * verification.
- *
- * Set info->mod to the temporary copy of the module in info->hdr. The final one
- * will be allocated in move_module().
- */
-static int setup_load_info(struct load_info *info, int flags)
-{
- unsigned int i;
-
- /* Set up the convenience variables */
- info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
- info->secstrings = (void *)info->hdr
- + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
-
- /* Try to find a name early so we can log errors with a module name */
- info->index.info = find_sec(info, ".modinfo");
- if (info->index.info)
- info->name = get_modinfo(info, "name");
-
- /* Find internal symbols and strings. */
- for (i = 1; i < info->hdr->e_shnum; i++) {
- if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
- info->index.sym = i;
- info->index.str = info->sechdrs[i].sh_link;
- info->strtab = (char *)info->hdr
- + info->sechdrs[info->index.str].sh_offset;
- break;
- }
- }
-
- if (info->index.sym == 0) {
- pr_warn("%s: module has no symbols (stripped?)\n",
- info->name ?: "(missing .modinfo section or name field)");
- return -ENOEXEC;
- }
-
- info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
- if (!info->index.mod) {
- pr_warn("%s: No module found in object\n",
- info->name ?: "(missing .modinfo section or name field)");
- return -ENOEXEC;
- }
- /* This is temporary: point mod into copy of data. */
- info->mod = (void *)info->hdr + info->sechdrs[info->index.mod].sh_offset;
-
- /*
- * If we didn't load the .modinfo 'name' field earlier, fall back to
- * on-disk struct mod 'name' field.
- */
- if (!info->name)
- info->name = info->mod->name;
-
- if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
- info->index.vers = 0; /* Pretend no __versions section! */
- else
- info->index.vers = find_sec(info, "__versions");
-
- info->index.pcpu = find_pcpusec(info);
-
- return 0;
-}
-
-static int check_modinfo(struct module *mod, struct load_info *info, int flags)
-{
- const char *modmagic = get_modinfo(info, "vermagic");
- int err;
-
- if (flags & MODULE_INIT_IGNORE_VERMAGIC)
- modmagic = NULL;
-
- /* This is allowed: modprobe --force will invalidate it. */
- if (!modmagic) {
- err = try_to_force_load(mod, "bad vermagic");
- if (err)
- return err;
- } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
- pr_err("%s: version magic '%s' should be '%s'\n",
- info->name, modmagic, vermagic);
- return -ENOEXEC;
- }
-
- if (!get_modinfo(info, "intree")) {
- if (!test_taint(TAINT_OOT_MODULE))
- pr_warn("%s: loading out-of-tree module taints kernel.\n",
- mod->name);
- add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
- }
-
- check_modinfo_retpoline(mod, info);
-
- if (get_modinfo(info, "staging")) {
- add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
- pr_warn("%s: module is from the staging directory, the quality "
- "is unknown, you have been warned.\n", mod->name);
- }
-
- err = check_modinfo_livepatch(mod, info);
- if (err)
- return err;
-
- /* Set up license info based on the info section */
- set_license(mod, get_modinfo(info, "license"));
-
- return 0;
-}
-
-static int find_module_sections(struct module *mod, struct load_info *info)
-{
- mod->kp = section_objs(info, "__param",
- sizeof(*mod->kp), &mod->num_kp);
- mod->syms = section_objs(info, "__ksymtab",
- sizeof(*mod->syms), &mod->num_syms);
- mod->crcs = section_addr(info, "__kcrctab");
- mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
- sizeof(*mod->gpl_syms),
- &mod->num_gpl_syms);
- mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
- mod->gpl_future_syms = section_objs(info,
- "__ksymtab_gpl_future",
- sizeof(*mod->gpl_future_syms),
- &mod->num_gpl_future_syms);
- mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future");
-
-#ifdef CONFIG_UNUSED_SYMBOLS
- mod->unused_syms = section_objs(info, "__ksymtab_unused",
- sizeof(*mod->unused_syms),
- &mod->num_unused_syms);
- mod->unused_crcs = section_addr(info, "__kcrctab_unused");
- mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl",
- sizeof(*mod->unused_gpl_syms),
- &mod->num_unused_gpl_syms);
- mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
-#endif
-#ifdef CONFIG_CONSTRUCTORS
- mod->ctors = section_objs(info, ".ctors",
- sizeof(*mod->ctors), &mod->num_ctors);
- if (!mod->ctors)
- mod->ctors = section_objs(info, ".init_array",
- sizeof(*mod->ctors), &mod->num_ctors);
- else if (find_sec(info, ".init_array")) {
- /*
- * This shouldn't happen with same compiler and binutils
- * building all parts of the module.
- */
- pr_warn("%s: has both .ctors and .init_array.\n",
- mod->name);
- return -EINVAL;
- }
-#endif
-
- mod->noinstr_text_start = section_objs(info, ".noinstr.text", 1,
- &mod->noinstr_text_size);
-
-#ifdef CONFIG_TRACEPOINTS
- mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
- sizeof(*mod->tracepoints_ptrs),
- &mod->num_tracepoints);
-#endif
-#ifdef CONFIG_TREE_SRCU
- mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs",
- sizeof(*mod->srcu_struct_ptrs),
- &mod->num_srcu_structs);
-#endif
-#ifdef CONFIG_BPF_EVENTS
- mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map",
- sizeof(*mod->bpf_raw_events),
- &mod->num_bpf_raw_events);
-#endif
-#ifdef CONFIG_JUMP_LABEL
- mod->jump_entries = section_objs(info, "__jump_table",
- sizeof(*mod->jump_entries),
- &mod->num_jump_entries);
-#endif
-#ifdef CONFIG_EVENT_TRACING
- mod->trace_events = section_objs(info, "_ftrace_events",
- sizeof(*mod->trace_events),
- &mod->num_trace_events);
- mod->trace_evals = section_objs(info, "_ftrace_eval_map",
- sizeof(*mod->trace_evals),
- &mod->num_trace_evals);
-#endif
-#ifdef CONFIG_TRACING
- mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
- sizeof(*mod->trace_bprintk_fmt_start),
- &mod->num_trace_bprintk_fmt);
-#endif
-#ifdef CONFIG_FTRACE_MCOUNT_RECORD
- /* sechdrs[0].sh_size is always zero */
- mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION,
- sizeof(*mod->ftrace_callsites),
- &mod->num_ftrace_callsites);
-#endif
-#ifdef CONFIG_FUNCTION_ERROR_INJECTION
- mod->ei_funcs = section_objs(info, "_error_injection_whitelist",
- sizeof(*mod->ei_funcs),
- &mod->num_ei_funcs);
-#endif
-#ifdef CONFIG_KPROBES
- mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1,
- &mod->kprobes_text_size);
- mod->kprobe_blacklist = section_objs(info, "_kprobe_blacklist",
- sizeof(unsigned long),
- &mod->num_kprobe_blacklist);
-#endif
- mod->extable = section_objs(info, "__ex_table",
- sizeof(*mod->extable), &mod->num_exentries);
-
- if (section_addr(info, "__obsparm"))
- pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
-
- info->debug = section_objs(info, "__verbose",
- sizeof(*info->debug), &info->num_debug);
-
- return 0;
-}
-
-static int move_module(struct module *mod, struct load_info *info)
-{
- int i;
- void *ptr;
-
- /* Do the allocs. */
- ptr = module_alloc(mod->core_layout.size);
- /*
- * The pointer to this block is stored in the module structure
- * which is inside the block. Just mark it as not being a
- * leak.
- */
- kmemleak_not_leak(ptr);
- if (!ptr)
- return -ENOMEM;
-
- memset(ptr, 0, mod->core_layout.size);
- mod->core_layout.base = ptr;
-
- if (mod->init_layout.size) {
- ptr = module_alloc(mod->init_layout.size);
- /*
- * The pointer to this block is stored in the module structure
- * which is inside the block. This block doesn't need to be
- * scanned as it contains data and code that will be freed
- * after the module is initialized.
- */
- kmemleak_ignore(ptr);
- if (!ptr) {
- module_memfree(mod->core_layout.base);
- return -ENOMEM;
- }
- memset(ptr, 0, mod->init_layout.size);
- mod->init_layout.base = ptr;
- } else
- mod->init_layout.base = NULL;
-
- /* Transfer each section which specifies SHF_ALLOC */
- pr_debug("final section addresses:\n");
- for (i = 0; i < info->hdr->e_shnum; i++) {
- void *dest;
- Elf_Shdr *shdr = &info->sechdrs[i];
-
- if (!(shdr->sh_flags & SHF_ALLOC))
- continue;
-
- if (shdr->sh_entsize & INIT_OFFSET_MASK)
- dest = mod->init_layout.base
- + (shdr->sh_entsize & ~INIT_OFFSET_MASK);
- else
- dest = mod->core_layout.base + shdr->sh_entsize;
-
- if (shdr->sh_type != SHT_NOBITS)
- memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
- /* Update sh_addr to point to copy in image. */
- shdr->sh_addr = (unsigned long)dest;
- pr_debug("\t0x%lx %s\n",
- (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
- }
-
- return 0;
-}
-
-static int check_module_license_and_versions(struct module *mod)
-{
- int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);
-
- /*
- * ndiswrapper is under GPL by itself, but loads proprietary modules.
- * Don't use add_taint_module(), as it would prevent ndiswrapper from
- * using GPL-only symbols it needs.
- */
- if (strcmp(mod->name, "ndiswrapper") == 0)
- add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
-
- /* driverloader was caught wrongly pretending to be under GPL */
- if (strcmp(mod->name, "driverloader") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
- LOCKDEP_NOW_UNRELIABLE);
-
- /* lve claims to be GPL but upstream won't provide source */
- if (strcmp(mod->name, "lve") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
- LOCKDEP_NOW_UNRELIABLE);
-
- if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
- pr_warn("%s: module license taints kernel.\n", mod->name);
-
-#ifdef CONFIG_MODVERSIONS
- if ((mod->num_syms && !mod->crcs)
- || (mod->num_gpl_syms && !mod->gpl_crcs)
- || (mod->num_gpl_future_syms && !mod->gpl_future_crcs)
-#ifdef CONFIG_UNUSED_SYMBOLS
- || (mod->num_unused_syms && !mod->unused_crcs)
- || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
-#endif
- ) {
- return try_to_force_load(mod,
- "no versions for exported symbols");
- }
-#endif
- return 0;
-}
-
-static void flush_module_icache(const struct module *mod)
-{
- /*
- * Flush the instruction cache, since we've played with text.
- * Do it before processing of module parameters, so the module
- * can provide parameter accessor functions of its own.
- */
- if (mod->init_layout.base)
- flush_icache_range((unsigned long)mod->init_layout.base,
- (unsigned long)mod->init_layout.base
- + mod->init_layout.size);
- flush_icache_range((unsigned long)mod->core_layout.base,
- (unsigned long)mod->core_layout.base + mod->core_layout.size);
-}
-
-int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- char *secstrings,
- struct module *mod)
-{
- return 0;
-}
-
-/* module_blacklist is a comma-separated list of module names */
-static char *module_blacklist;
-static bool blacklisted(const char *module_name)
-{
- const char *p;
- size_t len;
-
- if (!module_blacklist)
- return false;
-
- for (p = module_blacklist; *p; p += len) {
- len = strcspn(p, ",");
- if (strlen(module_name) == len && !memcmp(module_name, p, len))
- return true;
- if (p[len] == ',')
- len++;
- }
- return false;
-}
-core_param(module_blacklist, module_blacklist, charp, 0400);
-
-static struct module *layout_and_allocate(struct load_info *info, int flags)
-{
- struct module *mod;
- unsigned int ndx;
- int err;
-
- err = check_modinfo(info->mod, info, flags);
- if (err)
- return ERR_PTR(err);
-
- /* Allow arches to frob section contents and sizes. */
- err = module_frob_arch_sections(info->hdr, info->sechdrs,
- info->secstrings, info->mod);
- if (err < 0)
- return ERR_PTR(err);
-
- err = module_enforce_rwx_sections(info->hdr, info->sechdrs,
- info->secstrings, info->mod);
- if (err < 0)
- return ERR_PTR(err);
-
- /* We will do a special allocation for per-cpu sections later. */
- info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
-
- /*
- * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
- * layout_sections() can put it in the right place.
- * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
- */
- ndx = find_sec(info, ".data..ro_after_init");
- if (ndx)
- info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
- /*
- * Mark the __jump_table section as ro_after_init as well: these data
- * structures are never modified, with the exception of entries that
- * refer to code in the __init section, which are annotated as such
- * at module load time.
- */
- ndx = find_sec(info, "__jump_table");
- if (ndx)
- info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
-
- /* Determine total sizes, and put offsets in sh_entsize. For now
- this is done generically; there doesn't appear to be any
- special cases for the architectures. */
- layout_sections(info->mod, info);
- layout_symtab(info->mod, info);
-
- /* Allocate and move to the final place */
- err = move_module(info->mod, info);
- if (err)
- return ERR_PTR(err);
-
- /* Module has been copied to its final place now: return it. */
- mod = (void *)info->sechdrs[info->index.mod].sh_addr;
- kmemleak_load_module(mod, info);
- return mod;
-}
-
-/* mod is no longer valid after this! */
-static void module_deallocate(struct module *mod, struct load_info *info)
-{
- percpu_modfree(mod);
- module_arch_freeing_init(mod);
- module_memfree(mod->init_layout.base);
- module_memfree(mod->core_layout.base);
-}
-
-int __weak module_finalize(const Elf_Ehdr *hdr,
- const Elf_Shdr *sechdrs,
- struct module *me)
-{
- return 0;
-}
-
-static int post_relocation(struct module *mod, const struct load_info *info)
-{
- /* Sort exception table now relocations are done. */
- sort_extable(mod->extable, mod->extable + mod->num_exentries);
-
- /* Copy relocated percpu area over. */
- percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
- info->sechdrs[info->index.pcpu].sh_size);
-
- /* Setup kallsyms-specific fields. */
- add_kallsyms(mod, info);
-
- /* Arch-specific module finalizing. */
- return module_finalize(info->hdr, info->sechdrs, mod);
-}
-
-/* Is this module of this name done loading? No locks held. */
-static bool finished_loading(const char *name)
-{
- struct module *mod;
- bool ret;
-
- /*
- * The module_mutex should not be a heavily contended lock;
- * if we get the occasional sleep here, we'll go an extra iteration
- * in the wait_event_interruptible(), which is harmless.
- */
- sched_annotate_sleep();
- mutex_lock(&module_mutex);
- mod = find_module_all(name, strlen(name), true);
- ret = !mod || mod->state == MODULE_STATE_LIVE;
- mutex_unlock(&module_mutex);
-
- return ret;
-}
-
-/* Call module constructors. */
-static void do_mod_ctors(struct module *mod)
-{
-#ifdef CONFIG_CONSTRUCTORS
- unsigned long i;
-
- for (i = 0; i < mod->num_ctors; i++)
- mod->ctors[i]();
-#endif
-}
-
-/* For freeing module_init on success, in case kallsyms traversing */
-struct mod_initfree {
- struct llist_node node;
- void *module_init;
-};
-
-static void do_free_init(struct work_struct *w)
-{
- struct llist_node *pos, *n, *list;
- struct mod_initfree *initfree;
-
- list = llist_del_all(&init_free_list);
-
- synchronize_rcu();
-
- llist_for_each_safe(pos, n, list) {
- initfree = container_of(pos, struct mod_initfree, node);
- module_memfree(initfree->module_init);
- kfree(initfree);
- }
-}
-
-static int __init modules_wq_init(void)
-{
- INIT_WORK(&init_free_wq, do_free_init);
- init_llist_head(&init_free_list);
- return 0;
-}
-module_init(modules_wq_init);
-
-/*
- * This is where the real work happens.
- *
- * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
- * helper command 'lx-symbols'.
- */
-static noinline int do_init_module(struct module *mod)
-{
- int ret = 0;
- struct mod_initfree *freeinit;
-
- freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
- if (!freeinit) {
- ret = -ENOMEM;
- goto fail;
- }
- freeinit->module_init = mod->init_layout.base;
-
- /*
- * We want to find out whether @mod uses async during init. Clear
- * PF_USED_ASYNC. async_schedule*() will set it.
- */
- current->flags &= ~PF_USED_ASYNC;
-
- do_mod_ctors(mod);
- /* Start the module */
- if (mod->init != NULL)
- ret = do_one_initcall(mod->init);
- if (ret < 0) {
- goto fail_free_freeinit;
- }
- if (ret > 0) {
- pr_warn("%s: '%s'->init suspiciously returned %d, it should "
- "follow 0/-E convention\n"
- "%s: loading module anyway...\n",
- __func__, mod->name, ret, __func__);
- dump_stack();
- }
-
- /* Now it's a first class citizen! */
- mod->state = MODULE_STATE_LIVE;
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_LIVE, mod);
-
- /*
- * We need to finish all async code before the module init sequence
- * is done. This has potential to deadlock. For example, a newly
- * detected block device can trigger request_module() of the
- * default iosched from async probing task. Once userland helper
- * reaches here, async_synchronize_full() will wait on the async
- * task waiting on request_module() and deadlock.
- *
- * This deadlock is avoided by perfomring async_synchronize_full()
- * iff module init queued any async jobs. This isn't a full
- * solution as it will deadlock the same if module loading from
- * async jobs nests more than once; however, due to the various
- * constraints, this hack seems to be the best option for now.
- * Please refer to the following thread for details.
- *
- * http://thread.gmane.org/gmane.linux.kernel/1420814
- */
- if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
- async_synchronize_full();
-
- ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base +
- mod->init_layout.size);
- mutex_lock(&module_mutex);
- /* Drop initial reference. */
- module_put(mod);
- trim_init_extable(mod);
-#ifdef CONFIG_KALLSYMS
- /* Switch to core kallsyms now init is done: kallsyms may be walking! */
- rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
-#endif
- module_enable_ro(mod, true);
- mod_tree_remove_init(mod);
- module_arch_freeing_init(mod);
- mod->init_layout.base = NULL;
- mod->init_layout.size = 0;
- mod->init_layout.ro_size = 0;
- mod->init_layout.ro_after_init_size = 0;
- mod->init_layout.text_size = 0;
- /*
- * We want to free module_init, but be aware that kallsyms may be
- * walking this with preempt disabled. In all the failure paths, we
- * call synchronize_rcu(), but we don't want to slow down the success
- * path. module_memfree() cannot be called in an interrupt, so do the
- * work and call synchronize_rcu() in a work queue.
- *
- * Note that module_alloc() on most architectures creates W+X page
- * mappings which won't be cleaned up until do_free_init() runs. Any
- * code such as mark_rodata_ro() which depends on those mappings to
- * be cleaned up needs to sync with the queued work - ie
- * rcu_barrier()
- */
- if (llist_add(&freeinit->node, &init_free_list))
- schedule_work(&init_free_wq);
-
- mutex_unlock(&module_mutex);
- wake_up_all(&module_wq);
-
- return 0;
-
-fail_free_freeinit:
- kfree(freeinit);
-fail:
- /* Try to protect us from buggy refcounters. */
- mod->state = MODULE_STATE_GOING;
- synchronize_rcu();
- module_put(mod);
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_GOING, mod);
- klp_module_going(mod);
- ftrace_release_mod(mod);
- free_module(mod);
- wake_up_all(&module_wq);
- return ret;
-}
-
-static int may_init_module(void)
-{
- if (!capable(CAP_SYS_MODULE) || modules_disabled)
- return -EPERM;
-
- return 0;
-}
-
-/*
- * We try to place it in the list now to make sure it's unique before
- * we dedicate too many resources. In particular, temporary percpu
- * memory exhaustion.
- */
-static int add_unformed_module(struct module *mod)
-{
- int err;
- struct module *old;
-
- mod->state = MODULE_STATE_UNFORMED;
-
-again:
- mutex_lock(&module_mutex);
- old = find_module_all(mod->name, strlen(mod->name), true);
- if (old != NULL) {
- if (old->state != MODULE_STATE_LIVE) {
- /* Wait in case it fails to load. */
- mutex_unlock(&module_mutex);
- err = wait_event_interruptible(module_wq,
- finished_loading(mod->name));
- if (err)
- goto out_unlocked;
- goto again;
- }
- err = -EEXIST;
- goto out;
- }
- mod_update_bounds(mod);
- list_add_rcu(&mod->list, &modules);
- mod_tree_insert(mod);
- err = 0;
-
-out:
- mutex_unlock(&module_mutex);
-out_unlocked:
- return err;
-}
-
-static int complete_formation(struct module *mod, struct load_info *info)
-{
- int err;
-
- mutex_lock(&module_mutex);
-
- /* Find duplicate symbols (must be called under lock). */
- err = verify_exported_symbols(mod);
- if (err < 0)
- goto out;
-
- /* This relies on module_mutex for list integrity. */
- module_bug_finalize(info->hdr, info->sechdrs, mod);
-
- module_enable_ro(mod, false);
- module_enable_nx(mod);
- module_enable_x(mod);
-
- /* Mark state as coming so strong_try_module_get() ignores us,
- * but kallsyms etc. can see us. */
- mod->state = MODULE_STATE_COMING;
- mutex_unlock(&module_mutex);
-
- return 0;
-
-out:
- mutex_unlock(&module_mutex);
- return err;
-}
-
-static int prepare_coming_module(struct module *mod)
-{
- int err;
-
- ftrace_module_enable(mod);
- err = klp_module_coming(mod);
- if (err)
- return err;
-
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_COMING, mod);
- return 0;
-}
-
-static int unknown_module_param_cb(char *param, char *val, const char *modname,
- void *arg)
-{
- struct module *mod = arg;
- int ret;
-
- if (strcmp(param, "async_probe") == 0) {
- mod->async_probe_requested = true;
- return 0;
- }
-
- /* Check for magic 'dyndbg' arg */
- ret = ddebug_dyndbg_module_param_cb(param, val, modname);
- if (ret != 0)
- pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
- return 0;
-}
-
-/* Allocate and load the module: note that size of section 0 is always
- zero, and we rely on this for optional sections. */
-static int load_module(struct load_info *info, const char __user *uargs,
- int flags)
-{
- struct module *mod;
- long err = 0;
- char *after_dashes;
-
- err = elf_header_check(info);
- if (err)
- goto free_copy;
-
- err = setup_load_info(info, flags);
- if (err)
- goto free_copy;
-
- if (blacklisted(info->name)) {
- err = -EPERM;
- goto free_copy;
- }
-
- err = module_sig_check(info, flags);
- if (err)
- goto free_copy;
-
- err = rewrite_section_headers(info, flags);
- if (err)
- goto free_copy;
-
- /* Check module struct version now, before we try to use module. */
- if (!check_modstruct_version(info, info->mod)) {
- err = -ENOEXEC;
- goto free_copy;
- }
-
- /* Figure out module layout, and allocate all the memory. */
- mod = layout_and_allocate(info, flags);
- if (IS_ERR(mod)) {
- err = PTR_ERR(mod);
- goto free_copy;
- }
-
- audit_log_kern_module(mod->name);
-
- /* Reserve our place in the list. */
- err = add_unformed_module(mod);
- if (err)
- goto free_module;
-
-#ifdef CONFIG_MODULE_SIG
- mod->sig_ok = info->sig_ok;
- if (!mod->sig_ok) {
- pr_notice_once("%s: module verification failed: signature "
- "and/or required key missing - tainting "
- "kernel\n", mod->name);
- add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
- }
-#endif
-
- /* To avoid stressing percpu allocator, do this once we're unique. */
- err = percpu_modalloc(mod, info);
- if (err)
- goto unlink_mod;
-
- /* Now module is in final location, initialize linked lists, etc. */
- err = module_unload_init(mod);
- if (err)
- goto unlink_mod;
-
- init_param_lock(mod);
-
- /* Now we've got everything in the final locations, we can
- * find optional sections. */
- err = find_module_sections(mod, info);
- if (err)
- goto free_unload;
-
- err = check_module_license_and_versions(mod);
- if (err)
- goto free_unload;
-
- /* Set up MODINFO_ATTR fields */
- setup_modinfo(mod, info);
-
- /* Fix up syms, so that st_value is a pointer to location. */
- err = simplify_symbols(mod, info);
- if (err < 0)
- goto free_modinfo;
-
- err = apply_relocations(mod, info);
- if (err < 0)
- goto free_modinfo;
-
- err = post_relocation(mod, info);
- if (err < 0)
- goto free_modinfo;
-
- flush_module_icache(mod);
-
- /* Now copy in args */
- mod->args = strndup_user(uargs, ~0UL >> 1);
- if (IS_ERR(mod->args)) {
- err = PTR_ERR(mod->args);
- goto free_arch_cleanup;
- }
-
- dynamic_debug_setup(mod, info->debug, info->num_debug);
-
- /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
- ftrace_module_init(mod);
-
- /* Finally it's fully formed, ready to start executing. */
- err = complete_formation(mod, info);
- if (err)
- goto ddebug_cleanup;
-
- err = prepare_coming_module(mod);
- if (err)
- goto bug_cleanup;
-
- /* Module is ready to execute: parsing args may do that. */
- after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
- -32768, 32767, mod,
- unknown_module_param_cb);
- if (IS_ERR(after_dashes)) {
- err = PTR_ERR(after_dashes);
- goto coming_cleanup;
- } else if (after_dashes) {
- pr_warn("%s: parameters '%s' after `--' ignored\n",
- mod->name, after_dashes);
- }
-
- /* Link in to sysfs. */
- err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
- if (err < 0)
- goto coming_cleanup;
-
- if (is_livepatch_module(mod)) {
- err = copy_module_elf(mod, info);
- if (err < 0)
- goto sysfs_cleanup;
- }
-
- /* Get rid of temporary copy. */
- free_copy(info);
-
- /* Done! */
- trace_module_load(mod);
-
- return do_init_module(mod);
-
- sysfs_cleanup:
- mod_sysfs_teardown(mod);
- coming_cleanup:
- mod->state = MODULE_STATE_GOING;
- destroy_params(mod->kp, mod->num_kp);
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_GOING, mod);
- klp_module_going(mod);
- bug_cleanup:
- /* module_bug_cleanup needs module_mutex protection */
- mutex_lock(&module_mutex);
- module_bug_cleanup(mod);
- mutex_unlock(&module_mutex);
-
- ddebug_cleanup:
- ftrace_release_mod(mod);
- dynamic_debug_remove(mod, info->debug);
- synchronize_rcu();
- kfree(mod->args);
- free_arch_cleanup:
- module_arch_cleanup(mod);
- free_modinfo:
- free_modinfo(mod);
- free_unload:
- module_unload_free(mod);
- unlink_mod:
- mutex_lock(&module_mutex);
- /* Unlink carefully: kallsyms could be walking list. */
- list_del_rcu(&mod->list);
- mod_tree_remove(mod);
- wake_up_all(&module_wq);
- /* Wait for RCU-sched synchronizing before releasing mod->list. */
- synchronize_rcu();
- mutex_unlock(&module_mutex);
- free_module:
- /* Free lock-classes; relies on the preceding sync_rcu() */
- lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
-
- module_deallocate(mod, info);
- free_copy:
- free_copy(info);
- return err;
-}
-
-SYSCALL_DEFINE3(init_module, void __user *, umod,
- unsigned long, len, const char __user *, uargs)
-{
- int err;
- struct load_info info = { };
-
- err = may_init_module();
- if (err)
- return err;
-
- pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
- umod, len, uargs);
-
- err = copy_module_from_user(umod, len, &info);
- if (err)
- return err;
-
- return load_module(&info, uargs, 0);
-}
-
-SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
-{
- struct load_info info = { };
- loff_t size;
- void *hdr;
- int err;
-
- err = may_init_module();
- if (err)
- return err;
-
- pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
-
- if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
- |MODULE_INIT_IGNORE_VERMAGIC))
- return -EINVAL;
-
- err = kernel_read_file_from_fd(fd, &hdr, &size, INT_MAX,
- READING_MODULE);
- if (err)
- return err;
- info.hdr = hdr;
- info.len = size;
-
- return load_module(&info, uargs, flags);
-}
-
-static inline int within(unsigned long addr, void *start, unsigned long size)
-{
- return ((void *)addr >= start && (void *)addr < start + size);
-}
-
-#ifdef CONFIG_KALLSYMS
-/*
- * This ignores the intensely annoying "mapping symbols" found
- * in ARM ELF files: $a, $t and $d.
- */
-static inline int is_arm_mapping_symbol(const char *str)
-{
- if (str[0] == '.' && str[1] == 'L')
- return true;
- return str[0] == '$' && strchr("axtd", str[1])
- && (str[2] == '\0' || str[2] == '.');
-}
-
-static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
-{
- return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
-}
-
-/*
- * Given a module and address, find the corresponding symbol and return its name
- * while providing its size and offset if needed.
- */
-static const char *find_kallsyms_symbol(struct module *mod,
- unsigned long addr,
- unsigned long *size,
- unsigned long *offset)
-{
- unsigned int i, best = 0;
- unsigned long nextval, bestval;
- struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
-
- /* At worse, next value is at end of module */
- if (within_module_init(addr, mod))
- nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size;
- else
- nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size;
-
- bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
-
- /* Scan for closest preceding symbol, and next symbol. (ELF
- starts real symbols at 1). */
- for (i = 1; i < kallsyms->num_symtab; i++) {
- const Elf_Sym *sym = &kallsyms->symtab[i];
- unsigned long thisval = kallsyms_symbol_value(sym);
-
- if (sym->st_shndx == SHN_UNDEF)
- continue;
-
- /* We ignore unnamed symbols: they're uninformative
- * and inserted at a whim. */
- if (*kallsyms_symbol_name(kallsyms, i) == '\0'
- || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
- continue;
-
- if (thisval <= addr && thisval > bestval) {
- best = i;
- bestval = thisval;
- }
- if (thisval > addr && thisval < nextval)
- nextval = thisval;
- }
-
- if (!best)
- return NULL;
-
- if (size)
- *size = nextval - bestval;
- if (offset)
- *offset = addr - bestval;
-
- return kallsyms_symbol_name(kallsyms, best);
-}
-
-void * __weak dereference_module_function_descriptor(struct module *mod,
- void *ptr)
-{
- return ptr;
-}
-
-/* For kallsyms to ask for address resolution. NULL means not found. Careful
- * not to lock to avoid deadlock on oopses, simply disable preemption. */
-const char *module_address_lookup(unsigned long addr,
- unsigned long *size,
- unsigned long *offset,
- char **modname,
- char *namebuf)
-{
- const char *ret = NULL;
- struct module *mod;
-
- preempt_disable();
- mod = __module_address(addr);
- if (mod) {
- if (modname)
- *modname = mod->name;
-
- ret = find_kallsyms_symbol(mod, addr, size, offset);
- }
- /* Make a copy in here where it's safe */
- if (ret) {
- strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
- ret = namebuf;
- }
- preempt_enable();
-
- return ret;
-}
-
-int lookup_module_symbol_name(unsigned long addr, char *symname)
-{
- struct module *mod;
-
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (within_module(addr, mod)) {
- const char *sym;
-
- sym = find_kallsyms_symbol(mod, addr, NULL, NULL);
- if (!sym)
- goto out;
-
- strlcpy(symname, sym, KSYM_NAME_LEN);
- preempt_enable();
- return 0;
- }
- }
-out:
- preempt_enable();
- return -ERANGE;
-}
-
-int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
- unsigned long *offset, char *modname, char *name)
-{
- struct module *mod;
-
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (within_module(addr, mod)) {
- const char *sym;
-
- sym = find_kallsyms_symbol(mod, addr, size, offset);
- if (!sym)
- goto out;
- if (modname)
- strlcpy(modname, mod->name, MODULE_NAME_LEN);
- if (name)
- strlcpy(name, sym, KSYM_NAME_LEN);
- preempt_enable();
- return 0;
- }
- }
-out:
- preempt_enable();
- return -ERANGE;
-}
-
-int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
- char *name, char *module_name, int *exported)
-{
- struct module *mod;
-
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- struct mod_kallsyms *kallsyms;
-
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- kallsyms = rcu_dereference_sched(mod->kallsyms);
- if (symnum < kallsyms->num_symtab) {
- const Elf_Sym *sym = &kallsyms->symtab[symnum];
-
- *value = kallsyms_symbol_value(sym);
- *type = kallsyms->typetab[symnum];
- strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
- strlcpy(module_name, mod->name, MODULE_NAME_LEN);
- *exported = is_exported(name, *value, mod);
- preempt_enable();
- return 0;
- }
- symnum -= kallsyms->num_symtab;
- }
- preempt_enable();
- return -ERANGE;
-}
-
-/* Given a module and name of symbol, find and return the symbol's value */
-static unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
-{
- unsigned int i;
- struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
-
- for (i = 0; i < kallsyms->num_symtab; i++) {
- const Elf_Sym *sym = &kallsyms->symtab[i];
-
- if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 &&
- sym->st_shndx != SHN_UNDEF)
- return kallsyms_symbol_value(sym);
- }
- return 0;
-}
-
-/* Look for this name: can be of form module:name. */
-unsigned long module_kallsyms_lookup_name(const char *name)
-{
- struct module *mod;
- char *colon;
- unsigned long ret = 0;
-
- /* Don't lock: we're in enough trouble already. */
- preempt_disable();
- if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) {
- if ((mod = find_module_all(name, colon - name, false)) != NULL)
- ret = find_kallsyms_symbol_value(mod, colon+1);
- } else {
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if ((ret = find_kallsyms_symbol_value(mod, name)) != 0)
- break;
- }
- }
- preempt_enable();
- return ret;
-}
-
-int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
- struct module *, unsigned long),
- void *data)
-{
- struct module *mod;
- unsigned int i;
- int ret;
-
- module_assert_mutex();
-
- list_for_each_entry(mod, &modules, list) {
- /* We hold module_mutex: no need for rcu_dereference_sched */
- struct mod_kallsyms *kallsyms = mod->kallsyms;
-
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- for (i = 0; i < kallsyms->num_symtab; i++) {
- const Elf_Sym *sym = &kallsyms->symtab[i];
-
- if (sym->st_shndx == SHN_UNDEF)
- continue;
-
- ret = fn(data, kallsyms_symbol_name(kallsyms, i),
- mod, kallsyms_symbol_value(sym));
- if (ret != 0)
- return ret;
- }
- }
- return 0;
-}
-#endif /* CONFIG_KALLSYMS */
-
-/* Maximum number of characters written by module_flags() */
-#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
-
-/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
-static char *module_flags(struct module *mod, char *buf)
-{
- int bx = 0;
-
- BUG_ON(mod->state == MODULE_STATE_UNFORMED);
- if (mod->taints ||
- mod->state == MODULE_STATE_GOING ||
- mod->state == MODULE_STATE_COMING) {
- buf[bx++] = '(';
- bx += module_flags_taint(mod, buf + bx);
- /* Show a - for module-is-being-unloaded */
- if (mod->state == MODULE_STATE_GOING)
- buf[bx++] = '-';
- /* Show a + for module-is-being-loaded */
- if (mod->state == MODULE_STATE_COMING)
- buf[bx++] = '+';
- buf[bx++] = ')';
- }
- buf[bx] = '\0';
-
- return buf;
-}
-
-#ifdef CONFIG_PROC_FS
-/* Called by the /proc file system to return a list of modules. */
-static void *m_start(struct seq_file *m, loff_t *pos)
-{
- mutex_lock(&module_mutex);
- return seq_list_start(&modules, *pos);
-}
-
-static void *m_next(struct seq_file *m, void *p, loff_t *pos)
-{
- return seq_list_next(p, &modules, pos);
-}
-
-static void m_stop(struct seq_file *m, void *p)
-{
- mutex_unlock(&module_mutex);
-}
-
-static int m_show(struct seq_file *m, void *p)
-{
- struct module *mod = list_entry(p, struct module, list);
- char buf[MODULE_FLAGS_BUF_SIZE];
- void *value;
-
- /* We always ignore unformed modules. */
- if (mod->state == MODULE_STATE_UNFORMED)
- return 0;
-
- seq_printf(m, "%s %u",
- mod->name, mod->init_layout.size + mod->core_layout.size);
- print_unload_info(m, mod);
-
- /* Informative for users. */
- seq_printf(m, " %s",
- mod->state == MODULE_STATE_GOING ? "Unloading" :
- mod->state == MODULE_STATE_COMING ? "Loading" :
- "Live");
- /* Used by oprofile and other similar tools. */
- value = m->private ? NULL : mod->core_layout.base;
- seq_printf(m, " 0x%px", value);
-
- /* Taints info */
- if (mod->taints)
- seq_printf(m, " %s", module_flags(mod, buf));
-
- seq_puts(m, "\n");
- return 0;
-}
-
-/* Format: modulename size refcount deps address
-
- Where refcount is a number or -, and deps is a comma-separated list
- of depends or -.
-*/
-static const struct seq_operations modules_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = m_show
-};
-
-/*
- * This also sets the "private" pointer to non-NULL if the
- * kernel pointers should be hidden (so you can just test
- * "m->private" to see if you should keep the values private).
- *
- * We use the same logic as for /proc/kallsyms.
- */
-static int modules_open(struct inode *inode, struct file *file)
-{
- int err = seq_open(file, &modules_op);
-
- if (!err) {
- struct seq_file *m = file->private_data;
- m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul;
- }
-
- return err;
-}
-
-static const struct proc_ops modules_proc_ops = {
- .proc_flags = PROC_ENTRY_PERMANENT,
- .proc_open = modules_open,
- .proc_read = seq_read,
- .proc_lseek = seq_lseek,
- .proc_release = seq_release,
-};
-
-static int __init proc_modules_init(void)
-{
- proc_create("modules", 0, NULL, &modules_proc_ops);
- return 0;
-}
-module_init(proc_modules_init);
-#endif
-
-/* Given an address, look for it in the module exception tables. */
-const struct exception_table_entry *search_module_extables(unsigned long addr)
-{
- const struct exception_table_entry *e = NULL;
- struct module *mod;
-
- preempt_disable();
- mod = __module_address(addr);
- if (!mod)
- goto out;
-
- if (!mod->num_exentries)
- goto out;
-
- e = search_extable(mod->extable,
- mod->num_exentries,
- addr);
-out:
- preempt_enable();
-
- /*
- * Now, if we found one, we are running inside it now, hence
- * we cannot unload the module, hence no refcnt needed.
- */
- return e;
-}
-
-/*
- * is_module_address - is this address inside a module?
- * @addr: the address to check.
- *
- * See is_module_text_address() if you simply want to see if the address
- * is code (not data).
- */
-bool is_module_address(unsigned long addr)
-{
- bool ret;
-
- preempt_disable();
- ret = __module_address(addr) != NULL;
- preempt_enable();
-
- return ret;
-}
-
-/*
- * __module_address - get the module which contains an address.
- * @addr: the address.
- *
- * Must be called with preempt disabled or module mutex held so that
- * module doesn't get freed during this.
- */
-struct module *__module_address(unsigned long addr)
-{
- struct module *mod;
-
- if (addr < module_addr_min || addr > module_addr_max)
- return NULL;
-
- module_assert_mutex_or_preempt();
-
- mod = mod_find(addr);
- if (mod) {
- BUG_ON(!within_module(addr, mod));
- if (mod->state == MODULE_STATE_UNFORMED)
- mod = NULL;
- }
- return mod;
-}
-EXPORT_SYMBOL_GPL(__module_address);
-
-/*
- * is_module_text_address - is this address inside module code?
- * @addr: the address to check.
- *
- * See is_module_address() if you simply want to see if the address is
- * anywhere in a module. See kernel_text_address() for testing if an
- * address corresponds to kernel or module code.
- */
-bool is_module_text_address(unsigned long addr)
-{
- bool ret;
-
- preempt_disable();
- ret = __module_text_address(addr) != NULL;
- preempt_enable();
-
- return ret;
-}
-
-/*
- * __module_text_address - get the module whose code contains an address.
- * @addr: the address.
- *
- * Must be called with preempt disabled or module mutex held so that
- * module doesn't get freed during this.
- */
-struct module *__module_text_address(unsigned long addr)
-{
- struct module *mod = __module_address(addr);
- if (mod) {
- /* Make sure it's within the text section. */
- if (!within(addr, mod->init_layout.base, mod->init_layout.text_size)
- && !within(addr, mod->core_layout.base, mod->core_layout.text_size))
- mod = NULL;
- }
- return mod;
-}
-EXPORT_SYMBOL_GPL(__module_text_address);
-
-/* Don't grab lock, we're oopsing. */
-void print_modules(void)
-{
- struct module *mod;
- char buf[MODULE_FLAGS_BUF_SIZE];
-
- printk(KERN_DEFAULT "Modules linked in:");
- /* Most callers should already have preempt disabled, but make sure */
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- pr_cont(" %s%s", mod->name, module_flags(mod, buf));
- }
- preempt_enable();
- if (last_unloaded_module[0])
- pr_cont(" [last unloaded: %s]", last_unloaded_module);
- pr_cont("\n");
-}
-
-#ifdef CONFIG_MODVERSIONS
-/* Generate the signature for all relevant module structures here.
- * If these change, we don't want to try to parse the module. */
-void module_layout(struct module *mod,
- struct modversion_info *ver,
- struct kernel_param *kp,
- struct kernel_symbol *ks,
- struct tracepoint * const *tp)
-{
-}
-EXPORT_SYMBOL(module_layout);
-#endif
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
new file mode 100644
index 000000000000..0ea1b2970a23
--- /dev/null
+++ b/kernel/module/Kconfig
@@ -0,0 +1,397 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menuconfig MODULES
+ bool "Enable loadable module support"
+ modules
+ help
+ Kernel modules are small pieces of compiled code which can
+ be inserted in the running kernel, rather than being
+ permanently built into the kernel. You use the "modprobe"
+ tool to add (and sometimes remove) them. If you say Y here,
+ many parts of the kernel can be built as modules (by
+ answering M instead of Y where indicated): this is most
+ useful for infrequently used options which are not required
+ for booting. For more information, see the man pages for
+ modprobe, lsmod, modinfo, insmod and rmmod.
+
+ If you say Y here, you will need to run "make
+ modules_install" to put the modules under /lib/modules/
+ where modprobe can find them (you may need to be root to do
+ this).
+
+ If unsure, say Y.
+
+if MODULES
+
+config MODULE_DEBUGFS
+ bool
+
+config MODULE_DEBUG
+ bool "Module debugging"
+ depends on DEBUG_FS
+ help
+ Allows you to enable / disable features which can help you debug
+ modules. You don't need these options on production systems.
+
+if MODULE_DEBUG
+
+config MODULE_STATS
+ bool "Module statistics"
+ depends on DEBUG_FS
+ select MODULE_DEBUGFS
+ help
+ This option allows you to maintain a record of module statistics.
+ For example, size of all modules, average size, text size, a list
+ of failed modules and the size for each of those. For failed
+ modules we keep track of modules which failed due to either the
+ existing module taking too long to load or that module was already
+ loaded.
+
+ You should enable this if you are debugging production loads
+ and want to see if userspace or the kernel is doing stupid things
+ with loading modules when it shouldn't or if you want to help
+ optimize userspace / kernel space module autoloading schemes.
+ You might want to do this because failed modules tend to use
+ up significant amount of memory, and so you'd be doing everyone a
+ favor in avoiding these failures proactively.
+
+ This functionality is also useful for those experimenting with
+ module .text ELF section optimization.
+
+ If unsure, say N.
+
+config MODULE_DEBUG_AUTOLOAD_DUPS
+ bool "Debug duplicate modules with auto-loading"
+ help
+ Module autoloading allows in-kernel code to request modules through
+ the *request_module*() API calls. This in turn just calls userspace
+ modprobe. Although modprobe checks to see if a module is already
+ loaded before trying to load a module there is a small time window in
+ which multiple duplicate requests can end up in userspace and multiple
+ modprobe calls race calling finit_module() around the same time for
+ duplicate modules. The finit_module() system call can consume in the
+ worst case more than twice the respective module size in virtual
+ memory for each duplicate module requests. Although duplicate module
+ requests are non-fatal virtual memory is a limited resource and each
+ duplicate module request ends up just unnecessarily straining virtual
+ memory.
+
+ This debugging facility will create pr_warn() splats for duplicate
+ module requests to help identify if module auto-loading may be the
+ culprit to your early boot virtual memory pressure. Since virtual
+ memory abuse caused by duplicate module requests could render a
+ system unusable this functionality will also converge races in
+ requests for the same module to a single request. You can boot with
+ the module.enable_dups_trace=1 kernel parameter to use WARN_ON()
+ instead of the pr_warn().
+
+ If the first module request used request_module_nowait() we cannot
+ use that as the anchor to wait for duplicate module requests, since
+ users of request_module() do want a proper return value. If a call
+ for the same module happened earlier with request_module() though,
+ then a duplicate request_module_nowait() would be detected. The
+ non-wait request_module() call is synchronous and waits until modprobe
+ completes. Subsequent auto-loading requests for the same module do
+ not trigger a new finit_module() calls and do not strain virtual
+ memory, and so as soon as modprobe successfully completes we remove
+ tracking for duplicates for that module.
+
+ Enable this functionality to try to debug virtual memory abuse during
+ boot on systems which are failing to boot or if you suspect you may be
+ straining virtual memory during boot, and you want to identify if the
+ abuse was due to module auto-loading. These issues are currently only
+ known to occur on systems with many CPUs (over 400) and is likely the
+ result of udev issuing duplicate module requests for each CPU, and so
+ module auto-loading is not the culprit. There may very well still be
+ many duplicate module auto-loading requests which could be optimized
+ for and this debugging facility can be used to help identify them.
+
+ Only enable this for debugging system functionality, never have it
+ enabled on real systems.
+
+config MODULE_DEBUG_AUTOLOAD_DUPS_TRACE
+ bool "Force full stack trace when duplicates are found"
+ depends on MODULE_DEBUG_AUTOLOAD_DUPS
+ help
+ Enabling this will force a full stack trace for duplicate module
+ auto-loading requests using WARN_ON() instead of pr_warn(). You
+ should keep this disabled at all times unless you are a developer
+ and are doing a manual inspection and want to debug exactly why
+ these duplicates occur.
+
+endif # MODULE_DEBUG
+
+config MODULE_FORCE_LOAD
+ bool "Forced module loading"
+ default n
+ help
+ Allow loading of modules without version information (ie. modprobe
+ --force). Forced module loading sets the 'F' (forced) taint flag and
+ is usually a really bad idea.
+
+config MODULE_UNLOAD
+ bool "Module unloading"
+ help
+ Without this option you will not be able to unload any
+ modules (note that some modules may not be unloadable
+ anyway), which makes your kernel smaller, faster
+ and simpler. If unsure, say Y.
+
+config MODULE_FORCE_UNLOAD
+ bool "Forced module unloading"
+ depends on MODULE_UNLOAD
+ help
+ This option allows you to force a module to unload, even if the
+ kernel believes it is unsafe: the kernel will remove the module
+ without waiting for anyone to stop using it (using the -f option to
+ rmmod). This is mainly for kernel developers and desperate users.
+ If unsure, say N.
+
+config MODULE_UNLOAD_TAINT_TRACKING
+ bool "Tainted module unload tracking"
+ depends on MODULE_UNLOAD
+ select MODULE_DEBUGFS
+ help
+ This option allows you to maintain a record of each unloaded
+ module that tainted the kernel. In addition to displaying a
+ list of linked (or loaded) modules e.g. on detection of a bad
+ page (see bad_page()), the aforementioned details are also
+ shown. If unsure, say N.
+
+config MODVERSIONS
+ bool "Module versioning support"
+ help
+ Usually, you have to use modules compiled with your kernel.
+ Saying Y here makes it sometimes possible to use modules
+ compiled for different kernels, by adding enough information
+ to the modules to (hopefully) spot any changes which would
+ make them incompatible with the kernel you are running. If
+ unsure, say N.
+
+config ASM_MODVERSIONS
+ bool
+ default HAVE_ASM_MODVERSIONS && MODVERSIONS
+ help
+ This enables module versioning for exported symbols also from
+ assembly. This can be enabled only when the target architecture
+ supports it.
+
+config MODULE_SRCVERSION_ALL
+ bool "Source checksum for all modules"
+ help
+ Modules which contain a MODULE_VERSION get an extra "srcversion"
+ field inserted into their modinfo section, which contains a
+ sum of the source files which made it. This helps maintainers
+ see exactly which source was used to build a module (since
+ others sometimes change the module source without updating
+ the version). With this option, such a "srcversion" field
+ will be created for all modules. If unsure, say N.
+
+config MODULE_SIG
+ bool "Module signature verification"
+ select MODULE_SIG_FORMAT
+ help
+ Check modules for valid signatures upon load: the signature
+ is simply appended to the module. For more information see
+ <file:Documentation/admin-guide/module-signing.rst>.
+
+ Note that this option adds the OpenSSL development packages as a
+ kernel build dependency so that the signing tool can use its crypto
+ library.
+
+ You should enable this option if you wish to use either
+ CONFIG_SECURITY_LOCKDOWN_LSM or lockdown functionality imposed via
+ another LSM - otherwise unsigned modules will be loadable regardless
+ of the lockdown policy.
+
+ !!!WARNING!!! If you enable this option, you MUST make sure that the
+ module DOES NOT get stripped after being signed. This includes the
+ debuginfo strip done by some packagers (such as rpmbuild) and
+ inclusion into an initramfs that wants the module size reduced.
+
+config MODULE_SIG_FORCE
+ bool "Require modules to be validly signed"
+ depends on MODULE_SIG
+ help
+ Reject unsigned modules or signed modules for which we don't have a
+ key. Without this, such modules will simply taint the kernel.
+
+config MODULE_SIG_ALL
+ bool "Automatically sign all modules"
+ default y
+ depends on MODULE_SIG || IMA_APPRAISE_MODSIG
+ help
+ Sign all modules during make modules_install. Without this option,
+ modules must be signed manually, using the scripts/sign-file tool.
+
+comment "Do not forget to sign required modules with scripts/sign-file"
+ depends on MODULE_SIG_FORCE && !MODULE_SIG_ALL
+
+choice
+ prompt "Which hash algorithm should modules be signed with?"
+ depends on MODULE_SIG || IMA_APPRAISE_MODSIG
+ help
+ This determines which sort of hashing algorithm will be used during
+ signature generation. This algorithm _must_ be built into the kernel
+ directly so that signature verification can take place. It is not
+ possible to load a signed module containing the algorithm to check
+ the signature on that module.
+
+config MODULE_SIG_SHA256
+ bool "Sign modules with SHA-256"
+ select CRYPTO_SHA256
+
+config MODULE_SIG_SHA384
+ bool "Sign modules with SHA-384"
+ select CRYPTO_SHA512
+
+config MODULE_SIG_SHA512
+ bool "Sign modules with SHA-512"
+ select CRYPTO_SHA512
+
+config MODULE_SIG_SHA3_256
+ bool "Sign modules with SHA3-256"
+ select CRYPTO_SHA3
+
+config MODULE_SIG_SHA3_384
+ bool "Sign modules with SHA3-384"
+ select CRYPTO_SHA3
+
+config MODULE_SIG_SHA3_512
+ bool "Sign modules with SHA3-512"
+ select CRYPTO_SHA3
+
+endchoice
+
+config MODULE_SIG_HASH
+ string
+ depends on MODULE_SIG || IMA_APPRAISE_MODSIG
+ default "sha256" if MODULE_SIG_SHA256
+ default "sha384" if MODULE_SIG_SHA384
+ default "sha512" if MODULE_SIG_SHA512
+ default "sha3-256" if MODULE_SIG_SHA3_256
+ default "sha3-384" if MODULE_SIG_SHA3_384
+ default "sha3-512" if MODULE_SIG_SHA3_512
+
+choice
+ prompt "Module compression mode"
+ help
+ This option allows you to choose the algorithm which will be used to
+ compress modules when 'make modules_install' is run. (or, you can
+ choose to not compress modules at all.)
+
+ External modules will also be compressed in the same way during the
+ installation.
+
+ For modules inside an initrd or initramfs, it's more efficient to
+ compress the whole initrd or initramfs instead.
+
+ This is fully compatible with signed modules.
+
+ Please note that the tool used to load modules needs to support the
+ corresponding algorithm. module-init-tools MAY support gzip, and kmod
+ MAY support gzip, xz and zstd.
+
+ Your build system needs to provide the appropriate compression tool
+ to compress the modules.
+
+ If in doubt, select 'None'.
+
+config MODULE_COMPRESS_NONE
+ bool "None"
+ help
+ Do not compress modules. The installed modules are suffixed
+ with .ko.
+
+config MODULE_COMPRESS_GZIP
+ bool "GZIP"
+ help
+ Compress modules with GZIP. The installed modules are suffixed
+ with .ko.gz.
+
+config MODULE_COMPRESS_XZ
+ bool "XZ"
+ help
+ Compress modules with XZ. The installed modules are suffixed
+ with .ko.xz.
+
+config MODULE_COMPRESS_ZSTD
+ bool "ZSTD"
+ help
+ Compress modules with ZSTD. The installed modules are suffixed
+ with .ko.zst.
+
+endchoice
+
+config MODULE_DECOMPRESS
+ bool "Support in-kernel module decompression"
+ depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD
+ select ZLIB_INFLATE if MODULE_COMPRESS_GZIP
+ select XZ_DEC if MODULE_COMPRESS_XZ
+ select ZSTD_DECOMPRESS if MODULE_COMPRESS_ZSTD
+ help
+
+ Support for decompressing kernel modules by the kernel itself
+ instead of relying on userspace to perform this task. Useful when
+ load pinning security policy is enabled.
+
+ If unsure, say N.
+
+config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+ bool "Allow loading of modules with missing namespace imports"
+ help
+ Symbols exported with EXPORT_SYMBOL_NS*() are considered exported in
+ a namespace. A module that makes use of a symbol exported with such a
+ namespace is required to import the namespace via MODULE_IMPORT_NS().
+ There is no technical reason to enforce correct namespace imports,
+ but it creates consistency between symbols defining namespaces and
+ users importing namespaces they make use of. This option relaxes this
+ requirement and lifts the enforcement when loading a module.
+
+ If unsure, say N.
+
+config MODPROBE_PATH
+ string "Path to modprobe binary"
+ default "/sbin/modprobe"
+ help
+ When kernel code requests a module, it does so by calling
+ the "modprobe" userspace utility. This option allows you to
+ set the path where that binary is found. This can be changed
+ at runtime via the sysctl file
+ /proc/sys/kernel/modprobe. Setting this to the empty string
+ removes the kernel's ability to request modules (but
+ userspace can still load modules explicitly).
+
+config TRIM_UNUSED_KSYMS
+ bool "Trim unused exported kernel symbols" if EXPERT
+ depends on !COMPILE_TEST
+ help
+ The kernel and some modules make many symbols available for
+ other modules to use via EXPORT_SYMBOL() and variants. Depending
+ on the set of modules being selected in your kernel configuration,
+ many of those exported symbols might never be used.
+
+ This option allows for unused exported symbols to be dropped from
+ the build. In turn, this provides the compiler more opportunities
+ (especially when using LTO) for optimizing the code and reducing
+ binary size. This might have some security advantages as well.
+
+ If unsure, or if you need to build out-of-tree modules, say N.
+
+config UNUSED_KSYMS_WHITELIST
+ string "Whitelist of symbols to keep in ksymtab"
+ depends on TRIM_UNUSED_KSYMS
+ help
+ By default, all unused exported symbols will be un-exported from the
+ build when TRIM_UNUSED_KSYMS is selected.
+
+ UNUSED_KSYMS_WHITELIST allows to whitelist symbols that must be kept
+ exported at all times, even in absence of in-tree users. The value to
+ set here is the path to a text file containing the list of symbols,
+ one per line. The path can be absolute, or relative to the kernel
+ source tree.
+
+config MODULES_TREE_LOOKUP
+ def_bool y
+ depends on PERF_EVENTS || TRACING || CFI_CLANG
+
+endif # MODULES
diff --git a/kernel/module/Makefile b/kernel/module/Makefile
new file mode 100644
index 000000000000..a10b2b9a6fdf
--- /dev/null
+++ b/kernel/module/Makefile
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for linux kernel module support
+#
+
+# These are called from save_stack_trace() on slub debug path,
+# and produce insane amounts of uninteresting coverage.
+KCOV_INSTRUMENT_module.o := n
+
+obj-y += main.o
+obj-y += strict_rwx.o
+obj-y += kmod.o
+obj-$(CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS) += dups.o
+obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o
+obj-$(CONFIG_MODULE_SIG) += signing.o
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
+obj-$(CONFIG_MODULES_TREE_LOOKUP) += tree_lookup.o
+obj-$(CONFIG_DEBUG_KMEMLEAK) += debug_kmemleak.o
+obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_PROC_FS) += procfs.o
+obj-$(CONFIG_SYSFS) += sysfs.o
+obj-$(CONFIG_KGDB_KDB) += kdb.o
+obj-$(CONFIG_MODVERSIONS) += version.o
+obj-$(CONFIG_MODULE_UNLOAD_TAINT_TRACKING) += tracking.o
+obj-$(CONFIG_MODULE_STATS) += stats.o
diff --git a/kernel/module/debug_kmemleak.c b/kernel/module/debug_kmemleak.c
new file mode 100644
index 000000000000..12a569d361e8
--- /dev/null
+++ b/kernel/module/debug_kmemleak.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module kmemleak support
+ *
+ * Copyright (C) 2009 Catalin Marinas
+ */
+
+#include <linux/module.h>
+#include <linux/kmemleak.h>
+#include "internal.h"
+
+void kmemleak_load_module(const struct module *mod,
+ const struct load_info *info)
+{
+ unsigned int i;
+
+ /* only scan the sections containing data */
+ kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ /* Scan all writable sections that's not executable */
+ if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
+ !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
+ (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
+ continue;
+
+ kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
+ info->sechdrs[i].sh_size, GFP_KERNEL);
+ }
+}
diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
new file mode 100644
index 000000000000..474e68f0f063
--- /dev/null
+++ b/kernel/module/decompress.c
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2021 Google LLC.
+ */
+
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/kobject.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
+
+#include "internal.h"
+
+static int module_extend_max_pages(struct load_info *info, unsigned int extent)
+{
+ struct page **new_pages;
+
+ new_pages = kvmalloc_array(info->max_pages + extent,
+ sizeof(info->pages), GFP_KERNEL);
+ if (!new_pages)
+ return -ENOMEM;
+
+ memcpy(new_pages, info->pages, info->max_pages * sizeof(info->pages));
+ kvfree(info->pages);
+ info->pages = new_pages;
+ info->max_pages += extent;
+
+ return 0;
+}
+
+static struct page *module_get_next_page(struct load_info *info)
+{
+ struct page *page;
+ int error;
+
+ if (info->max_pages == info->used_pages) {
+ error = module_extend_max_pages(info, info->used_pages);
+ if (error)
+ return ERR_PTR(error);
+ }
+
+ page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ info->pages[info->used_pages++] = page;
+ return page;
+}
+
+#if defined(CONFIG_MODULE_COMPRESS_GZIP)
+#include <linux/zlib.h>
+#define MODULE_COMPRESSION gzip
+#define MODULE_DECOMPRESS_FN module_gzip_decompress
+
+/*
+ * Calculate length of the header which consists of signature, header
+ * flags, time stamp and operating system ID (10 bytes total), plus
+ * an optional filename.
+ */
+static size_t module_gzip_header_len(const u8 *buf, size_t size)
+{
+ const u8 signature[] = { 0x1f, 0x8b, 0x08 };
+ size_t len = 10;
+
+ if (size < len || memcmp(buf, signature, sizeof(signature)))
+ return 0;
+
+ if (buf[3] & 0x08) {
+ do {
+ /*
+ * If we can't find the end of the file name we must
+ * be dealing with a corrupted file.
+ */
+ if (len == size)
+ return 0;
+ } while (buf[len++] != '\0');
+ }
+
+ return len;
+}
+
+static ssize_t module_gzip_decompress(struct load_info *info,
+ const void *buf, size_t size)
+{
+ struct z_stream_s s = { 0 };
+ size_t new_size = 0;
+ size_t gzip_hdr_len;
+ ssize_t retval;
+ int rc;
+
+ gzip_hdr_len = module_gzip_header_len(buf, size);
+ if (!gzip_hdr_len) {
+ pr_err("not a gzip compressed module\n");
+ return -EINVAL;
+ }
+
+ s.next_in = buf + gzip_hdr_len;
+ s.avail_in = size - gzip_hdr_len;
+
+ s.workspace = kvmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
+ if (!s.workspace)
+ return -ENOMEM;
+
+ rc = zlib_inflateInit2(&s, -MAX_WBITS);
+ if (rc != Z_OK) {
+ pr_err("failed to initialize decompressor: %d\n", rc);
+ retval = -EINVAL;
+ goto out;
+ }
+
+ do {
+ struct page *page = module_get_next_page(info);
+
+ if (IS_ERR(page)) {
+ retval = PTR_ERR(page);
+ goto out_inflate_end;
+ }
+
+ s.next_out = kmap_local_page(page);
+ s.avail_out = PAGE_SIZE;
+ rc = zlib_inflate(&s, 0);
+ kunmap_local(s.next_out);
+
+ new_size += PAGE_SIZE - s.avail_out;
+ } while (rc == Z_OK);
+
+ if (rc != Z_STREAM_END) {
+ pr_err("decompression failed with status %d\n", rc);
+ retval = -EINVAL;
+ goto out_inflate_end;
+ }
+
+ retval = new_size;
+
+out_inflate_end:
+ zlib_inflateEnd(&s);
+out:
+ kvfree(s.workspace);
+ return retval;
+}
+#elif defined(CONFIG_MODULE_COMPRESS_XZ)
+#include <linux/xz.h>
+#define MODULE_COMPRESSION xz
+#define MODULE_DECOMPRESS_FN module_xz_decompress
+
+static ssize_t module_xz_decompress(struct load_info *info,
+ const void *buf, size_t size)
+{
+ static const u8 signature[] = { 0xfd, '7', 'z', 'X', 'Z', 0 };
+ struct xz_dec *xz_dec;
+ struct xz_buf xz_buf;
+ enum xz_ret xz_ret;
+ size_t new_size = 0;
+ ssize_t retval;
+
+ if (size < sizeof(signature) ||
+ memcmp(buf, signature, sizeof(signature))) {
+ pr_err("not an xz compressed module\n");
+ return -EINVAL;
+ }
+
+ xz_dec = xz_dec_init(XZ_DYNALLOC, (u32)-1);
+ if (!xz_dec)
+ return -ENOMEM;
+
+ xz_buf.in_size = size;
+ xz_buf.in = buf;
+ xz_buf.in_pos = 0;
+
+ do {
+ struct page *page = module_get_next_page(info);
+
+ if (IS_ERR(page)) {
+ retval = PTR_ERR(page);
+ goto out;
+ }
+
+ xz_buf.out = kmap_local_page(page);
+ xz_buf.out_pos = 0;
+ xz_buf.out_size = PAGE_SIZE;
+ xz_ret = xz_dec_run(xz_dec, &xz_buf);
+ kunmap_local(xz_buf.out);
+
+ new_size += xz_buf.out_pos;
+ } while (xz_buf.out_pos == PAGE_SIZE && xz_ret == XZ_OK);
+
+ if (xz_ret != XZ_STREAM_END) {
+ pr_err("decompression failed with status %d\n", xz_ret);
+ retval = -EINVAL;
+ goto out;
+ }
+
+ retval = new_size;
+
+ out:
+ xz_dec_end(xz_dec);
+ return retval;
+}
+#elif defined(CONFIG_MODULE_COMPRESS_ZSTD)
+#include <linux/zstd.h>
+#define MODULE_COMPRESSION zstd
+#define MODULE_DECOMPRESS_FN module_zstd_decompress
+
+static ssize_t module_zstd_decompress(struct load_info *info,
+ const void *buf, size_t size)
+{
+ static const u8 signature[] = { 0x28, 0xb5, 0x2f, 0xfd };
+ ZSTD_outBuffer zstd_dec;
+ ZSTD_inBuffer zstd_buf;
+ zstd_frame_header header;
+ size_t wksp_size;
+ void *wksp = NULL;
+ ZSTD_DStream *dstream;
+ size_t ret;
+ size_t new_size = 0;
+ int retval;
+
+ if (size < sizeof(signature) ||
+ memcmp(buf, signature, sizeof(signature))) {
+ pr_err("not a zstd compressed module\n");
+ return -EINVAL;
+ }
+
+ zstd_buf.src = buf;
+ zstd_buf.pos = 0;
+ zstd_buf.size = size;
+
+ ret = zstd_get_frame_header(&header, zstd_buf.src, zstd_buf.size);
+ if (ret != 0) {
+ pr_err("ZSTD-compressed data has an incomplete frame header\n");
+ retval = -EINVAL;
+ goto out;
+ }
+ if (header.windowSize > (1 << ZSTD_WINDOWLOG_MAX)) {
+ pr_err("ZSTD-compressed data has too large a window size\n");
+ retval = -EINVAL;
+ goto out;
+ }
+
+ wksp_size = zstd_dstream_workspace_bound(header.windowSize);
+ wksp = kvmalloc(wksp_size, GFP_KERNEL);
+ if (!wksp) {
+ retval = -ENOMEM;
+ goto out;
+ }
+
+ dstream = zstd_init_dstream(header.windowSize, wksp, wksp_size);
+ if (!dstream) {
+ pr_err("Can't initialize ZSTD stream\n");
+ retval = -ENOMEM;
+ goto out;
+ }
+
+ do {
+ struct page *page = module_get_next_page(info);
+
+ if (IS_ERR(page)) {
+ retval = PTR_ERR(page);
+ goto out;
+ }
+
+ zstd_dec.dst = kmap_local_page(page);
+ zstd_dec.pos = 0;
+ zstd_dec.size = PAGE_SIZE;
+
+ ret = zstd_decompress_stream(dstream, &zstd_dec, &zstd_buf);
+ kunmap_local(zstd_dec.dst);
+ retval = zstd_get_error_code(ret);
+ if (retval)
+ break;
+
+ new_size += zstd_dec.pos;
+ } while (zstd_dec.pos == PAGE_SIZE && ret != 0);
+
+ if (retval) {
+ pr_err("ZSTD-decompression failed with status %d\n", retval);
+ retval = -EINVAL;
+ goto out;
+ }
+
+ retval = new_size;
+
+ out:
+ kvfree(wksp);
+ return retval;
+}
+#else
+#error "Unexpected configuration for CONFIG_MODULE_DECOMPRESS"
+#endif
+
+int module_decompress(struct load_info *info, const void *buf, size_t size)
+{
+ unsigned int n_pages;
+ ssize_t data_size;
+ int error;
+
+#if defined(CONFIG_MODULE_STATS)
+ info->compressed_len = size;
+#endif
+
+ /*
+ * Start with number of pages twice as big as needed for
+ * compressed data.
+ */
+ n_pages = DIV_ROUND_UP(size, PAGE_SIZE) * 2;
+ error = module_extend_max_pages(info, n_pages);
+
+ data_size = MODULE_DECOMPRESS_FN(info, buf, size);
+ if (data_size < 0) {
+ error = data_size;
+ goto err;
+ }
+
+ info->hdr = vmap(info->pages, info->used_pages, VM_MAP, PAGE_KERNEL);
+ if (!info->hdr) {
+ error = -ENOMEM;
+ goto err;
+ }
+
+ info->len = data_size;
+ return 0;
+
+err:
+ module_decompress_cleanup(info);
+ return error;
+}
+
+void module_decompress_cleanup(struct load_info *info)
+{
+ int i;
+
+ if (info->hdr)
+ vunmap(info->hdr);
+
+ for (i = 0; i < info->used_pages; i++)
+ __free_page(info->pages[i]);
+
+ kvfree(info->pages);
+
+ info->pages = NULL;
+ info->max_pages = info->used_pages = 0;
+}
+
+#ifdef CONFIG_SYSFS
+static ssize_t compression_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, __stringify(MODULE_COMPRESSION) "\n");
+}
+
+static struct kobj_attribute module_compression_attr = __ATTR_RO(compression);
+
+static int __init module_decompress_sysfs_init(void)
+{
+ int error;
+
+ error = sysfs_create_file(&module_kset->kobj,
+ &module_compression_attr.attr);
+ if (error)
+ pr_warn("Failed to create 'compression' attribute");
+
+ return 0;
+}
+late_initcall(module_decompress_sysfs_init);
+#endif
diff --git a/kernel/module/dups.c b/kernel/module/dups.c
new file mode 100644
index 000000000000..9a92f2f8c9d3
--- /dev/null
+++ b/kernel/module/dups.c
@@ -0,0 +1,248 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * kmod dups - the kernel module autoloader duplicate suppressor
+ *
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#define pr_fmt(fmt) "module: " fmt
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/binfmts.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/cred.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/resource.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+#include <linux/rwsem.h>
+#include <linux/ptrace.h>
+#include <linux/async.h>
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "module."
+static bool enable_dups_trace = IS_ENABLED(CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS_TRACE);
+module_param(enable_dups_trace, bool_enable_only, 0644);
+
+/*
+ * Protects dup_kmod_reqs list, adds / removals with RCU.
+ */
+static DEFINE_MUTEX(kmod_dup_mutex);
+static LIST_HEAD(dup_kmod_reqs);
+
+struct kmod_dup_req {
+ struct list_head list;
+ char name[MODULE_NAME_LEN];
+ struct completion first_req_done;
+ struct work_struct complete_work;
+ struct delayed_work delete_work;
+ int dup_ret;
+};
+
+static struct kmod_dup_req *kmod_dup_request_lookup(char *module_name)
+{
+ struct kmod_dup_req *kmod_req;
+
+ list_for_each_entry_rcu(kmod_req, &dup_kmod_reqs, list,
+ lockdep_is_held(&kmod_dup_mutex)) {
+ if (strlen(kmod_req->name) == strlen(module_name) &&
+ !memcmp(kmod_req->name, module_name, strlen(module_name))) {
+ return kmod_req;
+ }
+ }
+
+ return NULL;
+}
+
+static void kmod_dup_request_delete(struct work_struct *work)
+{
+ struct kmod_dup_req *kmod_req;
+ kmod_req = container_of(to_delayed_work(work), struct kmod_dup_req, delete_work);
+
+ /*
+ * The typical situation is a module successully loaded. In that
+ * situation the module will be present already in userspace. If
+ * new requests come in after that, userspace will already know the
+ * module is loaded so will just return 0 right away. There is still
+ * a small chance right after we delete this entry new request_module()
+ * calls may happen after that, they can happen. These heuristics
+ * are to protect finit_module() abuse for auto-loading, if modules
+ * are still tryign to auto-load even if a module is already loaded,
+ * that's on them, and those inneficiencies should not be fixed by
+ * kmod. The inneficies there are a call to modprobe and modprobe
+ * just returning 0.
+ */
+ mutex_lock(&kmod_dup_mutex);
+ list_del_rcu(&kmod_req->list);
+ synchronize_rcu();
+ mutex_unlock(&kmod_dup_mutex);
+ kfree(kmod_req);
+}
+
+static void kmod_dup_request_complete(struct work_struct *work)
+{
+ struct kmod_dup_req *kmod_req;
+
+ kmod_req = container_of(work, struct kmod_dup_req, complete_work);
+
+ /*
+ * This will ensure that the kernel will let all the waiters get
+ * informed its time to check the return value. It's time to
+ * go home.
+ */
+ complete_all(&kmod_req->first_req_done);
+
+ /*
+ * Now that we have allowed prior request_module() calls to go on
+ * with life, let's schedule deleting this entry. We don't have
+ * to do it right away, but we *eventually* want to do it so to not
+ * let this linger forever as this is just a boot optimization for
+ * possible abuses of vmalloc() incurred by finit_module() thrashing.
+ */
+ queue_delayed_work(system_wq, &kmod_req->delete_work, 60 * HZ);
+}
+
+bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret)
+{
+ struct kmod_dup_req *kmod_req, *new_kmod_req;
+ int ret;
+
+ /*
+ * Pre-allocate the entry in case we have to use it later
+ * to avoid contention with the mutex.
+ */
+ new_kmod_req = kzalloc(sizeof(*new_kmod_req), GFP_KERNEL);
+ if (!new_kmod_req)
+ return false;
+
+ memcpy(new_kmod_req->name, module_name, strlen(module_name));
+ INIT_WORK(&new_kmod_req->complete_work, kmod_dup_request_complete);
+ INIT_DELAYED_WORK(&new_kmod_req->delete_work, kmod_dup_request_delete);
+ init_completion(&new_kmod_req->first_req_done);
+
+ mutex_lock(&kmod_dup_mutex);
+
+ kmod_req = kmod_dup_request_lookup(module_name);
+ if (!kmod_req) {
+ /*
+ * If the first request that came through for a module
+ * was with request_module_nowait() we cannot wait for it
+ * and share its return value with other users which may
+ * have used request_module() and need a proper return value
+ * so just skip using them as an anchor.
+ *
+ * If a prior request to this one came through with
+ * request_module() though, then a request_module_nowait()
+ * would benefit from duplicate detection.
+ */
+ if (!wait) {
+ kfree(new_kmod_req);
+ pr_debug("New request_module_nowait() for %s -- cannot track duplicates for this request\n", module_name);
+ mutex_unlock(&kmod_dup_mutex);
+ return false;
+ }
+
+ /*
+ * There was no duplicate, just add the request so we can
+ * keep tab on duplicates later.
+ */
+ pr_debug("New request_module() for %s\n", module_name);
+ list_add_rcu(&new_kmod_req->list, &dup_kmod_reqs);
+ mutex_unlock(&kmod_dup_mutex);
+ return false;
+ }
+ mutex_unlock(&kmod_dup_mutex);
+
+ /* We are dealing with a duplicate request now */
+ kfree(new_kmod_req);
+
+ /*
+ * To fix these try to use try_then_request_module() instead as that
+ * will check if the component you are looking for is present or not.
+ * You could also just queue a single request to load the module once,
+ * instead of having each and everything you need try to request for
+ * the module.
+ *
+ * Duplicate request_module() calls can cause quite a bit of wasted
+ * vmalloc() space when racing with userspace.
+ */
+ if (enable_dups_trace)
+ WARN(1, "module-autoload: duplicate request for module %s\n", module_name);
+ else
+ pr_warn("module-autoload: duplicate request for module %s\n", module_name);
+
+ if (!wait) {
+ /*
+ * If request_module_nowait() was used then the user just
+ * wanted to issue the request and if another module request
+ * was already its way with the same name we don't care for
+ * the return value either. Let duplicate request_module_nowait()
+ * calls bail out right away.
+ */
+ *dup_ret = 0;
+ return true;
+ }
+
+ /*
+ * If a duplicate request_module() was used they *may* care for
+ * the return value, so we have no other option but to wait for
+ * the first caller to complete. If the first caller used
+ * the request_module_nowait() call, subsquent callers will
+ * deal with the comprmise of getting a successful call with this
+ * optimization enabled ...
+ */
+ ret = wait_for_completion_state(&kmod_req->first_req_done,
+ TASK_KILLABLE);
+ if (ret) {
+ *dup_ret = ret;
+ return true;
+ }
+
+ /* Now the duplicate request has the same exact return value as the first request */
+ *dup_ret = kmod_req->dup_ret;
+
+ return true;
+}
+
+void kmod_dup_request_announce(char *module_name, int ret)
+{
+ struct kmod_dup_req *kmod_req;
+
+ mutex_lock(&kmod_dup_mutex);
+
+ kmod_req = kmod_dup_request_lookup(module_name);
+ if (!kmod_req)
+ goto out;
+
+ kmod_req->dup_ret = ret;
+
+ /*
+ * If we complete() here we may allow duplicate threads
+ * to continue before the first one that submitted the
+ * request. We're in no rush also, given that each and
+ * every bounce back to userspace is slow we avoid that
+ * with a slight delay here. So queueue up the completion
+ * and let duplicates suffer, just wait a tad bit longer.
+ * There is no rush. But we also don't want to hold the
+ * caller up forever or introduce any boot delays.
+ */
+ queue_work(system_wq, &kmod_req->complete_work);
+
+out:
+ mutex_unlock(&kmod_dup_mutex);
+}
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
new file mode 100644
index 000000000000..c8b7b4dcf782
--- /dev/null
+++ b/kernel/module/internal.h
@@ -0,0 +1,406 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Module internals
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#include <linux/elf.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/mm.h>
+
+#ifndef ARCH_SHF_SMALL
+#define ARCH_SHF_SMALL 0
+#endif
+
+/*
+ * Use highest 4 bits of sh_entsize to store the mod_mem_type of this
+ * section. This leaves 28 bits for offset on 32-bit systems, which is
+ * about 256 MiB (WARN_ON_ONCE if we exceed that).
+ */
+
+#define SH_ENTSIZE_TYPE_BITS 4
+#define SH_ENTSIZE_TYPE_SHIFT (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS)
+#define SH_ENTSIZE_TYPE_MASK ((1UL << SH_ENTSIZE_TYPE_BITS) - 1)
+#define SH_ENTSIZE_OFFSET_MASK ((1UL << (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS)) - 1)
+
+/* Maximum number of characters written by module_flags() */
+#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
+
+struct kernel_symbol {
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ int value_offset;
+ int name_offset;
+ int namespace_offset;
+#else
+ unsigned long value;
+ const char *name;
+ const char *namespace;
+#endif
+};
+
+extern struct mutex module_mutex;
+extern struct list_head modules;
+
+extern struct module_attribute *modinfo_attrs[];
+extern size_t modinfo_attrs_count;
+
+/* Provided by the linker */
+extern const struct kernel_symbol __start___ksymtab[];
+extern const struct kernel_symbol __stop___ksymtab[];
+extern const struct kernel_symbol __start___ksymtab_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_gpl[];
+extern const s32 __start___kcrctab[];
+extern const s32 __start___kcrctab_gpl[];
+
+struct load_info {
+ const char *name;
+ /* pointer to module in temporary copy, freed at end of load_module() */
+ struct module *mod;
+ Elf_Ehdr *hdr;
+ unsigned long len;
+ Elf_Shdr *sechdrs;
+ char *secstrings, *strtab;
+ unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
+ bool sig_ok;
+#ifdef CONFIG_KALLSYMS
+ unsigned long mod_kallsyms_init_off;
+#endif
+#ifdef CONFIG_MODULE_DECOMPRESS
+#ifdef CONFIG_MODULE_STATS
+ unsigned long compressed_len;
+#endif
+ struct page **pages;
+ unsigned int max_pages;
+ unsigned int used_pages;
+#endif
+ struct {
+ unsigned int sym, str, mod, vers, info, pcpu;
+ } index;
+};
+
+enum mod_license {
+ NOT_GPL_ONLY,
+ GPL_ONLY,
+};
+
+struct find_symbol_arg {
+ /* Input */
+ const char *name;
+ bool gplok;
+ bool warn;
+
+ /* Output */
+ struct module *owner;
+ const s32 *crc;
+ const struct kernel_symbol *sym;
+ enum mod_license license;
+};
+
+int mod_verify_sig(const void *mod, struct load_info *info);
+int try_to_force_load(struct module *mod, const char *reason);
+bool find_symbol(struct find_symbol_arg *fsa);
+struct module *find_module_all(const char *name, size_t len, bool even_unformed);
+int cmp_name(const void *name, const void *sym);
+long module_get_offset_and_type(struct module *mod, enum mod_mem_type type,
+ Elf_Shdr *sechdr, unsigned int section);
+char *module_flags(struct module *mod, char *buf, bool show_state);
+size_t module_flags_taint(unsigned long taints, char *buf);
+
+char *module_next_tag_pair(char *string, unsigned long *secsize);
+
+#define for_each_modinfo_entry(entry, info, name) \
+ for (entry = get_modinfo(info, name); entry; entry = get_next_modinfo(info, name, entry))
+
+static inline void module_assert_mutex_or_preempt(void)
+{
+#ifdef CONFIG_LOCKDEP
+ if (unlikely(!debug_locks))
+ return;
+
+ WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
+ !lockdep_is_held(&module_mutex));
+#endif
+}
+
+static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ return (unsigned long)offset_to_ptr(&sym->value_offset);
+#else
+ return sym->value;
+#endif
+}
+
+#ifdef CONFIG_LIVEPATCH
+int copy_module_elf(struct module *mod, struct load_info *info);
+void free_module_elf(struct module *mod);
+#else /* !CONFIG_LIVEPATCH */
+static inline int copy_module_elf(struct module *mod, struct load_info *info)
+{
+ return 0;
+}
+
+static inline void free_module_elf(struct module *mod) { }
+#endif /* CONFIG_LIVEPATCH */
+
+static inline bool set_livepatch_module(struct module *mod)
+{
+#ifdef CONFIG_LIVEPATCH
+ mod->klp = true;
+ return true;
+#else
+ return false;
+#endif
+}
+
+/**
+ * enum fail_dup_mod_reason - state at which a duplicate module was detected
+ *
+ * @FAIL_DUP_MOD_BECOMING: the module is read properly, passes all checks but
+ * we've determined that another module with the same name is already loaded
+ * or being processed on our &modules list. This happens on early_mod_check()
+ * right before layout_and_allocate(). The kernel would have already
+ * vmalloc()'d space for the entire module through finit_module(). If
+ * decompression was used two vmap() spaces were used. These failures can
+ * happen when userspace has not seen the module present on the kernel and
+ * tries to load the module multiple times at same time.
+ * @FAIL_DUP_MOD_LOAD: the module has been read properly, passes all validation
+ * checks and the kernel determines that the module was unique and because
+ * of this allocated yet another private kernel copy of the module space in
+ * layout_and_allocate() but after this determined in add_unformed_module()
+ * that another module with the same name is already loaded or being processed.
+ * These failures should be mitigated as much as possible and are indicative
+ * of really fast races in loading modules. Without module decompression
+ * they waste twice as much vmap space. With module decompression three
+ * times the module's size vmap space is wasted.
+ */
+enum fail_dup_mod_reason {
+ FAIL_DUP_MOD_BECOMING = 0,
+ FAIL_DUP_MOD_LOAD,
+};
+
+#ifdef CONFIG_MODULE_DEBUGFS
+extern struct dentry *mod_debugfs_root;
+#endif
+
+#ifdef CONFIG_MODULE_STATS
+
+#define mod_stat_add_long(count, var) atomic_long_add(count, var)
+#define mod_stat_inc(name) atomic_inc(name)
+
+extern atomic_long_t total_mod_size;
+extern atomic_long_t total_text_size;
+extern atomic_long_t invalid_kread_bytes;
+extern atomic_long_t invalid_decompress_bytes;
+
+extern atomic_t modcount;
+extern atomic_t failed_kreads;
+extern atomic_t failed_decompress;
+struct mod_fail_load {
+ struct list_head list;
+ char name[MODULE_NAME_LEN];
+ atomic_long_t count;
+ unsigned long dup_fail_mask;
+};
+
+int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason);
+void mod_stat_bump_invalid(struct load_info *info, int flags);
+void mod_stat_bump_becoming(struct load_info *info, int flags);
+
+#else
+
+#define mod_stat_add_long(name, var)
+#define mod_stat_inc(name)
+
+static inline int try_add_failed_module(const char *name,
+ enum fail_dup_mod_reason reason)
+{
+ return 0;
+}
+
+static inline void mod_stat_bump_invalid(struct load_info *info, int flags)
+{
+}
+
+static inline void mod_stat_bump_becoming(struct load_info *info, int flags)
+{
+}
+
+#endif /* CONFIG_MODULE_STATS */
+
+#ifdef CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS
+bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret);
+void kmod_dup_request_announce(char *module_name, int ret);
+#else
+static inline bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret)
+{
+ return false;
+}
+
+static inline void kmod_dup_request_announce(char *module_name, int ret)
+{
+}
+#endif
+
+#ifdef CONFIG_MODULE_UNLOAD_TAINT_TRACKING
+struct mod_unload_taint {
+ struct list_head list;
+ char name[MODULE_NAME_LEN];
+ unsigned long taints;
+ u64 count;
+};
+
+int try_add_tainted_module(struct module *mod);
+void print_unloaded_tainted_modules(void);
+#else /* !CONFIG_MODULE_UNLOAD_TAINT_TRACKING */
+static inline int try_add_tainted_module(struct module *mod)
+{
+ return 0;
+}
+
+static inline void print_unloaded_tainted_modules(void)
+{
+}
+#endif /* CONFIG_MODULE_UNLOAD_TAINT_TRACKING */
+
+#ifdef CONFIG_MODULE_DECOMPRESS
+int module_decompress(struct load_info *info, const void *buf, size_t size);
+void module_decompress_cleanup(struct load_info *info);
+#else
+static inline int module_decompress(struct load_info *info,
+ const void *buf, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void module_decompress_cleanup(struct load_info *info)
+{
+}
+#endif
+
+struct mod_tree_root {
+#ifdef CONFIG_MODULES_TREE_LOOKUP
+ struct latch_tree_root root;
+#endif
+ unsigned long addr_min;
+ unsigned long addr_max;
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ unsigned long data_addr_min;
+ unsigned long data_addr_max;
+#endif
+};
+
+extern struct mod_tree_root mod_tree;
+
+#ifdef CONFIG_MODULES_TREE_LOOKUP
+void mod_tree_insert(struct module *mod);
+void mod_tree_remove_init(struct module *mod);
+void mod_tree_remove(struct module *mod);
+struct module *mod_find(unsigned long addr, struct mod_tree_root *tree);
+#else /* !CONFIG_MODULES_TREE_LOOKUP */
+
+static inline void mod_tree_insert(struct module *mod) { }
+static inline void mod_tree_remove_init(struct module *mod) { }
+static inline void mod_tree_remove(struct module *mod) { }
+static inline struct module *mod_find(unsigned long addr, struct mod_tree_root *tree)
+{
+ struct module *mod;
+
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
+ if (within_module(addr, mod))
+ return mod;
+ }
+
+ return NULL;
+}
+#endif /* CONFIG_MODULES_TREE_LOOKUP */
+
+void module_enable_ro(const struct module *mod, bool after_init);
+void module_enable_nx(const struct module *mod);
+void module_enable_x(const struct module *mod);
+int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+ char *secstrings, struct module *mod);
+
+#ifdef CONFIG_MODULE_SIG
+int module_sig_check(struct load_info *info, int flags);
+#else /* !CONFIG_MODULE_SIG */
+static inline int module_sig_check(struct load_info *info, int flags)
+{
+ return 0;
+}
+#endif /* !CONFIG_MODULE_SIG */
+
+#ifdef CONFIG_DEBUG_KMEMLEAK
+void kmemleak_load_module(const struct module *mod, const struct load_info *info);
+#else /* !CONFIG_DEBUG_KMEMLEAK */
+static inline void kmemleak_load_module(const struct module *mod,
+ const struct load_info *info) { }
+#endif /* CONFIG_DEBUG_KMEMLEAK */
+
+#ifdef CONFIG_KALLSYMS
+void init_build_id(struct module *mod, const struct load_info *info);
+void layout_symtab(struct module *mod, struct load_info *info);
+void add_kallsyms(struct module *mod, const struct load_info *info);
+
+static inline bool sect_empty(const Elf_Shdr *sect)
+{
+ return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
+}
+#else /* !CONFIG_KALLSYMS */
+static inline void init_build_id(struct module *mod, const struct load_info *info) { }
+static inline void layout_symtab(struct module *mod, struct load_info *info) { }
+static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
+#endif /* CONFIG_KALLSYMS */
+
+#ifdef CONFIG_SYSFS
+int mod_sysfs_setup(struct module *mod, const struct load_info *info,
+ struct kernel_param *kparam, unsigned int num_params);
+void mod_sysfs_teardown(struct module *mod);
+void init_param_lock(struct module *mod);
+#else /* !CONFIG_SYSFS */
+static inline int mod_sysfs_setup(struct module *mod,
+ const struct load_info *info,
+ struct kernel_param *kparam,
+ unsigned int num_params)
+{
+ return 0;
+}
+
+static inline void mod_sysfs_teardown(struct module *mod) { }
+static inline void init_param_lock(struct module *mod) { }
+#endif /* CONFIG_SYSFS */
+
+#ifdef CONFIG_MODVERSIONS
+int check_version(const struct load_info *info,
+ const char *symname, struct module *mod, const s32 *crc);
+void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp,
+ struct kernel_symbol *ks, struct tracepoint * const *tp);
+int check_modstruct_version(const struct load_info *info, struct module *mod);
+int same_magic(const char *amagic, const char *bmagic, bool has_crcs);
+#else /* !CONFIG_MODVERSIONS */
+static inline int check_version(const struct load_info *info,
+ const char *symname,
+ struct module *mod,
+ const s32 *crc)
+{
+ return 1;
+}
+
+static inline int check_modstruct_version(const struct load_info *info,
+ struct module *mod)
+{
+ return 1;
+}
+
+static inline int same_magic(const char *amagic, const char *bmagic, bool has_crcs)
+{
+ return strcmp(amagic, bmagic) == 0;
+}
+#endif /* CONFIG_MODVERSIONS */
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
new file mode 100644
index 000000000000..ef73ae7c8909
--- /dev/null
+++ b/kernel/module/kallsyms.c
@@ -0,0 +1,521 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module kallsyms support
+ *
+ * Copyright (C) 2010 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/module_symbol.h>
+#include <linux/kallsyms.h>
+#include <linux/buildid.h>
+#include <linux/bsearch.h>
+#include "internal.h"
+
+/* Lookup exported symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_exported_symbol(const char *name,
+ const struct kernel_symbol *start,
+ const struct kernel_symbol *stop)
+{
+ return bsearch(name, start, stop - start,
+ sizeof(struct kernel_symbol), cmp_name);
+}
+
+static int is_exported(const char *name, unsigned long value,
+ const struct module *mod)
+{
+ const struct kernel_symbol *ks;
+
+ if (!mod)
+ ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab);
+ else
+ ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms);
+
+ return ks && kernel_symbol_value(ks) == value;
+}
+
+/* As per nm */
+static char elf_type(const Elf_Sym *sym, const struct load_info *info)
+{
+ const Elf_Shdr *sechdrs = info->sechdrs;
+
+ if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
+ if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
+ return 'v';
+ else
+ return 'w';
+ }
+ if (sym->st_shndx == SHN_UNDEF)
+ return 'U';
+ if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
+ return 'a';
+ if (sym->st_shndx >= SHN_LORESERVE)
+ return '?';
+ if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
+ return 't';
+ if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC &&
+ sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
+ if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
+ return 'r';
+ else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+ return 'g';
+ else
+ return 'd';
+ }
+ if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+ return 's';
+ else
+ return 'b';
+ }
+ if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
+ ".debug")) {
+ return 'n';
+ }
+ return '?';
+}
+
+static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
+ unsigned int shnum, unsigned int pcpundx)
+{
+ const Elf_Shdr *sec;
+ enum mod_mem_type type;
+
+ if (src->st_shndx == SHN_UNDEF ||
+ src->st_shndx >= shnum ||
+ !src->st_name)
+ return false;
+
+#ifdef CONFIG_KALLSYMS_ALL
+ if (src->st_shndx == pcpundx)
+ return true;
+#endif
+
+ sec = sechdrs + src->st_shndx;
+ type = sec->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
+ if (!(sec->sh_flags & SHF_ALLOC)
+#ifndef CONFIG_KALLSYMS_ALL
+ || !(sec->sh_flags & SHF_EXECINSTR)
+#endif
+ || mod_mem_type_is_init(type))
+ return false;
+
+ return true;
+}
+
+/*
+ * We only allocate and copy the strings needed by the parts of symtab
+ * we keep. This is simple, but has the effect of making multiple
+ * copies of duplicates. We could be more sophisticated, see
+ * linux-kernel thread starting with
+ * <73defb5e4bca04a6431392cc341112b1@localhost>.
+ */
+void layout_symtab(struct module *mod, struct load_info *info)
+{
+ Elf_Shdr *symsect = info->sechdrs + info->index.sym;
+ Elf_Shdr *strsect = info->sechdrs + info->index.str;
+ const Elf_Sym *src;
+ unsigned int i, nsrc, ndst, strtab_size = 0;
+ struct module_memory *mod_mem_data = &mod->mem[MOD_DATA];
+ struct module_memory *mod_mem_init_data = &mod->mem[MOD_INIT_DATA];
+
+ /* Put symbol section at end of init part of module. */
+ symsect->sh_flags |= SHF_ALLOC;
+ symsect->sh_entsize = module_get_offset_and_type(mod, MOD_INIT_DATA,
+ symsect, info->index.sym);
+ pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
+
+ src = (void *)info->hdr + symsect->sh_offset;
+ nsrc = symsect->sh_size / sizeof(*src);
+
+ /* Compute total space required for the core symbols' strtab. */
+ for (ndst = i = 0; i < nsrc; i++) {
+ if (i == 0 || is_livepatch_module(mod) ||
+ is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
+ strtab_size += strlen(&info->strtab[src[i].st_name]) + 1;
+ ndst++;
+ }
+ }
+
+ /* Append room for core symbols at end of core part. */
+ info->symoffs = ALIGN(mod_mem_data->size, symsect->sh_addralign ?: 1);
+ info->stroffs = mod_mem_data->size = info->symoffs + ndst * sizeof(Elf_Sym);
+ mod_mem_data->size += strtab_size;
+ /* Note add_kallsyms() computes strtab_size as core_typeoffs - stroffs */
+ info->core_typeoffs = mod_mem_data->size;
+ mod_mem_data->size += ndst * sizeof(char);
+
+ /* Put string table section at end of init part of module. */
+ strsect->sh_flags |= SHF_ALLOC;
+ strsect->sh_entsize = module_get_offset_and_type(mod, MOD_INIT_DATA,
+ strsect, info->index.str);
+ pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
+
+ /* We'll tack temporary mod_kallsyms on the end. */
+ mod_mem_init_data->size = ALIGN(mod_mem_init_data->size,
+ __alignof__(struct mod_kallsyms));
+ info->mod_kallsyms_init_off = mod_mem_init_data->size;
+
+ mod_mem_init_data->size += sizeof(struct mod_kallsyms);
+ info->init_typeoffs = mod_mem_init_data->size;
+ mod_mem_init_data->size += nsrc * sizeof(char);
+}
+
+/*
+ * We use the full symtab and strtab which layout_symtab arranged to
+ * be appended to the init section. Later we switch to the cut-down
+ * core-only ones.
+ */
+void add_kallsyms(struct module *mod, const struct load_info *info)
+{
+ unsigned int i, ndst;
+ const Elf_Sym *src;
+ Elf_Sym *dst;
+ char *s;
+ Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+ unsigned long strtab_size;
+ void *data_base = mod->mem[MOD_DATA].base;
+ void *init_data_base = mod->mem[MOD_INIT_DATA].base;
+
+ /* Set up to point into init section. */
+ mod->kallsyms = (void __rcu *)init_data_base +
+ info->mod_kallsyms_init_off;
+
+ rcu_read_lock();
+ /* The following is safe since this pointer cannot change */
+ rcu_dereference(mod->kallsyms)->symtab = (void *)symsec->sh_addr;
+ rcu_dereference(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+ /* Make sure we get permanent strtab: don't use info->strtab. */
+ rcu_dereference(mod->kallsyms)->strtab =
+ (void *)info->sechdrs[info->index.str].sh_addr;
+ rcu_dereference(mod->kallsyms)->typetab = init_data_base + info->init_typeoffs;
+
+ /*
+ * Now populate the cut down core kallsyms for after init
+ * and set types up while we still have access to sections.
+ */
+ mod->core_kallsyms.symtab = dst = data_base + info->symoffs;
+ mod->core_kallsyms.strtab = s = data_base + info->stroffs;
+ mod->core_kallsyms.typetab = data_base + info->core_typeoffs;
+ strtab_size = info->core_typeoffs - info->stroffs;
+ src = rcu_dereference(mod->kallsyms)->symtab;
+ for (ndst = i = 0; i < rcu_dereference(mod->kallsyms)->num_symtab; i++) {
+ rcu_dereference(mod->kallsyms)->typetab[i] = elf_type(src + i, info);
+ if (i == 0 || is_livepatch_module(mod) ||
+ is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
+ ssize_t ret;
+
+ mod->core_kallsyms.typetab[ndst] =
+ rcu_dereference(mod->kallsyms)->typetab[i];
+ dst[ndst] = src[i];
+ dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
+ ret = strscpy(s,
+ &rcu_dereference(mod->kallsyms)->strtab[src[i].st_name],
+ strtab_size);
+ if (ret < 0)
+ break;
+ s += ret + 1;
+ strtab_size -= ret + 1;
+ }
+ }
+ rcu_read_unlock();
+ mod->core_kallsyms.num_symtab = ndst;
+}
+
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+void init_build_id(struct module *mod, const struct load_info *info)
+{
+ const Elf_Shdr *sechdr;
+ unsigned int i;
+
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ sechdr = &info->sechdrs[i];
+ if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE &&
+ !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id,
+ sechdr->sh_size))
+ break;
+ }
+}
+#else
+void init_build_id(struct module *mod, const struct load_info *info)
+{
+}
+#endif
+
+static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
+{
+ return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
+}
+
+/*
+ * Given a module and address, find the corresponding symbol and return its name
+ * while providing its size and offset if needed.
+ */
+static const char *find_kallsyms_symbol(struct module *mod,
+ unsigned long addr,
+ unsigned long *size,
+ unsigned long *offset)
+{
+ unsigned int i, best = 0;
+ unsigned long nextval, bestval;
+ struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
+ struct module_memory *mod_mem;
+
+ /* At worse, next value is at end of module */
+ if (within_module_init(addr, mod))
+ mod_mem = &mod->mem[MOD_INIT_TEXT];
+ else
+ mod_mem = &mod->mem[MOD_TEXT];
+
+ nextval = (unsigned long)mod_mem->base + mod_mem->size;
+
+ bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
+
+ /*
+ * Scan for closest preceding symbol, and next symbol. (ELF
+ * starts real symbols at 1).
+ */
+ for (i = 1; i < kallsyms->num_symtab; i++) {
+ const Elf_Sym *sym = &kallsyms->symtab[i];
+ unsigned long thisval = kallsyms_symbol_value(sym);
+
+ if (sym->st_shndx == SHN_UNDEF)
+ continue;
+
+ /*
+ * We ignore unnamed symbols: they're uninformative
+ * and inserted at a whim.
+ */
+ if (*kallsyms_symbol_name(kallsyms, i) == '\0' ||
+ is_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
+ continue;
+
+ if (thisval <= addr && thisval > bestval) {
+ best = i;
+ bestval = thisval;
+ }
+ if (thisval > addr && thisval < nextval)
+ nextval = thisval;
+ }
+
+ if (!best)
+ return NULL;
+
+ if (size)
+ *size = nextval - bestval;
+ if (offset)
+ *offset = addr - bestval;
+
+ return kallsyms_symbol_name(kallsyms, best);
+}
+
+void * __weak dereference_module_function_descriptor(struct module *mod,
+ void *ptr)
+{
+ return ptr;
+}
+
+/*
+ * For kallsyms to ask for address resolution. NULL means not found. Careful
+ * not to lock to avoid deadlock on oopses, simply disable preemption.
+ */
+const char *module_address_lookup(unsigned long addr,
+ unsigned long *size,
+ unsigned long *offset,
+ char **modname,
+ const unsigned char **modbuildid,
+ char *namebuf)
+{
+ const char *ret = NULL;
+ struct module *mod;
+
+ preempt_disable();
+ mod = __module_address(addr);
+ if (mod) {
+ if (modname)
+ *modname = mod->name;
+ if (modbuildid) {
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+ *modbuildid = mod->build_id;
+#else
+ *modbuildid = NULL;
+#endif
+ }
+
+ ret = find_kallsyms_symbol(mod, addr, size, offset);
+ }
+ /* Make a copy in here where it's safe */
+ if (ret) {
+ strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
+ ret = namebuf;
+ }
+ preempt_enable();
+
+ return ret;
+}
+
+int lookup_module_symbol_name(unsigned long addr, char *symname)
+{
+ struct module *mod;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (within_module(addr, mod)) {
+ const char *sym;
+
+ sym = find_kallsyms_symbol(mod, addr, NULL, NULL);
+ if (!sym)
+ goto out;
+
+ strscpy(symname, sym, KSYM_NAME_LEN);
+ preempt_enable();
+ return 0;
+ }
+ }
+out:
+ preempt_enable();
+ return -ERANGE;
+}
+
+int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *name, char *module_name, int *exported)
+{
+ struct module *mod;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ struct mod_kallsyms *kallsyms;
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ kallsyms = rcu_dereference_sched(mod->kallsyms);
+ if (symnum < kallsyms->num_symtab) {
+ const Elf_Sym *sym = &kallsyms->symtab[symnum];
+
+ *value = kallsyms_symbol_value(sym);
+ *type = kallsyms->typetab[symnum];
+ strscpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
+ strscpy(module_name, mod->name, MODULE_NAME_LEN);
+ *exported = is_exported(name, *value, mod);
+ preempt_enable();
+ return 0;
+ }
+ symnum -= kallsyms->num_symtab;
+ }
+ preempt_enable();
+ return -ERANGE;
+}
+
+/* Given a module and name of symbol, find and return the symbol's value */
+static unsigned long __find_kallsyms_symbol_value(struct module *mod, const char *name)
+{
+ unsigned int i;
+ struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
+
+ for (i = 0; i < kallsyms->num_symtab; i++) {
+ const Elf_Sym *sym = &kallsyms->symtab[i];
+
+ if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 &&
+ sym->st_shndx != SHN_UNDEF)
+ return kallsyms_symbol_value(sym);
+ }
+ return 0;
+}
+
+static unsigned long __module_kallsyms_lookup_name(const char *name)
+{
+ struct module *mod;
+ char *colon;
+
+ colon = strnchr(name, MODULE_NAME_LEN, ':');
+ if (colon) {
+ mod = find_module_all(name, colon - name, false);
+ if (mod)
+ return __find_kallsyms_symbol_value(mod, colon + 1);
+ return 0;
+ }
+
+ list_for_each_entry_rcu(mod, &modules, list) {
+ unsigned long ret;
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ ret = __find_kallsyms_symbol_value(mod, name);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/* Look for this name: can be of form module:name. */
+unsigned long module_kallsyms_lookup_name(const char *name)
+{
+ unsigned long ret;
+
+ /* Don't lock: we're in enough trouble already. */
+ preempt_disable();
+ ret = __module_kallsyms_lookup_name(name);
+ preempt_enable();
+ return ret;
+}
+
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
+{
+ unsigned long ret;
+
+ preempt_disable();
+ ret = __find_kallsyms_symbol_value(mod, name);
+ preempt_enable();
+ return ret;
+}
+
+int module_kallsyms_on_each_symbol(const char *modname,
+ int (*fn)(void *, const char *, unsigned long),
+ void *data)
+{
+ struct module *mod;
+ unsigned int i;
+ int ret = 0;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(mod, &modules, list) {
+ struct mod_kallsyms *kallsyms;
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
+ if (modname && strcmp(modname, mod->name))
+ continue;
+
+ /* Use rcu_dereference_sched() to remain compliant with the sparse tool */
+ preempt_disable();
+ kallsyms = rcu_dereference_sched(mod->kallsyms);
+ preempt_enable();
+
+ for (i = 0; i < kallsyms->num_symtab; i++) {
+ const Elf_Sym *sym = &kallsyms->symtab[i];
+
+ if (sym->st_shndx == SHN_UNDEF)
+ continue;
+
+ ret = fn(data, kallsyms_symbol_name(kallsyms, i),
+ kallsyms_symbol_value(sym));
+ if (ret != 0)
+ goto out;
+ }
+
+ /*
+ * The given module is found, the subsequent modules do not
+ * need to be compared.
+ */
+ if (modname)
+ break;
+ }
+out:
+ mutex_unlock(&module_mutex);
+ return ret;
+}
diff --git a/kernel/module/kdb.c b/kernel/module/kdb.c
new file mode 100644
index 000000000000..995c32d3698f
--- /dev/null
+++ b/kernel/module/kdb.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module kdb support
+ *
+ * Copyright (C) 2010 Jason Wessel
+ */
+
+#include <linux/module.h>
+#include <linux/kdb.h>
+#include "internal.h"
+
+/*
+ * kdb_lsmod - This function implements the 'lsmod' command. Lists
+ * currently loaded kernel modules.
+ * Mostly taken from userland lsmod.
+ */
+int kdb_lsmod(int argc, const char **argv)
+{
+ struct module *mod;
+
+ if (argc != 0)
+ return KDB_ARGCOUNT;
+
+ kdb_printf("Module Size modstruct Used by\n");
+ list_for_each_entry(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
+ kdb_printf("%-20s%8u", mod->name, mod->mem[MOD_TEXT].size);
+ kdb_printf("/%8u", mod->mem[MOD_RODATA].size);
+ kdb_printf("/%8u", mod->mem[MOD_RO_AFTER_INIT].size);
+ kdb_printf("/%8u", mod->mem[MOD_DATA].size);
+
+ kdb_printf(" 0x%px ", (void *)mod);
+#ifdef CONFIG_MODULE_UNLOAD
+ kdb_printf("%4d ", module_refcount(mod));
+#endif
+ if (mod->state == MODULE_STATE_GOING)
+ kdb_printf(" (Unloading)");
+ else if (mod->state == MODULE_STATE_COMING)
+ kdb_printf(" (Loading)");
+ else
+ kdb_printf(" (Live)");
+ kdb_printf(" 0x%px", mod->mem[MOD_TEXT].base);
+ kdb_printf("/0x%px", mod->mem[MOD_RODATA].base);
+ kdb_printf("/0x%px", mod->mem[MOD_RO_AFTER_INIT].base);
+ kdb_printf("/0x%px", mod->mem[MOD_DATA].base);
+
+#ifdef CONFIG_MODULE_UNLOAD
+ {
+ struct module_use *use;
+
+ kdb_printf(" [ ");
+ list_for_each_entry(use, &mod->source_list,
+ source_list)
+ kdb_printf("%s ", use->target->name);
+ kdb_printf("]\n");
+ }
+#endif
+ }
+
+ return 0;
+}
diff --git a/kernel/kmod.c b/kernel/module/kmod.c
index 37c3c4b97b8e..0800d9891692 100644
--- a/kernel/kmod.c
+++ b/kernel/module/kmod.c
@@ -1,6 +1,9 @@
/*
* kmod - the kernel module loader
+ *
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
*/
+
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
@@ -27,6 +30,7 @@
#include <linux/uaccess.h>
#include <trace/events/module.h>
+#include "internal.h"
/*
* Assuming:
@@ -36,13 +40,11 @@
*
* If you need less than 50 threads would mean we're dealing with systems
* smaller than 3200 pages. This assumes you are capable of having ~13M memory,
- * and this would only be an be an upper limit, after which the OOM killer
- * would take effect. Systems like these are very unlikely if modules are
- * enabled.
+ * and this would only be an upper limit, after which the OOM killer would take
+ * effect. Systems like these are very unlikely if modules are enabled.
*/
#define MAX_KMOD_CONCURRENT 50
-static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
-static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
+static DEFINE_SEMAPHORE(kmod_concurrent_max, MAX_KMOD_CONCURRENT);
/*
* This is a restriction on having *all* MAX_KMOD_CONCURRENT threads
@@ -59,7 +61,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
/*
modprobe_path is set via /proc/sys.
*/
-char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
+char modprobe_path[KMOD_PATH_LEN] = CONFIG_MODPROBE_PATH;
static void free_modprobe_argv(struct subprocess_info *info)
{
@@ -67,7 +69,7 @@ static void free_modprobe_argv(struct subprocess_info *info)
kfree(info->argv);
}
-static int call_modprobe(char *module_name, int wait)
+static int call_modprobe(char *orig_module_name, int wait)
{
struct subprocess_info *info;
static char *envp[] = {
@@ -76,12 +78,14 @@ static int call_modprobe(char *module_name, int wait)
"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
NULL
};
+ char *module_name;
+ int ret;
char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
if (!argv)
goto out;
- module_name = kstrdup(module_name, GFP_KERNEL);
+ module_name = kstrdup(orig_module_name, GFP_KERNEL);
if (!module_name)
goto free_argv;
@@ -96,13 +100,16 @@ static int call_modprobe(char *module_name, int wait)
if (!info)
goto free_module_name;
- return call_usermodehelper_exec(info, wait | UMH_KILLABLE);
+ ret = call_usermodehelper_exec(info, wait | UMH_KILLABLE);
+ kmod_dup_request_announce(orig_module_name, ret);
+ return ret;
free_module_name:
kfree(module_name);
free_argv:
kfree(argv);
out:
+ kmod_dup_request_announce(orig_module_name, -ENOMEM);
return -ENOMEM;
}
@@ -126,7 +133,7 @@ int __request_module(bool wait, const char *fmt, ...)
{
va_list args;
char module_name[MODULE_NAME_LEN];
- int ret;
+ int ret, dup_ret;
/*
* We don't allow synchronous module loading from async. Module
@@ -149,29 +156,24 @@ int __request_module(bool wait, const char *fmt, ...)
if (ret)
return ret;
- if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) {
- pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",
- atomic_read(&kmod_concurrent_max),
- MAX_KMOD_CONCURRENT, module_name);
- ret = wait_event_killable_timeout(kmod_wq,
- atomic_dec_if_positive(&kmod_concurrent_max) >= 0,
- MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
- if (!ret) {
- pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
- module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
- return -ETIME;
- } else if (ret == -ERESTARTSYS) {
- pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name);
- return ret;
- }
+ ret = down_timeout(&kmod_concurrent_max, MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
+ if (ret) {
+ pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
+ module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
+ return ret;
}
trace_module_request(module_name, wait, _RET_IP_);
+ if (kmod_dup_request_exists_wait(module_name, wait, &dup_ret)) {
+ ret = dup_ret;
+ goto out;
+ }
+
ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
- atomic_inc(&kmod_concurrent_max);
- wake_up(&kmod_wq);
+out:
+ up(&kmod_concurrent_max);
return ret;
}
diff --git a/kernel/module/livepatch.c b/kernel/module/livepatch.c
new file mode 100644
index 000000000000..a89f01e1d6b7
--- /dev/null
+++ b/kernel/module/livepatch.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module livepatch support
+ *
+ * Copyright (C) 2016 Jessica Yu <jeyu@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+/*
+ * Persist ELF information about a module. Copy the ELF header,
+ * section header table, section string table, and symtab section
+ * index from info to mod->klp_info.
+ */
+int copy_module_elf(struct module *mod, struct load_info *info)
+{
+ unsigned int size, symndx;
+ int ret;
+
+ size = sizeof(*mod->klp_info);
+ mod->klp_info = kmalloc(size, GFP_KERNEL);
+ if (!mod->klp_info)
+ return -ENOMEM;
+
+ /* ELF header */
+ size = sizeof(mod->klp_info->hdr);
+ memcpy(&mod->klp_info->hdr, info->hdr, size);
+
+ /* ELF section header table */
+ size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
+ mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL);
+ if (!mod->klp_info->sechdrs) {
+ ret = -ENOMEM;
+ goto free_info;
+ }
+
+ /* ELF section name string table */
+ size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
+ mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL);
+ if (!mod->klp_info->secstrings) {
+ ret = -ENOMEM;
+ goto free_sechdrs;
+ }
+
+ /* ELF symbol section index */
+ symndx = info->index.sym;
+ mod->klp_info->symndx = symndx;
+
+ /*
+ * For livepatch modules, core_kallsyms.symtab is a complete
+ * copy of the original symbol table. Adjust sh_addr to point
+ * to core_kallsyms.symtab since the copy of the symtab in module
+ * init memory is freed at the end of do_init_module().
+ */
+ mod->klp_info->sechdrs[symndx].sh_addr = (unsigned long)mod->core_kallsyms.symtab;
+
+ return 0;
+
+free_sechdrs:
+ kfree(mod->klp_info->sechdrs);
+free_info:
+ kfree(mod->klp_info);
+ return ret;
+}
+
+void free_module_elf(struct module *mod)
+{
+ kfree(mod->klp_info->sechdrs);
+ kfree(mod->klp_info->secstrings);
+ kfree(mod->klp_info);
+}
diff --git a/kernel/module/main.c b/kernel/module/main.c
new file mode 100644
index 000000000000..36681911c05a
--- /dev/null
+++ b/kernel/module/main.c
@@ -0,0 +1,3369 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2002 Richard Henderson
+ * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#define INCLUDE_VERMAGIC
+
+#include <linux/export.h>
+#include <linux/extable.h>
+#include <linux/moduleloader.h>
+#include <linux/module_signature.h>
+#include <linux/trace_events.h>
+#include <linux/init.h>
+#include <linux/kallsyms.h>
+#include <linux/buildid.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/kernel_read_file.h>
+#include <linux/kstrtox.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/elf.h>
+#include <linux/seq_file.h>
+#include <linux/syscalls.h>
+#include <linux/fcntl.h>
+#include <linux/rcupdate.h>
+#include <linux/capability.h>
+#include <linux/cpu.h>
+#include <linux/moduleparam.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/vermagic.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/device.h>
+#include <linux/string.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <linux/set_memory.h>
+#include <asm/mmu_context.h>
+#include <linux/license.h>
+#include <asm/sections.h>
+#include <linux/tracepoint.h>
+#include <linux/ftrace.h>
+#include <linux/livepatch.h>
+#include <linux/async.h>
+#include <linux/percpu.h>
+#include <linux/kmemleak.h>
+#include <linux/jump_label.h>
+#include <linux/pfn.h>
+#include <linux/bsearch.h>
+#include <linux/dynamic_debug.h>
+#include <linux/audit.h>
+#include <linux/cfi.h>
+#include <linux/debugfs.h>
+#include <uapi/linux/module.h>
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/module.h>
+
+/*
+ * Mutex protects:
+ * 1) List of modules (also safely readable with preempt_disable),
+ * 2) module_use links,
+ * 3) mod_tree.addr_min/mod_tree.addr_max.
+ * (delete and add uses RCU list operations).
+ */
+DEFINE_MUTEX(module_mutex);
+LIST_HEAD(modules);
+
+/* Work queue for freeing init sections in success case */
+static void do_free_init(struct work_struct *w);
+static DECLARE_WORK(init_free_wq, do_free_init);
+static LLIST_HEAD(init_free_list);
+
+struct mod_tree_root mod_tree __cacheline_aligned = {
+ .addr_min = -1UL,
+};
+
+struct symsearch {
+ const struct kernel_symbol *start, *stop;
+ const s32 *crcs;
+ enum mod_license license;
+};
+
+/*
+ * Bounds of module memory, for speeding up __module_address.
+ * Protected by module_mutex.
+ */
+static void __mod_update_bounds(enum mod_mem_type type __maybe_unused, void *base,
+ unsigned int size, struct mod_tree_root *tree)
+{
+ unsigned long min = (unsigned long)base;
+ unsigned long max = min + size;
+
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ if (mod_mem_type_is_core_data(type)) {
+ if (min < tree->data_addr_min)
+ tree->data_addr_min = min;
+ if (max > tree->data_addr_max)
+ tree->data_addr_max = max;
+ return;
+ }
+#endif
+ if (min < tree->addr_min)
+ tree->addr_min = min;
+ if (max > tree->addr_max)
+ tree->addr_max = max;
+}
+
+static void mod_update_bounds(struct module *mod)
+{
+ for_each_mod_mem_type(type) {
+ struct module_memory *mod_mem = &mod->mem[type];
+
+ if (mod_mem->size)
+ __mod_update_bounds(type, mod_mem->base, mod_mem->size, &mod_tree);
+ }
+}
+
+/* Block module loading/unloading? */
+int modules_disabled;
+core_param(nomodule, modules_disabled, bint, 0);
+
+/* Waiting for a module to finish initializing? */
+static DECLARE_WAIT_QUEUE_HEAD(module_wq);
+
+static BLOCKING_NOTIFIER_HEAD(module_notify_list);
+
+int register_module_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&module_notify_list, nb);
+}
+EXPORT_SYMBOL(register_module_notifier);
+
+int unregister_module_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&module_notify_list, nb);
+}
+EXPORT_SYMBOL(unregister_module_notifier);
+
+/*
+ * We require a truly strong try_module_get(): 0 means success.
+ * Otherwise an error is returned due to ongoing or failed
+ * initialization etc.
+ */
+static inline int strong_try_module_get(struct module *mod)
+{
+ BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
+ if (mod && mod->state == MODULE_STATE_COMING)
+ return -EBUSY;
+ if (try_module_get(mod))
+ return 0;
+ else
+ return -ENOENT;
+}
+
+static inline void add_taint_module(struct module *mod, unsigned flag,
+ enum lockdep_ok lockdep_ok)
+{
+ add_taint(flag, lockdep_ok);
+ set_bit(flag, &mod->taints);
+}
+
+/*
+ * A thread that wants to hold a reference to a module only while it
+ * is running can call this to safely exit.
+ */
+void __noreturn __module_put_and_kthread_exit(struct module *mod, long code)
+{
+ module_put(mod);
+ kthread_exit(code);
+}
+EXPORT_SYMBOL(__module_put_and_kthread_exit);
+
+/* Find a module section: 0 means not found. */
+static unsigned int find_sec(const struct load_info *info, const char *name)
+{
+ unsigned int i;
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *shdr = &info->sechdrs[i];
+ /* Alloc bit cleared means "ignore it." */
+ if ((shdr->sh_flags & SHF_ALLOC)
+ && strcmp(info->secstrings + shdr->sh_name, name) == 0)
+ return i;
+ }
+ return 0;
+}
+
+/* Find a module section, or NULL. */
+static void *section_addr(const struct load_info *info, const char *name)
+{
+ /* Section 0 has sh_addr 0. */
+ return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
+}
+
+/* Find a module section, or NULL. Fill in number of "objects" in section. */
+static void *section_objs(const struct load_info *info,
+ const char *name,
+ size_t object_size,
+ unsigned int *num)
+{
+ unsigned int sec = find_sec(info, name);
+
+ /* Section 0 has sh_addr 0 and sh_size 0. */
+ *num = info->sechdrs[sec].sh_size / object_size;
+ return (void *)info->sechdrs[sec].sh_addr;
+}
+
+/* Find a module section: 0 means not found. Ignores SHF_ALLOC flag. */
+static unsigned int find_any_sec(const struct load_info *info, const char *name)
+{
+ unsigned int i;
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *shdr = &info->sechdrs[i];
+ if (strcmp(info->secstrings + shdr->sh_name, name) == 0)
+ return i;
+ }
+ return 0;
+}
+
+/*
+ * Find a module section, or NULL. Fill in number of "objects" in section.
+ * Ignores SHF_ALLOC flag.
+ */
+static __maybe_unused void *any_section_objs(const struct load_info *info,
+ const char *name,
+ size_t object_size,
+ unsigned int *num)
+{
+ unsigned int sec = find_any_sec(info, name);
+
+ /* Section 0 has sh_addr 0 and sh_size 0. */
+ *num = info->sechdrs[sec].sh_size / object_size;
+ return (void *)info->sechdrs[sec].sh_addr;
+}
+
+#ifndef CONFIG_MODVERSIONS
+#define symversion(base, idx) NULL
+#else
+#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
+#endif
+
+static const char *kernel_symbol_name(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ return offset_to_ptr(&sym->name_offset);
+#else
+ return sym->name;
+#endif
+}
+
+static const char *kernel_symbol_namespace(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ if (!sym->namespace_offset)
+ return NULL;
+ return offset_to_ptr(&sym->namespace_offset);
+#else
+ return sym->namespace;
+#endif
+}
+
+int cmp_name(const void *name, const void *sym)
+{
+ return strcmp(name, kernel_symbol_name(sym));
+}
+
+static bool find_exported_symbol_in_section(const struct symsearch *syms,
+ struct module *owner,
+ struct find_symbol_arg *fsa)
+{
+ struct kernel_symbol *sym;
+
+ if (!fsa->gplok && syms->license == GPL_ONLY)
+ return false;
+
+ sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
+ sizeof(struct kernel_symbol), cmp_name);
+ if (!sym)
+ return false;
+
+ fsa->owner = owner;
+ fsa->crc = symversion(syms->crcs, sym - syms->start);
+ fsa->sym = sym;
+ fsa->license = syms->license;
+
+ return true;
+}
+
+/*
+ * Find an exported symbol and return it, along with, (optional) crc and
+ * (optional) module which owns it. Needs preempt disabled or module_mutex.
+ */
+bool find_symbol(struct find_symbol_arg *fsa)
+{
+ static const struct symsearch arr[] = {
+ { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
+ NOT_GPL_ONLY },
+ { __start___ksymtab_gpl, __stop___ksymtab_gpl,
+ __start___kcrctab_gpl,
+ GPL_ONLY },
+ };
+ struct module *mod;
+ unsigned int i;
+
+ module_assert_mutex_or_preempt();
+
+ for (i = 0; i < ARRAY_SIZE(arr); i++)
+ if (find_exported_symbol_in_section(&arr[i], NULL, fsa))
+ return true;
+
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
+ struct symsearch arr[] = {
+ { mod->syms, mod->syms + mod->num_syms, mod->crcs,
+ NOT_GPL_ONLY },
+ { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
+ mod->gpl_crcs,
+ GPL_ONLY },
+ };
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
+ for (i = 0; i < ARRAY_SIZE(arr); i++)
+ if (find_exported_symbol_in_section(&arr[i], mod, fsa))
+ return true;
+ }
+
+ pr_debug("Failed to find symbol %s\n", fsa->name);
+ return false;
+}
+
+/*
+ * Search for module by name: must hold module_mutex (or preempt disabled
+ * for read-only access).
+ */
+struct module *find_module_all(const char *name, size_t len,
+ bool even_unformed)
+{
+ struct module *mod;
+
+ module_assert_mutex_or_preempt();
+
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
+ if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
+ return mod;
+ }
+ return NULL;
+}
+
+struct module *find_module(const char *name)
+{
+ return find_module_all(name, strlen(name), false);
+}
+
+#ifdef CONFIG_SMP
+
+static inline void __percpu *mod_percpu(struct module *mod)
+{
+ return mod->percpu;
+}
+
+static int percpu_modalloc(struct module *mod, struct load_info *info)
+{
+ Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
+ unsigned long align = pcpusec->sh_addralign;
+
+ if (!pcpusec->sh_size)
+ return 0;
+
+ if (align > PAGE_SIZE) {
+ pr_warn("%s: per-cpu alignment %li > %li\n",
+ mod->name, align, PAGE_SIZE);
+ align = PAGE_SIZE;
+ }
+
+ mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
+ if (!mod->percpu) {
+ pr_warn("%s: Could not allocate %lu bytes percpu data\n",
+ mod->name, (unsigned long)pcpusec->sh_size);
+ return -ENOMEM;
+ }
+ mod->percpu_size = pcpusec->sh_size;
+ return 0;
+}
+
+static void percpu_modfree(struct module *mod)
+{
+ free_percpu(mod->percpu);
+}
+
+static unsigned int find_pcpusec(struct load_info *info)
+{
+ return find_sec(info, ".data..percpu");
+}
+
+static void percpu_modcopy(struct module *mod,
+ const void *from, unsigned long size)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
+}
+
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+ struct module *mod;
+ unsigned int cpu;
+
+ preempt_disable();
+
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (!mod->percpu_size)
+ continue;
+ for_each_possible_cpu(cpu) {
+ void *start = per_cpu_ptr(mod->percpu, cpu);
+ void *va = (void *)addr;
+
+ if (va >= start && va < start + mod->percpu_size) {
+ if (can_addr) {
+ *can_addr = (unsigned long) (va - start);
+ *can_addr += (unsigned long)
+ per_cpu_ptr(mod->percpu,
+ get_boot_cpu_id());
+ }
+ preempt_enable();
+ return true;
+ }
+ }
+ }
+
+ preempt_enable();
+ return false;
+}
+
+/**
+ * is_module_percpu_address() - test whether address is from module static percpu
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to module static percpu area.
+ *
+ * Return: %true if @addr is from module static percpu area
+ */
+bool is_module_percpu_address(unsigned long addr)
+{
+ return __is_module_percpu_address(addr, NULL);
+}
+
+#else /* ... !CONFIG_SMP */
+
+static inline void __percpu *mod_percpu(struct module *mod)
+{
+ return NULL;
+}
+static int percpu_modalloc(struct module *mod, struct load_info *info)
+{
+ /* UP modules shouldn't have this section: ENOMEM isn't quite right */
+ if (info->sechdrs[info->index.pcpu].sh_size != 0)
+ return -ENOMEM;
+ return 0;
+}
+static inline void percpu_modfree(struct module *mod)
+{
+}
+static unsigned int find_pcpusec(struct load_info *info)
+{
+ return 0;
+}
+static inline void percpu_modcopy(struct module *mod,
+ const void *from, unsigned long size)
+{
+ /* pcpusec should be 0, and size of that section should be 0. */
+ BUG_ON(size != 0);
+}
+bool is_module_percpu_address(unsigned long addr)
+{
+ return false;
+}
+
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+ return false;
+}
+
+#endif /* CONFIG_SMP */
+
+#define MODINFO_ATTR(field) \
+static void setup_modinfo_##field(struct module *mod, const char *s) \
+{ \
+ mod->field = kstrdup(s, GFP_KERNEL); \
+} \
+static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
+ struct module_kobject *mk, char *buffer) \
+{ \
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field); \
+} \
+static int modinfo_##field##_exists(struct module *mod) \
+{ \
+ return mod->field != NULL; \
+} \
+static void free_modinfo_##field(struct module *mod) \
+{ \
+ kfree(mod->field); \
+ mod->field = NULL; \
+} \
+static struct module_attribute modinfo_##field = { \
+ .attr = { .name = __stringify(field), .mode = 0444 }, \
+ .show = show_modinfo_##field, \
+ .setup = setup_modinfo_##field, \
+ .test = modinfo_##field##_exists, \
+ .free = free_modinfo_##field, \
+};
+
+MODINFO_ATTR(version);
+MODINFO_ATTR(srcversion);
+
+static struct {
+ char name[MODULE_NAME_LEN + 1];
+ char taints[MODULE_FLAGS_BUF_SIZE];
+} last_unloaded_module;
+
+#ifdef CONFIG_MODULE_UNLOAD
+
+EXPORT_TRACEPOINT_SYMBOL(module_get);
+
+/* MODULE_REF_BASE is the base reference count by kmodule loader. */
+#define MODULE_REF_BASE 1
+
+/* Init the unload section of the module. */
+static int module_unload_init(struct module *mod)
+{
+ /*
+ * Initialize reference counter to MODULE_REF_BASE.
+ * refcnt == 0 means module is going.
+ */
+ atomic_set(&mod->refcnt, MODULE_REF_BASE);
+
+ INIT_LIST_HEAD(&mod->source_list);
+ INIT_LIST_HEAD(&mod->target_list);
+
+ /* Hold reference count during initialization. */
+ atomic_inc(&mod->refcnt);
+
+ return 0;
+}
+
+/* Does a already use b? */
+static int already_uses(struct module *a, struct module *b)
+{
+ struct module_use *use;
+
+ list_for_each_entry(use, &b->source_list, source_list) {
+ if (use->source == a)
+ return 1;
+ }
+ pr_debug("%s does not use %s!\n", a->name, b->name);
+ return 0;
+}
+
+/*
+ * Module a uses b
+ * - we add 'a' as a "source", 'b' as a "target" of module use
+ * - the module_use is added to the list of 'b' sources (so
+ * 'b' can walk the list to see who sourced them), and of 'a'
+ * targets (so 'a' can see what modules it targets).
+ */
+static int add_module_usage(struct module *a, struct module *b)
+{
+ struct module_use *use;
+
+ pr_debug("Allocating new usage for %s.\n", a->name);
+ use = kmalloc(sizeof(*use), GFP_ATOMIC);
+ if (!use)
+ return -ENOMEM;
+
+ use->source = a;
+ use->target = b;
+ list_add(&use->source_list, &b->source_list);
+ list_add(&use->target_list, &a->target_list);
+ return 0;
+}
+
+/* Module a uses b: caller needs module_mutex() */
+static int ref_module(struct module *a, struct module *b)
+{
+ int err;
+
+ if (b == NULL || already_uses(a, b))
+ return 0;
+
+ /* If module isn't available, we fail. */
+ err = strong_try_module_get(b);
+ if (err)
+ return err;
+
+ err = add_module_usage(a, b);
+ if (err) {
+ module_put(b);
+ return err;
+ }
+ return 0;
+}
+
+/* Clear the unload stuff of the module. */
+static void module_unload_free(struct module *mod)
+{
+ struct module_use *use, *tmp;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
+ struct module *i = use->target;
+ pr_debug("%s unusing %s\n", mod->name, i->name);
+ module_put(i);
+ list_del(&use->source_list);
+ list_del(&use->target_list);
+ kfree(use);
+ }
+ mutex_unlock(&module_mutex);
+}
+
+#ifdef CONFIG_MODULE_FORCE_UNLOAD
+static inline int try_force_unload(unsigned int flags)
+{
+ int ret = (flags & O_TRUNC);
+ if (ret)
+ add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
+ return ret;
+}
+#else
+static inline int try_force_unload(unsigned int flags)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULE_FORCE_UNLOAD */
+
+/* Try to release refcount of module, 0 means success. */
+static int try_release_module_ref(struct module *mod)
+{
+ int ret;
+
+ /* Try to decrement refcnt which we set at loading */
+ ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
+ BUG_ON(ret < 0);
+ if (ret)
+ /* Someone can put this right now, recover with checking */
+ ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
+
+ return ret;
+}
+
+static int try_stop_module(struct module *mod, int flags, int *forced)
+{
+ /* If it's not unused, quit unless we're forcing. */
+ if (try_release_module_ref(mod) != 0) {
+ *forced = try_force_unload(flags);
+ if (!(*forced))
+ return -EWOULDBLOCK;
+ }
+
+ /* Mark it as dying. */
+ mod->state = MODULE_STATE_GOING;
+
+ return 0;
+}
+
+/**
+ * module_refcount() - return the refcount or -1 if unloading
+ * @mod: the module we're checking
+ *
+ * Return:
+ * -1 if the module is in the process of unloading
+ * otherwise the number of references in the kernel to the module
+ */
+int module_refcount(struct module *mod)
+{
+ return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
+}
+EXPORT_SYMBOL(module_refcount);
+
+/* This exists whether we can unload or not */
+static void free_module(struct module *mod);
+
+SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
+ unsigned int, flags)
+{
+ struct module *mod;
+ char name[MODULE_NAME_LEN];
+ char buf[MODULE_FLAGS_BUF_SIZE];
+ int ret, forced = 0;
+
+ if (!capable(CAP_SYS_MODULE) || modules_disabled)
+ return -EPERM;
+
+ if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
+ return -EFAULT;
+ name[MODULE_NAME_LEN-1] = '\0';
+
+ audit_log_kern_module(name);
+
+ if (mutex_lock_interruptible(&module_mutex) != 0)
+ return -EINTR;
+
+ mod = find_module(name);
+ if (!mod) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (!list_empty(&mod->source_list)) {
+ /* Other modules depend on us: get rid of them first. */
+ ret = -EWOULDBLOCK;
+ goto out;
+ }
+
+ /* Doing init or already dying? */
+ if (mod->state != MODULE_STATE_LIVE) {
+ /* FIXME: if (force), slam module count damn the torpedoes */
+ pr_debug("%s already dying\n", mod->name);
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /* If it has an init func, it must have an exit func to unload */
+ if (mod->init && !mod->exit) {
+ forced = try_force_unload(flags);
+ if (!forced) {
+ /* This module can't be removed */
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+
+ ret = try_stop_module(mod, flags, &forced);
+ if (ret != 0)
+ goto out;
+
+ mutex_unlock(&module_mutex);
+ /* Final destruction now no one is using it. */
+ if (mod->exit != NULL)
+ mod->exit();
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ klp_module_going(mod);
+ ftrace_release_mod(mod);
+
+ async_synchronize_full();
+
+ /* Store the name and taints of the last unloaded module for diagnostic purposes */
+ strscpy(last_unloaded_module.name, mod->name, sizeof(last_unloaded_module.name));
+ strscpy(last_unloaded_module.taints, module_flags(mod, buf, false), sizeof(last_unloaded_module.taints));
+
+ free_module(mod);
+ /* someone could wait for the module in add_unformed_module() */
+ wake_up_all(&module_wq);
+ return 0;
+out:
+ mutex_unlock(&module_mutex);
+ return ret;
+}
+
+void __symbol_put(const char *symbol)
+{
+ struct find_symbol_arg fsa = {
+ .name = symbol,
+ .gplok = true,
+ };
+
+ preempt_disable();
+ BUG_ON(!find_symbol(&fsa));
+ module_put(fsa.owner);
+ preempt_enable();
+}
+EXPORT_SYMBOL(__symbol_put);
+
+/* Note this assumes addr is a function, which it currently always is. */
+void symbol_put_addr(void *addr)
+{
+ struct module *modaddr;
+ unsigned long a = (unsigned long)dereference_function_descriptor(addr);
+
+ if (core_kernel_text(a))
+ return;
+
+ /*
+ * Even though we hold a reference on the module; we still need to
+ * disable preemption in order to safely traverse the data structure.
+ */
+ preempt_disable();
+ modaddr = __module_text_address(a);
+ BUG_ON(!modaddr);
+ module_put(modaddr);
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(symbol_put_addr);
+
+static ssize_t show_refcnt(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ return sprintf(buffer, "%i\n", module_refcount(mk->mod));
+}
+
+static struct module_attribute modinfo_refcnt =
+ __ATTR(refcnt, 0444, show_refcnt, NULL);
+
+void __module_get(struct module *module)
+{
+ if (module) {
+ atomic_inc(&module->refcnt);
+ trace_module_get(module, _RET_IP_);
+ }
+}
+EXPORT_SYMBOL(__module_get);
+
+bool try_module_get(struct module *module)
+{
+ bool ret = true;
+
+ if (module) {
+ /* Note: here, we can fail to get a reference */
+ if (likely(module_is_live(module) &&
+ atomic_inc_not_zero(&module->refcnt) != 0))
+ trace_module_get(module, _RET_IP_);
+ else
+ ret = false;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(try_module_get);
+
+void module_put(struct module *module)
+{
+ int ret;
+
+ if (module) {
+ ret = atomic_dec_if_positive(&module->refcnt);
+ WARN_ON(ret < 0); /* Failed to put refcount */
+ trace_module_put(module, _RET_IP_);
+ }
+}
+EXPORT_SYMBOL(module_put);
+
+#else /* !CONFIG_MODULE_UNLOAD */
+static inline void module_unload_free(struct module *mod)
+{
+}
+
+static int ref_module(struct module *a, struct module *b)
+{
+ return strong_try_module_get(b);
+}
+
+static inline int module_unload_init(struct module *mod)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULE_UNLOAD */
+
+size_t module_flags_taint(unsigned long taints, char *buf)
+{
+ size_t l = 0;
+ int i;
+
+ for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
+ if (taint_flags[i].module && test_bit(i, &taints))
+ buf[l++] = taint_flags[i].c_true;
+ }
+
+ return l;
+}
+
+static ssize_t show_initstate(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ const char *state = "unknown";
+
+ switch (mk->mod->state) {
+ case MODULE_STATE_LIVE:
+ state = "live";
+ break;
+ case MODULE_STATE_COMING:
+ state = "coming";
+ break;
+ case MODULE_STATE_GOING:
+ state = "going";
+ break;
+ default:
+ BUG();
+ }
+ return sprintf(buffer, "%s\n", state);
+}
+
+static struct module_attribute modinfo_initstate =
+ __ATTR(initstate, 0444, show_initstate, NULL);
+
+static ssize_t store_uevent(struct module_attribute *mattr,
+ struct module_kobject *mk,
+ const char *buffer, size_t count)
+{
+ int rc;
+
+ rc = kobject_synth_uevent(&mk->kobj, buffer, count);
+ return rc ? rc : count;
+}
+
+struct module_attribute module_uevent =
+ __ATTR(uevent, 0200, NULL, store_uevent);
+
+static ssize_t show_coresize(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ unsigned int size = mk->mod->mem[MOD_TEXT].size;
+
+ if (!IS_ENABLED(CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC)) {
+ for_class_mod_mem_type(type, core_data)
+ size += mk->mod->mem[type].size;
+ }
+ return sprintf(buffer, "%u\n", size);
+}
+
+static struct module_attribute modinfo_coresize =
+ __ATTR(coresize, 0444, show_coresize, NULL);
+
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+static ssize_t show_datasize(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ unsigned int size = 0;
+
+ for_class_mod_mem_type(type, core_data)
+ size += mk->mod->mem[type].size;
+ return sprintf(buffer, "%u\n", size);
+}
+
+static struct module_attribute modinfo_datasize =
+ __ATTR(datasize, 0444, show_datasize, NULL);
+#endif
+
+static ssize_t show_initsize(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ unsigned int size = 0;
+
+ for_class_mod_mem_type(type, init)
+ size += mk->mod->mem[type].size;
+ return sprintf(buffer, "%u\n", size);
+}
+
+static struct module_attribute modinfo_initsize =
+ __ATTR(initsize, 0444, show_initsize, NULL);
+
+static ssize_t show_taint(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ size_t l;
+
+ l = module_flags_taint(mk->mod->taints, buffer);
+ buffer[l++] = '\n';
+ return l;
+}
+
+static struct module_attribute modinfo_taint =
+ __ATTR(taint, 0444, show_taint, NULL);
+
+struct module_attribute *modinfo_attrs[] = {
+ &module_uevent,
+ &modinfo_version,
+ &modinfo_srcversion,
+ &modinfo_initstate,
+ &modinfo_coresize,
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ &modinfo_datasize,
+#endif
+ &modinfo_initsize,
+ &modinfo_taint,
+#ifdef CONFIG_MODULE_UNLOAD
+ &modinfo_refcnt,
+#endif
+ NULL,
+};
+
+size_t modinfo_attrs_count = ARRAY_SIZE(modinfo_attrs);
+
+static const char vermagic[] = VERMAGIC_STRING;
+
+int try_to_force_load(struct module *mod, const char *reason)
+{
+#ifdef CONFIG_MODULE_FORCE_LOAD
+ if (!test_taint(TAINT_FORCED_MODULE))
+ pr_warn("%s: %s: kernel tainted.\n", mod->name, reason);
+ add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
+ return 0;
+#else
+ return -ENOEXEC;
+#endif
+}
+
+/* Parse tag=value strings from .modinfo section */
+char *module_next_tag_pair(char *string, unsigned long *secsize)
+{
+ /* Skip non-zero chars */
+ while (string[0]) {
+ string++;
+ if ((*secsize)-- <= 1)
+ return NULL;
+ }
+
+ /* Skip any zero padding. */
+ while (!string[0]) {
+ string++;
+ if ((*secsize)-- <= 1)
+ return NULL;
+ }
+ return string;
+}
+
+static char *get_next_modinfo(const struct load_info *info, const char *tag,
+ char *prev)
+{
+ char *p;
+ unsigned int taglen = strlen(tag);
+ Elf_Shdr *infosec = &info->sechdrs[info->index.info];
+ unsigned long size = infosec->sh_size;
+
+ /*
+ * get_modinfo() calls made before rewrite_section_headers()
+ * must use sh_offset, as sh_addr isn't set!
+ */
+ char *modinfo = (char *)info->hdr + infosec->sh_offset;
+
+ if (prev) {
+ size -= prev - modinfo;
+ modinfo = module_next_tag_pair(prev, &size);
+ }
+
+ for (p = modinfo; p; p = module_next_tag_pair(p, &size)) {
+ if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
+ return p + taglen + 1;
+ }
+ return NULL;
+}
+
+static char *get_modinfo(const struct load_info *info, const char *tag)
+{
+ return get_next_modinfo(info, tag, NULL);
+}
+
+static int verify_namespace_is_imported(const struct load_info *info,
+ const struct kernel_symbol *sym,
+ struct module *mod)
+{
+ const char *namespace;
+ char *imported_namespace;
+
+ namespace = kernel_symbol_namespace(sym);
+ if (namespace && namespace[0]) {
+ for_each_modinfo_entry(imported_namespace, info, "import_ns") {
+ if (strcmp(namespace, imported_namespace) == 0)
+ return 0;
+ }
+#ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+ pr_warn(
+#else
+ pr_err(
+#endif
+ "%s: module uses symbol (%s) from namespace %s, but does not import it.\n",
+ mod->name, kernel_symbol_name(sym), namespace);
+#ifndef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+ return -EINVAL;
+#endif
+ }
+ return 0;
+}
+
+static bool inherit_taint(struct module *mod, struct module *owner, const char *name)
+{
+ if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints))
+ return true;
+
+ if (mod->using_gplonly_symbols) {
+ pr_err("%s: module using GPL-only symbols uses symbols %s from proprietary module %s.\n",
+ mod->name, name, owner->name);
+ return false;
+ }
+
+ if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) {
+ pr_warn("%s: module uses symbols %s from proprietary module %s, inheriting taint.\n",
+ mod->name, name, owner->name);
+ set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints);
+ }
+ return true;
+}
+
+/* Resolve a symbol for this module. I.e. if we find one, record usage. */
+static const struct kernel_symbol *resolve_symbol(struct module *mod,
+ const struct load_info *info,
+ const char *name,
+ char ownername[])
+{
+ struct find_symbol_arg fsa = {
+ .name = name,
+ .gplok = !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)),
+ .warn = true,
+ };
+ int err;
+
+ /*
+ * The module_mutex should not be a heavily contended lock;
+ * if we get the occasional sleep here, we'll go an extra iteration
+ * in the wait_event_interruptible(), which is harmless.
+ */
+ sched_annotate_sleep();
+ mutex_lock(&module_mutex);
+ if (!find_symbol(&fsa))
+ goto unlock;
+
+ if (fsa.license == GPL_ONLY)
+ mod->using_gplonly_symbols = true;
+
+ if (!inherit_taint(mod, fsa.owner, name)) {
+ fsa.sym = NULL;
+ goto getname;
+ }
+
+ if (!check_version(info, name, mod, fsa.crc)) {
+ fsa.sym = ERR_PTR(-EINVAL);
+ goto getname;
+ }
+
+ err = verify_namespace_is_imported(info, fsa.sym, mod);
+ if (err) {
+ fsa.sym = ERR_PTR(err);
+ goto getname;
+ }
+
+ err = ref_module(mod, fsa.owner);
+ if (err) {
+ fsa.sym = ERR_PTR(err);
+ goto getname;
+ }
+
+getname:
+ /* We must make copy under the lock if we failed to get ref. */
+ strncpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN);
+unlock:
+ mutex_unlock(&module_mutex);
+ return fsa.sym;
+}
+
+static const struct kernel_symbol *
+resolve_symbol_wait(struct module *mod,
+ const struct load_info *info,
+ const char *name)
+{
+ const struct kernel_symbol *ksym;
+ char owner[MODULE_NAME_LEN];
+
+ if (wait_event_interruptible_timeout(module_wq,
+ !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
+ || PTR_ERR(ksym) != -EBUSY,
+ 30 * HZ) <= 0) {
+ pr_warn("%s: gave up waiting for init of module %s.\n",
+ mod->name, owner);
+ }
+ return ksym;
+}
+
+void __weak module_memfree(void *module_region)
+{
+ /*
+ * This memory may be RO, and freeing RO memory in an interrupt is not
+ * supported by vmalloc.
+ */
+ WARN_ON(in_interrupt());
+ vfree(module_region);
+}
+
+void __weak module_arch_cleanup(struct module *mod)
+{
+}
+
+void __weak module_arch_freeing_init(struct module *mod)
+{
+}
+
+static bool mod_mem_use_vmalloc(enum mod_mem_type type)
+{
+ return IS_ENABLED(CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC) &&
+ mod_mem_type_is_core_data(type);
+}
+
+static void *module_memory_alloc(unsigned int size, enum mod_mem_type type)
+{
+ if (mod_mem_use_vmalloc(type))
+ return vzalloc(size);
+ return module_alloc(size);
+}
+
+static void module_memory_free(void *ptr, enum mod_mem_type type)
+{
+ if (mod_mem_use_vmalloc(type))
+ vfree(ptr);
+ else
+ module_memfree(ptr);
+}
+
+static void free_mod_mem(struct module *mod)
+{
+ for_each_mod_mem_type(type) {
+ struct module_memory *mod_mem = &mod->mem[type];
+
+ if (type == MOD_DATA)
+ continue;
+
+ /* Free lock-classes; relies on the preceding sync_rcu(). */
+ lockdep_free_key_range(mod_mem->base, mod_mem->size);
+ if (mod_mem->size)
+ module_memory_free(mod_mem->base, type);
+ }
+
+ /* MOD_DATA hosts mod, so free it at last */
+ lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size);
+ module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA);
+}
+
+/* Free a module, remove from lists, etc. */
+static void free_module(struct module *mod)
+{
+ trace_module_free(mod);
+
+ mod_sysfs_teardown(mod);
+
+ /*
+ * We leave it in list to prevent duplicate loads, but make sure
+ * that noone uses it while it's being deconstructed.
+ */
+ mutex_lock(&module_mutex);
+ mod->state = MODULE_STATE_UNFORMED;
+ mutex_unlock(&module_mutex);
+
+ /* Arch-specific cleanup. */
+ module_arch_cleanup(mod);
+
+ /* Module unload stuff */
+ module_unload_free(mod);
+
+ /* Free any allocated parameters. */
+ destroy_params(mod->kp, mod->num_kp);
+
+ if (is_livepatch_module(mod))
+ free_module_elf(mod);
+
+ /* Now we can delete it from the lists */
+ mutex_lock(&module_mutex);
+ /* Unlink carefully: kallsyms could be walking list. */
+ list_del_rcu(&mod->list);
+ mod_tree_remove(mod);
+ /* Remove this module from bug list, this uses list_del_rcu */
+ module_bug_cleanup(mod);
+ /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */
+ synchronize_rcu();
+ if (try_add_tainted_module(mod))
+ pr_err("%s: adding tainted module to the unloaded tainted modules list failed.\n",
+ mod->name);
+ mutex_unlock(&module_mutex);
+
+ /* This may be empty, but that's OK */
+ module_arch_freeing_init(mod);
+ kfree(mod->args);
+ percpu_modfree(mod);
+
+ free_mod_mem(mod);
+}
+
+void *__symbol_get(const char *symbol)
+{
+ struct find_symbol_arg fsa = {
+ .name = symbol,
+ .gplok = true,
+ .warn = true,
+ };
+
+ preempt_disable();
+ if (!find_symbol(&fsa))
+ goto fail;
+ if (fsa.license != GPL_ONLY) {
+ pr_warn("failing symbol_get of non-GPLONLY symbol %s.\n",
+ symbol);
+ goto fail;
+ }
+ if (strong_try_module_get(fsa.owner))
+ goto fail;
+ preempt_enable();
+ return (void *)kernel_symbol_value(fsa.sym);
+fail:
+ preempt_enable();
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(__symbol_get);
+
+/*
+ * Ensure that an exported symbol [global namespace] does not already exist
+ * in the kernel or in some other module's exported symbol table.
+ *
+ * You must hold the module_mutex.
+ */
+static int verify_exported_symbols(struct module *mod)
+{
+ unsigned int i;
+ const struct kernel_symbol *s;
+ struct {
+ const struct kernel_symbol *sym;
+ unsigned int num;
+ } arr[] = {
+ { mod->syms, mod->num_syms },
+ { mod->gpl_syms, mod->num_gpl_syms },
+ };
+
+ for (i = 0; i < ARRAY_SIZE(arr); i++) {
+ for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
+ struct find_symbol_arg fsa = {
+ .name = kernel_symbol_name(s),
+ .gplok = true,
+ };
+ if (find_symbol(&fsa)) {
+ pr_err("%s: exports duplicate symbol %s"
+ " (owned by %s)\n",
+ mod->name, kernel_symbol_name(s),
+ module_name(fsa.owner));
+ return -ENOEXEC;
+ }
+ }
+ }
+ return 0;
+}
+
+static bool ignore_undef_symbol(Elf_Half emachine, const char *name)
+{
+ /*
+ * On x86, PIC code and Clang non-PIC code may have call foo@PLT. GNU as
+ * before 2.37 produces an unreferenced _GLOBAL_OFFSET_TABLE_ on x86-64.
+ * i386 has a similar problem but may not deserve a fix.
+ *
+ * If we ever have to ignore many symbols, consider refactoring the code to
+ * only warn if referenced by a relocation.
+ */
+ if (emachine == EM_386 || emachine == EM_X86_64)
+ return !strcmp(name, "_GLOBAL_OFFSET_TABLE_");
+ return false;
+}
+
+/* Change all symbols so that st_value encodes the pointer directly. */
+static int simplify_symbols(struct module *mod, const struct load_info *info)
+{
+ Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+ Elf_Sym *sym = (void *)symsec->sh_addr;
+ unsigned long secbase;
+ unsigned int i;
+ int ret = 0;
+ const struct kernel_symbol *ksym;
+
+ for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
+ const char *name = info->strtab + sym[i].st_name;
+
+ switch (sym[i].st_shndx) {
+ case SHN_COMMON:
+ /* Ignore common symbols */
+ if (!strncmp(name, "__gnu_lto", 9))
+ break;
+
+ /*
+ * We compiled with -fno-common. These are not
+ * supposed to happen.
+ */
+ pr_debug("Common symbol: %s\n", name);
+ pr_warn("%s: please compile with -fno-common\n",
+ mod->name);
+ ret = -ENOEXEC;
+ break;
+
+ case SHN_ABS:
+ /* Don't need to do anything */
+ pr_debug("Absolute symbol: 0x%08lx %s\n",
+ (long)sym[i].st_value, name);
+ break;
+
+ case SHN_LIVEPATCH:
+ /* Livepatch symbols are resolved by livepatch */
+ break;
+
+ case SHN_UNDEF:
+ ksym = resolve_symbol_wait(mod, info, name);
+ /* Ok if resolved. */
+ if (ksym && !IS_ERR(ksym)) {
+ sym[i].st_value = kernel_symbol_value(ksym);
+ break;
+ }
+
+ /* Ok if weak or ignored. */
+ if (!ksym &&
+ (ELF_ST_BIND(sym[i].st_info) == STB_WEAK ||
+ ignore_undef_symbol(info->hdr->e_machine, name)))
+ break;
+
+ ret = PTR_ERR(ksym) ?: -ENOENT;
+ pr_warn("%s: Unknown symbol %s (err %d)\n",
+ mod->name, name, ret);
+ break;
+
+ default:
+ /* Divert to percpu allocation if a percpu var. */
+ if (sym[i].st_shndx == info->index.pcpu)
+ secbase = (unsigned long)mod_percpu(mod);
+ else
+ secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
+ sym[i].st_value += secbase;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int apply_relocations(struct module *mod, const struct load_info *info)
+{
+ unsigned int i;
+ int err = 0;
+
+ /* Now do relocations. */
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ unsigned int infosec = info->sechdrs[i].sh_info;
+
+ /* Not a valid relocation section? */
+ if (infosec >= info->hdr->e_shnum)
+ continue;
+
+ /* Don't bother with non-allocated sections */
+ if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
+ continue;
+
+ if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH)
+ err = klp_apply_section_relocs(mod, info->sechdrs,
+ info->secstrings,
+ info->strtab,
+ info->index.sym, i,
+ NULL);
+ else if (info->sechdrs[i].sh_type == SHT_REL)
+ err = apply_relocate(info->sechdrs, info->strtab,
+ info->index.sym, i, mod);
+ else if (info->sechdrs[i].sh_type == SHT_RELA)
+ err = apply_relocate_add(info->sechdrs, info->strtab,
+ info->index.sym, i, mod);
+ if (err < 0)
+ break;
+ }
+ return err;
+}
+
+/* Additional bytes needed by arch in front of individual sections */
+unsigned int __weak arch_mod_section_prepend(struct module *mod,
+ unsigned int section)
+{
+ /* default implementation just returns zero */
+ return 0;
+}
+
+long module_get_offset_and_type(struct module *mod, enum mod_mem_type type,
+ Elf_Shdr *sechdr, unsigned int section)
+{
+ long offset;
+ long mask = ((unsigned long)(type) & SH_ENTSIZE_TYPE_MASK) << SH_ENTSIZE_TYPE_SHIFT;
+
+ mod->mem[type].size += arch_mod_section_prepend(mod, section);
+ offset = ALIGN(mod->mem[type].size, sechdr->sh_addralign ?: 1);
+ mod->mem[type].size = offset + sechdr->sh_size;
+
+ WARN_ON_ONCE(offset & mask);
+ return offset | mask;
+}
+
+bool module_init_layout_section(const char *sname)
+{
+#ifndef CONFIG_MODULE_UNLOAD
+ if (module_exit_section(sname))
+ return true;
+#endif
+ return module_init_section(sname);
+}
+
+static void __layout_sections(struct module *mod, struct load_info *info, bool is_init)
+{
+ unsigned int m, i;
+
+ static const unsigned long masks[][2] = {
+ /*
+ * NOTE: all executable code must be the first section
+ * in this array; otherwise modify the text_size
+ * finder in the two loops below
+ */
+ { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
+ { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
+ { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
+ { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
+ { ARCH_SHF_SMALL | SHF_ALLOC, 0 }
+ };
+ static const int core_m_to_mem_type[] = {
+ MOD_TEXT,
+ MOD_RODATA,
+ MOD_RO_AFTER_INIT,
+ MOD_DATA,
+ MOD_DATA,
+ };
+ static const int init_m_to_mem_type[] = {
+ MOD_INIT_TEXT,
+ MOD_INIT_RODATA,
+ MOD_INVALID,
+ MOD_INIT_DATA,
+ MOD_INIT_DATA,
+ };
+
+ for (m = 0; m < ARRAY_SIZE(masks); ++m) {
+ enum mod_mem_type type = is_init ? init_m_to_mem_type[m] : core_m_to_mem_type[m];
+
+ for (i = 0; i < info->hdr->e_shnum; ++i) {
+ Elf_Shdr *s = &info->sechdrs[i];
+ const char *sname = info->secstrings + s->sh_name;
+
+ if ((s->sh_flags & masks[m][0]) != masks[m][0]
+ || (s->sh_flags & masks[m][1])
+ || s->sh_entsize != ~0UL
+ || is_init != module_init_layout_section(sname))
+ continue;
+
+ if (WARN_ON_ONCE(type == MOD_INVALID))
+ continue;
+
+ s->sh_entsize = module_get_offset_and_type(mod, type, s, i);
+ pr_debug("\t%s\n", sname);
+ }
+ }
+}
+
+/*
+ * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
+ * might -- code, read-only data, read-write data, small data. Tally
+ * sizes, and place the offsets into sh_entsize fields: high bit means it
+ * belongs in init.
+ */
+static void layout_sections(struct module *mod, struct load_info *info)
+{
+ unsigned int i;
+
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ info->sechdrs[i].sh_entsize = ~0UL;
+
+ pr_debug("Core section allocation order for %s:\n", mod->name);
+ __layout_sections(mod, info, false);
+
+ pr_debug("Init section allocation order for %s:\n", mod->name);
+ __layout_sections(mod, info, true);
+}
+
+static void module_license_taint_check(struct module *mod, const char *license)
+{
+ if (!license)
+ license = "unspecified";
+
+ if (!license_is_gpl_compatible(license)) {
+ if (!test_taint(TAINT_PROPRIETARY_MODULE))
+ pr_warn("%s: module license '%s' taints kernel.\n",
+ mod->name, license);
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
+ }
+}
+
+static void setup_modinfo(struct module *mod, struct load_info *info)
+{
+ struct module_attribute *attr;
+ int i;
+
+ for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ if (attr->setup)
+ attr->setup(mod, get_modinfo(info, attr->attr.name));
+ }
+}
+
+static void free_modinfo(struct module *mod)
+{
+ struct module_attribute *attr;
+ int i;
+
+ for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ if (attr->free)
+ attr->free(mod);
+ }
+}
+
+void * __weak module_alloc(unsigned long size)
+{
+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
+ NUMA_NO_NODE, __builtin_return_address(0));
+}
+
+bool __weak module_init_section(const char *name)
+{
+ return strstarts(name, ".init");
+}
+
+bool __weak module_exit_section(const char *name)
+{
+ return strstarts(name, ".exit");
+}
+
+static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr)
+{
+#if defined(CONFIG_64BIT)
+ unsigned long long secend;
+#else
+ unsigned long secend;
+#endif
+
+ /*
+ * Check for both overflow and offset/size being
+ * too large.
+ */
+ secend = shdr->sh_offset + shdr->sh_size;
+ if (secend < shdr->sh_offset || secend > info->len)
+ return -ENOEXEC;
+
+ return 0;
+}
+
+/*
+ * Check userspace passed ELF module against our expectations, and cache
+ * useful variables for further processing as we go.
+ *
+ * This does basic validity checks against section offsets and sizes, the
+ * section name string table, and the indices used for it (sh_name).
+ *
+ * As a last step, since we're already checking the ELF sections we cache
+ * useful variables which will be used later for our convenience:
+ *
+ * o pointers to section headers
+ * o cache the modinfo symbol section
+ * o cache the string symbol section
+ * o cache the module section
+ *
+ * As a last step we set info->mod to the temporary copy of the module in
+ * info->hdr. The final one will be allocated in move_module(). Any
+ * modifications we make to our copy of the module will be carried over
+ * to the final minted module.
+ */
+static int elf_validity_cache_copy(struct load_info *info, int flags)
+{
+ unsigned int i;
+ Elf_Shdr *shdr, *strhdr;
+ int err;
+ unsigned int num_mod_secs = 0, mod_idx;
+ unsigned int num_info_secs = 0, info_idx;
+ unsigned int num_sym_secs = 0, sym_idx;
+
+ if (info->len < sizeof(*(info->hdr))) {
+ pr_err("Invalid ELF header len %lu\n", info->len);
+ goto no_exec;
+ }
+
+ if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0) {
+ pr_err("Invalid ELF header magic: != %s\n", ELFMAG);
+ goto no_exec;
+ }
+ if (info->hdr->e_type != ET_REL) {
+ pr_err("Invalid ELF header type: %u != %u\n",
+ info->hdr->e_type, ET_REL);
+ goto no_exec;
+ }
+ if (!elf_check_arch(info->hdr)) {
+ pr_err("Invalid architecture in ELF header: %u\n",
+ info->hdr->e_machine);
+ goto no_exec;
+ }
+ if (!module_elf_check_arch(info->hdr)) {
+ pr_err("Invalid module architecture in ELF header: %u\n",
+ info->hdr->e_machine);
+ goto no_exec;
+ }
+ if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) {
+ pr_err("Invalid ELF section header size\n");
+ goto no_exec;
+ }
+
+ /*
+ * e_shnum is 16 bits, and sizeof(Elf_Shdr) is
+ * known and small. So e_shnum * sizeof(Elf_Shdr)
+ * will not overflow unsigned long on any platform.
+ */
+ if (info->hdr->e_shoff >= info->len
+ || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
+ info->len - info->hdr->e_shoff)) {
+ pr_err("Invalid ELF section header overflow\n");
+ goto no_exec;
+ }
+
+ info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
+
+ /*
+ * Verify if the section name table index is valid.
+ */
+ if (info->hdr->e_shstrndx == SHN_UNDEF
+ || info->hdr->e_shstrndx >= info->hdr->e_shnum) {
+ pr_err("Invalid ELF section name index: %d || e_shstrndx (%d) >= e_shnum (%d)\n",
+ info->hdr->e_shstrndx, info->hdr->e_shstrndx,
+ info->hdr->e_shnum);
+ goto no_exec;
+ }
+
+ strhdr = &info->sechdrs[info->hdr->e_shstrndx];
+ err = validate_section_offset(info, strhdr);
+ if (err < 0) {
+ pr_err("Invalid ELF section hdr(type %u)\n", strhdr->sh_type);
+ return err;
+ }
+
+ /*
+ * The section name table must be NUL-terminated, as required
+ * by the spec. This makes strcmp and pr_* calls that access
+ * strings in the section safe.
+ */
+ info->secstrings = (void *)info->hdr + strhdr->sh_offset;
+ if (strhdr->sh_size == 0) {
+ pr_err("empty section name table\n");
+ goto no_exec;
+ }
+ if (info->secstrings[strhdr->sh_size - 1] != '\0') {
+ pr_err("ELF Spec violation: section name table isn't null terminated\n");
+ goto no_exec;
+ }
+
+ /*
+ * The code assumes that section 0 has a length of zero and
+ * an addr of zero, so check for it.
+ */
+ if (info->sechdrs[0].sh_type != SHT_NULL
+ || info->sechdrs[0].sh_size != 0
+ || info->sechdrs[0].sh_addr != 0) {
+ pr_err("ELF Spec violation: section 0 type(%d)!=SH_NULL or non-zero len or addr\n",
+ info->sechdrs[0].sh_type);
+ goto no_exec;
+ }
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ shdr = &info->sechdrs[i];
+ switch (shdr->sh_type) {
+ case SHT_NULL:
+ case SHT_NOBITS:
+ continue;
+ case SHT_SYMTAB:
+ if (shdr->sh_link == SHN_UNDEF
+ || shdr->sh_link >= info->hdr->e_shnum) {
+ pr_err("Invalid ELF sh_link!=SHN_UNDEF(%d) or (sh_link(%d) >= hdr->e_shnum(%d)\n",
+ shdr->sh_link, shdr->sh_link,
+ info->hdr->e_shnum);
+ goto no_exec;
+ }
+ num_sym_secs++;
+ sym_idx = i;
+ fallthrough;
+ default:
+ err = validate_section_offset(info, shdr);
+ if (err < 0) {
+ pr_err("Invalid ELF section in module (section %u type %u)\n",
+ i, shdr->sh_type);
+ return err;
+ }
+ if (strcmp(info->secstrings + shdr->sh_name,
+ ".gnu.linkonce.this_module") == 0) {
+ num_mod_secs++;
+ mod_idx = i;
+ } else if (strcmp(info->secstrings + shdr->sh_name,
+ ".modinfo") == 0) {
+ num_info_secs++;
+ info_idx = i;
+ }
+
+ if (shdr->sh_flags & SHF_ALLOC) {
+ if (shdr->sh_name >= strhdr->sh_size) {
+ pr_err("Invalid ELF section name in module (section %u type %u)\n",
+ i, shdr->sh_type);
+ return -ENOEXEC;
+ }
+ }
+ break;
+ }
+ }
+
+ if (num_info_secs > 1) {
+ pr_err("Only one .modinfo section must exist.\n");
+ goto no_exec;
+ } else if (num_info_secs == 1) {
+ /* Try to find a name early so we can log errors with a module name */
+ info->index.info = info_idx;
+ info->name = get_modinfo(info, "name");
+ }
+
+ if (num_sym_secs != 1) {
+ pr_warn("%s: module has no symbols (stripped?)\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ goto no_exec;
+ }
+
+ /* Sets internal symbols and strings. */
+ info->index.sym = sym_idx;
+ shdr = &info->sechdrs[sym_idx];
+ info->index.str = shdr->sh_link;
+ info->strtab = (char *)info->hdr + info->sechdrs[info->index.str].sh_offset;
+
+ /*
+ * The ".gnu.linkonce.this_module" ELF section is special. It is
+ * what modpost uses to refer to __this_module and let's use rely
+ * on THIS_MODULE to point to &__this_module properly. The kernel's
+ * modpost declares it on each modules's *.mod.c file. If the struct
+ * module of the kernel changes a full kernel rebuild is required.
+ *
+ * We have a few expectaions for this special section, the following
+ * code validates all this for us:
+ *
+ * o Only one section must exist
+ * o We expect the kernel to always have to allocate it: SHF_ALLOC
+ * o The section size must match the kernel's run time's struct module
+ * size
+ */
+ if (num_mod_secs != 1) {
+ pr_err("module %s: Only one .gnu.linkonce.this_module section must exist.\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ goto no_exec;
+ }
+
+ shdr = &info->sechdrs[mod_idx];
+
+ /*
+ * This is already implied on the switch above, however let's be
+ * pedantic about it.
+ */
+ if (shdr->sh_type == SHT_NOBITS) {
+ pr_err("module %s: .gnu.linkonce.this_module section must have a size set\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ goto no_exec;
+ }
+
+ if (!(shdr->sh_flags & SHF_ALLOC)) {
+ pr_err("module %s: .gnu.linkonce.this_module must occupy memory during process execution\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ goto no_exec;
+ }
+
+ if (shdr->sh_size != sizeof(struct module)) {
+ pr_err("module %s: .gnu.linkonce.this_module section size must match the kernel's built struct module size at run time\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ goto no_exec;
+ }
+
+ info->index.mod = mod_idx;
+
+ /* This is temporary: point mod into copy of data. */
+ info->mod = (void *)info->hdr + shdr->sh_offset;
+
+ /*
+ * If we didn't load the .modinfo 'name' field earlier, fall back to
+ * on-disk struct mod 'name' field.
+ */
+ if (!info->name)
+ info->name = info->mod->name;
+
+ if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
+ info->index.vers = 0; /* Pretend no __versions section! */
+ else
+ info->index.vers = find_sec(info, "__versions");
+
+ info->index.pcpu = find_pcpusec(info);
+
+ return 0;
+
+no_exec:
+ return -ENOEXEC;
+}
+
+#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
+
+static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
+{
+ do {
+ unsigned long n = min(len, COPY_CHUNK_SIZE);
+
+ if (copy_from_user(dst, usrc, n) != 0)
+ return -EFAULT;
+ cond_resched();
+ dst += n;
+ usrc += n;
+ len -= n;
+ } while (len);
+ return 0;
+}
+
+static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
+{
+ if (!get_modinfo(info, "livepatch"))
+ /* Nothing more to do */
+ return 0;
+
+ if (set_livepatch_module(mod))
+ return 0;
+
+ pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
+ mod->name);
+ return -ENOEXEC;
+}
+
+static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
+{
+ if (retpoline_module_ok(get_modinfo(info, "retpoline")))
+ return;
+
+ pr_warn("%s: loading module not compiled with retpoline compiler.\n",
+ mod->name);
+}
+
+/* Sets info->hdr and info->len. */
+static int copy_module_from_user(const void __user *umod, unsigned long len,
+ struct load_info *info)
+{
+ int err;
+
+ info->len = len;
+ if (info->len < sizeof(*(info->hdr)))
+ return -ENOEXEC;
+
+ err = security_kernel_load_data(LOADING_MODULE, true);
+ if (err)
+ return err;
+
+ /* Suck in entire file: we'll want most of it. */
+ info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN);
+ if (!info->hdr)
+ return -ENOMEM;
+
+ if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ err = security_kernel_post_load_data((char *)info->hdr, info->len,
+ LOADING_MODULE, "init_module");
+out:
+ if (err)
+ vfree(info->hdr);
+
+ return err;
+}
+
+static void free_copy(struct load_info *info, int flags)
+{
+ if (flags & MODULE_INIT_COMPRESSED_FILE)
+ module_decompress_cleanup(info);
+ else
+ vfree(info->hdr);
+}
+
+static int rewrite_section_headers(struct load_info *info, int flags)
+{
+ unsigned int i;
+
+ /* This should always be true, but let's be sure. */
+ info->sechdrs[0].sh_addr = 0;
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *shdr = &info->sechdrs[i];
+
+ /*
+ * Mark all sections sh_addr with their address in the
+ * temporary image.
+ */
+ shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
+
+ }
+
+ /* Track but don't keep modinfo and version sections. */
+ info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
+
+ return 0;
+}
+
+/*
+ * These calls taint the kernel depending certain module circumstances */
+static void module_augment_kernel_taints(struct module *mod, struct load_info *info)
+{
+ int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);
+
+ if (!get_modinfo(info, "intree")) {
+ if (!test_taint(TAINT_OOT_MODULE))
+ pr_warn("%s: loading out-of-tree module taints kernel.\n",
+ mod->name);
+ add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
+ }
+
+ check_modinfo_retpoline(mod, info);
+
+ if (get_modinfo(info, "staging")) {
+ add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
+ pr_warn("%s: module is from the staging directory, the quality "
+ "is unknown, you have been warned.\n", mod->name);
+ }
+
+ if (is_livepatch_module(mod)) {
+ add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+ pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
+ mod->name);
+ }
+
+ module_license_taint_check(mod, get_modinfo(info, "license"));
+
+ if (get_modinfo(info, "test")) {
+ if (!test_taint(TAINT_TEST))
+ pr_warn("%s: loading test module taints kernel.\n",
+ mod->name);
+ add_taint_module(mod, TAINT_TEST, LOCKDEP_STILL_OK);
+ }
+#ifdef CONFIG_MODULE_SIG
+ mod->sig_ok = info->sig_ok;
+ if (!mod->sig_ok) {
+ pr_notice_once("%s: module verification failed: signature "
+ "and/or required key missing - tainting "
+ "kernel\n", mod->name);
+ add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
+ }
+#endif
+
+ /*
+ * ndiswrapper is under GPL by itself, but loads proprietary modules.
+ * Don't use add_taint_module(), as it would prevent ndiswrapper from
+ * using GPL-only symbols it needs.
+ */
+ if (strcmp(mod->name, "ndiswrapper") == 0)
+ add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
+
+ /* driverloader was caught wrongly pretending to be under GPL */
+ if (strcmp(mod->name, "driverloader") == 0)
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
+
+ /* lve claims to be GPL but upstream won't provide source */
+ if (strcmp(mod->name, "lve") == 0)
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
+
+ if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
+ pr_warn("%s: module license taints kernel.\n", mod->name);
+
+}
+
+static int check_modinfo(struct module *mod, struct load_info *info, int flags)
+{
+ const char *modmagic = get_modinfo(info, "vermagic");
+ int err;
+
+ if (flags & MODULE_INIT_IGNORE_VERMAGIC)
+ modmagic = NULL;
+
+ /* This is allowed: modprobe --force will invalidate it. */
+ if (!modmagic) {
+ err = try_to_force_load(mod, "bad vermagic");
+ if (err)
+ return err;
+ } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
+ pr_err("%s: version magic '%s' should be '%s'\n",
+ info->name, modmagic, vermagic);
+ return -ENOEXEC;
+ }
+
+ err = check_modinfo_livepatch(mod, info);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int find_module_sections(struct module *mod, struct load_info *info)
+{
+ mod->kp = section_objs(info, "__param",
+ sizeof(*mod->kp), &mod->num_kp);
+ mod->syms = section_objs(info, "__ksymtab",
+ sizeof(*mod->syms), &mod->num_syms);
+ mod->crcs = section_addr(info, "__kcrctab");
+ mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
+ sizeof(*mod->gpl_syms),
+ &mod->num_gpl_syms);
+ mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
+
+#ifdef CONFIG_CONSTRUCTORS
+ mod->ctors = section_objs(info, ".ctors",
+ sizeof(*mod->ctors), &mod->num_ctors);
+ if (!mod->ctors)
+ mod->ctors = section_objs(info, ".init_array",
+ sizeof(*mod->ctors), &mod->num_ctors);
+ else if (find_sec(info, ".init_array")) {
+ /*
+ * This shouldn't happen with same compiler and binutils
+ * building all parts of the module.
+ */
+ pr_warn("%s: has both .ctors and .init_array.\n",
+ mod->name);
+ return -EINVAL;
+ }
+#endif
+
+ mod->noinstr_text_start = section_objs(info, ".noinstr.text", 1,
+ &mod->noinstr_text_size);
+
+#ifdef CONFIG_TRACEPOINTS
+ mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
+ sizeof(*mod->tracepoints_ptrs),
+ &mod->num_tracepoints);
+#endif
+#ifdef CONFIG_TREE_SRCU
+ mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs",
+ sizeof(*mod->srcu_struct_ptrs),
+ &mod->num_srcu_structs);
+#endif
+#ifdef CONFIG_BPF_EVENTS
+ mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map",
+ sizeof(*mod->bpf_raw_events),
+ &mod->num_bpf_raw_events);
+#endif
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ mod->btf_data = any_section_objs(info, ".BTF", 1, &mod->btf_data_size);
+#endif
+#ifdef CONFIG_JUMP_LABEL
+ mod->jump_entries = section_objs(info, "__jump_table",
+ sizeof(*mod->jump_entries),
+ &mod->num_jump_entries);
+#endif
+#ifdef CONFIG_EVENT_TRACING
+ mod->trace_events = section_objs(info, "_ftrace_events",
+ sizeof(*mod->trace_events),
+ &mod->num_trace_events);
+ mod->trace_evals = section_objs(info, "_ftrace_eval_map",
+ sizeof(*mod->trace_evals),
+ &mod->num_trace_evals);
+#endif
+#ifdef CONFIG_TRACING
+ mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
+ sizeof(*mod->trace_bprintk_fmt_start),
+ &mod->num_trace_bprintk_fmt);
+#endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+ /* sechdrs[0].sh_size is always zero */
+ mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION,
+ sizeof(*mod->ftrace_callsites),
+ &mod->num_ftrace_callsites);
+#endif
+#ifdef CONFIG_FUNCTION_ERROR_INJECTION
+ mod->ei_funcs = section_objs(info, "_error_injection_whitelist",
+ sizeof(*mod->ei_funcs),
+ &mod->num_ei_funcs);
+#endif
+#ifdef CONFIG_KPROBES
+ mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1,
+ &mod->kprobes_text_size);
+ mod->kprobe_blacklist = section_objs(info, "_kprobe_blacklist",
+ sizeof(unsigned long),
+ &mod->num_kprobe_blacklist);
+#endif
+#ifdef CONFIG_PRINTK_INDEX
+ mod->printk_index_start = section_objs(info, ".printk_index",
+ sizeof(*mod->printk_index_start),
+ &mod->printk_index_size);
+#endif
+#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
+ mod->static_call_sites = section_objs(info, ".static_call_sites",
+ sizeof(*mod->static_call_sites),
+ &mod->num_static_call_sites);
+#endif
+#if IS_ENABLED(CONFIG_KUNIT)
+ mod->kunit_suites = section_objs(info, ".kunit_test_suites",
+ sizeof(*mod->kunit_suites),
+ &mod->num_kunit_suites);
+ mod->kunit_init_suites = section_objs(info, ".kunit_init_test_suites",
+ sizeof(*mod->kunit_init_suites),
+ &mod->num_kunit_init_suites);
+#endif
+
+ mod->extable = section_objs(info, "__ex_table",
+ sizeof(*mod->extable), &mod->num_exentries);
+
+ if (section_addr(info, "__obsparm"))
+ pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
+
+#ifdef CONFIG_DYNAMIC_DEBUG_CORE
+ mod->dyndbg_info.descs = section_objs(info, "__dyndbg",
+ sizeof(*mod->dyndbg_info.descs),
+ &mod->dyndbg_info.num_descs);
+ mod->dyndbg_info.classes = section_objs(info, "__dyndbg_classes",
+ sizeof(*mod->dyndbg_info.classes),
+ &mod->dyndbg_info.num_classes);
+#endif
+
+ return 0;
+}
+
+static int move_module(struct module *mod, struct load_info *info)
+{
+ int i;
+ void *ptr;
+ enum mod_mem_type t = 0;
+ int ret = -ENOMEM;
+
+ for_each_mod_mem_type(type) {
+ if (!mod->mem[type].size) {
+ mod->mem[type].base = NULL;
+ continue;
+ }
+ mod->mem[type].size = PAGE_ALIGN(mod->mem[type].size);
+ ptr = module_memory_alloc(mod->mem[type].size, type);
+ /*
+ * The pointer to these blocks of memory are stored on the module
+ * structure and we keep that around so long as the module is
+ * around. We only free that memory when we unload the module.
+ * Just mark them as not being a leak then. The .init* ELF
+ * sections *do* get freed after boot so we *could* treat them
+ * slightly differently with kmemleak_ignore() and only grey
+ * them out as they work as typical memory allocations which
+ * *do* eventually get freed, but let's just keep things simple
+ * and avoid *any* false positives.
+ */
+ kmemleak_not_leak(ptr);
+ if (!ptr) {
+ t = type;
+ goto out_enomem;
+ }
+ memset(ptr, 0, mod->mem[type].size);
+ mod->mem[type].base = ptr;
+ }
+
+ /* Transfer each section which specifies SHF_ALLOC */
+ pr_debug("Final section addresses for %s:\n", mod->name);
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ void *dest;
+ Elf_Shdr *shdr = &info->sechdrs[i];
+ enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
+
+ if (!(shdr->sh_flags & SHF_ALLOC))
+ continue;
+
+ dest = mod->mem[type].base + (shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK);
+
+ if (shdr->sh_type != SHT_NOBITS) {
+ /*
+ * Our ELF checker already validated this, but let's
+ * be pedantic and make the goal clearer. We actually
+ * end up copying over all modifications made to the
+ * userspace copy of the entire struct module.
+ */
+ if (i == info->index.mod &&
+ (WARN_ON_ONCE(shdr->sh_size != sizeof(struct module)))) {
+ ret = -ENOEXEC;
+ goto out_enomem;
+ }
+ memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
+ }
+ /*
+ * Update the userspace copy's ELF section address to point to
+ * our newly allocated memory as a pure convenience so that
+ * users of info can keep taking advantage and using the newly
+ * minted official memory area.
+ */
+ shdr->sh_addr = (unsigned long)dest;
+ pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr,
+ (long)shdr->sh_size, info->secstrings + shdr->sh_name);
+ }
+
+ return 0;
+out_enomem:
+ for (t--; t >= 0; t--)
+ module_memory_free(mod->mem[t].base, t);
+ return ret;
+}
+
+static int check_export_symbol_versions(struct module *mod)
+{
+#ifdef CONFIG_MODVERSIONS
+ if ((mod->num_syms && !mod->crcs) ||
+ (mod->num_gpl_syms && !mod->gpl_crcs)) {
+ return try_to_force_load(mod,
+ "no versions for exported symbols");
+ }
+#endif
+ return 0;
+}
+
+static void flush_module_icache(const struct module *mod)
+{
+ /*
+ * Flush the instruction cache, since we've played with text.
+ * Do it before processing of module parameters, so the module
+ * can provide parameter accessor functions of its own.
+ */
+ for_each_mod_mem_type(type) {
+ const struct module_memory *mod_mem = &mod->mem[type];
+
+ if (mod_mem->size) {
+ flush_icache_range((unsigned long)mod_mem->base,
+ (unsigned long)mod_mem->base + mod_mem->size);
+ }
+ }
+}
+
+bool __weak module_elf_check_arch(Elf_Ehdr *hdr)
+{
+ return true;
+}
+
+int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs,
+ char *secstrings,
+ struct module *mod)
+{
+ return 0;
+}
+
+/* module_blacklist is a comma-separated list of module names */
+static char *module_blacklist;
+static bool blacklisted(const char *module_name)
+{
+ const char *p;
+ size_t len;
+
+ if (!module_blacklist)
+ return false;
+
+ for (p = module_blacklist; *p; p += len) {
+ len = strcspn(p, ",");
+ if (strlen(module_name) == len && !memcmp(module_name, p, len))
+ return true;
+ if (p[len] == ',')
+ len++;
+ }
+ return false;
+}
+core_param(module_blacklist, module_blacklist, charp, 0400);
+
+static struct module *layout_and_allocate(struct load_info *info, int flags)
+{
+ struct module *mod;
+ unsigned int ndx;
+ int err;
+
+ /* Allow arches to frob section contents and sizes. */
+ err = module_frob_arch_sections(info->hdr, info->sechdrs,
+ info->secstrings, info->mod);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ err = module_enforce_rwx_sections(info->hdr, info->sechdrs,
+ info->secstrings, info->mod);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ /* We will do a special allocation for per-cpu sections later. */
+ info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
+
+ /*
+ * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
+ * layout_sections() can put it in the right place.
+ * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
+ */
+ ndx = find_sec(info, ".data..ro_after_init");
+ if (ndx)
+ info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+ /*
+ * Mark the __jump_table section as ro_after_init as well: these data
+ * structures are never modified, with the exception of entries that
+ * refer to code in the __init section, which are annotated as such
+ * at module load time.
+ */
+ ndx = find_sec(info, "__jump_table");
+ if (ndx)
+ info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+
+ /*
+ * Determine total sizes, and put offsets in sh_entsize. For now
+ * this is done generically; there doesn't appear to be any
+ * special cases for the architectures.
+ */
+ layout_sections(info->mod, info);
+ layout_symtab(info->mod, info);
+
+ /* Allocate and move to the final place */
+ err = move_module(info->mod, info);
+ if (err)
+ return ERR_PTR(err);
+
+ /* Module has been copied to its final place now: return it. */
+ mod = (void *)info->sechdrs[info->index.mod].sh_addr;
+ kmemleak_load_module(mod, info);
+ return mod;
+}
+
+/* mod is no longer valid after this! */
+static void module_deallocate(struct module *mod, struct load_info *info)
+{
+ percpu_modfree(mod);
+ module_arch_freeing_init(mod);
+
+ free_mod_mem(mod);
+}
+
+int __weak module_finalize(const Elf_Ehdr *hdr,
+ const Elf_Shdr *sechdrs,
+ struct module *me)
+{
+ return 0;
+}
+
+static int post_relocation(struct module *mod, const struct load_info *info)
+{
+ /* Sort exception table now relocations are done. */
+ sort_extable(mod->extable, mod->extable + mod->num_exentries);
+
+ /* Copy relocated percpu area over. */
+ percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
+ info->sechdrs[info->index.pcpu].sh_size);
+
+ /* Setup kallsyms-specific fields. */
+ add_kallsyms(mod, info);
+
+ /* Arch-specific module finalizing. */
+ return module_finalize(info->hdr, info->sechdrs, mod);
+}
+
+/* Call module constructors. */
+static void do_mod_ctors(struct module *mod)
+{
+#ifdef CONFIG_CONSTRUCTORS
+ unsigned long i;
+
+ for (i = 0; i < mod->num_ctors; i++)
+ mod->ctors[i]();
+#endif
+}
+
+/* For freeing module_init on success, in case kallsyms traversing */
+struct mod_initfree {
+ struct llist_node node;
+ void *init_text;
+ void *init_data;
+ void *init_rodata;
+};
+
+static void do_free_init(struct work_struct *w)
+{
+ struct llist_node *pos, *n, *list;
+ struct mod_initfree *initfree;
+
+ list = llist_del_all(&init_free_list);
+
+ synchronize_rcu();
+
+ llist_for_each_safe(pos, n, list) {
+ initfree = container_of(pos, struct mod_initfree, node);
+ module_memfree(initfree->init_text);
+ module_memfree(initfree->init_data);
+ module_memfree(initfree->init_rodata);
+ kfree(initfree);
+ }
+}
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "module."
+/* Default value for module->async_probe_requested */
+static bool async_probe;
+module_param(async_probe, bool, 0644);
+
+/*
+ * This is where the real work happens.
+ *
+ * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
+ * helper command 'lx-symbols'.
+ */
+static noinline int do_init_module(struct module *mod)
+{
+ int ret = 0;
+ struct mod_initfree *freeinit;
+#if defined(CONFIG_MODULE_STATS)
+ unsigned int text_size = 0, total_size = 0;
+
+ for_each_mod_mem_type(type) {
+ const struct module_memory *mod_mem = &mod->mem[type];
+ if (mod_mem->size) {
+ total_size += mod_mem->size;
+ if (type == MOD_TEXT || type == MOD_INIT_TEXT)
+ text_size += mod_mem->size;
+ }
+ }
+#endif
+
+ freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
+ if (!freeinit) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ freeinit->init_text = mod->mem[MOD_INIT_TEXT].base;
+ freeinit->init_data = mod->mem[MOD_INIT_DATA].base;
+ freeinit->init_rodata = mod->mem[MOD_INIT_RODATA].base;
+
+ do_mod_ctors(mod);
+ /* Start the module */
+ if (mod->init != NULL)
+ ret = do_one_initcall(mod->init);
+ if (ret < 0) {
+ goto fail_free_freeinit;
+ }
+ if (ret > 0) {
+ pr_warn("%s: '%s'->init suspiciously returned %d, it should "
+ "follow 0/-E convention\n"
+ "%s: loading module anyway...\n",
+ __func__, mod->name, ret, __func__);
+ dump_stack();
+ }
+
+ /* Now it's a first class citizen! */
+ mod->state = MODULE_STATE_LIVE;
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_LIVE, mod);
+
+ /* Delay uevent until module has finished its init routine */
+ kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
+
+ /*
+ * We need to finish all async code before the module init sequence
+ * is done. This has potential to deadlock if synchronous module
+ * loading is requested from async (which is not allowed!).
+ *
+ * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous
+ * request_module() from async workers") for more details.
+ */
+ if (!mod->async_probe_requested)
+ async_synchronize_full();
+
+ ftrace_free_mem(mod, mod->mem[MOD_INIT_TEXT].base,
+ mod->mem[MOD_INIT_TEXT].base + mod->mem[MOD_INIT_TEXT].size);
+ mutex_lock(&module_mutex);
+ /* Drop initial reference. */
+ module_put(mod);
+ trim_init_extable(mod);
+#ifdef CONFIG_KALLSYMS
+ /* Switch to core kallsyms now init is done: kallsyms may be walking! */
+ rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
+#endif
+ module_enable_ro(mod, true);
+ mod_tree_remove_init(mod);
+ module_arch_freeing_init(mod);
+ for_class_mod_mem_type(type, init) {
+ mod->mem[type].base = NULL;
+ mod->mem[type].size = 0;
+ }
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ /* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */
+ mod->btf_data = NULL;
+#endif
+ /*
+ * We want to free module_init, but be aware that kallsyms may be
+ * walking this with preempt disabled. In all the failure paths, we
+ * call synchronize_rcu(), but we don't want to slow down the success
+ * path. module_memfree() cannot be called in an interrupt, so do the
+ * work and call synchronize_rcu() in a work queue.
+ *
+ * Note that module_alloc() on most architectures creates W+X page
+ * mappings which won't be cleaned up until do_free_init() runs. Any
+ * code such as mark_rodata_ro() which depends on those mappings to
+ * be cleaned up needs to sync with the queued work - ie
+ * rcu_barrier()
+ */
+ if (llist_add(&freeinit->node, &init_free_list))
+ schedule_work(&init_free_wq);
+
+ mutex_unlock(&module_mutex);
+ wake_up_all(&module_wq);
+
+ mod_stat_add_long(text_size, &total_text_size);
+ mod_stat_add_long(total_size, &total_mod_size);
+
+ mod_stat_inc(&modcount);
+
+ return 0;
+
+fail_free_freeinit:
+ kfree(freeinit);
+fail:
+ /* Try to protect us from buggy refcounters. */
+ mod->state = MODULE_STATE_GOING;
+ synchronize_rcu();
+ module_put(mod);
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ klp_module_going(mod);
+ ftrace_release_mod(mod);
+ free_module(mod);
+ wake_up_all(&module_wq);
+
+ return ret;
+}
+
+static int may_init_module(void)
+{
+ if (!capable(CAP_SYS_MODULE) || modules_disabled)
+ return -EPERM;
+
+ return 0;
+}
+
+/* Is this module of this name done loading? No locks held. */
+static bool finished_loading(const char *name)
+{
+ struct module *mod;
+ bool ret;
+
+ /*
+ * The module_mutex should not be a heavily contended lock;
+ * if we get the occasional sleep here, we'll go an extra iteration
+ * in the wait_event_interruptible(), which is harmless.
+ */
+ sched_annotate_sleep();
+ mutex_lock(&module_mutex);
+ mod = find_module_all(name, strlen(name), true);
+ ret = !mod || mod->state == MODULE_STATE_LIVE
+ || mod->state == MODULE_STATE_GOING;
+ mutex_unlock(&module_mutex);
+
+ return ret;
+}
+
+/* Must be called with module_mutex held */
+static int module_patient_check_exists(const char *name,
+ enum fail_dup_mod_reason reason)
+{
+ struct module *old;
+ int err = 0;
+
+ old = find_module_all(name, strlen(name), true);
+ if (old == NULL)
+ return 0;
+
+ if (old->state == MODULE_STATE_COMING ||
+ old->state == MODULE_STATE_UNFORMED) {
+ /* Wait in case it fails to load. */
+ mutex_unlock(&module_mutex);
+ err = wait_event_interruptible(module_wq,
+ finished_loading(name));
+ mutex_lock(&module_mutex);
+ if (err)
+ return err;
+
+ /* The module might have gone in the meantime. */
+ old = find_module_all(name, strlen(name), true);
+ }
+
+ if (try_add_failed_module(name, reason))
+ pr_warn("Could not add fail-tracking for module: %s\n", name);
+
+ /*
+ * We are here only when the same module was being loaded. Do
+ * not try to load it again right now. It prevents long delays
+ * caused by serialized module load failures. It might happen
+ * when more devices of the same type trigger load of
+ * a particular module.
+ */
+ if (old && old->state == MODULE_STATE_LIVE)
+ return -EEXIST;
+ return -EBUSY;
+}
+
+/*
+ * We try to place it in the list now to make sure it's unique before
+ * we dedicate too many resources. In particular, temporary percpu
+ * memory exhaustion.
+ */
+static int add_unformed_module(struct module *mod)
+{
+ int err;
+
+ mod->state = MODULE_STATE_UNFORMED;
+
+ mutex_lock(&module_mutex);
+ err = module_patient_check_exists(mod->name, FAIL_DUP_MOD_LOAD);
+ if (err)
+ goto out;
+
+ mod_update_bounds(mod);
+ list_add_rcu(&mod->list, &modules);
+ mod_tree_insert(mod);
+ err = 0;
+
+out:
+ mutex_unlock(&module_mutex);
+ return err;
+}
+
+static int complete_formation(struct module *mod, struct load_info *info)
+{
+ int err;
+
+ mutex_lock(&module_mutex);
+
+ /* Find duplicate symbols (must be called under lock). */
+ err = verify_exported_symbols(mod);
+ if (err < 0)
+ goto out;
+
+ /* These rely on module_mutex for list integrity. */
+ module_bug_finalize(info->hdr, info->sechdrs, mod);
+ module_cfi_finalize(info->hdr, info->sechdrs, mod);
+
+ module_enable_ro(mod, false);
+ module_enable_nx(mod);
+ module_enable_x(mod);
+
+ /*
+ * Mark state as coming so strong_try_module_get() ignores us,
+ * but kallsyms etc. can see us.
+ */
+ mod->state = MODULE_STATE_COMING;
+ mutex_unlock(&module_mutex);
+
+ return 0;
+
+out:
+ mutex_unlock(&module_mutex);
+ return err;
+}
+
+static int prepare_coming_module(struct module *mod)
+{
+ int err;
+
+ ftrace_module_enable(mod);
+ err = klp_module_coming(mod);
+ if (err)
+ return err;
+
+ err = blocking_notifier_call_chain_robust(&module_notify_list,
+ MODULE_STATE_COMING, MODULE_STATE_GOING, mod);
+ err = notifier_to_errno(err);
+ if (err)
+ klp_module_going(mod);
+
+ return err;
+}
+
+static int unknown_module_param_cb(char *param, char *val, const char *modname,
+ void *arg)
+{
+ struct module *mod = arg;
+ int ret;
+
+ if (strcmp(param, "async_probe") == 0) {
+ if (kstrtobool(val, &mod->async_probe_requested))
+ mod->async_probe_requested = true;
+ return 0;
+ }
+
+ /* Check for magic 'dyndbg' arg */
+ ret = ddebug_dyndbg_module_param_cb(param, val, modname);
+ if (ret != 0)
+ pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
+ return 0;
+}
+
+/* Module within temporary copy, this doesn't do any allocation */
+static int early_mod_check(struct load_info *info, int flags)
+{
+ int err;
+
+ /*
+ * Now that we know we have the correct module name, check
+ * if it's blacklisted.
+ */
+ if (blacklisted(info->name)) {
+ pr_err("Module %s is blacklisted\n", info->name);
+ return -EPERM;
+ }
+
+ err = rewrite_section_headers(info, flags);
+ if (err)
+ return err;
+
+ /* Check module struct version now, before we try to use module. */
+ if (!check_modstruct_version(info, info->mod))
+ return -ENOEXEC;
+
+ err = check_modinfo(info->mod, info, flags);
+ if (err)
+ return err;
+
+ mutex_lock(&module_mutex);
+ err = module_patient_check_exists(info->mod->name, FAIL_DUP_MOD_BECOMING);
+ mutex_unlock(&module_mutex);
+
+ return err;
+}
+
+/*
+ * Allocate and load the module: note that size of section 0 is always
+ * zero, and we rely on this for optional sections.
+ */
+static int load_module(struct load_info *info, const char __user *uargs,
+ int flags)
+{
+ struct module *mod;
+ bool module_allocated = false;
+ long err = 0;
+ char *after_dashes;
+
+ /*
+ * Do the signature check (if any) first. All that
+ * the signature check needs is info->len, it does
+ * not need any of the section info. That can be
+ * set up later. This will minimize the chances
+ * of a corrupt module causing problems before
+ * we even get to the signature check.
+ *
+ * The check will also adjust info->len by stripping
+ * off the sig length at the end of the module, making
+ * checks against info->len more correct.
+ */
+ err = module_sig_check(info, flags);
+ if (err)
+ goto free_copy;
+
+ /*
+ * Do basic sanity checks against the ELF header and
+ * sections. Cache useful sections and set the
+ * info->mod to the userspace passed struct module.
+ */
+ err = elf_validity_cache_copy(info, flags);
+ if (err)
+ goto free_copy;
+
+ err = early_mod_check(info, flags);
+ if (err)
+ goto free_copy;
+
+ /* Figure out module layout, and allocate all the memory. */
+ mod = layout_and_allocate(info, flags);
+ if (IS_ERR(mod)) {
+ err = PTR_ERR(mod);
+ goto free_copy;
+ }
+
+ module_allocated = true;
+
+ audit_log_kern_module(mod->name);
+
+ /* Reserve our place in the list. */
+ err = add_unformed_module(mod);
+ if (err)
+ goto free_module;
+
+ /*
+ * We are tainting your kernel if your module gets into
+ * the modules linked list somehow.
+ */
+ module_augment_kernel_taints(mod, info);
+
+ /* To avoid stressing percpu allocator, do this once we're unique. */
+ err = percpu_modalloc(mod, info);
+ if (err)
+ goto unlink_mod;
+
+ /* Now module is in final location, initialize linked lists, etc. */
+ err = module_unload_init(mod);
+ if (err)
+ goto unlink_mod;
+
+ init_param_lock(mod);
+
+ /*
+ * Now we've got everything in the final locations, we can
+ * find optional sections.
+ */
+ err = find_module_sections(mod, info);
+ if (err)
+ goto free_unload;
+
+ err = check_export_symbol_versions(mod);
+ if (err)
+ goto free_unload;
+
+ /* Set up MODINFO_ATTR fields */
+ setup_modinfo(mod, info);
+
+ /* Fix up syms, so that st_value is a pointer to location. */
+ err = simplify_symbols(mod, info);
+ if (err < 0)
+ goto free_modinfo;
+
+ err = apply_relocations(mod, info);
+ if (err < 0)
+ goto free_modinfo;
+
+ err = post_relocation(mod, info);
+ if (err < 0)
+ goto free_modinfo;
+
+ flush_module_icache(mod);
+
+ /* Now copy in args */
+ mod->args = strndup_user(uargs, ~0UL >> 1);
+ if (IS_ERR(mod->args)) {
+ err = PTR_ERR(mod->args);
+ goto free_arch_cleanup;
+ }
+
+ init_build_id(mod, info);
+
+ /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
+ ftrace_module_init(mod);
+
+ /* Finally it's fully formed, ready to start executing. */
+ err = complete_formation(mod, info);
+ if (err)
+ goto ddebug_cleanup;
+
+ err = prepare_coming_module(mod);
+ if (err)
+ goto bug_cleanup;
+
+ mod->async_probe_requested = async_probe;
+
+ /* Module is ready to execute: parsing args may do that. */
+ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
+ -32768, 32767, mod,
+ unknown_module_param_cb);
+ if (IS_ERR(after_dashes)) {
+ err = PTR_ERR(after_dashes);
+ goto coming_cleanup;
+ } else if (after_dashes) {
+ pr_warn("%s: parameters '%s' after `--' ignored\n",
+ mod->name, after_dashes);
+ }
+
+ /* Link in to sysfs. */
+ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
+ if (err < 0)
+ goto coming_cleanup;
+
+ if (is_livepatch_module(mod)) {
+ err = copy_module_elf(mod, info);
+ if (err < 0)
+ goto sysfs_cleanup;
+ }
+
+ /* Get rid of temporary copy. */
+ free_copy(info, flags);
+
+ /* Done! */
+ trace_module_load(mod);
+
+ return do_init_module(mod);
+
+ sysfs_cleanup:
+ mod_sysfs_teardown(mod);
+ coming_cleanup:
+ mod->state = MODULE_STATE_GOING;
+ destroy_params(mod->kp, mod->num_kp);
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ klp_module_going(mod);
+ bug_cleanup:
+ mod->state = MODULE_STATE_GOING;
+ /* module_bug_cleanup needs module_mutex protection */
+ mutex_lock(&module_mutex);
+ module_bug_cleanup(mod);
+ mutex_unlock(&module_mutex);
+
+ ddebug_cleanup:
+ ftrace_release_mod(mod);
+ synchronize_rcu();
+ kfree(mod->args);
+ free_arch_cleanup:
+ module_arch_cleanup(mod);
+ free_modinfo:
+ free_modinfo(mod);
+ free_unload:
+ module_unload_free(mod);
+ unlink_mod:
+ mutex_lock(&module_mutex);
+ /* Unlink carefully: kallsyms could be walking list. */
+ list_del_rcu(&mod->list);
+ mod_tree_remove(mod);
+ wake_up_all(&module_wq);
+ /* Wait for RCU-sched synchronizing before releasing mod->list. */
+ synchronize_rcu();
+ mutex_unlock(&module_mutex);
+ free_module:
+ mod_stat_bump_invalid(info, flags);
+ /* Free lock-classes; relies on the preceding sync_rcu() */
+ for_class_mod_mem_type(type, core_data) {
+ lockdep_free_key_range(mod->mem[type].base,
+ mod->mem[type].size);
+ }
+
+ module_deallocate(mod, info);
+ free_copy:
+ /*
+ * The info->len is always set. We distinguish between
+ * failures once the proper module was allocated and
+ * before that.
+ */
+ if (!module_allocated)
+ mod_stat_bump_becoming(info, flags);
+ free_copy(info, flags);
+ return err;
+}
+
+SYSCALL_DEFINE3(init_module, void __user *, umod,
+ unsigned long, len, const char __user *, uargs)
+{
+ int err;
+ struct load_info info = { };
+
+ err = may_init_module();
+ if (err)
+ return err;
+
+ pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
+ umod, len, uargs);
+
+ err = copy_module_from_user(umod, len, &info);
+ if (err) {
+ mod_stat_inc(&failed_kreads);
+ mod_stat_add_long(len, &invalid_kread_bytes);
+ return err;
+ }
+
+ return load_module(&info, uargs, 0);
+}
+
+struct idempotent {
+ const void *cookie;
+ struct hlist_node entry;
+ struct completion complete;
+ int ret;
+};
+
+#define IDEM_HASH_BITS 8
+static struct hlist_head idem_hash[1 << IDEM_HASH_BITS];
+static DEFINE_SPINLOCK(idem_lock);
+
+static bool idempotent(struct idempotent *u, const void *cookie)
+{
+ int hash = hash_ptr(cookie, IDEM_HASH_BITS);
+ struct hlist_head *head = idem_hash + hash;
+ struct idempotent *existing;
+ bool first;
+
+ u->ret = 0;
+ u->cookie = cookie;
+ init_completion(&u->complete);
+
+ spin_lock(&idem_lock);
+ first = true;
+ hlist_for_each_entry(existing, head, entry) {
+ if (existing->cookie != cookie)
+ continue;
+ first = false;
+ break;
+ }
+ hlist_add_head(&u->entry, idem_hash + hash);
+ spin_unlock(&idem_lock);
+
+ return !first;
+}
+
+/*
+ * We were the first one with 'cookie' on the list, and we ended
+ * up completing the operation. We now need to walk the list,
+ * remove everybody - which includes ourselves - fill in the return
+ * value, and then complete the operation.
+ */
+static int idempotent_complete(struct idempotent *u, int ret)
+{
+ const void *cookie = u->cookie;
+ int hash = hash_ptr(cookie, IDEM_HASH_BITS);
+ struct hlist_head *head = idem_hash + hash;
+ struct hlist_node *next;
+ struct idempotent *pos;
+
+ spin_lock(&idem_lock);
+ hlist_for_each_entry_safe(pos, next, head, entry) {
+ if (pos->cookie != cookie)
+ continue;
+ hlist_del(&pos->entry);
+ pos->ret = ret;
+ complete(&pos->complete);
+ }
+ spin_unlock(&idem_lock);
+ return ret;
+}
+
+static int init_module_from_file(struct file *f, const char __user * uargs, int flags)
+{
+ struct load_info info = { };
+ void *buf = NULL;
+ int len;
+
+ len = kernel_read_file(f, 0, &buf, INT_MAX, NULL, READING_MODULE);
+ if (len < 0) {
+ mod_stat_inc(&failed_kreads);
+ return len;
+ }
+
+ if (flags & MODULE_INIT_COMPRESSED_FILE) {
+ int err = module_decompress(&info, buf, len);
+ vfree(buf); /* compressed data is no longer needed */
+ if (err) {
+ mod_stat_inc(&failed_decompress);
+ mod_stat_add_long(len, &invalid_decompress_bytes);
+ return err;
+ }
+ } else {
+ info.hdr = buf;
+ info.len = len;
+ }
+
+ return load_module(&info, uargs, flags);
+}
+
+static int idempotent_init_module(struct file *f, const char __user * uargs, int flags)
+{
+ struct idempotent idem;
+
+ if (!f || !(f->f_mode & FMODE_READ))
+ return -EBADF;
+
+ /* See if somebody else is doing the operation? */
+ if (idempotent(&idem, file_inode(f))) {
+ wait_for_completion(&idem.complete);
+ return idem.ret;
+ }
+
+ /* Otherwise, we'll do it and complete others */
+ return idempotent_complete(&idem,
+ init_module_from_file(f, uargs, flags));
+}
+
+SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
+{
+ int err;
+ struct fd f;
+
+ err = may_init_module();
+ if (err)
+ return err;
+
+ pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
+
+ if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
+ |MODULE_INIT_IGNORE_VERMAGIC
+ |MODULE_INIT_COMPRESSED_FILE))
+ return -EINVAL;
+
+ f = fdget(fd);
+ err = idempotent_init_module(f.file, uargs, flags);
+ fdput(f);
+ return err;
+}
+
+/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
+char *module_flags(struct module *mod, char *buf, bool show_state)
+{
+ int bx = 0;
+
+ BUG_ON(mod->state == MODULE_STATE_UNFORMED);
+ if (!mod->taints && !show_state)
+ goto out;
+ if (mod->taints ||
+ mod->state == MODULE_STATE_GOING ||
+ mod->state == MODULE_STATE_COMING) {
+ buf[bx++] = '(';
+ bx += module_flags_taint(mod->taints, buf + bx);
+ /* Show a - for module-is-being-unloaded */
+ if (mod->state == MODULE_STATE_GOING && show_state)
+ buf[bx++] = '-';
+ /* Show a + for module-is-being-loaded */
+ if (mod->state == MODULE_STATE_COMING && show_state)
+ buf[bx++] = '+';
+ buf[bx++] = ')';
+ }
+out:
+ buf[bx] = '\0';
+
+ return buf;
+}
+
+/* Given an address, look for it in the module exception tables. */
+const struct exception_table_entry *search_module_extables(unsigned long addr)
+{
+ const struct exception_table_entry *e = NULL;
+ struct module *mod;
+
+ preempt_disable();
+ mod = __module_address(addr);
+ if (!mod)
+ goto out;
+
+ if (!mod->num_exentries)
+ goto out;
+
+ e = search_extable(mod->extable,
+ mod->num_exentries,
+ addr);
+out:
+ preempt_enable();
+
+ /*
+ * Now, if we found one, we are running inside it now, hence
+ * we cannot unload the module, hence no refcnt needed.
+ */
+ return e;
+}
+
+/**
+ * is_module_address() - is this address inside a module?
+ * @addr: the address to check.
+ *
+ * See is_module_text_address() if you simply want to see if the address
+ * is code (not data).
+ */
+bool is_module_address(unsigned long addr)
+{
+ bool ret;
+
+ preempt_disable();
+ ret = __module_address(addr) != NULL;
+ preempt_enable();
+
+ return ret;
+}
+
+/**
+ * __module_address() - get the module which contains an address.
+ * @addr: the address.
+ *
+ * Must be called with preempt disabled or module mutex held so that
+ * module doesn't get freed during this.
+ */
+struct module *__module_address(unsigned long addr)
+{
+ struct module *mod;
+
+ if (addr >= mod_tree.addr_min && addr <= mod_tree.addr_max)
+ goto lookup;
+
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ if (addr >= mod_tree.data_addr_min && addr <= mod_tree.data_addr_max)
+ goto lookup;
+#endif
+
+ return NULL;
+
+lookup:
+ module_assert_mutex_or_preempt();
+
+ mod = mod_find(addr, &mod_tree);
+ if (mod) {
+ BUG_ON(!within_module(addr, mod));
+ if (mod->state == MODULE_STATE_UNFORMED)
+ mod = NULL;
+ }
+ return mod;
+}
+
+/**
+ * is_module_text_address() - is this address inside module code?
+ * @addr: the address to check.
+ *
+ * See is_module_address() if you simply want to see if the address is
+ * anywhere in a module. See kernel_text_address() for testing if an
+ * address corresponds to kernel or module code.
+ */
+bool is_module_text_address(unsigned long addr)
+{
+ bool ret;
+
+ preempt_disable();
+ ret = __module_text_address(addr) != NULL;
+ preempt_enable();
+
+ return ret;
+}
+
+/**
+ * __module_text_address() - get the module whose code contains an address.
+ * @addr: the address.
+ *
+ * Must be called with preempt disabled or module mutex held so that
+ * module doesn't get freed during this.
+ */
+struct module *__module_text_address(unsigned long addr)
+{
+ struct module *mod = __module_address(addr);
+ if (mod) {
+ /* Make sure it's within the text section. */
+ if (!within_module_mem_type(addr, mod, MOD_TEXT) &&
+ !within_module_mem_type(addr, mod, MOD_INIT_TEXT))
+ mod = NULL;
+ }
+ return mod;
+}
+
+/* Don't grab lock, we're oopsing. */
+void print_modules(void)
+{
+ struct module *mod;
+ char buf[MODULE_FLAGS_BUF_SIZE];
+
+ printk(KERN_DEFAULT "Modules linked in:");
+ /* Most callers should already have preempt disabled, but make sure */
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ pr_cont(" %s%s", mod->name, module_flags(mod, buf, true));
+ }
+
+ print_unloaded_tainted_modules();
+ preempt_enable();
+ if (last_unloaded_module.name[0])
+ pr_cont(" [last unloaded: %s%s]", last_unloaded_module.name,
+ last_unloaded_module.taints);
+ pr_cont("\n");
+}
+
+#ifdef CONFIG_MODULE_DEBUGFS
+struct dentry *mod_debugfs_root;
+
+static int module_debugfs_init(void)
+{
+ mod_debugfs_root = debugfs_create_dir("modules", NULL);
+ return 0;
+}
+module_init(module_debugfs_init);
+#endif
diff --git a/kernel/module/procfs.c b/kernel/module/procfs.c
new file mode 100644
index 000000000000..0a4841e88adb
--- /dev/null
+++ b/kernel/module/procfs.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module proc support
+ *
+ * Copyright (C) 2008 Alexey Dobriyan
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include "internal.h"
+
+#ifdef CONFIG_MODULE_UNLOAD
+static inline void print_unload_info(struct seq_file *m, struct module *mod)
+{
+ struct module_use *use;
+ int printed_something = 0;
+
+ seq_printf(m, " %i ", module_refcount(mod));
+
+ /*
+ * Always include a trailing , so userspace can differentiate
+ * between this and the old multi-field proc format.
+ */
+ list_for_each_entry(use, &mod->source_list, source_list) {
+ printed_something = 1;
+ seq_printf(m, "%s,", use->source->name);
+ }
+
+ if (mod->init && !mod->exit) {
+ printed_something = 1;
+ seq_puts(m, "[permanent],");
+ }
+
+ if (!printed_something)
+ seq_puts(m, "-");
+}
+#else /* !CONFIG_MODULE_UNLOAD */
+static inline void print_unload_info(struct seq_file *m, struct module *mod)
+{
+ /* We don't know the usage count, or what modules are using. */
+ seq_puts(m, " - -");
+}
+#endif /* CONFIG_MODULE_UNLOAD */
+
+/* Called by the /proc file system to return a list of modules. */
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&module_mutex);
+ return seq_list_start(&modules, *pos);
+}
+
+static void *m_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &modules, pos);
+}
+
+static void m_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&module_mutex);
+}
+
+static unsigned int module_total_size(struct module *mod)
+{
+ int size = 0;
+
+ for_each_mod_mem_type(type)
+ size += mod->mem[type].size;
+ return size;
+}
+
+static int m_show(struct seq_file *m, void *p)
+{
+ struct module *mod = list_entry(p, struct module, list);
+ char buf[MODULE_FLAGS_BUF_SIZE];
+ void *value;
+ unsigned int size;
+
+ /* We always ignore unformed modules. */
+ if (mod->state == MODULE_STATE_UNFORMED)
+ return 0;
+
+ size = module_total_size(mod);
+ seq_printf(m, "%s %u", mod->name, size);
+ print_unload_info(m, mod);
+
+ /* Informative for users. */
+ seq_printf(m, " %s",
+ mod->state == MODULE_STATE_GOING ? "Unloading" :
+ mod->state == MODULE_STATE_COMING ? "Loading" :
+ "Live");
+ /* Used by oprofile and other similar tools. */
+ value = m->private ? NULL : mod->mem[MOD_TEXT].base;
+ seq_printf(m, " 0x%px", value);
+
+ /* Taints info */
+ if (mod->taints)
+ seq_printf(m, " %s", module_flags(mod, buf, true));
+
+ seq_puts(m, "\n");
+ return 0;
+}
+
+/*
+ * Format: modulename size refcount deps address
+ *
+ * Where refcount is a number or -, and deps is a comma-separated list
+ * of depends or -.
+ */
+static const struct seq_operations modules_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = m_show
+};
+
+/*
+ * This also sets the "private" pointer to non-NULL if the
+ * kernel pointers should be hidden (so you can just test
+ * "m->private" to see if you should keep the values private).
+ *
+ * We use the same logic as for /proc/kallsyms.
+ */
+static int modules_open(struct inode *inode, struct file *file)
+{
+ int err = seq_open(file, &modules_op);
+
+ if (!err) {
+ struct seq_file *m = file->private_data;
+
+ m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul;
+ }
+
+ return err;
+}
+
+static const struct proc_ops modules_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_open = modules_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
+};
+
+static int __init proc_modules_init(void)
+{
+ proc_create("modules", 0, NULL, &modules_proc_ops);
+ return 0;
+}
+module_init(proc_modules_init);
diff --git a/kernel/module/signing.c b/kernel/module/signing.c
new file mode 100644
index 000000000000..a2ff4242e623
--- /dev/null
+++ b/kernel/module/signing.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Module signature checker
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/module_signature.h>
+#include <linux/string.h>
+#include <linux/verification.h>
+#include <linux/security.h>
+#include <crypto/public_key.h>
+#include <uapi/linux/module.h>
+#include "internal.h"
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "module."
+
+static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
+module_param(sig_enforce, bool_enable_only, 0644);
+
+/*
+ * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
+ * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
+ */
+bool is_module_sig_enforced(void)
+{
+ return sig_enforce;
+}
+EXPORT_SYMBOL(is_module_sig_enforced);
+
+void set_module_sig_enforced(void)
+{
+ sig_enforce = true;
+}
+
+/*
+ * Verify the signature on a module.
+ */
+int mod_verify_sig(const void *mod, struct load_info *info)
+{
+ struct module_signature ms;
+ size_t sig_len, modlen = info->len;
+ int ret;
+
+ pr_devel("==>%s(,%zu)\n", __func__, modlen);
+
+ if (modlen <= sizeof(ms))
+ return -EBADMSG;
+
+ memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
+
+ ret = mod_check_sig(&ms, modlen, "module");
+ if (ret)
+ return ret;
+
+ sig_len = be32_to_cpu(ms.sig_len);
+ modlen -= sig_len + sizeof(ms);
+ info->len = modlen;
+
+ return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
+ VERIFY_USE_SECONDARY_KEYRING,
+ VERIFYING_MODULE_SIGNATURE,
+ NULL, NULL);
+}
+
+int module_sig_check(struct load_info *info, int flags)
+{
+ int err = -ENODATA;
+ const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+ const char *reason;
+ const void *mod = info->hdr;
+ bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS |
+ MODULE_INIT_IGNORE_VERMAGIC);
+ /*
+ * Do not allow mangled modules as a module with version information
+ * removed is no longer the module that was signed.
+ */
+ if (!mangled_module &&
+ info->len > markerlen &&
+ memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
+ /* We truncate the module to discard the signature */
+ info->len -= markerlen;
+ err = mod_verify_sig(mod, info);
+ if (!err) {
+ info->sig_ok = true;
+ return 0;
+ }
+ }
+
+ /*
+ * We don't permit modules to be loaded into the trusted kernels
+ * without a valid signature on them, but if we're not enforcing,
+ * certain errors are non-fatal.
+ */
+ switch (err) {
+ case -ENODATA:
+ reason = "unsigned module";
+ break;
+ case -ENOPKG:
+ reason = "module with unsupported crypto";
+ break;
+ case -ENOKEY:
+ reason = "module with unavailable key";
+ break;
+
+ default:
+ /*
+ * All other errors are fatal, including lack of memory,
+ * unparseable signatures, and signature check failures --
+ * even if signatures aren't required.
+ */
+ return err;
+ }
+
+ if (is_module_sig_enforced()) {
+ pr_notice("Loading of %s is rejected\n", reason);
+ return -EKEYREJECTED;
+ }
+
+ return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
+}
diff --git a/kernel/module/stats.c b/kernel/module/stats.c
new file mode 100644
index 000000000000..3ba0e98b3c91
--- /dev/null
+++ b/kernel/module/stats.c
@@ -0,0 +1,432 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Debugging module statistics.
+ *
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#include <linux/module.h>
+#include <uapi/linux/module.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/debugfs.h>
+#include <linux/rculist.h>
+#include <linux/math.h>
+
+#include "internal.h"
+
+/**
+ * DOC: module debugging statistics overview
+ *
+ * Enabling CONFIG_MODULE_STATS enables module debugging statistics which
+ * are useful to monitor and root cause memory pressure issues with module
+ * loading. These statistics are useful to allow us to improve production
+ * workloads.
+ *
+ * The current module debugging statistics supported help keep track of module
+ * loading failures to enable improvements either for kernel module auto-loading
+ * usage (request_module()) or interactions with userspace. Statistics are
+ * provided to track all possible failures in the finit_module() path and memory
+ * wasted in this process space. Each of the failure counters are associated
+ * to a type of module loading failure which is known to incur a certain amount
+ * of memory allocation loss. In the worst case loading a module will fail after
+ * a 3 step memory allocation process:
+ *
+ * a) memory allocated with kernel_read_file_from_fd()
+ * b) module decompression processes the file read from
+ * kernel_read_file_from_fd(), and vmap() is used to map
+ * the decompressed module to a new local buffer which represents
+ * a copy of the decompressed module passed from userspace. The buffer
+ * from kernel_read_file_from_fd() is freed right away.
+ * c) layout_and_allocate() allocates space for the final resting
+ * place where we would keep the module if it were to be processed
+ * successfully.
+ *
+ * If a failure occurs after these three different allocations only one
+ * counter will be incremented with the summation of the allocated bytes freed
+ * incurred during this failure. Likewise, if module loading failed only after
+ * step b) a separate counter is used and incremented for the bytes freed and
+ * not used during both of those allocations.
+ *
+ * Virtual memory space can be limited, for example on x86 virtual memory size
+ * defaults to 128 MiB. We should strive to limit and avoid wasting virtual
+ * memory allocations when possible. These module debugging statistics help
+ * to evaluate how much memory is being wasted on bootup due to module loading
+ * failures.
+ *
+ * All counters are designed to be incremental. Atomic counters are used so to
+ * remain simple and avoid delays and deadlocks.
+ */
+
+/**
+ * DOC: dup_failed_modules - tracks duplicate failed modules
+ *
+ * Linked list of modules which failed to be loaded because an already existing
+ * module with the same name was already being processed or already loaded.
+ * The finit_module() system call incurs heavy virtual memory allocations. In
+ * the worst case an finit_module() system call can end up allocating virtual
+ * memory 3 times:
+ *
+ * 1) kernel_read_file_from_fd() call uses vmalloc()
+ * 2) optional module decompression uses vmap()
+ * 3) layout_and allocate() can use vzalloc() or an arch specific variation of
+ * vmalloc to deal with ELF sections requiring special permissions
+ *
+ * In practice on a typical boot today most finit_module() calls fail due to
+ * the module with the same name already being loaded or about to be processed.
+ * All virtual memory allocated to these failed modules will be freed with
+ * no functional use.
+ *
+ * To help with this the dup_failed_modules allows us to track modules which
+ * failed to load due to the fact that a module was already loaded or being
+ * processed. There are only two points at which we can fail such calls,
+ * we list them below along with the number of virtual memory allocation
+ * calls:
+ *
+ * a) FAIL_DUP_MOD_BECOMING: at the end of early_mod_check() before
+ * layout_and_allocate().
+ * - with module decompression: 2 virtual memory allocation calls
+ * - without module decompression: 1 virtual memory allocation calls
+ * b) FAIL_DUP_MOD_LOAD: after layout_and_allocate() on add_unformed_module()
+ * - with module decompression 3 virtual memory allocation calls
+ * - without module decompression 2 virtual memory allocation calls
+ *
+ * We should strive to get this list to be as small as possible. If this list
+ * is not empty it is a reflection of possible work or optimizations possible
+ * either in-kernel or in userspace.
+ */
+static LIST_HEAD(dup_failed_modules);
+
+/**
+ * DOC: module statistics debugfs counters
+ *
+ * The total amount of wasted virtual memory allocation space during module
+ * loading can be computed by adding the total from the summation:
+ *
+ * * @invalid_kread_bytes +
+ * @invalid_decompress_bytes +
+ * @invalid_becoming_bytes +
+ * @invalid_mod_bytes
+ *
+ * The following debugfs counters are available to inspect module loading
+ * failures:
+ *
+ * * total_mod_size: total bytes ever used by all modules we've dealt with on
+ * this system
+ * * total_text_size: total bytes of the .text and .init.text ELF section
+ * sizes we've dealt with on this system
+ * * invalid_kread_bytes: bytes allocated and then freed on failures which
+ * happen due to the initial kernel_read_file_from_fd(). kernel_read_file_from_fd()
+ * uses vmalloc(). These should typically not happen unless your system is
+ * under memory pressure.
+ * * invalid_decompress_bytes: number of bytes allocated and freed due to
+ * memory allocations in the module decompression path that use vmap().
+ * These typically should not happen unless your system is under memory
+ * pressure.
+ * * invalid_becoming_bytes: total number of bytes allocated and freed used
+ * to read the kernel module userspace wants us to read before we
+ * promote it to be processed to be added to our @modules linked list. These
+ * failures can happen if we had a check in between a successful kernel_read_file_from_fd()
+ * call and right before we allocate the our private memory for the module
+ * which would be kept if the module is successfully loaded. The most common
+ * reason for this failure is when userspace is racing to load a module
+ * which it does not yet see loaded. The first module to succeed in
+ * add_unformed_module() will add a module to our &modules list and
+ * subsequent loads of modules with the same name will error out at the
+ * end of early_mod_check(). The check for module_patient_check_exists()
+ * at the end of early_mod_check() prevents duplicate allocations
+ * on layout_and_allocate() for modules already being processed. These
+ * duplicate failed modules are non-fatal, however they typically are
+ * indicative of userspace not seeing a module in userspace loaded yet and
+ * unnecessarily trying to load a module before the kernel even has a chance
+ * to begin to process prior requests. Although duplicate failures can be
+ * non-fatal, we should try to reduce vmalloc() pressure proactively, so
+ * ideally after boot this will be close to as 0 as possible. If module
+ * decompression was used we also add to this counter the cost of the
+ * initial kernel_read_file_from_fd() of the compressed module. If module
+ * decompression was not used the value represents the total allocated and
+ * freed bytes in kernel_read_file_from_fd() calls for these type of
+ * failures. These failures can occur because:
+ *
+ * * module_sig_check() - module signature checks
+ * * elf_validity_cache_copy() - some ELF validation issue
+ * * early_mod_check():
+ *
+ * * blacklisting
+ * * failed to rewrite section headers
+ * * version magic
+ * * live patch requirements didn't check out
+ * * the module was detected as being already present
+ *
+ * * invalid_mod_bytes: these are the total number of bytes allocated and
+ * freed due to failures after we did all the sanity checks of the module
+ * which userspace passed to us and after our first check that the module
+ * is unique. A module can still fail to load if we detect the module is
+ * loaded after we allocate space for it with layout_and_allocate(), we do
+ * this check right before processing the module as live and run its
+ * initialization routines. Note that you have a failure of this type it
+ * also means the respective kernel_read_file_from_fd() memory space was
+ * also freed and not used, and so we increment this counter with twice
+ * the size of the module. Additionally if you used module decompression
+ * the size of the compressed module is also added to this counter.
+ *
+ * * modcount: how many modules we've loaded in our kernel life time
+ * * failed_kreads: how many modules failed due to failed kernel_read_file_from_fd()
+ * * failed_decompress: how many failed module decompression attempts we've had.
+ * These really should not happen unless your compression / decompression
+ * might be broken.
+ * * failed_becoming: how many modules failed after we kernel_read_file_from_fd()
+ * it and before we allocate memory for it with layout_and_allocate(). This
+ * counter is never incremented if you manage to validate the module and
+ * call layout_and_allocate() for it.
+ * * failed_load_modules: how many modules failed once we've allocated our
+ * private space for our module using layout_and_allocate(). These failures
+ * should hopefully mostly be dealt with already. Races in theory could
+ * still exist here, but it would just mean the kernel had started processing
+ * two threads concurrently up to early_mod_check() and one thread won.
+ * These failures are good signs the kernel or userspace is doing something
+ * seriously stupid or that could be improved. We should strive to fix these,
+ * but it is perhaps not easy to fix them. A recent example are the modules
+ * requests incurred for frequency modules, a separate module request was
+ * being issued for each CPU on a system.
+ */
+
+atomic_long_t total_mod_size;
+atomic_long_t total_text_size;
+atomic_long_t invalid_kread_bytes;
+atomic_long_t invalid_decompress_bytes;
+static atomic_long_t invalid_becoming_bytes;
+static atomic_long_t invalid_mod_bytes;
+atomic_t modcount;
+atomic_t failed_kreads;
+atomic_t failed_decompress;
+static atomic_t failed_becoming;
+static atomic_t failed_load_modules;
+
+static const char *mod_fail_to_str(struct mod_fail_load *mod_fail)
+{
+ if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask) &&
+ test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask))
+ return "Becoming & Load";
+ if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask))
+ return "Becoming";
+ if (test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask))
+ return "Load";
+ return "Bug-on-stats";
+}
+
+void mod_stat_bump_invalid(struct load_info *info, int flags)
+{
+ atomic_long_add(info->len * 2, &invalid_mod_bytes);
+ atomic_inc(&failed_load_modules);
+#if defined(CONFIG_MODULE_DECOMPRESS)
+ if (flags & MODULE_INIT_COMPRESSED_FILE)
+ atomic_long_add(info->compressed_len, &invalid_mod_bytes);
+#endif
+}
+
+void mod_stat_bump_becoming(struct load_info *info, int flags)
+{
+ atomic_inc(&failed_becoming);
+ atomic_long_add(info->len, &invalid_becoming_bytes);
+#if defined(CONFIG_MODULE_DECOMPRESS)
+ if (flags & MODULE_INIT_COMPRESSED_FILE)
+ atomic_long_add(info->compressed_len, &invalid_becoming_bytes);
+#endif
+}
+
+int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason)
+{
+ struct mod_fail_load *mod_fail;
+
+ list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list,
+ lockdep_is_held(&module_mutex)) {
+ if (!strcmp(mod_fail->name, name)) {
+ atomic_long_inc(&mod_fail->count);
+ __set_bit(reason, &mod_fail->dup_fail_mask);
+ goto out;
+ }
+ }
+
+ mod_fail = kzalloc(sizeof(*mod_fail), GFP_KERNEL);
+ if (!mod_fail)
+ return -ENOMEM;
+ memcpy(mod_fail->name, name, strlen(name));
+ __set_bit(reason, &mod_fail->dup_fail_mask);
+ atomic_long_inc(&mod_fail->count);
+ list_add_rcu(&mod_fail->list, &dup_failed_modules);
+out:
+ return 0;
+}
+
+/*
+ * At 64 bytes per module and assuming a 1024 bytes preamble we can fit the
+ * 112 module prints within 8k.
+ *
+ * 1024 + (64*112) = 8k
+ */
+#define MAX_PREAMBLE 1024
+#define MAX_FAILED_MOD_PRINT 112
+#define MAX_BYTES_PER_MOD 64
+static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct mod_fail_load *mod_fail;
+ unsigned int len, size, count_failed = 0;
+ char *buf;
+ int ret;
+ u32 live_mod_count, fkreads, fdecompress, fbecoming, floads;
+ unsigned long total_size, text_size, ikread_bytes, ibecoming_bytes,
+ idecompress_bytes, imod_bytes, total_virtual_lost;
+
+ live_mod_count = atomic_read(&modcount);
+ fkreads = atomic_read(&failed_kreads);
+ fdecompress = atomic_read(&failed_decompress);
+ fbecoming = atomic_read(&failed_becoming);
+ floads = atomic_read(&failed_load_modules);
+
+ total_size = atomic_long_read(&total_mod_size);
+ text_size = atomic_long_read(&total_text_size);
+ ikread_bytes = atomic_long_read(&invalid_kread_bytes);
+ idecompress_bytes = atomic_long_read(&invalid_decompress_bytes);
+ ibecoming_bytes = atomic_long_read(&invalid_becoming_bytes);
+ imod_bytes = atomic_long_read(&invalid_mod_bytes);
+
+ total_virtual_lost = ikread_bytes + idecompress_bytes + ibecoming_bytes + imod_bytes;
+
+ size = MAX_PREAMBLE + min((unsigned int)(floads + fbecoming),
+ (unsigned int)MAX_FAILED_MOD_PRINT) * MAX_BYTES_PER_MOD;
+ buf = kzalloc(size, GFP_KERNEL);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ /* The beginning of our debug preamble */
+ len = scnprintf(buf, size, "%25s\t%u\n", "Mods ever loaded", live_mod_count);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on kread", fkreads);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on decompress",
+ fdecompress);
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on becoming", fbecoming);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on load", floads);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total module size", total_size);
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total mod text size", text_size);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kread bytes", ikread_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed decompress bytes",
+ idecompress_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed becoming bytes", ibecoming_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kmod bytes", imod_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Virtual mem wasted bytes", total_virtual_lost);
+
+ if (live_mod_count && total_size) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod size",
+ DIV_ROUND_UP(total_size, live_mod_count));
+ }
+
+ if (live_mod_count && text_size) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod text size",
+ DIV_ROUND_UP(text_size, live_mod_count));
+ }
+
+ /*
+ * We use WARN_ON_ONCE() for the counters to ensure we always have parity
+ * for keeping tabs on a type of failure with one type of byte counter.
+ * The counters for imod_bytes does not increase for fkreads failures
+ * for example, and so on.
+ */
+
+ WARN_ON_ONCE(ikread_bytes && !fkreads);
+ if (fkreads && ikread_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail kread bytes",
+ DIV_ROUND_UP(ikread_bytes, fkreads));
+ }
+
+ WARN_ON_ONCE(ibecoming_bytes && !fbecoming);
+ if (fbecoming && ibecoming_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail becoming bytes",
+ DIV_ROUND_UP(ibecoming_bytes, fbecoming));
+ }
+
+ WARN_ON_ONCE(idecompress_bytes && !fdecompress);
+ if (fdecompress && idecompress_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail decomp bytes",
+ DIV_ROUND_UP(idecompress_bytes, fdecompress));
+ }
+
+ WARN_ON_ONCE(imod_bytes && !floads);
+ if (floads && imod_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average fail load bytes",
+ DIV_ROUND_UP(imod_bytes, floads));
+ }
+
+ /* End of our debug preamble header. */
+
+ /* Catch when we've gone beyond our expected preamble */
+ WARN_ON_ONCE(len >= MAX_PREAMBLE);
+
+ if (list_empty(&dup_failed_modules))
+ goto out;
+
+ len += scnprintf(buf + len, size - len, "Duplicate failed modules:\n");
+ len += scnprintf(buf + len, size - len, "%25s\t%15s\t%25s\n",
+ "Module-name", "How-many-times", "Reason");
+ mutex_lock(&module_mutex);
+
+
+ list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list) {
+ if (WARN_ON_ONCE(++count_failed >= MAX_FAILED_MOD_PRINT))
+ goto out_unlock;
+ len += scnprintf(buf + len, size - len, "%25s\t%15lu\t%25s\n", mod_fail->name,
+ atomic_long_read(&mod_fail->count), mod_fail_to_str(mod_fail));
+ }
+out_unlock:
+ mutex_unlock(&module_mutex);
+out:
+ ret = simple_read_from_buffer(user_buf, count, ppos, buf, len);
+ kfree(buf);
+ return ret;
+}
+#undef MAX_PREAMBLE
+#undef MAX_FAILED_MOD_PRINT
+#undef MAX_BYTES_PER_MOD
+
+static const struct file_operations fops_mod_stats = {
+ .read = read_file_mod_stats,
+ .open = simple_open,
+ .owner = THIS_MODULE,
+ .llseek = default_llseek,
+};
+
+#define mod_debug_add_ulong(name) debugfs_create_ulong(#name, 0400, mod_debugfs_root, (unsigned long *) &name.counter)
+#define mod_debug_add_atomic(name) debugfs_create_atomic_t(#name, 0400, mod_debugfs_root, &name)
+static int __init module_stats_init(void)
+{
+ mod_debug_add_ulong(total_mod_size);
+ mod_debug_add_ulong(total_text_size);
+ mod_debug_add_ulong(invalid_kread_bytes);
+ mod_debug_add_ulong(invalid_decompress_bytes);
+ mod_debug_add_ulong(invalid_becoming_bytes);
+ mod_debug_add_ulong(invalid_mod_bytes);
+
+ mod_debug_add_atomic(modcount);
+ mod_debug_add_atomic(failed_kreads);
+ mod_debug_add_atomic(failed_decompress);
+ mod_debug_add_atomic(failed_becoming);
+ mod_debug_add_atomic(failed_load_modules);
+
+ debugfs_create_file("stats", 0400, mod_debugfs_root, mod_debugfs_root, &fops_mod_stats);
+
+ return 0;
+}
+#undef mod_debug_add_ulong
+#undef mod_debug_add_atomic
+module_init(module_stats_init);
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
new file mode 100644
index 000000000000..a2b656b4e3d2
--- /dev/null
+++ b/kernel/module/strict_rwx.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module strict rwx
+ *
+ * Copyright (C) 2015 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/set_memory.h>
+#include "internal.h"
+
+static void module_set_memory(const struct module *mod, enum mod_mem_type type,
+ int (*set_memory)(unsigned long start, int num_pages))
+{
+ const struct module_memory *mod_mem = &mod->mem[type];
+
+ set_vm_flush_reset_perms(mod_mem->base);
+ set_memory((unsigned long)mod_mem->base, mod_mem->size >> PAGE_SHIFT);
+}
+
+/*
+ * Since some arches are moving towards PAGE_KERNEL module allocations instead
+ * of PAGE_KERNEL_EXEC, keep module_enable_x() independent of
+ * CONFIG_STRICT_MODULE_RWX because they are needed regardless of whether we
+ * are strict.
+ */
+void module_enable_x(const struct module *mod)
+{
+ for_class_mod_mem_type(type, text)
+ module_set_memory(mod, type, set_memory_x);
+}
+
+void module_enable_ro(const struct module *mod, bool after_init)
+{
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ return;
+#ifdef CONFIG_STRICT_MODULE_RWX
+ if (!rodata_enabled)
+ return;
+#endif
+
+ module_set_memory(mod, MOD_TEXT, set_memory_ro);
+ module_set_memory(mod, MOD_INIT_TEXT, set_memory_ro);
+ module_set_memory(mod, MOD_RODATA, set_memory_ro);
+ module_set_memory(mod, MOD_INIT_RODATA, set_memory_ro);
+
+ if (after_init)
+ module_set_memory(mod, MOD_RO_AFTER_INIT, set_memory_ro);
+}
+
+void module_enable_nx(const struct module *mod)
+{
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ return;
+
+ for_class_mod_mem_type(type, data)
+ module_set_memory(mod, type, set_memory_nx);
+}
+
+int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+ char *secstrings, struct module *mod)
+{
+ const unsigned long shf_wx = SHF_WRITE | SHF_EXECINSTR;
+ int i;
+
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ return 0;
+
+ for (i = 0; i < hdr->e_shnum; i++) {
+ if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) {
+ pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n",
+ mod->name, secstrings + sechdrs[i].sh_name, i);
+ return -ENOEXEC;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c
new file mode 100644
index 000000000000..d964167c6658
--- /dev/null
+++ b/kernel/module/sysfs.c
@@ -0,0 +1,436 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module sysfs support
+ *
+ * Copyright (C) 2008 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include "internal.h"
+
+/*
+ * /sys/module/foo/sections stuff
+ * J. Corbet <corbet@lwn.net>
+ */
+#ifdef CONFIG_KALLSYMS
+struct module_sect_attr {
+ struct bin_attribute battr;
+ unsigned long address;
+};
+
+struct module_sect_attrs {
+ struct attribute_group grp;
+ unsigned int nsections;
+ struct module_sect_attr attrs[];
+};
+
+#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4))
+static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *battr,
+ char *buf, loff_t pos, size_t count)
+{
+ struct module_sect_attr *sattr =
+ container_of(battr, struct module_sect_attr, battr);
+ char bounce[MODULE_SECT_READ_SIZE + 1];
+ size_t wrote;
+
+ if (pos != 0)
+ return -EINVAL;
+
+ /*
+ * Since we're a binary read handler, we must account for the
+ * trailing NUL byte that sprintf will write: if "buf" is
+ * too small to hold the NUL, or the NUL is exactly the last
+ * byte, the read will look like it got truncated by one byte.
+ * Since there is no way to ask sprintf nicely to not write
+ * the NUL, we have to use a bounce buffer.
+ */
+ wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n",
+ kallsyms_show_value(file->f_cred)
+ ? (void *)sattr->address : NULL);
+ count = min(count, wrote);
+ memcpy(buf, bounce, count);
+
+ return count;
+}
+
+static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
+{
+ unsigned int section;
+
+ for (section = 0; section < sect_attrs->nsections; section++)
+ kfree(sect_attrs->attrs[section].battr.attr.name);
+ kfree(sect_attrs);
+}
+
+static void add_sect_attrs(struct module *mod, const struct load_info *info)
+{
+ unsigned int nloaded = 0, i, size[2];
+ struct module_sect_attrs *sect_attrs;
+ struct module_sect_attr *sattr;
+ struct bin_attribute **gattr;
+
+ /* Count loaded sections and allocate structures */
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ if (!sect_empty(&info->sechdrs[i]))
+ nloaded++;
+ size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded),
+ sizeof(sect_attrs->grp.bin_attrs[0]));
+ size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]);
+ sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
+ if (!sect_attrs)
+ return;
+
+ /* Setup section attributes. */
+ sect_attrs->grp.name = "sections";
+ sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0];
+
+ sect_attrs->nsections = 0;
+ sattr = &sect_attrs->attrs[0];
+ gattr = &sect_attrs->grp.bin_attrs[0];
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *sec = &info->sechdrs[i];
+
+ if (sect_empty(sec))
+ continue;
+ sysfs_bin_attr_init(&sattr->battr);
+ sattr->address = sec->sh_addr;
+ sattr->battr.attr.name =
+ kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
+ if (!sattr->battr.attr.name)
+ goto out;
+ sect_attrs->nsections++;
+ sattr->battr.read = module_sect_read;
+ sattr->battr.size = MODULE_SECT_READ_SIZE;
+ sattr->battr.attr.mode = 0400;
+ *(gattr++) = &(sattr++)->battr;
+ }
+ *gattr = NULL;
+
+ if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
+ goto out;
+
+ mod->sect_attrs = sect_attrs;
+ return;
+out:
+ free_sect_attrs(sect_attrs);
+}
+
+static void remove_sect_attrs(struct module *mod)
+{
+ if (mod->sect_attrs) {
+ sysfs_remove_group(&mod->mkobj.kobj,
+ &mod->sect_attrs->grp);
+ /*
+ * We are positive that no one is using any sect attrs
+ * at this point. Deallocate immediately.
+ */
+ free_sect_attrs(mod->sect_attrs);
+ mod->sect_attrs = NULL;
+ }
+}
+
+/*
+ * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections.
+ */
+
+struct module_notes_attrs {
+ struct kobject *dir;
+ unsigned int notes;
+ struct bin_attribute attrs[] __counted_by(notes);
+};
+
+static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t pos, size_t count)
+{
+ /*
+ * The caller checked the pos and count against our size.
+ */
+ memcpy(buf, bin_attr->private + pos, count);
+ return count;
+}
+
+static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
+ unsigned int i)
+{
+ if (notes_attrs->dir) {
+ while (i-- > 0)
+ sysfs_remove_bin_file(notes_attrs->dir,
+ &notes_attrs->attrs[i]);
+ kobject_put(notes_attrs->dir);
+ }
+ kfree(notes_attrs);
+}
+
+static void add_notes_attrs(struct module *mod, const struct load_info *info)
+{
+ unsigned int notes, loaded, i;
+ struct module_notes_attrs *notes_attrs;
+ struct bin_attribute *nattr;
+
+ /* failed to create section attributes, so can't create notes */
+ if (!mod->sect_attrs)
+ return;
+
+ /* Count notes sections and allocate structures. */
+ notes = 0;
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ if (!sect_empty(&info->sechdrs[i]) &&
+ info->sechdrs[i].sh_type == SHT_NOTE)
+ ++notes;
+
+ if (notes == 0)
+ return;
+
+ notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
+ GFP_KERNEL);
+ if (!notes_attrs)
+ return;
+
+ notes_attrs->notes = notes;
+ nattr = &notes_attrs->attrs[0];
+ for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
+ if (sect_empty(&info->sechdrs[i]))
+ continue;
+ if (info->sechdrs[i].sh_type == SHT_NOTE) {
+ sysfs_bin_attr_init(nattr);
+ nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name;
+ nattr->attr.mode = 0444;
+ nattr->size = info->sechdrs[i].sh_size;
+ nattr->private = (void *)info->sechdrs[i].sh_addr;
+ nattr->read = module_notes_read;
+ ++nattr;
+ }
+ ++loaded;
+ }
+
+ notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
+ if (!notes_attrs->dir)
+ goto out;
+
+ for (i = 0; i < notes; ++i)
+ if (sysfs_create_bin_file(notes_attrs->dir,
+ &notes_attrs->attrs[i]))
+ goto out;
+
+ mod->notes_attrs = notes_attrs;
+ return;
+
+out:
+ free_notes_attrs(notes_attrs, i);
+}
+
+static void remove_notes_attrs(struct module *mod)
+{
+ if (mod->notes_attrs)
+ free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes);
+}
+
+#else /* !CONFIG_KALLSYMS */
+static inline void add_sect_attrs(struct module *mod, const struct load_info *info) { }
+static inline void remove_sect_attrs(struct module *mod) { }
+static inline void add_notes_attrs(struct module *mod, const struct load_info *info) { }
+static inline void remove_notes_attrs(struct module *mod) { }
+#endif /* CONFIG_KALLSYMS */
+
+static void del_usage_links(struct module *mod)
+{
+#ifdef CONFIG_MODULE_UNLOAD
+ struct module_use *use;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(use, &mod->target_list, target_list)
+ sysfs_remove_link(use->target->holders_dir, mod->name);
+ mutex_unlock(&module_mutex);
+#endif
+}
+
+static int add_usage_links(struct module *mod)
+{
+ int ret = 0;
+#ifdef CONFIG_MODULE_UNLOAD
+ struct module_use *use;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(use, &mod->target_list, target_list) {
+ ret = sysfs_create_link(use->target->holders_dir,
+ &mod->mkobj.kobj, mod->name);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&module_mutex);
+ if (ret)
+ del_usage_links(mod);
+#endif
+ return ret;
+}
+
+static void module_remove_modinfo_attrs(struct module *mod, int end)
+{
+ struct module_attribute *attr;
+ int i;
+
+ for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
+ if (end >= 0 && i > end)
+ break;
+ /* pick a field to test for end of list */
+ if (!attr->attr.name)
+ break;
+ sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
+ if (attr->free)
+ attr->free(mod);
+ }
+ kfree(mod->modinfo_attrs);
+}
+
+static int module_add_modinfo_attrs(struct module *mod)
+{
+ struct module_attribute *attr;
+ struct module_attribute *temp_attr;
+ int error = 0;
+ int i;
+
+ mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
+ (modinfo_attrs_count + 1)),
+ GFP_KERNEL);
+ if (!mod->modinfo_attrs)
+ return -ENOMEM;
+
+ temp_attr = mod->modinfo_attrs;
+ for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ if (!attr->test || attr->test(mod)) {
+ memcpy(temp_attr, attr, sizeof(*temp_attr));
+ sysfs_attr_init(&temp_attr->attr);
+ error = sysfs_create_file(&mod->mkobj.kobj,
+ &temp_attr->attr);
+ if (error)
+ goto error_out;
+ ++temp_attr;
+ }
+ }
+
+ return 0;
+
+error_out:
+ if (i > 0)
+ module_remove_modinfo_attrs(mod, --i);
+ else
+ kfree(mod->modinfo_attrs);
+ return error;
+}
+
+static void mod_kobject_put(struct module *mod)
+{
+ DECLARE_COMPLETION_ONSTACK(c);
+
+ mod->mkobj.kobj_completion = &c;
+ kobject_put(&mod->mkobj.kobj);
+ wait_for_completion(&c);
+}
+
+static int mod_sysfs_init(struct module *mod)
+{
+ int err;
+ struct kobject *kobj;
+
+ if (!module_kset) {
+ pr_err("%s: module sysfs not initialized\n", mod->name);
+ err = -EINVAL;
+ goto out;
+ }
+
+ kobj = kset_find_obj(module_kset, mod->name);
+ if (kobj) {
+ pr_err("%s: module is already loaded\n", mod->name);
+ kobject_put(kobj);
+ err = -EINVAL;
+ goto out;
+ }
+
+ mod->mkobj.mod = mod;
+
+ memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
+ mod->mkobj.kobj.kset = module_kset;
+ err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
+ "%s", mod->name);
+ if (err)
+ mod_kobject_put(mod);
+
+out:
+ return err;
+}
+
+int mod_sysfs_setup(struct module *mod,
+ const struct load_info *info,
+ struct kernel_param *kparam,
+ unsigned int num_params)
+{
+ int err;
+
+ err = mod_sysfs_init(mod);
+ if (err)
+ goto out;
+
+ mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
+ if (!mod->holders_dir) {
+ err = -ENOMEM;
+ goto out_unreg;
+ }
+
+ err = module_param_sysfs_setup(mod, kparam, num_params);
+ if (err)
+ goto out_unreg_holders;
+
+ err = module_add_modinfo_attrs(mod);
+ if (err)
+ goto out_unreg_param;
+
+ err = add_usage_links(mod);
+ if (err)
+ goto out_unreg_modinfo_attrs;
+
+ add_sect_attrs(mod, info);
+ add_notes_attrs(mod, info);
+
+ return 0;
+
+out_unreg_modinfo_attrs:
+ module_remove_modinfo_attrs(mod, -1);
+out_unreg_param:
+ module_param_sysfs_remove(mod);
+out_unreg_holders:
+ kobject_put(mod->holders_dir);
+out_unreg:
+ mod_kobject_put(mod);
+out:
+ return err;
+}
+
+static void mod_sysfs_fini(struct module *mod)
+{
+ remove_notes_attrs(mod);
+ remove_sect_attrs(mod);
+ mod_kobject_put(mod);
+}
+
+void mod_sysfs_teardown(struct module *mod)
+{
+ del_usage_links(mod);
+ module_remove_modinfo_attrs(mod, -1);
+ module_param_sysfs_remove(mod);
+ kobject_put(mod->mkobj.drivers_dir);
+ kobject_put(mod->holders_dir);
+ mod_sysfs_fini(mod);
+}
+
+void init_param_lock(struct module *mod)
+{
+ mutex_init(&mod->param_lock);
+}
diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c
new file mode 100644
index 000000000000..16742d1c630c
--- /dev/null
+++ b/kernel/module/tracking.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module taint unload tracking support
+ *
+ * Copyright (C) 2022 Aaron Tomlin
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/debugfs.h>
+#include <linux/rculist.h>
+#include "internal.h"
+
+static LIST_HEAD(unloaded_tainted_modules);
+extern struct dentry *mod_debugfs_root;
+
+int try_add_tainted_module(struct module *mod)
+{
+ struct mod_unload_taint *mod_taint;
+
+ module_assert_mutex_or_preempt();
+
+ if (!mod->taints)
+ goto out;
+
+ list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules, list,
+ lockdep_is_held(&module_mutex)) {
+ if (!strcmp(mod_taint->name, mod->name) &&
+ mod_taint->taints & mod->taints) {
+ mod_taint->count++;
+ goto out;
+ }
+ }
+
+ mod_taint = kmalloc(sizeof(*mod_taint), GFP_KERNEL);
+ if (unlikely(!mod_taint))
+ return -ENOMEM;
+ strscpy(mod_taint->name, mod->name, MODULE_NAME_LEN);
+ mod_taint->taints = mod->taints;
+ list_add_rcu(&mod_taint->list, &unloaded_tainted_modules);
+ mod_taint->count = 1;
+out:
+ return 0;
+}
+
+void print_unloaded_tainted_modules(void)
+{
+ struct mod_unload_taint *mod_taint;
+ char buf[MODULE_FLAGS_BUF_SIZE];
+
+ if (!list_empty(&unloaded_tainted_modules)) {
+ printk(KERN_DEFAULT "Unloaded tainted modules:");
+ list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules,
+ list) {
+ size_t l;
+
+ l = module_flags_taint(mod_taint->taints, buf);
+ buf[l++] = '\0';
+ pr_cont(" %s(%s):%llu", mod_taint->name, buf,
+ mod_taint->count);
+ }
+ }
+}
+
+#ifdef CONFIG_DEBUG_FS
+static void *unloaded_tainted_modules_seq_start(struct seq_file *m, loff_t *pos)
+ __acquires(rcu)
+{
+ rcu_read_lock();
+ return seq_list_start_rcu(&unloaded_tainted_modules, *pos);
+}
+
+static void *unloaded_tainted_modules_seq_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next_rcu(p, &unloaded_tainted_modules, pos);
+}
+
+static void unloaded_tainted_modules_seq_stop(struct seq_file *m, void *p)
+ __releases(rcu)
+{
+ rcu_read_unlock();
+}
+
+static int unloaded_tainted_modules_seq_show(struct seq_file *m, void *p)
+{
+ struct mod_unload_taint *mod_taint;
+ char buf[MODULE_FLAGS_BUF_SIZE];
+ size_t l;
+
+ mod_taint = list_entry(p, struct mod_unload_taint, list);
+ l = module_flags_taint(mod_taint->taints, buf);
+ buf[l++] = '\0';
+
+ seq_printf(m, "%s (%s) %llu", mod_taint->name, buf, mod_taint->count);
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static const struct seq_operations unloaded_tainted_modules_seq_ops = {
+ .start = unloaded_tainted_modules_seq_start,
+ .next = unloaded_tainted_modules_seq_next,
+ .stop = unloaded_tainted_modules_seq_stop,
+ .show = unloaded_tainted_modules_seq_show,
+};
+
+static int unloaded_tainted_modules_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &unloaded_tainted_modules_seq_ops);
+}
+
+static const struct file_operations unloaded_tainted_modules_fops = {
+ .open = unloaded_tainted_modules_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init unloaded_tainted_modules_init(void)
+{
+ debugfs_create_file("unloaded_tainted", 0444, mod_debugfs_root, NULL,
+ &unloaded_tainted_modules_fops);
+ return 0;
+}
+module_init(unloaded_tainted_modules_init);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c
new file mode 100644
index 000000000000..277197977d43
--- /dev/null
+++ b/kernel/module/tree_lookup.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Modules tree lookup
+ *
+ * Copyright (C) 2015 Peter Zijlstra
+ * Copyright (C) 2015 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/rbtree_latch.h>
+#include "internal.h"
+
+/*
+ * Use a latched RB-tree for __module_address(); this allows us to use
+ * RCU-sched lookups of the address from any context.
+ *
+ * This is conditional on PERF_EVENTS || TRACING because those can really hit
+ * __module_address() hard by doing a lot of stack unwinding; potentially from
+ * NMI context.
+ */
+
+static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
+{
+ struct module_memory *mod_mem = container_of(n, struct module_memory, mtn.node);
+
+ return (unsigned long)mod_mem->base;
+}
+
+static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
+{
+ struct module_memory *mod_mem = container_of(n, struct module_memory, mtn.node);
+
+ return (unsigned long)mod_mem->size;
+}
+
+static __always_inline bool
+mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b)
+{
+ return __mod_tree_val(a) < __mod_tree_val(b);
+}
+
+static __always_inline int
+mod_tree_comp(void *key, struct latch_tree_node *n)
+{
+ unsigned long val = (unsigned long)key;
+ unsigned long start, end;
+
+ start = __mod_tree_val(n);
+ if (val < start)
+ return -1;
+
+ end = start + __mod_tree_size(n);
+ if (val >= end)
+ return 1;
+
+ return 0;
+}
+
+static const struct latch_tree_ops mod_tree_ops = {
+ .less = mod_tree_less,
+ .comp = mod_tree_comp,
+};
+
+static noinline void __mod_tree_insert(struct mod_tree_node *node, struct mod_tree_root *tree)
+{
+ latch_tree_insert(&node->node, &tree->root, &mod_tree_ops);
+}
+
+static void __mod_tree_remove(struct mod_tree_node *node, struct mod_tree_root *tree)
+{
+ latch_tree_erase(&node->node, &tree->root, &mod_tree_ops);
+}
+
+/*
+ * These modifications: insert, remove_init and remove; are serialized by the
+ * module_mutex.
+ */
+void mod_tree_insert(struct module *mod)
+{
+ for_each_mod_mem_type(type) {
+ mod->mem[type].mtn.mod = mod;
+ if (mod->mem[type].size)
+ __mod_tree_insert(&mod->mem[type].mtn, &mod_tree);
+ }
+}
+
+void mod_tree_remove_init(struct module *mod)
+{
+ for_class_mod_mem_type(type, init) {
+ if (mod->mem[type].size)
+ __mod_tree_remove(&mod->mem[type].mtn, &mod_tree);
+ }
+}
+
+void mod_tree_remove(struct module *mod)
+{
+ for_each_mod_mem_type(type) {
+ if (mod->mem[type].size)
+ __mod_tree_remove(&mod->mem[type].mtn, &mod_tree);
+ }
+}
+
+struct module *mod_find(unsigned long addr, struct mod_tree_root *tree)
+{
+ struct latch_tree_node *ltn;
+
+ ltn = latch_tree_find((void *)addr, &tree->root, &mod_tree_ops);
+ if (!ltn)
+ return NULL;
+
+ return container_of(ltn, struct mod_tree_node, node)->mod;
+}
diff --git a/kernel/module/version.c b/kernel/module/version.c
new file mode 100644
index 000000000000..53f43ac5a73e
--- /dev/null
+++ b/kernel/module/version.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module version support
+ *
+ * Copyright (C) 2008 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include "internal.h"
+
+int check_version(const struct load_info *info,
+ const char *symname,
+ struct module *mod,
+ const s32 *crc)
+{
+ Elf_Shdr *sechdrs = info->sechdrs;
+ unsigned int versindex = info->index.vers;
+ unsigned int i, num_versions;
+ struct modversion_info *versions;
+
+ /* Exporting module didn't supply crcs? OK, we're already tainted. */
+ if (!crc)
+ return 1;
+
+ /* No versions at all? modprobe --force does this. */
+ if (versindex == 0)
+ return try_to_force_load(mod, symname) == 0;
+
+ versions = (void *)sechdrs[versindex].sh_addr;
+ num_versions = sechdrs[versindex].sh_size
+ / sizeof(struct modversion_info);
+
+ for (i = 0; i < num_versions; i++) {
+ u32 crcval;
+
+ if (strcmp(versions[i].name, symname) != 0)
+ continue;
+
+ crcval = *crc;
+ if (versions[i].crc == crcval)
+ return 1;
+ pr_debug("Found checksum %X vs module %lX\n",
+ crcval, versions[i].crc);
+ goto bad_version;
+ }
+
+ /* Broken toolchain. Warn once, then let it go.. */
+ pr_warn_once("%s: no symbol version for %s\n", info->name, symname);
+ return 1;
+
+bad_version:
+ pr_warn("%s: disagrees about version of symbol %s\n", info->name, symname);
+ return 0;
+}
+
+int check_modstruct_version(const struct load_info *info,
+ struct module *mod)
+{
+ struct find_symbol_arg fsa = {
+ .name = "module_layout",
+ .gplok = true,
+ };
+
+ /*
+ * Since this should be found in kernel (which can't be removed), no
+ * locking is necessary -- use preempt_disable() to placate lockdep.
+ */
+ preempt_disable();
+ if (!find_symbol(&fsa)) {
+ preempt_enable();
+ BUG();
+ }
+ preempt_enable();
+ return check_version(info, "module_layout", mod, fsa.crc);
+}
+
+/* First part is kernel version, which we ignore if module has crcs. */
+int same_magic(const char *amagic, const char *bmagic,
+ bool has_crcs)
+{
+ if (has_crcs) {
+ amagic += strcspn(amagic, " ");
+ bmagic += strcspn(bmagic, " ");
+ }
+ return strcmp(amagic, bmagic) == 0;
+}
+
+/*
+ * Generate the signature for all relevant module structures here.
+ * If these change, we don't want to try to parse the module.
+ */
+void module_layout(struct module *mod,
+ struct modversion_info *ver,
+ struct kernel_param *kp,
+ struct kernel_symbol *ks,
+ struct tracepoint * const *tp)
+{
+}
+EXPORT_SYMBOL(module_layout);
diff --git a/kernel/module_signature.c b/kernel/module_signature.c
index 4224a1086b7d..00132d12487c 100644
--- a/kernel/module_signature.c
+++ b/kernel/module_signature.c
@@ -25,7 +25,7 @@ int mod_check_sig(const struct module_signature *ms, size_t file_len,
return -EBADMSG;
if (ms->id_type != PKEY_ID_PKCS7) {
- pr_err("%s: Module is not signed with expected PKCS#7 message\n",
+ pr_err("%s: not signed with expected PKCS#7 message\n",
name);
return -ENOPKG;
}
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
deleted file mode 100644
index 9d9fc678c91d..000000000000
--- a/kernel/module_signing.c
+++ /dev/null
@@ -1,45 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* Module signature checker
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/module_signature.h>
-#include <linux/string.h>
-#include <linux/verification.h>
-#include <crypto/public_key.h>
-#include "module-internal.h"
-
-/*
- * Verify the signature on a module.
- */
-int mod_verify_sig(const void *mod, struct load_info *info)
-{
- struct module_signature ms;
- size_t sig_len, modlen = info->len;
- int ret;
-
- pr_devel("==>%s(,%zu)\n", __func__, modlen);
-
- if (modlen <= sizeof(ms))
- return -EBADMSG;
-
- memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
-
- ret = mod_check_sig(&ms, modlen, info->name);
- if (ret)
- return ret;
-
- sig_len = be32_to_cpu(ms.sig_len);
- modlen -= sig_len + sizeof(ms);
- info->len = modlen;
-
- return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
- VERIFY_USE_SECONDARY_KEYRING,
- VERIFYING_MODULE_SIGNATURE,
- NULL, NULL);
-}
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 84c987dfbe03..b3ce28f39eb6 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -7,6 +7,9 @@
#include <linux/vmalloc.h>
#include <linux/reboot.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/notifier.h>
+
/*
* Notifier list for kernel code which wants to be called
* at shutdown. This is used to stop any idling DMA operations
@@ -20,19 +23,24 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
*/
static int notifier_chain_register(struct notifier_block **nl,
- struct notifier_block *n)
+ struct notifier_block *n,
+ bool unique_priority)
{
while ((*nl) != NULL) {
if (unlikely((*nl) == n)) {
- WARN(1, "double register detected");
- return 0;
+ WARN(1, "notifier callback %ps already registered",
+ n->notifier_call);
+ return -EEXIST;
}
if (n->priority > (*nl)->priority)
break;
+ if (n->priority == (*nl)->priority && unique_priority)
+ return -EBUSY;
nl = &((*nl)->next);
}
n->next = *nl;
rcu_assign_pointer(*nl, n);
+ trace_notifier_register((void *)n->notifier_call);
return 0;
}
@@ -42,6 +50,7 @@ static int notifier_chain_unregister(struct notifier_block **nl,
while ((*nl) != NULL) {
if ((*nl) == n) {
rcu_assign_pointer(*nl, n->next);
+ trace_notifier_unregister((void *)n->notifier_call);
return 0;
}
nl = &((*nl)->next);
@@ -58,7 +67,7 @@ static int notifier_chain_unregister(struct notifier_block **nl,
* value of this parameter is -1.
* @nr_calls: Records the number of notifications sent. Don't care
* value of this field is NULL.
- * @returns: notifier_call_chain returns the value returned by the
+ * Return: notifier_call_chain returns the value returned by the
* last notifier function called.
*/
static int notifier_call_chain(struct notifier_block **nl,
@@ -80,6 +89,7 @@ static int notifier_call_chain(struct notifier_block **nl,
continue;
}
#endif
+ trace_notifier_run((void *)nb->notifier_call);
ret = nb->notifier_call(nb, val, v);
if (nr_calls)
@@ -94,6 +104,34 @@ static int notifier_call_chain(struct notifier_block **nl,
}
NOKPROBE_SYMBOL(notifier_call_chain);
+/**
+ * notifier_call_chain_robust - Inform the registered notifiers about an event
+ * and rollback on error.
+ * @nl: Pointer to head of the blocking notifier chain
+ * @val_up: Value passed unmodified to the notifier function
+ * @val_down: Value passed unmodified to the notifier function when recovering
+ * from an error on @val_up
+ * @v: Pointer passed unmodified to the notifier function
+ *
+ * NOTE: It is important the @nl chain doesn't change between the two
+ * invocations of notifier_call_chain() such that we visit the
+ * exact same notifier callbacks; this rules out any RCU usage.
+ *
+ * Return: the return value of the @val_up call.
+ */
+static int notifier_call_chain_robust(struct notifier_block **nl,
+ unsigned long val_up, unsigned long val_down,
+ void *v)
+{
+ int ret, nr = 0;
+
+ ret = notifier_call_chain(nl, val_up, v, -1, &nr);
+ if (ret & NOTIFY_STOP_MASK)
+ notifier_call_chain(nl, val_down, v, nr-1, NULL);
+
+ return ret;
+}
+
/*
* Atomic notifier chain routines. Registration and unregistration
* use a spinlock, and call_chain is synchronized by RCU (no locks).
@@ -106,7 +144,7 @@ NOKPROBE_SYMBOL(notifier_call_chain);
*
* Adds a notifier to an atomic notifier chain.
*
- * Currently always returns zero.
+ * Returns 0 on success, %-EEXIST on error.
*/
int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
struct notifier_block *n)
@@ -115,13 +153,36 @@ int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
int ret;
spin_lock_irqsave(&nh->lock, flags);
- ret = notifier_chain_register(&nh->head, n);
+ ret = notifier_chain_register(&nh->head, n, false);
spin_unlock_irqrestore(&nh->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
/**
+ * atomic_notifier_chain_register_unique_prio - Add notifier to an atomic notifier chain
+ * @nh: Pointer to head of the atomic notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to an atomic notifier chain if there is no other
+ * notifier registered using the same priority.
+ *
+ * Returns 0 on success, %-EEXIST or %-EBUSY on error.
+ */
+int atomic_notifier_chain_register_unique_prio(struct atomic_notifier_head *nh,
+ struct notifier_block *n)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&nh->lock, flags);
+ ret = notifier_chain_register(&nh->head, n, true);
+ spin_unlock_irqrestore(&nh->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_register_unique_prio);
+
+/**
* atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
* @nh: Pointer to head of the atomic notifier chain
* @n: Entry to remove from notifier chain
@@ -145,12 +206,10 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
/**
- * __atomic_notifier_call_chain - Call functions in an atomic notifier chain
+ * atomic_notifier_call_chain - Call functions in an atomic notifier chain
* @nh: Pointer to head of the atomic notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
- * @nr_to_call: See the comment for notifier_call_chain.
- * @nr_calls: See the comment for notifier_call_chain.
*
* Calls each function in a notifier chain in turn. The functions
* run in an atomic context, so they must not block.
@@ -163,45 +222,41 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
* Otherwise the return value is the return value
* of the last notifier function called.
*/
-int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
- unsigned long val, void *v,
- int nr_to_call, int *nr_calls)
+int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v)
{
int ret;
rcu_read_lock();
- ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+ ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
rcu_read_unlock();
+
return ret;
}
-EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
-NOKPROBE_SYMBOL(__atomic_notifier_call_chain);
+EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+NOKPROBE_SYMBOL(atomic_notifier_call_chain);
-int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
- unsigned long val, void *v)
+/**
+ * atomic_notifier_call_chain_is_empty - Check whether notifier chain is empty
+ * @nh: Pointer to head of the atomic notifier chain
+ *
+ * Checks whether notifier chain is empty.
+ *
+ * Returns true is notifier chain is empty, false otherwise.
+ */
+bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh)
{
- return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
+ return !rcu_access_pointer(nh->head);
}
-EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
-NOKPROBE_SYMBOL(atomic_notifier_call_chain);
/*
* Blocking notifier chain routines. All access to the chain is
* synchronized by an rwsem.
*/
-/**
- * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
- * @nh: Pointer to head of the blocking notifier chain
- * @n: New entry in notifier chain
- *
- * Adds a notifier to a blocking notifier chain.
- * Must be called in process context.
- *
- * Currently always returns zero.
- */
-int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
- struct notifier_block *n)
+static int __blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+ struct notifier_block *n,
+ bool unique_priority)
{
int ret;
@@ -211,16 +266,49 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
* such times we must not call down_write().
*/
if (unlikely(system_state == SYSTEM_BOOTING))
- return notifier_chain_register(&nh->head, n);
+ return notifier_chain_register(&nh->head, n, unique_priority);
down_write(&nh->rwsem);
- ret = notifier_chain_register(&nh->head, n);
+ ret = notifier_chain_register(&nh->head, n, unique_priority);
up_write(&nh->rwsem);
return ret;
}
+
+/**
+ * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to a blocking notifier chain.
+ * Must be called in process context.
+ *
+ * Returns 0 on success, %-EEXIST on error.
+ */
+int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+ struct notifier_block *n)
+{
+ return __blocking_notifier_chain_register(nh, n, false);
+}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
/**
+ * blocking_notifier_chain_register_unique_prio - Add notifier to a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to an blocking notifier chain if there is no other
+ * notifier registered using the same priority.
+ *
+ * Returns 0 on success, %-EEXIST or %-EBUSY on error.
+ */
+int blocking_notifier_chain_register_unique_prio(struct blocking_notifier_head *nh,
+ struct notifier_block *n)
+{
+ return __blocking_notifier_chain_register(nh, n, true);
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_register_unique_prio);
+
+/**
* blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @n: Entry to remove from notifier chain
@@ -250,13 +338,30 @@ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
+int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh,
+ unsigned long val_up, unsigned long val_down, void *v)
+{
+ int ret = NOTIFY_DONE;
+
+ /*
+ * We check the head outside the lock, but if this access is
+ * racy then it does not matter what the result of the test
+ * is, we re-check the list after having taken the lock anyway:
+ */
+ if (rcu_access_pointer(nh->head)) {
+ down_read(&nh->rwsem);
+ ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v);
+ up_read(&nh->rwsem);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_robust);
+
/**
- * __blocking_notifier_call_chain - Call functions in a blocking notifier chain
+ * blocking_notifier_call_chain - Call functions in a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
- * @nr_to_call: See comment for notifier_call_chain.
- * @nr_calls: See comment for notifier_call_chain.
*
* Calls each function in a notifier chain in turn. The functions
* run in a process context, so they are allowed to block.
@@ -268,9 +373,8 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
* Otherwise the return value is the return value
* of the last notifier function called.
*/
-int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
- unsigned long val, void *v,
- int nr_to_call, int *nr_calls)
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+ unsigned long val, void *v)
{
int ret = NOTIFY_DONE;
@@ -281,19 +385,11 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
*/
if (rcu_access_pointer(nh->head)) {
down_read(&nh->rwsem);
- ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
- nr_calls);
+ ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
up_read(&nh->rwsem);
}
return ret;
}
-EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
-
-int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
- unsigned long val, void *v)
-{
- return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
-}
EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
/*
@@ -309,12 +405,12 @@ EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
* Adds a notifier to a raw notifier chain.
* All locking must be provided by the caller.
*
- * Currently always returns zero.
+ * Returns 0 on success, %-EEXIST on error.
*/
int raw_notifier_chain_register(struct raw_notifier_head *nh,
struct notifier_block *n)
{
- return notifier_chain_register(&nh->head, n);
+ return notifier_chain_register(&nh->head, n, false);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
@@ -335,13 +431,18 @@ int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
+int raw_notifier_call_chain_robust(struct raw_notifier_head *nh,
+ unsigned long val_up, unsigned long val_down, void *v)
+{
+ return notifier_call_chain_robust(&nh->head, val_up, val_down, v);
+}
+EXPORT_SYMBOL_GPL(raw_notifier_call_chain_robust);
+
/**
- * __raw_notifier_call_chain - Call functions in a raw notifier chain
+ * raw_notifier_call_chain - Call functions in a raw notifier chain
* @nh: Pointer to head of the raw notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
- * @nr_to_call: See comment for notifier_call_chain.
- * @nr_calls: See comment for notifier_call_chain
*
* Calls each function in a notifier chain in turn. The functions
* run in an undefined context.
@@ -354,22 +455,13 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
* Otherwise the return value is the return value
* of the last notifier function called.
*/
-int __raw_notifier_call_chain(struct raw_notifier_head *nh,
- unsigned long val, void *v,
- int nr_to_call, int *nr_calls)
-{
- return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
-}
-EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
-
int raw_notifier_call_chain(struct raw_notifier_head *nh,
unsigned long val, void *v)
{
- return __raw_notifier_call_chain(nh, val, v, -1, NULL);
+ return notifier_call_chain(&nh->head, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
-#ifdef CONFIG_SRCU
/*
* SRCU notifier chain routines. Registration and unregistration
* use a mutex, and call_chain is synchronized by SRCU (no locks).
@@ -383,7 +475,7 @@ EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
* Adds a notifier to an SRCU notifier chain.
* Must be called in process context.
*
- * Currently always returns zero.
+ * Returns 0 on success, %-EEXIST on error.
*/
int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
struct notifier_block *n)
@@ -396,10 +488,10 @@ int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
* such times we must not call mutex_lock().
*/
if (unlikely(system_state == SYSTEM_BOOTING))
- return notifier_chain_register(&nh->head, n);
+ return notifier_chain_register(&nh->head, n, false);
mutex_lock(&nh->mutex);
- ret = notifier_chain_register(&nh->head, n);
+ ret = notifier_chain_register(&nh->head, n, false);
mutex_unlock(&nh->mutex);
return ret;
}
@@ -437,12 +529,10 @@ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
/**
- * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
+ * srcu_notifier_call_chain - Call functions in an SRCU notifier chain
* @nh: Pointer to head of the SRCU notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
- * @nr_to_call: See comment for notifier_call_chain.
- * @nr_calls: See comment for notifier_call_chain
*
* Calls each function in a notifier chain in turn. The functions
* run in a process context, so they are allowed to block.
@@ -454,25 +544,17 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
* Otherwise the return value is the return value
* of the last notifier function called.
*/
-int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
- unsigned long val, void *v,
- int nr_to_call, int *nr_calls)
+int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+ unsigned long val, void *v)
{
int ret;
int idx;
idx = srcu_read_lock(&nh->srcu);
- ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+ ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
srcu_read_unlock(&nh->srcu, idx);
return ret;
}
-EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
-
-int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
- unsigned long val, void *v)
-{
- return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
-}
EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
/**
@@ -496,8 +578,6 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
}
EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
-#endif /* CONFIG_SRCU */
-
static ATOMIC_NOTIFIER_HEAD(die_chain);
int notrace notify_die(enum die_val val, const char *str,
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index cd356630a311..15781acaac1c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -30,7 +30,7 @@
static struct kmem_cache *nsproxy_cachep;
struct nsproxy init_nsproxy = {
- .count = ATOMIC_INIT(1),
+ .count = REFCOUNT_INIT(1),
.uts_ns = &init_uts_ns,
#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
.ipc_ns = &init_ipc_ns,
@@ -55,7 +55,7 @@ static inline struct nsproxy *create_nsproxy(void)
nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
if (nsproxy)
- atomic_set(&nsproxy->count, 1);
+ refcount_set(&nsproxy->count, 1);
return nsproxy;
}
@@ -153,12 +153,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
struct nsproxy *old_ns = tsk->nsproxy;
struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
struct nsproxy *new_ns;
- int ret;
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET |
CLONE_NEWCGROUP | CLONE_NEWTIME)))) {
- if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
+ if ((flags & CLONE_VM) ||
+ likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
get_nsproxy(old_ns);
return 0;
}
@@ -173,18 +173,15 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
* it along with CLONE_NEWIPC.
*/
if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
- (CLONE_NEWIPC | CLONE_SYSVSEM))
+ (CLONE_NEWIPC | CLONE_SYSVSEM))
return -EINVAL;
new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
if (IS_ERR(new_ns))
return PTR_ERR(new_ns);
- ret = timens_on_fork(new_ns, tsk);
- if (ret) {
- free_nsproxy(new_ns);
- return ret;
- }
+ if ((flags & CLONE_VM) == 0)
+ timens_on_fork(new_ns, tsk);
tsk->nsproxy = new_ns;
return 0;
@@ -250,8 +247,8 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
p->nsproxy = new;
task_unlock(p);
- if (ns && atomic_dec_and_test(&ns->count))
- free_nsproxy(ns);
+ if (ns)
+ put_nsproxy(ns);
}
void exit_task_namespaces(struct task_struct *p)
@@ -259,11 +256,28 @@ void exit_task_namespaces(struct task_struct *p)
switch_task_namespaces(p, NULL);
}
+int exec_task_namespaces(void)
+{
+ struct task_struct *tsk = current;
+ struct nsproxy *new;
+
+ if (tsk->nsproxy->time_ns_for_children == tsk->nsproxy->time_ns)
+ return 0;
+
+ new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ timens_on_fork(new, tsk);
+ switch_task_namespaces(tsk, new);
+ return 0;
+}
+
static int check_setns_flags(unsigned long flags)
{
if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID |
- CLONE_NEWCGROUP)))
+ CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER |
+ CLONE_NEWPID | CLONE_NEWCGROUP)))
return -EINVAL;
#ifndef CONFIG_USER_NS
@@ -290,6 +304,10 @@ static int check_setns_flags(unsigned long flags)
if (flags & CLONE_NEWNET)
return -EINVAL;
#endif
+#ifndef CONFIG_TIME_NS
+ if (flags & CLONE_NEWTIME)
+ return -EINVAL;
+#endif
return 0;
}
@@ -464,6 +482,14 @@ static int validate_nsset(struct nsset *nsset, struct pid *pid)
}
#endif
+#ifdef CONFIG_TIME_NS
+ if (flags & CLONE_NEWTIME) {
+ ret = validate_ns(nsset, &nsp->time_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
out:
if (pid_ns)
put_pid_ns(pid_ns);
@@ -507,6 +533,11 @@ static void commit_nsset(struct nsset *nsset)
exit_sem(me);
#endif
+#ifdef CONFIG_TIME_NS
+ if (flags & CLONE_NEWTIME)
+ timens_commit(me, nsset->nsproxy->time_ns);
+#endif
+
/* transfer ownership */
switch_task_namespaces(me, nsset->nsproxy);
nsset->nsproxy = NULL;
@@ -514,21 +545,20 @@ static void commit_nsset(struct nsset *nsset)
SYSCALL_DEFINE2(setns, int, fd, int, flags)
{
- struct file *file;
+ struct fd f = fdget(fd);
struct ns_common *ns = NULL;
struct nsset nsset = {};
int err = 0;
- file = fget(fd);
- if (!file)
+ if (!f.file)
return -EBADF;
- if (proc_ns_file(file)) {
- ns = get_proc_ns(file_inode(file));
+ if (proc_ns_file(f.file)) {
+ ns = get_proc_ns(file_inode(f.file));
if (flags && (ns->ops->type != flags))
err = -EINVAL;
flags = ns->ops->type;
- } else if (!IS_ERR(pidfd_pid(file))) {
+ } else if (!IS_ERR(pidfd_pid(f.file))) {
err = check_setns_flags(flags);
} else {
err = -EINVAL;
@@ -540,22 +570,22 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
if (err)
goto out;
- if (proc_ns_file(file))
+ if (proc_ns_file(f.file))
err = validate_ns(&nsset, ns);
else
- err = validate_nsset(&nsset, file->private_data);
+ err = validate_nsset(&nsset, f.file->private_data);
if (!err) {
commit_nsset(&nsset);
perf_event_namespaces(current);
}
put_nsset(&nsset);
out:
- fput(file);
+ fdput(f);
return err;
}
int __init nsproxy_cache_init(void)
{
- nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
+ nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC|SLAB_ACCOUNT);
return 0;
}
diff --git a/kernel/numa.c b/kernel/numa.c
new file mode 100644
index 000000000000..67ca6b8585c0
--- /dev/null
+++ b/kernel/numa.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/printk.h>
+#include <linux/numa.h>
+
+/* Stub functions: */
+
+#ifndef memory_add_physaddr_to_nid
+int memory_add_physaddr_to_nid(u64 start)
+{
+ pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n",
+ start);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
+
+#ifndef phys_to_target_node
+int phys_to_target_node(u64 start)
+{
+ pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
+ start);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(phys_to_target_node);
+#endif
diff --git a/kernel/padata.c b/kernel/padata.c
index 4373f7adaa40..179fb1518070 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -9,19 +9,6 @@
*
* Copyright (c) 2020 Oracle and/or its affiliates.
* Author: Daniel Jordan <daniel.m.jordan@oracle.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <linux/completion.h>
@@ -96,8 +83,16 @@ static struct padata_work *padata_work_alloc(void)
return pw;
}
-static void padata_work_init(struct padata_work *pw, work_func_t work_fn,
- void *data, int flags)
+/*
+ * This function is marked __ref because this function may be optimized in such
+ * a way that it directly refers to work_fn's address, which causes modpost to
+ * complain when work_fn is marked __init. This scenario was observed with clang
+ * LTO, where padata_work_init() was optimized to refer directly to
+ * padata_mt_helper() because the calls to padata_work_init() with other work_fn
+ * values were eliminated or inlined.
+ */
+static void __ref padata_work_init(struct padata_work *pw, work_func_t work_fn,
+ void *data, int flags)
{
if (flags & PADATA_WORK_ONSTACK)
INIT_WORK_ONSTACK(&pw->pw_work, work_fn);
@@ -194,7 +189,7 @@ int padata_do_parallel(struct padata_shell *ps,
goto out;
if (!cpumask_test_cpu(*cb_cpu, pd->cpumask.cbcpu)) {
- if (!cpumask_weight(pd->cpumask.cbcpu))
+ if (cpumask_empty(pd->cpumask.cbcpu))
goto out;
/* Select an alternate fallback CPU and notify the caller. */
@@ -207,26 +202,29 @@ int padata_do_parallel(struct padata_shell *ps,
*cb_cpu = cpu;
}
- err = -EBUSY;
+ err = -EBUSY;
if ((pinst->flags & PADATA_RESET))
goto out;
- atomic_inc(&pd->refcnt);
+ refcount_inc(&pd->refcnt);
padata->pd = pd;
padata->cb_cpu = *cb_cpu;
- rcu_read_unlock_bh();
-
spin_lock(&padata_works_lock);
padata->seq_nr = ++pd->seq_nr;
pw = padata_work_alloc();
spin_unlock(&padata_works_lock);
+
+ if (!pw) {
+ /* Maximum works limit exceeded, run in the current task. */
+ padata->parallel(padata);
+ }
+
+ rcu_read_unlock_bh();
+
if (pw) {
padata_work_init(pw, padata_parallel_worker, padata, 0);
queue_work(pinst->parallel_wq, &pw->pw_work);
- } else {
- /* Maximum works limit exceeded, run in the current task. */
- padata->parallel(padata);
}
return 0;
@@ -250,13 +248,11 @@ EXPORT_SYMBOL(padata_do_parallel);
static struct padata_priv *padata_find_next(struct parallel_data *pd,
bool remove_object)
{
- struct padata_parallel_queue *next_queue;
struct padata_priv *padata;
struct padata_list *reorder;
int cpu = pd->cpu;
- next_queue = per_cpu_ptr(pd->pqueue, cpu);
- reorder = &next_queue->reorder;
+ reorder = per_cpu_ptr(pd->reorder_list, cpu);
spin_lock(&reorder->lock);
if (list_empty(&reorder->list)) {
@@ -291,7 +287,7 @@ static void padata_reorder(struct parallel_data *pd)
int cb_cpu;
struct padata_priv *padata;
struct padata_serial_queue *squeue;
- struct padata_parallel_queue *next_queue;
+ struct padata_list *reorder;
/*
* We need to ensure that only one cpu can work on dequeueing of
@@ -339,9 +335,8 @@ static void padata_reorder(struct parallel_data *pd)
*/
smp_mb();
- next_queue = per_cpu_ptr(pd->pqueue, pd->cpu);
- if (!list_empty(&next_queue->reorder.list) &&
- padata_find_next(pd, false))
+ reorder = per_cpu_ptr(pd->reorder_list, pd->cpu);
+ if (!list_empty(&reorder->list) && padata_find_next(pd, false))
queue_work(pinst->serial_wq, &pd->reorder_work);
}
@@ -385,7 +380,7 @@ static void padata_serial_worker(struct work_struct *serial_work)
}
local_bh_enable();
- if (atomic_sub_and_test(cnt, &pd->refcnt))
+ if (refcount_sub_and_test(cnt, &pd->refcnt))
padata_free_pd(pd);
}
@@ -401,17 +396,19 @@ void padata_do_serial(struct padata_priv *padata)
{
struct parallel_data *pd = padata->pd;
int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr);
- struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue,
- hashed_cpu);
+ struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu);
struct padata_priv *cur;
+ struct list_head *pos;
- spin_lock(&pqueue->reorder.lock);
+ spin_lock(&reorder->lock);
/* Sort in ascending order of sequence number. */
- list_for_each_entry_reverse(cur, &pqueue->reorder.list, list)
+ list_for_each_prev(pos, &reorder->list) {
+ cur = list_entry(pos, struct padata_priv, list);
if (cur->seq_nr < padata->seq_nr)
break;
- list_add(&padata->list, &cur->list);
- spin_unlock(&pqueue->reorder.lock);
+ }
+ list_add(&padata->list, pos);
+ spin_unlock(&reorder->lock);
/*
* Ensure the addition to the reorder list is ordered correctly
@@ -441,28 +438,6 @@ static int padata_setup_cpumasks(struct padata_instance *pinst)
return err;
}
-static int pd_setup_cpumasks(struct parallel_data *pd,
- const struct cpumask *pcpumask,
- const struct cpumask *cbcpumask)
-{
- int err = -ENOMEM;
-
- if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
- goto out;
- if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL))
- goto free_pcpu_mask;
-
- cpumask_copy(pd->cpumask.pcpu, pcpumask);
- cpumask_copy(pd->cpumask.cbcpu, cbcpumask);
-
- return 0;
-
-free_pcpu_mask:
- free_cpumask_var(pd->cpumask.pcpu);
-out:
- return err;
-}
-
static void __init padata_mt_helper(struct work_struct *w)
{
struct padata_work *pw = container_of(w, struct padata_work, pw_work);
@@ -516,7 +491,7 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
return;
/* Ensure at least one thread when size < min_chunk. */
- nworks = max(job->size / job->min_chunk, 1ul);
+ nworks = max(job->size / max(job->min_chunk, job->align), 1ul);
nworks = min(nworks, job->max_threads);
if (nworks == 1) {
@@ -575,17 +550,15 @@ static void padata_init_squeues(struct parallel_data *pd)
}
}
-/* Initialize all percpu queues used by parallel workers */
-static void padata_init_pqueues(struct parallel_data *pd)
+/* Initialize per-CPU reorder lists */
+static void padata_init_reorder_list(struct parallel_data *pd)
{
int cpu;
- struct padata_parallel_queue *pqueue;
+ struct padata_list *list;
for_each_cpu(cpu, pd->cpumask.pcpu) {
- pqueue = per_cpu_ptr(pd->pqueue, cpu);
-
- __padata_list_init(&pqueue->reorder);
- atomic_set(&pqueue->num_obj, 0);
+ list = per_cpu_ptr(pd->reorder_list, cpu);
+ __padata_list_init(list);
}
}
@@ -593,43 +566,46 @@ static void padata_init_pqueues(struct parallel_data *pd)
static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
{
struct padata_instance *pinst = ps->pinst;
- const struct cpumask *cbcpumask;
- const struct cpumask *pcpumask;
struct parallel_data *pd;
- cbcpumask = pinst->rcpumask.cbcpu;
- pcpumask = pinst->rcpumask.pcpu;
-
pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
if (!pd)
goto err;
- pd->pqueue = alloc_percpu(struct padata_parallel_queue);
- if (!pd->pqueue)
+ pd->reorder_list = alloc_percpu(struct padata_list);
+ if (!pd->reorder_list)
goto err_free_pd;
pd->squeue = alloc_percpu(struct padata_serial_queue);
if (!pd->squeue)
- goto err_free_pqueue;
+ goto err_free_reorder_list;
pd->ps = ps;
- if (pd_setup_cpumasks(pd, pcpumask, cbcpumask))
+
+ if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
goto err_free_squeue;
+ if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL))
+ goto err_free_pcpu;
+
+ cpumask_and(pd->cpumask.pcpu, pinst->cpumask.pcpu, cpu_online_mask);
+ cpumask_and(pd->cpumask.cbcpu, pinst->cpumask.cbcpu, cpu_online_mask);
- padata_init_pqueues(pd);
+ padata_init_reorder_list(pd);
padata_init_squeues(pd);
pd->seq_nr = -1;
- atomic_set(&pd->refcnt, 1);
+ refcount_set(&pd->refcnt, 1);
spin_lock_init(&pd->lock);
pd->cpu = cpumask_first(pd->cpumask.pcpu);
INIT_WORK(&pd->reorder_work, invoke_padata_reorder);
return pd;
+err_free_pcpu:
+ free_cpumask_var(pd->cpumask.pcpu);
err_free_squeue:
free_percpu(pd->squeue);
-err_free_pqueue:
- free_percpu(pd->pqueue);
+err_free_reorder_list:
+ free_percpu(pd->reorder_list);
err_free_pd:
kfree(pd);
err:
@@ -640,7 +616,7 @@ static void padata_free_pd(struct parallel_data *pd)
{
free_cpumask_var(pd->cpumask.pcpu);
free_cpumask_var(pd->cpumask.cbcpu);
- free_percpu(pd->pqueue);
+ free_percpu(pd->reorder_list);
free_percpu(pd->squeue);
kfree(pd);
}
@@ -682,12 +658,6 @@ static int padata_replace(struct padata_instance *pinst)
pinst->flags |= PADATA_RESET;
- cpumask_and(pinst->rcpumask.pcpu, pinst->cpumask.pcpu,
- cpu_online_mask);
-
- cpumask_and(pinst->rcpumask.cbcpu, pinst->cpumask.cbcpu,
- cpu_online_mask);
-
list_for_each_entry(ps, &pinst->pslist, list) {
err = padata_replace_one(ps);
if (err)
@@ -697,7 +667,7 @@ static int padata_replace(struct padata_instance *pinst)
synchronize_rcu();
list_for_each_entry_continue_reverse(ps, &pinst->pslist, list)
- if (atomic_dec_and_test(&ps->opd->refcnt))
+ if (refcount_dec_and_test(&ps->opd->refcnt))
padata_free_pd(ps->opd);
pinst->flags &= ~PADATA_RESET;
@@ -763,7 +733,7 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
struct cpumask *serial_mask, *parallel_mask;
int err = -EINVAL;
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&pinst->lock);
switch (cpumask_type) {
@@ -783,49 +753,12 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
out:
mutex_unlock(&pinst->lock);
- put_online_cpus();
+ cpus_read_unlock();
return err;
}
EXPORT_SYMBOL(padata_set_cpumask);
-/**
- * padata_start - start the parallel processing
- *
- * @pinst: padata instance to start
- *
- * Return: 0 on success or negative error code
- */
-int padata_start(struct padata_instance *pinst)
-{
- int err = 0;
-
- mutex_lock(&pinst->lock);
-
- if (pinst->flags & PADATA_INVALID)
- err = -EINVAL;
-
- __padata_start(pinst);
-
- mutex_unlock(&pinst->lock);
-
- return err;
-}
-EXPORT_SYMBOL(padata_start);
-
-/**
- * padata_stop - stop the parallel processing
- *
- * @pinst: padata instance to stop
- */
-void padata_stop(struct padata_instance *pinst)
-{
- mutex_lock(&pinst->lock);
- __padata_stop(pinst);
- mutex_unlock(&pinst->lock);
-}
-EXPORT_SYMBOL(padata_stop);
-
#ifdef CONFIG_HOTPLUG_CPU
static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
@@ -907,9 +840,6 @@ static void __padata_free(struct padata_instance *pinst)
WARN_ON(!list_empty(&pinst->pslist));
- padata_stop(pinst);
- free_cpumask_var(pinst->rcpumask.cbcpu);
- free_cpumask_var(pinst->rcpumask.pcpu);
free_cpumask_var(pinst->cpumask.pcpu);
free_cpumask_var(pinst->cpumask.cbcpu);
destroy_workqueue(pinst->serial_wq);
@@ -1037,25 +967,19 @@ static const struct sysfs_ops padata_sysfs_ops = {
.store = padata_sysfs_store,
};
-static struct kobj_type padata_attr_type = {
+static const struct kobj_type padata_attr_type = {
.sysfs_ops = &padata_sysfs_ops,
.default_groups = padata_default_groups,
.release = padata_sysfs_release,
};
/**
- * padata_alloc - allocate and initialize a padata instance and specify
- * cpumasks for serial and parallel workers.
- *
+ * padata_alloc - allocate and initialize a padata instance
* @name: used to identify the instance
- * @pcpumask: cpumask that will be used for padata parallelization
- * @cbcpumask: cpumask that will be used for padata serialization
*
* Return: new instance on success, NULL on error
*/
-static struct padata_instance *padata_alloc(const char *name,
- const struct cpumask *pcpumask,
- const struct cpumask *cbcpumask)
+struct padata_instance *padata_alloc(const char *name)
{
struct padata_instance *pinst;
@@ -1068,7 +992,7 @@ static struct padata_instance *padata_alloc(const char *name,
if (!pinst->parallel_wq)
goto err_free_inst;
- get_online_cpus();
+ cpus_read_lock();
pinst->serial_wq = alloc_workqueue("%s_serial", WQ_MEM_RECLAIM |
WQ_CPU_INTENSIVE, 1, name);
@@ -1081,26 +1005,16 @@ static struct padata_instance *padata_alloc(const char *name,
free_cpumask_var(pinst->cpumask.pcpu);
goto err_free_serial_wq;
}
- if (!padata_validate_cpumask(pinst, pcpumask) ||
- !padata_validate_cpumask(pinst, cbcpumask))
- goto err_free_masks;
-
- if (!alloc_cpumask_var(&pinst->rcpumask.pcpu, GFP_KERNEL))
- goto err_free_masks;
- if (!alloc_cpumask_var(&pinst->rcpumask.cbcpu, GFP_KERNEL))
- goto err_free_rcpumask_pcpu;
INIT_LIST_HEAD(&pinst->pslist);
- cpumask_copy(pinst->cpumask.pcpu, pcpumask);
- cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
- cpumask_and(pinst->rcpumask.pcpu, pcpumask, cpu_online_mask);
- cpumask_and(pinst->rcpumask.cbcpu, cbcpumask, cpu_online_mask);
+ cpumask_copy(pinst->cpumask.pcpu, cpu_possible_mask);
+ cpumask_copy(pinst->cpumask.cbcpu, cpu_possible_mask);
if (padata_setup_cpumasks(pinst))
- goto err_free_rcpumask_cbcpu;
+ goto err_free_masks;
- pinst->flags = 0;
+ __padata_start(pinst);
kobject_init(&pinst->kobj, &padata_attr_type);
mutex_init(&pinst->lock);
@@ -1112,42 +1026,24 @@ static struct padata_instance *padata_alloc(const char *name,
&pinst->cpu_dead_node);
#endif
- put_online_cpus();
+ cpus_read_unlock();
return pinst;
-err_free_rcpumask_cbcpu:
- free_cpumask_var(pinst->rcpumask.cbcpu);
-err_free_rcpumask_pcpu:
- free_cpumask_var(pinst->rcpumask.pcpu);
err_free_masks:
free_cpumask_var(pinst->cpumask.pcpu);
free_cpumask_var(pinst->cpumask.cbcpu);
err_free_serial_wq:
destroy_workqueue(pinst->serial_wq);
err_put_cpus:
- put_online_cpus();
+ cpus_read_unlock();
destroy_workqueue(pinst->parallel_wq);
err_free_inst:
kfree(pinst);
err:
return NULL;
}
-
-/**
- * padata_alloc_possible - Allocate and initialize padata instance.
- * Use the cpu_possible_mask for serial and
- * parallel workers.
- *
- * @name: used to identify the instance
- *
- * Return: new instance on success, NULL on error
- */
-struct padata_instance *padata_alloc_possible(const char *name)
-{
- return padata_alloc(name, cpu_possible_mask, cpu_possible_mask);
-}
-EXPORT_SYMBOL(padata_alloc_possible);
+EXPORT_SYMBOL(padata_alloc);
/**
* padata_free - free a padata instance
@@ -1178,9 +1074,9 @@ struct padata_shell *padata_alloc_shell(struct padata_instance *pinst)
ps->pinst = pinst;
- get_online_cpus();
+ cpus_read_lock();
pd = padata_alloc_pd(ps);
- put_online_cpus();
+ cpus_read_unlock();
if (!pd)
goto out_free_ps;
@@ -1206,12 +1102,16 @@ EXPORT_SYMBOL(padata_alloc_shell);
*/
void padata_free_shell(struct padata_shell *ps)
{
+ struct parallel_data *pd;
+
if (!ps)
return;
mutex_lock(&ps->pinst->lock);
list_del(&ps->list);
- padata_free_pd(rcu_dereference_protected(ps->pd, 1));
+ pd = rcu_dereference_protected(ps->pd, 1);
+ if (refcount_dec_and_test(&pd->refcnt))
+ padata_free_pd(pd);
mutex_unlock(&ps->pinst->lock);
kfree(ps);
diff --git a/kernel/panic.c b/kernel/panic.c
index e2157ca387c8..2807639aab51 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -23,7 +23,9 @@
#include <linux/reboot.h>
#include <linux/delay.h>
#include <linux/kexec.h>
+#include <linux/panic_notifier.h>
#include <linux/sched.h>
+#include <linux/string_helpers.h>
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/nmi.h>
@@ -31,6 +33,9 @@
#include <linux/bug.h>
#include <linux/ratelimit.h>
#include <linux/debugfs.h>
+#include <linux/sysfs.h>
+#include <linux/context_tracking.h>
+#include <trace/events/error_report.h>
#include <asm/sections.h>
#define PANIC_TIMER_STEP 100
@@ -41,12 +46,14 @@
* Should we dump all CPUs backtraces in an oops event?
* Defaults to 0, can be changed via sysctl.
*/
-unsigned int __read_mostly sysctl_oops_all_cpu_backtrace;
+static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace;
+#else
+#define sysctl_oops_all_cpu_backtrace 0
#endif /* CONFIG_SMP */
int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
static unsigned long tainted_mask =
- IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
+ IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
@@ -54,6 +61,7 @@ bool crash_kexec_post_notifiers;
int panic_on_warn __read_mostly;
unsigned long panic_on_taint;
bool panic_on_taint_nousertaint = false;
+static unsigned int warn_limit __read_mostly;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
@@ -64,12 +72,63 @@ EXPORT_SYMBOL_GPL(panic_timeout);
#define PANIC_PRINT_LOCK_INFO 0x00000008
#define PANIC_PRINT_FTRACE_INFO 0x00000010
#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020
+#define PANIC_PRINT_ALL_CPU_BT 0x00000040
unsigned long panic_print;
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_panic_table[] = {
+#ifdef CONFIG_SMP
+ {
+ .procname = "oops_all_cpu_backtrace",
+ .data = &sysctl_oops_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
+ {
+ .procname = "warn_limit",
+ .data = &warn_limit,
+ .maxlen = sizeof(warn_limit),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+ { }
+};
+
+static __init int kernel_panic_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_panic_table);
+ return 0;
+}
+late_initcall(kernel_panic_sysctls_init);
+#endif
+
+static atomic_t warn_count = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+static ssize_t warn_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%d\n", atomic_read(&warn_count));
+}
+
+static struct kobj_attribute warn_count_attr = __ATTR_RO(warn_count);
+
+static __init int kernel_panic_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &warn_count_attr.attr, NULL);
+ return 0;
+}
+late_initcall(kernel_panic_sysfs_init);
+#endif
+
static long no_blink(int state)
{
return 0;
@@ -82,7 +141,7 @@ EXPORT_SYMBOL(panic_blink);
/*
* Stop ourself in panic -- architecture code may override this
*/
-void __weak panic_smp_self_stop(void)
+void __weak __noreturn panic_smp_self_stop(void)
{
while (1)
cpu_relax();
@@ -92,7 +151,7 @@ void __weak panic_smp_self_stop(void)
* Stop ourselves in NMI context if another CPU has already panicked. Arch code
* may override this to prepare for crash dumping, e.g. save regs info.
*/
-void __weak nmi_panic_self_stop(struct pt_regs *regs)
+void __weak __noreturn nmi_panic_self_stop(struct pt_regs *regs)
{
panic_smp_self_stop();
}
@@ -133,28 +192,32 @@ atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
*/
void nmi_panic(struct pt_regs *regs, const char *msg)
{
- int old_cpu, cpu;
+ int old_cpu, this_cpu;
- cpu = raw_smp_processor_id();
- old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu);
+ old_cpu = PANIC_CPU_INVALID;
+ this_cpu = raw_smp_processor_id();
- if (old_cpu == PANIC_CPU_INVALID)
+ /* atomic_try_cmpxchg updates old_cpu on failure */
+ if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu))
panic("%s", msg);
- else if (old_cpu != cpu)
+ else if (old_cpu != this_cpu)
nmi_panic_self_stop(regs);
}
EXPORT_SYMBOL(nmi_panic);
-static void panic_print_sys_info(void)
+static void panic_print_sys_info(bool console_flush)
{
- if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
- console_flush_on_panic(CONSOLE_REPLAY_ALL);
+ if (console_flush) {
+ if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
+ console_flush_on_panic(CONSOLE_REPLAY_ALL);
+ return;
+ }
if (panic_print & PANIC_PRINT_TASK_INFO)
show_state();
if (panic_print & PANIC_PRINT_MEM_INFO)
- show_mem(0, NULL);
+ show_mem();
if (panic_print & PANIC_PRINT_TIMER_INFO)
sysrq_timer_list_show();
@@ -166,6 +229,43 @@ static void panic_print_sys_info(void)
ftrace_dump(DUMP_ALL);
}
+void check_panic_on_warn(const char *origin)
+{
+ unsigned int limit;
+
+ if (panic_on_warn)
+ panic("%s: panic_on_warn set ...\n", origin);
+
+ limit = READ_ONCE(warn_limit);
+ if (atomic_inc_return(&warn_count) >= limit && limit)
+ panic("%s: system warned too often (kernel.warn_limit is %d)",
+ origin, limit);
+}
+
+/*
+ * Helper that triggers the NMI backtrace (if set in panic_print)
+ * and then performs the secondary CPUs shutdown - we cannot have
+ * the NMI backtrace after the CPUs are off!
+ */
+static void panic_other_cpus_shutdown(bool crash_kexec)
+{
+ if (panic_print & PANIC_PRINT_ALL_CPU_BT)
+ trigger_all_cpu_backtrace();
+
+ /*
+ * Note that smp_send_stop() is the usual SMP shutdown function,
+ * which unfortunately may not be hardened to work in a panic
+ * situation. If we want to do crash dump after notifier calls
+ * and kmsg_dump, we will need architecture dependent extra
+ * bits in addition to stopping other CPUs, hence we rely on
+ * crash_smp_send_stop() for that.
+ */
+ if (!crash_kexec)
+ smp_send_stop();
+ else
+ crash_smp_send_stop();
+}
+
/**
* panic - halt the system
* @fmt: The text string to print
@@ -183,6 +283,16 @@ void panic(const char *fmt, ...)
int old_cpu, this_cpu;
bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
+ if (panic_on_warn) {
+ /*
+ * This thread may hit another WARN() in the panic path.
+ * Resetting this prevents additional WARN() from panicking the
+ * system on this thread. Other threads are blocked by the
+ * panic_mutex in panic().
+ */
+ panic_on_warn = 0;
+ }
+
/*
* Disable local interrupts. This will prevent panic_smp_self_stop
* from deadlocking the first cpu that invokes the panic, since
@@ -202,15 +312,18 @@ void panic(const char *fmt, ...)
* stop themself or will wait until they are stopped by the 1st CPU
* with smp_send_stop().
*
- * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which
- * comes here, so go ahead.
+ * cmpxchg success means this is the 1st CPU which comes here,
+ * so go ahead.
* `old_cpu == this_cpu' means we came from nmi_panic() which sets
* panic_cpu to this CPU. In this case, this is also the 1st CPU.
*/
+ old_cpu = PANIC_CPU_INVALID;
this_cpu = raw_smp_processor_id();
- old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
- if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
+ /* atomic_try_cmpxchg updates old_cpu on failure */
+ if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
+ /* go ahead */
+ } else if (old_cpu != this_cpu)
panic_smp_self_stop();
console_verbose();
@@ -246,24 +359,10 @@ void panic(const char *fmt, ...)
*
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
- if (!_crash_kexec_post_notifiers) {
- printk_safe_flush_on_panic();
+ if (!_crash_kexec_post_notifiers)
__crash_kexec(NULL);
- /*
- * Note smp_send_stop is the usual smp shutdown function, which
- * unfortunately means it may not be hardened to work in a
- * panic situation.
- */
- smp_send_stop();
- } else {
- /*
- * If we want to do crash dump after notifier calls and
- * kmsg_dump, we will need architecture dependent extra
- * works in addition to stopping other CPUs.
- */
- crash_smp_send_stop();
- }
+ panic_other_cpus_shutdown(_crash_kexec_post_notifiers);
/*
* Run any panic handlers, including those that might need to
@@ -271,8 +370,8 @@ void panic(const char *fmt, ...)
*/
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
- /* Call flush even twice. It tries harder with a single online CPU */
- printk_safe_flush_on_panic();
+ panic_print_sys_info(false);
+
kmsg_dump(KMSG_DUMP_PANIC);
/*
@@ -287,9 +386,6 @@ void panic(const char *fmt, ...)
if (_crash_kexec_post_notifiers)
__crash_kexec(NULL);
-#ifdef CONFIG_VT
- unblank_screen();
-#endif
console_unblank();
/*
@@ -303,7 +399,7 @@ void panic(const char *fmt, ...)
debug_locks_off();
console_flush_on_panic(CONSOLE_FLUSH_PENDING);
- panic_print_sys_info();
+ panic_print_sys_info(true);
if (!panic_blink)
panic_blink = no_blink;
@@ -386,6 +482,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
[ TAINT_LIVEPATCH ] = { 'K', ' ', true },
[ TAINT_AUX ] = { 'X', ' ', true },
[ TAINT_RANDSTRUCT ] = { 'T', ' ', true },
+ [ TAINT_TEST ] = { 'N', ' ', true },
};
/**
@@ -505,7 +602,7 @@ static void do_oops_enter_exit(void)
* Return true if the calling CPU is allowed to print oops-related info.
* This is a bit racy..
*/
-int oops_may_print(void)
+bool oops_may_print(void)
{
return pause_on_oops_flag == 0;
}
@@ -535,26 +632,9 @@ void oops_enter(void)
trigger_all_cpu_backtrace();
}
-/*
- * 64-bit random ID for oopses:
- */
-static u64 oops_id;
-
-static int init_oops_id(void)
-{
- if (!oops_id)
- get_random_bytes(&oops_id, sizeof(oops_id));
- else
- oops_id++;
-
- return 0;
-}
-late_initcall(init_oops_id);
-
-void print_oops_end_marker(void)
+static void print_oops_end_marker(void)
{
- init_oops_id();
- pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
+ pr_warn("---[ end trace %016llx ]---\n", 0ULL);
}
/*
@@ -589,36 +669,31 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
if (args)
vprintk(args->fmt, args->args);
- if (panic_on_warn) {
- /*
- * This thread may hit another WARN() in the panic path.
- * Resetting this prevents additional WARN() from panicking the
- * system on this thread. Other threads are blocked by the
- * panic_mutex in panic().
- */
- panic_on_warn = 0;
- panic("panic_on_warn set ...\n");
- }
-
print_modules();
if (regs)
show_regs(regs);
- else
+
+ check_panic_on_warn("kernel");
+
+ if (!regs)
dump_stack();
print_irqtrace_events(current);
print_oops_end_marker();
+ trace_error_report_end(ERROR_DETECTOR_WARN, (unsigned long)caller);
/* Just a warning, don't kill lockdep. */
add_taint(taint, LOCKDEP_STILL_OK);
}
+#ifdef CONFIG_BUG
#ifndef __WARN_FLAGS
void warn_slowpath_fmt(const char *file, int line, unsigned taint,
const char *fmt, ...)
{
+ bool rcu = warn_rcu_enter();
struct warn_args args;
pr_warn(CUT_HERE);
@@ -626,6 +701,7 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint,
if (!fmt) {
__warn(file, line, __builtin_return_address(0), taint,
NULL, NULL);
+ warn_rcu_exit(rcu);
return;
}
@@ -633,11 +709,13 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint,
va_start(args.args, fmt);
__warn(file, line, __builtin_return_address(0), taint, NULL, &args);
va_end(args.args);
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL(warn_slowpath_fmt);
#else
void __warn_printk(const char *fmt, ...)
{
+ bool rcu = warn_rcu_enter();
va_list args;
pr_warn(CUT_HERE);
@@ -645,12 +723,11 @@ void __warn_printk(const char *fmt, ...)
va_start(args, fmt);
vprintk(fmt, args);
va_end(args);
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL(__warn_printk);
#endif
-#ifdef CONFIG_BUG
-
/* Support resetting WARN*_ONCE state */
static int clear_warn_once_set(void *data, u64 val)
@@ -727,8 +804,8 @@ static int __init panic_on_taint_setup(char *s)
if (s && !strcmp(s, "nousertaint"))
panic_on_taint_nousertaint = true;
- pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%sabled\n",
- panic_on_taint, panic_on_taint_nousertaint ? "en" : "dis");
+ pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%s\n",
+ panic_on_taint, str_enabled_disabled(panic_on_taint_nousertaint));
return 0;
}
diff --git a/kernel/params.c b/kernel/params.c
index 8e56f8b12d8f..2e447f8ae183 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -1,18 +1,20 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/* Helpers for initial module or kernel cmdline parsing
- Copyright (C) 2001 Rusty Russell.
-
-*/
-#include <linux/kernel.h>
-#include <linux/string.h>
+/*
+ * Helpers for initial module or kernel cmdline parsing
+ * Copyright (C) 2001 Rusty Russell.
+ */
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/err.h>
#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/kstrtox.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
-#include <linux/device.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/ctype.h>
+#include <linux/overflow.h>
#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/string.h>
#ifdef CONFIG_SYSFS
/* Protects all built-in parameters, modules use their own param_lock */
@@ -47,7 +49,7 @@ static void *kmalloc_parameter(unsigned int size)
{
struct kmalloced_param *p;
- p = kmalloc(sizeof(*p) + size, GFP_KERNEL);
+ p = kmalloc(size_add(sizeof(*p), size), GFP_KERNEL);
if (!p)
return NULL;
@@ -119,9 +121,7 @@ static int parse_one(char *param,
unsigned num_params,
s16 min_level,
s16 max_level,
- void *arg,
- int (*handle_unknown)(char *param, char *val,
- const char *doing, void *arg))
+ void *arg, parse_unknown_fn handle_unknown)
{
unsigned int i;
int err;
@@ -164,9 +164,7 @@ char *parse_args(const char *doing,
unsigned num,
s16 min_level,
s16 max_level,
- void *arg,
- int (*unknown)(char *param, char *val,
- const char *doing, void *arg))
+ void *arg, parse_unknown_fn unknown)
{
char *param, *val, *err = NULL;
@@ -233,28 +231,52 @@ char *parse_args(const char *doing,
EXPORT_SYMBOL(param_ops_##name)
-STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
-STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
-STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
-STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
-STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
-STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
+STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
+STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
+STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
+STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
+STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
+STANDARD_PARAM_DEF(hexint, unsigned int, "%#08x", kstrtouint);
+
+int param_set_uint_minmax(const char *val, const struct kernel_param *kp,
+ unsigned int min, unsigned int max)
+{
+ unsigned int num;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+ ret = kstrtouint(val, 0, &num);
+ if (ret)
+ return ret;
+ if (num < min || num > max)
+ return -EINVAL;
+ *((unsigned int *)kp->arg) = num;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(param_set_uint_minmax);
int param_set_charp(const char *val, const struct kernel_param *kp)
{
- if (strlen(val) > 1024) {
+ size_t len, maxlen = 1024;
+
+ len = strnlen(val, maxlen + 1);
+ if (len == maxlen + 1) {
pr_err("%s: string parameter too long\n", kp->name);
return -ENOSPC;
}
maybe_kfree_parameter(*(char **)kp->arg);
- /* This is a hack. We can't kmalloc in early boot, and we
- * don't need to; this mangled commandline is preserved. */
+ /*
+ * This is a hack. We can't kmalloc() in early boot, and we
+ * don't need to; this mangled commandline is preserved.
+ */
if (slab_is_available()) {
- *(char **)kp->arg = kmalloc_parameter(strlen(val)+1);
+ *(char **)kp->arg = kmalloc_parameter(len + 1);
if (!*(char **)kp->arg)
return -ENOMEM;
strcpy(*(char **)kp->arg, val);
@@ -291,7 +313,7 @@ int param_set_bool(const char *val, const struct kernel_param *kp)
if (!val) val = "1";
/* One of =[yYnN01] */
- return strtobool(val, kp->arg);
+ return kstrtobool(val, kp->arg);
}
EXPORT_SYMBOL(param_set_bool);
@@ -311,7 +333,7 @@ EXPORT_SYMBOL(param_ops_bool);
int param_set_bool_enable_only(const char *val, const struct kernel_param *kp)
{
- int err = 0;
+ int err;
bool new_value;
bool orig_value = *(bool *)kp->arg;
struct kernel_param dummy_kp = *kp;
@@ -492,7 +514,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp)
{
const struct kparam_string *kps = kp->str;
- if (strlen(val)+1 > kps->maxlen) {
+ if (strnlen(val, kps->maxlen) == kps->maxlen) {
pr_err("%s: string doesn't fit in %u chars.\n",
kp->name, kps->maxlen-1);
return -ENOSPC;
@@ -529,7 +551,7 @@ struct module_param_attrs
{
unsigned int num;
struct attribute_group grp;
- struct param_attribute attrs[0];
+ struct param_attribute attrs[];
};
#ifdef CONFIG_SYSFS
@@ -723,8 +745,10 @@ void module_param_sysfs_remove(struct module *mod)
{
if (mod->mkobj.mp) {
sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
- /* We are positive that no one is using any param
- * attrs at this point. Deallocate immediately. */
+ /*
+ * We are positive that no one is using any param
+ * attrs at this point. Deallocate immediately.
+ */
free_module_param_attrs(&mod->mkobj);
}
}
@@ -827,7 +851,7 @@ static void __init param_sysfs_builtin(void)
name_len = 0;
} else {
name_len = dot - kp->name + 1;
- strlcpy(modname, kp->name, name_len);
+ strscpy(modname, kp->name, name_len);
}
kernel_add_sysfs_param(modname, kp, name_len);
}
@@ -842,18 +866,16 @@ ssize_t __modver_version_show(struct module_attribute *mattr,
return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version);
}
-extern const struct module_version_attribute *__start___modver[];
-extern const struct module_version_attribute *__stop___modver[];
+extern const struct module_version_attribute __start___modver[];
+extern const struct module_version_attribute __stop___modver[];
static void __init version_sysfs_builtin(void)
{
- const struct module_version_attribute **p;
+ const struct module_version_attribute *vattr;
struct module_kobject *mk;
int err;
- for (p = __start___modver; p < __stop___modver; p++) {
- const struct module_version_attribute *vattr = *p;
-
+ for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
mk = locate_module_kobject(vattr->module_name);
if (mk) {
err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
@@ -909,9 +931,9 @@ static const struct sysfs_ops module_sysfs_ops = {
.store = module_attr_store,
};
-static int uevent_filter(struct kset *kset, struct kobject *kobj)
+static int uevent_filter(const struct kobject *kobj)
{
- struct kobj_type *ktype = get_ktype(kobj);
+ const struct kobj_type *ktype = get_ktype(kobj);
if (ktype == &module_ktype)
return 1;
@@ -923,7 +945,6 @@ static const struct kset_uevent_ops module_uevent_ops = {
};
struct kset *module_kset;
-int module_sysfs_initialized;
static void module_kobj_release(struct kobject *kobj)
{
@@ -931,13 +952,17 @@ static void module_kobj_release(struct kobject *kobj)
complete(mk->kobj_completion);
}
-struct kobj_type module_ktype = {
+const struct kobj_type module_ktype = {
.release = module_kobj_release,
.sysfs_ops = &module_sysfs_ops,
};
/*
- * param_sysfs_init - wrapper for built-in params support
+ * param_sysfs_init - create "module" kset
+ *
+ * This must be done before the initramfs is unpacked and
+ * request_module() thus becomes possible, because otherwise the
+ * module load would fail in mod_sysfs_init.
*/
static int __init param_sysfs_init(void)
{
@@ -947,13 +972,25 @@ static int __init param_sysfs_init(void)
__FILE__, __LINE__);
return -ENOMEM;
}
- module_sysfs_initialized = 1;
+
+ return 0;
+}
+subsys_initcall(param_sysfs_init);
+
+/*
+ * param_sysfs_builtin_init - add sysfs version and parameter
+ * attributes for built-in modules
+ */
+static int __init param_sysfs_builtin_init(void)
+{
+ if (!module_kset)
+ return -ENOMEM;
version_sysfs_builtin();
param_sysfs_builtin();
return 0;
}
-subsys_initcall(param_sysfs_init);
+late_initcall(param_sysfs_builtin_init);
#endif /* CONFIG_SYSFS */
diff --git a/kernel/pid.c b/kernel/pid.c
index f1496b757162..b52b10865454 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -42,6 +42,8 @@
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
+#include <net/sock.h>
+#include <uapi/linux/pidfd.h>
struct pid init_struct_pid = {
.count = REFCOUNT_INIT(1),
@@ -71,7 +73,7 @@ int pid_max_max = PID_MAX_LIMIT;
* the scheme scales to up to 4 million PIDs, runtime.
*/
struct pid_namespace init_pid_ns = {
- .kref = KREF_INIT(2),
+ .ns.count = REFCOUNT_INIT(2),
.idr = IDR_INIT(init_pid_ns.idr),
.pid_allocated = PIDNS_ADDING,
.level = 0,
@@ -81,6 +83,9 @@ struct pid_namespace init_pid_ns = {
#ifdef CONFIG_PID_NS
.ns.ops = &pidns_operations,
#endif
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+ .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
+#endif
};
EXPORT_SYMBOL_GPL(init_pid_ns);
@@ -198,7 +203,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
if (tid != 1 && !tmp->child_reaper)
goto out_free;
retval = -EPERM;
- if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN))
+ if (!checkpoint_restore_ns_capable(tmp->user_ns))
goto out_free;
set_tid_size--;
}
@@ -517,34 +522,94 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
return idr_get_next(&ns->idr, &nr);
}
+EXPORT_SYMBOL_GPL(find_ge_pid);
+
+struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
+{
+ struct fd f;
+ struct pid *pid;
+
+ f = fdget(fd);
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ pid = pidfd_pid(f.file);
+ if (!IS_ERR(pid)) {
+ get_pid(pid);
+ *flags = f.file->f_flags;
+ }
+
+ fdput(f);
+ return pid;
+}
+
+/**
+ * pidfd_get_task() - Get the task associated with a pidfd
+ *
+ * @pidfd: pidfd for which to get the task
+ * @flags: flags associated with this pidfd
+ *
+ * Return the task associated with @pidfd. The function takes a reference on
+ * the returned task. The caller is responsible for releasing that reference.
+ *
+ * Currently, the process identified by @pidfd is always a thread-group leader.
+ * This restriction currently exists for all aspects of pidfds including pidfd
+ * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling
+ * (only supports thread group leaders).
+ *
+ * Return: On success, the task_struct associated with the pidfd.
+ * On error, a negative errno number will be returned.
+ */
+struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
+{
+ unsigned int f_flags;
+ struct pid *pid;
+ struct task_struct *task;
+
+ pid = pidfd_get_pid(pidfd, &f_flags);
+ if (IS_ERR(pid))
+ return ERR_CAST(pid);
+
+ task = get_pid_task(pid, PIDTYPE_TGID);
+ put_pid(pid);
+ if (!task)
+ return ERR_PTR(-ESRCH);
+
+ *flags = f_flags;
+ return task;
+}
/**
* pidfd_create() - Create a new pid file descriptor.
*
- * @pid: struct pid that the pidfd will reference
+ * @pid: struct pid that the pidfd will reference
+ * @flags: flags to pass
*
* This creates a new pid file descriptor with the O_CLOEXEC flag set.
*
* Note, that this function can only be called after the fd table has
* been unshared to avoid leaking the pidfd to the new process.
*
+ * This symbol should not be explicitly exported to loadable modules.
+ *
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
*/
-static int pidfd_create(struct pid *pid)
+int pidfd_create(struct pid *pid, unsigned int flags)
{
- int fd;
+ int pidfd;
+ struct file *pidfd_file;
- fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
- O_RDWR | O_CLOEXEC);
- if (fd < 0)
- put_pid(pid);
+ pidfd = pidfd_prepare(pid, flags, &pidfd_file);
+ if (pidfd < 0)
+ return pidfd;
- return fd;
+ fd_install(pidfd, pidfd_file);
+ return pidfd;
}
/**
- * pidfd_open() - Open new pid file descriptor.
+ * sys_pidfd_open() - Open new pid file descriptor.
*
* @pid: pid for which to retrieve a pidfd
* @flags: flags to pass
@@ -564,7 +629,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
int fd;
struct pid *p;
- if (flags)
+ if (flags & ~PIDFD_NONBLOCK)
return -EINVAL;
if (pid <= 0)
@@ -574,10 +639,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
if (!p)
return -ESRCH;
- if (pid_has_task(p, PIDTYPE_TGID))
- fd = pidfd_create(p);
- else
- fd = -EINVAL;
+ fd = pidfd_create(p, flags);
put_pid(p);
return fd;
@@ -597,8 +659,11 @@ void __init pid_idr_init(void)
idr_init(&init_pid_ns.idr);
- init_pid_ns.pid_cachep = KMEM_CACHE(pid,
- SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
+ init_pid_ns.pid_cachep = kmem_cache_create("pid",
+ struct_size_t(struct pid, numbers, 1),
+ __alignof__(struct pid),
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
+ NULL);
}
static struct file *__pidfd_fget(struct task_struct *task, int fd)
@@ -606,7 +671,7 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd)
struct file *file;
int ret;
- ret = mutex_lock_killable(&task->signal->exec_update_mutex);
+ ret = down_read_killable(&task->signal->exec_update_lock);
if (ret)
return ERR_PTR(ret);
@@ -615,7 +680,7 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd)
else
file = ERR_PTR(-EPERM);
- mutex_unlock(&task->signal->exec_update_mutex);
+ up_read(&task->signal->exec_update_lock);
return file ?: ERR_PTR(-EBADF);
}
@@ -635,17 +700,8 @@ static int pidfd_getfd(struct pid *pid, int fd)
if (IS_ERR(file))
return PTR_ERR(file);
- ret = security_file_receive(file);
- if (ret) {
- fput(file);
- return ret;
- }
-
- ret = get_unused_fd_flags(O_CLOEXEC);
- if (ret < 0)
- fput(file);
- else
- fd_install(ret, file);
+ ret = receive_fd(file, NULL, O_CLOEXEC);
+ fput(file);
return ret;
}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 0e5ac162c3a8..7ade20e95232 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -23,6 +23,8 @@
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
#include <linux/idr.h>
+#include <uapi/linux/wait.h>
+#include "pid_sysctl.h"
static DEFINE_MUTEX(pid_caches_mutex);
static struct kmem_cache *pid_ns_cachep;
@@ -47,11 +49,12 @@ static struct kmem_cache *create_pid_cachep(unsigned int level)
return kc;
snprintf(name, sizeof(name), "pid_%u", level + 1);
- len = sizeof(struct pid) + level * sizeof(struct upid);
+ len = struct_size_t(struct pid, numbers, level + 1);
mutex_lock(&pid_caches_mutex);
/* Name collision forces to do allocation under mutex. */
if (!*pkc)
- *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN, 0);
+ *pkc = kmem_cache_create(name, len, 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
mutex_unlock(&pid_caches_mutex);
/* current can fail, but someone else can succeed. */
return READ_ONCE(*pkc);
@@ -102,13 +105,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
goto out_free_idr;
ns->ns.ops = &pidns_operations;
- kref_init(&ns->kref);
+ refcount_set(&ns->ns.count, 1);
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
-
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+ ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
+#endif
return ns;
out_free_idr:
@@ -148,22 +153,15 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
return create_pid_namespace(user_ns, old_ns);
}
-static void free_pid_ns(struct kref *kref)
-{
- struct pid_namespace *ns;
-
- ns = container_of(kref, struct pid_namespace, kref);
- destroy_pid_namespace(ns);
-}
-
void put_pid_ns(struct pid_namespace *ns)
{
struct pid_namespace *parent;
while (ns != &init_pid_ns) {
parent = ns->parent;
- if (!kref_put(&ns->kref, free_pid_ns))
+ if (!refcount_dec_and_test(&ns->ns.count))
break;
+ destroy_pid_namespace(ns);
ns = parent;
}
}
@@ -233,7 +231,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
* to pid_ns->child_reaper. Thus pidns->child_reaper needs to
* stay valid until they all go away.
*
- * The code relies on the the pid_ns->child_reaper ignoring
+ * The code relies on the pid_ns->child_reaper ignoring
* SIGCHILD to cause those EXIT_ZOMBIE processes to be
* autoreaped if reparented.
*
@@ -250,7 +248,24 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
set_current_state(TASK_INTERRUPTIBLE);
if (pid_ns->pid_allocated == init_pids)
break;
+ /*
+ * Release tasks_rcu_exit_srcu to avoid following deadlock:
+ *
+ * 1) TASK A unshare(CLONE_NEWPID)
+ * 2) TASK A fork() twice -> TASK B (child reaper for new ns)
+ * and TASK C
+ * 3) TASK B exits, kills TASK C, waits for TASK A to reap it
+ * 4) TASK A calls synchronize_rcu_tasks()
+ * -> synchronize_srcu(tasks_rcu_exit_srcu)
+ * 5) *DEADLOCK*
+ *
+ * It is considered safe to release tasks_rcu_exit_srcu here
+ * because we assume the current task can not be concurrently
+ * reaped at this point.
+ */
+ exit_tasks_rcu_stop();
schedule();
+ exit_tasks_rcu_start();
}
__set_current_state(TASK_RUNNING);
@@ -269,15 +284,9 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
struct ctl_table tmp = *table;
int ret, next;
- if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
+ if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
return -EPERM;
- /*
- * Writing directly to ns' last_pid field is OK, since this field
- * is volatile in a living namespace anyway and a code writing to
- * it should synchronize its usage with external means.
- */
-
next = idr_get_cursor(&pid_ns->idr) - 1;
tmp.data = &next;
@@ -300,7 +309,6 @@ static struct ctl_table pid_ns_ctl_table[] = {
},
{ }
};
-static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
#endif /* CONFIG_CHECKPOINT_RESTORE */
int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
@@ -456,11 +464,13 @@ const struct proc_ns_operations pidns_for_children_operations = {
static __init int pid_namespaces_init(void)
{
- pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+ pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
#ifdef CONFIG_CHECKPOINT_RESTORE
- register_sysctl_paths(kern_path, pid_ns_ctl_table);
+ register_sysctl_init("kernel", pid_ns_ctl_table);
#endif
+
+ register_pid_ns_sysctl_table_vm();
return 0;
}
diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h
new file mode 100644
index 000000000000..2ee41a3a1dfd
--- /dev/null
+++ b/kernel/pid_sysctl.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_PID_SYSCTL_H
+#define LINUX_PID_SYSCTL_H
+
+#include <linux/pid_namespace.h>
+
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table,
+ int write, void *buf, size_t *lenp, loff_t *ppos)
+{
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ struct ctl_table table_copy;
+ int err, scope, parent_scope;
+
+ if (write && !ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ table_copy = *table;
+
+ /* You cannot set a lower enforcement value than your parent. */
+ parent_scope = pidns_memfd_noexec_scope(ns->parent);
+ /* Equivalent to pidns_memfd_noexec_scope(ns). */
+ scope = max(READ_ONCE(ns->memfd_noexec_scope), parent_scope);
+
+ table_copy.data = &scope;
+ table_copy.extra1 = &parent_scope;
+
+ err = proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos);
+ if (!err && write)
+ WRITE_ONCE(ns->memfd_noexec_scope, scope);
+ return err;
+}
+
+static struct ctl_table pid_ns_ctl_table_vm[] = {
+ {
+ .procname = "memfd_noexec",
+ .data = &init_pid_ns.memfd_noexec_scope,
+ .maxlen = sizeof(init_pid_ns.memfd_noexec_scope),
+ .mode = 0644,
+ .proc_handler = pid_mfd_noexec_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ { }
+};
+static inline void register_pid_ns_sysctl_table_vm(void)
+{
+ register_sysctl("vm", pid_ns_ctl_table_vm);
+}
+#else
+static inline void register_pid_ns_sysctl_table_vm(void) {}
+#endif
+
+#endif /* LINUX_PID_SYSCTL_H */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index a7320f07689d..4b31629c5be4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -98,27 +98,26 @@ config PM_STD_PARTITION
default ""
help
The default resume partition is the partition that the suspend-
- to-disk implementation will look for a suspended disk image.
+ to-disk implementation will look for a suspended disk image.
- The partition specified here will be different for almost every user.
+ The partition specified here will be different for almost every user.
It should be a valid swap partition (at least for now) that is turned
- on before suspending.
+ on before suspending.
The partition specified can be overridden by specifying:
- resume=/dev/<other device>
+ resume=/dev/<other device>
- which will set the resume partition to the device specified.
+ which will set the resume partition to the device specified.
Note there is currently not a way to specify which device to save the
- suspended image to. It will simply pick the first available swap
+ suspended image to. It will simply pick the first available swap
device.
config PM_SLEEP
def_bool y
depends on SUSPEND || HIBERNATE_CALLBACKS
select PM
- select SRCU
config PM_SLEEP_SMP
def_bool y
@@ -139,15 +138,33 @@ config PM_SLEEP_SMP_NONZERO_CPU
config PM_AUTOSLEEP
bool "Opportunistic sleep"
depends on PM_SLEEP
- default n
help
Allow the kernel to trigger a system transition into a global sleep
state automatically whenever there are no active wakeup sources.
+config PM_USERSPACE_AUTOSLEEP
+ bool "Userspace opportunistic sleep"
+ depends on PM_SLEEP
+ help
+ Notify kernel of aggressive userspace autosleep power management policy.
+
+ This option changes the behavior of various sleep-sensitive code to deal
+ with frequent userspace-initiated transitions into a global sleep state.
+
+ Saying Y here, disables code paths that most users really should keep
+ enabled. In particular, only enable this if it is very common to be
+ asleep/awake for very short periods of time (<= 2 seconds).
+
+ Only platforms, such as Android, that implement opportunistic sleep from
+ a userspace power manager service should enable this option; and not
+ other machines. Therefore, you should say N here, unless you are
+ extremely certain that this is what you want. The option otherwise has
+ bad, undesirable effects, and should not be enabled just for fun.
+
+
config PM_WAKELOCKS
bool "User space wakeup sources interface"
depends on PM_SLEEP
- default n
help
Allow user space to create, activate and deactivate wakeup source
objects with the help of a sysfs-based interface.
@@ -293,7 +310,6 @@ config PM_GENERIC_DOMAINS
config WQ_POWER_EFFICIENT_DEFAULT
bool "Enable workqueue power-efficient mode by default"
depends on PM
- default n
help
Per-cpu workqueues are generally preferred because they show
better performance thanks to cache locality; unfortunately,
@@ -322,15 +338,14 @@ config CPU_PM
bool
config ENERGY_MODEL
- bool "Energy Model for CPUs"
+ bool "Energy Model for devices with DVFS (CPUs, GPUs, etc)"
depends on SMP
depends on CPU_FREQ
- default n
help
Several subsystems (thermal and/or the task scheduler for example)
- can leverage information about the energy consumed by CPUs to make
- smarter decisions. This config option enables the framework from
- which subsystems can access the energy models.
+ can leverage information about the energy consumed by devices to
+ make smarter decisions. This config option enables the framework
+ from which subsystems can access the energy models.
The exact usage of the energy model is subsystem-dependent.
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 5899260a8bef..874ad834dc8d 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,6 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
-ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
+ifeq ($(CONFIG_DYNAMIC_DEBUG), y)
+CFLAGS_swap.o := -DDEBUG
+CFLAGS_snapshot.o := -DDEBUG
+CFLAGS_energy_model.o := -DDEBUG
+endif
KASAN_SANITIZE_snapshot.o := n
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index 9af5a50d3489..b29c8aca7486 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -54,7 +54,7 @@ static void try_to_suspend(struct work_struct *work)
goto out;
/*
- * If the wakeup occured for an unknown reason, wait to prevent the
+ * If the wakeup occurred for an unknown reason, wait to prevent the
* system from trying to suspend and waking up in a tight loop.
*/
if (final_count == initial_count)
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 0a9326f5f421..7b44f5b89fa1 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -1,44 +1,49 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Energy Model of CPUs
+ * Energy Model of devices
*
- * Copyright (c) 2018, Arm ltd.
+ * Copyright (c) 2018-2021, Arm ltd.
* Written by: Quentin Perret, Arm ltd.
+ * Improvements provided by: Lukasz Luba, Arm ltd.
*/
#define pr_fmt(fmt) "energy_model: " fmt
#include <linux/cpu.h>
+#include <linux/cpufreq.h>
#include <linux/cpumask.h>
#include <linux/debugfs.h>
#include <linux/energy_model.h>
#include <linux/sched/topology.h>
#include <linux/slab.h>
-/* Mapping of each CPU to the performance domain to which it belongs. */
-static DEFINE_PER_CPU(struct em_perf_domain *, em_data);
-
/*
* Mutex serializing the registrations of performance domains and letting
* callbacks defined by drivers sleep.
*/
static DEFINE_MUTEX(em_pd_mutex);
+static bool _is_cpu_device(struct device *dev)
+{
+ return (dev->bus == &cpu_subsys);
+}
+
#ifdef CONFIG_DEBUG_FS
static struct dentry *rootdir;
-static void em_debug_create_cs(struct em_cap_state *cs, struct dentry *pd)
+static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd)
{
struct dentry *d;
char name[24];
- snprintf(name, sizeof(name), "cs:%lu", cs->frequency);
+ snprintf(name, sizeof(name), "ps:%lu", ps->frequency);
- /* Create per-cs directory */
+ /* Create per-ps directory */
d = debugfs_create_dir(name, pd);
- debugfs_create_ulong("frequency", 0444, d, &cs->frequency);
- debugfs_create_ulong("power", 0444, d, &cs->power);
- debugfs_create_ulong("cost", 0444, d, &cs->cost);
+ debugfs_create_ulong("frequency", 0444, d, &ps->frequency);
+ debugfs_create_ulong("power", 0444, d, &ps->power);
+ debugfs_create_ulong("cost", 0444, d, &ps->cost);
+ debugfs_create_ulong("inefficient", 0444, d, &ps->flags);
}
static int em_debug_cpus_show(struct seq_file *s, void *unused)
@@ -49,22 +54,40 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused)
}
DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
-static void em_debug_create_pd(struct em_perf_domain *pd, int cpu)
+static int em_debug_flags_show(struct seq_file *s, void *unused)
+{
+ struct em_perf_domain *pd = s->private;
+
+ seq_printf(s, "%#lx\n", pd->flags);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(em_debug_flags);
+
+static void em_debug_create_pd(struct device *dev)
{
struct dentry *d;
- char name[8];
int i;
- snprintf(name, sizeof(name), "pd%d", cpu);
-
/* Create the directory of the performance domain */
- d = debugfs_create_dir(name, rootdir);
+ d = debugfs_create_dir(dev_name(dev), rootdir);
- debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops);
+ if (_is_cpu_device(dev))
+ debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus,
+ &em_debug_cpus_fops);
+
+ debugfs_create_file("flags", 0444, d, dev->em_pd,
+ &em_debug_flags_fops);
+
+ /* Create a sub-directory for each performance state */
+ for (i = 0; i < dev->em_pd->nr_perf_states; i++)
+ em_debug_create_ps(&dev->em_pd->table[i], d);
- /* Create a sub-directory for each capacity state */
- for (i = 0; i < pd->nr_cap_states; i++)
- em_debug_create_cs(&pd->table[i], d);
+}
+
+static void em_debug_remove_pd(struct device *dev)
+{
+ debugfs_lookup_and_remove(dev_name(dev), rootdir);
}
static int __init em_debug_init(void)
@@ -74,136 +97,252 @@ static int __init em_debug_init(void)
return 0;
}
-core_initcall(em_debug_init);
+fs_initcall(em_debug_init);
#else /* CONFIG_DEBUG_FS */
-static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {}
+static void em_debug_create_pd(struct device *dev) {}
+static void em_debug_remove_pd(struct device *dev) {}
#endif
-static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
- struct em_data_callback *cb)
+
+static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
+ int nr_states, struct em_data_callback *cb,
+ unsigned long flags)
{
- unsigned long opp_eff, prev_opp_eff = ULONG_MAX;
- unsigned long power, freq, prev_freq = 0;
- int i, ret, cpu = cpumask_first(span);
- struct em_cap_state *table;
- struct em_perf_domain *pd;
+ unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX;
+ struct em_perf_state *table;
+ int i, ret;
u64 fmax;
- if (!cb->active_power)
- return NULL;
-
- pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
- if (!pd)
- return NULL;
-
table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL);
if (!table)
- goto free_pd;
+ return -ENOMEM;
- /* Build the list of capacity states for this performance domain */
+ /* Build the list of performance states for this performance domain */
for (i = 0, freq = 0; i < nr_states; i++, freq++) {
/*
* active_power() is a driver callback which ceils 'freq' to
- * lowest capacity state of 'cpu' above 'freq' and updates
+ * lowest performance state of 'dev' above 'freq' and updates
* 'power' and 'freq' accordingly.
*/
- ret = cb->active_power(&power, &freq, cpu);
+ ret = cb->active_power(dev, &power, &freq);
if (ret) {
- pr_err("pd%d: invalid cap. state: %d\n", cpu, ret);
- goto free_cs_table;
+ dev_err(dev, "EM: invalid perf. state: %d\n",
+ ret);
+ goto free_ps_table;
}
/*
* We expect the driver callback to increase the frequency for
- * higher capacity states.
+ * higher performance states.
*/
if (freq <= prev_freq) {
- pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq);
- goto free_cs_table;
+ dev_err(dev, "EM: non-increasing freq: %lu\n",
+ freq);
+ goto free_ps_table;
}
/*
* The power returned by active_state() is expected to be
- * positive, in milli-watts and to fit into 16 bits.
+ * positive and be in range.
*/
- if (!power || power > EM_CPU_MAX_POWER) {
- pr_err("pd%d: invalid power: %lu\n", cpu, power);
- goto free_cs_table;
+ if (!power || power > EM_MAX_POWER) {
+ dev_err(dev, "EM: invalid power: %lu\n",
+ power);
+ goto free_ps_table;
}
table[i].power = power;
table[i].frequency = prev_freq = freq;
-
- /*
- * The hertz/watts efficiency ratio should decrease as the
- * frequency grows on sane platforms. But this isn't always
- * true in practice so warn the user if a higher OPP is more
- * power efficient than a lower one.
- */
- opp_eff = freq / power;
- if (opp_eff >= prev_opp_eff)
- pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n",
- cpu, i, i - 1);
- prev_opp_eff = opp_eff;
}
- /* Compute the cost of each capacity_state. */
+ /* Compute the cost of each performance state. */
fmax = (u64) table[nr_states - 1].frequency;
- for (i = 0; i < nr_states; i++) {
- table[i].cost = div64_u64(fmax * table[i].power,
- table[i].frequency);
+ for (i = nr_states - 1; i >= 0; i--) {
+ unsigned long power_res, cost;
+
+ if (flags & EM_PERF_DOMAIN_ARTIFICIAL) {
+ ret = cb->get_cost(dev, table[i].frequency, &cost);
+ if (ret || !cost || cost > EM_MAX_POWER) {
+ dev_err(dev, "EM: invalid cost %lu %d\n",
+ cost, ret);
+ goto free_ps_table;
+ }
+ } else {
+ power_res = table[i].power;
+ cost = div64_u64(fmax * power_res, table[i].frequency);
+ }
+
+ table[i].cost = cost;
+
+ if (table[i].cost >= prev_cost) {
+ table[i].flags = EM_PERF_STATE_INEFFICIENT;
+ dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
+ table[i].frequency);
+ } else {
+ prev_cost = table[i].cost;
+ }
}
pd->table = table;
- pd->nr_cap_states = nr_states;
- cpumask_copy(to_cpumask(pd->cpus), span);
+ pd->nr_perf_states = nr_states;
- em_debug_create_pd(pd, cpu);
-
- return pd;
+ return 0;
-free_cs_table:
+free_ps_table:
kfree(table);
-free_pd:
- kfree(pd);
+ return -EINVAL;
+}
+
+static int em_create_pd(struct device *dev, int nr_states,
+ struct em_data_callback *cb, cpumask_t *cpus,
+ unsigned long flags)
+{
+ struct em_perf_domain *pd;
+ struct device *cpu_dev;
+ int cpu, ret, num_cpus;
+
+ if (_is_cpu_device(dev)) {
+ num_cpus = cpumask_weight(cpus);
+
+ /* Prevent max possible energy calculation to not overflow */
+ if (num_cpus > EM_MAX_NUM_CPUS) {
+ dev_err(dev, "EM: too many CPUs, overflow possible\n");
+ return -EINVAL;
+ }
+
+ pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
+ if (!pd)
+ return -ENOMEM;
+
+ cpumask_copy(em_span_cpus(pd), cpus);
+ } else {
+ pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+ if (!pd)
+ return -ENOMEM;
+ }
+
+ ret = em_create_perf_table(dev, pd, nr_states, cb, flags);
+ if (ret) {
+ kfree(pd);
+ return ret;
+ }
+
+ if (_is_cpu_device(dev))
+ for_each_cpu(cpu, cpus) {
+ cpu_dev = get_cpu_device(cpu);
+ cpu_dev->em_pd = pd;
+ }
+
+ dev->em_pd = pd;
+
+ return 0;
+}
+
+static void em_cpufreq_update_efficiencies(struct device *dev)
+{
+ struct em_perf_domain *pd = dev->em_pd;
+ struct em_perf_state *table;
+ struct cpufreq_policy *policy;
+ int found = 0;
+ int i;
+
+ if (!_is_cpu_device(dev) || !pd)
+ return;
- return NULL;
+ policy = cpufreq_cpu_get(cpumask_first(em_span_cpus(pd)));
+ if (!policy) {
+ dev_warn(dev, "EM: Access to CPUFreq policy failed");
+ return;
+ }
+
+ table = pd->table;
+
+ for (i = 0; i < pd->nr_perf_states; i++) {
+ if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT))
+ continue;
+
+ if (!cpufreq_table_set_inefficient(policy, table[i].frequency))
+ found++;
+ }
+
+ cpufreq_cpu_put(policy);
+
+ if (!found)
+ return;
+
+ /*
+ * Efficiencies have been installed in CPUFreq, inefficient frequencies
+ * will be skipped. The EM can do the same.
+ */
+ pd->flags |= EM_PERF_DOMAIN_SKIP_INEFFICIENCIES;
}
/**
+ * em_pd_get() - Return the performance domain for a device
+ * @dev : Device to find the performance domain for
+ *
+ * Returns the performance domain to which @dev belongs, or NULL if it doesn't
+ * exist.
+ */
+struct em_perf_domain *em_pd_get(struct device *dev)
+{
+ if (IS_ERR_OR_NULL(dev))
+ return NULL;
+
+ return dev->em_pd;
+}
+EXPORT_SYMBOL_GPL(em_pd_get);
+
+/**
* em_cpu_get() - Return the performance domain for a CPU
* @cpu : CPU to find the performance domain for
*
- * Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't
+ * Returns the performance domain to which @cpu belongs, or NULL if it doesn't
* exist.
*/
struct em_perf_domain *em_cpu_get(int cpu)
{
- return READ_ONCE(per_cpu(em_data, cpu));
+ struct device *cpu_dev;
+
+ cpu_dev = get_cpu_device(cpu);
+ if (!cpu_dev)
+ return NULL;
+
+ return em_pd_get(cpu_dev);
}
EXPORT_SYMBOL_GPL(em_cpu_get);
/**
- * em_register_perf_domain() - Register the Energy Model of a performance domain
- * @span : Mask of CPUs in the performance domain
- * @nr_states : Number of capacity states to register
+ * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device
+ * @dev : Device for which the EM is to register
+ * @nr_states : Number of performance states to register
* @cb : Callback functions providing the data of the Energy Model
+ * @cpus : Pointer to cpumask_t, which in case of a CPU device is
+ * obligatory. It can be taken from i.e. 'policy->cpus'. For other
+ * type of devices this should be set to NULL.
+ * @microwatts : Flag indicating that the power values are in micro-Watts or
+ * in some other scale. It must be set properly.
*
* Create Energy Model tables for a performance domain using the callbacks
* defined in cb.
*
+ * The @microwatts is important to set with correct value. Some kernel
+ * sub-systems might rely on this flag and check if all devices in the EM are
+ * using the same scale.
+ *
* If multiple clients register the same performance domain, all but the first
* registration will be ignored.
*
* Return 0 on success
*/
-int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
- struct em_data_callback *cb)
+int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
+ struct em_data_callback *cb, cpumask_t *cpus,
+ bool microwatts)
{
unsigned long cap, prev_cap = 0;
- struct em_perf_domain *pd;
- int cpu, ret = 0;
+ unsigned long flags = 0;
+ int cpu, ret;
- if (!span || !nr_states || !cb)
+ if (!dev || !nr_states || !cb)
return -EINVAL;
/*
@@ -212,47 +351,88 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
*/
mutex_lock(&em_pd_mutex);
- for_each_cpu(cpu, span) {
- /* Make sure we don't register again an existing domain. */
- if (READ_ONCE(per_cpu(em_data, cpu))) {
- ret = -EEXIST;
- goto unlock;
- }
+ if (dev->em_pd) {
+ ret = -EEXIST;
+ goto unlock;
+ }
- /*
- * All CPUs of a domain must have the same micro-architecture
- * since they all share the same table.
- */
- cap = arch_scale_cpu_capacity(cpu);
- if (prev_cap && prev_cap != cap) {
- pr_err("CPUs of %*pbl must have the same capacity\n",
- cpumask_pr_args(span));
+ if (_is_cpu_device(dev)) {
+ if (!cpus) {
+ dev_err(dev, "EM: invalid CPU mask\n");
ret = -EINVAL;
goto unlock;
}
- prev_cap = cap;
+
+ for_each_cpu(cpu, cpus) {
+ if (em_cpu_get(cpu)) {
+ dev_err(dev, "EM: exists for CPU%d\n", cpu);
+ ret = -EEXIST;
+ goto unlock;
+ }
+ /*
+ * All CPUs of a domain must have the same
+ * micro-architecture since they all share the same
+ * table.
+ */
+ cap = arch_scale_cpu_capacity(cpu);
+ if (prev_cap && prev_cap != cap) {
+ dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n",
+ cpumask_pr_args(cpus));
+
+ ret = -EINVAL;
+ goto unlock;
+ }
+ prev_cap = cap;
+ }
}
- /* Create the performance domain and add it to the Energy Model. */
- pd = em_create_pd(span, nr_states, cb);
- if (!pd) {
- ret = -EINVAL;
+ if (microwatts)
+ flags |= EM_PERF_DOMAIN_MICROWATTS;
+ else if (cb->get_cost)
+ flags |= EM_PERF_DOMAIN_ARTIFICIAL;
+
+ ret = em_create_pd(dev, nr_states, cb, cpus, flags);
+ if (ret)
goto unlock;
- }
- for_each_cpu(cpu, span) {
- /*
- * The per-cpu array can be read concurrently from em_cpu_get().
- * The barrier enforces the ordering needed to make sure readers
- * can only access well formed em_perf_domain structs.
- */
- smp_store_release(per_cpu_ptr(&em_data, cpu), pd);
- }
+ dev->em_pd->flags |= flags;
+
+ em_cpufreq_update_efficiencies(dev);
+
+ em_debug_create_pd(dev);
+ dev_info(dev, "EM: created perf domain\n");
- pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span));
unlock:
mutex_unlock(&em_pd_mutex);
-
return ret;
}
-EXPORT_SYMBOL_GPL(em_register_perf_domain);
+EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
+
+/**
+ * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device
+ * @dev : Device for which the EM is registered
+ *
+ * Unregister the EM for the specified @dev (but not a CPU device).
+ */
+void em_dev_unregister_perf_domain(struct device *dev)
+{
+ if (IS_ERR_OR_NULL(dev) || !dev->em_pd)
+ return;
+
+ if (_is_cpu_device(dev))
+ return;
+
+ /*
+ * The mutex separates all register/unregister requests and protects
+ * from potential clean-up/setup issues in the debugfs directories.
+ * The debugfs directory name is the same as device's name.
+ */
+ mutex_lock(&em_pd_mutex);
+ em_debug_remove_pd(dev);
+
+ kfree(dev->em_pd->table);
+ kfree(dev->em_pd);
+ dev->em_pd = NULL;
+ mutex_unlock(&em_pd_mutex);
+}
+EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 02ec716a4927..4b0b7cf2e019 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -11,6 +11,7 @@
#define pr_fmt(fmt) "PM: hibernation: " fmt
+#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/suspend.h>
#include <linux/reboot.h>
@@ -28,9 +29,9 @@
#include <linux/gfp.h>
#include <linux/syscore_ops.h>
#include <linux/ctype.h>
-#include <linux/genhd.h>
#include <linux/ktime.h>
#include <linux/security.h>
+#include <linux/secretmem.h>
#include <trace/events/power.h>
#include "power.h"
@@ -81,7 +82,9 @@ void hibernate_release(void)
bool hibernation_available(void)
{
- return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION);
+ return nohibernate == 0 &&
+ !security_locked_down(LOCKDOWN_HIBERNATION) &&
+ !secretmem_active() && !cxl_mem_active();
}
/**
@@ -90,20 +93,24 @@ bool hibernation_available(void)
*/
void hibernation_set_ops(const struct platform_hibernation_ops *ops)
{
+ unsigned int sleep_flags;
+
if (ops && !(ops->begin && ops->end && ops->pre_snapshot
&& ops->prepare && ops->finish && ops->enter && ops->pre_restore
&& ops->restore_cleanup && ops->leave)) {
WARN_ON(1);
return;
}
- lock_system_sleep();
+
+ sleep_flags = lock_system_sleep();
+
hibernation_ops = ops;
if (ops)
hibernation_mode = HIBERNATION_PLATFORM;
else if (hibernation_mode == HIBERNATION_PLATFORM)
hibernation_mode = HIBERNATION_SHUTDOWN;
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
}
EXPORT_SYMBOL_GPL(hibernation_set_ops);
@@ -297,7 +304,7 @@ static int create_image(int platform_mode)
if (error || hibernation_test(TEST_PLATFORM))
goto Platform_finish;
- error = suspend_disable_secondary_cpus();
+ error = pm_sleep_disable_secondary_cpus();
if (error || hibernation_test(TEST_CPUS))
goto Enable_cpus;
@@ -326,7 +333,7 @@ static int create_image(int platform_mode)
if (!in_suspend) {
events_check_enabled = false;
- clear_free_pages();
+ clear_or_poison_free_pages();
}
platform_leave(platform_mode);
@@ -339,7 +346,7 @@ static int create_image(int platform_mode)
local_irq_enable();
Enable_cpus:
- suspend_enable_secondary_cpus();
+ pm_sleep_enable_secondary_cpus();
/* Allow architectures to do nosmt-specific post-resume dances */
if (!in_suspend)
@@ -463,6 +470,8 @@ static int resume_target_kernel(bool platform_mode)
if (error)
goto Cleanup;
+ cpuidle_pause();
+
error = hibernate_resume_nonboot_cpu_disable();
if (error)
goto Enable_cpus;
@@ -506,7 +515,7 @@ static int resume_target_kernel(bool platform_mode)
local_irq_enable();
Enable_cpus:
- suspend_enable_secondary_cpus();
+ pm_sleep_enable_secondary_cpus();
Cleanup:
platform_restore_cleanup(platform_mode);
@@ -584,7 +593,7 @@ int hibernation_platform_enter(void)
if (error)
goto Platform_finish;
- error = suspend_disable_secondary_cpus();
+ error = pm_sleep_disable_secondary_cpus();
if (error)
goto Enable_cpus;
@@ -606,7 +615,7 @@ int hibernation_platform_enter(void)
local_irq_enable();
Enable_cpus:
- suspend_enable_secondary_cpus();
+ pm_sleep_enable_secondary_cpus();
Platform_finish:
hibernation_ops->finish();
@@ -633,11 +642,11 @@ int hibernation_platform_enter(void)
*/
static void power_down(void)
{
-#ifdef CONFIG_SUSPEND
int error;
+#ifdef CONFIG_SUSPEND
if (hibernation_mode == HIBERNATION_SUSPEND) {
- error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+ error = suspend_devices_and_enter(mem_sleep_current);
if (error) {
hibernation_mode = hibernation_ops ?
HIBERNATION_PLATFORM :
@@ -658,10 +667,16 @@ static void power_down(void)
kernel_restart(NULL);
break;
case HIBERNATION_PLATFORM:
- hibernation_platform_enter();
- /* Fall through */
+ error = hibernation_platform_enter();
+ if (error == -EAGAIN || error == -EBUSY) {
+ swsusp_unmark();
+ events_check_enabled = false;
+ pr_info("Wakeup event detected during hibernation, rolling back.\n");
+ return;
+ }
+ fallthrough;
case HIBERNATION_SHUTDOWN:
- if (pm_power_off)
+ if (kernel_can_power_off())
kernel_power_off();
break;
}
@@ -684,11 +699,13 @@ static int load_image_and_restore(void)
lock_device_hotplug();
error = create_basic_memory_bitmaps();
- if (error)
+ if (error) {
+ swsusp_close();
goto Unlock;
+ }
error = swsusp_read(&flags);
- swsusp_close(FMODE_READ);
+ swsusp_close();
if (!error)
error = hibernation_restore(flags & SF_PLATFORM_MODE);
@@ -706,15 +723,16 @@ static int load_image_and_restore(void)
*/
int hibernate(void)
{
- int error, nr_calls = 0;
bool snapshot_test = false;
+ unsigned int sleep_flags;
+ int error;
if (!hibernation_available()) {
pm_pr_dbg("Hibernation not available.\n");
return -EPERM;
}
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
/* The snapshot device should not be opened while we're running */
if (!hibernate_acquire()) {
error = -EBUSY;
@@ -723,11 +741,9 @@ int hibernate(void)
pr_info("hibernation entry\n");
pm_prepare_console();
- error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
- if (error) {
- nr_calls--;
- goto Exit;
- }
+ error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION);
+ if (error)
+ goto Restore;
ksys_sync_helper();
@@ -776,7 +792,7 @@ int hibernate(void)
unlock_device_hotplug();
if (snapshot_test) {
pm_pr_dbg("Checking hibernation image\n");
- error = swsusp_check();
+ error = swsusp_check(false);
if (!error)
error = load_image_and_restore();
}
@@ -785,61 +801,119 @@ int hibernate(void)
/* Don't bother checking whether freezer_test_done is true */
freezer_test_done = false;
Exit:
- __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
+ pm_notifier_call_chain(PM_POST_HIBERNATION);
+ Restore:
pm_restore_console();
hibernate_release();
Unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
pr_info("hibernation exit\n");
return error;
}
-
/**
- * software_resume - Resume from a saved hibernation image.
- *
- * This routine is called as a late initcall, when all devices have been
- * discovered and initialized already.
+ * hibernate_quiet_exec - Execute a function with all devices frozen.
+ * @func: Function to execute.
+ * @data: Data pointer to pass to @func.
*
- * The image reading code is called to see if there is a hibernation image
- * available for reading. If that is the case, devices are quiesced and the
- * contents of memory is restored from the saved image.
- *
- * If this is successful, control reappears in the restored target kernel in
- * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine
- * attempts to recover gracefully and make the kernel return to the normal mode
- * of operation.
+ * Return the @func return value or an error code if it cannot be executed.
*/
-static int software_resume(void)
+int hibernate_quiet_exec(int (*func)(void *data), void *data)
{
- int error, nr_calls = 0;
+ unsigned int sleep_flags;
+ int error;
- /*
- * If the user said "noresume".. bail out early.
- */
- if (noresume || !hibernation_available())
- return 0;
+ sleep_flags = lock_system_sleep();
- /*
- * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
- * is configured into the kernel. Since the regular hibernate
- * trigger path is via sysfs which takes a buffer mutex before
- * calling hibernate functions (which take system_transition_mutex)
- * this can cause lockdep to complain about a possible ABBA deadlock
- * which cannot happen since we're in the boot code here and
- * sysfs can't be invoked yet. Therefore, we use a subclass
- * here to avoid lockdep complaining.
- */
- mutex_lock_nested(&system_transition_mutex, SINGLE_DEPTH_NESTING);
+ if (!hibernate_acquire()) {
+ error = -EBUSY;
+ goto unlock;
+ }
- if (swsusp_resume_device)
- goto Check_image;
+ pm_prepare_console();
- if (!strlen(resume_file)) {
- error = -ENOENT;
- goto Unlock;
- }
+ error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION);
+ if (error)
+ goto restore;
+
+ error = freeze_processes();
+ if (error)
+ goto exit;
+
+ lock_device_hotplug();
+
+ pm_suspend_clear_flags();
+
+ error = platform_begin(true);
+ if (error)
+ goto thaw;
+
+ error = freeze_kernel_threads();
+ if (error)
+ goto thaw;
+
+ error = dpm_prepare(PMSG_FREEZE);
+ if (error)
+ goto dpm_complete;
+
+ suspend_console();
+
+ error = dpm_suspend(PMSG_FREEZE);
+ if (error)
+ goto dpm_resume;
+
+ error = dpm_suspend_end(PMSG_FREEZE);
+ if (error)
+ goto dpm_resume;
+
+ error = platform_pre_snapshot(true);
+ if (error)
+ goto skip;
+
+ error = func(data);
+
+skip:
+ platform_finish(true);
+
+ dpm_resume_start(PMSG_THAW);
+
+dpm_resume:
+ dpm_resume(PMSG_THAW);
+
+ resume_console();
+
+dpm_complete:
+ dpm_complete(PMSG_THAW);
+
+ thaw_kernel_threads();
+
+thaw:
+ platform_end(true);
+
+ unlock_device_hotplug();
+
+ thaw_processes();
+
+exit:
+ pm_notifier_call_chain(PM_POST_HIBERNATION);
+
+restore:
+ pm_restore_console();
+
+ hibernate_release();
+
+unlock:
+ unlock_system_sleep(sleep_flags);
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(hibernate_quiet_exec);
+
+static int __init find_resume_device(void)
+{
+ if (!strlen(resume_file))
+ return -ENOENT;
pm_pr_dbg("Checking hibernation image partition %s\n", resume_file);
@@ -850,61 +924,49 @@ static int software_resume(void)
}
/* Check if the device is there */
- swsusp_resume_device = name_to_dev_t(resume_file);
+ if (!early_lookup_bdev(resume_file, &swsusp_resume_device))
+ return 0;
/*
- * name_to_dev_t is ineffective to verify parition if resume_file is in
- * integer format. (e.g. major:minor)
+ * Some device discovery might still be in progress; we need to wait for
+ * this to finish.
*/
- if (isdigit(resume_file[0]) && resume_wait) {
- int partno;
- while (!get_gendisk(swsusp_resume_device, &partno))
+ wait_for_device_probe();
+ if (resume_wait) {
+ while (early_lookup_bdev(resume_file, &swsusp_resume_device))
msleep(10);
+ async_synchronize_full();
}
- if (!swsusp_resume_device) {
- /*
- * Some device discovery might still be in progress; we need
- * to wait for this to finish.
- */
- wait_for_device_probe();
-
- if (resume_wait) {
- while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0)
- msleep(10);
- async_synchronize_full();
- }
+ return early_lookup_bdev(resume_file, &swsusp_resume_device);
+}
- swsusp_resume_device = name_to_dev_t(resume_file);
- if (!swsusp_resume_device) {
- error = -ENODEV;
- goto Unlock;
- }
- }
+static int software_resume(void)
+{
+ int error;
- Check_image:
pm_pr_dbg("Hibernation image partition %d:%d present\n",
MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
pm_pr_dbg("Looking for hibernation image.\n");
- error = swsusp_check();
+
+ mutex_lock(&system_transition_mutex);
+ error = swsusp_check(true);
if (error)
goto Unlock;
/* The snapshot device should not be opened while we're running */
if (!hibernate_acquire()) {
error = -EBUSY;
- swsusp_close(FMODE_READ);
+ swsusp_close();
goto Unlock;
}
pr_info("resume from hibernation\n");
pm_prepare_console();
- error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
- if (error) {
- nr_calls--;
- goto Close_Finish;
- }
+ error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE);
+ if (error)
+ goto Restore;
pm_pr_dbg("Preparing processes for hibernation restore.\n");
error = freeze_processes();
@@ -920,7 +982,8 @@ static int software_resume(void)
error = load_image_and_restore();
thaw_processes();
Finish:
- __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
+ pm_notifier_call_chain(PM_POST_RESTORE);
+ Restore:
pm_restore_console();
pr_info("resume failed (%d)\n", error);
hibernate_release();
@@ -930,11 +993,43 @@ static int software_resume(void)
pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
return error;
Close_Finish:
- swsusp_close(FMODE_READ);
+ swsusp_close();
goto Finish;
}
-late_initcall_sync(software_resume);
+/**
+ * software_resume_initcall - Resume from a saved hibernation image.
+ *
+ * This routine is called as a late initcall, when all devices have been
+ * discovered and initialized already.
+ *
+ * The image reading code is called to see if there is a hibernation image
+ * available for reading. If that is the case, devices are quiesced and the
+ * contents of memory is restored from the saved image.
+ *
+ * If this is successful, control reappears in the restored target kernel in
+ * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine
+ * attempts to recover gracefully and make the kernel return to the normal mode
+ * of operation.
+ */
+static int __init software_resume_initcall(void)
+{
+ /*
+ * If the user said "noresume".. bail out early.
+ */
+ if (noresume || !hibernation_available())
+ return 0;
+
+ if (!swsusp_resume_device) {
+ int error = find_resume_device();
+
+ if (error)
+ return error;
+ }
+
+ return software_resume();
+}
+late_initcall_sync(software_resume_initcall);
static const char * const hibernation_modes[] = {
@@ -1011,11 +1106,12 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
+ int mode = HIBERNATION_INVALID;
+ unsigned int sleep_flags;
int error = 0;
- int i;
int len;
char *p;
- int mode = HIBERNATION_INVALID;
+ int i;
if (!hibernation_available())
return -EPERM;
@@ -1023,7 +1119,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
if (len == strlen(hibernation_modes[i])
&& !strncmp(buf, hibernation_modes[i], len)) {
@@ -1053,7 +1149,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
if (!error)
pm_pr_dbg("Hibernation mode set to '%s'\n",
hibernation_modes[mode]);
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error ? error : n;
}
@@ -1062,16 +1158,21 @@ power_attr(disk);
static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
+ return sprintf(buf, "%d:%d\n", MAJOR(swsusp_resume_device),
MINOR(swsusp_resume_device));
}
static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
- dev_t res;
+ unsigned int sleep_flags;
int len = n;
char *name;
+ dev_t dev;
+ int error;
+
+ if (!hibernation_available())
+ return n;
if (len && buf[len-1] == '\n')
len--;
@@ -1079,14 +1180,32 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
if (!name)
return -ENOMEM;
- res = name_to_dev_t(name);
+ error = lookup_bdev(name, &dev);
+ if (error) {
+ unsigned maj, min, offset;
+ char *p, dummy;
+
+ error = 0;
+ if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 ||
+ sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset,
+ &dummy) == 3) {
+ dev = MKDEV(maj, min);
+ if (maj != MAJOR(dev) || min != MINOR(dev))
+ error = -EINVAL;
+ } else {
+ dev = new_decode_dev(simple_strtoul(name, &p, 16));
+ if (*p)
+ error = -EINVAL;
+ }
+ }
kfree(name);
- if (!res)
- return -EINVAL;
+ if (error)
+ return error;
+
+ sleep_flags = lock_system_sleep();
+ swsusp_resume_device = dev;
+ unlock_system_sleep(sleep_flags);
- lock_system_sleep();
- swsusp_resume_device = res;
- unlock_system_sleep();
pm_pr_dbg("Configured hibernation resume from disk to %u\n",
swsusp_resume_device);
noresume = 0;
@@ -1162,7 +1281,7 @@ static ssize_t reserved_size_store(struct kobject *kobj,
power_attr(reserved_size);
-static struct attribute * g[] = {
+static struct attribute *g[] = {
&disk_attr.attr,
&resume_offset_attr.attr,
&resume_attr.attr,
@@ -1190,7 +1309,7 @@ static int __init resume_setup(char *str)
if (noresume)
return 1;
- strncpy( resume_file, str, 255 );
+ strncpy(resume_file, str, 255);
return 1;
}
@@ -1240,7 +1359,7 @@ static int __init resumedelay_setup(char *str)
int rc = kstrtouint(str, 0, &resume_delay);
if (rc)
- return rc;
+ pr_warn("resumedelay: bad option string '%s'\n", str);
return 1;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 40f86ec4ab30..b1ae9b677d03 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -6,6 +6,7 @@
* Copyright (c) 2003 Open Source Development Lab
*/
+#include <linux/acpi.h>
#include <linux/export.h>
#include <linux/kobject.h>
#include <linux/string.h>
@@ -20,33 +21,47 @@
#include "power.h"
#ifdef CONFIG_PM_SLEEP
+/*
+ * The following functions are used by the suspend/hibernate code to temporarily
+ * change gfp_allowed_mask in order to avoid using I/O during memory allocations
+ * while devices are suspended. To avoid races with the suspend/hibernate code,
+ * they should always be called with system_transition_mutex held
+ * (gfp_allowed_mask also should only be modified with system_transition_mutex
+ * held, unless the suspend/hibernate code is guaranteed not to run in parallel
+ * with that modification).
+ */
+static gfp_t saved_gfp_mask;
+
+void pm_restore_gfp_mask(void)
+{
+ WARN_ON(!mutex_is_locked(&system_transition_mutex));
+ if (saved_gfp_mask) {
+ gfp_allowed_mask = saved_gfp_mask;
+ saved_gfp_mask = 0;
+ }
+}
-void lock_system_sleep(void)
+void pm_restrict_gfp_mask(void)
{
- current->flags |= PF_FREEZER_SKIP;
+ WARN_ON(!mutex_is_locked(&system_transition_mutex));
+ WARN_ON(saved_gfp_mask);
+ saved_gfp_mask = gfp_allowed_mask;
+ gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
+}
+
+unsigned int lock_system_sleep(void)
+{
+ unsigned int flags = current->flags;
+ current->flags |= PF_NOFREEZE;
mutex_lock(&system_transition_mutex);
+ return flags;
}
EXPORT_SYMBOL_GPL(lock_system_sleep);
-void unlock_system_sleep(void)
-{
- /*
- * Don't use freezer_count() because we don't want the call to
- * try_to_freeze() here.
- *
- * Reason:
- * Fundamentally, we just don't need it, because freezing condition
- * doesn't come into effect until we release the
- * system_transition_mutex lock, since the freezer always works with
- * system_transition_mutex held.
- *
- * More importantly, in the case of hibernation,
- * unlock_system_sleep() gets called in snapshot_read() and
- * snapshot_write() when the freezing condition is still in effect.
- * Which means, if we use try_to_freeze() here, it would make them
- * enter the refrigerator, thus causing hibernation to lockup.
- */
- current->flags &= ~PF_FREEZER_SKIP;
+void unlock_system_sleep(unsigned int flags)
+{
+ if (!(flags & PF_NOFREEZE))
+ current->flags &= ~PF_NOFREEZE;
mutex_unlock(&system_transition_mutex);
}
EXPORT_SYMBOL_GPL(unlock_system_sleep);
@@ -80,18 +95,31 @@ int unregister_pm_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_pm_notifier);
-int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls)
+void pm_report_hw_sleep_time(u64 t)
+{
+ suspend_stats.last_hw_sleep = t;
+ suspend_stats.total_hw_sleep += t;
+}
+EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time);
+
+void pm_report_max_hw_sleep(u64 t)
+{
+ suspend_stats.max_hw_sleep = t;
+}
+EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep);
+
+int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down)
{
int ret;
- ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL,
- nr_to_call, nr_calls);
+ ret = blocking_notifier_call_chain_robust(&pm_chain_head, val_up, val_down, NULL);
return notifier_to_errno(ret);
}
+
int pm_notifier_call_chain(unsigned long val)
{
- return __pm_notifier_call_chain(val, -1, NULL);
+ return blocking_notifier_call_chain(&pm_chain_head, val, NULL);
}
/* If set, devices may be suspended and resumed asynchronously. */
@@ -127,7 +155,9 @@ static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr,
char *s = buf;
suspend_state_t i;
- for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) {
+ if (i >= PM_SUSPEND_MEM && cxl_mem_active())
+ continue;
if (mem_sleep_states[i]) {
const char *label = mem_sleep_states[i];
@@ -136,6 +166,7 @@ static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr,
else
s += sprintf(s, "%s ", label);
}
+ }
/* Convert the last space to a newline if needed. */
if (s != buf)
@@ -260,16 +291,17 @@ static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr,
static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
+ unsigned int sleep_flags;
const char * const *s;
+ int error = -EINVAL;
int level;
char *p;
int len;
- int error = -EINVAL;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
level = TEST_FIRST;
for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
@@ -279,7 +311,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
break;
}
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error ? error : n;
}
@@ -307,24 +339,27 @@ static char *suspend_step_name(enum suspend_stat_step step)
}
}
-#define suspend_attr(_name) \
+#define suspend_attr(_name, format_str) \
static ssize_t _name##_show(struct kobject *kobj, \
struct kobj_attribute *attr, char *buf) \
{ \
- return sprintf(buf, "%d\n", suspend_stats._name); \
+ return sprintf(buf, format_str, suspend_stats._name); \
} \
static struct kobj_attribute _name = __ATTR_RO(_name)
-suspend_attr(success);
-suspend_attr(fail);
-suspend_attr(failed_freeze);
-suspend_attr(failed_prepare);
-suspend_attr(failed_suspend);
-suspend_attr(failed_suspend_late);
-suspend_attr(failed_suspend_noirq);
-suspend_attr(failed_resume);
-suspend_attr(failed_resume_early);
-suspend_attr(failed_resume_noirq);
+suspend_attr(success, "%d\n");
+suspend_attr(fail, "%d\n");
+suspend_attr(failed_freeze, "%d\n");
+suspend_attr(failed_prepare, "%d\n");
+suspend_attr(failed_suspend, "%d\n");
+suspend_attr(failed_suspend_late, "%d\n");
+suspend_attr(failed_suspend_noirq, "%d\n");
+suspend_attr(failed_resume, "%d\n");
+suspend_attr(failed_resume_early, "%d\n");
+suspend_attr(failed_resume_noirq, "%d\n");
+suspend_attr(last_hw_sleep, "%llu\n");
+suspend_attr(total_hw_sleep, "%llu\n");
+suspend_attr(max_hw_sleep, "%llu\n");
static ssize_t last_failed_dev_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -384,12 +419,30 @@ static struct attribute *suspend_attrs[] = {
&last_failed_dev.attr,
&last_failed_errno.attr,
&last_failed_step.attr,
+ &last_hw_sleep.attr,
+ &total_hw_sleep.attr,
+ &max_hw_sleep.attr,
NULL,
};
-static struct attribute_group suspend_attr_group = {
+static umode_t suspend_attr_is_visible(struct kobject *kobj, struct attribute *attr, int idx)
+{
+ if (attr != &last_hw_sleep.attr &&
+ attr != &total_hw_sleep.attr &&
+ attr != &max_hw_sleep.attr)
+ return 0444;
+
+#ifdef CONFIG_ACPI
+ if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0)
+ return 0444;
+#endif
+ return 0;
+}
+
+static const struct attribute_group suspend_attr_group = {
.name = "suspend_stats",
.attrs = suspend_attrs,
+ .is_visible = suspend_attr_is_visible,
};
#ifdef CONFIG_DEBUG_FS
@@ -504,13 +557,22 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
- return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA;
+ if (!pm_wakeup_irq())
+ return -ENODATA;
+
+ return sprintf(buf, "%u\n", pm_wakeup_irq());
}
power_attr_ro(pm_wakeup_irq);
bool pm_debug_messages_on __read_mostly;
+bool pm_debug_messages_should_print(void)
+{
+ return pm_debug_messages_on && pm_suspend_target_state != PM_SUSPEND_ON;
+}
+EXPORT_SYMBOL_GPL(pm_debug_messages_should_print);
+
static ssize_t pm_debug_messages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -542,42 +604,13 @@ static int __init pm_debug_messages_setup(char *str)
}
__setup("pm_debug_messages", pm_debug_messages_setup);
-/**
- * __pm_pr_dbg - Print a suspend debug message to the kernel log.
- * @defer: Whether or not to use printk_deferred() to print the message.
- * @fmt: Message format.
- *
- * The message will be emitted if enabled through the pm_debug_messages
- * sysfs attribute.
- */
-void __pm_pr_dbg(bool defer, const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- if (!pm_debug_messages_on)
- return;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- if (defer)
- printk_deferred(KERN_DEBUG "PM: %pV", &vaf);
- else
- printk(KERN_DEBUG "PM: %pV", &vaf);
-
- va_end(args);
-}
-
#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
#endif /* CONFIG_PM_SLEEP_DEBUG */
struct kobject *power_kobj;
-/**
+/*
* state - control system sleep states.
*
* show() returns available sleep state labels, which may be "mem", "standby",
diff --git a/kernel/power/power.h b/kernel/power/power.h
index ba2094db6294..8499a39c62f4 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -4,6 +4,8 @@
#include <linux/utsname.h>
#include <linux/freezer.h>
#include <linux/compiler.h>
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
struct swsusp_info {
struct new_utsname uts;
@@ -24,23 +26,18 @@ extern void __init hibernate_image_size_init(void);
/* Maximum size of architecture specific data in a hibernation header */
#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4)
-extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
-extern int arch_hibernation_header_restore(void *addr);
-
static inline int init_header_complete(struct swsusp_info *info)
{
return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE);
}
-static inline char *check_image_kernel(struct swsusp_info *info)
+static inline const char *check_image_kernel(struct swsusp_info *info)
{
return arch_hibernation_header_restore(info) ?
"architecture specific data" : NULL;
}
#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
-extern int hibernate_resume_nonboot_cpu_disable(void);
-
/*
* Keep some memory free so that I/O operations can succeed without paging
* [Might this be more than 4 MB?]
@@ -106,7 +103,7 @@ extern int create_basic_memory_bitmaps(void);
extern void free_basic_memory_bitmaps(void);
extern int hibernate_preallocate_memory(void);
-extern void clear_free_pages(void);
+extern void clear_or_poison_free_pages(void);
/**
* Auxiliary structure used for reading the snapshot image data and
@@ -168,15 +165,18 @@ extern int swsusp_swap_in_use(void);
#define SF_PLATFORM_MODE 1
#define SF_NOCOMPRESS_MODE 2
#define SF_CRC32_MODE 4
+#define SF_HW_SIG 8
/* kernel/power/hibernate.c */
-extern int swsusp_check(void);
+int swsusp_check(bool exclusive);
extern void swsusp_free(void);
extern int swsusp_read(unsigned int *flags_p);
extern int swsusp_write(unsigned int flags);
-extern void swsusp_close(fmode_t);
+void swsusp_close(void);
#ifdef CONFIG_SUSPEND
extern int swsusp_unmark(void);
+#else
+static inline int swsusp_unmark(void) { return 0; }
#endif
struct __kernel_old_timeval;
@@ -210,9 +210,13 @@ static inline void suspend_test_finish(const char *label) {}
#ifdef CONFIG_PM_SLEEP
/* kernel/power/main.c */
-extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call,
- int *nr_calls);
+extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down);
extern int pm_notifier_call_chain(unsigned long val);
+void pm_restrict_gfp_mask(void);
+void pm_restore_gfp_mask(void);
+#else
+static inline void pm_restrict_gfp_mask(void) {}
+static inline void pm_restore_gfp_mask(void) {}
#endif
#ifdef CONFIG_HIGHMEM
@@ -311,3 +315,15 @@ extern int pm_wake_lock(const char *buf);
extern int pm_wake_unlock(const char *buf);
#endif /* !CONFIG_PM_WAKELOCKS */
+
+static inline int pm_sleep_disable_secondary_cpus(void)
+{
+ cpuidle_pause();
+ return suspend_disable_secondary_cpus();
+}
+
+static inline void pm_sleep_enable_secondary_cpus(void)
+{
+ suspend_enable_secondary_cpus();
+ cpuidle_resume();
+}
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 562aa0e450ed..1f306f158696 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -23,7 +23,7 @@ static void do_poweroff(struct work_struct *dummy)
static DECLARE_WORK(poweroff_work, do_poweroff);
-static void handle_poweroff(int key)
+static void handle_poweroff(u8 key)
{
/* run sysrq poweroff on boot cpu */
schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 4b6a54da7e65..cae81a87cc91 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -1,14 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * drivers/power/process.c - Functions for starting/stopping processes on
+ * drivers/power/process.c - Functions for starting/stopping processes on
* suspend transitions.
*
* Originally from swsusp.
*/
-
-#undef DEBUG
-
#include <linux/interrupt.h>
#include <linux/oom.h>
#include <linux/suspend.h>
@@ -30,6 +27,8 @@ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
static int try_to_freeze_tasks(bool user_only)
{
+ const char *what = user_only ? "user space processes" :
+ "remaining freezable tasks";
struct task_struct *g, *p;
unsigned long end_time;
unsigned int todo;
@@ -39,6 +38,8 @@ static int try_to_freeze_tasks(bool user_only)
bool wakeup = false;
int sleep_usecs = USEC_PER_MSEC;
+ pr_info("Freezing %s\n", what);
+
start = ktime_get_boottime();
end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
@@ -53,8 +54,7 @@ static int try_to_freeze_tasks(bool user_only)
if (p == current || !freeze_task(p))
continue;
- if (!freezer_should_skip(p))
- todo++;
+ todo++;
}
read_unlock(&tasklist_lock);
@@ -86,28 +86,26 @@ static int try_to_freeze_tasks(bool user_only)
elapsed_msecs = ktime_to_ms(elapsed);
if (todo) {
- pr_cont("\n");
- pr_err("Freezing of tasks %s after %d.%03d seconds "
- "(%d tasks refusing to freeze, wq_busy=%d):\n",
+ pr_err("Freezing %s %s after %d.%03d seconds "
+ "(%d tasks refusing to freeze, wq_busy=%d):\n", what,
wakeup ? "aborted" : "failed",
elapsed_msecs / 1000, elapsed_msecs % 1000,
todo - wq_busy, wq_busy);
if (wq_busy)
- show_workqueue_state();
+ show_freezable_workqueues();
if (!wakeup || pm_debug_messages_on) {
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
- if (p != current && !freezer_should_skip(p)
- && freezing(p) && !frozen(p))
+ if (p != current && freezing(p) && !frozen(p))
sched_show_task(p);
}
read_unlock(&tasklist_lock);
}
} else {
- pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
- elapsed_msecs % 1000);
+ pr_info("Freezing %s completed (elapsed %d.%03d seconds)\n",
+ what, elapsed_msecs / 1000, elapsed_msecs % 1000);
}
return todo ? -EBUSY : 0;
@@ -132,21 +130,18 @@ int freeze_processes(void)
current->flags |= PF_SUSPEND_TASK;
if (!pm_freezing)
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc(&freezer_active);
- pm_wakeup_clear(true);
- pr_info("Freezing user space processes ... ");
+ pm_wakeup_clear(0);
pm_freezing = true;
error = try_to_freeze_tasks(true);
- if (!error) {
+ if (!error)
__usermodehelper_set_disable_depth(UMH_DISABLED);
- pr_cont("done.");
- }
- pr_cont("\n");
+
BUG_ON(in_atomic());
/*
- * Now that the whole userspace is frozen we need to disbale
+ * Now that the whole userspace is frozen we need to disable
* the OOM killer to disallow any further interference with
* killable tasks. There is no guarantee oom victims will
* ever reach a point they go away we have to wait with a timeout.
@@ -171,14 +166,9 @@ int freeze_kernel_threads(void)
{
int error;
- pr_info("Freezing remaining freezable tasks ... ");
-
pm_nosig_freezing = true;
error = try_to_freeze_tasks(false);
- if (!error)
- pr_cont("done.");
- pr_cont("\n");
BUG_ON(in_atomic());
if (error)
@@ -193,7 +183,7 @@ void thaw_processes(void)
trace_suspend_resume(TPS("thaw_processes"), 0, true);
if (pm_freezing)
- atomic_dec(&system_freezing_cnt);
+ static_branch_dec(&freezer_active);
pm_freezing = false;
pm_nosig_freezing = false;
@@ -235,7 +225,7 @@ void thaw_kernel_threads(void)
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
- if (p->flags & (PF_KTHREAD | PF_WQ_WORKER))
+ if (p->flags & PF_KTHREAD)
__thaw_task(p);
}
read_unlock(&tasklist_lock);
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index db0bed2cae26..4244b069442e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -119,7 +119,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
* and add, then see if the aggregate has changed.
*/
plist_del(node, &c->list);
- /* fall through */
+ fallthrough;
case PM_QOS_ADD_REQ:
plist_node_init(node, new_value);
plist_add(node, &c->list);
@@ -188,7 +188,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf,
break;
case PM_QOS_UPDATE_REQ:
pm_qos_flags_remove_req(pqf, req);
- /* fall through */
+ fallthrough;
case PM_QOS_ADD_REQ:
req->flags = val;
INIT_LIST_HEAD(&req->node);
@@ -220,6 +220,11 @@ static struct pm_qos_constraints cpu_latency_constraints = {
.type = PM_QOS_MIN,
};
+static inline bool cpu_latency_qos_value_invalid(s32 value)
+{
+ return value < 0 && value != PM_QOS_DEFAULT_VALUE;
+}
+
/**
* cpu_latency_qos_limit - Return current system-wide CPU latency QoS limit.
*/
@@ -263,7 +268,7 @@ static void cpu_latency_qos_apply(struct pm_qos_request *req,
*/
void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value)
{
- if (!req)
+ if (!req || cpu_latency_qos_value_invalid(value))
return;
if (cpu_latency_qos_request_active(req)) {
@@ -289,7 +294,7 @@ EXPORT_SYMBOL_GPL(cpu_latency_qos_add_request);
*/
void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value)
{
- if (!req)
+ if (!req || cpu_latency_qos_value_invalid(new_value))
return;
if (!cpu_latency_qos_request_active(req)) {
@@ -426,6 +431,11 @@ late_initcall(cpu_latency_qos_init);
/* Definitions related to the frequency QoS below. */
+static inline bool freq_qos_value_invalid(s32 value)
+{
+ return value < 0 && value != PM_QOS_DEFAULT_VALUE;
+}
+
/**
* freq_constraints_init - Initialize frequency QoS constraints.
* @qos: Frequency QoS constraints to initialize.
@@ -531,7 +541,7 @@ int freq_qos_add_request(struct freq_constraints *qos,
{
int ret;
- if (IS_ERR_OR_NULL(qos) || !req)
+ if (IS_ERR_OR_NULL(qos) || !req || freq_qos_value_invalid(value))
return -EINVAL;
if (WARN(freq_qos_request_active(req),
@@ -563,7 +573,7 @@ EXPORT_SYMBOL_GPL(freq_qos_add_request);
*/
int freq_qos_update_request(struct freq_qos_request *req, s32 new_value)
{
- if (!req)
+ if (!req || freq_qos_value_invalid(new_value))
return -EINVAL;
if (WARN(!freq_qos_request_active(req),
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 881128b9351e..5c96ff067c64 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -76,6 +76,40 @@ static inline void hibernate_restore_protect_page(void *page_address) {}
static inline void hibernate_restore_unprotect_page(void *page_address) {}
#endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */
+
+/*
+ * The calls to set_direct_map_*() should not fail because remapping a page
+ * here means that we only update protection bits in an existing PTE.
+ * It is still worth to have a warning here if something changes and this
+ * will no longer be the case.
+ */
+static inline void hibernate_map_page(struct page *page)
+{
+ if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) {
+ int ret = set_direct_map_default_noflush(page);
+
+ if (ret)
+ pr_warn_once("Failed to remap page\n");
+ } else {
+ debug_pagealloc_map_pages(page, 1);
+ }
+}
+
+static inline void hibernate_unmap_page(struct page *page)
+{
+ if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) {
+ unsigned long addr = (unsigned long)page_address(page);
+ int ret = set_direct_map_invalid_noflush(page);
+
+ if (ret)
+ pr_warn_once("Failed to remap page\n");
+
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+ } else {
+ debug_pagealloc_unmap_pages(page, 1);
+ }
+}
+
static int swsusp_page_is_free(struct page *);
static void swsusp_set_page_forbidden(struct page *);
static void swsusp_unset_page_forbidden(struct page *);
@@ -292,12 +326,12 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
return ret;
}
-/**
+/*
* Data types related to memory bitmaps.
*
- * Memory bitmap is a structure consiting of many linked lists of
+ * Memory bitmap is a structure consisting of many linked lists of
* objects. The main list's elements are of type struct zone_bitmap
- * and each of them corresonds to one zone. For each zone bitmap
+ * and each of them corresponds to one zone. For each zone bitmap
* object there is a list of objects of type struct bm_block that
* represent each blocks of bitmap in which information is stored.
*
@@ -364,12 +398,13 @@ struct mem_zone_bm_rtree {
unsigned int blocks; /* Number of Bitmap Blocks */
};
-/* strcut bm_position is used for browsing memory bitmaps */
+/* struct bm_position is used for browsing memory bitmaps */
struct bm_position {
struct mem_zone_bm_rtree *zone;
struct rtree_node *node;
unsigned long node_pfn;
+ unsigned long cur_pfn;
int node_bit;
};
@@ -393,6 +428,10 @@ struct memory_bitmap {
/**
* alloc_rtree_node - Allocate a new node and add it to the radix tree.
+ * @gfp_mask: GFP mask for the allocation.
+ * @safe_needed: Get pages not used before hibernation (restore only)
+ * @ca: Pointer to a linked list of pages ("a chain") to allocate from
+ * @list: Radix Tree node to add.
*
* This function is used to allocate inner nodes as well as the
* leave nodes of the radix tree. It also adds the node to the
@@ -551,6 +590,7 @@ static void memory_bm_position_reset(struct memory_bitmap *bm)
bm->cur.node = list_entry(bm->cur.zone->leaves.next,
struct rtree_node, list);
bm->cur.node_pfn = 0;
+ bm->cur.cur_pfn = BM_END_OF_MAP;
bm->cur.node_bit = 0;
}
@@ -735,7 +775,7 @@ zone_found:
*/
/*
- * If the zone we wish to scan is the the current zone and the
+ * If the zone we wish to scan is the current zone and the
* pfn falls into the current node then we do not need to walk
* the tree.
*/
@@ -761,6 +801,7 @@ node_found:
bm->cur.zone = zone;
bm->cur.node = node;
bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
+ bm->cur.cur_pfn = pfn;
/* Set return values */
*addr = node->data;
@@ -812,6 +853,11 @@ static void memory_bm_clear_current(struct memory_bitmap *bm)
clear_bit(bit, bm->cur.node->data);
}
+static unsigned long memory_bm_get_current(struct memory_bitmap *bm)
+{
+ return bm->cur.cur_pfn;
+}
+
static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
@@ -868,7 +914,7 @@ static bool rtree_next_node(struct memory_bitmap *bm)
}
/**
- * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap.
+ * memory_bm_next_pfn - Find the next set bit in a memory bitmap.
* @bm: Memory bitmap.
*
* Starting from the last returned position this function searches for the next
@@ -891,10 +937,12 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
if (bit < bits) {
pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit;
bm->cur.node_bit = bit + 1;
+ bm->cur.cur_pfn = pfn;
return pfn;
}
} while (rtree_next_node(bm));
+ bm->cur.cur_pfn = BM_END_OF_MAP;
return BM_END_OF_MAP;
}
@@ -944,8 +992,7 @@ static void memory_bm_recycle(struct memory_bitmap *bm)
* Register a range of page frames the contents of which should not be saved
* during hibernation (to be used in the early initialization code).
*/
-void __init __register_nosave_region(unsigned long start_pfn,
- unsigned long end_pfn, int use_kmalloc)
+void __init register_nosave_region(unsigned long start_pfn, unsigned long end_pfn)
{
struct nosave_region *region;
@@ -961,18 +1008,12 @@ void __init __register_nosave_region(unsigned long start_pfn,
goto Report;
}
}
- if (use_kmalloc) {
- /* During init, this shouldn't fail */
- region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
- BUG_ON(!region);
- } else {
- /* This allocation cannot fail */
- region = memblock_alloc(sizeof(struct nosave_region),
- SMP_CACHE_BYTES);
- if (!region)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- sizeof(struct nosave_region));
- }
+ /* This allocation cannot fail */
+ region = memblock_alloc(sizeof(struct nosave_region),
+ SMP_CACHE_BYTES);
+ if (!region)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ sizeof(struct nosave_region));
region->start_pfn = start_pfn;
region->end_pfn = end_pfn;
list_add_tail(&region->list, &nosave_regions);
@@ -1078,7 +1119,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
int create_basic_memory_bitmaps(void)
{
struct memory_bitmap *bm1, *bm2;
- int error = 0;
+ int error;
if (forbidden_pages_map && free_pages_map)
return 0;
@@ -1112,7 +1153,7 @@ int create_basic_memory_bitmaps(void)
Free_second_object:
kfree(bm2);
Free_first_bitmap:
- memory_bm_free(bm1, PG_UNSAFE_CLEAR);
+ memory_bm_free(bm1, PG_UNSAFE_CLEAR);
Free_first_object:
kfree(bm1);
return -ENOMEM;
@@ -1144,7 +1185,15 @@ void free_basic_memory_bitmaps(void)
pr_debug("Basic memory bitmaps freed\n");
}
-void clear_free_pages(void)
+static void clear_or_poison_free_page(struct page *page)
+{
+ if (page_poisoning_enabled_static())
+ __kernel_poison_pages(page, 1);
+ else if (want_init_on_free())
+ clear_highpage(page);
+}
+
+void clear_or_poison_free_pages(void)
{
struct memory_bitmap *bm = free_pages_map;
unsigned long pfn;
@@ -1152,12 +1201,12 @@ void clear_free_pages(void)
if (WARN_ON(!(free_pages_map)))
return;
- if (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) || want_init_on_free()) {
+ if (page_poisoning_enabled() || want_init_on_free()) {
memory_bm_position_reset(bm);
pfn = memory_bm_next_pfn(bm);
while (pfn != BM_END_OF_MAP) {
if (pfn_valid(pfn))
- clear_highpage(pfn_to_page(pfn));
+ clear_or_poison_free_page(pfn_to_page(pfn));
pfn = memory_bm_next_pfn(bm);
}
@@ -1189,6 +1238,58 @@ unsigned int snapshot_additional_pages(struct zone *zone)
return 2 * rtree;
}
+/*
+ * Touch the watchdog for every WD_PAGE_COUNT pages.
+ */
+#define WD_PAGE_COUNT (128*1024)
+
+static void mark_free_pages(struct zone *zone)
+{
+ unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
+ unsigned long flags;
+ unsigned int order, t;
+ struct page *page;
+
+ if (zone_is_empty(zone))
+ return;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ max_zone_pfn = zone_end_pfn(zone);
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (pfn_valid(pfn)) {
+ page = pfn_to_page(pfn);
+
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
+
+ if (page_zone(page) != zone)
+ continue;
+
+ if (!swsusp_page_is_forbidden(page))
+ swsusp_unset_page_free(page);
+ }
+
+ for_each_migratetype_order(order, t) {
+ list_for_each_entry(page,
+ &zone->free_area[order].free_list[t], buddy_list) {
+ unsigned long i;
+
+ pfn = page_to_pfn(page);
+ for (i = 0; i < (1UL << order); i++) {
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
+ swsusp_set_page_free(pfn_to_page(pfn + i));
+ }
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
#ifdef CONFIG_HIGHMEM
/**
* count_free_highmem_pages - Compute the total number of free highmem pages.
@@ -1332,14 +1433,19 @@ static unsigned int count_data_pages(void)
/*
* This is needed, because copy_page and memcpy are not usable for copying
- * task structs.
+ * task structs. Returns true if the page was filled with only zeros,
+ * otherwise false.
*/
-static inline void do_copy_page(long *dst, long *src)
+static inline bool do_copy_page(long *dst, long *src)
{
+ long z = 0;
int n;
- for (n = PAGE_SIZE / sizeof(long); n; n--)
+ for (n = PAGE_SIZE / sizeof(long); n; n--) {
+ z |= *src;
*dst++ = *src++;
+ }
+ return !z;
}
/**
@@ -1348,17 +1454,21 @@ static inline void do_copy_page(long *dst, long *src)
* Check if the page we are going to copy is marked as present in the kernel
* page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or
* CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present()
- * always returns 'true'.
+ * always returns 'true'. Returns true if the page was entirely composed of
+ * zeros, otherwise it will return false.
*/
-static void safe_copy_page(void *dst, struct page *s_page)
+static bool safe_copy_page(void *dst, struct page *s_page)
{
+ bool zeros_only;
+
if (kernel_page_present(s_page)) {
- do_copy_page(dst, page_address(s_page));
+ zeros_only = do_copy_page(dst, page_address(s_page));
} else {
- kernel_map_pages(s_page, 1, 1);
- do_copy_page(dst, page_address(s_page));
- kernel_map_pages(s_page, 1, 0);
+ hibernate_map_page(s_page);
+ zeros_only = do_copy_page(dst, page_address(s_page));
+ hibernate_unmap_page(s_page);
}
+ return zeros_only;
}
#ifdef CONFIG_HIGHMEM
@@ -1368,49 +1478,59 @@ static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn
saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn);
}
-static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
{
struct page *s_page, *d_page;
void *src, *dst;
+ bool zeros_only;
s_page = pfn_to_page(src_pfn);
d_page = pfn_to_page(dst_pfn);
if (PageHighMem(s_page)) {
- src = kmap_atomic(s_page);
- dst = kmap_atomic(d_page);
- do_copy_page(dst, src);
- kunmap_atomic(dst);
- kunmap_atomic(src);
+ src = kmap_local_page(s_page);
+ dst = kmap_local_page(d_page);
+ zeros_only = do_copy_page(dst, src);
+ kunmap_local(dst);
+ kunmap_local(src);
} else {
if (PageHighMem(d_page)) {
/*
* The page pointed to by src may contain some kernel
* data modified by kmap_atomic()
*/
- safe_copy_page(buffer, s_page);
- dst = kmap_atomic(d_page);
+ zeros_only = safe_copy_page(buffer, s_page);
+ dst = kmap_local_page(d_page);
copy_page(dst, buffer);
- kunmap_atomic(dst);
+ kunmap_local(dst);
} else {
- safe_copy_page(page_address(d_page), s_page);
+ zeros_only = safe_copy_page(page_address(d_page), s_page);
}
}
+ return zeros_only;
}
#else
#define page_is_saveable(zone, pfn) saveable_page(zone, pfn)
-static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+static inline int copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
{
- safe_copy_page(page_address(pfn_to_page(dst_pfn)),
+ return safe_copy_page(page_address(pfn_to_page(dst_pfn)),
pfn_to_page(src_pfn));
}
#endif /* CONFIG_HIGHMEM */
-static void copy_data_pages(struct memory_bitmap *copy_bm,
- struct memory_bitmap *orig_bm)
+/*
+ * Copy data pages will copy all pages into pages pulled from the copy_bm.
+ * If a page was entirely filled with zeros it will be marked in the zero_bm.
+ *
+ * Returns the number of pages copied.
+ */
+static unsigned long copy_data_pages(struct memory_bitmap *copy_bm,
+ struct memory_bitmap *orig_bm,
+ struct memory_bitmap *zero_bm)
{
+ unsigned long copied_pages = 0;
struct zone *zone;
- unsigned long pfn;
+ unsigned long pfn, copy_pfn;
for_each_populated_zone(zone) {
unsigned long max_zone_pfn;
@@ -1423,18 +1543,29 @@ static void copy_data_pages(struct memory_bitmap *copy_bm,
}
memory_bm_position_reset(orig_bm);
memory_bm_position_reset(copy_bm);
+ copy_pfn = memory_bm_next_pfn(copy_bm);
for(;;) {
pfn = memory_bm_next_pfn(orig_bm);
if (unlikely(pfn == BM_END_OF_MAP))
break;
- copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
+ if (copy_data_page(copy_pfn, pfn)) {
+ memory_bm_set_bit(zero_bm, pfn);
+ /* Use this copy_pfn for a page that is not full of zeros */
+ continue;
+ }
+ copied_pages++;
+ copy_pfn = memory_bm_next_pfn(copy_bm);
}
+ return copied_pages;
}
/* Total number of image pages */
static unsigned int nr_copy_pages;
/* Number of pages needed for saving the original pfns of the image pages */
static unsigned int nr_meta_pages;
+/* Number of zero pages */
+static unsigned int nr_zero_pages;
+
/*
* Numbers of normal and highmem page frames allocated for hibernation image
* before suspending devices.
@@ -1455,10 +1586,13 @@ static struct memory_bitmap orig_bm;
*/
static struct memory_bitmap copy_bm;
+/* Memory bitmap which tracks which saveable pages were zero filled. */
+static struct memory_bitmap zero_bm;
+
/**
* swsusp_free - Free pages allocated for hibernation image.
*
- * Image pages are alocated before snapshot creation, so they need to be
+ * Image pages are allocated before snapshot creation, so they need to be
* released after resume.
*/
void swsusp_free(void)
@@ -1499,6 +1633,7 @@ loop:
out:
nr_copy_pages = 0;
nr_meta_pages = 0;
+ nr_zero_pages = 0;
restore_pblist = NULL;
buffer = NULL;
alloc_normal = 0;
@@ -1663,7 +1798,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
{
unsigned long size;
- size = global_node_page_state(NR_SLAB_RECLAIMABLE)
+ size = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B)
+ global_node_page_state(NR_ACTIVE_ANON)
+ global_node_page_state(NR_INACTIVE_ANON)
+ global_node_page_state(NR_ACTIVE_FILE)
@@ -1684,8 +1819,8 @@ static unsigned long minimum_image_size(unsigned long saveable)
* /sys/power/reserved_size, respectively). To make this happen, we compute the
* total number of available page frames and allocate at least
*
- * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2
- * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
+ * ([page frames total] - PAGES_FOR_IO - [metadata pages]) / 2
+ * - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
*
* of them, which corresponds to the maximum size of a hibernation image.
*
@@ -1717,8 +1852,15 @@ int hibernate_preallocate_memory(void)
goto err_out;
}
+ error = memory_bm_create(&zero_bm, GFP_IMAGE, PG_ANY);
+ if (error) {
+ pr_err("Cannot allocate zero bitmap\n");
+ goto err_out;
+ }
+
alloc_normal = 0;
alloc_highmem = 0;
+ nr_zero_pages = 0;
/* Count the number of saveable data pages. */
save_highmem = count_highmem_pages();
@@ -1902,7 +2044,7 @@ static inline int get_highmem_buffer(int safe_needed)
}
/**
- * alloc_highmem_image_pages - Allocate some highmem pages for the image.
+ * alloc_highmem_pages - Allocate some highmem pages for the image.
*
* Try to allocate as many pages as needed, but if the number of free highmem
* pages is less than that, allocate them all.
@@ -1998,19 +2140,19 @@ asmlinkage __visible int swsusp_save(void)
* Kill them.
*/
drain_local_pages(NULL);
- copy_data_pages(&copy_bm, &orig_bm);
+ nr_copy_pages = copy_data_pages(&copy_bm, &orig_bm, &zero_bm);
/*
* End of critical section. From now on, we can write to memory,
* but we should not touch disk. This specially means we must _not_
* touch swap space! Except we must write out our image of course.
*/
-
nr_pages += nr_highmem;
- nr_copy_pages = nr_pages;
+ /* We don't actually copy the zero pages */
+ nr_zero_pages = nr_pages - nr_copy_pages;
nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
- pr_info("Image created (%d pages copied)\n", nr_pages);
+ pr_info("Image created (%d pages copied, %d zero pages)\n", nr_copy_pages, nr_zero_pages);
return 0;
}
@@ -2023,7 +2165,7 @@ static int init_header_complete(struct swsusp_info *info)
return 0;
}
-static char *check_image_kernel(struct swsusp_info *info)
+static const char *check_image_kernel(struct swsusp_info *info)
{
if (info->version_code != LINUX_VERSION_CODE)
return "kernel version";
@@ -2055,15 +2197,22 @@ static int init_header(struct swsusp_info *info)
return init_header_complete(info);
}
+#define ENCODED_PFN_ZERO_FLAG ((unsigned long)1 << (BITS_PER_LONG - 1))
+#define ENCODED_PFN_MASK (~ENCODED_PFN_ZERO_FLAG)
+
/**
* pack_pfns - Prepare PFNs for saving.
* @bm: Memory bitmap.
* @buf: Memory buffer to store the PFNs in.
+ * @zero_bm: Memory bitmap containing PFNs of zero pages.
*
* PFNs corresponding to set bits in @bm are stored in the area of memory
- * pointed to by @buf (1 page at a time).
+ * pointed to by @buf (1 page at a time). Pages which were filled with only
+ * zeros will have the highest bit set in the packed format to distinguish
+ * them from PFNs which will be contained in the image file.
*/
-static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
+static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm,
+ struct memory_bitmap *zero_bm)
{
int j;
@@ -2071,6 +2220,8 @@ static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
buf[j] = memory_bm_next_pfn(bm);
if (unlikely(buf[j] == BM_END_OF_MAP))
break;
+ if (memory_bm_test_bit(zero_bm, buf[j]))
+ buf[j] |= ENCODED_PFN_ZERO_FLAG;
}
}
@@ -2112,7 +2263,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
memory_bm_position_reset(&copy_bm);
} else if (handle->cur <= nr_meta_pages) {
clear_page(buffer);
- pack_pfns(buffer, &orig_bm);
+ pack_pfns(buffer, &orig_bm, &zero_bm);
} else {
struct page *page;
@@ -2176,7 +2327,7 @@ static void mark_unsafe_pages(struct memory_bitmap *bm)
static int check_header(struct swsusp_info *info)
{
- char *reason;
+ const char *reason;
reason = check_image_kernel(info);
if (!reason && info->num_physpages != get_num_physpages())
@@ -2189,7 +2340,7 @@ static int check_header(struct swsusp_info *info)
}
/**
- * load header - Check the image header and copy the data from it.
+ * load_header - Check the image header and copy the data from it.
*/
static int load_header(struct swsusp_info *info)
{
@@ -2208,22 +2359,37 @@ static int load_header(struct swsusp_info *info)
* unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap.
* @bm: Memory bitmap.
* @buf: Area of memory containing the PFNs.
+ * @zero_bm: Memory bitmap with the zero PFNs marked.
*
* For each element of the array pointed to by @buf (1 page at a time), set the
- * corresponding bit in @bm.
+ * corresponding bit in @bm. If the page was originally populated with only
+ * zeros then a corresponding bit will also be set in @zero_bm.
*/
-static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
+static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm,
+ struct memory_bitmap *zero_bm)
{
+ unsigned long decoded_pfn;
+ bool zero;
int j;
for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
if (unlikely(buf[j] == BM_END_OF_MAP))
break;
- if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j]))
- memory_bm_set_bit(bm, buf[j]);
- else
+ zero = !!(buf[j] & ENCODED_PFN_ZERO_FLAG);
+ decoded_pfn = buf[j] & ENCODED_PFN_MASK;
+ if (pfn_valid(decoded_pfn) && memory_bm_pfn_present(bm, decoded_pfn)) {
+ memory_bm_set_bit(bm, decoded_pfn);
+ if (zero) {
+ memory_bm_set_bit(zero_bm, decoded_pfn);
+ nr_zero_pages++;
+ }
+ } else {
+ if (!pfn_valid(decoded_pfn))
+ pr_err(FW_BUG "Memory map mismatch at 0x%llx after hibernation\n",
+ (unsigned long long)PFN_PHYS(decoded_pfn));
return -EFAULT;
+ }
}
return 0;
@@ -2284,7 +2450,7 @@ static struct memory_bitmap *safe_highmem_bm;
* (@nr_highmem_p points to the variable containing the number of highmem image
* pages). The pages that are "safe" (ie. will not be overwritten when the
* hibernation image is restored entirely) have the corresponding bits set in
- * @bm (it must be unitialized).
+ * @bm (it must be uninitialized).
*
* NOTE: This function should not be called if there are no highmem image pages.
*/
@@ -2379,8 +2545,9 @@ static void *get_highmem_page_buffer(struct page *page,
pbe->copy_page = tmp;
} else {
/* Copy of the page will be stored in normal memory */
- kaddr = safe_pages_list;
- safe_pages_list = safe_pages_list->next;
+ kaddr = __get_safe_page(ca->gfp_mask);
+ if (!kaddr)
+ return ERR_PTR(-ENOMEM);
pbe->copy_page = virt_to_page(kaddr);
}
pbe->next = highmem_pblist;
@@ -2441,8 +2608,9 @@ static inline void free_highmem_data(void) {}
/**
* prepare_image - Make room for loading hibernation image.
- * @new_bm: Unitialized memory bitmap structure.
+ * @new_bm: Uninitialized memory bitmap structure.
* @bm: Memory bitmap with unsafe pages marked.
+ * @zero_bm: Memory bitmap containing the zero pages.
*
* Use @bm to mark the pages that will be overwritten in the process of
* restoring the system memory state from the suspend image ("unsafe" pages)
@@ -2453,10 +2621,15 @@ static inline void free_highmem_data(void) {}
* pages will be used for just yet. Instead, we mark them all as allocated and
* create a lists of "safe" pages to be used later. On systems with high
* memory a list of "safe" highmem pages is created too.
+ *
+ * Because it was not known which pages were unsafe when @zero_bm was created,
+ * make a copy of it and recreate it within safe pages.
*/
-static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
+static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm,
+ struct memory_bitmap *zero_bm)
{
unsigned int nr_pages, nr_highmem;
+ struct memory_bitmap tmp;
struct linked_page *lp;
int error;
@@ -2473,6 +2646,24 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
duplicate_memory_bitmap(new_bm, bm);
memory_bm_free(bm, PG_UNSAFE_KEEP);
+
+ /* Make a copy of zero_bm so it can be created in safe pages */
+ error = memory_bm_create(&tmp, GFP_ATOMIC, PG_SAFE);
+ if (error)
+ goto Free;
+
+ duplicate_memory_bitmap(&tmp, zero_bm);
+ memory_bm_free(zero_bm, PG_UNSAFE_KEEP);
+
+ /* Recreate zero_bm in safe pages */
+ error = memory_bm_create(zero_bm, GFP_ATOMIC, PG_SAFE);
+ if (error)
+ goto Free;
+
+ duplicate_memory_bitmap(zero_bm, &tmp);
+ memory_bm_free(&tmp, PG_UNSAFE_CLEAR);
+ /* At this point zero_bm is in safe pages and it can be used for restoring. */
+
if (nr_highmem > 0) {
error = prepare_highmem_image(bm, &nr_highmem);
if (error)
@@ -2487,7 +2678,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
*
* nr_copy_pages cannot be less than allocated_unsafe_pages too.
*/
- nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
+ nr_pages = (nr_zero_pages + nr_copy_pages) - nr_highmem - allocated_unsafe_pages;
nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
while (nr_pages > 0) {
lp = get_image_page(GFP_ATOMIC, PG_SAFE);
@@ -2500,7 +2691,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
nr_pages--;
}
/* Preallocate memory for the image */
- nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
+ nr_pages = (nr_zero_pages + nr_copy_pages) - nr_highmem - allocated_unsafe_pages;
while (nr_pages > 0) {
lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
if (!lp) {
@@ -2560,8 +2751,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
return ERR_PTR(-ENOMEM);
}
pbe->orig_address = page_address(page);
- pbe->address = safe_pages_list;
- safe_pages_list = safe_pages_list->next;
+ pbe->address = __get_safe_page(ca->gfp_mask);
+ if (!pbe->address)
+ return ERR_PTR(-ENOMEM);
pbe->next = restore_pblist;
restore_pblist = pbe;
return pbe->address;
@@ -2586,14 +2778,13 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
int snapshot_write_next(struct snapshot_handle *handle)
{
static struct chain_allocator ca;
- int error = 0;
+ int error;
+next:
/* Check if we have already loaded the entire image */
- if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages)
+ if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages)
return 0;
- handle->sync_read = 1;
-
if (!handle->cur) {
if (!buffer)
/* This makes the buffer be freed by swsusp_free() */
@@ -2614,22 +2805,28 @@ int snapshot_write_next(struct snapshot_handle *handle)
if (error)
return error;
+ error = memory_bm_create(&zero_bm, GFP_ATOMIC, PG_ANY);
+ if (error)
+ return error;
+
+ nr_zero_pages = 0;
+
hibernate_restore_protection_begin();
} else if (handle->cur <= nr_meta_pages + 1) {
- error = unpack_orig_pfns(buffer, &copy_bm);
+ error = unpack_orig_pfns(buffer, &copy_bm, &zero_bm);
if (error)
return error;
if (handle->cur == nr_meta_pages + 1) {
- error = prepare_image(&orig_bm, &copy_bm);
+ error = prepare_image(&orig_bm, &copy_bm, &zero_bm);
if (error)
return error;
chain_init(&ca, GFP_ATOMIC, PG_SAFE);
memory_bm_position_reset(&orig_bm);
+ memory_bm_position_reset(&zero_bm);
restore_pblist = NULL;
handle->buffer = get_buffer(&orig_bm, &ca);
- handle->sync_read = 0;
if (IS_ERR(handle->buffer))
return PTR_ERR(handle->buffer);
}
@@ -2639,10 +2836,17 @@ int snapshot_write_next(struct snapshot_handle *handle)
handle->buffer = get_buffer(&orig_bm, &ca);
if (IS_ERR(handle->buffer))
return PTR_ERR(handle->buffer);
- if (handle->buffer != buffer)
- handle->sync_read = 0;
}
+ handle->sync_read = (handle->buffer == buffer);
handle->cur++;
+
+ /* Zero pages were not included in the image, memset it and move on. */
+ if (handle->cur > nr_meta_pages + 1 &&
+ memory_bm_test_bit(&zero_bm, memory_bm_get_current(&orig_bm))) {
+ memset(handle->buffer, 0, PAGE_SIZE);
+ goto next;
+ }
+
return PAGE_SIZE;
}
@@ -2659,7 +2863,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle)
copy_last_highmem_page();
hibernate_restore_protect_page(handle->buffer);
/* Do that only if we have loaded the image entirely */
- if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
+ if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages) {
memory_bm_recycle(&orig_bm);
free_highmem_data();
}
@@ -2668,7 +2872,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle)
int snapshot_image_loaded(struct snapshot_handle *handle)
{
return !(!nr_copy_pages || !last_highmem_page_copied() ||
- handle->cur <= nr_meta_pages + nr_copy_pages);
+ handle->cur <= nr_meta_pages + nr_copy_pages + nr_zero_pages);
}
#ifdef CONFIG_HIGHMEM
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8b1bb5ee7e5d..fa3bf161d13f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -75,9 +75,11 @@ EXPORT_SYMBOL_GPL(pm_suspend_default_s2idle);
void s2idle_set_ops(const struct platform_s2idle_ops *ops)
{
- lock_system_sleep();
+ unsigned int sleep_flags;
+
+ sleep_flags = lock_system_sleep();
s2idle_ops = ops;
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
}
static void s2idle_begin(void)
@@ -96,8 +98,7 @@ static void s2idle_enter(void)
s2idle_state = S2IDLE_STATE_ENTER;
raw_spin_unlock_irq(&s2idle_lock);
- get_online_cpus();
- cpuidle_resume();
+ cpus_read_lock();
/* Push all the CPUs into the idle loop. */
wake_up_all_idle_cpus();
@@ -105,8 +106,7 @@ static void s2idle_enter(void)
swait_event_exclusive(s2idle_wait_head,
s2idle_state == S2IDLE_STATE_WAKE);
- cpuidle_pause();
- put_online_cpus();
+ cpus_read_unlock();
raw_spin_lock_irq(&s2idle_lock);
@@ -138,7 +138,8 @@ static void s2idle_loop(void)
break;
}
- pm_wakeup_clear(false);
+ if (s2idle_ops && s2idle_ops->check)
+ s2idle_ops->check();
s2idle_enter();
}
@@ -162,11 +163,13 @@ EXPORT_SYMBOL_GPL(s2idle_wake);
static bool valid_state(suspend_state_t state)
{
/*
- * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level
- * support and need to be valid to the low level
- * implementation, no valid callback implies that none are valid.
+ * The PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states require low-level
+ * support and need to be valid to the low-level implementation.
+ *
+ * No ->valid() or ->enter() callback implies that none are valid.
*/
- return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
+ return suspend_ops && suspend_ops->valid && suspend_ops->valid(state) &&
+ suspend_ops->enter;
}
void __init pm_states_init(void)
@@ -202,7 +205,9 @@ __setup("mem_sleep_default=", mem_sleep_default_setup);
*/
void suspend_set_ops(const struct platform_suspend_ops *ops)
{
- lock_system_sleep();
+ unsigned int sleep_flags;
+
+ sleep_flags = lock_system_sleep();
suspend_ops = ops;
@@ -218,12 +223,13 @@ void suspend_set_ops(const struct platform_suspend_ops *ops)
mem_sleep_current = PM_SUSPEND_MEM;
}
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
}
EXPORT_SYMBOL_GPL(suspend_set_ops);
/**
* suspend_valid_only_mem - Generic memory-only valid callback.
+ * @state: Target system sleep state.
*
* Platform drivers that implement mem suspend only and only need to check for
* that in their .valid() callback can use this instead of rolling their own
@@ -237,7 +243,8 @@ EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
static bool sleep_state_supported(suspend_state_t state)
{
- return state == PM_SUSPEND_TO_IDLE || (suspend_ops && suspend_ops->enter);
+ return state == PM_SUSPEND_TO_IDLE ||
+ (valid_state(state) && !cxl_mem_active());
}
static int platform_suspend_prepare(suspend_state_t state)
@@ -335,6 +342,7 @@ static int suspend_test(int level)
/**
* suspend_prepare - Prepare for entering system sleep state.
+ * @state: Target system sleep state.
*
* Common code run for every system sleep state that can be entered (except for
* hibernation). Run suspend notifiers, allocate the "suspend" console and
@@ -342,18 +350,16 @@ static int suspend_test(int level)
*/
static int suspend_prepare(suspend_state_t state)
{
- int error, nr_calls = 0;
+ int error;
if (!sleep_state_supported(state))
return -EPERM;
pm_prepare_console();
- error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls);
- if (error) {
- nr_calls--;
- goto Finish;
- }
+ error = pm_notifier_call_chain_robust(PM_SUSPEND_PREPARE, PM_POST_SUSPEND);
+ if (error)
+ goto Restore;
trace_suspend_resume(TPS("freeze_processes"), 0, true);
error = suspend_freeze_processes();
@@ -363,8 +369,8 @@ static int suspend_prepare(suspend_state_t state)
suspend_stats.failed_freeze++;
dpm_save_failed_step(SUSPEND_FREEZE);
- Finish:
- __pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL);
+ pm_notifier_call_chain(PM_POST_SUSPEND);
+ Restore:
pm_restore_console();
return error;
}
@@ -422,7 +428,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
goto Platform_wake;
}
- error = suspend_disable_secondary_cpus();
+ error = pm_sleep_disable_secondary_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
@@ -452,7 +458,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
BUG_ON(irqs_disabled());
Enable_cpus:
- suspend_enable_secondary_cpus();
+ pm_sleep_enable_secondary_cpus();
Platform_wake:
platform_resume_noirq(state);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index e1ed58adb69e..b663a97f5867 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -129,7 +129,7 @@ static int __init has_wakealarm(struct device *dev, const void *data)
{
struct rtc_device *candidate = to_rtc_device(dev);
- if (!candidate->ops->set_alarm)
+ if (!test_bit(RTC_FEATURE_ALARM, candidate->features))
return 0;
if (!device_may_wakeup(candidate->dev.parent))
return 0;
@@ -157,22 +157,22 @@ static int __init setup_test_suspend(char *value)
value++;
suspend_type = strsep(&value, ",");
if (!suspend_type)
- return 0;
+ return 1;
repeat = strsep(&value, ",");
if (repeat) {
if (kstrtou32(repeat, 0, &test_repeat_count_max))
- return 0;
+ return 1;
}
for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
if (!strcmp(pm_labels[i], suspend_type)) {
test_state_label = pm_labels[i];
- return 0;
+ return 1;
}
printk(warn_bad_state, suspend_type);
- return 0;
+ return 1;
}
__setup("test_suspend", setup_test_suspend);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 01e2858b5fe3..6053ddddaf65 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -16,7 +16,6 @@
#include <linux/file.h>
#include <linux/delay.h>
#include <linux/bitops.h>
-#include <linux/genhd.h>
#include <linux/device.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
@@ -36,6 +35,8 @@
#define HIBERNATE_SIG "S1SUSPEND"
+u32 swsusp_hardware_signature;
+
/*
* When reading an {un,}compressed image, we may restore pages in place,
* in which case some architectures need these pages cleaning before they
@@ -87,7 +88,7 @@ struct swap_map_page_list {
struct swap_map_page_list *next;
};
-/**
+/*
* The swap_map_handle structure is used for handling swap in
* a file-alike way
*/
@@ -104,7 +105,8 @@ struct swap_map_handle {
struct swsusp_header {
char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) -
- sizeof(u32)];
+ sizeof(u32) - sizeof(u32)];
+ u32 hw_sig;
u32 crc32;
sector_t image;
unsigned int flags; /* Flags to pass to the "boot" kernel */
@@ -114,7 +116,7 @@ struct swsusp_header {
static struct swsusp_header *swsusp_header;
-/**
+/*
* The following functions are used for tracing the allocated
* swap pages, so that they can be freed in case of an error.
*/
@@ -168,7 +170,7 @@ static int swsusp_extents_insert(unsigned long swap_offset)
return 0;
}
-/**
+/*
* alloc_swapdev_block - allocate a swap page and register that it has
* been allocated, so that it can be freed in case of an error.
*/
@@ -187,7 +189,7 @@ sector_t alloc_swapdev_block(int swap)
return 0;
}
-/**
+/*
* free_all_swap_pages - free swap pages allocated for saving image data.
* It also frees the extents used to register which swap entries had been
* allocated.
@@ -220,12 +222,13 @@ int swsusp_swap_in_use(void)
*/
static unsigned short root_swap = 0xffff;
-static struct block_device *hib_resume_bdev;
+static struct bdev_handle *hib_resume_bdev_handle;
struct hib_bio_batch {
atomic_t count;
wait_queue_head_t wait;
blk_status_t error;
+ struct blk_plug plug;
};
static void hib_init_batch(struct hib_bio_batch *hb)
@@ -233,6 +236,12 @@ static void hib_init_batch(struct hib_bio_batch *hb)
atomic_set(&hb->count, 0);
init_waitqueue_head(&hb->wait);
hb->error = BLK_STS_OK;
+ blk_start_plug(&hb->plug);
+}
+
+static void hib_finish_batch(struct hib_bio_batch *hb)
+{
+ blk_finish_plug(&hb->plug);
}
static void hib_end_io(struct bio *bio)
@@ -260,17 +269,16 @@ static void hib_end_io(struct bio *bio)
bio_put(bio);
}
-static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
- struct hib_bio_batch *hb)
+static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr,
+ struct hib_bio_batch *hb)
{
struct page *page = virt_to_page(addr);
struct bio *bio;
int error = 0;
- bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1);
+ bio = bio_alloc(hib_resume_bdev_handle->bdev, 1, opf,
+ GFP_NOIO | __GFP_HIGH);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
- bio_set_dev(bio, hib_resume_bdev);
- bio_set_op_attrs(bio, op, op_flags);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
pr_err("Adding page to bio failed at %llu\n",
@@ -292,8 +300,12 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
return error;
}
-static blk_status_t hib_wait_io(struct hib_bio_batch *hb)
+static int hib_wait_io(struct hib_bio_batch *hb)
{
+ /*
+ * We are relying on the behavior of blk_plug that a thread with
+ * a plug will flush the plug list before sleeping.
+ */
wait_event(hb->wait, atomic_read(&hb->count) == 0);
return blk_status_to_errno(hb->error);
}
@@ -301,22 +313,24 @@ static blk_status_t hib_wait_io(struct hib_bio_batch *hb)
/*
* Saving part
*/
-
static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
{
int error;
- hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block,
- swsusp_header, NULL);
+ hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL);
if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
!memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
swsusp_header->image = handle->first_sector;
+ if (swsusp_hardware_signature) {
+ swsusp_header->hw_sig = swsusp_hardware_signature;
+ flags |= SF_HW_SIG;
+ }
swsusp_header->flags = flags;
if (flags & SF_CRC32_MODE)
swsusp_header->crc32 = handle->crc32;
- error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
+ error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC,
swsusp_resume_block, swsusp_header, NULL);
} else {
pr_err("Swap header not found!\n");
@@ -335,26 +349,23 @@ static int swsusp_swap_check(void)
{
int res;
- res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
- &hib_resume_bdev);
+ if (swsusp_resume_device)
+ res = swap_type_of(swsusp_resume_device, swsusp_resume_block);
+ else
+ res = find_first_swap(&swsusp_resume_device);
if (res < 0)
return res;
-
root_swap = res;
- res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
- if (res)
- return res;
- res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
+ hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device,
+ BLK_OPEN_WRITE, NULL, NULL);
+ if (IS_ERR(hib_resume_bdev_handle))
+ return PTR_ERR(hib_resume_bdev_handle);
+
+ res = set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE);
if (res < 0)
- blkdev_put(hib_resume_bdev, FMODE_WRITE);
+ bdev_release(hib_resume_bdev_handle);
- /*
- * Update the resume device to the one actually used,
- * so the test_resume mode can use it in case it is
- * invoked from hibernate() to test the snapshot.
- */
- swsusp_resume_device = hib_resume_bdev->bd_dev;
return res;
}
@@ -396,7 +407,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
} else {
src = buf;
}
- return hib_submit_io(REQ_OP_WRITE, REQ_SYNC, offset, src, hb);
+ return hib_submit_io(REQ_OP_WRITE | REQ_SYNC, offset, src, hb);
}
static void release_swap_writer(struct swap_map_handle *handle)
@@ -433,14 +444,14 @@ static int get_swap_writer(struct swap_map_handle *handle)
err_rel:
release_swap_writer(handle);
err_close:
- swsusp_close(FMODE_WRITE);
+ swsusp_close();
return ret;
}
static int swap_write_page(struct swap_map_handle *handle, void *buf,
struct hib_bio_batch *hb)
{
- int error = 0;
+ int error;
sector_t offset;
if (!handle->cur)
@@ -489,16 +500,16 @@ static int swap_writer_finish(struct swap_map_handle *handle,
unsigned int flags, int error)
{
if (!error) {
- flush_swap_writer(handle);
pr_info("S");
error = mark_swapfiles(handle, flags);
pr_cont("|\n");
+ flush_swap_writer(handle);
}
if (error)
free_all_swap_pages(root_swap);
release_swap_writer(handle);
- swsusp_close(FMODE_WRITE);
+ swsusp_close();
return error;
}
@@ -561,6 +572,7 @@ static int save_image(struct swap_map_handle *handle,
nr_pages++;
}
err2 = hib_wait_io(&hb);
+ hib_finish_batch(&hb);
stop = ktime_get();
if (!ret)
ret = err2;
@@ -570,7 +582,7 @@ static int save_image(struct swap_map_handle *handle,
return ret;
}
-/**
+/*
* Structure used for CRC32.
*/
struct crc_data {
@@ -585,7 +597,7 @@ struct crc_data {
unsigned char *unc[LZO_THREADS]; /* uncompressed data */
};
-/**
+/*
* CRC32 update function that runs in its own thread.
*/
static int crc32_threadfn(void *data)
@@ -594,11 +606,11 @@ static int crc32_threadfn(void *data)
unsigned i;
while (1) {
- wait_event(d->go, atomic_read(&d->ready) ||
+ wait_event(d->go, atomic_read_acquire(&d->ready) ||
kthread_should_stop());
if (kthread_should_stop()) {
d->thr = NULL;
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
break;
}
@@ -607,12 +619,12 @@ static int crc32_threadfn(void *data)
for (i = 0; i < d->run_threads; i++)
*d->crc32 = crc32_le(*d->crc32,
d->unc[i], *d->unc_len[i]);
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
}
return 0;
}
-/**
+/*
* Structure used for LZO data compression.
*/
struct cmp_data {
@@ -629,7 +641,7 @@ struct cmp_data {
unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */
};
-/**
+/*
* Compression function that runs in its own thread.
*/
static int lzo_compress_threadfn(void *data)
@@ -637,12 +649,12 @@ static int lzo_compress_threadfn(void *data)
struct cmp_data *d = data;
while (1) {
- wait_event(d->go, atomic_read(&d->ready) ||
+ wait_event(d->go, atomic_read_acquire(&d->ready) ||
kthread_should_stop());
if (kthread_should_stop()) {
d->thr = NULL;
d->ret = -1;
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
break;
}
@@ -651,7 +663,7 @@ static int lzo_compress_threadfn(void *data)
d->ret = lzo1x_1_compress(d->unc, d->unc_len,
d->cmp + LZO_HEADER, &d->cmp_len,
d->wrk);
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
}
return 0;
@@ -696,22 +708,19 @@ static int save_image_lzo(struct swap_map_handle *handle,
goto out_clean;
}
- data = vmalloc(array_size(nr_threads, sizeof(*data)));
+ data = vzalloc(array_size(nr_threads, sizeof(*data)));
if (!data) {
pr_err("Failed to allocate LZO data\n");
ret = -ENOMEM;
goto out_clean;
}
- for (thr = 0; thr < nr_threads; thr++)
- memset(&data[thr], 0, offsetof(struct cmp_data, go));
- crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+ crc = kzalloc(sizeof(*crc), GFP_KERNEL);
if (!crc) {
pr_err("Failed to allocate crc\n");
ret = -ENOMEM;
goto out_clean;
}
- memset(crc, 0, offsetof(struct crc_data, go));
/*
* Start the compression threads.
@@ -789,7 +798,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
data[thr].unc_len = off;
- atomic_set(&data[thr].ready, 1);
+ atomic_set_release(&data[thr].ready, 1);
wake_up(&data[thr].go);
}
@@ -797,12 +806,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
break;
crc->run_threads = thr;
- atomic_set(&crc->ready, 1);
+ atomic_set_release(&crc->ready, 1);
wake_up(&crc->go);
for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
wait_event(data[thr].done,
- atomic_read(&data[thr].stop));
+ atomic_read_acquire(&data[thr].stop));
atomic_set(&data[thr].stop, 0);
ret = data[thr].ret;
@@ -841,7 +850,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
}
}
- wait_event(crc->done, atomic_read(&crc->stop));
+ wait_event(crc->done, atomic_read_acquire(&crc->stop));
atomic_set(&crc->stop, 0);
}
@@ -854,6 +863,7 @@ out_finish:
pr_info("Image saving done\n");
swsusp_show_speed(start, stop, nr_to_write, "Wrote");
out_clean:
+ hib_finish_batch(&hb);
if (crc) {
if (crc->thr)
kthread_stop(crc->thr);
@@ -874,7 +884,7 @@ out_clean:
* enough_swap - Make sure we have enough swap to save the image.
*
* Returns TRUE or FALSE after checking the total amount of swap
- * space avaiable from the resume partition.
+ * space available from the resume partition.
*/
static int enough_swap(unsigned int nr_pages)
@@ -939,9 +949,9 @@ out_finish:
return error;
}
-/**
+/*
* The following functions allow us to read data using a swap map
- * in a file-alike way
+ * in a file-like way.
*/
static void release_swap_reader(struct swap_map_handle *handle)
@@ -992,7 +1002,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
return -ENOMEM;
}
- error = hib_submit_io(REQ_OP_READ, 0, offset, tmp->map, NULL);
+ error = hib_submit_io(REQ_OP_READ, offset, tmp->map, NULL);
if (error) {
release_swap_reader(handle);
return error;
@@ -1016,7 +1026,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
offset = handle->cur->entries[handle->k];
if (!offset)
return -EFAULT;
- error = hib_submit_io(REQ_OP_READ, 0, offset, buf, hb);
+ error = hib_submit_io(REQ_OP_READ, offset, buf, hb);
if (error)
return error;
if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -1084,6 +1094,7 @@ static int load_image(struct swap_map_handle *handle,
nr_pages++;
}
err2 = hib_wait_io(&hb);
+ hib_finish_batch(&hb);
stop = ktime_get();
if (!ret)
ret = err2;
@@ -1097,7 +1108,7 @@ static int load_image(struct swap_map_handle *handle,
return ret;
}
-/**
+/*
* Structure used for LZO data decompression.
*/
struct dec_data {
@@ -1113,20 +1124,20 @@ struct dec_data {
unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
};
-/**
- * Deompression function that runs in its own thread.
+/*
+ * Decompression function that runs in its own thread.
*/
static int lzo_decompress_threadfn(void *data)
{
struct dec_data *d = data;
while (1) {
- wait_event(d->go, atomic_read(&d->ready) ||
+ wait_event(d->go, atomic_read_acquire(&d->ready) ||
kthread_should_stop());
if (kthread_should_stop()) {
d->thr = NULL;
d->ret = -1;
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
break;
}
@@ -1139,7 +1150,7 @@ static int lzo_decompress_threadfn(void *data)
flush_icache_range((unsigned long)d->unc,
(unsigned long)d->unc + d->unc_len);
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
}
return 0;
@@ -1187,22 +1198,19 @@ static int load_image_lzo(struct swap_map_handle *handle,
goto out_clean;
}
- data = vmalloc(array_size(nr_threads, sizeof(*data)));
+ data = vzalloc(array_size(nr_threads, sizeof(*data)));
if (!data) {
pr_err("Failed to allocate LZO data\n");
ret = -ENOMEM;
goto out_clean;
}
- for (thr = 0; thr < nr_threads; thr++)
- memset(&data[thr], 0, offsetof(struct dec_data, go));
- crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+ crc = kzalloc(sizeof(*crc), GFP_KERNEL);
if (!crc) {
pr_err("Failed to allocate crc\n");
ret = -ENOMEM;
goto out_clean;
}
- memset(crc, 0, offsetof(struct crc_data, go));
clean_pages_on_decompress = true;
@@ -1327,7 +1335,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
if (crc->run_threads) {
- wait_event(crc->done, atomic_read(&crc->stop));
+ wait_event(crc->done, atomic_read_acquire(&crc->stop));
atomic_set(&crc->stop, 0);
crc->run_threads = 0;
}
@@ -1363,7 +1371,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
pg = 0;
}
- atomic_set(&data[thr].ready, 1);
+ atomic_set_release(&data[thr].ready, 1);
wake_up(&data[thr].go);
}
@@ -1382,7 +1390,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
wait_event(data[thr].done,
- atomic_read(&data[thr].stop));
+ atomic_read_acquire(&data[thr].stop));
atomic_set(&data[thr].stop, 0);
ret = data[thr].ret;
@@ -1413,7 +1421,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
ret = snapshot_write_next(snapshot);
if (ret <= 0) {
crc->run_threads = thr + 1;
- atomic_set(&crc->ready, 1);
+ atomic_set_release(&crc->ready, 1);
wake_up(&crc->go);
goto out_finish;
}
@@ -1421,13 +1429,13 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
crc->run_threads = thr;
- atomic_set(&crc->ready, 1);
+ atomic_set_release(&crc->ready, 1);
wake_up(&crc->go);
}
out_finish:
if (crc->run_threads) {
- wait_event(crc->done, atomic_read(&crc->stop));
+ wait_event(crc->done, atomic_read_acquire(&crc->stop));
atomic_set(&crc->stop, 0);
}
stop = ktime_get();
@@ -1447,6 +1455,7 @@ out_finish:
}
swsusp_show_speed(start, stop, nr_to_read, "Read");
out_clean:
+ hib_finish_batch(&hb);
for (i = 0; i < ring_size; i++)
free_page((unsigned long)page[i]);
if (crc) {
@@ -1502,21 +1511,24 @@ end:
return error;
}
+static void *swsusp_holder;
+
/**
- * swsusp_check - Check for swsusp signature in the resume device
+ * swsusp_check - Open the resume device and check for the swsusp signature.
+ * @exclusive: Open the resume device exclusively.
*/
-int swsusp_check(void)
+int swsusp_check(bool exclusive)
{
+ void *holder = exclusive ? &swsusp_holder : NULL;
int error;
- hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
- FMODE_READ, NULL);
- if (!IS_ERR(hib_resume_bdev)) {
- set_blocksize(hib_resume_bdev, PAGE_SIZE);
+ hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device,
+ BLK_OPEN_READ, holder, NULL);
+ if (!IS_ERR(hib_resume_bdev_handle)) {
+ set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE);
clear_page(swsusp_header);
- error = hib_submit_io(REQ_OP_READ, 0,
- swsusp_resume_block,
+ error = hib_submit_io(REQ_OP_READ, swsusp_resume_block,
swsusp_header, NULL);
if (error)
goto put;
@@ -1524,20 +1536,26 @@ int swsusp_check(void)
if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
/* Reset swap signature now */
- error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
+ error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC,
swsusp_resume_block,
swsusp_header, NULL);
} else {
error = -EINVAL;
}
+ if (!error && swsusp_header->flags & SF_HW_SIG &&
+ swsusp_header->hw_sig != swsusp_hardware_signature) {
+ pr_info("Suspend image hardware signature mismatch (%08x now %08x); aborting resume.\n",
+ swsusp_header->hw_sig, swsusp_hardware_signature);
+ error = -EINVAL;
+ }
put:
if (error)
- blkdev_put(hib_resume_bdev, FMODE_READ);
+ bdev_release(hib_resume_bdev_handle);
else
pr_debug("Image signature found, resuming\n");
} else {
- error = PTR_ERR(hib_resume_bdev);
+ error = PTR_ERR(hib_resume_bdev_handle);
}
if (error)
@@ -1547,17 +1565,17 @@ put:
}
/**
- * swsusp_close - close swap device.
+ * swsusp_close - close resume device.
*/
-void swsusp_close(fmode_t mode)
+void swsusp_close(void)
{
- if (IS_ERR(hib_resume_bdev)) {
+ if (IS_ERR(hib_resume_bdev_handle)) {
pr_debug("Image device not initialised\n");
return;
}
- blkdev_put(hib_resume_bdev, mode);
+ bdev_release(hib_resume_bdev_handle);
}
/**
@@ -1569,11 +1587,11 @@ int swsusp_unmark(void)
{
int error;
- hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block,
- swsusp_header, NULL);
+ hib_submit_io(REQ_OP_READ, swsusp_resume_block,
+ swsusp_header, NULL);
if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
- error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
+ error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC,
swsusp_resume_block,
swsusp_header, NULL);
} else {
diff --git a/kernel/power/user.c b/kernel/power/user.c
index d5eedc2baa2a..3a4e70366f35 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -26,6 +26,7 @@
#include "power.h"
+static bool need_wait;
static struct snapshot_data {
struct snapshot_handle handle;
@@ -35,23 +36,24 @@ static struct snapshot_data {
bool ready;
bool platform_support;
bool free_bitmaps;
- struct inode *bd_inode;
+ dev_t dev;
} snapshot_state;
-int is_hibernate_resume_dev(const struct inode *bd_inode)
+int is_hibernate_resume_dev(dev_t dev)
{
- return hibernation_available() && snapshot_state.bd_inode == bd_inode;
+ return hibernation_available() && snapshot_state.dev == dev;
}
static int snapshot_open(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
- int error, nr_calls = 0;
+ unsigned int sleep_flags;
+ int error;
if (!hibernation_available())
return -EPERM;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
if (!hibernate_acquire()) {
error = -EBUSY;
@@ -69,31 +71,24 @@ static int snapshot_open(struct inode *inode, struct file *filp)
memset(&data->handle, 0, sizeof(struct snapshot_handle));
if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
/* Hibernating. The image device should be accessible. */
- data->swap = swsusp_resume_device ?
- swap_type_of(swsusp_resume_device, 0, NULL) : -1;
+ data->swap = swap_type_of(swsusp_resume_device, 0);
data->mode = O_RDONLY;
data->free_bitmaps = false;
- error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
- if (error)
- __pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL);
+ error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION);
} else {
/*
* Resuming. We may need to wait for the image device to
* appear.
*/
- wait_for_device_probe();
+ need_wait = true;
data->swap = -1;
data->mode = O_WRONLY;
- error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
+ error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE);
if (!error) {
error = create_basic_memory_bitmaps();
data->free_bitmaps = !error;
- } else
- nr_calls--;
-
- if (error)
- __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
+ }
}
if (error)
hibernate_release();
@@ -101,10 +96,10 @@ static int snapshot_open(struct inode *inode, struct file *filp)
data->frozen = false;
data->ready = false;
data->platform_support = false;
- data->bd_inode = NULL;
+ data->dev = 0;
Unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error;
}
@@ -112,12 +107,13 @@ static int snapshot_open(struct inode *inode, struct file *filp)
static int snapshot_release(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
+ unsigned int sleep_flags;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
swsusp_free();
data = filp->private_data;
- data->bd_inode = NULL;
+ data->dev = 0;
free_all_swap_pages(data->swap);
if (data->frozen) {
pm_restore_gfp_mask();
@@ -130,7 +126,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
PM_POST_HIBERNATION : PM_POST_RESTORE);
hibernate_release();
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return 0;
}
@@ -138,11 +134,12 @@ static int snapshot_release(struct inode *inode, struct file *filp)
static ssize_t snapshot_read(struct file *filp, char __user *buf,
size_t count, loff_t *offp)
{
+ loff_t pg_offp = *offp & ~PAGE_MASK;
struct snapshot_data *data;
+ unsigned int sleep_flags;
ssize_t res;
- loff_t pg_offp = *offp & ~PAGE_MASK;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
data = filp->private_data;
if (!data->ready) {
@@ -163,7 +160,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
*offp += res;
Unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return res;
}
@@ -171,11 +168,17 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
static ssize_t snapshot_write(struct file *filp, const char __user *buf,
size_t count, loff_t *offp)
{
+ loff_t pg_offp = *offp & ~PAGE_MASK;
struct snapshot_data *data;
+ unsigned long sleep_flags;
ssize_t res;
- loff_t pg_offp = *offp & ~PAGE_MASK;
- lock_system_sleep();
+ if (need_wait) {
+ wait_for_device_probe();
+ need_wait = false;
+ }
+
+ sleep_flags = lock_system_sleep();
data = filp->private_data;
@@ -184,7 +187,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
if (res <= 0)
goto unlock;
} else {
- res = PAGE_SIZE - pg_offp;
+ res = PAGE_SIZE;
}
if (!data_of(data->handle)) {
@@ -197,7 +200,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
if (res > 0)
*offp += res;
unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return res;
}
@@ -210,7 +213,6 @@ struct compat_resume_swap_area {
static int snapshot_set_swap_area(struct snapshot_data *data,
void __user *argp)
{
- struct block_device *bdev;
sector_t offset;
dev_t swdev;
@@ -237,16 +239,10 @@ static int snapshot_set_swap_area(struct snapshot_data *data,
* User space encodes device types as two-byte values,
* so we need to recode them
*/
- if (!swdev) {
- data->swap = -1;
- return -EINVAL;
- }
- data->swap = swap_type_of(swdev, offset, &bdev);
+ data->swap = swap_type_of(swdev, offset);
if (data->swap < 0)
- return -ENODEV;
-
- data->bd_inode = bdev->bd_inode;
- bdput(bdev);
+ return swdev ? -ENODEV : -EINVAL;
+ data->dev = swdev;
return 0;
}
@@ -258,6 +254,11 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
loff_t size;
sector_t offset;
+ if (need_wait) {
+ wait_for_device_probe();
+ need_wait = false;
+ }
+
if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
return -ENOTTY;
if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 105df4dfc783..52571dcad768 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -39,23 +39,20 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active)
{
struct rb_node *node;
struct wakelock *wl;
- char *str = buf;
- char *end = buf + PAGE_SIZE;
+ int len = 0;
mutex_lock(&wakelocks_lock);
for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) {
wl = rb_entry(node, struct wakelock, node);
if (wl->ws->active == show_active)
- str += scnprintf(str, end - str, "%s ", wl->name);
+ len += sysfs_emit_at(buf, len, "%s ", wl->name);
}
- if (str > buf)
- str--;
- str += scnprintf(str, end - str, "\n");
+ len += sysfs_emit_at(buf, len, "\n");
mutex_unlock(&wakelocks_lock);
- return (str - buf);
+ return len;
}
#if CONFIG_PM_WAKELOCKS_LIMIT > 0
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 4d052fc6bcde..39a2b61c7232 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,4 +1,9 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y = printk.o
-obj-$(CONFIG_PRINTK) += printk_safe.o
+obj-$(CONFIG_PRINTK) += printk_safe.o nbcon.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
+obj-$(CONFIG_PRINTK_INDEX) += index.o
+
+obj-$(CONFIG_PRINTK) += printk_support.o
+printk_support-y := printk_ringbuffer.o
+printk_support-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/kernel/printk/index.c b/kernel/printk/index.c
new file mode 100644
index 000000000000..a6b27526baaf
--- /dev/null
+++ b/kernel/printk/index.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Userspace indexing of printk formats
+ */
+
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/string_helpers.h>
+
+#include "internal.h"
+
+extern struct pi_entry *__start_printk_index[];
+extern struct pi_entry *__stop_printk_index[];
+
+/* The base dir for module formats, typically debugfs/printk/index/ */
+static struct dentry *dfs_index;
+
+static struct pi_entry *pi_get_entry(const struct module *mod, loff_t pos)
+{
+ struct pi_entry **entries;
+ unsigned int nr_entries;
+
+#ifdef CONFIG_MODULES
+ if (mod) {
+ entries = mod->printk_index_start;
+ nr_entries = mod->printk_index_size;
+ } else
+#endif
+ {
+ /* vmlinux, comes from linker symbols */
+ entries = __start_printk_index;
+ nr_entries = __stop_printk_index - __start_printk_index;
+ }
+
+ if (pos >= nr_entries)
+ return NULL;
+
+ return entries[pos];
+}
+
+static void *pi_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ const struct module *mod = s->file->f_inode->i_private;
+ struct pi_entry *entry = pi_get_entry(mod, *pos);
+
+ (*pos)++;
+
+ return entry;
+}
+
+static void *pi_start(struct seq_file *s, loff_t *pos)
+{
+ /*
+ * Make show() print the header line. Do not update *pos because
+ * pi_next() still has to return the entry at index 0 later.
+ */
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ return pi_next(s, NULL, pos);
+}
+
+/*
+ * We need both ESCAPE_ANY and explicit characters from ESCAPE_SPECIAL in @only
+ * because otherwise ESCAPE_NAP will cause double quotes and backslashes to be
+ * ignored for quoting.
+ */
+#define seq_escape_printf_format(s, src) \
+ seq_escape_str(s, src, ESCAPE_ANY | ESCAPE_NAP | ESCAPE_APPEND, "\"\\")
+
+static int pi_show(struct seq_file *s, void *v)
+{
+ const struct pi_entry *entry = v;
+ int level = LOGLEVEL_DEFAULT;
+ enum printk_info_flags flags = 0;
+ u16 prefix_len = 0;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(s, "# <level/flags> filename:line function \"format\"\n");
+ return 0;
+ }
+
+ if (!entry->fmt)
+ return 0;
+
+ if (entry->level)
+ printk_parse_prefix(entry->level, &level, &flags);
+ else
+ prefix_len = printk_parse_prefix(entry->fmt, &level, &flags);
+
+
+ if (flags & LOG_CONT) {
+ /*
+ * LOGLEVEL_DEFAULT here means "use the same level as the
+ * message we're continuing from", not the default message
+ * loglevel, so don't display it as such.
+ */
+ if (level == LOGLEVEL_DEFAULT)
+ seq_puts(s, "<c>");
+ else
+ seq_printf(s, "<%d,c>", level);
+ } else
+ seq_printf(s, "<%d>", level);
+
+ seq_printf(s, " %s:%d %s \"", entry->file, entry->line, entry->func);
+ if (entry->subsys_fmt_prefix)
+ seq_escape_printf_format(s, entry->subsys_fmt_prefix);
+ seq_escape_printf_format(s, entry->fmt + prefix_len);
+ seq_puts(s, "\"\n");
+
+ return 0;
+}
+
+static void pi_stop(struct seq_file *p, void *v) { }
+
+static const struct seq_operations dfs_index_sops = {
+ .start = pi_start,
+ .next = pi_next,
+ .show = pi_show,
+ .stop = pi_stop,
+};
+
+DEFINE_SEQ_ATTRIBUTE(dfs_index);
+
+#ifdef CONFIG_MODULES
+static const char *pi_get_module_name(struct module *mod)
+{
+ return mod ? mod->name : "vmlinux";
+}
+#else
+static const char *pi_get_module_name(struct module *mod)
+{
+ return "vmlinux";
+}
+#endif
+
+static void pi_create_file(struct module *mod)
+{
+ debugfs_create_file(pi_get_module_name(mod), 0444, dfs_index,
+ mod, &dfs_index_fops);
+}
+
+#ifdef CONFIG_MODULES
+static void pi_remove_file(struct module *mod)
+{
+ debugfs_lookup_and_remove(pi_get_module_name(mod), dfs_index);
+}
+
+static int pi_module_notify(struct notifier_block *nb, unsigned long op,
+ void *data)
+{
+ struct module *mod = data;
+
+ switch (op) {
+ case MODULE_STATE_COMING:
+ pi_create_file(mod);
+ break;
+ case MODULE_STATE_GOING:
+ pi_remove_file(mod);
+ break;
+ default: /* we don't care about other module states */
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block module_printk_fmts_nb = {
+ .notifier_call = pi_module_notify,
+};
+
+static void __init pi_setup_module_notifier(void)
+{
+ register_module_notifier(&module_printk_fmts_nb);
+}
+#else
+static inline void __init pi_setup_module_notifier(void) { }
+#endif
+
+static int __init pi_init(void)
+{
+ struct dentry *dfs_root = debugfs_create_dir("printk", NULL);
+
+ dfs_index = debugfs_create_dir("index", dfs_root);
+ pi_setup_module_notifier();
+ pi_create_file(NULL);
+
+ return 0;
+}
+
+/* debugfs comes up on core and must be initialised first */
+postcore_initcall(pi_init);
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 660f9a6bf73a..6c2afee5ef62 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -3,29 +3,56 @@
* internal.h - printk internal definitions
*/
#include <linux/percpu.h>
+#include <linux/console.h>
+#include "printk_ringbuffer.h"
+
+#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
+void __init printk_sysctl_init(void);
+int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
+#else
+#define printk_sysctl_init() do { } while (0)
+#endif
+
+#define con_printk(lvl, con, fmt, ...) \
+ printk(lvl pr_fmt("%s%sconsole [%s%d] " fmt), \
+ (con->flags & CON_NBCON) ? "" : "legacy ", \
+ (con->flags & CON_BOOT) ? "boot" : "", \
+ con->name, con->index, ##__VA_ARGS__)
#ifdef CONFIG_PRINTK
-#define PRINTK_SAFE_CONTEXT_MASK 0x007ffffff
-#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x008000000
-#define PRINTK_NMI_CONTEXT_MASK 0xff0000000
+#ifdef CONFIG_PRINTK_CALLER
+#define PRINTK_PREFIX_MAX 48
+#else
+#define PRINTK_PREFIX_MAX 32
+#endif
+
+/*
+ * the maximum size of a formatted record (i.e. with prefix added
+ * per line and dropped messages or in extended message format)
+ */
+#define PRINTK_MESSAGE_MAX 2048
-#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000
+/* the maximum size allowed to be reserved for a record */
+#define PRINTKRB_RECORD_MAX 1024
-extern raw_spinlock_t logbuf_lock;
+/* Flags for a single printk record. */
+enum printk_info_flags {
+ LOG_NEWLINE = 2, /* text ended with a newline */
+ LOG_CONT = 8, /* text is a fragment of a continuation line */
+};
-__printf(5, 0)
+extern struct printk_ringbuffer *prb;
+
+__printf(4, 0)
int vprintk_store(int facility, int level,
- const char *dict, size_t dictlen,
+ const struct dev_printk_info *dev_info,
const char *fmt, va_list args);
__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args);
-void __printk_safe_enter(void);
-void __printk_safe_exit(void);
-void printk_safe_init(void);
bool printk_percpu_data_ready(void);
#define printk_safe_enter_irqsave(flags) \
@@ -40,35 +67,72 @@ bool printk_percpu_data_ready(void);
local_irq_restore(flags); \
} while (0)
-#define printk_safe_enter_irq() \
- do { \
- local_irq_disable(); \
- __printk_safe_enter(); \
- } while (0)
+void defer_console_output(void);
-#define printk_safe_exit_irq() \
- do { \
- __printk_safe_exit(); \
- local_irq_enable(); \
- } while (0)
+u16 printk_parse_prefix(const char *text, int *level,
+ enum printk_info_flags *flags);
-void defer_console_output(void);
+u64 nbcon_seq_read(struct console *con);
+void nbcon_seq_force(struct console *con, u64 seq);
+bool nbcon_alloc(struct console *con);
+void nbcon_init(struct console *con);
+void nbcon_free(struct console *con);
#else
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
+#define PRINTK_PREFIX_MAX 0
+#define PRINTK_MESSAGE_MAX 0
+#define PRINTKRB_RECORD_MAX 0
/*
- * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem
+ * In !PRINTK builds we still export console_sem
* semaphore and some of console functions (console_unlock()/etc.), so
* printk-safe must preserve the existing local IRQ guarantees.
*/
#define printk_safe_enter_irqsave(flags) local_irq_save(flags)
#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
-#define printk_safe_enter_irq() local_irq_disable()
-#define printk_safe_exit_irq() local_irq_enable()
-
-static inline void printk_safe_init(void) { }
static inline bool printk_percpu_data_ready(void) { return false; }
+static inline u64 nbcon_seq_read(struct console *con) { return 0; }
+static inline void nbcon_seq_force(struct console *con, u64 seq) { }
+static inline bool nbcon_alloc(struct console *con) { return false; }
+static inline void nbcon_init(struct console *con) { }
+static inline void nbcon_free(struct console *con) { }
+
#endif /* CONFIG_PRINTK */
+
+extern struct printk_buffers printk_shared_pbufs;
+
+/**
+ * struct printk_buffers - Buffers to read/format/output printk messages.
+ * @outbuf: After formatting, contains text to output.
+ * @scratchbuf: Used as temporary ringbuffer reading and string-print space.
+ */
+struct printk_buffers {
+ char outbuf[PRINTK_MESSAGE_MAX];
+ char scratchbuf[PRINTKRB_RECORD_MAX];
+};
+
+/**
+ * struct printk_message - Container for a prepared printk message.
+ * @pbufs: printk buffers used to prepare the message.
+ * @outbuf_len: The length of prepared text in @pbufs->outbuf to output. This
+ * does not count the terminator. A value of 0 means there is
+ * nothing to output and this record should be skipped.
+ * @seq: The sequence number of the record used for @pbufs->outbuf.
+ * @dropped: The number of dropped records from reading @seq.
+ */
+struct printk_message {
+ struct printk_buffers *pbufs;
+ unsigned int outbuf_len;
+ u64 seq;
+ unsigned long dropped;
+};
+
+bool other_cpu_in_panic(void);
+bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+ bool is_extended, bool may_supress);
+
+#ifdef CONFIG_PRINTK
+void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped);
+#endif
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
new file mode 100644
index 000000000000..b96077152f49
--- /dev/null
+++ b/kernel/printk/nbcon.c
@@ -0,0 +1,1029 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2022 Linutronix GmbH, John Ogness
+// Copyright (C) 2022 Intel, Thomas Gleixner
+
+#include <linux/kernel.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include "internal.h"
+/*
+ * Printk console printing implementation for consoles which does not depend
+ * on the legacy style console_lock mechanism.
+ *
+ * The state of the console is maintained in the "nbcon_state" atomic
+ * variable.
+ *
+ * The console is locked when:
+ *
+ * - The 'prio' field contains the priority of the context that owns the
+ * console. Only higher priority contexts are allowed to take over the
+ * lock. A value of 0 (NBCON_PRIO_NONE) means the console is not locked.
+ *
+ * - The 'cpu' field denotes on which CPU the console is locked. It is used
+ * to prevent busy waiting on the same CPU. Also it informs the lock owner
+ * that it has lost the lock in a more complex scenario when the lock was
+ * taken over by a higher priority context, released, and taken on another
+ * CPU with the same priority as the interrupted owner.
+ *
+ * The acquire mechanism uses a few more fields:
+ *
+ * - The 'req_prio' field is used by the handover approach to make the
+ * current owner aware that there is a context with a higher priority
+ * waiting for the friendly handover.
+ *
+ * - The 'unsafe' field allows to take over the console in a safe way in the
+ * middle of emitting a message. The field is set only when accessing some
+ * shared resources or when the console device is manipulated. It can be
+ * cleared, for example, after emitting one character when the console
+ * device is in a consistent state.
+ *
+ * - The 'unsafe_takeover' field is set when a hostile takeover took the
+ * console in an unsafe state. The console will stay in the unsafe state
+ * until re-initialized.
+ *
+ * The acquire mechanism uses three approaches:
+ *
+ * 1) Direct acquire when the console is not owned or is owned by a lower
+ * priority context and is in a safe state.
+ *
+ * 2) Friendly handover mechanism uses a request/grant handshake. It is used
+ * when the current owner has lower priority and the console is in an
+ * unsafe state.
+ *
+ * The requesting context:
+ *
+ * a) Sets its priority into the 'req_prio' field.
+ *
+ * b) Waits (with a timeout) for the owning context to unlock the
+ * console.
+ *
+ * c) Takes the lock and clears the 'req_prio' field.
+ *
+ * The owning context:
+ *
+ * a) Observes the 'req_prio' field set on exit from the unsafe
+ * console state.
+ *
+ * b) Gives up console ownership by clearing the 'prio' field.
+ *
+ * 3) Unsafe hostile takeover allows to take over the lock even when the
+ * console is an unsafe state. It is used only in panic() by the final
+ * attempt to flush consoles in a try and hope mode.
+ *
+ * Note that separate record buffers are used in panic(). As a result,
+ * the messages can be read and formatted without any risk even after
+ * using the hostile takeover in unsafe state.
+ *
+ * The release function simply clears the 'prio' field.
+ *
+ * All operations on @console::nbcon_state are atomic cmpxchg based to
+ * handle concurrency.
+ *
+ * The acquire/release functions implement only minimal policies:
+ *
+ * - Preference for higher priority contexts.
+ * - Protection of the panic CPU.
+ *
+ * All other policy decisions must be made at the call sites:
+ *
+ * - What is marked as an unsafe section.
+ * - Whether to spin-wait if there is already an owner and the console is
+ * in an unsafe state.
+ * - Whether to attempt an unsafe hostile takeover.
+ *
+ * The design allows to implement the well known:
+ *
+ * acquire()
+ * output_one_printk_record()
+ * release()
+ *
+ * The output of one printk record might be interrupted with a higher priority
+ * context. The new owner is supposed to reprint the entire interrupted record
+ * from scratch.
+ */
+
+/**
+ * nbcon_state_set - Helper function to set the console state
+ * @con: Console to update
+ * @new: The new state to write
+ *
+ * Only to be used when the console is not yet or no longer visible in the
+ * system. Otherwise use nbcon_state_try_cmpxchg().
+ */
+static inline void nbcon_state_set(struct console *con, struct nbcon_state *new)
+{
+ atomic_set(&ACCESS_PRIVATE(con, nbcon_state), new->atom);
+}
+
+/**
+ * nbcon_state_read - Helper function to read the console state
+ * @con: Console to read
+ * @state: The state to store the result
+ */
+static inline void nbcon_state_read(struct console *con, struct nbcon_state *state)
+{
+ state->atom = atomic_read(&ACCESS_PRIVATE(con, nbcon_state));
+}
+
+/**
+ * nbcon_state_try_cmpxchg() - Helper function for atomic_try_cmpxchg() on console state
+ * @con: Console to update
+ * @cur: Old/expected state
+ * @new: New state
+ *
+ * Return: True on success. False on fail and @cur is updated.
+ */
+static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_state *cur,
+ struct nbcon_state *new)
+{
+ return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom);
+}
+
+#ifdef CONFIG_64BIT
+
+#define __seq_to_nbcon_seq(seq) (seq)
+#define __nbcon_seq_to_seq(seq) (seq)
+
+#else /* CONFIG_64BIT */
+
+#define __seq_to_nbcon_seq(seq) ((u32)seq)
+
+static inline u64 __nbcon_seq_to_seq(u32 nbcon_seq)
+{
+ u64 seq;
+ u64 rb_next_seq;
+
+ /*
+ * The provided sequence is only the lower 32 bits of the ringbuffer
+ * sequence. It needs to be expanded to 64bit. Get the next sequence
+ * number from the ringbuffer and fold it.
+ *
+ * Having a 32bit representation in the console is sufficient.
+ * If a console ever gets more than 2^31 records behind
+ * the ringbuffer then this is the least of the problems.
+ *
+ * Also the access to the ring buffer is always safe.
+ */
+ rb_next_seq = prb_next_seq(prb);
+ seq = rb_next_seq - ((u32)rb_next_seq - nbcon_seq);
+
+ return seq;
+}
+
+#endif /* CONFIG_64BIT */
+
+/**
+ * nbcon_seq_read - Read the current console sequence
+ * @con: Console to read the sequence of
+ *
+ * Return: Sequence number of the next record to print on @con.
+ */
+u64 nbcon_seq_read(struct console *con)
+{
+ unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq));
+
+ return __nbcon_seq_to_seq(nbcon_seq);
+}
+
+/**
+ * nbcon_seq_force - Force console sequence to a specific value
+ * @con: Console to work on
+ * @seq: Sequence number value to set
+ *
+ * Only to be used during init (before registration) or in extreme situations
+ * (such as panic with CONSOLE_REPLAY_ALL).
+ */
+void nbcon_seq_force(struct console *con, u64 seq)
+{
+ /*
+ * If the specified record no longer exists, the oldest available record
+ * is chosen. This is especially important on 32bit systems because only
+ * the lower 32 bits of the sequence number are stored. The upper 32 bits
+ * are derived from the sequence numbers available in the ringbuffer.
+ */
+ u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb));
+
+ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __seq_to_nbcon_seq(valid_seq));
+
+ /* Clear con->seq since nbcon consoles use con->nbcon_seq instead. */
+ con->seq = 0;
+}
+
+/**
+ * nbcon_seq_try_update - Try to update the console sequence number
+ * @ctxt: Pointer to an acquire context that contains
+ * all information about the acquire mode
+ * @new_seq: The new sequence number to set
+ *
+ * @ctxt->seq is updated to the new value of @con::nbcon_seq (expanded to
+ * the 64bit value). This could be a different value than @new_seq if
+ * nbcon_seq_force() was used or the current context no longer owns the
+ * console. In the later case, it will stop printing anyway.
+ */
+static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
+{
+ unsigned long nbcon_seq = __seq_to_nbcon_seq(ctxt->seq);
+ struct console *con = ctxt->console;
+
+ if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq,
+ __seq_to_nbcon_seq(new_seq))) {
+ ctxt->seq = new_seq;
+ } else {
+ ctxt->seq = nbcon_seq_read(con);
+ }
+}
+
+/**
+ * nbcon_context_try_acquire_direct - Try to acquire directly
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ *
+ * Acquire the console when it is released. Also acquire the console when
+ * the current owner has a lower priority and the console is in a safe state.
+ *
+ * Return: 0 on success. Otherwise, an error code on failure. Also @cur
+ * is updated to the latest state when failed to modify it.
+ *
+ * Errors:
+ *
+ * -EPERM: A panic is in progress and this is not the panic CPU.
+ * Or the current owner or waiter has the same or higher
+ * priority. No acquire method can be successful in
+ * this case.
+ *
+ * -EBUSY: The current owner has a lower priority but the console
+ * in an unsafe state. The caller should try using
+ * the handover acquire method.
+ */
+static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
+ struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+
+ do {
+ if (other_cpu_in_panic())
+ return -EPERM;
+
+ if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio)
+ return -EPERM;
+
+ if (cur->unsafe)
+ return -EBUSY;
+
+ /*
+ * The console should never be safe for a direct acquire
+ * if an unsafe hostile takeover has ever happened.
+ */
+ WARN_ON_ONCE(cur->unsafe_takeover);
+
+ new.atom = cur->atom;
+ new.prio = ctxt->prio;
+ new.req_prio = NBCON_PRIO_NONE;
+ new.unsafe = cur->unsafe_takeover;
+ new.cpu = cpu;
+
+ } while (!nbcon_state_try_cmpxchg(con, cur, &new));
+
+ return 0;
+}
+
+static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio)
+{
+ /*
+ * The request context is well defined by the @req_prio because:
+ *
+ * - Only a context with a higher priority can take over the request.
+ * - There are only three priorities.
+ * - Only one CPU is allowed to request PANIC priority.
+ * - Lower priorities are ignored during panic() until reboot.
+ *
+ * As a result, the following scenario is *not* possible:
+ *
+ * 1. Another context with a higher priority directly takes ownership.
+ * 2. The higher priority context releases the ownership.
+ * 3. A lower priority context takes the ownership.
+ * 4. Another context with the same priority as this context
+ * creates a request and starts waiting.
+ */
+
+ return (cur->req_prio == expected_prio);
+}
+
+/**
+ * nbcon_context_try_acquire_requested - Try to acquire after having
+ * requested a handover
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ *
+ * This is a helper function for nbcon_context_try_acquire_handover().
+ * It is called when the console is in an unsafe state. The current
+ * owner will release the console on exit from the unsafe region.
+ *
+ * Return: 0 on success and @cur is updated to the new console state.
+ * Otherwise an error code on failure.
+ *
+ * Errors:
+ *
+ * -EPERM: A panic is in progress and this is not the panic CPU
+ * or this context is no longer the waiter.
+ *
+ * -EBUSY: The console is still locked. The caller should
+ * continue waiting.
+ *
+ * Note: The caller must still remove the request when an error has occurred
+ * except when this context is no longer the waiter.
+ */
+static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt,
+ struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+
+ /* Note that the caller must still remove the request! */
+ if (other_cpu_in_panic())
+ return -EPERM;
+
+ /*
+ * Note that the waiter will also change if there was an unsafe
+ * hostile takeover.
+ */
+ if (!nbcon_waiter_matches(cur, ctxt->prio))
+ return -EPERM;
+
+ /* If still locked, caller should continue waiting. */
+ if (cur->prio != NBCON_PRIO_NONE)
+ return -EBUSY;
+
+ /*
+ * The previous owner should have never released ownership
+ * in an unsafe region.
+ */
+ WARN_ON_ONCE(cur->unsafe);
+
+ new.atom = cur->atom;
+ new.prio = ctxt->prio;
+ new.req_prio = NBCON_PRIO_NONE;
+ new.unsafe = cur->unsafe_takeover;
+ new.cpu = cpu;
+
+ if (!nbcon_state_try_cmpxchg(con, cur, &new)) {
+ /*
+ * The acquire could fail only when it has been taken
+ * over by a higher priority context.
+ */
+ WARN_ON_ONCE(nbcon_waiter_matches(cur, ctxt->prio));
+ return -EPERM;
+ }
+
+ /* Handover success. This context now owns the console. */
+ return 0;
+}
+
+/**
+ * nbcon_context_try_acquire_handover - Try to acquire via handover
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ *
+ * The function must be called only when the context has higher priority
+ * than the current owner and the console is in an unsafe state.
+ * It is the case when nbcon_context_try_acquire_direct() returns -EBUSY.
+ *
+ * The function sets "req_prio" field to make the current owner aware of
+ * the request. Then it waits until the current owner releases the console,
+ * or an even higher context takes over the request, or timeout expires.
+ *
+ * The current owner checks the "req_prio" field on exit from the unsafe
+ * region and releases the console. It does not touch the "req_prio" field
+ * so that the console stays reserved for the waiter.
+ *
+ * Return: 0 on success. Otherwise, an error code on failure. Also @cur
+ * is updated to the latest state when failed to modify it.
+ *
+ * Errors:
+ *
+ * -EPERM: A panic is in progress and this is not the panic CPU.
+ * Or a higher priority context has taken over the
+ * console or the handover request.
+ *
+ * -EBUSY: The current owner is on the same CPU so that the hand
+ * shake could not work. Or the current owner is not
+ * willing to wait (zero timeout). Or the console does
+ * not enter the safe state before timeout passed. The
+ * caller might still use the unsafe hostile takeover
+ * when allowed.
+ *
+ * -EAGAIN: @cur has changed when creating the handover request.
+ * The caller should retry with direct acquire.
+ */
+static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt,
+ struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+ int timeout;
+ int request_err = -EBUSY;
+
+ /*
+ * Check that the handover is called when the direct acquire failed
+ * with -EBUSY.
+ */
+ WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
+ WARN_ON_ONCE(!cur->unsafe);
+
+ /* Handover is not possible on the same CPU. */
+ if (cur->cpu == cpu)
+ return -EBUSY;
+
+ /*
+ * Console stays unsafe after an unsafe takeover until re-initialized.
+ * Waiting is not going to help in this case.
+ */
+ if (cur->unsafe_takeover)
+ return -EBUSY;
+
+ /* Is the caller willing to wait? */
+ if (ctxt->spinwait_max_us == 0)
+ return -EBUSY;
+
+ /*
+ * Setup a request for the handover. The caller should try to acquire
+ * the console directly when the current state has been modified.
+ */
+ new.atom = cur->atom;
+ new.req_prio = ctxt->prio;
+ if (!nbcon_state_try_cmpxchg(con, cur, &new))
+ return -EAGAIN;
+
+ cur->atom = new.atom;
+
+ /* Wait until there is no owner and then acquire the console. */
+ for (timeout = ctxt->spinwait_max_us; timeout >= 0; timeout--) {
+ /* On successful acquire, this request is cleared. */
+ request_err = nbcon_context_try_acquire_requested(ctxt, cur);
+ if (!request_err)
+ return 0;
+
+ /*
+ * If the acquire should be aborted, it must be ensured
+ * that the request is removed before returning to caller.
+ */
+ if (request_err == -EPERM)
+ break;
+
+ udelay(1);
+
+ /* Re-read the state because some time has passed. */
+ nbcon_state_read(con, cur);
+ }
+
+ /* Timed out or aborted. Carefully remove handover request. */
+ do {
+ /*
+ * No need to remove request if there is a new waiter. This
+ * can only happen if a higher priority context has taken over
+ * the console or the handover request.
+ */
+ if (!nbcon_waiter_matches(cur, ctxt->prio))
+ return -EPERM;
+
+ /* Unset request for handover. */
+ new.atom = cur->atom;
+ new.req_prio = NBCON_PRIO_NONE;
+ if (nbcon_state_try_cmpxchg(con, cur, &new)) {
+ /*
+ * Request successfully unset. Report failure of
+ * acquiring via handover.
+ */
+ cur->atom = new.atom;
+ return request_err;
+ }
+
+ /*
+ * Unable to remove request. Try to acquire in case
+ * the owner has released the lock.
+ */
+ } while (nbcon_context_try_acquire_requested(ctxt, cur));
+
+ /* Lucky timing. The acquire succeeded while removing the request. */
+ return 0;
+}
+
+/**
+ * nbcon_context_try_acquire_hostile - Acquire via unsafe hostile takeover
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ *
+ * Acquire the console even in the unsafe state.
+ *
+ * It can be permitted by setting the 'allow_unsafe_takeover' field only
+ * by the final attempt to flush messages in panic().
+ *
+ * Return: 0 on success. -EPERM when not allowed by the context.
+ */
+static int nbcon_context_try_acquire_hostile(struct nbcon_context *ctxt,
+ struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+
+ if (!ctxt->allow_unsafe_takeover)
+ return -EPERM;
+
+ /* Ensure caller is allowed to perform unsafe hostile takeovers. */
+ if (WARN_ON_ONCE(ctxt->prio != NBCON_PRIO_PANIC))
+ return -EPERM;
+
+ /*
+ * Check that try_acquire_direct() and try_acquire_handover() returned
+ * -EBUSY in the right situation.
+ */
+ WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
+ WARN_ON_ONCE(cur->unsafe != true);
+
+ do {
+ new.atom = cur->atom;
+ new.cpu = cpu;
+ new.prio = ctxt->prio;
+ new.unsafe |= cur->unsafe_takeover;
+ new.unsafe_takeover |= cur->unsafe;
+
+ } while (!nbcon_state_try_cmpxchg(con, cur, &new));
+
+ return 0;
+}
+
+static struct printk_buffers panic_nbcon_pbufs;
+
+/**
+ * nbcon_context_try_acquire - Try to acquire nbcon console
+ * @ctxt: The context of the caller
+ *
+ * Return: True if the console was acquired. False otherwise.
+ *
+ * If the caller allowed an unsafe hostile takeover, on success the
+ * caller should check the current console state to see if it is
+ * in an unsafe state. Otherwise, on success the caller may assume
+ * the console is not in an unsafe state.
+ */
+__maybe_unused
+static bool nbcon_context_try_acquire(struct nbcon_context *ctxt)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+ int err;
+
+ nbcon_state_read(con, &cur);
+try_again:
+ err = nbcon_context_try_acquire_direct(ctxt, &cur);
+ if (err != -EBUSY)
+ goto out;
+
+ err = nbcon_context_try_acquire_handover(ctxt, &cur);
+ if (err == -EAGAIN)
+ goto try_again;
+ if (err != -EBUSY)
+ goto out;
+
+ err = nbcon_context_try_acquire_hostile(ctxt, &cur);
+out:
+ if (err)
+ return false;
+
+ /* Acquire succeeded. */
+
+ /* Assign the appropriate buffer for this context. */
+ if (atomic_read(&panic_cpu) == cpu)
+ ctxt->pbufs = &panic_nbcon_pbufs;
+ else
+ ctxt->pbufs = con->pbufs;
+
+ /* Set the record sequence for this context to print. */
+ ctxt->seq = nbcon_seq_read(ctxt->console);
+
+ return true;
+}
+
+static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu,
+ int expected_prio)
+{
+ /*
+ * Since consoles can only be acquired by higher priorities,
+ * owning contexts are uniquely identified by @prio. However,
+ * since contexts can unexpectedly lose ownership, it is
+ * possible that later another owner appears with the same
+ * priority. For this reason @cpu is also needed.
+ */
+
+ if (cur->prio != expected_prio)
+ return false;
+
+ if (cur->cpu != expected_cpu)
+ return false;
+
+ return true;
+}
+
+/**
+ * nbcon_context_release - Release the console
+ * @ctxt: The nbcon context from nbcon_context_try_acquire()
+ */
+static void nbcon_context_release(struct nbcon_context *ctxt)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+ struct nbcon_state new;
+
+ nbcon_state_read(con, &cur);
+
+ do {
+ if (!nbcon_owner_matches(&cur, cpu, ctxt->prio))
+ break;
+
+ new.atom = cur.atom;
+ new.prio = NBCON_PRIO_NONE;
+
+ /*
+ * If @unsafe_takeover is set, it is kept set so that
+ * the state remains permanently unsafe.
+ */
+ new.unsafe |= cur.unsafe_takeover;
+
+ } while (!nbcon_state_try_cmpxchg(con, &cur, &new));
+
+ ctxt->pbufs = NULL;
+}
+
+/**
+ * nbcon_context_can_proceed - Check whether ownership can proceed
+ * @ctxt: The nbcon context from nbcon_context_try_acquire()
+ * @cur: The current console state
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * Must be invoked when entering the unsafe state to make sure that it still
+ * owns the lock. Also must be invoked when exiting the unsafe context
+ * to eventually free the lock for a higher priority context which asked
+ * for the friendly handover.
+ *
+ * It can be called inside an unsafe section when the console is just
+ * temporary in safe state instead of exiting and entering the unsafe
+ * state.
+ *
+ * Also it can be called in the safe context before doing an expensive
+ * safe operation. It does not make sense to do the operation when
+ * a higher priority context took the lock.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+static bool nbcon_context_can_proceed(struct nbcon_context *ctxt, struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+
+ /* Make sure this context still owns the console. */
+ if (!nbcon_owner_matches(cur, cpu, ctxt->prio))
+ return false;
+
+ /* The console owner can proceed if there is no waiter. */
+ if (cur->req_prio == NBCON_PRIO_NONE)
+ return true;
+
+ /*
+ * A console owner within an unsafe region is always allowed to
+ * proceed, even if there are waiters. It can perform a handover
+ * when exiting the unsafe region. Otherwise the waiter will
+ * need to perform an unsafe hostile takeover.
+ */
+ if (cur->unsafe)
+ return true;
+
+ /* Waiters always have higher priorities than owners. */
+ WARN_ON_ONCE(cur->req_prio <= cur->prio);
+
+ /*
+ * Having a safe point for take over and eventually a few
+ * duplicated characters or a full line is way better than a
+ * hostile takeover. Post processing can take care of the garbage.
+ * Release and hand over.
+ */
+ nbcon_context_release(ctxt);
+
+ /*
+ * It is not clear whether the waiter really took over ownership. The
+ * outermost callsite must make the final decision whether console
+ * ownership is needed for it to proceed. If yes, it must reacquire
+ * ownership (possibly hostile) before carefully proceeding.
+ *
+ * The calling context no longer owns the console so go back all the
+ * way instead of trying to implement reacquire heuristics in tons of
+ * places.
+ */
+ return false;
+}
+
+/**
+ * nbcon_can_proceed - Check whether ownership can proceed
+ * @wctxt: The write context that was handed to the write function
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * It is used in nbcon_enter_unsafe() to make sure that it still owns the
+ * lock. Also it is used in nbcon_exit_unsafe() to eventually free the lock
+ * for a higher priority context which asked for the friendly handover.
+ *
+ * It can be called inside an unsafe section when the console is just
+ * temporary in safe state instead of exiting and entering the unsafe state.
+ *
+ * Also it can be called in the safe context before doing an expensive safe
+ * operation. It does not make sense to do the operation when a higher
+ * priority context took the lock.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_can_proceed(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+
+ nbcon_state_read(con, &cur);
+
+ return nbcon_context_can_proceed(ctxt, &cur);
+}
+EXPORT_SYMBOL_GPL(nbcon_can_proceed);
+
+#define nbcon_context_enter_unsafe(c) __nbcon_context_update_unsafe(c, true)
+#define nbcon_context_exit_unsafe(c) __nbcon_context_update_unsafe(c, false)
+
+/**
+ * __nbcon_context_update_unsafe - Update the unsafe bit in @con->nbcon_state
+ * @ctxt: The nbcon context from nbcon_context_try_acquire()
+ * @unsafe: The new value for the unsafe bit
+ *
+ * Return: True if the unsafe state was updated and this context still
+ * owns the console. Otherwise false if ownership was handed
+ * over or taken.
+ *
+ * This function allows console owners to modify the unsafe status of the
+ * console.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ *
+ * Internal helper to avoid duplicated code.
+ */
+static bool __nbcon_context_update_unsafe(struct nbcon_context *ctxt, bool unsafe)
+{
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+ struct nbcon_state new;
+
+ nbcon_state_read(con, &cur);
+
+ do {
+ /*
+ * The unsafe bit must not be cleared if an
+ * unsafe hostile takeover has occurred.
+ */
+ if (!unsafe && cur.unsafe_takeover)
+ goto out;
+
+ if (!nbcon_context_can_proceed(ctxt, &cur))
+ return false;
+
+ new.atom = cur.atom;
+ new.unsafe = unsafe;
+ } while (!nbcon_state_try_cmpxchg(con, &cur, &new));
+
+ cur.atom = new.atom;
+out:
+ return nbcon_context_can_proceed(ctxt, &cur);
+}
+
+/**
+ * nbcon_enter_unsafe - Enter an unsafe region in the driver
+ * @wctxt: The write context that was handed to the write function
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+ return nbcon_context_enter_unsafe(ctxt);
+}
+EXPORT_SYMBOL_GPL(nbcon_enter_unsafe);
+
+/**
+ * nbcon_exit_unsafe - Exit an unsafe region in the driver
+ * @wctxt: The write context that was handed to the write function
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+ return nbcon_context_exit_unsafe(ctxt);
+}
+EXPORT_SYMBOL_GPL(nbcon_exit_unsafe);
+
+/**
+ * nbcon_emit_next_record - Emit a record in the acquired context
+ * @wctxt: The write context that will be handed to the write function
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context. If the caller
+ * wants to do more it must reacquire the console first.
+ *
+ * When true is returned, @wctxt->ctxt.backlog indicates whether there are
+ * still records pending in the ringbuffer,
+ */
+__maybe_unused
+static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
+ struct printk_message pmsg = {
+ .pbufs = ctxt->pbufs,
+ };
+ unsigned long con_dropped;
+ struct nbcon_state cur;
+ unsigned long dropped;
+ bool done;
+
+ /*
+ * The printk buffers are filled within an unsafe section. This
+ * prevents NBCON_PRIO_NORMAL and NBCON_PRIO_EMERGENCY from
+ * clobbering each other.
+ */
+
+ if (!nbcon_context_enter_unsafe(ctxt))
+ return false;
+
+ ctxt->backlog = printk_get_next_message(&pmsg, ctxt->seq, is_extended, true);
+ if (!ctxt->backlog)
+ return nbcon_context_exit_unsafe(ctxt);
+
+ /*
+ * @con->dropped is not protected in case of an unsafe hostile
+ * takeover. In that situation the update can be racy so
+ * annotate it accordingly.
+ */
+ con_dropped = data_race(READ_ONCE(con->dropped));
+
+ dropped = con_dropped + pmsg.dropped;
+ if (dropped && !is_extended)
+ console_prepend_dropped(&pmsg, dropped);
+
+ if (!nbcon_context_exit_unsafe(ctxt))
+ return false;
+
+ /* For skipped records just update seq/dropped in @con. */
+ if (pmsg.outbuf_len == 0)
+ goto update_con;
+
+ /* Initialize the write context for driver callbacks. */
+ wctxt->outbuf = &pmsg.pbufs->outbuf[0];
+ wctxt->len = pmsg.outbuf_len;
+ nbcon_state_read(con, &cur);
+ wctxt->unsafe_takeover = cur.unsafe_takeover;
+
+ if (con->write_atomic) {
+ done = con->write_atomic(con, wctxt);
+ } else {
+ nbcon_context_release(ctxt);
+ WARN_ON_ONCE(1);
+ done = false;
+ }
+
+ /* If not done, the emit was aborted. */
+ if (!done)
+ return false;
+
+ /*
+ * Since any dropped message was successfully output, reset the
+ * dropped count for the console.
+ */
+ dropped = 0;
+update_con:
+ /*
+ * The dropped count and the sequence number are updated within an
+ * unsafe section. This limits update races to the panic context and
+ * allows the panic context to win.
+ */
+
+ if (!nbcon_context_enter_unsafe(ctxt))
+ return false;
+
+ if (dropped != con_dropped) {
+ /* Counterpart to the READ_ONCE() above. */
+ WRITE_ONCE(con->dropped, dropped);
+ }
+
+ nbcon_seq_try_update(ctxt, pmsg.seq + 1);
+
+ return nbcon_context_exit_unsafe(ctxt);
+}
+
+/**
+ * nbcon_alloc - Allocate buffers needed by the nbcon console
+ * @con: Console to allocate buffers for
+ *
+ * Return: True on success. False otherwise and the console cannot
+ * be used.
+ *
+ * This is not part of nbcon_init() because buffer allocation must
+ * be performed earlier in the console registration process.
+ */
+bool nbcon_alloc(struct console *con)
+{
+ if (con->flags & CON_BOOT) {
+ /*
+ * Boot console printing is synchronized with legacy console
+ * printing, so boot consoles can share the same global printk
+ * buffers.
+ */
+ con->pbufs = &printk_shared_pbufs;
+ } else {
+ con->pbufs = kmalloc(sizeof(*con->pbufs), GFP_KERNEL);
+ if (!con->pbufs) {
+ con_printk(KERN_ERR, con, "failed to allocate printing buffer\n");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * nbcon_init - Initialize the nbcon console specific data
+ * @con: Console to initialize
+ *
+ * nbcon_alloc() *must* be called and succeed before this function
+ * is called.
+ *
+ * This function expects that the legacy @con->seq has been set.
+ */
+void nbcon_init(struct console *con)
+{
+ struct nbcon_state state = { };
+
+ /* nbcon_alloc() must have been called and successful! */
+ BUG_ON(!con->pbufs);
+
+ nbcon_seq_force(con, con->seq);
+ nbcon_state_set(con, &state);
+}
+
+/**
+ * nbcon_free - Free and cleanup the nbcon console specific data
+ * @con: Console to free/cleanup nbcon data
+ */
+void nbcon_free(struct console *con)
+{
+ struct nbcon_state state = { };
+
+ nbcon_state_set(con, &state);
+
+ /* Boot consoles share global printk buffers. */
+ if (!(con->flags & CON_BOOT))
+ kfree(con->pbufs);
+
+ con->pbufs = NULL;
+}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b71eaf5f5a86..f2444b581e16 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -55,6 +55,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>
+#include "printk_ringbuffer.h"
#include "console_cmdline.h"
#include "braille.h"
#include "internal.h"
@@ -70,6 +71,8 @@ EXPORT_SYMBOL_GPL(console_printk);
atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0);
EXPORT_SYMBOL(ignore_console_lock_warning);
+EXPORT_TRACEPOINT_SYMBOL_GPL(console);
+
/*
* Low level drivers may need that to know if they can schedule in
* their unblank() callback or not. So let's export it.
@@ -78,13 +81,20 @@ int oops_in_progress;
EXPORT_SYMBOL(oops_in_progress);
/*
- * console_sem protects the console_drivers list, and also
- * provides serialisation for access to the entire console
- * driver system.
+ * console_mutex protects console_list updates and console->flags updates.
+ * The flags are synchronized only for consoles that are registered, i.e.
+ * accessible via the console list.
+ */
+static DEFINE_MUTEX(console_mutex);
+
+/*
+ * console_sem protects updates to console->seq
+ * and also provides serialization for console printing.
*/
-static DEFINE_SEMAPHORE(console_sem);
-struct console *console_drivers;
-EXPORT_SYMBOL_GPL(console_drivers);
+static DEFINE_SEMAPHORE(console_sem, 1);
+HLIST_HEAD(console_list);
+EXPORT_SYMBOL_GPL(console_list);
+DEFINE_STATIC_SRCU(console_srcu);
/*
* System may need to suppress printk message under certain
@@ -96,6 +106,20 @@ int __read_mostly suppress_printk;
static struct lockdep_map console_lock_dep_map = {
.name = "console_lock"
};
+
+void lockdep_assert_console_list_lock_held(void)
+{
+ lockdep_assert_held(&console_mutex);
+}
+EXPORT_SYMBOL(lockdep_assert_console_list_lock_held);
+#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+bool console_srcu_read_lock_is_held(void)
+{
+ return srcu_read_lock_held(&console_srcu);
+}
+EXPORT_SYMBOL(console_srcu_read_lock_is_held);
#endif
enum devkmsg_log_bits {
@@ -145,8 +169,10 @@ static int __control_devkmsg(char *str)
static int __init control_devkmsg(char *str)
{
- if (__control_devkmsg(str) < 0)
+ if (__control_devkmsg(str) < 0) {
+ pr_warn("printk.devkmsg: bad option string '%s'\n", str);
return 1;
+ }
/*
* Set sysctl string accordingly:
@@ -165,12 +191,12 @@ static int __init control_devkmsg(char *str)
*/
devkmsg_log |= DEVKMSG_LOG_MASK_LOCK;
- return 0;
+ return 1;
}
__setup("printk.devkmsg=", control_devkmsg);
char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
-
+#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -209,9 +235,70 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
return 0;
}
+#endif /* CONFIG_PRINTK && CONFIG_SYSCTL */
+
+/**
+ * console_list_lock - Lock the console list
+ *
+ * For console list or console->flags updates
+ */
+void console_list_lock(void)
+{
+ /*
+ * In unregister_console() and console_force_preferred_locked(),
+ * synchronize_srcu() is called with the console_list_lock held.
+ * Therefore it is not allowed that the console_list_lock is taken
+ * with the srcu_lock held.
+ *
+ * Detecting if this context is really in the read-side critical
+ * section is only possible if the appropriate debug options are
+ * enabled.
+ */
+ WARN_ON_ONCE(debug_lockdep_rcu_enabled() &&
+ srcu_read_lock_held(&console_srcu));
+
+ mutex_lock(&console_mutex);
+}
+EXPORT_SYMBOL(console_list_lock);
+
+/**
+ * console_list_unlock - Unlock the console list
+ *
+ * Counterpart to console_list_lock()
+ */
+void console_list_unlock(void)
+{
+ mutex_unlock(&console_mutex);
+}
+EXPORT_SYMBOL(console_list_unlock);
+
+/**
+ * console_srcu_read_lock - Register a new reader for the
+ * SRCU-protected console list
+ *
+ * Use for_each_console_srcu() to iterate the console list
+ *
+ * Context: Any context.
+ * Return: A cookie to pass to console_srcu_read_unlock().
+ */
+int console_srcu_read_lock(void)
+{
+ return srcu_read_lock_nmisafe(&console_srcu);
+}
+EXPORT_SYMBOL(console_srcu_read_lock);
-/* Number of registered extended console drivers. */
-static int nr_ext_console_drivers;
+/**
+ * console_srcu_read_unlock - Unregister an old reader from
+ * the SRCU-protected console list
+ * @cookie: cookie returned from console_srcu_read_lock()
+ *
+ * Counterpart to console_srcu_read_lock()
+ */
+void console_srcu_read_unlock(int cookie)
+{
+ srcu_read_unlock_nmisafe(&console_srcu, cookie);
+}
+EXPORT_SYMBOL(console_srcu_read_unlock);
/*
* Helper macros to handle lockdep when locking/unlocking console_sem. We use
@@ -255,20 +342,20 @@ static void __up_console_sem(unsigned long ip)
}
#define up_console_sem() __up_console_sem(_RET_IP_)
+static bool panic_in_progress(void)
+{
+ return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
+}
+
/*
* This is used for debugging the mess that is the VT code by
* keeping track if we have the console semaphore held. It's
* definitely not the perfect debug tool (we don't know if _WE_
* hold it and are racing, but it helps tracking those weird code
* paths in the console code where we end up in places I want
- * locked without the console sempahore held).
- */
-static int console_locked, console_suspended;
-
-/*
- * If exclusive_console is non-NULL then only this console is to be printed to.
+ * locked without the console semaphore held).
*/
-static struct console *exclusive_console;
+static int console_locked;
/*
* Array of consoles built from command line options (console=)
@@ -279,7 +366,6 @@ static struct console *exclusive_console;
static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
static int preferred_console = -1;
-static bool has_preferred_console;
int console_set_on_cmdline;
EXPORT_SYMBOL(console_set_on_cmdline);
@@ -294,30 +380,22 @@ enum con_msg_format_flags {
static int console_msg_format = MSG_FORMAT_DEFAULT;
/*
- * The printk log buffer consists of a chain of concatenated variable
- * length records. Every record starts with a record header, containing
- * the overall length of the record.
- *
- * The heads to the first and last entry in the buffer, as well as the
- * sequence numbers of these entries are maintained when messages are
- * stored.
+ * The printk log buffer consists of a sequenced collection of records, each
+ * containing variable length message text. Every record also contains its
+ * own meta-data (@info).
*
- * If the heads indicate available messages, the length in the header
- * tells the start next message. A length == 0 for the next message
- * indicates a wrap-around to the beginning of the buffer.
+ * Every record meta-data carries the timestamp in microseconds, as well as
+ * the standard userspace syslog level and syslog facility. The usual kernel
+ * messages use LOG_KERN; userspace-injected messages always carry a matching
+ * syslog facility, by default LOG_USER. The origin of every message can be
+ * reliably determined that way.
*
- * Every record carries the monotonic timestamp in microseconds, as well as
- * the standard userspace syslog level and syslog facility. The usual
- * kernel messages use LOG_KERN; userspace-injected messages always carry
- * a matching syslog facility, by default LOG_USER. The origin of every
- * message can be reliably determined that way.
+ * The human readable log message of a record is available in @text, the
+ * length of the message text in @text_len. The stored message is not
+ * terminated.
*
- * The human readable log message directly follows the message header. The
- * length of the message text is stored in the header, the stored message
- * is not terminated.
- *
- * Optionally, a message can carry a dictionary of properties (key/value pairs),
- * to provide userspace with a machine-readable message context.
+ * Optionally, a record can carry a dictionary of properties (key/value
+ * pairs), to provide userspace with a machine-readable message context.
*
* Examples for well-defined, commonly used property names are:
* DEVICE=b12:8 device identifier
@@ -327,25 +405,22 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
* +sound:card0 subsystem:devname
* SUBSYSTEM=pci driver-core subsystem name
*
- * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value
- * follows directly after a '=' character. Every property is terminated by
- * a '\0' character. The last property is not terminated.
- *
- * Example of a message structure:
- * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec
- * 0008 34 00 record is 52 bytes long
- * 000a 0b 00 text is 11 bytes long
- * 000c 1f 00 dictionary is 23 bytes long
- * 000e 03 00 LOG_KERN (facility) LOG_ERR (level)
- * 0010 69 74 27 73 20 61 20 6c "it's a l"
- * 69 6e 65 "ine"
- * 001b 44 45 56 49 43 "DEVIC"
- * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D"
- * 52 49 56 45 52 3d 62 75 "RIVER=bu"
- * 67 "g"
- * 0032 00 00 00 padding to next message header
- *
- * The 'struct printk_log' buffer header must never be directly exported to
+ * Valid characters in property names are [a-zA-Z0-9.-_]. Property names
+ * and values are terminated by a '\0' character.
+ *
+ * Example of record values:
+ * record.text_buf = "it's a line" (unterminated)
+ * record.info.seq = 56
+ * record.info.ts_nsec = 36863
+ * record.info.text_len = 11
+ * record.info.facility = 0 (LOG_KERN)
+ * record.info.flags = 0
+ * record.info.level = 3 (LOG_ERR)
+ * record.info.caller_id = 299 (task 299)
+ * record.info.dev_info.subsystem = "pci" (terminated)
+ * record.info.dev_info.device = "+pci:0000:00:01.0" (terminated)
+ *
+ * The 'struct printk_info' buffer must never be directly exported to
* userspace, it is a kernel-private implementation detail that might
* need to be changed in the future, when the requirements change.
*
@@ -360,100 +435,44 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
* non-prinatable characters are escaped in the "\xff" notation.
*/
-enum log_flags {
- LOG_NEWLINE = 2, /* text ended with a newline */
- LOG_CONT = 8, /* text is a fragment of a continuation line */
-};
-
-struct printk_log {
- u64 ts_nsec; /* timestamp in nanoseconds */
- u16 len; /* length of entire record */
- u16 text_len; /* length of text buffer */
- u16 dict_len; /* length of dictionary buffer */
- u8 facility; /* syslog facility */
- u8 flags:5; /* internal record flags */
- u8 level:3; /* syslog level */
-#ifdef CONFIG_PRINTK_CALLER
- u32 caller_id; /* thread id or processor id */
-#endif
-}
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-__packed __aligned(4)
-#endif
-;
+/* syslog_lock protects syslog_* variables and write access to clear_seq. */
+static DEFINE_MUTEX(syslog_lock);
+#ifdef CONFIG_PRINTK
/*
- * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
- * within the scheduler's rq lock. It must be released before calling
- * console_unlock() or anything else that might wake up a process.
+ * During panic, heavy printk by other CPUs can delay the
+ * panic and risk deadlock on console resources.
*/
-DEFINE_RAW_SPINLOCK(logbuf_lock);
+static int __read_mostly suppress_panic_printk;
-/*
- * Helper macros to lock/unlock logbuf_lock and switch between
- * printk-safe/unsafe modes.
- */
-#define logbuf_lock_irq() \
- do { \
- printk_safe_enter_irq(); \
- raw_spin_lock(&logbuf_lock); \
- } while (0)
-
-#define logbuf_unlock_irq() \
- do { \
- raw_spin_unlock(&logbuf_lock); \
- printk_safe_exit_irq(); \
- } while (0)
-
-#define logbuf_lock_irqsave(flags) \
- do { \
- printk_safe_enter_irqsave(flags); \
- raw_spin_lock(&logbuf_lock); \
- } while (0)
-
-#define logbuf_unlock_irqrestore(flags) \
- do { \
- raw_spin_unlock(&logbuf_lock); \
- printk_safe_exit_irqrestore(flags); \
- } while (0)
-
-#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
+/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
-static u32 syslog_idx;
static size_t syslog_partial;
static bool syslog_time;
-/* index and sequence number of the first record stored in the buffer */
-static u64 log_first_seq;
-static u32 log_first_idx;
-
-/* index and sequence number of the next record to store in the buffer */
-static u64 log_next_seq;
-static u32 log_next_idx;
-
-/* the next printk record to write to the console */
-static u64 console_seq;
-static u32 console_idx;
-static u64 exclusive_console_stop_seq;
-
-/* the next printk record to read after the last 'clear' command */
-static u64 clear_seq;
-static u32 clear_idx;
+struct latched_seq {
+ seqcount_latch_t latch;
+ u64 val[2];
+};
-#ifdef CONFIG_PRINTK_CALLER
-#define PREFIX_MAX 48
-#else
-#define PREFIX_MAX 32
-#endif
-#define LOG_LINE_MAX (1024 - PREFIX_MAX)
+/*
+ * The next printk record to read after the last 'clear' command. There are
+ * two copies (updated with seqcount_latch) so that reads can locklessly
+ * access a valid value. Writers are synchronized by @syslog_lock.
+ */
+static struct latched_seq clear_seq = {
+ .latch = SEQCNT_LATCH_ZERO(clear_seq.latch),
+ .val[0] = 0,
+ .val[1] = 0,
+};
#define LOG_LEVEL(v) ((v) & 0x07)
#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
/* record buffer */
-#define LOG_ALIGN __alignof__(struct printk_log)
+#define LOG_ALIGN __alignof__(unsigned long)
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
#define LOG_BUF_LEN_MAX (u32)(1 << 31)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
@@ -461,129 +480,69 @@ static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
/*
+ * Define the average message size. This only affects the number of
+ * descriptors that will be available. Underestimating is better than
+ * overestimating (too many available descriptors is better than not enough).
+ */
+#define PRB_AVGBITS 5 /* 32 character average length */
+
+#if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS
+#error CONFIG_LOG_BUF_SHIFT value too small.
+#endif
+_DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS,
+ PRB_AVGBITS, &__log_buf[0]);
+
+static struct printk_ringbuffer printk_rb_dynamic;
+
+struct printk_ringbuffer *prb = &printk_rb_static;
+
+/*
* We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
* per_cpu_areas are initialised. This variable is set to true when
* it's safe to access per-CPU data.
*/
-static bool __printk_percpu_data_ready __read_mostly;
+static bool __printk_percpu_data_ready __ro_after_init;
bool printk_percpu_data_ready(void)
{
return __printk_percpu_data_ready;
}
-/* Return log buffer address */
-char *log_buf_addr_get(void)
+/* Must be called under syslog_lock. */
+static void latched_seq_write(struct latched_seq *ls, u64 val)
{
- return log_buf;
+ raw_write_seqcount_latch(&ls->latch);
+ ls->val[0] = val;
+ raw_write_seqcount_latch(&ls->latch);
+ ls->val[1] = val;
}
-/* Return log buffer size */
-u32 log_buf_len_get(void)
-{
- return log_buf_len;
-}
-
-/* human readable text of the record */
-static char *log_text(const struct printk_log *msg)
-{
- return (char *)msg + sizeof(struct printk_log);
-}
-
-/* optional key/value pair dictionary attached to the record */
-static char *log_dict(const struct printk_log *msg)
+/* Can be called from any context. */
+static u64 latched_seq_read_nolock(struct latched_seq *ls)
{
- return (char *)msg + sizeof(struct printk_log) + msg->text_len;
-}
-
-/* get record by index; idx must point to valid msg */
-static struct printk_log *log_from_idx(u32 idx)
-{
- struct printk_log *msg = (struct printk_log *)(log_buf + idx);
-
- /*
- * A length == 0 record is the end of buffer marker. Wrap around and
- * read the message at the start of the buffer.
- */
- if (!msg->len)
- return (struct printk_log *)log_buf;
- return msg;
-}
+ unsigned int seq;
+ unsigned int idx;
+ u64 val;
-/* get next record; idx must point to valid msg */
-static u32 log_next(u32 idx)
-{
- struct printk_log *msg = (struct printk_log *)(log_buf + idx);
-
- /* length == 0 indicates the end of the buffer; wrap */
- /*
- * A length == 0 record is the end of buffer marker. Wrap around and
- * read the message at the start of the buffer as *this* one, and
- * return the one after that.
- */
- if (!msg->len) {
- msg = (struct printk_log *)log_buf;
- return msg->len;
- }
- return idx + msg->len;
-}
-
-/*
- * Check whether there is enough free space for the given message.
- *
- * The same values of first_idx and next_idx mean that the buffer
- * is either empty or full.
- *
- * If the buffer is empty, we must respect the position of the indexes.
- * They cannot be reset to the beginning of the buffer.
- */
-static int logbuf_has_space(u32 msg_size, bool empty)
-{
- u32 free;
-
- if (log_next_idx > log_first_idx || empty)
- free = max(log_buf_len - log_next_idx, log_first_idx);
- else
- free = log_first_idx - log_next_idx;
+ do {
+ seq = raw_read_seqcount_latch(&ls->latch);
+ idx = seq & 0x1;
+ val = ls->val[idx];
+ } while (raw_read_seqcount_latch_retry(&ls->latch, seq));
- /*
- * We need space also for an empty header that signalizes wrapping
- * of the buffer.
- */
- return free >= msg_size + sizeof(struct printk_log);
+ return val;
}
-static int log_make_free_space(u32 msg_size)
+/* Return log buffer address */
+char *log_buf_addr_get(void)
{
- while (log_first_seq < log_next_seq &&
- !logbuf_has_space(msg_size, false)) {
- /* drop old messages until we have enough contiguous space */
- log_first_idx = log_next(log_first_idx);
- log_first_seq++;
- }
-
- if (clear_seq < log_first_seq) {
- clear_seq = log_first_seq;
- clear_idx = log_first_idx;
- }
-
- /* sequence numbers are equal, so the log buffer is empty */
- if (logbuf_has_space(msg_size, log_first_seq == log_next_seq))
- return 0;
-
- return -ENOMEM;
+ return log_buf;
}
-/* compute the message size including the padding bytes */
-static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
+/* Return log buffer size */
+u32 log_buf_len_get(void)
{
- u32 size;
-
- size = sizeof(struct printk_log) + text_len + dict_len;
- *pad_len = (-size) & (LOG_ALIGN - 1);
- size += *pad_len;
-
- return size;
+ return log_buf_len;
}
/*
@@ -594,84 +553,23 @@ static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
#define MAX_LOG_TAKE_PART 4
static const char trunc_msg[] = "<truncated>";
-static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
- u16 *dict_len, u32 *pad_len)
+static void truncate_msg(u16 *text_len, u16 *trunc_msg_len)
{
/*
* The message should not take the whole buffer. Otherwise, it might
* get removed too soon.
*/
u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;
+
if (*text_len > max_text_len)
*text_len = max_text_len;
- /* enable the warning message */
- *trunc_msg_len = strlen(trunc_msg);
- /* disable the "dict" completely */
- *dict_len = 0;
- /* compute the size again, count also the warning message */
- return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len);
-}
-
-/* insert record into the buffer, discard old ones, update heads */
-static int log_store(u32 caller_id, int facility, int level,
- enum log_flags flags, u64 ts_nsec,
- const char *dict, u16 dict_len,
- const char *text, u16 text_len)
-{
- struct printk_log *msg;
- u32 size, pad_len;
- u16 trunc_msg_len = 0;
-
- /* number of '\0' padding bytes to next message */
- size = msg_used_size(text_len, dict_len, &pad_len);
-
- if (log_make_free_space(size)) {
- /* truncate the message if it is too long for empty buffer */
- size = truncate_msg(&text_len, &trunc_msg_len,
- &dict_len, &pad_len);
- /* survive when the log buffer is too small for trunc_msg */
- if (log_make_free_space(size))
- return 0;
- }
-
- if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) {
- /*
- * This message + an additional empty header does not fit
- * at the end of the buffer. Add an empty header with len == 0
- * to signify a wrap around.
- */
- memset(log_buf + log_next_idx, 0, sizeof(struct printk_log));
- log_next_idx = 0;
- }
- /* fill message */
- msg = (struct printk_log *)(log_buf + log_next_idx);
- memcpy(log_text(msg), text, text_len);
- msg->text_len = text_len;
- if (trunc_msg_len) {
- memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len);
- msg->text_len += trunc_msg_len;
- }
- memcpy(log_dict(msg), dict, dict_len);
- msg->dict_len = dict_len;
- msg->facility = facility;
- msg->level = level & 7;
- msg->flags = flags & 0x1f;
- if (ts_nsec > 0)
- msg->ts_nsec = ts_nsec;
+ /* enable the warning message (if there is room) */
+ *trunc_msg_len = strlen(trunc_msg);
+ if (*text_len >= *trunc_msg_len)
+ *text_len -= *trunc_msg_len;
else
- msg->ts_nsec = local_clock();
-#ifdef CONFIG_PRINTK_CALLER
- msg->caller_id = caller_id;
-#endif
- memset(log_dict(msg) + dict_len, 0, pad_len);
- msg->len = size;
-
- /* insert message */
- log_next_idx += msg->len;
- log_next_seq++;
-
- return msg->text_len;
+ *trunc_msg_len = 0;
}
int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);
@@ -723,13 +621,13 @@ static void append_char(char **pp, char *e, char c)
*(*pp)++ = c;
}
-static ssize_t msg_print_ext_header(char *buf, size_t size,
- struct printk_log *msg, u64 seq)
+static ssize_t info_print_ext_header(char *buf, size_t size,
+ struct printk_info *info)
{
- u64 ts_usec = msg->ts_nsec;
+ u64 ts_usec = info->ts_nsec;
char caller[20];
#ifdef CONFIG_PRINTK_CALLER
- u32 id = msg->caller_id;
+ u32 id = info->caller_id;
snprintf(caller, sizeof(caller), ",caller=%c%u",
id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
@@ -740,13 +638,13 @@ static ssize_t msg_print_ext_header(char *buf, size_t size,
do_div(ts_usec, 1000);
return scnprintf(buf, size, "%u,%llu,%llu,%c%s;",
- (msg->facility << 3) | msg->level, seq, ts_usec,
- msg->flags & LOG_CONT ? 'c' : '-', caller);
+ (info->facility << 3) | info->level, info->seq,
+ ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller);
}
-static ssize_t msg_print_ext_body(char *buf, size_t size,
- char *dict, size_t dict_len,
- char *text, size_t text_len)
+static ssize_t msg_add_ext_text(char *buf, size_t size,
+ const char *text, size_t text_len,
+ unsigned char endc)
{
char *p = buf, *e = buf + size;
size_t i;
@@ -760,45 +658,52 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
else
append_char(&p, e, c);
}
- append_char(&p, e, '\n');
+ append_char(&p, e, endc);
- if (dict_len) {
- bool line = true;
+ return p - buf;
+}
- for (i = 0; i < dict_len; i++) {
- unsigned char c = dict[i];
+static ssize_t msg_add_dict_text(char *buf, size_t size,
+ const char *key, const char *val)
+{
+ size_t val_len = strlen(val);
+ ssize_t len;
- if (line) {
- append_char(&p, e, ' ');
- line = false;
- }
+ if (!val_len)
+ return 0;
- if (c == '\0') {
- append_char(&p, e, '\n');
- line = true;
- continue;
- }
+ len = msg_add_ext_text(buf, size, "", 0, ' '); /* dict prefix */
+ len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '=');
+ len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n');
- if (c < ' ' || c >= 127 || c == '\\') {
- p += scnprintf(p, e - p, "\\x%02x", c);
- continue;
- }
+ return len;
+}
- append_char(&p, e, c);
- }
- append_char(&p, e, '\n');
- }
+static ssize_t msg_print_ext_body(char *buf, size_t size,
+ char *text, size_t text_len,
+ struct dev_printk_info *dev_info)
+{
+ ssize_t len;
- return p - buf;
+ len = msg_add_ext_text(buf, size, text, text_len, '\n');
+
+ if (!dev_info)
+ goto out;
+
+ len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM",
+ dev_info->subsystem);
+ len += msg_add_dict_text(buf + len, size - len, "DEVICE",
+ dev_info->device);
+out:
+ return len;
}
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
- u64 seq;
- u32 idx;
+ atomic64_t seq;
struct ratelimit_state rs;
struct mutex lock;
- char buf[CONSOLE_EXT_LOG_MAX];
+ struct printk_buffers pbufs;
};
static __printf(3, 4) __cold
@@ -808,7 +713,7 @@ int devkmsg_emit(int facility, int level, const char *fmt, ...)
int r;
va_start(args, fmt);
- r = vprintk_emit(facility, level, NULL, 0, fmt, args);
+ r = vprintk_emit(facility, level, NULL, fmt, args);
va_end(args);
return r;
@@ -824,7 +729,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
size_t len = iov_iter_count(from);
ssize_t ret = len;
- if (!user || len > LOG_LINE_MAX)
+ if (len > PRINTKRB_RECORD_MAX)
return -EINVAL;
/* Ignore when user logging is disabled. */
@@ -867,7 +772,6 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
if (LOG_FACILITY(u) != 0)
facility = LOG_FACILITY(u);
endp++;
- len -= endp - line;
line = endp;
}
}
@@ -881,84 +785,83 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct devkmsg_user *user = file->private_data;
- struct printk_log *msg;
- size_t len;
+ char *outbuf = &user->pbufs.outbuf[0];
+ struct printk_message pmsg = {
+ .pbufs = &user->pbufs,
+ };
ssize_t ret;
- if (!user)
- return -EBADF;
-
ret = mutex_lock_interruptible(&user->lock);
if (ret)
return ret;
- logbuf_lock_irq();
- while (user->seq == log_next_seq) {
+ if (!printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
- logbuf_unlock_irq();
goto out;
}
- logbuf_unlock_irq();
+ /*
+ * Guarantee this task is visible on the waitqueue before
+ * checking the wake condition.
+ *
+ * The full memory barrier within set_current_state() of
+ * prepare_to_wait_event() pairs with the full memory barrier
+ * within wq_has_sleeper().
+ *
+ * This pairs with __wake_up_klogd:A.
+ */
ret = wait_event_interruptible(log_wait,
- user->seq != log_next_seq);
+ printk_get_next_message(&pmsg, atomic64_read(&user->seq), true,
+ false)); /* LMM(devkmsg_read:A) */
if (ret)
goto out;
- logbuf_lock_irq();
}
- if (user->seq < log_first_seq) {
+ if (pmsg.dropped) {
/* our last seen message is gone, return error and reset */
- user->idx = log_first_idx;
- user->seq = log_first_seq;
+ atomic64_set(&user->seq, pmsg.seq);
ret = -EPIPE;
- logbuf_unlock_irq();
goto out;
}
- msg = log_from_idx(user->idx);
- len = msg_print_ext_header(user->buf, sizeof(user->buf),
- msg, user->seq);
- len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len,
- log_dict(msg), msg->dict_len,
- log_text(msg), msg->text_len);
-
- user->idx = log_next(user->idx);
- user->seq++;
- logbuf_unlock_irq();
+ atomic64_set(&user->seq, pmsg.seq + 1);
- if (len > count) {
+ if (pmsg.outbuf_len > count) {
ret = -EINVAL;
goto out;
}
- if (copy_to_user(buf, user->buf, len)) {
+ if (copy_to_user(buf, outbuf, pmsg.outbuf_len)) {
ret = -EFAULT;
goto out;
}
- ret = len;
+ ret = pmsg.outbuf_len;
out:
mutex_unlock(&user->lock);
return ret;
}
+/*
+ * Be careful when modifying this function!!!
+ *
+ * Only few operations are supported because the device works only with the
+ * entire variable length messages (records). Non-standard values are
+ * returned in the other cases and has been this way for quite some time.
+ * User space applications might depend on this behavior.
+ */
static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
{
struct devkmsg_user *user = file->private_data;
loff_t ret = 0;
- if (!user)
- return -EBADF;
if (offset)
return -ESPIPE;
- logbuf_lock_irq();
switch (whence) {
case SEEK_SET:
/* the first record */
- user->idx = log_first_idx;
- user->seq = log_first_seq;
+ atomic64_set(&user->seq, prb_first_valid_seq(prb));
break;
case SEEK_DATA:
/*
@@ -966,40 +869,33 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
* like issued by 'dmesg -c'. Reading /dev/kmsg itself
* changes no global state, and does not clear anything.
*/
- user->idx = clear_idx;
- user->seq = clear_seq;
+ atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq));
break;
case SEEK_END:
/* after the last record */
- user->idx = log_next_idx;
- user->seq = log_next_seq;
+ atomic64_set(&user->seq, prb_next_seq(prb));
break;
default:
ret = -EINVAL;
}
- logbuf_unlock_irq();
return ret;
}
static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
{
struct devkmsg_user *user = file->private_data;
+ struct printk_info info;
__poll_t ret = 0;
- if (!user)
- return EPOLLERR|EPOLLNVAL;
-
poll_wait(file, &log_wait, wait);
- logbuf_lock_irq();
- if (user->seq < log_next_seq) {
+ if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) {
/* return error when data has vanished underneath us */
- if (user->seq < log_first_seq)
+ if (info.seq != atomic64_read(&user->seq))
ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
else
ret = EPOLLIN|EPOLLRDNORM;
}
- logbuf_unlock_irq();
return ret;
}
@@ -1020,7 +916,7 @@ static int devkmsg_open(struct inode *inode, struct file *file)
return err;
}
- user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
+ user = kvmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
if (!user)
return -ENOMEM;
@@ -1029,10 +925,7 @@ static int devkmsg_open(struct inode *inode, struct file *file)
mutex_init(&user->lock);
- logbuf_lock_irq();
- user->idx = log_first_idx;
- user->seq = log_first_seq;
- logbuf_unlock_irq();
+ atomic64_set(&user->seq, prb_first_valid_seq(prb));
file->private_data = user;
return 0;
@@ -1042,13 +935,10 @@ static int devkmsg_release(struct inode *inode, struct file *file)
{
struct devkmsg_user *user = file->private_data;
- if (!user)
- return 0;
-
ratelimit_state_exit(&user->rs);
mutex_destroy(&user->lock);
- kfree(user);
+ kvfree(user);
return 0;
}
@@ -1072,23 +962,61 @@ const struct file_operations kmsg_fops = {
*/
void log_buf_vmcoreinfo_setup(void)
{
- VMCOREINFO_SYMBOL(log_buf);
- VMCOREINFO_SYMBOL(log_buf_len);
- VMCOREINFO_SYMBOL(log_first_idx);
- VMCOREINFO_SYMBOL(clear_idx);
- VMCOREINFO_SYMBOL(log_next_idx);
+ struct dev_printk_info *dev_info = NULL;
+
+ VMCOREINFO_SYMBOL(prb);
+ VMCOREINFO_SYMBOL(printk_rb_static);
+ VMCOREINFO_SYMBOL(clear_seq);
+
/*
- * Export struct printk_log size and field offsets. User space tools can
+ * Export struct size and field offsets. User space tools can
* parse it and detect any changes to structure down the line.
*/
- VMCOREINFO_STRUCT_SIZE(printk_log);
- VMCOREINFO_OFFSET(printk_log, ts_nsec);
- VMCOREINFO_OFFSET(printk_log, len);
- VMCOREINFO_OFFSET(printk_log, text_len);
- VMCOREINFO_OFFSET(printk_log, dict_len);
-#ifdef CONFIG_PRINTK_CALLER
- VMCOREINFO_OFFSET(printk_log, caller_id);
-#endif
+
+ VMCOREINFO_STRUCT_SIZE(printk_ringbuffer);
+ VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring);
+ VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring);
+ VMCOREINFO_OFFSET(printk_ringbuffer, fail);
+
+ VMCOREINFO_STRUCT_SIZE(prb_desc_ring);
+ VMCOREINFO_OFFSET(prb_desc_ring, count_bits);
+ VMCOREINFO_OFFSET(prb_desc_ring, descs);
+ VMCOREINFO_OFFSET(prb_desc_ring, infos);
+ VMCOREINFO_OFFSET(prb_desc_ring, head_id);
+ VMCOREINFO_OFFSET(prb_desc_ring, tail_id);
+
+ VMCOREINFO_STRUCT_SIZE(prb_desc);
+ VMCOREINFO_OFFSET(prb_desc, state_var);
+ VMCOREINFO_OFFSET(prb_desc, text_blk_lpos);
+
+ VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos);
+ VMCOREINFO_OFFSET(prb_data_blk_lpos, begin);
+ VMCOREINFO_OFFSET(prb_data_blk_lpos, next);
+
+ VMCOREINFO_STRUCT_SIZE(printk_info);
+ VMCOREINFO_OFFSET(printk_info, seq);
+ VMCOREINFO_OFFSET(printk_info, ts_nsec);
+ VMCOREINFO_OFFSET(printk_info, text_len);
+ VMCOREINFO_OFFSET(printk_info, caller_id);
+ VMCOREINFO_OFFSET(printk_info, dev_info);
+
+ VMCOREINFO_STRUCT_SIZE(dev_printk_info);
+ VMCOREINFO_OFFSET(dev_printk_info, subsystem);
+ VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem));
+ VMCOREINFO_OFFSET(dev_printk_info, device);
+ VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device));
+
+ VMCOREINFO_STRUCT_SIZE(prb_data_ring);
+ VMCOREINFO_OFFSET(prb_data_ring, size_bits);
+ VMCOREINFO_OFFSET(prb_data_ring, data);
+ VMCOREINFO_OFFSET(prb_data_ring, head_lpos);
+ VMCOREINFO_OFFSET(prb_data_ring, tail_lpos);
+
+ VMCOREINFO_SIZE(atomic_long_t);
+ VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter);
+
+ VMCOREINFO_STRUCT_SIZE(latched_seq);
+ VMCOREINFO_OFFSET(latched_seq, val);
}
#endif
@@ -1160,17 +1088,50 @@ static inline void log_buf_add_cpu(void) {}
static void __init set_percpu_data_ready(void)
{
- printk_safe_init();
- /* Make sure we set this flag only after printk_safe() init is done */
- barrier();
__printk_percpu_data_ready = true;
}
+static unsigned int __init add_to_rb(struct printk_ringbuffer *rb,
+ struct printk_record *r)
+{
+ struct prb_reserved_entry e;
+ struct printk_record dest_r;
+
+ prb_rec_init_wr(&dest_r, r->info->text_len);
+
+ if (!prb_reserve(&e, rb, &dest_r))
+ return 0;
+
+ memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len);
+ dest_r.info->text_len = r->info->text_len;
+ dest_r.info->facility = r->info->facility;
+ dest_r.info->level = r->info->level;
+ dest_r.info->flags = r->info->flags;
+ dest_r.info->ts_nsec = r->info->ts_nsec;
+ dest_r.info->caller_id = r->info->caller_id;
+ memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info));
+
+ prb_final_commit(&e);
+
+ return prb_record_text_space(&e);
+}
+
+static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata;
+
void __init setup_log_buf(int early)
{
+ struct printk_info *new_infos;
+ unsigned int new_descs_count;
+ struct prb_desc *new_descs;
+ struct printk_info info;
+ struct printk_record r;
+ unsigned int text_size;
+ size_t new_descs_size;
+ size_t new_infos_size;
unsigned long flags;
char *new_log_buf;
unsigned int free;
+ u64 seq;
/*
* Some archs call setup_log_buf() multiple times - first is very
@@ -1189,24 +1150,88 @@ void __init setup_log_buf(int early)
if (!new_log_buf_len)
return;
+ new_descs_count = new_log_buf_len >> PRB_AVGBITS;
+ if (new_descs_count == 0) {
+ pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len);
+ return;
+ }
+
new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN);
if (unlikely(!new_log_buf)) {
- pr_err("log_buf_len: %lu bytes not available\n",
- new_log_buf_len);
+ pr_err("log_buf_len: %lu text bytes not available\n",
+ new_log_buf_len);
return;
}
- logbuf_lock_irqsave(flags);
+ new_descs_size = new_descs_count * sizeof(struct prb_desc);
+ new_descs = memblock_alloc(new_descs_size, LOG_ALIGN);
+ if (unlikely(!new_descs)) {
+ pr_err("log_buf_len: %zu desc bytes not available\n",
+ new_descs_size);
+ goto err_free_log_buf;
+ }
+
+ new_infos_size = new_descs_count * sizeof(struct printk_info);
+ new_infos = memblock_alloc(new_infos_size, LOG_ALIGN);
+ if (unlikely(!new_infos)) {
+ pr_err("log_buf_len: %zu info bytes not available\n",
+ new_infos_size);
+ goto err_free_descs;
+ }
+
+ prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf));
+
+ prb_init(&printk_rb_dynamic,
+ new_log_buf, ilog2(new_log_buf_len),
+ new_descs, ilog2(new_descs_count),
+ new_infos);
+
+ local_irq_save(flags);
+
log_buf_len = new_log_buf_len;
log_buf = new_log_buf;
new_log_buf_len = 0;
- free = __LOG_BUF_LEN - log_next_idx;
- memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
- logbuf_unlock_irqrestore(flags);
+
+ free = __LOG_BUF_LEN;
+ prb_for_each_record(0, &printk_rb_static, seq, &r) {
+ text_size = add_to_rb(&printk_rb_dynamic, &r);
+ if (text_size > free)
+ free = 0;
+ else
+ free -= text_size;
+ }
+
+ prb = &printk_rb_dynamic;
+
+ local_irq_restore(flags);
+
+ /*
+ * Copy any remaining messages that might have appeared from
+ * NMI context after copying but before switching to the
+ * dynamic buffer.
+ */
+ prb_for_each_record(seq, &printk_rb_static, seq, &r) {
+ text_size = add_to_rb(&printk_rb_dynamic, &r);
+ if (text_size > free)
+ free = 0;
+ else
+ free -= text_size;
+ }
+
+ if (seq != prb_next_seq(&printk_rb_static)) {
+ pr_err("dropped %llu messages\n",
+ prb_next_seq(&printk_rb_static) - seq);
+ }
pr_info("log_buf_len: %u bytes\n", log_buf_len);
pr_info("early log buf free: %u(%u%%)\n",
free, (free * 100) / __LOG_BUF_LEN);
+ return;
+
+err_free_descs:
+ memblock_free(new_descs, new_descs_size);
+err_free_log_buf:
+ memblock_free(new_log_buf, new_log_buf_len);
}
static bool __read_mostly ignore_loglevel;
@@ -1313,18 +1338,18 @@ static size_t print_caller(u32 id, char *buf)
#define print_caller(id, buf) 0
#endif
-static size_t print_prefix(const struct printk_log *msg, bool syslog,
- bool time, char *buf)
+static size_t info_print_prefix(const struct printk_info *info, bool syslog,
+ bool time, char *buf)
{
size_t len = 0;
if (syslog)
- len = print_syslog((msg->facility << 3) | msg->level, buf);
+ len = print_syslog((info->facility << 3) | info->level, buf);
if (time)
- len += print_time(msg->ts_nsec, buf + len);
+ len += print_time(info->ts_nsec, buf + len);
- len += print_caller(msg->caller_id, buf + len);
+ len += print_caller(info->caller_id, buf + len);
if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) {
buf[len++] = ' ';
@@ -1334,71 +1359,244 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog,
return len;
}
-static size_t msg_print_text(const struct printk_log *msg, bool syslog,
- bool time, char *buf, size_t size)
+/*
+ * Prepare the record for printing. The text is shifted within the given
+ * buffer to avoid a need for another one. The following operations are
+ * done:
+ *
+ * - Add prefix for each line.
+ * - Drop truncated lines that no longer fit into the buffer.
+ * - Add the trailing newline that has been removed in vprintk_store().
+ * - Add a string terminator.
+ *
+ * Since the produced string is always terminated, the maximum possible
+ * return value is @r->text_buf_size - 1;
+ *
+ * Return: The length of the updated/prepared text, including the added
+ * prefixes and the newline. The terminator is not counted. The dropped
+ * line(s) are not counted.
+ */
+static size_t record_print_text(struct printk_record *r, bool syslog,
+ bool time)
{
- const char *text = log_text(msg);
- size_t text_size = msg->text_len;
+ size_t text_len = r->info->text_len;
+ size_t buf_size = r->text_buf_size;
+ char *text = r->text_buf;
+ char prefix[PRINTK_PREFIX_MAX];
+ bool truncated = false;
+ size_t prefix_len;
+ size_t line_len;
size_t len = 0;
- char prefix[PREFIX_MAX];
- const size_t prefix_len = print_prefix(msg, syslog, time, prefix);
+ char *next;
- do {
- const char *next = memchr(text, '\n', text_size);
- size_t text_len;
+ /*
+ * If the message was truncated because the buffer was not large
+ * enough, treat the available text as if it were the full text.
+ */
+ if (text_len > buf_size)
+ text_len = buf_size;
+ prefix_len = info_print_prefix(r->info, syslog, time, prefix);
+
+ /*
+ * @text_len: bytes of unprocessed text
+ * @line_len: bytes of current line _without_ newline
+ * @text: pointer to beginning of current line
+ * @len: number of bytes prepared in r->text_buf
+ */
+ for (;;) {
+ next = memchr(text, '\n', text_len);
if (next) {
- text_len = next - text;
- next++;
- text_size -= next - text;
+ line_len = next - text;
} else {
- text_len = text_size;
+ /* Drop truncated line(s). */
+ if (truncated)
+ break;
+ line_len = text_len;
}
- if (buf) {
- if (prefix_len + text_len + 1 >= size - len)
+ /*
+ * Truncate the text if there is not enough space to add the
+ * prefix and a trailing newline and a terminator.
+ */
+ if (len + prefix_len + text_len + 1 + 1 > buf_size) {
+ /* Drop even the current line if no space. */
+ if (len + prefix_len + line_len + 1 + 1 > buf_size)
break;
- memcpy(buf + len, prefix, prefix_len);
- len += prefix_len;
- memcpy(buf + len, text, text_len);
- len += text_len;
- buf[len++] = '\n';
- } else {
- /* SYSLOG_ACTION_* buffer size only calculation */
- len += prefix_len + text_len + 1;
+ text_len = buf_size - len - prefix_len - 1 - 1;
+ truncated = true;
+ }
+
+ memmove(text + prefix_len, text, text_len);
+ memcpy(text, prefix, prefix_len);
+
+ /*
+ * Increment the prepared length to include the text and
+ * prefix that were just moved+copied. Also increment for the
+ * newline at the end of this line. If this is the last line,
+ * there is no newline, but it will be added immediately below.
+ */
+ len += prefix_len + line_len + 1;
+ if (text_len == line_len) {
+ /*
+ * This is the last line. Add the trailing newline
+ * removed in vprintk_store().
+ */
+ text[prefix_len + line_len] = '\n';
+ break;
}
- text = next;
- } while (text);
+ /*
+ * Advance beyond the added prefix and the related line with
+ * its newline.
+ */
+ text += prefix_len + line_len + 1;
+
+ /*
+ * The remaining text has only decreased by the line with its
+ * newline.
+ *
+ * Note that @text_len can become zero. It happens when @text
+ * ended with a newline (either due to truncation or the
+ * original string ending with "\n\n"). The loop is correctly
+ * repeated and (if not truncated) an empty line with a prefix
+ * will be prepared.
+ */
+ text_len -= line_len + 1;
+ }
+
+ /*
+ * If a buffer was provided, it will be terminated. Space for the
+ * string terminator is guaranteed to be available. The terminator is
+ * not counted in the return value.
+ */
+ if (buf_size > 0)
+ r->text_buf[len] = 0;
return len;
}
+static size_t get_record_print_text_size(struct printk_info *info,
+ unsigned int line_count,
+ bool syslog, bool time)
+{
+ char prefix[PRINTK_PREFIX_MAX];
+ size_t prefix_len;
+
+ prefix_len = info_print_prefix(info, syslog, time, prefix);
+
+ /*
+ * Each line will be preceded with a prefix. The intermediate
+ * newlines are already within the text, but a final trailing
+ * newline will be added.
+ */
+ return ((prefix_len * line_count) + info->text_len + 1);
+}
+
+/*
+ * Beginning with @start_seq, find the first record where it and all following
+ * records up to (but not including) @max_seq fit into @size.
+ *
+ * @max_seq is simply an upper bound and does not need to exist. If the caller
+ * does not require an upper bound, -1 can be used for @max_seq.
+ */
+static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size,
+ bool syslog, bool time)
+{
+ struct printk_info info;
+ unsigned int line_count;
+ size_t len = 0;
+ u64 seq;
+
+ /* Determine the size of the records up to @max_seq. */
+ prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
+ if (info.seq >= max_seq)
+ break;
+ len += get_record_print_text_size(&info, line_count, syslog, time);
+ }
+
+ /*
+ * Adjust the upper bound for the next loop to avoid subtracting
+ * lengths that were never added.
+ */
+ if (seq < max_seq)
+ max_seq = seq;
+
+ /*
+ * Move first record forward until length fits into the buffer. Ignore
+ * newest messages that were not counted in the above cycle. Messages
+ * might appear and get lost in the meantime. This is a best effort
+ * that prevents an infinite loop that could occur with a retry.
+ */
+ prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
+ if (len <= size || info.seq >= max_seq)
+ break;
+ len -= get_record_print_text_size(&info, line_count, syslog, time);
+ }
+
+ return seq;
+}
+
+/* The caller is responsible for making sure @size is greater than 0. */
static int syslog_print(char __user *buf, int size)
{
+ struct printk_info info;
+ struct printk_record r;
char *text;
- struct printk_log *msg;
int len = 0;
+ u64 seq;
- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
- while (size > 0) {
+ prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);
+
+ mutex_lock(&syslog_lock);
+
+ /*
+ * Wait for the @syslog_seq record to be available. @syslog_seq may
+ * change while waiting.
+ */
+ do {
+ seq = syslog_seq;
+
+ mutex_unlock(&syslog_lock);
+ /*
+ * Guarantee this task is visible on the waitqueue before
+ * checking the wake condition.
+ *
+ * The full memory barrier within set_current_state() of
+ * prepare_to_wait_event() pairs with the full memory barrier
+ * within wq_has_sleeper().
+ *
+ * This pairs with __wake_up_klogd:A.
+ */
+ len = wait_event_interruptible(log_wait,
+ prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */
+ mutex_lock(&syslog_lock);
+
+ if (len)
+ goto out;
+ } while (syslog_seq != seq);
+
+ /*
+ * Copy records that fit into the buffer. The above cycle makes sure
+ * that the first record is always available.
+ */
+ do {
size_t n;
size_t skip;
+ int err;
- logbuf_lock_irq();
- if (syslog_seq < log_first_seq) {
- /* messages are gone, move to first one */
- syslog_seq = log_first_seq;
- syslog_idx = log_first_idx;
- syslog_partial = 0;
- }
- if (syslog_seq == log_next_seq) {
- logbuf_unlock_irq();
+ if (!prb_read_valid(prb, syslog_seq, &r))
break;
+
+ if (r.info->seq != syslog_seq) {
+ /* message is gone, move to next valid one */
+ syslog_seq = r.info->seq;
+ syslog_partial = 0;
}
/*
@@ -1409,13 +1607,10 @@ static int syslog_print(char __user *buf, int size)
syslog_time = printk_time;
skip = syslog_partial;
- msg = log_from_idx(syslog_idx);
- n = msg_print_text(msg, true, syslog_time, text,
- LOG_LINE_MAX + PREFIX_MAX);
+ n = record_print_text(&r, true, syslog_time);
if (n - syslog_partial <= size) {
/* message fits into buffer, move forward */
- syslog_idx = log_next(syslog_idx);
- syslog_seq++;
+ syslog_seq = r.info->seq + 1;
n -= syslog_partial;
syslog_partial = 0;
} else if (!len){
@@ -1424,12 +1619,15 @@ static int syslog_print(char __user *buf, int size)
syslog_partial += n;
} else
n = 0;
- logbuf_unlock_irq();
if (!n)
break;
- if (copy_to_user(buf, text + skip, n)) {
+ mutex_unlock(&syslog_lock);
+ err = copy_to_user(buf, text + skip, n);
+ mutex_lock(&syslog_lock);
+
+ if (err) {
if (!len)
len = -EFAULT;
break;
@@ -1438,83 +1636,60 @@ static int syslog_print(char __user *buf, int size)
len += n;
size -= n;
buf += n;
- }
-
+ } while (size);
+out:
+ mutex_unlock(&syslog_lock);
kfree(text);
return len;
}
static int syslog_print_all(char __user *buf, int size, bool clear)
{
+ struct printk_info info;
+ struct printk_record r;
char *text;
int len = 0;
- u64 next_seq;
u64 seq;
- u32 idx;
bool time;
- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
time = printk_time;
- logbuf_lock_irq();
/*
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
*/
- seq = clear_seq;
- idx = clear_idx;
- while (seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- len += msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
- }
-
- /* move first record forward until length fits into the buffer */
- seq = clear_seq;
- idx = clear_idx;
- while (len > size && seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1,
+ size, true, time);
- len -= msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
- }
+ prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);
- /* last message fitting into this dump */
- next_seq = log_next_seq;
+ prb_for_each_record(seq, prb, seq, &r) {
+ int textlen;
- len = 0;
- while (len >= 0 && seq < next_seq) {
- struct printk_log *msg = log_from_idx(idx);
- int textlen = msg_print_text(msg, true, time, text,
- LOG_LINE_MAX + PREFIX_MAX);
+ textlen = record_print_text(&r, true, time);
- idx = log_next(idx);
- seq++;
+ if (len + textlen > size) {
+ seq--;
+ break;
+ }
- logbuf_unlock_irq();
if (copy_to_user(buf + len, text, textlen))
len = -EFAULT;
else
len += textlen;
- logbuf_lock_irq();
- if (seq < log_first_seq) {
- /* messages are gone, move to next one */
- seq = log_first_seq;
- idx = log_first_idx;
- }
+ if (len < 0)
+ break;
}
if (clear) {
- clear_seq = log_next_seq;
- clear_idx = log_next_idx;
+ mutex_lock(&syslog_lock);
+ latched_seq_write(&clear_seq, seq);
+ mutex_unlock(&syslog_lock);
}
- logbuf_unlock_irq();
kfree(text);
return len;
@@ -1522,14 +1697,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
static void syslog_clear(void)
{
- logbuf_lock_irq();
- clear_seq = log_next_seq;
- clear_idx = log_next_idx;
- logbuf_unlock_irq();
+ mutex_lock(&syslog_lock);
+ latched_seq_write(&clear_seq, prb_next_seq(prb));
+ mutex_unlock(&syslog_lock);
}
int do_syslog(int type, char __user *buf, int len, int source)
{
+ struct printk_info info;
bool clear = false;
static int saved_console_loglevel = LOGLEVEL_DEFAULT;
int error;
@@ -1550,16 +1725,12 @@ int do_syslog(int type, char __user *buf, int len, int source)
return 0;
if (!access_ok(buf, len))
return -EFAULT;
- error = wait_event_interruptible(log_wait,
- syslog_seq != log_next_seq);
- if (error)
- return error;
error = syslog_print(buf, len);
break;
/* Read/clear last kernel messages */
case SYSLOG_ACTION_READ_CLEAR:
clear = true;
- /* FALL THRU */
+ fallthrough;
/* Read last kernel messages */
case SYSLOG_ACTION_READ_ALL:
if (!buf || len < 0)
@@ -1599,11 +1770,15 @@ int do_syslog(int type, char __user *buf, int len, int source)
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
- logbuf_lock_irq();
- if (syslog_seq < log_first_seq) {
+ mutex_lock(&syslog_lock);
+ if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) {
+ /* No unread messages. */
+ mutex_unlock(&syslog_lock);
+ return 0;
+ }
+ if (info.seq != syslog_seq) {
/* messages are gone, move to first one */
- syslog_seq = log_first_seq;
- syslog_idx = log_first_idx;
+ syslog_seq = info.seq;
syslog_partial = 0;
}
if (source == SYSLOG_FROM_PROC) {
@@ -1612,24 +1787,21 @@ int do_syslog(int type, char __user *buf, int len, int source)
* for pending data, not the size; return the count of
* records, not the length.
*/
- error = log_next_seq - syslog_seq;
+ error = prb_next_seq(prb) - syslog_seq;
} else {
- u64 seq = syslog_seq;
- u32 idx = syslog_idx;
bool time = syslog_partial ? syslog_time : printk_time;
+ unsigned int line_count;
+ u64 seq;
- while (seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- error += msg_print_text(msg, true, time, NULL,
- 0);
+ prb_for_each_info(syslog_seq, prb, seq, &info,
+ &line_count) {
+ error += get_record_print_text_size(&info, line_count,
+ true, time);
time = printk_time;
- idx = log_next(idx);
- seq++;
}
error -= syslog_partial;
}
- logbuf_unlock_irq();
+ mutex_unlock(&syslog_lock);
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
@@ -1685,19 +1857,20 @@ static void console_lock_spinning_enable(void)
/**
* console_lock_spinning_disable_and_check - mark end of code where another
* thread was able to busy wait and check if there is a waiter
+ * @cookie: cookie returned from console_srcu_read_lock()
*
* This is called at the end of the section where spinning is allowed.
* It has two functions. First, it is a signal that it is no longer
* safe to start busy waiting for the lock. Second, it checks if
* there is a busy waiter and passes the lock rights to her.
*
- * Important: Callers lose the lock if there was a busy waiter.
- * They must not touch items synchronized by console_lock
- * in this case.
+ * Important: Callers lose both the console_lock and the SRCU read lock if
+ * there was a busy waiter. They must not touch items synchronized by
+ * console_lock or SRCU read lock in this case.
*
* Return: 1 if the lock rights were passed, 0 otherwise.
*/
-static int console_lock_spinning_disable_and_check(void)
+static int console_lock_spinning_disable_and_check(int cookie)
{
int waiter;
@@ -1717,6 +1890,12 @@ static int console_lock_spinning_disable_and_check(void)
spin_release(&console_owner_dep_map, _THIS_IP_);
/*
+ * Preserve lockdep lock ordering. Release the SRCU read lock before
+ * releasing the console_lock.
+ */
+ console_srcu_read_unlock(cookie);
+
+ /*
* Hand off console_lock to waiter. The waiter will perform
* the up(). After this, the waiter is the console_lock owner.
*/
@@ -1744,6 +1923,16 @@ static int console_trylock_spinning(void)
if (console_trylock())
return 1;
+ /*
+ * It's unsafe to spin once a panic has begun. If we are the
+ * panic CPU, we may have already halted the owner of the
+ * console_sem. If we are not the panic CPU, then we should
+ * avoid taking console_sem, so the panic CPU has a better
+ * chance of cleanly acquiring it later.
+ */
+ if (panic_in_progress())
+ return 0;
+
printk_safe_enter_irqsave(flags);
raw_spin_lock(&console_owner_lock);
@@ -1789,38 +1978,81 @@ static int console_trylock_spinning(void)
}
/*
- * Call the console drivers, asking them to write out
- * log_buf[start] to log_buf[end - 1].
- * The console_lock must be held.
+ * Recursion is tracked separately on each CPU. If NMIs are supported, an
+ * additional NMI context per CPU is also separately tracked. Until per-CPU
+ * is available, a separate "early tracking" is performed.
*/
-static void call_console_drivers(const char *ext_text, size_t ext_len,
- const char *text, size_t len)
-{
- struct console *con;
+static DEFINE_PER_CPU(u8, printk_count);
+static u8 printk_count_early;
+#ifdef CONFIG_HAVE_NMI
+static DEFINE_PER_CPU(u8, printk_count_nmi);
+static u8 printk_count_nmi_early;
+#endif
- trace_console_rcuidle(text, len);
+/*
+ * Recursion is limited to keep the output sane. printk() should not require
+ * more than 1 level of recursion (allowing, for example, printk() to trigger
+ * a WARN), but a higher value is used in case some printk-internal errors
+ * exist, such as the ringbuffer validation checks failing.
+ */
+#define PRINTK_MAX_RECURSION 3
- for_each_console(con) {
- if (exclusive_console && con != exclusive_console)
- continue;
- if (!(con->flags & CON_ENABLED))
- continue;
- if (!con->write)
- continue;
- if (!cpu_online(smp_processor_id()) &&
- !(con->flags & CON_ANYTIME))
- continue;
- if (con->flags & CON_EXTENDED)
- con->write(con, ext_text, ext_len);
- else
- con->write(con, text, len);
+/*
+ * Return a pointer to the dedicated counter for the CPU+context of the
+ * caller.
+ */
+static u8 *__printk_recursion_counter(void)
+{
+#ifdef CONFIG_HAVE_NMI
+ if (in_nmi()) {
+ if (printk_percpu_data_ready())
+ return this_cpu_ptr(&printk_count_nmi);
+ return &printk_count_nmi_early;
}
+#endif
+ if (printk_percpu_data_ready())
+ return this_cpu_ptr(&printk_count);
+ return &printk_count_early;
}
+/*
+ * Enter recursion tracking. Interrupts are disabled to simplify tracking.
+ * The caller must check the boolean return value to see if the recursion is
+ * allowed. On failure, interrupts are not disabled.
+ *
+ * @recursion_ptr must be a variable of type (u8 *) and is the same variable
+ * that is passed to printk_exit_irqrestore().
+ */
+#define printk_enter_irqsave(recursion_ptr, flags) \
+({ \
+ bool success = true; \
+ \
+ typecheck(u8 *, recursion_ptr); \
+ local_irq_save(flags); \
+ (recursion_ptr) = __printk_recursion_counter(); \
+ if (*(recursion_ptr) > PRINTK_MAX_RECURSION) { \
+ local_irq_restore(flags); \
+ success = false; \
+ } else { \
+ (*(recursion_ptr))++; \
+ } \
+ success; \
+})
+
+/* Exit recursion tracking, restoring interrupts. */
+#define printk_exit_irqrestore(recursion_ptr, flags) \
+ do { \
+ typecheck(u8 *, recursion_ptr); \
+ (*(recursion_ptr))--; \
+ local_irq_restore(flags); \
+ } while (0)
+
int printk_delay_msec __read_mostly;
-static inline void printk_delay(void)
+static inline void printk_delay(int level)
{
+ boot_delay_msec(level);
+
if (unlikely(printk_delay_msec)) {
int m = printk_delay_msec;
@@ -1834,277 +2066,287 @@ static inline void printk_delay(void)
static inline u32 printk_caller_id(void)
{
return in_task() ? task_pid_nr(current) :
- 0x80000000 + raw_smp_processor_id();
+ 0x80000000 + smp_processor_id();
}
-/*
- * Continuation lines are buffered, and not committed to the record buffer
- * until the line is complete, or a race forces it. The line fragments
- * though, are printed immediately to the consoles to ensure everything has
- * reached the console in case of a kernel crash.
- */
-static struct cont {
- char buf[LOG_LINE_MAX];
- size_t len; /* length == 0 means unused buffer */
- u32 caller_id; /* printk_caller_id() of first print */
- u64 ts_nsec; /* time of first print */
- u8 level; /* log level of first message */
- u8 facility; /* log facility of first message */
- enum log_flags flags; /* prefix, newline flags */
-} cont;
-
-static void cont_flush(void)
-{
- if (cont.len == 0)
- return;
-
- log_store(cont.caller_id, cont.facility, cont.level, cont.flags,
- cont.ts_nsec, NULL, 0, cont.buf, cont.len);
- cont.len = 0;
-}
-
-static bool cont_add(u32 caller_id, int facility, int level,
- enum log_flags flags, const char *text, size_t len)
+/**
+ * printk_parse_prefix - Parse level and control flags.
+ *
+ * @text: The terminated text message.
+ * @level: A pointer to the current level value, will be updated.
+ * @flags: A pointer to the current printk_info flags, will be updated.
+ *
+ * @level may be NULL if the caller is not interested in the parsed value.
+ * Otherwise the variable pointed to by @level must be set to
+ * LOGLEVEL_DEFAULT in order to be updated with the parsed value.
+ *
+ * @flags may be NULL if the caller is not interested in the parsed value.
+ * Otherwise the variable pointed to by @flags will be OR'd with the parsed
+ * value.
+ *
+ * Return: The length of the parsed level and control flags.
+ */
+u16 printk_parse_prefix(const char *text, int *level,
+ enum printk_info_flags *flags)
{
- /* If the line gets too long, split it up in separate records. */
- if (cont.len + len > sizeof(cont.buf)) {
- cont_flush();
- return false;
- }
+ u16 prefix_len = 0;
+ int kern_level;
- if (!cont.len) {
- cont.facility = facility;
- cont.level = level;
- cont.caller_id = caller_id;
- cont.ts_nsec = local_clock();
- cont.flags = flags;
- }
+ while (*text) {
+ kern_level = printk_get_level(text);
+ if (!kern_level)
+ break;
- memcpy(cont.buf + cont.len, text, len);
- cont.len += len;
+ switch (kern_level) {
+ case '0' ... '7':
+ if (level && *level == LOGLEVEL_DEFAULT)
+ *level = kern_level - '0';
+ break;
+ case 'c': /* KERN_CONT */
+ if (flags)
+ *flags |= LOG_CONT;
+ }
- // The original flags come from the first line,
- // but later continuations can add a newline.
- if (flags & LOG_NEWLINE) {
- cont.flags |= LOG_NEWLINE;
- cont_flush();
+ prefix_len += 2;
+ text += 2;
}
- return true;
+ return prefix_len;
}
-static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len)
+__printf(5, 0)
+static u16 printk_sprint(char *text, u16 size, int facility,
+ enum printk_info_flags *flags, const char *fmt,
+ va_list args)
{
- const u32 caller_id = printk_caller_id();
+ u16 text_len;
- /*
- * If an earlier line was buffered, and we're a continuation
- * write from the same context, try to add it to the buffer.
- */
- if (cont.len) {
- if (cont.caller_id == caller_id && (lflags & LOG_CONT)) {
- if (cont_add(caller_id, facility, level, lflags, text, text_len))
- return text_len;
- }
- /* Otherwise, make sure it's flushed */
- cont_flush();
+ text_len = vscnprintf(text, size, fmt, args);
+
+ /* Mark and strip a trailing newline. */
+ if (text_len && text[text_len - 1] == '\n') {
+ text_len--;
+ *flags |= LOG_NEWLINE;
}
- /* Skip empty continuation lines that couldn't be added - they just flush */
- if (!text_len && (lflags & LOG_CONT))
- return 0;
+ /* Strip log level and control flags. */
+ if (facility == 0) {
+ u16 prefix_len;
- /* If it doesn't end in a newline, try to buffer the current line */
- if (!(lflags & LOG_NEWLINE)) {
- if (cont_add(caller_id, facility, level, lflags, text, text_len))
- return text_len;
+ prefix_len = printk_parse_prefix(text, NULL, NULL);
+ if (prefix_len) {
+ text_len -= prefix_len;
+ memmove(text, text + prefix_len, text_len);
+ }
}
- /* Store it in the record log */
- return log_store(caller_id, facility, level, lflags, 0,
- dict, dictlen, text, text_len);
+ trace_console(text, text_len);
+
+ return text_len;
}
-/* Must be called under logbuf_lock. */
+__printf(4, 0)
int vprintk_store(int facility, int level,
- const char *dict, size_t dictlen,
+ const struct dev_printk_info *dev_info,
const char *fmt, va_list args)
{
- static char textbuf[LOG_LINE_MAX];
- char *text = textbuf;
- size_t text_len;
- enum log_flags lflags = 0;
+ struct prb_reserved_entry e;
+ enum printk_info_flags flags = 0;
+ struct printk_record r;
+ unsigned long irqflags;
+ u16 trunc_msg_len = 0;
+ char prefix_buf[8];
+ u8 *recursion_ptr;
+ u16 reserve_size;
+ va_list args2;
+ u32 caller_id;
+ u16 text_len;
+ int ret = 0;
+ u64 ts_nsec;
+
+ if (!printk_enter_irqsave(recursion_ptr, irqflags))
+ return 0;
/*
- * The printf needs to come first; we need the syslog
- * prefix which might be passed-in as a parameter.
+ * Since the duration of printk() can vary depending on the message
+ * and state of the ringbuffer, grab the timestamp now so that it is
+ * close to the call of printk(). This provides a more deterministic
+ * timestamp with respect to the caller.
*/
- text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
+ ts_nsec = local_clock();
- /* mark and strip a trailing newline */
- if (text_len && text[text_len-1] == '\n') {
- text_len--;
- lflags |= LOG_NEWLINE;
- }
+ caller_id = printk_caller_id();
- /* strip kernel syslog prefix and extract log level or control flags */
- if (facility == 0) {
- int kern_level;
+ /*
+ * The sprintf needs to come first since the syslog prefix might be
+ * passed in as a parameter. An extra byte must be reserved so that
+ * later the vscnprintf() into the reserved buffer has room for the
+ * terminating '\0', which is not counted by vsnprintf().
+ */
+ va_copy(args2, args);
+ reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1;
+ va_end(args2);
- while ((kern_level = printk_get_level(text)) != 0) {
- switch (kern_level) {
- case '0' ... '7':
- if (level == LOGLEVEL_DEFAULT)
- level = kern_level - '0';
- break;
- case 'c': /* KERN_CONT */
- lflags |= LOG_CONT;
+ if (reserve_size > PRINTKRB_RECORD_MAX)
+ reserve_size = PRINTKRB_RECORD_MAX;
+
+ /* Extract log level or control flags. */
+ if (facility == 0)
+ printk_parse_prefix(&prefix_buf[0], &level, &flags);
+
+ if (level == LOGLEVEL_DEFAULT)
+ level = default_message_loglevel;
+
+ if (dev_info)
+ flags |= LOG_NEWLINE;
+
+ if (flags & LOG_CONT) {
+ prb_rec_init_wr(&r, reserve_size);
+ if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) {
+ text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size,
+ facility, &flags, fmt, args);
+ r.info->text_len += text_len;
+
+ if (flags & LOG_NEWLINE) {
+ r.info->flags |= LOG_NEWLINE;
+ prb_final_commit(&e);
+ } else {
+ prb_commit(&e);
}
- text_len -= 2;
- text += 2;
+ ret = text_len;
+ goto out;
}
}
- if (level == LOGLEVEL_DEFAULT)
- level = default_message_loglevel;
+ /*
+ * Explicitly initialize the record before every prb_reserve() call.
+ * prb_reserve_in_last() and prb_reserve() purposely invalidate the
+ * structure when they fail.
+ */
+ prb_rec_init_wr(&r, reserve_size);
+ if (!prb_reserve(&e, prb, &r)) {
+ /* truncate the message if it is too long for empty buffer */
+ truncate_msg(&reserve_size, &trunc_msg_len);
+
+ prb_rec_init_wr(&r, reserve_size + trunc_msg_len);
+ if (!prb_reserve(&e, prb, &r))
+ goto out;
+ }
- if (dict)
- lflags |= LOG_NEWLINE;
+ /* fill message */
+ text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args);
+ if (trunc_msg_len)
+ memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len);
+ r.info->text_len = text_len + trunc_msg_len;
+ r.info->facility = facility;
+ r.info->level = level & 7;
+ r.info->flags = flags & 0x1f;
+ r.info->ts_nsec = ts_nsec;
+ r.info->caller_id = caller_id;
+ if (dev_info)
+ memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));
+
+ /* A message without a trailing newline can be continued. */
+ if (!(flags & LOG_NEWLINE))
+ prb_commit(&e);
+ else
+ prb_final_commit(&e);
- return log_output(facility, level, lflags,
- dict, dictlen, text, text_len);
+ ret = text_len + trunc_msg_len;
+out:
+ printk_exit_irqrestore(recursion_ptr, irqflags);
+ return ret;
}
asmlinkage int vprintk_emit(int facility, int level,
- const char *dict, size_t dictlen,
+ const struct dev_printk_info *dev_info,
const char *fmt, va_list args)
{
int printed_len;
- bool in_sched = false, pending_output;
- unsigned long flags;
- u64 curr_log_seq;
+ bool in_sched = false;
/* Suppress unimportant messages after panic happens */
if (unlikely(suppress_printk))
return 0;
+ if (unlikely(suppress_panic_printk) &&
+ atomic_read(&panic_cpu) != raw_smp_processor_id())
+ return 0;
+
if (level == LOGLEVEL_SCHED) {
level = LOGLEVEL_DEFAULT;
in_sched = true;
}
- boot_delay_msec(level);
- printk_delay();
+ printk_delay(level);
- /* This stops the holder of console_sem just where we want him */
- logbuf_lock_irqsave(flags);
- curr_log_seq = log_next_seq;
- printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args);
- pending_output = (curr_log_seq != log_next_seq);
- logbuf_unlock_irqrestore(flags);
+ printed_len = vprintk_store(facility, level, dev_info, fmt, args);
/* If called from the scheduler, we can not call up(). */
- if (!in_sched && pending_output) {
+ if (!in_sched) {
/*
- * Disable preemption to avoid being preempted while holding
- * console_sem which would prevent anyone from printing to
- * console
+ * The caller may be holding system-critical or
+ * timing-sensitive locks. Disable preemption during
+ * printing of all remaining records to all consoles so that
+ * this context can return as soon as possible. Hopefully
+ * another printk() caller will take over the printing.
*/
preempt_disable();
/*
* Try to acquire and then immediately release the console
- * semaphore. The release will print out buffers and wake up
- * /dev/kmsg and syslog() users.
+ * semaphore. The release will print out buffers. With the
+ * spinning variant, this context tries to take over the
+ * printing from another printing context.
*/
if (console_trylock_spinning())
console_unlock();
preempt_enable();
}
- if (pending_output)
+ if (in_sched)
+ defer_console_output();
+ else
wake_up_klogd();
+
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
-asmlinkage int vprintk(const char *fmt, va_list args)
-{
- return vprintk_func(fmt, args);
-}
-EXPORT_SYMBOL(vprintk);
-
int vprintk_default(const char *fmt, va_list args)
{
- return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
}
EXPORT_SYMBOL_GPL(vprintk_default);
-/**
- * printk - print a kernel message
- * @fmt: format string
- *
- * This is printk(). It can be called from any context. We want it to work.
- *
- * We try to grab the console_lock. If we succeed, it's easy - we log the
- * output and call the console drivers. If we fail to get the semaphore, we
- * place the output into the log buffer and return. The current holder of
- * the console_sem will notice the new output in console_unlock(); and will
- * send it to the consoles before releasing the lock.
- *
- * One effect of this deferred printing is that code which calls printk() and
- * then changes console_loglevel may break. This is because console_loglevel
- * is inspected when the actual printing occurs.
- *
- * See also:
- * printf(3)
- *
- * See the vsnprintf() documentation for format string extensions over C99.
- */
-asmlinkage __visible int printk(const char *fmt, ...)
+asmlinkage __visible int _printk(const char *fmt, ...)
{
va_list args;
int r;
va_start(args, fmt);
- r = vprintk_func(fmt, args);
+ r = vprintk(fmt, args);
va_end(args);
return r;
}
-EXPORT_SYMBOL(printk);
+EXPORT_SYMBOL(_printk);
+
+static bool pr_flush(int timeout_ms, bool reset_on_progress);
+static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);
#else /* CONFIG_PRINTK */
-#define LOG_LINE_MAX 0
-#define PREFIX_MAX 0
#define printk_time false
+#define prb_read_valid(rb, seq, r) false
+#define prb_first_valid_seq(rb) 0
+#define prb_next_seq(rb) 0
+
static u64 syslog_seq;
-static u32 syslog_idx;
-static u64 console_seq;
-static u32 console_idx;
-static u64 exclusive_console_stop_seq;
-static u64 log_first_seq;
-static u32 log_first_idx;
-static u64 log_next_seq;
-static char *log_text(const struct printk_log *msg) { return NULL; }
-static char *log_dict(const struct printk_log *msg) { return NULL; }
-static struct printk_log *log_from_idx(u32 idx) { return NULL; }
-static u32 log_next(u32 idx) { return 0; }
-static ssize_t msg_print_ext_header(char *buf, size_t size,
- struct printk_log *msg,
- u64 seq) { return 0; }
-static ssize_t msg_print_ext_body(char *buf, size_t size,
- char *dict, size_t dict_len,
- char *text, size_t text_len) { return 0; }
-static void console_lock_spinning_enable(void) { }
-static int console_lock_spinning_disable_and_check(void) { return 0; }
-static void call_console_drivers(const char *ext_text, size_t ext_len,
- const char *text, size_t len) {}
-static size_t msg_print_text(const struct printk_log *msg, bool syslog,
- bool time, char *buf, size_t size) { return 0; }
-static bool suppress_message_printing(int level) { return false; }
+
+static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; }
+static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
#endif /* CONFIG_PRINTK */
@@ -2128,13 +2370,35 @@ asmlinkage __visible void early_printk(const char *fmt, ...)
}
#endif
-static int __add_preferred_console(char *name, int idx, char *options,
+static void set_user_specified(struct console_cmdline *c, bool user_specified)
+{
+ if (!user_specified)
+ return;
+
+ /*
+ * @c console was defined by the user on the command line.
+ * Do not clear when added twice also by SPCR or the device tree.
+ */
+ c->user_specified = true;
+ /* At least one console defined by the user on the command line. */
+ console_set_on_cmdline = 1;
+}
+
+static int __add_preferred_console(const char *name, const short idx, char *options,
char *brl_options, bool user_specified)
{
struct console_cmdline *c;
int i;
/*
+ * We use a signed short index for struct console for device drivers to
+ * indicate a not yet assigned index or port. However, a negative index
+ * value is not valid for preferred console.
+ */
+ if (idx < 0)
+ return -EINVAL;
+
+ /*
* See if this tty is not yet registered, and
* if we have a slot free.
*/
@@ -2144,8 +2408,7 @@ static int __add_preferred_console(char *name, int idx, char *options,
if (strcmp(c->name, name) == 0 && c->index == idx) {
if (!brl_options)
preferred_console = i;
- if (user_specified)
- c->user_specified = true;
+ set_user_specified(c, user_specified);
return 0;
}
}
@@ -2153,9 +2416,9 @@ static int __add_preferred_console(char *name, int idx, char *options,
return -E2BIG;
if (!brl_options)
preferred_console = i;
- strlcpy(c->name, name, sizeof(c->name));
+ strscpy(c->name, name, sizeof(c->name));
c->options = options;
- c->user_specified = user_specified;
+ set_user_specified(c, user_specified);
braille_set_options(c, brl_options);
c->index = idx;
@@ -2182,8 +2445,15 @@ static int __init console_setup(char *str)
char *s, *options, *brl_options = NULL;
int idx;
- if (str[0] == 0)
+ /*
+ * console="" or console=null have been suggested as a way to
+ * disable console output. Use ttynull that has been created
+ * for exactly this purpose.
+ */
+ if (str[0] == 0 || strcmp(str, "null") == 0) {
+ __add_preferred_console("ttynull", 0, NULL, NULL, true);
return 1;
+ }
if (_braille_console_setup(&str, &brl_options))
return 1;
@@ -2214,7 +2484,6 @@ static int __init console_setup(char *str)
*s = 0;
__add_preferred_console(buf, idx, options, brl_options, true);
- console_set_on_cmdline = 1;
return 1;
}
__setup("console=", console_setup);
@@ -2232,7 +2501,7 @@ __setup("console=", console_setup);
* commonly to provide a default console (ie from PROM variables) when
* the user has not supplied one.
*/
-int add_preferred_console(char *name, int idx, char *options)
+int add_preferred_console(const char *name, const short idx, char *options)
{
return __add_preferred_console(name, idx, options, NULL, false);
}
@@ -2251,6 +2520,18 @@ module_param_named(console_suspend, console_suspend_enabled,
MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
" and hibernate operations");
+static bool printk_console_no_auto_verbose;
+
+void console_verbose(void)
+{
+ if (console_loglevel && !printk_console_no_auto_verbose)
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
+}
+EXPORT_SYMBOL_GPL(console_verbose);
+
+module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool, 0644);
+MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc");
+
/**
* suspend_console - suspend the console subsystem
*
@@ -2258,21 +2539,47 @@ MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
*/
void suspend_console(void)
{
+ struct console *con;
+
if (!console_suspend_enabled)
return;
pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
- console_lock();
- console_suspended = 1;
- up_console_sem();
+ pr_flush(1000, true);
+
+ console_list_lock();
+ for_each_console(con)
+ console_srcu_write_flags(con, con->flags | CON_SUSPENDED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All printing
+ * contexts must be able to see that they are suspended so that it
+ * is guaranteed that all printing has stopped when this function
+ * completes.
+ */
+ synchronize_srcu(&console_srcu);
}
void resume_console(void)
{
+ struct console *con;
+
if (!console_suspend_enabled)
return;
- down_console_sem();
- console_suspended = 0;
- console_unlock();
+
+ console_list_lock();
+ for_each_console(con)
+ console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All printing
+ * contexts must be able to see they are no longer suspended so
+ * that they are guaranteed to wake up and resume printing.
+ */
+ synchronize_srcu(&console_srcu);
+
+ pr_flush(1000, true);
}
/**
@@ -2294,11 +2601,31 @@ static int console_cpu_notify(unsigned int cpu)
return 0;
}
+/*
+ * Return true if a panic is in progress on a remote CPU.
+ *
+ * On true, the local CPU should immediately release any printing resources
+ * that may be needed by the panic CPU.
+ */
+bool other_cpu_in_panic(void)
+{
+ if (!panic_in_progress())
+ return false;
+
+ /*
+ * We can use raw_smp_processor_id() here because it is impossible for
+ * the task to be migrated to the panic_cpu, or away from it. If
+ * panic_cpu has already been set, and we're not currently executing on
+ * that CPU, then we never will be.
+ */
+ return atomic_read(&panic_cpu) != raw_smp_processor_id();
+}
+
/**
- * console_lock - lock the console system for exclusive use.
+ * console_lock - block the console subsystem from printing
*
- * Acquires a lock which guarantees that the caller has
- * exclusive access to the console system and the console_drivers list.
+ * Acquires a lock which guarantees that no consoles will
+ * be in or enter their write() callback.
*
* Can sleep, returns nothing.
*/
@@ -2306,30 +2633,31 @@ void console_lock(void)
{
might_sleep();
+ /* On panic, the console_lock must be left to the panic cpu. */
+ while (other_cpu_in_panic())
+ msleep(1000);
+
down_console_sem();
- if (console_suspended)
- return;
console_locked = 1;
console_may_schedule = 1;
}
EXPORT_SYMBOL(console_lock);
/**
- * console_trylock - try to lock the console system for exclusive use.
+ * console_trylock - try to block the console subsystem from printing
*
- * Try to acquire a lock which guarantees that the caller has exclusive
- * access to the console system and the console_drivers list.
+ * Try to acquire a lock which guarantees that no consoles will
+ * be in or enter their write() callback.
*
* returns 1 on success, and 0 on failure to acquire the lock.
*/
int console_trylock(void)
{
- if (down_trylock_console_sem())
+ /* On panic, the console_lock must be left to the panic cpu. */
+ if (other_cpu_in_panic())
return 0;
- if (console_suspended) {
- up_console_sem();
+ if (down_trylock_console_sem())
return 0;
- }
console_locked = 1;
console_may_schedule = 0;
return 1;
@@ -2343,186 +2671,388 @@ int is_console_locked(void)
EXPORT_SYMBOL(is_console_locked);
/*
- * Check if we have any console that is capable of printing while cpu is
- * booting or shutting down. Requires console_sem.
+ * Check if the given console is currently capable and allowed to print
+ * records.
+ *
+ * Requires the console_srcu_read_lock.
*/
-static int have_callable_console(void)
+static inline bool console_is_usable(struct console *con)
{
- struct console *con;
+ short flags = console_srcu_read_flags(con);
- for_each_console(con)
- if ((con->flags & CON_ENABLED) &&
- (con->flags & CON_ANYTIME))
- return 1;
+ if (!(flags & CON_ENABLED))
+ return false;
- return 0;
+ if ((flags & CON_SUSPENDED))
+ return false;
+
+ if (!con->write)
+ return false;
+
+ /*
+ * Console drivers may assume that per-cpu resources have been
+ * allocated. So unless they're explicitly marked as being able to
+ * cope (CON_ANYTIME) don't call them until this CPU is officially up.
+ */
+ if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
+ return false;
+
+ return true;
}
-/*
- * Can we actually use the console at this time on this cpu?
- *
- * Console drivers may assume that per-cpu resources have been allocated. So
- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
- * call them until this CPU is officially up.
- */
-static inline int can_use_console(void)
+static void __console_unlock(void)
{
- return cpu_online(raw_smp_processor_id()) || have_callable_console();
+ console_locked = 0;
+ up_console_sem();
}
-/**
- * console_unlock - unlock the console system
+#ifdef CONFIG_PRINTK
+
+/*
+ * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". This
+ * is achieved by shifting the existing message over and inserting the dropped
+ * message.
*
- * Releases the console_lock which the caller holds on the console system
- * and the console driver list.
+ * @pmsg is the printk message to prepend.
*
- * While the console_lock was held, console output may have been buffered
- * by printk(). If this is the case, console_unlock(); emits
- * the output prior to releasing the lock.
+ * @dropped is the dropped count to report in the dropped message.
*
- * If there is output waiting, we wake /dev/kmsg and syslog() users.
+ * If the message text in @pmsg->pbufs->outbuf does not have enough space for
+ * the dropped message, the message text will be sufficiently truncated.
*
- * console_unlock(); may be called from any context.
+ * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated.
*/
-void console_unlock(void)
+void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
{
- static char ext_text[CONSOLE_EXT_LOG_MAX];
- static char text[LOG_LINE_MAX + PREFIX_MAX];
- unsigned long flags;
- bool do_cond_resched, retry;
+ struct printk_buffers *pbufs = pmsg->pbufs;
+ const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
+ const size_t outbuf_sz = sizeof(pbufs->outbuf);
+ char *scratchbuf = &pbufs->scratchbuf[0];
+ char *outbuf = &pbufs->outbuf[0];
+ size_t len;
+
+ len = scnprintf(scratchbuf, scratchbuf_sz,
+ "** %lu printk messages dropped **\n", dropped);
- if (console_suspended) {
- up_console_sem();
+ /*
+ * Make sure outbuf is sufficiently large before prepending.
+ * Keep at least the prefix when the message must be truncated.
+ * It is a rather theoretical problem when someone tries to
+ * use a minimalist buffer.
+ */
+ if (WARN_ON_ONCE(len + PRINTK_PREFIX_MAX >= outbuf_sz))
return;
+
+ if (pmsg->outbuf_len + len >= outbuf_sz) {
+ /* Truncate the message, but keep it terminated. */
+ pmsg->outbuf_len = outbuf_sz - (len + 1);
+ outbuf[pmsg->outbuf_len] = 0;
}
+ memmove(outbuf + len, outbuf, pmsg->outbuf_len + 1);
+ memcpy(outbuf, scratchbuf, len);
+ pmsg->outbuf_len += len;
+}
+
+/*
+ * Read and format the specified record (or a later record if the specified
+ * record is not available).
+ *
+ * @pmsg will contain the formatted result. @pmsg->pbufs must point to a
+ * struct printk_buffers.
+ *
+ * @seq is the record to read and format. If it is not available, the next
+ * valid record is read.
+ *
+ * @is_extended specifies if the message should be formatted for extended
+ * console output.
+ *
+ * @may_supress specifies if records may be skipped based on loglevel.
+ *
+ * Returns false if no record is available. Otherwise true and all fields
+ * of @pmsg are valid. (See the documentation of struct printk_message
+ * for information about the @pmsg fields.)
+ */
+bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+ bool is_extended, bool may_suppress)
+{
+ static int panic_console_dropped;
+
+ struct printk_buffers *pbufs = pmsg->pbufs;
+ const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
+ const size_t outbuf_sz = sizeof(pbufs->outbuf);
+ char *scratchbuf = &pbufs->scratchbuf[0];
+ char *outbuf = &pbufs->outbuf[0];
+ struct printk_info info;
+ struct printk_record r;
+ size_t len = 0;
+
/*
- * Console drivers are called with interrupts disabled, so
- * @console_may_schedule should be cleared before; however, we may
- * end up dumping a lot of lines, for example, if called from
- * console registration path, and should invoke cond_resched()
- * between lines if allowable. Not doing so can cause a very long
- * scheduling stall on a slow console leading to RCU stall and
- * softlockup warnings which exacerbate the issue with more
- * messages practically incapacitating the system.
+ * Formatting extended messages requires a separate buffer, so use the
+ * scratch buffer to read in the ringbuffer text.
*
- * console_trylock() is not able to detect the preemptive
- * context reliably. Therefore the value must be stored before
- * and cleared after the the "again" goto label.
+ * Formatting normal messages is done in-place, so read the ringbuffer
+ * text directly into the output buffer.
*/
- do_cond_resched = console_may_schedule;
-again:
- console_may_schedule = 0;
+ if (is_extended)
+ prb_rec_init_rd(&r, &info, scratchbuf, scratchbuf_sz);
+ else
+ prb_rec_init_rd(&r, &info, outbuf, outbuf_sz);
+
+ if (!prb_read_valid(prb, seq, &r))
+ return false;
+
+ pmsg->seq = r.info->seq;
+ pmsg->dropped = r.info->seq - seq;
/*
- * We released the console_sem lock, so we need to recheck if
- * cpu is online and (if not) is there at least one CON_ANYTIME
- * console.
+ * Check for dropped messages in panic here so that printk
+ * suppression can occur as early as possible if necessary.
*/
- if (!can_use_console()) {
- console_locked = 0;
- up_console_sem();
- return;
+ if (pmsg->dropped &&
+ panic_in_progress() &&
+ panic_console_dropped++ > 10) {
+ suppress_panic_printk = 1;
+ pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
}
- for (;;) {
- struct printk_log *msg;
- size_t ext_len = 0;
- size_t len;
+ /* Skip record that has level above the console loglevel. */
+ if (may_suppress && suppress_message_printing(r.info->level))
+ goto out;
- printk_safe_enter_irqsave(flags);
- raw_spin_lock(&logbuf_lock);
- if (console_seq < log_first_seq) {
- len = snprintf(text, sizeof(text),
- "** %llu printk messages dropped **\n",
- log_first_seq - console_seq);
+ if (is_extended) {
+ len = info_print_ext_header(outbuf, outbuf_sz, r.info);
+ len += msg_print_ext_body(outbuf + len, outbuf_sz - len,
+ &r.text_buf[0], r.info->text_len, &r.info->dev_info);
+ } else {
+ len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
+ }
+out:
+ pmsg->outbuf_len = len;
+ return true;
+}
- /* messages are gone, move to first one */
- console_seq = log_first_seq;
- console_idx = log_first_idx;
- } else {
- len = 0;
- }
+/*
+ * Used as the printk buffers for non-panic, serialized console printing.
+ * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles.
+ * Its usage requires the console_lock held.
+ */
+struct printk_buffers printk_shared_pbufs;
+
+/*
+ * Print one record for the given console. The record printed is whatever
+ * record is the next available record for the given console.
+ *
+ * @handover will be set to true if a printk waiter has taken over the
+ * console_lock, in which case the caller is no longer holding both the
+ * console_lock and the SRCU read lock. Otherwise it is set to false.
+ *
+ * @cookie is the cookie from the SRCU read lock.
+ *
+ * Returns false if the given console has no next record to print, otherwise
+ * true.
+ *
+ * Requires the console_lock and the SRCU read lock.
+ */
+static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
+{
+ bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
+ char *outbuf = &printk_shared_pbufs.outbuf[0];
+ struct printk_message pmsg = {
+ .pbufs = &printk_shared_pbufs,
+ };
+ unsigned long flags;
+
+ *handover = false;
+
+ if (!printk_get_next_message(&pmsg, con->seq, is_extended, true))
+ return false;
+
+ con->dropped += pmsg.dropped;
+
+ /* Skip messages of formatted length 0. */
+ if (pmsg.outbuf_len == 0) {
+ con->seq = pmsg.seq + 1;
+ goto skip;
+ }
+
+ if (con->dropped && !is_extended) {
+ console_prepend_dropped(&pmsg, con->dropped);
+ con->dropped = 0;
+ }
+
+ /*
+ * While actively printing out messages, if another printk()
+ * were to occur on another CPU, it may wait for this one to
+ * finish. This task can not be preempted if there is a
+ * waiter waiting to take over.
+ *
+ * Interrupts are disabled because the hand over to a waiter
+ * must not be interrupted until the hand over is completed
+ * (@console_waiter is cleared).
+ */
+ printk_safe_enter_irqsave(flags);
+ console_lock_spinning_enable();
+
+ /* Do not trace print latency. */
+ stop_critical_timings();
+
+ /* Write everything out to the hardware. */
+ con->write(con, outbuf, pmsg.outbuf_len);
+
+ start_critical_timings();
+
+ con->seq = pmsg.seq + 1;
+
+ *handover = console_lock_spinning_disable_and_check(cookie);
+ printk_safe_exit_irqrestore(flags);
skip:
- if (console_seq == log_next_seq)
- break;
+ return true;
+}
- msg = log_from_idx(console_idx);
- if (suppress_message_printing(msg->level)) {
- /*
- * Skip record we have buffered and already printed
- * directly to the console when we received it, and
- * record that has level above the console loglevel.
- */
- console_idx = log_next(console_idx);
- console_seq++;
- goto skip;
- }
+#else
- /* Output to all consoles once old messages replayed. */
- if (unlikely(exclusive_console &&
- console_seq >= exclusive_console_stop_seq)) {
- exclusive_console = NULL;
- }
+static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
+{
+ *handover = false;
+ return false;
+}
- len += msg_print_text(msg,
- console_msg_format & MSG_FORMAT_SYSLOG,
- printk_time, text + len, sizeof(text) - len);
- if (nr_ext_console_drivers) {
- ext_len = msg_print_ext_header(ext_text,
- sizeof(ext_text),
- msg, console_seq);
- ext_len += msg_print_ext_body(ext_text + ext_len,
- sizeof(ext_text) - ext_len,
- log_dict(msg), msg->dict_len,
- log_text(msg), msg->text_len);
- }
- console_idx = log_next(console_idx);
- console_seq++;
- raw_spin_unlock(&logbuf_lock);
+#endif /* CONFIG_PRINTK */
- /*
- * While actively printing out messages, if another printk()
- * were to occur on another CPU, it may wait for this one to
- * finish. This task can not be preempted if there is a
- * waiter waiting to take over.
- */
- console_lock_spinning_enable();
+/*
+ * Print out all remaining records to all consoles.
+ *
+ * @do_cond_resched is set by the caller. It can be true only in schedulable
+ * context.
+ *
+ * @next_seq is set to the sequence number after the last available record.
+ * The value is valid only when this function returns true. It means that all
+ * usable consoles are completely flushed.
+ *
+ * @handover will be set to true if a printk waiter has taken over the
+ * console_lock, in which case the caller is no longer holding the
+ * console_lock. Otherwise it is set to false.
+ *
+ * Returns true when there was at least one usable console and all messages
+ * were flushed to all usable consoles. A returned false informs the caller
+ * that everything was not flushed (either there were no usable consoles or
+ * another context has taken over printing or it is a panic situation and this
+ * is not the panic CPU). Regardless the reason, the caller should assume it
+ * is not useful to immediately try again.
+ *
+ * Requires the console_lock.
+ */
+static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover)
+{
+ bool any_usable = false;
+ struct console *con;
+ bool any_progress;
+ int cookie;
- stop_critical_timings(); /* don't trace print latency */
- call_console_drivers(ext_text, ext_len, text, len);
- start_critical_timings();
+ *next_seq = 0;
+ *handover = false;
- if (console_lock_spinning_disable_and_check()) {
- printk_safe_exit_irqrestore(flags);
- return;
- }
+ do {
+ any_progress = false;
- printk_safe_exit_irqrestore(flags);
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
+ bool progress;
- if (do_cond_resched)
- cond_resched();
- }
+ if (!console_is_usable(con))
+ continue;
+ any_usable = true;
- console_locked = 0;
+ progress = console_emit_next_record(con, handover, cookie);
- raw_spin_unlock(&logbuf_lock);
+ /*
+ * If a handover has occurred, the SRCU read lock
+ * is already released.
+ */
+ if (*handover)
+ return false;
- up_console_sem();
+ /* Track the next of the highest seq flushed. */
+ if (con->seq > *next_seq)
+ *next_seq = con->seq;
+
+ if (!progress)
+ continue;
+ any_progress = true;
+
+ /* Allow panic_cpu to take over the consoles safely. */
+ if (other_cpu_in_panic())
+ goto abandon;
+
+ if (do_cond_resched)
+ cond_resched();
+ }
+ console_srcu_read_unlock(cookie);
+ } while (any_progress);
+
+ return any_usable;
+
+abandon:
+ console_srcu_read_unlock(cookie);
+ return false;
+}
+
+/**
+ * console_unlock - unblock the console subsystem from printing
+ *
+ * Releases the console_lock which the caller holds to block printing of
+ * the console subsystem.
+ *
+ * While the console_lock was held, console output may have been buffered
+ * by printk(). If this is the case, console_unlock(); emits
+ * the output prior to releasing the lock.
+ *
+ * console_unlock(); may be called from any context.
+ */
+void console_unlock(void)
+{
+ bool do_cond_resched;
+ bool handover;
+ bool flushed;
+ u64 next_seq;
/*
- * Someone could have filled up the buffer again, so re-check if there's
- * something to flush. In case we cannot trylock the console_sem again,
- * there's a new owner and the console_unlock() from them will do the
- * flush, no worries.
+ * Console drivers are called with interrupts disabled, so
+ * @console_may_schedule should be cleared before; however, we may
+ * end up dumping a lot of lines, for example, if called from
+ * console registration path, and should invoke cond_resched()
+ * between lines if allowable. Not doing so can cause a very long
+ * scheduling stall on a slow console leading to RCU stall and
+ * softlockup warnings which exacerbate the issue with more
+ * messages practically incapacitating the system. Therefore, create
+ * a local to use for the printing loop.
*/
- raw_spin_lock(&logbuf_lock);
- retry = console_seq != log_next_seq;
- raw_spin_unlock(&logbuf_lock);
- printk_safe_exit_irqrestore(flags);
+ do_cond_resched = console_may_schedule;
+
+ do {
+ console_may_schedule = 0;
+
+ flushed = console_flush_all(do_cond_resched, &next_seq, &handover);
+ if (!handover)
+ __console_unlock();
- if (retry && console_trylock())
- goto again;
+ /*
+ * Abort if there was a failure to flush all messages to all
+ * usable consoles. Either it is not possible to flush (in
+ * which case it would be an infinite loop of retrying) or
+ * another context has taken over printing.
+ */
+ if (!flushed)
+ break;
+
+ /*
+ * Some context may have added new records after
+ * console_flush_all() but before unlocking the console.
+ * Re-check if there is a new record to flush. If the trylock
+ * fails, another context is already handling the printing.
+ */
+ } while (prb_read_valid(prb, next_seq, NULL) && console_trylock());
}
EXPORT_SYMBOL(console_unlock);
@@ -2544,13 +3074,45 @@ EXPORT_SYMBOL(console_conditional_schedule);
void console_unblank(void)
{
+ bool found_unblank = false;
struct console *c;
+ int cookie;
+
+ /*
+ * First check if there are any consoles implementing the unblank()
+ * callback. If not, there is no reason to continue and take the
+ * console lock, which in particular can be dangerous if
+ * @oops_in_progress is set.
+ */
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) {
+ found_unblank = true;
+ break;
+ }
+ }
+ console_srcu_read_unlock(cookie);
+ if (!found_unblank)
+ return;
/*
- * console_unblank can no longer be called in interrupt context unless
- * oops_in_progress is set to 1..
+ * Stop console printing because the unblank() callback may
+ * assume the console is not within its write() callback.
+ *
+ * If @oops_in_progress is set, this may be an atomic context.
+ * In that case, attempt a trylock as best-effort.
*/
if (oops_in_progress) {
+ /* Semaphores are not NMI-safe. */
+ if (in_nmi())
+ return;
+
+ /*
+ * Attempting to trylock the console lock can deadlock
+ * if another CPU was stopped while modifying the
+ * semaphore. "Hope and pray" that this is not the
+ * current situation.
+ */
if (down_trylock_console_sem() != 0)
return;
} else
@@ -2558,10 +3120,18 @@ void console_unblank(void)
console_locked = 1;
console_may_schedule = 0;
- for_each_console(c)
- if ((c->flags & CON_ENABLED) && c->unblank)
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank)
c->unblank();
+ }
+ console_srcu_read_unlock(cookie);
+
console_unlock();
+
+ if (!oops_in_progress)
+ pr_flush(1000, true);
}
/**
@@ -2572,25 +3142,52 @@ void console_unblank(void)
*/
void console_flush_on_panic(enum con_flush_mode mode)
{
+ bool handover;
+ u64 next_seq;
+
/*
- * If someone else is holding the console lock, trylock will fail
- * and may_schedule may be set. Ignore and proceed to unlock so
- * that messages are flushed out. As this can be called from any
- * context and we don't want to get preempted while flushing,
- * ensure may_schedule is cleared.
+ * Ignore the console lock and flush out the messages. Attempting a
+ * trylock would not be useful because:
+ *
+ * - if it is contended, it must be ignored anyway
+ * - console_lock() and console_trylock() block and fail
+ * respectively in panic for non-panic CPUs
+ * - semaphores are not NMI-safe
+ */
+
+ /*
+ * If another context is holding the console lock,
+ * @console_may_schedule might be set. Clear it so that
+ * this context does not call cond_resched() while flushing.
*/
- console_trylock();
console_may_schedule = 0;
if (mode == CONSOLE_REPLAY_ALL) {
- unsigned long flags;
-
- logbuf_lock_irqsave(flags);
- console_seq = log_first_seq;
- console_idx = log_first_idx;
- logbuf_unlock_irqrestore(flags);
+ struct console *c;
+ short flags;
+ int cookie;
+ u64 seq;
+
+ seq = prb_first_valid_seq(prb);
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ flags = console_srcu_read_flags(c);
+
+ if (flags & CON_NBCON) {
+ nbcon_seq_force(c, seq);
+ } else {
+ /*
+ * This is an unsynchronized assignment. On
+ * panic legacy consoles are only best effort.
+ */
+ c->seq = seq;
+ }
+ }
+ console_srcu_read_unlock(cookie);
}
- console_unlock();
+
+ console_flush_all(false, &next_seq, &handover);
}
/*
@@ -2600,15 +3197,25 @@ struct tty_driver *console_device(int *index)
{
struct console *c;
struct tty_driver *driver = NULL;
+ int cookie;
+ /*
+ * Take console_lock to serialize device() callback with
+ * other console operations. For example, fg_console is
+ * modified under console_lock when switching vt.
+ */
console_lock();
- for_each_console(c) {
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
if (!c->device)
continue;
driver = c->device(c, index);
if (driver)
break;
}
+ console_srcu_read_unlock(cookie);
+
console_unlock();
return driver;
}
@@ -2620,17 +3227,27 @@ struct tty_driver *console_device(int *index)
*/
void console_stop(struct console *console)
{
- console_lock();
- console->flags &= ~CON_ENABLED;
- console_unlock();
+ __pr_flush(console, 1000, true);
+ console_list_lock();
+ console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All contexts must
+ * be able to see that this console is disabled so that (for example)
+ * the caller can suspend the port without risk of another context
+ * using the port.
+ */
+ synchronize_srcu(&console_srcu);
}
EXPORT_SYMBOL(console_stop);
void console_start(struct console *console)
{
- console_lock();
- console->flags |= CON_ENABLED;
- console_unlock();
+ console_list_lock();
+ console_srcu_write_flags(console, console->flags | CON_ENABLED);
+ console_list_unlock();
+ __pr_flush(console, 1000, true);
}
EXPORT_SYMBOL(console_start);
@@ -2655,10 +3272,11 @@ early_param("keep_bootcon", keep_bootcon_setup);
* Care need to be taken with consoles that are statically
* enabled such as netconsole
*/
-static int try_enable_new_console(struct console *newcon, bool user_specified)
+static int try_enable_preferred_console(struct console *newcon,
+ bool user_specified)
{
struct console_cmdline *c;
- int i;
+ int i, err;
for (i = 0, c = console_cmdline;
i < MAX_CMDLINECONSOLES && c->name[0];
@@ -2681,21 +3299,19 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
return 0;
if (newcon->setup &&
- newcon->setup(newcon, c->options) != 0)
- return -EIO;
+ (err = newcon->setup(newcon, c->options)) != 0)
+ return err;
}
newcon->flags |= CON_ENABLED;
- if (i == preferred_console) {
+ if (i == preferred_console)
newcon->flags |= CON_CONSDEV;
- has_preferred_console = true;
- }
return 0;
}
/*
* Some consoles, such as pstore and netconsole, can be enabled even
* without matching. Accept the pre-enabled consoles only when match()
- * and setup() had a change to be called.
+ * and setup() had a chance to be called.
*/
if (newcon->flags & CON_ENABLED && c->user_specified == user_specified)
return 0;
@@ -2703,6 +3319,87 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
return -ENOENT;
}
+/* Try to enable the console unconditionally */
+static void try_enable_default_console(struct console *newcon)
+{
+ if (newcon->index < 0)
+ newcon->index = 0;
+
+ if (newcon->setup && newcon->setup(newcon, NULL) != 0)
+ return;
+
+ newcon->flags |= CON_ENABLED;
+
+ if (newcon->device)
+ newcon->flags |= CON_CONSDEV;
+}
+
+static void console_init_seq(struct console *newcon, bool bootcon_registered)
+{
+ struct console *con;
+ bool handover;
+
+ if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) {
+ /* Get a consistent copy of @syslog_seq. */
+ mutex_lock(&syslog_lock);
+ newcon->seq = syslog_seq;
+ mutex_unlock(&syslog_lock);
+ } else {
+ /* Begin with next message added to ringbuffer. */
+ newcon->seq = prb_next_seq(prb);
+
+ /*
+ * If any enabled boot consoles are due to be unregistered
+ * shortly, some may not be caught up and may be the same
+ * device as @newcon. Since it is not known which boot console
+ * is the same device, flush all consoles and, if necessary,
+ * start with the message of the enabled boot console that is
+ * the furthest behind.
+ */
+ if (bootcon_registered && !keep_bootcon) {
+ /*
+ * Hold the console_lock to stop console printing and
+ * guarantee safe access to console->seq.
+ */
+ console_lock();
+
+ /*
+ * Flush all consoles and set the console to start at
+ * the next unprinted sequence number.
+ */
+ if (!console_flush_all(true, &newcon->seq, &handover)) {
+ /*
+ * Flushing failed. Just choose the lowest
+ * sequence of the enabled boot consoles.
+ */
+
+ /*
+ * If there was a handover, this context no
+ * longer holds the console_lock.
+ */
+ if (handover)
+ console_lock();
+
+ newcon->seq = prb_next_seq(prb);
+ for_each_console(con) {
+ if ((con->flags & CON_BOOT) &&
+ (con->flags & CON_ENABLED) &&
+ con->seq < newcon->seq) {
+ newcon->seq = con->seq;
+ }
+ }
+ }
+
+ console_unlock();
+ }
+ }
+}
+
+#define console_first() \
+ hlist_entry(console_list.first, struct console, node)
+
+static int unregister_console_locked(struct console *console);
+
/*
* The console driver calls this routine during kernel initialization
* to register the console printing procedure with printk() and to
@@ -2724,64 +3421,73 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
*/
void register_console(struct console *newcon)
{
- unsigned long flags;
- struct console *bcon = NULL;
+ struct console *con;
+ bool bootcon_registered = false;
+ bool realcon_registered = false;
int err;
- for_each_console(bcon) {
- if (WARN(bcon == newcon, "console '%s%d' already registered\n",
- bcon->name, bcon->index))
- return;
- }
+ console_list_lock();
- /*
- * before we register a new CON_BOOT console, make sure we don't
- * already have a valid console
- */
- if (newcon->flags & CON_BOOT) {
- for_each_console(bcon) {
- if (!(bcon->flags & CON_BOOT)) {
- pr_info("Too late to register bootconsole %s%d\n",
- newcon->name, newcon->index);
- return;
- }
+ for_each_console(con) {
+ if (WARN(con == newcon, "console '%s%d' already registered\n",
+ con->name, con->index)) {
+ goto unlock;
}
+
+ if (con->flags & CON_BOOT)
+ bootcon_registered = true;
+ else
+ realcon_registered = true;
}
- if (console_drivers && console_drivers->flags & CON_BOOT)
- bcon = console_drivers;
+ /* Do not register boot consoles when there already is a real one. */
+ if ((newcon->flags & CON_BOOT) && realcon_registered) {
+ pr_info("Too late to register bootconsole %s%d\n",
+ newcon->name, newcon->index);
+ goto unlock;
+ }
- if (!has_preferred_console || bcon || !console_drivers)
- has_preferred_console = preferred_console >= 0;
+ if (newcon->flags & CON_NBCON) {
+ /*
+ * Ensure the nbcon console buffers can be allocated
+ * before modifying any global data.
+ */
+ if (!nbcon_alloc(newcon))
+ goto unlock;
+ }
/*
- * See if we want to use this console driver. If we
- * didn't select a console we take the first one
- * that registers here.
+ * See if we want to enable this console driver by default.
+ *
+ * Nope when a console is preferred by the command line, device
+ * tree, or SPCR.
+ *
+ * The first real console with tty binding (driver) wins. More
+ * consoles might get enabled before the right one is found.
+ *
+ * Note that a console with tty binding will have CON_CONSDEV
+ * flag set and will be first in the list.
*/
- if (!has_preferred_console) {
- if (newcon->index < 0)
- newcon->index = 0;
- if (newcon->setup == NULL ||
- newcon->setup(newcon, NULL) == 0) {
- newcon->flags |= CON_ENABLED;
- if (newcon->device) {
- newcon->flags |= CON_CONSDEV;
- has_preferred_console = true;
- }
+ if (preferred_console < 0) {
+ if (hlist_empty(&console_list) || !console_first()->device ||
+ console_first()->flags & CON_BOOT) {
+ try_enable_default_console(newcon);
}
}
/* See if this console matches one we selected on the command line */
- err = try_enable_new_console(newcon, true);
+ err = try_enable_preferred_console(newcon, true);
/* If not, try to match against the platform default(s) */
if (err == -ENOENT)
- err = try_enable_new_console(newcon, false);
+ err = try_enable_preferred_console(newcon, false);
/* printk() messages are not printed to the Braille console. */
- if (err || newcon->flags & CON_BRL)
- return;
+ if (err || newcon->flags & CON_BRL) {
+ if (newcon->flags & CON_NBCON)
+ nbcon_free(newcon);
+ goto unlock;
+ }
/*
* If we have a bootconsole, and are switching to a real console,
@@ -2789,51 +3495,41 @@ void register_console(struct console *newcon)
* the real console are the same physical device, it's annoying to
* see the beginning boot messages twice
*/
- if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
+ if (bootcon_registered &&
+ ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
newcon->flags &= ~CON_PRINTBUFFER;
+ }
+
+ newcon->dropped = 0;
+ console_init_seq(newcon, bootcon_registered);
+
+ if (newcon->flags & CON_NBCON)
+ nbcon_init(newcon);
/*
- * Put this console in the list - keep the
- * preferred driver at the head of the list.
+ * Put this console in the list - keep the
+ * preferred driver at the head of the list.
*/
- console_lock();
- if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
- newcon->next = console_drivers;
- console_drivers = newcon;
- if (newcon->next)
- newcon->next->flags &= ~CON_CONSDEV;
- /* Ensure this flag is always set for the head of the list */
+ if (hlist_empty(&console_list)) {
+ /* Ensure CON_CONSDEV is always set for the head. */
newcon->flags |= CON_CONSDEV;
+ hlist_add_head_rcu(&newcon->node, &console_list);
+
+ } else if (newcon->flags & CON_CONSDEV) {
+ /* Only the new head can have CON_CONSDEV set. */
+ console_srcu_write_flags(console_first(), console_first()->flags & ~CON_CONSDEV);
+ hlist_add_head_rcu(&newcon->node, &console_list);
+
} else {
- newcon->next = console_drivers->next;
- console_drivers->next = newcon;
+ hlist_add_behind_rcu(&newcon->node, console_list.first);
}
- if (newcon->flags & CON_EXTENDED)
- nr_ext_console_drivers++;
+ /*
+ * No need to synchronize SRCU here! The caller does not rely
+ * on all contexts being able to see the new console before
+ * register_console() completes.
+ */
- if (newcon->flags & CON_PRINTBUFFER) {
- /*
- * console_unlock(); will print out the buffered messages
- * for us.
- */
- logbuf_lock_irqsave(flags);
- /*
- * We're about to replay the log buffer. Only do this to the
- * just-registered console to avoid excessive message spam to
- * the already-registered consoles.
- *
- * Set exclusive_console with disabled interrupts to reduce
- * race window with eventual console_flush_on_panic() that
- * ignores console_lock.
- */
- exclusive_console = newcon;
- exclusive_console_stop_seq = console_seq;
- console_seq = syslog_seq;
- console_idx = syslog_idx;
- logbuf_unlock_irqrestore(flags);
- }
- console_unlock();
console_sysfs_notify();
/*
@@ -2843,30 +3539,30 @@ void register_console(struct console *newcon)
* users know there might be something in the kernel's log buffer that
* went to the bootconsole (that they do not see on the real console)
*/
- pr_info("%sconsole [%s%d] enabled\n",
- (newcon->flags & CON_BOOT) ? "boot" : "" ,
- newcon->name, newcon->index);
- if (bcon &&
+ con_printk(KERN_INFO, newcon, "enabled\n");
+ if (bootcon_registered &&
((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
!keep_bootcon) {
- /* We need to iterate through all boot consoles, to make
- * sure we print everything out, before we unregister them.
- */
- for_each_console(bcon)
- if (bcon->flags & CON_BOOT)
- unregister_console(bcon);
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(con, tmp, &console_list, node) {
+ if (con->flags & CON_BOOT)
+ unregister_console_locked(con);
+ }
}
+unlock:
+ console_list_unlock();
}
EXPORT_SYMBOL(register_console);
-int unregister_console(struct console *console)
+/* Must be called under console_list_lock(). */
+static int unregister_console_locked(struct console *console)
{
- struct console *con;
int res;
- pr_info("%sconsole [%s%d] disabled\n",
- (console->flags & CON_BOOT) ? "boot" : "" ,
- console->name, console->index);
+ lockdep_assert_console_list_lock_held();
+
+ con_printk(KERN_INFO, console, "disabled\n");
res = _braille_unregister_console(console);
if (res < 0)
@@ -2874,51 +3570,97 @@ int unregister_console(struct console *console)
if (res > 0)
return 0;
- res = -ENODEV;
- console_lock();
- if (console_drivers == console) {
- console_drivers=console->next;
- res = 0;
- } else {
- for_each_console(con) {
- if (con->next == console) {
- con->next = console->next;
- res = 0;
- break;
- }
- }
- }
+ /* Disable it unconditionally */
+ console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
- if (res)
- goto out_disable_unlock;
+ if (!console_is_registered_locked(console))
+ return -ENODEV;
- if (console->flags & CON_EXTENDED)
- nr_ext_console_drivers--;
+ hlist_del_init_rcu(&console->node);
/*
+ * <HISTORICAL>
* If this isn't the last console and it has CON_CONSDEV set, we
* need to set it on the next preferred console.
+ * </HISTORICAL>
+ *
+ * The above makes no sense as there is no guarantee that the next
+ * console has any device attached. Oh well....
*/
- if (console_drivers != NULL && console->flags & CON_CONSDEV)
- console_drivers->flags |= CON_CONSDEV;
+ if (!hlist_empty(&console_list) && console->flags & CON_CONSDEV)
+ console_srcu_write_flags(console_first(), console_first()->flags | CON_CONSDEV);
+
+ /*
+ * Ensure that all SRCU list walks have completed. All contexts
+ * must not be able to see this console in the list so that any
+ * exit/cleanup routines can be performed safely.
+ */
+ synchronize_srcu(&console_srcu);
+
+ if (console->flags & CON_NBCON)
+ nbcon_free(console);
- console->flags &= ~CON_ENABLED;
- console_unlock();
console_sysfs_notify();
if (console->exit)
res = console->exit(console);
return res;
+}
-out_disable_unlock:
- console->flags &= ~CON_ENABLED;
- console_unlock();
+int unregister_console(struct console *console)
+{
+ int res;
+ console_list_lock();
+ res = unregister_console_locked(console);
+ console_list_unlock();
return res;
}
EXPORT_SYMBOL(unregister_console);
+/**
+ * console_force_preferred_locked - force a registered console preferred
+ * @con: The registered console to force preferred.
+ *
+ * Must be called under console_list_lock().
+ */
+void console_force_preferred_locked(struct console *con)
+{
+ struct console *cur_pref_con;
+
+ if (!console_is_registered_locked(con))
+ return;
+
+ cur_pref_con = console_first();
+
+ /* Already preferred? */
+ if (cur_pref_con == con)
+ return;
+
+ /*
+ * Delete, but do not re-initialize the entry. This allows the console
+ * to continue to appear registered (via any hlist_unhashed_lockless()
+ * checks), even though it was briefly removed from the console list.
+ */
+ hlist_del_rcu(&con->node);
+
+ /*
+ * Ensure that all SRCU list walks have completed so that the console
+ * can be added to the beginning of the console list and its forward
+ * list pointer can be re-initialized.
+ */
+ synchronize_srcu(&console_srcu);
+
+ con->flags |= CON_CONSDEV;
+ WARN_ON(!con->device);
+
+ /* Only the new head can have CON_CONSDEV set. */
+ console_srcu_write_flags(cur_pref_con, cur_pref_con->flags & ~CON_CONSDEV);
+ hlist_add_head_rcu(&con->node, &console_list);
+}
+EXPORT_SYMBOL(console_force_preferred_locked);
+
/*
* Initialize the console device. This is called *early*, so
* we can't necessarily depend on lots of kernel help here.
@@ -2961,14 +3703,16 @@ void __init console_init(void)
*
* To mitigate this problem somewhat, only unregister consoles whose memory
* intersects with the init section. Note that all other boot consoles will
- * get unregistred when the real preferred console is registered.
+ * get unregistered when the real preferred console is registered.
*/
static int __init printk_late_init(void)
{
+ struct hlist_node *tmp;
struct console *con;
int ret;
- for_each_console(con) {
+ console_list_lock();
+ hlist_for_each_entry_safe(con, tmp, &console_list, node) {
if (!(con->flags & CON_BOOT))
continue;
@@ -2985,20 +3729,126 @@ static int __init printk_late_init(void)
*/
pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n",
con->name, con->index);
- unregister_console(con);
+ unregister_console_locked(con);
}
}
+ console_list_unlock();
+
ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
console_cpu_notify);
WARN_ON(ret < 0);
ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online",
console_cpu_notify, NULL);
WARN_ON(ret < 0);
+ printk_sysctl_init();
return 0;
}
late_initcall(printk_late_init);
#if defined CONFIG_PRINTK
+/* If @con is specified, only wait for that console. Otherwise wait for all. */
+static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress)
+{
+ unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms);
+ unsigned long remaining_jiffies = timeout_jiffies;
+ struct console *c;
+ u64 last_diff = 0;
+ u64 printk_seq;
+ short flags;
+ int cookie;
+ u64 diff;
+ u64 seq;
+
+ might_sleep();
+
+ seq = prb_next_seq(prb);
+
+ /* Flush the consoles so that records up to @seq are printed. */
+ console_lock();
+ console_unlock();
+
+ for (;;) {
+ unsigned long begin_jiffies;
+ unsigned long slept_jiffies;
+
+ diff = 0;
+
+ /*
+ * Hold the console_lock to guarantee safe access to
+ * console->seq. Releasing console_lock flushes more
+ * records in case @seq is still not printed on all
+ * usable consoles.
+ */
+ console_lock();
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if (con && con != c)
+ continue;
+
+ flags = console_srcu_read_flags(c);
+
+ /*
+ * If consoles are not usable, it cannot be expected
+ * that they make forward progress, so only increment
+ * @diff for usable consoles.
+ */
+ if (!console_is_usable(c))
+ continue;
+
+ if (flags & CON_NBCON) {
+ printk_seq = nbcon_seq_read(c);
+ } else {
+ printk_seq = c->seq;
+ }
+
+ if (printk_seq < seq)
+ diff += seq - printk_seq;
+ }
+ console_srcu_read_unlock(cookie);
+
+ if (diff != last_diff && reset_on_progress)
+ remaining_jiffies = timeout_jiffies;
+
+ console_unlock();
+
+ /* Note: @diff is 0 if there are no usable consoles. */
+ if (diff == 0 || remaining_jiffies == 0)
+ break;
+
+ /* msleep(1) might sleep much longer. Check time by jiffies. */
+ begin_jiffies = jiffies;
+ msleep(1);
+ slept_jiffies = jiffies - begin_jiffies;
+
+ remaining_jiffies -= min(slept_jiffies, remaining_jiffies);
+
+ last_diff = diff;
+ }
+
+ return (diff == 0);
+}
+
+/**
+ * pr_flush() - Wait for printing threads to catch up.
+ *
+ * @timeout_ms: The maximum time (in ms) to wait.
+ * @reset_on_progress: Reset the timeout if forward progress is seen.
+ *
+ * A value of 0 for @timeout_ms means no waiting will occur. A value of -1
+ * represents infinite waiting.
+ *
+ * If @reset_on_progress is true, the timeout will be reset whenever any
+ * printer has been seen to make some forward progress.
+ *
+ * Context: Process context. May sleep while acquiring console lock.
+ * Return: true if all usable printers are caught up.
+ */
+static bool pr_flush(int timeout_ms, bool reset_on_progress)
+{
+ return __pr_flush(NULL, timeout_ms, reset_on_progress);
+}
+
/*
* Delayed printk version, for scheduler-internal messages:
*/
@@ -3009,7 +3859,7 @@ static DEFINE_PER_CPU(int, printk_pending);
static void wake_up_klogd_work_func(struct irq_work *irq_work)
{
- int pending = __this_cpu_xchg(printk_pending, 0);
+ int pending = this_cpu_xchg(printk_pending, 0);
if (pending & PRINTK_PENDING_OUTPUT) {
/* If trylock fails, someone else is doing the printing */
@@ -3021,46 +3871,81 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
wake_up_interruptible(&log_wait);
}
-static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
- .func = wake_up_klogd_work_func,
- .flags = ATOMIC_INIT(IRQ_WORK_LAZY),
-};
+static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
+ IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func);
-void wake_up_klogd(void)
+static void __wake_up_klogd(int val)
{
if (!printk_percpu_data_ready())
return;
preempt_disable();
- if (waitqueue_active(&log_wait)) {
- this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+ /*
+ * Guarantee any new records can be seen by tasks preparing to wait
+ * before this context checks if the wait queue is empty.
+ *
+ * The full memory barrier within wq_has_sleeper() pairs with the full
+ * memory barrier within set_current_state() of
+ * prepare_to_wait_event(), which is called after ___wait_event() adds
+ * the waiter but before it has checked the wait condition.
+ *
+ * This pairs with devkmsg_read:A and syslog_print:A.
+ */
+ if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */
+ (val & PRINTK_PENDING_OUTPUT)) {
+ this_cpu_or(printk_pending, val);
irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
}
preempt_enable();
}
-void defer_console_output(void)
+/**
+ * wake_up_klogd - Wake kernel logging daemon
+ *
+ * Use this function when new records have been added to the ringbuffer
+ * and the console printing of those records has already occurred or is
+ * known to be handled by some other context. This function will only
+ * wake the logging daemon.
+ *
+ * Context: Any context.
+ */
+void wake_up_klogd(void)
{
- if (!printk_percpu_data_ready())
- return;
-
- preempt_disable();
- __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
- irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
- preempt_enable();
+ __wake_up_klogd(PRINTK_PENDING_WAKEUP);
}
-int vprintk_deferred(const char *fmt, va_list args)
+/**
+ * defer_console_output - Wake kernel logging daemon and trigger
+ * console printing in a deferred context
+ *
+ * Use this function when new records have been added to the ringbuffer,
+ * this context is responsible for console printing those records, but
+ * the current context is not allowed to perform the console printing.
+ * Trigger an irq_work context to perform the console printing. This
+ * function also wakes the logging daemon.
+ *
+ * Context: Any context.
+ */
+void defer_console_output(void)
{
- int r;
+ /*
+ * New messages may have been added directly to the ringbuffer
+ * using vprintk_store(), so wake any waiters as well.
+ */
+ __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT);
+}
- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
+void printk_trigger_flush(void)
+{
defer_console_output();
+}
- return r;
+int vprintk_deferred(const char *fmt, va_list args)
+{
+ return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
}
-int printk_deferred(const char *fmt, ...)
+int _printk_deferred(const char *fmt, ...)
{
va_list args;
int r;
@@ -3197,7 +4082,6 @@ EXPORT_SYMBOL_GPL(kmsg_dump_reason_str);
void kmsg_dump(enum kmsg_dump_reason reason)
{
struct kmsg_dumper *dumper;
- unsigned long flags;
rcu_read_lock();
list_for_each_entry_rcu(dumper, &dump_list, list) {
@@ -3214,28 +4098,15 @@ void kmsg_dump(enum kmsg_dump_reason reason)
if (reason > max_reason)
continue;
- /* initialize iterator with data about the stored records */
- dumper->active = true;
-
- logbuf_lock_irqsave(flags);
- dumper->cur_seq = clear_seq;
- dumper->cur_idx = clear_idx;
- dumper->next_seq = log_next_seq;
- dumper->next_idx = log_next_idx;
- logbuf_unlock_irqrestore(flags);
-
/* invoke dumper which will iterate over records */
dumper->dump(dumper, reason);
-
- /* reset iterator */
- dumper->active = false;
}
rcu_read_unlock();
}
/**
- * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
- * @dumper: registered kmsg dumper
+ * kmsg_dump_get_line - retrieve one kmsg log line
+ * @iter: kmsg dump iterator
* @syslog: include the "<4>" prefixes
* @line: buffer to copy the line to
* @size: maximum size of the buffer
@@ -3249,82 +4120,56 @@ void kmsg_dump(enum kmsg_dump_reason reason)
*
* A return value of FALSE indicates that there are no more records to
* read.
- *
- * The function is similar to kmsg_dump_get_line(), but grabs no locks.
*/
-bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
- char *line, size_t size, size_t *len)
+bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
+ char *line, size_t size, size_t *len)
{
- struct printk_log *msg;
+ u64 min_seq = latched_seq_read_nolock(&clear_seq);
+ struct printk_info info;
+ unsigned int line_count;
+ struct printk_record r;
size_t l = 0;
bool ret = false;
- if (!dumper->active)
- goto out;
+ if (iter->cur_seq < min_seq)
+ iter->cur_seq = min_seq;
- if (dumper->cur_seq < log_first_seq) {
- /* messages are gone, move to first available one */
- dumper->cur_seq = log_first_seq;
- dumper->cur_idx = log_first_idx;
- }
+ prb_rec_init_rd(&r, &info, line, size);
- /* last entry */
- if (dumper->cur_seq >= log_next_seq)
- goto out;
+ /* Read text or count text lines? */
+ if (line) {
+ if (!prb_read_valid(prb, iter->cur_seq, &r))
+ goto out;
+ l = record_print_text(&r, syslog, printk_time);
+ } else {
+ if (!prb_read_valid_info(prb, iter->cur_seq,
+ &info, &line_count)) {
+ goto out;
+ }
+ l = get_record_print_text_size(&info, line_count, syslog,
+ printk_time);
- msg = log_from_idx(dumper->cur_idx);
- l = msg_print_text(msg, syslog, printk_time, line, size);
+ }
- dumper->cur_idx = log_next(dumper->cur_idx);
- dumper->cur_seq++;
+ iter->cur_seq = r.info->seq + 1;
ret = true;
out:
if (len)
*len = l;
return ret;
}
-
-/**
- * kmsg_dump_get_line - retrieve one kmsg log line
- * @dumper: registered kmsg dumper
- * @syslog: include the "<4>" prefixes
- * @line: buffer to copy the line to
- * @size: maximum size of the buffer
- * @len: length of line placed into buffer
- *
- * Start at the beginning of the kmsg buffer, with the oldest kmsg
- * record, and copy one record into the provided buffer.
- *
- * Consecutive calls will return the next available record moving
- * towards the end of the buffer with the youngest messages.
- *
- * A return value of FALSE indicates that there are no more records to
- * read.
- */
-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
- char *line, size_t size, size_t *len)
-{
- unsigned long flags;
- bool ret;
-
- logbuf_lock_irqsave(flags);
- ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
- logbuf_unlock_irqrestore(flags);
-
- return ret;
-}
EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
/**
* kmsg_dump_get_buffer - copy kmsg log lines
- * @dumper: registered kmsg dumper
+ * @iter: kmsg dump iterator
* @syslog: include the "<4>" prefixes
* @buf: buffer to copy the line to
* @size: maximum size of the buffer
- * @len: length of line placed into buffer
+ * @len_out: length of line placed into buffer
*
* Start at the end of the kmsg buffer and fill the provided buffer
- * with as many of the the *youngest* kmsg records that fit into it.
+ * with as many of the *youngest* kmsg records that fit into it.
* If the buffer is large enough, all available kmsg records will be
* copied with a single call.
*
@@ -3334,114 +4179,201 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
* A return value of FALSE indicates that there are no more records to
* read.
*/
-bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
- char *buf, size_t size, size_t *len)
+bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
+ char *buf, size_t size, size_t *len_out)
{
- unsigned long flags;
+ u64 min_seq = latched_seq_read_nolock(&clear_seq);
+ struct printk_info info;
+ struct printk_record r;
u64 seq;
- u32 idx;
u64 next_seq;
- u32 next_idx;
- size_t l = 0;
+ size_t len = 0;
bool ret = false;
bool time = printk_time;
- if (!dumper->active)
+ if (!buf || !size)
goto out;
- logbuf_lock_irqsave(flags);
- if (dumper->cur_seq < log_first_seq) {
- /* messages are gone, move to first available one */
- dumper->cur_seq = log_first_seq;
- dumper->cur_idx = log_first_idx;
+ if (iter->cur_seq < min_seq)
+ iter->cur_seq = min_seq;
+
+ if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) {
+ if (info.seq != iter->cur_seq) {
+ /* messages are gone, move to first available one */
+ iter->cur_seq = info.seq;
+ }
}
/* last entry */
- if (dumper->cur_seq >= dumper->next_seq) {
- logbuf_unlock_irqrestore(flags);
+ if (iter->cur_seq >= iter->next_seq)
goto out;
- }
- /* calculate length of entire buffer */
- seq = dumper->cur_seq;
- idx = dumper->cur_idx;
- while (seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- l += msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
- }
+ /*
+ * Find first record that fits, including all following records,
+ * into the user-provided buffer for this dump. Pass in size-1
+ * because this function (by way of record_print_text()) will
+ * not write more than size-1 bytes of text into @buf.
+ */
+ seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq,
+ size - 1, syslog, time);
- /* move first record forward until length fits into the buffer */
- seq = dumper->cur_seq;
- idx = dumper->cur_idx;
- while (l >= size && seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ /*
+ * Next kmsg_dump_get_buffer() invocation will dump block of
+ * older records stored right before this one.
+ */
+ next_seq = seq;
- l -= msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
- }
+ prb_rec_init_rd(&r, &info, buf, size);
- /* last message in next interation */
- next_seq = seq;
- next_idx = idx;
+ prb_for_each_record(seq, prb, seq, &r) {
+ if (r.info->seq >= iter->next_seq)
+ break;
- l = 0;
- while (seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ len += record_print_text(&r, syslog, time);
- l += msg_print_text(msg, syslog, time, buf + l, size - l);
- idx = log_next(idx);
- seq++;
+ /* Adjust record to store to remaining buffer space. */
+ prb_rec_init_rd(&r, &info, buf + len, size - len);
}
- dumper->next_seq = next_seq;
- dumper->next_idx = next_idx;
+ iter->next_seq = next_seq;
ret = true;
- logbuf_unlock_irqrestore(flags);
out:
- if (len)
- *len = l;
+ if (len_out)
+ *len_out = len;
return ret;
}
EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
/**
- * kmsg_dump_rewind_nolock - reset the iterator (unlocked version)
- * @dumper: registered kmsg dumper
+ * kmsg_dump_rewind - reset the iterator
+ * @iter: kmsg dump iterator
*
* Reset the dumper's iterator so that kmsg_dump_get_line() and
* kmsg_dump_get_buffer() can be called again and used multiple
* times within the same dumper.dump() callback.
+ */
+void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
+{
+ iter->cur_seq = latched_seq_read_nolock(&clear_seq);
+ iter->next_seq = prb_next_seq(prb);
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
+
+#endif
+
+#ifdef CONFIG_SMP
+static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1);
+static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0);
+
+/**
+ * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant
+ * spinning lock is not owned by any CPU.
*
- * The function is similar to kmsg_dump_rewind(), but grabs no locks.
+ * Context: Any context.
*/
-void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
+void __printk_cpu_sync_wait(void)
{
- dumper->cur_seq = clear_seq;
- dumper->cur_idx = clear_idx;
- dumper->next_seq = log_next_seq;
- dumper->next_idx = log_next_idx;
+ do {
+ cpu_relax();
+ } while (atomic_read(&printk_cpu_sync_owner) != -1);
}
+EXPORT_SYMBOL(__printk_cpu_sync_wait);
/**
- * kmsg_dump_rewind - reset the iterator
- * @dumper: registered kmsg dumper
+ * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant
+ * spinning lock.
*
- * Reset the dumper's iterator so that kmsg_dump_get_line() and
- * kmsg_dump_get_buffer() can be called again and used multiple
- * times within the same dumper.dump() callback.
+ * If no processor has the lock, the calling processor takes the lock and
+ * becomes the owner. If the calling processor is already the owner of the
+ * lock, this function succeeds immediately.
+ *
+ * Context: Any context. Expects interrupts to be disabled.
+ * Return: 1 on success, otherwise 0.
*/
-void kmsg_dump_rewind(struct kmsg_dumper *dumper)
+int __printk_cpu_sync_try_get(void)
{
- unsigned long flags;
+ int cpu;
+ int old;
+
+ cpu = smp_processor_id();
- logbuf_lock_irqsave(flags);
- kmsg_dump_rewind_nolock(dumper);
- logbuf_unlock_irqrestore(flags);
+ /*
+ * Guarantee loads and stores from this CPU when it is the lock owner
+ * are _not_ visible to the previous lock owner. This pairs with
+ * __printk_cpu_sync_put:B.
+ *
+ * Memory barrier involvement:
+ *
+ * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
+ * then __printk_cpu_sync_put:A can never read from
+ * __printk_cpu_sync_try_get:B.
+ *
+ * Relies on:
+ *
+ * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
+ * of the previous CPU
+ * matching
+ * ACQUIRE from __printk_cpu_sync_try_get:A to
+ * __printk_cpu_sync_try_get:B of this CPU
+ */
+ old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1,
+ cpu); /* LMM(__printk_cpu_sync_try_get:A) */
+ if (old == -1) {
+ /*
+ * This CPU is now the owner and begins loading/storing
+ * data: LMM(__printk_cpu_sync_try_get:B)
+ */
+ return 1;
+
+ } else if (old == cpu) {
+ /* This CPU is already the owner. */
+ atomic_inc(&printk_cpu_sync_nested);
+ return 1;
+ }
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
+EXPORT_SYMBOL(__printk_cpu_sync_try_get);
-#endif
+/**
+ * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock.
+ *
+ * The calling processor must be the owner of the lock.
+ *
+ * Context: Any context. Expects interrupts to be disabled.
+ */
+void __printk_cpu_sync_put(void)
+{
+ if (atomic_read(&printk_cpu_sync_nested)) {
+ atomic_dec(&printk_cpu_sync_nested);
+ return;
+ }
+
+ /*
+ * This CPU is finished loading/storing data:
+ * LMM(__printk_cpu_sync_put:A)
+ */
+
+ /*
+ * Guarantee loads and stores from this CPU when it was the
+ * lock owner are visible to the next lock owner. This pairs
+ * with __printk_cpu_sync_try_get:A.
+ *
+ * Memory barrier involvement:
+ *
+ * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
+ * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A.
+ *
+ * Relies on:
+ *
+ * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
+ * of this CPU
+ * matching
+ * ACQUIRE from __printk_cpu_sync_try_get:A to
+ * __printk_cpu_sync_try_get:B of the next CPU
+ */
+ atomic_set_release(&printk_cpu_sync_owner,
+ -1); /* LMM(__printk_cpu_sync_put:B) */
+}
+EXPORT_SYMBOL(__printk_cpu_sync_put);
+#endif /* CONFIG_SMP */
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
new file mode 100644
index 000000000000..fde338606ce8
--- /dev/null
+++ b/kernel/printk/printk_ringbuffer.c
@@ -0,0 +1,2124 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/kernel.h>
+#include <linux/irqflags.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/bug.h>
+#include "printk_ringbuffer.h"
+
+/**
+ * DOC: printk_ringbuffer overview
+ *
+ * Data Structure
+ * --------------
+ * The printk_ringbuffer is made up of 3 internal ringbuffers:
+ *
+ * desc_ring
+ * A ring of descriptors and their meta data (such as sequence number,
+ * timestamp, loglevel, etc.) as well as internal state information about
+ * the record and logical positions specifying where in the other
+ * ringbuffer the text strings are located.
+ *
+ * text_data_ring
+ * A ring of data blocks. A data block consists of an unsigned long
+ * integer (ID) that maps to a desc_ring index followed by the text
+ * string of the record.
+ *
+ * The internal state information of a descriptor is the key element to allow
+ * readers and writers to locklessly synchronize access to the data.
+ *
+ * Implementation
+ * --------------
+ *
+ * Descriptor Ring
+ * ~~~~~~~~~~~~~~~
+ * The descriptor ring is an array of descriptors. A descriptor contains
+ * essential meta data to track the data of a printk record using
+ * blk_lpos structs pointing to associated text data blocks (see
+ * "Data Rings" below). Each descriptor is assigned an ID that maps
+ * directly to index values of the descriptor array and has a state. The ID
+ * and the state are bitwise combined into a single descriptor field named
+ * @state_var, allowing ID and state to be synchronously and atomically
+ * updated.
+ *
+ * Descriptors have four states:
+ *
+ * reserved
+ * A writer is modifying the record.
+ *
+ * committed
+ * The record and all its data are written. A writer can reopen the
+ * descriptor (transitioning it back to reserved), but in the committed
+ * state the data is consistent.
+ *
+ * finalized
+ * The record and all its data are complete and available for reading. A
+ * writer cannot reopen the descriptor.
+ *
+ * reusable
+ * The record exists, but its text and/or meta data may no longer be
+ * available.
+ *
+ * Querying the @state_var of a record requires providing the ID of the
+ * descriptor to query. This can yield a possible fifth (pseudo) state:
+ *
+ * miss
+ * The descriptor being queried has an unexpected ID.
+ *
+ * The descriptor ring has a @tail_id that contains the ID of the oldest
+ * descriptor and @head_id that contains the ID of the newest descriptor.
+ *
+ * When a new descriptor should be created (and the ring is full), the tail
+ * descriptor is invalidated by first transitioning to the reusable state and
+ * then invalidating all tail data blocks up to and including the data blocks
+ * associated with the tail descriptor (for the text ring). Then
+ * @tail_id is advanced, followed by advancing @head_id. And finally the
+ * @state_var of the new descriptor is initialized to the new ID and reserved
+ * state.
+ *
+ * The @tail_id can only be advanced if the new @tail_id would be in the
+ * committed or reusable queried state. This makes it possible that a valid
+ * sequence number of the tail is always available.
+ *
+ * Descriptor Finalization
+ * ~~~~~~~~~~~~~~~~~~~~~~~
+ * When a writer calls the commit function prb_commit(), record data is
+ * fully stored and is consistent within the ringbuffer. However, a writer can
+ * reopen that record, claiming exclusive access (as with prb_reserve()), and
+ * modify that record. When finished, the writer must again commit the record.
+ *
+ * In order for a record to be made available to readers (and also become
+ * recyclable for writers), it must be finalized. A finalized record cannot be
+ * reopened and can never become "unfinalized". Record finalization can occur
+ * in three different scenarios:
+ *
+ * 1) A writer can simultaneously commit and finalize its record by calling
+ * prb_final_commit() instead of prb_commit().
+ *
+ * 2) When a new record is reserved and the previous record has been
+ * committed via prb_commit(), that previous record is automatically
+ * finalized.
+ *
+ * 3) When a record is committed via prb_commit() and a newer record
+ * already exists, the record being committed is automatically finalized.
+ *
+ * Data Ring
+ * ~~~~~~~~~
+ * The text data ring is a byte array composed of data blocks. Data blocks are
+ * referenced by blk_lpos structs that point to the logical position of the
+ * beginning of a data block and the beginning of the next adjacent data
+ * block. Logical positions are mapped directly to index values of the byte
+ * array ringbuffer.
+ *
+ * Each data block consists of an ID followed by the writer data. The ID is
+ * the identifier of a descriptor that is associated with the data block. A
+ * given data block is considered valid if all of the following conditions
+ * are met:
+ *
+ * 1) The descriptor associated with the data block is in the committed
+ * or finalized queried state.
+ *
+ * 2) The blk_lpos struct within the descriptor associated with the data
+ * block references back to the same data block.
+ *
+ * 3) The data block is within the head/tail logical position range.
+ *
+ * If the writer data of a data block would extend beyond the end of the
+ * byte array, only the ID of the data block is stored at the logical
+ * position and the full data block (ID and writer data) is stored at the
+ * beginning of the byte array. The referencing blk_lpos will point to the
+ * ID before the wrap and the next data block will be at the logical
+ * position adjacent the full data block after the wrap.
+ *
+ * Data rings have a @tail_lpos that points to the beginning of the oldest
+ * data block and a @head_lpos that points to the logical position of the
+ * next (not yet existing) data block.
+ *
+ * When a new data block should be created (and the ring is full), tail data
+ * blocks will first be invalidated by putting their associated descriptors
+ * into the reusable state and then pushing the @tail_lpos forward beyond
+ * them. Then the @head_lpos is pushed forward and is associated with a new
+ * descriptor. If a data block is not valid, the @tail_lpos cannot be
+ * advanced beyond it.
+ *
+ * Info Array
+ * ~~~~~~~~~~
+ * The general meta data of printk records are stored in printk_info structs,
+ * stored in an array with the same number of elements as the descriptor ring.
+ * Each info corresponds to the descriptor of the same index in the
+ * descriptor ring. Info validity is confirmed by evaluating the corresponding
+ * descriptor before and after loading the info.
+ *
+ * Usage
+ * -----
+ * Here are some simple examples demonstrating writers and readers. For the
+ * examples a global ringbuffer (test_rb) is available (which is not the
+ * actual ringbuffer used by printk)::
+ *
+ * DEFINE_PRINTKRB(test_rb, 15, 5);
+ *
+ * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of
+ * 1 MiB (2 ^ (15 + 5)) for text data.
+ *
+ * Sample writer code::
+ *
+ * const char *textstr = "message text";
+ * struct prb_reserved_entry e;
+ * struct printk_record r;
+ *
+ * // specify how much to allocate
+ * prb_rec_init_wr(&r, strlen(textstr) + 1);
+ *
+ * if (prb_reserve(&e, &test_rb, &r)) {
+ * snprintf(r.text_buf, r.text_buf_size, "%s", textstr);
+ *
+ * r.info->text_len = strlen(textstr);
+ * r.info->ts_nsec = local_clock();
+ * r.info->caller_id = printk_caller_id();
+ *
+ * // commit and finalize the record
+ * prb_final_commit(&e);
+ * }
+ *
+ * Note that additional writer functions are available to extend a record
+ * after it has been committed but not yet finalized. This can be done as
+ * long as no new records have been reserved and the caller is the same.
+ *
+ * Sample writer code (record extending)::
+ *
+ * // alternate rest of previous example
+ *
+ * r.info->text_len = strlen(textstr);
+ * r.info->ts_nsec = local_clock();
+ * r.info->caller_id = printk_caller_id();
+ *
+ * // commit the record (but do not finalize yet)
+ * prb_commit(&e);
+ * }
+ *
+ * ...
+ *
+ * // specify additional 5 bytes text space to extend
+ * prb_rec_init_wr(&r, 5);
+ *
+ * // try to extend, but only if it does not exceed 32 bytes
+ * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id(), 32)) {
+ * snprintf(&r.text_buf[r.info->text_len],
+ * r.text_buf_size - r.info->text_len, "hello");
+ *
+ * r.info->text_len += 5;
+ *
+ * // commit and finalize the record
+ * prb_final_commit(&e);
+ * }
+ *
+ * Sample reader code::
+ *
+ * struct printk_info info;
+ * struct printk_record r;
+ * char text_buf[32];
+ * u64 seq;
+ *
+ * prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf));
+ *
+ * prb_for_each_record(0, &test_rb, &seq, &r) {
+ * if (info.seq != seq)
+ * pr_warn("lost %llu records\n", info.seq - seq);
+ *
+ * if (info.text_len > r.text_buf_size) {
+ * pr_warn("record %llu text truncated\n", info.seq);
+ * text_buf[r.text_buf_size - 1] = 0;
+ * }
+ *
+ * pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec,
+ * &text_buf[0]);
+ * }
+ *
+ * Note that additional less convenient reader functions are available to
+ * allow complex record access.
+ *
+ * ABA Issues
+ * ~~~~~~~~~~
+ * To help avoid ABA issues, descriptors are referenced by IDs (array index
+ * values combined with tagged bits counting array wraps) and data blocks are
+ * referenced by logical positions (array index values combined with tagged
+ * bits counting array wraps). However, on 32-bit systems the number of
+ * tagged bits is relatively small such that an ABA incident is (at least
+ * theoretically) possible. For example, if 4 million maximally sized (1KiB)
+ * printk messages were to occur in NMI context on a 32-bit system, the
+ * interrupted context would not be able to recognize that the 32-bit integer
+ * completely wrapped and thus represents a different data block than the one
+ * the interrupted context expects.
+ *
+ * To help combat this possibility, additional state checking is performed
+ * (such as using cmpxchg() even though set() would suffice). These extra
+ * checks are commented as such and will hopefully catch any ABA issue that
+ * a 32-bit system might experience.
+ *
+ * Memory Barriers
+ * ~~~~~~~~~~~~~~~
+ * Multiple memory barriers are used. To simplify proving correctness and
+ * generating litmus tests, lines of code related to memory barriers
+ * (loads, stores, and the associated memory barriers) are labeled::
+ *
+ * LMM(function:letter)
+ *
+ * Comments reference the labels using only the "function:letter" part.
+ *
+ * The memory barrier pairs and their ordering are:
+ *
+ * desc_reserve:D / desc_reserve:B
+ * push descriptor tail (id), then push descriptor head (id)
+ *
+ * desc_reserve:D / data_push_tail:B
+ * push data tail (lpos), then set new descriptor reserved (state)
+ *
+ * desc_reserve:D / desc_push_tail:C
+ * push descriptor tail (id), then set new descriptor reserved (state)
+ *
+ * desc_reserve:D / prb_first_seq:C
+ * push descriptor tail (id), then set new descriptor reserved (state)
+ *
+ * desc_reserve:F / desc_read:D
+ * set new descriptor id and reserved (state), then allow writer changes
+ *
+ * data_alloc:A (or data_realloc:A) / desc_read:D
+ * set old descriptor reusable (state), then modify new data block area
+ *
+ * data_alloc:A (or data_realloc:A) / data_push_tail:B
+ * push data tail (lpos), then modify new data block area
+ *
+ * _prb_commit:B / desc_read:B
+ * store writer changes, then set new descriptor committed (state)
+ *
+ * desc_reopen_last:A / _prb_commit:B
+ * set descriptor reserved (state), then read descriptor data
+ *
+ * _prb_commit:B / desc_reserve:D
+ * set new descriptor committed (state), then check descriptor head (id)
+ *
+ * data_push_tail:D / data_push_tail:A
+ * set descriptor reusable (state), then push data tail (lpos)
+ *
+ * desc_push_tail:B / desc_reserve:D
+ * set descriptor reusable (state), then push descriptor tail (id)
+ */
+
+#define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits)
+#define DATA_SIZE_MASK(data_ring) (DATA_SIZE(data_ring) - 1)
+
+#define DESCS_COUNT(desc_ring) _DESCS_COUNT((desc_ring)->count_bits)
+#define DESCS_COUNT_MASK(desc_ring) (DESCS_COUNT(desc_ring) - 1)
+
+/* Determine the data array index from a logical position. */
+#define DATA_INDEX(data_ring, lpos) ((lpos) & DATA_SIZE_MASK(data_ring))
+
+/* Determine the desc array index from an ID or sequence number. */
+#define DESC_INDEX(desc_ring, n) ((n) & DESCS_COUNT_MASK(desc_ring))
+
+/* Determine how many times the data array has wrapped. */
+#define DATA_WRAPS(data_ring, lpos) ((lpos) >> (data_ring)->size_bits)
+
+/* Determine if a logical position refers to a data-less block. */
+#define LPOS_DATALESS(lpos) ((lpos) & 1UL)
+#define BLK_DATALESS(blk) (LPOS_DATALESS((blk)->begin) && \
+ LPOS_DATALESS((blk)->next))
+
+/* Get the logical position at index 0 of the current wrap. */
+#define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \
+((lpos) & ~DATA_SIZE_MASK(data_ring))
+
+/* Get the ID for the same index of the previous wrap as the given ID. */
+#define DESC_ID_PREV_WRAP(desc_ring, id) \
+DESC_ID((id) - DESCS_COUNT(desc_ring))
+
+/*
+ * A data block: mapped directly to the beginning of the data block area
+ * specified as a logical position within the data ring.
+ *
+ * @id: the ID of the associated descriptor
+ * @data: the writer data
+ *
+ * Note that the size of a data block is only known by its associated
+ * descriptor.
+ */
+struct prb_data_block {
+ unsigned long id;
+ char data[];
+};
+
+/*
+ * Return the descriptor associated with @n. @n can be either a
+ * descriptor ID or a sequence number.
+ */
+static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n)
+{
+ return &desc_ring->descs[DESC_INDEX(desc_ring, n)];
+}
+
+/*
+ * Return the printk_info associated with @n. @n can be either a
+ * descriptor ID or a sequence number.
+ */
+static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n)
+{
+ return &desc_ring->infos[DESC_INDEX(desc_ring, n)];
+}
+
+static struct prb_data_block *to_block(struct prb_data_ring *data_ring,
+ unsigned long begin_lpos)
+{
+ return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)];
+}
+
+/*
+ * Increase the data size to account for data block meta data plus any
+ * padding so that the adjacent data block is aligned on the ID size.
+ */
+static unsigned int to_blk_size(unsigned int size)
+{
+ struct prb_data_block *db = NULL;
+
+ size += sizeof(*db);
+ size = ALIGN(size, sizeof(db->id));
+ return size;
+}
+
+/*
+ * Sanity checker for reserve size. The ringbuffer code assumes that a data
+ * block does not exceed the maximum possible size that could fit within the
+ * ringbuffer. This function provides that basic size check so that the
+ * assumption is safe.
+ */
+static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size)
+{
+ struct prb_data_block *db = NULL;
+
+ if (size == 0)
+ return true;
+
+ /*
+ * Ensure the alignment padded size could possibly fit in the data
+ * array. The largest possible data block must still leave room for
+ * at least the ID of the next block.
+ */
+ size = to_blk_size(size);
+ if (size > DATA_SIZE(data_ring) - sizeof(db->id))
+ return false;
+
+ return true;
+}
+
+/* Query the state of a descriptor. */
+static enum desc_state get_desc_state(unsigned long id,
+ unsigned long state_val)
+{
+ if (id != DESC_ID(state_val))
+ return desc_miss;
+
+ return DESC_STATE(state_val);
+}
+
+/*
+ * Get a copy of a specified descriptor and return its queried state. If the
+ * descriptor is in an inconsistent state (miss or reserved), the caller can
+ * only expect the descriptor's @state_var field to be valid.
+ *
+ * The sequence number and caller_id can be optionally retrieved. Like all
+ * non-state_var data, they are only valid if the descriptor is in a
+ * consistent state.
+ */
+static enum desc_state desc_read(struct prb_desc_ring *desc_ring,
+ unsigned long id, struct prb_desc *desc_out,
+ u64 *seq_out, u32 *caller_id_out)
+{
+ struct printk_info *info = to_info(desc_ring, id);
+ struct prb_desc *desc = to_desc(desc_ring, id);
+ atomic_long_t *state_var = &desc->state_var;
+ enum desc_state d_state;
+ unsigned long state_val;
+
+ /* Check the descriptor state. */
+ state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */
+ d_state = get_desc_state(id, state_val);
+ if (d_state == desc_miss || d_state == desc_reserved) {
+ /*
+ * The descriptor is in an inconsistent state. Set at least
+ * @state_var so that the caller can see the details of
+ * the inconsistent state.
+ */
+ goto out;
+ }
+
+ /*
+ * Guarantee the state is loaded before copying the descriptor
+ * content. This avoids copying obsolete descriptor content that might
+ * not apply to the descriptor state. This pairs with _prb_commit:B.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_read:A reads from _prb_commit:B, then desc_read:C reads
+ * from _prb_commit:A.
+ *
+ * Relies on:
+ *
+ * WMB from _prb_commit:A to _prb_commit:B
+ * matching
+ * RMB from desc_read:A to desc_read:C
+ */
+ smp_rmb(); /* LMM(desc_read:B) */
+
+ /*
+ * Copy the descriptor data. The data is not valid until the
+ * state has been re-checked. A memcpy() for all of @desc
+ * cannot be used because of the atomic_t @state_var field.
+ */
+ if (desc_out) {
+ memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos,
+ sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */
+ }
+ if (seq_out)
+ *seq_out = info->seq; /* also part of desc_read:C */
+ if (caller_id_out)
+ *caller_id_out = info->caller_id; /* also part of desc_read:C */
+
+ /*
+ * 1. Guarantee the descriptor content is loaded before re-checking
+ * the state. This avoids reading an obsolete descriptor state
+ * that may not apply to the copied content. This pairs with
+ * desc_reserve:F.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_read:C reads from desc_reserve:G, then desc_read:E
+ * reads from desc_reserve:F.
+ *
+ * Relies on:
+ *
+ * WMB from desc_reserve:F to desc_reserve:G
+ * matching
+ * RMB from desc_read:C to desc_read:E
+ *
+ * 2. Guarantee the record data is loaded before re-checking the
+ * state. This avoids reading an obsolete descriptor state that may
+ * not apply to the copied data. This pairs with data_alloc:A and
+ * data_realloc:A.
+ *
+ * Memory barrier involvement:
+ *
+ * If copy_data:A reads from data_alloc:B, then desc_read:E
+ * reads from desc_make_reusable:A.
+ *
+ * Relies on:
+ *
+ * MB from desc_make_reusable:A to data_alloc:B
+ * matching
+ * RMB from desc_read:C to desc_read:E
+ *
+ * Note: desc_make_reusable:A and data_alloc:B can be different
+ * CPUs. However, the data_alloc:B CPU (which performs the
+ * full memory barrier) must have previously seen
+ * desc_make_reusable:A.
+ */
+ smp_rmb(); /* LMM(desc_read:D) */
+
+ /*
+ * The data has been copied. Return the current descriptor state,
+ * which may have changed since the load above.
+ */
+ state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */
+ d_state = get_desc_state(id, state_val);
+out:
+ if (desc_out)
+ atomic_long_set(&desc_out->state_var, state_val);
+ return d_state;
+}
+
+/*
+ * Take a specified descriptor out of the finalized state by attempting
+ * the transition from finalized to reusable. Either this context or some
+ * other context will have been successful.
+ */
+static void desc_make_reusable(struct prb_desc_ring *desc_ring,
+ unsigned long id)
+{
+ unsigned long val_finalized = DESC_SV(id, desc_finalized);
+ unsigned long val_reusable = DESC_SV(id, desc_reusable);
+ struct prb_desc *desc = to_desc(desc_ring, id);
+ atomic_long_t *state_var = &desc->state_var;
+
+ atomic_long_cmpxchg_relaxed(state_var, val_finalized,
+ val_reusable); /* LMM(desc_make_reusable:A) */
+}
+
+/*
+ * Given the text data ring, put the associated descriptor of each
+ * data block from @lpos_begin until @lpos_end into the reusable state.
+ *
+ * If there is any problem making the associated descriptor reusable, either
+ * the descriptor has not yet been finalized or another writer context has
+ * already pushed the tail lpos past the problematic data block. Regardless,
+ * on error the caller can re-load the tail lpos to determine the situation.
+ */
+static bool data_make_reusable(struct printk_ringbuffer *rb,
+ unsigned long lpos_begin,
+ unsigned long lpos_end,
+ unsigned long *lpos_out)
+{
+
+ struct prb_data_ring *data_ring = &rb->text_data_ring;
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ struct prb_data_block *blk;
+ enum desc_state d_state;
+ struct prb_desc desc;
+ struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos;
+ unsigned long id;
+
+ /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */
+ while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) {
+ blk = to_block(data_ring, lpos_begin);
+
+ /*
+ * Load the block ID from the data block. This is a data race
+ * against a writer that may have newly reserved this data
+ * area. If the loaded value matches a valid descriptor ID,
+ * the blk_lpos of that descriptor will be checked to make
+ * sure it points back to this data block. If the check fails,
+ * the data area has been recycled by another writer.
+ */
+ id = blk->id; /* LMM(data_make_reusable:A) */
+
+ d_state = desc_read(desc_ring, id, &desc,
+ NULL, NULL); /* LMM(data_make_reusable:B) */
+
+ switch (d_state) {
+ case desc_miss:
+ case desc_reserved:
+ case desc_committed:
+ return false;
+ case desc_finalized:
+ /*
+ * This data block is invalid if the descriptor
+ * does not point back to it.
+ */
+ if (blk_lpos->begin != lpos_begin)
+ return false;
+ desc_make_reusable(desc_ring, id);
+ break;
+ case desc_reusable:
+ /*
+ * This data block is invalid if the descriptor
+ * does not point back to it.
+ */
+ if (blk_lpos->begin != lpos_begin)
+ return false;
+ break;
+ }
+
+ /* Advance @lpos_begin to the next data block. */
+ lpos_begin = blk_lpos->next;
+ }
+
+ *lpos_out = lpos_begin;
+ return true;
+}
+
+/*
+ * Advance the data ring tail to at least @lpos. This function puts
+ * descriptors into the reusable state if the tail is pushed beyond
+ * their associated data block.
+ */
+static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long lpos)
+{
+ struct prb_data_ring *data_ring = &rb->text_data_ring;
+ unsigned long tail_lpos_new;
+ unsigned long tail_lpos;
+ unsigned long next_lpos;
+
+ /* If @lpos is from a data-less block, there is nothing to do. */
+ if (LPOS_DATALESS(lpos))
+ return true;
+
+ /*
+ * Any descriptor states that have transitioned to reusable due to the
+ * data tail being pushed to this loaded value will be visible to this
+ * CPU. This pairs with data_push_tail:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If data_push_tail:A reads from data_push_tail:D, then this CPU can
+ * see desc_make_reusable:A.
+ *
+ * Relies on:
+ *
+ * MB from desc_make_reusable:A to data_push_tail:D
+ * matches
+ * READFROM from data_push_tail:D to data_push_tail:A
+ * thus
+ * READFROM from desc_make_reusable:A to this CPU
+ */
+ tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */
+
+ /*
+ * Loop until the tail lpos is at or beyond @lpos. This condition
+ * may already be satisfied, resulting in no full memory barrier
+ * from data_push_tail:D being performed. However, since this CPU
+ * sees the new tail lpos, any descriptor states that transitioned to
+ * the reusable state must already be visible.
+ */
+ while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) {
+ /*
+ * Make all descriptors reusable that are associated with
+ * data blocks before @lpos.
+ */
+ if (!data_make_reusable(rb, tail_lpos, lpos, &next_lpos)) {
+ /*
+ * 1. Guarantee the block ID loaded in
+ * data_make_reusable() is performed before
+ * reloading the tail lpos. The failed
+ * data_make_reusable() may be due to a newly
+ * recycled data area causing the tail lpos to
+ * have been previously pushed. This pairs with
+ * data_alloc:A and data_realloc:A.
+ *
+ * Memory barrier involvement:
+ *
+ * If data_make_reusable:A reads from data_alloc:B,
+ * then data_push_tail:C reads from
+ * data_push_tail:D.
+ *
+ * Relies on:
+ *
+ * MB from data_push_tail:D to data_alloc:B
+ * matching
+ * RMB from data_make_reusable:A to
+ * data_push_tail:C
+ *
+ * Note: data_push_tail:D and data_alloc:B can be
+ * different CPUs. However, the data_alloc:B
+ * CPU (which performs the full memory
+ * barrier) must have previously seen
+ * data_push_tail:D.
+ *
+ * 2. Guarantee the descriptor state loaded in
+ * data_make_reusable() is performed before
+ * reloading the tail lpos. The failed
+ * data_make_reusable() may be due to a newly
+ * recycled descriptor causing the tail lpos to
+ * have been previously pushed. This pairs with
+ * desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If data_make_reusable:B reads from
+ * desc_reserve:F, then data_push_tail:C reads
+ * from data_push_tail:D.
+ *
+ * Relies on:
+ *
+ * MB from data_push_tail:D to desc_reserve:F
+ * matching
+ * RMB from data_make_reusable:B to
+ * data_push_tail:C
+ *
+ * Note: data_push_tail:D and desc_reserve:F can
+ * be different CPUs. However, the
+ * desc_reserve:F CPU (which performs the
+ * full memory barrier) must have previously
+ * seen data_push_tail:D.
+ */
+ smp_rmb(); /* LMM(data_push_tail:B) */
+
+ tail_lpos_new = atomic_long_read(&data_ring->tail_lpos
+ ); /* LMM(data_push_tail:C) */
+ if (tail_lpos_new == tail_lpos)
+ return false;
+
+ /* Another CPU pushed the tail. Try again. */
+ tail_lpos = tail_lpos_new;
+ continue;
+ }
+
+ /*
+ * Guarantee any descriptor states that have transitioned to
+ * reusable are stored before pushing the tail lpos. A full
+ * memory barrier is needed since other CPUs may have made
+ * the descriptor states reusable. This pairs with
+ * data_push_tail:A.
+ */
+ if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos,
+ next_lpos)) { /* LMM(data_push_tail:D) */
+ break;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * Advance the desc ring tail. This function advances the tail by one
+ * descriptor, thus invalidating the oldest descriptor. Before advancing
+ * the tail, the tail descriptor is made reusable and all data blocks up to
+ * and including the descriptor's data block are invalidated (i.e. the data
+ * ring tail is pushed past the data block of the descriptor being made
+ * reusable).
+ */
+static bool desc_push_tail(struct printk_ringbuffer *rb,
+ unsigned long tail_id)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ enum desc_state d_state;
+ struct prb_desc desc;
+
+ d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL);
+
+ switch (d_state) {
+ case desc_miss:
+ /*
+ * If the ID is exactly 1 wrap behind the expected, it is
+ * in the process of being reserved by another writer and
+ * must be considered reserved.
+ */
+ if (DESC_ID(atomic_long_read(&desc.state_var)) ==
+ DESC_ID_PREV_WRAP(desc_ring, tail_id)) {
+ return false;
+ }
+
+ /*
+ * The ID has changed. Another writer must have pushed the
+ * tail and recycled the descriptor already. Success is
+ * returned because the caller is only interested in the
+ * specified tail being pushed, which it was.
+ */
+ return true;
+ case desc_reserved:
+ case desc_committed:
+ return false;
+ case desc_finalized:
+ desc_make_reusable(desc_ring, tail_id);
+ break;
+ case desc_reusable:
+ break;
+ }
+
+ /*
+ * Data blocks must be invalidated before their associated
+ * descriptor can be made available for recycling. Invalidating
+ * them later is not possible because there is no way to trust
+ * data blocks once their associated descriptor is gone.
+ */
+
+ if (!data_push_tail(rb, desc.text_blk_lpos.next))
+ return false;
+
+ /*
+ * Check the next descriptor after @tail_id before pushing the tail
+ * to it because the tail must always be in a finalized or reusable
+ * state. The implementation of prb_first_seq() relies on this.
+ *
+ * A successful read implies that the next descriptor is less than or
+ * equal to @head_id so there is no risk of pushing the tail past the
+ * head.
+ */
+ d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc,
+ NULL, NULL); /* LMM(desc_push_tail:A) */
+
+ if (d_state == desc_finalized || d_state == desc_reusable) {
+ /*
+ * Guarantee any descriptor states that have transitioned to
+ * reusable are stored before pushing the tail ID. This allows
+ * verifying the recycled descriptor state. A full memory
+ * barrier is needed since other CPUs may have made the
+ * descriptor states reusable. This pairs with desc_reserve:D.
+ */
+ atomic_long_cmpxchg(&desc_ring->tail_id, tail_id,
+ DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */
+ } else {
+ /*
+ * Guarantee the last state load from desc_read() is before
+ * reloading @tail_id in order to see a new tail ID in the
+ * case that the descriptor has been recycled. This pairs
+ * with desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_push_tail:A reads from desc_reserve:F, then
+ * desc_push_tail:D reads from desc_push_tail:B.
+ *
+ * Relies on:
+ *
+ * MB from desc_push_tail:B to desc_reserve:F
+ * matching
+ * RMB from desc_push_tail:A to desc_push_tail:D
+ *
+ * Note: desc_push_tail:B and desc_reserve:F can be different
+ * CPUs. However, the desc_reserve:F CPU (which performs
+ * the full memory barrier) must have previously seen
+ * desc_push_tail:B.
+ */
+ smp_rmb(); /* LMM(desc_push_tail:C) */
+
+ /*
+ * Re-check the tail ID. The descriptor following @tail_id is
+ * not in an allowed tail state. But if the tail has since
+ * been moved by another CPU, then it does not matter.
+ */
+ if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */
+ return false;
+ }
+
+ return true;
+}
+
+/* Reserve a new descriptor, invalidating the oldest if necessary. */
+static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ unsigned long prev_state_val;
+ unsigned long id_prev_wrap;
+ struct prb_desc *desc;
+ unsigned long head_id;
+ unsigned long id;
+
+ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */
+
+ do {
+ id = DESC_ID(head_id + 1);
+ id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id);
+
+ /*
+ * Guarantee the head ID is read before reading the tail ID.
+ * Since the tail ID is updated before the head ID, this
+ * guarantees that @id_prev_wrap is never ahead of the tail
+ * ID. This pairs with desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_reserve:A reads from desc_reserve:D, then
+ * desc_reserve:C reads from desc_push_tail:B.
+ *
+ * Relies on:
+ *
+ * MB from desc_push_tail:B to desc_reserve:D
+ * matching
+ * RMB from desc_reserve:A to desc_reserve:C
+ *
+ * Note: desc_push_tail:B and desc_reserve:D can be different
+ * CPUs. However, the desc_reserve:D CPU (which performs
+ * the full memory barrier) must have previously seen
+ * desc_push_tail:B.
+ */
+ smp_rmb(); /* LMM(desc_reserve:B) */
+
+ if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id
+ )) { /* LMM(desc_reserve:C) */
+ /*
+ * Make space for the new descriptor by
+ * advancing the tail.
+ */
+ if (!desc_push_tail(rb, id_prev_wrap))
+ return false;
+ }
+
+ /*
+ * 1. Guarantee the tail ID is read before validating the
+ * recycled descriptor state. A read memory barrier is
+ * sufficient for this. This pairs with desc_push_tail:B.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_reserve:C reads from desc_push_tail:B, then
+ * desc_reserve:E reads from desc_make_reusable:A.
+ *
+ * Relies on:
+ *
+ * MB from desc_make_reusable:A to desc_push_tail:B
+ * matching
+ * RMB from desc_reserve:C to desc_reserve:E
+ *
+ * Note: desc_make_reusable:A and desc_push_tail:B can be
+ * different CPUs. However, the desc_push_tail:B CPU
+ * (which performs the full memory barrier) must have
+ * previously seen desc_make_reusable:A.
+ *
+ * 2. Guarantee the tail ID is stored before storing the head
+ * ID. This pairs with desc_reserve:B.
+ *
+ * 3. Guarantee any data ring tail changes are stored before
+ * recycling the descriptor. Data ring tail changes can
+ * happen via desc_push_tail()->data_push_tail(). A full
+ * memory barrier is needed since another CPU may have
+ * pushed the data ring tails. This pairs with
+ * data_push_tail:B.
+ *
+ * 4. Guarantee a new tail ID is stored before recycling the
+ * descriptor. A full memory barrier is needed since
+ * another CPU may have pushed the tail ID. This pairs
+ * with desc_push_tail:C and this also pairs with
+ * prb_first_seq:C.
+ *
+ * 5. Guarantee the head ID is stored before trying to
+ * finalize the previous descriptor. This pairs with
+ * _prb_commit:B.
+ */
+ } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id,
+ id)); /* LMM(desc_reserve:D) */
+
+ desc = to_desc(desc_ring, id);
+
+ /*
+ * If the descriptor has been recycled, verify the old state val.
+ * See "ABA Issues" about why this verification is performed.
+ */
+ prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */
+ if (prev_state_val &&
+ get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) {
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /*
+ * Assign the descriptor a new ID and set its state to reserved.
+ * See "ABA Issues" about why cmpxchg() instead of set() is used.
+ *
+ * Guarantee the new descriptor ID and state is stored before making
+ * any other changes. A write memory barrier is sufficient for this.
+ * This pairs with desc_read:D.
+ */
+ if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val,
+ DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /* Now data in @desc can be modified: LMM(desc_reserve:G) */
+
+ *id_out = id;
+ return true;
+}
+
+/* Determine the end of a data block. */
+static unsigned long get_next_lpos(struct prb_data_ring *data_ring,
+ unsigned long lpos, unsigned int size)
+{
+ unsigned long begin_lpos;
+ unsigned long next_lpos;
+
+ begin_lpos = lpos;
+ next_lpos = lpos + size;
+
+ /* First check if the data block does not wrap. */
+ if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos))
+ return next_lpos;
+
+ /* Wrapping data blocks store their data at the beginning. */
+ return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size);
+}
+
+/*
+ * Allocate a new data block, invalidating the oldest data block(s)
+ * if necessary. This function also associates the data block with
+ * a specified descriptor.
+ */
+static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size,
+ struct prb_data_blk_lpos *blk_lpos, unsigned long id)
+{
+ struct prb_data_ring *data_ring = &rb->text_data_ring;
+ struct prb_data_block *blk;
+ unsigned long begin_lpos;
+ unsigned long next_lpos;
+
+ if (size == 0) {
+ /* Specify a data-less block. */
+ blk_lpos->begin = NO_LPOS;
+ blk_lpos->next = NO_LPOS;
+ return NULL;
+ }
+
+ size = to_blk_size(size);
+
+ begin_lpos = atomic_long_read(&data_ring->head_lpos);
+
+ do {
+ next_lpos = get_next_lpos(data_ring, begin_lpos, size);
+
+ if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) {
+ /* Failed to allocate, specify a data-less block. */
+ blk_lpos->begin = FAILED_LPOS;
+ blk_lpos->next = FAILED_LPOS;
+ return NULL;
+ }
+
+ /*
+ * 1. Guarantee any descriptor states that have transitioned
+ * to reusable are stored before modifying the newly
+ * allocated data area. A full memory barrier is needed
+ * since other CPUs may have made the descriptor states
+ * reusable. See data_push_tail:A about why the reusable
+ * states are visible. This pairs with desc_read:D.
+ *
+ * 2. Guarantee any updated tail lpos is stored before
+ * modifying the newly allocated data area. Another CPU may
+ * be in data_make_reusable() and is reading a block ID
+ * from this area. data_make_reusable() can handle reading
+ * a garbage block ID value, but then it must be able to
+ * load a new tail lpos. A full memory barrier is needed
+ * since other CPUs may have updated the tail lpos. This
+ * pairs with data_push_tail:B.
+ */
+ } while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos,
+ next_lpos)); /* LMM(data_alloc:A) */
+
+ blk = to_block(data_ring, begin_lpos);
+ blk->id = id; /* LMM(data_alloc:B) */
+
+ if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) {
+ /* Wrapping data blocks store their data at the beginning. */
+ blk = to_block(data_ring, 0);
+
+ /*
+ * Store the ID on the wrapped block for consistency.
+ * The printk_ringbuffer does not actually use it.
+ */
+ blk->id = id;
+ }
+
+ blk_lpos->begin = begin_lpos;
+ blk_lpos->next = next_lpos;
+
+ return &blk->data[0];
+}
+
+/*
+ * Try to resize an existing data block associated with the descriptor
+ * specified by @id. If the resized data block should become wrapped, it
+ * copies the old data to the new data block. If @size yields a data block
+ * with the same or less size, the data block is left as is.
+ *
+ * Fail if this is not the last allocated data block or if there is not
+ * enough space or it is not possible make enough space.
+ *
+ * Return a pointer to the beginning of the entire data buffer or NULL on
+ * failure.
+ */
+static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size,
+ struct prb_data_blk_lpos *blk_lpos, unsigned long id)
+{
+ struct prb_data_ring *data_ring = &rb->text_data_ring;
+ struct prb_data_block *blk;
+ unsigned long head_lpos;
+ unsigned long next_lpos;
+ bool wrapped;
+
+ /* Reallocation only works if @blk_lpos is the newest data block. */
+ head_lpos = atomic_long_read(&data_ring->head_lpos);
+ if (head_lpos != blk_lpos->next)
+ return NULL;
+
+ /* Keep track if @blk_lpos was a wrapping data block. */
+ wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next));
+
+ size = to_blk_size(size);
+
+ next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size);
+
+ /* If the data block does not increase, there is nothing to do. */
+ if (head_lpos - next_lpos < DATA_SIZE(data_ring)) {
+ if (wrapped)
+ blk = to_block(data_ring, 0);
+ else
+ blk = to_block(data_ring, blk_lpos->begin);
+ return &blk->data[0];
+ }
+
+ if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring)))
+ return NULL;
+
+ /* The memory barrier involvement is the same as data_alloc:A. */
+ if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos,
+ next_lpos)) { /* LMM(data_realloc:A) */
+ return NULL;
+ }
+
+ blk = to_block(data_ring, blk_lpos->begin);
+
+ if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) {
+ struct prb_data_block *old_blk = blk;
+
+ /* Wrapping data blocks store their data at the beginning. */
+ blk = to_block(data_ring, 0);
+
+ /*
+ * Store the ID on the wrapped block for consistency.
+ * The printk_ringbuffer does not actually use it.
+ */
+ blk->id = id;
+
+ if (!wrapped) {
+ /*
+ * Since the allocated space is now in the newly
+ * created wrapping data block, copy the content
+ * from the old data block.
+ */
+ memcpy(&blk->data[0], &old_blk->data[0],
+ (blk_lpos->next - blk_lpos->begin) - sizeof(blk->id));
+ }
+ }
+
+ blk_lpos->next = next_lpos;
+
+ return &blk->data[0];
+}
+
+/* Return the number of bytes used by a data block. */
+static unsigned int space_used(struct prb_data_ring *data_ring,
+ struct prb_data_blk_lpos *blk_lpos)
+{
+ /* Data-less blocks take no space. */
+ if (BLK_DATALESS(blk_lpos))
+ return 0;
+
+ if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) {
+ /* Data block does not wrap. */
+ return (DATA_INDEX(data_ring, blk_lpos->next) -
+ DATA_INDEX(data_ring, blk_lpos->begin));
+ }
+
+ /*
+ * For wrapping data blocks, the trailing (wasted) space is
+ * also counted.
+ */
+ return (DATA_INDEX(data_ring, blk_lpos->next) +
+ DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin));
+}
+
+/*
+ * Given @blk_lpos, return a pointer to the writer data from the data block
+ * and calculate the size of the data part. A NULL pointer is returned if
+ * @blk_lpos specifies values that could never be legal.
+ *
+ * This function (used by readers) performs strict validation on the lpos
+ * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
+ * triggered if an internal error is detected.
+ */
+static const char *get_data(struct prb_data_ring *data_ring,
+ struct prb_data_blk_lpos *blk_lpos,
+ unsigned int *data_size)
+{
+ struct prb_data_block *db;
+
+ /* Data-less data block description. */
+ if (BLK_DATALESS(blk_lpos)) {
+ if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) {
+ *data_size = 0;
+ return "";
+ }
+ return NULL;
+ }
+
+ /* Regular data block: @begin less than @next and in same wrap. */
+ if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) &&
+ blk_lpos->begin < blk_lpos->next) {
+ db = to_block(data_ring, blk_lpos->begin);
+ *data_size = blk_lpos->next - blk_lpos->begin;
+
+ /* Wrapping data block: @begin is one wrap behind @next. */
+ } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) ==
+ DATA_WRAPS(data_ring, blk_lpos->next)) {
+ db = to_block(data_ring, 0);
+ *data_size = DATA_INDEX(data_ring, blk_lpos->next);
+
+ /* Illegal block description. */
+ } else {
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
+ /* A valid data block will always be aligned to the ID size. */
+ if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) ||
+ WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) {
+ return NULL;
+ }
+
+ /* A valid data block will always have at least an ID. */
+ if (WARN_ON_ONCE(*data_size < sizeof(db->id)))
+ return NULL;
+
+ /* Subtract block ID space from size to reflect data size. */
+ *data_size -= sizeof(db->id);
+
+ return &db->data[0];
+}
+
+/*
+ * Attempt to transition the newest descriptor from committed back to reserved
+ * so that the record can be modified by a writer again. This is only possible
+ * if the descriptor is not yet finalized and the provided @caller_id matches.
+ */
+static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring,
+ u32 caller_id, unsigned long *id_out)
+{
+ unsigned long prev_state_val;
+ enum desc_state d_state;
+ struct prb_desc desc;
+ struct prb_desc *d;
+ unsigned long id;
+ u32 cid;
+
+ id = atomic_long_read(&desc_ring->head_id);
+
+ /*
+ * To reduce unnecessarily reopening, first check if the descriptor
+ * state and caller ID are correct.
+ */
+ d_state = desc_read(desc_ring, id, &desc, NULL, &cid);
+ if (d_state != desc_committed || cid != caller_id)
+ return NULL;
+
+ d = to_desc(desc_ring, id);
+
+ prev_state_val = DESC_SV(id, desc_committed);
+
+ /*
+ * Guarantee the reserved state is stored before reading any
+ * record data. A full memory barrier is needed because @state_var
+ * modification is followed by reading. This pairs with _prb_commit:B.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_reopen_last:A reads from _prb_commit:B, then
+ * prb_reserve_in_last:A reads from _prb_commit:A.
+ *
+ * Relies on:
+ *
+ * WMB from _prb_commit:A to _prb_commit:B
+ * matching
+ * MB If desc_reopen_last:A to prb_reserve_in_last:A
+ */
+ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
+ DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */
+ return NULL;
+ }
+
+ *id_out = id;
+ return d;
+}
+
+/**
+ * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer
+ * used by the newest record.
+ *
+ * @e: The entry structure to setup.
+ * @rb: The ringbuffer to re-reserve and extend data in.
+ * @r: The record structure to allocate buffers for.
+ * @caller_id: The caller ID of the caller (reserving writer).
+ * @max_size: Fail if the extended size would be greater than this.
+ *
+ * This is the public function available to writers to re-reserve and extend
+ * data.
+ *
+ * The writer specifies the text size to extend (not the new total size) by
+ * setting the @text_buf_size field of @r. To ensure proper initialization
+ * of @r, prb_rec_init_wr() should be used.
+ *
+ * This function will fail if @caller_id does not match the caller ID of the
+ * newest record. In that case the caller must reserve new data using
+ * prb_reserve().
+ *
+ * Context: Any context. Disables local interrupts on success.
+ * Return: true if text data could be extended, otherwise false.
+ *
+ * On success:
+ *
+ * - @r->text_buf points to the beginning of the entire text buffer.
+ *
+ * - @r->text_buf_size is set to the new total size of the buffer.
+ *
+ * - @r->info is not touched so that @r->info->text_len could be used
+ * to append the text.
+ *
+ * - prb_record_text_space() can be used on @e to query the new
+ * actually used space.
+ *
+ * Important: All @r->info fields will already be set with the current values
+ * for the record. I.e. @r->info->text_len will be less than
+ * @text_buf_size. Writers can use @r->info->text_len to know
+ * where concatenation begins and writers should update
+ * @r->info->text_len after concatenating.
+ */
+bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+ struct printk_record *r, u32 caller_id, unsigned int max_size)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ struct printk_info *info;
+ unsigned int data_size;
+ struct prb_desc *d;
+ unsigned long id;
+
+ local_irq_save(e->irqflags);
+
+ /* Transition the newest descriptor back to the reserved state. */
+ d = desc_reopen_last(desc_ring, caller_id, &id);
+ if (!d) {
+ local_irq_restore(e->irqflags);
+ goto fail_reopen;
+ }
+
+ /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */
+
+ info = to_info(desc_ring, id);
+
+ /*
+ * Set the @e fields here so that prb_commit() can be used if
+ * anything fails from now on.
+ */
+ e->rb = rb;
+ e->id = id;
+
+ /*
+ * desc_reopen_last() checked the caller_id, but there was no
+ * exclusive access at that point. The descriptor may have
+ * changed since then.
+ */
+ if (caller_id != info->caller_id)
+ goto fail;
+
+ if (BLK_DATALESS(&d->text_blk_lpos)) {
+ if (WARN_ON_ONCE(info->text_len != 0)) {
+ pr_warn_once("wrong text_len value (%hu, expecting 0)\n",
+ info->text_len);
+ info->text_len = 0;
+ }
+
+ if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
+ goto fail;
+
+ if (r->text_buf_size > max_size)
+ goto fail;
+
+ r->text_buf = data_alloc(rb, r->text_buf_size,
+ &d->text_blk_lpos, id);
+ } else {
+ if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size))
+ goto fail;
+
+ /*
+ * Increase the buffer size to include the original size. If
+ * the meta data (@text_len) is not sane, use the full data
+ * block size.
+ */
+ if (WARN_ON_ONCE(info->text_len > data_size)) {
+ pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n",
+ info->text_len, data_size);
+ info->text_len = data_size;
+ }
+ r->text_buf_size += info->text_len;
+
+ if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
+ goto fail;
+
+ if (r->text_buf_size > max_size)
+ goto fail;
+
+ r->text_buf = data_realloc(rb, r->text_buf_size,
+ &d->text_blk_lpos, id);
+ }
+ if (r->text_buf_size && !r->text_buf)
+ goto fail;
+
+ r->info = info;
+
+ e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);
+
+ return true;
+fail:
+ prb_commit(e);
+ /* prb_commit() re-enabled interrupts. */
+fail_reopen:
+ /* Make it clear to the caller that the re-reserve failed. */
+ memset(r, 0, sizeof(*r));
+ return false;
+}
+
+/*
+ * Attempt to finalize a specified descriptor. If this fails, the descriptor
+ * is either already final or it will finalize itself when the writer commits.
+ */
+static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id)
+{
+ unsigned long prev_state_val = DESC_SV(id, desc_committed);
+ struct prb_desc *d = to_desc(desc_ring, id);
+
+ atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val,
+ DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */
+
+ /* Best effort to remember the last finalized @id. */
+ atomic_long_set(&desc_ring->last_finalized_id, id);
+}
+
+/**
+ * prb_reserve() - Reserve space in the ringbuffer.
+ *
+ * @e: The entry structure to setup.
+ * @rb: The ringbuffer to reserve data in.
+ * @r: The record structure to allocate buffers for.
+ *
+ * This is the public function available to writers to reserve data.
+ *
+ * The writer specifies the text size to reserve by setting the
+ * @text_buf_size field of @r. To ensure proper initialization of @r,
+ * prb_rec_init_wr() should be used.
+ *
+ * Context: Any context. Disables local interrupts on success.
+ * Return: true if at least text data could be allocated, otherwise false.
+ *
+ * On success, the fields @info and @text_buf of @r will be set by this
+ * function and should be filled in by the writer before committing. Also
+ * on success, prb_record_text_space() can be used on @e to query the actual
+ * space used for the text data block.
+ *
+ * Important: @info->text_len needs to be set correctly by the writer in
+ * order for data to be readable and/or extended. Its value
+ * is initialized to 0.
+ */
+bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+ struct printk_record *r)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ struct printk_info *info;
+ struct prb_desc *d;
+ unsigned long id;
+ u64 seq;
+
+ if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
+ goto fail;
+
+ /*
+ * Descriptors in the reserved state act as blockers to all further
+ * reservations once the desc_ring has fully wrapped. Disable
+ * interrupts during the reserve/commit window in order to minimize
+ * the likelihood of this happening.
+ */
+ local_irq_save(e->irqflags);
+
+ if (!desc_reserve(rb, &id)) {
+ /* Descriptor reservation failures are tracked. */
+ atomic_long_inc(&rb->fail);
+ local_irq_restore(e->irqflags);
+ goto fail;
+ }
+
+ d = to_desc(desc_ring, id);
+ info = to_info(desc_ring, id);
+
+ /*
+ * All @info fields (except @seq) are cleared and must be filled in
+ * by the writer. Save @seq before clearing because it is used to
+ * determine the new sequence number.
+ */
+ seq = info->seq;
+ memset(info, 0, sizeof(*info));
+
+ /*
+ * Set the @e fields here so that prb_commit() can be used if
+ * text data allocation fails.
+ */
+ e->rb = rb;
+ e->id = id;
+
+ /*
+ * Initialize the sequence number if it has "never been set".
+ * Otherwise just increment it by a full wrap.
+ *
+ * @seq is considered "never been set" if it has a value of 0,
+ * _except_ for @infos[0], which was specially setup by the ringbuffer
+ * initializer and therefore is always considered as set.
+ *
+ * See the "Bootstrap" comment block in printk_ringbuffer.h for
+ * details about how the initializer bootstraps the descriptors.
+ */
+ if (seq == 0 && DESC_INDEX(desc_ring, id) != 0)
+ info->seq = DESC_INDEX(desc_ring, id);
+ else
+ info->seq = seq + DESCS_COUNT(desc_ring);
+
+ /*
+ * New data is about to be reserved. Once that happens, previous
+ * descriptors are no longer able to be extended. Finalize the
+ * previous descriptor now so that it can be made available to
+ * readers. (For seq==0 there is no previous descriptor.)
+ */
+ if (info->seq > 0)
+ desc_make_final(desc_ring, DESC_ID(id - 1));
+
+ r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id);
+ /* If text data allocation fails, a data-less record is committed. */
+ if (r->text_buf_size && !r->text_buf) {
+ prb_commit(e);
+ /* prb_commit() re-enabled interrupts. */
+ goto fail;
+ }
+
+ r->info = info;
+
+ /* Record full text space used by record. */
+ e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);
+
+ return true;
+fail:
+ /* Make it clear to the caller that the reserve failed. */
+ memset(r, 0, sizeof(*r));
+ return false;
+}
+
+/* Commit the data (possibly finalizing it) and restore interrupts. */
+static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val)
+{
+ struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
+ struct prb_desc *d = to_desc(desc_ring, e->id);
+ unsigned long prev_state_val = DESC_SV(e->id, desc_reserved);
+
+ /* Now the writer has finished all writing: LMM(_prb_commit:A) */
+
+ /*
+ * Set the descriptor as committed. See "ABA Issues" about why
+ * cmpxchg() instead of set() is used.
+ *
+ * 1 Guarantee all record data is stored before the descriptor state
+ * is stored as committed. A write memory barrier is sufficient
+ * for this. This pairs with desc_read:B and desc_reopen_last:A.
+ *
+ * 2. Guarantee the descriptor state is stored as committed before
+ * re-checking the head ID in order to possibly finalize this
+ * descriptor. This pairs with desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If prb_commit:A reads from desc_reserve:D, then
+ * desc_make_final:A reads from _prb_commit:B.
+ *
+ * Relies on:
+ *
+ * MB _prb_commit:B to prb_commit:A
+ * matching
+ * MB desc_reserve:D to desc_make_final:A
+ */
+ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
+ DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */
+ WARN_ON_ONCE(1);
+ }
+
+ /* Restore interrupts, the reserve/commit window is finished. */
+ local_irq_restore(e->irqflags);
+}
+
+/**
+ * prb_commit() - Commit (previously reserved) data to the ringbuffer.
+ *
+ * @e: The entry containing the reserved data information.
+ *
+ * This is the public function available to writers to commit data.
+ *
+ * Note that the data is not yet available to readers until it is finalized.
+ * Finalizing happens automatically when space for the next record is
+ * reserved.
+ *
+ * See prb_final_commit() for a version of this function that finalizes
+ * immediately.
+ *
+ * Context: Any context. Enables local interrupts.
+ */
+void prb_commit(struct prb_reserved_entry *e)
+{
+ struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
+ unsigned long head_id;
+
+ _prb_commit(e, desc_committed);
+
+ /*
+ * If this descriptor is no longer the head (i.e. a new record has
+ * been allocated), extending the data for this record is no longer
+ * allowed and therefore it must be finalized.
+ */
+ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */
+ if (head_id != e->id)
+ desc_make_final(desc_ring, e->id);
+}
+
+/**
+ * prb_final_commit() - Commit and finalize (previously reserved) data to
+ * the ringbuffer.
+ *
+ * @e: The entry containing the reserved data information.
+ *
+ * This is the public function available to writers to commit+finalize data.
+ *
+ * By finalizing, the data is made immediately available to readers.
+ *
+ * This function should only be used if there are no intentions of extending
+ * this data using prb_reserve_in_last().
+ *
+ * Context: Any context. Enables local interrupts.
+ */
+void prb_final_commit(struct prb_reserved_entry *e)
+{
+ struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
+
+ _prb_commit(e, desc_finalized);
+
+ /* Best effort to remember the last finalized @id. */
+ atomic_long_set(&desc_ring->last_finalized_id, e->id);
+}
+
+/*
+ * Count the number of lines in provided text. All text has at least 1 line
+ * (even if @text_size is 0). Each '\n' processed is counted as an additional
+ * line.
+ */
+static unsigned int count_lines(const char *text, unsigned int text_size)
+{
+ unsigned int next_size = text_size;
+ unsigned int line_count = 1;
+ const char *next = text;
+
+ while (next_size) {
+ next = memchr(next, '\n', next_size);
+ if (!next)
+ break;
+ line_count++;
+ next++;
+ next_size = text_size - (next - text);
+ }
+
+ return line_count;
+}
+
+/*
+ * Given @blk_lpos, copy an expected @len of data into the provided buffer.
+ * If @line_count is provided, count the number of lines in the data.
+ *
+ * This function (used by readers) performs strict validation on the data
+ * size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
+ * triggered if an internal error is detected.
+ */
+static bool copy_data(struct prb_data_ring *data_ring,
+ struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf,
+ unsigned int buf_size, unsigned int *line_count)
+{
+ unsigned int data_size;
+ const char *data;
+
+ /* Caller might not want any data. */
+ if ((!buf || !buf_size) && !line_count)
+ return true;
+
+ data = get_data(data_ring, blk_lpos, &data_size);
+ if (!data)
+ return false;
+
+ /*
+ * Actual cannot be less than expected. It can be more than expected
+ * because of the trailing alignment padding.
+ *
+ * Note that invalid @len values can occur because the caller loads
+ * the value during an allowed data race.
+ */
+ if (data_size < (unsigned int)len)
+ return false;
+
+ /* Caller interested in the line count? */
+ if (line_count)
+ *line_count = count_lines(data, len);
+
+ /* Caller interested in the data content? */
+ if (!buf || !buf_size)
+ return true;
+
+ data_size = min_t(unsigned int, buf_size, len);
+
+ memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */
+ return true;
+}
+
+/*
+ * This is an extended version of desc_read(). It gets a copy of a specified
+ * descriptor. However, it also verifies that the record is finalized and has
+ * the sequence number @seq. On success, 0 is returned.
+ *
+ * Error return values:
+ * -EINVAL: A finalized record with sequence number @seq does not exist.
+ * -ENOENT: A finalized record with sequence number @seq exists, but its data
+ * is not available. This is a valid record, so readers should
+ * continue with the next record.
+ */
+static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring,
+ unsigned long id, u64 seq,
+ struct prb_desc *desc_out)
+{
+ struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos;
+ enum desc_state d_state;
+ u64 s;
+
+ d_state = desc_read(desc_ring, id, desc_out, &s, NULL);
+
+ /*
+ * An unexpected @id (desc_miss) or @seq mismatch means the record
+ * does not exist. A descriptor in the reserved or committed state
+ * means the record does not yet exist for the reader.
+ */
+ if (d_state == desc_miss ||
+ d_state == desc_reserved ||
+ d_state == desc_committed ||
+ s != seq) {
+ return -EINVAL;
+ }
+
+ /*
+ * A descriptor in the reusable state may no longer have its data
+ * available; report it as existing but with lost data. Or the record
+ * may actually be a record with lost data.
+ */
+ if (d_state == desc_reusable ||
+ (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) {
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+/*
+ * Copy the ringbuffer data from the record with @seq to the provided
+ * @r buffer. On success, 0 is returned.
+ *
+ * See desc_read_finalized_seq() for error return values.
+ */
+static int prb_read(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_record *r, unsigned int *line_count)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ struct printk_info *info = to_info(desc_ring, seq);
+ struct prb_desc *rdesc = to_desc(desc_ring, seq);
+ atomic_long_t *state_var = &rdesc->state_var;
+ struct prb_desc desc;
+ unsigned long id;
+ int err;
+
+ /* Extract the ID, used to specify the descriptor to read. */
+ id = DESC_ID(atomic_long_read(state_var));
+
+ /* Get a local copy of the correct descriptor (if available). */
+ err = desc_read_finalized_seq(desc_ring, id, seq, &desc);
+
+ /*
+ * If @r is NULL, the caller is only interested in the availability
+ * of the record.
+ */
+ if (err || !r)
+ return err;
+
+ /* If requested, copy meta data. */
+ if (r->info)
+ memcpy(r->info, info, sizeof(*(r->info)));
+
+ /* Copy text data. If it fails, this is a data-less record. */
+ if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len,
+ r->text_buf, r->text_buf_size, line_count)) {
+ return -ENOENT;
+ }
+
+ /* Ensure the record is still finalized and has the same @seq. */
+ return desc_read_finalized_seq(desc_ring, id, seq, &desc);
+}
+
+/* Get the sequence number of the tail descriptor. */
+static u64 prb_first_seq(struct printk_ringbuffer *rb)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ enum desc_state d_state;
+ struct prb_desc desc;
+ unsigned long id;
+ u64 seq;
+
+ for (;;) {
+ id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */
+
+ d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */
+
+ /*
+ * This loop will not be infinite because the tail is
+ * _always_ in the finalized or reusable state.
+ */
+ if (d_state == desc_finalized || d_state == desc_reusable)
+ break;
+
+ /*
+ * Guarantee the last state load from desc_read() is before
+ * reloading @tail_id in order to see a new tail in the case
+ * that the descriptor has been recycled. This pairs with
+ * desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If prb_first_seq:B reads from desc_reserve:F, then
+ * prb_first_seq:A reads from desc_push_tail:B.
+ *
+ * Relies on:
+ *
+ * MB from desc_push_tail:B to desc_reserve:F
+ * matching
+ * RMB prb_first_seq:B to prb_first_seq:A
+ */
+ smp_rmb(); /* LMM(prb_first_seq:C) */
+ }
+
+ return seq;
+}
+
+/*
+ * Non-blocking read of a record. Updates @seq to the last finalized record
+ * (which may have no data available).
+ *
+ * See the description of prb_read_valid() and prb_read_valid_info()
+ * for details.
+ */
+static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
+ struct printk_record *r, unsigned int *line_count)
+{
+ u64 tail_seq;
+ int err;
+
+ while ((err = prb_read(rb, *seq, r, line_count))) {
+ tail_seq = prb_first_seq(rb);
+
+ if (*seq < tail_seq) {
+ /*
+ * Behind the tail. Catch up and try again. This
+ * can happen for -ENOENT and -EINVAL cases.
+ */
+ *seq = tail_seq;
+
+ } else if (err == -ENOENT) {
+ /* Record exists, but no data available. Skip. */
+ (*seq)++;
+
+ } else {
+ /* Non-existent/non-finalized record. Must stop. */
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * prb_read_valid() - Non-blocking read of a requested record or (if gone)
+ * the next available record.
+ *
+ * @rb: The ringbuffer to read from.
+ * @seq: The sequence number of the record to read.
+ * @r: A record data buffer to store the read record to.
+ *
+ * This is the public function available to readers to read a record.
+ *
+ * The reader provides the @info and @text_buf buffers of @r to be
+ * filled in. Any of the buffer pointers can be set to NULL if the reader
+ * is not interested in that data. To ensure proper initialization of @r,
+ * prb_rec_init_rd() should be used.
+ *
+ * Context: Any context.
+ * Return: true if a record was read, otherwise false.
+ *
+ * On success, the reader must check r->info.seq to see which record was
+ * actually read. This allows the reader to detect dropped records.
+ *
+ * Failure means @seq refers to a not yet written record.
+ */
+bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_record *r)
+{
+ return _prb_read_valid(rb, &seq, r, NULL);
+}
+
+/**
+ * prb_read_valid_info() - Non-blocking read of meta data for a requested
+ * record or (if gone) the next available record.
+ *
+ * @rb: The ringbuffer to read from.
+ * @seq: The sequence number of the record to read.
+ * @info: A buffer to store the read record meta data to.
+ * @line_count: A buffer to store the number of lines in the record text.
+ *
+ * This is the public function available to readers to read only the
+ * meta data of a record.
+ *
+ * The reader provides the @info, @line_count buffers to be filled in.
+ * Either of the buffer pointers can be set to NULL if the reader is not
+ * interested in that data.
+ *
+ * Context: Any context.
+ * Return: true if a record's meta data was read, otherwise false.
+ *
+ * On success, the reader must check info->seq to see which record meta data
+ * was actually read. This allows the reader to detect dropped records.
+ *
+ * Failure means @seq refers to a not yet written record.
+ */
+bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_info *info, unsigned int *line_count)
+{
+ struct printk_record r;
+
+ prb_rec_init_rd(&r, info, NULL, 0);
+
+ return _prb_read_valid(rb, &seq, &r, line_count);
+}
+
+/**
+ * prb_first_valid_seq() - Get the sequence number of the oldest available
+ * record.
+ *
+ * @rb: The ringbuffer to get the sequence number from.
+ *
+ * This is the public function available to readers to see what the
+ * first/oldest valid sequence number is.
+ *
+ * This provides readers a starting point to begin iterating the ringbuffer.
+ *
+ * Context: Any context.
+ * Return: The sequence number of the first/oldest record or, if the
+ * ringbuffer is empty, 0 is returned.
+ */
+u64 prb_first_valid_seq(struct printk_ringbuffer *rb)
+{
+ u64 seq = 0;
+
+ if (!_prb_read_valid(rb, &seq, NULL, NULL))
+ return 0;
+
+ return seq;
+}
+
+/**
+ * prb_next_seq() - Get the sequence number after the last available record.
+ *
+ * @rb: The ringbuffer to get the sequence number from.
+ *
+ * This is the public function available to readers to see what the next
+ * newest sequence number available to readers will be.
+ *
+ * This provides readers a sequence number to jump to if all currently
+ * available records should be skipped.
+ *
+ * Context: Any context.
+ * Return: The sequence number of the next newest (not yet available) record
+ * for readers.
+ */
+u64 prb_next_seq(struct printk_ringbuffer *rb)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ enum desc_state d_state;
+ unsigned long id;
+ u64 seq;
+
+ /* Check if the cached @id still points to a valid @seq. */
+ id = atomic_long_read(&desc_ring->last_finalized_id);
+ d_state = desc_read(desc_ring, id, NULL, &seq, NULL);
+
+ if (d_state == desc_finalized || d_state == desc_reusable) {
+ /*
+ * Begin searching after the last finalized record.
+ *
+ * On 0, the search must begin at 0 because of hack#2
+ * of the bootstrapping phase it is not known if a
+ * record at index 0 exists.
+ */
+ if (seq != 0)
+ seq++;
+ } else {
+ /*
+ * The information about the last finalized sequence number
+ * has gone. It should happen only when there is a flood of
+ * new messages and the ringbuffer is rapidly recycled.
+ * Give up and start from the beginning.
+ */
+ seq = 0;
+ }
+
+ /*
+ * The information about the last finalized @seq might be inaccurate.
+ * Search forward to find the current one.
+ */
+ while (_prb_read_valid(rb, &seq, NULL, NULL))
+ seq++;
+
+ return seq;
+}
+
+/**
+ * prb_init() - Initialize a ringbuffer to use provided external buffers.
+ *
+ * @rb: The ringbuffer to initialize.
+ * @text_buf: The data buffer for text data.
+ * @textbits: The size of @text_buf as a power-of-2 value.
+ * @descs: The descriptor buffer for ringbuffer records.
+ * @descbits: The count of @descs items as a power-of-2 value.
+ * @infos: The printk_info buffer for ringbuffer records.
+ *
+ * This is the public function available to writers to setup a ringbuffer
+ * during runtime using provided buffers.
+ *
+ * This must match the initialization of DEFINE_PRINTKRB().
+ *
+ * Context: Any context.
+ */
+void prb_init(struct printk_ringbuffer *rb,
+ char *text_buf, unsigned int textbits,
+ struct prb_desc *descs, unsigned int descbits,
+ struct printk_info *infos)
+{
+ memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0]));
+ memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0]));
+
+ rb->desc_ring.count_bits = descbits;
+ rb->desc_ring.descs = descs;
+ rb->desc_ring.infos = infos;
+ atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits));
+ atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits));
+ atomic_long_set(&rb->desc_ring.last_finalized_id, DESC0_ID(descbits));
+
+ rb->text_data_ring.size_bits = textbits;
+ rb->text_data_ring.data = text_buf;
+ atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits));
+ atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits));
+
+ atomic_long_set(&rb->fail, 0);
+
+ atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits));
+ descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS;
+ descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS;
+
+ infos[0].seq = -(u64)_DESCS_COUNT(descbits);
+ infos[_DESCS_COUNT(descbits) - 1].seq = 0;
+}
+
+/**
+ * prb_record_text_space() - Query the full actual used ringbuffer space for
+ * the text data of a reserved entry.
+ *
+ * @e: The successfully reserved entry to query.
+ *
+ * This is the public function available to writers to see how much actual
+ * space is used in the ringbuffer to store the text data of the specified
+ * entry.
+ *
+ * This function is only valid if @e has been successfully reserved using
+ * prb_reserve().
+ *
+ * Context: Any context.
+ * Return: The size in bytes used by the text data of the associated record.
+ */
+unsigned int prb_record_text_space(struct prb_reserved_entry *e)
+{
+ return e->text_space;
+}
diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h
new file mode 100644
index 000000000000..18cd25e489b8
--- /dev/null
+++ b/kernel/printk/printk_ringbuffer.h
@@ -0,0 +1,384 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _KERNEL_PRINTK_RINGBUFFER_H
+#define _KERNEL_PRINTK_RINGBUFFER_H
+
+#include <linux/atomic.h>
+#include <linux/dev_printk.h>
+
+/*
+ * Meta information about each stored message.
+ *
+ * All fields are set by the printk code except for @seq, which is
+ * set by the ringbuffer code.
+ */
+struct printk_info {
+ u64 seq; /* sequence number */
+ u64 ts_nsec; /* timestamp in nanoseconds */
+ u16 text_len; /* length of text message */
+ u8 facility; /* syslog facility */
+ u8 flags:5; /* internal record flags */
+ u8 level:3; /* syslog level */
+ u32 caller_id; /* thread id or processor id */
+
+ struct dev_printk_info dev_info;
+};
+
+/*
+ * A structure providing the buffers, used by writers and readers.
+ *
+ * Writers:
+ * Using prb_rec_init_wr(), a writer sets @text_buf_size before calling
+ * prb_reserve(). On success, prb_reserve() sets @info and @text_buf to
+ * buffers reserved for that writer.
+ *
+ * Readers:
+ * Using prb_rec_init_rd(), a reader sets all fields before calling
+ * prb_read_valid(). Note that the reader provides the @info and @text_buf,
+ * buffers. On success, the struct pointed to by @info will be filled and
+ * the char array pointed to by @text_buf will be filled with text data.
+ */
+struct printk_record {
+ struct printk_info *info;
+ char *text_buf;
+ unsigned int text_buf_size;
+};
+
+/* Specifies the logical position and span of a data block. */
+struct prb_data_blk_lpos {
+ unsigned long begin;
+ unsigned long next;
+};
+
+/*
+ * A descriptor: the complete meta-data for a record.
+ *
+ * @state_var: A bitwise combination of descriptor ID and descriptor state.
+ */
+struct prb_desc {
+ atomic_long_t state_var;
+ struct prb_data_blk_lpos text_blk_lpos;
+};
+
+/* A ringbuffer of "ID + data" elements. */
+struct prb_data_ring {
+ unsigned int size_bits;
+ char *data;
+ atomic_long_t head_lpos;
+ atomic_long_t tail_lpos;
+};
+
+/* A ringbuffer of "struct prb_desc" elements. */
+struct prb_desc_ring {
+ unsigned int count_bits;
+ struct prb_desc *descs;
+ struct printk_info *infos;
+ atomic_long_t head_id;
+ atomic_long_t tail_id;
+ atomic_long_t last_finalized_id;
+};
+
+/*
+ * The high level structure representing the printk ringbuffer.
+ *
+ * @fail: Count of failed prb_reserve() calls where not even a data-less
+ * record was created.
+ */
+struct printk_ringbuffer {
+ struct prb_desc_ring desc_ring;
+ struct prb_data_ring text_data_ring;
+ atomic_long_t fail;
+};
+
+/*
+ * Used by writers as a reserve/commit handle.
+ *
+ * @rb: Ringbuffer where the entry is reserved.
+ * @irqflags: Saved irq flags to restore on entry commit.
+ * @id: ID of the reserved descriptor.
+ * @text_space: Total occupied buffer space in the text data ring, including
+ * ID, alignment padding, and wrapping data blocks.
+ *
+ * This structure is an opaque handle for writers. Its contents are only
+ * to be used by the ringbuffer implementation.
+ */
+struct prb_reserved_entry {
+ struct printk_ringbuffer *rb;
+ unsigned long irqflags;
+ unsigned long id;
+ unsigned int text_space;
+};
+
+/* The possible responses of a descriptor state-query. */
+enum desc_state {
+ desc_miss = -1, /* ID mismatch (pseudo state) */
+ desc_reserved = 0x0, /* reserved, in use by writer */
+ desc_committed = 0x1, /* committed by writer, could get reopened */
+ desc_finalized = 0x2, /* committed, no further modification allowed */
+ desc_reusable = 0x3, /* free, not yet used by any writer */
+};
+
+#define _DATA_SIZE(sz_bits) (1UL << (sz_bits))
+#define _DESCS_COUNT(ct_bits) (1U << (ct_bits))
+#define DESC_SV_BITS (sizeof(unsigned long) * 8)
+#define DESC_FLAGS_SHIFT (DESC_SV_BITS - 2)
+#define DESC_FLAGS_MASK (3UL << DESC_FLAGS_SHIFT)
+#define DESC_STATE(sv) (3UL & (sv >> DESC_FLAGS_SHIFT))
+#define DESC_SV(id, state) (((unsigned long)state << DESC_FLAGS_SHIFT) | id)
+#define DESC_ID_MASK (~DESC_FLAGS_MASK)
+#define DESC_ID(sv) ((sv) & DESC_ID_MASK)
+#define FAILED_LPOS 0x1
+#define NO_LPOS 0x3
+
+#define FAILED_BLK_LPOS \
+{ \
+ .begin = FAILED_LPOS, \
+ .next = FAILED_LPOS, \
+}
+
+/*
+ * Descriptor Bootstrap
+ *
+ * The descriptor array is minimally initialized to allow immediate usage
+ * by readers and writers. The requirements that the descriptor array
+ * initialization must satisfy:
+ *
+ * Req1
+ * The tail must point to an existing (committed or reusable) descriptor.
+ * This is required by the implementation of prb_first_seq().
+ *
+ * Req2
+ * Readers must see that the ringbuffer is initially empty.
+ *
+ * Req3
+ * The first record reserved by a writer is assigned sequence number 0.
+ *
+ * To satisfy Req1, the tail initially points to a descriptor that is
+ * minimally initialized (having no data block, i.e. data-less with the
+ * data block's lpos @begin and @next values set to FAILED_LPOS).
+ *
+ * To satisfy Req2, the initial tail descriptor is initialized to the
+ * reusable state. Readers recognize reusable descriptors as existing
+ * records, but skip over them.
+ *
+ * To satisfy Req3, the last descriptor in the array is used as the initial
+ * head (and tail) descriptor. This allows the first record reserved by a
+ * writer (head + 1) to be the first descriptor in the array. (Only the first
+ * descriptor in the array could have a valid sequence number of 0.)
+ *
+ * The first time a descriptor is reserved, it is assigned a sequence number
+ * with the value of the array index. A "first time reserved" descriptor can
+ * be recognized because it has a sequence number of 0 but does not have an
+ * index of 0. (Only the first descriptor in the array could have a valid
+ * sequence number of 0.) After the first reservation, all future reservations
+ * (recycling) simply involve incrementing the sequence number by the array
+ * count.
+ *
+ * Hack #1
+ * Only the first descriptor in the array is allowed to have the sequence
+ * number 0. In this case it is not possible to recognize if it is being
+ * reserved the first time (set to index value) or has been reserved
+ * previously (increment by the array count). This is handled by _always_
+ * incrementing the sequence number by the array count when reserving the
+ * first descriptor in the array. In order to satisfy Req3, the sequence
+ * number of the first descriptor in the array is initialized to minus
+ * the array count. Then, upon the first reservation, it is incremented
+ * to 0, thus satisfying Req3.
+ *
+ * Hack #2
+ * prb_first_seq() can be called at any time by readers to retrieve the
+ * sequence number of the tail descriptor. However, due to Req2 and Req3,
+ * initially there are no records to report the sequence number of
+ * (sequence numbers are u64 and there is nothing less than 0). To handle
+ * this, the sequence number of the initial tail descriptor is initialized
+ * to 0. Technically this is incorrect, because there is no record with
+ * sequence number 0 (yet) and the tail descriptor is not the first
+ * descriptor in the array. But it allows prb_read_valid() to correctly
+ * report the existence of a record for _any_ given sequence number at all
+ * times. Bootstrapping is complete when the tail is pushed the first
+ * time, thus finally pointing to the first descriptor reserved by a
+ * writer, which has the assigned sequence number 0.
+ */
+
+/*
+ * Initiating Logical Value Overflows
+ *
+ * Both logical position (lpos) and ID values can be mapped to array indexes
+ * but may experience overflows during the lifetime of the system. To ensure
+ * that printk_ringbuffer can handle the overflows for these types, initial
+ * values are chosen that map to the correct initial array indexes, but will
+ * result in overflows soon.
+ *
+ * BLK0_LPOS
+ * The initial @head_lpos and @tail_lpos for data rings. It is at index
+ * 0 and the lpos value is such that it will overflow on the first wrap.
+ *
+ * DESC0_ID
+ * The initial @head_id and @tail_id for the desc ring. It is at the last
+ * index of the descriptor array (see Req3 above) and the ID value is such
+ * that it will overflow on the second wrap.
+ */
+#define BLK0_LPOS(sz_bits) (-(_DATA_SIZE(sz_bits)))
+#define DESC0_ID(ct_bits) DESC_ID(-(_DESCS_COUNT(ct_bits) + 1))
+#define DESC0_SV(ct_bits) DESC_SV(DESC0_ID(ct_bits), desc_reusable)
+
+/*
+ * Define a ringbuffer with an external text data buffer. The same as
+ * DEFINE_PRINTKRB() but requires specifying an external buffer for the
+ * text data.
+ *
+ * Note: The specified external buffer must be of the size:
+ * 2 ^ (descbits + avgtextbits)
+ */
+#define _DEFINE_PRINTKRB(name, descbits, avgtextbits, text_buf) \
+static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \
+ /* the initial head and tail */ \
+ [_DESCS_COUNT(descbits) - 1] = { \
+ /* reusable */ \
+ .state_var = ATOMIC_INIT(DESC0_SV(descbits)), \
+ /* no associated data block */ \
+ .text_blk_lpos = FAILED_BLK_LPOS, \
+ }, \
+}; \
+static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = { \
+ /* this will be the first record reserved by a writer */ \
+ [0] = { \
+ /* will be incremented to 0 on the first reservation */ \
+ .seq = -(u64)_DESCS_COUNT(descbits), \
+ }, \
+ /* the initial head and tail */ \
+ [_DESCS_COUNT(descbits) - 1] = { \
+ /* reports the first seq value during the bootstrap phase */ \
+ .seq = 0, \
+ }, \
+}; \
+static struct printk_ringbuffer name = { \
+ .desc_ring = { \
+ .count_bits = descbits, \
+ .descs = &_##name##_descs[0], \
+ .infos = &_##name##_infos[0], \
+ .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \
+ .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \
+ .last_finalized_id = ATOMIC_INIT(DESC0_ID(descbits)), \
+ }, \
+ .text_data_ring = { \
+ .size_bits = (avgtextbits) + (descbits), \
+ .data = text_buf, \
+ .head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \
+ .tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \
+ }, \
+ .fail = ATOMIC_LONG_INIT(0), \
+}
+
+/**
+ * DEFINE_PRINTKRB() - Define a ringbuffer.
+ *
+ * @name: The name of the ringbuffer variable.
+ * @descbits: The number of descriptors as a power-of-2 value.
+ * @avgtextbits: The average text data size per record as a power-of-2 value.
+ *
+ * This is a macro for defining a ringbuffer and all internal structures
+ * such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a
+ * variant where the text data buffer can be specified externally.
+ */
+#define DEFINE_PRINTKRB(name, descbits, avgtextbits) \
+static char _##name##_text[1U << ((avgtextbits) + (descbits))] \
+ __aligned(__alignof__(unsigned long)); \
+_DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0])
+
+/* Writer Interface */
+
+/**
+ * prb_rec_init_wr() - Initialize a buffer for writing records.
+ *
+ * @r: The record to initialize.
+ * @text_buf_size: The needed text buffer size.
+ */
+static inline void prb_rec_init_wr(struct printk_record *r,
+ unsigned int text_buf_size)
+{
+ r->info = NULL;
+ r->text_buf = NULL;
+ r->text_buf_size = text_buf_size;
+}
+
+bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+ struct printk_record *r);
+bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+ struct printk_record *r, u32 caller_id, unsigned int max_size);
+void prb_commit(struct prb_reserved_entry *e);
+void prb_final_commit(struct prb_reserved_entry *e);
+
+void prb_init(struct printk_ringbuffer *rb,
+ char *text_buf, unsigned int text_buf_size,
+ struct prb_desc *descs, unsigned int descs_count_bits,
+ struct printk_info *infos);
+unsigned int prb_record_text_space(struct prb_reserved_entry *e);
+
+/* Reader Interface */
+
+/**
+ * prb_rec_init_rd() - Initialize a buffer for reading records.
+ *
+ * @r: The record to initialize.
+ * @info: A buffer to store record meta-data.
+ * @text_buf: A buffer to store text data.
+ * @text_buf_size: The size of @text_buf.
+ *
+ * Initialize all the fields that a reader is interested in. All arguments
+ * (except @r) are optional. Only record data for arguments that are
+ * non-NULL or non-zero will be read.
+ */
+static inline void prb_rec_init_rd(struct printk_record *r,
+ struct printk_info *info,
+ char *text_buf, unsigned int text_buf_size)
+{
+ r->info = info;
+ r->text_buf = text_buf;
+ r->text_buf_size = text_buf_size;
+}
+
+/**
+ * prb_for_each_record() - Iterate over the records of a ringbuffer.
+ *
+ * @from: The sequence number to begin with.
+ * @rb: The ringbuffer to iterate over.
+ * @s: A u64 to store the sequence number on each iteration.
+ * @r: A printk_record to store the record on each iteration.
+ *
+ * This is a macro for conveniently iterating over a ringbuffer.
+ * Note that @s may not be the sequence number of the record on each
+ * iteration. For the sequence number, @r->info->seq should be checked.
+ *
+ * Context: Any context.
+ */
+#define prb_for_each_record(from, rb, s, r) \
+for ((s) = from; prb_read_valid(rb, s, r); (s) = (r)->info->seq + 1)
+
+/**
+ * prb_for_each_info() - Iterate over the meta data of a ringbuffer.
+ *
+ * @from: The sequence number to begin with.
+ * @rb: The ringbuffer to iterate over.
+ * @s: A u64 to store the sequence number on each iteration.
+ * @i: A printk_info to store the record meta data on each iteration.
+ * @lc: An unsigned int to store the text line count of each record.
+ *
+ * This is a macro for conveniently iterating over a ringbuffer.
+ * Note that @s may not be the sequence number of the record on each
+ * iteration. For the sequence number, @r->info->seq should be checked.
+ *
+ * Context: Any context.
+ */
+#define prb_for_each_info(from, rb, s, i, lc) \
+for ((s) = from; prb_read_valid_info(rb, s, i, lc); (s) = (i)->seq + 1)
+
+bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_record *r);
+bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_info *info, unsigned int *line_count);
+
+u64 prb_first_valid_seq(struct printk_ringbuffer *rb);
+u64 prb_next_seq(struct printk_ringbuffer *rb);
+
+#endif /* _KERNEL_PRINTK_RINGBUFFER_H */
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 50aeae770434..6d10927a07d8 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -4,349 +4,16 @@
*/
#include <linux/preempt.h>
-#include <linux/spinlock.h>
-#include <linux/debug_locks.h>
#include <linux/kdb.h>
#include <linux/smp.h>
#include <linux/cpumask.h>
-#include <linux/irq_work.h>
#include <linux/printk.h>
#include <linux/kprobes.h>
#include "internal.h"
-/*
- * printk() could not take logbuf_lock in NMI context. Instead,
- * it uses an alternative implementation that temporary stores
- * the strings into a per-CPU buffer. The content of the buffer
- * is later flushed into the main ring buffer via IRQ work.
- *
- * The alternative implementation is chosen transparently
- * by examinig current printk() context mask stored in @printk_context
- * per-CPU variable.
- *
- * The implementation allows to flush the strings also from another CPU.
- * There are situations when we want to make sure that all buffers
- * were handled or when IRQs are blocked.
- */
-
-#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \
- sizeof(atomic_t) - \
- sizeof(atomic_t) - \
- sizeof(struct irq_work))
-
-struct printk_safe_seq_buf {
- atomic_t len; /* length of written data */
- atomic_t message_lost;
- struct irq_work work; /* IRQ work that flushes the buffer */
- unsigned char buffer[SAFE_LOG_BUF_LEN];
-};
-
-static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq);
static DEFINE_PER_CPU(int, printk_context);
-#ifdef CONFIG_PRINTK_NMI
-static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
-#endif
-
-/* Get flushed in a more safe context. */
-static void queue_flush_work(struct printk_safe_seq_buf *s)
-{
- if (printk_percpu_data_ready())
- irq_work_queue(&s->work);
-}
-
-/*
- * Add a message to per-CPU context-dependent buffer. NMI and printk-safe
- * have dedicated buffers, because otherwise printk-safe preempted by
- * NMI-printk would have overwritten the NMI messages.
- *
- * The messages are flushed from irq work (or from panic()), possibly,
- * from other CPU, concurrently with printk_safe_log_store(). Should this
- * happen, printk_safe_log_store() will notice the buffer->len mismatch
- * and repeat the write.
- */
-static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s,
- const char *fmt, va_list args)
-{
- int add;
- size_t len;
- va_list ap;
-
-again:
- len = atomic_read(&s->len);
-
- /* The trailing '\0' is not counted into len. */
- if (len >= sizeof(s->buffer) - 1) {
- atomic_inc(&s->message_lost);
- queue_flush_work(s);
- return 0;
- }
-
- /*
- * Make sure that all old data have been read before the buffer
- * was reset. This is not needed when we just append data.
- */
- if (!len)
- smp_rmb();
-
- va_copy(ap, args);
- add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap);
- va_end(ap);
- if (!add)
- return 0;
-
- /*
- * Do it once again if the buffer has been flushed in the meantime.
- * Note that atomic_cmpxchg() is an implicit memory barrier that
- * makes sure that the data were written before updating s->len.
- */
- if (atomic_cmpxchg(&s->len, len, len + add) != len)
- goto again;
-
- queue_flush_work(s);
- return add;
-}
-
-static inline void printk_safe_flush_line(const char *text, int len)
-{
- /*
- * Avoid any console drivers calls from here, because we may be
- * in NMI or printk_safe context (when in panic). The messages
- * must go only into the ring buffer at this stage. Consoles will
- * get explicitly called later when a crashdump is not generated.
- */
- printk_deferred("%.*s", len, text);
-}
-
-/* printk part of the temporary buffer line by line */
-static int printk_safe_flush_buffer(const char *start, size_t len)
-{
- const char *c, *end;
- bool header;
-
- c = start;
- end = start + len;
- header = true;
-
- /* Print line by line. */
- while (c < end) {
- if (*c == '\n') {
- printk_safe_flush_line(start, c - start + 1);
- start = ++c;
- header = true;
- continue;
- }
-
- /* Handle continuous lines or missing new line. */
- if ((c + 1 < end) && printk_get_level(c)) {
- if (header) {
- c = printk_skip_level(c);
- continue;
- }
-
- printk_safe_flush_line(start, c - start);
- start = c++;
- header = true;
- continue;
- }
-
- header = false;
- c++;
- }
-
- /* Check if there was a partial line. Ignore pure header. */
- if (start < end && !header) {
- static const char newline[] = KERN_CONT "\n";
-
- printk_safe_flush_line(start, end - start);
- printk_safe_flush_line(newline, strlen(newline));
- }
-
- return len;
-}
-
-static void report_message_lost(struct printk_safe_seq_buf *s)
-{
- int lost = atomic_xchg(&s->message_lost, 0);
-
- if (lost)
- printk_deferred("Lost %d message(s)!\n", lost);
-}
-
-/*
- * Flush data from the associated per-CPU buffer. The function
- * can be called either via IRQ work or independently.
- */
-static void __printk_safe_flush(struct irq_work *work)
-{
- static raw_spinlock_t read_lock =
- __RAW_SPIN_LOCK_INITIALIZER(read_lock);
- struct printk_safe_seq_buf *s =
- container_of(work, struct printk_safe_seq_buf, work);
- unsigned long flags;
- size_t len;
- int i;
-
- /*
- * The lock has two functions. First, one reader has to flush all
- * available message to make the lockless synchronization with
- * writers easier. Second, we do not want to mix messages from
- * different CPUs. This is especially important when printing
- * a backtrace.
- */
- raw_spin_lock_irqsave(&read_lock, flags);
-
- i = 0;
-more:
- len = atomic_read(&s->len);
-
- /*
- * This is just a paranoid check that nobody has manipulated
- * the buffer an unexpected way. If we printed something then
- * @len must only increase. Also it should never overflow the
- * buffer size.
- */
- if ((i && i >= len) || len > sizeof(s->buffer)) {
- const char *msg = "printk_safe_flush: internal error\n";
-
- printk_safe_flush_line(msg, strlen(msg));
- len = 0;
- }
-
- if (!len)
- goto out; /* Someone else has already flushed the buffer. */
-
- /* Make sure that data has been written up to the @len */
- smp_rmb();
- i += printk_safe_flush_buffer(s->buffer + i, len - i);
-
- /*
- * Check that nothing has got added in the meantime and truncate
- * the buffer. Note that atomic_cmpxchg() is an implicit memory
- * barrier that makes sure that the data were copied before
- * updating s->len.
- */
- if (atomic_cmpxchg(&s->len, len, 0) != len)
- goto more;
-
-out:
- report_message_lost(s);
- raw_spin_unlock_irqrestore(&read_lock, flags);
-}
-
-/**
- * printk_safe_flush - flush all per-cpu nmi buffers.
- *
- * The buffers are flushed automatically via IRQ work. This function
- * is useful only when someone wants to be sure that all buffers have
- * been flushed at some point.
- */
-void printk_safe_flush(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
-#ifdef CONFIG_PRINTK_NMI
- __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work);
-#endif
- __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work);
- }
-}
-
-/**
- * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system
- * goes down.
- *
- * Similar to printk_safe_flush() but it can be called even in NMI context when
- * the system goes down. It does the best effort to get NMI messages into
- * the main ring buffer.
- *
- * Note that it could try harder when there is only one CPU online.
- */
-void printk_safe_flush_on_panic(void)
-{
- /*
- * Make sure that we could access the main ring buffer.
- * Do not risk a double release when more CPUs are up.
- */
- if (raw_spin_is_locked(&logbuf_lock)) {
- if (num_online_cpus() > 1)
- return;
-
- debug_locks_off();
- raw_spin_lock_init(&logbuf_lock);
- }
-
- printk_safe_flush();
-}
-
-#ifdef CONFIG_PRINTK_NMI
-/*
- * Safe printk() for NMI context. It uses a per-CPU buffer to
- * store the message. NMIs are not nested, so there is always only
- * one writer running. But the buffer might get flushed from another
- * CPU, so we need to be careful.
- */
-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
-{
- struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-
- return printk_safe_log_store(s, fmt, args);
-}
-
-void noinstr printk_nmi_enter(void)
-{
- this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
-}
-
-void noinstr printk_nmi_exit(void)
-{
- this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
-}
-
-/*
- * Marks a code that might produce many messages in NMI context
- * and the risk of losing them is more critical than eventual
- * reordering.
- *
- * It has effect only when called in NMI context. Then printk()
- * will try to store the messages into the main logbuf directly
- * and use the per-CPU buffers only as a fallback when the lock
- * is not available.
- */
-void printk_nmi_direct_enter(void)
-{
- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
- this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK);
-}
-
-void printk_nmi_direct_exit(void)
-{
- this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK);
-}
-
-#else
-
-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
-{
- return 0;
-}
-
-#endif /* CONFIG_PRINTK_NMI */
-
-/*
- * Lock-less printk(), to avoid deadlocks should the printk() recurse
- * into itself. It uses a per-CPU buffer to store the message, just like
- * NMI.
- */
-static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args)
-{
- struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq);
-
- return printk_safe_log_store(s, fmt, args);
-}
-
/* Can be preempted by NMI. */
void __printk_safe_enter(void)
{
@@ -359,7 +26,7 @@ void __printk_safe_exit(void)
this_cpu_dec(printk_context);
}
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+asmlinkage int vprintk(const char *fmt, va_list args)
{
#ifdef CONFIG_KGDB_KDB
/* Allow to pass printk() to kdb but avoid a recursion. */
@@ -368,47 +35,13 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
#endif
/*
- * Try to use the main logbuf even in NMI. But avoid calling console
+ * Use the main logbuf even in NMI. But avoid calling console
* drivers that might have their own locks.
*/
- if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK) &&
- raw_spin_trylock(&logbuf_lock)) {
- int len;
-
- len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
- raw_spin_unlock(&logbuf_lock);
- defer_console_output();
- return len;
- }
-
- /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */
- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
- return vprintk_nmi(fmt, args);
-
- /* Use extra buffer to prevent a recursion deadlock in safe mode. */
- if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
- return vprintk_safe(fmt, args);
+ if (this_cpu_read(printk_context) || in_nmi())
+ return vprintk_deferred(fmt, args);
/* No obstacles. */
return vprintk_default(fmt, args);
}
-
-void __init printk_safe_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct printk_safe_seq_buf *s;
-
- s = &per_cpu(safe_print_seq, cpu);
- init_irq_work(&s->work, __printk_safe_flush);
-
-#ifdef CONFIG_PRINTK_NMI
- s = &per_cpu(nmi_print_seq, cpu);
- init_irq_work(&s->work, __printk_safe_flush);
-#endif
- }
-
- /* Flush pending messages that did not have scheduled IRQ works. */
- printk_safe_flush();
-}
+EXPORT_SYMBOL(vprintk);
diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c
new file mode 100644
index 000000000000..c228343eeb97
--- /dev/null
+++ b/kernel/printk/sysctl.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * sysctl.c: General linux system control interface
+ */
+
+#include <linux/sysctl.h>
+#include <linux/printk.h>
+#include <linux/capability.h>
+#include <linux/ratelimit.h>
+#include "internal.h"
+
+static const int ten_thousand = 10000;
+
+static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+
+static struct ctl_table printk_sysctls[] = {
+ {
+ .procname = "printk",
+ .data = &console_loglevel,
+ .maxlen = 4*sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "printk_ratelimit",
+ .data = &printk_ratelimit_state.interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "printk_ratelimit_burst",
+ .data = &printk_ratelimit_state.burst,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "printk_delay",
+ .data = &printk_delay_msec,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&ten_thousand,
+ },
+ {
+ .procname = "printk_devkmsg",
+ .data = devkmsg_log_str,
+ .maxlen = DEVKMSG_STR_MAX_SIZE,
+ .mode = 0644,
+ .proc_handler = devkmsg_sysctl_set_loglvl,
+ },
+ {
+ .procname = "dmesg_restrict",
+ .data = &dmesg_restrict,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "kptr_restrict",
+ .data = &kptr_restrict,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {}
+};
+
+void __init printk_sysctl_init(void)
+{
+ register_sysctl_init("kernel", printk_sysctls);
+}
diff --git a/kernel/profile.c b/kernel/profile.c
index 6f69a4195d56..8a77769bc4b4 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -41,7 +41,8 @@ struct profile_hit {
#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
static atomic_t *prof_buffer;
-static unsigned long prof_len, prof_shift;
+static unsigned long prof_len;
+static unsigned short int prof_shift;
int prof_on __read_mostly;
EXPORT_SYMBOL_GPL(prof_on);
@@ -58,43 +59,39 @@ int profile_setup(char *str)
static const char schedstr[] = "schedule";
static const char sleepstr[] = "sleep";
static const char kvmstr[] = "kvm";
+ const char *select = NULL;
int par;
if (!strncmp(str, sleepstr, strlen(sleepstr))) {
#ifdef CONFIG_SCHEDSTATS
force_schedstat_enabled();
prof_on = SLEEP_PROFILING;
- if (str[strlen(sleepstr)] == ',')
- str += strlen(sleepstr) + 1;
- if (get_option(&str, &par))
- prof_shift = par;
- pr_info("kernel sleep profiling enabled (shift: %ld)\n",
- prof_shift);
+ select = sleepstr;
#else
pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
#endif /* CONFIG_SCHEDSTATS */
} else if (!strncmp(str, schedstr, strlen(schedstr))) {
prof_on = SCHED_PROFILING;
- if (str[strlen(schedstr)] == ',')
- str += strlen(schedstr) + 1;
- if (get_option(&str, &par))
- prof_shift = par;
- pr_info("kernel schedule profiling enabled (shift: %ld)\n",
- prof_shift);
+ select = schedstr;
} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
prof_on = KVM_PROFILING;
- if (str[strlen(kvmstr)] == ',')
- str += strlen(kvmstr) + 1;
- if (get_option(&str, &par))
- prof_shift = par;
- pr_info("kernel KVM profiling enabled (shift: %ld)\n",
- prof_shift);
+ select = kvmstr;
} else if (get_option(&str, &par)) {
- prof_shift = par;
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
prof_on = CPU_PROFILING;
- pr_info("kernel profiling enabled (shift: %ld)\n",
+ pr_info("kernel profiling enabled (shift: %u)\n",
prof_shift);
}
+
+ if (select) {
+ if (str[strlen(select)] == ',')
+ str += strlen(select) + 1;
+ if (get_option(&str, &par))
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
+ pr_info("kernel %s profiling enabled (shift: %u)\n",
+ select, prof_shift);
+ }
+
return 1;
}
__setup("profile=", profile_setup);
@@ -108,6 +105,13 @@ int __ref profile_init(void)
/* only text is profiled */
prof_len = (_etext - _stext) >> prof_shift;
+
+ if (!prof_len) {
+ pr_warn("profiling shift: %u too large\n", prof_shift);
+ prof_on = 0;
+ return -EINVAL;
+ }
+
buffer_bytes = prof_len*sizeof(atomic_t);
if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
@@ -132,79 +136,6 @@ int __ref profile_init(void)
return -ENOMEM;
}
-/* Profile event notifications */
-
-static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
-static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
-static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
-
-void profile_task_exit(struct task_struct *task)
-{
- blocking_notifier_call_chain(&task_exit_notifier, 0, task);
-}
-
-int profile_handoff_task(struct task_struct *task)
-{
- int ret;
- ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
- return (ret == NOTIFY_OK) ? 1 : 0;
-}
-
-void profile_munmap(unsigned long addr)
-{
- blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
-}
-
-int task_handoff_register(struct notifier_block *n)
-{
- return atomic_notifier_chain_register(&task_free_notifier, n);
-}
-EXPORT_SYMBOL_GPL(task_handoff_register);
-
-int task_handoff_unregister(struct notifier_block *n)
-{
- return atomic_notifier_chain_unregister(&task_free_notifier, n);
-}
-EXPORT_SYMBOL_GPL(task_handoff_unregister);
-
-int profile_event_register(enum profile_type type, struct notifier_block *n)
-{
- int err = -EINVAL;
-
- switch (type) {
- case PROFILE_TASK_EXIT:
- err = blocking_notifier_chain_register(
- &task_exit_notifier, n);
- break;
- case PROFILE_MUNMAP:
- err = blocking_notifier_chain_register(
- &munmap_notifier, n);
- break;
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(profile_event_register);
-
-int profile_event_unregister(enum profile_type type, struct notifier_block *n)
-{
- int err = -EINVAL;
-
- switch (type) {
- case PROFILE_TASK_EXIT:
- err = blocking_notifier_chain_unregister(
- &task_exit_notifier, n);
- break;
- case PROFILE_MUNMAP:
- err = blocking_notifier_chain_unregister(
- &munmap_notifier, n);
- break;
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(profile_event_unregister);
-
#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
/*
* Each cpu has a pair of open-addressed hashtables for pending
@@ -430,7 +361,7 @@ static ssize_t prof_cpu_mask_proc_write(struct file *file,
cpumask_var_t new_value;
int err;
- if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
err = cpumask_parse_user(buffer, count, new_value);
@@ -468,7 +399,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
unsigned long p = *ppos;
ssize_t read;
char *pnt;
- unsigned int sample_step = 1 << prof_shift;
+ unsigned long sample_step = 1UL << prof_shift;
profile_flip_buffers();
if (p >= (prof_len+1)*sizeof(unsigned int))
@@ -490,6 +421,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
return read;
}
+/* default is to not implement this call */
+int __weak setup_profiling_timer(unsigned mult)
+{
+ return -EINVAL;
+}
+
/*
* Writing to /proc/profile resets the counters
*
@@ -500,8 +437,6 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
#ifdef CONFIG_SMP
- extern int setup_profiling_timer(unsigned int multiplier);
-
if (count == sizeof(int)) {
unsigned int multiplier;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 43d6179508d6..2fabd497d659 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -31,6 +31,8 @@
#include <linux/cn_proc.h>
#include <linux/compat.h>
#include <linux/sched/signal.h>
+#include <linux/minmax.h>
+#include <linux/syscall_user_dispatch.h>
#include <asm/syscall.h> /* for syscall_get_* */
@@ -57,7 +59,7 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
return 0;
}
- ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
+ ret = access_remote_vm(mm, addr, buf, len, gup_flags);
mmput(mm);
return ret;
@@ -117,9 +119,9 @@ void __ptrace_unlink(struct task_struct *child)
const struct cred *old_cred;
BUG_ON(!child->ptrace);
- clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-#ifdef TIF_SYSCALL_EMU
- clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+ clear_task_syscall_work(child, SYSCALL_TRACE);
+#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
+ clear_task_syscall_work(child, SYSCALL_EMU);
#endif
child->parent = child->real_parent;
@@ -143,20 +145,9 @@ void __ptrace_unlink(struct task_struct *child)
*/
if (!(child->flags & PF_EXITING) &&
(child->signal->flags & SIGNAL_STOP_STOPPED ||
- child->signal->group_stop_count)) {
+ child->signal->group_stop_count))
child->jobctl |= JOBCTL_STOP_PENDING;
- /*
- * This is only possible if this thread was cloned by the
- * traced task running in the stopped group, set the signal
- * for the future reports.
- * FIXME: we should change ptrace_init_task() to handle this
- * case.
- */
- if (!(child->jobctl & JOBCTL_STOP_SIGMASK))
- child->jobctl |= SIGSTOP;
- }
-
/*
* If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
* @child in the butt. Note that @resume should be used iff @child
@@ -169,7 +160,27 @@ void __ptrace_unlink(struct task_struct *child)
spin_unlock(&child->sighand->siglock);
}
-/* Ensure that nothing can wake it up, even SIGKILL */
+static bool looks_like_a_spurious_pid(struct task_struct *task)
+{
+ if (task->exit_code != ((PTRACE_EVENT_EXEC << 8) | SIGTRAP))
+ return false;
+
+ if (task_pid_vnr(task) == task->ptrace_message)
+ return false;
+ /*
+ * The tracee changed its pid but the PTRACE_EVENT_EXEC event
+ * was not wait()'ed, most probably debugger targets the old
+ * leader which was destroyed in de_thread().
+ */
+ return true;
+}
+
+/*
+ * Ensure that nothing can wake it up, even SIGKILL
+ *
+ * A task is switched to this state while a ptrace operation is in progress;
+ * such that the ptrace operation is uninterruptible.
+ */
static bool ptrace_freeze_traced(struct task_struct *task)
{
bool ret = false;
@@ -179,8 +190,9 @@ static bool ptrace_freeze_traced(struct task_struct *task)
return ret;
spin_lock_irq(&task->sighand->siglock);
- if (task_is_traced(task) && !__fatal_signal_pending(task)) {
- task->state = __TASK_TRACED;
+ if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
+ !__fatal_signal_pending(task)) {
+ task->jobctl |= JOBCTL_PTRACE_FROZEN;
ret = true;
}
spin_unlock_irq(&task->sighand->siglock);
@@ -190,23 +202,21 @@ static bool ptrace_freeze_traced(struct task_struct *task)
static void ptrace_unfreeze_traced(struct task_struct *task)
{
- if (task->state != __TASK_TRACED)
- return;
-
- WARN_ON(!task->ptrace || task->parent != current);
+ unsigned long flags;
/*
- * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely.
- * Recheck state under the lock to close this race.
+ * The child may be awake and may have cleared
+ * JOBCTL_PTRACE_FROZEN (see ptrace_resume). The child will
+ * not set JOBCTL_PTRACE_FROZEN or enter __TASK_TRACED anew.
*/
- spin_lock_irq(&task->sighand->siglock);
- if (task->state == __TASK_TRACED) {
- if (__fatal_signal_pending(task))
+ if (lock_task_sighand(task, &flags)) {
+ task->jobctl &= ~JOBCTL_PTRACE_FROZEN;
+ if (__fatal_signal_pending(task)) {
+ task->jobctl &= ~JOBCTL_TRACED;
wake_up_state(task, __TASK_TRACED);
- else
- task->state = TASK_TRACED;
+ }
+ unlock_task_sighand(task, &flags);
}
- spin_unlock_irq(&task->sighand->siglock);
}
/**
@@ -239,7 +249,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
*/
read_lock(&tasklist_lock);
if (child->ptrace && child->parent == current) {
- WARN_ON(child->state == __TASK_TRACED);
/*
* child->sighand can't be NULL, release_task()
* does ptrace_unlink() before __exit_signal().
@@ -249,32 +258,18 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
}
read_unlock(&tasklist_lock);
- if (!ret && !ignore_state) {
- if (!wait_task_inactive(child, __TASK_TRACED)) {
- /*
- * This can only happen if may_ptrace_stop() fails and
- * ptrace_stop() changes ->state back to TASK_RUNNING,
- * so we should not worry about leaking __TASK_TRACED.
- */
- WARN_ON(child->state == __TASK_TRACED);
- ret = -ESRCH;
- }
- }
+ if (!ret && !ignore_state &&
+ WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED|TASK_FROZEN)))
+ ret = -ESRCH;
return ret;
}
-static bool ptrace_has_cap(const struct cred *cred, struct user_namespace *ns,
- unsigned int mode)
+static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
{
- int ret;
-
if (mode & PTRACE_MODE_NOAUDIT)
- ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT);
- else
- ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NONE);
-
- return ret == 0;
+ return ns_capable_noaudit(ns, CAP_SYS_PTRACE);
+ return ns_capable(ns, CAP_SYS_PTRACE);
}
/* Returns 0 on success, -errno on denial. */
@@ -326,7 +321,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
gid_eq(caller_gid, tcred->sgid) &&
gid_eq(caller_gid, tcred->gid))
goto ok;
- if (ptrace_has_cap(cred, tcred->user_ns, mode))
+ if (ptrace_has_cap(tcred->user_ns, mode))
goto ok;
rcu_read_unlock();
return -EPERM;
@@ -345,7 +340,7 @@ ok:
mm = task->mm;
if (mm &&
((get_dumpable(mm) != SUID_DUMP_USER) &&
- !ptrace_has_cap(cred, mm->user_ns, mode)))
+ !ptrace_has_cap(mm->user_ns, mode)))
return -EPERM;
return security_ptrace_access_check(task, mode);
@@ -360,6 +355,54 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
return !err;
}
+static int check_ptrace_options(unsigned long data)
+{
+ if (data & ~(unsigned long)PTRACE_O_MASK)
+ return -EINVAL;
+
+ if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
+ if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
+ !IS_ENABLED(CONFIG_SECCOMP))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
+ current->ptrace & PT_SUSPEND_SECCOMP)
+ return -EPERM;
+ }
+ return 0;
+}
+
+static inline void ptrace_set_stopped(struct task_struct *task)
+{
+ guard(spinlock)(&task->sighand->siglock);
+
+ /*
+ * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
+ * TRAPPING, and kick it so that it transits to TRACED. TRAPPING
+ * will be cleared if the child completes the transition or any
+ * event which clears the group stop states happens. We'll wait
+ * for the transition to complete before returning from this
+ * function.
+ *
+ * This hides STOPPED -> RUNNING -> TRACED transition from the
+ * attaching thread but a different thread in the same group can
+ * still observe the transient RUNNING state. IOW, if another
+ * thread's WNOHANG wait(2) on the stopped tracee races against
+ * ATTACH, the wait(2) may fail due to the transient RUNNING.
+ *
+ * The following task_is_stopped() test is safe as both transitions
+ * in and out of STOPPED are protected by siglock.
+ */
+ if (task_is_stopped(task) &&
+ task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
+ task->jobctl &= ~JOBCTL_STOPPED;
+ signal_wake_up_state(task, __TASK_STOPPED);
+ }
+}
+
static int ptrace_attach(struct task_struct *task, long request,
unsigned long addr,
unsigned long flags)
@@ -367,12 +410,20 @@ static int ptrace_attach(struct task_struct *task, long request,
bool seize = (request == PTRACE_SEIZE);
int retval;
- retval = -EIO;
if (seize) {
if (addr != 0)
- goto out;
+ return -EIO;
+ /*
+ * This duplicates the check in check_ptrace_options() because
+ * ptrace_attach() and ptrace_setoptions() have historically
+ * used different error codes for unknown ptrace options.
+ */
if (flags & ~(unsigned long)PTRACE_O_MASK)
- goto out;
+ return -EIO;
+
+ retval = check_ptrace_options(flags);
+ if (retval)
+ return retval;
flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
} else {
flags = PT_PTRACED;
@@ -380,88 +431,54 @@ static int ptrace_attach(struct task_struct *task, long request,
audit_ptrace(task);
- retval = -EPERM;
if (unlikely(task->flags & PF_KTHREAD))
- goto out;
+ return -EPERM;
if (same_thread_group(task, current))
- goto out;
+ return -EPERM;
/*
* Protect exec's credential calculations against our interference;
* SUID, SGID and LSM creds get determined differently
* under ptrace.
*/
- retval = -ERESTARTNOINTR;
- if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
- goto out;
+ scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR,
+ &task->signal->cred_guard_mutex) {
- task_lock(task);
- retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
- task_unlock(task);
- if (retval)
- goto unlock_creds;
+ scoped_guard (task_lock, task) {
+ retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
+ if (retval)
+ return retval;
+ }
- write_lock_irq(&tasklist_lock);
- retval = -EPERM;
- if (unlikely(task->exit_state))
- goto unlock_tasklist;
- if (task->ptrace)
- goto unlock_tasklist;
+ scoped_guard (write_lock_irq, &tasklist_lock) {
+ if (unlikely(task->exit_state))
+ return -EPERM;
+ if (task->ptrace)
+ return -EPERM;
- if (seize)
- flags |= PT_SEIZED;
- task->ptrace = flags;
+ task->ptrace = flags;
- ptrace_link(task, current);
+ ptrace_link(task, current);
- /* SEIZE doesn't trap tracee on attach */
- if (!seize)
- send_sig_info(SIGSTOP, SEND_SIG_PRIV, task);
+ /* SEIZE doesn't trap tracee on attach */
+ if (!seize)
+ send_sig_info(SIGSTOP, SEND_SIG_PRIV, task);
- spin_lock(&task->sighand->siglock);
+ ptrace_set_stopped(task);
+ }
+ }
/*
- * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
- * TRAPPING, and kick it so that it transits to TRACED. TRAPPING
- * will be cleared if the child completes the transition or any
- * event which clears the group stop states happens. We'll wait
- * for the transition to complete before returning from this
- * function.
- *
- * This hides STOPPED -> RUNNING -> TRACED transition from the
- * attaching thread but a different thread in the same group can
- * still observe the transient RUNNING state. IOW, if another
- * thread's WNOHANG wait(2) on the stopped tracee races against
- * ATTACH, the wait(2) may fail due to the transient RUNNING.
- *
- * The following task_is_stopped() test is safe as both transitions
- * in and out of STOPPED are protected by siglock.
+ * We do not bother to change retval or clear JOBCTL_TRAPPING
+ * if wait_on_bit() was interrupted by SIGKILL. The tracer will
+ * not return to user-mode, it will exit and clear this bit in
+ * __ptrace_unlink() if it wasn't already cleared by the tracee;
+ * and until then nobody can ptrace this task.
*/
- if (task_is_stopped(task) &&
- task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
- signal_wake_up_state(task, __TASK_STOPPED);
-
- spin_unlock(&task->sighand->siglock);
-
- retval = 0;
-unlock_tasklist:
- write_unlock_irq(&tasklist_lock);
-unlock_creds:
- mutex_unlock(&task->signal->cred_guard_mutex);
-out:
- if (!retval) {
- /*
- * We do not bother to change retval or clear JOBCTL_TRAPPING
- * if wait_on_bit() was interrupted by SIGKILL. The tracer will
- * not return to user-mode, it will exit and clear this bit in
- * __ptrace_unlink() if it wasn't already cleared by the tracee;
- * and until then nobody can ptrace this task.
- */
- wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE);
- proc_ptrace_connector(task, PTRACE_ATTACH);
- }
+ wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE);
+ proc_ptrace_connector(task, PTRACE_ATTACH);
- return retval;
+ return 0;
}
/**
@@ -645,22 +662,11 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
static int ptrace_setoptions(struct task_struct *child, unsigned long data)
{
unsigned flags;
+ int ret;
- if (data & ~(unsigned long)PTRACE_O_MASK)
- return -EINVAL;
-
- if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
- if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
- !IS_ENABLED(CONFIG_SECCOMP))
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
- current->ptrace & PT_SUSPEND_SECCOMP)
- return -EPERM;
- }
+ ret = check_ptrace_options(data);
+ if (ret)
+ return ret;
/* Avoid intermediate state when all opts are cleared */
flags = child->ptrace;
@@ -785,12 +791,26 @@ static int ptrace_peek_siginfo(struct task_struct *child,
return ret;
}
-#ifdef PTRACE_SINGLESTEP
-#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
-#else
-#define is_singlestep(request) 0
+#ifdef CONFIG_RSEQ
+static long ptrace_get_rseq_configuration(struct task_struct *task,
+ unsigned long size, void __user *data)
+{
+ struct ptrace_rseq_configuration conf = {
+ .rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
+ .rseq_abi_size = task->rseq_len,
+ .signature = task->rseq_sig,
+ .flags = 0,
+ };
+
+ size = min_t(unsigned long, size, sizeof(conf));
+ if (copy_to_user(data, &conf, size))
+ return -EFAULT;
+ return sizeof(conf);
+}
#endif
+#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
+
#ifdef PTRACE_SINGLEBLOCK
#define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK)
#else
@@ -806,21 +826,19 @@ static int ptrace_peek_siginfo(struct task_struct *child,
static int ptrace_resume(struct task_struct *child, long request,
unsigned long data)
{
- bool need_siglock;
-
if (!valid_signal(data))
return -EIO;
if (request == PTRACE_SYSCALL)
- set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+ set_task_syscall_work(child, SYSCALL_TRACE);
else
- clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+ clear_task_syscall_work(child, SYSCALL_TRACE);
-#ifdef TIF_SYSCALL_EMU
+#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP)
- set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+ set_task_syscall_work(child, SYSCALL_EMU);
else
- clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+ clear_task_syscall_work(child, SYSCALL_EMU);
#endif
if (is_singleblock(request)) {
@@ -843,18 +861,12 @@ static int ptrace_resume(struct task_struct *child, long request,
* Note that we need siglock even if ->exit_code == data and/or this
* status was not reported yet, the new status must not be cleared by
* wait_task_stopped() after resume.
- *
- * If data == 0 we do not care if wait_task_stopped() reports the old
- * status and clears the code too; this can't race with the tracee, it
- * takes siglock after resume.
*/
- need_siglock = data && !thread_group_empty(current);
- if (need_siglock)
- spin_lock_irq(&child->sighand->siglock);
+ spin_lock_irq(&child->sighand->siglock);
child->exit_code = data;
+ child->jobctl &= ~JOBCTL_TRACED;
wake_up_state(child, __TASK_TRACED);
- if (need_siglock)
- spin_unlock_irq(&child->sighand->siglock);
+ spin_unlock_irq(&child->sighand->siglock);
return 0;
}
@@ -1177,9 +1189,7 @@ int ptrace_request(struct task_struct *child, long request,
}
#endif
-#ifdef PTRACE_SINGLESTEP
case PTRACE_SINGLESTEP:
-#endif
#ifdef PTRACE_SINGLEBLOCK
case PTRACE_SINGLEBLOCK:
#endif
@@ -1192,9 +1202,8 @@ int ptrace_request(struct task_struct *child, long request,
return ptrace_resume(child, request, data);
case PTRACE_KILL:
- if (child->exit_state) /* already dead */
- return 0;
- return ptrace_resume(child, request, SIGKILL);
+ send_sig_info(SIGKILL, SEND_SIG_NOINFO, child);
+ return 0;
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
case PTRACE_GETREGSET:
@@ -1228,6 +1237,20 @@ int ptrace_request(struct task_struct *child, long request,
ret = seccomp_get_metadata(child, addr, datavp);
break;
+#ifdef CONFIG_RSEQ
+ case PTRACE_GET_RSEQ_CONFIGURATION:
+ ret = ptrace_get_rseq_configuration(child, addr, datavp);
+ break;
+#endif
+
+ case PTRACE_SET_SYSCALL_USER_DISPATCH_CONFIG:
+ ret = syscall_user_dispatch_set_config(child, addr, datavp);
+ break;
+
+ case PTRACE_GET_SYSCALL_USER_DISPATCH_CONFIG:
+ ret = syscall_user_dispatch_get_config(child, addr, datavp);
+ break;
+
default:
break;
}
@@ -1235,10 +1258,6 @@ int ptrace_request(struct task_struct *child, long request,
return ret;
}
-#ifndef arch_ptrace_attach
-#define arch_ptrace_attach(child) do { } while (0)
-#endif
-
SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
unsigned long, data)
{
@@ -1247,8 +1266,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
if (request == PTRACE_TRACEME) {
ret = ptrace_traceme();
- if (!ret)
- arch_ptrace_attach(current);
goto out;
}
@@ -1260,12 +1277,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
ret = ptrace_attach(child, request, addr, data);
- /*
- * Some architectures need to do book-keeping after
- * a ptrace attach.
- */
- if (!ret)
- arch_ptrace_attach(child);
goto out_put_task_struct;
}
@@ -1405,12 +1416,6 @@ COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid,
if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
ret = ptrace_attach(child, request, addr, data);
- /*
- * Some architectures need to do book-keeping after
- * a ptrace attach.
- */
- if (!ret)
- arch_ptrace_attach(child);
goto out_put_task_struct;
}
diff --git a/kernel/range.c b/kernel/range.c
index d84de6766472..56435f96da73 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -2,8 +2,9 @@
/*
* Range add and subtract
*/
-#include <linux/kernel.h>
#include <linux/init.h>
+#include <linux/minmax.h>
+#include <linux/printk.h>
#include <linux/sort.h>
#include <linux/string.h>
#include <linux/range.h>
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 0ebe15a84985..bdd7eadb33d8 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -8,6 +8,8 @@ menu "RCU Subsystem"
config TREE_RCU
bool
default y if SMP
+ # Dynticks-idle tracking
+ select CONTEXT_TRACKING_IDLE
help
This option selects the RCU implementation that is
designed for very large SMP system with hundreds or
@@ -51,56 +53,77 @@ config RCU_EXPERT
Say N if you are unsure.
-config SRCU
- bool
- help
- This option selects the sleepable version of RCU. This version
- permits arbitrary sleeping or blocking within RCU read-side critical
- sections.
-
config TINY_SRCU
bool
- default y if SRCU && TINY_RCU
+ default y if TINY_RCU
help
This option selects the single-CPU non-preemptible version of SRCU.
config TREE_SRCU
bool
- default y if SRCU && !TINY_RCU
+ default y if !TINY_RCU
help
This option selects the full-fledged version of SRCU.
+config NEED_SRCU_NMI_SAFE
+ def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU
+
config TASKS_RCU_GENERIC
def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU
- select SRCU
help
This option enables generic infrastructure code supporting
task-based RCU implementations. Not for manual selection.
+config FORCE_TASKS_RCU
+ bool "Force selection of TASKS_RCU"
+ depends on RCU_EXPERT
+ select TASKS_RCU
+ default n
+ help
+ This option force-enables a task-based RCU implementation
+ that uses only voluntary context switch (not preemption!),
+ idle, and user-mode execution as quiescent states. Not for
+ manual selection in most cases.
+
config TASKS_RCU
- def_bool PREEMPTION
+ bool
+ default n
+ select IRQ_WORK
+
+config FORCE_TASKS_RUDE_RCU
+ bool "Force selection of Tasks Rude RCU"
+ depends on RCU_EXPERT
+ select TASKS_RUDE_RCU
+ default n
help
- This option enables a task-based RCU implementation that uses
- only voluntary context switch (not preemption!), idle, and
- user-mode execution as quiescent states. Not for manual selection.
+ This option force-enables a task-based RCU implementation
+ that uses only context switch (including preemption) and
+ user-mode execution as quiescent states. It forces IPIs and
+ context switches on all online CPUs, including idle ones,
+ so use with caution. Not for manual selection in most cases.
config TASKS_RUDE_RCU
- def_bool 0
- help
- This option enables a task-based RCU implementation that uses
- only context switch (including preemption) and user-mode
- execution as quiescent states. It forces IPIs and context
- switches on all online CPUs, including idle ones, so use
- with caution.
+ bool
+ default n
+ select IRQ_WORK
-config TASKS_TRACE_RCU
- def_bool 0
+config FORCE_TASKS_TRACE_RCU
+ bool "Force selection of Tasks Trace RCU"
+ depends on RCU_EXPERT
+ select TASKS_TRACE_RCU
+ default n
help
This option enables a task-based RCU implementation that uses
explicit rcu_read_lock_trace() read-side markers, and allows
- these readers to appear in the idle loop as well as on the CPU
- hotplug code paths. It can force IPIs on online CPUs, including
- idle ones, so use with caution.
+ these readers to appear in the idle loop as well as on the
+ CPU hotplug code paths. It can force IPIs on online CPUs,
+ including idle ones, so use with caution. Not for manual
+ selection in most cases.
+
+config TASKS_TRACE_RCU
+ bool
+ default n
+ select IRQ_WORK
config RCU_STALL_COMMON
def_bool TREE_RCU
@@ -111,7 +134,7 @@ config RCU_STALL_COMMON
making these warnings mandatory for the tree variants.
config RCU_NEED_SEGCBLIST
- def_bool ( TREE_RCU || TREE_SRCU )
+ def_bool ( TREE_RCU || TREE_SRCU || TASKS_RCU_GENERIC )
config RCU_FANOUT
int "Tree-based hierarchical RCU fanout value"
@@ -135,10 +158,12 @@ config RCU_FANOUT
config RCU_FANOUT_LEAF
int "Tree-based hierarchical RCU leaf-level fanout value"
- range 2 64 if 64BIT
- range 2 32 if !64BIT
+ range 2 64 if 64BIT && !RCU_STRICT_GRACE_PERIOD
+ range 2 32 if !64BIT && !RCU_STRICT_GRACE_PERIOD
+ range 2 3 if RCU_STRICT_GRACE_PERIOD
depends on TREE_RCU && RCU_EXPERT
- default 16
+ default 16 if !RCU_STRICT_GRACE_PERIOD
+ default 2 if RCU_STRICT_GRACE_PERIOD
help
This option controls the leaf-level fanout of hierarchical
implementations of RCU, and allows trading off cache misses
@@ -166,28 +191,10 @@ config RCU_FANOUT_LEAF
Take the default if unsure.
-config RCU_FAST_NO_HZ
- bool "Accelerate last non-dyntick-idle CPU's grace periods"
- depends on NO_HZ_COMMON && SMP && RCU_EXPERT
- default n
- help
- This option permits CPUs to enter dynticks-idle state even if
- they have RCU callbacks queued, and prevents RCU from waking
- these CPUs up more than roughly once every four jiffies (by
- default, you can adjust this using the rcutree.rcu_idle_gp_delay
- parameter), thus improving energy efficiency. On the other
- hand, this option increases the duration of RCU grace periods,
- for example, slowing down synchronize_rcu().
-
- Say Y if energy efficiency is critically important, and you
- don't care about increased grace-period durations.
-
- Say N if you are unsure.
-
config RCU_BOOST
bool "Enable RCU priority boosting"
- depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
- default n
+ depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT
+ default y if PREEMPT_RT
help
This option boosts the priority of preempted RCU readers that
block the current preemptible RCU grace period for too long.
@@ -210,6 +217,20 @@ config RCU_BOOST_DELAY
Accept the default if unsure.
+config RCU_EXP_KTHREAD
+ bool "Perform RCU expedited work in a real-time kthread"
+ depends on RCU_BOOST && RCU_EXPERT
+ default !PREEMPT_RT && NR_CPUS <= 32
+ help
+ Use this option to further reduce the latencies of expedited
+ grace periods at the expense of being more disruptive.
+
+ This option is disabled by default on PREEMPT_RT=y kernels which
+ disable expedited grace periods after boot by unconditionally
+ setting rcupdate.rcu_normal_after_boot=1.
+
+ Accept the default if unsure.
+
config RCU_NOCB_CPU
bool "Offload RCU callback processing from boot-selected CPUs"
depends on TREE_RCU
@@ -219,24 +240,57 @@ config RCU_NOCB_CPU
Use this option to reduce OS jitter for aggressive HPC or
real-time workloads. It can also be used to offload RCU
callback invocation to energy-efficient CPUs in battery-powered
- asymmetric multiprocessors.
+ asymmetric multiprocessors. The price of this reduced jitter
+ is that the overhead of call_rcu() increases and that some
+ workloads will incur significant increases in context-switch
+ rates.
This option offloads callback invocation from the set of CPUs
specified at boot time by the rcu_nocbs parameter. For each
such CPU, a kthread ("rcuox/N") will be created to invoke
callbacks, where the "N" is the CPU being offloaded, and where
- the "p" for RCU-preempt (PREEMPTION kernels) and "s" for RCU-sched
- (!PREEMPTION kernels). Nothing prevents this kthread from running
- on the specified CPUs, but (1) the kthreads may be preempted
- between each callback, and (2) affinity or cgroups can be used
- to force the kthreads to run on whatever set of CPUs is desired.
+ the "x" is "p" for RCU-preempt (PREEMPTION kernels) and "s" for
+ RCU-sched (!PREEMPTION kernels). Nothing prevents this kthread
+ from running on the specified CPUs, but (1) the kthreads may be
+ preempted between each callback, and (2) affinity or cgroups can
+ be used to force the kthreads to run on whatever set of CPUs is
+ desired.
+
+ Say Y here if you need reduced OS jitter, despite added overhead.
+ Say N here if you are unsure.
+
+config RCU_NOCB_CPU_DEFAULT_ALL
+ bool "Offload RCU callback processing from all CPUs by default"
+ depends on RCU_NOCB_CPU
+ default n
+ help
+ Use this option to offload callback processing from all CPUs
+ by default, in the absence of the rcu_nocbs or nohz_full boot
+ parameter. This also avoids the need to use any boot parameters
+ to achieve the effect of offloading all CPUs on boot.
- Say Y here if you want to help to debug reduced OS jitter.
+ Say Y here if you want offload all CPUs by default on boot.
Say N here if you are unsure.
+config RCU_NOCB_CPU_CB_BOOST
+ bool "Offload RCU callback from real-time kthread"
+ depends on RCU_NOCB_CPU && RCU_BOOST
+ default y if PREEMPT_RT
+ help
+ Use this option to invoke offloaded callbacks as SCHED_FIFO
+ to avoid starvation by heavy SCHED_OTHER background load.
+ Of course, running as SCHED_FIFO during callback floods will
+ cause the rcuo[ps] kthreads to monopolize the CPU for hundreds
+ of milliseconds or more. Therefore, when enabling this option,
+ it is your responsibility to ensure that latency-sensitive
+ tasks either run with higher priority or run on some other CPU.
+
+ Say Y here if you want to set RT priority for offloading kthreads.
+ Say N here if you are building a !PREEMPT_RT kernel and are unsure.
+
config TASKS_TRACE_RCU_READ_MB
bool "Tasks Trace RCU readers use memory barriers in user and idle"
- depends on RCU_EXPERT
+ depends on RCU_EXPERT && TASKS_TRACE_RCU
default PREEMPT_RT || NR_CPUS < 8
help
Use this option to further reduce the number of IPIs sent
@@ -252,4 +306,30 @@ config TASKS_TRACE_RCU_READ_MB
Say N here if you hate read-side memory barriers.
Take the default if you are unsure.
+config RCU_LAZY
+ bool "RCU callback lazy invocation functionality"
+ depends on RCU_NOCB_CPU
+ default n
+ help
+ To save power, batch RCU callbacks and flush after delay, memory
+ pressure, or callback list growing too big.
+
+config RCU_DOUBLE_CHECK_CB_TIME
+ bool "RCU callback-batch backup time check"
+ depends on RCU_EXPERT
+ default n
+ help
+ Use this option to provide more precise enforcement of the
+ rcutree.rcu_resched_ns module parameter in situations where
+ a single RCU callback might run for hundreds of microseconds,
+ thus defeating the 32-callback batching used to amortize the
+ cost of the fine-grained but expensive local_clock() function.
+
+ This option rounds rcutree.rcu_resched_ns up to the next
+ jiffy, and overrides the 32-callback batching if this limit
+ is exceeded.
+
+ Say Y here if you need tighter callback-limit enforcement.
+ Say N here if you are unsure.
+
endmenu # "RCU Subsystem"
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 452feae8de20..9b0b52e1836f 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -23,14 +23,10 @@ config TORTURE_TEST
tristate
default n
-config RCU_PERF_TEST
+config RCU_SCALE_TEST
tristate "performance tests for RCU"
depends on DEBUG_KERNEL
select TORTURE_TEST
- select SRCU
- select TASKS_RCU
- select TASKS_RUDE_RCU
- select TASKS_TRACE_RCU
default n
help
This option provides a kernel module that runs performance
@@ -46,10 +42,6 @@ config RCU_TORTURE_TEST
tristate "torture tests for RCU"
depends on DEBUG_KERNEL
select TORTURE_TEST
- select SRCU
- select TASKS_RCU
- select TASKS_RUDE_RCU
- select TASKS_TRACE_RCU
default n
help
This option provides a kernel module that runs torture tests
@@ -61,6 +53,21 @@ config RCU_TORTURE_TEST
Say M if you want the RCU torture tests to build as a module.
Say N if you are unsure.
+config RCU_REF_SCALE_TEST
+ tristate "Scalability tests for read-side synchronization (RCU and others)"
+ depends on DEBUG_KERNEL
+ select TORTURE_TEST
+ default n
+ help
+ This option provides a kernel module that runs performance tests
+ useful comparing RCU with various read-side synchronization mechanisms.
+ The kernel module may be built after the fact on the running kernel to be
+ tested, if desired.
+
+ Say Y here if you want these performance tests built into the kernel.
+ Say M if you want to build it as a module instead.
+ Say N if you are unsure.
+
config RCU_CPU_STALL_TIMEOUT
int "RCU CPU stall timeout in seconds"
depends on RCU_STALL_COMMON
@@ -72,6 +79,57 @@ config RCU_CPU_STALL_TIMEOUT
RCU grace period persists, additional CPU stall warnings are
printed at more widely spaced intervals.
+config RCU_EXP_CPU_STALL_TIMEOUT
+ int "Expedited RCU CPU stall timeout in milliseconds"
+ depends on RCU_STALL_COMMON
+ range 0 300000
+ default 0
+ help
+ If a given expedited RCU grace period extends more than the
+ specified number of milliseconds, a CPU stall warning is printed.
+ If the RCU grace period persists, additional CPU stall warnings
+ are printed at more widely spaced intervals. A value of zero
+ says to use the RCU_CPU_STALL_TIMEOUT value converted from
+ seconds to milliseconds.
+
+config RCU_CPU_STALL_CPUTIME
+ bool "Provide additional RCU stall debug information"
+ depends on RCU_STALL_COMMON
+ default n
+ help
+ Collect statistics during the sampling period, such as the number of
+ (hard interrupts, soft interrupts, task switches) and the cputime of
+ (hard interrupts, soft interrupts, kernel tasks) are added to the
+ RCU stall report. For multiple continuous RCU stalls, all sampling
+ periods begin at half of the first RCU stall timeout.
+ The boot option rcupdate.rcu_cpu_stall_cputime has the same function
+ as this one, but will override this if it exists.
+
+config RCU_CPU_STALL_NOTIFIER
+ bool "Provide RCU CPU-stall notifiers"
+ depends on RCU_STALL_COMMON
+ depends on DEBUG_KERNEL
+ depends on RCU_EXPERT
+ default n
+ help
+ WARNING: You almost certainly do not want this!!!
+
+ Enable RCU CPU-stall notifiers, which are invoked just before
+ printing the RCU CPU stall warning. As such, bugs in notifier
+ callbacks can prevent stall warnings from being printed.
+ And the whole reason that a stall warning is being printed is
+ that something is hung up somewhere. Therefore, the notifier
+ callbacks must be written extremely carefully, preferably
+ containing only lockless code. After all, it is quite possible
+ that the whole reason that the RCU CPU stall is happening in
+ the first place is that someone forgot to release whatever lock
+ that you are thinking of acquiring. In which case, having your
+ notifier callback acquire that lock will hang, preventing the
+ RCU CPU stall warning from appearing.
+
+ Say Y here if you want RCU CPU stall notifiers (you don't want them)
+ Say N if you are unsure.
+
config RCU_TRACE
bool "Enable tracing for RCU"
depends on DEBUG_KERNEL
@@ -95,4 +153,19 @@ config RCU_EQS_DEBUG
Say N here if you need ultimate kernel/user switch latencies
Say Y if you are unsure
+config RCU_STRICT_GRACE_PERIOD
+ bool "Provide debug RCU implementation with short grace periods"
+ depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4 && !TINY_RCU
+ default n
+ select PREEMPT_COUNT if PREEMPT=n
+ help
+ Select this option to build an RCU variant that is strict about
+ grace periods, making them as short as it can. This limits
+ scalability, destroys real-time response, degrades battery
+ lifetime and kills performance. Don't try this on large
+ machines, as in systems with more than about 10 or 20 CPUs.
+ But in conjunction with tools like KASAN, it can be helpful
+ when looking for certain types of RCU usage bugs, for example,
+ too-short RCU read-side critical sections.
+
endmenu # "RCU Debugging"
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index f91f2c2cf138..0cfb009a99b9 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -11,7 +11,8 @@ obj-y += update.o sync.o
obj-$(CONFIG_TREE_SRCU) += srcutree.o
obj-$(CONFIG_TINY_SRCU) += srcutiny.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
-obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
+obj-$(CONFIG_RCU_SCALE_TEST) += rcuscale.o
+obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o
obj-$(CONFIG_TREE_RCU) += tree.o
obj-$(CONFIG_TINY_RCU) += tiny.o
obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index cf66a3ccd757..f94f65877f2b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -10,19 +10,58 @@
#ifndef __LINUX_RCU_H
#define __LINUX_RCU_H
+#include <linux/slab.h>
#include <trace/events/rcu.h>
-/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
-#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1)
-
-
/*
* Grace-period counter management.
+ *
+ * The two least significant bits contain the control flags.
+ * The most significant bits contain the grace-period sequence counter.
+ *
+ * When both control flags are zero, no grace period is in progress.
+ * When either bit is non-zero, a grace period has started and is in
+ * progress. When the grace period completes, the control flags are reset
+ * to 0 and the grace-period sequence counter is incremented.
+ *
+ * However some specific RCU usages make use of custom values.
+ *
+ * SRCU special control values:
+ *
+ * SRCU_SNP_INIT_SEQ : Invalid/init value set when SRCU node
+ * is initialized.
+ *
+ * SRCU_STATE_IDLE : No SRCU gp is in progress
+ *
+ * SRCU_STATE_SCAN1 : State set by rcu_seq_start(). Indicates
+ * we are scanning the readers on the slot
+ * defined as inactive (there might well
+ * be pending readers that will use that
+ * index, but their number is bounded).
+ *
+ * SRCU_STATE_SCAN2 : State set manually via rcu_seq_set_state()
+ * Indicates we are flipping the readers
+ * index and then scanning the readers on the
+ * slot newly designated as inactive (again,
+ * the number of pending readers that will use
+ * this inactive index is bounded).
+ *
+ * RCU polled GP special control value:
+ *
+ * RCU_GET_STATE_COMPLETED : State value indicating an already-completed
+ * polled GP has completed. This value covers
+ * both the state and the counter of the
+ * grace-period sequence number.
*/
#define RCU_SEQ_CTR_SHIFT 2
#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1)
+/* Low-order bit definition for polled grace-period APIs. */
+#define RCU_GET_STATE_COMPLETED 0x1
+
+extern int sysctl_sched_rt_runtime;
+
/*
* Return the counter portion of a sequence number previously returned
* by rcu_seq_snap() or rcu_seq_current().
@@ -118,6 +157,18 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s)
}
/*
+ * Given a snapshot from rcu_seq_snap(), determine whether or not a
+ * full update-side operation has occurred, but do not allow the
+ * (ULONG_MAX / 2) safety-factor/guard-band.
+ */
+static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s)
+{
+ unsigned long cur_s = READ_ONCE(*sp);
+
+ return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1));
+}
+
+/*
* Has a grace period completed since the time the old gp_seq was collected?
*/
static inline bool rcu_seq_completed_gp(unsigned long old, unsigned long new)
@@ -167,7 +218,7 @@ static inline unsigned long rcu_seq_diff(unsigned long new, unsigned long old)
# define STATE_RCU_HEAD_READY 0
# define STATE_RCU_HEAD_QUEUED 1
-extern struct debug_obj_descr rcuhead_debug_descr;
+extern const struct debug_obj_descr rcuhead_debug_descr;
static inline int debug_rcu_head_queue(struct rcu_head *head)
{
@@ -198,6 +249,12 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
}
#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+static inline void debug_rcu_head_callback(struct rcu_head *rhp)
+{
+ if (unlikely(!rhp->func))
+ kmem_dump_obj(rhp);
+}
+
extern int rcu_cpu_stall_suppress_at_boot;
static inline bool rcu_stall_is_suppressed_at_boot(void)
@@ -205,12 +262,18 @@ static inline bool rcu_stall_is_suppressed_at_boot(void)
return rcu_cpu_stall_suppress_at_boot && !rcu_inkernel_boot_has_ended();
}
+extern int rcu_cpu_stall_notifiers;
+
#ifdef CONFIG_RCU_STALL_COMMON
extern int rcu_cpu_stall_ftrace_dump;
extern int rcu_cpu_stall_suppress;
extern int rcu_cpu_stall_timeout;
+extern int rcu_exp_cpu_stall_timeout;
+extern int rcu_cpu_stall_cputime;
+extern bool rcu_exp_stall_task_details __read_mostly;
int rcu_jiffies_till_stall_check(void);
+int rcu_exp_jiffies_till_stall_check(void);
static inline bool rcu_stall_is_suppressed(void)
{
@@ -271,7 +334,7 @@ void rcu_test_sync_prims(void);
*/
extern void resched_cpu(int cpu);
-#if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU)
+#if !defined(CONFIG_TINY_RCU)
#include <linux/rcu_node_tree.h>
@@ -308,6 +371,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
}
}
+extern void rcu_init_geometry(void);
+
/* Returns a pointer to the first leaf rcu_node structure. */
#define rcu_first_leaf_node() (rcu_state.level[rcu_num_lvls - 1])
@@ -322,11 +387,13 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
* specified state structure (for SRCU) or the only rcu_state structure
* (for RCU).
*/
-#define srcu_for_each_node_breadth_first(sp, rnp) \
+#define _rcu_for_each_node_breadth_first(sp, rnp) \
for ((rnp) = &(sp)->node[0]; \
(rnp) < &(sp)->node[rcu_num_nodes]; (rnp)++)
#define rcu_for_each_node_breadth_first(rnp) \
- srcu_for_each_node_breadth_first(&rcu_state, rnp)
+ _rcu_for_each_node_breadth_first(&rcu_state, rnp)
+#define srcu_for_each_node_breadth_first(ssp, rnp) \
+ _rcu_for_each_node_breadth_first(ssp->srcu_sup, rnp)
/*
* Scan the leaves of the rcu_node hierarchy for the rcu_state structure.
@@ -358,6 +425,10 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
(cpu) <= rnp->grphi; \
(cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask)))
+#endif /* !defined(CONFIG_TINY_RCU) */
+
+#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_TASKS_RCU_GENERIC)
+
/*
* Wrappers for the rcu_node::lock acquire and release.
*
@@ -378,7 +449,11 @@ do { \
smp_mb__after_unlock_lock(); \
} while (0)
-#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock))
+#define raw_spin_unlock_rcu_node(p) \
+do { \
+ lockdep_assert_irqs_disabled(); \
+ raw_spin_unlock(&ACCESS_PRIVATE(p, lock)); \
+} while (0)
#define raw_spin_lock_irq_rcu_node(p) \
do { \
@@ -387,7 +462,10 @@ do { \
} while (0)
#define raw_spin_unlock_irq_rcu_node(p) \
- raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
+do { \
+ lockdep_assert_irqs_disabled(); \
+ raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)); \
+} while (0)
#define raw_spin_lock_irqsave_rcu_node(p, flags) \
do { \
@@ -396,7 +474,10 @@ do { \
} while (0)
#define raw_spin_unlock_irqrestore_rcu_node(p, flags) \
- raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags)
+do { \
+ lockdep_assert_irqs_disabled(); \
+ raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags); \
+} while (0)
#define raw_spin_trylock_rcu_node(p) \
({ \
@@ -410,31 +491,43 @@ do { \
#define raw_lockdep_assert_held_rcu_node(p) \
lockdep_assert_held(&ACCESS_PRIVATE(p, lock))
-#endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */
-
-#ifdef CONFIG_SRCU
-void srcu_init(void);
-#else /* #ifdef CONFIG_SRCU */
-static inline void srcu_init(void) { }
-#endif /* #else #ifdef CONFIG_SRCU */
+#endif // #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_TASKS_RCU_GENERIC)
#ifdef CONFIG_TINY_RCU
/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
static inline bool rcu_gp_is_normal(void) { return true; }
static inline bool rcu_gp_is_expedited(void) { return false; }
+static inline bool rcu_async_should_hurry(void) { return false; }
static inline void rcu_expedite_gp(void) { }
static inline void rcu_unexpedite_gp(void) { }
-static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
+static inline void rcu_async_hurry(void) { }
+static inline void rcu_async_relax(void) { }
+static inline bool rcu_cpu_online(int cpu) { return true; }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_gp_is_normal(void); /* Internal RCU use. */
bool rcu_gp_is_expedited(void); /* Internal RCU use. */
+bool rcu_async_should_hurry(void); /* Internal RCU use. */
void rcu_expedite_gp(void);
void rcu_unexpedite_gp(void);
+void rcu_async_hurry(void);
+void rcu_async_relax(void);
void rcupdate_announce_bootup_oddness(void);
+bool rcu_cpu_online(int cpu);
+#ifdef CONFIG_TASKS_RCU_GENERIC
void show_rcu_tasks_gp_kthreads(void);
-void rcu_request_urgent_qs_task(struct task_struct *t);
+#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
+static inline void show_rcu_tasks_gp_kthreads(void) {}
+#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
#endif /* #else #ifdef CONFIG_TINY_RCU */
+#ifdef CONFIG_TASKS_RCU
+struct task_struct *get_rcu_tasks_gp_kthread(void);
+#endif // # ifdef CONFIG_TASKS_RCU
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+struct task_struct *get_rcu_tasks_rude_gp_kthread(void);
+#endif // # ifdef CONFIG_TASKS_RUDE_RCU
+
#define RCU_SCHEDULER_INACTIVE 0
#define RCU_SCHEDULER_INIT 1
#define RCU_SCHEDULER_RUNNING 2
@@ -449,6 +542,14 @@ enum rcutorture_type {
INVALID_RCU_FLAVOR
};
+#if defined(CONFIG_RCU_LAZY)
+unsigned long rcu_lazy_get_jiffies_till_flush(void);
+void rcu_lazy_set_jiffies_till_flush(unsigned long j);
+#else
+static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
+static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
+#endif
+
#if defined(CONFIG_TREE_RCU)
void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
unsigned long *gp_seq);
@@ -478,10 +579,6 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
static inline void rcu_gp_set_torture_wait(int duration) { }
#endif
-#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
-long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
-#endif
-
#ifdef CONFIG_TINY_SRCU
static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
@@ -509,28 +606,65 @@ static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
static inline unsigned long
srcu_batches_completed(struct srcu_struct *sp) { return 0; }
static inline void rcu_force_quiescent_state(void) { }
+static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; }
static inline void show_rcu_gp_kthreads(void) { }
static inline int rcu_get_gp_kthreads_prio(void) { return 0; }
static inline void rcu_fwd_progress_check(unsigned long j) { }
+static inline void rcu_gp_slow_register(atomic_t *rgssp) { }
+static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_dynticks_zero_in_eqs(int cpu, int *vp);
unsigned long rcu_get_gp_seq(void);
unsigned long rcu_exp_batches_completed(void);
unsigned long srcu_batches_completed(struct srcu_struct *sp);
+bool rcu_check_boost_fail(unsigned long gp_state, int *cpup);
void show_rcu_gp_kthreads(void);
int rcu_get_gp_kthreads_prio(void);
void rcu_fwd_progress_check(unsigned long j);
void rcu_force_quiescent_state(void);
extern struct workqueue_struct *rcu_gp_wq;
+#ifdef CONFIG_RCU_EXP_KTHREAD
+extern struct kthread_worker *rcu_exp_gp_kworker;
+extern struct kthread_worker *rcu_exp_par_gp_kworker;
+#else /* !CONFIG_RCU_EXP_KTHREAD */
extern struct workqueue_struct *rcu_par_gp_wq;
+#endif /* CONFIG_RCU_EXP_KTHREAD */
+void rcu_gp_slow_register(atomic_t *rgssp);
+void rcu_gp_slow_unregister(atomic_t *rgssp);
#endif /* #else #ifdef CONFIG_TINY_RCU */
#ifdef CONFIG_RCU_NOCB_CPU
-bool rcu_is_nocb_cpu(int cpu);
void rcu_bind_current_to_nocb(void);
#else
-static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
static inline void rcu_bind_current_to_nocb(void) { }
#endif
+#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_RCU)
+void show_rcu_tasks_classic_gp_kthread(void);
+#else
+static inline void show_rcu_tasks_classic_gp_kthread(void) {}
+#endif
+#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_RUDE_RCU)
+void show_rcu_tasks_rude_gp_kthread(void);
+#else
+static inline void show_rcu_tasks_rude_gp_kthread(void) {}
+#endif
+#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_TRACE_RCU)
+void show_rcu_tasks_trace_gp_kthread(void);
+#else
+static inline void show_rcu_tasks_trace_gp_kthread(void) {}
+#endif
+
+#ifdef CONFIG_TINY_RCU
+static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; }
+#else
+bool rcu_cpu_beenfullyonline(int cpu);
+#endif
+
+#if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
+int rcu_stall_notifier_call_chain(unsigned long val, void *v);
+#else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
+static inline int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return NOTIFY_DONE; }
+#endif // #else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 9a0f66133b4b..1693ea22ef1b 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -7,10 +7,10 @@
* Authors: Paul E. McKenney <paulmck@linux.ibm.com>
*/
-#include <linux/types.h>
-#include <linux/kernel.h>
+#include <linux/cpu.h>
#include <linux/interrupt.h>
-#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
#include "rcu_segcblist.h"
@@ -88,23 +88,135 @@ static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)
#endif
}
+/* Get the length of a segment of the rcu_segcblist structure. */
+long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
+{
+ return READ_ONCE(rsclp->seglen[seg]);
+}
+
+/* Return number of callbacks in segmented callback list by summing seglen. */
+long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp)
+{
+ long len = 0;
+ int i;
+
+ for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
+ len += rcu_segcblist_get_seglen(rsclp, i);
+
+ return len;
+}
+
+/* Set the length of a segment of the rcu_segcblist structure. */
+static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v)
+{
+ WRITE_ONCE(rsclp->seglen[seg], v);
+}
+
+/* Increase the numeric length of a segment by a specified amount. */
+static void rcu_segcblist_add_seglen(struct rcu_segcblist *rsclp, int seg, long v)
+{
+ WRITE_ONCE(rsclp->seglen[seg], rsclp->seglen[seg] + v);
+}
+
+/* Move from's segment length to to's segment. */
+static void rcu_segcblist_move_seglen(struct rcu_segcblist *rsclp, int from, int to)
+{
+ long len;
+
+ if (from == to)
+ return;
+
+ len = rcu_segcblist_get_seglen(rsclp, from);
+ if (!len)
+ return;
+
+ rcu_segcblist_add_seglen(rsclp, to, len);
+ rcu_segcblist_set_seglen(rsclp, from, 0);
+}
+
+/* Increment segment's length. */
+static void rcu_segcblist_inc_seglen(struct rcu_segcblist *rsclp, int seg)
+{
+ rcu_segcblist_add_seglen(rsclp, seg, 1);
+}
+
/*
* Increase the numeric length of an rcu_segcblist structure by the
* specified amount, which can be negative. This can cause the ->len
* field to disagree with the actual number of callbacks on the structure.
* This increase is fully ordered with respect to the callers accesses
* both before and after.
+ *
+ * So why on earth is a memory barrier required both before and after
+ * the update to the ->len field???
+ *
+ * The reason is that rcu_barrier() locklessly samples each CPU's ->len
+ * field, and if a given CPU's field is zero, avoids IPIing that CPU.
+ * This can of course race with both queuing and invoking of callbacks.
+ * Failing to correctly handle either of these races could result in
+ * rcu_barrier() failing to IPI a CPU that actually had callbacks queued
+ * which rcu_barrier() was obligated to wait on. And if rcu_barrier()
+ * failed to wait on such a callback, unloading certain kernel modules
+ * would result in calls to functions whose code was no longer present in
+ * the kernel, for but one example.
+ *
+ * Therefore, ->len transitions from 1->0 and 0->1 have to be carefully
+ * ordered with respect with both list modifications and the rcu_barrier().
+ *
+ * The queuing case is CASE 1 and the invoking case is CASE 2.
+ *
+ * CASE 1: Suppose that CPU 0 has no callbacks queued, but invokes
+ * call_rcu() just as CPU 1 invokes rcu_barrier(). CPU 0's ->len field
+ * will transition from 0->1, which is one of the transitions that must
+ * be handled carefully. Without the full memory barriers after the ->len
+ * update and at the beginning of rcu_barrier(), the following could happen:
+ *
+ * CPU 0 CPU 1
+ *
+ * call_rcu().
+ * rcu_barrier() sees ->len as 0.
+ * set ->len = 1.
+ * rcu_barrier() does nothing.
+ * module is unloaded.
+ * callback invokes unloaded function!
+ *
+ * With the full barriers, any case where rcu_barrier() sees ->len as 0 will
+ * have unambiguously preceded the return from the racing call_rcu(), which
+ * means that this call_rcu() invocation is OK to not wait on. After all,
+ * you are supposed to make sure that any problematic call_rcu() invocations
+ * happen before the rcu_barrier().
+ *
+ *
+ * CASE 2: Suppose that CPU 0 is invoking its last callback just as
+ * CPU 1 invokes rcu_barrier(). CPU 0's ->len field will transition from
+ * 1->0, which is one of the transitions that must be handled carefully.
+ * Without the full memory barriers before the ->len update and at the
+ * end of rcu_barrier(), the following could happen:
+ *
+ * CPU 0 CPU 1
+ *
+ * start invoking last callback
+ * set ->len = 0 (reordered)
+ * rcu_barrier() sees ->len as 0
+ * rcu_barrier() does nothing.
+ * module is unloaded
+ * callback executing after unloaded!
+ *
+ * With the full barriers, any case where rcu_barrier() sees ->len as 0
+ * will be fully ordered after the completion of the callback function,
+ * so that the module unloading operation is completely safe.
+ *
*/
-static void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v)
+void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v)
{
#ifdef CONFIG_RCU_NOCB_CPU
- smp_mb__before_atomic(); /* Up to the caller! */
+ smp_mb__before_atomic(); // Read header comment above.
atomic_long_add(v, &rsclp->len);
- smp_mb__after_atomic(); /* Up to the caller! */
+ smp_mb__after_atomic(); // Read header comment above.
#else
- smp_mb(); /* Up to the caller! */
+ smp_mb(); // Read header comment above.
WRITE_ONCE(rsclp->len, rsclp->len + v);
- smp_mb(); /* Up to the caller! */
+ smp_mb(); // Read header comment above.
#endif
}
@@ -120,26 +232,6 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp)
}
/*
- * Exchange the numeric length of the specified rcu_segcblist structure
- * with the specified value. This can cause the ->len field to disagree
- * with the actual number of callbacks on the structure. This exchange is
- * fully ordered with respect to the callers accesses both before and after.
- */
-static long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v)
-{
-#ifdef CONFIG_RCU_NOCB_CPU
- return atomic_long_xchg(&rsclp->len, v);
-#else
- long ret = rsclp->len;
-
- smp_mb(); /* Up to the caller! */
- WRITE_ONCE(rsclp->len, v);
- smp_mb(); /* Up to the caller! */
- return ret;
-#endif
-}
-
-/*
* Initialize an rcu_segcblist structure.
*/
void rcu_segcblist_init(struct rcu_segcblist *rsclp)
@@ -149,10 +241,12 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp)
BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
rsclp->head = NULL;
- for (i = 0; i < RCU_CBLIST_NSEGS; i++)
+ for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
rsclp->tails[i] = &rsclp->head;
+ rcu_segcblist_set_seglen(rsclp, i, 0);
+ }
rcu_segcblist_set_len(rsclp, 0);
- rsclp->enabled = 1;
+ rcu_segcblist_set_flags(rsclp, SEGCBLIST_ENABLED);
}
/*
@@ -163,16 +257,18 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
{
WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
- rsclp->enabled = 0;
+ rcu_segcblist_clear_flags(rsclp, SEGCBLIST_ENABLED);
}
/*
- * Mark the specified rcu_segcblist structure as offloaded. This
- * structure must be empty.
+ * Mark the specified rcu_segcblist structure as offloaded (or not)
*/
-void rcu_segcblist_offload(struct rcu_segcblist *rsclp)
+void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload)
{
- rsclp->offloaded = 1;
+ if (offload)
+ rcu_segcblist_set_flags(rsclp, SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED);
+ else
+ rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED);
}
/*
@@ -245,7 +341,7 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
struct rcu_head *rhp)
{
rcu_segcblist_inc_len(rsclp);
- smp_mb(); /* Ensure counts are updated before callback is enqueued. */
+ rcu_segcblist_inc_seglen(rsclp, RCU_NEXT_TAIL);
rhp->next = NULL;
WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp);
WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next);
@@ -272,8 +368,9 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
smp_mb(); /* Ensure counts are updated before callback is entrained. */
rhp->next = NULL;
for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
- if (rsclp->tails[i] != rsclp->tails[i - 1])
+ if (!rcu_segcblist_segempty(rsclp, i))
break;
+ rcu_segcblist_inc_seglen(rsclp, i);
WRITE_ONCE(*rsclp->tails[i], rhp);
for (; i <= RCU_NEXT_TAIL; i++)
WRITE_ONCE(rsclp->tails[i], &rhp->next);
@@ -281,21 +378,6 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
}
/*
- * Extract only the counts from the specified rcu_segcblist structure,
- * and place them in the specified rcu_cblist structure. This function
- * supports both callback orphaning and invocation, hence the separation
- * of counts and callbacks. (Callbacks ready for invocation must be
- * orphaned and adopted separately from pending callbacks, but counts
- * apply to all callbacks. Locking must be used to make sure that
- * both orphaned-callbacks lists are consistent.)
- */
-void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
- struct rcu_cblist *rclp)
-{
- rclp->len = rcu_segcblist_xchg_len(rsclp, 0);
-}
-
-/*
* Extract only those callbacks ready to be invoked from the specified
* rcu_segcblist structure and place them in the specified rcu_cblist
* structure.
@@ -307,6 +389,7 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
if (!rcu_segcblist_ready_cbs(rsclp))
return; /* Nothing to do. */
+ rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_DONE_TAIL);
*rclp->tail = rsclp->head;
WRITE_ONCE(rsclp->head, *rsclp->tails[RCU_DONE_TAIL]);
WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
@@ -314,6 +397,7 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
WRITE_ONCE(rsclp->tails[i], &rsclp->head);
+ rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0);
}
/*
@@ -330,11 +414,15 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
if (!rcu_segcblist_pend_cbs(rsclp))
return; /* Nothing to do. */
+ rclp->len = 0;
*rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
- for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
+ for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) {
+ rclp->len += rcu_segcblist_get_seglen(rsclp, i);
WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]);
+ rcu_segcblist_set_seglen(rsclp, i, 0);
+ }
}
/*
@@ -345,7 +433,6 @@ void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp)
{
rcu_segcblist_add_len(rsclp, rclp->len);
- rclp->len = 0;
}
/*
@@ -359,6 +446,7 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
if (!rclp->head)
return; /* No callbacks to move. */
+ rcu_segcblist_add_seglen(rsclp, RCU_DONE_TAIL, rclp->len);
*rclp->tail = rsclp->head;
WRITE_ONCE(rsclp->head, rclp->head);
for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
@@ -379,6 +467,8 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
{
if (!rclp->head)
return; /* Nothing to do. */
+
+ rcu_segcblist_add_seglen(rsclp, RCU_NEXT_TAIL, rclp->len);
WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head);
WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail);
}
@@ -403,6 +493,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
break;
WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]);
+ rcu_segcblist_move_seglen(rsclp, i, RCU_DONE_TAIL);
}
/* If no callbacks moved, nothing more need be done. */
@@ -414,15 +505,16 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]);
/*
- * Callbacks moved, so clean up the misordered ->tails[] pointers
- * that now point into the middle of the list of ready-to-invoke
- * callbacks. The overall effect is to copy down the later pointers
- * into the gap that was created by the now-ready segments.
+ * Callbacks moved, so there might be an empty RCU_WAIT_TAIL
+ * and a non-empty RCU_NEXT_READY_TAIL. If so, copy the
+ * RCU_NEXT_READY_TAIL segment to fill the RCU_WAIT_TAIL gap
+ * created by the now-ready-to-invoke segments.
*/
for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
break; /* No more callbacks. */
WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]);
+ rcu_segcblist_move_seglen(rsclp, i, j);
rsclp->gp_seq[j] = rsclp->gp_seq[i];
}
}
@@ -444,7 +536,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
*/
bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
{
- int i;
+ int i, j;
WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
@@ -459,7 +551,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
* as their ->gp_seq[] grace-period completion sequence number.
*/
for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
- if (rsclp->tails[i] != rsclp->tails[i - 1] &&
+ if (!rcu_segcblist_segempty(rsclp, i) &&
ULONG_CMP_LT(rsclp->gp_seq[i], seq))
break;
@@ -475,10 +567,22 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
* Also advance to the oldest segment of callbacks whose
* ->gp_seq[] completion is at or after that passed in via "seq",
* skipping any empty segments.
+ *
+ * Note that segment "i" (and any lower-numbered segments
+ * containing older callbacks) will be unaffected, and their
+ * grace-period numbers remain unchanged. For example, if i ==
+ * WAIT_TAIL, then neither WAIT_TAIL nor DONE_TAIL will be touched.
+ * Instead, the CBs in NEXT_TAIL will be merged with those in
+ * NEXT_READY_TAIL and the grace-period number of NEXT_READY_TAIL
+ * would be updated. NEXT_TAIL would then be empty.
*/
- if (++i >= RCU_NEXT_TAIL)
+ if (rcu_segcblist_restempty(rsclp, i) || ++i >= RCU_NEXT_TAIL)
return false;
+ /* Accounting: everything below i is about to get merged into i. */
+ for (j = i + 1; j <= RCU_NEXT_TAIL; j++)
+ rcu_segcblist_move_seglen(rsclp, j, i);
+
/*
* Merge all later callbacks, including newly arrived callbacks,
* into the segment located by the for-loop above. Assign "seq"
@@ -506,13 +610,24 @@ void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
struct rcu_cblist donecbs;
struct rcu_cblist pendcbs;
+ lockdep_assert_cpus_held();
+
rcu_cblist_init(&donecbs);
rcu_cblist_init(&pendcbs);
- rcu_segcblist_extract_count(src_rsclp, &donecbs);
+
rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs);
rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs);
+
+ /*
+ * No need smp_mb() before setting length to 0, because CPU hotplug
+ * lock excludes rcu_barrier.
+ */
+ rcu_segcblist_set_len(src_rsclp, 0);
+
rcu_segcblist_insert_count(dst_rsclp, &donecbs);
+ rcu_segcblist_insert_count(dst_rsclp, &pendcbs);
rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs);
rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs);
+
rcu_segcblist_init(src_rsclp);
}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 5c293afc07b8..4fe877f5f654 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -15,6 +15,11 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
return READ_ONCE(rclp->len);
}
+long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg);
+
+/* Return number of callbacks in segmented callback list by summing seglen. */
+long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp);
+
void rcu_cblist_init(struct rcu_cblist *rclp);
void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
@@ -50,19 +55,53 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
#endif
}
+static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp,
+ int flags)
+{
+ WRITE_ONCE(rsclp->flags, rsclp->flags | flags);
+}
+
+static inline void rcu_segcblist_clear_flags(struct rcu_segcblist *rsclp,
+ int flags)
+{
+ WRITE_ONCE(rsclp->flags, rsclp->flags & ~flags);
+}
+
+static inline bool rcu_segcblist_test_flags(struct rcu_segcblist *rsclp,
+ int flags)
+{
+ return READ_ONCE(rsclp->flags) & flags;
+}
+
/*
* Is the specified rcu_segcblist enabled, for example, not corresponding
* to an offline CPU?
*/
static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
{
- return rsclp->enabled;
+ return rcu_segcblist_test_flags(rsclp, SEGCBLIST_ENABLED);
}
-/* Is the specified rcu_segcblist offloaded? */
+/*
+ * Is the specified rcu_segcblist NOCB offloaded (or in the middle of the
+ * [de]offloading process)?
+ */
static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
{
- return rsclp->offloaded;
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ rcu_segcblist_test_flags(rsclp, SEGCBLIST_LOCKING))
+ return true;
+
+ return false;
+}
+
+static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp)
+{
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ !rcu_segcblist_test_flags(rsclp, SEGCBLIST_RCU_CORE))
+ return true;
+
+ return false;
}
/*
@@ -75,10 +114,22 @@ static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
return !READ_ONCE(*READ_ONCE(rsclp->tails[seg]));
}
+/*
+ * Is the specified segment of the specified rcu_segcblist structure
+ * empty of callbacks?
+ */
+static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
+{
+ if (seg == RCU_DONE_TAIL)
+ return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
+ return rsclp->tails[seg - 1] == rsclp->tails[seg];
+}
+
void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp);
+void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v);
void rcu_segcblist_init(struct rcu_segcblist *rsclp);
void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
-void rcu_segcblist_offload(struct rcu_segcblist *rsclp);
+void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload);
bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
@@ -88,8 +139,6 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
struct rcu_head *rhp);
bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
struct rcu_head *rhp);
-void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
- struct rcu_cblist *rclp);
void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp);
void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
deleted file mode 100644
index 9eb39c20082c..000000000000
--- a/kernel/rcu/rcuperf.c
+++ /dev/null
@@ -1,856 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Read-Copy Update module-based performance-test facility
- *
- * Copyright (C) IBM Corporation, 2015
- *
- * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
- */
-
-#define pr_fmt(fmt) fmt
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/kthread.h>
-#include <linux/err.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <uapi/linux/sched/types.h>
-#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/completion.h>
-#include <linux/moduleparam.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/reboot.h>
-#include <linux/freezer.h>
-#include <linux/cpu.h>
-#include <linux/delay.h>
-#include <linux/stat.h>
-#include <linux/srcu.h>
-#include <linux/slab.h>
-#include <asm/byteorder.h>
-#include <linux/torture.h>
-#include <linux/vmalloc.h>
-
-#include "rcu.h"
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
-
-#define PERF_FLAG "-perf:"
-#define PERFOUT_STRING(s) \
- pr_alert("%s" PERF_FLAG " %s\n", perf_type, s)
-#define VERBOSE_PERFOUT_STRING(s) \
- do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0)
-#define VERBOSE_PERFOUT_ERRSTRING(s) \
- do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
-
-/*
- * The intended use cases for the nreaders and nwriters module parameters
- * are as follows:
- *
- * 1. Specify only the nr_cpus kernel boot parameter. This will
- * set both nreaders and nwriters to the value specified by
- * nr_cpus for a mixed reader/writer test.
- *
- * 2. Specify the nr_cpus kernel boot parameter, but set
- * rcuperf.nreaders to zero. This will set nwriters to the
- * value specified by nr_cpus for an update-only test.
- *
- * 3. Specify the nr_cpus kernel boot parameter, but set
- * rcuperf.nwriters to zero. This will set nreaders to the
- * value specified by nr_cpus for a read-only test.
- *
- * Various other use cases may of course be specified.
- */
-
-#ifdef MODULE
-# define RCUPERF_SHUTDOWN 0
-#else
-# define RCUPERF_SHUTDOWN 1
-#endif
-
-torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
-torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
-torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
-torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
-torture_param(int, nreaders, -1, "Number of RCU reader threads");
-torture_param(int, nwriters, -1, "Number of RCU updater threads");
-torture_param(bool, shutdown, RCUPERF_SHUTDOWN,
- "Shutdown at end of performance tests.");
-torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
-torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
-torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?");
-torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate.");
-
-static char *perf_type = "rcu";
-module_param(perf_type, charp, 0444);
-MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, srcu, ...)");
-
-static int nrealreaders;
-static int nrealwriters;
-static struct task_struct **writer_tasks;
-static struct task_struct **reader_tasks;
-static struct task_struct *shutdown_task;
-
-static u64 **writer_durations;
-static int *writer_n_durations;
-static atomic_t n_rcu_perf_reader_started;
-static atomic_t n_rcu_perf_writer_started;
-static atomic_t n_rcu_perf_writer_finished;
-static wait_queue_head_t shutdown_wq;
-static u64 t_rcu_perf_writer_started;
-static u64 t_rcu_perf_writer_finished;
-static unsigned long b_rcu_gp_test_started;
-static unsigned long b_rcu_gp_test_finished;
-static DEFINE_PER_CPU(atomic_t, n_async_inflight);
-
-#define MAX_MEAS 10000
-#define MIN_MEAS 100
-
-/*
- * Operations vector for selecting different types of tests.
- */
-
-struct rcu_perf_ops {
- int ptype;
- void (*init)(void);
- void (*cleanup)(void);
- int (*readlock)(void);
- void (*readunlock)(int idx);
- unsigned long (*get_gp_seq)(void);
- unsigned long (*gp_diff)(unsigned long new, unsigned long old);
- unsigned long (*exp_completed)(void);
- void (*async)(struct rcu_head *head, rcu_callback_t func);
- void (*gp_barrier)(void);
- void (*sync)(void);
- void (*exp_sync)(void);
- const char *name;
-};
-
-static struct rcu_perf_ops *cur_ops;
-
-/*
- * Definitions for rcu perf testing.
- */
-
-static int rcu_perf_read_lock(void) __acquires(RCU)
-{
- rcu_read_lock();
- return 0;
-}
-
-static void rcu_perf_read_unlock(int idx) __releases(RCU)
-{
- rcu_read_unlock();
-}
-
-static unsigned long __maybe_unused rcu_no_completed(void)
-{
- return 0;
-}
-
-static void rcu_sync_perf_init(void)
-{
-}
-
-static struct rcu_perf_ops rcu_ops = {
- .ptype = RCU_FLAVOR,
- .init = rcu_sync_perf_init,
- .readlock = rcu_perf_read_lock,
- .readunlock = rcu_perf_read_unlock,
- .get_gp_seq = rcu_get_gp_seq,
- .gp_diff = rcu_seq_diff,
- .exp_completed = rcu_exp_batches_completed,
- .async = call_rcu,
- .gp_barrier = rcu_barrier,
- .sync = synchronize_rcu,
- .exp_sync = synchronize_rcu_expedited,
- .name = "rcu"
-};
-
-/*
- * Definitions for srcu perf testing.
- */
-
-DEFINE_STATIC_SRCU(srcu_ctl_perf);
-static struct srcu_struct *srcu_ctlp = &srcu_ctl_perf;
-
-static int srcu_perf_read_lock(void) __acquires(srcu_ctlp)
-{
- return srcu_read_lock(srcu_ctlp);
-}
-
-static void srcu_perf_read_unlock(int idx) __releases(srcu_ctlp)
-{
- srcu_read_unlock(srcu_ctlp, idx);
-}
-
-static unsigned long srcu_perf_completed(void)
-{
- return srcu_batches_completed(srcu_ctlp);
-}
-
-static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func)
-{
- call_srcu(srcu_ctlp, head, func);
-}
-
-static void srcu_rcu_barrier(void)
-{
- srcu_barrier(srcu_ctlp);
-}
-
-static void srcu_perf_synchronize(void)
-{
- synchronize_srcu(srcu_ctlp);
-}
-
-static void srcu_perf_synchronize_expedited(void)
-{
- synchronize_srcu_expedited(srcu_ctlp);
-}
-
-static struct rcu_perf_ops srcu_ops = {
- .ptype = SRCU_FLAVOR,
- .init = rcu_sync_perf_init,
- .readlock = srcu_perf_read_lock,
- .readunlock = srcu_perf_read_unlock,
- .get_gp_seq = srcu_perf_completed,
- .gp_diff = rcu_seq_diff,
- .exp_completed = srcu_perf_completed,
- .async = srcu_call_rcu,
- .gp_barrier = srcu_rcu_barrier,
- .sync = srcu_perf_synchronize,
- .exp_sync = srcu_perf_synchronize_expedited,
- .name = "srcu"
-};
-
-static struct srcu_struct srcud;
-
-static void srcu_sync_perf_init(void)
-{
- srcu_ctlp = &srcud;
- init_srcu_struct(srcu_ctlp);
-}
-
-static void srcu_sync_perf_cleanup(void)
-{
- cleanup_srcu_struct(srcu_ctlp);
-}
-
-static struct rcu_perf_ops srcud_ops = {
- .ptype = SRCU_FLAVOR,
- .init = srcu_sync_perf_init,
- .cleanup = srcu_sync_perf_cleanup,
- .readlock = srcu_perf_read_lock,
- .readunlock = srcu_perf_read_unlock,
- .get_gp_seq = srcu_perf_completed,
- .gp_diff = rcu_seq_diff,
- .exp_completed = srcu_perf_completed,
- .async = srcu_call_rcu,
- .gp_barrier = srcu_rcu_barrier,
- .sync = srcu_perf_synchronize,
- .exp_sync = srcu_perf_synchronize_expedited,
- .name = "srcud"
-};
-
-/*
- * Definitions for RCU-tasks perf testing.
- */
-
-static int tasks_perf_read_lock(void)
-{
- return 0;
-}
-
-static void tasks_perf_read_unlock(int idx)
-{
-}
-
-static struct rcu_perf_ops tasks_ops = {
- .ptype = RCU_TASKS_FLAVOR,
- .init = rcu_sync_perf_init,
- .readlock = tasks_perf_read_lock,
- .readunlock = tasks_perf_read_unlock,
- .get_gp_seq = rcu_no_completed,
- .gp_diff = rcu_seq_diff,
- .async = call_rcu_tasks,
- .gp_barrier = rcu_barrier_tasks,
- .sync = synchronize_rcu_tasks,
- .exp_sync = synchronize_rcu_tasks,
- .name = "tasks"
-};
-
-static unsigned long rcuperf_seq_diff(unsigned long new, unsigned long old)
-{
- if (!cur_ops->gp_diff)
- return new - old;
- return cur_ops->gp_diff(new, old);
-}
-
-/*
- * If performance tests complete, wait for shutdown to commence.
- */
-static void rcu_perf_wait_shutdown(void)
-{
- cond_resched_tasks_rcu_qs();
- if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters)
- return;
- while (!torture_must_stop())
- schedule_timeout_uninterruptible(1);
-}
-
-/*
- * RCU perf reader kthread. Repeatedly does empty RCU read-side
- * critical section, minimizing update-side interference.
- */
-static int
-rcu_perf_reader(void *arg)
-{
- unsigned long flags;
- int idx;
- long me = (long)arg;
-
- VERBOSE_PERFOUT_STRING("rcu_perf_reader task started");
- set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
- set_user_nice(current, MAX_NICE);
- atomic_inc(&n_rcu_perf_reader_started);
-
- do {
- local_irq_save(flags);
- idx = cur_ops->readlock();
- cur_ops->readunlock(idx);
- local_irq_restore(flags);
- rcu_perf_wait_shutdown();
- } while (!torture_must_stop());
- torture_kthread_stopping("rcu_perf_reader");
- return 0;
-}
-
-/*
- * Callback function for asynchronous grace periods from rcu_perf_writer().
- */
-static void rcu_perf_async_cb(struct rcu_head *rhp)
-{
- atomic_dec(this_cpu_ptr(&n_async_inflight));
- kfree(rhp);
-}
-
-/*
- * RCU perf writer kthread. Repeatedly does a grace period.
- */
-static int
-rcu_perf_writer(void *arg)
-{
- int i = 0;
- int i_max;
- long me = (long)arg;
- struct rcu_head *rhp = NULL;
- struct sched_param sp;
- bool started = false, done = false, alldone = false;
- u64 t;
- u64 *wdp;
- u64 *wdpp = writer_durations[me];
-
- VERBOSE_PERFOUT_STRING("rcu_perf_writer task started");
- WARN_ON(!wdpp);
- set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
- sp.sched_priority = 1;
- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
-
- if (holdoff)
- schedule_timeout_uninterruptible(holdoff * HZ);
-
- /*
- * Wait until rcu_end_inkernel_boot() is called for normal GP tests
- * so that RCU is not always expedited for normal GP tests.
- * The system_state test is approximate, but works well in practice.
- */
- while (!gp_exp && system_state != SYSTEM_RUNNING)
- schedule_timeout_uninterruptible(1);
-
- t = ktime_get_mono_fast_ns();
- if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) {
- t_rcu_perf_writer_started = t;
- if (gp_exp) {
- b_rcu_gp_test_started =
- cur_ops->exp_completed() / 2;
- } else {
- b_rcu_gp_test_started = cur_ops->get_gp_seq();
- }
- }
-
- do {
- if (writer_holdoff)
- udelay(writer_holdoff);
- wdp = &wdpp[i];
- *wdp = ktime_get_mono_fast_ns();
- if (gp_async) {
-retry:
- if (!rhp)
- rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
- if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) {
- atomic_inc(this_cpu_ptr(&n_async_inflight));
- cur_ops->async(rhp, rcu_perf_async_cb);
- rhp = NULL;
- } else if (!kthread_should_stop()) {
- cur_ops->gp_barrier();
- goto retry;
- } else {
- kfree(rhp); /* Because we are stopping. */
- }
- } else if (gp_exp) {
- cur_ops->exp_sync();
- } else {
- cur_ops->sync();
- }
- t = ktime_get_mono_fast_ns();
- *wdp = t - *wdp;
- i_max = i;
- if (!started &&
- atomic_read(&n_rcu_perf_writer_started) >= nrealwriters)
- started = true;
- if (!done && i >= MIN_MEAS) {
- done = true;
- sp.sched_priority = 0;
- sched_setscheduler_nocheck(current,
- SCHED_NORMAL, &sp);
- pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n",
- perf_type, PERF_FLAG, me, MIN_MEAS);
- if (atomic_inc_return(&n_rcu_perf_writer_finished) >=
- nrealwriters) {
- schedule_timeout_interruptible(10);
- rcu_ftrace_dump(DUMP_ALL);
- PERFOUT_STRING("Test complete");
- t_rcu_perf_writer_finished = t;
- if (gp_exp) {
- b_rcu_gp_test_finished =
- cur_ops->exp_completed() / 2;
- } else {
- b_rcu_gp_test_finished =
- cur_ops->get_gp_seq();
- }
- if (shutdown) {
- smp_mb(); /* Assign before wake. */
- wake_up(&shutdown_wq);
- }
- }
- }
- if (done && !alldone &&
- atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters)
- alldone = true;
- if (started && !alldone && i < MAX_MEAS - 1)
- i++;
- rcu_perf_wait_shutdown();
- } while (!torture_must_stop());
- if (gp_async) {
- cur_ops->gp_barrier();
- }
- writer_n_durations[me] = i_max;
- torture_kthread_stopping("rcu_perf_writer");
- return 0;
-}
-
-static void
-rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag)
-{
- pr_alert("%s" PERF_FLAG
- "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n",
- perf_type, tag, nrealreaders, nrealwriters, verbose, shutdown);
-}
-
-static void
-rcu_perf_cleanup(void)
-{
- int i;
- int j;
- int ngps = 0;
- u64 *wdp;
- u64 *wdpp;
-
- /*
- * Would like warning at start, but everything is expedited
- * during the mid-boot phase, so have to wait till the end.
- */
- if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
- VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
- if (rcu_gp_is_normal() && gp_exp)
- VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
- if (gp_exp && gp_async)
- VERBOSE_PERFOUT_ERRSTRING("No expedited async GPs, so went with async!");
-
- if (torture_cleanup_begin())
- return;
- if (!cur_ops) {
- torture_cleanup_end();
- return;
- }
-
- if (reader_tasks) {
- for (i = 0; i < nrealreaders; i++)
- torture_stop_kthread(rcu_perf_reader,
- reader_tasks[i]);
- kfree(reader_tasks);
- }
-
- if (writer_tasks) {
- for (i = 0; i < nrealwriters; i++) {
- torture_stop_kthread(rcu_perf_writer,
- writer_tasks[i]);
- if (!writer_n_durations)
- continue;
- j = writer_n_durations[i];
- pr_alert("%s%s writer %d gps: %d\n",
- perf_type, PERF_FLAG, i, j);
- ngps += j;
- }
- pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n",
- perf_type, PERF_FLAG,
- t_rcu_perf_writer_started, t_rcu_perf_writer_finished,
- t_rcu_perf_writer_finished -
- t_rcu_perf_writer_started,
- ngps,
- rcuperf_seq_diff(b_rcu_gp_test_finished,
- b_rcu_gp_test_started));
- for (i = 0; i < nrealwriters; i++) {
- if (!writer_durations)
- break;
- if (!writer_n_durations)
- continue;
- wdpp = writer_durations[i];
- if (!wdpp)
- continue;
- for (j = 0; j <= writer_n_durations[i]; j++) {
- wdp = &wdpp[j];
- pr_alert("%s%s %4d writer-duration: %5d %llu\n",
- perf_type, PERF_FLAG,
- i, j, *wdp);
- if (j % 100 == 0)
- schedule_timeout_uninterruptible(1);
- }
- kfree(writer_durations[i]);
- }
- kfree(writer_tasks);
- kfree(writer_durations);
- kfree(writer_n_durations);
- }
-
- /* Do torture-type-specific cleanup operations. */
- if (cur_ops->cleanup != NULL)
- cur_ops->cleanup();
-
- torture_cleanup_end();
-}
-
-/*
- * Return the number if non-negative. If -1, the number of CPUs.
- * If less than -1, that much less than the number of CPUs, but
- * at least one.
- */
-static int compute_real(int n)
-{
- int nr;
-
- if (n >= 0) {
- nr = n;
- } else {
- nr = num_online_cpus() + 1 + n;
- if (nr <= 0)
- nr = 1;
- }
- return nr;
-}
-
-/*
- * RCU perf shutdown kthread. Just waits to be awakened, then shuts
- * down system.
- */
-static int
-rcu_perf_shutdown(void *arg)
-{
- do {
- wait_event(shutdown_wq,
- atomic_read(&n_rcu_perf_writer_finished) >=
- nrealwriters);
- } while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters);
- smp_mb(); /* Wake before output. */
- rcu_perf_cleanup();
- kernel_power_off();
- return -EINVAL;
-}
-
-/*
- * kfree_rcu() performance tests: Start a kfree_rcu() loop on all CPUs for number
- * of iterations and measure total time and number of GP for all iterations to complete.
- */
-
-torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu().");
-torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration.");
-torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees.");
-
-static struct task_struct **kfree_reader_tasks;
-static int kfree_nrealthreads;
-static atomic_t n_kfree_perf_thread_started;
-static atomic_t n_kfree_perf_thread_ended;
-
-struct kfree_obj {
- char kfree_obj[8];
- struct rcu_head rh;
-};
-
-static int
-kfree_perf_thread(void *arg)
-{
- int i, loop = 0;
- long me = (long)arg;
- struct kfree_obj *alloc_ptr;
- u64 start_time, end_time;
- long long mem_begin, mem_during = 0;
-
- VERBOSE_PERFOUT_STRING("kfree_perf_thread task started");
- set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
- set_user_nice(current, MAX_NICE);
-
- start_time = ktime_get_mono_fast_ns();
-
- if (atomic_inc_return(&n_kfree_perf_thread_started) >= kfree_nrealthreads) {
- if (gp_exp)
- b_rcu_gp_test_started = cur_ops->exp_completed() / 2;
- else
- b_rcu_gp_test_started = cur_ops->get_gp_seq();
- }
-
- do {
- if (!mem_during) {
- mem_during = mem_begin = si_mem_available();
- } else if (loop % (kfree_loops / 4) == 0) {
- mem_during = (mem_during + si_mem_available()) / 2;
- }
-
- for (i = 0; i < kfree_alloc_num; i++) {
- alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL);
- if (!alloc_ptr)
- return -ENOMEM;
-
- kfree_rcu(alloc_ptr, rh);
- }
-
- cond_resched();
- } while (!torture_must_stop() && ++loop < kfree_loops);
-
- if (atomic_inc_return(&n_kfree_perf_thread_ended) >= kfree_nrealthreads) {
- end_time = ktime_get_mono_fast_ns();
-
- if (gp_exp)
- b_rcu_gp_test_finished = cur_ops->exp_completed() / 2;
- else
- b_rcu_gp_test_finished = cur_ops->get_gp_seq();
-
- pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n",
- (unsigned long long)(end_time - start_time), kfree_loops,
- rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started),
- (mem_begin - mem_during) >> (20 - PAGE_SHIFT));
-
- if (shutdown) {
- smp_mb(); /* Assign before wake. */
- wake_up(&shutdown_wq);
- }
- }
-
- torture_kthread_stopping("kfree_perf_thread");
- return 0;
-}
-
-static void
-kfree_perf_cleanup(void)
-{
- int i;
-
- if (torture_cleanup_begin())
- return;
-
- if (kfree_reader_tasks) {
- for (i = 0; i < kfree_nrealthreads; i++)
- torture_stop_kthread(kfree_perf_thread,
- kfree_reader_tasks[i]);
- kfree(kfree_reader_tasks);
- }
-
- torture_cleanup_end();
-}
-
-/*
- * shutdown kthread. Just waits to be awakened, then shuts down system.
- */
-static int
-kfree_perf_shutdown(void *arg)
-{
- do {
- wait_event(shutdown_wq,
- atomic_read(&n_kfree_perf_thread_ended) >=
- kfree_nrealthreads);
- } while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads);
-
- smp_mb(); /* Wake before output. */
-
- kfree_perf_cleanup();
- kernel_power_off();
- return -EINVAL;
-}
-
-static int __init
-kfree_perf_init(void)
-{
- long i;
- int firsterr = 0;
-
- kfree_nrealthreads = compute_real(kfree_nthreads);
- /* Start up the kthreads. */
- if (shutdown) {
- init_waitqueue_head(&shutdown_wq);
- firsterr = torture_create_kthread(kfree_perf_shutdown, NULL,
- shutdown_task);
- if (firsterr)
- goto unwind;
- schedule_timeout_uninterruptible(1);
- }
-
- pr_alert("kfree object size=%zu\n", kfree_mult * sizeof(struct kfree_obj));
-
- kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]),
- GFP_KERNEL);
- if (kfree_reader_tasks == NULL) {
- firsterr = -ENOMEM;
- goto unwind;
- }
-
- for (i = 0; i < kfree_nrealthreads; i++) {
- firsterr = torture_create_kthread(kfree_perf_thread, (void *)i,
- kfree_reader_tasks[i]);
- if (firsterr)
- goto unwind;
- }
-
- while (atomic_read(&n_kfree_perf_thread_started) < kfree_nrealthreads)
- schedule_timeout_uninterruptible(1);
-
- torture_init_end();
- return 0;
-
-unwind:
- torture_init_end();
- kfree_perf_cleanup();
- return firsterr;
-}
-
-static int __init
-rcu_perf_init(void)
-{
- long i;
- int firsterr = 0;
- static struct rcu_perf_ops *perf_ops[] = {
- &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops,
- };
-
- if (!torture_init_begin(perf_type, verbose))
- return -EBUSY;
-
- /* Process args and tell the world that the perf'er is on the job. */
- for (i = 0; i < ARRAY_SIZE(perf_ops); i++) {
- cur_ops = perf_ops[i];
- if (strcmp(perf_type, cur_ops->name) == 0)
- break;
- }
- if (i == ARRAY_SIZE(perf_ops)) {
- pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type);
- pr_alert("rcu-perf types:");
- for (i = 0; i < ARRAY_SIZE(perf_ops); i++)
- pr_cont(" %s", perf_ops[i]->name);
- pr_cont("\n");
- WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST));
- firsterr = -EINVAL;
- cur_ops = NULL;
- goto unwind;
- }
- if (cur_ops->init)
- cur_ops->init();
-
- if (kfree_rcu_test)
- return kfree_perf_init();
-
- nrealwriters = compute_real(nwriters);
- nrealreaders = compute_real(nreaders);
- atomic_set(&n_rcu_perf_reader_started, 0);
- atomic_set(&n_rcu_perf_writer_started, 0);
- atomic_set(&n_rcu_perf_writer_finished, 0);
- rcu_perf_print_module_parms(cur_ops, "Start of test");
-
- /* Start up the kthreads. */
-
- if (shutdown) {
- init_waitqueue_head(&shutdown_wq);
- firsterr = torture_create_kthread(rcu_perf_shutdown, NULL,
- shutdown_task);
- if (firsterr)
- goto unwind;
- schedule_timeout_uninterruptible(1);
- }
- reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
- GFP_KERNEL);
- if (reader_tasks == NULL) {
- VERBOSE_PERFOUT_ERRSTRING("out of memory");
- firsterr = -ENOMEM;
- goto unwind;
- }
- for (i = 0; i < nrealreaders; i++) {
- firsterr = torture_create_kthread(rcu_perf_reader, (void *)i,
- reader_tasks[i]);
- if (firsterr)
- goto unwind;
- }
- while (atomic_read(&n_rcu_perf_reader_started) < nrealreaders)
- schedule_timeout_uninterruptible(1);
- writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]),
- GFP_KERNEL);
- writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations),
- GFP_KERNEL);
- writer_n_durations =
- kcalloc(nrealwriters, sizeof(*writer_n_durations),
- GFP_KERNEL);
- if (!writer_tasks || !writer_durations || !writer_n_durations) {
- VERBOSE_PERFOUT_ERRSTRING("out of memory");
- firsterr = -ENOMEM;
- goto unwind;
- }
- for (i = 0; i < nrealwriters; i++) {
- writer_durations[i] =
- kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
- GFP_KERNEL);
- if (!writer_durations[i]) {
- firsterr = -ENOMEM;
- goto unwind;
- }
- firsterr = torture_create_kthread(rcu_perf_writer, (void *)i,
- writer_tasks[i]);
- if (firsterr)
- goto unwind;
- }
- torture_init_end();
- return 0;
-
-unwind:
- torture_init_end();
- rcu_perf_cleanup();
- return firsterr;
-}
-
-module_init(rcu_perf_init);
-module_exit(rcu_perf_cleanup);
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
new file mode 100644
index 000000000000..ffdb30495e3c
--- /dev/null
+++ b/kernel/rcu/rcuscale.c
@@ -0,0 +1,1058 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Read-Copy Update module-based scalability-test facility
+ *
+ * Copyright (C) IBM Corporation, 2015
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+ */
+
+#define pr_fmt(fmt) fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+#include <linux/vmalloc.h>
+#include <linux/rcupdate_trace.h>
+
+#include "rcu.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
+
+#define SCALE_FLAG "-scale:"
+#define SCALEOUT_STRING(s) \
+ pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s)
+#define VERBOSE_SCALEOUT_STRING(s) \
+ do { if (verbose) pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s); } while (0)
+#define SCALEOUT_ERRSTRING(s) \
+ pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s)
+
+/*
+ * The intended use cases for the nreaders and nwriters module parameters
+ * are as follows:
+ *
+ * 1. Specify only the nr_cpus kernel boot parameter. This will
+ * set both nreaders and nwriters to the value specified by
+ * nr_cpus for a mixed reader/writer test.
+ *
+ * 2. Specify the nr_cpus kernel boot parameter, but set
+ * rcuscale.nreaders to zero. This will set nwriters to the
+ * value specified by nr_cpus for an update-only test.
+ *
+ * 3. Specify the nr_cpus kernel boot parameter, but set
+ * rcuscale.nwriters to zero. This will set nreaders to the
+ * value specified by nr_cpus for a read-only test.
+ *
+ * Various other use cases may of course be specified.
+ *
+ * Note that this test's readers are intended only as a test load for
+ * the writers. The reader scalability statistics will be overly
+ * pessimistic due to the per-critical-section interrupt disabling,
+ * test-end checks, and the pair of calls through pointers.
+ */
+
+#ifdef MODULE
+# define RCUSCALE_SHUTDOWN 0
+#else
+# define RCUSCALE_SHUTDOWN 1
+#endif
+
+torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
+torture_param(int, gp_async_max, 1000, "Max # outstanding waits per writer");
+torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
+torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
+torture_param(int, minruntime, 0, "Minimum run time (s)");
+torture_param(int, nreaders, -1, "Number of RCU reader threads");
+torture_param(int, nwriters, -1, "Number of RCU updater threads");
+torture_param(bool, shutdown, RCUSCALE_SHUTDOWN,
+ "Shutdown at end of scalability tests.");
+torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
+torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
+torture_param(int, writer_holdoff_jiffies, 0, "Holdoff (jiffies) between GPs, zero to disable");
+torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?");
+torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate.");
+torture_param(int, kfree_by_call_rcu, 0, "Use call_rcu() to emulate kfree_rcu()?");
+
+static char *scale_type = "rcu";
+module_param(scale_type, charp, 0444);
+MODULE_PARM_DESC(scale_type, "Type of RCU to scalability-test (rcu, srcu, ...)");
+
+static int nrealreaders;
+static int nrealwriters;
+static struct task_struct **writer_tasks;
+static struct task_struct **reader_tasks;
+static struct task_struct *shutdown_task;
+
+static u64 **writer_durations;
+static int *writer_n_durations;
+static atomic_t n_rcu_scale_reader_started;
+static atomic_t n_rcu_scale_writer_started;
+static atomic_t n_rcu_scale_writer_finished;
+static wait_queue_head_t shutdown_wq;
+static u64 t_rcu_scale_writer_started;
+static u64 t_rcu_scale_writer_finished;
+static unsigned long b_rcu_gp_test_started;
+static unsigned long b_rcu_gp_test_finished;
+static DEFINE_PER_CPU(atomic_t, n_async_inflight);
+
+#define MAX_MEAS 10000
+#define MIN_MEAS 100
+
+/*
+ * Operations vector for selecting different types of tests.
+ */
+
+struct rcu_scale_ops {
+ int ptype;
+ void (*init)(void);
+ void (*cleanup)(void);
+ int (*readlock)(void);
+ void (*readunlock)(int idx);
+ unsigned long (*get_gp_seq)(void);
+ unsigned long (*gp_diff)(unsigned long new, unsigned long old);
+ unsigned long (*exp_completed)(void);
+ void (*async)(struct rcu_head *head, rcu_callback_t func);
+ void (*gp_barrier)(void);
+ void (*sync)(void);
+ void (*exp_sync)(void);
+ struct task_struct *(*rso_gp_kthread)(void);
+ const char *name;
+};
+
+static struct rcu_scale_ops *cur_ops;
+
+/*
+ * Definitions for rcu scalability testing.
+ */
+
+static int rcu_scale_read_lock(void) __acquires(RCU)
+{
+ rcu_read_lock();
+ return 0;
+}
+
+static void rcu_scale_read_unlock(int idx) __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static unsigned long __maybe_unused rcu_no_completed(void)
+{
+ return 0;
+}
+
+static void rcu_sync_scale_init(void)
+{
+}
+
+static struct rcu_scale_ops rcu_ops = {
+ .ptype = RCU_FLAVOR,
+ .init = rcu_sync_scale_init,
+ .readlock = rcu_scale_read_lock,
+ .readunlock = rcu_scale_read_unlock,
+ .get_gp_seq = rcu_get_gp_seq,
+ .gp_diff = rcu_seq_diff,
+ .exp_completed = rcu_exp_batches_completed,
+ .async = call_rcu_hurry,
+ .gp_barrier = rcu_barrier,
+ .sync = synchronize_rcu,
+ .exp_sync = synchronize_rcu_expedited,
+ .name = "rcu"
+};
+
+/*
+ * Definitions for srcu scalability testing.
+ */
+
+DEFINE_STATIC_SRCU(srcu_ctl_scale);
+static struct srcu_struct *srcu_ctlp = &srcu_ctl_scale;
+
+static int srcu_scale_read_lock(void) __acquires(srcu_ctlp)
+{
+ return srcu_read_lock(srcu_ctlp);
+}
+
+static void srcu_scale_read_unlock(int idx) __releases(srcu_ctlp)
+{
+ srcu_read_unlock(srcu_ctlp, idx);
+}
+
+static unsigned long srcu_scale_completed(void)
+{
+ return srcu_batches_completed(srcu_ctlp);
+}
+
+static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+ call_srcu(srcu_ctlp, head, func);
+}
+
+static void srcu_rcu_barrier(void)
+{
+ srcu_barrier(srcu_ctlp);
+}
+
+static void srcu_scale_synchronize(void)
+{
+ synchronize_srcu(srcu_ctlp);
+}
+
+static void srcu_scale_synchronize_expedited(void)
+{
+ synchronize_srcu_expedited(srcu_ctlp);
+}
+
+static struct rcu_scale_ops srcu_ops = {
+ .ptype = SRCU_FLAVOR,
+ .init = rcu_sync_scale_init,
+ .readlock = srcu_scale_read_lock,
+ .readunlock = srcu_scale_read_unlock,
+ .get_gp_seq = srcu_scale_completed,
+ .gp_diff = rcu_seq_diff,
+ .exp_completed = srcu_scale_completed,
+ .async = srcu_call_rcu,
+ .gp_barrier = srcu_rcu_barrier,
+ .sync = srcu_scale_synchronize,
+ .exp_sync = srcu_scale_synchronize_expedited,
+ .name = "srcu"
+};
+
+static struct srcu_struct srcud;
+
+static void srcu_sync_scale_init(void)
+{
+ srcu_ctlp = &srcud;
+ init_srcu_struct(srcu_ctlp);
+}
+
+static void srcu_sync_scale_cleanup(void)
+{
+ cleanup_srcu_struct(srcu_ctlp);
+}
+
+static struct rcu_scale_ops srcud_ops = {
+ .ptype = SRCU_FLAVOR,
+ .init = srcu_sync_scale_init,
+ .cleanup = srcu_sync_scale_cleanup,
+ .readlock = srcu_scale_read_lock,
+ .readunlock = srcu_scale_read_unlock,
+ .get_gp_seq = srcu_scale_completed,
+ .gp_diff = rcu_seq_diff,
+ .exp_completed = srcu_scale_completed,
+ .async = srcu_call_rcu,
+ .gp_barrier = srcu_rcu_barrier,
+ .sync = srcu_scale_synchronize,
+ .exp_sync = srcu_scale_synchronize_expedited,
+ .name = "srcud"
+};
+
+#ifdef CONFIG_TASKS_RCU
+
+/*
+ * Definitions for RCU-tasks scalability testing.
+ */
+
+static int tasks_scale_read_lock(void)
+{
+ return 0;
+}
+
+static void tasks_scale_read_unlock(int idx)
+{
+}
+
+static struct rcu_scale_ops tasks_ops = {
+ .ptype = RCU_TASKS_FLAVOR,
+ .init = rcu_sync_scale_init,
+ .readlock = tasks_scale_read_lock,
+ .readunlock = tasks_scale_read_unlock,
+ .get_gp_seq = rcu_no_completed,
+ .gp_diff = rcu_seq_diff,
+ .async = call_rcu_tasks,
+ .gp_barrier = rcu_barrier_tasks,
+ .sync = synchronize_rcu_tasks,
+ .exp_sync = synchronize_rcu_tasks,
+ .rso_gp_kthread = get_rcu_tasks_gp_kthread,
+ .name = "tasks"
+};
+
+#define TASKS_OPS &tasks_ops,
+
+#else // #ifdef CONFIG_TASKS_RCU
+
+#define TASKS_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_RCU
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+
+/*
+ * Definitions for RCU-tasks-rude scalability testing.
+ */
+
+static int tasks_rude_scale_read_lock(void)
+{
+ return 0;
+}
+
+static void tasks_rude_scale_read_unlock(int idx)
+{
+}
+
+static struct rcu_scale_ops tasks_rude_ops = {
+ .ptype = RCU_TASKS_RUDE_FLAVOR,
+ .init = rcu_sync_scale_init,
+ .readlock = tasks_rude_scale_read_lock,
+ .readunlock = tasks_rude_scale_read_unlock,
+ .get_gp_seq = rcu_no_completed,
+ .gp_diff = rcu_seq_diff,
+ .async = call_rcu_tasks_rude,
+ .gp_barrier = rcu_barrier_tasks_rude,
+ .sync = synchronize_rcu_tasks_rude,
+ .exp_sync = synchronize_rcu_tasks_rude,
+ .rso_gp_kthread = get_rcu_tasks_rude_gp_kthread,
+ .name = "tasks-rude"
+};
+
+#define TASKS_RUDE_OPS &tasks_rude_ops,
+
+#else // #ifdef CONFIG_TASKS_RUDE_RCU
+
+#define TASKS_RUDE_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_RUDE_RCU
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+
+/*
+ * Definitions for RCU-tasks-trace scalability testing.
+ */
+
+static int tasks_trace_scale_read_lock(void)
+{
+ rcu_read_lock_trace();
+ return 0;
+}
+
+static void tasks_trace_scale_read_unlock(int idx)
+{
+ rcu_read_unlock_trace();
+}
+
+static struct rcu_scale_ops tasks_tracing_ops = {
+ .ptype = RCU_TASKS_FLAVOR,
+ .init = rcu_sync_scale_init,
+ .readlock = tasks_trace_scale_read_lock,
+ .readunlock = tasks_trace_scale_read_unlock,
+ .get_gp_seq = rcu_no_completed,
+ .gp_diff = rcu_seq_diff,
+ .async = call_rcu_tasks_trace,
+ .gp_barrier = rcu_barrier_tasks_trace,
+ .sync = synchronize_rcu_tasks_trace,
+ .exp_sync = synchronize_rcu_tasks_trace,
+ .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread,
+ .name = "tasks-tracing"
+};
+
+#define TASKS_TRACING_OPS &tasks_tracing_ops,
+
+#else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+#define TASKS_TRACING_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old)
+{
+ if (!cur_ops->gp_diff)
+ return new - old;
+ return cur_ops->gp_diff(new, old);
+}
+
+/*
+ * If scalability tests complete, wait for shutdown to commence.
+ */
+static void rcu_scale_wait_shutdown(void)
+{
+ cond_resched_tasks_rcu_qs();
+ if (atomic_read(&n_rcu_scale_writer_finished) < nrealwriters)
+ return;
+ while (!torture_must_stop())
+ schedule_timeout_uninterruptible(1);
+}
+
+/*
+ * RCU scalability reader kthread. Repeatedly does empty RCU read-side
+ * critical section, minimizing update-side interference. However, the
+ * point of this test is not to evaluate reader scalability, but instead
+ * to serve as a test load for update-side scalability testing.
+ */
+static int
+rcu_scale_reader(void *arg)
+{
+ unsigned long flags;
+ int idx;
+ long me = (long)arg;
+
+ VERBOSE_SCALEOUT_STRING("rcu_scale_reader task started");
+ set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ set_user_nice(current, MAX_NICE);
+ atomic_inc(&n_rcu_scale_reader_started);
+
+ do {
+ local_irq_save(flags);
+ idx = cur_ops->readlock();
+ cur_ops->readunlock(idx);
+ local_irq_restore(flags);
+ rcu_scale_wait_shutdown();
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_scale_reader");
+ return 0;
+}
+
+/*
+ * Callback function for asynchronous grace periods from rcu_scale_writer().
+ */
+static void rcu_scale_async_cb(struct rcu_head *rhp)
+{
+ atomic_dec(this_cpu_ptr(&n_async_inflight));
+ kfree(rhp);
+}
+
+/*
+ * RCU scale writer kthread. Repeatedly does a grace period.
+ */
+static int
+rcu_scale_writer(void *arg)
+{
+ int i = 0;
+ int i_max;
+ unsigned long jdone;
+ long me = (long)arg;
+ struct rcu_head *rhp = NULL;
+ bool started = false, done = false, alldone = false;
+ u64 t;
+ DEFINE_TORTURE_RANDOM(tr);
+ u64 *wdp;
+ u64 *wdpp = writer_durations[me];
+
+ VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started");
+ WARN_ON(!wdpp);
+ set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ current->flags |= PF_NO_SETAFFINITY;
+ sched_set_fifo_low(current);
+
+ if (holdoff)
+ schedule_timeout_idle(holdoff * HZ);
+
+ /*
+ * Wait until rcu_end_inkernel_boot() is called for normal GP tests
+ * so that RCU is not always expedited for normal GP tests.
+ * The system_state test is approximate, but works well in practice.
+ */
+ while (!gp_exp && system_state != SYSTEM_RUNNING)
+ schedule_timeout_uninterruptible(1);
+
+ t = ktime_get_mono_fast_ns();
+ if (atomic_inc_return(&n_rcu_scale_writer_started) >= nrealwriters) {
+ t_rcu_scale_writer_started = t;
+ if (gp_exp) {
+ b_rcu_gp_test_started =
+ cur_ops->exp_completed() / 2;
+ } else {
+ b_rcu_gp_test_started = cur_ops->get_gp_seq();
+ }
+ }
+
+ jdone = jiffies + minruntime * HZ;
+ do {
+ if (writer_holdoff)
+ udelay(writer_holdoff);
+ if (writer_holdoff_jiffies)
+ schedule_timeout_idle(torture_random(&tr) % writer_holdoff_jiffies + 1);
+ wdp = &wdpp[i];
+ *wdp = ktime_get_mono_fast_ns();
+ if (gp_async) {
+retry:
+ if (!rhp)
+ rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+ if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) {
+ atomic_inc(this_cpu_ptr(&n_async_inflight));
+ cur_ops->async(rhp, rcu_scale_async_cb);
+ rhp = NULL;
+ } else if (!kthread_should_stop()) {
+ cur_ops->gp_barrier();
+ goto retry;
+ } else {
+ kfree(rhp); /* Because we are stopping. */
+ }
+ } else if (gp_exp) {
+ cur_ops->exp_sync();
+ } else {
+ cur_ops->sync();
+ }
+ t = ktime_get_mono_fast_ns();
+ *wdp = t - *wdp;
+ i_max = i;
+ if (!started &&
+ atomic_read(&n_rcu_scale_writer_started) >= nrealwriters)
+ started = true;
+ if (!done && i >= MIN_MEAS && time_after(jiffies, jdone)) {
+ done = true;
+ sched_set_normal(current, 0);
+ pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n",
+ scale_type, SCALE_FLAG, me, MIN_MEAS);
+ if (atomic_inc_return(&n_rcu_scale_writer_finished) >=
+ nrealwriters) {
+ schedule_timeout_interruptible(10);
+ rcu_ftrace_dump(DUMP_ALL);
+ SCALEOUT_STRING("Test complete");
+ t_rcu_scale_writer_finished = t;
+ if (gp_exp) {
+ b_rcu_gp_test_finished =
+ cur_ops->exp_completed() / 2;
+ } else {
+ b_rcu_gp_test_finished =
+ cur_ops->get_gp_seq();
+ }
+ if (shutdown) {
+ smp_mb(); /* Assign before wake. */
+ wake_up(&shutdown_wq);
+ }
+ }
+ }
+ if (done && !alldone &&
+ atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters)
+ alldone = true;
+ if (started && !alldone && i < MAX_MEAS - 1)
+ i++;
+ rcu_scale_wait_shutdown();
+ } while (!torture_must_stop());
+ if (gp_async) {
+ cur_ops->gp_barrier();
+ }
+ writer_n_durations[me] = i_max + 1;
+ torture_kthread_stopping("rcu_scale_writer");
+ return 0;
+}
+
+static void
+rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag)
+{
+ pr_alert("%s" SCALE_FLAG
+ "--- %s: gp_async=%d gp_async_max=%d gp_exp=%d holdoff=%d minruntime=%d nreaders=%d nwriters=%d writer_holdoff=%d writer_holdoff_jiffies=%d verbose=%d shutdown=%d\n",
+ scale_type, tag, gp_async, gp_async_max, gp_exp, holdoff, minruntime, nrealreaders, nrealwriters, writer_holdoff, writer_holdoff_jiffies, verbose, shutdown);
+}
+
+/*
+ * Return the number if non-negative. If -1, the number of CPUs.
+ * If less than -1, that much less than the number of CPUs, but
+ * at least one.
+ */
+static int compute_real(int n)
+{
+ int nr;
+
+ if (n >= 0) {
+ nr = n;
+ } else {
+ nr = num_online_cpus() + 1 + n;
+ if (nr <= 0)
+ nr = 1;
+ }
+ return nr;
+}
+
+/*
+ * kfree_rcu() scalability tests: Start a kfree_rcu() loop on all CPUs for number
+ * of iterations and measure total time and number of GP for all iterations to complete.
+ */
+
+torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu().");
+torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration.");
+torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees.");
+torture_param(bool, kfree_rcu_test_double, false, "Do we run a kfree_rcu() double-argument scale test?");
+torture_param(bool, kfree_rcu_test_single, false, "Do we run a kfree_rcu() single-argument scale test?");
+
+static struct task_struct **kfree_reader_tasks;
+static int kfree_nrealthreads;
+static atomic_t n_kfree_scale_thread_started;
+static atomic_t n_kfree_scale_thread_ended;
+static struct task_struct *kthread_tp;
+static u64 kthread_stime;
+
+struct kfree_obj {
+ char kfree_obj[8];
+ struct rcu_head rh;
+};
+
+/* Used if doing RCU-kfree'ing via call_rcu(). */
+static void kfree_call_rcu(struct rcu_head *rh)
+{
+ struct kfree_obj *obj = container_of(rh, struct kfree_obj, rh);
+
+ kfree(obj);
+}
+
+static int
+kfree_scale_thread(void *arg)
+{
+ int i, loop = 0;
+ long me = (long)arg;
+ struct kfree_obj *alloc_ptr;
+ u64 start_time, end_time;
+ long long mem_begin, mem_during = 0;
+ bool kfree_rcu_test_both;
+ DEFINE_TORTURE_RANDOM(tr);
+
+ VERBOSE_SCALEOUT_STRING("kfree_scale_thread task started");
+ set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ set_user_nice(current, MAX_NICE);
+ kfree_rcu_test_both = (kfree_rcu_test_single == kfree_rcu_test_double);
+
+ start_time = ktime_get_mono_fast_ns();
+
+ if (atomic_inc_return(&n_kfree_scale_thread_started) >= kfree_nrealthreads) {
+ if (gp_exp)
+ b_rcu_gp_test_started = cur_ops->exp_completed() / 2;
+ else
+ b_rcu_gp_test_started = cur_ops->get_gp_seq();
+ }
+
+ do {
+ if (!mem_during) {
+ mem_during = mem_begin = si_mem_available();
+ } else if (loop % (kfree_loops / 4) == 0) {
+ mem_during = (mem_during + si_mem_available()) / 2;
+ }
+
+ for (i = 0; i < kfree_alloc_num; i++) {
+ alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL);
+ if (!alloc_ptr)
+ return -ENOMEM;
+
+ if (kfree_by_call_rcu) {
+ call_rcu(&(alloc_ptr->rh), kfree_call_rcu);
+ continue;
+ }
+
+ // By default kfree_rcu_test_single and kfree_rcu_test_double are
+ // initialized to false. If both have the same value (false or true)
+ // both are randomly tested, otherwise only the one with value true
+ // is tested.
+ if ((kfree_rcu_test_single && !kfree_rcu_test_double) ||
+ (kfree_rcu_test_both && torture_random(&tr) & 0x800))
+ kfree_rcu_mightsleep(alloc_ptr);
+ else
+ kfree_rcu(alloc_ptr, rh);
+ }
+
+ cond_resched();
+ } while (!torture_must_stop() && ++loop < kfree_loops);
+
+ if (atomic_inc_return(&n_kfree_scale_thread_ended) >= kfree_nrealthreads) {
+ end_time = ktime_get_mono_fast_ns();
+
+ if (gp_exp)
+ b_rcu_gp_test_finished = cur_ops->exp_completed() / 2;
+ else
+ b_rcu_gp_test_finished = cur_ops->get_gp_seq();
+
+ pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n",
+ (unsigned long long)(end_time - start_time), kfree_loops,
+ rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started),
+ (mem_begin - mem_during) >> (20 - PAGE_SHIFT));
+
+ if (shutdown) {
+ smp_mb(); /* Assign before wake. */
+ wake_up(&shutdown_wq);
+ }
+ }
+
+ torture_kthread_stopping("kfree_scale_thread");
+ return 0;
+}
+
+static void
+kfree_scale_cleanup(void)
+{
+ int i;
+
+ if (torture_cleanup_begin())
+ return;
+
+ if (kfree_reader_tasks) {
+ for (i = 0; i < kfree_nrealthreads; i++)
+ torture_stop_kthread(kfree_scale_thread,
+ kfree_reader_tasks[i]);
+ kfree(kfree_reader_tasks);
+ }
+
+ torture_cleanup_end();
+}
+
+/*
+ * shutdown kthread. Just waits to be awakened, then shuts down system.
+ */
+static int
+kfree_scale_shutdown(void *arg)
+{
+ wait_event_idle(shutdown_wq,
+ atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads);
+
+ smp_mb(); /* Wake before output. */
+
+ kfree_scale_cleanup();
+ kernel_power_off();
+ return -EINVAL;
+}
+
+// Used if doing RCU-kfree'ing via call_rcu().
+static unsigned long jiffies_at_lazy_cb;
+static struct rcu_head lazy_test1_rh;
+static int rcu_lazy_test1_cb_called;
+static void call_rcu_lazy_test1(struct rcu_head *rh)
+{
+ jiffies_at_lazy_cb = jiffies;
+ WRITE_ONCE(rcu_lazy_test1_cb_called, 1);
+}
+
+static int __init
+kfree_scale_init(void)
+{
+ int firsterr = 0;
+ long i;
+ unsigned long jif_start;
+ unsigned long orig_jif;
+
+ pr_alert("%s" SCALE_FLAG
+ "--- kfree_rcu_test: kfree_mult=%d kfree_by_call_rcu=%d kfree_nthreads=%d kfree_alloc_num=%d kfree_loops=%d kfree_rcu_test_double=%d kfree_rcu_test_single=%d\n",
+ scale_type, kfree_mult, kfree_by_call_rcu, kfree_nthreads, kfree_alloc_num, kfree_loops, kfree_rcu_test_double, kfree_rcu_test_single);
+
+ // Also, do a quick self-test to ensure laziness is as much as
+ // expected.
+ if (kfree_by_call_rcu && !IS_ENABLED(CONFIG_RCU_LAZY)) {
+ pr_alert("CONFIG_RCU_LAZY is disabled, falling back to kfree_rcu() for delayed RCU kfree'ing\n");
+ kfree_by_call_rcu = 0;
+ }
+
+ if (kfree_by_call_rcu) {
+ /* do a test to check the timeout. */
+ orig_jif = rcu_lazy_get_jiffies_till_flush();
+
+ rcu_lazy_set_jiffies_till_flush(2 * HZ);
+ rcu_barrier();
+
+ jif_start = jiffies;
+ jiffies_at_lazy_cb = 0;
+ call_rcu(&lazy_test1_rh, call_rcu_lazy_test1);
+
+ smp_cond_load_relaxed(&rcu_lazy_test1_cb_called, VAL == 1);
+
+ rcu_lazy_set_jiffies_till_flush(orig_jif);
+
+ if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) {
+ pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n");
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start > 3 * HZ)) {
+ pr_alert("ERROR: call_rcu() CBs are being too lazy!\n");
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+ }
+
+ kfree_nrealthreads = compute_real(kfree_nthreads);
+ /* Start up the kthreads. */
+ if (shutdown) {
+ init_waitqueue_head(&shutdown_wq);
+ firsterr = torture_create_kthread(kfree_scale_shutdown, NULL,
+ shutdown_task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ schedule_timeout_uninterruptible(1);
+ }
+
+ pr_alert("kfree object size=%zu, kfree_by_call_rcu=%d\n",
+ kfree_mult * sizeof(struct kfree_obj),
+ kfree_by_call_rcu);
+
+ kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]),
+ GFP_KERNEL);
+ if (kfree_reader_tasks == NULL) {
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+
+ for (i = 0; i < kfree_nrealthreads; i++) {
+ firsterr = torture_create_kthread(kfree_scale_thread, (void *)i,
+ kfree_reader_tasks[i]);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+
+ while (atomic_read(&n_kfree_scale_thread_started) < kfree_nrealthreads)
+ schedule_timeout_uninterruptible(1);
+
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ kfree_scale_cleanup();
+ return firsterr;
+}
+
+static void
+rcu_scale_cleanup(void)
+{
+ int i;
+ int j;
+ int ngps = 0;
+ u64 *wdp;
+ u64 *wdpp;
+
+ /*
+ * Would like warning at start, but everything is expedited
+ * during the mid-boot phase, so have to wait till the end.
+ */
+ if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
+ SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
+ if (rcu_gp_is_normal() && gp_exp)
+ SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
+ if (gp_exp && gp_async)
+ SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!");
+
+ // If built-in, just report all of the GP kthread's CPU time.
+ if (IS_BUILTIN(CONFIG_RCU_SCALE_TEST) && !kthread_tp && cur_ops->rso_gp_kthread)
+ kthread_tp = cur_ops->rso_gp_kthread();
+ if (kthread_tp) {
+ u32 ns;
+ u64 us;
+
+ kthread_stime = kthread_tp->stime - kthread_stime;
+ us = div_u64_rem(kthread_stime, 1000, &ns);
+ pr_info("rcu_scale: Grace-period kthread CPU time: %llu.%03u us\n", us, ns);
+ show_rcu_gp_kthreads();
+ }
+ if (kfree_rcu_test) {
+ kfree_scale_cleanup();
+ return;
+ }
+
+ if (torture_cleanup_begin())
+ return;
+ if (!cur_ops) {
+ torture_cleanup_end();
+ return;
+ }
+
+ if (reader_tasks) {
+ for (i = 0; i < nrealreaders; i++)
+ torture_stop_kthread(rcu_scale_reader,
+ reader_tasks[i]);
+ kfree(reader_tasks);
+ }
+
+ if (writer_tasks) {
+ for (i = 0; i < nrealwriters; i++) {
+ torture_stop_kthread(rcu_scale_writer,
+ writer_tasks[i]);
+ if (!writer_n_durations)
+ continue;
+ j = writer_n_durations[i];
+ pr_alert("%s%s writer %d gps: %d\n",
+ scale_type, SCALE_FLAG, i, j);
+ ngps += j;
+ }
+ pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n",
+ scale_type, SCALE_FLAG,
+ t_rcu_scale_writer_started, t_rcu_scale_writer_finished,
+ t_rcu_scale_writer_finished -
+ t_rcu_scale_writer_started,
+ ngps,
+ rcuscale_seq_diff(b_rcu_gp_test_finished,
+ b_rcu_gp_test_started));
+ for (i = 0; i < nrealwriters; i++) {
+ if (!writer_durations)
+ break;
+ if (!writer_n_durations)
+ continue;
+ wdpp = writer_durations[i];
+ if (!wdpp)
+ continue;
+ for (j = 0; j < writer_n_durations[i]; j++) {
+ wdp = &wdpp[j];
+ pr_alert("%s%s %4d writer-duration: %5d %llu\n",
+ scale_type, SCALE_FLAG,
+ i, j, *wdp);
+ if (j % 100 == 0)
+ schedule_timeout_uninterruptible(1);
+ }
+ kfree(writer_durations[i]);
+ }
+ kfree(writer_tasks);
+ kfree(writer_durations);
+ kfree(writer_n_durations);
+ }
+
+ /* Do torture-type-specific cleanup operations. */
+ if (cur_ops->cleanup != NULL)
+ cur_ops->cleanup();
+
+ torture_cleanup_end();
+}
+
+/*
+ * RCU scalability shutdown kthread. Just waits to be awakened, then shuts
+ * down system.
+ */
+static int
+rcu_scale_shutdown(void *arg)
+{
+ wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
+ smp_mb(); /* Wake before output. */
+ rcu_scale_cleanup();
+ kernel_power_off();
+ return -EINVAL;
+}
+
+static int __init
+rcu_scale_init(void)
+{
+ long i;
+ int firsterr = 0;
+ static struct rcu_scale_ops *scale_ops[] = {
+ &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS
+ };
+
+ if (!torture_init_begin(scale_type, verbose))
+ return -EBUSY;
+
+ /* Process args and announce that the scalability'er is on the job. */
+ for (i = 0; i < ARRAY_SIZE(scale_ops); i++) {
+ cur_ops = scale_ops[i];
+ if (strcmp(scale_type, cur_ops->name) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(scale_ops)) {
+ pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type);
+ pr_alert("rcu-scale types:");
+ for (i = 0; i < ARRAY_SIZE(scale_ops); i++)
+ pr_cont(" %s", scale_ops[i]->name);
+ pr_cont("\n");
+ firsterr = -EINVAL;
+ cur_ops = NULL;
+ goto unwind;
+ }
+ if (cur_ops->init)
+ cur_ops->init();
+
+ if (cur_ops->rso_gp_kthread) {
+ kthread_tp = cur_ops->rso_gp_kthread();
+ if (kthread_tp)
+ kthread_stime = kthread_tp->stime;
+ }
+ if (kfree_rcu_test)
+ return kfree_scale_init();
+
+ nrealwriters = compute_real(nwriters);
+ nrealreaders = compute_real(nreaders);
+ atomic_set(&n_rcu_scale_reader_started, 0);
+ atomic_set(&n_rcu_scale_writer_started, 0);
+ atomic_set(&n_rcu_scale_writer_finished, 0);
+ rcu_scale_print_module_parms(cur_ops, "Start of test");
+
+ /* Start up the kthreads. */
+
+ if (shutdown) {
+ init_waitqueue_head(&shutdown_wq);
+ firsterr = torture_create_kthread(rcu_scale_shutdown, NULL,
+ shutdown_task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ schedule_timeout_uninterruptible(1);
+ }
+ reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
+ GFP_KERNEL);
+ if (reader_tasks == NULL) {
+ SCALEOUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nrealreaders; i++) {
+ firsterr = torture_create_kthread(rcu_scale_reader, (void *)i,
+ reader_tasks[i]);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+ while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders)
+ schedule_timeout_uninterruptible(1);
+ writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]),
+ GFP_KERNEL);
+ writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations),
+ GFP_KERNEL);
+ writer_n_durations =
+ kcalloc(nrealwriters, sizeof(*writer_n_durations),
+ GFP_KERNEL);
+ if (!writer_tasks || !writer_durations || !writer_n_durations) {
+ SCALEOUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nrealwriters; i++) {
+ writer_durations[i] =
+ kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
+ GFP_KERNEL);
+ if (!writer_durations[i]) {
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ firsterr = torture_create_kthread(rcu_scale_writer, (void *)i,
+ writer_tasks[i]);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ rcu_scale_cleanup();
+ if (shutdown) {
+ WARN_ON(!IS_MODULE(CONFIG_RCU_SCALE_TEST));
+ kernel_power_off();
+ }
+ return firsterr;
+}
+
+module_init(rcu_scale_init);
+module_exit(rcu_scale_cleanup);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index efb792e13fca..7567ca8e743c 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -7,7 +7,7 @@
* Authors: Paul E. McKenney <paulmck@linux.ibm.com>
* Josh Triplett <josh@joshtriplett.org>
*
- * See also: Documentation/RCU/torture.txt
+ * See also: Documentation/RCU/torture.rst
*/
#define pr_fmt(fmt) fmt
@@ -21,6 +21,7 @@
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/rcupdate_wait.h>
+#include <linux/rcu_notifier.h>
#include <linux/interrupt.h>
#include <linux/sched/signal.h>
#include <uapi/linux/sched/types.h>
@@ -46,35 +47,26 @@
#include <linux/oom.h>
#include <linux/tick.h>
#include <linux/rcupdate_trace.h>
+#include <linux/nmi.h>
#include "rcu.h"
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
-#ifndef data_race
-#define data_race(expr) \
- ({ \
- expr; \
- })
-#endif
-#ifndef ASSERT_EXCLUSIVE_WRITER
-#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0)
-#endif
-#ifndef ASSERT_EXCLUSIVE_ACCESS
-#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0)
-#endif
-
/* Bits for ->extendables field, extendables param, and related definitions. */
-#define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */
-#define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1)
+#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */
+#define RCUTORTURE_RDR_MASK_1 (1 << RCUTORTURE_RDR_SHIFT_1)
+#define RCUTORTURE_RDR_SHIFT_2 9 /* Put SRCU index in upper bits. */
+#define RCUTORTURE_RDR_MASK_2 (1 << RCUTORTURE_RDR_SHIFT_2)
#define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */
#define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */
#define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */
#define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */
#define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */
-#define RCUTORTURE_RDR_RCU 0x20 /* ... entering another RCU reader. */
-#define RCUTORTURE_RDR_NBITS 6 /* Number of bits defined above. */
+#define RCUTORTURE_RDR_RCU_1 0x20 /* ... entering another RCU reader. */
+#define RCUTORTURE_RDR_RCU_2 0x40 /* ... entering another RCU reader. */
+#define RCUTORTURE_RDR_NBITS 7 /* Number of bits defined above. */
#define RCUTORTURE_MAX_EXTEND \
(RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \
RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED)
@@ -84,76 +76,92 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@
torture_param(int, extendables, RCUTORTURE_MAX_EXTEND,
"Extend readers by disabling bh (1), irqs (2), or preempt (4)");
-torture_param(int, fqs_duration, 0,
- "Duration of fqs bursts (us), 0 to disable");
+torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable");
torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
-torture_param(bool, fwd_progress, 1, "Test grace-period forward progress");
+torture_param(int, fwd_progress, 1, "Number of grace-period forward progress tasks (0 to disable)");
torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait");
-torture_param(int, fwd_progress_holdoff, 60,
- "Time between forward-progress tests (s)");
-torture_param(bool, fwd_progress_need_resched, 1,
- "Hide cond_resched() behind need_resched()");
+torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress tests (s)");
+torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()");
torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
+torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives");
+torture_param(bool, gp_cond_full, false, "Use conditional/async full-state GP wait primitives");
+torture_param(bool, gp_cond_exp_full, false,
+ "Use conditional/async full-stateexpedited GP wait primitives");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
-torture_param(bool, gp_normal, false,
- "Use normal (non-expedited) GP wait primitives");
+torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives");
+torture_param(bool, gp_poll, false, "Use polling GP wait primitives");
+torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives");
+torture_param(bool, gp_poll_full, false, "Use polling full-state GP wait primitives");
+torture_param(bool, gp_poll_exp_full, false, "Use polling full-state expedited GP wait primitives");
torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
-torture_param(int, n_barrier_cbs, 0,
- "# of callbacks/kthreads for barrier testing");
+torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
+torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing");
torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
-torture_param(int, object_debug, 0,
- "Enable debug-object double call_rcu() testing");
+torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing");
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
-torture_param(int, onoff_interval, 0,
- "Time between CPU hotplugs (jiffies), 0=disable");
+torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable");
+torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable");
+torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)");
+torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)");
+torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable");
torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
-torture_param(int, stall_cpu_holdoff, 10,
- "Time to wait before starting stall (s).");
+torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s).");
+torture_param(bool, stall_no_softlockup, false, "Avoid softlockup warning during cpu stall.");
torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling.");
torture_param(int, stall_cpu_block, 0, "Sleep while stalling.");
-torture_param(int, stall_gp_kthread, 0,
- "Grace-period kthread stall duration (s).");
-torture_param(int, stat_interval, 60,
- "Number of seconds between stats printk()s");
+torture_param(int, stall_gp_kthread, 0, "Grace-period kthread stall duration (s).");
+torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of seconds to run/halt test");
torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
-torture_param(int, test_boost_duration, 4,
- "Duration of each boost test, seconds.");
-torture_param(int, test_boost_interval, 7,
- "Interval between boost tests, seconds.");
-torture_param(bool, test_no_idle_hz, true,
- "Test support for tickless idle CPUs");
-torture_param(int, verbose, 1,
- "Enable verbose debugging printk()s");
+torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds.");
+torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds.");
+torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable.");
+torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs");
+torture_param(int, test_srcu_lockdep, 0, "Test specified SRCU deadlock scenario.");
+torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
static char *torture_type = "rcu";
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)");
+static int nrealnocbers;
static int nrealreaders;
static struct task_struct *writer_task;
static struct task_struct **fakewriter_tasks;
static struct task_struct **reader_tasks;
+static struct task_struct **nocb_tasks;
static struct task_struct *stats_task;
static struct task_struct *fqs_task;
static struct task_struct *boost_tasks[NR_CPUS];
static struct task_struct *stall_task;
-static struct task_struct *fwd_prog_task;
+static struct task_struct **fwd_prog_tasks;
static struct task_struct **barrier_cbs_tasks;
static struct task_struct *barrier_task;
+static struct task_struct *read_exit_task;
#define RCU_TORTURE_PIPE_LEN 10
+// Mailbox-like structure to check RCU global memory ordering.
+struct rcu_torture_reader_check {
+ unsigned long rtc_myloops;
+ int rtc_chkrdr;
+ unsigned long rtc_chkloops;
+ int rtc_ready;
+ struct rcu_torture_reader_check *rtc_assigner;
+} ____cacheline_internodealigned_in_smp;
+
+// Update-side data structure used to check RCU readers.
struct rcu_torture {
struct rcu_head rtort_rcu;
int rtort_pipe_count;
struct list_head rtort_free;
int rtort_mbtest;
+ struct rcu_torture_reader_check *rtort_chkp;
};
static LIST_HEAD(rcu_torture_freelist);
@@ -164,21 +172,27 @@ static DEFINE_SPINLOCK(rcu_torture_lock);
static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count);
static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch);
static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
+static struct rcu_torture_reader_check *rcu_torture_reader_mbchk;
static atomic_t n_rcu_torture_alloc;
static atomic_t n_rcu_torture_alloc_fail;
static atomic_t n_rcu_torture_free;
static atomic_t n_rcu_torture_mberror;
+static atomic_t n_rcu_torture_mbchk_fail;
+static atomic_t n_rcu_torture_mbchk_tries;
static atomic_t n_rcu_torture_error;
static long n_rcu_torture_barrier_error;
static long n_rcu_torture_boost_ktrerror;
-static long n_rcu_torture_boost_rterror;
static long n_rcu_torture_boost_failure;
static long n_rcu_torture_boosts;
static atomic_long_t n_rcu_torture_timers;
static long n_barrier_attempts;
static long n_barrier_successes; /* did rcu_barrier test succeed? */
+static unsigned long n_read_exits;
static struct list_head rcu_torture_removed;
static unsigned long shutdown_jiffies;
+static unsigned long start_gp_seq;
+static atomic_long_t n_nocb_offload;
+static atomic_long_t n_nocb_deoffload;
static int rcu_torture_writer_state;
#define RTWS_FIXED_DELAY 0
@@ -187,10 +201,24 @@ static int rcu_torture_writer_state;
#define RTWS_DEF_FREE 3
#define RTWS_EXP_SYNC 4
#define RTWS_COND_GET 5
-#define RTWS_COND_SYNC 6
-#define RTWS_SYNC 7
-#define RTWS_STUTTER 8
-#define RTWS_STOPPING 9
+#define RTWS_COND_GET_FULL 6
+#define RTWS_COND_GET_EXP 7
+#define RTWS_COND_GET_EXP_FULL 8
+#define RTWS_COND_SYNC 9
+#define RTWS_COND_SYNC_FULL 10
+#define RTWS_COND_SYNC_EXP 11
+#define RTWS_COND_SYNC_EXP_FULL 12
+#define RTWS_POLL_GET 13
+#define RTWS_POLL_GET_FULL 14
+#define RTWS_POLL_GET_EXP 15
+#define RTWS_POLL_GET_EXP_FULL 16
+#define RTWS_POLL_WAIT 17
+#define RTWS_POLL_WAIT_FULL 18
+#define RTWS_POLL_WAIT_EXP 19
+#define RTWS_POLL_WAIT_EXP_FULL 20
+#define RTWS_SYNC 21
+#define RTWS_STUTTER 22
+#define RTWS_STOPPING 23
static const char * const rcu_torture_writer_state_names[] = {
"RTWS_FIXED_DELAY",
"RTWS_DELAY",
@@ -198,7 +226,21 @@ static const char * const rcu_torture_writer_state_names[] = {
"RTWS_DEF_FREE",
"RTWS_EXP_SYNC",
"RTWS_COND_GET",
+ "RTWS_COND_GET_FULL",
+ "RTWS_COND_GET_EXP",
+ "RTWS_COND_GET_EXP_FULL",
"RTWS_COND_SYNC",
+ "RTWS_COND_SYNC_FULL",
+ "RTWS_COND_SYNC_EXP",
+ "RTWS_COND_SYNC_EXP_FULL",
+ "RTWS_POLL_GET",
+ "RTWS_POLL_GET_FULL",
+ "RTWS_POLL_GET_EXP",
+ "RTWS_POLL_GET_EXP_FULL",
+ "RTWS_POLL_WAIT",
+ "RTWS_POLL_WAIT_FULL",
+ "RTWS_POLL_WAIT_EXP",
+ "RTWS_POLL_WAIT_EXP_FULL",
"RTWS_SYNC",
"RTWS_STUTTER",
"RTWS_STOPPING",
@@ -225,12 +267,6 @@ static const char *rcu_torture_writer_state_getname(void)
return rcu_torture_writer_state_names[i];
}
-#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
-#define rcu_can_boost() 1
-#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
-#define rcu_can_boost() 0
-#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
-
#ifdef CONFIG_RCU_TRACE
static u64 notrace rcu_trace_clock_local(void)
{
@@ -264,7 +300,7 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
-static bool rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */
+static atomic_t rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */
/*
* Allocate an element from the rcu_tortures pool.
@@ -311,22 +347,46 @@ struct rcu_torture_ops {
void (*read_delay)(struct torture_random_state *rrsp,
struct rt_read_seg *rtrsp);
void (*readunlock)(int idx);
+ int (*readlock_held)(void);
unsigned long (*get_gp_seq)(void);
unsigned long (*gp_diff)(unsigned long new, unsigned long old);
void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
void (*exp_sync)(void);
- unsigned long (*get_state)(void);
+ unsigned long (*get_gp_state_exp)(void);
+ unsigned long (*start_gp_poll_exp)(void);
+ void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp);
+ bool (*poll_gp_state_exp)(unsigned long oldstate);
+ void (*cond_sync_exp)(unsigned long oldstate);
+ void (*cond_sync_exp_full)(struct rcu_gp_oldstate *rgosp);
+ unsigned long (*get_comp_state)(void);
+ void (*get_comp_state_full)(struct rcu_gp_oldstate *rgosp);
+ bool (*same_gp_state)(unsigned long oldstate1, unsigned long oldstate2);
+ bool (*same_gp_state_full)(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2);
+ unsigned long (*get_gp_state)(void);
+ void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp);
+ unsigned long (*get_gp_completed)(void);
+ void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp);
+ unsigned long (*start_gp_poll)(void);
+ void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp);
+ bool (*poll_gp_state)(unsigned long oldstate);
+ bool (*poll_gp_state_full)(struct rcu_gp_oldstate *rgosp);
+ bool (*poll_need_2gp)(bool poll, bool poll_full);
void (*cond_sync)(unsigned long oldstate);
+ void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp);
call_rcu_func_t call;
void (*cb_barrier)(void);
void (*fqs)(void);
void (*stats)(void);
+ void (*gp_kthread_dbg)(void);
+ bool (*check_boost_failed)(unsigned long gp_state, int *cpup);
int (*stall_dur)(void);
+ long cbflood_max;
int irq_capable;
int can_boost;
int extendables;
int slow_gps;
+ int no_pi_lock;
const char *name;
};
@@ -336,7 +396,12 @@ static struct rcu_torture_ops *cur_ops;
* Definitions for rcu torture testing.
*/
-static int rcu_torture_read_lock(void) __acquires(RCU)
+static int torture_readlock_not_held(void)
+{
+ return rcu_read_lock_bh_held() || rcu_read_lock_sched_held();
+}
+
+static int rcu_torture_read_lock(void)
{
rcu_read_lock();
return 0;
@@ -355,7 +420,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
* period, and we want a long delay occasionally to trigger
* force_quiescent_state. */
- if (!READ_ONCE(rcu_fwd_cb_nodelay) &&
+ if (!atomic_read(&rcu_fwd_cb_nodelay) &&
!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
@@ -378,7 +443,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
}
}
-static void rcu_torture_read_unlock(int idx) __releases(RCU)
+static void rcu_torture_read_unlock(int idx)
{
rcu_read_unlock();
}
@@ -390,7 +455,12 @@ static bool
rcu_torture_pipe_update_one(struct rcu_torture *rp)
{
int i;
+ struct rcu_torture_reader_check *rtrcp = READ_ONCE(rp->rtort_chkp);
+ if (rtrcp) {
+ WRITE_ONCE(rp->rtort_chkp, NULL);
+ smp_store_release(&rtrcp->rtc_ready, 1); // Pair with smp_load_acquire().
+ }
i = READ_ONCE(rp->rtort_pipe_count);
if (i > RCU_TORTURE_PIPE_LEN)
i = RCU_TORTURE_PIPE_LEN;
@@ -446,7 +516,7 @@ static unsigned long rcu_no_completed(void)
static void rcu_torture_deferred_free(struct rcu_torture *p)
{
- call_rcu(&p->rtort_rcu, rcu_torture_cb);
+ call_rcu_hurry(&p->rtort_rcu, rcu_torture_cb);
}
static void rcu_sync_torture_init(void)
@@ -454,28 +524,54 @@ static void rcu_sync_torture_init(void)
INIT_LIST_HEAD(&rcu_torture_removed);
}
+static bool rcu_poll_need_2gp(bool poll, bool poll_full)
+{
+ return poll;
+}
+
static struct rcu_torture_ops rcu_ops = {
- .ttype = RCU_FLAVOR,
- .init = rcu_sync_torture_init,
- .readlock = rcu_torture_read_lock,
- .read_delay = rcu_read_delay,
- .readunlock = rcu_torture_read_unlock,
- .get_gp_seq = rcu_get_gp_seq,
- .gp_diff = rcu_seq_diff,
- .deferred_free = rcu_torture_deferred_free,
- .sync = synchronize_rcu,
- .exp_sync = synchronize_rcu_expedited,
- .get_state = get_state_synchronize_rcu,
- .cond_sync = cond_synchronize_rcu,
- .call = call_rcu,
- .cb_barrier = rcu_barrier,
- .fqs = rcu_force_quiescent_state,
- .stats = NULL,
- .stall_dur = rcu_jiffies_till_stall_check,
- .irq_capable = 1,
- .can_boost = rcu_can_boost(),
- .extendables = RCUTORTURE_MAX_EXTEND,
- .name = "rcu"
+ .ttype = RCU_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock,
+ .read_delay = rcu_read_delay,
+ .readunlock = rcu_torture_read_unlock,
+ .readlock_held = torture_readlock_not_held,
+ .get_gp_seq = rcu_get_gp_seq,
+ .gp_diff = rcu_seq_diff,
+ .deferred_free = rcu_torture_deferred_free,
+ .sync = synchronize_rcu,
+ .exp_sync = synchronize_rcu_expedited,
+ .same_gp_state = same_state_synchronize_rcu,
+ .same_gp_state_full = same_state_synchronize_rcu_full,
+ .get_comp_state = get_completed_synchronize_rcu,
+ .get_comp_state_full = get_completed_synchronize_rcu_full,
+ .get_gp_state = get_state_synchronize_rcu,
+ .get_gp_state_full = get_state_synchronize_rcu_full,
+ .get_gp_completed = get_completed_synchronize_rcu,
+ .get_gp_completed_full = get_completed_synchronize_rcu_full,
+ .start_gp_poll = start_poll_synchronize_rcu,
+ .start_gp_poll_full = start_poll_synchronize_rcu_full,
+ .poll_gp_state = poll_state_synchronize_rcu,
+ .poll_gp_state_full = poll_state_synchronize_rcu_full,
+ .poll_need_2gp = rcu_poll_need_2gp,
+ .cond_sync = cond_synchronize_rcu,
+ .cond_sync_full = cond_synchronize_rcu_full,
+ .get_gp_state_exp = get_state_synchronize_rcu,
+ .start_gp_poll_exp = start_poll_synchronize_rcu_expedited,
+ .start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full,
+ .poll_gp_state_exp = poll_state_synchronize_rcu,
+ .cond_sync_exp = cond_synchronize_rcu_expedited,
+ .call = call_rcu_hurry,
+ .cb_barrier = rcu_barrier,
+ .fqs = rcu_force_quiescent_state,
+ .stats = NULL,
+ .gp_kthread_dbg = show_rcu_gp_kthreads,
+ .check_boost_failed = rcu_check_boost_fail,
+ .stall_dur = rcu_jiffies_till_stall_check,
+ .irq_capable = 1,
+ .can_boost = IS_ENABLED(CONFIG_RCU_BOOST),
+ .extendables = RCUTORTURE_MAX_EXTEND,
+ .name = "rcu"
};
/*
@@ -509,6 +605,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_torture_read_unlock,
+ .readlock_held = torture_readlock_not_held,
.get_gp_seq = rcu_no_completed,
.deferred_free = rcu_busted_torture_deferred_free,
.sync = synchronize_rcu_busted,
@@ -528,10 +625,14 @@ static struct rcu_torture_ops rcu_busted_ops = {
DEFINE_STATIC_SRCU(srcu_ctl);
static struct srcu_struct srcu_ctld;
static struct srcu_struct *srcu_ctlp = &srcu_ctl;
+static struct rcu_torture_ops srcud_ops;
-static int srcu_torture_read_lock(void) __acquires(srcu_ctlp)
+static int srcu_torture_read_lock(void)
{
- return srcu_read_lock(srcu_ctlp);
+ if (cur_ops == &srcud_ops)
+ return srcu_read_lock_nmisafe(srcu_ctlp);
+ else
+ return srcu_read_lock(srcu_ctlp);
}
static void
@@ -553,9 +654,17 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
}
}
-static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp)
+static void srcu_torture_read_unlock(int idx)
+{
+ if (cur_ops == &srcud_ops)
+ srcu_read_unlock_nmisafe(srcu_ctlp, idx);
+ else
+ srcu_read_unlock(srcu_ctlp, idx);
+}
+
+static int torture_srcu_read_lock_held(void)
{
- srcu_read_unlock(srcu_ctlp, idx);
+ return srcu_read_lock_held(srcu_ctlp);
}
static unsigned long srcu_torture_completed(void)
@@ -573,6 +682,21 @@ static void srcu_torture_synchronize(void)
synchronize_srcu(srcu_ctlp);
}
+static unsigned long srcu_torture_get_gp_state(void)
+{
+ return get_state_synchronize_srcu(srcu_ctlp);
+}
+
+static unsigned long srcu_torture_start_gp_poll(void)
+{
+ return start_poll_synchronize_srcu(srcu_ctlp);
+}
+
+static bool srcu_torture_poll_gp_state(unsigned long oldstate)
+{
+ return poll_state_synchronize_srcu(srcu_ctlp, oldstate);
+}
+
static void srcu_torture_call(struct rcu_head *head,
rcu_callback_t func)
{
@@ -600,14 +724,20 @@ static struct rcu_torture_ops srcu_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
+ .readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
+ .get_gp_state = srcu_torture_get_gp_state,
+ .start_gp_poll = srcu_torture_start_gp_poll,
+ .poll_gp_state = srcu_torture_poll_gp_state,
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .cbflood_max = 50000,
.irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.name = "srcu"
};
@@ -632,14 +762,20 @@ static struct rcu_torture_ops srcud_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
+ .readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
+ .get_gp_state = srcu_torture_get_gp_state,
+ .start_gp_poll = srcu_torture_start_gp_poll,
+ .poll_gp_state = srcu_torture_poll_gp_state,
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .cbflood_max = 50000,
.irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.name = "srcud"
};
@@ -651,6 +787,7 @@ static struct rcu_torture_ops busted_srcud_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = rcu_read_delay,
.readunlock = srcu_torture_read_unlock,
+ .readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
@@ -659,11 +796,56 @@ static struct rcu_torture_ops busted_srcud_ops = {
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
.irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.extendables = RCUTORTURE_MAX_EXTEND,
.name = "busted_srcud"
};
/*
+ * Definitions for trivial CONFIG_PREEMPT=n-only torture testing.
+ * This implementation does not necessarily work well with CPU hotplug.
+ */
+
+static void synchronize_rcu_trivial(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ torture_sched_setaffinity(current->pid, cpumask_of(cpu));
+ WARN_ON_ONCE(raw_smp_processor_id() != cpu);
+ }
+}
+
+static int rcu_torture_read_lock_trivial(void)
+{
+ preempt_disable();
+ return 0;
+}
+
+static void rcu_torture_read_unlock_trivial(int idx)
+{
+ preempt_enable();
+}
+
+static struct rcu_torture_ops trivial_ops = {
+ .ttype = RCU_TRIVIAL_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock_trivial,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_torture_read_unlock_trivial,
+ .readlock_held = torture_readlock_not_held,
+ .get_gp_seq = rcu_no_completed,
+ .sync = synchronize_rcu_trivial,
+ .exp_sync = synchronize_rcu_trivial,
+ .fqs = NULL,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "trivial"
+};
+
+#ifdef CONFIG_TASKS_RCU
+
+/*
* Definitions for RCU-tasks torture testing.
*/
@@ -683,7 +865,7 @@ static void rcu_tasks_torture_deferred_free(struct rcu_torture *p)
static void synchronize_rcu_mult_test(void)
{
- synchronize_rcu_mult(call_rcu_tasks, call_rcu);
+ synchronize_rcu_mult(call_rcu_tasks, call_rcu_hurry);
}
static struct rcu_torture_ops tasks_ops = {
@@ -698,6 +880,7 @@ static struct rcu_torture_ops tasks_ops = {
.exp_sync = synchronize_rcu_mult_test,
.call = call_rcu_tasks,
.cb_barrier = rcu_barrier_tasks,
+ .gp_kthread_dbg = show_rcu_tasks_classic_gp_kthread,
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
@@ -705,46 +888,16 @@ static struct rcu_torture_ops tasks_ops = {
.name = "tasks"
};
-/*
- * Definitions for trivial CONFIG_PREEMPT=n-only torture testing.
- * This implementation does not necessarily work well with CPU hotplug.
- */
+#define TASKS_OPS &tasks_ops,
-static void synchronize_rcu_trivial(void)
-{
- int cpu;
+#else // #ifdef CONFIG_TASKS_RCU
- for_each_online_cpu(cpu) {
- rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu));
- WARN_ON_ONCE(raw_smp_processor_id() != cpu);
- }
-}
+#define TASKS_OPS
-static int rcu_torture_read_lock_trivial(void) __acquires(RCU)
-{
- preempt_disable();
- return 0;
-}
+#endif // #else #ifdef CONFIG_TASKS_RCU
-static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU)
-{
- preempt_enable();
-}
-static struct rcu_torture_ops trivial_ops = {
- .ttype = RCU_TRIVIAL_FLAVOR,
- .init = rcu_sync_torture_init,
- .readlock = rcu_torture_read_lock_trivial,
- .read_delay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = rcu_torture_read_unlock_trivial,
- .get_gp_seq = rcu_no_completed,
- .sync = synchronize_rcu_trivial,
- .exp_sync = synchronize_rcu_trivial,
- .fqs = NULL,
- .stats = NULL,
- .irq_capable = 1,
- .name = "trivial"
-};
+#ifdef CONFIG_TASKS_RUDE_RCU
/*
* Definitions for rude RCU-tasks torture testing.
@@ -767,12 +920,25 @@ static struct rcu_torture_ops tasks_rude_ops = {
.exp_sync = synchronize_rcu_tasks_rude,
.call = call_rcu_tasks_rude,
.cb_barrier = rcu_barrier_tasks_rude,
+ .gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread,
+ .cbflood_max = 50000,
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
.name = "tasks-rude"
};
+#define TASKS_RUDE_OPS &tasks_rude_ops,
+
+#else // #ifdef CONFIG_TASKS_RUDE_RCU
+
+#define TASKS_RUDE_OPS
+
+#endif // #else #ifdef CONFIG_TASKS_RUDE_RCU
+
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+
/*
* Definitions for tracing RCU-tasks torture testing.
*/
@@ -799,12 +965,15 @@ static struct rcu_torture_ops tasks_tracing_ops = {
.readlock = tasks_tracing_torture_read_lock,
.read_delay = srcu_read_delay, /* just reuse srcu's version. */
.readunlock = tasks_tracing_torture_read_unlock,
+ .readlock_held = rcu_read_lock_trace_held,
.get_gp_seq = rcu_no_completed,
.deferred_free = rcu_tasks_tracing_torture_deferred_free,
.sync = synchronize_rcu_tasks_trace,
.exp_sync = synchronize_rcu_tasks_trace,
.call = call_rcu_tasks_trace,
.cb_barrier = rcu_barrier_tasks_trace,
+ .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread,
+ .cbflood_max = 50000,
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
@@ -812,6 +981,15 @@ static struct rcu_torture_ops tasks_tracing_ops = {
.name = "tasks-tracing"
};
+#define TASKS_TRACING_OPS &tasks_tracing_ops,
+
+#else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+#define TASKS_TRACING_OPS
+
+#endif // #else #ifdef CONFIG_TASKS_TRACE_RCU
+
+
static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old)
{
if (!cur_ops->gp_diff)
@@ -819,32 +997,13 @@ static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old)
return cur_ops->gp_diff(new, old);
}
-static bool __maybe_unused torturing_tasks(void)
-{
- return cur_ops == &tasks_ops || cur_ops == &tasks_rude_ops;
-}
-
/*
* RCU torture priority-boost testing. Runs one real-time thread per
- * CPU for moderate bursts, repeatedly registering RCU callbacks and
- * spinning waiting for them to be invoked. If a given callback takes
- * too long to be invoked, we assume that priority inversion has occurred.
+ * CPU for moderate bursts, repeatedly starting grace periods and waiting
+ * for them to complete. If a given grace period takes too long, we assume
+ * that priority inversion has occurred.
*/
-struct rcu_boost_inflight {
- struct rcu_head rcu;
- int inflight;
-};
-
-static void rcu_torture_boost_cb(struct rcu_head *head)
-{
- struct rcu_boost_inflight *rbip =
- container_of(head, struct rcu_boost_inflight, rcu);
-
- /* Ensure RCU-core accesses precede clearing ->inflight */
- smp_store_release(&rbip->inflight, 0);
-}
-
static int old_rt_runtime = -1;
static void rcu_torture_disable_rt_throttle(void)
@@ -871,89 +1030,108 @@ static void rcu_torture_enable_rt_throttle(void)
old_rt_runtime = -1;
}
-static bool rcu_torture_boost_failed(unsigned long start, unsigned long end)
+static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long *start)
{
- if (end - start > test_boost_duration * HZ - HZ / 2) {
+ int cpu;
+ static int dbg_done;
+ unsigned long end = jiffies;
+ bool gp_done;
+ unsigned long j;
+ static unsigned long last_persist;
+ unsigned long lp;
+ unsigned long mininterval = test_boost_duration * HZ - HZ / 2;
+
+ if (end - *start > mininterval) {
+ // Recheck after checking time to avoid false positives.
+ smp_mb(); // Time check before grace-period check.
+ if (cur_ops->poll_gp_state(gp_state))
+ return false; // passed, though perhaps just barely
+ if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, &cpu)) {
+ // At most one persisted message per boost test.
+ j = jiffies;
+ lp = READ_ONCE(last_persist);
+ if (time_after(j, lp + mininterval) && cmpxchg(&last_persist, lp, j) == lp)
+ pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu);
+ return false; // passed on a technicality
+ }
VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
n_rcu_torture_boost_failure++;
+ if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) {
+ pr_info("Boost inversion thread ->rt_priority %u gp_state %lu jiffies %lu\n",
+ current->rt_priority, gp_state, end - *start);
+ cur_ops->gp_kthread_dbg();
+ // Recheck after print to flag grace period ending during splat.
+ gp_done = cur_ops->poll_gp_state(gp_state);
+ pr_info("Boost inversion: GP %lu %s.\n", gp_state,
+ gp_done ? "ended already" : "still pending");
+
+ }
- return true; /* failed */
+ return true; // failed
+ } else if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, NULL)) {
+ *start = jiffies;
}
- return false; /* passed */
+ return false; // passed
}
static int rcu_torture_boost(void *arg)
{
- unsigned long call_rcu_time;
unsigned long endtime;
+ unsigned long gp_state;
+ unsigned long gp_state_time;
unsigned long oldstarttime;
- struct rcu_boost_inflight rbi = { .inflight = 0 };
- struct sched_param sp;
VERBOSE_TOROUT_STRING("rcu_torture_boost started");
/* Set real-time priority. */
- sp.sched_priority = 1;
- if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
- VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!");
- n_rcu_torture_boost_rterror++;
- }
+ sched_set_fifo_low(current);
- init_rcu_head_on_stack(&rbi.rcu);
/* Each pass through the following loop does one boost-test cycle. */
do {
- /* Track if the test failed already in this test interval? */
- bool failed = false;
+ bool failed = false; // Test failed already in this test interval
+ bool gp_initiated = false;
- /* Increment n_rcu_torture_boosts once per boost-test */
- while (!kthread_should_stop()) {
- if (mutex_trylock(&boost_mutex)) {
- n_rcu_torture_boosts++;
- mutex_unlock(&boost_mutex);
- break;
- }
- schedule_timeout_uninterruptible(1);
- }
if (kthread_should_stop())
goto checkwait;
/* Wait for the next test interval. */
- oldstarttime = boost_starttime;
+ oldstarttime = READ_ONCE(boost_starttime);
while (time_before(jiffies, oldstarttime)) {
schedule_timeout_interruptible(oldstarttime - jiffies);
- stutter_wait("rcu_torture_boost");
+ if (stutter_wait("rcu_torture_boost"))
+ sched_set_fifo_low(current);
if (torture_must_stop())
goto checkwait;
}
- /* Do one boost-test interval. */
+ // Do one boost-test interval.
endtime = oldstarttime + test_boost_duration * HZ;
- call_rcu_time = jiffies;
while (time_before(jiffies, endtime)) {
- /* If we don't have a callback in flight, post one. */
- if (!smp_load_acquire(&rbi.inflight)) {
- /* RCU core before ->inflight = 1. */
- smp_store_release(&rbi.inflight, 1);
- call_rcu(&rbi.rcu, rcu_torture_boost_cb);
- /* Check if the boost test failed */
- failed = failed ||
- rcu_torture_boost_failed(call_rcu_time,
- jiffies);
- call_rcu_time = jiffies;
+ // Has current GP gone too long?
+ if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state))
+ failed = rcu_torture_boost_failed(gp_state, &gp_state_time);
+ // If we don't have a grace period in flight, start one.
+ if (!gp_initiated || cur_ops->poll_gp_state(gp_state)) {
+ gp_state = cur_ops->start_gp_poll();
+ gp_initiated = true;
+ gp_state_time = jiffies;
+ }
+ if (stutter_wait("rcu_torture_boost")) {
+ sched_set_fifo_low(current);
+ // If the grace period already ended,
+ // we don't know when that happened, so
+ // start over.
+ if (cur_ops->poll_gp_state(gp_state))
+ gp_initiated = false;
}
- stutter_wait("rcu_torture_boost");
if (torture_must_stop())
goto checkwait;
}
- /*
- * If boost never happened, then inflight will always be 1, in
- * this case the boost check would never happen in the above
- * loop so do another one here.
- */
- if (!failed && smp_load_acquire(&rbi.inflight))
- rcu_torture_boost_failed(call_rcu_time, jiffies);
+ // In case the grace period extended beyond the end of the loop.
+ if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state))
+ rcu_torture_boost_failed(gp_state, &gp_state_time);
/*
* Set the start time of the next test interval.
@@ -962,27 +1140,29 @@ static int rcu_torture_boost(void *arg)
* interval. Besides, we are running at RT priority,
* so delays should be relatively rare.
*/
- while (oldstarttime == boost_starttime &&
- !kthread_should_stop()) {
+ while (oldstarttime == READ_ONCE(boost_starttime) && !kthread_should_stop()) {
if (mutex_trylock(&boost_mutex)) {
- boost_starttime = jiffies +
- test_boost_interval * HZ;
+ if (oldstarttime == boost_starttime) {
+ WRITE_ONCE(boost_starttime,
+ jiffies + test_boost_interval * HZ);
+ n_rcu_torture_boosts++;
+ }
mutex_unlock(&boost_mutex);
break;
}
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_uninterruptible(HZ / 20);
}
/* Go do the stutter. */
-checkwait: stutter_wait("rcu_torture_boost");
+checkwait: if (stutter_wait("rcu_torture_boost"))
+ sched_set_fifo_low(current);
} while (!torture_must_stop());
/* Clean up and exit. */
- while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) {
+ while (!kthread_should_stop()) {
torture_shutdown_absorb("rcu_torture_boost");
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_uninterruptible(HZ / 20);
}
- destroy_rcu_head_on_stack(&rbi.rcu);
torture_kthread_stopping("rcu_torture_boost");
return 0;
}
@@ -997,13 +1177,14 @@ rcu_torture_fqs(void *arg)
{
unsigned long fqs_resume_time;
int fqs_burst_remaining;
+ int oldnice = task_nice(current);
VERBOSE_TOROUT_STRING("rcu_torture_fqs task started");
do {
fqs_resume_time = jiffies + fqs_stutter * HZ;
while (time_before(jiffies, fqs_resume_time) &&
!kthread_should_stop()) {
- schedule_timeout_interruptible(1);
+ schedule_timeout_interruptible(HZ / 20);
}
fqs_burst_remaining = fqs_duration;
while (fqs_burst_remaining > 0 &&
@@ -1012,48 +1193,76 @@ rcu_torture_fqs(void *arg)
udelay(fqs_holdoff);
fqs_burst_remaining -= fqs_holdoff;
}
- stutter_wait("rcu_torture_fqs");
+ if (stutter_wait("rcu_torture_fqs"))
+ sched_set_normal(current, oldnice);
} while (!torture_must_stop());
torture_kthread_stopping("rcu_torture_fqs");
return 0;
}
+// Used by writers to randomly choose from the available grace-period primitives.
+static int synctype[ARRAY_SIZE(rcu_torture_writer_state_names)] = { };
+static int nsynctypes;
+
/*
- * RCU torture writer kthread. Repeatedly substitutes a new structure
- * for that pointed to by rcu_torture_current, freeing the old structure
- * after a series of grace periods (the "pipeline").
+ * Determine which grace-period primitives are available.
*/
-static int
-rcu_torture_writer(void *arg)
+static void rcu_torture_write_types(void)
{
- bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
- int expediting = 0;
- unsigned long gp_snap;
- bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
- bool gp_sync1 = gp_sync;
- int i;
- struct rcu_torture *rp;
- struct rcu_torture *old_rp;
- static DEFINE_TORTURE_RANDOM(rand);
- int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC,
- RTWS_COND_GET, RTWS_SYNC };
- int nsynctypes = 0;
-
- VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
- if (!can_expedite)
- pr_alert("%s" TORTURE_FLAG
- " GP expediting controlled from boot/sysfs for %s.\n",
- torture_type, cur_ops->name);
+ bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_cond_full1 = gp_cond_full;
+ bool gp_cond_exp_full1 = gp_cond_exp_full, gp_exp1 = gp_exp, gp_poll_exp1 = gp_poll_exp;
+ bool gp_poll_exp_full1 = gp_poll_exp_full, gp_normal1 = gp_normal, gp_poll1 = gp_poll;
+ bool gp_poll_full1 = gp_poll_full, gp_sync1 = gp_sync;
/* Initialize synctype[] array. If none set, take default. */
- if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
- gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
- if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) {
+ if (!gp_cond1 &&
+ !gp_cond_exp1 &&
+ !gp_cond_full1 &&
+ !gp_cond_exp_full1 &&
+ !gp_exp1 &&
+ !gp_poll_exp1 &&
+ !gp_poll_exp_full1 &&
+ !gp_normal1 &&
+ !gp_poll1 &&
+ !gp_poll_full1 &&
+ !gp_sync1) {
+ gp_cond1 = true;
+ gp_cond_exp1 = true;
+ gp_cond_full1 = true;
+ gp_cond_exp_full1 = true;
+ gp_exp1 = true;
+ gp_poll_exp1 = true;
+ gp_poll_exp_full1 = true;
+ gp_normal1 = true;
+ gp_poll1 = true;
+ gp_poll_full1 = true;
+ gp_sync1 = true;
+ }
+ if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) {
synctype[nsynctypes++] = RTWS_COND_GET;
pr_info("%s: Testing conditional GPs.\n", __func__);
- } else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) {
+ } else if (gp_cond && (!cur_ops->get_gp_state || !cur_ops->cond_sync)) {
pr_alert("%s: gp_cond without primitives.\n", __func__);
}
+ if (gp_cond_exp1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp) {
+ synctype[nsynctypes++] = RTWS_COND_GET_EXP;
+ pr_info("%s: Testing conditional expedited GPs.\n", __func__);
+ } else if (gp_cond_exp && (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp)) {
+ pr_alert("%s: gp_cond_exp without primitives.\n", __func__);
+ }
+ if (gp_cond_full1 && cur_ops->get_gp_state && cur_ops->cond_sync_full) {
+ synctype[nsynctypes++] = RTWS_COND_GET_FULL;
+ pr_info("%s: Testing conditional full-state GPs.\n", __func__);
+ } else if (gp_cond_full && (!cur_ops->get_gp_state || !cur_ops->cond_sync_full)) {
+ pr_alert("%s: gp_cond_full without primitives.\n", __func__);
+ }
+ if (gp_cond_exp_full1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp_full) {
+ synctype[nsynctypes++] = RTWS_COND_GET_EXP_FULL;
+ pr_info("%s: Testing conditional full-state expedited GPs.\n", __func__);
+ } else if (gp_cond_exp_full &&
+ (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp_full)) {
+ pr_alert("%s: gp_cond_exp_full without primitives.\n", __func__);
+ }
if (gp_exp1 && cur_ops->exp_sync) {
synctype[nsynctypes++] = RTWS_EXP_SYNC;
pr_info("%s: Testing expedited GPs.\n", __func__);
@@ -1066,14 +1275,109 @@ rcu_torture_writer(void *arg)
} else if (gp_normal && !cur_ops->deferred_free) {
pr_alert("%s: gp_normal without primitives.\n", __func__);
}
+ if (gp_poll1 && cur_ops->get_comp_state && cur_ops->same_gp_state &&
+ cur_ops->start_gp_poll && cur_ops->poll_gp_state) {
+ synctype[nsynctypes++] = RTWS_POLL_GET;
+ pr_info("%s: Testing polling GPs.\n", __func__);
+ } else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) {
+ pr_alert("%s: gp_poll without primitives.\n", __func__);
+ }
+ if (gp_poll_full1 && cur_ops->get_comp_state_full && cur_ops->same_gp_state_full
+ && cur_ops->start_gp_poll_full && cur_ops->poll_gp_state_full) {
+ synctype[nsynctypes++] = RTWS_POLL_GET_FULL;
+ pr_info("%s: Testing polling full-state GPs.\n", __func__);
+ } else if (gp_poll_full && (!cur_ops->start_gp_poll_full || !cur_ops->poll_gp_state_full)) {
+ pr_alert("%s: gp_poll_full without primitives.\n", __func__);
+ }
+ if (gp_poll_exp1 && cur_ops->start_gp_poll_exp && cur_ops->poll_gp_state_exp) {
+ synctype[nsynctypes++] = RTWS_POLL_GET_EXP;
+ pr_info("%s: Testing polling expedited GPs.\n", __func__);
+ } else if (gp_poll_exp && (!cur_ops->start_gp_poll_exp || !cur_ops->poll_gp_state_exp)) {
+ pr_alert("%s: gp_poll_exp without primitives.\n", __func__);
+ }
+ if (gp_poll_exp_full1 && cur_ops->start_gp_poll_exp_full && cur_ops->poll_gp_state_full) {
+ synctype[nsynctypes++] = RTWS_POLL_GET_EXP_FULL;
+ pr_info("%s: Testing polling full-state expedited GPs.\n", __func__);
+ } else if (gp_poll_exp_full &&
+ (!cur_ops->start_gp_poll_exp_full || !cur_ops->poll_gp_state_full)) {
+ pr_alert("%s: gp_poll_exp_full without primitives.\n", __func__);
+ }
if (gp_sync1 && cur_ops->sync) {
synctype[nsynctypes++] = RTWS_SYNC;
pr_info("%s: Testing normal GPs.\n", __func__);
} else if (gp_sync && !cur_ops->sync) {
pr_alert("%s: gp_sync without primitives.\n", __func__);
}
+}
+
+/*
+ * Do the specified rcu_torture_writer() synchronous grace period,
+ * while also testing out the polled APIs. Note well that the single-CPU
+ * grace-period optimizations must be accounted for.
+ */
+static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void))
+{
+ unsigned long cookie;
+ struct rcu_gp_oldstate cookie_full;
+ bool dopoll;
+ bool dopoll_full;
+ unsigned long r = torture_random(trsp);
+
+ dopoll = cur_ops->get_gp_state && cur_ops->poll_gp_state && !(r & 0x300);
+ dopoll_full = cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full && !(r & 0xc00);
+ if (dopoll || dopoll_full)
+ cpus_read_lock();
+ if (dopoll)
+ cookie = cur_ops->get_gp_state();
+ if (dopoll_full)
+ cur_ops->get_gp_state_full(&cookie_full);
+ if (cur_ops->poll_need_2gp && cur_ops->poll_need_2gp(dopoll, dopoll_full))
+ sync();
+ sync();
+ WARN_ONCE(dopoll && !cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 3 failed %pS() online %*pbl.",
+ __func__, sync, cpumask_pr_args(cpu_online_mask));
+ WARN_ONCE(dopoll_full && !cur_ops->poll_gp_state_full(&cookie_full),
+ "%s: Cookie check 4 failed %pS() online %*pbl",
+ __func__, sync, cpumask_pr_args(cpu_online_mask));
+ if (dopoll || dopoll_full)
+ cpus_read_unlock();
+}
+
+/*
+ * RCU torture writer kthread. Repeatedly substitutes a new structure
+ * for that pointed to by rcu_torture_current, freeing the old structure
+ * after a series of grace periods (the "pipeline").
+ */
+static int
+rcu_torture_writer(void *arg)
+{
+ bool boot_ended;
+ bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
+ unsigned long cookie;
+ struct rcu_gp_oldstate cookie_full;
+ int expediting = 0;
+ unsigned long gp_snap;
+ unsigned long gp_snap1;
+ struct rcu_gp_oldstate gp_snap_full;
+ struct rcu_gp_oldstate gp_snap1_full;
+ int i;
+ int idx;
+ int oldnice = task_nice(current);
+ struct rcu_gp_oldstate rgo[NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE];
+ struct rcu_torture *rp;
+ struct rcu_torture *old_rp;
+ static DEFINE_TORTURE_RANDOM(rand);
+ bool stutter_waited;
+ unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+
+ VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
+ if (!can_expedite)
+ pr_alert("%s" TORTURE_FLAG
+ " GP expediting controlled from boot/sysfs for %s.\n",
+ torture_type, cur_ops->name);
if (WARN_ONCE(nsynctypes == 0,
- "rcu_torture_writer: No update-side primitives.\n")) {
+ "%s: No update-side primitives.\n", __func__)) {
/*
* No updates primitives, so don't try updating.
* The resulting test won't be testing much, hence the
@@ -1081,11 +1385,12 @@ rcu_torture_writer(void *arg)
*/
rcu_torture_writer_state = RTWS_STOPPING;
torture_kthread_stopping("rcu_torture_writer");
+ return 0;
}
do {
rcu_torture_writer_state = RTWS_FIXED_DELAY;
- schedule_timeout_uninterruptible(1);
+ torture_hrtimeout_us(500, 1000, &rand);
rp = rcu_torture_alloc();
if (rp == NULL)
continue;
@@ -1105,6 +1410,38 @@ rcu_torture_writer(void *arg)
atomic_inc(&rcu_torture_wcount[i]);
WRITE_ONCE(old_rp->rtort_pipe_count,
old_rp->rtort_pipe_count + 1);
+
+ // Make sure readers block polled grace periods.
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state) {
+ idx = cur_ops->readlock();
+ cookie = cur_ops->get_gp_state();
+ WARN_ONCE(cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 1 failed %s(%d) %lu->%lu\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cookie, cur_ops->get_gp_state());
+ if (cur_ops->get_gp_completed) {
+ cookie = cur_ops->get_gp_completed();
+ WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
+ }
+ cur_ops->readunlock(idx);
+ }
+ if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) {
+ idx = cur_ops->readlock();
+ cur_ops->get_gp_state_full(&cookie_full);
+ WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
+ "%s: Cookie check 5 failed %s(%d) online %*pbl\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cpumask_pr_args(cpu_online_mask));
+ if (cur_ops->get_gp_completed_full) {
+ cur_ops->get_gp_completed_full(&cookie_full);
+ WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full));
+ }
+ cur_ops->readunlock(idx);
+ }
switch (synctype[torture_random(&rand) % nsynctypes]) {
case RTWS_DEF_FREE:
rcu_torture_writer_state = RTWS_DEF_FREE;
@@ -1112,23 +1449,103 @@ rcu_torture_writer(void *arg)
break;
case RTWS_EXP_SYNC:
rcu_torture_writer_state = RTWS_EXP_SYNC;
- cur_ops->exp_sync();
+ do_rtws_sync(&rand, cur_ops->exp_sync);
rcu_torture_pipe_update(old_rp);
break;
case RTWS_COND_GET:
rcu_torture_writer_state = RTWS_COND_GET;
- gp_snap = cur_ops->get_state();
- i = torture_random(&rand) % 16;
- if (i != 0)
- schedule_timeout_interruptible(i);
- udelay(torture_random(&rand) % 1000);
+ gp_snap = cur_ops->get_gp_state();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
rcu_torture_writer_state = RTWS_COND_SYNC;
cur_ops->cond_sync(gp_snap);
rcu_torture_pipe_update(old_rp);
break;
+ case RTWS_COND_GET_EXP:
+ rcu_torture_writer_state = RTWS_COND_GET_EXP;
+ gp_snap = cur_ops->get_gp_state_exp();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ rcu_torture_writer_state = RTWS_COND_SYNC_EXP;
+ cur_ops->cond_sync_exp(gp_snap);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_COND_GET_FULL:
+ rcu_torture_writer_state = RTWS_COND_GET_FULL;
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ rcu_torture_writer_state = RTWS_COND_SYNC_FULL;
+ cur_ops->cond_sync_full(&gp_snap_full);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_COND_GET_EXP_FULL:
+ rcu_torture_writer_state = RTWS_COND_GET_EXP_FULL;
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ rcu_torture_writer_state = RTWS_COND_SYNC_EXP_FULL;
+ cur_ops->cond_sync_exp_full(&gp_snap_full);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_POLL_GET:
+ rcu_torture_writer_state = RTWS_POLL_GET;
+ for (i = 0; i < ARRAY_SIZE(ulo); i++)
+ ulo[i] = cur_ops->get_comp_state();
+ gp_snap = cur_ops->start_gp_poll();
+ rcu_torture_writer_state = RTWS_POLL_WAIT;
+ while (!cur_ops->poll_gp_state(gp_snap)) {
+ gp_snap1 = cur_ops->get_gp_state();
+ for (i = 0; i < ARRAY_SIZE(ulo); i++)
+ if (cur_ops->poll_gp_state(ulo[i]) ||
+ cur_ops->same_gp_state(ulo[i], gp_snap1)) {
+ ulo[i] = gp_snap1;
+ break;
+ }
+ WARN_ON_ONCE(i >= ARRAY_SIZE(ulo));
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_POLL_GET_FULL:
+ rcu_torture_writer_state = RTWS_POLL_GET_FULL;
+ for (i = 0; i < ARRAY_SIZE(rgo); i++)
+ cur_ops->get_comp_state_full(&rgo[i]);
+ cur_ops->start_gp_poll_full(&gp_snap_full);
+ rcu_torture_writer_state = RTWS_POLL_WAIT_FULL;
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
+ cur_ops->get_gp_state_full(&gp_snap1_full);
+ for (i = 0; i < ARRAY_SIZE(rgo); i++)
+ if (cur_ops->poll_gp_state_full(&rgo[i]) ||
+ cur_ops->same_gp_state_full(&rgo[i],
+ &gp_snap1_full)) {
+ rgo[i] = gp_snap1_full;
+ break;
+ }
+ WARN_ON_ONCE(i >= ARRAY_SIZE(rgo));
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_POLL_GET_EXP:
+ rcu_torture_writer_state = RTWS_POLL_GET_EXP;
+ gp_snap = cur_ops->start_gp_poll_exp();
+ rcu_torture_writer_state = RTWS_POLL_WAIT_EXP;
+ while (!cur_ops->poll_gp_state_exp(gp_snap))
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_POLL_GET_EXP_FULL:
+ rcu_torture_writer_state = RTWS_POLL_GET_EXP_FULL;
+ cur_ops->start_gp_poll_exp_full(&gp_snap_full);
+ rcu_torture_writer_state = RTWS_POLL_WAIT_EXP_FULL;
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full))
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ rcu_torture_pipe_update(old_rp);
+ break;
case RTWS_SYNC:
rcu_torture_writer_state = RTWS_SYNC;
- cur_ops->sync();
+ do_rtws_sync(&rand, cur_ops->sync);
rcu_torture_pipe_update(old_rp);
break;
default:
@@ -1153,19 +1570,26 @@ rcu_torture_writer(void *arg)
!rcu_gp_is_normal();
}
rcu_torture_writer_state = RTWS_STUTTER;
- if (stutter_wait("rcu_torture_writer") &&
- !READ_ONCE(rcu_fwd_cb_nodelay) &&
+ boot_ended = rcu_inkernel_boot_has_ended();
+ stutter_waited = stutter_wait("rcu_torture_writer");
+ if (stutter_waited &&
+ !atomic_read(&rcu_fwd_cb_nodelay) &&
!cur_ops->slow_gps &&
!torture_must_stop() &&
- rcu_inkernel_boot_has_ended())
+ boot_ended)
for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++)
if (list_empty(&rcu_tortures[i].rtort_free) &&
rcu_access_pointer(rcu_torture_current) !=
&rcu_tortures[i]) {
- rcu_ftrace_dump(DUMP_ALL);
+ tracing_off();
+ show_rcu_gp_kthreads();
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
+ rcu_ftrace_dump(DUMP_ALL);
}
+ if (stutter_waited)
+ sched_set_normal(current, oldnice);
} while (!torture_must_stop());
+ rcu_torture_current = NULL; // Let stats task know that we are done.
/* Reset expediting back to unexpedited. */
if (expediting > 0)
expediting = -expediting;
@@ -1188,26 +1612,91 @@ rcu_torture_writer(void *arg)
static int
rcu_torture_fakewriter(void *arg)
{
+ unsigned long gp_snap;
+ struct rcu_gp_oldstate gp_snap_full;
DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
set_user_nice(current, MAX_NICE);
+ if (WARN_ONCE(nsynctypes == 0,
+ "%s: No update-side primitives.\n", __func__)) {
+ /*
+ * No updates primitives, so don't try updating.
+ * The resulting test won't be testing much, hence the
+ * above WARN_ONCE().
+ */
+ torture_kthread_stopping("rcu_torture_fakewriter");
+ return 0;
+ }
+
do {
- schedule_timeout_uninterruptible(1 + torture_random(&rand)%10);
- udelay(torture_random(&rand) & 0x3ff);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand);
if (cur_ops->cb_barrier != NULL &&
torture_random(&rand) % (nfakewriters * 8) == 0) {
cur_ops->cb_barrier();
- } else if (gp_normal == gp_exp) {
- if (cur_ops->sync && torture_random(&rand) & 0x80)
- cur_ops->sync();
- else if (cur_ops->exp_sync)
+ } else {
+ switch (synctype[torture_random(&rand) % nsynctypes]) {
+ case RTWS_DEF_FREE:
+ break;
+ case RTWS_EXP_SYNC:
cur_ops->exp_sync();
- } else if (gp_normal && cur_ops->sync) {
- cur_ops->sync();
- } else if (cur_ops->exp_sync) {
- cur_ops->exp_sync();
+ break;
+ case RTWS_COND_GET:
+ gp_snap = cur_ops->get_gp_state();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync(gp_snap);
+ break;
+ case RTWS_COND_GET_EXP:
+ gp_snap = cur_ops->get_gp_state_exp();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync_exp(gp_snap);
+ break;
+ case RTWS_COND_GET_FULL:
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync_full(&gp_snap_full);
+ break;
+ case RTWS_COND_GET_EXP_FULL:
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync_exp_full(&gp_snap_full);
+ break;
+ case RTWS_POLL_GET:
+ gp_snap = cur_ops->start_gp_poll();
+ while (!cur_ops->poll_gp_state(gp_snap)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
+ case RTWS_POLL_GET_FULL:
+ cur_ops->start_gp_poll_full(&gp_snap_full);
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
+ case RTWS_POLL_GET_EXP:
+ gp_snap = cur_ops->start_gp_poll_exp();
+ while (!cur_ops->poll_gp_state_exp(gp_snap)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
+ case RTWS_POLL_GET_EXP_FULL:
+ cur_ops->start_gp_poll_exp_full(&gp_snap_full);
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
+ case RTWS_SYNC:
+ cur_ops->sync();
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
}
stutter_wait("rcu_torture_fakewriter");
} while (!torture_must_stop());
@@ -1221,6 +1710,62 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp)
kfree(rhp);
}
+// Set up and carry out testing of RCU's global memory ordering
+static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp,
+ struct torture_random_state *trsp)
+{
+ unsigned long loops;
+ int noc = torture_num_online_cpus();
+ int rdrchked;
+ int rdrchker;
+ struct rcu_torture_reader_check *rtrcp; // Me.
+ struct rcu_torture_reader_check *rtrcp_assigner; // Assigned us to do checking.
+ struct rcu_torture_reader_check *rtrcp_chked; // Reader being checked.
+ struct rcu_torture_reader_check *rtrcp_chker; // Reader doing checking when not me.
+
+ if (myid < 0)
+ return; // Don't try this from timer handlers.
+
+ // Increment my counter.
+ rtrcp = &rcu_torture_reader_mbchk[myid];
+ WRITE_ONCE(rtrcp->rtc_myloops, rtrcp->rtc_myloops + 1);
+
+ // Attempt to assign someone else some checking work.
+ rdrchked = torture_random(trsp) % nrealreaders;
+ rtrcp_chked = &rcu_torture_reader_mbchk[rdrchked];
+ rdrchker = torture_random(trsp) % nrealreaders;
+ rtrcp_chker = &rcu_torture_reader_mbchk[rdrchker];
+ if (rdrchked != myid && rdrchked != rdrchker && noc >= rdrchked && noc >= rdrchker &&
+ smp_load_acquire(&rtrcp->rtc_chkrdr) < 0 && // Pairs with smp_store_release below.
+ !READ_ONCE(rtp->rtort_chkp) &&
+ !smp_load_acquire(&rtrcp_chker->rtc_assigner)) { // Pairs with smp_store_release below.
+ rtrcp->rtc_chkloops = READ_ONCE(rtrcp_chked->rtc_myloops);
+ WARN_ON_ONCE(rtrcp->rtc_chkrdr >= 0);
+ rtrcp->rtc_chkrdr = rdrchked;
+ WARN_ON_ONCE(rtrcp->rtc_ready); // This gets set after the grace period ends.
+ if (cmpxchg_relaxed(&rtrcp_chker->rtc_assigner, NULL, rtrcp) ||
+ cmpxchg_relaxed(&rtp->rtort_chkp, NULL, rtrcp))
+ (void)cmpxchg_relaxed(&rtrcp_chker->rtc_assigner, rtrcp, NULL); // Back out.
+ }
+
+ // If assigned some completed work, do it!
+ rtrcp_assigner = READ_ONCE(rtrcp->rtc_assigner);
+ if (!rtrcp_assigner || !smp_load_acquire(&rtrcp_assigner->rtc_ready))
+ return; // No work or work not yet ready.
+ rdrchked = rtrcp_assigner->rtc_chkrdr;
+ if (WARN_ON_ONCE(rdrchked < 0))
+ return;
+ rtrcp_chked = &rcu_torture_reader_mbchk[rdrchked];
+ loops = READ_ONCE(rtrcp_chked->rtc_myloops);
+ atomic_inc(&n_rcu_torture_mbchk_tries);
+ if (ULONG_CMP_LT(loops, rtrcp_assigner->rtc_chkloops))
+ atomic_inc(&n_rcu_torture_mbchk_fail);
+ rtrcp_assigner->rtc_chkloops = loops + ULONG_MAX / 2;
+ rtrcp_assigner->rtc_ready = 0;
+ smp_store_release(&rtrcp->rtc_assigner, NULL); // Someone else can assign us work.
+ smp_store_release(&rtrcp_assigner->rtc_chkrdr, -1); // Assigner can again assign.
+}
+
/*
* Do one extension of an RCU read-side critical section using the
* current reader state in readstate (set to zero for initial entry
@@ -1235,46 +1780,64 @@ static void rcutorture_one_extend(int *readstate, int newstate,
struct rt_read_seg *rtrsp)
{
unsigned long flags;
- int idxnew = -1;
- int idxold = *readstate;
+ int idxnew1 = -1;
+ int idxnew2 = -1;
+ int idxold1 = *readstate;
+ int idxold2 = idxold1;
int statesnew = ~*readstate & newstate;
int statesold = *readstate & ~newstate;
- WARN_ON_ONCE(idxold < 0);
- WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1);
+ WARN_ON_ONCE(idxold2 < 0);
+ WARN_ON_ONCE((idxold2 >> RCUTORTURE_RDR_SHIFT_2) > 1);
rtrsp->rt_readstate = newstate;
/* First, put new protection in place to avoid critical-section gap. */
if (statesnew & RCUTORTURE_RDR_BH)
local_bh_disable();
+ if (statesnew & RCUTORTURE_RDR_RBH)
+ rcu_read_lock_bh();
if (statesnew & RCUTORTURE_RDR_IRQ)
local_irq_disable();
if (statesnew & RCUTORTURE_RDR_PREEMPT)
preempt_disable();
- if (statesnew & RCUTORTURE_RDR_RBH)
- rcu_read_lock_bh();
if (statesnew & RCUTORTURE_RDR_SCHED)
rcu_read_lock_sched();
- if (statesnew & RCUTORTURE_RDR_RCU)
- idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT;
+ if (statesnew & RCUTORTURE_RDR_RCU_1)
+ idxnew1 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_1;
+ if (statesnew & RCUTORTURE_RDR_RCU_2)
+ idxnew2 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_2;
- /* Next, remove old protection, irq first due to bh conflict. */
+ /*
+ * Next, remove old protection, in decreasing order of strength
+ * to avoid unlock paths that aren't safe in the stronger
+ * context. Namely: BH can not be enabled with disabled interrupts.
+ * Additionally PREEMPT_RT requires that BH is enabled in preemptible
+ * context.
+ */
if (statesold & RCUTORTURE_RDR_IRQ)
local_irq_enable();
- if (statesold & RCUTORTURE_RDR_BH)
- local_bh_enable();
if (statesold & RCUTORTURE_RDR_PREEMPT)
preempt_enable();
- if (statesold & RCUTORTURE_RDR_RBH)
- rcu_read_unlock_bh();
if (statesold & RCUTORTURE_RDR_SCHED)
rcu_read_unlock_sched();
- if (statesold & RCUTORTURE_RDR_RCU) {
- bool lockit = !statesnew && !(torture_random(trsp) & 0xffff);
+ if (statesold & RCUTORTURE_RDR_BH)
+ local_bh_enable();
+ if (statesold & RCUTORTURE_RDR_RBH)
+ rcu_read_unlock_bh();
+ if (statesold & RCUTORTURE_RDR_RCU_2) {
+ cur_ops->readunlock((idxold2 >> RCUTORTURE_RDR_SHIFT_2) & 0x1);
+ WARN_ON_ONCE(idxnew2 != -1);
+ idxold2 = 0;
+ }
+ if (statesold & RCUTORTURE_RDR_RCU_1) {
+ bool lockit;
+ lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff);
if (lockit)
raw_spin_lock_irqsave(&current->pi_lock, flags);
- cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT);
+ cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1);
+ WARN_ON_ONCE(idxnew1 != -1);
+ idxold1 = 0;
if (lockit)
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
}
@@ -1284,13 +1847,19 @@ static void rcutorture_one_extend(int *readstate, int newstate,
cur_ops->read_delay(trsp, rtrsp);
/* Update the reader state. */
- if (idxnew == -1)
- idxnew = idxold & ~RCUTORTURE_RDR_MASK;
- WARN_ON_ONCE(idxnew < 0);
- WARN_ON_ONCE((idxnew >> RCUTORTURE_RDR_SHIFT) > 1);
- *readstate = idxnew | newstate;
- WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) < 0);
- WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) > 1);
+ if (idxnew1 == -1)
+ idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1;
+ WARN_ON_ONCE(idxnew1 < 0);
+ if (WARN_ON_ONCE((idxnew1 >> RCUTORTURE_RDR_SHIFT_1) > 1))
+ pr_info("Unexpected idxnew1 value of %#x\n", idxnew1);
+ if (idxnew2 == -1)
+ idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2;
+ WARN_ON_ONCE(idxnew2 < 0);
+ WARN_ON_ONCE((idxnew2 >> RCUTORTURE_RDR_SHIFT_2) > 1);
+ *readstate = idxnew1 | idxnew2 | newstate;
+ WARN_ON_ONCE(*readstate < 0);
+ if (WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT_2) > 1))
+ pr_info("Unexpected idxnew2 value of %#x\n", idxnew2);
}
/* Return the biggest extendables mask given current RCU and boot parameters. */
@@ -1300,7 +1869,7 @@ static int rcutorture_extend_mask_max(void)
WARN_ON_ONCE(extendables & ~RCUTORTURE_MAX_EXTEND);
mask = extendables & RCUTORTURE_MAX_EXTEND & cur_ops->extendables;
- mask = mask | RCUTORTURE_RDR_RCU;
+ mask = mask | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2;
return mask;
}
@@ -1309,21 +1878,47 @@ static int
rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
{
int mask = rcutorture_extend_mask_max();
- unsigned long randmask1 = torture_random(trsp) >> 8;
+ unsigned long randmask1 = torture_random(trsp);
unsigned long randmask2 = randmask1 >> 3;
+ unsigned long preempts = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+ unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ;
+ unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
- WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT);
+ WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1);
/* Mostly only one bit (need preemption!), sometimes lots of bits. */
if (!(randmask1 & 0x7))
mask = mask & randmask2;
else
mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS));
- /* Can't enable bh w/irq disabled. */
- if ((mask & RCUTORTURE_RDR_IRQ) &&
- ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) ||
- (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH))))
- mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
- return mask ?: RCUTORTURE_RDR_RCU;
+
+ // Can't have nested RCU reader without outer RCU reader.
+ if (!(mask & RCUTORTURE_RDR_RCU_1) && (mask & RCUTORTURE_RDR_RCU_2)) {
+ if (oldmask & RCUTORTURE_RDR_RCU_1)
+ mask &= ~RCUTORTURE_RDR_RCU_2;
+ else
+ mask |= RCUTORTURE_RDR_RCU_1;
+ }
+
+ /*
+ * Can't enable bh w/irq disabled.
+ */
+ if (mask & RCUTORTURE_RDR_IRQ)
+ mask |= oldmask & bhs;
+
+ /*
+ * Ideally these sequences would be detected in debug builds
+ * (regardless of RT), but until then don't stop testing
+ * them on non-RT.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ /* Can't modify BH in atomic context */
+ if (oldmask & preempts_irq)
+ mask &= ~bhs;
+ if ((oldmask | mask) & preempts_irq)
+ mask |= oldmask & bhs;
+ }
+
+ return mask ?: RCUTORTURE_RDR_RCU_1;
}
/*
@@ -1342,7 +1937,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp,
if (!((mask - 1) & mask))
return rtrsp; /* Current RCU reader not extendable. */
/* Bias towards larger numbers of loops. */
- i = (torture_random(trsp) >> 3);
+ i = torture_random(trsp);
i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1;
for (j = 0; j < i; j++) {
mask = rcutorture_extend_mask(*readstate, trsp);
@@ -1356,8 +1951,11 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp,
* no data to read. Can be invoked both from process context and
* from a timer handler.
*/
-static bool rcu_torture_one_read(struct torture_random_state *trsp)
+static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
{
+ bool checkpolling = !(torture_random(trsp) & 0xfff);
+ unsigned long cookie;
+ struct rcu_gp_oldstate cookie_full;
int i;
unsigned long started;
unsigned long completed;
@@ -1370,16 +1968,19 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
struct rt_read_seg *rtrsp1;
unsigned long long ts;
+ WARN_ON_ONCE(!rcu_is_watching());
newstate = rcutorture_extend_mask(readstate, trsp);
rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++);
+ if (checkpolling) {
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ cookie = cur_ops->get_gp_state();
+ if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
+ cur_ops->get_gp_state_full(&cookie_full);
+ }
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
- rcu_read_lock_bh_held() ||
- rcu_read_lock_sched_held() ||
- srcu_read_lock_held(srcu_ctlp) ||
- rcu_read_lock_trace_held() ||
- torturing_tasks());
+ !cur_ops->readlock_held || cur_ops->readlock_held());
if (p == NULL) {
/* Wait for rcu_torture_writer to get underway */
rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
@@ -1387,6 +1988,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
}
if (p->rtort_mbtest == 0)
atomic_inc(&n_rcu_torture_mberror);
+ rcu_torture_reader_do_mbchk(myid, p, trsp);
rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp);
preempt_disable();
pipe_count = READ_ONCE(p->rtort_pipe_count);
@@ -1408,8 +2010,27 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
}
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
+ if (checkpolling) {
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ WARN_ONCE(cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 2 failed %s(%d) %lu->%lu\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cookie, cur_ops->get_gp_state());
+ if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
+ WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
+ "%s: Cookie check 6 failed %s(%d) online %*pbl\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cpumask_pr_args(cpu_online_mask));
+ }
rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
- WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK);
+ WARN_ON_ONCE(readstate);
+ // This next splat is expected behavior if leakpointer, especially
+ // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels.
+ WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1);
/* If error or close call, record the sequence of reader protections. */
if ((pipe_count > 1 || completed > 1) && !xchg(&err_segs_recorded, 1)) {
@@ -1433,7 +2054,7 @@ static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand);
static void rcu_torture_timer(struct timer_list *unused)
{
atomic_long_inc(&n_rcu_torture_timers);
- (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand));
+ (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand), -1);
/* Test call_rcu() invocation from interrupt handler. */
if (cur_ops->call) {
@@ -1469,13 +2090,13 @@ rcu_torture_reader(void *arg)
if (!timer_pending(&t))
mod_timer(&t, jiffies + 1);
}
- if (!rcu_torture_one_read(&rand) && !torture_must_stop())
+ if (!rcu_torture_one_read(&rand, myid) && !torture_must_stop())
schedule_timeout_interruptible(HZ);
if (time_after(jiffies, lastsleep) && !torture_must_stop()) {
- schedule_timeout_interruptible(1);
+ torture_hrtimeout_us(500, 1000, &rand);
lastsleep = jiffies + 10;
}
- while (num_online_cpus() < mynumonline && !torture_must_stop())
+ while (torture_num_online_cpus() < mynumonline && !torture_must_stop())
schedule_timeout_interruptible(HZ / 5);
stutter_wait("rcu_torture_reader");
} while (!torture_must_stop());
@@ -1489,6 +2110,53 @@ rcu_torture_reader(void *arg)
}
/*
+ * Randomly Toggle CPUs' callback-offload state. This uses hrtimers to
+ * increase race probabilities and fuzzes the interval between toggling.
+ */
+static int rcu_nocb_toggle(void *arg)
+{
+ int cpu;
+ int maxcpu = -1;
+ int oldnice = task_nice(current);
+ long r;
+ DEFINE_TORTURE_RANDOM(rand);
+ ktime_t toggle_delay;
+ unsigned long toggle_fuzz;
+ ktime_t toggle_interval = ms_to_ktime(nocbs_toggle);
+
+ VERBOSE_TOROUT_STRING("rcu_nocb_toggle task started");
+ while (!rcu_inkernel_boot_has_ended())
+ schedule_timeout_interruptible(HZ / 10);
+ for_each_possible_cpu(cpu)
+ maxcpu = cpu;
+ WARN_ON(maxcpu < 0);
+ if (toggle_interval > ULONG_MAX)
+ toggle_fuzz = ULONG_MAX >> 3;
+ else
+ toggle_fuzz = toggle_interval >> 3;
+ if (toggle_fuzz <= 0)
+ toggle_fuzz = NSEC_PER_USEC;
+ do {
+ r = torture_random(&rand);
+ cpu = (r >> 1) % (maxcpu + 1);
+ if (r & 0x1) {
+ rcu_nocb_cpu_offload(cpu);
+ atomic_long_inc(&n_nocb_offload);
+ } else {
+ rcu_nocb_cpu_deoffload(cpu);
+ atomic_long_inc(&n_nocb_deoffload);
+ }
+ toggle_delay = torture_random(&rand) % toggle_fuzz + toggle_interval;
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_hrtimeout(&toggle_delay, HRTIMER_MODE_REL);
+ if (stutter_wait("rcu_nocb_toggle"))
+ sched_set_normal(current, oldnice);
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_nocb_toggle");
+ return 0;
+}
+
+/*
* Print torture statistics. Caller must ensure that there is only
* one call to this function at a given time!!! This is normally
* accomplished by relying on the module system to only have one copy
@@ -1514,7 +2182,7 @@ rcu_torture_stats_print(void)
batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]);
}
}
- for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
+ for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) {
if (pipesummary[i] != 0)
break;
}
@@ -1529,33 +2197,36 @@ rcu_torture_stats_print(void)
atomic_read(&n_rcu_torture_alloc),
atomic_read(&n_rcu_torture_alloc_fail),
atomic_read(&n_rcu_torture_free));
- pr_cont("rtmbe: %d rtbe: %ld rtbke: %ld rtbre: %ld ",
+ pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld ",
atomic_read(&n_rcu_torture_mberror),
+ atomic_read(&n_rcu_torture_mbchk_fail), atomic_read(&n_rcu_torture_mbchk_tries),
n_rcu_torture_barrier_error,
- n_rcu_torture_boost_ktrerror,
- n_rcu_torture_boost_rterror);
+ n_rcu_torture_boost_ktrerror);
pr_cont("rtbf: %ld rtb: %ld nt: %ld ",
n_rcu_torture_boost_failure,
n_rcu_torture_boosts,
atomic_long_read(&n_rcu_torture_timers));
torture_onoff_stats();
- pr_cont("barrier: %ld/%ld:%ld\n",
+ pr_cont("barrier: %ld/%ld:%ld ",
data_race(n_barrier_successes),
data_race(n_barrier_attempts),
data_race(n_rcu_torture_barrier_error));
+ pr_cont("read-exits: %ld ", data_race(n_read_exits)); // Statistic.
+ pr_cont("nocb-toggles: %ld:%ld\n",
+ atomic_long_read(&n_nocb_offload), atomic_long_read(&n_nocb_deoffload));
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
if (atomic_read(&n_rcu_torture_mberror) ||
+ atomic_read(&n_rcu_torture_mbchk_fail) ||
n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror ||
- n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure ||
- i > 1) {
+ n_rcu_torture_boost_failure || i > 1) {
pr_cont("%s", "!!! ");
atomic_inc(&n_rcu_torture_error);
WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror));
+ WARN_ON_ONCE(atomic_read(&n_rcu_torture_mbchk_fail));
WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier()
WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread
- WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio
- WARN_ON_ONCE(n_rcu_torture_boost_failure); // RCU boost failed
+ WARN_ON_ONCE(n_rcu_torture_boost_failure); // boost failed (TIMER_SOFTIRQ RT prio?)
WARN_ON_ONCE(i > 1); // Too-short grace period
}
pr_cont("Reader Pipe: ");
@@ -1589,16 +2260,17 @@ rcu_torture_stats_print(void)
srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
&flags, &gp_seq);
wtp = READ_ONCE(writer_task);
- pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#lx cpu %d\n",
+ pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#x cpu %d\n",
rcu_torture_writer_state_getname(),
rcu_torture_writer_state, gp_seq, flags,
- wtp == NULL ? ~0UL : wtp->state,
+ wtp == NULL ? ~0U : wtp->__state,
wtp == NULL ? -1 : (int)task_cpu(wtp));
if (!splatted && wtp) {
sched_show_task(wtp);
splatted = true;
}
- show_rcu_gp_kthreads();
+ if (cur_ops->gp_kthread_dbg)
+ cur_ops->gp_kthread_dbg();
rcu_ftrace_dump(DUMP_ALL);
}
rtcv_snap = rcu_torture_current_version;
@@ -1621,6 +2293,56 @@ rcu_torture_stats(void *arg)
return 0;
}
+/* Test mem_dump_obj() and friends. */
+static void rcu_torture_mem_dump_obj(void)
+{
+ struct rcu_head *rhp;
+ struct kmem_cache *kcp;
+ static int z;
+
+ kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
+ if (WARN_ON_ONCE(!kcp))
+ return;
+ rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
+ if (WARN_ON_ONCE(!rhp)) {
+ kmem_cache_destroy(kcp);
+ return;
+ }
+ pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
+ pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
+ mem_dump_obj(ZERO_SIZE_PTR);
+ pr_alert("mem_dump_obj(NULL):");
+ mem_dump_obj(NULL);
+ pr_alert("mem_dump_obj(%px):", &rhp);
+ mem_dump_obj(&rhp);
+ pr_alert("mem_dump_obj(%px):", rhp);
+ mem_dump_obj(rhp);
+ pr_alert("mem_dump_obj(%px):", &rhp->func);
+ mem_dump_obj(&rhp->func);
+ pr_alert("mem_dump_obj(%px):", &z);
+ mem_dump_obj(&z);
+ kmem_cache_free(kcp, rhp);
+ kmem_cache_destroy(kcp);
+ rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+ if (WARN_ON_ONCE(!rhp))
+ return;
+ pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
+ pr_alert("mem_dump_obj(kmalloc %px):", rhp);
+ mem_dump_obj(rhp);
+ pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func);
+ mem_dump_obj(&rhp->func);
+ kfree(rhp);
+ rhp = vmalloc(4096);
+ if (WARN_ON_ONCE(!rhp))
+ return;
+ pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
+ pr_alert("mem_dump_obj(vmalloc %px):", rhp);
+ mem_dump_obj(rhp);
+ pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func);
+ mem_dump_obj(&rhp->func);
+ vfree(rhp);
+}
+
static void
rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
{
@@ -1634,7 +2356,10 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d "
"stall_cpu_block=%d "
"n_barrier_cbs=%d "
- "onoff_interval=%d onoff_holdoff=%d\n",
+ "onoff_interval=%d onoff_holdoff=%d "
+ "read_exit_delay=%d read_exit_burst=%d "
+ "nocbs_nthreads=%d nocbs_toggle=%d "
+ "test_nmis=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
@@ -1643,7 +2368,10 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff,
stall_cpu_block,
n_barrier_cbs,
- onoff_interval, onoff_holdoff);
+ onoff_interval, onoff_holdoff,
+ read_exit_delay, read_exit_burst,
+ nocbs_nthreads, nocbs_toggle,
+ test_nmis);
}
static int rcutorture_booster_cleanup(unsigned int cpu)
@@ -1670,13 +2398,25 @@ static int rcutorture_booster_init(unsigned int cpu)
if (boost_tasks[cpu] != NULL)
return 0; /* Already created, nothing more to do. */
+ // Testing RCU priority boosting requires rcutorture do
+ // some serious abuse. Counter this by running ksoftirqd
+ // at higher priority.
+ if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) {
+ struct sched_param sp;
+ struct task_struct *t;
+
+ t = per_cpu(ksoftirqd, cpu);
+ WARN_ON_ONCE(!t);
+ sp.sched_priority = 2;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+
/* Don't allow time recalculation while creating a new task. */
mutex_lock(&boost_mutex);
rcu_torture_disable_rt_throttle();
VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task");
- boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
- cpu_to_node(cpu),
- "rcu_torture_boost");
+ boost_tasks[cpu] = kthread_run_on_cpu(rcu_torture_boost, NULL,
+ cpu, "rcu_torture_boost_%u");
if (IS_ERR(boost_tasks[cpu])) {
retval = PTR_ERR(boost_tasks[cpu]);
VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed");
@@ -1685,12 +2425,20 @@ static int rcutorture_booster_init(unsigned int cpu)
mutex_unlock(&boost_mutex);
return retval;
}
- kthread_bind(boost_tasks[cpu], cpu);
- wake_up_process(boost_tasks[cpu]);
mutex_unlock(&boost_mutex);
return 0;
}
+static int rcu_torture_stall_nf(struct notifier_block *nb, unsigned long v, void *ptr)
+{
+ pr_info("%s: v=%lu, duration=%lu.\n", __func__, v, (unsigned long)ptr);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block rcu_torture_stall_block = {
+ .notifier_call = rcu_torture_stall_nf,
+};
+
/*
* CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
* induces a CPU stall for the time specified by stall_cpu.
@@ -1698,9 +2446,16 @@ static int rcutorture_booster_init(unsigned int cpu)
static int rcu_torture_stall(void *args)
{
int idx;
+ int ret;
unsigned long stop_at;
VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
+ if (rcu_cpu_stall_notifiers) {
+ ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block);
+ if (ret)
+ pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n",
+ __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : "");
+ }
if (stall_cpu_holdoff > 0) {
VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
@@ -1724,19 +2479,31 @@ static int rcu_torture_stall(void *args)
local_irq_disable();
else if (!stall_cpu_block)
preempt_disable();
- pr_alert("rcu_torture_stall start on CPU %d.\n",
- raw_smp_processor_id());
+ pr_alert("%s start on CPU %d.\n",
+ __func__, raw_smp_processor_id());
while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(),
stop_at))
- if (stall_cpu_block)
+ if (stall_cpu_block) {
+#ifdef CONFIG_PREEMPTION
+ preempt_schedule();
+#else
schedule_timeout_uninterruptible(HZ);
+#endif
+ } else if (stall_no_softlockup) {
+ touch_softlockup_watchdog();
+ }
if (stall_cpu_irqsoff)
local_irq_enable();
else if (!stall_cpu_block)
preempt_enable();
cur_ops->readunlock(idx);
}
- pr_alert("rcu_torture_stall end.\n");
+ pr_alert("%s end.\n", __func__);
+ if (rcu_cpu_stall_notifiers && !ret) {
+ ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block);
+ if (ret)
+ pr_info("%s: rcu_stall_chain_notifier_unregister() returned %d.\n", __func__, ret);
+ }
torture_shutdown_absorb("rcu_torture_stall");
while (!kthread_should_stop())
schedule_timeout_interruptible(10 * HZ);
@@ -1800,9 +2567,13 @@ struct rcu_fwd {
unsigned long rcu_fwd_startat;
struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST];
unsigned long rcu_launder_gp_seq_start;
+ int rcu_fwd_id;
};
+static DEFINE_MUTEX(rcu_fwd_mutex);
static struct rcu_fwd *rcu_fwds;
+static unsigned long rcu_fwd_seq;
+static atomic_long_t rcu_fwd_max_cbs;
static bool rcu_fwd_emergency_stop;
static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
@@ -1815,8 +2586,8 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--)
if (rfp->n_launders_hist[i].n_launders > 0)
break;
- pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):",
- __func__, jiffies - rfp->rcu_fwd_startat);
+ pr_alert("%s: Callback-invocation histogram %d (duration %lu jiffies):",
+ __func__, rfp->rcu_fwd_id, jiffies - rfp->rcu_fwd_startat);
gps_old = rfp->rcu_launder_gp_seq_start;
for (j = 0; j <= i; j++) {
gps = rfp->n_launders_hist[j].launder_gp_seq;
@@ -1914,13 +2685,16 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
unsigned long stopat;
static DEFINE_TORTURE_RANDOM(trs);
- if (cur_ops->call && cur_ops->sync && cur_ops->cb_barrier) {
+ pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
+ if (!cur_ops->sync)
+ return; // Cannot do need_resched() forward progress testing without ->sync.
+ if (cur_ops->call && cur_ops->cb_barrier) {
init_rcu_head_on_stack(&fcs.rh);
selfpropcb = true;
}
/* Tight loop containing cond_resched(). */
- WRITE_ONCE(rcu_fwd_cb_nodelay, true);
+ atomic_inc(&rcu_fwd_cb_nodelay);
cur_ops->sync(); /* Later readers see above write. */
if (selfpropcb) {
WRITE_ONCE(fcs.stop, 0);
@@ -1950,11 +2724,13 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
cver = READ_ONCE(rcu_torture_current_version) - cver;
gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
WARN_ON(!cver && gps < 2);
- pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps);
+ pr_alert("%s: %d Duration %ld cver %ld gps %ld\n", __func__,
+ rfp->rcu_fwd_id, dur, cver, gps);
}
if (selfpropcb) {
WRITE_ONCE(fcs.stop, 1);
cur_ops->sync(); /* Wait for running CB to complete. */
+ pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id);
cur_ops->cb_barrier(); /* Wait for queued callbacks. */
}
@@ -1963,7 +2739,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
destroy_rcu_head_on_stack(&fcs.rh);
}
schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */
- WRITE_ONCE(rcu_fwd_cb_nodelay, false);
+ atomic_dec(&rcu_fwd_cb_nodelay);
}
/* Carry out call_rcu() forward-progress testing. */
@@ -1983,13 +2759,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
unsigned long stopat;
unsigned long stoppedat;
+ pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
if (READ_ONCE(rcu_fwd_emergency_stop))
return; /* Get out of the way quickly, no GP wait! */
if (!cur_ops->call)
return; /* Can't do call_rcu() fwd prog without ->call. */
/* Loop continuously posting RCU callbacks. */
- WRITE_ONCE(rcu_fwd_cb_nodelay, true);
+ atomic_inc(&rcu_fwd_cb_nodelay);
cur_ops->sync(); /* Later readers see above write. */
WRITE_ONCE(rfp->rcu_fwd_startat, jiffies);
stopat = rfp->rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
@@ -2018,7 +2795,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
rfp->rcu_fwd_cb_head = rfcpn;
n_launders++;
n_launders_sa++;
- } else {
+ } else if (!cur_ops->cbflood_max || cur_ops->cbflood_max > n_max_cbs) {
rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL);
if (WARN_ON_ONCE(!rfcp)) {
schedule_timeout_interruptible(1);
@@ -2028,8 +2805,11 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
n_launders_sa = 0;
rfcp->rfc_gps = 0;
rfcp->rfc_rfp = rfp;
+ } else {
+ rfcp = NULL;
}
- cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
+ if (rfcp)
+ cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);
if (tick_nohz_full_enabled()) {
local_irq_save(flags);
@@ -2041,6 +2821,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
n_launders_cb_snap = READ_ONCE(rfp->n_launders_cb);
cver = READ_ONCE(rcu_torture_current_version) - cver;
gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
+ pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id);
cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
(void)rcu_torture_fwd_prog_cbfree(rfp);
@@ -2053,11 +2834,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
n_launders + n_max_cbs - n_launders_cb_snap,
n_launders, n_launders_sa,
n_max_gps, n_max_cbs, cver, gps);
+ atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs);
+ mutex_lock(&rcu_fwd_mutex); // Serialize histograms.
rcu_torture_fwd_cb_hist(rfp);
+ mutex_unlock(&rcu_fwd_mutex);
}
schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
- WRITE_ONCE(rcu_fwd_cb_nodelay, false);
+ atomic_dec(&rcu_fwd_cb_nodelay);
}
@@ -2068,25 +2852,42 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
static int rcutorture_oom_notify(struct notifier_block *self,
unsigned long notused, void *nfreed)
{
- struct rcu_fwd *rfp = rcu_fwds;
+ int i;
+ long ncbs;
+ struct rcu_fwd *rfp;
+ mutex_lock(&rcu_fwd_mutex);
+ rfp = rcu_fwds;
+ if (!rfp) {
+ mutex_unlock(&rcu_fwd_mutex);
+ return NOTIFY_OK;
+ }
WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
__func__);
- rcu_torture_fwd_cb_hist(rfp);
- rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp->rcu_fwd_startat)) / 2);
+ for (i = 0; i < fwd_progress; i++) {
+ rcu_torture_fwd_cb_hist(&rfp[i]);
+ rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp[i].rcu_fwd_startat)) / 2);
+ }
WRITE_ONCE(rcu_fwd_emergency_stop, true);
smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
- pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree(rfp));
- rcu_barrier();
- pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree(rfp));
- rcu_barrier();
- pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree(rfp));
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
+ cur_ops->cb_barrier();
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
+ cur_ops->cb_barrier();
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
smp_mb(); /* Frees before return to avoid redoing OOM. */
(*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */
pr_info("%s returning after OOM processing.\n", __func__);
+ mutex_unlock(&rcu_fwd_mutex);
return NOTIFY_OK;
}
@@ -2097,6 +2898,10 @@ static struct notifier_block rcutorture_oom_nb = {
/* Carry out grace-period forward-progress testing. */
static int rcu_torture_fwd_prog(void *args)
{
+ bool firsttime = true;
+ long max_cbs;
+ int oldnice = task_nice(current);
+ unsigned long oldseq = READ_ONCE(rcu_fwd_seq);
struct rcu_fwd *rfp = args;
int tested = 0;
int tested_tries = 0;
@@ -2106,22 +2911,38 @@ static int rcu_torture_fwd_prog(void *args)
if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST))
set_user_nice(current, MAX_NICE);
do {
- schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
- WRITE_ONCE(rcu_fwd_emergency_stop, false);
- register_oom_notifier(&rcutorture_oom_nb);
- if (!IS_ENABLED(CONFIG_TINY_RCU) ||
- rcu_inkernel_boot_has_ended())
- rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
- if (rcu_inkernel_boot_has_ended())
+ if (!rfp->rcu_fwd_id) {
+ schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
+ WRITE_ONCE(rcu_fwd_emergency_stop, false);
+ if (!firsttime) {
+ max_cbs = atomic_long_xchg(&rcu_fwd_max_cbs, 0);
+ pr_alert("%s n_max_cbs: %ld\n", __func__, max_cbs);
+ }
+ firsttime = false;
+ WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1);
+ } else {
+ while (READ_ONCE(rcu_fwd_seq) == oldseq && !torture_must_stop())
+ schedule_timeout_interruptible(HZ / 20);
+ oldseq = READ_ONCE(rcu_fwd_seq);
+ }
+ pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
+ if (rcu_inkernel_boot_has_ended() && torture_num_online_cpus() > rfp->rcu_fwd_id)
rcu_torture_fwd_prog_cr(rfp);
- unregister_oom_notifier(&rcutorture_oom_nb);
+ if ((cur_ops->stall_dur && cur_ops->stall_dur() > 0) &&
+ (!IS_ENABLED(CONFIG_TINY_RCU) ||
+ (rcu_inkernel_boot_has_ended() &&
+ torture_num_online_cpus() > rfp->rcu_fwd_id)))
+ rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
/* Avoid slow periods, better to test when busy. */
- stutter_wait("rcu_torture_fwd_prog");
+ if (stutter_wait("rcu_torture_fwd_prog"))
+ sched_set_normal(current, oldnice);
} while (!torture_must_stop());
/* Short runs might not contain a valid forward-progress attempt. */
- WARN_ON(!tested && tested_tries >= 5);
- pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries);
+ if (!rfp->rcu_fwd_id) {
+ WARN_ON(!tested && tested_tries >= 5);
+ pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries);
+ }
torture_kthread_stopping("rcu_torture_fwd_prog");
return 0;
}
@@ -2129,18 +2950,29 @@ static int rcu_torture_fwd_prog(void *args)
/* If forward-progress checking is requested and feasible, spawn the thread. */
static int __init rcu_torture_fwd_prog_init(void)
{
+ int i;
+ int ret = 0;
struct rcu_fwd *rfp;
if (!fwd_progress)
return 0; /* Not requested, so don't do it. */
- if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0 ||
+ if (fwd_progress >= nr_cpu_ids) {
+ VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Limiting fwd_progress to # CPUs.\n");
+ fwd_progress = nr_cpu_ids;
+ } else if (fwd_progress < 0) {
+ fwd_progress = nr_cpu_ids;
+ }
+ if ((!cur_ops->sync && !cur_ops->call) ||
+ (!cur_ops->cbflood_max && (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0)) ||
cur_ops == &rcu_busted_ops) {
VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test");
+ fwd_progress = 0;
return 0;
}
if (stall_cpu > 0) {
VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing");
- if (IS_MODULE(CONFIG_RCU_TORTURE_TESTS))
+ fwd_progress = 0;
+ if (IS_MODULE(CONFIG_RCU_TORTURE_TEST))
return -EINVAL; /* In module, can fail back to user. */
WARN_ON(1); /* Make sure rcutorture notices conflict. */
return 0;
@@ -2149,12 +2981,51 @@ static int __init rcu_torture_fwd_prog_init(void)
fwd_progress_holdoff = 1;
if (fwd_progress_div <= 0)
fwd_progress_div = 4;
- rfp = kzalloc(sizeof(*rfp), GFP_KERNEL);
- if (!rfp)
+ rfp = kcalloc(fwd_progress, sizeof(*rfp), GFP_KERNEL);
+ fwd_prog_tasks = kcalloc(fwd_progress, sizeof(*fwd_prog_tasks), GFP_KERNEL);
+ if (!rfp || !fwd_prog_tasks) {
+ kfree(rfp);
+ kfree(fwd_prog_tasks);
+ fwd_prog_tasks = NULL;
+ fwd_progress = 0;
return -ENOMEM;
- spin_lock_init(&rfp->rcu_fwd_lock);
- rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
- return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task);
+ }
+ for (i = 0; i < fwd_progress; i++) {
+ spin_lock_init(&rfp[i].rcu_fwd_lock);
+ rfp[i].rcu_fwd_cb_tail = &rfp[i].rcu_fwd_cb_head;
+ rfp[i].rcu_fwd_id = i;
+ }
+ mutex_lock(&rcu_fwd_mutex);
+ rcu_fwds = rfp;
+ mutex_unlock(&rcu_fwd_mutex);
+ register_oom_notifier(&rcutorture_oom_nb);
+ for (i = 0; i < fwd_progress; i++) {
+ ret = torture_create_kthread(rcu_torture_fwd_prog, &rcu_fwds[i], fwd_prog_tasks[i]);
+ if (ret) {
+ fwd_progress = i;
+ return ret;
+ }
+ }
+ return 0;
+}
+
+static void rcu_torture_fwd_prog_cleanup(void)
+{
+ int i;
+ struct rcu_fwd *rfp;
+
+ if (!rcu_fwds || !fwd_prog_tasks)
+ return;
+ for (i = 0; i < fwd_progress; i++)
+ torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_tasks[i]);
+ unregister_oom_notifier(&rcutorture_oom_nb);
+ mutex_lock(&rcu_fwd_mutex);
+ rfp = rcu_fwds;
+ rcu_fwds = NULL;
+ mutex_unlock(&rcu_fwd_mutex);
+ kfree(rfp);
+ kfree(fwd_prog_tasks);
+ fwd_prog_tasks = NULL;
}
/* Callback function for RCU barrier testing. */
@@ -2175,7 +3046,7 @@ static void rcu_torture_barrier1cb(void *rcu_void)
static int rcu_torture_barrier_cbs(void *arg)
{
long myid = (long)arg;
- bool lastphase = 0;
+ bool lastphase = false;
bool newphase;
struct rcu_head rcu;
@@ -2321,13 +3192,15 @@ static bool rcu_torture_can_boost(void)
if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2)
return false;
+ if (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)
+ return false;
prio = rcu_get_gp_kthreads_prio();
if (!prio)
return false;
if (prio < 2) {
- if (boost_warn_once == 1)
+ if (boost_warn_once == 1)
return false;
pr_alert("%s: WARN: RCU kthread priority too low to test boosting. Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 on the kernel command line.\n", KBUILD_MODNAME);
@@ -2338,6 +3211,115 @@ static bool rcu_torture_can_boost(void)
return true;
}
+static bool read_exit_child_stop;
+static bool read_exit_child_stopped;
+static wait_queue_head_t read_exit_wq;
+
+// Child kthread which just does an rcutorture reader and exits.
+static int rcu_torture_read_exit_child(void *trsp_in)
+{
+ struct torture_random_state *trsp = trsp_in;
+
+ set_user_nice(current, MAX_NICE);
+ // Minimize time between reading and exiting.
+ while (!kthread_should_stop())
+ schedule_timeout_uninterruptible(HZ / 20);
+ (void)rcu_torture_one_read(trsp, -1);
+ return 0;
+}
+
+// Parent kthread which creates and destroys read-exit child kthreads.
+static int rcu_torture_read_exit(void *unused)
+{
+ bool errexit = false;
+ int i;
+ struct task_struct *tsp;
+ DEFINE_TORTURE_RANDOM(trs);
+
+ // Allocate and initialize.
+ set_user_nice(current, MAX_NICE);
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of test");
+
+ // Each pass through this loop does one read-exit episode.
+ do {
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode");
+ for (i = 0; i < read_exit_burst; i++) {
+ if (READ_ONCE(read_exit_child_stop))
+ break;
+ stutter_wait("rcu_torture_read_exit");
+ // Spawn child.
+ tsp = kthread_run(rcu_torture_read_exit_child,
+ &trs, "%s", "rcu_torture_read_exit_child");
+ if (IS_ERR(tsp)) {
+ TOROUT_ERRSTRING("out of memory");
+ errexit = true;
+ break;
+ }
+ cond_resched();
+ kthread_stop(tsp);
+ n_read_exits++;
+ }
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode");
+ rcu_barrier(); // Wait for task_struct free, avoid OOM.
+ i = 0;
+ for (; !errexit && !READ_ONCE(read_exit_child_stop) && i < read_exit_delay; i++)
+ schedule_timeout_uninterruptible(HZ);
+ } while (!errexit && !READ_ONCE(read_exit_child_stop));
+
+ // Clean up and exit.
+ smp_store_release(&read_exit_child_stopped, true); // After reaping.
+ smp_mb(); // Store before wakeup.
+ wake_up(&read_exit_wq);
+ while (!torture_must_stop())
+ schedule_timeout_uninterruptible(HZ / 20);
+ torture_kthread_stopping("rcu_torture_read_exit");
+ return 0;
+}
+
+static int rcu_torture_read_exit_init(void)
+{
+ if (read_exit_burst <= 0)
+ return 0;
+ init_waitqueue_head(&read_exit_wq);
+ read_exit_child_stop = false;
+ read_exit_child_stopped = false;
+ return torture_create_kthread(rcu_torture_read_exit, NULL,
+ read_exit_task);
+}
+
+static void rcu_torture_read_exit_cleanup(void)
+{
+ if (!read_exit_task)
+ return;
+ WRITE_ONCE(read_exit_child_stop, true);
+ smp_mb(); // Above write before wait.
+ wait_event(read_exit_wq, smp_load_acquire(&read_exit_child_stopped));
+ torture_stop_kthread(rcutorture_read_exit, read_exit_task);
+}
+
+static void rcutorture_test_nmis(int n)
+{
+#if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+ int cpu;
+ int dumpcpu;
+ int i;
+
+ for (i = 0; i < n; i++) {
+ preempt_disable();
+ cpu = smp_processor_id();
+ dumpcpu = cpu + 1;
+ if (dumpcpu >= nr_cpu_ids)
+ dumpcpu = 0;
+ pr_alert("%s: CPU %d invoking dump_cpu_task(%d)\n", __func__, cpu, dumpcpu);
+ dump_cpu_task(dumpcpu);
+ preempt_enable();
+ schedule_timeout_uninterruptible(15 * HZ);
+ }
+#else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+ WARN_ONCE(n, "Non-zero rcutorture.test_nmis=%d permitted only when rcutorture is built in.\n", test_nmis);
+#endif // #else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+}
+
static enum cpuhp_state rcutor_hp;
static void
@@ -2349,56 +3331,77 @@ rcu_torture_cleanup(void)
int i;
if (torture_cleanup_begin()) {
- if (cur_ops->cb_barrier != NULL)
+ if (cur_ops->cb_barrier != NULL) {
+ pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier);
cur_ops->cb_barrier();
+ }
+ rcu_gp_slow_unregister(NULL);
return;
}
if (!cur_ops) {
torture_cleanup_end();
+ rcu_gp_slow_unregister(NULL);
return;
}
- show_rcu_gp_kthreads();
+ rcutorture_test_nmis(test_nmis);
+
+ if (cur_ops->gp_kthread_dbg)
+ cur_ops->gp_kthread_dbg();
+ rcu_torture_read_exit_cleanup();
rcu_torture_barrier_cleanup();
- torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
+ rcu_torture_fwd_prog_cleanup();
torture_stop_kthread(rcu_torture_stall, stall_task);
torture_stop_kthread(rcu_torture_writer, writer_task);
+ if (nocb_tasks) {
+ for (i = 0; i < nrealnocbers; i++)
+ torture_stop_kthread(rcu_nocb_toggle, nocb_tasks[i]);
+ kfree(nocb_tasks);
+ nocb_tasks = NULL;
+ }
+
if (reader_tasks) {
for (i = 0; i < nrealreaders; i++)
torture_stop_kthread(rcu_torture_reader,
reader_tasks[i]);
kfree(reader_tasks);
+ reader_tasks = NULL;
}
- rcu_torture_current = NULL;
+ kfree(rcu_torture_reader_mbchk);
+ rcu_torture_reader_mbchk = NULL;
if (fakewriter_tasks) {
- for (i = 0; i < nfakewriters; i++) {
+ for (i = 0; i < nfakewriters; i++)
torture_stop_kthread(rcu_torture_fakewriter,
fakewriter_tasks[i]);
- }
kfree(fakewriter_tasks);
fakewriter_tasks = NULL;
}
rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq);
srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq);
- pr_alert("%s: End-test grace-period state: g%lu f%#x\n",
- cur_ops->name, gp_seq, flags);
+ pr_alert("%s: End-test grace-period state: g%ld f%#x total-gps=%ld\n",
+ cur_ops->name, (long)gp_seq, flags,
+ rcutorture_seq_diff(gp_seq, start_gp_seq));
torture_stop_kthread(rcu_torture_stats, stats_task);
torture_stop_kthread(rcu_torture_fqs, fqs_task);
- if (rcu_torture_can_boost())
+ if (rcu_torture_can_boost() && rcutor_hp >= 0)
cpuhp_remove_state(rcutor_hp);
/*
* Wait for all RCU callbacks to fire, then do torture-type-specific
* cleanup operations.
*/
- if (cur_ops->cb_barrier != NULL)
+ if (cur_ops->cb_barrier != NULL) {
+ pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier);
cur_ops->cb_barrier();
+ }
if (cur_ops->cleanup != NULL)
cur_ops->cleanup();
+ rcu_torture_mem_dump_obj();
+
rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
if (err_segs_recorded) {
@@ -2436,6 +3439,7 @@ rcu_torture_cleanup(void)
else
rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
torture_cleanup_end();
+ rcu_gp_slow_unregister(&rcu_fwd_cb_nodelay);
}
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
@@ -2468,6 +3472,7 @@ static void rcu_test_debug_objects(void)
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
struct rcu_head rh1;
struct rcu_head rh2;
+ struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
init_rcu_head_on_stack(&rh1);
init_rcu_head_on_stack(&rh2);
@@ -2476,10 +3481,14 @@ static void rcu_test_debug_objects(void)
/* Try to queue the rh2 pair of callbacks for the same grace period. */
preempt_disable(); /* Prevent preemption from interrupting test. */
rcu_read_lock(); /* Make it impossible to finish a grace period. */
- call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */
+ call_rcu_hurry(&rh1, rcu_torture_leak_cb); /* Start grace period. */
local_irq_disable(); /* Make it harder to start a new grace period. */
- call_rcu(&rh2, rcu_torture_leak_cb);
- call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
+ call_rcu_hurry(&rh2, rcu_torture_leak_cb);
+ call_rcu_hurry(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
+ if (rhp) {
+ call_rcu_hurry(rhp, rcu_torture_leak_cb);
+ call_rcu_hurry(rhp, rcu_torture_err_cb); /* Another duplicate callback. */
+ }
local_irq_enable();
rcu_read_unlock();
preempt_enable();
@@ -2489,6 +3498,7 @@ static void rcu_test_debug_objects(void)
pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME);
destroy_rcu_head_on_stack(&rh1);
destroy_rcu_head_on_stack(&rh2);
+ kfree(rhp);
#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME);
#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
@@ -2502,16 +3512,200 @@ static void rcutorture_sync(void)
cur_ops->sync();
}
+static DEFINE_MUTEX(mut0);
+static DEFINE_MUTEX(mut1);
+static DEFINE_MUTEX(mut2);
+static DEFINE_MUTEX(mut3);
+static DEFINE_MUTEX(mut4);
+static DEFINE_MUTEX(mut5);
+static DEFINE_MUTEX(mut6);
+static DEFINE_MUTEX(mut7);
+static DEFINE_MUTEX(mut8);
+static DEFINE_MUTEX(mut9);
+
+static DECLARE_RWSEM(rwsem0);
+static DECLARE_RWSEM(rwsem1);
+static DECLARE_RWSEM(rwsem2);
+static DECLARE_RWSEM(rwsem3);
+static DECLARE_RWSEM(rwsem4);
+static DECLARE_RWSEM(rwsem5);
+static DECLARE_RWSEM(rwsem6);
+static DECLARE_RWSEM(rwsem7);
+static DECLARE_RWSEM(rwsem8);
+static DECLARE_RWSEM(rwsem9);
+
+DEFINE_STATIC_SRCU(srcu0);
+DEFINE_STATIC_SRCU(srcu1);
+DEFINE_STATIC_SRCU(srcu2);
+DEFINE_STATIC_SRCU(srcu3);
+DEFINE_STATIC_SRCU(srcu4);
+DEFINE_STATIC_SRCU(srcu5);
+DEFINE_STATIC_SRCU(srcu6);
+DEFINE_STATIC_SRCU(srcu7);
+DEFINE_STATIC_SRCU(srcu8);
+DEFINE_STATIC_SRCU(srcu9);
+
+static int srcu_lockdep_next(const char *f, const char *fl, const char *fs, const char *fu, int i,
+ int cyclelen, int deadlock)
+{
+ int j = i + 1;
+
+ if (j >= cyclelen)
+ j = deadlock ? 0 : -1;
+ if (j >= 0)
+ pr_info("%s: %s(%d), %s(%d), %s(%d)\n", f, fl, i, fs, j, fu, i);
+ else
+ pr_info("%s: %s(%d), %s(%d)\n", f, fl, i, fu, i);
+ return j;
+}
+
+// Test lockdep on SRCU-based deadlock scenarios.
+static void rcu_torture_init_srcu_lockdep(void)
+{
+ int cyclelen;
+ int deadlock;
+ bool err = false;
+ int i;
+ int j;
+ int idx;
+ struct mutex *muts[] = { &mut0, &mut1, &mut2, &mut3, &mut4,
+ &mut5, &mut6, &mut7, &mut8, &mut9 };
+ struct rw_semaphore *rwsems[] = { &rwsem0, &rwsem1, &rwsem2, &rwsem3, &rwsem4,
+ &rwsem5, &rwsem6, &rwsem7, &rwsem8, &rwsem9 };
+ struct srcu_struct *srcus[] = { &srcu0, &srcu1, &srcu2, &srcu3, &srcu4,
+ &srcu5, &srcu6, &srcu7, &srcu8, &srcu9 };
+ int testtype;
+
+ if (!test_srcu_lockdep)
+ return;
+
+ deadlock = test_srcu_lockdep / 1000;
+ testtype = (test_srcu_lockdep / 10) % 100;
+ cyclelen = test_srcu_lockdep % 10;
+ WARN_ON_ONCE(ARRAY_SIZE(muts) != ARRAY_SIZE(srcus));
+ if (WARN_ONCE(deadlock != !!deadlock,
+ "%s: test_srcu_lockdep=%d and deadlock digit %d must be zero or one.\n",
+ __func__, test_srcu_lockdep, deadlock))
+ err = true;
+ if (WARN_ONCE(cyclelen <= 0,
+ "%s: test_srcu_lockdep=%d and cycle-length digit %d must be greater than zero.\n",
+ __func__, test_srcu_lockdep, cyclelen))
+ err = true;
+ if (err)
+ goto err_out;
+
+ if (testtype == 0) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ if (deadlock && cyclelen == 1)
+ pr_info("%s: Expect hang.\n", __func__);
+ for (i = 0; i < cyclelen; i++) {
+ j = srcu_lockdep_next(__func__, "srcu_read_lock", "synchronize_srcu",
+ "srcu_read_unlock", i, cyclelen, deadlock);
+ idx = srcu_read_lock(srcus[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ srcu_read_unlock(srcus[i], idx);
+ }
+ return;
+ }
+
+ if (testtype == 1) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU/mutex %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ for (i = 0; i < cyclelen; i++) {
+ pr_info("%s: srcu_read_lock(%d), mutex_lock(%d), mutex_unlock(%d), srcu_read_unlock(%d)\n",
+ __func__, i, i, i, i);
+ idx = srcu_read_lock(srcus[i]);
+ mutex_lock(muts[i]);
+ mutex_unlock(muts[i]);
+ srcu_read_unlock(srcus[i], idx);
+
+ j = srcu_lockdep_next(__func__, "mutex_lock", "synchronize_srcu",
+ "mutex_unlock", i, cyclelen, deadlock);
+ mutex_lock(muts[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ mutex_unlock(muts[i]);
+ }
+ return;
+ }
+
+ if (testtype == 2) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU/rwsem %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ for (i = 0; i < cyclelen; i++) {
+ pr_info("%s: srcu_read_lock(%d), down_read(%d), up_read(%d), srcu_read_unlock(%d)\n",
+ __func__, i, i, i, i);
+ idx = srcu_read_lock(srcus[i]);
+ down_read(rwsems[i]);
+ up_read(rwsems[i]);
+ srcu_read_unlock(srcus[i], idx);
+
+ j = srcu_lockdep_next(__func__, "down_write", "synchronize_srcu",
+ "up_write", i, cyclelen, deadlock);
+ down_write(rwsems[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ up_write(rwsems[i]);
+ }
+ return;
+ }
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ if (testtype == 3) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU and Tasks Trace RCU %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ if (deadlock && cyclelen == 1)
+ pr_info("%s: Expect hang.\n", __func__);
+ for (i = 0; i < cyclelen; i++) {
+ char *fl = i == 0 ? "rcu_read_lock_trace" : "srcu_read_lock";
+ char *fs = i == cyclelen - 1 ? "synchronize_rcu_tasks_trace"
+ : "synchronize_srcu";
+ char *fu = i == 0 ? "rcu_read_unlock_trace" : "srcu_read_unlock";
+
+ j = srcu_lockdep_next(__func__, fl, fs, fu, i, cyclelen, deadlock);
+ if (i == 0)
+ rcu_read_lock_trace();
+ else
+ idx = srcu_read_lock(srcus[i]);
+ if (j >= 0) {
+ if (i == cyclelen - 1)
+ synchronize_rcu_tasks_trace();
+ else
+ synchronize_srcu(srcus[j]);
+ }
+ if (i == 0)
+ rcu_read_unlock_trace();
+ else
+ srcu_read_unlock(srcus[i], idx);
+ }
+ return;
+ }
+#endif // #ifdef CONFIG_TASKS_TRACE_RCU
+
+err_out:
+ pr_info("%s: test_srcu_lockdep = %05d does nothing.\n", __func__, test_srcu_lockdep);
+ pr_info("%s: test_srcu_lockdep = DNNL.\n", __func__);
+ pr_info("%s: D: Deadlock if nonzero.\n", __func__);
+ pr_info("%s: NN: Test number, 0=SRCU, 1=SRCU/mutex, 2=SRCU/rwsem, 3=SRCU/Tasks Trace RCU.\n", __func__);
+ pr_info("%s: L: Cycle length.\n", __func__);
+ if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU))
+ pr_info("%s: NN=3 disallowed because kernel is built with CONFIG_TASKS_TRACE_RCU=n\n", __func__);
+}
+
static int __init
rcu_torture_init(void)
{
long i;
int cpu;
int firsterr = 0;
+ int flags = 0;
+ unsigned long gp_seq = 0;
static struct rcu_torture_ops *torture_ops[] = {
- &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
- &busted_srcud_ops, &tasks_ops, &tasks_rude_ops,
- &tasks_tracing_ops, &trivial_ops,
+ &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops,
+ TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS
+ &trivial_ops,
};
if (!torture_init_begin(torture_type, verbose))
@@ -2530,7 +3724,6 @@ rcu_torture_init(void)
for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
pr_cont(" %s", torture_ops[i]->name);
pr_cont("\n");
- WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST));
firsterr = -EINVAL;
cur_ops = NULL;
goto unwind;
@@ -2539,9 +3732,17 @@ rcu_torture_init(void)
pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
fqs_duration = 0;
}
+ if (nocbs_nthreads != 0 && (cur_ops != &rcu_ops ||
+ !IS_ENABLED(CONFIG_RCU_NOCB_CPU))) {
+ pr_alert("rcu-torture types: %s and CONFIG_RCU_NOCB_CPU=%d, nocb toggle disabled.\n",
+ cur_ops->name, IS_ENABLED(CONFIG_RCU_NOCB_CPU));
+ nocbs_nthreads = 0;
+ }
if (cur_ops->init)
cur_ops->init();
+ rcu_torture_init_srcu_lockdep();
+
if (nreaders >= 0) {
nrealreaders = nreaders;
} else {
@@ -2550,6 +3751,11 @@ rcu_torture_init(void)
nrealreaders = 1;
}
rcu_torture_print_module_parms(cur_ops, "Start of test");
+ rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq);
+ srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq);
+ start_gp_seq = gp_seq;
+ pr_alert("%s: Start-test grace-period state: g%ld f%#x\n",
+ cur_ops->name, (long)gp_seq, flags);
/* Set up the freelist. */
@@ -2568,10 +3774,11 @@ rcu_torture_init(void)
atomic_set(&n_rcu_torture_alloc_fail, 0);
atomic_set(&n_rcu_torture_free, 0);
atomic_set(&n_rcu_torture_mberror, 0);
+ atomic_set(&n_rcu_torture_mbchk_fail, 0);
+ atomic_set(&n_rcu_torture_mbchk_tries, 0);
atomic_set(&n_rcu_torture_error, 0);
n_rcu_torture_barrier_error = 0;
n_rcu_torture_boost_ktrerror = 0;
- n_rcu_torture_boost_rterror = 0;
n_rcu_torture_boost_failure = 0;
n_rcu_torture_boosts = 0;
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@@ -2587,16 +3794,17 @@ rcu_torture_init(void)
/* Start up the kthreads. */
+ rcu_torture_write_types();
firsterr = torture_create_kthread(rcu_torture_writer, NULL,
writer_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
if (nfakewriters > 0) {
fakewriter_tasks = kcalloc(nfakewriters,
sizeof(fakewriter_tasks[0]),
GFP_KERNEL);
if (fakewriter_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
@@ -2604,31 +3812,54 @@ rcu_torture_init(void)
for (i = 0; i < nfakewriters; i++) {
firsterr = torture_create_kthread(rcu_torture_fakewriter,
NULL, fakewriter_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
- if (reader_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ rcu_torture_reader_mbchk = kcalloc(nrealreaders, sizeof(*rcu_torture_reader_mbchk),
+ GFP_KERNEL);
+ if (!reader_tasks || !rcu_torture_reader_mbchk) {
+ TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
for (i = 0; i < nrealreaders; i++) {
+ rcu_torture_reader_mbchk[i].rtc_chkrdr = -1;
firsterr = torture_create_kthread(rcu_torture_reader, (void *)i,
reader_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+ nrealnocbers = nocbs_nthreads;
+ if (WARN_ON(nrealnocbers < 0))
+ nrealnocbers = 1;
+ if (WARN_ON(nocbs_toggle < 0))
+ nocbs_toggle = HZ;
+ if (nrealnocbers > 0) {
+ nocb_tasks = kcalloc(nrealnocbers, sizeof(nocb_tasks[0]), GFP_KERNEL);
+ if (nocb_tasks == NULL) {
+ TOROUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ } else {
+ nocb_tasks = NULL;
+ }
+ for (i = 0; i < nrealnocbers; i++) {
+ firsterr = torture_create_kthread(rcu_nocb_toggle, NULL, nocb_tasks[i]);
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stat_interval > 0) {
firsterr = torture_create_kthread(rcu_torture_stats, NULL,
stats_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (test_no_idle_hz && shuffle_interval > 0) {
firsterr = torture_shuffle_init(shuffle_interval * HZ);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stutter < 0)
@@ -2638,16 +3869,18 @@ rcu_torture_init(void)
t = cur_ops->stall_dur ? cur_ops->stall_dur() : stutter * HZ;
firsterr = torture_stutter_init(stutter * HZ, t);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (fqs_duration < 0)
fqs_duration = 0;
- if (fqs_duration) {
+ if (fqs_holdoff < 0)
+ fqs_holdoff = 0;
+ if (fqs_duration && fqs_holdoff) {
/* Create the fqs thread */
firsterr = torture_create_kthread(rcu_torture_fqs, NULL,
fqs_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (test_boost_interval < 1)
@@ -2661,35 +3894,43 @@ rcu_torture_init(void)
firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE",
rcutorture_booster_init,
rcutorture_booster_cleanup);
- if (firsterr < 0)
- goto unwind;
rcutor_hp = firsterr;
+ if (torture_init_error(firsterr))
+ goto unwind;
}
shutdown_jiffies = jiffies + shutdown_secs * HZ;
firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval,
rcutorture_sync);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = rcu_torture_stall_init();
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = rcu_torture_fwd_prog_init();
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = rcu_torture_barrier_init();
- if (firsterr)
+ if (torture_init_error(firsterr))
+ goto unwind;
+ firsterr = rcu_torture_read_exit_init();
+ if (torture_init_error(firsterr))
goto unwind;
if (object_debug)
rcu_test_debug_objects();
torture_init_end();
+ rcu_gp_slow_register(&rcu_fwd_cb_nodelay);
return 0;
unwind:
torture_init_end();
rcu_torture_cleanup();
+ if (shutdown_secs) {
+ WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST));
+ kernel_power_off();
+ }
return firsterr;
}
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
new file mode 100644
index 000000000000..2c2648a3ad30
--- /dev/null
+++ b/kernel/rcu/refscale.c
@@ -0,0 +1,1169 @@
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Scalability test comparing RCU vs other mechanisms
+// for acquiring references on objects.
+//
+// Copyright (C) Google, 2020.
+//
+// Author: Joel Fernandes <joel@joelfernandes.org>
+
+#define pr_fmt(fmt) fmt
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/notifier.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/torture.h>
+#include <linux/types.h>
+
+#include "rcu.h"
+
+#define SCALE_FLAG "-ref-scale: "
+
+#define SCALEOUT(s, x...) \
+ pr_alert("%s" SCALE_FLAG s, scale_type, ## x)
+
+#define VERBOSE_SCALEOUT(s, x...) \
+ do { \
+ if (verbose) \
+ pr_alert("%s" SCALE_FLAG s "\n", scale_type, ## x); \
+ } while (0)
+
+static atomic_t verbose_batch_ctr;
+
+#define VERBOSE_SCALEOUT_BATCH(s, x...) \
+do { \
+ if (verbose && \
+ (verbose_batched <= 0 || \
+ !(atomic_inc_return(&verbose_batch_ctr) % verbose_batched))) { \
+ schedule_timeout_uninterruptible(1); \
+ pr_alert("%s" SCALE_FLAG s "\n", scale_type, ## x); \
+ } \
+} while (0)
+
+#define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s "\n", scale_type, ## x)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>");
+
+static char *scale_type = "rcu";
+module_param(scale_type, charp, 0444);
+MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
+
+torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
+torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s");
+
+// Wait until there are multiple CPUs before starting test.
+torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
+ "Holdoff time before test start (s)");
+// Number of typesafe_lookup structures, that is, the degree of concurrency.
+torture_param(long, lookup_instances, 0, "Number of typesafe_lookup structures.");
+// Number of loops per experiment, all readers execute operations concurrently.
+torture_param(long, loops, 10000, "Number of loops per experiment.");
+// Number of readers, with -1 defaulting to about 75% of the CPUs.
+torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
+// Number of runs.
+torture_param(int, nruns, 30, "Number of experiments to run.");
+// Reader delay in nanoseconds, 0 for no delay.
+torture_param(int, readdelay, 0, "Read-side delay in nanoseconds.");
+
+#ifdef MODULE
+# define REFSCALE_SHUTDOWN 0
+#else
+# define REFSCALE_SHUTDOWN 1
+#endif
+
+torture_param(bool, shutdown, REFSCALE_SHUTDOWN,
+ "Shutdown at end of scalability tests.");
+
+struct reader_task {
+ struct task_struct *task;
+ int start_reader;
+ wait_queue_head_t wq;
+ u64 last_duration_ns;
+};
+
+static struct task_struct *shutdown_task;
+static wait_queue_head_t shutdown_wq;
+
+static struct task_struct *main_task;
+static wait_queue_head_t main_wq;
+static int shutdown_start;
+
+static struct reader_task *reader_tasks;
+
+// Number of readers that are part of the current experiment.
+static atomic_t nreaders_exp;
+
+// Use to wait for all threads to start.
+static atomic_t n_init;
+static atomic_t n_started;
+static atomic_t n_warmedup;
+static atomic_t n_cooleddown;
+
+// Track which experiment is currently running.
+static int exp_idx;
+
+// Operations vector for selecting different types of tests.
+struct ref_scale_ops {
+ bool (*init)(void);
+ void (*cleanup)(void);
+ void (*readsection)(const int nloops);
+ void (*delaysection)(const int nloops, const int udl, const int ndl);
+ const char *name;
+};
+
+static struct ref_scale_ops *cur_ops;
+
+static void un_delay(const int udl, const int ndl)
+{
+ if (udl)
+ udelay(udl);
+ if (ndl)
+ ndelay(ndl);
+}
+
+static void ref_rcu_read_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ rcu_read_lock();
+ rcu_read_unlock();
+ }
+}
+
+static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ rcu_read_lock();
+ un_delay(udl, ndl);
+ rcu_read_unlock();
+ }
+}
+
+static bool rcu_sync_scale_init(void)
+{
+ return true;
+}
+
+static struct ref_scale_ops rcu_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_rcu_read_section,
+ .delaysection = ref_rcu_delay_section,
+ .name = "rcu"
+};
+
+// Definitions for SRCU ref scale testing.
+DEFINE_STATIC_SRCU(srcu_refctl_scale);
+static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale;
+
+static void srcu_ref_scale_read_section(const int nloops)
+{
+ int i;
+ int idx;
+
+ for (i = nloops; i >= 0; i--) {
+ idx = srcu_read_lock(srcu_ctlp);
+ srcu_read_unlock(srcu_ctlp, idx);
+ }
+}
+
+static void srcu_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+ int idx;
+
+ for (i = nloops; i >= 0; i--) {
+ idx = srcu_read_lock(srcu_ctlp);
+ un_delay(udl, ndl);
+ srcu_read_unlock(srcu_ctlp, idx);
+ }
+}
+
+static struct ref_scale_ops srcu_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = srcu_ref_scale_read_section,
+ .delaysection = srcu_ref_scale_delay_section,
+ .name = "srcu"
+};
+
+#ifdef CONFIG_TASKS_RCU
+
+// Definitions for RCU Tasks ref scale testing: Empty read markers.
+// These definitions also work for RCU Rude readers.
+static void rcu_tasks_ref_scale_read_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--)
+ continue;
+}
+
+static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--)
+ un_delay(udl, ndl);
+}
+
+static struct ref_scale_ops rcu_tasks_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = rcu_tasks_ref_scale_read_section,
+ .delaysection = rcu_tasks_ref_scale_delay_section,
+ .name = "rcu-tasks"
+};
+
+#define RCU_TASKS_OPS &rcu_tasks_ops,
+
+#else // #ifdef CONFIG_TASKS_RCU
+
+#define RCU_TASKS_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_RCU
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+
+// Definitions for RCU Tasks Trace ref scale testing.
+static void rcu_trace_ref_scale_read_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ rcu_read_lock_trace();
+ rcu_read_unlock_trace();
+ }
+}
+
+static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ rcu_read_lock_trace();
+ un_delay(udl, ndl);
+ rcu_read_unlock_trace();
+ }
+}
+
+static struct ref_scale_ops rcu_trace_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = rcu_trace_ref_scale_read_section,
+ .delaysection = rcu_trace_ref_scale_delay_section,
+ .name = "rcu-trace"
+};
+
+#define RCU_TRACE_OPS &rcu_trace_ops,
+
+#else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+#define RCU_TRACE_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+// Definitions for reference count
+static atomic_t refcnt;
+
+static void ref_refcnt_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ atomic_inc(&refcnt);
+ atomic_dec(&refcnt);
+ }
+}
+
+static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ atomic_inc(&refcnt);
+ un_delay(udl, ndl);
+ atomic_dec(&refcnt);
+ }
+}
+
+static struct ref_scale_ops refcnt_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_refcnt_section,
+ .delaysection = ref_refcnt_delay_section,
+ .name = "refcnt"
+};
+
+// Definitions for rwlock
+static rwlock_t test_rwlock;
+
+static bool ref_rwlock_init(void)
+{
+ rwlock_init(&test_rwlock);
+ return true;
+}
+
+static void ref_rwlock_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ read_lock(&test_rwlock);
+ read_unlock(&test_rwlock);
+ }
+}
+
+static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ read_lock(&test_rwlock);
+ un_delay(udl, ndl);
+ read_unlock(&test_rwlock);
+ }
+}
+
+static struct ref_scale_ops rwlock_ops = {
+ .init = ref_rwlock_init,
+ .readsection = ref_rwlock_section,
+ .delaysection = ref_rwlock_delay_section,
+ .name = "rwlock"
+};
+
+// Definitions for rwsem
+static struct rw_semaphore test_rwsem;
+
+static bool ref_rwsem_init(void)
+{
+ init_rwsem(&test_rwsem);
+ return true;
+}
+
+static void ref_rwsem_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ down_read(&test_rwsem);
+ up_read(&test_rwsem);
+ }
+}
+
+static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ down_read(&test_rwsem);
+ un_delay(udl, ndl);
+ up_read(&test_rwsem);
+ }
+}
+
+static struct ref_scale_ops rwsem_ops = {
+ .init = ref_rwsem_init,
+ .readsection = ref_rwsem_section,
+ .delaysection = ref_rwsem_delay_section,
+ .name = "rwsem"
+};
+
+// Definitions for global spinlock
+static DEFINE_RAW_SPINLOCK(test_lock);
+
+static void ref_lock_section(const int nloops)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ raw_spin_lock(&test_lock);
+ raw_spin_unlock(&test_lock);
+ }
+ preempt_enable();
+}
+
+static void ref_lock_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ raw_spin_lock(&test_lock);
+ un_delay(udl, ndl);
+ raw_spin_unlock(&test_lock);
+ }
+ preempt_enable();
+}
+
+static struct ref_scale_ops lock_ops = {
+ .readsection = ref_lock_section,
+ .delaysection = ref_lock_delay_section,
+ .name = "lock"
+};
+
+// Definitions for global irq-save spinlock
+
+static void ref_lock_irq_section(const int nloops)
+{
+ unsigned long flags;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ raw_spin_lock_irqsave(&test_lock, flags);
+ raw_spin_unlock_irqrestore(&test_lock, flags);
+ }
+ preempt_enable();
+}
+
+static void ref_lock_irq_delay_section(const int nloops, const int udl, const int ndl)
+{
+ unsigned long flags;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ raw_spin_lock_irqsave(&test_lock, flags);
+ un_delay(udl, ndl);
+ raw_spin_unlock_irqrestore(&test_lock, flags);
+ }
+ preempt_enable();
+}
+
+static struct ref_scale_ops lock_irq_ops = {
+ .readsection = ref_lock_irq_section,
+ .delaysection = ref_lock_irq_delay_section,
+ .name = "lock-irq"
+};
+
+// Definitions acquire-release.
+static DEFINE_PER_CPU(unsigned long, test_acqrel);
+
+static void ref_acqrel_section(const int nloops)
+{
+ unsigned long x;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x = smp_load_acquire(this_cpu_ptr(&test_acqrel));
+ smp_store_release(this_cpu_ptr(&test_acqrel), x + 1);
+ }
+ preempt_enable();
+}
+
+static void ref_acqrel_delay_section(const int nloops, const int udl, const int ndl)
+{
+ unsigned long x;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x = smp_load_acquire(this_cpu_ptr(&test_acqrel));
+ un_delay(udl, ndl);
+ smp_store_release(this_cpu_ptr(&test_acqrel), x + 1);
+ }
+ preempt_enable();
+}
+
+static struct ref_scale_ops acqrel_ops = {
+ .readsection = ref_acqrel_section,
+ .delaysection = ref_acqrel_delay_section,
+ .name = "acqrel"
+};
+
+static volatile u64 stopopts;
+
+static void ref_clock_section(const int nloops)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--)
+ x += ktime_get_real_fast_ns();
+ preempt_enable();
+ stopopts = x;
+}
+
+static void ref_clock_delay_section(const int nloops, const int udl, const int ndl)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x += ktime_get_real_fast_ns();
+ un_delay(udl, ndl);
+ }
+ preempt_enable();
+ stopopts = x;
+}
+
+static struct ref_scale_ops clock_ops = {
+ .readsection = ref_clock_section,
+ .delaysection = ref_clock_delay_section,
+ .name = "clock"
+};
+
+static void ref_jiffies_section(const int nloops)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--)
+ x += jiffies;
+ preempt_enable();
+ stopopts = x;
+}
+
+static void ref_jiffies_delay_section(const int nloops, const int udl, const int ndl)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x += jiffies;
+ un_delay(udl, ndl);
+ }
+ preempt_enable();
+ stopopts = x;
+}
+
+static struct ref_scale_ops jiffies_ops = {
+ .readsection = ref_jiffies_section,
+ .delaysection = ref_jiffies_delay_section,
+ .name = "jiffies"
+};
+
+////////////////////////////////////////////////////////////////////////
+//
+// Methods leveraging SLAB_TYPESAFE_BY_RCU.
+//
+
+// Item to look up in a typesafe manner. Array of pointers to these.
+struct refscale_typesafe {
+ atomic_t rts_refctr; // Used by all flavors
+ spinlock_t rts_lock;
+ seqlock_t rts_seqlock;
+ unsigned int a;
+ unsigned int b;
+};
+
+static struct kmem_cache *typesafe_kmem_cachep;
+static struct refscale_typesafe **rtsarray;
+static long rtsarray_size;
+static DEFINE_TORTURE_RANDOM_PERCPU(refscale_rand);
+static bool (*rts_acquire)(struct refscale_typesafe *rtsp, unsigned int *start);
+static bool (*rts_release)(struct refscale_typesafe *rtsp, unsigned int start);
+
+// Conditionally acquire an explicit in-structure reference count.
+static bool typesafe_ref_acquire(struct refscale_typesafe *rtsp, unsigned int *start)
+{
+ return atomic_inc_not_zero(&rtsp->rts_refctr);
+}
+
+// Unconditionally release an explicit in-structure reference count.
+static bool typesafe_ref_release(struct refscale_typesafe *rtsp, unsigned int start)
+{
+ if (!atomic_dec_return(&rtsp->rts_refctr)) {
+ WRITE_ONCE(rtsp->a, rtsp->a + 1);
+ kmem_cache_free(typesafe_kmem_cachep, rtsp);
+ }
+ return true;
+}
+
+// Unconditionally acquire an explicit in-structure spinlock.
+static bool typesafe_lock_acquire(struct refscale_typesafe *rtsp, unsigned int *start)
+{
+ spin_lock(&rtsp->rts_lock);
+ return true;
+}
+
+// Unconditionally release an explicit in-structure spinlock.
+static bool typesafe_lock_release(struct refscale_typesafe *rtsp, unsigned int start)
+{
+ spin_unlock(&rtsp->rts_lock);
+ return true;
+}
+
+// Unconditionally acquire an explicit in-structure sequence lock.
+static bool typesafe_seqlock_acquire(struct refscale_typesafe *rtsp, unsigned int *start)
+{
+ *start = read_seqbegin(&rtsp->rts_seqlock);
+ return true;
+}
+
+// Conditionally release an explicit in-structure sequence lock. Return
+// true if this release was successful, that is, if no retry is required.
+static bool typesafe_seqlock_release(struct refscale_typesafe *rtsp, unsigned int start)
+{
+ return !read_seqretry(&rtsp->rts_seqlock, start);
+}
+
+// Do a read-side critical section with the specified delay in
+// microseconds and nanoseconds inserted so as to increase probability
+// of failure.
+static void typesafe_delay_section(const int nloops, const int udl, const int ndl)
+{
+ unsigned int a;
+ unsigned int b;
+ int i;
+ long idx;
+ struct refscale_typesafe *rtsp;
+ unsigned int start;
+
+ for (i = nloops; i >= 0; i--) {
+ preempt_disable();
+ idx = torture_random(this_cpu_ptr(&refscale_rand)) % rtsarray_size;
+ preempt_enable();
+retry:
+ rcu_read_lock();
+ rtsp = rcu_dereference(rtsarray[idx]);
+ a = READ_ONCE(rtsp->a);
+ if (!rts_acquire(rtsp, &start)) {
+ rcu_read_unlock();
+ goto retry;
+ }
+ if (a != READ_ONCE(rtsp->a)) {
+ (void)rts_release(rtsp, start);
+ rcu_read_unlock();
+ goto retry;
+ }
+ un_delay(udl, ndl);
+ b = READ_ONCE(rtsp->a);
+ // Remember, seqlock read-side release can fail.
+ if (!rts_release(rtsp, start)) {
+ rcu_read_unlock();
+ goto retry;
+ }
+ WARN_ONCE(a != b, "Re-read of ->a changed from %u to %u.\n", a, b);
+ b = rtsp->b;
+ rcu_read_unlock();
+ WARN_ON_ONCE(a * a != b);
+ }
+}
+
+// Because the acquisition and release methods are expensive, there
+// is no point in optimizing away the un_delay() function's two checks.
+// Thus simply define typesafe_read_section() as a simple wrapper around
+// typesafe_delay_section().
+static void typesafe_read_section(const int nloops)
+{
+ typesafe_delay_section(nloops, 0, 0);
+}
+
+// Allocate and initialize one refscale_typesafe structure.
+static struct refscale_typesafe *typesafe_alloc_one(void)
+{
+ struct refscale_typesafe *rtsp;
+
+ rtsp = kmem_cache_alloc(typesafe_kmem_cachep, GFP_KERNEL);
+ if (!rtsp)
+ return NULL;
+ atomic_set(&rtsp->rts_refctr, 1);
+ WRITE_ONCE(rtsp->a, rtsp->a + 1);
+ WRITE_ONCE(rtsp->b, rtsp->a * rtsp->a);
+ return rtsp;
+}
+
+// Slab-allocator constructor for refscale_typesafe structures created
+// out of a new slab of system memory.
+static void refscale_typesafe_ctor(void *rtsp_in)
+{
+ struct refscale_typesafe *rtsp = rtsp_in;
+
+ spin_lock_init(&rtsp->rts_lock);
+ seqlock_init(&rtsp->rts_seqlock);
+ preempt_disable();
+ rtsp->a = torture_random(this_cpu_ptr(&refscale_rand));
+ preempt_enable();
+}
+
+static struct ref_scale_ops typesafe_ref_ops;
+static struct ref_scale_ops typesafe_lock_ops;
+static struct ref_scale_ops typesafe_seqlock_ops;
+
+// Initialize for a typesafe test.
+static bool typesafe_init(void)
+{
+ long idx;
+ long si = lookup_instances;
+
+ typesafe_kmem_cachep = kmem_cache_create("refscale_typesafe",
+ sizeof(struct refscale_typesafe), sizeof(void *),
+ SLAB_TYPESAFE_BY_RCU, refscale_typesafe_ctor);
+ if (!typesafe_kmem_cachep)
+ return false;
+ if (si < 0)
+ si = -si * nr_cpu_ids;
+ else if (si == 0)
+ si = nr_cpu_ids;
+ rtsarray_size = si;
+ rtsarray = kcalloc(si, sizeof(*rtsarray), GFP_KERNEL);
+ if (!rtsarray)
+ return false;
+ for (idx = 0; idx < rtsarray_size; idx++) {
+ rtsarray[idx] = typesafe_alloc_one();
+ if (!rtsarray[idx])
+ return false;
+ }
+ if (cur_ops == &typesafe_ref_ops) {
+ rts_acquire = typesafe_ref_acquire;
+ rts_release = typesafe_ref_release;
+ } else if (cur_ops == &typesafe_lock_ops) {
+ rts_acquire = typesafe_lock_acquire;
+ rts_release = typesafe_lock_release;
+ } else if (cur_ops == &typesafe_seqlock_ops) {
+ rts_acquire = typesafe_seqlock_acquire;
+ rts_release = typesafe_seqlock_release;
+ } else {
+ WARN_ON_ONCE(1);
+ return false;
+ }
+ return true;
+}
+
+// Clean up after a typesafe test.
+static void typesafe_cleanup(void)
+{
+ long idx;
+
+ if (rtsarray) {
+ for (idx = 0; idx < rtsarray_size; idx++)
+ kmem_cache_free(typesafe_kmem_cachep, rtsarray[idx]);
+ kfree(rtsarray);
+ rtsarray = NULL;
+ rtsarray_size = 0;
+ }
+ kmem_cache_destroy(typesafe_kmem_cachep);
+ typesafe_kmem_cachep = NULL;
+ rts_acquire = NULL;
+ rts_release = NULL;
+}
+
+// The typesafe_init() function distinguishes these structures by address.
+static struct ref_scale_ops typesafe_ref_ops = {
+ .init = typesafe_init,
+ .cleanup = typesafe_cleanup,
+ .readsection = typesafe_read_section,
+ .delaysection = typesafe_delay_section,
+ .name = "typesafe_ref"
+};
+
+static struct ref_scale_ops typesafe_lock_ops = {
+ .init = typesafe_init,
+ .cleanup = typesafe_cleanup,
+ .readsection = typesafe_read_section,
+ .delaysection = typesafe_delay_section,
+ .name = "typesafe_lock"
+};
+
+static struct ref_scale_ops typesafe_seqlock_ops = {
+ .init = typesafe_init,
+ .cleanup = typesafe_cleanup,
+ .readsection = typesafe_read_section,
+ .delaysection = typesafe_delay_section,
+ .name = "typesafe_seqlock"
+};
+
+static void rcu_scale_one_reader(void)
+{
+ if (readdelay <= 0)
+ cur_ops->readsection(loops);
+ else
+ cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000);
+}
+
+// Reader kthread. Repeatedly does empty RCU read-side
+// critical section, minimizing update-side interference.
+static int
+ref_scale_reader(void *arg)
+{
+ unsigned long flags;
+ long me = (long)arg;
+ struct reader_task *rt = &(reader_tasks[me]);
+ u64 start;
+ s64 duration;
+
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: task started", me);
+ WARN_ON_ONCE(set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)));
+ set_user_nice(current, MAX_NICE);
+ atomic_inc(&n_init);
+ if (holdoff)
+ schedule_timeout_interruptible(holdoff * HZ);
+repeat:
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, raw_smp_processor_id());
+
+ // Wait for signal that this reader can start.
+ wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) ||
+ torture_must_stop());
+
+ if (torture_must_stop())
+ goto end;
+
+ // Make sure that the CPU is affinitized appropriately during testing.
+ WARN_ON_ONCE(raw_smp_processor_id() != me);
+
+ WRITE_ONCE(rt->start_reader, 0);
+ if (!atomic_dec_return(&n_started))
+ while (atomic_read_acquire(&n_started))
+ cpu_relax();
+
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: experiment %d started", me, exp_idx);
+
+
+ // To reduce noise, do an initial cache-warming invocation, check
+ // in, and then keep warming until everyone has checked in.
+ rcu_scale_one_reader();
+ if (!atomic_dec_return(&n_warmedup))
+ while (atomic_read_acquire(&n_warmedup))
+ rcu_scale_one_reader();
+ // Also keep interrupts disabled. This also has the effect
+ // of preventing entries into slow path for rcu_read_unlock().
+ local_irq_save(flags);
+ start = ktime_get_mono_fast_ns();
+
+ rcu_scale_one_reader();
+
+ duration = ktime_get_mono_fast_ns() - start;
+ local_irq_restore(flags);
+
+ rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration;
+ // To reduce runtime-skew noise, do maintain-load invocations until
+ // everyone is done.
+ if (!atomic_dec_return(&n_cooleddown))
+ while (atomic_read_acquire(&n_cooleddown))
+ rcu_scale_one_reader();
+
+ if (atomic_dec_and_test(&nreaders_exp))
+ wake_up(&main_wq);
+
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)",
+ me, exp_idx, atomic_read(&nreaders_exp));
+
+ if (!torture_must_stop())
+ goto repeat;
+end:
+ torture_kthread_stopping("ref_scale_reader");
+ return 0;
+}
+
+static void reset_readers(void)
+{
+ int i;
+ struct reader_task *rt;
+
+ for (i = 0; i < nreaders; i++) {
+ rt = &(reader_tasks[i]);
+
+ rt->last_duration_ns = 0;
+ }
+}
+
+// Print the results of each reader and return the sum of all their durations.
+static u64 process_durations(int n)
+{
+ int i;
+ struct reader_task *rt;
+ char buf1[64];
+ char *buf;
+ u64 sum = 0;
+
+ buf = kmalloc(800 + 64, GFP_KERNEL);
+ if (!buf)
+ return 0;
+ buf[0] = 0;
+ sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)",
+ exp_idx);
+
+ for (i = 0; i < n && !torture_must_stop(); i++) {
+ rt = &(reader_tasks[i]);
+ sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns);
+
+ if (i % 5 == 0)
+ strcat(buf, "\n");
+ if (strlen(buf) >= 800) {
+ pr_alert("%s", buf);
+ buf[0] = 0;
+ }
+ strcat(buf, buf1);
+
+ sum += rt->last_duration_ns;
+ }
+ pr_alert("%s\n", buf);
+
+ kfree(buf);
+ return sum;
+}
+
+// The main_func is the main orchestrator, it performs a bunch of
+// experiments. For every experiment, it orders all the readers
+// involved to start and waits for them to finish the experiment. It
+// then reads their timestamps and starts the next experiment. Each
+// experiment progresses from 1 concurrent reader to N of them at which
+// point all the timestamps are printed.
+static int main_func(void *arg)
+{
+ int exp, r;
+ char buf1[64];
+ char *buf;
+ u64 *result_avg;
+
+ set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids));
+ set_user_nice(current, MAX_NICE);
+
+ VERBOSE_SCALEOUT("main_func task started");
+ result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL);
+ buf = kzalloc(800 + 64, GFP_KERNEL);
+ if (!result_avg || !buf) {
+ SCALEOUT_ERRSTRING("out of memory");
+ goto oom_exit;
+ }
+ if (holdoff)
+ schedule_timeout_interruptible(holdoff * HZ);
+
+ // Wait for all threads to start.
+ atomic_inc(&n_init);
+ while (atomic_read(&n_init) < nreaders + 1)
+ schedule_timeout_uninterruptible(1);
+
+ // Start exp readers up per experiment
+ for (exp = 0; exp < nruns && !torture_must_stop(); exp++) {
+ if (torture_must_stop())
+ goto end;
+
+ reset_readers();
+ atomic_set(&nreaders_exp, nreaders);
+ atomic_set(&n_started, nreaders);
+ atomic_set(&n_warmedup, nreaders);
+ atomic_set(&n_cooleddown, nreaders);
+
+ exp_idx = exp;
+
+ for (r = 0; r < nreaders; r++) {
+ smp_store_release(&reader_tasks[r].start_reader, 1);
+ wake_up(&reader_tasks[r].wq);
+ }
+
+ VERBOSE_SCALEOUT("main_func: experiment started, waiting for %d readers",
+ nreaders);
+
+ wait_event(main_wq,
+ !atomic_read(&nreaders_exp) || torture_must_stop());
+
+ VERBOSE_SCALEOUT("main_func: experiment ended");
+
+ if (torture_must_stop())
+ goto end;
+
+ result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops);
+ }
+
+ // Print the average of all experiments
+ SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n");
+
+ pr_alert("Runs\tTime(ns)\n");
+ for (exp = 0; exp < nruns; exp++) {
+ u64 avg;
+ u32 rem;
+
+ avg = div_u64_rem(result_avg[exp], 1000, &rem);
+ sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem);
+ strcat(buf, buf1);
+ if (strlen(buf) >= 800) {
+ pr_alert("%s", buf);
+ buf[0] = 0;
+ }
+ }
+
+ pr_alert("%s", buf);
+
+oom_exit:
+ // This will shutdown everything including us.
+ if (shutdown) {
+ shutdown_start = 1;
+ wake_up(&shutdown_wq);
+ }
+
+ // Wait for torture to stop us
+ while (!torture_must_stop())
+ schedule_timeout_uninterruptible(1);
+
+end:
+ torture_kthread_stopping("main_func");
+ kfree(result_avg);
+ kfree(buf);
+ return 0;
+}
+
+static void
+ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag)
+{
+ pr_alert("%s" SCALE_FLAG
+ "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
+ verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay);
+}
+
+static void
+ref_scale_cleanup(void)
+{
+ int i;
+
+ if (torture_cleanup_begin())
+ return;
+
+ if (!cur_ops) {
+ torture_cleanup_end();
+ return;
+ }
+
+ if (reader_tasks) {
+ for (i = 0; i < nreaders; i++)
+ torture_stop_kthread("ref_scale_reader",
+ reader_tasks[i].task);
+ }
+ kfree(reader_tasks);
+
+ torture_stop_kthread("main_task", main_task);
+ kfree(main_task);
+
+ // Do scale-type-specific cleanup operations.
+ if (cur_ops->cleanup != NULL)
+ cur_ops->cleanup();
+
+ torture_cleanup_end();
+}
+
+// Shutdown kthread. Just waits to be awakened, then shuts down system.
+static int
+ref_scale_shutdown(void *arg)
+{
+ wait_event_idle(shutdown_wq, shutdown_start);
+
+ smp_mb(); // Wake before output.
+ ref_scale_cleanup();
+ kernel_power_off();
+
+ return -EINVAL;
+}
+
+static int __init
+ref_scale_init(void)
+{
+ long i;
+ int firsterr = 0;
+ static struct ref_scale_ops *scale_ops[] = {
+ &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops,
+ &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops,
+ &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,
+ };
+
+ if (!torture_init_begin(scale_type, verbose))
+ return -EBUSY;
+
+ for (i = 0; i < ARRAY_SIZE(scale_ops); i++) {
+ cur_ops = scale_ops[i];
+ if (strcmp(scale_type, cur_ops->name) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(scale_ops)) {
+ pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type);
+ pr_alert("rcu-scale types:");
+ for (i = 0; i < ARRAY_SIZE(scale_ops); i++)
+ pr_cont(" %s", scale_ops[i]->name);
+ pr_cont("\n");
+ firsterr = -EINVAL;
+ cur_ops = NULL;
+ goto unwind;
+ }
+ if (cur_ops->init)
+ if (!cur_ops->init()) {
+ firsterr = -EUCLEAN;
+ goto unwind;
+ }
+
+ ref_scale_print_module_parms(cur_ops, "Start of test");
+
+ // Shutdown task
+ if (shutdown) {
+ init_waitqueue_head(&shutdown_wq);
+ firsterr = torture_create_kthread(ref_scale_shutdown, NULL,
+ shutdown_task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ schedule_timeout_uninterruptible(1);
+ }
+
+ // Reader tasks (default to ~75% of online CPUs).
+ if (nreaders < 0)
+ nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2);
+ if (WARN_ONCE(loops <= 0, "%s: loops = %ld, adjusted to 1\n", __func__, loops))
+ loops = 1;
+ if (WARN_ONCE(nreaders <= 0, "%s: nreaders = %d, adjusted to 1\n", __func__, nreaders))
+ nreaders = 1;
+ if (WARN_ONCE(nruns <= 0, "%s: nruns = %d, adjusted to 1\n", __func__, nruns))
+ nruns = 1;
+ reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
+ GFP_KERNEL);
+ if (!reader_tasks) {
+ SCALEOUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+
+ VERBOSE_SCALEOUT("Starting %d reader threads", nreaders);
+
+ for (i = 0; i < nreaders; i++) {
+ init_waitqueue_head(&reader_tasks[i].wq);
+ firsterr = torture_create_kthread(ref_scale_reader, (void *)i,
+ reader_tasks[i].task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+
+ // Main Task
+ init_waitqueue_head(&main_wq);
+ firsterr = torture_create_kthread(main_func, NULL, main_task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ ref_scale_cleanup();
+ if (shutdown) {
+ WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST));
+ kernel_power_off();
+ }
+ return firsterr;
+}
+
+module_init(ref_scale_init);
+module_exit(ref_scale_cleanup);
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 6208c1dae5c9..c38e5933a5d6 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -34,6 +34,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp)
ssp->srcu_gp_running = false;
ssp->srcu_gp_waiting = false;
ssp->srcu_idx = 0;
+ ssp->srcu_idx_max = 0;
INIT_WORK(&ssp->srcu_work, srcu_drive_gp);
INIT_LIST_HEAD(&ssp->srcu_work.entry);
return 0;
@@ -84,6 +85,8 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
WARN_ON(ssp->srcu_gp_waiting);
WARN_ON(ssp->srcu_cb_head);
WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail);
+ WARN_ON(ssp->srcu_idx != ssp->srcu_idx_max);
+ WARN_ON(ssp->srcu_idx & 0x1);
}
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
@@ -93,10 +96,10 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
*/
void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
- int newval = ssp->srcu_lock_nesting[idx] - 1;
+ int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
- if (!newval && READ_ONCE(ssp->srcu_gp_waiting))
+ if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task())
swake_up_one(&ssp->srcu_wq);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -114,7 +117,7 @@ void srcu_drive_gp(struct work_struct *wp)
struct srcu_struct *ssp;
ssp = container_of(wp, struct srcu_struct, srcu_work);
- if (ssp->srcu_gp_running || !READ_ONCE(ssp->srcu_cb_head))
+ if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
return; /* Already running or nothing to do. */
/* Remove recently arrived callbacks and wait for readers. */
@@ -124,16 +127,18 @@ void srcu_drive_gp(struct work_struct *wp)
ssp->srcu_cb_head = NULL;
ssp->srcu_cb_tail = &ssp->srcu_cb_head;
local_irq_enable();
- idx = ssp->srcu_idx;
- WRITE_ONCE(ssp->srcu_idx, !ssp->srcu_idx);
+ idx = (ssp->srcu_idx & 0x2) / 2;
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
/* Invoke the callbacks we removed above. */
while (lh) {
rhp = lh;
lh = lh->next;
+ debug_rcu_head_callback(rhp);
local_bh_disable();
rhp->func(rhp);
local_bh_enable();
@@ -146,11 +151,27 @@ void srcu_drive_gp(struct work_struct *wp)
* straighten that out.
*/
WRITE_ONCE(ssp->srcu_gp_running, false);
- if (READ_ONCE(ssp->srcu_cb_head))
+ if (ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
schedule_work(&ssp->srcu_work);
}
EXPORT_SYMBOL_GPL(srcu_drive_gp);
+static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
+{
+ unsigned long cookie;
+
+ cookie = get_state_synchronize_srcu(ssp);
+ if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
+ return;
+ WRITE_ONCE(ssp->srcu_idx_max, cookie);
+ if (!READ_ONCE(ssp->srcu_gp_running)) {
+ if (likely(srcu_init_done))
+ schedule_work(&ssp->srcu_work);
+ else if (list_empty(&ssp->srcu_work.entry))
+ list_add(&ssp->srcu_work.entry, &srcu_boot_list);
+ }
+}
+
/*
* Enqueue an SRCU callback on the specified srcu_struct structure,
* initiating grace-period processing if it is not already running.
@@ -166,12 +187,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
*ssp->srcu_cb_tail = rhp;
ssp->srcu_cb_tail = &rhp->next;
local_irq_restore(flags);
- if (!READ_ONCE(ssp->srcu_gp_running)) {
- if (likely(srcu_init_done))
- schedule_work(&ssp->srcu_work);
- else if (list_empty(&ssp->srcu_work.entry))
- list_add(&ssp->srcu_work.entry, &srcu_boot_list);
- }
+ srcu_gp_start_if_needed(ssp);
}
EXPORT_SYMBOL_GPL(call_srcu);
@@ -182,6 +198,18 @@ void synchronize_srcu(struct srcu_struct *ssp)
{
struct rcu_synchronize rs;
+ srcu_lock_sync(&ssp->dep_map);
+
+ RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
+ lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
+
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+ return;
+
+ might_sleep();
init_rcu_head_on_stack(&rs.head);
init_completion(&rs.completion);
call_srcu(ssp, &rs.head, wakeme_after_rcu);
@@ -190,6 +218,48 @@ void synchronize_srcu(struct srcu_struct *ssp)
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
+/*
+ * get_state_synchronize_srcu - Provide an end-of-grace-period cookie
+ */
+unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
+{
+ unsigned long ret;
+
+ barrier();
+ ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1;
+ barrier();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
+
+/*
+ * start_poll_synchronize_srcu - Provide cookie and start grace period
+ *
+ * The difference between this and get_state_synchronize_srcu() is that
+ * this function ensures that the poll_state_synchronize_srcu() will
+ * eventually return the value true.
+ */
+unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
+{
+ unsigned long ret = get_state_synchronize_srcu(ssp);
+
+ srcu_gp_start_if_needed(ssp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
+
+/*
+ * poll_state_synchronize_srcu - Has cookie's grace period ended?
+ */
+bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+ unsigned long cur_s = READ_ONCE(ssp->srcu_idx);
+
+ barrier();
+ return ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3);
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
+
/* Lockdep diagnostics. */
void __init rcu_scheduler_starting(void)
{
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 6d3ef700fb0e..0351a4e83529 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -24,24 +24,12 @@
#include <linux/smp.h>
#include <linux/delay.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/srcu.h>
#include "rcu.h"
#include "rcu_segcblist.h"
-#ifndef data_race
-#define data_race(expr) \
- ({ \
- expr; \
- })
-#endif
-#ifndef ASSERT_EXCLUSIVE_WRITER
-#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0)
-#endif
-#ifndef ASSERT_EXCLUSIVE_ACCESS
-#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0)
-#endif
-
/* Holdoff in nanoseconds for auto-expediting. */
#define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000)
static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF;
@@ -51,6 +39,35 @@ module_param(exp_holdoff, ulong, 0444);
static ulong counter_wrap_check = (ULONG_MAX >> 2);
module_param(counter_wrap_check, ulong, 0444);
+/*
+ * Control conversion to SRCU_SIZE_BIG:
+ * 0: Don't convert at all.
+ * 1: Convert at init_srcu_struct() time.
+ * 2: Convert when rcutorture invokes srcu_torture_stats_print().
+ * 3: Decide at boot time based on system shape (default).
+ * 0x1x: Convert when excessive contention encountered.
+ */
+#define SRCU_SIZING_NONE 0
+#define SRCU_SIZING_INIT 1
+#define SRCU_SIZING_TORTURE 2
+#define SRCU_SIZING_AUTO 3
+#define SRCU_SIZING_CONTEND 0x10
+#define SRCU_SIZING_IS(x) ((convert_to_big & ~SRCU_SIZING_CONTEND) == x)
+#define SRCU_SIZING_IS_NONE() (SRCU_SIZING_IS(SRCU_SIZING_NONE))
+#define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT))
+#define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE))
+#define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND)
+static int convert_to_big = SRCU_SIZING_AUTO;
+module_param(convert_to_big, int, 0444);
+
+/* Number of CPUs to trigger init_srcu_struct()-time transition to big. */
+static int big_cpu_lim __read_mostly = 128;
+module_param(big_cpu_lim, int, 0444);
+
+/* Contention events per jiffy to initiate transition to big. */
+static int small_contention_lim __read_mostly = 100;
+module_param(small_contention_lim, int, 0444);
+
/* Early-boot callback-management, so early that no lock is required! */
static LIST_HEAD(srcu_boot_list);
static bool __read_mostly srcu_init_done;
@@ -61,39 +78,90 @@ static void process_srcu(struct work_struct *work);
static void srcu_delay_timer(struct timer_list *t);
/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
-#define spin_lock_rcu_node(p) \
-do { \
- spin_lock(&ACCESS_PRIVATE(p, lock)); \
- smp_mb__after_unlock_lock(); \
+#define spin_lock_rcu_node(p) \
+do { \
+ spin_lock(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
} while (0)
#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock))
-#define spin_lock_irq_rcu_node(p) \
-do { \
- spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
- smp_mb__after_unlock_lock(); \
+#define spin_lock_irq_rcu_node(p) \
+do { \
+ spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
} while (0)
-#define spin_unlock_irq_rcu_node(p) \
+#define spin_unlock_irq_rcu_node(p) \
spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
-#define spin_lock_irqsave_rcu_node(p, flags) \
-do { \
- spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
- smp_mb__after_unlock_lock(); \
+#define spin_lock_irqsave_rcu_node(p, flags) \
+do { \
+ spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ smp_mb__after_unlock_lock(); \
} while (0)
-#define spin_unlock_irqrestore_rcu_node(p, flags) \
- spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
+#define spin_trylock_irqsave_rcu_node(p, flags) \
+({ \
+ bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ \
+ if (___locked) \
+ smp_mb__after_unlock_lock(); \
+ ___locked; \
+})
+
+#define spin_unlock_irqrestore_rcu_node(p, flags) \
+ spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
/*
- * Initialize SRCU combining tree. Note that statically allocated
+ * Initialize SRCU per-CPU data. Note that statically allocated
* srcu_struct structures might already have srcu_read_lock() and
* srcu_read_unlock() running against them. So if the is_static parameter
* is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
*/
-static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
+static void init_srcu_struct_data(struct srcu_struct *ssp)
+{
+ int cpu;
+ struct srcu_data *sdp;
+
+ /*
+ * Initialize the per-CPU srcu_data array, which feeds into the
+ * leaves of the srcu_node tree.
+ */
+ WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
+ ARRAY_SIZE(sdp->srcu_unlock_count));
+ for_each_possible_cpu(cpu) {
+ sdp = per_cpu_ptr(ssp->sda, cpu);
+ spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
+ rcu_segcblist_init(&sdp->srcu_cblist);
+ sdp->srcu_cblist_invoking = false;
+ sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq;
+ sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq;
+ sdp->mynode = NULL;
+ sdp->cpu = cpu;
+ INIT_WORK(&sdp->work, srcu_invoke_callbacks);
+ timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
+ sdp->ssp = ssp;
+ }
+}
+
+/* Invalid seq state, used during snp node initialization */
+#define SRCU_SNP_INIT_SEQ 0x2
+
+/*
+ * Check whether sequence number corresponding to snp node,
+ * is invalid.
+ */
+static inline bool srcu_invl_snp_seq(unsigned long s)
+{
+ return s == SRCU_SNP_INIT_SEQ;
+}
+
+/*
+ * Allocated and initialize SRCU combining tree. Returns @true if
+ * allocation succeeded and @false otherwise.
+ */
+static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
{
int cpu;
int i;
@@ -103,10 +171,16 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
struct srcu_node *snp;
struct srcu_node *snp_first;
+ /* Initialize geometry if it has not already been initialized. */
+ rcu_init_geometry();
+ ssp->srcu_sup->node = kcalloc(rcu_num_nodes, sizeof(*ssp->srcu_sup->node), gfp_flags);
+ if (!ssp->srcu_sup->node)
+ return false;
+
/* Work out the overall tree geometry. */
- ssp->level[0] = &ssp->node[0];
+ ssp->srcu_sup->level[0] = &ssp->srcu_sup->node[0];
for (i = 1; i < rcu_num_lvls; i++)
- ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1];
+ ssp->srcu_sup->level[i] = ssp->srcu_sup->level[i - 1] + num_rcu_lvl[i - 1];
rcu_init_levelspread(levelspread, num_rcu_lvl);
/* Each pass through this loop initializes one srcu_node structure. */
@@ -115,23 +189,23 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
ARRAY_SIZE(snp->srcu_data_have_cbs));
for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
- snp->srcu_have_cbs[i] = 0;
+ snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ;
snp->srcu_data_have_cbs[i] = 0;
}
- snp->srcu_gp_seq_needed_exp = 0;
+ snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ;
snp->grplo = -1;
snp->grphi = -1;
- if (snp == &ssp->node[0]) {
+ if (snp == &ssp->srcu_sup->node[0]) {
/* Root node, special case. */
snp->srcu_parent = NULL;
continue;
}
/* Non-root node. */
- if (snp == ssp->level[level + 1])
+ if (snp == ssp->srcu_sup->level[level + 1])
level++;
- snp->srcu_parent = ssp->level[level - 1] +
- (snp - ssp->level[level]) /
+ snp->srcu_parent = ssp->srcu_sup->level[level - 1] +
+ (snp - ssp->srcu_sup->level[level]) /
levelspread[level - 1];
}
@@ -139,62 +213,73 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
* Initialize the per-CPU srcu_data array, which feeds into the
* leaves of the srcu_node tree.
*/
- WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
- ARRAY_SIZE(sdp->srcu_unlock_count));
level = rcu_num_lvls - 1;
- snp_first = ssp->level[level];
+ snp_first = ssp->srcu_sup->level[level];
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
- rcu_segcblist_init(&sdp->srcu_cblist);
- sdp->srcu_cblist_invoking = false;
- sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq;
- sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq;
sdp->mynode = &snp_first[cpu / levelspread[level]];
for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
if (snp->grplo < 0)
snp->grplo = cpu;
snp->grphi = cpu;
}
- sdp->cpu = cpu;
- INIT_WORK(&sdp->work, srcu_invoke_callbacks);
- timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
- sdp->ssp = ssp;
- sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
- if (is_static)
- continue;
-
- /* Dynamically allocated, better be no srcu_read_locks()! */
- for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) {
- sdp->srcu_lock_count[i] = 0;
- sdp->srcu_unlock_count[i] = 0;
- }
+ sdp->grpmask = 1UL << (cpu - sdp->mynode->grplo);
}
+ smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
+ return true;
}
/*
* Initialize non-compile-time initialized fields, including the
- * associated srcu_node and srcu_data structures. The is_static
- * parameter is passed through to init_srcu_struct_nodes(), and
- * also tells us that ->sda has already been wired up to srcu_data.
+ * associated srcu_node and srcu_data structures. The is_static parameter
+ * tells us that ->sda has already been wired up to srcu_data.
*/
static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
{
- mutex_init(&ssp->srcu_cb_mutex);
- mutex_init(&ssp->srcu_gp_mutex);
+ if (!is_static)
+ ssp->srcu_sup = kzalloc(sizeof(*ssp->srcu_sup), GFP_KERNEL);
+ if (!ssp->srcu_sup)
+ return -ENOMEM;
+ if (!is_static)
+ spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL;
+ ssp->srcu_sup->node = NULL;
+ mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
+ mutex_init(&ssp->srcu_sup->srcu_gp_mutex);
ssp->srcu_idx = 0;
- ssp->srcu_gp_seq = 0;
- ssp->srcu_barrier_seq = 0;
- mutex_init(&ssp->srcu_barrier_mutex);
- atomic_set(&ssp->srcu_barrier_cpu_cnt, 0);
- INIT_DELAYED_WORK(&ssp->work, process_srcu);
+ ssp->srcu_sup->srcu_gp_seq = 0;
+ ssp->srcu_sup->srcu_barrier_seq = 0;
+ mutex_init(&ssp->srcu_sup->srcu_barrier_mutex);
+ atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0);
+ INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu);
+ ssp->srcu_sup->sda_is_static = is_static;
if (!is_static)
ssp->sda = alloc_percpu(struct srcu_data);
- init_srcu_struct_nodes(ssp, is_static);
- ssp->srcu_gp_seq_needed_exp = 0;
- ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
- smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
- return ssp->sda ? 0 : -ENOMEM;
+ if (!ssp->sda)
+ goto err_free_sup;
+ init_srcu_struct_data(ssp);
+ ssp->srcu_sup->srcu_gp_seq_needed_exp = 0;
+ ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
+ if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
+ if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC))
+ goto err_free_sda;
+ WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
+ }
+ ssp->srcu_sup->srcu_ssp = ssp;
+ smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */
+ return 0;
+
+err_free_sda:
+ if (!is_static) {
+ free_percpu(ssp->sda);
+ ssp->sda = NULL;
+ }
+err_free_sup:
+ if (!is_static) {
+ kfree(ssp->srcu_sup);
+ ssp->srcu_sup = NULL;
+ }
+ return -ENOMEM;
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -205,7 +290,6 @@ int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
/* Don't re-initialize a lock while it is held. */
debug_check_no_locks_freed((void *)ssp, sizeof(*ssp));
lockdep_init_map(&ssp->dep_map, name, key, 0);
- spin_lock_init(&ACCESS_PRIVATE(ssp, lock));
return init_srcu_struct_fields(ssp, false);
}
EXPORT_SYMBOL_GPL(__init_srcu_struct);
@@ -222,7 +306,6 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
*/
int init_srcu_struct(struct srcu_struct *ssp)
{
- spin_lock_init(&ACCESS_PRIVATE(ssp, lock));
return init_srcu_struct_fields(ssp, false);
}
EXPORT_SYMBOL_GPL(init_srcu_struct);
@@ -230,6 +313,86 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/*
+ * Initiate a transition to SRCU_SIZE_BIG with lock held.
+ */
+static void __srcu_transition_to_big(struct srcu_struct *ssp)
+{
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC);
+}
+
+/*
+ * Initiate an idempotent transition to SRCU_SIZE_BIG.
+ */
+static void srcu_transition_to_big(struct srcu_struct *ssp)
+{
+ unsigned long flags;
+
+ /* Double-checked locking on ->srcu_size-state. */
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL)
+ return;
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) {
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+ return;
+ }
+ __srcu_transition_to_big(ssp);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+}
+
+/*
+ * Check to see if the just-encountered contention event justifies
+ * a transition to SRCU_SIZE_BIG.
+ */
+static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
+{
+ unsigned long j;
+
+ if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_sup->srcu_size_state)
+ return;
+ j = jiffies;
+ if (ssp->srcu_sup->srcu_size_jiffies != j) {
+ ssp->srcu_sup->srcu_size_jiffies = j;
+ ssp->srcu_sup->srcu_n_lock_retries = 0;
+ }
+ if (++ssp->srcu_sup->srcu_n_lock_retries <= small_contention_lim)
+ return;
+ __srcu_transition_to_big(ssp);
+}
+
+/*
+ * Acquire the specified srcu_data structure's ->lock, but check for
+ * excessive contention, which results in initiation of a transition
+ * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module
+ * parameter permits this.
+ */
+static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags)
+{
+ struct srcu_struct *ssp = sdp->ssp;
+
+ if (spin_trylock_irqsave_rcu_node(sdp, *flags))
+ return;
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
+ spin_lock_irqsave_check_contention(ssp);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags);
+ spin_lock_irqsave_rcu_node(sdp, *flags);
+}
+
+/*
+ * Acquire the specified srcu_struct structure's ->lock, but check for
+ * excessive contention, which results in initiation of a transition
+ * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module
+ * parameter permits this.
+ */
+static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags)
+{
+ if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags))
+ return;
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
+ spin_lock_irqsave_check_contention(ssp);
+}
+
+/*
* First-use initialization of statically allocated srcu_struct
* structure. Wiring up the combining tree is more than can be
* done with compile-time initialization, so this check is added
@@ -242,15 +405,15 @@ static void check_init_srcu_struct(struct srcu_struct *ssp)
unsigned long flags;
/* The smp_load_acquire() pairs with the smp_store_release(). */
- if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/
+ if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /*^^^*/
return; /* Already initialized. */
- spin_lock_irqsave_rcu_node(ssp, flags);
- if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) {
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq_needed)) {
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
return;
}
init_srcu_struct_fields(ssp, true);
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
@@ -265,7 +428,7 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx)
for_each_possible_cpu(cpu) {
struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
- sum += READ_ONCE(cpuc->srcu_lock_count[idx]);
+ sum += atomic_long_read(&cpuc->srcu_lock_count[idx]);
}
return sum;
}
@@ -277,13 +440,18 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx)
static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx)
{
int cpu;
+ unsigned long mask = 0;
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
- sum += READ_ONCE(cpuc->srcu_unlock_count[idx]);
+ sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]);
+ if (IS_ENABLED(CONFIG_PROVE_RCU))
+ mask = mask | READ_ONCE(cpuc->srcu_nmi_safety);
}
+ WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> 1)),
+ "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp);
return sum;
}
@@ -312,24 +480,59 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
/*
* If the locks are the same as the unlocks, then there must have
- * been no readers on this index at some time in between. This does
- * not mean that there are no more readers, as one could have read
- * the current index but not have incremented the lock counter yet.
+ * been no readers on this index at some point in this function.
+ * But there might be more readers, as a task might have read
+ * the current ->srcu_idx but not yet have incremented its CPU's
+ * ->srcu_lock_count[idx] counter. In fact, it is possible
+ * that most of the tasks have been preempted between fetching
+ * ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there
+ * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks
+ * in a system whose address space was fully populated with memory.
+ * Call this quantity Nt.
+ *
+ * So suppose that the updater is preempted at this point in the
+ * code for a long time. That now-preempted updater has already
+ * flipped ->srcu_idx (possibly during the preceding grace period),
+ * done an smp_mb() (again, possibly during the preceding grace
+ * period), and summed up the ->srcu_unlock_count[idx] counters.
+ * How many times can a given one of the aforementioned Nt tasks
+ * increment the old ->srcu_idx value's ->srcu_lock_count[idx]
+ * counter, in the absence of nesting?
+ *
+ * It can clearly do so once, given that it has already fetched
+ * the old value of ->srcu_idx and is just about to use that value
+ * to index its increment of ->srcu_lock_count[idx]. But as soon as
+ * it leaves that SRCU read-side critical section, it will increment
+ * ->srcu_unlock_count[idx], which must follow the updater's above
+ * read from that same value. Thus, as soon the reading task does
+ * an smp_mb() and a later fetch from ->srcu_idx, that task will be
+ * guaranteed to get the new index. Except that the increment of
+ * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the
+ * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock()
+ * is before the smp_mb(). Thus, that task might not see the new
+ * value of ->srcu_idx until the -second- __srcu_read_lock(),
+ * which in turn means that this task might well increment
+ * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice,
+ * not just once.
+ *
+ * However, it is important to note that a given smp_mb() takes
+ * effect not just for the task executing it, but also for any
+ * later task running on that same CPU.
*
- * So suppose that the updater is preempted here for so long
- * that more than ULONG_MAX non-nested readers come and go in
- * the meantime. It turns out that this cannot result in overflow
- * because if a reader modifies its unlock count after we read it
- * above, then that reader's next load of ->srcu_idx is guaranteed
- * to get the new value, which will cause it to operate on the
- * other bank of counters, where it cannot contribute to the
- * overflow of these counters. This means that there is a maximum
- * of 2*NR_CPUS increments, which cannot overflow given current
- * systems, especially not on 64-bit systems.
+ * That is, there can be almost Nt + Nc further increments of
+ * ->srcu_lock_count[idx] for the old index, where Nc is the number
+ * of CPUs. But this is OK because the size of the task_struct
+ * structure limits the value of Nt and current systems limit Nc
+ * to a few thousand.
*
- * OK, how about nesting? This does impose a limit on nesting
- * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient,
- * especially on 64-bit systems.
+ * OK, but what about nesting? This does impose a limit on
+ * nesting of half of the size of the task_struct structure
+ * (measured in bytes), which should be sufficient. A late 2022
+ * TREE01 rcutorture run reported this size to be no less than
+ * 9408 bytes, allowing up to 4704 levels of nesting, which is
+ * comfortably beyond excessive. Especially on 64-bit systems,
+ * which are unlikely to be configured with an address space fully
+ * populated with memory, at least not anytime soon.
*/
return srcu_readers_lock_idx(ssp, idx) == unlocks;
}
@@ -351,15 +554,60 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
for_each_possible_cpu(cpu) {
struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
- sum += READ_ONCE(cpuc->srcu_lock_count[0]);
- sum += READ_ONCE(cpuc->srcu_lock_count[1]);
- sum -= READ_ONCE(cpuc->srcu_unlock_count[0]);
- sum -= READ_ONCE(cpuc->srcu_unlock_count[1]);
+ sum += atomic_long_read(&cpuc->srcu_lock_count[0]);
+ sum += atomic_long_read(&cpuc->srcu_lock_count[1]);
+ sum -= atomic_long_read(&cpuc->srcu_unlock_count[0]);
+ sum -= atomic_long_read(&cpuc->srcu_unlock_count[1]);
}
return sum;
}
-#define SRCU_INTERVAL 1
+/*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited(). We spin for a fixed time period
+ * (defined below, boot time configurable) to allow SRCU readers to exit
+ * their read-side critical sections. If there are still some readers
+ * after one jiffy, we repeatedly block for one jiffy time periods.
+ * The blocking time is increased as the grace-period age increases,
+ * with max blocking time capped at 10 jiffies.
+ */
+#define SRCU_DEFAULT_RETRY_CHECK_DELAY 5
+
+static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY;
+module_param(srcu_retry_check_delay, ulong, 0444);
+
+#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
+#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
+
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase
+ // no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase
+ // no-delay instances.
+
+#define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low))
+#define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high))
+#define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high))
+// per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto
+// one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay()
+// called from process_srcu().
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \
+ (2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY)
+
+// Maximum per-GP-phase consecutive no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE \
+ SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE_HI)
+
+static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE;
+module_param(srcu_max_nodelay_phase, ulong, 0444);
+
+// Maximum consecutive no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE : 100)
+
+static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY;
+module_param(srcu_max_nodelay, ulong, 0444);
/*
* Return grace-period delay, zero if there are expedited grace
@@ -367,10 +615,25 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
*/
static unsigned long srcu_get_delay(struct srcu_struct *ssp)
{
- if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq),
- READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
- return 0;
- return SRCU_INTERVAL;
+ unsigned long gpstart;
+ unsigned long j;
+ unsigned long jbase = SRCU_INTERVAL;
+ struct srcu_usage *sup = ssp->srcu_sup;
+
+ if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)))
+ jbase = 0;
+ if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) {
+ j = jiffies - 1;
+ gpstart = READ_ONCE(sup->srcu_gp_start);
+ if (time_after(j, gpstart))
+ jbase += j - gpstart;
+ if (!jbase) {
+ WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1);
+ if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
+ jbase = 1;
+ }
+ }
+ return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase;
}
/**
@@ -383,12 +646,13 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
void cleanup_srcu_struct(struct srcu_struct *ssp)
{
int cpu;
+ struct srcu_usage *sup = ssp->srcu_sup;
if (WARN_ON(!srcu_get_delay(ssp)))
return; /* Just leak it! */
if (WARN_ON(srcu_readers_active(ssp)))
return; /* Just leak it! */
- flush_delayed_work(&ssp->work);
+ flush_delayed_work(&sup->work);
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
@@ -397,17 +661,49 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist)))
return; /* Forgot srcu_barrier(), so just leak it! */
}
- if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+ if (WARN_ON(rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+ WARN_ON(rcu_seq_current(&sup->srcu_gp_seq) != sup->srcu_gp_seq_needed) ||
WARN_ON(srcu_readers_active(ssp))) {
- pr_info("%s: Active srcu_struct %p state: %d\n",
- __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)));
+ pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n",
+ __func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)),
+ rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed);
return; /* Caller forgot to stop doing call_srcu()? */
}
- free_percpu(ssp->sda);
- ssp->sda = NULL;
+ kfree(sup->node);
+ sup->node = NULL;
+ sup->srcu_size_state = SRCU_SIZE_SMALL;
+ if (!sup->sda_is_static) {
+ free_percpu(ssp->sda);
+ ssp->sda = NULL;
+ kfree(sup);
+ ssp->srcu_sup = NULL;
+ }
}
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+#ifdef CONFIG_PROVE_RCU
+/*
+ * Check for consistent NMI safety.
+ */
+void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe)
+{
+ int nmi_safe_mask = 1 << nmi_safe;
+ int old_nmi_safe_mask;
+ struct srcu_data *sdp;
+
+ /* NMI-unsafe use in NMI is a bad sign */
+ WARN_ON_ONCE(!nmi_safe && in_nmi());
+ sdp = raw_cpu_ptr(ssp->sda);
+ old_nmi_safe_mask = READ_ONCE(sdp->srcu_nmi_safety);
+ if (!old_nmi_safe_mask) {
+ WRITE_ONCE(sdp->srcu_nmi_safety, nmi_safe_mask);
+ return;
+ }
+ WARN_ONCE(old_nmi_safe_mask != nmi_safe_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_nmi_safe_mask, nmi_safe_mask);
+}
+EXPORT_SYMBOL_GPL(srcu_check_nmi_safety);
+#endif /* CONFIG_PROVE_RCU */
+
/*
* Counts the new reader in the appropriate per-CPU element of the
* srcu_struct.
@@ -418,7 +714,7 @@ int __srcu_read_lock(struct srcu_struct *ssp)
int idx;
idx = READ_ONCE(ssp->srcu_idx) & 0x1;
- this_cpu_inc(ssp->sda->srcu_lock_count[idx]);
+ this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
return idx;
}
@@ -432,38 +728,59 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
smp_mb(); /* C */ /* Avoid leaking the critical section. */
- this_cpu_inc(ssp->sda->srcu_unlock_count[idx]);
+ this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
+#ifdef CONFIG_NEED_SRCU_NMI_SAFE
+
/*
- * We use an adaptive strategy for synchronize_srcu() and especially for
- * synchronize_srcu_expedited(). We spin for a fixed time period
- * (defined below) to allow SRCU readers to exit their read-side critical
- * sections. If there are still some readers after a few microseconds,
- * we repeatedly block for 1-millisecond time periods.
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct, but in an NMI-safe manner using RMW atomics.
+ * Returns an index that must be passed to the matching srcu_read_unlock().
*/
-#define SRCU_RETRY_CHECK_DELAY 5
+int __srcu_read_lock_nmisafe(struct srcu_struct *ssp)
+{
+ int idx;
+ struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
+
+ idx = READ_ONCE(ssp->srcu_idx) & 0x1;
+ atomic_long_inc(&sdp->srcu_lock_count[idx]);
+ smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */
+ return idx;
+}
+EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe);
+
+/*
+ * Removes the count for the old reader from the appropriate per-CPU
+ * element of the srcu_struct. Note that this may well be a different
+ * CPU than that which was incremented by the corresponding srcu_read_lock().
+ */
+void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
+{
+ struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
+
+ smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */
+ atomic_long_inc(&sdp->srcu_unlock_count[idx]);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe);
+
+#endif // CONFIG_NEED_SRCU_NMI_SAFE
/*
* Start an SRCU grace period.
*/
static void srcu_gp_start(struct srcu_struct *ssp)
{
- struct srcu_data *sdp = this_cpu_ptr(ssp->sda);
int state;
- lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
- WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
- spin_lock_rcu_node(sdp); /* Interrupts already disabled. */
- rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
- (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
- rcu_seq_snap(&ssp->srcu_gp_seq));
- spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed));
+ WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies);
+ WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0);
smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
- rcu_seq_start(&ssp->srcu_gp_seq);
- state = rcu_seq_state(ssp->srcu_gp_seq);
+ rcu_seq_start(&ssp->srcu_sup->srcu_gp_seq);
+ state = rcu_seq_state(ssp->srcu_sup->srcu_gp_seq);
WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
}
@@ -507,7 +824,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
int cpu;
for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
- if (!(mask & (1 << (cpu - snp->grplo))))
+ if (!(mask & (1UL << (cpu - snp->grplo))))
continue;
srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay);
}
@@ -524,7 +841,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
*/
static void srcu_gp_end(struct srcu_struct *ssp)
{
- unsigned long cbdelay;
+ unsigned long cbdelay = 1;
bool cbs;
bool last_lvl;
int cpu;
@@ -533,71 +850,92 @@ static void srcu_gp_end(struct srcu_struct *ssp)
int idx;
unsigned long mask;
struct srcu_data *sdp;
+ unsigned long sgsne;
struct srcu_node *snp;
+ int ss_state;
+ struct srcu_usage *sup = ssp->srcu_sup;
/* Prevent more than one additional grace period. */
- mutex_lock(&ssp->srcu_cb_mutex);
+ mutex_lock(&sup->srcu_cb_mutex);
/* End the current grace period. */
- spin_lock_irq_rcu_node(ssp);
- idx = rcu_seq_state(ssp->srcu_gp_seq);
+ spin_lock_irq_rcu_node(sup);
+ idx = rcu_seq_state(sup->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
- cbdelay = srcu_get_delay(ssp);
- WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
- rcu_seq_end(&ssp->srcu_gp_seq);
- gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
- if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq))
- WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq);
- spin_unlock_irq_rcu_node(ssp);
- mutex_unlock(&ssp->srcu_gp_mutex);
+ if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)))
+ cbdelay = 0;
+
+ WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns());
+ rcu_seq_end(&sup->srcu_gp_seq);
+ gpseq = rcu_seq_current(&sup->srcu_gp_seq);
+ if (ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, gpseq))
+ WRITE_ONCE(sup->srcu_gp_seq_needed_exp, gpseq);
+ spin_unlock_irq_rcu_node(sup);
+ mutex_unlock(&sup->srcu_gp_mutex);
/* A new grace period can start at this point. But only one. */
/* Initiate callback invocation as needed. */
- idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
- srcu_for_each_node_breadth_first(ssp, snp) {
- spin_lock_irq_rcu_node(snp);
- cbs = false;
- last_lvl = snp >= ssp->level[rcu_num_lvls - 1];
- if (last_lvl)
- cbs = snp->srcu_have_cbs[idx] == gpseq;
- snp->srcu_have_cbs[idx] = gpseq;
- rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
- if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq))
- WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq);
- mask = snp->srcu_data_have_cbs[idx];
- snp->srcu_data_have_cbs[idx] = 0;
- spin_unlock_irq_rcu_node(snp);
- if (cbs)
- srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay);
-
- /* Occasionally prevent srcu_data counter wrap. */
- if (!(gpseq & counter_wrap_check) && last_lvl)
- for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
- sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_irqsave_rcu_node(sdp, flags);
- if (ULONG_CMP_GE(gpseq,
- sdp->srcu_gp_seq_needed + 100))
- sdp->srcu_gp_seq_needed = gpseq;
- if (ULONG_CMP_GE(gpseq,
- sdp->srcu_gp_seq_needed_exp + 100))
- sdp->srcu_gp_seq_needed_exp = gpseq;
- spin_unlock_irqrestore_rcu_node(sdp, flags);
- }
+ ss_state = smp_load_acquire(&sup->srcu_size_state);
+ if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
+ srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()),
+ cbdelay);
+ } else {
+ idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
+ srcu_for_each_node_breadth_first(ssp, snp) {
+ spin_lock_irq_rcu_node(snp);
+ cbs = false;
+ last_lvl = snp >= sup->level[rcu_num_lvls - 1];
+ if (last_lvl)
+ cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq;
+ snp->srcu_have_cbs[idx] = gpseq;
+ rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
+ sgsne = snp->srcu_gp_seq_needed_exp;
+ if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq))
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq);
+ if (ss_state < SRCU_SIZE_BIG)
+ mask = ~0;
+ else
+ mask = snp->srcu_data_have_cbs[idx];
+ snp->srcu_data_have_cbs[idx] = 0;
+ spin_unlock_irq_rcu_node(snp);
+ if (cbs)
+ srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay);
+ }
}
+ /* Occasionally prevent srcu_data counter wrap. */
+ if (!(gpseq & counter_wrap_check))
+ for_each_possible_cpu(cpu) {
+ sdp = per_cpu_ptr(ssp->sda, cpu);
+ spin_lock_irqsave_rcu_node(sdp, flags);
+ if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100))
+ sdp->srcu_gp_seq_needed = gpseq;
+ if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100))
+ sdp->srcu_gp_seq_needed_exp = gpseq;
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
+ }
+
/* Callback initiation done, allow grace periods after next. */
- mutex_unlock(&ssp->srcu_cb_mutex);
+ mutex_unlock(&sup->srcu_cb_mutex);
/* Start a new grace period if needed. */
- spin_lock_irq_rcu_node(ssp);
- gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
+ spin_lock_irq_rcu_node(sup);
+ gpseq = rcu_seq_current(&sup->srcu_gp_seq);
if (!rcu_seq_state(gpseq) &&
- ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) {
+ ULONG_CMP_LT(gpseq, sup->srcu_gp_seq_needed)) {
srcu_gp_start(ssp);
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(sup);
srcu_reschedule(ssp, 0);
} else {
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(sup);
+ }
+
+ /* Transition to big if needed. */
+ if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) {
+ if (ss_state == SRCU_SIZE_ALLOC)
+ init_srcu_struct_nodes(ssp, GFP_KERNEL);
+ else
+ smp_store_release(&sup->srcu_size_state, ss_state + 1);
}
}
@@ -612,23 +950,27 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
unsigned long s)
{
unsigned long flags;
+ unsigned long sgsne;
- for (; snp != NULL; snp = snp->srcu_parent) {
- if (rcu_seq_done(&ssp->srcu_gp_seq, s) ||
- ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
- return;
- spin_lock_irqsave_rcu_node(snp, flags);
- if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
+ if (snp)
+ for (; snp != NULL; snp = snp->srcu_parent) {
+ sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp);
+ if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) ||
+ (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)))
+ return;
+ spin_lock_irqsave_rcu_node(snp, flags);
+ sgsne = snp->srcu_gp_seq_needed_exp;
+ if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) {
+ spin_unlock_irqrestore_rcu_node(snp, flags);
+ return;
+ }
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
spin_unlock_irqrestore_rcu_node(snp, flags);
- return;
}
- WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(snp, flags);
- }
- spin_lock_irqsave_rcu_node(ssp, flags);
- if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
- WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_lock_irqsave_ssp_contention(ssp, &flags);
+ if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s))
+ WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
@@ -640,67 +982,85 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
*
* Note that this function also does the work of srcu_funnel_exp_start(),
* in some cases by directly invoking it.
+ *
+ * The srcu read lock should be hold around this function. And s is a seq snap
+ * after holding that lock.
*/
static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
unsigned long s, bool do_norm)
{
unsigned long flags;
int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
- struct srcu_node *snp = sdp->mynode;
+ unsigned long sgsne;
+ struct srcu_node *snp;
+ struct srcu_node *snp_leaf;
unsigned long snp_seq;
+ struct srcu_usage *sup = ssp->srcu_sup;
- /* Each pass through the loop does one level of the srcu_node tree. */
- for (; snp != NULL; snp = snp->srcu_parent) {
- if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != sdp->mynode)
- return; /* GP already done and CBs recorded. */
- spin_lock_irqsave_rcu_node(snp, flags);
- if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
+ /* Ensure that snp node tree is fully initialized before traversing it */
+ if (smp_load_acquire(&sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ snp_leaf = NULL;
+ else
+ snp_leaf = sdp->mynode;
+
+ if (snp_leaf)
+ /* Each pass through the loop does one level of the srcu_node tree. */
+ for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) {
+ if (WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && snp != snp_leaf)
+ return; /* GP already done and CBs recorded. */
+ spin_lock_irqsave_rcu_node(snp, flags);
snp_seq = snp->srcu_have_cbs[idx];
- if (snp == sdp->mynode && snp_seq == s)
- snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
- spin_unlock_irqrestore_rcu_node(snp, flags);
- if (snp == sdp->mynode && snp_seq != s) {
- srcu_schedule_cbs_sdp(sdp, do_norm
- ? SRCU_INTERVAL
- : 0);
+ if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) {
+ if (snp == snp_leaf && snp_seq == s)
+ snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+ spin_unlock_irqrestore_rcu_node(snp, flags);
+ if (snp == snp_leaf && snp_seq != s) {
+ srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0);
+ return;
+ }
+ if (!do_norm)
+ srcu_funnel_exp_start(ssp, snp, s);
return;
}
- if (!do_norm)
- srcu_funnel_exp_start(ssp, snp, s);
- return;
+ snp->srcu_have_cbs[idx] = s;
+ if (snp == snp_leaf)
+ snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+ sgsne = snp->srcu_gp_seq_needed_exp;
+ if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s)))
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
+ spin_unlock_irqrestore_rcu_node(snp, flags);
}
- snp->srcu_have_cbs[idx] = s;
- if (snp == sdp->mynode)
- snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
- if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
- WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(snp, flags);
- }
/* Top of tree, must ensure the grace period will be started. */
- spin_lock_irqsave_rcu_node(ssp, flags);
- if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) {
+ spin_lock_irqsave_ssp_contention(ssp, &flags);
+ if (ULONG_CMP_LT(sup->srcu_gp_seq_needed, s)) {
/*
* Record need for grace period s. Pair with load
* acquire setting up for initialization.
*/
- smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/
+ smp_store_release(&sup->srcu_gp_seq_needed, s); /*^^^*/
}
- if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
- WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
+ if (!do_norm && ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, s))
+ WRITE_ONCE(sup->srcu_gp_seq_needed_exp, s);
- /* If grace period not already done and none in progress, start it. */
- if (!rcu_seq_done(&ssp->srcu_gp_seq, s) &&
- rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) {
- WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
+ /* If grace period not already in progress, start it. */
+ if (!WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) &&
+ rcu_seq_state(sup->srcu_gp_seq) == SRCU_STATE_IDLE) {
+ WARN_ON_ONCE(ULONG_CMP_GE(sup->srcu_gp_seq, sup->srcu_gp_seq_needed));
srcu_gp_start(ssp);
+
+ // And how can that list_add() in the "else" clause
+ // possibly be safe for concurrent execution? Well,
+ // it isn't. And it does not have to be. After all, it
+ // can only be executed during early boot when there is only
+ // the one boot CPU running with interrupts still disabled.
if (likely(srcu_init_done))
- queue_delayed_work(rcu_gp_wq, &ssp->work,
- srcu_get_delay(ssp));
- else if (list_empty(&ssp->work.work.entry))
- list_add(&ssp->work.work.entry, &srcu_boot_list);
+ queue_delayed_work(rcu_gp_wq, &sup->work,
+ !!srcu_get_delay(ssp));
+ else if (list_empty(&sup->work.work.entry))
+ list_add(&sup->work.work.entry, &srcu_boot_list);
}
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_unlock_irqrestore_rcu_node(sup, flags);
}
/*
@@ -710,12 +1070,16 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
*/
static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
{
+ unsigned long curdelay;
+
+ curdelay = !srcu_get_delay(ssp);
+
for (;;) {
if (srcu_readers_active_idx_check(ssp, idx))
return true;
- if (--trycount + !srcu_get_delay(ssp) <= 0)
+ if ((--trycount + curdelay) <= 0)
return false;
- udelay(SRCU_RETRY_CHECK_DELAY);
+ udelay(srcu_retry_check_delay);
}
}
@@ -727,23 +1091,44 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
static void srcu_flip(struct srcu_struct *ssp)
{
/*
- * Ensure that if this updater saw a given reader's increment
- * from __srcu_read_lock(), that reader was using an old value
- * of ->srcu_idx. Also ensure that if a given reader sees the
- * new value of ->srcu_idx, this updater's earlier scans cannot
- * have seen that reader's increments (which is OK, because this
- * grace period need not wait on that reader).
+ * Because the flip of ->srcu_idx is executed only if the
+ * preceding call to srcu_readers_active_idx_check() found that
+ * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched
+ * and because that summing uses atomic_long_read(), there is
+ * ordering due to a control dependency between that summing and
+ * the WRITE_ONCE() in this call to srcu_flip(). This ordering
+ * ensures that if this updater saw a given reader's increment from
+ * __srcu_read_lock(), that reader was using a value of ->srcu_idx
+ * from before the previous call to srcu_flip(), which should be
+ * quite rare. This ordering thus helps forward progress because
+ * the grace period could otherwise be delayed by additional
+ * calls to __srcu_read_lock() using that old (soon to be new)
+ * value of ->srcu_idx.
+ *
+ * This sum-equality check and ordering also ensures that if
+ * a given call to __srcu_read_lock() uses the new value of
+ * ->srcu_idx, this updater's earlier scans cannot have seen
+ * that reader's increments, which is all to the good, because
+ * this grace period need not wait on that reader. After all,
+ * if those earlier scans had seen that reader, there would have
+ * been a sum mismatch and this code would not be reached.
+ *
+ * This means that the following smp_mb() is redundant, but
+ * it stays until either (1) Compilers learn about this sort of
+ * control dependency or (2) Some production workload running on
+ * a production system is unduly delayed by this slowpath smp_mb().
*/
smp_mb(); /* E */ /* Pairs with B and C. */
- WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter.
/*
* Ensure that if the updater misses an __srcu_read_unlock()
- * increment, that task's next __srcu_read_lock() will see the
- * above counter update. Note that both this memory barrier
- * and the one in srcu_readers_active_idx_check() provide the
- * guarantee for __srcu_read_lock().
+ * increment, that task's __srcu_read_lock() following its next
+ * __srcu_read_lock() or __srcu_read_unlock() will see the above
+ * counter update. Note that both this memory barrier and the
+ * one in srcu_readers_active_idx_check() provide the guarantee
+ * for __srcu_read_lock().
*/
smp_mb(); /* D */ /* Pairs with C. */
}
@@ -766,7 +1151,7 @@ static void srcu_flip(struct srcu_struct *ssp)
* it, if this function was preempted for enough time for the counters
* to wrap, it really doesn't matter whether or not we expedite the grace
* period. The extra overhead of a needlessly expedited grace period is
- * negligible when amoritized over that time period, and the extra latency
+ * negligible when amortized over that time period, and the extra latency
* of a needlessly non-expedited grace period is similarly negligible.
*/
static bool srcu_might_be_idle(struct srcu_struct *ssp)
@@ -777,35 +1162,36 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
unsigned long t;
unsigned long tlast;
+ check_init_srcu_struct(ssp);
/* If the local srcu_data structure has callbacks, not idle. */
- local_irq_save(flags);
- sdp = this_cpu_ptr(ssp->sda);
+ sdp = raw_cpu_ptr(ssp->sda);
+ spin_lock_irqsave_rcu_node(sdp, flags);
if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
- local_irq_restore(flags);
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
return false; /* Callbacks already present, so not idle. */
}
- local_irq_restore(flags);
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
/*
- * No local callbacks, so probabalistically probe global state.
+ * No local callbacks, so probabilistically probe global state.
* Exact information would require acquiring locks, which would
- * kill scalability, hence the probabalistic nature of the probe.
+ * kill scalability, hence the probabilistic nature of the probe.
*/
/* First, see if enough time has passed since the last GP. */
t = ktime_get_mono_fast_ns();
- tlast = READ_ONCE(ssp->srcu_last_gp_end);
+ tlast = READ_ONCE(ssp->srcu_sup->srcu_last_gp_end);
if (exp_holdoff == 0 ||
time_in_range_open(t, tlast, tlast + exp_holdoff))
return false; /* Too soon after last GP. */
/* Next, check for probable idleness. */
- curseq = rcu_seq_current(&ssp->srcu_gp_seq);
+ curseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */
- if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_gp_seq_needed)))
+ if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed)))
return false; /* Grace period in progress, so not idle. */
smp_mb(); /* Order ->srcu_gp_seq with prior access. */
- if (curseq != rcu_seq_current(&ssp->srcu_gp_seq))
+ if (curseq != rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq))
return false; /* GP # changed, so not idle. */
return true; /* With reasonable probability, idle! */
}
@@ -818,6 +1204,93 @@ static void srcu_leak_callback(struct rcu_head *rhp)
}
/*
+ * Start an SRCU grace period, and also queue the callback if non-NULL.
+ */
+static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
+ struct rcu_head *rhp, bool do_norm)
+{
+ unsigned long flags;
+ int idx;
+ bool needexp = false;
+ bool needgp = false;
+ unsigned long s;
+ struct srcu_data *sdp;
+ struct srcu_node *sdp_mynode;
+ int ss_state;
+
+ check_init_srcu_struct(ssp);
+ /*
+ * While starting a new grace period, make sure we are in an
+ * SRCU read-side critical section so that the grace-period
+ * sequence number cannot wrap around in the meantime.
+ */
+ idx = __srcu_read_lock_nmisafe(ssp);
+ ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state);
+ if (ss_state < SRCU_SIZE_WAIT_CALL)
+ sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
+ else
+ sdp = raw_cpu_ptr(ssp->sda);
+ spin_lock_irqsave_sdp_contention(sdp, &flags);
+ if (rhp)
+ rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
+ /*
+ * The snapshot for acceleration must be taken _before_ the read of the
+ * current gp sequence used for advancing, otherwise advancing may fail
+ * and acceleration may then fail too.
+ *
+ * This could happen if:
+ *
+ * 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the
+ * RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8).
+ *
+ * 2) The grace period for RCU_WAIT_TAIL is seen as started but not
+ * completed so rcu_seq_current() returns X + SRCU_STATE_SCAN1.
+ *
+ * 3) This value is passed to rcu_segcblist_advance() which can't move
+ * any segment forward and fails.
+ *
+ * 4) srcu_gp_start_if_needed() still proceeds with callback acceleration.
+ * But then the call to rcu_seq_snap() observes the grace period for the
+ * RCU_WAIT_TAIL segment as completed and the subsequent one for the
+ * RCU_NEXT_READY_TAIL segment as started (ie: X + 4 + SRCU_STATE_SCAN1)
+ * so it returns a snapshot of the next grace period, which is X + 12.
+ *
+ * 5) The value of X + 12 is passed to rcu_segcblist_accelerate() but the
+ * freshly enqueued callback in RCU_NEXT_TAIL can't move to
+ * RCU_NEXT_READY_TAIL which already has callbacks for a previous grace
+ * period (gp_num = X + 8). So acceleration fails.
+ */
+ s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
+ if (rhp) {
+ rcu_segcblist_advance(&sdp->srcu_cblist,
+ rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+ WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s));
+ }
+ if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
+ sdp->srcu_gp_seq_needed = s;
+ needgp = true;
+ }
+ if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
+ sdp->srcu_gp_seq_needed_exp = s;
+ needexp = true;
+ }
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
+
+ /* Ensure that snp node tree is fully initialized before traversing it */
+ if (ss_state < SRCU_SIZE_WAIT_BARRIER)
+ sdp_mynode = NULL;
+ else
+ sdp_mynode = sdp->mynode;
+
+ if (needgp)
+ srcu_funnel_gp_start(ssp, sdp, s, do_norm);
+ else if (needexp)
+ srcu_funnel_exp_start(ssp, sdp_mynode, s);
+ __srcu_read_unlock_nmisafe(ssp, idx);
+ return s;
+}
+
+/*
* Enqueue an SRCU callback on the srcu_data structure associated with
* the current CPU and the specified srcu_struct structure, initiating
* grace-period processing if it is not already running.
@@ -848,14 +1321,6 @@ static void srcu_leak_callback(struct rcu_head *rhp)
static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rcu_callback_t func, bool do_norm)
{
- unsigned long flags;
- int idx;
- bool needexp = false;
- bool needgp = false;
- unsigned long s;
- struct srcu_data *sdp;
-
- check_init_srcu_struct(ssp);
if (debug_rcu_head_queue(rhp)) {
/* Probable double call_srcu(), so leak the callback. */
WRITE_ONCE(rhp->func, srcu_leak_callback);
@@ -863,29 +1328,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
return;
}
rhp->func = func;
- idx = srcu_read_lock(ssp);
- local_irq_save(flags);
- sdp = this_cpu_ptr(ssp->sda);
- spin_lock_rcu_node(sdp);
- rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
- rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
- s = rcu_seq_snap(&ssp->srcu_gp_seq);
- (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
- if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
- sdp->srcu_gp_seq_needed = s;
- needgp = true;
- }
- if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
- sdp->srcu_gp_seq_needed_exp = s;
- needexp = true;
- }
- spin_unlock_irqrestore_rcu_node(sdp, flags);
- if (needgp)
- srcu_funnel_gp_start(ssp, sdp, s, do_norm);
- else if (needexp)
- srcu_funnel_exp_start(ssp, sdp->mynode, s);
- srcu_read_unlock(ssp, idx);
+ (void)srcu_gp_start_if_needed(ssp, rhp, do_norm);
}
/**
@@ -919,7 +1362,9 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
{
struct rcu_synchronize rcu;
- RCU_LOCKDEP_WARN(lock_is_held(&ssp->dep_map) ||
+ srcu_lock_sync(&ssp->dep_map);
+
+ RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
@@ -1000,6 +1445,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
* synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
* passed the same srcu_struct structure.
*
+ * Implementation of these memory-ordering guarantees is similar to
+ * that of synchronize_rcu().
+ *
* If SRCU is likely idle, expedite the first request. This semantic
* was provided by Classic SRCU, and is relied upon by its users, so TREE
* SRCU must also provide it. Note that detecting idleness is heuristic
@@ -1014,6 +1462,77 @@ void synchronize_srcu(struct srcu_struct *ssp)
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
+/**
+ * get_state_synchronize_srcu - Provide an end-of-grace-period cookie
+ * @ssp: srcu_struct to provide cookie for.
+ *
+ * This function returns a cookie that can be passed to
+ * poll_state_synchronize_srcu(), which will return true if a full grace
+ * period has elapsed in the meantime. It is the caller's responsibility
+ * to make sure that grace period happens, for example, by invoking
+ * call_srcu() after return from get_state_synchronize_srcu().
+ */
+unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
+{
+ // Any prior manipulation of SRCU-protected data must happen
+ // before the load from ->srcu_gp_seq.
+ smp_mb();
+ return rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
+
+/**
+ * start_poll_synchronize_srcu - Provide cookie and start grace period
+ * @ssp: srcu_struct to provide cookie for.
+ *
+ * This function returns a cookie that can be passed to
+ * poll_state_synchronize_srcu(), which will return true if a full grace
+ * period has elapsed in the meantime. Unlike get_state_synchronize_srcu(),
+ * this function also ensures that any needed SRCU grace period will be
+ * started. This convenience does come at a cost in terms of CPU overhead.
+ */
+unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
+{
+ return srcu_gp_start_if_needed(ssp, NULL, true);
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
+
+/**
+ * poll_state_synchronize_srcu - Has cookie's grace period ended?
+ * @ssp: srcu_struct to provide cookie for.
+ * @cookie: Return value from get_state_synchronize_srcu() or start_poll_synchronize_srcu().
+ *
+ * This function takes the cookie that was returned from either
+ * get_state_synchronize_srcu() or start_poll_synchronize_srcu(), and
+ * returns @true if an SRCU grace period elapsed since the time that the
+ * cookie was created.
+ *
+ * Because cookies are finite in size, wrapping/overflow is possible.
+ * This is more pronounced on 32-bit systems where cookies are 32 bits,
+ * where in theory wrapping could happen in about 14 hours assuming
+ * 25-microsecond expedited SRCU grace periods. However, a more likely
+ * overflow lower bound is on the order of 24 days in the case of
+ * one-millisecond SRCU grace periods. Of course, wrapping in a 64-bit
+ * system requires geologic timespans, as in more than seven million years
+ * even for expedited SRCU grace periods.
+ *
+ * Wrapping/overflow is much more of an issue for CONFIG_SMP=n systems
+ * that also have CONFIG_PREEMPTION=n, which selects Tiny SRCU. This uses
+ * a 16-bit cookie, which rcutorture routinely wraps in a matter of a
+ * few minutes. If this proves to be a problem, this counter will be
+ * expanded to the same size as for Tree SRCU.
+ */
+bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+ if (!rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie))
+ return false;
+ // Ensure that the end of the SRCU grace period happens before
+ // any subsequent code that the caller might execute.
+ smp_mb(); // ^^^
+ return true;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
+
/*
* Callback function for srcu_barrier() use.
*/
@@ -1024,8 +1543,30 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
ssp = sdp->ssp;
- if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
- complete(&ssp->srcu_barrier_completion);
+ if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt))
+ complete(&ssp->srcu_sup->srcu_barrier_completion);
+}
+
+/*
+ * Enqueue an srcu_barrier() callback on the specified srcu_data
+ * structure's ->cblist. but only if that ->cblist already has at least one
+ * callback enqueued. Note that if a CPU already has callbacks enqueue,
+ * it must have already registered the need for a future grace period,
+ * so all we need do is enqueue a callback that will use the same grace
+ * period as the last callback already in the queue.
+ */
+static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp)
+{
+ spin_lock_irq_rcu_node(sdp);
+ atomic_inc(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
+ sdp->srcu_barrier_head.func = srcu_barrier_cb;
+ debug_rcu_head_queue(&sdp->srcu_barrier_head);
+ if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
+ &sdp->srcu_barrier_head)) {
+ debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
+ atomic_dec(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
+ }
+ spin_unlock_irq_rcu_node(sdp);
}
/**
@@ -1035,51 +1576,37 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
void srcu_barrier(struct srcu_struct *ssp)
{
int cpu;
- struct srcu_data *sdp;
- unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq);
+ int idx;
+ unsigned long s = rcu_seq_snap(&ssp->srcu_sup->srcu_barrier_seq);
check_init_srcu_struct(ssp);
- mutex_lock(&ssp->srcu_barrier_mutex);
- if (rcu_seq_done(&ssp->srcu_barrier_seq, s)) {
+ mutex_lock(&ssp->srcu_sup->srcu_barrier_mutex);
+ if (rcu_seq_done(&ssp->srcu_sup->srcu_barrier_seq, s)) {
smp_mb(); /* Force ordering following return. */
- mutex_unlock(&ssp->srcu_barrier_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex);
return; /* Someone else did our work for us. */
}
- rcu_seq_start(&ssp->srcu_barrier_seq);
- init_completion(&ssp->srcu_barrier_completion);
+ rcu_seq_start(&ssp->srcu_sup->srcu_barrier_seq);
+ init_completion(&ssp->srcu_sup->srcu_barrier_completion);
/* Initial count prevents reaching zero until all CBs are posted. */
- atomic_set(&ssp->srcu_barrier_cpu_cnt, 1);
+ atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 1);
- /*
- * Each pass through this loop enqueues a callback, but only
- * on CPUs already having callbacks enqueued. Note that if
- * a CPU already has callbacks enqueue, it must have already
- * registered the need for a future grace period, so all we
- * need do is enqueue a callback that will use the same
- * grace period as the last callback already in the queue.
- */
- for_each_possible_cpu(cpu) {
- sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_irq_rcu_node(sdp);
- atomic_inc(&ssp->srcu_barrier_cpu_cnt);
- sdp->srcu_barrier_head.func = srcu_barrier_cb;
- debug_rcu_head_queue(&sdp->srcu_barrier_head);
- if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
- &sdp->srcu_barrier_head)) {
- debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
- atomic_dec(&ssp->srcu_barrier_cpu_cnt);
- }
- spin_unlock_irq_rcu_node(sdp);
- }
+ idx = __srcu_read_lock_nmisafe(ssp);
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, get_boot_cpu_id()));
+ else
+ for_each_possible_cpu(cpu)
+ srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu));
+ __srcu_read_unlock_nmisafe(ssp, idx);
/* Remove the initial count, at which point reaching zero can happen. */
- if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
- complete(&ssp->srcu_barrier_completion);
- wait_for_completion(&ssp->srcu_barrier_completion);
+ if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt))
+ complete(&ssp->srcu_sup->srcu_barrier_completion);
+ wait_for_completion(&ssp->srcu_sup->srcu_barrier_completion);
- rcu_seq_end(&ssp->srcu_barrier_seq);
- mutex_unlock(&ssp->srcu_barrier_mutex);
+ rcu_seq_end(&ssp->srcu_sup->srcu_barrier_seq);
+ mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex);
}
EXPORT_SYMBOL_GPL(srcu_barrier);
@@ -1105,7 +1632,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
{
int idx;
- mutex_lock(&ssp->srcu_gp_mutex);
+ mutex_lock(&ssp->srcu_sup->srcu_gp_mutex);
/*
* Because readers might be delayed for an extended period after
@@ -1117,38 +1644,39 @@ static void srcu_advance_state(struct srcu_struct *ssp)
* The load-acquire ensures that we see the accesses performed
* by the prior grace period.
*/
- idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */
+ idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); /* ^^^ */
if (idx == SRCU_STATE_IDLE) {
- spin_lock_irq_rcu_node(ssp);
- if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
- WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq));
- spin_unlock_irq_rcu_node(ssp);
- mutex_unlock(&ssp->srcu_gp_mutex);
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
+ WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq));
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return;
}
- idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq));
+ idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq));
if (idx == SRCU_STATE_IDLE)
srcu_gp_start(ssp);
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (idx != SRCU_STATE_IDLE) {
- mutex_unlock(&ssp->srcu_gp_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* Someone else started the grace period. */
}
}
- if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
idx = 1 ^ (ssp->srcu_idx & 1);
if (!try_check_zero(ssp, idx, 1)) {
- mutex_unlock(&ssp->srcu_gp_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
}
srcu_flip(ssp);
- spin_lock_irq_rcu_node(ssp);
- rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2);
- spin_unlock_irq_rcu_node(ssp);
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2);
+ ssp->srcu_sup->srcu_n_exp_nodelay = 0;
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
}
- if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
/*
* SRCU read-side critical sections are normally short,
@@ -1156,9 +1684,10 @@ static void srcu_advance_state(struct srcu_struct *ssp)
*/
idx = 1 ^ (ssp->srcu_idx & 1);
if (!try_check_zero(ssp, idx, 2)) {
- mutex_unlock(&ssp->srcu_gp_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
}
+ ssp->srcu_sup->srcu_n_exp_nodelay = 0;
srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */
}
}
@@ -1171,6 +1700,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
*/
static void srcu_invoke_callbacks(struct work_struct *work)
{
+ long len;
bool more;
struct rcu_cblist ready_cbs;
struct rcu_head *rhp;
@@ -1182,8 +1712,14 @@ static void srcu_invoke_callbacks(struct work_struct *work)
ssp = sdp->ssp;
rcu_cblist_init(&ready_cbs);
spin_lock_irq_rcu_node(sdp);
+ WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL));
rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
+ rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+ /*
+ * Although this function is theoretically re-entrant, concurrent
+ * callbacks invocation is disallowed to avoid executing an SRCU barrier
+ * too early.
+ */
if (sdp->srcu_cblist_invoking ||
!rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
spin_unlock_irq_rcu_node(sdp);
@@ -1193,26 +1729,28 @@ static void srcu_invoke_callbacks(struct work_struct *work)
/* We are on the job! Extract and invoke ready callbacks. */
sdp->srcu_cblist_invoking = true;
rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
+ len = ready_cbs.len;
spin_unlock_irq_rcu_node(sdp);
rhp = rcu_cblist_dequeue(&ready_cbs);
for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
debug_rcu_head_unqueue(rhp);
+ debug_rcu_head_callback(rhp);
local_bh_disable();
rhp->func(rhp);
local_bh_enable();
}
+ WARN_ON_ONCE(ready_cbs.len);
/*
* Update counts, accelerate new callbacks, and if needed,
* schedule another round of callback invocation.
*/
spin_lock_irq_rcu_node(sdp);
- rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
- (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
- rcu_seq_snap(&ssp->srcu_gp_seq));
+ rcu_segcblist_add_len(&sdp->srcu_cblist, -len);
sdp->srcu_cblist_invoking = false;
more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
spin_unlock_irq_rcu_node(sdp);
+ /* An SRCU barrier or callbacks from previous nesting work pending */
if (more)
srcu_schedule_cbs_sdp(sdp, 0);
}
@@ -1225,20 +1763,20 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
{
bool pushgp = true;
- spin_lock_irq_rcu_node(ssp);
- if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
- if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) {
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
+ if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) {
/* All requests fulfilled, time to go idle. */
pushgp = false;
}
- } else if (!rcu_seq_state(ssp->srcu_gp_seq)) {
+ } else if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)) {
/* Outstanding request and no GP. Start one. */
srcu_gp_start(ssp);
}
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (pushgp)
- queue_delayed_work(rcu_gp_wq, &ssp->work, delay);
+ queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, delay);
}
/*
@@ -1246,12 +1784,30 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
*/
static void process_srcu(struct work_struct *work)
{
+ unsigned long curdelay;
+ unsigned long j;
struct srcu_struct *ssp;
+ struct srcu_usage *sup;
- ssp = container_of(work, struct srcu_struct, work.work);
+ sup = container_of(work, struct srcu_usage, work.work);
+ ssp = sup->srcu_ssp;
srcu_advance_state(ssp);
- srcu_reschedule(ssp, srcu_get_delay(ssp));
+ curdelay = srcu_get_delay(ssp);
+ if (curdelay) {
+ WRITE_ONCE(sup->reschedule_count, 0);
+ } else {
+ j = jiffies;
+ if (READ_ONCE(sup->reschedule_jiffies) == j) {
+ WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1);
+ if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay)
+ curdelay = 1;
+ } else {
+ WRITE_ONCE(sup->reschedule_count, 1);
+ WRITE_ONCE(sup->reschedule_jiffies, j);
+ }
+ }
+ srcu_reschedule(ssp, curdelay);
}
void srcutorture_get_gp_data(enum rcutorture_type test_type,
@@ -1261,47 +1817,73 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
if (test_type != SRCU_FLAVOR)
return;
*flags = 0;
- *gp_seq = rcu_seq_current(&ssp->srcu_gp_seq);
+ *gp_seq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
}
EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
+static const char * const srcu_size_state_name[] = {
+ "SRCU_SIZE_SMALL",
+ "SRCU_SIZE_ALLOC",
+ "SRCU_SIZE_WAIT_BARRIER",
+ "SRCU_SIZE_WAIT_CALL",
+ "SRCU_SIZE_WAIT_CBS1",
+ "SRCU_SIZE_WAIT_CBS2",
+ "SRCU_SIZE_WAIT_CBS3",
+ "SRCU_SIZE_WAIT_CBS4",
+ "SRCU_SIZE_BIG",
+ "SRCU_SIZE_???",
+};
+
void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
{
int cpu;
int idx;
unsigned long s0 = 0, s1 = 0;
+ int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state);
+ int ss_state_idx = ss_state;
idx = ssp->srcu_idx & 0x1;
- pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):",
- tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), idx);
- for_each_possible_cpu(cpu) {
- unsigned long l0, l1;
- unsigned long u0, u1;
- long c0, c1;
- struct srcu_data *sdp;
-
- sdp = per_cpu_ptr(ssp->sda, cpu);
- u0 = data_race(sdp->srcu_unlock_count[!idx]);
- u1 = data_race(sdp->srcu_unlock_count[idx]);
-
- /*
- * Make sure that a lock is always counted if the corresponding
- * unlock is counted.
- */
- smp_rmb();
-
- l0 = data_race(sdp->srcu_lock_count[!idx]);
- l1 = data_race(sdp->srcu_lock_count[idx]);
-
- c0 = l0 - u0;
- c1 = l1 - u1;
- pr_cont(" %d(%ld,%ld %c)",
- cpu, c0, c1,
- "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]);
- s0 += c0;
- s1 += c1;
+ if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name))
+ ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1;
+ pr_alert("%s%s Tree SRCU g%ld state %d (%s)",
+ tt, tf, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq), ss_state,
+ srcu_size_state_name[ss_state_idx]);
+ if (!ssp->sda) {
+ // Called after cleanup_srcu_struct(), perhaps.
+ pr_cont(" No per-CPU srcu_data structures (->sda == NULL).\n");
+ } else {
+ pr_cont(" per-CPU(idx=%d):", idx);
+ for_each_possible_cpu(cpu) {
+ unsigned long l0, l1;
+ unsigned long u0, u1;
+ long c0, c1;
+ struct srcu_data *sdp;
+
+ sdp = per_cpu_ptr(ssp->sda, cpu);
+ u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx]));
+ u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx]));
+
+ /*
+ * Make sure that a lock is always counted if the corresponding
+ * unlock is counted.
+ */
+ smp_rmb();
+
+ l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx]));
+ l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx]));
+
+ c0 = l0 - u0;
+ c1 = l1 - u1;
+ pr_cont(" %d(%ld,%ld %c)",
+ cpu, c0, c1,
+ "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]);
+ s0 += c0;
+ s1 += c1;
+ }
+ pr_cont(" T(%ld,%ld)\n", s0, s1);
}
- pr_cont(" T(%ld,%ld)\n", s0, s1);
+ if (SRCU_SIZING_IS_TORTURE())
+ srcu_transition_to_big(ssp);
}
EXPORT_SYMBOL_GPL(srcu_torture_stats_print);
@@ -1310,21 +1892,44 @@ static int __init srcu_bootup_announce(void)
pr_info("Hierarchical SRCU implementation.\n");
if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff);
+ if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY)
+ pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay);
+ if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY)
+ pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay);
+ pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase);
return 0;
}
early_initcall(srcu_bootup_announce);
void __init srcu_init(void)
{
- struct srcu_struct *ssp;
+ struct srcu_usage *sup;
+
+ /* Decide on srcu_struct-size strategy. */
+ if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) {
+ if (nr_cpu_ids >= big_cpu_lim) {
+ convert_to_big = SRCU_SIZING_INIT; // Don't bother waiting for contention.
+ pr_info("%s: Setting srcu_struct sizes to big.\n", __func__);
+ } else {
+ convert_to_big = SRCU_SIZING_NONE | SRCU_SIZING_CONTEND;
+ pr_info("%s: Setting srcu_struct sizes based on contention.\n", __func__);
+ }
+ }
+ /*
+ * Once that is set, call_srcu() can follow the normal path and
+ * queue delayed work. This must follow RCU workqueues creation
+ * and timers initialization.
+ */
srcu_init_done = true;
while (!list_empty(&srcu_boot_list)) {
- ssp = list_first_entry(&srcu_boot_list, struct srcu_struct,
+ sup = list_first_entry(&srcu_boot_list, struct srcu_usage,
work.work.entry);
- check_init_srcu_struct(ssp);
- list_del_init(&ssp->work.work.entry);
- queue_work(rcu_gp_wq, &ssp->work.work);
+ list_del_init(&sup->work.work.entry);
+ if (SRCU_SIZING_IS(SRCU_SIZING_INIT) &&
+ sup->srcu_size_state == SRCU_SIZE_SMALL)
+ sup->srcu_size_state = SRCU_SIZE_ALLOC;
+ queue_work(rcu_gp_wq, &sup->work.work);
}
}
@@ -1334,13 +1939,14 @@ void __init srcu_init(void)
static int srcu_module_coming(struct module *mod)
{
int i;
+ struct srcu_struct *ssp;
struct srcu_struct **sspp = mod->srcu_struct_ptrs;
- int ret;
for (i = 0; i < mod->num_srcu_structs; i++) {
- ret = init_srcu_struct(*(sspp++));
- if (WARN_ON_ONCE(ret))
- return ret;
+ ssp = *(sspp++);
+ ssp->sda = alloc_percpu(struct srcu_data);
+ if (WARN_ON_ONCE(!ssp->sda))
+ return -ENOMEM;
}
return 0;
}
@@ -1349,10 +1955,17 @@ static int srcu_module_coming(struct module *mod)
static void srcu_module_going(struct module *mod)
{
int i;
+ struct srcu_struct *ssp;
struct srcu_struct **sspp = mod->srcu_struct_ptrs;
- for (i = 0; i < mod->num_srcu_structs; i++)
- cleanup_srcu_struct(*(sspp++));
+ for (i = 0; i < mod->num_srcu_structs; i++) {
+ ssp = *(sspp++);
+ if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed)) &&
+ !WARN_ON_ONCE(!ssp->srcu_sup->sda_is_static))
+ cleanup_srcu_struct(ssp);
+ if (!WARN_ON(srcu_readers_active(ssp)))
+ free_percpu(ssp->sda);
+ }
}
/* Handle one module, either coming or going. */
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index d4558ab7a07d..e550f97779b8 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -44,7 +44,7 @@ static void rcu_sync_func(struct rcu_head *rhp);
static void rcu_sync_call(struct rcu_sync *rsp)
{
- call_rcu(&rsp->cb_head, rcu_sync_func);
+ call_rcu_hurry(&rsp->cb_head, rcu_sync_func);
}
/**
@@ -94,9 +94,9 @@ static void rcu_sync_func(struct rcu_head *rhp)
rcu_sync_call(rsp);
} else {
/*
- * We're at least a GP after the last rcu_sync_exit(); eveybody
+ * We're at least a GP after the last rcu_sync_exit(); everybody
* will now have observed the write side critical section.
- * Let 'em rip!.
+ * Let 'em rip!
*/
WRITE_ONCE(rsp->gp_state, GP_IDLE);
}
@@ -111,7 +111,7 @@ static void rcu_sync_func(struct rcu_head *rhp)
* a slowpath during the update. After this function returns, all
* subsequent calls to rcu_sync_is_idle() will return false, which
* tells readers to stay off their fastpaths. A later call to
- * rcu_sync_exit() re-enables reader slowpaths.
+ * rcu_sync_exit() re-enables reader fastpaths.
*
* When called in isolation, rcu_sync_enter() must wait for a grace
* period, however, closely spaced calls to rcu_sync_enter() can
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index ce23f6cc5043..732ad5b39946 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -6,6 +6,7 @@
*/
#ifdef CONFIG_TASKS_RCU_GENERIC
+#include "rcu_segcblist.h"
////////////////////////////////////////////////////////////////////////
//
@@ -13,47 +14,90 @@
struct rcu_tasks;
typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp);
-typedef void (*pregp_func_t)(void);
+typedef void (*pregp_func_t)(struct list_head *hop);
typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop);
typedef void (*postscan_func_t)(struct list_head *hop);
typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp);
typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
/**
- * Definition for a Tasks-RCU-like mechanism.
- * @cbs_head: Head of callback list.
- * @cbs_tail: Tail pointer for callback list.
- * @cbs_wq: Wait queue allowning new callback to get kthread's attention.
- * @cbs_lock: Lock protecting callback list.
- * @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
+ * struct rcu_tasks_percpu - Per-CPU component of definition for a Tasks-RCU-like mechanism.
+ * @cblist: Callback list.
+ * @lock: Lock protecting per-CPU callback list.
+ * @rtp_jiffies: Jiffies counter value for statistics.
+ * @lazy_timer: Timer to unlazify callbacks.
+ * @urgent_gp: Number of additional non-lazy grace periods.
+ * @rtp_n_lock_retries: Rough lock-contention statistic.
+ * @rtp_work: Work queue for invoking callbacks.
+ * @rtp_irq_work: IRQ work queue for deferred wakeups.
+ * @barrier_q_head: RCU callback for barrier operation.
+ * @rtp_blkd_tasks: List of tasks blocked as readers.
+ * @cpu: CPU number corresponding to this entry.
+ * @rtpp: Pointer to the rcu_tasks structure.
+ */
+struct rcu_tasks_percpu {
+ struct rcu_segcblist cblist;
+ raw_spinlock_t __private lock;
+ unsigned long rtp_jiffies;
+ unsigned long rtp_n_lock_retries;
+ struct timer_list lazy_timer;
+ unsigned int urgent_gp;
+ struct work_struct rtp_work;
+ struct irq_work rtp_irq_work;
+ struct rcu_head barrier_q_head;
+ struct list_head rtp_blkd_tasks;
+ int cpu;
+ struct rcu_tasks *rtpp;
+};
+
+/**
+ * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism.
+ * @cbs_wait: RCU wait allowing a new callback to get kthread's attention.
+ * @cbs_gbl_lock: Lock protecting callback list.
+ * @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone.
* @gp_func: This flavor's grace-period-wait function.
* @gp_state: Grace period's most recent state transition (debugging).
+ * @gp_sleep: Per-grace-period sleep to prevent CPU-bound looping.
+ * @init_fract: Initial backoff sleep interval.
* @gp_jiffies: Time of last @gp_state transition.
* @gp_start: Most recent grace-period start in jiffies.
- * @n_gps: Number of grace periods completed since boot.
+ * @tasks_gp_seq: Number of grace periods completed since boot.
* @n_ipis: Number of IPIs sent to encourage grace periods to end.
* @n_ipis_fails: Number of IPI-send failures.
+ * @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
+ * @lazy_jiffies: Number of jiffies to allow callbacks to be lazy.
* @pregp_func: This flavor's pre-grace-period function (optional).
* @pertask_func: This flavor's per-task scan function (optional).
* @postscan_func: This flavor's post-task scan function (optional).
- * @holdout_func: This flavor's holdout-list scan function (optional).
+ * @holdouts_func: This flavor's holdout-list scan function (optional).
* @postgp_func: This flavor's post-grace-period function (optional).
* @call_func: This flavor's call_rcu()-equivalent function.
+ * @rtpcpu: This flavor's rcu_tasks_percpu structure.
+ * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks.
+ * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing.
+ * @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing.
+ * @percpu_dequeue_gpseq: RCU grace-period number to propagate enqueue limit to dequeuers.
+ * @barrier_q_mutex: Serialize barrier operations.
+ * @barrier_q_count: Number of queues being waited on.
+ * @barrier_q_completion: Barrier wait/wakeup mechanism.
+ * @barrier_q_seq: Sequence number for barrier operations.
* @name: This flavor's textual name.
* @kname: This flavor's kthread name.
*/
struct rcu_tasks {
- struct rcu_head *cbs_head;
- struct rcu_head **cbs_tail;
- struct wait_queue_head cbs_wq;
- raw_spinlock_t cbs_lock;
+ struct rcuwait cbs_wait;
+ raw_spinlock_t cbs_gbl_lock;
+ struct mutex tasks_gp_mutex;
int gp_state;
+ int gp_sleep;
+ int init_fract;
unsigned long gp_jiffies;
unsigned long gp_start;
- unsigned long n_gps;
+ unsigned long tasks_gp_seq;
unsigned long n_ipis;
unsigned long n_ipis_fails;
struct task_struct *kthread_ptr;
+ unsigned long lazy_jiffies;
rcu_tasks_gp_func_t gp_func;
pregp_func_t pregp_func;
pertask_func_t pertask_func;
@@ -61,34 +105,79 @@ struct rcu_tasks {
holdouts_func_t holdouts_func;
postgp_func_t postgp_func;
call_rcu_func_t call_func;
+ struct rcu_tasks_percpu __percpu *rtpcpu;
+ int percpu_enqueue_shift;
+ int percpu_enqueue_lim;
+ int percpu_dequeue_lim;
+ unsigned long percpu_dequeue_gpseq;
+ struct mutex barrier_q_mutex;
+ atomic_t barrier_q_count;
+ struct completion barrier_q_completion;
+ unsigned long barrier_q_seq;
char *name;
char *kname;
};
-#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \
-static struct rcu_tasks rt_name = \
-{ \
- .cbs_tail = &rt_name.cbs_head, \
- .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \
- .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_lock), \
- .gp_func = gp, \
- .call_func = call, \
- .name = n, \
- .kname = #rt_name, \
+static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp);
+
+#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \
+static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = { \
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \
+ .rtp_irq_work = IRQ_WORK_INIT_HARD(call_rcu_tasks_iw_wakeup), \
+}; \
+static struct rcu_tasks rt_name = \
+{ \
+ .cbs_wait = __RCUWAIT_INITIALIZER(rt_name.wait), \
+ .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \
+ .tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex), \
+ .gp_func = gp, \
+ .call_func = call, \
+ .rtpcpu = &rt_name ## __percpu, \
+ .lazy_jiffies = DIV_ROUND_UP(HZ, 4), \
+ .name = n, \
+ .percpu_enqueue_shift = order_base_2(CONFIG_NR_CPUS), \
+ .percpu_enqueue_lim = 1, \
+ .percpu_dequeue_lim = 1, \
+ .barrier_q_mutex = __MUTEX_INITIALIZER(rt_name.barrier_q_mutex), \
+ .barrier_q_seq = (0UL - 50UL) << RCU_SEQ_CTR_SHIFT, \
+ .kname = #rt_name, \
}
+#ifdef CONFIG_TASKS_RCU
/* Track exiting tasks in order to allow them to be waited for. */
DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
+/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
+static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
+static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall);
+#endif
+
/* Avoid IPIing CPUs early in the grace period. */
-#define RCU_TASK_IPI_DELAY (HZ / 2)
+#define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0)
static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY;
module_param(rcu_task_ipi_delay, int, 0644);
/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
+#define RCU_TASK_BOOT_STALL_TIMEOUT (HZ * 30)
#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
module_param(rcu_task_stall_timeout, int, 0644);
+#define RCU_TASK_STALL_INFO (HZ * 10)
+static int rcu_task_stall_info __read_mostly = RCU_TASK_STALL_INFO;
+module_param(rcu_task_stall_info, int, 0644);
+static int rcu_task_stall_info_mult __read_mostly = 3;
+module_param(rcu_task_stall_info_mult, int, 0444);
+
+static int rcu_task_enqueue_lim __read_mostly = -1;
+module_param(rcu_task_enqueue_lim, int, 0444);
+
+static bool rcu_task_cb_adjust;
+static int rcu_task_contend_lim __read_mostly = 100;
+module_param(rcu_task_contend_lim, int, 0444);
+static int rcu_task_collapse_lim __read_mostly = 10;
+module_param(rcu_task_collapse_lim, int, 0444);
+static int rcu_task_lazy_lim __read_mostly = 32;
+module_param(rcu_task_lazy_lim, int, 0444);
/* RCU tasks grace-period state for debugging. */
#define RTGS_INIT 0
@@ -103,6 +192,7 @@ module_param(rcu_task_stall_timeout, int, 0644);
#define RTGS_WAIT_READERS 9
#define RTGS_INVOKE_CBS 10
#define RTGS_WAIT_CBS 11
+#ifndef CONFIG_TINY_RCU
static const char * const rcu_tasks_gp_state_names[] = {
"RTGS_INIT",
"RTGS_WAIT_WAIT_CBS",
@@ -117,11 +207,14 @@ static const char * const rcu_tasks_gp_state_names[] = {
"RTGS_INVOKE_CBS",
"RTGS_WAIT_CBS",
};
+#endif /* #ifndef CONFIG_TINY_RCU */
////////////////////////////////////////////////////////////////////////
//
// Generic code.
+static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp);
+
/* Record grace-period phase and time. */
static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate)
{
@@ -129,6 +222,7 @@ static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate)
rtp->gp_jiffies = jiffies;
}
+#ifndef CONFIG_TINY_RCU
/* Return state name. */
static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
{
@@ -139,48 +233,385 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
return "???";
return rcu_tasks_gp_state_names[j];
}
+#endif /* #ifndef CONFIG_TINY_RCU */
+
+// Initialize per-CPU callback lists for the specified flavor of
+// Tasks RCU. Do not enqueue callbacks before this function is invoked.
+static void cblist_init_generic(struct rcu_tasks *rtp)
+{
+ int cpu;
+ unsigned long flags;
+ int lim;
+ int shift;
+
+ if (rcu_task_enqueue_lim < 0) {
+ rcu_task_enqueue_lim = 1;
+ rcu_task_cb_adjust = true;
+ } else if (rcu_task_enqueue_lim == 0) {
+ rcu_task_enqueue_lim = 1;
+ }
+ lim = rcu_task_enqueue_lim;
+
+ if (lim > nr_cpu_ids)
+ lim = nr_cpu_ids;
+ shift = ilog2(nr_cpu_ids / lim);
+ if (((nr_cpu_ids - 1) >> shift) >= lim)
+ shift++;
+ WRITE_ONCE(rtp->percpu_enqueue_shift, shift);
+ WRITE_ONCE(rtp->percpu_dequeue_lim, lim);
+ smp_store_release(&rtp->percpu_enqueue_lim, lim);
+ for_each_possible_cpu(cpu) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ WARN_ON_ONCE(!rtpcp);
+ if (cpu)
+ raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock));
+ local_irq_save(flags); // serialize initialization
+ if (rcu_segcblist_empty(&rtpcp->cblist))
+ rcu_segcblist_init(&rtpcp->cblist);
+ local_irq_restore(flags);
+ INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq);
+ rtpcp->cpu = cpu;
+ rtpcp->rtpp = rtp;
+ if (!rtpcp->rtp_blkd_tasks.next)
+ INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
+ }
+
+ pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name,
+ data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust);
+}
+
+// Compute wakeup time for lazy callback timer.
+static unsigned long rcu_tasks_lazy_time(struct rcu_tasks *rtp)
+{
+ return jiffies + rtp->lazy_jiffies;
+}
+
+// Timer handler that unlazifies lazy callbacks.
+static void call_rcu_tasks_generic_timer(struct timer_list *tlp)
+{
+ unsigned long flags;
+ bool needwake = false;
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp = from_timer(rtpcp, tlp, lazy_timer);
+
+ rtp = rtpcp->rtpp;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ if (!rcu_segcblist_empty(&rtpcp->cblist) && rtp->lazy_jiffies) {
+ if (!rtpcp->urgent_gp)
+ rtpcp->urgent_gp = 1;
+ needwake = true;
+ mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp));
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ if (needwake)
+ rcuwait_wake_up(&rtp->cbs_wait);
+}
+
+// IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic().
+static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp)
+{
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp = container_of(iwp, struct rcu_tasks_percpu, rtp_irq_work);
+
+ rtp = rtpcp->rtpp;
+ rcuwait_wake_up(&rtp->cbs_wait);
+}
// Enqueue a callback for the specified flavor of Tasks RCU.
static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
struct rcu_tasks *rtp)
{
+ int chosen_cpu;
unsigned long flags;
+ bool havekthread = smp_load_acquire(&rtp->kthread_ptr);
+ int ideal_cpu;
+ unsigned long j;
+ bool needadjust = false;
bool needwake;
+ struct rcu_tasks_percpu *rtpcp;
rhp->next = NULL;
rhp->func = func;
- raw_spin_lock_irqsave(&rtp->cbs_lock, flags);
- needwake = !rtp->cbs_head;
- WRITE_ONCE(*rtp->cbs_tail, rhp);
- rtp->cbs_tail = &rhp->next;
- raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags);
+ local_irq_save(flags);
+ rcu_read_lock();
+ ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift);
+ chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask);
+ rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu);
+ if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled.
+ raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
+ j = jiffies;
+ if (rtpcp->rtp_jiffies != j) {
+ rtpcp->rtp_jiffies = j;
+ rtpcp->rtp_n_lock_retries = 0;
+ }
+ if (rcu_task_cb_adjust && ++rtpcp->rtp_n_lock_retries > rcu_task_contend_lim &&
+ READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids)
+ needadjust = true; // Defer adjustment to avoid deadlock.
+ }
+ // Queuing callbacks before initialization not yet supported.
+ if (WARN_ON_ONCE(!rcu_segcblist_is_enabled(&rtpcp->cblist)))
+ rcu_segcblist_init(&rtpcp->cblist);
+ needwake = (func == wakeme_after_rcu) ||
+ (rcu_segcblist_n_cbs(&rtpcp->cblist) == rcu_task_lazy_lim);
+ if (havekthread && !needwake && !timer_pending(&rtpcp->lazy_timer)) {
+ if (rtp->lazy_jiffies)
+ mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp));
+ else
+ needwake = rcu_segcblist_empty(&rtpcp->cblist);
+ }
+ if (needwake)
+ rtpcp->urgent_gp = 3;
+ rcu_segcblist_enqueue(&rtpcp->cblist, rhp);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ if (unlikely(needadjust)) {
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rtp->percpu_enqueue_lim != nr_cpu_ids) {
+ WRITE_ONCE(rtp->percpu_enqueue_shift, 0);
+ WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids);
+ smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids);
+ pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name);
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ }
+ rcu_read_unlock();
/* We can't create the thread unless interrupts are enabled. */
if (needwake && READ_ONCE(rtp->kthread_ptr))
- wake_up(&rtp->cbs_wq);
+ irq_work_queue(&rtpcp->rtp_irq_work);
}
-// Wait for a grace period for the specified flavor of Tasks RCU.
-static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
+// RCU callback function for rcu_barrier_tasks_generic().
+static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp)
{
- /* Complain if the scheduler has not started. */
- RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
- "synchronize_rcu_tasks called too soon");
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp;
- /* Wait for the grace period. */
- wait_rcu_gp(rtp->call_func);
+ rtpcp = container_of(rhp, struct rcu_tasks_percpu, barrier_q_head);
+ rtp = rtpcp->rtpp;
+ if (atomic_dec_and_test(&rtp->barrier_q_count))
+ complete(&rtp->barrier_q_completion);
}
-/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
-static int __noreturn rcu_tasks_kthread(void *arg)
+// Wait for all in-flight callbacks for the specified RCU Tasks flavor.
+// Operates in a manner similar to rcu_barrier().
+static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp)
+{
+ int cpu;
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ unsigned long s = rcu_seq_snap(&rtp->barrier_q_seq);
+
+ mutex_lock(&rtp->barrier_q_mutex);
+ if (rcu_seq_done(&rtp->barrier_q_seq, s)) {
+ smp_mb();
+ mutex_unlock(&rtp->barrier_q_mutex);
+ return;
+ }
+ rcu_seq_start(&rtp->barrier_q_seq);
+ init_completion(&rtp->barrier_q_completion);
+ atomic_set(&rtp->barrier_q_count, 2);
+ for_each_possible_cpu(cpu) {
+ if (cpu >= smp_load_acquire(&rtp->percpu_dequeue_lim))
+ break;
+ rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+ rtpcp->barrier_q_head.func = rcu_barrier_tasks_generic_cb;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ if (rcu_segcblist_entrain(&rtpcp->cblist, &rtpcp->barrier_q_head))
+ atomic_inc(&rtp->barrier_q_count);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ }
+ if (atomic_sub_and_test(2, &rtp->barrier_q_count))
+ complete(&rtp->barrier_q_completion);
+ wait_for_completion(&rtp->barrier_q_completion);
+ rcu_seq_end(&rtp->barrier_q_seq);
+ mutex_unlock(&rtp->barrier_q_mutex);
+}
+
+// Advance callbacks and indicate whether either a grace period or
+// callback invocation is needed.
+static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
+{
+ int cpu;
+ int dequeue_limit;
+ unsigned long flags;
+ bool gpdone = poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq);
+ long n;
+ long ncbs = 0;
+ long ncbsnz = 0;
+ int needgpcb = 0;
+
+ dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim);
+ for (cpu = 0; cpu < dequeue_limit; cpu++) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ /* Advance and accelerate any new callbacks. */
+ if (!rcu_segcblist_n_cbs(&rtpcp->cblist))
+ continue;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ // Should we shrink down to a single callback queue?
+ n = rcu_segcblist_n_cbs(&rtpcp->cblist);
+ if (n) {
+ ncbs += n;
+ if (cpu > 0)
+ ncbsnz += n;
+ }
+ rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
+ (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
+ if (rtpcp->urgent_gp > 0 && rcu_segcblist_pend_cbs(&rtpcp->cblist)) {
+ if (rtp->lazy_jiffies)
+ rtpcp->urgent_gp--;
+ needgpcb |= 0x3;
+ } else if (rcu_segcblist_empty(&rtpcp->cblist)) {
+ rtpcp->urgent_gp = 0;
+ }
+ if (rcu_segcblist_ready_cbs(&rtpcp->cblist))
+ needgpcb |= 0x1;
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ }
+
+ // Shrink down to a single callback queue if appropriate.
+ // This is done in two stages: (1) If there are no more than
+ // rcu_task_collapse_lim callbacks on CPU 0 and none on any other
+ // CPU, limit enqueueing to CPU 0. (2) After an RCU grace period,
+ // if there has not been an increase in callbacks, limit dequeuing
+ // to CPU 0. Note the matching RCU read-side critical section in
+ // call_rcu_tasks_generic().
+ if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) {
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rtp->percpu_enqueue_lim > 1) {
+ WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids));
+ smp_store_release(&rtp->percpu_enqueue_lim, 1);
+ rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu();
+ gpdone = false;
+ pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name);
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ }
+ if (rcu_task_cb_adjust && !ncbsnz && gpdone) {
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rtp->percpu_enqueue_lim < rtp->percpu_dequeue_lim) {
+ WRITE_ONCE(rtp->percpu_dequeue_lim, 1);
+ pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name);
+ }
+ if (rtp->percpu_dequeue_lim == 1) {
+ for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist));
+ }
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ }
+
+ return needgpcb;
+}
+
+// Advance callbacks and invoke any that are ready.
+static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu *rtpcp)
{
+ int cpu;
+ int cpunext;
+ int cpuwq;
unsigned long flags;
- struct rcu_head *list;
- struct rcu_head *next;
+ int len;
+ struct rcu_head *rhp;
+ struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
+ struct rcu_tasks_percpu *rtpcp_next;
+
+ cpu = rtpcp->cpu;
+ cpunext = cpu * 2 + 1;
+ if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
+ rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
+ cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND;
+ queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work);
+ cpunext++;
+ if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
+ rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
+ cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND;
+ queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work);
+ }
+ }
+
+ if (rcu_segcblist_empty(&rtpcp->cblist) || !cpu_possible(cpu))
+ return;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
+ rcu_segcblist_extract_done_cbs(&rtpcp->cblist, &rcl);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ len = rcl.len;
+ for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) {
+ debug_rcu_head_callback(rhp);
+ local_bh_disable();
+ rhp->func(rhp);
+ local_bh_enable();
+ cond_resched();
+ }
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ rcu_segcblist_add_len(&rtpcp->cblist, -len);
+ (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+}
+
+// Workqueue flood to advance callbacks and invoke any that are ready.
+static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp)
+{
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp = container_of(wp, struct rcu_tasks_percpu, rtp_work);
+
+ rtp = rtpcp->rtpp;
+ rcu_tasks_invoke_cbs(rtp, rtpcp);
+}
+
+// Wait for one grace period.
+static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
+{
+ int needgpcb;
+
+ mutex_lock(&rtp->tasks_gp_mutex);
+
+ // If there were none, wait a bit and start over.
+ if (unlikely(midboot)) {
+ needgpcb = 0x2;
+ } else {
+ mutex_unlock(&rtp->tasks_gp_mutex);
+ set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
+ rcuwait_wait_event(&rtp->cbs_wait,
+ (needgpcb = rcu_tasks_need_gpcb(rtp)),
+ TASK_IDLE);
+ mutex_lock(&rtp->tasks_gp_mutex);
+ }
+
+ if (needgpcb & 0x2) {
+ // Wait for one grace period.
+ set_tasks_gp_state(rtp, RTGS_WAIT_GP);
+ rtp->gp_start = jiffies;
+ rcu_seq_start(&rtp->tasks_gp_seq);
+ rtp->gp_func(rtp);
+ rcu_seq_end(&rtp->tasks_gp_seq);
+ }
+
+ // Invoke callbacks.
+ set_tasks_gp_state(rtp, RTGS_INVOKE_CBS);
+ rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0));
+ mutex_unlock(&rtp->tasks_gp_mutex);
+}
+
+// RCU-tasks kthread that detects grace periods and invokes callbacks.
+static int __noreturn rcu_tasks_kthread(void *arg)
+{
+ int cpu;
struct rcu_tasks *rtp = arg;
+ for_each_possible_cpu(cpu) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ timer_setup(&rtpcp->lazy_timer, call_rcu_tasks_generic_timer, 0);
+ rtpcp->urgent_gp = 1;
+ }
+
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
- housekeeping_affine(current, HK_FLAG_RCU);
- WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start!
+ housekeeping_affine(current, HK_TYPE_RCU);
+ smp_store_release(&rtp->kthread_ptr, current); // Let GPs start!
/*
* Each pass through the following loop makes one check for
@@ -189,51 +620,32 @@ static int __noreturn rcu_tasks_kthread(void *arg)
* This loop is terminated by the system going down. ;-)
*/
for (;;) {
+ // Wait for one grace period and invoke any callbacks
+ // that are ready.
+ rcu_tasks_one_gp(rtp, false);
- /* Pick up any new callbacks. */
- raw_spin_lock_irqsave(&rtp->cbs_lock, flags);
- smp_mb__after_spinlock(); // Order updates vs. GP.
- list = rtp->cbs_head;
- rtp->cbs_head = NULL;
- rtp->cbs_tail = &rtp->cbs_head;
- raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags);
-
- /* If there were none, wait a bit and start over. */
- if (!list) {
- wait_event_interruptible(rtp->cbs_wq,
- READ_ONCE(rtp->cbs_head));
- if (!rtp->cbs_head) {
- WARN_ON(signal_pending(current));
- set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS);
- schedule_timeout_interruptible(HZ/10);
- }
- continue;
- }
+ // Paranoid sleep to keep this from entering a tight loop.
+ schedule_timeout_idle(rtp->gp_sleep);
+ }
+}
- // Wait for one grace period.
- set_tasks_gp_state(rtp, RTGS_WAIT_GP);
- rtp->gp_start = jiffies;
- rtp->gp_func(rtp);
- rtp->n_gps++;
-
- /* Invoke the callbacks. */
- set_tasks_gp_state(rtp, RTGS_INVOKE_CBS);
- while (list) {
- next = list->next;
- local_bh_disable();
- list->func(list);
- local_bh_enable();
- list = next;
- cond_resched();
- }
- /* Paranoid sleep to keep this from entering a tight loop */
- schedule_timeout_uninterruptible(HZ/10);
+// Wait for a grace period for the specified flavor of Tasks RCU.
+static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
+{
+ /* Complain if the scheduler has not started. */
+ if (WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
+ "synchronize_%s() called too soon", rtp->name))
+ return;
- set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
+ // If the grace-period kthread is running, use it.
+ if (READ_ONCE(rtp->kthread_ptr)) {
+ wait_rcu_gp(rtp->call_func);
+ return;
}
+ rcu_tasks_one_gp(rtp, true);
}
-/* Spawn RCU-tasks grace-period kthread, e.g., at core_initcall() time. */
+/* Spawn RCU-tasks grace-period kthread. */
static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp)
{
struct task_struct *t;
@@ -252,8 +664,15 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp)
static void __init rcu_tasks_bootup_oddness(void)
{
#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU)
+ int rtsimc;
+
if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT)
pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout);
+ rtsimc = clamp(rcu_task_stall_info_mult, 1, 10);
+ if (rtsimc != rcu_task_stall_info_mult) {
+ pr_info("\tTasks-RCU CPU stall info multiplier clamped to %d (rcu_task_stall_info_mult).\n", rtsimc);
+ rcu_task_stall_info_mult = rtsimc;
+ }
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_RCU
pr_info("\tTrampoline variant of Tasks RCU enabled.\n");
@@ -268,19 +687,41 @@ static void __init rcu_tasks_bootup_oddness(void)
#endif /* #ifndef CONFIG_TINY_RCU */
+#ifndef CONFIG_TINY_RCU
/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */
static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
{
- pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n",
+ int cpu;
+ bool havecbs = false;
+ bool haveurgent = false;
+ bool haveurgentcbs = false;
+
+ for_each_possible_cpu(cpu) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ if (!data_race(rcu_segcblist_empty(&rtpcp->cblist)))
+ havecbs = true;
+ if (data_race(rtpcp->urgent_gp))
+ haveurgent = true;
+ if (!data_race(rcu_segcblist_empty(&rtpcp->cblist)) && data_race(rtpcp->urgent_gp))
+ haveurgentcbs = true;
+ if (havecbs && haveurgent && haveurgentcbs)
+ break;
+ }
+ pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c%c%c l:%lu %s\n",
rtp->kname,
tasks_gp_state_getname(rtp), data_race(rtp->gp_state),
jiffies - data_race(rtp->gp_jiffies),
- data_race(rtp->n_gps),
+ data_race(rcu_seq_current(&rtp->tasks_gp_seq)),
data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis),
".k"[!!data_race(rtp->kthread_ptr)],
- ".C"[!!data_race(rtp->cbs_head)],
+ ".C"[havecbs],
+ ".u"[haveurgent],
+ ".U"[haveurgentcbs],
+ rtp->lazy_jiffies,
s);
}
+#endif // #ifndef CONFIG_TINY_RCU
static void exit_tasks_rcu_finish_trace(struct task_struct *t);
@@ -293,13 +734,18 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t);
/* Wait for one RCU-tasks grace period. */
static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
{
- struct task_struct *g, *t;
- unsigned long lastreport;
- LIST_HEAD(holdouts);
+ struct task_struct *g;
int fract;
+ LIST_HEAD(holdouts);
+ unsigned long j;
+ unsigned long lastinfo;
+ unsigned long lastreport;
+ bool reported = false;
+ int rtsi;
+ struct task_struct *t;
set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP);
- rtp->pregp_func();
+ rtp->pregp_func(&holdouts);
/*
* There were callbacks, so we need to wait for an RCU-tasks
@@ -308,10 +754,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
* and make a list of them in holdouts.
*/
set_tasks_gp_state(rtp, RTGS_SCAN_TASKLIST);
- rcu_read_lock();
- for_each_process_thread(g, t)
- rtp->pertask_func(t, &holdouts);
- rcu_read_unlock();
+ if (rtp->pertask_func) {
+ rcu_read_lock();
+ for_each_process_thread(g, t)
+ rtp->pertask_func(t, &holdouts);
+ rcu_read_unlock();
+ }
set_tasks_gp_state(rtp, RTGS_POST_SCAN_TASKLIST);
rtp->postscan_func(&holdouts);
@@ -322,33 +770,50 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
* is empty, we are done.
*/
lastreport = jiffies;
+ lastinfo = lastreport;
+ rtsi = READ_ONCE(rcu_task_stall_info);
- /* Start off with HZ/10 wait and slowly back off to 1 HZ wait. */
- fract = 10;
+ // Start off with initial wait and slowly back off to 1 HZ wait.
+ fract = rtp->init_fract;
- for (;;) {
+ while (!list_empty(&holdouts)) {
+ ktime_t exp;
bool firstreport;
bool needreport;
int rtst;
- if (list_empty(&holdouts))
- break;
-
- /* Slowly back off waiting for holdouts */
+ // Slowly back off waiting for holdouts
set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS);
- schedule_timeout_interruptible(HZ/fract);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ schedule_timeout_idle(fract);
+ } else {
+ exp = jiffies_to_nsecs(fract);
+ __set_current_state(TASK_IDLE);
+ schedule_hrtimeout_range(&exp, jiffies_to_nsecs(HZ / 2), HRTIMER_MODE_REL_HARD);
+ }
- if (fract > 1)
- fract--;
+ if (fract < HZ)
+ fract++;
rtst = READ_ONCE(rcu_task_stall_timeout);
needreport = rtst > 0 && time_after(jiffies, lastreport + rtst);
- if (needreport)
+ if (needreport) {
lastreport = jiffies;
+ reported = true;
+ }
firstreport = true;
WARN_ON(signal_pending(current));
set_tasks_gp_state(rtp, RTGS_SCAN_HOLDOUTS);
rtp->holdouts_func(&holdouts, needreport, &firstreport);
+
+ // Print pre-stall informational messages if needed.
+ j = jiffies;
+ if (rtsi > 0 && !reported && time_after(j, lastinfo + rtsi)) {
+ lastinfo = j;
+ rtsi = rtsi * rcu_task_stall_info_mult;
+ pr_info("%s: %s grace period number %lu (since boot) is %lu jiffies old.\n",
+ __func__, rtp->kname, rtp->tasks_gp_seq, j - rtp->gp_start);
+ }
}
set_tasks_gp_state(rtp, RTGS_POST_GP);
@@ -362,7 +827,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
////////////////////////////////////////////////////////////////////////
//
// Simple variant of RCU whose quiescent states are voluntary context
-// switch, cond_resched_rcu_qs(), user-space execution, and idle.
+// switch, cond_resched_tasks_rcu_qs(), user-space execution, and idle.
// As such, grace periods can take one good long time. There are no
// read-side primitives similar to rcu_read_lock() and rcu_read_unlock()
// because this implementation is intended to get the system into a safe
@@ -370,9 +835,49 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// Finally, this implementation does not support high call_rcu_tasks()
// rates from multiple CPUs. If this is required, per-CPU callback lists
// will be needed.
+//
+// The implementation uses rcu_tasks_wait_gp(), which relies on function
+// pointers in the rcu_tasks structure. The rcu_spawn_tasks_kthread()
+// function sets these function pointers up so that rcu_tasks_wait_gp()
+// invokes these functions in this order:
+//
+// rcu_tasks_pregp_step():
+// Invokes synchronize_rcu() in order to wait for all in-flight
+// t->on_rq and t->nvcsw transitions to complete. This works because
+// all such transitions are carried out with interrupts disabled.
+// rcu_tasks_pertask(), invoked on every non-idle task:
+// For every runnable non-idle task other than the current one, use
+// get_task_struct() to pin down that task, snapshot that task's
+// number of voluntary context switches, and add that task to the
+// holdout list.
+// rcu_tasks_postscan():
+// Invoke synchronize_srcu() to ensure that all tasks that were
+// in the process of exiting (and which thus might not know to
+// synchronize with this RCU Tasks grace period) have completed
+// exiting.
+// check_all_holdout_tasks(), repeatedly until holdout list is empty:
+// Scans the holdout list, attempting to identify a quiescent state
+// for each task on the list. If there is a quiescent state, the
+// corresponding task is removed from the holdout list.
+// rcu_tasks_postgp():
+// Invokes synchronize_rcu() in order to ensure that all prior
+// t->on_rq and t->nvcsw transitions are seen by all CPUs and tasks
+// to have happened before the end of this RCU Tasks grace period.
+// Again, this works because all such transitions are carried out
+// with interrupts disabled.
+//
+// For each exiting task, the exit_tasks_rcu_start() and
+// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU
+// read-side critical sections waited for by rcu_tasks_postscan().
+//
+// Pre-grace-period update-side code is ordered before the grace
+// via the raw_spin_lock.*rcu_node(). Pre-grace-period read-side code
+// is ordered before the grace period via synchronize_rcu() call in
+// rcu_tasks_pregp_step() and by the scheduler's locks and interrupt
+// disabling.
/* Pre-grace-period preparation. */
-static void rcu_tasks_pregp_step(void)
+static void rcu_tasks_pregp_step(struct list_head *hop)
{
/*
* Wait for all pre-existing t->on_rq and t->nvcsw transitions
@@ -390,10 +895,36 @@ static void rcu_tasks_pregp_step(void)
synchronize_rcu();
}
+/* Check for quiescent states since the pregp's synchronize_rcu() */
+static bool rcu_tasks_is_holdout(struct task_struct *t)
+{
+ int cpu;
+
+ /* Has the task been seen voluntarily sleeping? */
+ if (!READ_ONCE(t->on_rq))
+ return false;
+
+ /*
+ * Idle tasks (or idle injection) within the idle loop are RCU-tasks
+ * quiescent states. But CPU boot code performed by the idle task
+ * isn't a quiescent state.
+ */
+ if (is_idle_task(t))
+ return false;
+
+ cpu = task_cpu(t);
+
+ /* Idle tasks on offline CPUs are RCU-tasks quiescent states. */
+ if (t == idle_task(cpu) && !rcu_cpu_online(cpu))
+ return false;
+
+ return true;
+}
+
/* Per-task initial processing. */
static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
{
- if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) {
+ if (t != current && rcu_tasks_is_holdout(t)) {
get_task_struct(t);
t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw);
WRITE_ONCE(t->rcu_tasks_holdout, true);
@@ -402,16 +933,36 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
}
/* Processing between scanning taskslist and draining the holdout list. */
-void rcu_tasks_postscan(struct list_head *hop)
+static void rcu_tasks_postscan(struct list_head *hop)
{
+ int rtsi = READ_ONCE(rcu_task_stall_info);
+
+ if (!IS_ENABLED(CONFIG_TINY_RCU)) {
+ tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi;
+ add_timer(&tasks_rcu_exit_srcu_stall_timer);
+ }
+
/*
- * Wait for tasks that are in the process of exiting. This
- * does only part of the job, ensuring that all tasks that were
- * previously exiting reach the point where they have disabled
- * preemption, allowing the later synchronize_rcu() to finish
- * the job.
+ * Exiting tasks may escape the tasklist scan. Those are vulnerable
+ * until their final schedule() with TASK_DEAD state. To cope with
+ * this, divide the fragile exit path part in two intersecting
+ * read side critical sections:
+ *
+ * 1) An _SRCU_ read side starting before calling exit_notify(),
+ * which may remove the task from the tasklist, and ending after
+ * the final preempt_disable() call in do_exit().
+ *
+ * 2) An _RCU_ read side starting with the final preempt_disable()
+ * call in do_exit() and ending with the final call to schedule()
+ * with TASK_DEAD state.
+ *
+ * This handles the part 1). And postgp will handle part 2) with a
+ * call to synchronize_rcu().
*/
synchronize_srcu(&tasks_rcu_exit_srcu);
+
+ if (!IS_ENABLED(CONFIG_TINY_RCU))
+ del_timer_sync(&tasks_rcu_exit_srcu_stall_timer);
}
/* See if tasks are still holding out, complain if so. */
@@ -422,9 +973,9 @@ static void check_holdout_task(struct task_struct *t,
if (!READ_ONCE(t->rcu_tasks_holdout) ||
t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) ||
- !READ_ONCE(t->on_rq) ||
+ !rcu_tasks_is_holdout(t) ||
(IS_ENABLED(CONFIG_NO_HZ_FULL) &&
- !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
+ !is_idle_task(t) && READ_ONCE(t->rcu_tasks_idle_cpu) >= 0)) {
WRITE_ONCE(t->rcu_tasks_holdout, false);
list_del_init(&t->rcu_tasks_holdout_list);
put_task_struct(t);
@@ -442,7 +993,7 @@ static void check_holdout_task(struct task_struct *t,
t, ".I"[is_idle_task(t)],
"N."[cpu < 0 || !tick_nohz_full_cpu(cpu)],
t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout,
- t->rcu_tasks_idle_cpu, cpu);
+ data_race(t->rcu_tasks_idle_cpu), cpu);
sched_show_task(t);
}
@@ -476,7 +1027,10 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
*
* In addition, this synchronize_rcu() waits for exiting tasks
* to complete their final preempt_disable() region of execution,
- * cleaning up after the synchronize_srcu() above.
+ * cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu),
+ * enforcing the whole region before tasklist removal until
+ * the final schedule() with TASK_DEAD state to be an RCU TASKS
+ * read side critical section.
*/
synchronize_rcu();
}
@@ -484,6 +1038,21 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
+static void tasks_rcu_exit_srcu_stall(struct timer_list *unused)
+{
+#ifndef CONFIG_TINY_RCU
+ int rtsi;
+
+ rtsi = READ_ONCE(rcu_task_stall_info);
+ pr_info("%s: %s grace period number %lu (since boot) gp_state: %s is %lu jiffies old.\n",
+ __func__, rcu_tasks.kname, rcu_tasks.tasks_gp_seq,
+ tasks_gp_state_getname(&rcu_tasks), jiffies - rcu_tasks.gp_jiffies);
+ pr_info("Please check any exiting tasks stuck between calls to exit_tasks_rcu_start() and exit_tasks_rcu_finish()\n");
+ tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi;
+ add_timer(&tasks_rcu_exit_srcu_stall_timer);
+#endif // #ifndef CONFIG_TINY_RCU
+}
+
/**
* call_rcu_tasks() - Queue an RCU for invocation task-based grace period
* @rhp: structure to be used for queueing the RCU updates.
@@ -493,11 +1062,11 @@ DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. call_rcu_tasks() assumes
* that the read-side critical sections end at a voluntary context
- * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle,
+ * switch (not a preemption!), cond_resched_tasks_rcu_qs(), entry into idle,
* or transition to usermode execution. As such, there are no read-side
* primitives analogous to rcu_read_lock() and rcu_read_unlock() because
* this primitive is intended to determine that all tasks have passed
- * through a safe state, not so much for data-strcuture synchronization.
+ * through a safe state, not so much for data-structure synchronization.
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
@@ -540,13 +1109,20 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks);
*/
void rcu_barrier_tasks(void)
{
- /* There is only one callback queue, so this is easy. ;-) */
- synchronize_rcu_tasks();
+ rcu_barrier_tasks_generic(&rcu_tasks);
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
+static int rcu_tasks_lazy_ms = -1;
+module_param(rcu_tasks_lazy_ms, int, 0444);
+
static int __init rcu_spawn_tasks_kthread(void)
{
+ cblist_init_generic(&rcu_tasks);
+ rcu_tasks.gp_sleep = HZ / 10;
+ rcu_tasks.init_fract = HZ / 10;
+ if (rcu_tasks_lazy_ms >= 0)
+ rcu_tasks.lazy_jiffies = msecs_to_jiffies(rcu_tasks_lazy_ms);
rcu_tasks.pregp_func = rcu_tasks_pregp_step;
rcu_tasks.pertask_func = rcu_tasks_pertask;
rcu_tasks.postscan_func = rcu_tasks_postscan;
@@ -555,35 +1131,57 @@ static int __init rcu_spawn_tasks_kthread(void)
rcu_spawn_tasks_kthread_generic(&rcu_tasks);
return 0;
}
-core_initcall(rcu_spawn_tasks_kthread);
-static void show_rcu_tasks_classic_gp_kthread(void)
+#if !defined(CONFIG_TINY_RCU)
+void show_rcu_tasks_classic_gp_kthread(void)
{
show_rcu_tasks_generic_gp_kthread(&rcu_tasks, "");
}
+EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread);
+#endif // !defined(CONFIG_TINY_RCU)
+
+struct task_struct *get_rcu_tasks_gp_kthread(void)
+{
+ return rcu_tasks.kthread_ptr;
+}
+EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread);
-/* Do the srcu_read_lock() for the above synchronize_srcu(). */
+/*
+ * Contribute to protect against tasklist scan blind spot while the
+ * task is exiting and may be removed from the tasklist. See
+ * corresponding synchronize_srcu() for further details.
+ */
void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
{
- preempt_disable();
current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
- preempt_enable();
}
-/* Do the srcu_read_unlock() for the above synchronize_srcu(). */
-void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu)
+/*
+ * Contribute to protect against tasklist scan blind spot while the
+ * task is exiting and may be removed from the tasklist. See
+ * corresponding synchronize_srcu() for further details.
+ */
+void exit_tasks_rcu_stop(void) __releases(&tasks_rcu_exit_srcu)
{
struct task_struct *t = current;
- preempt_disable();
__srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx);
- preempt_enable();
- exit_tasks_rcu_finish_trace(t);
+}
+
+/*
+ * Contribute to protect against tasklist scan blind spot while the
+ * task is exiting and may be removed from the tasklist. See
+ * corresponding synchronize_srcu() for further details.
+ */
+void exit_tasks_rcu_finish(void)
+{
+ exit_tasks_rcu_stop();
+ exit_tasks_rcu_finish_trace(current);
}
#else /* #ifdef CONFIG_TASKS_RCU */
-static void show_rcu_tasks_classic_gp_kthread(void) { }
void exit_tasks_rcu_start(void) { }
+void exit_tasks_rcu_stop(void) { }
void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
#endif /* #else #ifdef CONFIG_TASKS_RCU */
@@ -593,10 +1191,15 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
//
// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of
// passing an empty function to schedule_on_each_cpu(). This approach
-// provides an asynchronous call_rcu_tasks_rude() API and batching
-// of concurrent calls to the synchronous synchronize_rcu_rude() API.
-// This sends IPIs far and wide and induces otherwise unnecessary context
-// switches on all online CPUs, whether idle or not.
+// provides an asynchronous call_rcu_tasks_rude() API and batching of
+// concurrent calls to the synchronous synchronize_rcu_tasks_rude() API.
+// This invokes schedule_on_each_cpu() in order to send IPIs far and wide
+// and induces otherwise unnecessary context switches on all online CPUs,
+// whether idle or not.
+//
+// Callback handling is provided by the rcu_tasks_kthread() function.
+//
+// Ordering is provided by the scheduler's context-switch code.
// Empty function to allow workqueues to force a context switch.
static void rcu_tasks_be_rude(struct work_struct *work)
@@ -623,11 +1226,11 @@ DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude,
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. call_rcu_tasks_rude()
* assumes that the read-side critical sections end at context switch,
- * cond_resched_rcu_qs(), or transition to usermode execution. As such,
- * there are no read-side primitives analogous to rcu_read_lock() and
- * rcu_read_unlock() because this primitive is intended to determine
- * that all tasks have passed through a safe state, not so much for
- * data-strcuture synchronization.
+ * cond_resched_tasks_rcu_qs(), or transition to usermode execution (as
+ * usermode execution is schedulable). As such, there are no read-side
+ * primitives analogous to rcu_read_lock() and rcu_read_unlock() because
+ * this primitive is intended to determine that all tasks have passed
+ * through a safe state, not so much for data-structure synchronization.
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
@@ -645,8 +1248,8 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_rude);
* grace period has elapsed, in other words after all currently
* executing rcu-tasks read-side critical sections have elapsed. These
* read-side critical sections are delimited by calls to schedule(),
- * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory,
- * anyway) cond_resched().
+ * cond_resched_tasks_rcu_qs(), userspace execution (which is a schedulable
+ * context), and (in theory, anyway) cond_resched().
*
* This is a very specialized primitive, intended only for a few uses in
* tracing and other situations requiring manipulation of function preambles
@@ -670,26 +1273,38 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude);
*/
void rcu_barrier_tasks_rude(void)
{
- /* There is only one callback queue, so this is easy. ;-) */
- synchronize_rcu_tasks_rude();
+ rcu_barrier_tasks_generic(&rcu_tasks_rude);
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude);
+int rcu_tasks_rude_lazy_ms = -1;
+module_param(rcu_tasks_rude_lazy_ms, int, 0444);
+
static int __init rcu_spawn_tasks_rude_kthread(void)
{
+ cblist_init_generic(&rcu_tasks_rude);
+ rcu_tasks_rude.gp_sleep = HZ / 10;
+ if (rcu_tasks_rude_lazy_ms >= 0)
+ rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms);
rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude);
return 0;
}
-core_initcall(rcu_spawn_tasks_rude_kthread);
-static void show_rcu_tasks_rude_gp_kthread(void)
+#if !defined(CONFIG_TINY_RCU)
+void show_rcu_tasks_rude_gp_kthread(void)
{
show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, "");
}
+EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
+#endif // !defined(CONFIG_TINY_RCU)
+
+struct task_struct *get_rcu_tasks_rude_gp_kthread(void)
+{
+ return rcu_tasks_rude.kthread_ptr;
+}
+EXPORT_SYMBOL_GPL(get_rcu_tasks_rude_gp_kthread);
-#else /* #ifdef CONFIG_TASKS_RUDE_RCU */
-static void show_rcu_tasks_rude_gp_kthread(void) {}
-#endif /* #else #ifdef CONFIG_TASKS_RUDE_RCU */
+#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */
////////////////////////////////////////////////////////////////////////
//
@@ -703,19 +1318,49 @@ static void show_rcu_tasks_rude_gp_kthread(void) {}
// 2. Protects code in the idle loop, exception entry/exit, and
// CPU-hotplug code paths, similar to the capabilities of SRCU.
//
-// 3. Avoids expensive read-side instruction, having overhead similar
+// 3. Avoids expensive read-side instructions, having overhead similar
// to that of Preemptible RCU.
//
-// There are of course downsides. The grace-period code can send IPIs to
-// CPUs, even when those CPUs are in the idle loop or in nohz_full userspace.
-// It is necessary to scan the full tasklist, much as for Tasks RCU. There
-// is a single callback queue guarded by a single lock, again, much as for
-// Tasks RCU. If needed, these downsides can be at least partially remedied.
+// There are of course downsides. For example, the grace-period code
+// can send IPIs to CPUs, even when those CPUs are in the idle loop or
+// in nohz_full userspace. If needed, these downsides can be at least
+// partially remedied.
//
// Perhaps most important, this variant of RCU does not affect the vanilla
// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace
// readers can operate from idle, offline, and exception entry/exit in no
// way allows rcu_preempt and rcu_sched readers to also do so.
+//
+// The implementation uses rcu_tasks_wait_gp(), which relies on function
+// pointers in the rcu_tasks structure. The rcu_spawn_tasks_trace_kthread()
+// function sets these function pointers up so that rcu_tasks_wait_gp()
+// invokes these functions in this order:
+//
+// rcu_tasks_trace_pregp_step():
+// Disables CPU hotplug, adds all currently executing tasks to the
+// holdout list, then checks the state of all tasks that blocked
+// or were preempted within their current RCU Tasks Trace read-side
+// critical section, adding them to the holdout list if appropriate.
+// Finally, this function re-enables CPU hotplug.
+// The ->pertask_func() pointer is NULL, so there is no per-task processing.
+// rcu_tasks_trace_postscan():
+// Invokes synchronize_rcu() to wait for late-stage exiting tasks
+// to finish exiting.
+// check_all_holdout_tasks_trace(), repeatedly until holdout list is empty:
+// Scans the holdout list, attempting to identify a quiescent state
+// for each task on the list. If there is a quiescent state, the
+// corresponding task is removed from the holdout list. Once this
+// list is empty, the grace period has completed.
+// rcu_tasks_trace_postgp():
+// Provides the needed full memory barrier and does debug checks.
+//
+// The exit_tasks_rcu_finish_trace() synchronizes with exiting tasks.
+//
+// Pre-grace-period update-side code is ordered before the grace period
+// via the ->cbs_lock and barriers in rcu_tasks_kthread(). Pre-grace-period
+// read-side code is ordered before the grace period by atomic operations
+// on .b.need_qs flag of each task involved in this process, or by scheduler
+// context-switch ordering (for locked-down non-running readers).
// The lockdep state must be outside of #ifdef to be useful.
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -727,55 +1372,112 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map);
#ifdef CONFIG_TASKS_TRACE_RCU
-atomic_t trc_n_readers_need_end; // Number of waited-for readers.
-DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks.
-
// Record outstanding IPIs to each CPU. No point in sending two...
static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
// The number of detections of task quiescent state relying on
// heavyweight readers executing explicit memory barriers.
-unsigned long n_heavy_reader_attempts;
-unsigned long n_heavy_reader_updates;
-unsigned long n_heavy_reader_ofl_updates;
+static unsigned long n_heavy_reader_attempts;
+static unsigned long n_heavy_reader_updates;
+static unsigned long n_heavy_reader_ofl_updates;
+static unsigned long n_trc_holdouts;
void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func);
DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace,
"RCU Tasks Trace");
+/* Load from ->trc_reader_special.b.need_qs with proper ordering. */
+static u8 rcu_ld_need_qs(struct task_struct *t)
+{
+ smp_mb(); // Enforce full grace-period ordering.
+ return smp_load_acquire(&t->trc_reader_special.b.need_qs);
+}
+
+/* Store to ->trc_reader_special.b.need_qs with proper ordering. */
+static void rcu_st_need_qs(struct task_struct *t, u8 v)
+{
+ smp_store_release(&t->trc_reader_special.b.need_qs, v);
+ smp_mb(); // Enforce full grace-period ordering.
+}
+
/*
- * This irq_work handler allows rcu_read_unlock_trace() to be invoked
- * while the scheduler locks are held.
+ * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for
+ * the four-byte operand-size restriction of some platforms.
+ * Returns the old value, which is often ignored.
*/
-static void rcu_read_unlock_iw(struct irq_work *iwp)
+u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new)
{
- wake_up(&trc_wait);
+ union rcu_special ret;
+ union rcu_special trs_old = READ_ONCE(t->trc_reader_special);
+ union rcu_special trs_new = trs_old;
+
+ if (trs_old.b.need_qs != old)
+ return trs_old.b.need_qs;
+ trs_new.b.need_qs = new;
+ ret.s = cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s);
+ return ret.b.need_qs;
}
-static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw);
+EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs);
-/* If we are the last reader, wake up the grace-period kthread. */
-void rcu_read_unlock_trace_special(struct task_struct *t, int nesting)
+/*
+ * If we are the last reader, signal the grace-period kthread.
+ * Also remove from the per-CPU list of blocked tasks.
+ */
+void rcu_read_unlock_trace_special(struct task_struct *t)
{
- int nq = t->trc_reader_special.b.need_qs;
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ union rcu_special trs;
+
+ // Open-coded full-word version of rcu_ld_need_qs().
+ smp_mb(); // Enforce full grace-period ordering.
+ trs = smp_load_acquire(&t->trc_reader_special);
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) &&
- t->trc_reader_special.b.need_mb)
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb)
smp_mb(); // Pairs with update-side barriers.
// Update .need_qs before ->trc_reader_nesting for irq/NMI handlers.
- if (nq)
- WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
- WRITE_ONCE(t->trc_reader_nesting, nesting);
- if (nq && atomic_dec_and_test(&trc_n_readers_need_end))
- irq_work_queue(&rcu_tasks_trace_iw);
+ if (trs.b.need_qs == (TRC_NEED_QS_CHECKED | TRC_NEED_QS)) {
+ u8 result = rcu_trc_cmpxchg_need_qs(t, TRC_NEED_QS_CHECKED | TRC_NEED_QS,
+ TRC_NEED_QS_CHECKED);
+
+ WARN_ONCE(result != trs.b.need_qs, "%s: result = %d", __func__, result);
+ }
+ if (trs.b.blocked) {
+ rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, t->trc_blkd_cpu);
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ list_del_init(&t->trc_blkd_node);
+ WRITE_ONCE(t->trc_reader_special.b.blocked, false);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ }
+ WRITE_ONCE(t->trc_reader_nesting, 0);
}
EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special);
+/* Add a newly blocked reader task to its CPU's list. */
+void rcu_tasks_trace_qs_blkd(struct task_struct *t)
+{
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+
+ local_irq_save(flags);
+ rtpcp = this_cpu_ptr(rcu_tasks_trace.rtpcpu);
+ raw_spin_lock_rcu_node(rtpcp); // irqs already disabled
+ t->trc_blkd_cpu = smp_processor_id();
+ if (!rtpcp->rtp_blkd_tasks.next)
+ INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
+ list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks);
+ WRITE_ONCE(t->trc_reader_special.b.blocked, true);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd);
+
/* Add a task to the holdout list, if it is not already on the list. */
static void trc_add_holdout(struct task_struct *t, struct list_head *bhp)
{
if (list_empty(&t->trc_holdout_list)) {
get_task_struct(t);
list_add(&t->trc_holdout_list, bhp);
+ n_trc_holdouts++;
}
}
@@ -785,92 +1487,91 @@ static void trc_del_holdout(struct task_struct *t)
if (!list_empty(&t->trc_holdout_list)) {
list_del_init(&t->trc_holdout_list);
put_task_struct(t);
+ n_trc_holdouts--;
}
}
/* IPI handler to check task state. */
static void trc_read_check_handler(void *t_in)
{
+ int nesting;
struct task_struct *t = current;
struct task_struct *texp = t_in;
// If the task is no longer running on this CPU, leave.
- if (unlikely(texp != t)) {
- if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
- wake_up(&trc_wait);
+ if (unlikely(texp != t))
goto reset_ipi; // Already on holdout list, so will check later.
- }
// If the task is not in a read-side critical section, and
// if this is the last reader, awaken the grace-period kthread.
- if (likely(!t->trc_reader_nesting)) {
- if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
- wake_up(&trc_wait);
- // Mark as checked after decrement to avoid false
- // positives on the above WARN_ON_ONCE().
- WRITE_ONCE(t->trc_reader_checked, true);
+ nesting = READ_ONCE(t->trc_reader_nesting);
+ if (likely(!nesting)) {
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
goto reset_ipi;
}
- WRITE_ONCE(t->trc_reader_checked, true);
+ // If we are racing with an rcu_read_unlock_trace(), try again later.
+ if (unlikely(nesting < 0))
+ goto reset_ipi;
- // Get here if the task is in a read-side critical section. Set
- // its state so that it will awaken the grace-period kthread upon
- // exit from that critical section.
- WARN_ON_ONCE(t->trc_reader_special.b.need_qs);
- WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
+ // Get here if the task is in a read-side critical section.
+ // Set its state so that it will update state for the grace-period
+ // kthread upon exit from that critical section.
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED);
reset_ipi:
// Allow future IPIs to be sent on CPU and for task.
// Also order this IPI handler against any later manipulations of
// the intended task.
- smp_store_release(&per_cpu(trc_ipi_to_cpu, smp_processor_id()), false); // ^^^
+ smp_store_release(per_cpu_ptr(&trc_ipi_to_cpu, smp_processor_id()), false); // ^^^
smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^
}
/* Callback function for scheduler to check locked-down task. */
-static bool trc_inspect_reader(struct task_struct *t, void *arg)
+static int trc_inspect_reader(struct task_struct *t, void *bhp_in)
{
+ struct list_head *bhp = bhp_in;
int cpu = task_cpu(t);
- bool in_qs = false;
+ int nesting;
bool ofl = cpu_is_offline(cpu);
- if (task_curr(t)) {
- WARN_ON_ONCE(ofl & !is_idle_task(t));
-
+ if (task_curr(t) && !ofl) {
// If no chance of heavyweight readers, do it the hard way.
- if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
- return false;
+ if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+ return -EINVAL;
// If heavyweight readers are enabled on the remote task,
// we can inspect its state despite its currently running.
// However, we cannot safely change its state.
n_heavy_reader_attempts++;
- if (!ofl && // Check for "running" idle tasks on offline CPUs.
- !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting))
- return false; // No quiescent state, do it the hard way.
+ // Check for "running" idle tasks on offline CPUs.
+ if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting))
+ return -EINVAL; // No quiescent state, do it the hard way.
n_heavy_reader_updates++;
- if (ofl)
- n_heavy_reader_ofl_updates++;
- in_qs = true;
+ nesting = 0;
} else {
- in_qs = likely(!t->trc_reader_nesting);
+ // The task is not running, so C-language access is safe.
+ nesting = t->trc_reader_nesting;
+ WARN_ON_ONCE(ofl && task_curr(t) && (t != idle_task(task_cpu(t))));
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl)
+ n_heavy_reader_ofl_updates++;
}
- // Mark as checked. Because this is called from the grace-period
- // kthread, also remove the task from the holdout list.
- t->trc_reader_checked = true;
- trc_del_holdout(t);
-
- if (in_qs)
- return true; // Already in quiescent state, done!!!
+ // If not exiting a read-side critical section, mark as checked
+ // so that the grace-period kthread will remove it from the
+ // holdout list.
+ if (!nesting) {
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
+ return 0; // In QS, so done.
+ }
+ if (nesting < 0)
+ return -EINVAL; // Reader transitioning, try again later.
// The task is in a read-side critical section, so set up its
- // state so that it will awaken the grace-period kthread upon exit
- // from that critical section.
- atomic_inc(&trc_n_readers_need_end); // One more to wait on.
- WARN_ON_ONCE(t->trc_reader_special.b.need_qs);
- WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
- return true;
+ // state so that it will update state upon exit from that critical
+ // section.
+ if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED))
+ trc_add_holdout(t, bhp);
+ return 0;
}
/* Attempt to extract the state for the specified task. */
@@ -885,23 +1586,29 @@ static void trc_wait_for_one_reader(struct task_struct *t,
// The current task had better be in a quiescent state.
if (t == current) {
- t->trc_reader_checked = true;
- trc_del_holdout(t);
- WARN_ON_ONCE(t->trc_reader_nesting);
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
return;
}
// Attempt to nail down the task for inspection.
get_task_struct(t);
- if (try_invoke_on_locked_down_task(t, trc_inspect_reader, NULL)) {
+ if (!task_call_func(t, trc_inspect_reader, bhp)) {
put_task_struct(t);
return;
}
put_task_struct(t);
+ // If this task is not yet on the holdout list, then we are in
+ // an RCU read-side critical section. Otherwise, the invocation of
+ // trc_add_holdout() that added it to the list did the necessary
+ // get_task_struct(). Either way, the task cannot be freed out
+ // from under this code.
+
// If currently running, send an IPI, either way, add to list.
trc_add_holdout(t, bhp);
- if (task_curr(t) && time_after(jiffies, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) {
+ if (task_curr(t) &&
+ time_after(jiffies + 1, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) {
// The task is currently running, so try IPIing it.
cpu = task_cpu(t);
@@ -909,91 +1616,161 @@ static void trc_wait_for_one_reader(struct task_struct *t,
if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0)
return;
- atomic_inc(&trc_n_readers_need_end);
per_cpu(trc_ipi_to_cpu, cpu) = true;
t->trc_ipi_to_cpu = cpu;
rcu_tasks_trace.n_ipis++;
- if (smp_call_function_single(cpu,
- trc_read_check_handler, t, 0)) {
+ if (smp_call_function_single(cpu, trc_read_check_handler, t, 0)) {
// Just in case there is some other reason for
// failure than the target CPU being offline.
+ WARN_ONCE(1, "%s(): smp_call_function_single() failed for CPU: %d\n",
+ __func__, cpu);
rcu_tasks_trace.n_ipis_fails++;
per_cpu(trc_ipi_to_cpu, cpu) = false;
- t->trc_ipi_to_cpu = cpu;
- if (atomic_dec_and_test(&trc_n_readers_need_end)) {
- WARN_ON_ONCE(1);
- wake_up(&trc_wait);
- }
+ t->trc_ipi_to_cpu = -1;
}
}
}
+/*
+ * Initialize for first-round processing for the specified task.
+ * Return false if task is NULL or already taken care of, true otherwise.
+ */
+static bool rcu_tasks_trace_pertask_prep(struct task_struct *t, bool notself)
+{
+ // During early boot when there is only the one boot CPU, there
+ // is no idle task for the other CPUs. Also, the grace-period
+ // kthread is always in a quiescent state. In addition, just return
+ // if this task is already on the list.
+ if (unlikely(t == NULL) || (t == current && notself) || !list_empty(&t->trc_holdout_list))
+ return false;
+
+ rcu_st_need_qs(t, 0);
+ t->trc_ipi_to_cpu = -1;
+ return true;
+}
+
+/* Do first-round processing for the specified task. */
+static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop)
+{
+ if (rcu_tasks_trace_pertask_prep(t, true))
+ trc_wait_for_one_reader(t, hop);
+}
+
/* Initialize for a new RCU-tasks-trace grace period. */
-static void rcu_tasks_trace_pregp_step(void)
+static void rcu_tasks_trace_pregp_step(struct list_head *hop)
{
+ LIST_HEAD(blkd_tasks);
int cpu;
-
- // Allow for fast-acting IPIs.
- atomic_set(&trc_n_readers_need_end, 1);
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ struct task_struct *t;
// There shouldn't be any old IPIs, but...
for_each_possible_cpu(cpu)
WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu));
- // Disable CPU hotplug across the tasklist scan.
- // This also waits for all readers in CPU-hotplug code paths.
+ // Disable CPU hotplug across the CPU scan for the benefit of
+ // any IPIs that might be needed. This also waits for all readers
+ // in CPU-hotplug code paths.
cpus_read_lock();
-}
-/* Do first-round processing for the specified task. */
-static void rcu_tasks_trace_pertask(struct task_struct *t,
- struct list_head *hop)
-{
- WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
- WRITE_ONCE(t->trc_reader_checked, false);
- t->trc_ipi_to_cpu = -1;
- trc_wait_for_one_reader(t, hop);
+ // These rcu_tasks_trace_pertask_prep() calls are serialized to
+ // allow safe access to the hop list.
+ for_each_online_cpu(cpu) {
+ rcu_read_lock();
+ t = cpu_curr_snapshot(cpu);
+ if (rcu_tasks_trace_pertask_prep(t, true))
+ trc_add_holdout(t, hop);
+ rcu_read_unlock();
+ cond_resched_tasks_rcu_qs();
+ }
+
+ // Only after all running tasks have been accounted for is it
+ // safe to take care of the tasks that have blocked within their
+ // current RCU tasks trace read-side critical section.
+ for_each_possible_cpu(cpu) {
+ rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, cpu);
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ list_splice_init(&rtpcp->rtp_blkd_tasks, &blkd_tasks);
+ while (!list_empty(&blkd_tasks)) {
+ rcu_read_lock();
+ t = list_first_entry(&blkd_tasks, struct task_struct, trc_blkd_node);
+ list_del_init(&t->trc_blkd_node);
+ list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ rcu_tasks_trace_pertask(t, hop);
+ rcu_read_unlock();
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ cond_resched_tasks_rcu_qs();
+ }
+
+ // Re-enable CPU hotplug now that the holdout list is populated.
+ cpus_read_unlock();
}
/*
- * Do intermediate processing between task and holdout scans and
- * pick up the idle tasks.
+ * Do intermediate processing between task and holdout scans.
*/
static void rcu_tasks_trace_postscan(struct list_head *hop)
{
- int cpu;
-
- for_each_possible_cpu(cpu)
- rcu_tasks_trace_pertask(idle_task(cpu), hop);
-
- // Re-enable CPU hotplug now that the tasklist scan has completed.
- cpus_read_unlock();
-
// Wait for late-stage exiting tasks to finish exiting.
// These might have passed the call to exit_tasks_rcu_finish().
+
+ // If you remove the following line, update rcu_trace_implies_rcu_gp()!!!
synchronize_rcu();
- // Any tasks that exit after this point will set ->trc_reader_checked.
+ // Any tasks that exit after this point will set
+ // TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs.
+}
+
+/* Communicate task state back to the RCU tasks trace stall warning request. */
+struct trc_stall_chk_rdr {
+ int nesting;
+ int ipi_to_cpu;
+ u8 needqs;
+};
+
+static int trc_check_slow_task(struct task_struct *t, void *arg)
+{
+ struct trc_stall_chk_rdr *trc_rdrp = arg;
+
+ if (task_curr(t) && cpu_online(task_cpu(t)))
+ return false; // It is running, so decline to inspect it.
+ trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting);
+ trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu);
+ trc_rdrp->needqs = rcu_ld_need_qs(t);
+ return true;
}
/* Show the state of a task stalling the current RCU tasks trace GP. */
static void show_stalled_task_trace(struct task_struct *t, bool *firstreport)
{
int cpu;
+ struct trc_stall_chk_rdr trc_rdr;
+ bool is_idle_tsk = is_idle_task(t);
if (*firstreport) {
pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n");
*firstreport = false;
}
- // FIXME: This should attempt to use try_invoke_on_nonrunning_task().
cpu = task_cpu(t);
- pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n",
- t->pid,
- ".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0],
- ".i"[is_idle_task(t)],
- ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)],
- t->trc_reader_nesting,
- " N"[!!t->trc_reader_special.b.need_qs],
- cpu);
+ if (!task_call_func(t, trc_check_slow_task, &trc_rdr))
+ pr_alert("P%d: %c%c\n",
+ t->pid,
+ ".I"[t->trc_ipi_to_cpu >= 0],
+ ".i"[is_idle_tsk]);
+ else
+ pr_alert("P%d: %c%c%c%c nesting: %d%c%c cpu: %d%s\n",
+ t->pid,
+ ".I"[trc_rdr.ipi_to_cpu >= 0],
+ ".i"[is_idle_tsk],
+ ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)],
+ ".B"[!!data_race(t->trc_reader_special.b.blocked)],
+ trc_rdr.nesting,
+ " !CN"[trc_rdr.needqs & 0x3],
+ " ?"[trc_rdr.needqs > 0x3],
+ cpu, cpu_online(cpu) ? "" : "(offline)");
sched_show_task(t);
}
@@ -1013,69 +1790,52 @@ static void check_all_holdout_tasks_trace(struct list_head *hop,
{
struct task_struct *g, *t;
- // Disable CPU hotplug across the holdout list scan.
+ // Disable CPU hotplug across the holdout list scan for IPIs.
cpus_read_lock();
list_for_each_entry_safe(t, g, hop, trc_holdout_list) {
// If safe and needed, try to check the current task.
if (READ_ONCE(t->trc_ipi_to_cpu) == -1 &&
- !READ_ONCE(t->trc_reader_checked))
+ !(rcu_ld_need_qs(t) & TRC_NEED_QS_CHECKED))
trc_wait_for_one_reader(t, hop);
// If check succeeded, remove this task from the list.
- if (READ_ONCE(t->trc_reader_checked))
+ if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 &&
+ rcu_ld_need_qs(t) == TRC_NEED_QS_CHECKED)
trc_del_holdout(t);
else if (needreport)
show_stalled_task_trace(t, firstreport);
+ cond_resched_tasks_rcu_qs();
}
// Re-enable CPU hotplug now that the holdout list scan has completed.
cpus_read_unlock();
if (needreport) {
- if (firstreport)
+ if (*firstreport)
pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n");
show_stalled_ipi_trace();
}
}
+static void rcu_tasks_trace_empty_fn(void *unused)
+{
+}
+
/* Wait for grace period to complete and provide ordering. */
static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
{
- bool firstreport;
- struct task_struct *g, *t;
- LIST_HEAD(holdouts);
- long ret;
+ int cpu;
- // Remove the safety count.
- smp_mb__before_atomic(); // Order vs. earlier atomics
- atomic_dec(&trc_n_readers_need_end);
- smp_mb__after_atomic(); // Order vs. later atomics
+ // Wait for any lingering IPI handlers to complete. Note that
+ // if a CPU has gone offline or transitioned to userspace in the
+ // meantime, all IPI handlers should have been drained beforehand.
+ // Yes, this assumes that CPUs process IPIs in order. If that ever
+ // changes, there will need to be a recheck and/or timed wait.
+ for_each_online_cpu(cpu)
+ if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu))))
+ smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1);
- // Wait for readers.
- set_tasks_gp_state(rtp, RTGS_WAIT_READERS);
- for (;;) {
- ret = wait_event_idle_exclusive_timeout(
- trc_wait,
- atomic_read(&trc_n_readers_need_end) == 0,
- READ_ONCE(rcu_task_stall_timeout));
- if (ret)
- break; // Count reached zero.
- // Stall warning time, so make a list of the offenders.
- for_each_process_thread(g, t)
- if (READ_ONCE(t->trc_reader_special.b.need_qs))
- trc_add_holdout(t, &holdouts);
- firstreport = true;
- list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list)
- if (READ_ONCE(t->trc_reader_special.b.need_qs)) {
- show_stalled_task_trace(t, &firstreport);
- trc_del_holdout(t);
- }
- if (firstreport)
- pr_err("INFO: rcu_tasks_trace detected stalls? (Counter/taskslist mismatch?)\n");
- show_stalled_ipi_trace();
- pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end));
- }
smp_mb(); // Caller's code must be ordered after wakeup.
// Pairs with pretty much every ordering primitive.
}
@@ -1083,11 +1843,14 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
/* Report any needed quiescent state for this exiting task. */
static void exit_tasks_rcu_finish_trace(struct task_struct *t)
{
- WRITE_ONCE(t->trc_reader_checked, true);
- WARN_ON_ONCE(t->trc_reader_nesting);
- WRITE_ONCE(t->trc_reader_nesting, 0);
- if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)))
- rcu_read_unlock_trace_special(t, 0);
+ union rcu_special trs = READ_ONCE(t->trc_reader_special);
+
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
+ if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS || trs.b.blocked))
+ rcu_read_unlock_trace_special(t);
+ else
+ WRITE_ONCE(t->trc_reader_nesting, 0);
}
/**
@@ -1095,15 +1858,11 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t)
* @rhp: structure to be used for queueing the RCU updates.
* @func: actual callback function to be invoked after the grace period
*
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_tasks_trace()
- * assumes that the read-side critical sections end at context switch,
- * cond_resched_rcu_qs(), or transition to usermode execution. As such,
- * there are no read-side primitives analogous to rcu_read_lock() and
- * rcu_read_unlock() because this primitive is intended to determine
- * that all tasks have passed through a safe state, not so much for
- * data-strcuture synchronization.
+ * The callback function will be invoked some time after a trace rcu-tasks
+ * grace period elapses, in other words after all currently executing
+ * trace rcu-tasks read-side critical sections have completed. These
+ * read-side critical sections are delimited by calls to rcu_read_lock_trace()
+ * and rcu_read_unlock_trace().
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
@@ -1118,11 +1877,10 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_trace);
* synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period
*
* Control will return to the caller some time after a trace rcu-tasks
- * grace period has elapsed, in other words after all currently
- * executing rcu-tasks read-side critical sections have elapsed. These
- * read-side critical sections are delimited by calls to schedule(),
- * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory,
- * anyway) cond_resched().
+ * grace period has elapsed, in other words after all currently executing
+ * trace rcu-tasks read-side critical sections have elapsed. These read-side
+ * critical sections are delimited by calls to rcu_read_lock_trace()
+ * and rcu_read_unlock_trace().
*
* This is a very specialized primitive, intended only for a few uses in
* tracing and other situations requiring manipulation of function preambles
@@ -1147,47 +1905,205 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace);
*/
void rcu_barrier_tasks_trace(void)
{
- /* There is only one callback queue, so this is easy. ;-) */
- synchronize_rcu_tasks_trace();
+ rcu_barrier_tasks_generic(&rcu_tasks_trace);
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace);
+int rcu_tasks_trace_lazy_ms = -1;
+module_param(rcu_tasks_trace_lazy_ms, int, 0444);
+
static int __init rcu_spawn_tasks_trace_kthread(void)
{
+ cblist_init_generic(&rcu_tasks_trace);
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) {
+ rcu_tasks_trace.gp_sleep = HZ / 10;
+ rcu_tasks_trace.init_fract = HZ / 10;
+ } else {
+ rcu_tasks_trace.gp_sleep = HZ / 200;
+ if (rcu_tasks_trace.gp_sleep <= 0)
+ rcu_tasks_trace.gp_sleep = 1;
+ rcu_tasks_trace.init_fract = HZ / 200;
+ if (rcu_tasks_trace.init_fract <= 0)
+ rcu_tasks_trace.init_fract = 1;
+ }
+ if (rcu_tasks_trace_lazy_ms >= 0)
+ rcu_tasks_trace.lazy_jiffies = msecs_to_jiffies(rcu_tasks_trace_lazy_ms);
rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step;
- rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask;
rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan;
rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace;
rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp;
rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace);
return 0;
}
-core_initcall(rcu_spawn_tasks_trace_kthread);
-static void show_rcu_tasks_trace_gp_kthread(void)
+#if !defined(CONFIG_TINY_RCU)
+void show_rcu_tasks_trace_gp_kthread(void)
{
char buf[64];
- sprintf(buf, "N%d h:%lu/%lu/%lu", atomic_read(&trc_n_readers_need_end),
+ sprintf(buf, "N%lu h:%lu/%lu/%lu",
+ data_race(n_trc_holdouts),
data_race(n_heavy_reader_ofl_updates),
data_race(n_heavy_reader_updates),
data_race(n_heavy_reader_attempts));
show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf);
}
+EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread);
+#endif // !defined(CONFIG_TINY_RCU)
+
+struct task_struct *get_rcu_tasks_trace_gp_kthread(void)
+{
+ return rcu_tasks_trace.kthread_ptr;
+}
+EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread);
#else /* #ifdef CONFIG_TASKS_TRACE_RCU */
static void exit_tasks_rcu_finish_trace(struct task_struct *t) { }
-static inline void show_rcu_tasks_trace_gp_kthread(void) {}
#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
+#ifndef CONFIG_TINY_RCU
void show_rcu_tasks_gp_kthreads(void)
{
show_rcu_tasks_classic_gp_kthread();
show_rcu_tasks_rude_gp_kthread();
show_rcu_tasks_trace_gp_kthread();
}
+#endif /* #ifndef CONFIG_TINY_RCU */
+
+#ifdef CONFIG_PROVE_RCU
+struct rcu_tasks_test_desc {
+ struct rcu_head rh;
+ const char *name;
+ bool notrun;
+ unsigned long runstart;
+};
+
+static struct rcu_tasks_test_desc tests[] = {
+ {
+ .name = "call_rcu_tasks()",
+ /* If not defined, the test is skipped. */
+ .notrun = IS_ENABLED(CONFIG_TASKS_RCU),
+ },
+ {
+ .name = "call_rcu_tasks_rude()",
+ /* If not defined, the test is skipped. */
+ .notrun = IS_ENABLED(CONFIG_TASKS_RUDE_RCU),
+ },
+ {
+ .name = "call_rcu_tasks_trace()",
+ /* If not defined, the test is skipped. */
+ .notrun = IS_ENABLED(CONFIG_TASKS_TRACE_RCU)
+ }
+};
+
+static void test_rcu_tasks_callback(struct rcu_head *rhp)
+{
+ struct rcu_tasks_test_desc *rttd =
+ container_of(rhp, struct rcu_tasks_test_desc, rh);
+
+ pr_info("Callback from %s invoked.\n", rttd->name);
+
+ rttd->notrun = false;
+}
+
+static void rcu_tasks_initiate_self_tests(void)
+{
+#ifdef CONFIG_TASKS_RCU
+ pr_info("Running RCU Tasks wait API self tests\n");
+ tests[0].runstart = jiffies;
+ synchronize_rcu_tasks();
+ call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback);
+#endif
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+ pr_info("Running RCU Tasks Rude wait API self tests\n");
+ tests[1].runstart = jiffies;
+ synchronize_rcu_tasks_rude();
+ call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback);
+#endif
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ pr_info("Running RCU Tasks Trace wait API self tests\n");
+ tests[2].runstart = jiffies;
+ synchronize_rcu_tasks_trace();
+ call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback);
+#endif
+}
+
+/*
+ * Return: 0 - test passed
+ * 1 - test failed, but have not timed out yet
+ * -1 - test failed and timed out
+ */
+static int rcu_tasks_verify_self_tests(void)
+{
+ int ret = 0;
+ int i;
+ unsigned long bst = rcu_task_stall_timeout;
+
+ if (bst <= 0 || bst > RCU_TASK_BOOT_STALL_TIMEOUT)
+ bst = RCU_TASK_BOOT_STALL_TIMEOUT;
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ while (tests[i].notrun) { // still hanging.
+ if (time_after(jiffies, tests[i].runstart + bst)) {
+ pr_err("%s has failed boot-time tests.\n", tests[i].name);
+ ret = -1;
+ break;
+ }
+ ret = 1;
+ break;
+ }
+ }
+ WARN_ON(ret < 0);
+
+ return ret;
+}
+
+/*
+ * Repeat the rcu_tasks_verify_self_tests() call once every second until the
+ * test passes or has timed out.
+ */
+static struct delayed_work rcu_tasks_verify_work;
+static void rcu_tasks_verify_work_fn(struct work_struct *work __maybe_unused)
+{
+ int ret = rcu_tasks_verify_self_tests();
+
+ if (ret <= 0)
+ return;
+
+ /* Test fails but not timed out yet, reschedule another check */
+ schedule_delayed_work(&rcu_tasks_verify_work, HZ);
+}
+
+static int rcu_tasks_verify_schedule_work(void)
+{
+ INIT_DELAYED_WORK(&rcu_tasks_verify_work, rcu_tasks_verify_work_fn);
+ rcu_tasks_verify_work_fn(NULL);
+ return 0;
+}
+late_initcall(rcu_tasks_verify_schedule_work);
+#else /* #ifdef CONFIG_PROVE_RCU */
+static void rcu_tasks_initiate_self_tests(void) { }
+#endif /* #else #ifdef CONFIG_PROVE_RCU */
+
+void __init rcu_init_tasks_generic(void)
+{
+#ifdef CONFIG_TASKS_RCU
+ rcu_spawn_tasks_kthread();
+#endif
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+ rcu_spawn_tasks_rude_kthread();
+#endif
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ rcu_spawn_tasks_trace_kthread();
+#endif
+
+ // Run the self-tests.
+ rcu_tasks_initiate_self_tests();
+}
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
static inline void rcu_tasks_bootup_oddness(void) {}
-void show_rcu_tasks_gp_kthreads(void) {}
#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index dd572ce7c747..fec804b79080 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -23,6 +23,7 @@
#include <linux/cpu.h>
#include <linux/prefetch.h>
#include <linux/slab.h>
+#include <linux/mm.h>
#include "rcu.h"
@@ -31,17 +32,19 @@ struct rcu_ctrlblk {
struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
struct rcu_head **curtail; /* ->next pointer of last CB. */
+ unsigned long gp_seq; /* Grace-period counter. */
};
/* Definition for rcupdate control block. */
static struct rcu_ctrlblk rcu_ctrlblk = {
.donetail = &rcu_ctrlblk.rcucblist,
.curtail = &rcu_ctrlblk.rcucblist,
+ .gp_seq = 0 - 300UL,
};
void rcu_barrier(void)
{
- wait_rcu_gp(call_rcu);
+ wait_rcu_gp(call_rcu_hurry);
}
EXPORT_SYMBOL(rcu_barrier);
@@ -55,6 +58,7 @@ void rcu_qs(void)
rcu_ctrlblk.donetail = rcu_ctrlblk.curtail;
raise_softirq_irqoff(RCU_SOFTIRQ);
}
+ WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2);
local_irq_restore(flags);
}
@@ -84,15 +88,16 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head)
unsigned long offset = (unsigned long)head->func;
rcu_lock_acquire(&rcu_callback_map);
- if (__is_kfree_rcu_offset(offset)) {
- trace_rcu_invoke_kfree_callback("", head, offset);
- kfree((void *)head - offset);
+ if (__is_kvfree_rcu_offset(offset)) {
+ trace_rcu_invoke_kvfree_callback("", head, offset);
+ kvfree((void *)head - offset);
rcu_lock_release(&rcu_callback_map);
return true;
}
trace_rcu_invoke_callback("", head);
f = head->func;
+ debug_rcu_head_callback(head);
WRITE_ONCE(head->func, (rcu_callback_t)0L);
f(head);
rcu_lock_release(&rcu_callback_map);
@@ -135,8 +140,10 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
/*
* Wait for a grace period to elapse. But it is illegal to invoke
* synchronize_rcu() from within an RCU read-side critical section.
- * Therefore, any legal call to synchronize_rcu() is a quiescent
- * state, and so on a UP system, synchronize_rcu() need do nothing.
+ * Therefore, any legal call to synchronize_rcu() is a quiescent state,
+ * and so on a UP system, synchronize_rcu() need do nothing, other than
+ * let the polled APIs know that another grace period elapsed.
+ *
* (But Lai Jiangshan points out the benefits of doing might_sleep()
* to reduce latency.)
*
@@ -148,9 +155,14 @@ void synchronize_rcu(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
+ WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2);
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
+static void tiny_rcu_leak_callback(struct rcu_head *rhp)
+{
+}
+
/*
* Post an RCU callback to be invoked after the end of an RCU grace
* period. But since we have but one CPU, that would be after any
@@ -158,9 +170,20 @@ EXPORT_SYMBOL_GPL(synchronize_rcu);
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
+ static atomic_t doublefrees;
unsigned long flags;
- debug_rcu_head_queue(head);
+ if (debug_rcu_head_queue(head)) {
+ if (atomic_inc_return(&doublefrees) < 4) {
+ pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
+ mem_dump_obj(head);
+ }
+
+ if (!__is_kvfree_rcu_offset((unsigned long)head->func))
+ WRITE_ONCE(head->func, tiny_rcu_leak_callback);
+ return;
+ }
+
head->func = func;
head->next = NULL;
@@ -176,9 +199,66 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
}
EXPORT_SYMBOL_GPL(call_rcu);
+/*
+ * Store a grace-period-counter "cookie". For more information,
+ * see the Tree RCU header comment.
+ */
+void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
+}
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
+
+/*
+ * Return a grace-period-counter "cookie". For more information,
+ * see the Tree RCU header comment.
+ */
+unsigned long get_state_synchronize_rcu(void)
+{
+ return READ_ONCE(rcu_ctrlblk.gp_seq);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
+
+/*
+ * Return a grace-period-counter "cookie" and ensure that a future grace
+ * period completes. For more information, see the Tree RCU header comment.
+ */
+unsigned long start_poll_synchronize_rcu(void)
+{
+ unsigned long gp_seq = get_state_synchronize_rcu();
+
+ if (unlikely(is_idle_task(current))) {
+ /* force scheduling for rcu_qs() */
+ resched_cpu(0);
+ }
+ return gp_seq;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
+
+/*
+ * Return true if the grace period corresponding to oldstate has completed
+ * and false otherwise. For more information, see the Tree RCU header
+ * comment.
+ */
+bool poll_state_synchronize_rcu(unsigned long oldstate)
+{
+ return oldstate == RCU_GET_STATE_COMPLETED || READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
+
+#ifdef CONFIG_KASAN_GENERIC
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+{
+ if (head)
+ kasan_record_aux_stack_noalloc(ptr);
+
+ __kvfree_call_rcu(head, ptr);
+}
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+#endif
+
void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
rcu_early_boot_tests();
- srcu_init();
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6c6569e0586c..b2bccfd37c38 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -31,7 +31,10 @@
#include <linux/bitops.h>
#include <linux/export.h>
#include <linux/completion.h>
+#include <linux/kmemleak.h>
#include <linux/moduleparam.h>
+#include <linux/panic.h>
+#include <linux/panic_notifier.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
@@ -57,6 +60,10 @@
#include <linux/slab.h>
#include <linux/sched/isolation.h>
#include <linux/sched/clock.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/kasan.h>
+#include <linux/context_tracking.h>
#include "../time/tick-internal.h"
#include "tree.h"
@@ -67,51 +74,35 @@
#endif
#define MODULE_PARAM_PREFIX "rcutree."
-#ifndef data_race
-#define data_race(expr) \
- ({ \
- expr; \
- })
-#endif
-#ifndef ASSERT_EXCLUSIVE_WRITER
-#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0)
-#endif
-#ifndef ASSERT_EXCLUSIVE_ACCESS
-#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0)
-#endif
-
/* Data structures. */
-/*
- * Steal a bit from the bottom of ->dynticks for idle entry/exit
- * control. Initially this is for TLB flushing.
- */
-#define RCU_DYNTICK_CTRL_MASK 0x1
-#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1)
-
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
- .dynticks_nesting = 1,
- .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
- .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
+ .gpwrap = true,
+#ifdef CONFIG_RCU_NOCB_CPU
+ .cblist.flags = SEGCBLIST_RCU_CORE,
+#endif
};
static struct rcu_state rcu_state = {
.level = { &rcu_state.node[0] },
.gp_state = RCU_GP_IDLE,
.gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT,
.barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex),
+ .barrier_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.barrier_lock),
.name = RCU_NAME,
.abbr = RCU_ABBR,
.exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
.exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
- .ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock),
+ .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
};
/* Dump rcu_node combining tree at boot to verify correct setup. */
static bool dump_tree;
module_param(dump_tree, bool, 0444);
/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
-static bool use_softirq = true;
+static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT);
+#ifndef CONFIG_PREEMPT_RT
module_param(use_softirq, bool, 0444);
+#endif
/* Control rcu_node-tree auto-balancing at boot time. */
static bool rcu_fanout_exact;
module_param(rcu_fanout_exact, bool, 0444);
@@ -154,15 +145,22 @@ static int rcu_scheduler_fully_active __read_mostly;
static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
unsigned long gps, unsigned long flags);
-static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
-static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void);
static void rcu_report_exp_rdp(struct rcu_data *rdp);
static void sync_sched_exp_online_cleanup(int cpu);
static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
+static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
+static bool rcu_rdp_cpu_online(struct rcu_data *rdp);
+static bool rcu_init_invoked(void);
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
-/* rcuc/rcub kthread realtime priority */
+/*
+ * rcuc/rcub/rcuop kthread realtime priority. The "rcuop"
+ * real-time priority(enabling/disabling) is controlled by
+ * the extra CONFIG_RCU_NOCB_CPU_CB_BOOST configuration.
+ */
static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
module_param(kthread_prio, int, 0444);
@@ -175,6 +173,32 @@ module_param(gp_init_delay, int, 0444);
static int gp_cleanup_delay;
module_param(gp_cleanup_delay, int, 0444);
+// Add delay to rcu_read_unlock() for strict grace periods.
+static int rcu_unlock_delay;
+#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD
+module_param(rcu_unlock_delay, int, 0444);
+#endif
+
+/*
+ * This rcu parameter is runtime-read-only. It reflects
+ * a minimum allowed number of objects which can be cached
+ * per-CPU. Object size is equal to one page. This value
+ * can be changed at boot time.
+ */
+static int rcu_min_cached_objs = 5;
+module_param(rcu_min_cached_objs, int, 0444);
+
+// A page shrinker can ask for pages to be freed to make them
+// available for other parts of the system. This usually happens
+// under low memory conditions, and in that case we should also
+// defer page-cache filling for a short time period.
+//
+// The default value is 5 seconds, which is long enough to reduce
+// interference with the shrinker while it asks other systems to
+// drain their caches.
+static int rcu_delay_page_cache_fill_msec = 5000;
+module_param(rcu_delay_page_cache_fill_msec, int, 0444);
+
/* Retrieve RCU kthreads priority for rcutorture */
int rcu_get_gp_kthreads_prio(void)
{
@@ -191,18 +215,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
* the need for long delays to increase some race probabilities with the
* need for fast grace periods to increase other race probabilities.
*/
-#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */
-
-/*
- * Compute the mask of online CPUs for the specified rcu_node structure.
- * This will not be stable unless the rcu_node structure's ->lock is
- * held, but the bit corresponding to the current CPU will be stable
- * in most contexts.
- */
-static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
-{
- return READ_ONCE(rnp->qsmaskinitnext);
-}
+#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */
/*
* Return true if an RCU grace period is in progress. The READ_ONCE()s
@@ -231,58 +244,7 @@ void rcu_softirq_qs(void)
{
rcu_qs();
rcu_preempt_deferred_qs(current);
-}
-
-/*
- * Record entry into an extended quiescent state. This is only to be
- * called when not already in an extended quiescent state, that is,
- * RCU is watching prior to the call to this function and is no longer
- * watching upon return.
- */
-static noinstr void rcu_dynticks_eqs_enter(void)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- int seq;
-
- /*
- * CPUs seeing atomic_add_return() must see prior RCU read-side
- * critical sections, and we also must force ordering with the
- * next idle sojourn.
- */
- rcu_dynticks_task_trace_enter(); // Before ->dynticks update!
- seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
- // RCU is no longer watching. Better be in extended quiescent state!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- (seq & RCU_DYNTICK_CTRL_CTR));
- /* Better not have special action (TLB flush) pending! */
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- (seq & RCU_DYNTICK_CTRL_MASK));
-}
-
-/*
- * Record exit from an extended quiescent state. This is only to be
- * called from an extended quiescent state, that is, RCU is not watching
- * prior to the call to this function and is watching upon return.
- */
-static noinstr void rcu_dynticks_eqs_exit(void)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- int seq;
-
- /*
- * CPUs seeing atomic_add_return() must see prior idle sojourns,
- * and we also must force ordering with the next RCU read-side
- * critical section.
- */
- seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
- // RCU is now watching. Better not be in an extended quiescent state!
- rcu_dynticks_task_trace_exit(); // After ->dynticks update!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- !(seq & RCU_DYNTICK_CTRL_CTR));
- if (seq & RCU_DYNTICK_CTRL_MASK) {
- arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks);
- smp_mb__after_atomic(); /* _exit after clearing mask. */
- }
+ rcu_tasks_qs(current, false);
}
/*
@@ -297,34 +259,19 @@ static noinstr void rcu_dynticks_eqs_exit(void)
*/
static void rcu_dynticks_eqs_online(void)
{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- if (atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR)
+ if (ct_dynticks() & RCU_DYNTICKS_IDX)
return;
- atomic_add(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
-}
-
-/*
- * Is the current CPU in an extended quiescent state?
- *
- * No ordering, as we are sampling CPU-local information.
- */
-static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR);
+ ct_state_inc(RCU_DYNTICKS_IDX);
}
/*
* Snapshot the ->dynticks counter with full ordering so as to allow
* stable comparison of this counter with past and future snapshots.
*/
-static int rcu_dynticks_snap(struct rcu_data *rdp)
+static int rcu_dynticks_snap(int cpu)
{
- int snap = atomic_add_return(0, &rdp->dynticks);
-
- return snap & ~RCU_DYNTICK_CTRL_MASK;
+ smp_mb(); // Fundamental RCU ordering guarantee.
+ return ct_dynticks_cpu_acquire(cpu);
}
/*
@@ -333,7 +280,7 @@ static int rcu_dynticks_snap(struct rcu_data *rdp)
*/
static bool rcu_dynticks_in_eqs(int snap)
{
- return !(snap & RCU_DYNTICK_CTRL_CTR);
+ return !(snap & RCU_DYNTICKS_IDX);
}
/*
@@ -343,7 +290,7 @@ static bool rcu_dynticks_in_eqs(int snap)
*/
static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
{
- return snap != rcu_dynticks_snap(rdp);
+ return snap != rcu_dynticks_snap(rdp->cpu);
}
/*
@@ -352,45 +299,17 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
*/
bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
int snap;
// If not quiescent, force back to earlier extended quiescent state.
- snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK |
- RCU_DYNTICK_CTRL_CTR);
-
+ snap = ct_dynticks_cpu(cpu) & ~RCU_DYNTICKS_IDX;
smp_rmb(); // Order ->dynticks and *vp reads.
if (READ_ONCE(*vp))
return false; // Non-zero, so report failure;
smp_rmb(); // Order *vp read and ->dynticks re-read.
// If still in the same extended quiescent state, we are good!
- return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK);
-}
-
-/*
- * Set the special (bottom) bit of the specified CPU so that it
- * will take special action (such as flushing its TLB) on the
- * next exit from an extended quiescent state. Returns true if
- * the bit was successfully set, or false if the CPU was not in
- * an extended quiescent state.
- */
-bool rcu_eqs_special_set(int cpu)
-{
- int old;
- int new;
- int new_old;
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-
- new_old = atomic_read(&rdp->dynticks);
- do {
- old = new_old;
- if (old & RCU_DYNTICK_CTRL_CTR)
- return false;
- new = old | RCU_DYNTICK_CTRL_MASK;
- new_old = atomic_cmpxchg(&rdp->dynticks, old, new);
- } while (new_old != old);
- return true;
+ return snap == ct_dynticks_cpu(cpu);
}
/*
@@ -404,15 +323,14 @@ bool rcu_eqs_special_set(int cpu)
*
* The caller must have disabled interrupts and must not be idle.
*/
-void rcu_momentary_dyntick_idle(void)
+notrace void rcu_momentary_dyntick_idle(void)
{
- int special;
+ int seq;
raw_cpu_write(rcu_data.rcu_need_heavy_qs, false);
- special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
- &this_cpu_ptr(&rcu_data)->dynticks);
+ seq = ct_state_inc(2 * RCU_DYNTICKS_IDX);
/* It is illegal to call this from idle state. */
- WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
+ WARN_ON_ONCE(!(seq & RCU_DYNTICKS_IDX));
rcu_preempt_deferred_qs(current);
}
EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
@@ -437,13 +355,13 @@ static int rcu_is_cpu_rrupt_from_idle(void)
lockdep_assert_irqs_disabled();
/* Check for counter underflows */
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0,
+ RCU_LOCKDEP_WARN(ct_dynticks_nesting() < 0,
"RCU dynticks_nesting counter underflow!");
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0,
+ RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() <= 0,
"RCU dynticks_nmi_nesting counter underflow/zero!");
/* Are we at first interrupt nesting level? */
- nesting = __this_cpu_read(rcu_data.dynticks_nmi_nesting);
+ nesting = ct_dynticks_nmi_nesting();
if (nesting > 1)
return false;
@@ -453,27 +371,28 @@ static int rcu_is_cpu_rrupt_from_idle(void)
WARN_ON_ONCE(!nesting && !is_idle_task(current));
/* Does CPU appear to be idle from an RCU standpoint? */
- return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
+ return ct_dynticks_nesting() == 0;
}
-#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */
-#define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */
+#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10)
+ // Maximum callbacks per rcu_do_batch ...
+#define DEFAULT_MAX_RCU_BLIMIT 10000 // ... even during callback flood.
static long blimit = DEFAULT_RCU_BLIMIT;
-#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */
+#define DEFAULT_RCU_QHIMARK 10000 // If this many pending, ignore blimit.
static long qhimark = DEFAULT_RCU_QHIMARK;
-#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */
+#define DEFAULT_RCU_QLOMARK 100 // Once only this many pending, use blimit.
static long qlowmark = DEFAULT_RCU_QLOMARK;
#define DEFAULT_RCU_QOVLD_MULT 2
#define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK)
-static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */
-static long qovld_calc = -1; /* No pre-initialization lock acquisitions! */
+static long qovld = DEFAULT_RCU_QOVLD; // If this many pending, hammer QS.
+static long qovld_calc = -1; // No pre-initialization lock acquisitions!
module_param(blimit, long, 0444);
module_param(qhimark, long, 0444);
module_param(qlowmark, long, 0444);
module_param(qovld, long, 0444);
-static ulong jiffies_till_first_fqs = ULONG_MAX;
+static ulong jiffies_till_first_fqs = IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 0 : ULONG_MAX;
static ulong jiffies_till_next_fqs = ULONG_MAX;
static bool rcu_kick_kthreads;
static int rcu_divisor = 7;
@@ -540,12 +459,12 @@ static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param
return ret;
}
-static struct kernel_param_ops first_fqs_jiffies_ops = {
+static const struct kernel_param_ops first_fqs_jiffies_ops = {
.set = param_set_first_fqs_jiffies,
.get = param_get_ulong,
};
-static struct kernel_param_ops next_fqs_jiffies_ops = {
+static const struct kernel_param_ops next_fqs_jiffies_ops = {
.set = param_set_next_fqs_jiffies,
.get = param_get_ulong,
};
@@ -603,186 +522,45 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
}
EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
+#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK))
/*
- * Enter an RCU extended quiescent state, which can be either the
- * idle loop or adaptive-tickless usermode execution.
- *
- * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
- * the possibility of usermode upcalls having messed up our count
- * of interrupt nesting level during the prior busy period.
+ * An empty function that will trigger a reschedule on
+ * IRQ tail once IRQs get re-enabled on userspace/guest resume.
*/
-static noinstr void rcu_eqs_enter(bool user)
+static void late_wakeup_func(struct irq_work *work)
{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE);
- WRITE_ONCE(rdp->dynticks_nmi_nesting, 0);
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- rdp->dynticks_nesting == 0);
- if (rdp->dynticks_nesting != 1) {
- // RCU will still be watching, so just do accounting and leave.
- rdp->dynticks_nesting--;
- return;
- }
-
- lockdep_assert_irqs_disabled();
- instrumentation_begin();
- trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
- rdp = this_cpu_ptr(&rcu_data);
- do_nocb_deferred_wakeup(rdp);
- rcu_prepare_for_idle();
- rcu_preempt_deferred_qs(current);
-
- // instrumentation for the noinstr rcu_dynticks_eqs_enter()
- instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
-
- instrumentation_end();
- WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */
- // RCU is watching here ...
- rcu_dynticks_eqs_enter();
- // ... but is no longer watching here.
- rcu_dynticks_task_enter();
-}
-
-/**
- * rcu_idle_enter - inform RCU that current CPU is entering idle
- *
- * Enter idle mode, in other words, -leave- the mode in which RCU
- * read-side critical sections can occur. (Though RCU read-side
- * critical sections can occur in irq handlers in idle, a possibility
- * handled by irq_enter() and irq_exit().)
- *
- * If you add or remove a call to rcu_idle_enter(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_idle_enter(void)
-{
- lockdep_assert_irqs_disabled();
- rcu_eqs_enter(false);
}
-#ifdef CONFIG_NO_HZ_FULL
-/**
- * rcu_user_enter - inform RCU that we are resuming userspace.
- *
- * Enter RCU idle mode right before resuming userspace. No use of RCU
- * is permitted between this call and rcu_user_exit(). This way the
- * CPU doesn't need to maintain the tick for RCU maintenance purposes
- * when the CPU runs in userspace.
- *
- * If you add or remove a call to rcu_user_enter(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-noinstr void rcu_user_enter(void)
-{
- lockdep_assert_irqs_disabled();
- rcu_eqs_enter(true);
-}
-#endif /* CONFIG_NO_HZ_FULL */
+static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
+ IRQ_WORK_INIT(late_wakeup_func);
-/**
- * rcu_nmi_exit - inform RCU of exit from NMI context
+/*
+ * If either:
*
- * If we are returning from the outermost NMI handler that interrupted an
- * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting
- * to let the RCU grace-period handling know that the CPU is back to
- * being RCU-idle.
+ * 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work
+ * 2) the task is about to enter in user mode and $ARCH doesn't support generic entry.
*
- * If you add or remove a call to rcu_nmi_exit(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
+ * In these cases the late RCU wake ups aren't supported in the resched loops and our
+ * last resort is to fire a local irq_work that will trigger a reschedule once IRQs
+ * get re-enabled again.
*/
-noinstr void rcu_nmi_exit(void)
+noinstr void rcu_irq_work_resched(void)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- instrumentation_begin();
- /*
- * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
- * (We are exiting an NMI handler, so RCU better be paying attention
- * to us!)
- */
- WARN_ON_ONCE(rdp->dynticks_nmi_nesting <= 0);
- WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
-
- /*
- * If the nesting level is not 1, the CPU wasn't RCU-idle, so
- * leave it in non-RCU-idle state.
- */
- if (rdp->dynticks_nmi_nesting != 1) {
- trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
- atomic_read(&rdp->dynticks));
- WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
- rdp->dynticks_nmi_nesting - 2);
- instrumentation_end();
+ if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU))
return;
- }
- /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
- trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
- WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
-
- if (!in_nmi())
- rcu_prepare_for_idle();
+ if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU))
+ return;
- // instrumentation for the noinstr rcu_dynticks_eqs_enter()
- instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
+ instrumentation_begin();
+ if (do_nocb_deferred_wakeup(rdp) && need_resched()) {
+ irq_work_queue(this_cpu_ptr(&late_wakeup_work));
+ }
instrumentation_end();
-
- // RCU is watching here ...
- rcu_dynticks_eqs_enter();
- // ... but is no longer watching here.
-
- if (!in_nmi())
- rcu_dynticks_task_enter();
-}
-
-/**
- * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
- *
- * Exit from an interrupt handler, which might possibly result in entering
- * idle mode, in other words, leaving the mode in which read-side critical
- * sections can occur. The caller must have disabled interrupts.
- *
- * This code assumes that the idle loop never does anything that might
- * result in unbalanced calls to irq_enter() and irq_exit(). If your
- * architecture's idle loop violates this assumption, RCU will give you what
- * you deserve, good and hard. But very infrequently and irreproducibly.
- *
- * Use things like work queues to work around this limitation.
- *
- * You have been warned.
- *
- * If you add or remove a call to rcu_irq_exit(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void noinstr rcu_irq_exit(void)
-{
- lockdep_assert_irqs_disabled();
- rcu_nmi_exit();
-}
-
-/**
- * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq
- * towards in kernel preemption
- *
- * Same as rcu_irq_exit() but has a sanity check that scheduling is safe
- * from RCU point of view. Invoked from return from interrupt before kernel
- * preemption.
- */
-void rcu_irq_exit_preempt(void)
-{
- lockdep_assert_irqs_disabled();
- rcu_nmi_exit();
-
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
- "RCU dynticks_nesting counter underflow/zero!");
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
- DYNTICK_IRQ_NONIDLE,
- "Bad RCU dynticks_nmi_nesting counter\n");
- RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
- "RCU in extended quiescent state!");
}
+#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) */
#ifdef CONFIG_PROVE_RCU
/**
@@ -792,9 +570,9 @@ void rcu_irq_exit_check_preempt(void)
{
lockdep_assert_irqs_disabled();
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
+ RCU_LOCKDEP_WARN(ct_dynticks_nesting() <= 0,
"RCU dynticks_nesting counter underflow/zero!");
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
+ RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() !=
DYNTICK_IRQ_NONIDLE,
"Bad RCU dynticks_nmi_nesting counter\n");
RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
@@ -802,95 +580,8 @@ void rcu_irq_exit_check_preempt(void)
}
#endif /* #ifdef CONFIG_PROVE_RCU */
-/*
- * Wrapper for rcu_irq_exit() where interrupts are enabled.
- *
- * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_irq_exit_irqson(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rcu_irq_exit();
- local_irq_restore(flags);
-}
-
-/*
- * Exit an RCU extended quiescent state, which can be either the
- * idle loop or adaptive-tickless usermode execution.
- *
- * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
- * allow for the possibility of usermode upcalls messing up our count of
- * interrupt nesting level during the busy period that is just now starting.
- */
-static void noinstr rcu_eqs_exit(bool user)
-{
- struct rcu_data *rdp;
- long oldval;
-
- lockdep_assert_irqs_disabled();
- rdp = this_cpu_ptr(&rcu_data);
- oldval = rdp->dynticks_nesting;
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
- if (oldval) {
- // RCU was already watching, so just do accounting and leave.
- rdp->dynticks_nesting++;
- return;
- }
- rcu_dynticks_task_exit();
- // RCU is not watching here ...
- rcu_dynticks_eqs_exit();
- // ... but is watching here.
- instrumentation_begin();
-
- // instrumentation for the noinstr rcu_dynticks_eqs_exit()
- instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
-
- rcu_cleanup_after_idle();
- trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
- WRITE_ONCE(rdp->dynticks_nesting, 1);
- WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
- WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
- instrumentation_end();
-}
-
-/**
- * rcu_idle_exit - inform RCU that current CPU is leaving idle
- *
- * Exit idle mode, in other words, -enter- the mode in which RCU
- * read-side critical sections can occur.
- *
- * If you add or remove a call to rcu_idle_exit(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_idle_exit(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rcu_eqs_exit(false);
- local_irq_restore(flags);
-}
-
#ifdef CONFIG_NO_HZ_FULL
/**
- * rcu_user_exit - inform RCU that we are exiting userspace.
- *
- * Exit RCU idle mode while entering the kernel because it can
- * run a RCU read side critical section anytime.
- *
- * If you add or remove a call to rcu_user_exit(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void noinstr rcu_user_exit(void)
-{
- rcu_eqs_exit(1);
-}
-
-/**
* __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it.
*
* The scheduler tick is not normally enabled when CPUs enter the kernel
@@ -920,8 +611,8 @@ void __rcu_irq_enter_check_tick(void)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- // Enabling the tick is unsafe in NMI handlers.
- if (WARN_ON_ONCE(in_nmi()))
+ // If we're here from NMI there's nothing to do.
+ if (in_nmi())
return;
RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
@@ -942,7 +633,7 @@ void __rcu_irq_enter_check_tick(void)
// prevents self-deadlock. So we can safely recheck under the lock.
// Note that the nohz_full state currently cannot change.
raw_spin_lock_rcu_node(rdp->mynode);
- if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
+ if (READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) {
// A nohz_full CPU is in the kernel and RCU needs a
// quiescent state. Turn on the tick!
WRITE_ONCE(rdp->rcu_forced_tick, true);
@@ -950,114 +641,24 @@ void __rcu_irq_enter_check_tick(void)
}
raw_spin_unlock_rcu_node(rdp->mynode);
}
+NOKPROBE_SYMBOL(__rcu_irq_enter_check_tick);
#endif /* CONFIG_NO_HZ_FULL */
-/**
- * rcu_nmi_enter - inform RCU of entry to NMI context
- * @irq: Is this call from rcu_irq_enter?
- *
- * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and
- * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know
- * that the CPU is active. This implementation permits nested NMIs, as
- * long as the nesting level does not overflow an int. (You will probably
- * run out of stack space first.)
- *
- * If you add or remove a call to rcu_nmi_enter(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
- */
-noinstr void rcu_nmi_enter(void)
-{
- long incby = 2;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- /* Complain about underflow. */
- WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0);
-
- /*
- * If idle from RCU viewpoint, atomically increment ->dynticks
- * to mark non-idle and increment ->dynticks_nmi_nesting by one.
- * Otherwise, increment ->dynticks_nmi_nesting by two. This means
- * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
- * to be in the outermost NMI handler that interrupted an RCU-idle
- * period (observation due to Andy Lutomirski).
- */
- if (rcu_dynticks_curr_cpu_in_eqs()) {
-
- if (!in_nmi())
- rcu_dynticks_task_exit();
-
- // RCU is not watching here ...
- rcu_dynticks_eqs_exit();
- // ... but is watching here.
-
- if (!in_nmi())
- rcu_cleanup_after_idle();
-
- instrumentation_begin();
- // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
- instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks));
- // instrumentation for the noinstr rcu_dynticks_eqs_exit()
- instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
-
- incby = 1;
- } else if (!in_nmi()) {
- instrumentation_begin();
- rcu_irq_enter_check_tick();
- instrumentation_end();
- } else {
- instrumentation_begin();
- }
-
- trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
- rdp->dynticks_nmi_nesting,
- rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
- instrumentation_end();
- WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
- rdp->dynticks_nmi_nesting + incby);
- barrier();
-}
-
-/**
- * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
- *
- * Enter an interrupt handler, which might possibly result in exiting
- * idle mode, in other words, entering the mode in which read-side critical
- * sections can occur. The caller must have disabled interrupts.
- *
- * Note that the Linux kernel is fully capable of entering an interrupt
- * handler that it never exits, for example when doing upcalls to user mode!
- * This code assumes that the idle loop never does upcalls to user mode.
- * If your architecture's idle loop does do upcalls to user mode (or does
- * anything else that results in unbalanced calls to the irq_enter() and
- * irq_exit() functions), RCU will give you what you deserve, good and hard.
- * But very infrequently and irreproducibly.
- *
- * Use things like work queues to work around this limitation.
- *
- * You have been warned.
- *
- * If you add or remove a call to rcu_irq_enter(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-noinstr void rcu_irq_enter(void)
-{
- lockdep_assert_irqs_disabled();
- rcu_nmi_enter();
-}
-
/*
- * Wrapper for rcu_irq_enter() where interrupts are enabled.
+ * Check to see if any future non-offloaded RCU-related work will need
+ * to be done by the current CPU, even if none need be done immediately,
+ * returning 1 if so. This function is part of the RCU implementation;
+ * it is -not- an exported member of the RCU API. This is used by
+ * the idle-entry code to figure out whether it is safe to disable the
+ * scheduler-clock interrupt.
*
- * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
+ * Just check whether or not this CPU has non-offloaded RCU callbacks
+ * queued.
*/
-void rcu_irq_enter_irqson(void)
+int rcu_needs_cpu(void)
{
- unsigned long flags;
-
- local_irq_save(flags);
- rcu_irq_enter();
- local_irq_restore(flags);
+ return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
+ !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data));
}
/*
@@ -1076,20 +677,22 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
}
}
-noinstr bool __rcu_is_watching(void)
-{
- return !rcu_dynticks_curr_cpu_in_eqs();
-}
-
/**
- * rcu_is_watching - see if RCU thinks that the current CPU is not idle
+ * rcu_is_watching - RCU read-side critical sections permitted on current CPU?
+ *
+ * Return @true if RCU is watching the running CPU and @false otherwise.
+ * An @true return means that this CPU can safely enter RCU read-side
+ * critical sections.
*
- * Return true if RCU is watching the running CPU, which means that this
- * CPU can safely enter RCU read-side critical sections. In other words,
- * if the current CPU is not in its idle loop or is in an interrupt or
- * NMI handler, return true.
+ * Although calls to rcu_is_watching() from most parts of the kernel
+ * will return @true, there are important exceptions. For example, if the
+ * current CPU is deep within its idle loop, in kernel entry/exit code,
+ * or offline, rcu_is_watching() will return @false.
+ *
+ * Make notrace because it can be called by the internal functions of
+ * ftrace, and making this notrace removes unnecessary recursion calls.
*/
-bool rcu_is_watching(void)
+notrace bool rcu_is_watching(void)
{
bool ret;
@@ -1118,43 +721,8 @@ void rcu_request_urgent_qs_task(struct task_struct *t)
smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true);
}
-#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
-
/*
- * Is the current CPU online as far as RCU is concerned?
- *
- * Disable preemption to avoid false positives that could otherwise
- * happen due to the current CPU number being sampled, this task being
- * preempted, its old CPU being taken offline, resuming on some other CPU,
- * then determining that its old CPU is now offline.
- *
- * Disable checking if in an NMI handler because we cannot safely
- * report errors from NMI handlers anyway. In addition, it is OK to use
- * RCU on an offline processor during initial boot, hence the check for
- * rcu_scheduler_fully_active.
- */
-bool rcu_lockdep_current_cpu_online(void)
-{
- struct rcu_data *rdp;
- struct rcu_node *rnp;
- bool ret = false;
-
- if (in_nmi() || !rcu_scheduler_fully_active)
- return true;
- preempt_disable_notrace();
- rdp = this_cpu_ptr(&rcu_data);
- rnp = rdp->mynode;
- if (rdp->grpmask & rcu_rnp_online_cpus(rnp))
- ret = true;
- preempt_enable_notrace();
- return ret;
-}
-EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
-
-#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
-
-/*
- * We are reporting a quiescent state on behalf of some other CPU, so
+ * When trying to report a quiescent state on behalf of some other CPU,
* it is our responsibility to check for and handle potential overflow
* of the rcu_node ->gp_seq counter with respect to the rcu_data counters.
* After all, the CPU might be in deep idle state, and thus executing no
@@ -1177,7 +745,7 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
*/
static int dyntick_save_progress_counter(struct rcu_data *rdp)
{
- rdp->dynticks_snap = rcu_dynticks_snap(rdp);
+ rdp->dynticks_snap = rcu_dynticks_snap(rdp->cpu);
if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
rcu_gpnum_ovf(rdp->mynode, rdp);
@@ -1187,16 +755,19 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
}
/*
- * Return true if the specified CPU has passed through a quiescent
- * state by virtue of being in or having passed through an dynticks
- * idle state since the last call to dyntick_save_progress_counter()
- * for this same CPU, or by virtue of having been offline.
+ * Returns positive if the specified CPU has passed through a quiescent state
+ * by virtue of being in or having passed through an dynticks idle state since
+ * the last call to dyntick_save_progress_counter() for this same CPU, or by
+ * virtue of having been offline.
+ *
+ * Returns negative if the specified CPU needs a force resched.
+ *
+ * Returns zero otherwise.
*/
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
{
unsigned long jtsq;
- bool *rnhqp;
- bool *ruqp;
+ int ret = 0;
struct rcu_node *rnp = rdp->mynode;
/*
@@ -1213,22 +784,35 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
return 1;
}
- /* If waiting too long on an offline CPU, complain. */
- if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) &&
- time_after(jiffies, rcu_state.gp_start + HZ)) {
- bool onl;
+ /*
+ * Complain if a CPU that is considered to be offline from RCU's
+ * perspective has not yet reported a quiescent state. After all,
+ * the offline CPU should have reported a quiescent state during
+ * the CPU-offline process, or, failing that, by rcu_gp_init()
+ * if it ran concurrently with either the CPU going offline or the
+ * last task on a leaf rcu_node structure exiting its RCU read-side
+ * critical section while all CPUs corresponding to that structure
+ * are offline. This added warning detects bugs in any of these
+ * code paths.
+ *
+ * The rcu_node structure's ->lock is held here, which excludes
+ * the relevant portions the CPU-hotplug code, the grace-period
+ * initialization code, and the rcu_read_unlock() code paths.
+ *
+ * For more detail, please refer to the "Hotplug CPU" section
+ * of RCU's Requirements documentation.
+ */
+ if (WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp))) {
struct rcu_node *rnp1;
- WARN_ON(1); /* Offline CPUs are supposed to report QS! */
pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
__func__, rnp->grplo, rnp->grphi, rnp->level,
(long)rnp->gp_seq, (long)rnp->completedqs);
for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n",
__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask);
- onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n",
- __func__, rdp->cpu, ".o"[onl],
+ __func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)],
(long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
(long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
return 1; /* Break things loose after complaining. */
@@ -1246,17 +830,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* is set way high.
*/
jtsq = READ_ONCE(jiffies_to_sched_qs);
- ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu);
- rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu);
- if (!READ_ONCE(*rnhqp) &&
+ if (!READ_ONCE(rdp->rcu_need_heavy_qs) &&
(time_after(jiffies, rcu_state.gp_start + jtsq * 2) ||
time_after(jiffies, rcu_state.jiffies_resched) ||
rcu_state.cbovld)) {
- WRITE_ONCE(*rnhqp, true);
+ WRITE_ONCE(rdp->rcu_need_heavy_qs, true);
/* Store rcu_need_heavy_qs before rcu_urgent_qs. */
- smp_store_release(ruqp, true);
+ smp_store_release(&rdp->rcu_urgent_qs, true);
} else if (time_after(jiffies, rcu_state.gp_start + jtsq)) {
- WRITE_ONCE(*ruqp, true);
+ WRITE_ONCE(rdp->rcu_urgent_qs, true);
}
/*
@@ -1270,9 +852,9 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (tick_nohz_full_cpu(rdp->cpu) &&
(time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) ||
rcu_state.cbovld)) {
- WRITE_ONCE(*ruqp, true);
- resched_cpu(rdp->cpu);
+ WRITE_ONCE(rdp->rcu_urgent_qs, true);
WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+ ret = -1;
}
/*
@@ -1285,21 +867,37 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (time_after(jiffies, rcu_state.jiffies_resched)) {
if (time_after(jiffies,
READ_ONCE(rdp->last_fqs_resched) + jtsq)) {
- resched_cpu(rdp->cpu);
WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+ ret = -1;
}
if (IS_ENABLED(CONFIG_IRQ_WORK) &&
!rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
(rnp->ffmask & rdp->grpmask)) {
- init_irq_work(&rdp->rcu_iw, rcu_iw_handler);
- atomic_set(&rdp->rcu_iw.flags, IRQ_WORK_HARD_IRQ);
rdp->rcu_iw_pending = true;
rdp->rcu_iw_gp_seq = rnp->gp_seq;
irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
}
+
+ if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) {
+ int cpu = rdp->cpu;
+ struct rcu_snap_record *rsrp;
+ struct kernel_cpustat *kcsp;
+
+ kcsp = &kcpustat_cpu(cpu);
+
+ rsrp = &rdp->snap_record;
+ rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
+ rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
+ rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
+ rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu);
+ rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu);
+ rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu);
+ rsrp->jiffies = jiffies;
+ rsrp->gp_seq = rdp->gp_seq;
+ }
}
- return 0;
+ return ret;
}
/* Trace-event wrapper function for trace_rcu_future_grace_period. */
@@ -1415,6 +1013,38 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
return needmore;
}
+static void swake_up_one_online_ipi(void *arg)
+{
+ struct swait_queue_head *wqh = arg;
+
+ swake_up_one(wqh);
+}
+
+static void swake_up_one_online(struct swait_queue_head *wqh)
+{
+ int cpu = get_cpu();
+
+ /*
+ * If called from rcutree_report_cpu_starting(), wake up
+ * is dangerous that late in the CPU-down hotplug process. The
+ * scheduler might queue an ignored hrtimer. Defer the wake up
+ * to an online CPU instead.
+ */
+ if (unlikely(cpu_is_offline(cpu))) {
+ int target;
+
+ target = cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU),
+ cpu_online_mask);
+
+ smp_call_function_single(target, swake_up_one_online_ipi,
+ wqh, 0);
+ put_cpu();
+ } else {
+ put_cpu();
+ swake_up_one(wqh);
+ }
+}
+
/*
* Awaken the grace-period kthread. Don't do a self-awaken (unless in an
* interrupt or softirq handler, in which case we just might immediately
@@ -1434,12 +1064,12 @@ static void rcu_gp_kthread_wake(void)
{
struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
- if ((current == t && !in_irq() && !in_serving_softirq()) ||
+ if ((current == t && !in_hardirq() && !in_serving_softirq()) ||
!READ_ONCE(rcu_state.gp_flags) || !t)
return;
WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
- swake_up_one(&rcu_state.gp_wq);
+ swake_up_one_online(&rcu_state.gp_wq);
}
/*
@@ -1466,6 +1096,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
if (!rcu_segcblist_pend_cbs(&rdp->cblist))
return false;
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc"));
+
/*
* Callbacks are often registered with incomplete grace-period
* information. Something about the fact that getting exact
@@ -1482,9 +1114,12 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
/* Trace depending on how much we were able to accelerate. */
if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
- trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccWaitCB"));
+ trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccWaitCB"));
else
- trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccReadyCB"));
+ trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
+
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc"));
+
return ret;
}
@@ -1552,14 +1187,28 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
struct rcu_data *rdp)
{
rcu_lockdep_assert_cblist_protected(rdp);
- if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) ||
- !raw_spin_trylock_rcu_node(rnp))
+ if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || !raw_spin_trylock_rcu_node(rnp))
return;
- WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
+ // The grace period cannot end while we hold the rcu_node lock.
+ if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))
+ WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
raw_spin_unlock_rcu_node(rnp);
}
/*
+ * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, attempt to generate a
+ * quiescent state. This is intended to be invoked when the CPU notices
+ * a new grace period.
+ */
+static void rcu_strict_gp_check_qs(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
+ rcu_read_lock();
+ rcu_read_unlock();
+ }
+}
+
+/*
* Update CPU-local rcu_data state to record the beginnings and ends of
* grace periods. The caller must hold the ->lock of the leaf rcu_node
* structure corresponding to the current CPU, and must have irqs disabled.
@@ -1569,8 +1218,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
{
bool ret = false;
bool need_qs;
- const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- rcu_segcblist_is_offloaded(&rdp->cblist);
+ const bool offloaded = rcu_rdp_is_offloaded(rdp);
raw_lockdep_assert_held_rcu_node(rnp);
@@ -1608,6 +1256,8 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */
if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
+ if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap))
+ WRITE_ONCE(rdp->last_sched_clock, jiffies);
WRITE_ONCE(rdp->gpwrap, false);
rcu_gpnum_ovf(rnp, rdp);
return ret;
@@ -1629,16 +1279,43 @@ static void note_gp_changes(struct rcu_data *rdp)
}
needwake = __note_gp_changes(rnp, rdp);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ rcu_strict_gp_check_qs();
if (needwake)
rcu_gp_kthread_wake();
}
+static atomic_t *rcu_gp_slow_suppress;
+
+/* Register a counter to suppress debugging grace-period delays. */
+void rcu_gp_slow_register(atomic_t *rgssp)
+{
+ WARN_ON_ONCE(rcu_gp_slow_suppress);
+
+ WRITE_ONCE(rcu_gp_slow_suppress, rgssp);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_slow_register);
+
+/* Unregister a counter, with NULL for not caring which. */
+void rcu_gp_slow_unregister(atomic_t *rgssp)
+{
+ WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress && rcu_gp_slow_suppress != NULL);
+
+ WRITE_ONCE(rcu_gp_slow_suppress, NULL);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_slow_unregister);
+
+static bool rcu_gp_slow_is_suppressed(void)
+{
+ atomic_t *rgssp = READ_ONCE(rcu_gp_slow_suppress);
+
+ return rgssp && atomic_read(rgssp);
+}
+
static void rcu_gp_slow(int delay)
{
- if (delay > 0 &&
- !(rcu_seq_ctr(rcu_state.gp_seq) %
- (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
- schedule_timeout_uninterruptible(delay);
+ if (!rcu_gp_slow_is_suppressed() && delay > 0 &&
+ !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
+ schedule_timeout_idle(delay);
}
static unsigned long sleep_duration;
@@ -1661,15 +1338,94 @@ static void rcu_gp_torture_wait(void)
duration = xchg(&sleep_duration, 0UL);
if (duration > 0) {
pr_alert("%s: Waiting %lu jiffies\n", __func__, duration);
- schedule_timeout_uninterruptible(duration);
+ schedule_timeout_idle(duration);
pr_alert("%s: Wait complete\n", __func__);
}
}
/*
+ * Handler for on_each_cpu() to invoke the target CPU's RCU core
+ * processing.
+ */
+static void rcu_strict_gp_boundary(void *unused)
+{
+ invoke_rcu_core();
+}
+
+// Make the polled API aware of the beginning of a grace period.
+static void rcu_poll_gp_seq_start(unsigned long *snap)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ raw_lockdep_assert_held_rcu_node(rnp);
+
+ // If RCU was idle, note beginning of GP.
+ if (!rcu_seq_state(rcu_state.gp_seq_polled))
+ rcu_seq_start(&rcu_state.gp_seq_polled);
+
+ // Either way, record current state.
+ *snap = rcu_state.gp_seq_polled;
+}
+
+// Make the polled API aware of the end of a grace period.
+static void rcu_poll_gp_seq_end(unsigned long *snap)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ raw_lockdep_assert_held_rcu_node(rnp);
+
+ // If the previously noted GP is still in effect, record the
+ // end of that GP. Either way, zero counter to avoid counter-wrap
+ // problems.
+ if (*snap && *snap == rcu_state.gp_seq_polled) {
+ rcu_seq_end(&rcu_state.gp_seq_polled);
+ rcu_state.gp_seq_polled_snap = 0;
+ rcu_state.gp_seq_polled_exp_snap = 0;
+ } else {
+ *snap = 0;
+ }
+}
+
+// Make the polled API aware of the beginning of a grace period, but
+// where caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap)
+{
+ unsigned long flags;
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_init_invoked()) {
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ lockdep_assert_irqs_enabled();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ }
+ rcu_poll_gp_seq_start(snap);
+ if (rcu_init_invoked())
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+
+// Make the polled API aware of the end of a grace period, but where
+// caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
+{
+ unsigned long flags;
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_init_invoked()) {
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ lockdep_assert_irqs_enabled();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ }
+ rcu_poll_gp_seq_end(snap);
+ if (rcu_init_invoked())
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+
+/*
* Initialize a new grace period. Return false if no grace period required.
*/
-static bool rcu_gp_init(void)
+static noinline_for_stack bool rcu_gp_init(void)
{
unsigned long flags;
unsigned long oldmask;
@@ -1701,23 +1457,30 @@ static bool rcu_gp_init(void)
rcu_seq_start(&rcu_state.gp_seq);
ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
+ rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
raw_spin_unlock_irq_rcu_node(rnp);
/*
- * Apply per-leaf buffered online and offline operations to the
- * rcu_node tree. Note that this new grace period need not wait
- * for subsequent online CPUs, and that quiescent-state forcing
- * will handle subsequent offline CPUs.
+ * Apply per-leaf buffered online and offline operations to
+ * the rcu_node tree. Note that this new grace period need not
+ * wait for subsequent online CPUs, and that RCU hooks in the CPU
+ * offlining path, when combined with checks in this function,
+ * will handle CPUs that are currently going offline or that will
+ * go offline later. Please also refer to "Hotplug CPU" section
+ * of RCU's Requirements documentation.
*/
- rcu_state.gp_state = RCU_GP_ONOFF;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
+ /* Exclude CPU hotplug operations. */
rcu_for_each_leaf_node(rnp) {
- raw_spin_lock(&rcu_state.ofl_lock);
- raw_spin_lock_irq_rcu_node(rnp);
+ local_irq_save(flags);
+ arch_spin_lock(&rcu_state.ofl_lock);
+ raw_spin_lock_rcu_node(rnp);
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
!rnp->wait_blkd_tasks) {
/* Nothing to do on this leaf rcu_node structure. */
- raw_spin_unlock_irq_rcu_node(rnp);
- raw_spin_unlock(&rcu_state.ofl_lock);
+ raw_spin_unlock_rcu_node(rnp);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ local_irq_restore(flags);
continue;
}
@@ -1752,8 +1515,9 @@ static bool rcu_gp_init(void)
rcu_cleanup_dead_rnp(rnp);
}
- raw_spin_unlock_irq_rcu_node(rnp);
- raw_spin_unlock(&rcu_state.ofl_lock);
+ raw_spin_unlock_rcu_node(rnp);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ local_irq_restore(flags);
}
rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */
@@ -1769,7 +1533,7 @@ static bool rcu_gp_init(void)
* The grace period cannot complete until the initialization
* process finishes, because this kthread handles both.
*/
- rcu_state.gp_state = RCU_GP_INIT;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_INIT);
rcu_for_each_node_breadth_first(rnp) {
rcu_gp_slow(gp_init_delay);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -1794,6 +1558,10 @@ static bool rcu_gp_init(void)
WRITE_ONCE(rcu_state.gp_activity, jiffies);
}
+ // If strict, make all CPUs aware of new grace period.
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
+ on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
+
return true;
}
@@ -1826,10 +1594,22 @@ static bool rcu_gp_fqs_check_wake(int *gfp)
*/
static void rcu_gp_fqs(bool first_time)
{
+ int nr_fqs = READ_ONCE(rcu_state.nr_fqs_jiffies_stall);
struct rcu_node *rnp = rcu_get_root();
WRITE_ONCE(rcu_state.gp_activity, jiffies);
- rcu_state.n_force_qs++;
+ WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1);
+
+ WARN_ON_ONCE(nr_fqs > 3);
+ /* Only countdown nr_fqs for stall purposes if jiffies moves. */
+ if (nr_fqs) {
+ if (nr_fqs == 1) {
+ WRITE_ONCE(rcu_state.jiffies_stall,
+ jiffies + rcu_jiffies_till_stall_check());
+ }
+ WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, --nr_fqs);
+ }
+
if (first_time) {
/* Collect dyntick-idle snapshots. */
force_qs_rnp(dyntick_save_progress_counter);
@@ -1849,40 +1629,57 @@ static void rcu_gp_fqs(bool first_time)
/*
* Loop doing repeated quiescent-state forcing until the grace period ends.
*/
-static void rcu_gp_fqs_loop(void)
+static noinline_for_stack void rcu_gp_fqs_loop(void)
{
- bool first_gp_fqs;
+ bool first_gp_fqs = true;
int gf = 0;
unsigned long j;
int ret;
struct rcu_node *rnp = rcu_get_root();
- first_gp_fqs = true;
j = READ_ONCE(jiffies_till_first_fqs);
if (rcu_state.cbovld)
gf = RCU_GP_FLAG_OVLD;
ret = 0;
for (;;) {
- if (!ret) {
- rcu_state.jiffies_force_qs = jiffies + j;
+ if (rcu_state.cbovld) {
+ j = (j + 2) / 3;
+ if (j <= 0)
+ j = 1;
+ }
+ if (!ret || time_before(jiffies + j, rcu_state.jiffies_force_qs)) {
+ WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
+ /*
+ * jiffies_force_qs before RCU_GP_WAIT_FQS state
+ * update; required for stall checks.
+ */
+ smp_wmb();
WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
jiffies + (j ? 3 * j : 2));
}
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqswait"));
- rcu_state.gp_state = RCU_GP_WAIT_FQS;
- ret = swait_event_idle_timeout_exclusive(
- rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j);
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS);
+ (void)swait_event_idle_timeout_exclusive(rcu_state.gp_wq,
+ rcu_gp_fqs_check_wake(&gf), j);
rcu_gp_torture_wait();
- rcu_state.gp_state = RCU_GP_DOING_FQS;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
/* Locking provides needed memory barriers. */
- /* If grace period done, leave loop. */
+ /*
+ * Exit the loop if the root rcu_node structure indicates that the grace period
+ * has ended, leave the loop. The rcu_preempt_blocked_readers_cgp(rnp) check
+ * is required only for single-node rcu_node trees because readers blocking
+ * the current grace period are queued only on leaf rcu_node structures.
+ * For multi-node trees, checking the root node's ->qsmask suffices, because a
+ * given root node's ->qsmask bit is cleared only when all CPUs and tasks from
+ * the corresponding leaf nodes have passed through their quiescent state.
+ */
if (!READ_ONCE(rnp->qsmask) &&
!rcu_preempt_blocked_readers_cgp(rnp))
break;
/* If time for quiescent-state forcing, do it. */
if (!time_after(rcu_state.jiffies_force_qs, jiffies) ||
- (gf & RCU_GP_FLAG_FQS)) {
+ (gf & (RCU_GP_FLAG_FQS | RCU_GP_FLAG_OVLD))) {
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqsstart"));
rcu_gp_fqs(first_gp_fqs);
@@ -1918,7 +1715,7 @@ static void rcu_gp_fqs_loop(void)
/*
* Clean up after the old grace period.
*/
-static void rcu_gp_cleanup(void)
+static noinline void rcu_gp_cleanup(void)
{
int cpu;
bool needgp = false;
@@ -1944,6 +1741,7 @@ static void rcu_gp_cleanup(void)
* safe for us to drop the lock in order to mark the grace
* period as completed in all of the rcu_node structures.
*/
+ rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap);
raw_spin_unlock_irq_rcu_node(rnp);
/*
@@ -1963,6 +1761,8 @@ static void rcu_gp_cleanup(void)
dump_blkd_tasks(rnp, 10);
WARN_ON_ONCE(rnp->qsmask);
WRITE_ONCE(rnp->gp_seq, new_gp_seq);
+ if (!rnp->parent)
+ smp_mb(); // Order against failing poll_state_synchronize_rcu_full().
rdp = this_cpu_ptr(&rcu_data);
if (rnp == rdp->mynode)
needgp = __note_gp_changes(rnp, rdp) || needgp;
@@ -1988,7 +1788,7 @@ static void rcu_gp_cleanup(void)
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end"));
rcu_seq_end(&rcu_state.gp_seq);
ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
- rcu_state.gp_state = RCU_GP_IDLE;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_IDLE);
/* Check for GP requests since above loop. */
rdp = this_cpu_ptr(&rcu_data);
if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) {
@@ -1997,19 +1797,37 @@ static void rcu_gp_cleanup(void)
needgp = true;
}
/* Advance CBs to reduce false positives below. */
- offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- rcu_segcblist_is_offloaded(&rdp->cblist);
+ offloaded = rcu_rdp_is_offloaded(rdp);
if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
+
+ // We get here if a grace period was needed (“needgp”)
+ // and the above call to rcu_accelerate_cbs() did not set
+ // the RCU_GP_FLAG_INIT bit in ->gp_state (which records
+ // the need for another grace period).  The purpose
+ // of the “offloaded” check is to avoid invoking
+ // rcu_accelerate_cbs() on an offloaded CPU because we do not
+ // hold the ->nocb_lock needed to safely access an offloaded
+ // ->cblist.  We do not want to acquire that lock because
+ // it can be heavily contended during callback floods.
+
WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
- trace_rcu_grace_period(rcu_state.name,
- rcu_state.gp_seq,
- TPS("newreq"));
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq"));
} else {
- WRITE_ONCE(rcu_state.gp_flags,
- rcu_state.gp_flags & RCU_GP_FLAG_INIT);
+
+ // We get here either if there is no need for an
+ // additional grace period or if rcu_accelerate_cbs() has
+ // already set the RCU_GP_FLAG_INIT bit in ->gp_flags. 
+ // So all we need to do is to clear all of the other
+ // ->gp_flags bits.
+
+ WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & RCU_GP_FLAG_INIT);
}
raw_spin_unlock_irq_rcu_node(rnp);
+
+ // If strict, make all CPUs aware of the end of the old grace period.
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
+ on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
}
/*
@@ -2024,12 +1842,12 @@ static int __noreturn rcu_gp_kthread(void *unused)
for (;;) {
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("reqwait"));
- rcu_state.gp_state = RCU_GP_WAIT_GPS;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_GPS);
swait_event_idle_exclusive(rcu_state.gp_wq,
READ_ONCE(rcu_state.gp_flags) &
RCU_GP_FLAG_INIT);
rcu_gp_torture_wait();
- rcu_state.gp_state = RCU_GP_DONE_GPS;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_DONE_GPS);
/* Locking provides needed memory barrier. */
if (rcu_gp_init())
break;
@@ -2044,9 +1862,9 @@ static int __noreturn rcu_gp_kthread(void *unused)
rcu_gp_fqs_loop();
/* Handle grace-period end. */
- rcu_state.gp_state = RCU_GP_CLEANUP;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANUP);
rcu_gp_cleanup();
- rcu_state.gp_state = RCU_GP_CLEANED;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANED);
}
}
@@ -2188,15 +2006,14 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
* structure. This must be called from the specified CPU.
*/
static void
-rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
+rcu_report_qs_rdp(struct rcu_data *rdp)
{
unsigned long flags;
unsigned long mask;
- bool needwake = false;
- const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- rcu_segcblist_is_offloaded(&rdp->cblist);
+ bool needacc = false;
struct rcu_node *rnp;
+ WARN_ON_ONCE(rdp->cpu != smp_processor_id());
rnp = rdp->mynode;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq ||
@@ -2213,23 +2030,40 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
return;
}
mask = rdp->grpmask;
- if (rdp->cpu == smp_processor_id())
- rdp->core_needs_qs = false;
+ rdp->core_needs_qs = false;
if ((rnp->qsmask & mask) == 0) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
} else {
/*
* This GP can't end until cpu checks in, so all of our
* callbacks can be processed during the next GP.
+ *
+ * NOCB kthreads have their own way to deal with that...
*/
- if (!offloaded)
- needwake = rcu_accelerate_cbs(rnp, rdp);
+ if (!rcu_rdp_is_offloaded(rdp)) {
+ /*
+ * The current GP has not yet ended, so it
+ * should not be possible for rcu_accelerate_cbs()
+ * to return true. So complain, but don't awaken.
+ */
+ WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp));
+ } else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
+ /*
+ * ...but NOCB kthreads may miss or delay callbacks acceleration
+ * if in the middle of a (de-)offloading process.
+ */
+ needacc = true;
+ }
rcu_disable_urgency_upon_qs(rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
/* ^^^ Released rnp->lock */
- if (needwake)
- rcu_gp_kthread_wake();
+
+ if (needacc) {
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_accelerate_cbs_unlocked(rnp, rdp);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ }
}
}
@@ -2263,110 +2097,38 @@ rcu_check_quiescent_state(struct rcu_data *rdp)
* Tell RCU we are done (but rcu_report_qs_rdp() will be the
* judge of that).
*/
- rcu_report_qs_rdp(rdp->cpu, rdp);
+ rcu_report_qs_rdp(rdp);
}
-/*
- * Near the end of the offline process. Trace the fact that this CPU
- * is going offline.
- */
-int rcutree_dying_cpu(unsigned int cpu)
+/* Return true if callback-invocation time limit exceeded. */
+static bool rcu_do_batch_check_time(long count, long tlimit,
+ bool jlimit_check, unsigned long jlimit)
{
- bool blkd;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- struct rcu_node *rnp = rdp->mynode;
-
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
- return 0;
-
- blkd = !!(rnp->qsmask & rdp->grpmask);
- trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
- blkd ? TPS("cpuofl") : TPS("cpuofl-bgp"));
- return 0;
-}
-
-/*
- * All CPUs for the specified rcu_node structure have gone offline,
- * and all tasks that were preempted within an RCU read-side critical
- * section while running on one of those CPUs have since exited their RCU
- * read-side critical section. Some other CPU is reporting this fact with
- * the specified rcu_node structure's ->lock held and interrupts disabled.
- * This function therefore goes up the tree of rcu_node structures,
- * clearing the corresponding bits in the ->qsmaskinit fields. Note that
- * the leaf rcu_node structure's ->qsmaskinit field has already been
- * updated.
- *
- * This function does check that the specified rcu_node structure has
- * all CPUs offline and no blocked tasks, so it is OK to invoke it
- * prematurely. That said, invoking it after the fact will cost you
- * a needless lock acquisition. So once it has done its work, don't
- * invoke it again.
- */
-static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
-{
- long mask;
- struct rcu_node *rnp = rnp_leaf;
-
- raw_lockdep_assert_held_rcu_node(rnp_leaf);
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
- WARN_ON_ONCE(rnp_leaf->qsmaskinit) ||
- WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
- return;
- for (;;) {
- mask = rnp->grpmask;
- rnp = rnp->parent;
- if (!rnp)
- break;
- raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- rnp->qsmaskinit &= ~mask;
- /* Between grace periods, so better already be zero! */
- WARN_ON_ONCE(rnp->qsmask);
- if (rnp->qsmaskinit) {
- raw_spin_unlock_rcu_node(rnp);
- /* irqs remain disabled. */
- return;
- }
- raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
- }
-}
-
-/*
- * The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context. Do the remainder of the cleanup.
- * There can only be one CPU hotplug operation at a time, so no need for
- * explicit locking.
- */
-int rcutree_dead_cpu(unsigned int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
-
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
- return 0;
-
- /* Adjust any no-longer-needed kthreads. */
- rcu_boost_kthread_setaffinity(rnp, -1);
- /* Do any needed no-CB deferred wakeups from this CPU. */
- do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu));
-
- // Stop-machine done, so allow nohz_full to disable tick.
- tick_dep_clear(TICK_DEP_BIT_RCU);
- return 0;
+ // Invoke local_clock() only once per 32 consecutive callbacks.
+ return unlikely(tlimit) &&
+ (!likely(count & 31) ||
+ (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) &&
+ jlimit_check && time_after(jiffies, jlimit))) &&
+ local_clock() >= tlimit;
}
/*
* Invoke any RCU callbacks that have made it to the end of their grace
- * period. Thottle as specified by rdp->blimit.
+ * period. Throttle as specified by rdp->blimit.
*/
static void rcu_do_batch(struct rcu_data *rdp)
{
+ long bl;
+ long count = 0;
+ int div;
+ bool __maybe_unused empty;
unsigned long flags;
- const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- rcu_segcblist_is_offloaded(&rdp->cblist);
- struct rcu_head *rhp;
+ unsigned long jlimit;
+ bool jlimit_check = false;
+ long pending;
struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
- long bl, count;
- long pending, tlimit = 0;
+ struct rcu_head *rhp;
+ long tlimit = 0;
/* If no callbacks are ready, just return. */
if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
@@ -2375,41 +2137,55 @@ static void rcu_do_batch(struct rcu_data *rdp)
trace_rcu_batch_end(rcu_state.name, 0,
!rcu_segcblist_empty(&rdp->cblist),
need_resched(), is_idle_task(current),
- rcu_is_callbacks_kthread());
+ rcu_is_callbacks_kthread(rdp));
return;
}
/*
- * Extract the list of ready callbacks, disabling to prevent
+ * Extract the list of ready callbacks, disabling IRQs to prevent
* races with call_rcu() from interrupt handlers. Leave the
* callback counts, as rcu_barrier() needs to be conservative.
*/
- local_irq_save(flags);
- rcu_nocb_lock(rdp);
+ rcu_nocb_lock_irqsave(rdp, flags);
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
- pending = rcu_segcblist_n_cbs(&rdp->cblist);
- bl = max(rdp->blimit, pending >> rcu_divisor);
- if (unlikely(bl > 100))
- tlimit = local_clock() + rcu_resched_ns;
+ pending = rcu_segcblist_get_seglen(&rdp->cblist, RCU_DONE_TAIL);
+ div = READ_ONCE(rcu_divisor);
+ div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div;
+ bl = max(rdp->blimit, pending >> div);
+ if ((in_serving_softirq() || rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING) &&
+ (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) || unlikely(bl > 100))) {
+ const long npj = NSEC_PER_SEC / HZ;
+ long rrn = READ_ONCE(rcu_resched_ns);
+
+ rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn;
+ tlimit = local_clock() + rrn;
+ jlimit = jiffies + (rrn + npj + 1) / npj;
+ jlimit_check = true;
+ }
trace_rcu_batch_start(rcu_state.name,
rcu_segcblist_n_cbs(&rdp->cblist), bl);
rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
- if (offloaded)
+ if (rcu_rdp_is_offloaded(rdp))
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
+
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued"));
rcu_nocb_unlock_irqrestore(rdp, flags);
/* Invoke callbacks. */
tick_dep_set_task(current, TICK_DEP_BIT_RCU);
rhp = rcu_cblist_dequeue(&rcl);
+
for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
rcu_callback_t f;
+ count++;
debug_rcu_head_unqueue(rhp);
rcu_lock_acquire(&rcu_callback_map);
trace_rcu_invoke_callback(rcu_state.name, rhp);
f = rhp->func;
+ debug_rcu_head_callback(rhp);
WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
f(rhp);
@@ -2417,39 +2193,42 @@ static void rcu_do_batch(struct rcu_data *rdp)
/*
* Stop only if limit reached and CPU has something to do.
- * Note: The rcl structure counts down from zero.
*/
- if (-rcl.len >= bl && !offloaded &&
- (need_resched() ||
- (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
- break;
- if (unlikely(tlimit)) {
- /* only call local_clock() every 32 callbacks */
- if (likely((-rcl.len & 31) || local_clock() < tlimit))
- continue;
- /* Exceeded the time limit, so leave. */
- break;
- }
- if (offloaded) {
- WARN_ON_ONCE(in_serving_softirq());
+ if (in_serving_softirq()) {
+ if (count >= bl && (need_resched() || !is_idle_task(current)))
+ break;
+ /*
+ * Make sure we don't spend too much time here and deprive other
+ * softirq vectors of CPU cycles.
+ */
+ if (rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit))
+ break;
+ } else {
+ // In rcuc/rcuoc context, so no worries about
+ // depriving other softirq vectors of CPU cycles.
local_bh_enable();
lockdep_assert_irqs_enabled();
cond_resched_tasks_rcu_qs();
lockdep_assert_irqs_enabled();
local_bh_disable();
+ // But rcuc kthreads can delay quiescent-state
+ // reporting, so check time limits for them.
+ if (rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING &&
+ rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) {
+ rdp->rcu_cpu_has_work = 1;
+ break;
+ }
}
}
- local_irq_save(flags);
- rcu_nocb_lock(rdp);
- count = -rcl.len;
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rdp->n_cbs_invoked += count;
trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
- is_idle_task(current), rcu_is_callbacks_kthread());
+ is_idle_task(current), rcu_is_callbacks_kthread(rdp));
/* Update counts and requeue any remaining callbacks. */
rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
- smp_mb(); /* List handling before counting for rcu_barrier(). */
- rcu_segcblist_insert_count(&rdp->cblist, &rcl);
+ rcu_segcblist_add_len(&rdp->cblist, -count);
/* Reinstate batch limit if we have worked down the excess. */
count = rcu_segcblist_n_cbs(&rdp->cblist);
@@ -2459,7 +2238,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
/* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
if (count == 0 && rdp->qlen_last_fqs_check != 0) {
rdp->qlen_last_fqs_check = 0;
- rdp->n_force_qs_snap = rcu_state.n_force_qs;
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
} else if (count < rdp->qlen_last_fqs_check - qhimark)
rdp->qlen_last_fqs_check = count;
@@ -2467,15 +2246,15 @@ static void rcu_do_batch(struct rcu_data *rdp)
* The following usually indicates a double call_rcu(). To track
* this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
*/
- WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
+ empty = rcu_segcblist_empty(&rdp->cblist);
+ WARN_ON_ONCE(count == 0 && !empty);
WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- count != 0 && rcu_segcblist_empty(&rdp->cblist));
+ count != 0 && empty);
+ WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
+ WARN_ON_ONCE(!empty && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
rcu_nocb_unlock_irqrestore(rdp, flags);
- /* Re-invoke RCU core processing if there are callbacks remaining. */
- if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist))
- invoke_rcu_core();
tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
}
@@ -2485,11 +2264,19 @@ static void rcu_do_batch(struct rcu_data *rdp)
* state, for example, user mode or idle loop. It also schedules RCU
* core processing. If the current grace period has gone on too long,
* it will ask the scheduler to manufacture a context switch for the sole
- * purpose of providing a providing the needed quiescent state.
+ * purpose of providing the needed quiescent state.
*/
void rcu_sched_clock_irq(int user)
{
+ unsigned long j;
+
+ if (IS_ENABLED(CONFIG_PROVE_RCU)) {
+ j = jiffies;
+ WARN_ON_ONCE(time_before(j, __this_cpu_read(rcu_data.last_sched_clock)));
+ __this_cpu_write(rcu_data.last_sched_clock, j);
+ }
trace_rcu_utilization(TPS("Start scheduler-tick"));
+ lockdep_assert_irqs_disabled();
raw_cpu_inc(rcu_data.ticks_this_gp);
/* The load-acquire pairs with the store-release setting to true. */
if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
@@ -2503,6 +2290,9 @@ void rcu_sched_clock_irq(int user)
rcu_flavor_sched_clock_irq(user);
if (rcu_pending(user))
invoke_rcu_core();
+ if (user || rcu_is_cpu_rrupt_from_idle())
+ rcu_note_voluntary_context_switch(current);
+ lockdep_assert_irqs_disabled();
trace_rcu_utilization(TPS("End scheduler-tick"));
}
@@ -2518,20 +2308,19 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
{
int cpu;
unsigned long flags;
- unsigned long mask;
- struct rcu_data *rdp;
struct rcu_node *rnp;
rcu_state.cbovld = rcu_state.cbovldnext;
rcu_state.cbovldnext = false;
rcu_for_each_leaf_node(rnp) {
+ unsigned long mask = 0;
+ unsigned long rsmask = 0;
+
cond_resched_tasks_rcu_qs();
- mask = 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rcu_state.cbovldnext |= !!rnp->cbovldmask;
if (rnp->qsmask == 0) {
- if (!IS_ENABLED(CONFIG_PREEMPT_RCU) ||
- rcu_preempt_blocked_readers_cgp(rnp)) {
+ if (rcu_preempt_blocked_readers_cgp(rnp)) {
/*
* No point in scanning bits because they
* are all zero. But we might need to
@@ -2545,11 +2334,17 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
continue;
}
for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
+ struct rcu_data *rdp;
+ int ret;
+
rdp = per_cpu_ptr(&rcu_data, cpu);
- if (f(rdp)) {
+ ret = f(rdp);
+ if (ret > 0) {
mask |= rdp->grpmask;
rcu_disable_urgency_upon_qs(rdp);
}
+ if (ret < 0)
+ rsmask |= rdp->grpmask;
}
if (mask != 0) {
/* Idle/offline CPUs, report (releases rnp->lock). */
@@ -2558,6 +2353,9 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
/* Nothing to do here, so just drop the lock. */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
+
+ for_each_leaf_node_cpu_mask(rnp, cpu, rsmask)
+ resched_cpu(cpu);
}
}
@@ -2572,8 +2370,10 @@ void rcu_force_quiescent_state(void)
struct rcu_node *rnp;
struct rcu_node *rnp_old = NULL;
+ if (!rcu_gp_in_progress())
+ return;
/* Funnel through hierarchy to reduce memory contention. */
- rnp = __this_cpu_read(rcu_data.mynode);
+ rnp = raw_cpu_read(rcu_data.mynode);
for (; rnp != NULL; rnp = rnp->parent) {
ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) ||
!raw_spin_trylock(&rnp->fqslock);
@@ -2599,14 +2399,38 @@ void rcu_force_quiescent_state(void)
}
EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+// Workqueue handler for an RCU reader for kernels enforcing struct RCU
+// grace periods.
+static void strict_work_handler(struct work_struct *work)
+{
+ rcu_read_lock();
+ rcu_read_unlock();
+}
+
/* Perform RCU core processing work for the current CPU. */
static __latent_entropy void rcu_core(void)
{
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
- const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- rcu_segcblist_is_offloaded(&rdp->cblist);
+ /*
+ * On RT rcu_core() can be preempted when IRQs aren't disabled.
+ * Therefore this function can race with concurrent NOCB (de-)offloading
+ * on this CPU and the below condition must be considered volatile.
+ * However if we race with:
+ *
+ * _ Offloading: In the worst case we accelerate or process callbacks
+ * concurrently with NOCB kthreads. We are guaranteed to
+ * call rcu_nocb_lock() if that happens.
+ *
+ * _ Deoffloading: In the worst case we miss callbacks acceleration or
+ * processing. This is fine because the early stage
+ * of deoffloading invokes rcu_core() after setting
+ * SEGCBLIST_RCU_CORE. So we guarantee that we'll process
+ * what could have been dismissed without the need to wait
+ * for the next rcu_pending() check in the next jiffy.
+ */
+ const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist);
if (cpu_is_offline(smp_processor_id()))
return;
@@ -2614,7 +2438,7 @@ static __latent_entropy void rcu_core(void)
WARN_ON_ONCE(!rdp->beenonline);
/* Report any deferred quiescent states if preemption enabled. */
- if (!(preempt_count() & PREEMPT_MASK)) {
+ if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) {
rcu_preempt_deferred_qs(current);
} else if (rcu_preempt_need_deferred_qs(current)) {
set_tsk_need_resched(current);
@@ -2626,23 +2450,31 @@ static __latent_entropy void rcu_core(void)
/* No grace period and unregistered callbacks? */
if (!rcu_gp_in_progress() &&
- rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) {
- local_irq_save(flags);
+ rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) {
+ rcu_nocb_lock_irqsave(rdp, flags);
if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
rcu_accelerate_cbs_unlocked(rnp, rdp);
- local_irq_restore(flags);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
}
rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
/* If there are callbacks ready, invoke them. */
- if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) &&
- likely(READ_ONCE(rcu_scheduler_fully_active)))
+ if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) &&
+ likely(READ_ONCE(rcu_scheduler_fully_active))) {
rcu_do_batch(rdp);
+ /* Re-invoke RCU core processing if there are callbacks remaining. */
+ if (rcu_segcblist_ready_cbs(&rdp->cblist))
+ invoke_rcu_core();
+ }
/* Do any needed deferred wakeups of rcuo kthreads. */
do_nocb_deferred_wakeup(rdp);
trace_rcu_utilization(TPS("End RCU core"));
+
+ // If strict GPs, schedule an RCU reader in a clean environment.
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
+ queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work);
}
static void rcu_core_si(struct softirq_action *h)
@@ -2705,20 +2537,22 @@ static void rcu_cpu_kthread(unsigned int cpu)
{
unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
+ unsigned long *j = this_cpu_ptr(&rcu_data.rcuc_activity);
int spincnt;
trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));
for (spincnt = 0; spincnt < 10; spincnt++) {
+ WRITE_ONCE(*j, jiffies);
local_bh_disable();
*statusp = RCU_KTHREAD_RUNNING;
local_irq_disable();
work = *workp;
- *workp = 0;
+ WRITE_ONCE(*workp, 0);
local_irq_enable();
if (work)
rcu_core();
local_bh_enable();
- if (*workp == 0) {
+ if (!READ_ONCE(*workp)) {
trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
*statusp = RCU_KTHREAD_WAITING;
return;
@@ -2726,9 +2560,10 @@ static void rcu_cpu_kthread(unsigned int cpu)
}
*statusp = RCU_KTHREAD_YIELDING;
trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
- schedule_timeout_interruptible(2);
+ schedule_timeout_idle(2);
trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
*statusp = RCU_KTHREAD_WAITING;
+ WRITE_ONCE(*j, jiffies);
}
static struct smp_hotplug_thread rcu_cpu_thread_spec = {
@@ -2749,13 +2584,12 @@ static int __init rcu_spawn_core_kthreads(void)
for_each_possible_cpu(cpu)
per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
- if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq)
+ if (use_softirq)
return 0;
WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
"%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
return 0;
}
-early_initcall(rcu_spawn_core_kthreads);
/*
* Handle any core-RCU processing required by a call_rcu() invocation.
@@ -2793,10 +2627,10 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
} else {
/* Give the grace period a kick. */
rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
- if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
+ if (READ_ONCE(rcu_state.n_force_qs) == rdp->n_force_qs_snap &&
rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
rcu_force_quiescent_state();
- rdp->n_force_qs_snap = rcu_state.n_force_qs;
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
}
}
@@ -2851,11 +2685,12 @@ static void check_cb_ovld(struct rcu_data *rdp)
raw_spin_unlock_rcu_node(rnp);
}
-/* Helper function for call_rcu() and friends. */
static void
-__call_rcu(struct rcu_head *head, rcu_callback_t func)
+__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
{
+ static atomic_t doublefrees;
unsigned long flags;
+ bool lazy;
struct rcu_data *rdp;
bool was_alldone;
@@ -2866,17 +2701,21 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
/*
* Probable double call_rcu(), so leak the callback.
* Use rcu:rcu_callback trace event to find the previous
- * time callback was passed to __call_rcu().
+ * time callback was passed to call_rcu().
*/
- WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n",
- head, head->func);
+ if (atomic_inc_return(&doublefrees) < 4) {
+ pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
+ mem_dump_obj(head);
+ }
WRITE_ONCE(head->func, rcu_leak_callback);
return;
}
head->func = func;
head->next = NULL;
+ kasan_record_aux_stack_noalloc(head);
local_irq_save(flags);
rdp = this_cpu_ptr(&rcu_data);
+ lazy = lazy_in && !rcu_async_should_hurry();
/* Add the callback to our list. */
if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) {
@@ -2890,21 +2729,22 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
}
check_cb_ovld(rdp);
- if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
+ if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
return; // Enqueued onto ->nocb_bypass, so just leave.
// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
rcu_segcblist_enqueue(&rdp->cblist, head);
- if (__is_kfree_rcu_offset((unsigned long)func))
- trace_rcu_kfree_callback(rcu_state.name, head,
+ if (__is_kvfree_rcu_offset((unsigned long)func))
+ trace_rcu_kvfree_callback(rcu_state.name, head,
(unsigned long)func,
rcu_segcblist_n_cbs(&rdp->cblist));
else
trace_rcu_callback(rcu_state.name, head,
rcu_segcblist_n_cbs(&rdp->cblist));
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
+
/* Go handle any RCU core processing required. */
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
+ if (unlikely(rcu_rdp_is_offloaded(rdp))) {
__call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
} else {
__call_rcu_core(rdp, head, flags);
@@ -2912,8 +2752,40 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
}
}
+#ifdef CONFIG_RCU_LAZY
+/**
+ * call_rcu_hurry() - Queue RCU callback for invocation after grace period, and
+ * flush all lazy callbacks (including the new one) to the main ->cblist while
+ * doing so.
+ *
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed.
+ *
+ * Use this API instead of call_rcu() if you don't want the callback to be
+ * invoked after very long periods of time, which can happen on systems without
+ * memory pressure and on systems which are lightly loaded or mostly idle.
+ * This function will cause callbacks to be invoked sooner than later at the
+ * expense of extra power. Other than that, this function is identical to, and
+ * reuses call_rcu()'s logic. Refer to call_rcu() for more details about memory
+ * ordering and other functionality.
+ */
+void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
+{
+ __call_rcu_common(head, func, false);
+}
+EXPORT_SYMBOL_GPL(call_rcu_hurry);
+#endif
+
/**
* call_rcu() - Queue an RCU callback for invocation after a grace period.
+ * By default the callbacks are 'lazy' and are kept hidden from the main
+ * ->cblist to prevent starting of grace periods too soon.
+ * If you desire grace periods to start very soon, use call_rcu_hurry().
+ *
* @head: structure to be used for queueing the RCU updates.
* @func: actual callback function to be invoked after the grace period
*
@@ -2921,12 +2793,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
* period elapses, in other words after all pre-existing RCU read-side
* critical sections have completed. However, the callback function
* might well execute concurrently with RCU read-side critical sections
- * that started after call_rcu() was invoked. RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and
- * may be nested. In addition, regions of code across which interrupts,
- * preemption, or softirqs have been disabled also serve as RCU read-side
- * critical sections. This includes hardware interrupt handlers, softirq
- * handlers, and NMI handlers.
+ * that started after call_rcu() was invoked.
+ *
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested. In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
+ * sections. This includes hardware interrupt handlers, softirq handlers,
+ * and NMI handlers.
*
* Note that all CPUs must agree that the grace period extended beyond
* all pre-existing RCU read-side critical section. On systems with more
@@ -2946,64 +2820,81 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
* between the call to call_rcu() and the invocation of "func()" -- even
* if CPU A and CPU B are the same CPU (but again only if the system has
* more than one CPU).
+ *
+ * Implementation of these memory-ordering guarantees is described here:
+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
- __call_rcu(head, func);
+ __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
}
EXPORT_SYMBOL_GPL(call_rcu);
-
/* Maximum number of jiffies to wait before draining a batch. */
-#define KFREE_DRAIN_JIFFIES (HZ / 50)
+#define KFREE_DRAIN_JIFFIES (5 * HZ)
#define KFREE_N_BATCHES 2
-
-/*
- * This macro defines how many entries the "records" array
- * will contain. It is based on the fact that the size of
- * kfree_rcu_bulk_data structure becomes exactly one page.
- */
-#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 3)
+#define FREE_N_CHANNELS 2
/**
- * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers
+ * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
+ * @list: List node. All blocks are linked between each other
+ * @gp_snap: Snapshot of RCU state for objects placed to this bulk
* @nr_records: Number of active pointers in the array
- * @records: Array of the kfree_rcu() pointers
- * @next: Next bulk object in the block chain
- * @head_free_debug: For debug, when CONFIG_DEBUG_OBJECTS_RCU_HEAD is set
+ * @records: Array of the kvfree_rcu() pointers
*/
-struct kfree_rcu_bulk_data {
+struct kvfree_rcu_bulk_data {
+ struct list_head list;
+ struct rcu_gp_oldstate gp_snap;
unsigned long nr_records;
- void *records[KFREE_BULK_MAX_ENTR];
- struct kfree_rcu_bulk_data *next;
- struct rcu_head *head_free_debug;
+ void *records[];
};
+/*
+ * This macro defines how many entries the "records" array
+ * will contain. It is based on the fact that the size of
+ * kvfree_rcu_bulk_data structure becomes exactly one page.
+ */
+#define KVFREE_BULK_MAX_ENTR \
+ ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
+
/**
* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
* @head_free: List of kfree_rcu() objects waiting for a grace period
- * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period
+ * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
+ * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
* @krcp: Pointer to @kfree_rcu_cpu structure
*/
struct kfree_rcu_cpu_work {
struct rcu_work rcu_work;
struct rcu_head *head_free;
- struct kfree_rcu_bulk_data *bhead_free;
+ struct rcu_gp_oldstate head_free_gp_snap;
+ struct list_head bulk_head_free[FREE_N_CHANNELS];
struct kfree_rcu_cpu *krcp;
};
/**
* struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
* @head: List of kfree_rcu() objects not yet waiting for a grace period
- * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
- * @bcached: Keeps at most one object for later reuse when build chain blocks
+ * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
+ * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
* @lock: Synchronize access to this structure
* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
- * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
- * @initialized: The @lock and @rcu_work fields have been initialized
+ * @initialized: The @rcu_work fields have been initialized
+ * @head_count: Number of objects in rcu_head singular list
+ * @bulk_count: Number of objects in bulk-list
+ * @bkvcache:
+ * A simple cache list that contains objects for reuse purpose.
+ * In order to save some per-cpu space the list is singular.
+ * Even though it is lockless an access has to be protected by the
+ * per-cpu lock.
+ * @page_cache_work: A work to refill the cache when it is empty
+ * @backoff_page_cache_fill: Delay cache refills
+ * @work_in_progress: Indicates that page_cache_work is running
+ * @hrtimer: A hrtimer for scheduling a page_cache_work
+ * @nr_bkv_objs: number of allocated objects at @bkvcache.
*
* This is a per-CPU structure. The reason that it is not included in
* the rcu_data structure is to permit this code to be extracted from
@@ -3011,282 +2902,581 @@ struct kfree_rcu_cpu_work {
* the interactions with the slab allocators.
*/
struct kfree_rcu_cpu {
+ // Objects queued on a linked list
+ // through their rcu_head structures.
struct rcu_head *head;
- struct kfree_rcu_bulk_data *bhead;
- struct kfree_rcu_bulk_data *bcached;
+ unsigned long head_gp_snap;
+ atomic_t head_count;
+
+ // Objects queued on a bulk-list.
+ struct list_head bulk_head[FREE_N_CHANNELS];
+ atomic_t bulk_count[FREE_N_CHANNELS];
+
struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
- spinlock_t lock;
+ raw_spinlock_t lock;
struct delayed_work monitor_work;
- bool monitor_todo;
bool initialized;
- // Number of objects for which GP not started
- int count;
+
+ struct delayed_work page_cache_work;
+ atomic_t backoff_page_cache_fill;
+ atomic_t work_in_progress;
+ struct hrtimer hrtimer;
+
+ struct llist_head bkvcache;
+ int nr_bkv_objs;
};
-static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
+static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
+};
static __always_inline void
-debug_rcu_head_unqueue_bulk(struct rcu_head *head)
+debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
{
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
- for (; head; head = head->next)
- debug_rcu_head_unqueue(head);
+ int i;
+
+ for (i = 0; i < bhead->nr_records; i++)
+ debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
#endif
}
+static inline struct kfree_rcu_cpu *
+krc_this_cpu_lock(unsigned long *flags)
+{
+ struct kfree_rcu_cpu *krcp;
+
+ local_irq_save(*flags); // For safely calling this_cpu_ptr().
+ krcp = this_cpu_ptr(&krc);
+ raw_spin_lock(&krcp->lock);
+
+ return krcp;
+}
+
+static inline void
+krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
+{
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+}
+
+static inline struct kvfree_rcu_bulk_data *
+get_cached_bnode(struct kfree_rcu_cpu *krcp)
+{
+ if (!krcp->nr_bkv_objs)
+ return NULL;
+
+ WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
+ return (struct kvfree_rcu_bulk_data *)
+ llist_del_first(&krcp->bkvcache);
+}
+
+static inline bool
+put_cached_bnode(struct kfree_rcu_cpu *krcp,
+ struct kvfree_rcu_bulk_data *bnode)
+{
+ // Check the limit.
+ if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
+ return false;
+
+ llist_add((struct llist_node *) bnode, &krcp->bkvcache);
+ WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
+ return true;
+}
+
+static int
+drain_page_cache(struct kfree_rcu_cpu *krcp)
+{
+ unsigned long flags;
+ struct llist_node *page_list, *pos, *n;
+ int freed = 0;
+
+ if (!rcu_min_cached_objs)
+ return 0;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ page_list = llist_del_all(&krcp->bkvcache);
+ WRITE_ONCE(krcp->nr_bkv_objs, 0);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ llist_for_each_safe(pos, n, page_list) {
+ free_page((unsigned long)pos);
+ freed++;
+ }
+
+ return freed;
+}
+
+static void
+kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
+ struct kvfree_rcu_bulk_data *bnode, int idx)
+{
+ unsigned long flags;
+ int i;
+
+ if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
+ debug_rcu_bhead_unqueue(bnode);
+ rcu_lock_acquire(&rcu_callback_map);
+ if (idx == 0) { // kmalloc() / kfree().
+ trace_rcu_invoke_kfree_bulk_callback(
+ rcu_state.name, bnode->nr_records,
+ bnode->records);
+
+ kfree_bulk(bnode->nr_records, bnode->records);
+ } else { // vmalloc() / vfree().
+ for (i = 0; i < bnode->nr_records; i++) {
+ trace_rcu_invoke_kvfree_callback(
+ rcu_state.name, bnode->records[i], 0);
+
+ vfree(bnode->records[i]);
+ }
+ }
+ rcu_lock_release(&rcu_callback_map);
+ }
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ if (put_cached_bnode(krcp, bnode))
+ bnode = NULL;
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ if (bnode)
+ free_page((unsigned long) bnode);
+
+ cond_resched_tasks_rcu_qs();
+}
+
+static void
+kvfree_rcu_list(struct rcu_head *head)
+{
+ struct rcu_head *next;
+
+ for (; head; head = next) {
+ void *ptr = (void *) head->func;
+ unsigned long offset = (void *) head - ptr;
+
+ next = head->next;
+ debug_rcu_head_unqueue((struct rcu_head *)ptr);
+ rcu_lock_acquire(&rcu_callback_map);
+ trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
+
+ if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
+ kvfree(ptr);
+
+ rcu_lock_release(&rcu_callback_map);
+ cond_resched_tasks_rcu_qs();
+ }
+}
+
/*
* This function is invoked in workqueue context after a grace period.
- * It frees all the objects queued on ->bhead_free or ->head_free.
+ * It frees all the objects queued on ->bulk_head_free or ->head_free.
*/
static void kfree_rcu_work(struct work_struct *work)
{
unsigned long flags;
- struct rcu_head *head, *next;
- struct kfree_rcu_bulk_data *bhead, *bnext;
+ struct kvfree_rcu_bulk_data *bnode, *n;
+ struct list_head bulk_head[FREE_N_CHANNELS];
+ struct rcu_head *head;
struct kfree_rcu_cpu *krcp;
struct kfree_rcu_cpu_work *krwp;
+ struct rcu_gp_oldstate head_gp_snap;
+ int i;
krwp = container_of(to_rcu_work(work),
- struct kfree_rcu_cpu_work, rcu_work);
+ struct kfree_rcu_cpu_work, rcu_work);
krcp = krwp->krcp;
- spin_lock_irqsave(&krcp->lock, flags);
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ // Channels 1 and 2.
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
+
+ // Channel 3.
head = krwp->head_free;
krwp->head_free = NULL;
- bhead = krwp->bhead_free;
- krwp->bhead_free = NULL;
- spin_unlock_irqrestore(&krcp->lock, flags);
+ head_gp_snap = krwp->head_free_gp_snap;
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ // Handle the first two channels.
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ // Start from the tail page, so a GP is likely passed for it.
+ list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
+ kvfree_rcu_bulk(krcp, bnode, i);
+ }
- /* "bhead" is now private, so traverse locklessly. */
- for (; bhead; bhead = bnext) {
- bnext = bhead->next;
+ /*
+ * This is used when the "bulk" path can not be used for the
+ * double-argument of kvfree_rcu(). This happens when the
+ * page-cache is empty, which means that objects are instead
+ * queued on a linked list through their rcu_head structures.
+ * This list is named "Channel 3".
+ */
+ if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
+ kvfree_rcu_list(head);
+}
- debug_rcu_head_unqueue_bulk(bhead->head_free_debug);
+static bool
+need_offload_krc(struct kfree_rcu_cpu *krcp)
+{
+ int i;
- rcu_lock_acquire(&rcu_callback_map);
- trace_rcu_invoke_kfree_bulk_callback(rcu_state.name,
- bhead->nr_records, bhead->records);
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ if (!list_empty(&krcp->bulk_head[i]))
+ return true;
- kfree_bulk(bhead->nr_records, bhead->records);
- rcu_lock_release(&rcu_callback_map);
+ return !!READ_ONCE(krcp->head);
+}
- if (cmpxchg(&krcp->bcached, NULL, bhead))
- free_page((unsigned long) bhead);
+static bool
+need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
+{
+ int i;
- cond_resched_tasks_rcu_qs();
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ if (!list_empty(&krwp->bulk_head_free[i]))
+ return true;
+
+ return !!krwp->head_free;
+}
+
+static int krc_count(struct kfree_rcu_cpu *krcp)
+{
+ int sum = atomic_read(&krcp->head_count);
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ sum += atomic_read(&krcp->bulk_count[i]);
+
+ return sum;
+}
+
+static void
+schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
+{
+ long delay, delay_left;
+
+ delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
+ if (delayed_work_pending(&krcp->monitor_work)) {
+ delay_left = krcp->monitor_work.timer.expires - jiffies;
+ if (delay < delay_left)
+ mod_delayed_work(system_wq, &krcp->monitor_work, delay);
+ return;
}
+ queue_delayed_work(system_wq, &krcp->monitor_work, delay);
+}
- /*
- * Emergency case only. It can happen under low memory
- * condition when an allocation gets failed, so the "bulk"
- * path can not be temporary maintained.
- */
- for (; head; head = next) {
- unsigned long offset = (unsigned long)head->func;
+static void
+kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
+{
+ struct list_head bulk_ready[FREE_N_CHANNELS];
+ struct kvfree_rcu_bulk_data *bnode, *n;
+ struct rcu_head *head_ready = NULL;
+ unsigned long flags;
+ int i;
- next = head->next;
- debug_rcu_head_unqueue(head);
- rcu_lock_acquire(&rcu_callback_map);
- trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ INIT_LIST_HEAD(&bulk_ready[i]);
- if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset)))
- kfree((void *)head - offset);
+ list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
+ if (!poll_state_synchronize_rcu_full(&bnode->gp_snap))
+ break;
- rcu_lock_release(&rcu_callback_map);
- cond_resched_tasks_rcu_qs();
+ atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
+ list_move(&bnode->list, &bulk_ready[i]);
+ }
+ }
+
+ if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) {
+ head_ready = krcp->head;
+ atomic_set(&krcp->head_count, 0);
+ WRITE_ONCE(krcp->head, NULL);
+ }
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
+ kvfree_rcu_bulk(krcp, bnode, i);
}
+
+ if (head_ready)
+ kvfree_rcu_list(head_ready);
}
/*
- * Schedule the kfree batch RCU work to run in workqueue context after a GP.
- *
- * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES
- * timeout has been reached.
+ * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
*/
-static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
+static void kfree_rcu_monitor(struct work_struct *work)
{
- struct kfree_rcu_cpu_work *krwp;
- bool queued = false;
- int i;
+ struct kfree_rcu_cpu *krcp = container_of(work,
+ struct kfree_rcu_cpu, monitor_work.work);
+ unsigned long flags;
+ int i, j;
+
+ // Drain ready for reclaim.
+ kvfree_rcu_drain_ready(krcp);
- lockdep_assert_held(&krcp->lock);
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ // Attempt to start a new batch.
for (i = 0; i < KFREE_N_BATCHES; i++) {
- krwp = &(krcp->krw_arr[i]);
+ struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
- /*
- * Try to detach bhead or head and attach it over any
- * available corresponding free channel. It can be that
- * a previous RCU batch is in progress, it means that
- * immediately to queue another one is not possible so
- * return false to tell caller to retry.
- */
- if ((krcp->bhead && !krwp->bhead_free) ||
- (krcp->head && !krwp->head_free)) {
- /* Channel 1. */
- if (!krwp->bhead_free) {
- krwp->bhead_free = krcp->bhead;
- krcp->bhead = NULL;
+ // Try to detach bulk_head or head and attach it, only when
+ // all channels are free. Any channel is not free means at krwp
+ // there is on-going rcu work to handle krwp's free business.
+ if (need_wait_for_krwp_work(krwp))
+ continue;
+
+ // kvfree_rcu_drain_ready() might handle this krcp, if so give up.
+ if (need_offload_krc(krcp)) {
+ // Channel 1 corresponds to the SLAB-pointer bulk path.
+ // Channel 2 corresponds to vmalloc-pointer bulk path.
+ for (j = 0; j < FREE_N_CHANNELS; j++) {
+ if (list_empty(&krwp->bulk_head_free[j])) {
+ atomic_set(&krcp->bulk_count[j], 0);
+ list_replace_init(&krcp->bulk_head[j],
+ &krwp->bulk_head_free[j]);
+ }
}
- /* Channel 2. */
+ // Channel 3 corresponds to both SLAB and vmalloc
+ // objects queued on the linked list.
if (!krwp->head_free) {
krwp->head_free = krcp->head;
- krcp->head = NULL;
+ get_state_synchronize_rcu_full(&krwp->head_free_gp_snap);
+ atomic_set(&krcp->head_count, 0);
+ WRITE_ONCE(krcp->head, NULL);
}
- WRITE_ONCE(krcp->count, 0);
-
- /*
- * One work is per one batch, so there are two "free channels",
- * "bhead_free" and "head_free" the batch can handle. It can be
- * that the work is in the pending state when two channels have
- * been detached following each other, one by one.
- */
+ // One work is per one batch, so there are three
+ // "free channels", the batch can handle. It can
+ // be that the work is in the pending state when
+ // channels have been detached following by each
+ // other.
queue_rcu_work(system_wq, &krwp->rcu_work);
- queued = true;
}
}
- return queued;
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ // If there is nothing to detach, it means that our job is
+ // successfully done here. In case of having at least one
+ // of the channels that is still busy we should rearm the
+ // work to repeat an attempt. Because previous batches are
+ // still in progress.
+ if (need_offload_krc(krcp))
+ schedule_delayed_monitor_work(krcp);
}
-static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
- unsigned long flags)
+static enum hrtimer_restart
+schedule_page_work_fn(struct hrtimer *t)
{
- // Attempt to start a new batch.
- krcp->monitor_todo = false;
- if (queue_kfree_rcu_work(krcp)) {
- // Success! Our job is done here.
- spin_unlock_irqrestore(&krcp->lock, flags);
- return;
- }
+ struct kfree_rcu_cpu *krcp =
+ container_of(t, struct kfree_rcu_cpu, hrtimer);
- // Previous RCU batch still in progress, try again later.
- krcp->monitor_todo = true;
- schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
- spin_unlock_irqrestore(&krcp->lock, flags);
+ queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
+ return HRTIMER_NORESTART;
}
-/*
- * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
- * It invokes kfree_rcu_drain_unlock() to attempt to start another batch.
- */
-static void kfree_rcu_monitor(struct work_struct *work)
+static void fill_page_cache_func(struct work_struct *work)
{
+ struct kvfree_rcu_bulk_data *bnode;
+ struct kfree_rcu_cpu *krcp =
+ container_of(work, struct kfree_rcu_cpu,
+ page_cache_work.work);
unsigned long flags;
- struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
- monitor_work.work);
+ int nr_pages;
+ bool pushed;
+ int i;
- spin_lock_irqsave(&krcp->lock, flags);
- if (krcp->monitor_todo)
- kfree_rcu_drain_unlock(krcp, flags);
- else
- spin_unlock_irqrestore(&krcp->lock, flags);
+ nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
+ 1 : rcu_min_cached_objs;
+
+ for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
+ bnode = (struct kvfree_rcu_bulk_data *)
+ __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+
+ if (!bnode)
+ break;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ pushed = put_cached_bnode(krcp, bnode);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ if (!pushed) {
+ free_page((unsigned long) bnode);
+ break;
+ }
+ }
+
+ atomic_set(&krcp->work_in_progress, 0);
+ atomic_set(&krcp->backoff_page_cache_fill, 0);
}
+static void
+run_page_cache_worker(struct kfree_rcu_cpu *krcp)
+{
+ // If cache disabled, bail out.
+ if (!rcu_min_cached_objs)
+ return;
+
+ if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
+ !atomic_xchg(&krcp->work_in_progress, 1)) {
+ if (atomic_read(&krcp->backoff_page_cache_fill)) {
+ queue_delayed_work(system_wq,
+ &krcp->page_cache_work,
+ msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
+ } else {
+ hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ krcp->hrtimer.function = schedule_page_work_fn;
+ hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+ }
+ }
+}
+
+// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
+// state specified by flags. If can_alloc is true, the caller must
+// be schedulable and not be holding any locks or mutexes that might be
+// acquired by the memory allocator or anything that it might invoke.
+// Returns true if ptr was successfully recorded, else the caller must
+// use a fallback.
static inline bool
-kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
- struct rcu_head *head, rcu_callback_t func)
+add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
+ unsigned long *flags, void *ptr, bool can_alloc)
{
- struct kfree_rcu_bulk_data *bnode;
+ struct kvfree_rcu_bulk_data *bnode;
+ int idx;
- if (unlikely(!krcp->initialized))
+ *krcp = krc_this_cpu_lock(flags);
+ if (unlikely(!(*krcp)->initialized))
return false;
- lockdep_assert_held(&krcp->lock);
+ idx = !!is_vmalloc_addr(ptr);
+ bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
+ struct kvfree_rcu_bulk_data, list);
/* Check if a new block is required. */
- if (!krcp->bhead ||
- krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
- bnode = xchg(&krcp->bcached, NULL);
- if (!bnode) {
- WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
-
- bnode = (struct kfree_rcu_bulk_data *)
- __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+ if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
+ bnode = get_cached_bnode(*krcp);
+ if (!bnode && can_alloc) {
+ krc_this_cpu_unlock(*krcp, *flags);
+
+ // __GFP_NORETRY - allows a light-weight direct reclaim
+ // what is OK from minimizing of fallback hitting point of
+ // view. Apart of that it forbids any OOM invoking what is
+ // also beneficial since we are about to release memory soon.
+ //
+ // __GFP_NOMEMALLOC - prevents from consuming of all the
+ // memory reserves. Please note we have a fallback path.
+ //
+ // __GFP_NOWARN - it is supposed that an allocation can
+ // be failed under low memory or high memory pressure
+ // scenarios.
+ bnode = (struct kvfree_rcu_bulk_data *)
+ __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
}
- /* Switch to emergency path. */
- if (unlikely(!bnode))
+ if (!bnode)
return false;
- /* Initialize the new block. */
+ // Initialize the new block and attach it.
bnode->nr_records = 0;
- bnode->next = krcp->bhead;
- bnode->head_free_debug = NULL;
-
- /* Attach it to the head. */
- krcp->bhead = bnode;
+ list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
}
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
- head->func = func;
- head->next = krcp->bhead->head_free_debug;
- krcp->bhead->head_free_debug = head;
-#endif
-
- /* Finally insert. */
- krcp->bhead->records[krcp->bhead->nr_records++] =
- (void *) head - (unsigned long) func;
+ // Finally insert and update the GP for this page.
+ bnode->records[bnode->nr_records++] = ptr;
+ get_state_synchronize_rcu_full(&bnode->gp_snap);
+ atomic_inc(&(*krcp)->bulk_count[idx]);
return true;
}
/*
- * Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace
- * period. Please note there are two paths are maintained, one is the main one
- * that uses kfree_bulk() interface and second one is emergency one, that is
- * used only when the main path can not be maintained temporary, due to memory
- * pressure.
+ * Queue a request for lazy invocation of the appropriate free routine
+ * after a grace period. Please note that three paths are maintained,
+ * two for the common case using arrays of pointers and a third one that
+ * is used only when the main paths cannot be used, for example, due to
+ * memory pressure.
*
- * Each kfree_call_rcu() request is added to a batch. The batch will be drained
+ * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
* every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
* be free'd in workqueue context. This allows us to: batch requests together to
- * reduce the number of grace periods during heavy kfree_rcu() load.
+ * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
*/
-void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
{
unsigned long flags;
struct kfree_rcu_cpu *krcp;
+ bool success;
- local_irq_save(flags); // For safely calling this_cpu_ptr().
- krcp = this_cpu_ptr(&krc);
- if (krcp->initialized)
- spin_lock(&krcp->lock);
+ /*
+ * Please note there is a limitation for the head-less
+ * variant, that is why there is a clear rule for such
+ * objects: it can be used from might_sleep() context
+ * only. For other places please embed an rcu_head to
+ * your data.
+ */
+ if (!head)
+ might_sleep();
// Queue the object but don't yet schedule the batch.
- if (debug_rcu_head_queue(head)) {
+ if (debug_rcu_head_queue(ptr)) {
// Probable double kfree_rcu(), just leak.
WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
__func__, head);
- goto unlock_return;
+
+ // Mark as success and leave.
+ return;
}
- /*
- * Under high memory pressure GFP_NOWAIT can fail,
- * in that case the emergency path is maintained.
- */
- if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) {
- head->func = func;
+ kasan_record_aux_stack_noalloc(ptr);
+ success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
+ if (!success) {
+ run_page_cache_worker(krcp);
+
+ if (head == NULL)
+ // Inline if kvfree_rcu(one_arg) call.
+ goto unlock_return;
+
+ head->func = ptr;
head->next = krcp->head;
- krcp->head = head;
+ WRITE_ONCE(krcp->head, head);
+ atomic_inc(&krcp->head_count);
+
+ // Take a snapshot for this krcp.
+ krcp->head_gp_snap = get_state_synchronize_rcu();
+ success = true;
}
- WRITE_ONCE(krcp->count, krcp->count + 1);
+ /*
+ * The kvfree_rcu() caller considers the pointer freed at this point
+ * and likely removes any references to it. Since the actual slab
+ * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
+ * this object (no scanning or false positives reporting).
+ */
+ kmemleak_ignore(ptr);
// Set timer to drain after KFREE_DRAIN_JIFFIES.
- if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
- !krcp->monitor_todo) {
- krcp->monitor_todo = true;
- schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
- }
+ if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
+ schedule_delayed_monitor_work(krcp);
unlock_return:
- if (krcp->initialized)
- spin_unlock(&krcp->lock);
- local_irq_restore(flags);
+ krc_this_cpu_unlock(krcp, flags);
+
+ /*
+ * Inline kvfree() after synchronize_rcu(). We can do
+ * it from might_sleep() context only, so the current
+ * CPU can pass the QS state.
+ */
+ if (!success) {
+ debug_rcu_head_unqueue((struct rcu_head *) ptr);
+ synchronize_rcu();
+ kvfree(ptr);
+ }
}
-EXPORT_SYMBOL_GPL(kfree_call_rcu);
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
static unsigned long
kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
@@ -3295,31 +3485,29 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
unsigned long count = 0;
/* Snapshot count of all CPUs */
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- count += READ_ONCE(krcp->count);
+ count += krc_count(krcp);
+ count += READ_ONCE(krcp->nr_bkv_objs);
+ atomic_set(&krcp->backoff_page_cache_fill, 1);
}
- return count;
+ return count == 0 ? SHRINK_EMPTY : count;
}
static unsigned long
kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
int cpu, freed = 0;
- unsigned long flags;
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
int count;
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- count = krcp->count;
- spin_lock_irqsave(&krcp->lock, flags);
- if (krcp->monitor_todo)
- kfree_rcu_drain_unlock(krcp, flags);
- else
- spin_unlock_irqrestore(&krcp->lock, flags);
+ count = krc_count(krcp);
+ count += drain_page_cache(krcp);
+ kfree_rcu_monitor(&krcp->monitor_work.work);
sc->nr_to_scan -= count;
freed += count;
@@ -3328,59 +3516,38 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
break;
}
- return freed;
+ return freed == 0 ? SHRINK_STOP : freed;
}
-static struct shrinker kfree_rcu_shrinker = {
- .count_objects = kfree_rcu_shrink_count,
- .scan_objects = kfree_rcu_shrink_scan,
- .batch = 0,
- .seeks = DEFAULT_SEEKS,
-};
-
void __init kfree_rcu_scheduler_running(void)
{
int cpu;
- unsigned long flags;
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- spin_lock_irqsave(&krcp->lock, flags);
- if (!krcp->head || krcp->monitor_todo) {
- spin_unlock_irqrestore(&krcp->lock, flags);
- continue;
- }
- krcp->monitor_todo = true;
- schedule_delayed_work_on(cpu, &krcp->monitor_work,
- KFREE_DRAIN_JIFFIES);
- spin_unlock_irqrestore(&krcp->lock, flags);
+ if (need_offload_krc(krcp))
+ schedule_delayed_monitor_work(krcp);
}
}
/*
* During early boot, any blocking grace-period wait automatically
- * implies a grace period. Later on, this is never the case for PREEMPTION.
+ * implies a grace period.
*
- * Howevr, because a context switch is a grace period for !PREEMPTION, any
- * blocking grace-period wait automatically implies a grace period if
- * there is only one CPU online at any point time during execution of
- * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
- * occasionally incorrectly indicate that there are multiple CPUs online
- * when there was in fact only one the whole time, as this just adds some
- * overhead: RCU still operates correctly.
+ * Later on, this could in theory be the case for kernels built with
+ * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this
+ * is not a common case. Furthermore, this optimization would cause
+ * the rcu_gp_oldstate structure to expand by 50%, so this potential
+ * grace-period optimization is ignored once the scheduler is running.
*/
static int rcu_blocking_is_gp(void)
{
- int ret;
-
- if (IS_ENABLED(CONFIG_PREEMPTION))
- return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
- might_sleep(); /* Check for RCU read-side critical section. */
- preempt_disable();
- ret = num_online_cpus() <= 1;
- preempt_enable();
- return ret;
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) {
+ might_sleep();
+ return false;
+ }
+ return true;
}
/**
@@ -3391,10 +3558,12 @@ static int rcu_blocking_is_gp(void)
* read-side critical sections have completed. Note, however, that
* upon return from synchronize_rcu(), the caller might well be executing
* concurrently with new RCU read-side critical sections that began while
- * synchronize_rcu() was waiting. RCU read-side critical sections are
- * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
- * In addition, regions of code across which interrupts, preemption, or
- * softirqs have been disabled also serve as RCU read-side critical
+ * synchronize_rcu() was waiting.
+ *
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested. In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
* sections. This includes hardware interrupt handlers, softirq handlers,
* and NMI handlers.
*
@@ -3415,28 +3584,70 @@ static int rcu_blocking_is_gp(void)
* to have executed a full memory barrier during the execution of
* synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
* again only if the system has more than one CPU).
+ *
+ * Implementation of these memory-ordering guarantees is described here:
+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
*/
void synchronize_rcu(void)
{
+ unsigned long flags;
+ struct rcu_node *rnp;
+
RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
- if (rcu_blocking_is_gp())
+ if (!rcu_blocking_is_gp()) {
+ if (rcu_gp_is_expedited())
+ synchronize_rcu_expedited();
+ else
+ wait_rcu_gp(call_rcu_hurry);
return;
- if (rcu_gp_is_expedited())
- synchronize_rcu_expedited();
- else
- wait_rcu_gp(call_rcu);
+ }
+
+ // Context allows vacuous grace periods.
+ // Note well that this code runs with !PREEMPT && !SMP.
+ // In addition, all code that advances grace periods runs at
+ // process level. Therefore, this normal GP overlaps with other
+ // normal GPs only by being fully nested within them, which allows
+ // reuse of ->gp_seq_polled_snap.
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
+
+ // Update the normal grace-period counters to record
+ // this grace period, but only those used by the boot CPU.
+ // The rcu_scheduler_starting() will take care of the rest of
+ // these counters.
+ local_irq_save(flags);
+ WARN_ON_ONCE(num_online_cpus() > 1);
+ rcu_state.gp_seq += (1 << RCU_SEQ_CTR_SHIFT);
+ for (rnp = this_cpu_ptr(&rcu_data)->mynode; rnp; rnp = rnp->parent)
+ rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
+ local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
/**
+ * get_completed_synchronize_rcu_full - Return a full pre-completed polled state cookie
+ * @rgosp: Place to put state cookie
+ *
+ * Stores into @rgosp a value that will always be treated by functions
+ * like poll_state_synchronize_rcu_full() as a cookie whose grace period
+ * has already completed.
+ */
+void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
+ rgosp->rgos_exp = RCU_GET_STATE_COMPLETED;
+}
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
+
+/**
* get_state_synchronize_rcu - Snapshot current RCU state
*
* Returns a cookie that is used by a later call to cond_synchronize_rcu()
- * to determine whether or not a full grace period has elapsed in the
- * meantime.
+ * or poll_state_synchronize_rcu() to determine whether or not a full
+ * grace period has elapsed in the meantime.
*/
unsigned long get_state_synchronize_rcu(void)
{
@@ -3445,33 +3656,256 @@ unsigned long get_state_synchronize_rcu(void)
* before the load from ->gp_seq.
*/
smp_mb(); /* ^^^ */
- return rcu_seq_snap(&rcu_state.gp_seq);
+ return rcu_seq_snap(&rcu_state.gp_seq_polled);
}
EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
/**
- * cond_synchronize_rcu - Conditionally wait for an RCU grace period
+ * get_state_synchronize_rcu_full - Snapshot RCU state, both normal and expedited
+ * @rgosp: location to place combined normal/expedited grace-period state
+ *
+ * Places the normal and expedited grace-period states in @rgosp. This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * The rcu_gp_oldstate structure takes up twice the memory of an unsigned
+ * long, but is guaranteed to see all grace periods. In contrast, the
+ * combined state occupies less memory, but can sometimes fail to take
+ * grace periods into account.
+ *
+ * This does not guarantee that the needed grace period will actually
+ * start.
+ */
+void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ /*
+ * Any prior manipulation of RCU-protected data must happen
+ * before the loads from ->gp_seq and ->expedited_sequence.
+ */
+ smp_mb(); /* ^^^ */
+ rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq);
+ rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
+
+/*
+ * Helper function for start_poll_synchronize_rcu() and
+ * start_poll_synchronize_rcu_full().
+ */
+static void start_poll_synchronize_rcu_common(void)
+{
+ unsigned long flags;
+ bool needwake;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ lockdep_assert_irqs_enabled();
+ local_irq_save(flags);
+ rdp = this_cpu_ptr(&rcu_data);
+ rnp = rdp->mynode;
+ raw_spin_lock_rcu_node(rnp); // irqs already disabled.
+ // Note it is possible for a grace period to have elapsed between
+ // the above call to get_state_synchronize_rcu() and the below call
+ // to rcu_seq_snap. This is OK, the worst that happens is that we
+ // get a grace period that no one needed. These accesses are ordered
+ // by smp_mb(), and we are accessing them in the opposite order
+ // from which they are updated at grace-period start, as required.
+ needwake = rcu_start_this_gp(rnp, rdp, rcu_seq_snap(&rcu_state.gp_seq));
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ if (needwake)
+ rcu_gp_kthread_wake();
+}
+
+/**
+ * start_poll_synchronize_rcu - Snapshot and start RCU grace period
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_rcu()
+ * or poll_state_synchronize_rcu() to determine whether or not a full
+ * grace period has elapsed in the meantime. If the needed grace period
+ * is not already slated to start, notifies RCU core of the need for that
+ * grace period.
+ *
+ * Interrupts must be enabled for the case where it is necessary to awaken
+ * the grace-period kthread.
+ */
+unsigned long start_poll_synchronize_rcu(void)
+{
+ unsigned long gp_seq = get_state_synchronize_rcu();
+
+ start_poll_synchronize_rcu_common();
+ return gp_seq;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
+
+/**
+ * start_poll_synchronize_rcu_full - Take a full snapshot and start RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
+ *
+ * Places the normal and expedited grace-period states in *@rgos. This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * If the needed grace period is not already slated to start, notifies
+ * RCU core of the need for that grace period.
*
- * @oldstate: return value from earlier call to get_state_synchronize_rcu()
+ * Interrupts must be enabled for the case where it is necessary to awaken
+ * the grace-period kthread.
+ */
+void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ get_state_synchronize_rcu_full(rgosp);
+
+ start_poll_synchronize_rcu_common();
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);
+
+/**
+ * poll_state_synchronize_rcu - Has the specified RCU grace period completed?
+ * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
+ *
+ * If a full RCU grace period has elapsed since the earlier call from
+ * which @oldstate was obtained, return @true, otherwise return @false.
+ * If @false is returned, it is the caller's responsibility to invoke this
+ * function later on until it does return @true. Alternatively, the caller
+ * can explicitly wait for a grace period, for example, by passing @oldstate
+ * to either cond_synchronize_rcu() or cond_synchronize_rcu_expedited()
+ * on the one hand or by directly invoking either synchronize_rcu() or
+ * synchronize_rcu_expedited() on the other.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than a billion grace periods (and way more on a 64-bit system!).
+ * Those needing to keep old state values for very long time periods
+ * (many hours even on 32-bit systems) should check them occasionally and
+ * either refresh them or set a flag indicating that the grace period has
+ * completed. Alternatively, they can use get_completed_synchronize_rcu()
+ * to get a guaranteed-completed grace-period state.
+ *
+ * In addition, because oldstate compresses the grace-period state for
+ * both normal and expedited grace periods into a single unsigned long,
+ * it can miss a grace period when synchronize_rcu() runs concurrently
+ * with synchronize_rcu_expedited(). If this is unacceptable, please
+ * instead use the _full() variant of these polling APIs.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @oldstate, and that returned at the end
+ * of this function.
+ */
+bool poll_state_synchronize_rcu(unsigned long oldstate)
+{
+ if (oldstate == RCU_GET_STATE_COMPLETED ||
+ rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) {
+ smp_mb(); /* Ensure GP ends before subsequent accesses. */
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
+
+/**
+ * poll_state_synchronize_rcu_full - Has the specified RCU grace period completed?
+ * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
+ *
+ * If a full RCU grace period has elapsed since the earlier call from
+ * which *rgosp was obtained, return @true, otherwise return @false.
+ * If @false is returned, it is the caller's responsibility to invoke this
+ * function later on until it does return @true. Alternatively, the caller
+ * can explicitly wait for a grace period, for example, by passing @rgosp
+ * to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited
+ * for more than a billion grace periods (and way more on a 64-bit
+ * system!). Those needing to keep rcu_gp_oldstate values for very
+ * long time periods (many hours even on 32-bit systems) should check
+ * them occasionally and either refresh them or set a flag indicating
+ * that the grace period has completed. Alternatively, they can use
+ * get_completed_synchronize_rcu_full() to get a guaranteed-completed
+ * grace-period state.
+ *
+ * This function provides the same memory-ordering guarantees that would
+ * be provided by a synchronize_rcu() that was invoked at the call to
+ * the function that provided @rgosp, and that returned at the end of this
+ * function. And this guarantee requires that the root rcu_node structure's
+ * ->gp_seq field be checked instead of that of the rcu_state structure.
+ * The problem is that the just-ending grace-period's callbacks can be
+ * invoked between the time that the root rcu_node structure's ->gp_seq
+ * field is updated and the time that the rcu_state structure's ->gp_seq
+ * field is updated. Therefore, if a single synchronize_rcu() is to
+ * cause a subsequent poll_state_synchronize_rcu_full() to return @true,
+ * then the root rcu_node structure is the one that needs to be polled.
+ */
+bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ smp_mb(); // Order against root rcu_node structure grace-period cleanup.
+ if (rgosp->rgos_norm == RCU_GET_STATE_COMPLETED ||
+ rcu_seq_done_exact(&rnp->gp_seq, rgosp->rgos_norm) ||
+ rgosp->rgos_exp == RCU_GET_STATE_COMPLETED ||
+ rcu_seq_done_exact(&rcu_state.expedited_sequence, rgosp->rgos_exp)) {
+ smp_mb(); /* Ensure GP ends before subsequent accesses. */
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu_full);
+
+/**
+ * cond_synchronize_rcu - Conditionally wait for an RCU grace period
+ * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
*
* If a full RCU grace period has elapsed since the earlier call to
- * get_state_synchronize_rcu(), just return. Otherwise, invoke
- * synchronize_rcu() to wait for a full grace period.
+ * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
+ * Otherwise, invoke synchronize_rcu() to wait for a full grace period.
*
- * Yes, this function does not take counter wrap into account. But
- * counter wrap is harmless. If the counter wraps, we have waited for
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
* more than 2 billion grace periods (and way more on a 64-bit system!),
- * so waiting for one additional grace period should be just fine.
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @oldstate and that returned at the end
+ * of this function.
*/
void cond_synchronize_rcu(unsigned long oldstate)
{
- if (!rcu_seq_done(&rcu_state.gp_seq, oldstate))
+ if (!poll_state_synchronize_rcu(oldstate))
synchronize_rcu();
- else
- smp_mb(); /* Ensure GP ends before subsequent accesses. */
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
+/**
+ * cond_synchronize_rcu_full - Conditionally wait for an RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
+ *
+ * If a full RCU grace period has elapsed since the call to
+ * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
+ * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
+ * obtained, just return. Otherwise, invoke synchronize_rcu() to wait
+ * for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @rgosp and that returned at the end of
+ * this function.
+ */
+void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ if (!poll_state_synchronize_rcu_full(rgosp))
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_full);
+
/*
* Check to see if there is any immediate RCU-related work to be done by
* the current CPU, returning 1 if so and zero otherwise. The checks are
@@ -3485,11 +3919,13 @@ static int rcu_pending(int user)
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
+ lockdep_assert_irqs_disabled();
+
/* Check for CPU stalls, if enabled. */
check_cpu_stall(rdp);
/* Does this CPU need a deferred NOCB wakeup? */
- if (rcu_nocb_need_deferred_wakeup(rdp))
+ if (rcu_nocb_need_deferred_wakeup(rdp, RCU_NOCB_WAKE))
return 1;
/* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */
@@ -3502,13 +3938,13 @@ static int rcu_pending(int user)
return 1;
/* Does this CPU have callbacks ready to invoke? */
- if (rcu_segcblist_ready_cbs(&rdp->cblist))
+ if (!rcu_rdp_is_offloaded(rdp) &&
+ rcu_segcblist_ready_cbs(&rdp->cblist))
return 1;
/* Has RCU gone idle with this CPU needing another grace period? */
if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) &&
- (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) ||
- !rcu_segcblist_is_offloaded(&rdp->cblist)) &&
+ !rcu_rdp_is_offloaded(rdp) &&
!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
return 1;
@@ -3554,26 +3990,56 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
}
/*
- * Called with preemption disabled, and from cross-cpu IRQ context.
+ * If needed, entrain an rcu_barrier() callback on rdp->cblist.
*/
-static void rcu_barrier_func(void *cpu_in)
+static void rcu_barrier_entrain(struct rcu_data *rdp)
{
- uintptr_t cpu = (uintptr_t)cpu_in;
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
+ unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
+ bool wake_nocb = false;
+ bool was_alldone = false;
+ lockdep_assert_held(&rcu_state.barrier_lock);
+ if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
+ return;
rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
rdp->barrier_head.func = rcu_barrier_callback;
debug_rcu_head_queue(&rdp->barrier_head);
rcu_nocb_lock(rdp);
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
+ /*
+ * Flush bypass and wakeup rcuog if we add callbacks to an empty regular
+ * queue. This way we don't wait for bypass timer that can reach seconds
+ * if it's fully lazy.
+ */
+ was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
+ wake_nocb = was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist);
if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
atomic_inc(&rcu_state.barrier_cpu_count);
} else {
debug_rcu_head_unqueue(&rdp->barrier_head);
- rcu_barrier_trace(TPS("IRQNQ"), -1,
- rcu_state.barrier_sequence);
+ rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
}
rcu_nocb_unlock(rdp);
+ if (wake_nocb)
+ wake_nocb_gp(rdp, false);
+ smp_store_release(&rdp->barrier_seq_snap, gseq);
+}
+
+/*
+ * Called with preemption disabled, and from cross-cpu IRQ context.
+ */
+static void rcu_barrier_handler(void *cpu_in)
+{
+ uintptr_t cpu = (uintptr_t)cpu_in;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ lockdep_assert_irqs_disabled();
+ WARN_ON_ONCE(cpu != rdp->cpu);
+ WARN_ON_ONCE(cpu != smp_processor_id());
+ raw_spin_lock(&rcu_state.barrier_lock);
+ rcu_barrier_entrain(rdp);
+ raw_spin_unlock(&rcu_state.barrier_lock);
}
/**
@@ -3587,6 +4053,8 @@ static void rcu_barrier_func(void *cpu_in)
void rcu_barrier(void)
{
uintptr_t cpu;
+ unsigned long flags;
+ unsigned long gseq;
struct rcu_data *rdp;
unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
@@ -3597,15 +4065,16 @@ void rcu_barrier(void)
/* Did someone else do our work for us? */
if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
- rcu_barrier_trace(TPS("EarlyExit"), -1,
- rcu_state.barrier_sequence);
+ rcu_barrier_trace(TPS("EarlyExit"), -1, rcu_state.barrier_sequence);
smp_mb(); /* caller's subsequent code after above check. */
mutex_unlock(&rcu_state.barrier_mutex);
return;
}
/* Mark the start of the barrier operation. */
+ raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
rcu_seq_start(&rcu_state.barrier_sequence);
+ gseq = rcu_state.barrier_sequence;
rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence);
/*
@@ -3617,7 +4086,7 @@ void rcu_barrier(void)
*/
init_completion(&rcu_state.barrier_completion);
atomic_set(&rcu_state.barrier_cpu_count, 2);
- get_online_cpus();
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
/*
* Force each CPU with callbacks to register a new callback.
@@ -3626,29 +4095,31 @@ void rcu_barrier(void)
*/
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
- if (cpu_is_offline(cpu) &&
- !rcu_segcblist_is_offloaded(&rdp->cblist))
+retry:
+ if (smp_load_acquire(&rdp->barrier_seq_snap) == gseq)
+ continue;
+ raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
+ if (!rcu_segcblist_n_cbs(&rdp->cblist)) {
+ WRITE_ONCE(rdp->barrier_seq_snap, gseq);
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence);
+ continue;
+ }
+ if (!rcu_rdp_cpu_online(rdp)) {
+ rcu_barrier_entrain(rdp);
+ WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, rcu_state.barrier_sequence);
continue;
- if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) {
- rcu_barrier_trace(TPS("OnlineQ"), cpu,
- rcu_state.barrier_sequence);
- smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1);
- } else if (rcu_segcblist_n_cbs(&rdp->cblist) &&
- cpu_is_offline(cpu)) {
- rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu,
- rcu_state.barrier_sequence);
- local_irq_disable();
- rcu_barrier_func((void *)cpu);
- local_irq_enable();
- } else if (cpu_is_offline(cpu)) {
- rcu_barrier_trace(TPS("OfflineNoCBNoQ"), cpu,
- rcu_state.barrier_sequence);
- } else {
- rcu_barrier_trace(TPS("OnlineNQ"), cpu,
- rcu_state.barrier_sequence);
}
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ if (smp_call_function_single(cpu, rcu_barrier_handler, (void *)cpu, 1)) {
+ schedule_timeout_uninterruptible(1);
+ goto retry;
+ }
+ WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
+ rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence);
}
- put_online_cpus();
/*
* Now that we have an rcu_barrier_callback() callback on each
@@ -3663,16 +4134,218 @@ void rcu_barrier(void)
/* Mark the end of the barrier operation. */
rcu_barrier_trace(TPS("Inc2"), -1, rcu_state.barrier_sequence);
rcu_seq_end(&rcu_state.barrier_sequence);
+ gseq = rcu_state.barrier_sequence;
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ WRITE_ONCE(rdp->barrier_seq_snap, gseq);
+ }
/* Other rcu_barrier() invocations can now safely proceed. */
mutex_unlock(&rcu_state.barrier_mutex);
}
EXPORT_SYMBOL_GPL(rcu_barrier);
+static unsigned long rcu_barrier_last_throttle;
+
+/**
+ * rcu_barrier_throttled - Do rcu_barrier(), but limit to one per second
+ *
+ * This can be thought of as guard rails around rcu_barrier() that
+ * permits unrestricted userspace use, at least assuming the hardware's
+ * try_cmpxchg() is robust. There will be at most one call per second to
+ * rcu_barrier() system-wide from use of this function, which means that
+ * callers might needlessly wait a second or three.
+ *
+ * This is intended for use by test suites to avoid OOM by flushing RCU
+ * callbacks from the previous test before starting the next. See the
+ * rcutree.do_rcu_barrier module parameter for more information.
+ *
+ * Why not simply make rcu_barrier() more scalable? That might be
+ * the eventual endpoint, but let's keep it simple for the time being.
+ * Note that the module parameter infrastructure serializes calls to a
+ * given .set() function, but should concurrent .set() invocation ever be
+ * possible, we are ready!
+ */
+static void rcu_barrier_throttled(void)
+{
+ unsigned long j = jiffies;
+ unsigned long old = READ_ONCE(rcu_barrier_last_throttle);
+ unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
+
+ while (time_in_range(j, old, old + HZ / 16) ||
+ !try_cmpxchg(&rcu_barrier_last_throttle, &old, j)) {
+ schedule_timeout_idle(HZ / 16);
+ if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
+ smp_mb(); /* caller's subsequent code after above check. */
+ return;
+ }
+ j = jiffies;
+ old = READ_ONCE(rcu_barrier_last_throttle);
+ }
+ rcu_barrier();
+}
+
+/*
+ * Invoke rcu_barrier_throttled() when a rcutree.do_rcu_barrier
+ * request arrives. We insist on a true value to allow for possible
+ * future expansion.
+ */
+static int param_set_do_rcu_barrier(const char *val, const struct kernel_param *kp)
+{
+ bool b;
+ int ret;
+
+ if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING)
+ return -EAGAIN;
+ ret = kstrtobool(val, &b);
+ if (!ret && b) {
+ atomic_inc((atomic_t *)kp->arg);
+ rcu_barrier_throttled();
+ atomic_dec((atomic_t *)kp->arg);
+ }
+ return ret;
+}
+
+/*
+ * Output the number of outstanding rcutree.do_rcu_barrier requests.
+ */
+static int param_get_do_rcu_barrier(char *buffer, const struct kernel_param *kp)
+{
+ return sprintf(buffer, "%d\n", atomic_read((atomic_t *)kp->arg));
+}
+
+static const struct kernel_param_ops do_rcu_barrier_ops = {
+ .set = param_set_do_rcu_barrier,
+ .get = param_get_do_rcu_barrier,
+};
+static atomic_t do_rcu_barrier;
+module_param_cb(do_rcu_barrier, &do_rcu_barrier_ops, &do_rcu_barrier, 0644);
+
+/*
+ * Compute the mask of online CPUs for the specified rcu_node structure.
+ * This will not be stable unless the rcu_node structure's ->lock is
+ * held, but the bit corresponding to the current CPU will be stable
+ * in most contexts.
+ */
+static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
+{
+ return READ_ONCE(rnp->qsmaskinitnext);
+}
+
+/*
+ * Is the CPU corresponding to the specified rcu_data structure online
+ * from RCU's perspective? This perspective is given by that structure's
+ * ->qsmaskinitnext field rather than by the global cpu_online_mask.
+ */
+static bool rcu_rdp_cpu_online(struct rcu_data *rdp)
+{
+ return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode));
+}
+
+bool rcu_cpu_online(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ return rcu_rdp_cpu_online(rdp);
+}
+
+#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
+
+/*
+ * Is the current CPU online as far as RCU is concerned?
+ *
+ * Disable preemption to avoid false positives that could otherwise
+ * happen due to the current CPU number being sampled, this task being
+ * preempted, its old CPU being taken offline, resuming on some other CPU,
+ * then determining that its old CPU is now offline.
+ *
+ * Disable checking if in an NMI handler because we cannot safely
+ * report errors from NMI handlers anyway. In addition, it is OK to use
+ * RCU on an offline processor during initial boot, hence the check for
+ * rcu_scheduler_fully_active.
+ */
+bool rcu_lockdep_current_cpu_online(void)
+{
+ struct rcu_data *rdp;
+ bool ret = false;
+
+ if (in_nmi() || !rcu_scheduler_fully_active)
+ return true;
+ preempt_disable_notrace();
+ rdp = this_cpu_ptr(&rcu_data);
+ /*
+ * Strictly, we care here about the case where the current CPU is
+ * in rcutree_report_cpu_starting() and thus has an excuse for rdp->grpmask
+ * not being up to date. So arch_spin_is_locked() might have a
+ * false positive if it's held by some *other* CPU, but that's
+ * OK because that just means a false *negative* on the warning.
+ */
+ if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock))
+ ret = true;
+ preempt_enable_notrace();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
+
+#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
+
+// Has rcu_init() been invoked? This is used (for example) to determine
+// whether spinlocks may be acquired safely.
+static bool rcu_init_invoked(void)
+{
+ return !!rcu_state.n_online_cpus;
+}
+
+/*
+ * All CPUs for the specified rcu_node structure have gone offline,
+ * and all tasks that were preempted within an RCU read-side critical
+ * section while running on one of those CPUs have since exited their RCU
+ * read-side critical section. Some other CPU is reporting this fact with
+ * the specified rcu_node structure's ->lock held and interrupts disabled.
+ * This function therefore goes up the tree of rcu_node structures,
+ * clearing the corresponding bits in the ->qsmaskinit fields. Note that
+ * the leaf rcu_node structure's ->qsmaskinit field has already been
+ * updated.
+ *
+ * This function does check that the specified rcu_node structure has
+ * all CPUs offline and no blocked tasks, so it is OK to invoke it
+ * prematurely. That said, invoking it after the fact will cost you
+ * a needless lock acquisition. So once it has done its work, don't
+ * invoke it again.
+ */
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+ long mask;
+ struct rcu_node *rnp = rnp_leaf;
+
+ raw_lockdep_assert_held_rcu_node(rnp_leaf);
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+ WARN_ON_ONCE(rnp_leaf->qsmaskinit) ||
+ WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
+ return;
+ for (;;) {
+ mask = rnp->grpmask;
+ rnp = rnp->parent;
+ if (!rnp)
+ break;
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
+ rnp->qsmaskinit &= ~mask;
+ /* Between grace periods, so better already be zero! */
+ WARN_ON_ONCE(rnp->qsmask);
+ if (rnp->qsmaskinit) {
+ raw_spin_unlock_rcu_node(rnp);
+ /* irqs remain disabled. */
+ return;
+ }
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ }
+}
+
/*
* Propagate ->qsinitmask bits up the rcu_node tree to account for the
* first CPU in a given leaf rcu_node structure coming online. The caller
- * must hold the corresponding leaf rcu_node ->lock with interrrupts
+ * must hold the corresponding leaf rcu_node ->lock with interrupts
* disabled.
*/
static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
@@ -3703,16 +4376,20 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
static void __init
rcu_boot_init_percpu_data(int cpu)
{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
/* Set up local state, ensuring consistent view of global state. */
rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
- WARN_ON_ONCE(rdp->dynticks_nesting != 1);
- WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)));
+ INIT_WORK(&rdp->strict_work, strict_work_handler);
+ WARN_ON_ONCE(ct->dynticks_nesting != 1);
+ WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)));
+ rdp->barrier_seq_snap = rcu_state.barrier_sequence;
rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
rdp->rcu_onl_gp_flags = RCU_GP_CLEANED;
+ rdp->last_sched_clock = jiffies;
rdp->cpu = cpu;
rcu_boot_init_nocb_percpu_data(rdp);
}
@@ -3730,39 +4407,44 @@ rcu_boot_init_percpu_data(int cpu)
int rcutree_prepare_cpu(unsigned int cpu)
{
unsigned long flags;
+ struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
struct rcu_node *rnp = rcu_get_root();
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rdp->qlen_last_fqs_check = 0;
- rdp->n_force_qs_snap = rcu_state.n_force_qs;
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
rdp->blimit = blimit;
- if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
- !rcu_segcblist_is_offloaded(&rdp->cblist))
- rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
- rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
- rcu_dynticks_eqs_online();
+ ct->dynticks_nesting = 1; /* CPU not up, no tearing. */
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
/*
+ * Only non-NOCB CPUs that didn't have early-boot callbacks need to be
+ * (re-)initialized.
+ */
+ if (!rcu_segcblist_is_enabled(&rdp->cblist))
+ rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
+
+ /*
* Add CPU to leaf rcu_node pending-online bitmask. Any needed
* propagation up the rcu_node tree will happen at the beginning
* of the next grace period.
*/
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- rdp->beenonline = true; /* We have now been online. */
rdp->gp_seq = READ_ONCE(rnp->gp_seq);
rdp->gp_seq_needed = rdp->gp_seq;
rdp->cpu_no_qs.b.norm = true;
rdp->core_needs_qs = false;
rdp->rcu_iw_pending = false;
+ rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler);
rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- rcu_prepare_kthreads(cpu);
+ rcu_spawn_one_boost_kthread(rnp);
rcu_spawn_cpu_nocb_kthread(cpu);
+ WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
return 0;
}
@@ -3778,6 +4460,16 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
}
/*
+ * Has the specified (known valid) CPU ever been fully online?
+ */
+bool rcu_cpu_beenfullyonline(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ return smp_load_acquire(&rdp->beenonline);
+}
+
+/*
* Near the end of the CPU-online process. Pretty much all services
* enabled, and the CPU is now very much alive.
*/
@@ -3803,31 +4495,6 @@ int rcutree_online_cpu(unsigned int cpu)
}
/*
- * Near the beginning of the process. The CPU is still very much alive
- * with pretty much all services enabled.
- */
-int rcutree_offline_cpu(unsigned int cpu)
-{
- unsigned long flags;
- struct rcu_data *rdp;
- struct rcu_node *rnp;
-
- rdp = per_cpu_ptr(&rcu_data, cpu);
- rnp = rdp->mynode;
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- rnp->ffmask &= ~rdp->grpmask;
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-
- rcutree_affinity_setting(cpu, cpu);
-
- // nohz_full CPUs need the tick for stop-machine to work quickly
- tick_dep_set(TICK_DEP_BIT_RCU);
- return 0;
-}
-
-static DEFINE_PER_CPU(int, rcu_cpu_started);
-
-/*
* Mark the specified CPU as being online so that subsequent grace periods
* (both expedited and normal) will wait on it. Note that this means that
* incoming CPUs are not allowed to use RCU read-side critical sections
@@ -3837,47 +4504,57 @@ static DEFINE_PER_CPU(int, rcu_cpu_started);
* Note that this function is special in that it is invoked directly
* from the incoming CPU rather than from the cpuhp_step mechanism.
* This is because this function must be invoked at a precise location.
+ * This incoming CPU must not have enabled interrupts yet.
+ *
+ * This mirrors the effects of rcutree_report_cpu_dead().
*/
-void rcu_cpu_starting(unsigned int cpu)
+void rcutree_report_cpu_starting(unsigned int cpu)
{
- unsigned long flags;
unsigned long mask;
- int nbits;
- unsigned long oldmask;
struct rcu_data *rdp;
struct rcu_node *rnp;
+ bool newcpu;
- if (per_cpu(rcu_cpu_started, cpu))
+ lockdep_assert_irqs_disabled();
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rdp->cpu_started)
return;
+ rdp->cpu_started = true;
- per_cpu(rcu_cpu_started, cpu) = 1;
-
- rdp = per_cpu_ptr(&rcu_data, cpu);
rnp = rdp->mynode;
mask = rdp->grpmask;
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ arch_spin_lock(&rcu_state.ofl_lock);
+ rcu_dynticks_eqs_online();
+ raw_spin_lock(&rcu_state.barrier_lock);
+ raw_spin_lock_rcu_node(rnp);
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
- oldmask = rnp->expmaskinitnext;
+ raw_spin_unlock(&rcu_state.barrier_lock);
+ newcpu = !(rnp->expmaskinitnext & mask);
rnp->expmaskinitnext |= mask;
- oldmask ^= rnp->expmaskinitnext;
- nbits = bitmap_weight(&oldmask, BITS_PER_LONG);
/* Allow lockless access for expedited grace periods. */
- smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */
+ smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */
ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus);
rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */
rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags);
- if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */
+
+ /* An incoming CPU should never be blocking a grace period. */
+ if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */
+ /* rcu_report_qs_rnp() *really* wants some flags to restore */
+ unsigned long flags;
+
+ local_irq_save(flags);
rcu_disable_urgency_upon_qs(rdp);
/* Report QS -after- changing ->qsmaskinitnext! */
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
} else {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ raw_spin_unlock_rcu_node(rnp);
}
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ smp_store_release(&rdp->beenonline, true);
smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
}
-#ifdef CONFIG_HOTPLUG_CPU
/*
* The outgoing function has no further need of RCU, so remove it from
* the rcu_node tree's ->qsmaskinitnext bit masks.
@@ -3885,38 +4562,45 @@ void rcu_cpu_starting(unsigned int cpu)
* Note that this function is special in that it is invoked directly
* from the outgoing CPU rather than from the cpuhp_step mechanism.
* This is because this function must be invoked at a precise location.
+ *
+ * This mirrors the effect of rcutree_report_cpu_starting().
*/
-void rcu_report_dead(unsigned int cpu)
+void rcutree_report_cpu_dead(void)
{
unsigned long flags;
unsigned long mask;
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
- /* QS for any half-done expedited grace period. */
- preempt_disable();
- rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
- preempt_enable();
+ /*
+ * IRQS must be disabled from now on and until the CPU dies, or an interrupt
+ * may introduce a new READ-side while it is actually off the QS masks.
+ */
+ lockdep_assert_irqs_disabled();
+ // Do any dangling deferred wakeups.
+ do_nocb_deferred_wakeup(rdp);
+
rcu_preempt_deferred_qs(current);
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
- raw_spin_lock(&rcu_state.ofl_lock);
+ arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
/* Report quiescent state -before- changing ->qsmaskinitnext! */
+ rcu_disable_urgency_upon_qs(rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
}
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- raw_spin_unlock(&rcu_state.ofl_lock);
-
- per_cpu(rcu_cpu_started, cpu) = 0;
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ rdp->cpu_started = false;
}
+#ifdef CONFIG_HOTPLUG_CPU
/*
* The outgoing CPU has just passed through the dying-idle state, and we
* are being invoked from the CPU that was IPIed to continue the offline
@@ -3930,25 +4614,28 @@ void rcutree_migrate_callbacks(int cpu)
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
bool needwake;
- if (rcu_segcblist_is_offloaded(&rdp->cblist) ||
+ if (rcu_rdp_is_offloaded(rdp) ||
rcu_segcblist_empty(&rdp->cblist))
return; /* No callbacks to migrate. */
- local_irq_save(flags);
+ raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
+ WARN_ON_ONCE(rcu_rdp_cpu_online(rdp));
+ rcu_barrier_entrain(rdp);
my_rdp = this_cpu_ptr(&rcu_data);
my_rnp = my_rdp->mynode;
rcu_nocb_lock(my_rdp); /* irqs already disabled. */
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, false));
raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
/* Leverage recent GPs and set GP for new callbacks. */
needwake = rcu_advance_cbs(my_rnp, rdp) ||
rcu_advance_cbs(my_rnp, my_rdp);
rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
+ raw_spin_unlock(&rcu_state.barrier_lock); /* irqs remain disabled. */
needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
rcu_segcblist_disable(&rdp->cblist);
- WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
- !rcu_segcblist_n_cbs(&my_rdp->cblist));
- if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) {
+ WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist));
+ check_cb_ovld_locked(my_rdp, my_rnp);
+ if (rcu_rdp_is_offloaded(my_rdp)) {
raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
__call_rcu_nocb_wake(my_rdp, true, flags);
} else {
@@ -3964,7 +4651,60 @@ void rcutree_migrate_callbacks(int cpu)
cpu, rcu_segcblist_n_cbs(&rdp->cblist),
rcu_segcblist_first_cb(&rdp->cblist));
}
-#endif
+
+/*
+ * The CPU has been completely removed, and some other CPU is reporting
+ * this fact from process context. Do the remainder of the cleanup.
+ * There can only be one CPU hotplug operation at a time, so no need for
+ * explicit locking.
+ */
+int rcutree_dead_cpu(unsigned int cpu)
+{
+ WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
+ // Stop-machine done, so allow nohz_full to disable tick.
+ tick_dep_clear(TICK_DEP_BIT_RCU);
+ return 0;
+}
+
+/*
+ * Near the end of the offline process. Trace the fact that this CPU
+ * is going offline.
+ */
+int rcutree_dying_cpu(unsigned int cpu)
+{
+ bool blkd;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_node *rnp = rdp->mynode;
+
+ blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask);
+ trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
+ blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
+ return 0;
+}
+
+/*
+ * Near the beginning of the process. The CPU is still very much alive
+ * with pretty much all services enabled.
+ */
+int rcutree_offline_cpu(unsigned int cpu)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ rnp = rdp->mynode;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rnp->ffmask &= ~rdp->grpmask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ rcutree_affinity_setting(cpu, cpu);
+
+ // nohz_full CPUs need the tick for stop-machine to work quickly
+ tick_dep_set(TICK_DEP_BIT_RCU);
+ return 0;
+}
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
/*
* On non-huge systems, use expedited RCU grace periods to make suspend
@@ -3976,11 +4716,13 @@ static int rcu_pm_notify(struct notifier_block *self,
switch (action) {
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
+ rcu_async_hurry();
rcu_expedite_gp();
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
rcu_unexpedite_gp();
+ rcu_async_relax();
break;
default:
break;
@@ -3988,31 +4730,61 @@ static int rcu_pm_notify(struct notifier_block *self,
return NOTIFY_OK;
}
+#ifdef CONFIG_RCU_EXP_KTHREAD
+struct kthread_worker *rcu_exp_gp_kworker;
+struct kthread_worker *rcu_exp_par_gp_kworker;
+
+static void __init rcu_start_exp_gp_kworkers(void)
+{
+ const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker";
+ const char *gp_kworker_name = "rcu_exp_gp_kthread_worker";
+ struct sched_param param = { .sched_priority = kthread_prio };
+
+ rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name);
+ if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
+ pr_err("Failed to create %s!\n", gp_kworker_name);
+ return;
+ }
+
+ rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name);
+ if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) {
+ pr_err("Failed to create %s!\n", par_gp_kworker_name);
+ kthread_destroy_worker(rcu_exp_gp_kworker);
+ return;
+ }
+
+ sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
+ sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO,
+ &param);
+}
+
+static inline void rcu_alloc_par_gp_wq(void)
+{
+}
+#else /* !CONFIG_RCU_EXP_KTHREAD */
+struct workqueue_struct *rcu_par_gp_wq;
+
+static void __init rcu_start_exp_gp_kworkers(void)
+{
+}
+
+static inline void rcu_alloc_par_gp_wq(void)
+{
+ rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
+ WARN_ON(!rcu_par_gp_wq);
+}
+#endif /* CONFIG_RCU_EXP_KTHREAD */
+
/*
* Spawn the kthreads that handle RCU's grace periods.
*/
static int __init rcu_spawn_gp_kthread(void)
{
unsigned long flags;
- int kthread_prio_in = kthread_prio;
struct rcu_node *rnp;
struct sched_param sp;
struct task_struct *t;
-
- /* Force priority into range. */
- if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2
- && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
- kthread_prio = 2;
- else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
- kthread_prio = 1;
- else if (kthread_prio < 0)
- kthread_prio = 0;
- else if (kthread_prio > 99)
- kthread_prio = 99;
-
- if (kthread_prio != kthread_prio_in)
- pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
- kthread_prio, kthread_prio_in);
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
rcu_scheduler_fully_active = 1;
t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
@@ -4030,8 +4802,17 @@ static int __init rcu_spawn_gp_kthread(void)
smp_store_release(&rcu_state.gp_kthread, t); /* ^^^ */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
wake_up_process(t);
- rcu_spawn_nocb_kthreads();
- rcu_spawn_boost_kthreads();
+ /* This is a pre-SMP initcall, we expect a single CPU */
+ WARN_ON(num_online_cpus() > 1);
+ /*
+ * Those kthreads couldn't be created on rcu_init() -> rcutree_prepare_cpu()
+ * due to rcu_scheduler_fully_active.
+ */
+ rcu_spawn_cpu_nocb_kthread(smp_processor_id());
+ rcu_spawn_one_boost_kthread(rdp->mynode);
+ rcu_spawn_core_kthreads();
+ /* Create kthread worker for expedited GPs */
+ rcu_start_exp_gp_kworkers();
return 0;
}
early_initcall(rcu_spawn_gp_kthread);
@@ -4048,9 +4829,20 @@ early_initcall(rcu_spawn_gp_kthread);
*/
void rcu_scheduler_starting(void)
{
+ unsigned long flags;
+ struct rcu_node *rnp;
+
WARN_ON(num_online_cpus() != 1);
WARN_ON(nr_context_switches() > 0);
rcu_test_sync_prims();
+
+ // Fix up the ->gp_seq counters.
+ local_irq_save(flags);
+ rcu_for_each_node_breadth_first(rnp)
+ rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
+ local_irq_restore(flags);
+
+ // Switch out of early boot mode.
rcu_scheduler_active = RCU_SCHEDULER_INIT;
rcu_test_sync_prims();
}
@@ -4123,6 +4915,10 @@ static void __init rcu_init_one(void)
init_waitqueue_head(&rnp->exp_wq[2]);
init_waitqueue_head(&rnp->exp_wq[3]);
spin_lock_init(&rnp->exp_lock);
+ mutex_init(&rnp->boost_kthread_mutex);
+ raw_spin_lock_init(&rnp->exp_poll_lock);
+ rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+ INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp);
}
}
@@ -4138,15 +4934,51 @@ static void __init rcu_init_one(void)
}
/*
+ * Force priority from the kernel command-line into range.
+ */
+static void __init sanitize_kthread_prio(void)
+{
+ int kthread_prio_in = kthread_prio;
+
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2
+ && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
+ kthread_prio = 2;
+ else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
+ kthread_prio = 1;
+ else if (kthread_prio < 0)
+ kthread_prio = 0;
+ else if (kthread_prio > 99)
+ kthread_prio = 99;
+
+ if (kthread_prio != kthread_prio_in)
+ pr_alert("%s: Limited prio to %d from %d\n",
+ __func__, kthread_prio, kthread_prio_in);
+}
+
+/*
* Compute the rcu_node tree geometry from kernel parameters. This cannot
* replace the definitions in tree.h because those are needed to size
* the ->node array in the rcu_state structure.
*/
-static void __init rcu_init_geometry(void)
+void rcu_init_geometry(void)
{
ulong d;
int i;
+ static unsigned long old_nr_cpu_ids;
int rcu_capacity[RCU_NUM_LVLS];
+ static bool initialized;
+
+ if (initialized) {
+ /*
+ * Warn if setup_nr_cpu_ids() had not yet been invoked,
+ * unless nr_cpus_ids == NR_CPUS, in which case who cares?
+ */
+ WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids);
+ return;
+ }
+
+ old_nr_cpu_ids = nr_cpu_ids;
+ initialized = true;
/*
* Initialize any unspecified boot parameters.
@@ -4240,37 +5072,65 @@ static void __init rcu_dump_rcu_node_tree(void)
}
struct workqueue_struct *rcu_gp_wq;
-struct workqueue_struct *rcu_par_gp_wq;
static void __init kfree_rcu_batch_init(void)
{
int cpu;
- int i;
+ int i, j;
+ struct shrinker *kfree_rcu_shrinker;
+
+ /* Clamp it to [0:100] seconds interval. */
+ if (rcu_delay_page_cache_fill_msec < 0 ||
+ rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
+
+ rcu_delay_page_cache_fill_msec =
+ clamp(rcu_delay_page_cache_fill_msec, 0,
+ (int) (100 * MSEC_PER_SEC));
+
+ pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
+ rcu_delay_page_cache_fill_msec);
+ }
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- spin_lock_init(&krcp->lock);
for (i = 0; i < KFREE_N_BATCHES; i++) {
INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
krcp->krw_arr[i].krcp = krcp;
+
+ for (j = 0; j < FREE_N_CHANNELS; j++)
+ INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);
}
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ INIT_LIST_HEAD(&krcp->bulk_head[i]);
+
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
+ INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
krcp->initialized = true;
}
- if (register_shrinker(&kfree_rcu_shrinker))
- pr_err("Failed to register kfree_rcu() shrinker!\n");
+
+ kfree_rcu_shrinker = shrinker_alloc(0, "rcu-kfree");
+ if (!kfree_rcu_shrinker) {
+ pr_err("Failed to allocate kfree_rcu() shrinker!\n");
+ return;
+ }
+
+ kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
+ kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
+
+ shrinker_register(kfree_rcu_shrinker);
}
void __init rcu_init(void)
{
- int cpu;
+ int cpu = smp_processor_id();
rcu_early_boot_tests();
kfree_rcu_batch_init();
rcu_bootup_announce();
+ sanitize_kthread_prio();
rcu_init_geometry();
rcu_init_one();
if (dump_tree)
@@ -4284,18 +5144,15 @@ void __init rcu_init(void)
* or the scheduler are operational.
*/
pm_notifier(rcu_pm_notify, 0);
- for_each_online_cpu(cpu) {
- rcutree_prepare_cpu(cpu);
- rcu_cpu_starting(cpu);
- rcutree_online_cpu(cpu);
- }
+ WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot.
+ rcutree_prepare_cpu(cpu);
+ rcutree_report_cpu_starting(cpu);
+ rcutree_online_cpu(cpu);
- /* Create workqueue for expedited GPs and for Tree SRCU. */
+ /* Create workqueue for Tree SRCU and for expedited GPs. */
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_gp_wq);
- rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
- WARN_ON(!rcu_par_gp_wq);
- srcu_init();
+ rcu_alloc_par_gp_wq();
/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */
@@ -4303,8 +5160,14 @@ void __init rcu_init(void)
qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
else
qovld_calc = qovld;
+
+ // Kick-start in case any polled grace periods started early.
+ (void)start_poll_synchronize_rcu_expedited();
+
+ rcu_test_sync_prims();
}
#include "tree_stall.h"
#include "tree_exp.h"
+#include "tree_nocb.h"
#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 43991a40b084..e9821a8422db 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -10,6 +10,7 @@
*/
#include <linux/cache.h>
+#include <linux/kthread.h>
#include <linux/spinlock.h>
#include <linux/rtmutex.h>
#include <linux/threads.h>
@@ -23,7 +24,11 @@
/* Communicate arguments to a workqueue handler. */
struct rcu_exp_work {
unsigned long rew_s;
+#ifdef CONFIG_RCU_EXP_KTHREAD
+ struct kthread_work rew_work;
+#else
struct work_struct rew_work;
+#endif /* CONFIG_RCU_EXP_KTHREAD */
};
/* RCU's kthread states for tracing. */
@@ -41,7 +46,7 @@ struct rcu_node {
raw_spinlock_t __private lock; /* Root rcu_node's lock protects */
/* some rcu_state fields as well as */
/* following. */
- unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */
+ unsigned long gp_seq; /* Track rsp->gp_seq. */
unsigned long gp_seq_needed; /* Track furthest future GP request. */
unsigned long completedqs; /* All QSes done for this node. */
unsigned long qsmask; /* CPUs or groups that need to switch in */
@@ -56,7 +61,6 @@ struct rcu_node {
/* Initialized from ->qsmaskinitnext at the */
/* beginning of each grace period. */
unsigned long qsmaskinitnext;
- /* Online CPUs for next grace period. */
unsigned long expmask; /* CPUs or groups that need to check in */
/* to allow the current expedited GP */
/* to complete. */
@@ -73,9 +77,9 @@ struct rcu_node {
unsigned long ffmask; /* Fully functional CPUs. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
/* Only one bit will be set in this mask. */
- int grplo; /* lowest-numbered CPU or group here. */
- int grphi; /* highest-numbered CPU or group here. */
- u8 grpnum; /* CPU/group number for next level up. */
+ int grplo; /* lowest-numbered CPU here. */
+ int grphi; /* highest-numbered CPU here. */
+ u8 grpnum; /* group number for next level up. */
u8 level; /* root is at level 0. */
bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */
/* exit RCU read-side critical sections */
@@ -109,11 +113,15 @@ struct rcu_node {
/* side effect, not as a lock. */
unsigned long boost_time;
/* When to start boosting (jiffies). */
+ struct mutex boost_kthread_mutex;
+ /* Exclusion for thread spawning and affinity */
+ /* manipulation. */
struct task_struct *boost_kthread_task;
/* kthread that takes care of priority */
/* boosting for this rcu_node structure. */
unsigned int boost_kthread_status;
/* State of boost_kthread_task for tracing. */
+ unsigned long n_boosts; /* Number of boosts for this rcu_node structure. */
#ifdef CONFIG_RCU_NOCB_CPU
struct swait_queue_head nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
@@ -125,6 +133,10 @@ struct rcu_node {
wait_queue_head_t exp_wq[4];
struct rcu_exp_work rew;
bool exp_need_flush; /* Need to flush workitem? */
+ raw_spinlock_t exp_poll_lock;
+ /* Lock and data for polled expedited grace periods. */
+ unsigned long exp_seq_poll_rq;
+ struct work_struct exp_poll_wq;
} ____cacheline_internodealigned_in_smp;
/*
@@ -146,16 +158,33 @@ union rcu_noqs {
u16 s; /* Set of bits, aggregate OR here. */
};
+/*
+ * Record the snapshot of the core stats at half of the first RCU stall timeout.
+ * The member gp_seq is used to ensure that all members are updated only once
+ * during the sampling period. The snapshot is taken only if this gp_seq is not
+ * equal to rdp->gp_seq.
+ */
+struct rcu_snap_record {
+ unsigned long gp_seq; /* Track rdp->gp_seq counter */
+ u64 cputime_irq; /* Accumulated cputime of hard irqs */
+ u64 cputime_softirq;/* Accumulated cputime of soft irqs */
+ u64 cputime_system; /* Accumulated cputime of kernel tasks */
+ unsigned long nr_hardirqs; /* Accumulated number of hard irqs */
+ unsigned int nr_softirqs; /* Accumulated number of soft irqs */
+ unsigned long long nr_csw; /* Accumulated number of task switches */
+ unsigned long jiffies; /* Track jiffies value */
+};
+
/* Per-CPU data for read-copy update. */
struct rcu_data {
/* 1) quiescent-state and grace-period handling : */
- unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */
+ unsigned long gp_seq; /* Track rsp->gp_seq counter. */
unsigned long gp_seq_needed; /* Track furthest future GP request. */
union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */
- bool core_needs_qs; /* Core waits for quiesc state. */
+ bool core_needs_qs; /* Core waits for quiescent state. */
bool beenonline; /* CPU online at least once. */
bool gpwrap; /* Possible ->gp_seq wrap. */
- bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */
+ bool cpu_started; /* RCU watching this onlining CPU. */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
unsigned long ticks_this_gp; /* The number of scheduling-clock */
@@ -164,6 +193,7 @@ struct rcu_data {
/* period it is aware of. */
struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */
bool defer_qs_iw_pending; /* Scheduler attention pending? */
+ struct work_struct strict_work; /* Schedule readers for strict GPs. */
/* 2) batch handling */
struct rcu_segcblist cblist; /* Segmented callback list, with */
@@ -171,38 +201,35 @@ struct rcu_data {
/* different grace periods. */
long qlen_last_fqs_check;
/* qlen at last check for QS forcing */
+ unsigned long n_cbs_invoked; /* # callbacks invoked since boot. */
unsigned long n_force_qs_snap;
/* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
/* 3) dynticks interface. */
int dynticks_snap; /* Per-GP tracking for dynticks. */
- long dynticks_nesting; /* Track process nesting level. */
- long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */
- atomic_t dynticks; /* Even value for idle, else odd. */
bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */
bool rcu_urgent_qs; /* GP old need light quiescent state. */
bool rcu_forced_tick; /* Forced tick to provide QS. */
bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */
-#ifdef CONFIG_RCU_FAST_NO_HZ
- unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */
- unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */
- int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
-#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
/* 4) rcu_barrier(), OOM callbacks, and expediting. */
+ unsigned long barrier_seq_snap; /* Snap of rcu_state.barrier_sequence. */
struct rcu_head barrier_head;
int exp_dynticks_snap; /* Double-check need for IPI. */
/* 5) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */
+ struct swait_queue_head nocb_state_wq; /* For offloading state changes */
struct task_struct *nocb_gp_kthread;
raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
atomic_t nocb_lock_contended; /* Contention experienced. */
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
struct timer_list nocb_timer; /* Enforce finite deferral. */
unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */
+ struct mutex nocb_gp_kthread_mutex; /* Exclusion for nocb gp kthread */
+ /* spawning */
/* The following fields are used by call_rcu, hence own cacheline. */
raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp;
@@ -213,7 +240,6 @@ struct rcu_data {
/* The following fields are used by GP kthread, hence own cacheline. */
raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp;
- struct timer_list nocb_bypass_timer; /* Force nocb_bypass flush. */
u8 nocb_gp_sleep; /* Is the nocb GP thread asleep? */
u8 nocb_gp_bypass; /* Found a bypass on last scan? */
u8 nocb_gp_gp; /* GP to wait for on last scan? */
@@ -222,8 +248,12 @@ struct rcu_data {
struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */
bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */
struct task_struct *nocb_cb_kthread;
- struct rcu_data *nocb_next_cb_rdp;
- /* Next rcu_data in wakeup chain. */
+ struct list_head nocb_head_rdp; /*
+ * Head of rcu_data list in wakeup chain,
+ * if rdp_gp.
+ */
+ struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */
+ struct rcu_data *nocb_toggling_rdp; /* rdp queued for (de-)offloading */
/* The following fields are used by CB kthread, hence new cacheline. */
struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp;
@@ -235,6 +265,7 @@ struct rcu_data {
/* rcuc per-CPU kthread or NULL. */
unsigned int rcu_cpu_kthread_status;
char rcu_cpu_has_work;
+ unsigned long rcuc_activity;
/* 7) Diagnostic data, including RCU CPU stall warnings. */
unsigned int softirq_snap; /* Snapshot of softirq activity. */
@@ -247,14 +278,20 @@ struct rcu_data {
unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */
short rcu_onl_gp_flags; /* ->gp_flags at last online. */
unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
+ unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */
+ struct rcu_snap_record snap_record; /* Snapshot of core stats at half of */
+ /* the first RCU stall timeout */
+ long lazy_len; /* Length of buffered lazy callbacks. */
int cpu;
};
/* Values for nocb_defer_wakeup field in struct rcu_data. */
#define RCU_NOCB_WAKE_NOT 0
-#define RCU_NOCB_WAKE 1
-#define RCU_NOCB_WAKE_FORCE 2
+#define RCU_NOCB_WAKE_BYPASS 1
+#define RCU_NOCB_WAKE_LAZY 2
+#define RCU_NOCB_WAKE 3
+#define RCU_NOCB_WAKE_FORCE 4
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
/* For jiffies_till_first_fqs and */
@@ -295,18 +332,23 @@ struct rcu_state {
/* Hierarchy levels (+1 to */
/* shut bogus gcc warning) */
int ncpus; /* # CPUs seen so far. */
+ int n_online_cpus; /* # CPUs online for RCU. */
/* The following fields are guarded by the root rcu_node's lock. */
- u8 boost ____cacheline_internodealigned_in_smp;
- /* Subject to priority boost. */
- unsigned long gp_seq; /* Grace-period sequence #. */
+ unsigned long gp_seq ____cacheline_internodealigned_in_smp;
+ /* Grace-period sequence #. */
+ unsigned long gp_max; /* Maximum GP duration in */
+ /* jiffies. */
struct task_struct *gp_kthread; /* Task for grace periods. */
struct swait_queue_head gp_wq; /* Where GP task waits. */
short gp_flags; /* Commands for GP task. */
short gp_state; /* GP kthread sleep state. */
unsigned long gp_wake_time; /* Last GP kthread wake. */
unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */
+ unsigned long gp_seq_polled; /* GP seq for polled API. */
+ unsigned long gp_seq_polled_snap; /* ->gp_seq_polled at normal GP start. */
+ unsigned long gp_seq_polled_exp_snap; /* ->gp_seq_polled at expedited GP start. */
/* End of fields guarded by root rcu_node's lock. */
@@ -317,6 +359,8 @@ struct rcu_state {
/* rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
+ raw_spinlock_t barrier_lock; /* Protects ->barrier_seq_snap. */
+
struct mutex exp_mutex; /* Serialize expedited GP. */
struct mutex exp_wake_mutex; /* Serialize wakeup. */
unsigned long expedited_sequence; /* Take a ticket. */
@@ -342,18 +386,21 @@ struct rcu_state {
/* in jiffies. */
unsigned long jiffies_stall; /* Time at which to check */
/* for CPU stalls. */
+ int nr_fqs_jiffies_stall; /* Number of fqs loops after
+ * which read jiffies and set
+ * jiffies_stall. Stall
+ * warnings disabled if !0. */
unsigned long jiffies_resched; /* Time at which to resched */
/* a reluctant CPU. */
unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */
/* GP start. */
- unsigned long gp_max; /* Maximum GP duration in */
- /* jiffies. */
const char *name; /* Name of structure. */
char abbr; /* Abbreviated name. */
- raw_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
+ arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
/* Synchronize offline with */
/* GP pre-initialization. */
+ int nocb_is_setup; /* nocb is setup from boot */
};
/* Values for rcu_state structure's gp_flags field. */
@@ -408,30 +455,27 @@ static void rcu_flavor_sched_clock_irq(int user);
static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
-static bool rcu_is_callbacks_kthread(void);
+static bool rcu_is_callbacks_kthread(struct rcu_data *rdp);
static void rcu_cpu_kthread_setup(unsigned int cpu);
-static void __init rcu_spawn_boost_kthreads(void);
-static void rcu_prepare_kthreads(int cpu);
-static void rcu_cleanup_after_idle(void);
-static void rcu_prepare_for_idle(void);
+static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
-static void rcu_preempt_deferred_qs(struct task_struct *t);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j);
+ unsigned long j, bool lazy);
static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags);
+ bool *was_alldone, unsigned long flags,
+ bool lazy);
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
unsigned long flags);
-static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_cpu_nocb_kthread(int cpu);
-static void __init rcu_spawn_nocb_kthreads(void);
static void show_rcu_nocb_state(struct rcu_data *rdp);
static void rcu_nocb_lock(struct rcu_data *rdp);
static void rcu_nocb_unlock(struct rcu_data *rdp);
@@ -440,12 +484,16 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp);
#ifdef CONFIG_RCU_NOCB_CPU
static void __init rcu_organize_nocb_kthreads(void);
-#define rcu_nocb_lock_irqsave(rdp, flags) \
-do { \
- if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) \
- local_irq_save(flags); \
- else \
- raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags)); \
+
+/*
+ * Disable IRQs before checking offloaded state so that local
+ * locking is safe against concurrent de-offloading.
+ */
+#define rcu_nocb_lock_irqsave(rdp, flags) \
+do { \
+ local_irq_save(flags); \
+ if (rcu_segcblist_is_offloaded(&(rdp)->cblist)) \
+ raw_spin_lock(&(rdp)->nocb_lock); \
} while (0)
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
#define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags)
@@ -453,10 +501,6 @@ do { \
static void rcu_bind_gp_kthread(void);
static bool rcu_nohz_full_cpu(void);
-static void rcu_dynticks_task_enter(void);
-static void rcu_dynticks_task_exit(void);
-static void rcu_dynticks_task_trace_enter(void);
-static void rcu_dynticks_task_trace_exit(void);
/* Forward declarations for tree_stall.h */
static void record_gp_stall_check_time(void);
@@ -464,3 +508,6 @@ static void rcu_iw_handler(struct irq_work *iwp);
static void check_cpu_stall(struct rcu_data *rdp);
static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
const unsigned long gpssdelay);
+
+/* Forward declarations for tree_exp.h. */
+static void sync_rcu_do_polled_gp(struct work_struct *wp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 72952edad1e4..2ac440bc7e10 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -11,6 +11,7 @@
static void rcu_exp_handler(void *unused);
static int rcu_print_task_exp_stall(struct rcu_node *rnp);
+static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp);
/*
* Record the start of an expedited grace period.
@@ -18,6 +19,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_exp_gp_seq_start(void)
{
rcu_seq_start(&rcu_state.expedited_sequence);
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap);
}
/*
@@ -34,6 +36,7 @@ static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void)
*/
static void rcu_exp_gp_seq_end(void)
{
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap);
rcu_seq_end(&rcu_state.expedited_sequence);
smp_mb(); /* Ensure that consecutive grace periods serialize. */
}
@@ -170,7 +173,6 @@ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp)
return ret;
}
-
/*
* Report the exit from RCU read-side critical section for the last task
* that queued itself during or before the current expedited preemptible-RCU
@@ -198,7 +200,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (wake) {
smp_mb(); /* EGP done before wake_up(). */
- swake_up_one(&rcu_state.expedited_wq);
+ swake_up_one_online(&rcu_state.expedited_wq);
}
break;
}
@@ -255,7 +257,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
*/
static void rcu_report_exp_rdp(struct rcu_data *rdp)
{
- WRITE_ONCE(rdp->exp_deferred_qs, false);
+ WRITE_ONCE(rdp->cpu_no_qs.b.exp, false);
rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true);
}
@@ -334,15 +336,13 @@ fastpath:
* Select the CPUs within the specified rcu_node that the upcoming
* expedited grace period needs to wait for.
*/
-static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
+static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
{
int cpu;
unsigned long flags;
unsigned long mask_ofl_test;
unsigned long mask_ofl_ipi;
int ret;
- struct rcu_exp_work *rewp =
- container_of(wp, struct rcu_exp_work, rew_work);
struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -358,7 +358,7 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
!(rnp->qsmaskinitnext & mask)) {
mask_ofl_test |= mask;
} else {
- snap = rcu_dynticks_snap(rdp);
+ snap = rcu_dynticks_snap(cpu);
if (rcu_dynticks_in_eqs(snap))
mask_ofl_test |= mask;
else
@@ -387,6 +387,7 @@ retry_ipi:
continue;
}
if (get_cpu() == cpu) {
+ mask_ofl_test |= mask;
put_cpu();
continue;
}
@@ -403,7 +404,7 @@ retry_ipi:
/* Online, so delay for a bit and try again. */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("selectofl"));
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_idle(1);
goto retry_ipi;
}
/* CPU really is offline, so we must report its QS. */
@@ -416,13 +417,119 @@ retry_ipi:
rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false);
}
+static void rcu_exp_sel_wait_wake(unsigned long s);
+
+#ifdef CONFIG_RCU_EXP_KTHREAD
+static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
+{
+ struct rcu_exp_work *rewp =
+ container_of(wp, struct rcu_exp_work, rew_work);
+
+ __sync_rcu_exp_select_node_cpus(rewp);
+}
+
+static inline bool rcu_gp_par_worker_started(void)
+{
+ return !!READ_ONCE(rcu_exp_par_gp_kworker);
+}
+
+static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
+{
+ kthread_init_work(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+ /*
+ * Use rcu_exp_par_gp_kworker, because flushing a work item from
+ * another work item on the same kthread worker can result in
+ * deadlock.
+ */
+ kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work);
+}
+
+static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
+{
+ kthread_flush_work(&rnp->rew.rew_work);
+}
+
+/*
+ * Work-queue handler to drive an expedited grace period forward.
+ */
+static void wait_rcu_exp_gp(struct kthread_work *wp)
+{
+ struct rcu_exp_work *rewp;
+
+ rewp = container_of(wp, struct rcu_exp_work, rew_work);
+ rcu_exp_sel_wait_wake(rewp->rew_s);
+}
+
+static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
+{
+ kthread_init_work(&rew->rew_work, wait_rcu_exp_gp);
+ kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work);
+}
+
+static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
+{
+}
+#else /* !CONFIG_RCU_EXP_KTHREAD */
+static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
+{
+ struct rcu_exp_work *rewp =
+ container_of(wp, struct rcu_exp_work, rew_work);
+
+ __sync_rcu_exp_select_node_cpus(rewp);
+}
+
+static inline bool rcu_gp_par_worker_started(void)
+{
+ return !!READ_ONCE(rcu_par_gp_wq);
+}
+
+static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
+{
+ int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
+
+ INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+ /* If all offline, queue the work on an unbound CPU. */
+ if (unlikely(cpu > rnp->grphi - rnp->grplo))
+ cpu = WORK_CPU_UNBOUND;
+ else
+ cpu += rnp->grplo;
+ queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
+}
+
+static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
+{
+ flush_work(&rnp->rew.rew_work);
+}
+
+/*
+ * Work-queue handler to drive an expedited grace period forward.
+ */
+static void wait_rcu_exp_gp(struct work_struct *wp)
+{
+ struct rcu_exp_work *rewp;
+
+ rewp = container_of(wp, struct rcu_exp_work, rew_work);
+ rcu_exp_sel_wait_wake(rewp->rew_s);
+}
+
+static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
+{
+ INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp);
+ queue_work(rcu_gp_wq, &rew->rew_work);
+}
+
+static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
+{
+ destroy_work_on_stack(&rew->rew_work);
+}
+#endif /* CONFIG_RCU_EXP_KTHREAD */
+
/*
* Select the nodes that the upcoming expedited grace period needs
* to wait for.
*/
static void sync_rcu_exp_select_cpus(void)
{
- int cpu;
struct rcu_node *rnp;
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset"));
@@ -434,28 +541,21 @@ static void sync_rcu_exp_select_cpus(void)
rnp->exp_need_flush = false;
if (!READ_ONCE(rnp->expmask))
continue; /* Avoid early boot non-existent wq. */
- if (!READ_ONCE(rcu_par_gp_wq) ||
+ if (!rcu_gp_par_worker_started() ||
rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||
rcu_is_last_leaf_node(rnp)) {
- /* No workqueues yet or last leaf, do direct call. */
+ /* No worker started yet or last leaf, do direct call. */
sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
continue;
}
- INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
- cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
- /* If all offline, queue the work on an unbound CPU. */
- if (unlikely(cpu > rnp->grphi - rnp->grplo))
- cpu = WORK_CPU_UNBOUND;
- else
- cpu += rnp->grplo;
- queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
+ sync_rcu_exp_select_cpus_queue_work(rnp);
rnp->exp_need_flush = true;
}
- /* Wait for workqueue jobs (if any) to complete. */
+ /* Wait for jobs (if any) to complete. */
rcu_for_each_leaf_node(rnp)
if (rnp->exp_need_flush)
- flush_work(&rnp->rew.rew_work);
+ sync_rcu_exp_select_cpus_flush_work(rnp);
}
/*
@@ -493,34 +593,42 @@ static void synchronize_rcu_expedited_wait(void)
struct rcu_data *rdp;
struct rcu_node *rnp;
struct rcu_node *rnp_root = rcu_get_root();
+ unsigned long flags;
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
- jiffies_stall = rcu_jiffies_till_stall_check();
+ jiffies_stall = rcu_exp_jiffies_till_stall_check();
jiffies_start = jiffies;
if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) {
if (synchronize_rcu_expedited_wait_once(1))
return;
rcu_for_each_leaf_node(rnp) {
- for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ mask = READ_ONCE(rnp->expmask);
+ for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rdp->rcu_forced_tick_exp)
continue;
rdp->rcu_forced_tick_exp = true;
- tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
+ if (cpu_online(cpu))
+ tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
}
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
j = READ_ONCE(jiffies_till_first_fqs);
if (synchronize_rcu_expedited_wait_once(j + HZ))
return;
- WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT));
}
for (;;) {
+ unsigned long j;
+
if (synchronize_rcu_expedited_wait_once(jiffies_stall))
return;
if (rcu_stall_is_suppressed())
continue;
- panic_on_rcu_stall();
+ j = jiffies;
+ rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start));
+ trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall"));
pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
rcu_state.name);
ndetected = 0;
@@ -534,18 +642,19 @@ static void synchronize_rcu_expedited_wait(void)
continue;
ndetected++;
rdp = per_cpu_ptr(&rcu_data, cpu);
- pr_cont(" %d-%c%c%c", cpu,
+ pr_cont(" %d-%c%c%c%c", cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rnp->expmaskinit)],
- "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
+ "N."[!!(rdp->grpmask & rnp->expmaskinitnext)],
+ "D."[!!data_race(rdp->cpu_no_qs.b.exp)]);
}
}
pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
- jiffies - jiffies_start, rcu_state.expedited_sequence,
+ j - jiffies_start, rcu_state.expedited_sequence,
data_race(rnp_root->expmask),
".T"[!!data_race(rnp_root->exp_tasks)]);
if (ndetected) {
- pr_err("blocking rcu_node structures:");
+ pr_err("blocking rcu_node structures (internal RCU debug):");
rcu_for_each_node_breadth_first(rnp) {
if (rnp == rnp_root)
continue; /* printed unconditionally */
@@ -563,10 +672,14 @@ static void synchronize_rcu_expedited_wait(void)
mask = leaf_node_cpu_bit(rnp, cpu);
if (!(READ_ONCE(rnp->expmask) & mask))
continue;
+ preempt_disable(); // For smp_processor_id() in dump_cpu_task().
dump_cpu_task(cpu);
+ preempt_enable();
}
+ rcu_exp_print_detail_task_stall_rnp(rnp);
}
- jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
+ jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3;
+ panic_on_rcu_stall();
}
}
@@ -617,17 +730,6 @@ static void rcu_exp_sel_wait_wake(unsigned long s)
rcu_exp_wait_wake(s);
}
-/*
- * Work-queue handler to drive an expedited grace period forward.
- */
-static void wait_rcu_exp_gp(struct work_struct *wp)
-{
- struct rcu_exp_work *rewp;
-
- rewp = container_of(wp, struct rcu_exp_work, rew_work);
- rcu_exp_sel_wait_wake(rewp->rew_s);
-}
-
#ifdef CONFIG_PREEMPT_RCU
/*
@@ -652,10 +754,10 @@ static void rcu_exp_handler(void *unused)
*/
if (!depth) {
if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) ||
- rcu_dynticks_curr_cpu_in_eqs()) {
+ rcu_is_cpu_rrupt_from_idle()) {
rcu_report_exp_rdp(rdp);
} else {
- rdp->exp_deferred_qs = true;
+ WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
set_tsk_need_resched(t);
set_preempt_need_resched();
}
@@ -677,7 +779,7 @@ static void rcu_exp_handler(void *unused)
if (depth > 0) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->expmask & rdp->grpmask) {
- rdp->exp_deferred_qs = true;
+ WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
t->rcu_read_unlock_special.b.exp_hint = true;
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -704,9 +806,11 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
int ndetected = 0;
struct task_struct *t;
- if (!READ_ONCE(rnp->exp_tasks))
- return 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (!rnp->exp_tasks) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return 0;
+ }
t = list_entry(rnp->exp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
@@ -717,6 +821,36 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
return ndetected;
}
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, dumping the stack of each that is blocking the current
+ * expedited grace period.
+ */
+static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ struct task_struct *t;
+
+ if (!rcu_exp_stall_task_details)
+ return;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (!READ_ONCE(rnp->exp_tasks)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ t = list_entry(rnp->exp_tasks->prev,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ /*
+ * We could be printing a lot while holding a spinlock.
+ * Avoid triggering hard lockup.
+ */
+ touch_nmi_watchdog();
+ sched_show_task(t);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+
#else /* #ifdef CONFIG_PREEMPT_RCU */
/* Request an expedited quiescent state. */
@@ -732,15 +866,15 @@ static void rcu_exp_need_qs(void)
/* Invoked on each online non-idle CPU for expedited quiescent state. */
static void rcu_exp_handler(void *unused)
{
- struct rcu_data *rdp;
- struct rcu_node *rnp;
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+ struct rcu_node *rnp = rdp->mynode;
+ bool preempt_bh_enabled = !(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK));
- rdp = this_cpu_ptr(&rcu_data);
- rnp = rdp->mynode;
if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
return;
- if (rcu_is_cpu_rrupt_from_idle()) {
+ if (rcu_is_cpu_rrupt_from_idle() ||
+ (IS_ENABLED(CONFIG_PREEMPT_COUNT) && preempt_bh_enabled)) {
rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
return;
}
@@ -761,7 +895,7 @@ static void sync_sched_exp_online_cleanup(int cpu)
my_cpu = get_cpu();
/* Quiescent state either not needed or already requested, leave. */
if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
- __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) {
+ READ_ONCE(rdp->cpu_no_qs.b.exp)) {
put_cpu();
return;
}
@@ -789,6 +923,15 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
return 0;
}
+/*
+ * Because preemptible RCU does not exist, we never have to print out
+ * tasks blocked within RCU read-side critical sections that are blocking
+ * the current expedited grace period.
+ */
+static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+}
+
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/**
@@ -814,6 +957,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
void synchronize_rcu_expedited(void)
{
bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT);
+ unsigned long flags;
struct rcu_exp_work rew;
struct rcu_node *rnp;
unsigned long s;
@@ -824,12 +968,25 @@ void synchronize_rcu_expedited(void)
"Illegal synchronize_rcu_expedited() in RCU read-side critical section");
/* Is the state is such that the call is a grace period? */
- if (rcu_blocking_is_gp())
- return;
+ if (rcu_blocking_is_gp()) {
+ // Note well that this code runs with !PREEMPT && !SMP.
+ // In addition, all code that advances grace periods runs
+ // at process level. Therefore, this expedited GP overlaps
+ // with other expedited GPs only by being fully nested within
+ // them, which allows reuse of ->gp_seq_polled_exp_snap.
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap);
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap);
+
+ local_irq_save(flags);
+ WARN_ON_ONCE(num_online_cpus() > 1);
+ rcu_state.expedited_sequence += (1 << RCU_SEQ_CTR_SHIFT);
+ local_irq_restore(flags);
+ return; // Context allows vacuous grace periods.
+ }
/* If expedited grace periods are prohibited, fall back to normal. */
if (rcu_gp_is_normal()) {
- wait_rcu_gp(call_rcu);
+ wait_rcu_gp(call_rcu_hurry);
return;
}
@@ -845,20 +1002,155 @@ void synchronize_rcu_expedited(void)
} else {
/* Marshall arguments & schedule the expedited grace period. */
rew.rew_s = s;
- INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
- queue_work(rcu_gp_wq, &rew.rew_work);
+ synchronize_rcu_expedited_queue_work(&rew);
}
/* Wait for expedited grace period to complete. */
rnp = rcu_get_root();
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
sync_exp_work_done(s));
- smp_mb(); /* Workqueue actions happen before return. */
+ smp_mb(); /* Work actions happen before return. */
/* Let the next expedited grace period start. */
mutex_unlock(&rcu_state.exp_mutex);
if (likely(!boottime))
- destroy_work_on_stack(&rew.rew_work);
+ synchronize_rcu_expedited_destroy_work(&rew);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+/*
+ * Ensure that start_poll_synchronize_rcu_expedited() has the expedited
+ * RCU grace periods that it needs.
+ */
+static void sync_rcu_do_polled_gp(struct work_struct *wp)
+{
+ unsigned long flags;
+ int i = 0;
+ struct rcu_node *rnp = container_of(wp, struct rcu_node, exp_poll_wq);
+ unsigned long s;
+
+ raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+ s = rnp->exp_seq_poll_rq;
+ rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+ raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+ if (s == RCU_GET_STATE_COMPLETED)
+ return;
+ while (!poll_state_synchronize_rcu(s)) {
+ synchronize_rcu_expedited();
+ if (i == 10 || i == 20)
+ pr_info("%s: i = %d s = %lx gp_seq_polled = %lx\n", __func__, i, s, READ_ONCE(rcu_state.gp_seq_polled));
+ i++;
+ }
+ raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+ s = rnp->exp_seq_poll_rq;
+ if (poll_state_synchronize_rcu(s))
+ rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+ raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+}
+
+/**
+ * start_poll_synchronize_rcu_expedited - Snapshot current RCU state and start expedited grace period
+ *
+ * Returns a cookie to pass to a call to cond_synchronize_rcu(),
+ * cond_synchronize_rcu_expedited(), or poll_state_synchronize_rcu(),
+ * allowing them to determine whether or not any sort of grace period has
+ * elapsed in the meantime. If the needed expedited grace period is not
+ * already slated to start, initiates that grace period.
+ */
+unsigned long start_poll_synchronize_rcu_expedited(void)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ unsigned long s;
+
+ s = get_state_synchronize_rcu();
+ rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id());
+ rnp = rdp->mynode;
+ if (rcu_init_invoked())
+ raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+ if (!poll_state_synchronize_rcu(s)) {
+ if (rcu_init_invoked()) {
+ rnp->exp_seq_poll_rq = s;
+ queue_work(rcu_gp_wq, &rnp->exp_poll_wq);
+ }
+ }
+ if (rcu_init_invoked())
+ raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+
+ return s;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited);
+
+/**
+ * start_poll_synchronize_rcu_expedited_full - Take a full snapshot and start expedited grace period
+ * @rgosp: Place to put snapshot of grace-period state
+ *
+ * Places the normal and expedited grace-period states in rgosp. This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * If the needed expedited grace period is not already slated to start,
+ * initiates that grace period.
+ */
+void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+{
+ get_state_synchronize_rcu_full(rgosp);
+ (void)start_poll_synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited_full);
+
+/**
+ * cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period
+ *
+ * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
+ *
+ * If any type of full RCU grace period has elapsed since the earlier
+ * call to get_state_synchronize_rcu(), start_poll_synchronize_rcu(),
+ * or start_poll_synchronize_rcu_expedited(), just return. Otherwise,
+ * invoke synchronize_rcu_expedited() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @oldstate and that returned at the end
+ * of this function.
+ */
+void cond_synchronize_rcu_expedited(unsigned long oldstate)
+{
+ if (!poll_state_synchronize_rcu(oldstate))
+ synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited);
+
+/**
+ * cond_synchronize_rcu_expedited_full - Conditionally wait for an expedited RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
+ *
+ * If a full RCU grace period has elapsed since the call to
+ * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
+ * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
+ * obtained, just return. Otherwise, invoke synchronize_rcu_expedited()
+ * to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @rgosp and that returned at the end of
+ * this function.
+ */
+void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+{
+ if (!poll_state_synchronize_rcu_full(rgosp))
+ synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited_full);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
new file mode 100644
index 000000000000..4efbf7333d4e
--- /dev/null
+++ b/kernel/rcu/tree_nocb.h
@@ -0,0 +1,1805 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Internal non-public definitions that provide either classic
+ * or preemptible semantics.
+ *
+ * Copyright Red Hat, 2009
+ * Copyright IBM Corporation, 2009
+ * Copyright SUSE, 2021
+ *
+ * Author: Ingo Molnar <mingo@elte.hu>
+ * Paul E. McKenney <paulmck@linux.ibm.com>
+ * Frederic Weisbecker <frederic@kernel.org>
+ */
+
+#ifdef CONFIG_RCU_NOCB_CPU
+static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
+static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
+static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
+{
+ return lockdep_is_held(&rdp->nocb_lock);
+}
+
+static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
+{
+ /* Race on early boot between thread creation and assignment */
+ if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread)
+ return true;
+
+ if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread)
+ if (in_task())
+ return true;
+ return false;
+}
+
+/*
+ * Offload callback processing from the boot-time-specified set of CPUs
+ * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
+ * created that pull the callbacks from the corresponding CPU, wait for
+ * a grace period to elapse, and invoke the callbacks. These kthreads
+ * are organized into GP kthreads, which manage incoming callbacks, wait for
+ * grace periods, and awaken CB kthreads, and the CB kthreads, which only
+ * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs
+ * do a wake_up() on their GP kthread when they insert a callback into any
+ * empty list, unless the rcu_nocb_poll boot parameter has been specified,
+ * in which case each kthread actively polls its CPU. (Which isn't so great
+ * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
+ *
+ * This is intended to be used in conjunction with Frederic Weisbecker's
+ * adaptive-idle work, which would seriously reduce OS jitter on CPUs
+ * running CPU-bound user-mode computations.
+ *
+ * Offloading of callbacks can also be used as an energy-efficiency
+ * measure because CPUs with no RCU callbacks queued are more aggressive
+ * about entering dyntick-idle mode.
+ */
+
+
+/*
+ * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
+ * If the list is invalid, a warning is emitted and all CPUs are offloaded.
+ */
+static int __init rcu_nocb_setup(char *str)
+{
+ alloc_bootmem_cpumask_var(&rcu_nocb_mask);
+ if (*str == '=') {
+ if (cpulist_parse(++str, rcu_nocb_mask)) {
+ pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
+ cpumask_setall(rcu_nocb_mask);
+ }
+ }
+ rcu_state.nocb_is_setup = true;
+ return 1;
+}
+__setup("rcu_nocbs", rcu_nocb_setup);
+
+static int __init parse_rcu_nocb_poll(char *arg)
+{
+ rcu_nocb_poll = true;
+ return 1;
+}
+__setup("rcu_nocb_poll", parse_rcu_nocb_poll);
+
+/*
+ * Don't bother bypassing ->cblist if the call_rcu() rate is low.
+ * After all, the main point of bypassing is to avoid lock contention
+ * on ->nocb_lock, which only can happen at high call_rcu() rates.
+ */
+static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
+module_param(nocb_nobypass_lim_per_jiffy, int, 0);
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
+ * lock isn't immediately available, increment ->nocb_lock_contended to
+ * flag the contention.
+ */
+static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
+ __acquires(&rdp->nocb_bypass_lock)
+{
+ lockdep_assert_irqs_disabled();
+ if (raw_spin_trylock(&rdp->nocb_bypass_lock))
+ return;
+ atomic_inc(&rdp->nocb_lock_contended);
+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
+ smp_mb__after_atomic(); /* atomic_inc() before lock. */
+ raw_spin_lock(&rdp->nocb_bypass_lock);
+ smp_mb__before_atomic(); /* atomic_dec() after lock. */
+ atomic_dec(&rdp->nocb_lock_contended);
+}
+
+/*
+ * Spinwait until the specified rcu_data structure's ->nocb_lock is
+ * not contended. Please note that this is extremely special-purpose,
+ * relying on the fact that at most two kthreads and one CPU contend for
+ * this lock, and also that the two kthreads are guaranteed to have frequent
+ * grace-period-duration time intervals between successive acquisitions
+ * of the lock. This allows us to use an extremely simple throttling
+ * mechanism, and further to apply it only to the CPU doing floods of
+ * call_rcu() invocations. Don't try this at home!
+ */
+static void rcu_nocb_wait_contended(struct rcu_data *rdp)
+{
+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
+ while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
+ cpu_relax();
+}
+
+/*
+ * Conditionally acquire the specified rcu_data structure's
+ * ->nocb_bypass_lock.
+ */
+static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ return raw_spin_trylock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_bypass_lock.
+ */
+static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
+ __releases(&rdp->nocb_bypass_lock)
+{
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_lock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (!rcu_rdp_is_offloaded(rdp))
+ return;
+ raw_spin_lock(&rdp->nocb_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
+{
+ if (rcu_rdp_is_offloaded(rdp)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_lock);
+ }
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock and restore
+ * interrupts, but only if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ if (rcu_rdp_is_offloaded(rdp)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ } else {
+ local_irq_restore(flags);
+ }
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (rcu_rdp_is_offloaded(rdp))
+ lockdep_assert_held(&rdp->nocb_lock);
+}
+
+/*
+ * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
+ * grace period.
+ */
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
+{
+ swake_up_all(sq);
+}
+
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1];
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+ init_swait_queue_head(&rnp->nocb_gp_wq[0]);
+ init_swait_queue_head(&rnp->nocb_gp_wq[1]);
+}
+
+static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
+ struct rcu_data *rdp,
+ bool force, unsigned long flags)
+ __releases(rdp_gp->nocb_gp_lock)
+{
+ bool needwake = false;
+
+ if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("AlreadyAwake"));
+ return false;
+ }
+
+ if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ del_timer(&rdp_gp->nocb_timer);
+ }
+
+ if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
+ WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
+ needwake = true;
+ }
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ if (needwake) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+ }
+
+ return needwake;
+}
+
+/*
+ * Kick the GP kthread for this NOCB group.
+ */
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ return __wake_nocb_gp(rdp_gp, rdp, force, flags);
+}
+
+/*
+ * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
+ * can elapse before lazy callbacks are flushed. Lazy callbacks
+ * could be flushed much earlier for a number of other reasons
+ * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
+ * left unsubmitted to RCU after those many jiffies.
+ */
+#define LAZY_FLUSH_JIFFIES (10 * HZ)
+static unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
+
+#ifdef CONFIG_RCU_LAZY
+// To be called only from test code.
+void rcu_lazy_set_jiffies_till_flush(unsigned long jif)
+{
+ jiffies_till_flush = jif;
+}
+EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush);
+
+unsigned long rcu_lazy_get_jiffies_till_flush(void)
+{
+ return jiffies_till_flush;
+}
+EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
+#endif
+
+/*
+ * Arrange to wake the GP kthread for this NOCB group at some future
+ * time when it is safe to do so.
+ */
+static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
+ const char *reason)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+
+ /*
+ * Bypass wakeup overrides previous deferments. In case of
+ * callback storms, no need to wake up too early.
+ */
+ if (waketype == RCU_NOCB_WAKE_LAZY &&
+ rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
+ mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush);
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+ } else if (waketype == RCU_NOCB_WAKE_BYPASS) {
+ mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+ } else {
+ if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
+ mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
+ if (rdp_gp->nocb_defer_wakeup < waketype)
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+ }
+
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
+}
+
+/*
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Return true if there was something to be flushed and it succeeded, otherwise
+ * false.
+ *
+ * Note that this function always returns true if rhp is NULL.
+ */
+static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp_in,
+ unsigned long j, bool lazy)
+{
+ struct rcu_cblist rcl;
+ struct rcu_head *rhp = rhp_in;
+
+ WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
+ rcu_lockdep_assert_cblist_protected(rdp);
+ lockdep_assert_held(&rdp->nocb_bypass_lock);
+ if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+ return false;
+ }
+ /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
+ if (rhp)
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+
+ /*
+ * If the new CB requested was a lazy one, queue it onto the main
+ * ->cblist so that we can take advantage of the grace-period that will
+ * happen regardless. But queue it onto the bypass list first so that
+ * the lazy CB is ordered with the existing CBs in the bypass list.
+ */
+ if (lazy && rhp) {
+ rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
+ rhp = NULL;
+ }
+ rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
+ WRITE_ONCE(rdp->lazy_len, 0);
+
+ rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ rcu_nocb_bypass_unlock(rdp);
+ return true;
+}
+
+/*
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Note that this function always returns true if rhp is NULL.
+ */
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j, bool lazy)
+{
+ if (!rcu_rdp_is_offloaded(rdp))
+ return true;
+ rcu_lockdep_assert_cblist_protected(rdp);
+ rcu_nocb_bypass_lock(rdp);
+ return rcu_nocb_do_flush_bypass(rdp, rhp, j, lazy);
+}
+
+/*
+ * If the ->nocb_bypass_lock is immediately available, flush the
+ * ->nocb_bypass queue into ->cblist.
+ */
+static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
+{
+ rcu_lockdep_assert_cblist_protected(rdp);
+ if (!rcu_rdp_is_offloaded(rdp) ||
+ !rcu_nocb_bypass_trylock(rdp))
+ return;
+ WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j, false));
+}
+
+/*
+ * See whether it is appropriate to use the ->nocb_bypass list in order
+ * to control contention on ->nocb_lock. A limited number of direct
+ * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
+ * is non-empty, further callbacks must be placed into ->nocb_bypass,
+ * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch
+ * back to direct use of ->cblist. However, ->nocb_bypass should not be
+ * used if ->cblist is empty, because otherwise callbacks can be stranded
+ * on ->nocb_bypass because we cannot count on the current CPU ever again
+ * invoking call_rcu(). The general rule is that if ->nocb_bypass is
+ * non-empty, the corresponding no-CBs grace-period kthread must not be
+ * in an indefinite sleep state.
+ *
+ * Finally, it is not permitted to use the bypass during early boot,
+ * as doing so would confuse the auto-initialization code. Besides
+ * which, there is no point in worrying about lock contention while
+ * there is only one CPU in operation.
+ */
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags,
+ bool lazy)
+{
+ unsigned long c;
+ unsigned long cur_gp_seq;
+ unsigned long j = jiffies;
+ long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ bool bypass_is_lazy = (ncbs == READ_ONCE(rdp->lazy_len));
+
+ lockdep_assert_irqs_disabled();
+
+ // Pure softirq/rcuc based processing: no bypassing, no
+ // locking.
+ if (!rcu_rdp_is_offloaded(rdp)) {
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false;
+ }
+
+ // In the process of (de-)offloading: no bypassing, but
+ // locking.
+ if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false; /* Not offloaded, no bypassing. */
+ }
+
+ // Don't use ->nocb_bypass during early boot.
+ if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
+ rcu_nocb_lock(rdp);
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false;
+ }
+
+ // If we have advanced to a new jiffy, reset counts to allow
+ // moving back from ->nocb_bypass to ->cblist.
+ if (j == rdp->nocb_nobypass_last) {
+ c = rdp->nocb_nobypass_count + 1;
+ } else {
+ WRITE_ONCE(rdp->nocb_nobypass_last, j);
+ c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
+ if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
+ nocb_nobypass_lim_per_jiffy))
+ c = 0;
+ else if (c > nocb_nobypass_lim_per_jiffy)
+ c = nocb_nobypass_lim_per_jiffy;
+ }
+ WRITE_ONCE(rdp->nocb_nobypass_count, c);
+
+ // If there hasn't yet been all that many ->cblist enqueues
+ // this jiffy, tell the caller to enqueue onto ->cblist. But flush
+ // ->nocb_bypass first.
+ // Lazy CBs throttle this back and do immediate bypass queuing.
+ if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy && !lazy) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j, false));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+
+ // If ->nocb_bypass has been used too long or is too full,
+ // flush ->nocb_bypass to ->cblist.
+ if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) ||
+ (ncbs && bypass_is_lazy &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) ||
+ ncbs >= qhimark) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+
+ if (!rcu_nocb_flush_bypass(rdp, rhp, j, lazy)) {
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
+ }
+
+ // The flush succeeded and we moved CBs into the regular list.
+ // Don't wait for the wake up timer as it may be too far ahead.
+ // Wake up the GP thread now instead, if the cblist was empty.
+ __call_rcu_nocb_wake(rdp, *was_alldone, flags);
+
+ return true; // Callback already enqueued.
+ }
+
+ // We need to use the bypass.
+ rcu_nocb_wait_contended(rdp);
+ rcu_nocb_bypass_lock(rdp);
+ ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+ rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
+
+ if (lazy)
+ WRITE_ONCE(rdp->lazy_len, rdp->lazy_len + 1);
+
+ if (!ncbs) {
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
+ }
+ rcu_nocb_bypass_unlock(rdp);
+ smp_mb(); /* Order enqueue before wake. */
+ // A wake up of the grace period kthread or timer adjustment
+ // needs to be done only if:
+ // 1. Bypass list was fully empty before (this is the first
+ // bypass list entry), or:
+ // 2. Both of these conditions are met:
+ // a. The bypass list previously had only lazy CBs, and:
+ // b. The new CB is non-lazy.
+ if (ncbs && (!bypass_is_lazy || lazy)) {
+ local_irq_restore(flags);
+ } else {
+ // No-CBs GP kthread might be indefinitely asleep, if so, wake.
+ rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQwake"));
+ __call_rcu_nocb_wake(rdp, true, flags);
+ } else {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQnoWake"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ }
+ }
+ return true; // Callback already enqueued.
+}
+
+/*
+ * Awaken the no-CBs grace-period kthread if needed, either due to it
+ * legitimately being asleep or due to overload conditions.
+ *
+ * If warranted, also wake up the kthread servicing this CPUs queues.
+ */
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
+ unsigned long flags)
+ __releases(rdp->nocb_lock)
+{
+ long bypass_len;
+ unsigned long cur_gp_seq;
+ unsigned long j;
+ long lazy_len;
+ long len;
+ struct task_struct *t;
+
+ // If we are being polled or there is no kthread, just leave.
+ t = READ_ONCE(rdp->nocb_gp_kthread);
+ if (rcu_nocb_poll || !t) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("WakeNotPoll"));
+ return;
+ }
+ // Need to actually to a wakeup.
+ len = rcu_segcblist_n_cbs(&rdp->cblist);
+ bypass_len = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ lazy_len = READ_ONCE(rdp->lazy_len);
+ if (was_alldone) {
+ rdp->qlen_last_fqs_check = len;
+ // Only lazy CBs in bypass list
+ if (lazy_len && bypass_len == lazy_len) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
+ TPS("WakeLazy"));
+ } else if (!irqs_disabled_flags(flags)) {
+ /* ... if queue was empty ... */
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp(rdp, false);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("WakeEmpty"));
+ } else {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeEmptyIsDeferred"));
+ }
+ } else if (len > rdp->qlen_last_fqs_check + qhimark) {
+ /* ... or if many callbacks queued. */
+ rdp->qlen_last_fqs_check = len;
+ j = jiffies;
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
+ }
+ smp_mb(); /* Enqueue before timer_pending(). */
+ if ((rdp->nocb_cb_sleep ||
+ !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
+ !timer_pending(&rdp->nocb_timer)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
+ TPS("WakeOvfIsDeferred"));
+ } else {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+ }
+ } else {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+ }
+}
+
+static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
+ bool *wake_state)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int ret;
+
+ rcu_nocb_lock_irqsave(rdp, flags);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
+ !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ /*
+ * Offloading. Set our flag and notify the offload worker.
+ * We will handle this rdp until it ever gets de-offloaded.
+ */
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *wake_state = true;
+ ret = 1;
+ } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We will ignore this rdp until it ever gets re-offloaded.
+ */
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *wake_state = true;
+ ret = 0;
+ } else {
+ WARN_ON_ONCE(1);
+ ret = -1;
+ }
+
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ return ret;
+}
+
+static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu)
+{
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
+ swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
+}
+
+/*
+ * No-CBs GP kthreads come here to wait for additional callbacks to show up
+ * or for grace periods to end.
+ */
+static void nocb_gp_wait(struct rcu_data *my_rdp)
+{
+ bool bypass = false;
+ int __maybe_unused cpu = my_rdp->cpu;
+ unsigned long cur_gp_seq;
+ unsigned long flags;
+ bool gotcbs = false;
+ unsigned long j = jiffies;
+ bool lazy = false;
+ bool needwait_gp = false; // This prevents actual uninitialized use.
+ bool needwake;
+ bool needwake_gp;
+ struct rcu_data *rdp, *rdp_toggling = NULL;
+ struct rcu_node *rnp;
+ unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
+ bool wasempty = false;
+
+ /*
+ * Each pass through the following loop checks for CBs and for the
+ * nearest grace period (if any) to wait for next. The CB kthreads
+ * and the global grace-period kthread are awakened if needed.
+ */
+ WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
+ /*
+ * An rcu_data structure is removed from the list after its
+ * CPU is de-offloaded and added to the list before that CPU is
+ * (re-)offloaded. If the following loop happens to be referencing
+ * that rcu_data structure during the time that the corresponding
+ * CPU is de-offloaded and then immediately re-offloaded, this
+ * loop's rdp pointer will be carried to the end of the list by
+ * the resulting pair of list operations. This can cause the loop
+ * to skip over some of the rcu_data structures that were supposed
+ * to have been scanned. Fortunately a new iteration through the
+ * entire loop is forced after a given CPU's rcu_data structure
+ * is added to the list, so the skipped-over rcu_data structures
+ * won't be ignored for long.
+ */
+ list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) {
+ long bypass_ncbs;
+ bool flush_bypass = false;
+ long lazy_ncbs;
+
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
+ rcu_nocb_lock_irqsave(rdp, flags);
+ lockdep_assert_held(&rdp->nocb_lock);
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ lazy_ncbs = READ_ONCE(rdp->lazy_len);
+
+ if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) ||
+ bypass_ncbs > 2 * qhimark)) {
+ flush_bypass = true;
+ } else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
+ bypass_ncbs > 2 * qhimark)) {
+ flush_bypass = true;
+ } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ continue; /* No callbacks here, try next. */
+ }
+
+ if (flush_bypass) {
+ // Bypass full or old, so flush it.
+ (void)rcu_nocb_try_flush_bypass(rdp, j);
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ lazy_ncbs = READ_ONCE(rdp->lazy_len);
+ }
+
+ if (bypass_ncbs) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ bypass_ncbs == lazy_ncbs ? TPS("Lazy") : TPS("Bypass"));
+ if (bypass_ncbs == lazy_ncbs)
+ lazy = true;
+ else
+ bypass = true;
+ }
+ rnp = rdp->mynode;
+
+ // Advance callbacks if helpful and low contention.
+ needwake_gp = false;
+ if (!rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL) ||
+ (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
+ raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
+ needwake_gp = rcu_advance_cbs(rnp, rdp);
+ wasempty = rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL);
+ raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
+ }
+ // Need to wait on some grace period?
+ WARN_ON_ONCE(wasempty &&
+ !rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL));
+ if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
+ if (!needwait_gp ||
+ ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
+ wait_gp_seq = cur_gp_seq;
+ needwait_gp = true;
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("NeedWaitGP"));
+ }
+ if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
+ needwake = rdp->nocb_cb_sleep;
+ WRITE_ONCE(rdp->nocb_cb_sleep, false);
+ smp_mb(); /* CB invocation -after- GP end. */
+ } else {
+ needwake = false;
+ }
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake) {
+ swake_up_one(&rdp->nocb_cb_wq);
+ gotcbs = true;
+ }
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+ }
+
+ my_rdp->nocb_gp_bypass = bypass;
+ my_rdp->nocb_gp_gp = needwait_gp;
+ my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
+
+ // At least one child with non-empty ->nocb_bypass, so set
+ // timer in order to avoid stranding its callbacks.
+ if (!rcu_nocb_poll) {
+ // If bypass list only has lazy CBs. Add a deferred lazy wake up.
+ if (lazy && !bypass) {
+ wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY,
+ TPS("WakeLazyIsDeferred"));
+ // Otherwise add a deferred bypass wake up.
+ } else if (bypass) {
+ wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
+ TPS("WakeBypassIsDeferred"));
+ }
+ }
+
+ if (rcu_nocb_poll) {
+ /* Polling, so trace if first poll in the series. */
+ if (gotcbs)
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
+ if (list_empty(&my_rdp->nocb_head_rdp)) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ if (!my_rdp->nocb_toggling_rdp)
+ WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ /* Wait for any offloading rdp */
+ nocb_gp_sleep(my_rdp, cpu);
+ } else {
+ schedule_timeout_idle(1);
+ }
+ } else if (!needwait_gp) {
+ /* Wait for callbacks to appear. */
+ nocb_gp_sleep(my_rdp, cpu);
+ } else {
+ rnp = my_rdp->mynode;
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
+ swait_event_interruptible_exclusive(
+ rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
+ rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
+ }
+
+ if (!rcu_nocb_poll) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ // (De-)queue an rdp to/from the group if its nocb state is changing
+ rdp_toggling = my_rdp->nocb_toggling_rdp;
+ if (rdp_toggling)
+ my_rdp->nocb_toggling_rdp = NULL;
+
+ if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+ WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ del_timer(&my_rdp->nocb_timer);
+ }
+ WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ } else {
+ rdp_toggling = READ_ONCE(my_rdp->nocb_toggling_rdp);
+ if (rdp_toggling) {
+ /*
+ * Paranoid locking to make sure nocb_toggling_rdp is well
+ * reset *before* we (re)set SEGCBLIST_KTHREAD_GP or we could
+ * race with another round of nocb toggling for this rdp.
+ * Nocb locking should prevent from that already but we stick
+ * to paranoia, especially in rare path.
+ */
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ my_rdp->nocb_toggling_rdp = NULL;
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ }
+ }
+
+ if (rdp_toggling) {
+ bool wake_state = false;
+ int ret;
+
+ ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state);
+ if (ret == 1)
+ list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp);
+ else if (ret == 0)
+ list_del(&rdp_toggling->nocb_entry_rdp);
+ if (wake_state)
+ swake_up_one(&rdp_toggling->nocb_state_wq);
+ }
+
+ my_rdp->nocb_gp_seq = -1;
+ WARN_ON(signal_pending(current));
+}
+
+/*
+ * No-CBs grace-period-wait kthread. There is one of these per group
+ * of CPUs, but only once at least one CPU in that group has come online
+ * at least once since boot. This kthread checks for newly posted
+ * callbacks from any of the CPUs it is responsible for, waits for a
+ * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
+ * that then have callback-invocation work to do.
+ */
+static int rcu_nocb_gp_kthread(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ for (;;) {
+ WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
+ nocb_gp_wait(rdp);
+ cond_resched_tasks_rcu_qs();
+ }
+ return 0;
+}
+
+static inline bool nocb_cb_can_run(struct rcu_data *rdp)
+{
+ u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
+
+ return rcu_segcblist_test_flags(&rdp->cblist, flags);
+}
+
+static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
+{
+ return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
+}
+
+/*
+ * Invoke any ready callbacks from the corresponding no-CBs CPU,
+ * then, if there are no more, wait for more to appear.
+ */
+static void nocb_cb_wait(struct rcu_data *rdp)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long cur_gp_seq;
+ unsigned long flags;
+ bool needwake_state = false;
+ bool needwake_gp = false;
+ bool can_sleep = true;
+ struct rcu_node *rnp = rdp->mynode;
+
+ do {
+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ nocb_cb_wait_cond(rdp));
+
+ // VVV Ensure CB invocation follows _sleep test.
+ if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
+ WARN_ON(signal_pending(current));
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ }
+ } while (!nocb_cb_can_run(rdp));
+
+
+ local_irq_save(flags);
+ rcu_momentary_dyntick_idle();
+ local_irq_restore(flags);
+ /*
+ * Disable BH to provide the expected environment. Also, when
+ * transitioning to/from NOCB mode, a self-requeuing callback might
+ * be invoked from softirq. A short grace period could cause both
+ * instances of this callback would execute concurrently.
+ */
+ local_bh_disable();
+ rcu_do_batch(rdp);
+ local_bh_enable();
+ lockdep_assert_irqs_enabled();
+ rcu_nocb_lock_irqsave(rdp, flags);
+ if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
+ raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
+ needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ }
+
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ needwake_state = true;
+ }
+ if (rcu_segcblist_ready_cbs(cblist))
+ can_sleep = false;
+ } else {
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We won't touch the callbacks and keep sleeping until we ever
+ * get re-offloaded.
+ */
+ WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ needwake_state = true;
+ }
+
+ WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep);
+
+ if (rdp->nocb_cb_sleep)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
+
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+}
+
+/*
+ * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke
+ * nocb_cb_wait() to do the dirty work.
+ */
+static int rcu_nocb_cb_kthread(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ // Each pass through this loop does one callback batch, and,
+ // if there are no more ready callbacks, waits for them.
+ for (;;) {
+ nocb_cb_wait(rdp);
+ cond_resched_tasks_rcu_qs();
+ }
+ return 0;
+}
+
+/* Is a deferred wakeup of rcu_nocb_kthread() required? */
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
+{
+ return READ_ONCE(rdp->nocb_defer_wakeup) >= level;
+}
+
+/* Do a deferred wakeup of rcu_nocb_kthread(). */
+static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
+ struct rcu_data *rdp, int level,
+ unsigned long flags)
+ __releases(rdp_gp->nocb_gp_lock)
+{
+ int ndw;
+ int ret;
+
+ if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) {
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ return false;
+ }
+
+ ndw = rdp_gp->nocb_defer_wakeup;
+ ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
+
+ return ret;
+}
+
+/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
+static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
+{
+ unsigned long flags;
+ struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
+
+ WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
+
+ raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags);
+ smp_mb__after_spinlock(); /* Timer expire before wakeup. */
+ do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags);
+}
+
+/*
+ * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
+ * This means we do an inexact common-case check. Note that if
+ * we miss, ->nocb_timer will eventually clean things up.
+ */
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE))
+ return false;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags);
+}
+
+void rcu_nocb_flush_deferred_wakeup(void)
+{
+ do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
+
+static int rdp_offload_toggle(struct rcu_data *rdp,
+ bool offload, unsigned long flags)
+ __releases(rdp->nocb_lock)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+ bool wake_gp = false;
+
+ rcu_segcblist_offload(cblist, offload);
+
+ if (rdp->nocb_cb_sleep)
+ rdp->nocb_cb_sleep = false;
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ /*
+ * Ignore former value of nocb_cb_sleep and force wake up as it could
+ * have been spuriously set to false already.
+ */
+ swake_up_one(&rdp->nocb_cb_wq);
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ // Queue this rdp for add/del to/from the list to iterate on rcuog
+ WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp);
+ if (rdp_gp->nocb_gp_sleep) {
+ rdp_gp->nocb_gp_sleep = false;
+ wake_gp = true;
+ }
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+
+ return wake_gp;
+}
+
+static long rcu_nocb_rdp_deoffload(void *arg)
+{
+ struct rcu_data *rdp = arg;
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int wake_gp;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ /*
+ * rcu_nocb_rdp_deoffload() may be called directly if
+ * rcuog/o[p] spawn failed, because at this time the rdp->cpu
+ * is not online yet.
+ */
+ WARN_ON_ONCE((rdp->cpu != raw_smp_processor_id()) && cpu_online(rdp->cpu));
+
+ pr_info("De-offloading %d\n", rdp->cpu);
+
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /*
+ * Flush once and for all now. This suffices because we are
+ * running on the target CPU holding ->nocb_lock (thus having
+ * interrupts disabled), and because rdp_offload_toggle()
+ * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED.
+ * Thus future calls to rcu_segcblist_completely_offloaded() will
+ * return false, which means that future calls to rcu_nocb_try_bypass()
+ * will refuse to put anything into the bypass.
+ */
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
+ /*
+ * Start with invoking rcu_core() early. This way if the current thread
+ * happens to preempt an ongoing call to rcu_core() in the middle,
+ * leaving some work dismissed because rcu_core() still thinks the rdp is
+ * completely offloaded, we are guaranteed a nearby future instance of
+ * rcu_core() to catch up.
+ */
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE);
+ invoke_rcu_core();
+ wake_gp = rdp_offload_toggle(rdp, false, flags);
+
+ mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
+ if (rdp_gp->nocb_gp_kthread) {
+ if (wake_gp)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+
+ /*
+ * If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB.
+ * Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog.
+ */
+ if (!rdp->nocb_cb_kthread) {
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ }
+
+ swait_event_exclusive(rdp->nocb_state_wq,
+ !rcu_segcblist_test_flags(cblist,
+ SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP));
+ } else {
+ /*
+ * No kthread to clear the flags for us or remove the rdp from the nocb list
+ * to iterate. Do it here instead. Locking doesn't look stricly necessary
+ * but we stick to paranoia in this rare path.
+ */
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_segcblist_clear_flags(&rdp->cblist,
+ SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ list_del(&rdp->nocb_entry_rdp);
+ }
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
+
+ /*
+ * Lock one last time to acquire latest callback updates from kthreads
+ * so we can later handle callbacks locally without locking.
+ */
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /*
+ * Theoretically we could clear SEGCBLIST_LOCKING after the nocb
+ * lock is released but how about being paranoid for once?
+ */
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_LOCKING);
+ /*
+ * Without SEGCBLIST_LOCKING, we can't use
+ * rcu_nocb_unlock_irqrestore() anymore.
+ */
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+
+ /* Sanity check */
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+
+
+ return 0;
+}
+
+int rcu_nocb_cpu_deoffload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ cpus_read_lock();
+ mutex_lock(&rcu_state.barrier_mutex);
+ if (rcu_rdp_is_offloaded(rdp)) {
+ if (cpu_online(cpu)) {
+ ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
+ if (!ret)
+ cpumask_clear_cpu(cpu, rcu_nocb_mask);
+ } else {
+ pr_info("NOCB: Cannot CB-deoffload offline CPU %d\n", rdp->cpu);
+ ret = -EINVAL;
+ }
+ }
+ mutex_unlock(&rcu_state.barrier_mutex);
+ cpus_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
+
+static long rcu_nocb_rdp_offload(void *arg)
+{
+ struct rcu_data *rdp = arg;
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int wake_gp;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+ /*
+ * For now we only support re-offload, ie: the rdp must have been
+ * offloaded on boot first.
+ */
+ if (!rdp->nocb_gp_rdp)
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread))
+ return -EINVAL;
+
+ pr_info("Offloading %d\n", rdp->cpu);
+
+ /*
+ * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING
+ * is set.
+ */
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+
+ /*
+ * We didn't take the nocb lock while working on the
+ * rdp->cblist with SEGCBLIST_LOCKING cleared (pure softirq/rcuc mode).
+ * Every modifications that have been done previously on
+ * rdp->cblist must be visible remotely by the nocb kthreads
+ * upon wake up after reading the cblist flags.
+ *
+ * The layout against nocb_lock enforces that ordering:
+ *
+ * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait()
+ * ------------------------- ----------------------------
+ * WRITE callbacks rcu_nocb_lock()
+ * rcu_nocb_lock() READ flags
+ * WRITE flags READ callbacks
+ * rcu_nocb_unlock() rcu_nocb_unlock()
+ */
+ wake_gp = rdp_offload_toggle(rdp, true, flags);
+ if (wake_gp)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+ swait_event_exclusive(rdp->nocb_state_wq,
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+
+ /*
+ * All kthreads are ready to work, we can finally relieve rcu_core() and
+ * enable nocb bypass.
+ */
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ return 0;
+}
+
+int rcu_nocb_cpu_offload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ cpus_read_lock();
+ mutex_lock(&rcu_state.barrier_mutex);
+ if (!rcu_rdp_is_offloaded(rdp)) {
+ if (cpu_online(cpu)) {
+ ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
+ if (!ret)
+ cpumask_set_cpu(cpu, rcu_nocb_mask);
+ } else {
+ pr_info("NOCB: Cannot CB-offload offline CPU %d\n", rdp->cpu);
+ ret = -EINVAL;
+ }
+ }
+ mutex_unlock(&rcu_state.barrier_mutex);
+ cpus_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
+
+#ifdef CONFIG_RCU_LAZY
+static unsigned long
+lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cpu;
+ unsigned long count = 0;
+
+ if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask)))
+ return 0;
+
+ /* Protect rcu_nocb_mask against concurrent (de-)offloading. */
+ if (!mutex_trylock(&rcu_state.barrier_mutex))
+ return 0;
+
+ /* Snapshot count of all CPUs */
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ count += READ_ONCE(rdp->lazy_len);
+ }
+
+ mutex_unlock(&rcu_state.barrier_mutex);
+
+ return count ? count : SHRINK_EMPTY;
+}
+
+static unsigned long
+lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cpu;
+ unsigned long flags;
+ unsigned long count = 0;
+
+ if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask)))
+ return 0;
+ /*
+ * Protect against concurrent (de-)offloading. Otherwise nocb locking
+ * may be ignored or imbalanced.
+ */
+ if (!mutex_trylock(&rcu_state.barrier_mutex)) {
+ /*
+ * But really don't insist if barrier_mutex is contended since we
+ * can't guarantee that it will never engage in a dependency
+ * chain involving memory allocation. The lock is seldom contended
+ * anyway.
+ */
+ return 0;
+ }
+
+ /* Snapshot count of all CPUs */
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int _count;
+
+ if (WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)))
+ continue;
+
+ if (!READ_ONCE(rdp->lazy_len))
+ continue;
+
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /*
+ * Recheck under the nocb lock. Since we are not holding the bypass
+ * lock we may still race with increments from the enqueuer but still
+ * we know for sure if there is at least one lazy callback.
+ */
+ _count = READ_ONCE(rdp->lazy_len);
+ if (!_count) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ continue;
+ }
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp(rdp, false);
+ sc->nr_to_scan -= _count;
+ count += _count;
+ if (sc->nr_to_scan <= 0)
+ break;
+ }
+
+ mutex_unlock(&rcu_state.barrier_mutex);
+
+ return count ? count : SHRINK_STOP;
+}
+#endif // #ifdef CONFIG_RCU_LAZY
+
+void __init rcu_init_nohz(void)
+{
+ int cpu;
+ struct rcu_data *rdp;
+ const struct cpumask *cpumask = NULL;
+ struct shrinker * __maybe_unused lazy_rcu_shrinker;
+
+#if defined(CONFIG_NO_HZ_FULL)
+ if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask))
+ cpumask = tick_nohz_full_mask;
+#endif
+
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) &&
+ !rcu_state.nocb_is_setup && !cpumask)
+ cpumask = cpu_possible_mask;
+
+ if (cpumask) {
+ if (!cpumask_available(rcu_nocb_mask)) {
+ if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
+ pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
+ return;
+ }
+ }
+
+ cpumask_or(rcu_nocb_mask, rcu_nocb_mask, cpumask);
+ rcu_state.nocb_is_setup = true;
+ }
+
+ if (!rcu_state.nocb_is_setup)
+ return;
+
+#ifdef CONFIG_RCU_LAZY
+ lazy_rcu_shrinker = shrinker_alloc(0, "rcu-lazy");
+ if (!lazy_rcu_shrinker) {
+ pr_err("Failed to allocate lazy_rcu shrinker!\n");
+ } else {
+ lazy_rcu_shrinker->count_objects = lazy_rcu_shrink_count;
+ lazy_rcu_shrinker->scan_objects = lazy_rcu_shrink_scan;
+
+ shrinker_register(lazy_rcu_shrinker);
+ }
+#endif // #ifdef CONFIG_RCU_LAZY
+
+ if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
+ pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
+ cpumask_and(rcu_nocb_mask, cpu_possible_mask,
+ rcu_nocb_mask);
+ }
+ if (cpumask_empty(rcu_nocb_mask))
+ pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
+ else
+ pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
+ cpumask_pr_args(rcu_nocb_mask));
+ if (rcu_nocb_poll)
+ pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
+
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rcu_segcblist_empty(&rdp->cblist))
+ rcu_segcblist_init(&rdp->cblist);
+ rcu_segcblist_offload(&rdp->cblist, true);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
+ rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE);
+ }
+ rcu_organize_nocb_kthreads();
+}
+
+/* Initialize per-rcu_data variables for no-CBs CPUs. */
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+ init_swait_queue_head(&rdp->nocb_cb_wq);
+ init_swait_queue_head(&rdp->nocb_gp_wq);
+ init_swait_queue_head(&rdp->nocb_state_wq);
+ raw_spin_lock_init(&rdp->nocb_lock);
+ raw_spin_lock_init(&rdp->nocb_bypass_lock);
+ raw_spin_lock_init(&rdp->nocb_gp_lock);
+ timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
+ rcu_cblist_init(&rdp->nocb_bypass);
+ WRITE_ONCE(rdp->lazy_len, 0);
+ mutex_init(&rdp->nocb_gp_kthread_mutex);
+}
+
+/*
+ * If the specified CPU is a no-CBs CPU that does not already have its
+ * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
+ * for this CPU's group has not yet been created, spawn it as well.
+ */
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_data *rdp_gp;
+ struct task_struct *t;
+ struct sched_param sp;
+
+ if (!rcu_scheduler_fully_active || !rcu_state.nocb_is_setup)
+ return;
+
+ /* If there already is an rcuo kthread, then nothing to do. */
+ if (rdp->nocb_cb_kthread)
+ return;
+
+ /* If we didn't spawn the GP kthread first, reorganize! */
+ sp.sched_priority = kthread_prio;
+ rdp_gp = rdp->nocb_gp_rdp;
+ mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
+ if (!rdp_gp->nocb_gp_kthread) {
+ t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
+ "rcuog/%d", rdp_gp->cpu);
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) {
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
+ goto end;
+ }
+ WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
+ if (kthread_prio)
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
+
+ /* Spawn the kthread for this CPU. */
+ t = kthread_run(rcu_nocb_cb_kthread, rdp,
+ "rcuo%c/%d", rcu_state.abbr, cpu);
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
+ goto end;
+
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio)
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+
+ WRITE_ONCE(rdp->nocb_cb_kthread, t);
+ WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
+ return;
+end:
+ mutex_lock(&rcu_state.barrier_mutex);
+ if (rcu_rdp_is_offloaded(rdp)) {
+ rcu_nocb_rdp_deoffload(rdp);
+ cpumask_clear_cpu(cpu, rcu_nocb_mask);
+ }
+ mutex_unlock(&rcu_state.barrier_mutex);
+}
+
+/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
+static int rcu_nocb_gp_stride = -1;
+module_param(rcu_nocb_gp_stride, int, 0444);
+
+/*
+ * Initialize GP-CB relationships for all no-CBs CPU.
+ */
+static void __init rcu_organize_nocb_kthreads(void)
+{
+ int cpu;
+ bool firsttime = true;
+ bool gotnocbs = false;
+ bool gotnocbscbs = true;
+ int ls = rcu_nocb_gp_stride;
+ int nl = 0; /* Next GP kthread. */
+ struct rcu_data *rdp;
+ struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
+
+ if (!cpumask_available(rcu_nocb_mask))
+ return;
+ if (ls == -1) {
+ ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
+ rcu_nocb_gp_stride = ls;
+ }
+
+ /*
+ * Each pass through this loop sets up one rcu_data structure.
+ * Should the corresponding CPU come online in the future, then
+ * we will spawn the needed set of rcu_nocb_kthread() kthreads.
+ */
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rdp->cpu >= nl) {
+ /* New GP kthread, set up for CBs & next GP. */
+ gotnocbs = true;
+ nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
+ rdp_gp = rdp;
+ INIT_LIST_HEAD(&rdp->nocb_head_rdp);
+ if (dump_tree) {
+ if (!firsttime)
+ pr_cont("%s\n", gotnocbscbs
+ ? "" : " (self only)");
+ gotnocbscbs = false;
+ firsttime = false;
+ pr_alert("%s: No-CB GP kthread CPU %d:",
+ __func__, cpu);
+ }
+ } else {
+ /* Another CB kthread, link to previous GP kthread. */
+ gotnocbscbs = true;
+ if (dump_tree)
+ pr_cont(" %d", cpu);
+ }
+ rdp->nocb_gp_rdp = rdp_gp;
+ if (cpumask_test_cpu(cpu, rcu_nocb_mask))
+ list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp);
+ }
+ if (gotnocbs && dump_tree)
+ pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
+}
+
+/*
+ * Bind the current task to the offloaded CPUs. If there are no offloaded
+ * CPUs, leave the task unbound. Splat if the bind attempt fails.
+ */
+void rcu_bind_current_to_nocb(void)
+{
+ if (cpumask_available(rcu_nocb_mask) && !cpumask_empty(rcu_nocb_mask))
+ WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
+}
+EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
+
+// The ->on_cpu field is available only in CONFIG_SMP=y, so...
+#ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : "";
+}
+#else // #ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return "";
+}
+#endif // #else #ifdef CONFIG_SMP
+
+/*
+ * Dump out nocb grace-period kthread state for the specified rcu_data
+ * structure.
+ */
+static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
+{
+ struct rcu_node *rnp = rdp->mynode;
+
+ pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
+ rdp->cpu,
+ "kK"[!!rdp->nocb_gp_kthread],
+ "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "tT"[timer_pending(&rdp->nocb_timer)],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[swait_active(&rdp->nocb_gp_wq)],
+ ".W"[swait_active(&rnp->nocb_gp_wq[0])],
+ ".W"[swait_active(&rnp->nocb_gp_wq[1])],
+ ".B"[!!rdp->nocb_gp_bypass],
+ ".G"[!!rdp->nocb_gp_gp],
+ (long)rdp->nocb_gp_seq,
+ rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
+ rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
+ rdp->nocb_gp_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_gp_kthread));
+}
+
+/* Dump out nocb kthread state for the specified rcu_data structure. */
+static void show_rcu_nocb_state(struct rcu_data *rdp)
+{
+ char bufw[20];
+ char bufr[20];
+ struct rcu_data *nocb_next_rdp;
+ struct rcu_segcblist *rsclp = &rdp->cblist;
+ bool waslocked;
+ bool wassleep;
+
+ if (rdp->nocb_gp_rdp == rdp)
+ show_rcu_nocb_gp_state(rdp);
+
+ nocb_next_rdp = list_next_or_null_rcu(&rdp->nocb_gp_rdp->nocb_head_rdp,
+ &rdp->nocb_entry_rdp,
+ typeof(*rdp),
+ nocb_entry_rdp);
+
+ sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
+ sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
+ pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
+ rdp->cpu, rdp->nocb_gp_rdp->cpu,
+ nocb_next_rdp ? nocb_next_rdp->cpu : -1,
+ "kK"[!!rdp->nocb_cb_kthread],
+ "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
+ "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
+ "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
+ "sS"[!!rdp->nocb_cb_sleep],
+ ".W"[swait_active(&rdp->nocb_cb_wq)],
+ jiffies - rdp->nocb_bypass_first,
+ jiffies - rdp->nocb_nobypass_last,
+ rdp->nocb_nobypass_count,
+ ".D"[rcu_segcblist_ready_cbs(rsclp)],
+ ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
+ ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
+ ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
+ ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
+ rcu_segcblist_n_cbs(&rdp->cblist),
+ rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
+ rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
+
+ /* It is OK for GP kthreads to have GP state. */
+ if (rdp->nocb_gp_rdp == rdp)
+ return;
+
+ waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
+ wassleep = swait_active(&rdp->nocb_gp_wq);
+ if (!rdp->nocb_gp_sleep && !waslocked && !wassleep)
+ return; /* Nothing untoward. */
+
+ pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n",
+ "lL"[waslocked],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[wassleep]);
+}
+
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+
+static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
+{
+ return 0;
+}
+
+static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
+{
+ return false;
+}
+
+/* No ->nocb_lock to acquire. */
+static void rcu_nocb_lock(struct rcu_data *rdp)
+{
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
+{
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ local_irq_restore(flags);
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+}
+
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
+{
+}
+
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return NULL;
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+}
+
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+{
+ return false;
+}
+
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j, bool lazy)
+{
+ return true;
+}
+
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags, bool lazy)
+{
+ return false;
+}
+
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
+ unsigned long flags)
+{
+ WARN_ON_ONCE(1); /* Should be dead code! */
+}
+
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+}
+
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
+{
+ return false;
+}
+
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ return false;
+}
+
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
+{
+}
+
+static void show_rcu_nocb_state(struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 352223664ebd..41021080ad25 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -13,10 +13,29 @@
#include "../locking/rtmutex_common.h"
-#ifdef CONFIG_RCU_NOCB_CPU
-static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
-static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
+{
+ /*
+ * In order to read the offloaded state of an rdp in a safe
+ * and stable way and prevent from its value to be changed
+ * under us, we must either hold the barrier mutex, the cpu
+ * hotplug lock (read or write) or the nocb lock. Local
+ * non-preemptible reads are also safe. NOCB kthreads and
+ * timers have their own means of synchronization against the
+ * offloaded state updaters.
+ */
+ RCU_LOCKDEP_WARN(
+ !(lockdep_is_held(&rcu_state.barrier_mutex) ||
+ (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) ||
+ rcu_lockdep_is_held_nocb(rdp) ||
+ (rdp == this_cpu_ptr(&rcu_data) &&
+ !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) ||
+ rcu_current_is_nocb_kthread(rdp)),
+ "Unsafe read of RCU_NOCB offloaded state"
+ );
+
+ return rcu_segcblist_is_offloaded(&rdp->cblist);
+}
/*
* Check the RCU kernel configuration parameters and print informative
@@ -32,10 +51,10 @@ static void __init rcu_bootup_announce_oddness(void)
RCU_FANOUT);
if (rcu_fanout_exact)
pr_info("\tHierarchical RCU autobalancing is disabled.\n");
- if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
- pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
if (IS_ENABLED(CONFIG_PROVE_RCU))
pr_info("\tRCU lockdep checking is enabled.\n");
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
+ pr_info("\tRCU strict (and thus non-scalable) grace periods are enabled.\n");
if (RCU_NUM_LVLS >= 4)
pr_info("\tFour(or more)-level hierarchy is enabled.\n");
if (RCU_FANOUT_LEAF != 16)
@@ -67,13 +86,13 @@ static void __init rcu_bootup_announce_oddness(void)
if (rcu_kick_kthreads)
pr_info("\tKick kthreads if too-long grace period.\n");
if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
- pr_info("\tRCU callback double-/use-after-free debug enabled.\n");
+ pr_info("\tRCU callback double-/use-after-free debug is enabled.\n");
if (gp_preinit_delay)
pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay);
if (gp_init_delay)
pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
if (gp_cleanup_delay)
- pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
+ pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay);
if (!use_softirq)
pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
@@ -238,11 +257,13 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
* GP should not be able to end until we report, so there should be
* no need to check for a subsequent expedited GP. (Though we are
* still in a quiescent state in any case.)
+ *
+ * Interrupts are disabled, so ->cpu_no_qs.b.exp cannot change.
*/
- if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs)
+ if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp)
rcu_report_exp_rdp(rdp);
else
- WARN_ON_ONCE(rdp->exp_deferred_qs);
+ WARN_ON_ONCE(rdp->cpu_no_qs.b.exp);
}
/*
@@ -256,12 +277,16 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
* current task, there might be any number of other tasks blocked while
* in an RCU read-side critical section.
*
+ * Unlike non-preemptible-RCU, quiescent state reports for expedited
+ * grace periods are handled separately via deferred quiescent states
+ * and context switch events.
+ *
* Callers to this function must disable preemption.
*/
static void rcu_qs(void)
{
RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n");
- if (__this_cpu_read(rcu_data.cpu_no_qs.s)) {
+ if (__this_cpu_read(rcu_data.cpu_no_qs.b.norm)) {
trace_rcu_grace_period(TPS("rcu_preempt"),
__this_cpu_read(rcu_data.gp_seq),
TPS("cpuqs"));
@@ -292,7 +317,7 @@ void rcu_note_context_switch(bool preempt)
trace_rcu_utilization(TPS("Start context switch"));
lockdep_assert_irqs_disabled();
- WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0);
+ WARN_ONCE(!preempt && rcu_preempt_depth() > 0, "Voluntary context switch within RCU read-side critical section!");
if (rcu_preempt_depth() > 0 &&
!t->rcu_read_unlock_special.b.blocked) {
@@ -307,7 +332,7 @@ void rcu_note_context_switch(bool preempt)
* then queue the task as required based on the states
* of any ongoing and expedited grace periods.
*/
- WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
+ WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp));
WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
trace_rcu_preempt_task(rcu_state.name,
t->pid,
@@ -329,7 +354,7 @@ void rcu_note_context_switch(bool preempt)
* means that we continue to block the current grace period.
*/
rcu_qs();
- if (rdp->exp_deferred_qs)
+ if (rdp->cpu_no_qs.b.exp)
rcu_report_exp_rdp(rdp);
rcu_tasks_qs(current, preempt);
trace_rcu_utilization(TPS("End context switch"));
@@ -351,17 +376,20 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
static void rcu_preempt_read_enter(void)
{
- current->rcu_read_lock_nesting++;
+ WRITE_ONCE(current->rcu_read_lock_nesting, READ_ONCE(current->rcu_read_lock_nesting) + 1);
}
static int rcu_preempt_read_exit(void)
{
- return --current->rcu_read_lock_nesting;
+ int ret = READ_ONCE(current->rcu_read_lock_nesting) - 1;
+
+ WRITE_ONCE(current->rcu_read_lock_nesting, ret);
+ return ret;
}
static void rcu_preempt_depth_set(int val)
{
- current->rcu_read_lock_nesting = val;
+ WRITE_ONCE(current->rcu_read_lock_nesting, val);
}
/*
@@ -374,6 +402,8 @@ void __rcu_read_lock(void)
rcu_preempt_read_enter();
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX);
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && rcu_state.gp_kthread)
+ WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true);
barrier(); /* critical section after entry code. */
}
EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -389,8 +419,9 @@ void __rcu_read_unlock(void)
{
struct task_struct *t = current;
+ barrier(); // critical section before exit code.
if (rcu_preempt_read_exit() == 0) {
- barrier(); /* critical section before exit code. */
+ barrier(); // critical-section exit before .s check.
if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
rcu_read_unlock_special(t);
}
@@ -431,7 +462,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
* be quite short, for example, in the case of the call from
* rcu_read_unlock_special().
*/
-static void
+static notrace void
rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
{
bool empty_exp;
@@ -450,13 +481,20 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
*/
special = t->rcu_read_unlock_special;
rdp = this_cpu_ptr(&rcu_data);
- if (!special.s && !rdp->exp_deferred_qs) {
+ if (!special.s && !rdp->cpu_no_qs.b.exp) {
local_irq_restore(flags);
return;
}
t->rcu_read_unlock_special.s = 0;
- if (special.b.need_qs)
- rcu_qs();
+ if (special.b.need_qs) {
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
+ rdp->cpu_no_qs.b.norm = false;
+ rcu_report_qs_rdp(rdp);
+ udelay(rcu_unlock_delay);
+ } else {
+ rcu_qs();
+ }
+ }
/*
* Respond to a request by an expedited grace period for a
@@ -464,7 +502,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* tasks are handled when removing the task from the
* blocked-tasks list below.
*/
- if (rdp->exp_deferred_qs)
+ if (rdp->cpu_no_qs.b.exp)
rcu_report_exp_rdp(rdp);
/* Clean up if blocked during RCU read-side critical section. */
@@ -496,7 +534,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
WRITE_ONCE(rnp->exp_tasks, np);
if (IS_ENABLED(CONFIG_RCU_BOOST)) {
/* Snapshot ->boost_mtx ownership w/rnp->lock held. */
- drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx.rtmutex) == t;
if (&t->rcu_node_entry == rnp->boost_tasks)
WRITE_ONCE(rnp->boost_tasks, np);
}
@@ -521,16 +559,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
- /* Unboost if we were boosted. */
- if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
- rt_mutex_futex_unlock(&rnp->boost_mtx);
-
/*
* If this was the last task on the expedited lists,
* then we need to report up the rcu_node hierarchy.
*/
if (!empty_exp && empty_exp_now)
rcu_report_exp_rnp(rnp, true);
+
+ /* Unboost if we were boosted. */
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
+ rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex);
} else {
local_irq_restore(flags);
}
@@ -545,9 +583,9 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* is disabled. This function cannot be expected to understand these
* nuances, so the caller must handle them.
*/
-static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
+static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
- return (__this_cpu_read(rcu_data.exp_deferred_qs) ||
+ return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) ||
READ_ONCE(t->rcu_read_unlock_special.s)) &&
rcu_preempt_depth() == 0;
}
@@ -559,7 +597,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
* evaluate safety in terms of interrupt, softirq, and preemption
* disabling.
*/
-static void rcu_preempt_deferred_qs(struct task_struct *t)
+notrace void rcu_preempt_deferred_qs(struct task_struct *t)
{
unsigned long flags;
@@ -588,9 +626,9 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
static void rcu_read_unlock_special(struct task_struct *t)
{
unsigned long flags;
+ bool irqs_were_disabled;
bool preempt_bh_were_disabled =
!!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK));
- bool irqs_were_disabled;
/* NMI handlers cannot block and cannot safely manipulate state. */
if (in_nmi())
@@ -599,30 +637,40 @@ static void rcu_read_unlock_special(struct task_struct *t)
local_irq_save(flags);
irqs_were_disabled = irqs_disabled_flags(flags);
if (preempt_bh_were_disabled || irqs_were_disabled) {
- bool exp;
+ bool expboost; // Expedited GP in flight or possible boosting.
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
- exp = (t->rcu_blocked_node &&
- READ_ONCE(t->rcu_blocked_node->exp_tasks)) ||
- (rdp->grpmask & READ_ONCE(rnp->expmask));
+ expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) ||
+ (rdp->grpmask & READ_ONCE(rnp->expmask)) ||
+ (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
+ ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) ||
+ (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled &&
+ t->rcu_blocked_node);
// Need to defer quiescent state until everything is enabled.
- if (use_softirq && (in_irq() || (exp && !irqs_were_disabled))) {
+ if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) {
// Using softirq, safe to awaken, and either the
- // wakeup is free or there is an expedited GP.
+ // wakeup is free or there is either an expedited
+ // GP in flight or a potential need to deboost.
raise_softirq_irqoff(RCU_SOFTIRQ);
} else {
// Enabling BH or preempt does reschedule, so...
- // Also if no expediting, slow is OK.
- // Plus nohz_full CPUs eventually get tick enabled.
+ // Also if no expediting and no possible deboosting,
+ // slow is OK. Plus nohz_full CPUs eventually get
+ // tick enabled.
set_tsk_need_resched(current);
set_preempt_need_resched();
if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
- !rdp->defer_qs_iw_pending && exp) {
+ expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) {
// Get scheduler to re-evaluate and call hooks.
// If !IRQ_WORK, FQS scan will eventually IPI.
- init_irq_work(&rdp->defer_qs_iw,
- rcu_preempt_deferred_qs_handler);
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
+ IS_ENABLED(CONFIG_PREEMPT_RT))
+ rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(
+ rcu_preempt_deferred_qs_handler);
+ else
+ init_irq_work(&rdp->defer_qs_iw,
+ rcu_preempt_deferred_qs_handler);
rdp->defer_qs_iw_pending = true;
irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
}
@@ -672,9 +720,7 @@ static void rcu_flavor_sched_clock_irq(int user)
{
struct task_struct *t = current;
- if (user || rcu_is_cpu_rrupt_from_idle()) {
- rcu_note_voluntary_context_switch(current);
- }
+ lockdep_assert_irqs_disabled();
if (rcu_preempt_depth() > 0 ||
(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
/* No QS, force context switch if deferred. */
@@ -734,7 +780,6 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
int cpu;
int i;
struct list_head *lhp;
- bool onl;
struct rcu_data *rdp;
struct rcu_node *rnp1;
@@ -758,9 +803,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
pr_cont("\n");
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
rdp = per_cpu_ptr(&rcu_data, cpu);
- onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n",
- cpu, ".o"[onl],
+ cpu, ".o"[rcu_rdp_cpu_online(rdp)],
(long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
(long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
}
@@ -769,6 +813,24 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
#else /* #ifdef CONFIG_PREEMPT_RCU */
/*
+ * If strict grace periods are enabled, and if the calling
+ * __rcu_read_unlock() marks the beginning of a quiescent state, immediately
+ * report that quiescent state and, if requested, spin for a bit.
+ */
+void rcu_read_unlock_strict(void)
+{
+ struct rcu_data *rdp;
+
+ if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
+ return;
+ rdp = this_cpu_ptr(&rcu_data);
+ rdp->cpu_no_qs.b.norm = false;
+ rcu_report_qs_rdp(rdp);
+ udelay(rcu_unlock_delay);
+}
+EXPORT_SYMBOL_GPL(rcu_read_unlock_strict);
+
+/*
* Tell them what RCU they are running.
*/
static void __init rcu_bootup_announce(void)
@@ -791,10 +853,8 @@ static void rcu_qs(void)
trace_rcu_grace_period(TPS("rcu_sched"),
__this_cpu_read(rcu_data.gp_seq), TPS("cpuqs"));
__this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
- if (!__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
- return;
- __this_cpu_write(rcu_data.cpu_no_qs.b.exp, false);
- rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
+ if (__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
+ rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
}
/*
@@ -810,7 +870,7 @@ void rcu_all_qs(void)
if (!raw_cpu_read(rcu_data.rcu_urgent_qs))
return;
- preempt_disable();
+ preempt_disable(); // For CONFIG_PREEMPT_COUNT=y kernels
/* Load rcu_urgent_qs before other flags. */
if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
preempt_enable();
@@ -840,8 +900,8 @@ void rcu_note_context_switch(bool preempt)
this_cpu_write(rcu_data.rcu_urgent_qs, false);
if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs)))
rcu_momentary_dyntick_idle();
- rcu_tasks_qs(current, preempt);
out:
+ rcu_tasks_qs(current, preempt);
trace_rcu_utilization(TPS("End context switch"));
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -867,11 +927,25 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
* Because there is no preemptible RCU, there can be no deferred quiescent
* states.
*/
-static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
+static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
return false;
}
-static void rcu_preempt_deferred_qs(struct task_struct *t) { }
+
+// Except that we do need to respond to a request by an expedited
+// grace period for a quiescent state from this CPU. Note that in
+// non-preemptible kernels, there can be no context switches within RCU
+// read-side critical sections, which in turn means that the leaf rcu_node
+// structure's blocked-tasks list is always empty. is therefore no need to
+// actually check it. Instead, a quiescent state from this CPU suffices,
+// and this function is only called from such a quiescent state.
+notrace void rcu_preempt_deferred_qs(struct task_struct *t)
+{
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+
+ if (READ_ONCE(rdp->cpu_no_qs.b.exp))
+ rcu_report_exp_rdp(rdp);
+}
/*
* Because there is no preemptible RCU, there can be no readers blocked,
@@ -902,7 +976,6 @@ static void rcu_flavor_sched_clock_irq(int user)
* neither access nor modify, at least not while the
* corresponding CPU is online.
*/
-
rcu_qs();
}
}
@@ -931,12 +1004,34 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
*/
static void rcu_cpu_kthread_setup(unsigned int cpu)
{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
#ifdef CONFIG_RCU_BOOST
struct sched_param sp;
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
#endif /* #ifdef CONFIG_RCU_BOOST */
+
+ WRITE_ONCE(rdp->rcuc_activity, jiffies);
+}
+
+static bool rcu_is_callbacks_nocb_kthread(struct rcu_data *rdp)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+ return rdp->nocb_cb_kthread == current;
+#else
+ return false;
+#endif
+}
+
+/*
+ * Is the current CPU running the RCU-callbacks kthread?
+ * Caller must have preemption disabled.
+ */
+static bool rcu_is_callbacks_kthread(struct rcu_data *rdp)
+{
+ return rdp->rcu_cpu_kthread_task == current ||
+ rcu_is_callbacks_nocb_kthread(rdp);
}
#ifdef CONFIG_RCU_BOOST
@@ -998,11 +1093,12 @@ static int rcu_boost(struct rcu_node *rnp)
* section.
*/
t = container_of(tb, struct task_struct, rcu_node_entry);
- rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
+ rt_mutex_init_proxy_locked(&rnp->boost_mtx.rtmutex, t);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* Lock only for side effect: boosts task t's priority. */
rt_mutex_lock(&rnp->boost_mtx);
rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
+ rnp->n_boosts++;
return READ_ONCE(rnp->exp_tasks) != NULL ||
READ_ONCE(rnp->boost_tasks) != NULL;
@@ -1033,7 +1129,7 @@ static int rcu_boost_kthread(void *arg)
if (spincnt > 10) {
WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING);
trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
- schedule_timeout_interruptible(2);
+ schedule_timeout_idle(2);
trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
spincnt = 0;
}
@@ -1057,7 +1153,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
__releases(rnp->lock)
{
raw_lockdep_assert_held_rcu_node(rnp);
- if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
+ if (!rnp->boost_kthread_task ||
+ (!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
@@ -1065,7 +1162,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
(rnp->gp_tasks != NULL &&
rnp->boost_tasks == NULL &&
rnp->qsmask == 0 &&
- (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) {
+ (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld ||
+ IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)))) {
if (rnp->exp_tasks == NULL)
WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -1076,15 +1174,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
}
}
-/*
- * Is the current CPU running the RCU-callbacks kthread?
- * Caller must have preemption disabled.
- */
-static bool rcu_is_callbacks_kthread(void)
-{
- return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current;
-}
-
#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
/*
@@ -1098,30 +1187,22 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
/*
* Create an RCU-boost kthread for the specified node if one does not
* already exist. We only create this kthread for preemptible RCU.
- * Returns zero if all is well, a negated errno otherwise.
*/
static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
- int rnp_index = rnp - rcu_get_root();
unsigned long flags;
+ int rnp_index = rnp - rcu_get_root();
struct sched_param sp;
struct task_struct *t;
- if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
- return;
-
- if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
- return;
-
- rcu_state.boost = 1;
-
- if (rnp->boost_kthread_task != NULL)
- return;
+ mutex_lock(&rnp->boost_kthread_mutex);
+ if (rnp->boost_kthread_task || !rcu_scheduler_fully_active)
+ goto out;
t = kthread_create(rcu_boost_kthread, (void *)rnp,
"rcub/%d", rnp_index);
if (WARN_ON_ONCE(IS_ERR(t)))
- return;
+ goto out;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->boost_kthread_task = t;
@@ -1129,6 +1210,9 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
+
+ out:
+ mutex_unlock(&rnp->boost_kthread_mutex);
}
/*
@@ -1139,11 +1223,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
* We don't include outgoingcpu in the affinity set, use -1 if there is
* no outgoing CPU. If there are no CPUs left in the affinity set,
* this function allows the kthread to execute on any CPU.
+ *
+ * Any future concurrent calls are serialized via ->boost_kthread_mutex.
*/
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
struct task_struct *t = rnp->boost_kthread_task;
- unsigned long mask = rcu_rnp_online_cpus(rnp);
+ unsigned long mask;
cpumask_var_t cm;
int cpu;
@@ -1151,37 +1237,23 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
return;
if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
return;
+ mutex_lock(&rnp->boost_kthread_mutex);
+ mask = rcu_rnp_online_cpus(rnp);
for_each_leaf_node_possible_cpu(rnp, cpu)
if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
- if (cpumask_weight(cm) == 0)
- cpumask_setall(cm);
+ cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
+ if (cpumask_empty(cm)) {
+ cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
+ if (outgoingcpu >= 0)
+ cpumask_clear_cpu(outgoingcpu, cm);
+ }
set_cpus_allowed_ptr(t, cm);
+ mutex_unlock(&rnp->boost_kthread_mutex);
free_cpumask_var(cm);
}
-/*
- * Spawn boost kthreads -- called as soon as the scheduler is running.
- */
-static void __init rcu_spawn_boost_kthreads(void)
-{
- struct rcu_node *rnp;
-
- rcu_for_each_leaf_node(rnp)
- rcu_spawn_one_boost_kthread(rnp);
-}
-
-static void rcu_prepare_kthreads(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_node *rnp = rdp->mynode;
-
- /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
- if (rcu_scheduler_fully_active)
- rcu_spawn_one_boost_kthread(rnp);
-}
-
#else /* #ifdef CONFIG_RCU_BOOST */
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
@@ -1190,1324 +1262,20 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
-static bool rcu_is_callbacks_kthread(void)
-{
- return false;
-}
-
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
}
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
-{
-}
-
-static void __init rcu_spawn_boost_kthreads(void)
+static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
}
-static void rcu_prepare_kthreads(int cpu)
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
}
#endif /* #else #ifdef CONFIG_RCU_BOOST */
-#if !defined(CONFIG_RCU_FAST_NO_HZ)
-
-/*
- * Check to see if any future non-offloaded RCU-related work will need
- * to be done by the current CPU, even if none need be done immediately,
- * returning 1 if so. This function is part of the RCU implementation;
- * it is -not- an exported member of the RCU API.
- *
- * Because we not have RCU_FAST_NO_HZ, just check whether or not this
- * CPU has RCU callbacks queued.
- */
-int rcu_needs_cpu(u64 basemono, u64 *nextevt)
-{
- *nextevt = KTIME_MAX;
- return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
- !rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist);
-}
-
-/*
- * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
- * after it.
- */
-static void rcu_cleanup_after_idle(void)
-{
-}
-
-/*
- * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
- * is nothing.
- */
-static void rcu_prepare_for_idle(void)
-{
-}
-
-#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-
-/*
- * This code is invoked when a CPU goes idle, at which point we want
- * to have the CPU do everything required for RCU so that it can enter
- * the energy-efficient dyntick-idle mode.
- *
- * The following preprocessor symbol controls this:
- *
- * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
- * to sleep in dyntick-idle mode with RCU callbacks pending. This
- * is sized to be roughly one RCU grace period. Those energy-efficiency
- * benchmarkers who might otherwise be tempted to set this to a large
- * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
- * system. And if you are -that- concerned about energy efficiency,
- * just power the system down and be done with it!
- *
- * The value below works well in practice. If future workloads require
- * adjustment, they can be converted into kernel config parameters, though
- * making the state machine smarter might be a better option.
- */
-#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
-
-static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
-module_param(rcu_idle_gp_delay, int, 0644);
-
-/*
- * Try to advance callbacks on the current CPU, but only if it has been
- * awhile since the last time we did so. Afterwards, if there are any
- * callbacks ready for immediate invocation, return true.
- */
-static bool __maybe_unused rcu_try_advance_all_cbs(void)
-{
- bool cbs_ready = false;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- struct rcu_node *rnp;
-
- /* Exit early if we advanced recently. */
- if (jiffies == rdp->last_advance_all)
- return false;
- rdp->last_advance_all = jiffies;
-
- rnp = rdp->mynode;
-
- /*
- * Don't bother checking unless a grace period has
- * completed since we last checked and there are
- * callbacks not yet ready to invoke.
- */
- if ((rcu_seq_completed_gp(rdp->gp_seq,
- rcu_seq_current(&rnp->gp_seq)) ||
- unlikely(READ_ONCE(rdp->gpwrap))) &&
- rcu_segcblist_pend_cbs(&rdp->cblist))
- note_gp_changes(rdp);
-
- if (rcu_segcblist_ready_cbs(&rdp->cblist))
- cbs_ready = true;
- return cbs_ready;
-}
-
-/*
- * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
- * to invoke. If the CPU has callbacks, try to advance them. Tell the
- * caller about what to set the timeout.
- *
- * The caller must have disabled interrupts.
- */
-int rcu_needs_cpu(u64 basemono, u64 *nextevt)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- unsigned long dj;
-
- lockdep_assert_irqs_disabled();
-
- /* If no non-offloaded callbacks, RCU doesn't need the CPU. */
- if (rcu_segcblist_empty(&rdp->cblist) ||
- rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist)) {
- *nextevt = KTIME_MAX;
- return 0;
- }
-
- /* Attempt to advance callbacks. */
- if (rcu_try_advance_all_cbs()) {
- /* Some ready to invoke, so initiate later invocation. */
- invoke_rcu_core();
- return 1;
- }
- rdp->last_accelerate = jiffies;
-
- /* Request timer and round. */
- dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies;
-
- *nextevt = basemono + dj * TICK_NSEC;
- return 0;
-}
-
-/*
- * Prepare a CPU for idle from an RCU perspective. The first major task is to
- * sense whether nohz mode has been enabled or disabled via sysfs. The second
- * major task is to accelerate (that is, assign grace-period numbers to) any
- * recently arrived callbacks.
- *
- * The caller must have disabled interrupts.
- */
-static void rcu_prepare_for_idle(void)
-{
- bool needwake;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- struct rcu_node *rnp;
- int tne;
-
- lockdep_assert_irqs_disabled();
- if (rcu_segcblist_is_offloaded(&rdp->cblist))
- return;
-
- /* Handle nohz enablement switches conservatively. */
- tne = READ_ONCE(tick_nohz_active);
- if (tne != rdp->tick_nohz_enabled_snap) {
- if (!rcu_segcblist_empty(&rdp->cblist))
- invoke_rcu_core(); /* force nohz to see update. */
- rdp->tick_nohz_enabled_snap = tne;
- return;
- }
- if (!tne)
- return;
-
- /*
- * If we have not yet accelerated this jiffy, accelerate all
- * callbacks on this CPU.
- */
- if (rdp->last_accelerate == jiffies)
- return;
- rdp->last_accelerate = jiffies;
- if (rcu_segcblist_pend_cbs(&rdp->cblist)) {
- rnp = rdp->mynode;
- raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- needwake = rcu_accelerate_cbs(rnp, rdp);
- raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
- if (needwake)
- rcu_gp_kthread_wake();
- }
-}
-
-/*
- * Clean up for exit from idle. Attempt to advance callbacks based on
- * any grace periods that elapsed while the CPU was idle, and if any
- * callbacks are now ready to invoke, initiate invocation.
- */
-static void rcu_cleanup_after_idle(void)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- lockdep_assert_irqs_disabled();
- if (rcu_segcblist_is_offloaded(&rdp->cblist))
- return;
- if (rcu_try_advance_all_cbs())
- invoke_rcu_core();
-}
-
-#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-
-#ifdef CONFIG_RCU_NOCB_CPU
-
-/*
- * Offload callback processing from the boot-time-specified set of CPUs
- * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
- * created that pull the callbacks from the corresponding CPU, wait for
- * a grace period to elapse, and invoke the callbacks. These kthreads
- * are organized into GP kthreads, which manage incoming callbacks, wait for
- * grace periods, and awaken CB kthreads, and the CB kthreads, which only
- * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs
- * do a wake_up() on their GP kthread when they insert a callback into any
- * empty list, unless the rcu_nocb_poll boot parameter has been specified,
- * in which case each kthread actively polls its CPU. (Which isn't so great
- * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
- *
- * This is intended to be used in conjunction with Frederic Weisbecker's
- * adaptive-idle work, which would seriously reduce OS jitter on CPUs
- * running CPU-bound user-mode computations.
- *
- * Offloading of callbacks can also be used as an energy-efficiency
- * measure because CPUs with no RCU callbacks queued are more aggressive
- * about entering dyntick-idle mode.
- */
-
-
-/*
- * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
- * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a
- * comma-separated list of CPUs and/or CPU ranges. If an invalid list is
- * given, a warning is emitted and all CPUs are offloaded.
- */
-static int __init rcu_nocb_setup(char *str)
-{
- alloc_bootmem_cpumask_var(&rcu_nocb_mask);
- if (!strcasecmp(str, "all"))
- cpumask_setall(rcu_nocb_mask);
- else
- if (cpulist_parse(str, rcu_nocb_mask)) {
- pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
- cpumask_setall(rcu_nocb_mask);
- }
- return 1;
-}
-__setup("rcu_nocbs=", rcu_nocb_setup);
-
-static int __init parse_rcu_nocb_poll(char *arg)
-{
- rcu_nocb_poll = true;
- return 0;
-}
-early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
-
-/*
- * Don't bother bypassing ->cblist if the call_rcu() rate is low.
- * After all, the main point of bypassing is to avoid lock contention
- * on ->nocb_lock, which only can happen at high call_rcu() rates.
- */
-int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
-module_param(nocb_nobypass_lim_per_jiffy, int, 0);
-
-/*
- * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
- * lock isn't immediately available, increment ->nocb_lock_contended to
- * flag the contention.
- */
-static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
- __acquires(&rdp->nocb_bypass_lock)
-{
- lockdep_assert_irqs_disabled();
- if (raw_spin_trylock(&rdp->nocb_bypass_lock))
- return;
- atomic_inc(&rdp->nocb_lock_contended);
- WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- smp_mb__after_atomic(); /* atomic_inc() before lock. */
- raw_spin_lock(&rdp->nocb_bypass_lock);
- smp_mb__before_atomic(); /* atomic_dec() after lock. */
- atomic_dec(&rdp->nocb_lock_contended);
-}
-
-/*
- * Spinwait until the specified rcu_data structure's ->nocb_lock is
- * not contended. Please note that this is extremely special-purpose,
- * relying on the fact that at most two kthreads and one CPU contend for
- * this lock, and also that the two kthreads are guaranteed to have frequent
- * grace-period-duration time intervals between successive acquisitions
- * of the lock. This allows us to use an extremely simple throttling
- * mechanism, and further to apply it only to the CPU doing floods of
- * call_rcu() invocations. Don't try this at home!
- */
-static void rcu_nocb_wait_contended(struct rcu_data *rdp)
-{
- WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
- cpu_relax();
-}
-
-/*
- * Conditionally acquire the specified rcu_data structure's
- * ->nocb_bypass_lock.
- */
-static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- return raw_spin_trylock(&rdp->nocb_bypass_lock);
-}
-
-/*
- * Release the specified rcu_data structure's ->nocb_bypass_lock.
- */
-static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
- __releases(&rdp->nocb_bypass_lock)
-{
- lockdep_assert_irqs_disabled();
- raw_spin_unlock(&rdp->nocb_bypass_lock);
-}
-
-/*
- * Acquire the specified rcu_data structure's ->nocb_lock, but only
- * if it corresponds to a no-CBs CPU.
- */
-static void rcu_nocb_lock(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- if (!rcu_segcblist_is_offloaded(&rdp->cblist))
- return;
- raw_spin_lock(&rdp->nocb_lock);
-}
-
-/*
- * Release the specified rcu_data structure's ->nocb_lock, but only
- * if it corresponds to a no-CBs CPU.
- */
-static void rcu_nocb_unlock(struct rcu_data *rdp)
-{
- if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
- lockdep_assert_irqs_disabled();
- raw_spin_unlock(&rdp->nocb_lock);
- }
-}
-
-/*
- * Release the specified rcu_data structure's ->nocb_lock and restore
- * interrupts, but only if it corresponds to a no-CBs CPU.
- */
-static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
- unsigned long flags)
-{
- if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
- lockdep_assert_irqs_disabled();
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
- } else {
- local_irq_restore(flags);
- }
-}
-
-/* Lockdep check that ->cblist may be safely accessed. */
-static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- if (rcu_segcblist_is_offloaded(&rdp->cblist))
- lockdep_assert_held(&rdp->nocb_lock);
-}
-
-/*
- * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
- * grace period.
- */
-static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
-{
- swake_up_all(sq);
-}
-
-static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
-{
- return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1];
-}
-
-static void rcu_init_one_nocb(struct rcu_node *rnp)
-{
- init_swait_queue_head(&rnp->nocb_gp_wq[0]);
- init_swait_queue_head(&rnp->nocb_gp_wq[1]);
-}
-
-/* Is the specified CPU a no-CBs CPU? */
-bool rcu_is_nocb_cpu(int cpu)
-{
- if (cpumask_available(rcu_nocb_mask))
- return cpumask_test_cpu(cpu, rcu_nocb_mask);
- return false;
-}
-
-/*
- * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock
- * and this function releases it.
- */
-static void wake_nocb_gp(struct rcu_data *rdp, bool force,
- unsigned long flags)
- __releases(rdp->nocb_lock)
-{
- bool needwake = false;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
-
- lockdep_assert_held(&rdp->nocb_lock);
- if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("AlreadyAwake"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- return;
- }
- del_timer(&rdp->nocb_timer);
- rcu_nocb_unlock_irqrestore(rdp, flags);
- raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
- if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
- WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
- needwake = true;
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
- }
- raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
- if (needwake)
- wake_up_process(rdp_gp->nocb_gp_kthread);
-}
-
-/*
- * Arrange to wake the GP kthread for this NOCB group at some future
- * time when it is safe to do so.
- */
-static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
- const char *reason)
-{
- if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
- mod_timer(&rdp->nocb_timer, jiffies + 1);
- if (rdp->nocb_defer_wakeup < waketype)
- WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
-}
-
-/*
- * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
- * However, if there is a callback to be enqueued and if ->nocb_bypass
- * proves to be initially empty, just return false because the no-CB GP
- * kthread may need to be awakened in this case.
- *
- * Note that this function always returns true if rhp is NULL.
- */
-static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
-{
- struct rcu_cblist rcl;
-
- WARN_ON_ONCE(!rcu_segcblist_is_offloaded(&rdp->cblist));
- rcu_lockdep_assert_cblist_protected(rdp);
- lockdep_assert_held(&rdp->nocb_bypass_lock);
- if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
- raw_spin_unlock(&rdp->nocb_bypass_lock);
- return false;
- }
- /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
- if (rhp)
- rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
- rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
- rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
- WRITE_ONCE(rdp->nocb_bypass_first, j);
- rcu_nocb_bypass_unlock(rdp);
- return true;
-}
-
-/*
- * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
- * However, if there is a callback to be enqueued and if ->nocb_bypass
- * proves to be initially empty, just return false because the no-CB GP
- * kthread may need to be awakened in this case.
- *
- * Note that this function always returns true if rhp is NULL.
- */
-static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
-{
- if (!rcu_segcblist_is_offloaded(&rdp->cblist))
- return true;
- rcu_lockdep_assert_cblist_protected(rdp);
- rcu_nocb_bypass_lock(rdp);
- return rcu_nocb_do_flush_bypass(rdp, rhp, j);
-}
-
-/*
- * If the ->nocb_bypass_lock is immediately available, flush the
- * ->nocb_bypass queue into ->cblist.
- */
-static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
-{
- rcu_lockdep_assert_cblist_protected(rdp);
- if (!rcu_segcblist_is_offloaded(&rdp->cblist) ||
- !rcu_nocb_bypass_trylock(rdp))
- return;
- WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
-}
-
-/*
- * See whether it is appropriate to use the ->nocb_bypass list in order
- * to control contention on ->nocb_lock. A limited number of direct
- * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
- * is non-empty, further callbacks must be placed into ->nocb_bypass,
- * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch
- * back to direct use of ->cblist. However, ->nocb_bypass should not be
- * used if ->cblist is empty, because otherwise callbacks can be stranded
- * on ->nocb_bypass because we cannot count on the current CPU ever again
- * invoking call_rcu(). The general rule is that if ->nocb_bypass is
- * non-empty, the corresponding no-CBs grace-period kthread must not be
- * in an indefinite sleep state.
- *
- * Finally, it is not permitted to use the bypass during early boot,
- * as doing so would confuse the auto-initialization code. Besides
- * which, there is no point in worrying about lock contention while
- * there is only one CPU in operation.
- */
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags)
-{
- unsigned long c;
- unsigned long cur_gp_seq;
- unsigned long j = jiffies;
- long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
-
- if (!rcu_segcblist_is_offloaded(&rdp->cblist)) {
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- return false; /* Not offloaded, no bypassing. */
- }
- lockdep_assert_irqs_disabled();
-
- // Don't use ->nocb_bypass during early boot.
- if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
- rcu_nocb_lock(rdp);
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- return false;
- }
-
- // If we have advanced to a new jiffy, reset counts to allow
- // moving back from ->nocb_bypass to ->cblist.
- if (j == rdp->nocb_nobypass_last) {
- c = rdp->nocb_nobypass_count + 1;
- } else {
- WRITE_ONCE(rdp->nocb_nobypass_last, j);
- c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
- if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
- nocb_nobypass_lim_per_jiffy))
- c = 0;
- else if (c > nocb_nobypass_lim_per_jiffy)
- c = nocb_nobypass_lim_per_jiffy;
- }
- WRITE_ONCE(rdp->nocb_nobypass_count, c);
-
- // If there hasn't yet been all that many ->cblist enqueues
- // this jiffy, tell the caller to enqueue onto ->cblist. But flush
- // ->nocb_bypass first.
- if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
- rcu_nocb_lock(rdp);
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- if (*was_alldone)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstQ"));
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- return false; // Caller must enqueue the callback.
- }
-
- // If ->nocb_bypass has been used too long or is too full,
- // flush ->nocb_bypass to ->cblist.
- if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
- ncbs >= qhimark) {
- rcu_nocb_lock(rdp);
- if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- if (*was_alldone)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstQ"));
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- return false; // Caller must enqueue the callback.
- }
- if (j != rdp->nocb_gp_adv_time &&
- rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
- rcu_advance_cbs_nowake(rdp->mynode, rdp);
- rdp->nocb_gp_adv_time = j;
- }
- rcu_nocb_unlock_irqrestore(rdp, flags);
- return true; // Callback already enqueued.
- }
-
- // We need to use the bypass.
- rcu_nocb_wait_contended(rdp);
- rcu_nocb_bypass_lock(rdp);
- ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
- rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
- if (!ncbs) {
- WRITE_ONCE(rdp->nocb_bypass_first, j);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
- }
- rcu_nocb_bypass_unlock(rdp);
- smp_mb(); /* Order enqueue before wake. */
- if (ncbs) {
- local_irq_restore(flags);
- } else {
- // No-CBs GP kthread might be indefinitely asleep, if so, wake.
- rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
- if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstBQwake"));
- __call_rcu_nocb_wake(rdp, true, flags);
- } else {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstBQnoWake"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- }
- }
- return true; // Callback already enqueued.
-}
-
-/*
- * Awaken the no-CBs grace-period kthead if needed, either due to it
- * legitimately being asleep or due to overload conditions.
- *
- * If warranted, also wake up the kthread servicing this CPUs queues.
- */
-static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
- unsigned long flags)
- __releases(rdp->nocb_lock)
-{
- unsigned long cur_gp_seq;
- unsigned long j;
- long len;
- struct task_struct *t;
-
- // If we are being polled or there is no kthread, just leave.
- t = READ_ONCE(rdp->nocb_gp_kthread);
- if (rcu_nocb_poll || !t) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WakeNotPoll"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- return;
- }
- // Need to actually to a wakeup.
- len = rcu_segcblist_n_cbs(&rdp->cblist);
- if (was_alldone) {
- rdp->qlen_last_fqs_check = len;
- if (!irqs_disabled_flags(flags)) {
- /* ... if queue was empty ... */
- wake_nocb_gp(rdp, false, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WakeEmpty"));
- } else {
- wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
- TPS("WakeEmptyIsDeferred"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- }
- } else if (len > rdp->qlen_last_fqs_check + qhimark) {
- /* ... or if many callbacks queued. */
- rdp->qlen_last_fqs_check = len;
- j = jiffies;
- if (j != rdp->nocb_gp_adv_time &&
- rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
- rcu_advance_cbs_nowake(rdp->mynode, rdp);
- rdp->nocb_gp_adv_time = j;
- }
- smp_mb(); /* Enqueue before timer_pending(). */
- if ((rdp->nocb_cb_sleep ||
- !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
- !timer_pending(&rdp->nocb_bypass_timer))
- wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
- TPS("WakeOvfIsDeferred"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- } else {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- }
- return;
-}
-
-/* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */
-static void do_nocb_bypass_wakeup_timer(struct timer_list *t)
-{
- unsigned long flags;
- struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer);
-
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
- rcu_nocb_lock_irqsave(rdp, flags);
- smp_mb__after_spinlock(); /* Timer expire before wakeup. */
- __call_rcu_nocb_wake(rdp, true, flags);
-}
-
-/*
- * No-CBs GP kthreads come here to wait for additional callbacks to show up
- * or for grace periods to end.
- */
-static void nocb_gp_wait(struct rcu_data *my_rdp)
-{
- bool bypass = false;
- long bypass_ncbs;
- int __maybe_unused cpu = my_rdp->cpu;
- unsigned long cur_gp_seq;
- unsigned long flags;
- bool gotcbs = false;
- unsigned long j = jiffies;
- bool needwait_gp = false; // This prevents actual uninitialized use.
- bool needwake;
- bool needwake_gp;
- struct rcu_data *rdp;
- struct rcu_node *rnp;
- unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
- bool wasempty = false;
-
- /*
- * Each pass through the following loop checks for CBs and for the
- * nearest grace period (if any) to wait for next. The CB kthreads
- * and the global grace-period kthread are awakened if needed.
- */
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
- rcu_nocb_lock_irqsave(rdp, flags);
- bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- if (bypass_ncbs &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
- bypass_ncbs > 2 * qhimark)) {
- // Bypass full or old, so flush it.
- (void)rcu_nocb_try_flush_bypass(rdp, j);
- bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- continue; /* No callbacks here, try next. */
- }
- if (bypass_ncbs) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("Bypass"));
- bypass = true;
- }
- rnp = rdp->mynode;
- if (bypass) { // Avoid race with first bypass CB.
- WRITE_ONCE(my_rdp->nocb_defer_wakeup,
- RCU_NOCB_WAKE_NOT);
- del_timer(&my_rdp->nocb_timer);
- }
- // Advance callbacks if helpful and low contention.
- needwake_gp = false;
- if (!rcu_segcblist_restempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL) ||
- (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
- raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
- needwake_gp = rcu_advance_cbs(rnp, rdp);
- wasempty = rcu_segcblist_restempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL);
- raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
- }
- // Need to wait on some grace period?
- WARN_ON_ONCE(wasempty &&
- !rcu_segcblist_restempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL));
- if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
- if (!needwait_gp ||
- ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
- wait_gp_seq = cur_gp_seq;
- needwait_gp = true;
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("NeedWaitGP"));
- }
- if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
- needwake = rdp->nocb_cb_sleep;
- WRITE_ONCE(rdp->nocb_cb_sleep, false);
- smp_mb(); /* CB invocation -after- GP end. */
- } else {
- needwake = false;
- }
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake) {
- swake_up_one(&rdp->nocb_cb_wq);
- gotcbs = true;
- }
- if (needwake_gp)
- rcu_gp_kthread_wake();
- }
-
- my_rdp->nocb_gp_bypass = bypass;
- my_rdp->nocb_gp_gp = needwait_gp;
- my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
- if (bypass && !rcu_nocb_poll) {
- // At least one child with non-empty ->nocb_bypass, so set
- // timer in order to avoid stranding its callbacks.
- raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
- mod_timer(&my_rdp->nocb_bypass_timer, j + 2);
- raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
- }
- if (rcu_nocb_poll) {
- /* Polling, so trace if first poll in the series. */
- if (gotcbs)
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
- schedule_timeout_interruptible(1);
- } else if (!needwait_gp) {
- /* Wait for callbacks to appear. */
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
- swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
- !READ_ONCE(my_rdp->nocb_gp_sleep));
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
- } else {
- rnp = my_rdp->mynode;
- trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
- swait_event_interruptible_exclusive(
- rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
- rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
- !READ_ONCE(my_rdp->nocb_gp_sleep));
- trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
- }
- if (!rcu_nocb_poll) {
- raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
- if (bypass)
- del_timer(&my_rdp->nocb_bypass_timer);
- WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
- raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
- }
- my_rdp->nocb_gp_seq = -1;
- WARN_ON(signal_pending(current));
-}
-
-/*
- * No-CBs grace-period-wait kthread. There is one of these per group
- * of CPUs, but only once at least one CPU in that group has come online
- * at least once since boot. This kthread checks for newly posted
- * callbacks from any of the CPUs it is responsible for, waits for a
- * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
- * that then have callback-invocation work to do.
- */
-static int rcu_nocb_gp_kthread(void *arg)
-{
- struct rcu_data *rdp = arg;
-
- for (;;) {
- WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
- nocb_gp_wait(rdp);
- cond_resched_tasks_rcu_qs();
- }
- return 0;
-}
-
-/*
- * Invoke any ready callbacks from the corresponding no-CBs CPU,
- * then, if there are no more, wait for more to appear.
- */
-static void nocb_cb_wait(struct rcu_data *rdp)
-{
- unsigned long cur_gp_seq;
- unsigned long flags;
- bool needwake_gp = false;
- struct rcu_node *rnp = rdp->mynode;
-
- local_irq_save(flags);
- rcu_momentary_dyntick_idle();
- local_irq_restore(flags);
- local_bh_disable();
- rcu_do_batch(rdp);
- local_bh_enable();
- lockdep_assert_irqs_enabled();
- rcu_nocb_lock_irqsave(rdp, flags);
- if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
- raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
- needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
- raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
- }
- if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_gp)
- rcu_gp_kthread_wake();
- return;
- }
-
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
- WRITE_ONCE(rdp->nocb_cb_sleep, true);
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_gp)
- rcu_gp_kthread_wake();
- swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
- !READ_ONCE(rdp->nocb_cb_sleep));
- if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */
- /* ^^^ Ensure CB invocation follows _sleep test. */
- return;
- }
- WARN_ON(signal_pending(current));
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
-}
-
-/*
- * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke
- * nocb_cb_wait() to do the dirty work.
- */
-static int rcu_nocb_cb_kthread(void *arg)
-{
- struct rcu_data *rdp = arg;
-
- // Each pass through this loop does one callback batch, and,
- // if there are no more ready callbacks, waits for them.
- for (;;) {
- nocb_cb_wait(rdp);
- cond_resched_tasks_rcu_qs();
- }
- return 0;
-}
-
-/* Is a deferred wakeup of rcu_nocb_kthread() required? */
-static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
-{
- return READ_ONCE(rdp->nocb_defer_wakeup);
-}
-
-/* Do a deferred wakeup of rcu_nocb_kthread(). */
-static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
-{
- unsigned long flags;
- int ndw;
-
- rcu_nocb_lock_irqsave(rdp, flags);
- if (!rcu_nocb_need_deferred_wakeup(rdp)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- return;
- }
- ndw = READ_ONCE(rdp->nocb_defer_wakeup);
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
-}
-
-/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
-static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
-{
- struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
-
- do_nocb_deferred_wakeup_common(rdp);
-}
-
-/*
- * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
- * This means we do an inexact common-case check. Note that if
- * we miss, ->nocb_timer will eventually clean things up.
- */
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
-{
- if (rcu_nocb_need_deferred_wakeup(rdp))
- do_nocb_deferred_wakeup_common(rdp);
-}
-
-void __init rcu_init_nohz(void)
-{
- int cpu;
- bool need_rcu_nocb_mask = false;
- struct rcu_data *rdp;
-
-#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
- need_rcu_nocb_mask = true;
-#endif /* #if defined(CONFIG_NO_HZ_FULL) */
-
- if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) {
- if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
- pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
- return;
- }
- }
- if (!cpumask_available(rcu_nocb_mask))
- return;
-
-#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running)
- cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
-#endif /* #if defined(CONFIG_NO_HZ_FULL) */
-
- if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
- pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
- cpumask_and(rcu_nocb_mask, cpu_possible_mask,
- rcu_nocb_mask);
- }
- if (cpumask_empty(rcu_nocb_mask))
- pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
- else
- pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
- cpumask_pr_args(rcu_nocb_mask));
- if (rcu_nocb_poll)
- pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
-
- for_each_cpu(cpu, rcu_nocb_mask) {
- rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rcu_segcblist_empty(&rdp->cblist))
- rcu_segcblist_init(&rdp->cblist);
- rcu_segcblist_offload(&rdp->cblist);
- }
- rcu_organize_nocb_kthreads();
-}
-
-/* Initialize per-rcu_data variables for no-CBs CPUs. */
-static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
-{
- init_swait_queue_head(&rdp->nocb_cb_wq);
- init_swait_queue_head(&rdp->nocb_gp_wq);
- raw_spin_lock_init(&rdp->nocb_lock);
- raw_spin_lock_init(&rdp->nocb_bypass_lock);
- raw_spin_lock_init(&rdp->nocb_gp_lock);
- timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
- timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0);
- rcu_cblist_init(&rdp->nocb_bypass);
-}
-
-/*
- * If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
- * for this CPU's group has not yet been created, spawn it as well.
- */
-static void rcu_spawn_one_nocb_kthread(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_data *rdp_gp;
- struct task_struct *t;
-
- /*
- * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
- * then nothing to do.
- */
- if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread)
- return;
-
- /* If we didn't spawn the GP kthread first, reorganize! */
- rdp_gp = rdp->nocb_gp_rdp;
- if (!rdp_gp->nocb_gp_kthread) {
- t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
- "rcuog/%d", rdp_gp->cpu);
- if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
- return;
- WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
- }
-
- /* Spawn the kthread for this CPU. */
- t = kthread_run(rcu_nocb_cb_kthread, rdp,
- "rcuo%c/%d", rcu_state.abbr, cpu);
- if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
- return;
- WRITE_ONCE(rdp->nocb_cb_kthread, t);
- WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
-}
-
-/*
- * If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo kthread, spawn it.
- */
-static void rcu_spawn_cpu_nocb_kthread(int cpu)
-{
- if (rcu_scheduler_fully_active)
- rcu_spawn_one_nocb_kthread(cpu);
-}
-
-/*
- * Once the scheduler is running, spawn rcuo kthreads for all online
- * no-CBs CPUs. This assumes that the early_initcall()s happen before
- * non-boot CPUs come online -- if this changes, we will need to add
- * some mutual exclusion.
- */
-static void __init rcu_spawn_nocb_kthreads(void)
-{
- int cpu;
-
- for_each_online_cpu(cpu)
- rcu_spawn_cpu_nocb_kthread(cpu);
-}
-
-/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
-static int rcu_nocb_gp_stride = -1;
-module_param(rcu_nocb_gp_stride, int, 0444);
-
-/*
- * Initialize GP-CB relationships for all no-CBs CPU.
- */
-static void __init rcu_organize_nocb_kthreads(void)
-{
- int cpu;
- bool firsttime = true;
- bool gotnocbs = false;
- bool gotnocbscbs = true;
- int ls = rcu_nocb_gp_stride;
- int nl = 0; /* Next GP kthread. */
- struct rcu_data *rdp;
- struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
- struct rcu_data *rdp_prev = NULL;
-
- if (!cpumask_available(rcu_nocb_mask))
- return;
- if (ls == -1) {
- ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
- rcu_nocb_gp_stride = ls;
- }
-
- /*
- * Each pass through this loop sets up one rcu_data structure.
- * Should the corresponding CPU come online in the future, then
- * we will spawn the needed set of rcu_nocb_kthread() kthreads.
- */
- for_each_cpu(cpu, rcu_nocb_mask) {
- rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rdp->cpu >= nl) {
- /* New GP kthread, set up for CBs & next GP. */
- gotnocbs = true;
- nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
- rdp->nocb_gp_rdp = rdp;
- rdp_gp = rdp;
- if (dump_tree) {
- if (!firsttime)
- pr_cont("%s\n", gotnocbscbs
- ? "" : " (self only)");
- gotnocbscbs = false;
- firsttime = false;
- pr_alert("%s: No-CB GP kthread CPU %d:",
- __func__, cpu);
- }
- } else {
- /* Another CB kthread, link to previous GP kthread. */
- gotnocbscbs = true;
- rdp->nocb_gp_rdp = rdp_gp;
- rdp_prev->nocb_next_cb_rdp = rdp;
- if (dump_tree)
- pr_cont(" %d", cpu);
- }
- rdp_prev = rdp;
- }
- if (gotnocbs && dump_tree)
- pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
-}
-
-/*
- * Bind the current task to the offloaded CPUs. If there are no offloaded
- * CPUs, leave the task unbound. Splat if the bind attempt fails.
- */
-void rcu_bind_current_to_nocb(void)
-{
- if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask))
- WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
-}
-EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
-
-/*
- * Dump out nocb grace-period kthread state for the specified rcu_data
- * structure.
- */
-static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
-{
- struct rcu_node *rnp = rdp->mynode;
-
- pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu\n",
- rdp->cpu,
- "kK"[!!rdp->nocb_gp_kthread],
- "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
- "dD"[!!rdp->nocb_defer_wakeup],
- "tT"[timer_pending(&rdp->nocb_timer)],
- "bB"[timer_pending(&rdp->nocb_bypass_timer)],
- "sS"[!!rdp->nocb_gp_sleep],
- ".W"[swait_active(&rdp->nocb_gp_wq)],
- ".W"[swait_active(&rnp->nocb_gp_wq[0])],
- ".W"[swait_active(&rnp->nocb_gp_wq[1])],
- ".B"[!!rdp->nocb_gp_bypass],
- ".G"[!!rdp->nocb_gp_gp],
- (long)rdp->nocb_gp_seq,
- rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops));
-}
-
-/* Dump out nocb kthread state for the specified rcu_data structure. */
-static void show_rcu_nocb_state(struct rcu_data *rdp)
-{
- struct rcu_segcblist *rsclp = &rdp->cblist;
- bool waslocked;
- bool wastimer;
- bool wassleep;
-
- if (rdp->nocb_gp_rdp == rdp)
- show_rcu_nocb_gp_state(rdp);
-
- pr_info(" CB %d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%c%c%c q%ld\n",
- rdp->cpu, rdp->nocb_gp_rdp->cpu,
- "kK"[!!rdp->nocb_cb_kthread],
- "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
- "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
- "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
- "sS"[!!rdp->nocb_cb_sleep],
- ".W"[swait_active(&rdp->nocb_cb_wq)],
- jiffies - rdp->nocb_bypass_first,
- jiffies - rdp->nocb_nobypass_last,
- rdp->nocb_nobypass_count,
- ".D"[rcu_segcblist_ready_cbs(rsclp)],
- ".W"[!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)],
- ".R"[!rcu_segcblist_restempty(rsclp, RCU_WAIT_TAIL)],
- ".N"[!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL)],
- ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
- rcu_segcblist_n_cbs(&rdp->cblist));
-
- /* It is OK for GP kthreads to have GP state. */
- if (rdp->nocb_gp_rdp == rdp)
- return;
-
- waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
- wastimer = timer_pending(&rdp->nocb_timer);
- wassleep = swait_active(&rdp->nocb_gp_wq);
- if (!rdp->nocb_defer_wakeup && !rdp->nocb_gp_sleep &&
- !waslocked && !wastimer && !wassleep)
- return; /* Nothing untowards. */
-
- pr_info(" !!! %c%c%c%c %c\n",
- "lL"[waslocked],
- "dD"[!!rdp->nocb_defer_wakeup],
- "tT"[wastimer],
- "sS"[!!rdp->nocb_gp_sleep],
- ".W"[wassleep]);
-}
-
-#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-
-/* No ->nocb_lock to acquire. */
-static void rcu_nocb_lock(struct rcu_data *rdp)
-{
-}
-
-/* No ->nocb_lock to release. */
-static void rcu_nocb_unlock(struct rcu_data *rdp)
-{
-}
-
-/* No ->nocb_lock to release. */
-static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
- unsigned long flags)
-{
- local_irq_restore(flags);
-}
-
-/* Lockdep check that ->cblist may be safely accessed. */
-static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
-}
-
-static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
-{
-}
-
-static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
-{
- return NULL;
-}
-
-static void rcu_init_one_nocb(struct rcu_node *rnp)
-{
-}
-
-static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
-{
- return true;
-}
-
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags)
-{
- return false;
-}
-
-static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
- unsigned long flags)
-{
- WARN_ON_ONCE(1); /* Should be dead code! */
-}
-
-static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
-{
-}
-
-static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
-{
- return false;
-}
-
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
-{
-}
-
-static void rcu_spawn_cpu_nocb_kthread(int cpu)
-{
-}
-
-static void __init rcu_spawn_nocb_kthreads(void)
-{
-}
-
-static void show_rcu_nocb_state(struct rcu_data *rdp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
-
/*
* Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
* grace-period kthread will do force_quiescent_state() processing?
@@ -2515,7 +1283,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
* CPU unless the grace period has extended for too long.
*
* This code relies on the fact that all NO_HZ_FULL CPUs are also
- * CONFIG_RCU_NOCB_CPU CPUs.
+ * RCU_NOCB_CPU CPUs.
*/
static bool rcu_nohz_full_cpu(void)
{
@@ -2535,39 +1303,5 @@ static void rcu_bind_gp_kthread(void)
{
if (!tick_nohz_full_enabled())
return;
- housekeeping_affine(current, HK_FLAG_RCU);
-}
-
-/* Record the current task on dyntick-idle entry. */
-static void noinstr rcu_dynticks_task_enter(void)
-{
-#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
- WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
-#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
-}
-
-/* Record no current task on dyntick-idle exit. */
-static void noinstr rcu_dynticks_task_exit(void)
-{
-#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
- WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
-#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
-}
-
-/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
-static void rcu_dynticks_task_trace_enter(void)
-{
-#ifdef CONFIG_TASKS_RCU_TRACE
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
- current->trc_reader_special.b.need_mb = true;
-#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */
-}
-
-/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
-static void rcu_dynticks_task_trace_exit(void)
-{
-#ifdef CONFIG_TASKS_RCU_TRACE
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
- current->trc_reader_special.b.need_mb = false;
-#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */
+ housekeeping_affine(current, HK_TYPE_RCU);
}
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 54a6dba0280d..5d666428546b 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -7,12 +7,16 @@
* Author: Paul E. McKenney <paulmck@linux.ibm.com>
*/
+#include <linux/kvm_para.h>
+#include <linux/rcu_notifier.h>
+
//////////////////////////////////////////////////////////////////////////////
//
// Controlling CPU stall warnings, including delay calculation.
/* panic() on RCU Stall sysctl. */
int sysctl_panic_on_rcu_stall __read_mostly;
+int sysctl_max_rcu_stall_to_panic __read_mostly;
#ifdef CONFIG_PROVE_RCU
#define RCU_STALL_DELAY_DELTA (5 * HZ)
@@ -22,6 +26,34 @@ int sysctl_panic_on_rcu_stall __read_mostly;
#define RCU_STALL_MIGHT_DIV 8
#define RCU_STALL_MIGHT_MIN (2 * HZ)
+int rcu_exp_jiffies_till_stall_check(void)
+{
+ int cpu_stall_timeout = READ_ONCE(rcu_exp_cpu_stall_timeout);
+ int exp_stall_delay_delta = 0;
+ int till_stall_check;
+
+ // Zero says to use rcu_cpu_stall_timeout, but in milliseconds.
+ if (!cpu_stall_timeout)
+ cpu_stall_timeout = jiffies_to_msecs(rcu_jiffies_till_stall_check());
+
+ // Limit check must be consistent with the Kconfig limits for
+ // CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range.
+ // The minimum clamped value is "2UL", because at least one full
+ // tick has to be guaranteed.
+ till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 300UL * HZ);
+
+ if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout)
+ WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check));
+
+#ifdef CONFIG_PROVE_RCU
+ /* Add extra ~25% out of till_stall_check. */
+ exp_stall_delay_delta = ((till_stall_check * 25) / 100) + 1;
+#endif
+
+ return till_stall_check + exp_stall_delay_delta;
+}
+EXPORT_SYMBOL_GPL(rcu_exp_jiffies_till_stall_check);
+
/* Limit-check stall timeouts specified at boottime and runtime. */
int rcu_jiffies_till_stall_check(void)
{
@@ -106,22 +138,29 @@ early_initcall(check_cpu_stall_init);
/* If so specified via sysctl, panic, yielding cleaner stall-warning output. */
static void panic_on_rcu_stall(void)
{
+ static int cpu_stall;
+
+ if (++cpu_stall < sysctl_max_rcu_stall_to_panic)
+ return;
+
if (sysctl_panic_on_rcu_stall)
panic("RCU Stall\n");
}
/**
- * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
+ * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period
*
- * Set the stall-warning timeout way off into the future, thus preventing
- * any RCU CPU stall-warning messages from appearing in the current set of
- * RCU grace periods.
+ * To perform the reset request from the caller, disable stall detection until
+ * 3 fqs loops have passed. This is required to ensure a fresh jiffies is
+ * loaded. It should be safe to do from the fqs loop as enough timer
+ * interrupts and context switches should have passed.
*
* The caller must disable hard irqs.
*/
void rcu_cpu_stall_reset(void)
{
- WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
+ WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 3);
+ WRITE_ONCE(rcu_state.jiffies_stall, ULONG_MAX);
}
//////////////////////////////////////////////////////////////////////////////
@@ -137,6 +176,7 @@ static void record_gp_stall_check_time(void)
WRITE_ONCE(rcu_state.gp_start, j);
j1 = rcu_jiffies_till_stall_check();
smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq.
+ WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 0);
WRITE_ONCE(rcu_state.jiffies_stall, j + j1);
rcu_state.jiffies_resched = j + j1 / 2;
rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
@@ -158,7 +198,7 @@ static void rcu_stall_kick_kthreads(void)
{
unsigned long j;
- if (!rcu_kick_kthreads)
+ if (!READ_ONCE(rcu_kick_kthreads))
return;
j = READ_ONCE(rcu_state.jiffies_kick_kthreads);
if (time_after(jiffies, j) && rcu_state.gp_kthread &&
@@ -235,38 +275,50 @@ struct rcu_stall_chk_rdr {
* Report out the state of a not-running task that is stalling the
* current RCU grace period.
*/
-static bool check_slow_task(struct task_struct *t, void *arg)
+static int check_slow_task(struct task_struct *t, void *arg)
{
- struct rcu_node *rnp;
struct rcu_stall_chk_rdr *rscrp = arg;
if (task_curr(t))
- return false; // It is running, so decline to inspect it.
+ return -EBUSY; // It is running, so decline to inspect it.
rscrp->nesting = t->rcu_read_lock_nesting;
rscrp->rs = t->rcu_read_unlock_special;
- rnp = t->rcu_blocked_node;
rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry);
- return true;
+ return 0;
}
/*
* Scan the current list of tasks blocked within RCU read-side critical
- * sections, printing out the tid of each.
+ * sections, printing out the tid of each of the first few of them.
*/
-static int rcu_print_task_stall(struct rcu_node *rnp)
+static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
{
+ int i = 0;
int ndetected = 0;
struct rcu_stall_chk_rdr rscr;
struct task_struct *t;
+ struct task_struct *ts[8];
- if (!rcu_preempt_blocked_readers_cgp(rnp))
+ lockdep_assert_irqs_disabled();
+ if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return 0;
+ }
pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
rnp->level, rnp->grplo, rnp->grphi);
t = list_entry(rnp->gp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
- if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr))
+ get_task_struct(t);
+ ts[i++] = t;
+ if (i >= ARRAY_SIZE(ts))
+ break;
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ while (i) {
+ t = ts[--i];
+ if (task_call_func(t, check_slow_task, &rscr))
pr_cont(" P%d", t->pid);
else
pr_cont(" P%d/%d:%c%c%c%c",
@@ -275,6 +327,8 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
".q"[rscr.rs.b.need_qs],
".e"[rscr.rs.b.exp_hint],
".l"[rscr.on_blkd_list]);
+ lockdep_assert_irqs_disabled();
+ put_task_struct(t);
ndetected++;
}
pr_cont("\n");
@@ -295,8 +349,10 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
* Because preemptible RCU does not exist, we never have to check for
* tasks blocked within RCU read-side critical sections.
*/
-static int rcu_print_task_stall(struct rcu_node *rnp)
+static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
{
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return 0;
}
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
@@ -316,33 +372,16 @@ static void rcu_dump_cpu_stacks(void)
rcu_for_each_leaf_node(rnp) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
for_each_leaf_node_possible_cpu(rnp, cpu)
- if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
- if (!trigger_single_cpu_backtrace(cpu))
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
+ if (cpu_is_offline(cpu))
+ pr_err("Offline CPU %d blocking current GP.\n", cpu);
+ else
dump_cpu_task(cpu);
+ }
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
-#ifdef CONFIG_RCU_FAST_NO_HZ
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-
- sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d",
- rdp->last_accelerate & 0xffff, jiffies & 0xffff,
- !!rdp->tick_nohz_enabled_snap);
-}
-
-#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
- *cp = '\0';
-}
-
-#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
-
static const char * const gp_state_names[] = {
[RCU_GP_IDLE] = "RCU_GP_IDLE",
[RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS",
@@ -375,6 +414,56 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp)
return j > 2 * HZ;
}
+static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp)
+{
+ int cpu;
+ struct task_struct *rcuc;
+ unsigned long j;
+
+ rcuc = rdp->rcu_cpu_kthread_task;
+ if (!rcuc)
+ return false;
+
+ cpu = task_cpu(rcuc);
+ if (cpu_is_offline(cpu) || idle_cpu(cpu))
+ return false;
+
+ j = jiffies - READ_ONCE(rdp->rcuc_activity);
+
+ if (jp)
+ *jp = j;
+ return j > 2 * HZ;
+}
+
+static void print_cpu_stat_info(int cpu)
+{
+ struct rcu_snap_record rsr, *rsrp;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct kernel_cpustat *kcsp = &kcpustat_cpu(cpu);
+
+ if (!rcu_cpu_stall_cputime)
+ return;
+
+ rsrp = &rdp->snap_record;
+ if (rsrp->gp_seq != rdp->gp_seq)
+ return;
+
+ rsr.cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
+ rsr.cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
+ rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
+
+ pr_err("\t hardirqs softirqs csw/system\n");
+ pr_err("\t number: %8ld %10d %12lld\n",
+ kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs,
+ kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs,
+ nr_context_switches_cpu(cpu) - rsrp->nr_csw);
+ pr_err("\tcputime: %8lld %10lld %12lld ==> %d(ms)\n",
+ div_u64(rsr.cputime_irq - rsrp->cputime_irq, NSEC_PER_MSEC),
+ div_u64(rsr.cputime_softirq - rsrp->cputime_softirq, NSEC_PER_MSEC),
+ div_u64(rsr.cputime_system - rsrp->cputime_system, NSEC_PER_MSEC),
+ jiffies_to_msecs(jiffies - rsrp->jiffies));
+}
+
/*
* Print out diagnostic information for the specified stalled CPU.
*
@@ -384,16 +473,18 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp)
* of RCU grace periods that this CPU is ignorant of, for example, "1"
* if the CPU was aware of the previous grace period.
*
- * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
+ * Also print out idle info.
*/
static void print_cpu_stall_info(int cpu)
{
unsigned long delta;
bool falsepositive;
- char fast_no_hz[72];
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
char *ticks_title;
unsigned long ticks_value;
+ bool rcuc_starved;
+ unsigned long j;
+ char buf[32];
/*
* We could be printing a lot while holding a spinlock. Avoid
@@ -408,11 +499,13 @@ static void print_cpu_stall_info(int cpu)
ticks_title = "ticks this GP";
ticks_value = rdp->ticks_this_gp;
}
- print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
falsepositive = rcu_is_gp_kthread_starving(NULL) &&
- rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp));
- pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s%s\n",
+ rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu));
+ rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j);
+ if (rcuc_starved)
+ sprintf(buf, " rcuc=%ld jiffies(starved)", j);
+ pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%04x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n",
cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
@@ -421,36 +514,79 @@ static void print_cpu_stall_info(int cpu)
rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
"!."[!delta],
ticks_value, ticks_title,
- rcu_dynticks_snap(rdp) & 0xfff,
- rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
+ rcu_dynticks_snap(cpu) & 0xffff,
+ ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu),
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
- fast_no_hz,
+ rcuc_starved ? buf : "",
falsepositive ? " (false positive?)" : "");
+
+ print_cpu_stat_info(cpu);
}
/* Complain about starvation of grace-period kthread. */
static void rcu_check_gp_kthread_starvation(void)
{
+ int cpu;
struct task_struct *gpk = rcu_state.gp_kthread;
unsigned long j;
if (rcu_is_gp_kthread_starving(&j)) {
- pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
+ cpu = gpk ? task_cpu(gpk) : -1;
+ pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x ->cpu=%d\n",
rcu_state.name, j,
(long)rcu_seq_current(&rcu_state.gp_seq),
- data_race(rcu_state.gp_flags),
- gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
- gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
+ data_race(READ_ONCE(rcu_state.gp_flags)),
+ gp_state_getname(rcu_state.gp_state),
+ data_race(READ_ONCE(rcu_state.gp_state)),
+ gpk ? data_race(READ_ONCE(gpk->__state)) : ~0, cpu);
if (gpk) {
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name);
pr_err("RCU grace-period kthread stack dump:\n");
sched_show_task(gpk);
+ if (cpu_is_offline(cpu)) {
+ pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu);
+ } else if (!(data_race(READ_ONCE(rdp->mynode->qsmask)) & rdp->grpmask)) {
+ pr_err("Stack dump where RCU GP kthread last ran:\n");
+ dump_cpu_task(cpu);
+ }
wake_up_process(gpk);
}
}
}
+/* Complain about missing wakeups from expired fqs wait timer */
+static void rcu_check_gp_kthread_expired_fqs_timer(void)
+{
+ struct task_struct *gpk = rcu_state.gp_kthread;
+ short gp_state;
+ unsigned long jiffies_fqs;
+ int cpu;
+
+ /*
+ * Order reads of .gp_state and .jiffies_force_qs.
+ * Matching smp_wmb() is present in rcu_gp_fqs_loop().
+ */
+ gp_state = smp_load_acquire(&rcu_state.gp_state);
+ jiffies_fqs = READ_ONCE(rcu_state.jiffies_force_qs);
+
+ if (gp_state == RCU_GP_WAIT_FQS &&
+ time_after(jiffies, jiffies_fqs + RCU_STALL_MIGHT_MIN) &&
+ gpk && !READ_ONCE(gpk->on_rq)) {
+ cpu = task_cpu(gpk);
+ pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x\n",
+ rcu_state.name, (jiffies - jiffies_fqs),
+ (long)rcu_seq_current(&rcu_state.gp_seq),
+ data_race(rcu_state.gp_flags),
+ gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS,
+ data_race(READ_ONCE(gpk->__state)));
+ pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n",
+ cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu));
+ }
+}
+
static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
{
int cpu;
@@ -461,6 +597,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
struct rcu_node *rnp;
long totqlen = 0;
+ lockdep_assert_irqs_disabled();
+
/* Kick and suppress, if so configured. */
rcu_stall_kick_kthreads();
if (rcu_stall_is_suppressed())
@@ -468,13 +606,13 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
/*
* OK, time to rat on our buddy...
- * See Documentation/RCU/stallwarn.txt for info on how to debug
+ * See Documentation/RCU/stallwarn.rst for info on how to debug
* RCU CPU stall warnings.
*/
+ trace_rcu_stall_warning(rcu_state.name, TPS("StallDetected"));
pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name);
rcu_for_each_leaf_node(rnp) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- ndetected += rcu_print_task_stall(rnp);
if (rnp->qsmask != 0) {
for_each_leaf_node_possible_cpu(rnp, cpu)
if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
@@ -482,14 +620,15 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
ndetected++;
}
}
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ ndetected += rcu_print_task_stall(rnp, flags); // Releases rnp->lock.
+ lockdep_assert_irqs_disabled();
}
for_each_possible_cpu(cpu)
totqlen += rcu_get_n_cbs_cpu(cpu);
- pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
+ pr_err("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n",
smp_processor_id(), (long)(jiffies - gps),
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
if (ndetected) {
rcu_dump_cpu_stacks();
@@ -501,11 +640,11 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
pr_err("INFO: Stall ended before state dump start\n");
} else {
j = jiffies;
- gpa = data_race(rcu_state.gp_activity);
+ gpa = data_race(READ_ONCE(rcu_state.gp_activity));
pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
rcu_state.name, j - gpa, j, gpa,
- data_race(jiffies_till_next_fqs),
- rcu_get_root()->qsmask);
+ data_race(READ_ONCE(jiffies_till_next_fqs)),
+ data_race(READ_ONCE(rcu_get_root()->qsmask)));
}
}
/* Rewrite if needed in case of slow consoles. */
@@ -513,6 +652,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
WRITE_ONCE(rcu_state.jiffies_stall,
jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
+ rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
panic_on_rcu_stall();
@@ -528,6 +668,8 @@ static void print_cpu_stall(unsigned long gps)
struct rcu_node *rnp = rcu_get_root();
long totqlen = 0;
+ lockdep_assert_irqs_disabled();
+
/* Kick and suppress, if so configured. */
rcu_stall_kick_kthreads();
if (rcu_stall_is_suppressed())
@@ -535,19 +677,21 @@ static void print_cpu_stall(unsigned long gps)
/*
* OK, time to rat on ourselves...
- * See Documentation/RCU/stallwarn.txt for info on how to debug
+ * See Documentation/RCU/stallwarn.rst for info on how to debug
* RCU CPU stall warnings.
*/
+ trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected"));
pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
print_cpu_stall_info(smp_processor_id());
raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
for_each_possible_cpu(cpu)
totqlen += rcu_get_n_cbs_cpu(cpu);
- pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n",
+ pr_err("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n",
jiffies - gps,
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
+ rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
rcu_dump_cpu_stacks();
@@ -574,6 +718,7 @@ static void print_cpu_stall(unsigned long gps)
static void check_cpu_stall(struct rcu_data *rdp)
{
+ bool self_detected;
unsigned long gs1;
unsigned long gs2;
unsigned long gps;
@@ -582,10 +727,21 @@ static void check_cpu_stall(struct rcu_data *rdp)
unsigned long js;
struct rcu_node *rnp;
- if ((rcu_stall_is_suppressed() && !rcu_kick_kthreads) ||
+ lockdep_assert_irqs_disabled();
+ if ((rcu_stall_is_suppressed() && !READ_ONCE(rcu_kick_kthreads)) ||
!rcu_gp_in_progress())
return;
rcu_stall_kick_kthreads();
+
+ /*
+ * Check if it was requested (via rcu_cpu_stall_reset()) that the FQS
+ * loop has to set jiffies to ensure a non-stale jiffies value. This
+ * is required to have good jiffies value after coming out of long
+ * breaks of jiffies updates. Not doing so can cause false positives.
+ */
+ if (READ_ONCE(rcu_state.nr_fqs_jiffies_stall) > 0)
+ return;
+
j = jiffies;
/*
@@ -618,64 +774,145 @@ static void check_cpu_stall(struct rcu_data *rdp)
ULONG_CMP_GE(gps, js))
return; /* No stall or GP completed since entering function. */
rnp = rdp->mynode;
- jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ jn = jiffies + ULONG_MAX / 2;
+ self_detected = READ_ONCE(rnp->qsmask) & rdp->grpmask;
if (rcu_gp_in_progress() &&
- (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
+ (self_detected || ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) &&
cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+ /*
+ * If a virtual machine is stopped by the host it can look to
+ * the watchdog like an RCU stall. Check to see if the host
+ * stopped the vm.
+ */
+ if (kvm_check_and_clear_guest_paused())
+ return;
- /* We haven't checked in, so go dump stack. */
- print_cpu_stall(gps);
- if (rcu_cpu_stall_ftrace_dump)
- rcu_ftrace_dump(DUMP_ALL);
-
- } else if (rcu_gp_in_progress() &&
- ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
- cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+ rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps);
+ if (self_detected) {
+ /* We haven't checked in, so go dump stack. */
+ print_cpu_stall(gps);
+ } else {
+ /* They had a few time units to dump stack, so complain. */
+ print_other_cpu_stall(gs2, gps);
+ }
- /* They had a few time units to dump stack, so complain. */
- print_other_cpu_stall(gs2, gps);
- if (rcu_cpu_stall_ftrace_dump)
+ if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
rcu_ftrace_dump(DUMP_ALL);
+
+ if (READ_ONCE(rcu_state.jiffies_stall) == jn) {
+ jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ WRITE_ONCE(rcu_state.jiffies_stall, jn);
+ }
}
}
//////////////////////////////////////////////////////////////////////////////
//
-// RCU forward-progress mechanisms, including of callback invocation.
+// RCU forward-progress mechanisms, including for callback invocation.
+
+/*
+ * Check to see if a failure to end RCU priority inversion was due to
+ * a CPU not passing through a quiescent state. When this happens, there
+ * is nothing that RCU priority boosting can do to help, so we shouldn't
+ * count this as an RCU priority boosting failure. A return of true says
+ * RCU priority boosting is to blame, and false says otherwise. If false
+ * is returned, the first of the CPUs to blame is stored through cpup.
+ * If there was no CPU blocking the current grace period, but also nothing
+ * in need of being boosted, *cpup is set to -1. This can happen in case
+ * of vCPU preemption while the last CPU is reporting its quiscent state,
+ * for example.
+ *
+ * If cpup is NULL, then a lockless quick check is carried out, suitable
+ * for high-rate usage. On the other hand, if cpup is non-NULL, each
+ * rcu_node structure's ->lock is acquired, ruling out high-rate usage.
+ */
+bool rcu_check_boost_fail(unsigned long gp_state, int *cpup)
+{
+ bool atb = false;
+ int cpu;
+ unsigned long flags;
+ struct rcu_node *rnp;
+
+ rcu_for_each_leaf_node(rnp) {
+ if (!cpup) {
+ if (data_race(READ_ONCE(rnp->qsmask))) {
+ return false;
+ } else {
+ if (READ_ONCE(rnp->gp_tasks))
+ atb = true;
+ continue;
+ }
+ }
+ *cpup = -1;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (rnp->gp_tasks)
+ atb = true;
+ if (!rnp->qsmask) {
+ // No CPUs without quiescent states for this rnp.
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ continue;
+ }
+ // Find the first holdout CPU.
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ if (rnp->qsmask & (1UL << (cpu - rnp->grplo))) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ *cpup = cpu;
+ return false;
+ }
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+ // Can't blame CPUs, so must blame RCU priority boosting.
+ return atb;
+}
+EXPORT_SYMBOL_GPL(rcu_check_boost_fail);
/*
* Show the state of the grace-period kthreads.
*/
void show_rcu_gp_kthreads(void)
{
+ unsigned long cbs = 0;
int cpu;
unsigned long j;
unsigned long ja;
unsigned long jr;
+ unsigned long js;
unsigned long jw;
struct rcu_data *rdp;
struct rcu_node *rnp;
struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
j = jiffies;
- ja = j - data_race(rcu_state.gp_activity);
- jr = j - data_race(rcu_state.gp_req_activity);
- jw = j - data_race(rcu_state.gp_wake_time);
- pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
+ ja = j - data_race(READ_ONCE(rcu_state.gp_activity));
+ jr = j - data_race(READ_ONCE(rcu_state.gp_req_activity));
+ js = j - data_race(READ_ONCE(rcu_state.gp_start));
+ jw = j - data_race(READ_ONCE(rcu_state.gp_wake_time));
+ pr_info("%s: wait state: %s(%d) ->state: %#x ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n",
rcu_state.name, gp_state_getname(rcu_state.gp_state),
- rcu_state.gp_state, t ? t->state : 0x1ffffL,
- ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq),
- (long)data_race(rcu_state.gp_seq),
- (long)data_race(rcu_get_root()->gp_seq_needed),
- data_race(rcu_state.gp_flags));
+ data_race(READ_ONCE(rcu_state.gp_state)),
+ t ? data_race(READ_ONCE(t->__state)) : 0x1ffff, t ? t->rt_priority : 0xffU,
+ js, ja, jr, jw, (long)data_race(READ_ONCE(rcu_state.gp_wake_seq)),
+ (long)data_race(READ_ONCE(rcu_state.gp_seq)),
+ (long)data_race(READ_ONCE(rcu_get_root()->gp_seq_needed)),
+ data_race(READ_ONCE(rcu_state.gp_max)),
+ data_race(READ_ONCE(rcu_state.gp_flags)));
rcu_for_each_node_breadth_first(rnp) {
- if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq),
- READ_ONCE(rnp->gp_seq_needed)))
+ if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed)) &&
+ !data_race(READ_ONCE(rnp->qsmask)) && !data_race(READ_ONCE(rnp->boost_tasks)) &&
+ !data_race(READ_ONCE(rnp->exp_tasks)) && !data_race(READ_ONCE(rnp->gp_tasks)))
continue;
- pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
- rnp->grplo, rnp->grphi, (long)data_race(rnp->gp_seq),
- (long)data_race(rnp->gp_seq_needed));
+ pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n",
+ rnp->grplo, rnp->grphi,
+ (long)data_race(READ_ONCE(rnp->gp_seq)),
+ (long)data_race(READ_ONCE(rnp->gp_seq_needed)),
+ data_race(READ_ONCE(rnp->qsmask)),
+ ".b"[!!data_race(READ_ONCE(rnp->boost_kthread_task))],
+ ".B"[!!data_race(READ_ONCE(rnp->boost_tasks))],
+ ".E"[!!data_race(READ_ONCE(rnp->exp_tasks))],
+ ".G"[!!data_race(READ_ONCE(rnp->gp_tasks))],
+ data_race(READ_ONCE(rnp->n_boosts)));
if (!rcu_is_leaf_node(rnp))
continue;
for_each_leaf_node_possible_cpu(rnp, cpu) {
@@ -685,14 +922,16 @@ void show_rcu_gp_kthreads(void)
READ_ONCE(rdp->gp_seq_needed)))
continue;
pr_info("\tcpu %d ->gp_seq_needed %ld\n",
- cpu, (long)data_race(rdp->gp_seq_needed));
+ cpu, (long)data_race(READ_ONCE(rdp->gp_seq_needed)));
}
}
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
+ cbs += data_race(READ_ONCE(rdp->n_cbs_invoked));
if (rcu_segcblist_is_offloaded(&rdp->cblist))
show_rcu_nocb_state(rdp);
}
+ pr_info("RCU callbacks invoked since boot: %lu\n", cbs);
show_rcu_tasks_gp_kthreads();
}
EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
@@ -770,11 +1009,11 @@ void rcu_fwd_progress_check(unsigned long j)
if (rcu_gp_in_progress()) {
pr_info("%s: GP age %lu jiffies\n",
- __func__, jiffies - rcu_state.gp_start);
+ __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_start)));
show_rcu_gp_kthreads();
} else {
pr_info("%s: Last GP end %lu jiffies ago\n",
- __func__, jiffies - rcu_state.gp_end);
+ __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_end)));
preempt_disable();
rdp = this_cpu_ptr(&rcu_data);
rcu_check_gp_start_stall(rdp->mynode, rdp, j);
@@ -802,7 +1041,7 @@ static bool sysrq_rcu;
module_param(sysrq_rcu, bool, 0444);
/* Dump grace-period-request information due to commandeered sysrq. */
-static void sysrq_show_rcu(int key)
+static void sysrq_show_rcu(u8 key)
{
show_rcu_gp_kthreads();
}
@@ -821,3 +1060,67 @@ static int __init rcu_sysrq_init(void)
return 0;
}
early_initcall(rcu_sysrq_init);
+
+#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// RCU CPU stall-warning notifiers
+
+static ATOMIC_NOTIFIER_HEAD(rcu_cpu_stall_notifier_list);
+
+/**
+ * rcu_stall_chain_notifier_register - Add an RCU CPU stall notifier
+ * @n: Entry to add.
+ *
+ * Adds an RCU CPU stall notifier to an atomic notifier chain.
+ * The @action passed to a notifier will be @RCU_STALL_NOTIFY_NORM or
+ * friends. The @data will be the duration of the stalled grace period,
+ * in jiffies, coerced to a void* pointer.
+ *
+ * Returns 0 on success, %-EEXIST on error.
+ */
+int rcu_stall_chain_notifier_register(struct notifier_block *n)
+{
+ int rcsn = rcu_cpu_stall_notifiers;
+
+ WARN(1, "Adding %pS() to RCU stall notifier list (%s).\n", n->notifier_call,
+ rcsn ? "possibly suppressing RCU CPU stall warnings" : "failed, so all is well");
+ if (rcsn)
+ return atomic_notifier_chain_register(&rcu_cpu_stall_notifier_list, n);
+ return -EEXIST;
+}
+EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_register);
+
+/**
+ * rcu_stall_chain_notifier_unregister - Remove an RCU CPU stall notifier
+ * @n: Entry to add.
+ *
+ * Removes an RCU CPU stall notifier from an atomic notifier chain.
+ *
+ * Returns zero on success, %-ENOENT on failure.
+ */
+int rcu_stall_chain_notifier_unregister(struct notifier_block *n)
+{
+ return atomic_notifier_chain_unregister(&rcu_cpu_stall_notifier_list, n);
+}
+EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_unregister);
+
+/*
+ * rcu_stall_notifier_call_chain - Call functions in an RCU CPU stall notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ *
+ * Calls each function in the RCU CPU stall notifier chain in turn, which
+ * is an atomic call chain. See atomic_notifier_call_chain() for more
+ * information.
+ *
+ * This is for use within RCU, hence the omission of the extra asterisk
+ * to indicate a non-kerneldoc format header comment.
+ */
+int rcu_stall_notifier_call_chain(unsigned long val, void *v)
+{
+ return atomic_notifier_call_chain(&rcu_cpu_stall_notifier_list, val, v);
+}
+
+#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 84843adfd939..46aaaa9fe339 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -25,6 +25,7 @@
#include <linux/interrupt.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
+#include <linux/torture.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/percpu.h>
@@ -42,6 +43,7 @@
#include <linux/kprobes.h>
#include <linux/slab.h>
#include <linux/irq_work.h>
+#include <linux/rcupdate_trace.h>
#define CREATE_TRACE_POINTS
@@ -52,24 +54,13 @@
#endif
#define MODULE_PARAM_PREFIX "rcupdate."
-#ifndef data_race
-#define data_race(expr) \
- ({ \
- expr; \
- })
-#endif
-#ifndef ASSERT_EXCLUSIVE_WRITER
-#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0)
-#endif
-#ifndef ASSERT_EXCLUSIVE_ACCESS
-#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0)
-#endif
-
#ifndef CONFIG_TINY_RCU
-module_param(rcu_expedited, int, 0);
-module_param(rcu_normal, int, 0);
-static int rcu_normal_after_boot;
-module_param(rcu_normal_after_boot, int, 0);
+module_param(rcu_expedited, int, 0444);
+module_param(rcu_normal, int, 0444);
+static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT);
+#if !defined(CONFIG_PREEMPT_RT) || defined(CONFIG_NO_HZ_FULL)
+module_param(rcu_normal_after_boot, int, 0444);
+#endif
#endif /* #ifndef CONFIG_TINY_RCU */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -95,7 +86,7 @@ module_param(rcu_normal_after_boot, int, 0);
* and while lockdep is disabled.
*
* Note that if the CPU is in the idle loop from an RCU point of view (ie:
- * that we are in the section between rcu_idle_enter() and rcu_idle_exit())
+ * that we are in the section between ct_idle_enter() and ct_idle_exit())
* then rcu_read_lock_held() sets ``*ret`` to false even if the CPU did an
* rcu_read_lock(). The reason for this is that RCU ignores CPUs that are
* in such a section, considering these as in extended quiescent state,
@@ -154,8 +145,45 @@ bool rcu_gp_is_normal(void)
}
EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
-static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
+static atomic_t rcu_async_hurry_nesting = ATOMIC_INIT(1);
+/*
+ * Should call_rcu() callbacks be processed with urgency or are
+ * they OK being executed with arbitrary delays?
+ */
+bool rcu_async_should_hurry(void)
+{
+ return !IS_ENABLED(CONFIG_RCU_LAZY) ||
+ atomic_read(&rcu_async_hurry_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_async_should_hurry);
+/**
+ * rcu_async_hurry - Make future async RCU callbacks not lazy.
+ *
+ * After a call to this function, future calls to call_rcu()
+ * will be processed in a timely fashion.
+ */
+void rcu_async_hurry(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_LAZY))
+ atomic_inc(&rcu_async_hurry_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_async_hurry);
+
+/**
+ * rcu_async_relax - Make future async RCU callbacks lazy.
+ *
+ * After a call to this function, future calls to call_rcu()
+ * will be processed in a lazy fashion.
+ */
+void rcu_async_relax(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_LAZY))
+ atomic_dec(&rcu_async_hurry_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_async_relax);
+
+static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
/*
* Should normal grace-period primitives be expedited? Intended for
* use within RCU. Note that this function takes the rcu_expedited
@@ -205,9 +233,10 @@ static bool rcu_boot_ended __read_mostly;
void rcu_end_inkernel_boot(void)
{
rcu_unexpedite_gp();
+ rcu_async_relax();
if (rcu_normal_after_boot)
WRITE_ONCE(rcu_normal, 1);
- rcu_boot_ended = 1;
+ rcu_boot_ended = true;
}
/*
@@ -230,11 +259,12 @@ void rcu_test_sync_prims(void)
{
if (!IS_ENABLED(CONFIG_PROVE_RCU))
return;
+ pr_info("Running RCU synchronous self tests\n");
synchronize_rcu();
synchronize_rcu_expedited();
}
-#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU)
+#if !defined(CONFIG_TINY_RCU)
/*
* Switch to run-time mode once RCU has fully initialized.
@@ -249,7 +279,7 @@ static int __init rcu_set_runtime_mode(void)
}
core_initcall(rcu_set_runtime_mode);
-#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
+#endif /* #if !defined(CONFIG_TINY_RCU) */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
@@ -257,7 +287,7 @@ struct lockdep_map rcu_lock_map = {
.name = "rcu_read_lock",
.key = &rcu_lock_key,
.wait_type_outer = LD_WAIT_FREE,
- .wait_type_inner = LD_WAIT_CONFIG, /* XXX PREEMPT_RCU ? */
+ .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT implies PREEMPT_RCU */
};
EXPORT_SYMBOL_GPL(rcu_lock_map);
@@ -266,7 +296,7 @@ struct lockdep_map rcu_bh_lock_map = {
.name = "rcu_read_lock_bh",
.key = &rcu_bh_lock_key,
.wait_type_outer = LD_WAIT_FREE,
- .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_LOCK also makes BH preemptible */
+ .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT makes BH preemptible. */
};
EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
@@ -279,6 +309,7 @@ struct lockdep_map rcu_sched_lock_map = {
};
EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
+// Tell lockdep when RCU callbacks are being invoked.
static struct lock_class_key rcu_callback_key;
struct lockdep_map rcu_callback_map =
STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key);
@@ -286,7 +317,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map);
noinstr int notrace debug_lockdep_rcu_enabled(void)
{
- return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks &&
+ return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && READ_ONCE(debug_locks) &&
current->lockdep_recursion == 0;
}
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
@@ -390,13 +421,14 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
might_sleep();
continue;
}
- init_rcu_head_on_stack(&rs_array[i].head);
- init_completion(&rs_array[i].completion);
for (j = 0; j < i; j++)
if (crcu_array[j] == crcu_array[i])
break;
- if (j == i)
+ if (j == i) {
+ init_rcu_head_on_stack(&rs_array[i].head);
+ init_completion(&rs_array[i].completion);
(crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
+ }
}
/* Wait for all callbacks to be invoked. */
@@ -407,13 +439,21 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
for (j = 0; j < i; j++)
if (crcu_array[j] == crcu_array[i])
break;
- if (j == i)
+ if (j == i) {
wait_for_completion(&rs_array[i].completion);
- destroy_rcu_head_on_stack(&rs_array[i].head);
+ destroy_rcu_head_on_stack(&rs_array[i].head);
+ }
}
}
EXPORT_SYMBOL_GPL(__wait_rcu_gp);
+void finish_rcuwait(struct rcuwait *w)
+{
+ rcu_assign_pointer(w->task, NULL);
+ __set_current_state(TASK_RUNNING);
+}
+EXPORT_SYMBOL_GPL(finish_rcuwait);
+
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
void init_rcu_head(struct rcu_head *head)
{
@@ -465,7 +505,7 @@ void destroy_rcu_head_on_stack(struct rcu_head *head)
}
EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
-struct debug_obj_descr rcuhead_debug_descr = {
+const struct debug_obj_descr rcuhead_debug_descr = {
.name = "rcu_head",
.is_static_object = rcuhead_is_static_object,
};
@@ -485,27 +525,39 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
do { } while (0)
#endif
-#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST)
/* Get rcutorture access to sched_setaffinity(). */
-long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
int ret;
ret = sched_setaffinity(pid, in_mask);
- WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret);
+ WARN_ONCE(ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret);
return ret;
}
-EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity);
+EXPORT_SYMBOL_GPL(torture_sched_setaffinity);
#endif
+int rcu_cpu_stall_notifiers __read_mostly; // !0 = provide stall notifiers (rarely useful)
+EXPORT_SYMBOL_GPL(rcu_cpu_stall_notifiers);
+
#ifdef CONFIG_RCU_STALL_COMMON
int rcu_cpu_stall_ftrace_dump __read_mostly;
module_param(rcu_cpu_stall_ftrace_dump, int, 0644);
+#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
+module_param(rcu_cpu_stall_notifiers, int, 0444);
+#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
int rcu_cpu_stall_suppress __read_mostly; // !0 = suppress stall warnings.
EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
module_param(rcu_cpu_stall_suppress, int, 0644);
int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
module_param(rcu_cpu_stall_timeout, int, 0644);
+int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT;
+module_param(rcu_exp_cpu_stall_timeout, int, 0644);
+int rcu_cpu_stall_cputime __read_mostly = IS_ENABLED(CONFIG_RCU_CPU_STALL_CPUTIME);
+module_param(rcu_cpu_stall_cputime, int, 0644);
+bool rcu_exp_stall_task_details __read_mostly;
+module_param(rcu_exp_stall_task_details, bool, 0644);
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
// Suppress boot-time RCU CPU stall warnings and rcutorture writer stall
@@ -514,6 +566,19 @@ int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls.
EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot);
module_param(rcu_cpu_stall_suppress_at_boot, int, 0444);
+/**
+ * get_completed_synchronize_rcu - Return a pre-completed polled state cookie
+ *
+ * Returns a value that will always be treated by functions like
+ * poll_state_synchronize_rcu() as a cookie whose grace period has already
+ * completed.
+ */
+unsigned long get_completed_synchronize_rcu(void)
+{
+ return RCU_GET_STATE_COMPLETED;
+}
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu);
+
#ifdef CONFIG_PROVE_RCU
/*
@@ -531,6 +596,7 @@ static void test_callback(struct rcu_head *r)
}
DEFINE_STATIC_SRCU(early_srcu);
+static unsigned long early_srcu_cookie;
struct early_boot_kfree_rcu {
struct rcu_head rh;
@@ -539,12 +605,15 @@ struct early_boot_kfree_rcu {
static void early_boot_test_call_rcu(void)
{
static struct rcu_head head;
+ int idx;
static struct rcu_head shead;
struct early_boot_kfree_rcu *rhp;
+ idx = srcu_down_read(&early_srcu);
+ srcu_up_read(&early_srcu, idx);
call_rcu(&head, test_callback);
- if (IS_ENABLED(CONFIG_SRCU))
- call_srcu(&early_srcu, &shead, test_callback);
+ early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu);
+ call_srcu(&early_srcu, &shead, test_callback);
rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
if (!WARN_ON_ONCE(!rhp))
kfree_rcu(rhp, rh);
@@ -567,10 +636,10 @@ static int rcu_verify_early_boot_tests(void)
if (rcu_self_test) {
early_boot_test_counter++;
rcu_barrier();
- if (IS_ENABLED(CONFIG_SRCU)) {
- early_boot_test_counter++;
- srcu_barrier(&early_srcu);
- }
+ early_boot_test_counter++;
+ srcu_barrier(&early_srcu);
+ WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie));
+ cleanup_srcu_struct(&early_srcu);
}
if (rcu_self_test_counter != early_boot_test_counter) {
WARN_ON(1);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 491f1347bf43..22c16e2564cc 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -7,6 +7,7 @@
#define pr_fmt(fmt) "reboot: " fmt
+#include <linux/atomic.h>
#include <linux/ctype.h>
#include <linux/export.h>
#include <linux/kexec.h>
@@ -22,16 +23,17 @@
* this indicates whether you can reboot with ctrl-alt-del: the default is yes
*/
-int C_A_D = 1;
+static int C_A_D = 1;
struct pid *cad_pid;
EXPORT_SYMBOL(cad_pid);
-#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32)
+#if defined(CONFIG_ARM)
#define DEFAULT_REBOOT_MODE = REBOOT_HARD
#else
#define DEFAULT_REBOOT_MODE
#endif
enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
+EXPORT_SYMBOL_GPL(reboot_mode);
enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED;
/*
@@ -46,12 +48,29 @@ int reboot_cpu;
enum reboot_type reboot_type = BOOT_ACPI;
int reboot_force;
+struct sys_off_handler {
+ struct notifier_block nb;
+ int (*sys_off_cb)(struct sys_off_data *data);
+ void *cb_data;
+ enum sys_off_mode mode;
+ bool blocking;
+ void *list;
+ struct device *dev;
+};
+
/*
- * If set, this is used for preparing the system to power off.
+ * This variable is used to indicate if a halt was initiated instead of a
+ * reboot when the reboot call was invoked with LINUX_REBOOT_CMD_POWER_OFF, but
+ * the system cannot be powered off. This allowes kernel_halt() to notify users
+ * of that.
*/
+static bool poweroff_fallback_to_halt;
-void (*pm_power_off_prepare)(void);
-EXPORT_SYMBOL_GPL(pm_power_off_prepare);
+/*
+ * Temporary stub that prevents linkage failure while we're in process
+ * of removing all uses of legacy pm_power_off() around the kernel.
+ */
+void __weak (*pm_power_off)(void);
/**
* emergency_restart - reboot the system
@@ -64,6 +83,7 @@ EXPORT_SYMBOL_GPL(pm_power_off_prepare);
void emergency_restart(void)
{
kmsg_dump(KMSG_DUMP_EMERG);
+ system_state = SYSTEM_RESTART;
machine_emergency_restart();
}
EXPORT_SYMBOL_GPL(emergency_restart);
@@ -233,6 +253,17 @@ void migrate_to_reboot_cpu(void)
set_cpus_allowed_ptr(current, cpumask_of(cpu));
}
+/*
+ * Notifier list for kernel code which wants to be called
+ * to prepare system for restart.
+ */
+static BLOCKING_NOTIFIER_HEAD(restart_prep_handler_list);
+
+static void do_kernel_restart_prepare(void)
+{
+ blocking_notifier_call_chain(&restart_prep_handler_list, 0, NULL);
+}
+
/**
* kernel_restart - reboot the system
* @cmd: pointer to buffer containing command to execute for restart
@@ -244,6 +275,7 @@ void migrate_to_reboot_cpu(void)
void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
+ do_kernel_restart_prepare();
migrate_to_reboot_cpu();
syscore_shutdown();
if (!cmd)
@@ -273,12 +305,386 @@ void kernel_halt(void)
kernel_shutdown_prepare(SYSTEM_HALT);
migrate_to_reboot_cpu();
syscore_shutdown();
- pr_emerg("System halted\n");
+ if (poweroff_fallback_to_halt)
+ pr_emerg("Power off not available: System halted instead\n");
+ else
+ pr_emerg("System halted\n");
kmsg_dump(KMSG_DUMP_SHUTDOWN);
machine_halt();
}
EXPORT_SYMBOL_GPL(kernel_halt);
+/*
+ * Notifier list for kernel code which wants to be called
+ * to prepare system for power off.
+ */
+static BLOCKING_NOTIFIER_HEAD(power_off_prep_handler_list);
+
+/*
+ * Notifier list for kernel code which wants to be called
+ * to power off system.
+ */
+static ATOMIC_NOTIFIER_HEAD(power_off_handler_list);
+
+static int sys_off_notify(struct notifier_block *nb,
+ unsigned long mode, void *cmd)
+{
+ struct sys_off_handler *handler;
+ struct sys_off_data data = {};
+
+ handler = container_of(nb, struct sys_off_handler, nb);
+ data.cb_data = handler->cb_data;
+ data.mode = mode;
+ data.cmd = cmd;
+ data.dev = handler->dev;
+
+ return handler->sys_off_cb(&data);
+}
+
+static struct sys_off_handler platform_sys_off_handler;
+
+static struct sys_off_handler *alloc_sys_off_handler(int priority)
+{
+ struct sys_off_handler *handler;
+ gfp_t flags;
+
+ /*
+ * Platforms like m68k can't allocate sys_off handler dynamically
+ * at the early boot time because memory allocator isn't available yet.
+ */
+ if (priority == SYS_OFF_PRIO_PLATFORM) {
+ handler = &platform_sys_off_handler;
+ if (handler->cb_data)
+ return ERR_PTR(-EBUSY);
+ } else {
+ if (system_state > SYSTEM_RUNNING)
+ flags = GFP_ATOMIC;
+ else
+ flags = GFP_KERNEL;
+
+ handler = kzalloc(sizeof(*handler), flags);
+ if (!handler)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return handler;
+}
+
+static void free_sys_off_handler(struct sys_off_handler *handler)
+{
+ if (handler == &platform_sys_off_handler)
+ memset(handler, 0, sizeof(*handler));
+ else
+ kfree(handler);
+}
+
+/**
+ * register_sys_off_handler - Register sys-off handler
+ * @mode: Sys-off mode
+ * @priority: Handler priority
+ * @callback: Callback function
+ * @cb_data: Callback argument
+ *
+ * Registers system power-off or restart handler that will be invoked
+ * at the step corresponding to the given sys-off mode. Handler's callback
+ * should return NOTIFY_DONE to permit execution of the next handler in
+ * the call chain or NOTIFY_STOP to break the chain (in error case for
+ * example).
+ *
+ * Multiple handlers can be registered at the default priority level.
+ *
+ * Only one handler can be registered at the non-default priority level,
+ * otherwise ERR_PTR(-EBUSY) is returned.
+ *
+ * Returns a new instance of struct sys_off_handler on success, or
+ * an ERR_PTR()-encoded error code otherwise.
+ */
+struct sys_off_handler *
+register_sys_off_handler(enum sys_off_mode mode,
+ int priority,
+ int (*callback)(struct sys_off_data *data),
+ void *cb_data)
+{
+ struct sys_off_handler *handler;
+ int err;
+
+ handler = alloc_sys_off_handler(priority);
+ if (IS_ERR(handler))
+ return handler;
+
+ switch (mode) {
+ case SYS_OFF_MODE_POWER_OFF_PREPARE:
+ handler->list = &power_off_prep_handler_list;
+ handler->blocking = true;
+ break;
+
+ case SYS_OFF_MODE_POWER_OFF:
+ handler->list = &power_off_handler_list;
+ break;
+
+ case SYS_OFF_MODE_RESTART_PREPARE:
+ handler->list = &restart_prep_handler_list;
+ handler->blocking = true;
+ break;
+
+ case SYS_OFF_MODE_RESTART:
+ handler->list = &restart_handler_list;
+ break;
+
+ default:
+ free_sys_off_handler(handler);
+ return ERR_PTR(-EINVAL);
+ }
+
+ handler->nb.notifier_call = sys_off_notify;
+ handler->nb.priority = priority;
+ handler->sys_off_cb = callback;
+ handler->cb_data = cb_data;
+ handler->mode = mode;
+
+ if (handler->blocking) {
+ if (priority == SYS_OFF_PRIO_DEFAULT)
+ err = blocking_notifier_chain_register(handler->list,
+ &handler->nb);
+ else
+ err = blocking_notifier_chain_register_unique_prio(handler->list,
+ &handler->nb);
+ } else {
+ if (priority == SYS_OFF_PRIO_DEFAULT)
+ err = atomic_notifier_chain_register(handler->list,
+ &handler->nb);
+ else
+ err = atomic_notifier_chain_register_unique_prio(handler->list,
+ &handler->nb);
+ }
+
+ if (err) {
+ free_sys_off_handler(handler);
+ return ERR_PTR(err);
+ }
+
+ return handler;
+}
+EXPORT_SYMBOL_GPL(register_sys_off_handler);
+
+/**
+ * unregister_sys_off_handler - Unregister sys-off handler
+ * @handler: Sys-off handler
+ *
+ * Unregisters given sys-off handler.
+ */
+void unregister_sys_off_handler(struct sys_off_handler *handler)
+{
+ int err;
+
+ if (IS_ERR_OR_NULL(handler))
+ return;
+
+ if (handler->blocking)
+ err = blocking_notifier_chain_unregister(handler->list,
+ &handler->nb);
+ else
+ err = atomic_notifier_chain_unregister(handler->list,
+ &handler->nb);
+
+ /* sanity check, shall never happen */
+ WARN_ON(err);
+
+ free_sys_off_handler(handler);
+}
+EXPORT_SYMBOL_GPL(unregister_sys_off_handler);
+
+static void devm_unregister_sys_off_handler(void *data)
+{
+ struct sys_off_handler *handler = data;
+
+ unregister_sys_off_handler(handler);
+}
+
+/**
+ * devm_register_sys_off_handler - Register sys-off handler
+ * @dev: Device that registers handler
+ * @mode: Sys-off mode
+ * @priority: Handler priority
+ * @callback: Callback function
+ * @cb_data: Callback argument
+ *
+ * Registers resource-managed sys-off handler.
+ *
+ * Returns zero on success, or error code on failure.
+ */
+int devm_register_sys_off_handler(struct device *dev,
+ enum sys_off_mode mode,
+ int priority,
+ int (*callback)(struct sys_off_data *data),
+ void *cb_data)
+{
+ struct sys_off_handler *handler;
+
+ handler = register_sys_off_handler(mode, priority, callback, cb_data);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+ handler->dev = dev;
+
+ return devm_add_action_or_reset(dev, devm_unregister_sys_off_handler,
+ handler);
+}
+EXPORT_SYMBOL_GPL(devm_register_sys_off_handler);
+
+/**
+ * devm_register_power_off_handler - Register power-off handler
+ * @dev: Device that registers callback
+ * @callback: Callback function
+ * @cb_data: Callback's argument
+ *
+ * Registers resource-managed sys-off handler with a default priority
+ * and using power-off mode.
+ *
+ * Returns zero on success, or error code on failure.
+ */
+int devm_register_power_off_handler(struct device *dev,
+ int (*callback)(struct sys_off_data *data),
+ void *cb_data)
+{
+ return devm_register_sys_off_handler(dev,
+ SYS_OFF_MODE_POWER_OFF,
+ SYS_OFF_PRIO_DEFAULT,
+ callback, cb_data);
+}
+EXPORT_SYMBOL_GPL(devm_register_power_off_handler);
+
+/**
+ * devm_register_restart_handler - Register restart handler
+ * @dev: Device that registers callback
+ * @callback: Callback function
+ * @cb_data: Callback's argument
+ *
+ * Registers resource-managed sys-off handler with a default priority
+ * and using restart mode.
+ *
+ * Returns zero on success, or error code on failure.
+ */
+int devm_register_restart_handler(struct device *dev,
+ int (*callback)(struct sys_off_data *data),
+ void *cb_data)
+{
+ return devm_register_sys_off_handler(dev,
+ SYS_OFF_MODE_RESTART,
+ SYS_OFF_PRIO_DEFAULT,
+ callback, cb_data);
+}
+EXPORT_SYMBOL_GPL(devm_register_restart_handler);
+
+static struct sys_off_handler *platform_power_off_handler;
+
+static int platform_power_off_notify(struct sys_off_data *data)
+{
+ void (*platform_power_power_off_cb)(void) = data->cb_data;
+
+ platform_power_power_off_cb();
+
+ return NOTIFY_DONE;
+}
+
+/**
+ * register_platform_power_off - Register platform-level power-off callback
+ * @power_off: Power-off callback
+ *
+ * Registers power-off callback that will be called as last step
+ * of the power-off sequence. This callback is expected to be invoked
+ * for the last resort. Only one platform power-off callback is allowed
+ * to be registered at a time.
+ *
+ * Returns zero on success, or error code on failure.
+ */
+int register_platform_power_off(void (*power_off)(void))
+{
+ struct sys_off_handler *handler;
+
+ handler = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF,
+ SYS_OFF_PRIO_PLATFORM,
+ platform_power_off_notify,
+ power_off);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+
+ platform_power_off_handler = handler;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(register_platform_power_off);
+
+/**
+ * unregister_platform_power_off - Unregister platform-level power-off callback
+ * @power_off: Power-off callback
+ *
+ * Unregisters previously registered platform power-off callback.
+ */
+void unregister_platform_power_off(void (*power_off)(void))
+{
+ if (platform_power_off_handler &&
+ platform_power_off_handler->cb_data == power_off) {
+ unregister_sys_off_handler(platform_power_off_handler);
+ platform_power_off_handler = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(unregister_platform_power_off);
+
+static int legacy_pm_power_off(struct sys_off_data *data)
+{
+ if (pm_power_off)
+ pm_power_off();
+
+ return NOTIFY_DONE;
+}
+
+static void do_kernel_power_off_prepare(void)
+{
+ blocking_notifier_call_chain(&power_off_prep_handler_list, 0, NULL);
+}
+
+/**
+ * do_kernel_power_off - Execute kernel power-off handler call chain
+ *
+ * Expected to be called as last step of the power-off sequence.
+ *
+ * Powers off the system immediately if a power-off handler function has
+ * been registered. Otherwise does nothing.
+ */
+void do_kernel_power_off(void)
+{
+ struct sys_off_handler *sys_off = NULL;
+
+ /*
+ * Register sys-off handlers for legacy PM callback. This allows
+ * legacy PM callbacks temporary co-exist with the new sys-off API.
+ *
+ * TODO: Remove legacy handlers once all legacy PM users will be
+ * switched to the sys-off based APIs.
+ */
+ if (pm_power_off)
+ sys_off = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF,
+ SYS_OFF_PRIO_DEFAULT,
+ legacy_pm_power_off, NULL);
+
+ atomic_notifier_call_chain(&power_off_handler_list, 0, NULL);
+
+ unregister_sys_off_handler(sys_off);
+}
+
+/**
+ * kernel_can_power_off - check whether system can be powered off
+ *
+ * Returns true if power-off handler is registered and system can be
+ * powered off, false otherwise.
+ */
+bool kernel_can_power_off(void)
+{
+ return !atomic_notifier_call_chain_is_empty(&power_off_handler_list) ||
+ pm_power_off;
+}
+EXPORT_SYMBOL_GPL(kernel_can_power_off);
+
/**
* kernel_power_off - power_off the system
*
@@ -287,8 +693,7 @@ EXPORT_SYMBOL_GPL(kernel_halt);
void kernel_power_off(void)
{
kernel_shutdown_prepare(SYSTEM_POWER_OFF);
- if (pm_power_off_prepare)
- pm_power_off_prepare();
+ do_kernel_power_off_prepare();
migrate_to_reboot_cpu();
syscore_shutdown();
pr_emerg("Power down\n");
@@ -338,8 +743,10 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
/* Instead of trying to make the power_off code look like
* halt when pm_power_off is not set do it the easy way.
*/
- if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
+ if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off()) {
+ poweroff_fallback_to_halt = true;
cmd = LINUX_REBOOT_CMD_HALT;
+ }
mutex_lock(&system_transition_mutex);
switch (cmd) {
@@ -358,7 +765,6 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
case LINUX_REBOOT_CMD_HALT:
kernel_halt();
do_exit(0);
- panic("cannot halt");
case LINUX_REBOOT_CMD_POWER_OFF:
kernel_power_off();
@@ -416,7 +822,8 @@ void ctrl_alt_del(void)
kill_cad_pid(SIGINT, 1);
}
-char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
+#define POWEROFF_CMD_PATH_LEN 256
+static char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
static const char reboot_cmd[] = "/sbin/reboot";
static int run_cmd(const char *cmd)
@@ -518,6 +925,90 @@ void orderly_reboot(void)
}
EXPORT_SYMBOL_GPL(orderly_reboot);
+/**
+ * hw_failure_emergency_poweroff_func - emergency poweroff work after a known delay
+ * @work: work_struct associated with the emergency poweroff function
+ *
+ * This function is called in very critical situations to force
+ * a kernel poweroff after a configurable timeout value.
+ */
+static void hw_failure_emergency_poweroff_func(struct work_struct *work)
+{
+ /*
+ * We have reached here after the emergency shutdown waiting period has
+ * expired. This means orderly_poweroff has not been able to shut off
+ * the system for some reason.
+ *
+ * Try to shut down the system immediately using kernel_power_off
+ * if populated
+ */
+ pr_emerg("Hardware protection timed-out. Trying forced poweroff\n");
+ kernel_power_off();
+
+ /*
+ * Worst of the worst case trigger emergency restart
+ */
+ pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n");
+ emergency_restart();
+}
+
+static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work,
+ hw_failure_emergency_poweroff_func);
+
+/**
+ * hw_failure_emergency_poweroff - Trigger an emergency system poweroff
+ *
+ * This may be called from any critical situation to trigger a system shutdown
+ * after a given period of time. If time is negative this is not scheduled.
+ */
+static void hw_failure_emergency_poweroff(int poweroff_delay_ms)
+{
+ if (poweroff_delay_ms <= 0)
+ return;
+ schedule_delayed_work(&hw_failure_emergency_poweroff_work,
+ msecs_to_jiffies(poweroff_delay_ms));
+}
+
+/**
+ * __hw_protection_shutdown - Trigger an emergency system shutdown or reboot
+ *
+ * @reason: Reason of emergency shutdown or reboot to be printed.
+ * @ms_until_forced: Time to wait for orderly shutdown or reboot before
+ * triggering it. Negative value disables the forced
+ * shutdown or reboot.
+ * @shutdown: If true, indicates that a shutdown will happen
+ * after the critical tempeature is reached.
+ * If false, indicates that a reboot will happen
+ * after the critical tempeature is reached.
+ *
+ * Initiate an emergency system shutdown or reboot in order to protect
+ * hardware from further damage. Usage examples include a thermal protection.
+ * NOTE: The request is ignored if protection shutdown or reboot is already
+ * pending even if the previous request has given a large timeout for forced
+ * shutdown/reboot.
+ */
+void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown)
+{
+ static atomic_t allow_proceed = ATOMIC_INIT(1);
+
+ pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason);
+
+ /* Shutdown should be initiated only once. */
+ if (!atomic_dec_and_test(&allow_proceed))
+ return;
+
+ /*
+ * Queue a backup emergency shutdown in the event of
+ * orderly_poweroff failure
+ */
+ hw_failure_emergency_poweroff(ms_until_forced);
+ if (shutdown)
+ orderly_poweroff(true);
+ else
+ orderly_reboot();
+}
+EXPORT_SYMBOL_GPL(__hw_protection_shutdown);
+
static int __init reboot_setup(char *str)
{
for (;;) {
@@ -551,22 +1042,26 @@ static int __init reboot_setup(char *str)
break;
case 's':
- {
- int rc;
-
- if (isdigit(*(str+1))) {
- rc = kstrtoint(str+1, 0, &reboot_cpu);
- if (rc)
- return rc;
- } else if (str[1] == 'm' && str[2] == 'p' &&
- isdigit(*(str+3))) {
- rc = kstrtoint(str+3, 0, &reboot_cpu);
- if (rc)
- return rc;
+ /*
+ * reboot_cpu is s[mp]#### with #### being the processor
+ * to be used for rebooting. Skip 's' or 'smp' prefix.
+ */
+ str += str[1] == 'm' && str[2] == 'p' ? 3 : 1;
+
+ if (isdigit(str[0])) {
+ int cpu = simple_strtoul(str, NULL, 0);
+
+ if (cpu >= num_possible_cpus()) {
+ pr_err("Ignoring the CPU number in reboot= option. "
+ "CPU %d exceeds possible cpu number %d\n",
+ cpu, num_possible_cpus());
+ break;
+ }
+ reboot_cpu = cpu;
} else
*mode = REBOOT_SOFT;
break;
- }
+
case 'g':
*mode = REBOOT_GPIO;
break;
@@ -594,3 +1089,246 @@ static int __init reboot_setup(char *str)
return 1;
}
__setup("reboot=", reboot_setup);
+
+#ifdef CONFIG_SYSFS
+
+#define REBOOT_COLD_STR "cold"
+#define REBOOT_WARM_STR "warm"
+#define REBOOT_HARD_STR "hard"
+#define REBOOT_SOFT_STR "soft"
+#define REBOOT_GPIO_STR "gpio"
+#define REBOOT_UNDEFINED_STR "undefined"
+
+#define BOOT_TRIPLE_STR "triple"
+#define BOOT_KBD_STR "kbd"
+#define BOOT_BIOS_STR "bios"
+#define BOOT_ACPI_STR "acpi"
+#define BOOT_EFI_STR "efi"
+#define BOOT_PCI_STR "pci"
+
+static ssize_t mode_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ const char *val;
+
+ switch (reboot_mode) {
+ case REBOOT_COLD:
+ val = REBOOT_COLD_STR;
+ break;
+ case REBOOT_WARM:
+ val = REBOOT_WARM_STR;
+ break;
+ case REBOOT_HARD:
+ val = REBOOT_HARD_STR;
+ break;
+ case REBOOT_SOFT:
+ val = REBOOT_SOFT_STR;
+ break;
+ case REBOOT_GPIO:
+ val = REBOOT_GPIO_STR;
+ break;
+ default:
+ val = REBOOT_UNDEFINED_STR;
+ }
+
+ return sprintf(buf, "%s\n", val);
+}
+static ssize_t mode_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (!capable(CAP_SYS_BOOT))
+ return -EPERM;
+
+ if (!strncmp(buf, REBOOT_COLD_STR, strlen(REBOOT_COLD_STR)))
+ reboot_mode = REBOOT_COLD;
+ else if (!strncmp(buf, REBOOT_WARM_STR, strlen(REBOOT_WARM_STR)))
+ reboot_mode = REBOOT_WARM;
+ else if (!strncmp(buf, REBOOT_HARD_STR, strlen(REBOOT_HARD_STR)))
+ reboot_mode = REBOOT_HARD;
+ else if (!strncmp(buf, REBOOT_SOFT_STR, strlen(REBOOT_SOFT_STR)))
+ reboot_mode = REBOOT_SOFT;
+ else if (!strncmp(buf, REBOOT_GPIO_STR, strlen(REBOOT_GPIO_STR)))
+ reboot_mode = REBOOT_GPIO;
+ else
+ return -EINVAL;
+
+ reboot_default = 0;
+
+ return count;
+}
+static struct kobj_attribute reboot_mode_attr = __ATTR_RW(mode);
+
+#ifdef CONFIG_X86
+static ssize_t force_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", reboot_force);
+}
+static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ bool res;
+
+ if (!capable(CAP_SYS_BOOT))
+ return -EPERM;
+
+ if (kstrtobool(buf, &res))
+ return -EINVAL;
+
+ reboot_default = 0;
+ reboot_force = res;
+
+ return count;
+}
+static struct kobj_attribute reboot_force_attr = __ATTR_RW(force);
+
+static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ const char *val;
+
+ switch (reboot_type) {
+ case BOOT_TRIPLE:
+ val = BOOT_TRIPLE_STR;
+ break;
+ case BOOT_KBD:
+ val = BOOT_KBD_STR;
+ break;
+ case BOOT_BIOS:
+ val = BOOT_BIOS_STR;
+ break;
+ case BOOT_ACPI:
+ val = BOOT_ACPI_STR;
+ break;
+ case BOOT_EFI:
+ val = BOOT_EFI_STR;
+ break;
+ case BOOT_CF9_FORCE:
+ val = BOOT_PCI_STR;
+ break;
+ default:
+ val = REBOOT_UNDEFINED_STR;
+ }
+
+ return sprintf(buf, "%s\n", val);
+}
+static ssize_t type_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (!capable(CAP_SYS_BOOT))
+ return -EPERM;
+
+ if (!strncmp(buf, BOOT_TRIPLE_STR, strlen(BOOT_TRIPLE_STR)))
+ reboot_type = BOOT_TRIPLE;
+ else if (!strncmp(buf, BOOT_KBD_STR, strlen(BOOT_KBD_STR)))
+ reboot_type = BOOT_KBD;
+ else if (!strncmp(buf, BOOT_BIOS_STR, strlen(BOOT_BIOS_STR)))
+ reboot_type = BOOT_BIOS;
+ else if (!strncmp(buf, BOOT_ACPI_STR, strlen(BOOT_ACPI_STR)))
+ reboot_type = BOOT_ACPI;
+ else if (!strncmp(buf, BOOT_EFI_STR, strlen(BOOT_EFI_STR)))
+ reboot_type = BOOT_EFI;
+ else if (!strncmp(buf, BOOT_PCI_STR, strlen(BOOT_PCI_STR)))
+ reboot_type = BOOT_CF9_FORCE;
+ else
+ return -EINVAL;
+
+ reboot_default = 0;
+
+ return count;
+}
+static struct kobj_attribute reboot_type_attr = __ATTR_RW(type);
+#endif
+
+#ifdef CONFIG_SMP
+static ssize_t cpu_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", reboot_cpu);
+}
+static ssize_t cpu_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned int cpunum;
+ int rc;
+
+ if (!capable(CAP_SYS_BOOT))
+ return -EPERM;
+
+ rc = kstrtouint(buf, 0, &cpunum);
+
+ if (rc)
+ return rc;
+
+ if (cpunum >= num_possible_cpus())
+ return -ERANGE;
+
+ reboot_default = 0;
+ reboot_cpu = cpunum;
+
+ return count;
+}
+static struct kobj_attribute reboot_cpu_attr = __ATTR_RW(cpu);
+#endif
+
+static struct attribute *reboot_attrs[] = {
+ &reboot_mode_attr.attr,
+#ifdef CONFIG_X86
+ &reboot_force_attr.attr,
+ &reboot_type_attr.attr,
+#endif
+#ifdef CONFIG_SMP
+ &reboot_cpu_attr.attr,
+#endif
+ NULL,
+};
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_reboot_table[] = {
+ {
+ .procname = "poweroff_cmd",
+ .data = &poweroff_cmd,
+ .maxlen = POWEROFF_CMD_PATH_LEN,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+ {
+ .procname = "ctrl-alt-del",
+ .data = &C_A_D,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static void __init kernel_reboot_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_reboot_table);
+}
+#else
+#define kernel_reboot_sysctls_init() do { } while (0)
+#endif /* CONFIG_SYSCTL */
+
+static const struct attribute_group reboot_attr_group = {
+ .attrs = reboot_attrs,
+};
+
+static int __init reboot_ksysfs_init(void)
+{
+ struct kobject *reboot_kobj;
+ int ret;
+
+ reboot_kobj = kobject_create_and_add("reboot", kernel_kobj);
+ if (!reboot_kobj)
+ return -ENOMEM;
+
+ ret = sysfs_create_group(reboot_kobj, &reboot_attr_group);
+ if (ret) {
+ kobject_put(reboot_kobj);
+ return ret;
+ }
+
+ kernel_reboot_sysctls_init();
+
+ return 0;
+}
+late_initcall(reboot_ksysfs_init);
+
+#endif
diff --git a/kernel/regset.c b/kernel/regset.c
new file mode 100644
index 000000000000..586823786f39
--- /dev/null
+++ b/kernel/regset.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/regset.h>
+
+static int __regset_get(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int size,
+ void **data)
+{
+ void *p = *data, *to_free = NULL;
+ int res;
+
+ if (!regset->regset_get)
+ return -EOPNOTSUPP;
+ if (size > regset->n * regset->size)
+ size = regset->n * regset->size;
+ if (!p) {
+ to_free = p = kzalloc(size, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ }
+ res = regset->regset_get(target, regset,
+ (struct membuf){.p = p, .left = size});
+ if (res < 0) {
+ kfree(to_free);
+ return res;
+ }
+ *data = p;
+ return size - res;
+}
+
+int regset_get(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int size,
+ void *data)
+{
+ return __regset_get(target, regset, size, &data);
+}
+EXPORT_SYMBOL(regset_get);
+
+int regset_get_alloc(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int size,
+ void **data)
+{
+ *data = NULL;
+ return __regset_get(target, regset, size, data);
+}
+EXPORT_SYMBOL(regset_get_alloc);
+
+/**
+ * copy_regset_to_user - fetch a thread's user_regset data into user memory
+ * @target: thread to be examined
+ * @view: &struct user_regset_view describing user thread machine state
+ * @setno: index in @view->regsets
+ * @offset: offset into the regset data, in bytes
+ * @size: amount of data to copy, in bytes
+ * @data: user-mode pointer to copy into
+ */
+int copy_regset_to_user(struct task_struct *target,
+ const struct user_regset_view *view,
+ unsigned int setno,
+ unsigned int offset, unsigned int size,
+ void __user *data)
+{
+ const struct user_regset *regset = &view->regsets[setno];
+ void *buf;
+ int ret;
+
+ ret = regset_get_alloc(target, regset, size, &buf);
+ if (ret > 0)
+ ret = copy_to_user(data, buf, ret) ? -EFAULT : 0;
+ kfree(buf);
+ return ret;
+}
diff --git a/kernel/relay.c b/kernel/relay.c
index 72fe443ea78f..a8e90e98bf2c 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -28,15 +28,6 @@ static DEFINE_MUTEX(relay_channels_mutex);
static LIST_HEAD(relay_channels);
/*
- * close() vm_op implementation for relay file mapping.
- */
-static void relay_file_mmap_close(struct vm_area_struct *vma)
-{
- struct rchan_buf *buf = vma->vm_private_data;
- buf->chan->cb->buf_unmapped(buf, vma->vm_file);
-}
-
-/*
* fault() vm_op implementation for relay file mapping.
*/
static vm_fault_t relay_buf_fault(struct vm_fault *vmf)
@@ -62,7 +53,6 @@ static vm_fault_t relay_buf_fault(struct vm_fault *vmf)
*/
static const struct vm_operations_struct relay_file_mmap_ops = {
.fault = relay_buf_fault,
- .close = relay_file_mmap_close,
};
/*
@@ -70,10 +60,7 @@ static const struct vm_operations_struct relay_file_mmap_ops = {
*/
static struct page **relay_alloc_page_array(unsigned int n_pages)
{
- const size_t pa_size = n_pages * sizeof(struct page *);
- if (pa_size > PAGE_SIZE)
- return vzalloc(pa_size);
- return kzalloc(pa_size, GFP_KERNEL);
+ return kvcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
}
/*
@@ -96,7 +83,6 @@ static void relay_free_page_array(struct page **array)
static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
{
unsigned long length = vma->vm_end - vma->vm_start;
- struct file *filp = vma->vm_file;
if (!buf)
return -EBADF;
@@ -105,9 +91,8 @@ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
return -EINVAL;
vma->vm_ops = &relay_file_mmap_ops;
- vma->vm_flags |= VM_DONTEXPAND;
+ vm_flags_set(vma, VM_DONTEXPAND);
vma->vm_private_data = buf;
- buf->chan->cb->buf_mapped(buf, filp);
return 0;
}
@@ -163,13 +148,13 @@ static struct rchan_buf *relay_create_buf(struct rchan *chan)
{
struct rchan_buf *buf;
- if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t *))
+ if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t))
return NULL;
buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
if (!buf)
return NULL;
- buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t *),
+ buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t),
GFP_KERNEL);
if (!buf->padding)
goto free_buf;
@@ -197,6 +182,7 @@ free_buf:
static void relay_destroy_channel(struct kref *kref)
{
struct rchan *chan = container_of(kref, struct rchan, kref);
+ free_percpu(chan->buf);
kfree(chan);
}
@@ -263,70 +249,16 @@ EXPORT_SYMBOL_GPL(relay_buf_full);
* High-level relay kernel API and associated functions.
*/
-/*
- * rchan_callback implementations defining default channel behavior. Used
- * in place of corresponding NULL values in client callback struct.
- */
-
-/*
- * subbuf_start() default callback. Does nothing.
- */
-static int subbuf_start_default_callback (struct rchan_buf *buf,
- void *subbuf,
- void *prev_subbuf,
- size_t prev_padding)
+static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf,
+ void *prev_subbuf, size_t prev_padding)
{
- if (relay_buf_full(buf))
- return 0;
+ if (!buf->chan->cb->subbuf_start)
+ return !relay_buf_full(buf);
- return 1;
+ return buf->chan->cb->subbuf_start(buf, subbuf,
+ prev_subbuf, prev_padding);
}
-/*
- * buf_mapped() default callback. Does nothing.
- */
-static void buf_mapped_default_callback(struct rchan_buf *buf,
- struct file *filp)
-{
-}
-
-/*
- * buf_unmapped() default callback. Does nothing.
- */
-static void buf_unmapped_default_callback(struct rchan_buf *buf,
- struct file *filp)
-{
-}
-
-/*
- * create_buf_file_create() default callback. Does nothing.
- */
-static struct dentry *create_buf_file_default_callback(const char *filename,
- struct dentry *parent,
- umode_t mode,
- struct rchan_buf *buf,
- int *is_global)
-{
- return NULL;
-}
-
-/*
- * remove_buf_file() default callback. Does nothing.
- */
-static int remove_buf_file_default_callback(struct dentry *dentry)
-{
- return -EINVAL;
-}
-
-/* relay channel default callbacks */
-static struct rchan_callbacks default_channel_callbacks = {
- .subbuf_start = subbuf_start_default_callback,
- .buf_mapped = buf_mapped_default_callback,
- .buf_unmapped = buf_unmapped_default_callback,
- .create_buf_file = create_buf_file_default_callback,
- .remove_buf_file = remove_buf_file_default_callback,
-};
-
/**
* wakeup_readers - wake up readers waiting on a channel
* @work: contains the channel buffer
@@ -370,7 +302,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
for (i = 0; i < buf->chan->n_subbufs; i++)
buf->padding[i] = 0;
- buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
+ relay_subbuf_start(buf, buf->data, NULL, 0);
}
/**
@@ -443,7 +375,7 @@ static struct dentry *relay_create_buf_file(struct rchan *chan,
*/
static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
{
- struct rchan_buf *buf = NULL;
+ struct rchan_buf *buf;
struct dentry *dentry;
if (chan->is_global)
@@ -498,27 +430,6 @@ static void relay_close_buf(struct rchan_buf *buf)
kref_put(&buf->kref, relay_remove_buf);
}
-static void setup_callbacks(struct rchan *chan,
- struct rchan_callbacks *cb)
-{
- if (!cb) {
- chan->cb = &default_channel_callbacks;
- return;
- }
-
- if (!cb->subbuf_start)
- cb->subbuf_start = subbuf_start_default_callback;
- if (!cb->buf_mapped)
- cb->buf_mapped = buf_mapped_default_callback;
- if (!cb->buf_unmapped)
- cb->buf_unmapped = buf_unmapped_default_callback;
- if (!cb->create_buf_file)
- cb->create_buf_file = create_buf_file_default_callback;
- if (!cb->remove_buf_file)
- cb->remove_buf_file = remove_buf_file_default_callback;
- chan->cb = cb;
-}
-
int relay_prepare_cpu(unsigned int cpu)
{
struct rchan *chan;
@@ -526,7 +437,7 @@ int relay_prepare_cpu(unsigned int cpu)
mutex_lock(&relay_channels_mutex);
list_for_each_entry(chan, &relay_channels, list) {
- if ((buf = *per_cpu_ptr(chan->buf, cpu)))
+ if (*per_cpu_ptr(chan->buf, cpu))
continue;
buf = relay_open_buf(chan, cpu);
if (!buf) {
@@ -564,7 +475,7 @@ struct rchan *relay_open(const char *base_filename,
struct dentry *parent,
size_t subbuf_size,
size_t n_subbufs,
- struct rchan_callbacks *cb,
+ const struct rchan_callbacks *cb,
void *private_data)
{
unsigned int i;
@@ -575,6 +486,8 @@ struct rchan *relay_open(const char *base_filename,
return NULL;
if (subbuf_size > UINT_MAX / n_subbufs)
return NULL;
+ if (!cb || !cb->create_buf_file || !cb->remove_buf_file)
+ return NULL;
chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
if (!chan)
@@ -594,9 +507,9 @@ struct rchan *relay_open(const char *base_filename,
chan->private_data = private_data;
if (base_filename) {
chan->has_base_filename = 1;
- strlcpy(chan->base_filename, base_filename, NAME_MAX);
+ strscpy(chan->base_filename, base_filename, NAME_MAX);
}
- setup_callbacks(chan, cb);
+ chan->cb = cb;
kref_init(&chan->kref);
mutex_lock(&relay_channels_mutex);
@@ -665,7 +578,7 @@ int relay_late_setup_files(struct rchan *chan,
if (!chan || !base_filename)
return -EINVAL;
- strlcpy(chan->base_filename, base_filename, NAME_MAX);
+ strscpy(chan->base_filename, base_filename, NAME_MAX);
mutex_lock(&relay_channels_mutex);
/* Is chan already set up? */
@@ -779,7 +692,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
new = buf->start + new_subbuf * buf->chan->subbuf_size;
buf->offset = 0;
- if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
+ if (!relay_subbuf_start(buf, new, old, buf->prev_padding)) {
buf->offset = buf->chan->subbuf_size + 1;
return 0;
}
@@ -1001,7 +914,7 @@ static int relay_file_read_avail(struct rchan_buf *buf)
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t produced = buf->subbufs_produced;
- size_t consumed = buf->subbufs_consumed;
+ size_t consumed;
relay_file_read_consume(buf, 0, 0);
@@ -1076,7 +989,8 @@ static size_t relay_file_read_start_pos(struct rchan_buf *buf)
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t consumed = buf->subbufs_consumed % n_subbufs;
- size_t read_pos = consumed * subbuf_size + buf->bytes_consumed;
+ size_t read_pos = (consumed * subbuf_size + buf->bytes_consumed)
+ % (n_subbufs * subbuf_size);
read_subbuf = read_pos / subbuf_size;
padding = buf->padding[read_subbuf];
@@ -1159,167 +1073,6 @@ static ssize_t relay_file_read(struct file *filp,
return written;
}
-static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
-{
- rbuf->bytes_consumed += bytes_consumed;
-
- if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) {
- relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1);
- rbuf->bytes_consumed %= rbuf->chan->subbuf_size;
- }
-}
-
-static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
-{
- struct rchan_buf *rbuf;
-
- rbuf = (struct rchan_buf *)page_private(buf->page);
- relay_consume_bytes(rbuf, buf->private);
-}
-
-static const struct pipe_buf_operations relay_pipe_buf_ops = {
- .release = relay_pipe_buf_release,
- .try_steal = generic_pipe_buf_try_steal,
- .get = generic_pipe_buf_get,
-};
-
-static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
-{
-}
-
-/*
- * subbuf_splice_actor - splice up to one subbuf's worth of data
- */
-static ssize_t subbuf_splice_actor(struct file *in,
- loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t len,
- unsigned int flags,
- int *nonpad_ret)
-{
- unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
- struct rchan_buf *rbuf = in->private_data;
- unsigned int subbuf_size = rbuf->chan->subbuf_size;
- uint64_t pos = (uint64_t) *ppos;
- uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size;
- size_t read_start = (size_t) do_div(pos, alloc_size);
- size_t read_subbuf = read_start / subbuf_size;
- size_t padding = rbuf->padding[read_subbuf];
- size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
- struct page *pages[PIPE_DEF_BUFFERS];
- struct partial_page partial[PIPE_DEF_BUFFERS];
- struct splice_pipe_desc spd = {
- .pages = pages,
- .nr_pages = 0,
- .nr_pages_max = PIPE_DEF_BUFFERS,
- .partial = partial,
- .ops = &relay_pipe_buf_ops,
- .spd_release = relay_page_release,
- };
- ssize_t ret;
-
- if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
- return 0;
- if (splice_grow_spd(pipe, &spd))
- return -ENOMEM;
-
- /*
- * Adjust read len, if longer than what is available
- */
- if (len > (subbuf_size - read_start % subbuf_size))
- len = subbuf_size - read_start % subbuf_size;
-
- subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
- pidx = (read_start / PAGE_SIZE) % subbuf_pages;
- poff = read_start & ~PAGE_MASK;
- nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max);
-
- for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
- unsigned int this_len, this_end, private;
- unsigned int cur_pos = read_start + total_len;
-
- if (!len)
- break;
-
- this_len = min_t(unsigned long, len, PAGE_SIZE - poff);
- private = this_len;
-
- spd.pages[spd.nr_pages] = rbuf->page_array[pidx];
- spd.partial[spd.nr_pages].offset = poff;
-
- this_end = cur_pos + this_len;
- if (this_end >= nonpad_end) {
- this_len = nonpad_end - cur_pos;
- private = this_len + padding;
- }
- spd.partial[spd.nr_pages].len = this_len;
- spd.partial[spd.nr_pages].private = private;
-
- len -= this_len;
- total_len += this_len;
- poff = 0;
- pidx = (pidx + 1) % subbuf_pages;
-
- if (this_end >= nonpad_end) {
- spd.nr_pages++;
- break;
- }
- }
-
- ret = 0;
- if (!spd.nr_pages)
- goto out;
-
- ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
- if (ret < 0 || ret < total_len)
- goto out;
-
- if (read_start + ret == nonpad_end)
- ret += padding;
-
-out:
- splice_shrink_spd(&spd);
- return ret;
-}
-
-static ssize_t relay_file_splice_read(struct file *in,
- loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t len,
- unsigned int flags)
-{
- ssize_t spliced;
- int ret;
- int nonpad_ret = 0;
-
- ret = 0;
- spliced = 0;
-
- while (len && !spliced) {
- ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
- if (ret < 0)
- break;
- else if (!ret) {
- if (flags & SPLICE_F_NONBLOCK)
- ret = -EAGAIN;
- break;
- }
-
- *ppos += ret;
- if (ret > len)
- len = 0;
- else
- len -= ret;
- spliced += nonpad_ret;
- nonpad_ret = 0;
- }
-
- if (spliced)
- return spliced;
-
- return ret;
-}
const struct file_operations relay_file_operations = {
.open = relay_file_open,
@@ -1328,6 +1081,5 @@ const struct file_operations relay_file_operations = {
.read = relay_file_read,
.llseek = no_llseek,
.release = relay_file_release,
- .splice_read = relay_file_splice_read,
};
EXPORT_SYMBOL_GPL(relay_file_operations);
diff --git a/kernel/resource.c b/kernel/resource.c
index 841737bbda9e..fcbca39dbc45 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -18,12 +18,17 @@
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
+#include <linux/pseudo_fs.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/device.h>
#include <linux/pfn.h>
#include <linux/mm.h>
+#include <linux/mount.h>
#include <linux/resource_ext.h>
+#include <uapi/linux/magic.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
#include <asm/io.h>
@@ -53,33 +58,17 @@ struct resource_constraint {
static DEFINE_RWLOCK(resource_lock);
-/*
- * For memory hotplug, there is no way to free resource entries allocated
- * by boot mem after the system is up. So for reusing the resource entry
- * we need to remember the resource.
- */
-static struct resource *bootmem_resource_free;
-static DEFINE_SPINLOCK(bootmem_resource_lock);
-
-static struct resource *next_resource(struct resource *p, bool sibling_only)
+static struct resource *next_resource(struct resource *p, bool skip_children)
{
- /* Caller wants to traverse through siblings only */
- if (sibling_only)
- return p->sibling;
-
- if (p->child)
+ if (!skip_children && p->child)
return p->child;
while (!p->sibling && p->parent)
p = p->parent;
return p->sibling;
}
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
-{
- struct resource *p = v;
- (*pos)++;
- return (void *)next_resource(p, false);
-}
+#define for_each_resource(_root, _p, _skip_children) \
+ for ((_p) = (_root)->child; (_p); (_p) = next_resource(_p, _skip_children))
#ifdef CONFIG_PROC_FS
@@ -88,14 +77,28 @@ enum { MAX_IORES_LEVEL = 5 };
static void *r_start(struct seq_file *m, loff_t *pos)
__acquires(resource_lock)
{
- struct resource *p = PDE_DATA(file_inode(m->file));
- loff_t l = 0;
+ struct resource *root = pde_data(file_inode(m->file));
+ struct resource *p;
+ loff_t l = *pos;
+
read_lock(&resource_lock);
- for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
- ;
+ for_each_resource(root, p, false) {
+ if (l-- == 0)
+ break;
+ }
+
return p;
}
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct resource *p = v;
+
+ (*pos)++;
+
+ return (void *)next_resource(p, false);
+}
+
static void r_stop(struct seq_file *m, void *v)
__releases(resource_lock)
{
@@ -104,7 +107,7 @@ static void r_stop(struct seq_file *m, void *v)
static int r_show(struct seq_file *m, void *v)
{
- struct resource *root = PDE_DATA(file_inode(m->file));
+ struct resource *root = pde_data(file_inode(m->file));
struct resource *r = v, *p;
unsigned long long start, end;
int width = root->end < 0x10000 ? 4 : 8;
@@ -149,36 +152,19 @@ __initcall(ioresources_init);
static void free_resource(struct resource *res)
{
- if (!res)
- return;
-
- if (!PageSlab(virt_to_head_page(res))) {
- spin_lock(&bootmem_resource_lock);
- res->sibling = bootmem_resource_free;
- bootmem_resource_free = res;
- spin_unlock(&bootmem_resource_lock);
- } else {
+ /**
+ * If the resource was allocated using memblock early during boot
+ * we'll leak it here: we can only return full pages back to the
+ * buddy and trying to be smart and reusing them eventually in
+ * alloc_resource() overcomplicates resource handling.
+ */
+ if (res && PageSlab(virt_to_head_page(res)))
kfree(res);
- }
}
static struct resource *alloc_resource(gfp_t flags)
{
- struct resource *res = NULL;
-
- spin_lock(&bootmem_resource_lock);
- if (bootmem_resource_free) {
- res = bootmem_resource_free;
- bootmem_resource_free = res->sibling;
- }
- spin_unlock(&bootmem_resource_lock);
-
- if (res)
- memset(res, 0, sizeof(struct resource));
- else
- res = kzalloc(sizeof(struct resource), flags);
-
- return res;
+ return kzalloc(sizeof(struct resource), flags);
}
/* Return the conflict entry if you can't request it */
@@ -320,29 +306,26 @@ int release_resource(struct resource *old)
EXPORT_SYMBOL(release_resource);
/**
- * Finds the lowest iomem resource that covers part of [@start..@end]. The
- * caller must specify @start, @end, @flags, and @desc (which may be
- * IORES_DESC_NONE).
+ * find_next_iomem_res - Finds the lowest iomem resource that covers part of
+ * [@start..@end].
*
* If a resource is found, returns 0 and @*res is overwritten with the part
* of the resource that's within [@start..@end]; if none is found, returns
* -ENODEV. Returns -EINVAL for invalid parameters.
*
- * This function walks the whole tree and not just first level children
- * unless @first_lvl is true.
- *
* @start: start address of the resource searched for
* @end: end address of same resource
* @flags: flags which the resource must have
* @desc: descriptor the resource must have
- * @first_lvl: walk only the first level children, if set
* @res: return ptr, if resource found
+ *
+ * The caller must specify @start, @end, @flags, and @desc
+ * (which may be IORES_DESC_NONE).
*/
static int find_next_iomem_res(resource_size_t start, resource_size_t end,
unsigned long flags, unsigned long desc,
- bool first_lvl, struct resource *res)
+ struct resource *res)
{
- bool siblings_only = true;
struct resource *p;
if (!res)
@@ -353,7 +336,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
read_lock(&resource_lock);
- for (p = iomem_resource.child; p; p = next_resource(p, siblings_only)) {
+ for_each_resource(&iomem_resource, p, false) {
/* If we passed the resource we are looking for, stop */
if (p->start > end) {
p = NULL;
@@ -364,13 +347,6 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
if (p->end < start)
continue;
- /*
- * Now that we found a range that matches what we look for,
- * check the flags and the descriptor. If we were not asked to
- * use only the first level, start looking at children as well.
- */
- siblings_only = first_lvl;
-
if ((p->flags & flags) != flags)
continue;
if ((desc != IORES_DESC_NONE) && (desc != p->desc))
@@ -382,10 +358,13 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
if (p) {
/* copy data */
- res->start = max(start, p->start);
- res->end = min(end, p->end);
- res->flags = p->flags;
- res->desc = p->desc;
+ *res = (struct resource) {
+ .start = max(start, p->start),
+ .end = min(end, p->end),
+ .flags = p->flags,
+ .desc = p->desc,
+ .parent = p->parent,
+ };
}
read_unlock(&resource_lock);
@@ -394,14 +373,14 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
unsigned long flags, unsigned long desc,
- bool first_lvl, void *arg,
+ void *arg,
int (*func)(struct resource *, void *))
{
struct resource res;
int ret = -EINVAL;
while (start < end &&
- !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) {
+ !find_next_iomem_res(start, end, flags, desc, &res)) {
ret = (*func)(&res, arg);
if (ret)
break;
@@ -413,11 +392,9 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
}
/**
- * Walks through iomem resources and calls func() with matching resource
- * ranges. This walks through whole tree and not just first level children.
- * All the memory ranges which overlap start,end and also match flags and
- * desc are valid candidates.
- *
+ * walk_iomem_res_desc - Walks through iomem resources and calls func()
+ * with matching resource ranges.
+ * *
* @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check.
* @flags: I/O resource flags
* @start: start addr
@@ -425,13 +402,16 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
* @arg: function argument for the callback @func
* @func: callback function that is called for each qualifying resource area
*
+ * All the memory ranges which overlap start,end and also match flags and
+ * desc are valid candidates.
+ *
* NOTE: For a new descriptor search, define a new IORES_DESC in
* <linux/ioport.h> and set it in 'desc' of a target resource entry.
*/
int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
u64 end, void *arg, int (*func)(struct resource *, void *))
{
- return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func);
+ return __walk_iomem_res_desc(start, end, flags, desc, arg, func);
}
EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
@@ -447,8 +427,63 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
{
unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
- return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
- arg, func);
+ return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg,
+ func);
+}
+
+/*
+ * This function, being a variant of walk_system_ram_res(), calls the @func
+ * callback against all memory ranges of type System RAM which are marked as
+ * IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY in reversed order, i.e., from
+ * higher to lower.
+ */
+int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
+ int (*func)(struct resource *, void *))
+{
+ struct resource res, *rams;
+ int rams_size = 16, i;
+ unsigned long flags;
+ int ret = -1;
+
+ /* create a list */
+ rams = kvcalloc(rams_size, sizeof(struct resource), GFP_KERNEL);
+ if (!rams)
+ return ret;
+
+ flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ i = 0;
+ while ((start < end) &&
+ (!find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res))) {
+ if (i >= rams_size) {
+ /* re-alloc */
+ struct resource *rams_new;
+
+ rams_new = kvrealloc(rams, rams_size * sizeof(struct resource),
+ (rams_size + 16) * sizeof(struct resource),
+ GFP_KERNEL);
+ if (!rams_new)
+ goto out;
+
+ rams = rams_new;
+ rams_size += 16;
+ }
+
+ rams[i].start = res.start;
+ rams[i++].end = res.end;
+
+ start = res.end + 1;
+ }
+
+ /* go reverse */
+ for (i--; i >= 0; i--) {
+ ret = (*func)(&rams[i], arg);
+ if (ret)
+ break;
+ }
+
+out:
+ kvfree(rams);
+ return ret;
}
/*
@@ -460,17 +495,14 @@ int walk_mem_res(u64 start, u64 end, void *arg,
{
unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
- arg, func);
+ return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg,
+ func);
}
/*
* This function calls the @func callback against all memory ranges of type
* System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
* It is to be used only for System RAM.
- *
- * This will find System RAM ranges that are children of top-level resources
- * in addition to top-level System RAM resources.
*/
int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg, int (*func)(unsigned long, unsigned long, void *))
@@ -485,8 +517,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
while (start < end &&
- !find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
- false, &res)) {
+ !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res)) {
pfn = PFN_UP(res.start);
end_pfn = PFN_DOWN(res.end + 1);
if (end_pfn > pfn)
@@ -513,6 +544,35 @@ int __weak page_is_ram(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(page_is_ram);
+static int __region_intersects(struct resource *parent, resource_size_t start,
+ size_t size, unsigned long flags,
+ unsigned long desc)
+{
+ struct resource res;
+ int type = 0; int other = 0;
+ struct resource *p;
+
+ res.start = start;
+ res.end = start + size - 1;
+
+ for (p = parent->child; p ; p = p->sibling) {
+ bool is_type = (((p->flags & flags) == flags) &&
+ ((desc == IORES_DESC_NONE) ||
+ (desc == p->desc)));
+
+ if (resource_overlaps(p, &res))
+ is_type ? type++ : other++;
+ }
+
+ if (type == 0)
+ return REGION_DISJOINT;
+
+ if (other == 0)
+ return REGION_INTERSECTS;
+
+ return REGION_MIXED;
+}
+
/**
* region_intersects() - determine intersection of region with known resources
* @start: region start address
@@ -536,31 +596,13 @@ EXPORT_SYMBOL_GPL(page_is_ram);
int region_intersects(resource_size_t start, size_t size, unsigned long flags,
unsigned long desc)
{
- struct resource res;
- int type = 0; int other = 0;
- struct resource *p;
-
- res.start = start;
- res.end = start + size - 1;
+ int ret;
read_lock(&resource_lock);
- for (p = iomem_resource.child; p ; p = p->sibling) {
- bool is_type = (((p->flags & flags) == flags) &&
- ((desc == IORES_DESC_NONE) ||
- (desc == p->desc)));
-
- if (resource_overlaps(p, &res))
- is_type ? type++ : other++;
- }
+ ret = __region_intersects(&iomem_resource, start, size, flags, desc);
read_unlock(&resource_lock);
- if (other == 0)
- return type ? REGION_INTERSECTS : REGION_DISJOINT;
-
- if (type)
- return REGION_MIXED;
-
- return REGION_DISJOINT;
+ return ret;
}
EXPORT_SYMBOL_GPL(region_intersects);
@@ -901,10 +943,17 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
if (conflict->end > new->end)
new->end = conflict->end;
- printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
+ pr_info("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
}
write_unlock(&resource_lock);
}
+/*
+ * Not for general consumption, only early boot memory map parsing, PCI
+ * resource discovery, and late discovery of CXL resources are expected
+ * to use this interface. The former are built-in and only the latter,
+ * CXL, is a module.
+ */
+EXPORT_SYMBOL_NS_GPL(insert_resource_expand_to_fit, CXL);
/**
* remove_resource - Remove a resource in the resource tree
@@ -1112,31 +1161,65 @@ resource_size_t resource_alignment(struct resource *res)
static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);
-/**
- * __request_region - create a new busy resource region
- * @parent: parent resource descriptor
- * @start: resource start address
- * @n: resource region size
- * @name: reserving caller's ID string
- * @flags: IO resource flags
- */
-struct resource * __request_region(struct resource *parent,
+static struct inode *iomem_inode;
+
+#ifdef CONFIG_IO_STRICT_DEVMEM
+static void revoke_iomem(struct resource *res)
+{
+ /* pairs with smp_store_release() in iomem_init_inode() */
+ struct inode *inode = smp_load_acquire(&iomem_inode);
+
+ /*
+ * Check that the initialization has completed. Losing the race
+ * is ok because it means drivers are claiming resources before
+ * the fs_initcall level of init and prevent iomem_get_mapping users
+ * from establishing mappings.
+ */
+ if (!inode)
+ return;
+
+ /*
+ * The expectation is that the driver has successfully marked
+ * the resource busy by this point, so devmem_is_allowed()
+ * should start returning false, however for performance this
+ * does not iterate the entire resource range.
+ */
+ if (devmem_is_allowed(PHYS_PFN(res->start)) &&
+ devmem_is_allowed(PHYS_PFN(res->end))) {
+ /*
+ * *cringe* iomem=relaxed says "go ahead, what's the
+ * worst that can happen?"
+ */
+ return;
+ }
+
+ unmap_mapping_range(inode->i_mapping, res->start, resource_size(res), 1);
+}
+#else
+static void revoke_iomem(struct resource *res) {}
+#endif
+
+struct address_space *iomem_get_mapping(void)
+{
+ /*
+ * This function is only called from file open paths, hence guaranteed
+ * that fs_initcalls have completed and no need to check for NULL. But
+ * since revoke_iomem can be called before the initcall we still need
+ * the barrier to appease checkers.
+ */
+ return smp_load_acquire(&iomem_inode)->i_mapping;
+}
+
+static int __request_region_locked(struct resource *res, struct resource *parent,
resource_size_t start, resource_size_t n,
const char *name, int flags)
{
DECLARE_WAITQUEUE(wait, current);
- struct resource *res = alloc_resource(GFP_KERNEL);
- struct resource *orig_parent = parent;
-
- if (!res)
- return NULL;
res->name = name;
res->start = start;
res->end = start + n - 1;
- write_lock(&resource_lock);
-
for (;;) {
struct resource *conflict;
@@ -1172,14 +1255,41 @@ struct resource * __request_region(struct resource *parent,
continue;
}
/* Uhhuh, that didn't work out.. */
- free_resource(res);
- res = NULL;
- break;
+ return -EBUSY;
}
+
+ return 0;
+}
+
+/**
+ * __request_region - create a new busy resource region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ * @name: reserving caller's ID string
+ * @flags: IO resource flags
+ */
+struct resource *__request_region(struct resource *parent,
+ resource_size_t start, resource_size_t n,
+ const char *name, int flags)
+{
+ struct resource *res = alloc_resource(GFP_KERNEL);
+ int ret;
+
+ if (!res)
+ return NULL;
+
+ write_lock(&resource_lock);
+ ret = __request_region_locked(res, parent, start, n, name, flags);
write_unlock(&resource_lock);
- if (res && orig_parent == &iomem_resource)
- revoke_devmem(res);
+ if (ret) {
+ free_resource(res);
+ return NULL;
+ }
+
+ if (parent == &iomem_resource)
+ revoke_iomem(res);
return res;
}
@@ -1228,16 +1338,13 @@ void __release_region(struct resource *parent, resource_size_t start,
write_unlock(&resource_lock);
- printk(KERN_WARNING "Trying to free nonexistent resource "
- "<%016llx-%016llx>\n", (unsigned long long)start,
- (unsigned long long)end);
+ pr_warn("Trying to free nonexistent resource <%pa-%pa>\n", &start, &end);
}
EXPORT_SYMBOL(__release_region);
#ifdef CONFIG_MEMORY_HOTREMOVE
/**
* release_mem_region_adjustable - release a previously reserved memory region
- * @parent: parent resource descriptor
* @start: resource start address
* @size: resource region size
*
@@ -1255,21 +1362,28 @@ EXPORT_SYMBOL(__release_region);
* assumes that all children remain in the lower address entry for
* simplicity. Enhance this logic when necessary.
*/
-int release_mem_region_adjustable(struct resource *parent,
- resource_size_t start, resource_size_t size)
+void release_mem_region_adjustable(resource_size_t start, resource_size_t size)
{
+ struct resource *parent = &iomem_resource;
+ struct resource *new_res = NULL;
+ bool alloc_nofail = false;
struct resource **p;
struct resource *res;
- struct resource *new_res;
resource_size_t end;
- int ret = -EINVAL;
end = start + size - 1;
- if ((start < parent->start) || (end > parent->end))
- return ret;
+ if (WARN_ON_ONCE((start < parent->start) || (end > parent->end)))
+ return;
- /* The alloc_resource() result gets checked later */
- new_res = alloc_resource(GFP_KERNEL);
+ /*
+ * We free up quite a lot of memory on memory hotunplug (esp., memap),
+ * just before releasing the region. This is highly unlikely to
+ * fail - let's play save and make it never fail as the caller cannot
+ * perform any error handling (e.g., trying to re-add memory will fail
+ * similarly).
+ */
+retry:
+ new_res = alloc_resource(GFP_KERNEL | (alloc_nofail ? __GFP_NOFAIL : 0));
p = &parent->child;
write_lock(&resource_lock);
@@ -1284,21 +1398,6 @@ int release_mem_region_adjustable(struct resource *parent,
continue;
}
- /*
- * All memory regions added from memory-hotplug path have the
- * flag IORESOURCE_SYSTEM_RAM. If the resource does not have
- * this flag, we know that we are dealing with a resource coming
- * from HMM/devm. HMM/devm use another mechanism to add/release
- * a resource. This goes via devm_request_mem_region and
- * devm_release_mem_region.
- * HMM/devm take care to release their resources when they want,
- * so if we are dealing with them, let us just back off here.
- */
- if (!(res->flags & IORESOURCE_SYSRAM)) {
- ret = 0;
- break;
- }
-
if (!(res->flags & IORESOURCE_MEM))
break;
@@ -1312,20 +1411,23 @@ int release_mem_region_adjustable(struct resource *parent,
/* free the whole entry */
*p = res->sibling;
free_resource(res);
- ret = 0;
} else if (res->start == start && res->end != end) {
/* adjust the start */
- ret = __adjust_resource(res, end + 1,
- res->end - end);
+ WARN_ON_ONCE(__adjust_resource(res, end + 1,
+ res->end - end));
} else if (res->start != start && res->end == end) {
/* adjust the end */
- ret = __adjust_resource(res, res->start,
- start - res->start);
+ WARN_ON_ONCE(__adjust_resource(res, res->start,
+ start - res->start));
} else {
- /* split into two entries */
+ /* split into two entries - we need a new resource */
if (!new_res) {
- ret = -ENOMEM;
- break;
+ new_res = alloc_resource(GFP_ATOMIC);
+ if (!new_res) {
+ alloc_nofail = true;
+ write_unlock(&resource_lock);
+ goto retry;
+ }
}
new_res->name = res->name;
new_res->start = end + 1;
@@ -1336,9 +1438,8 @@ int release_mem_region_adjustable(struct resource *parent,
new_res->sibling = res->sibling;
new_res->child = NULL;
- ret = __adjust_resource(res, res->start,
- start - res->start);
- if (ret)
+ if (WARN_ON_ONCE(__adjust_resource(res, res->start,
+ start - res->start)))
break;
res->sibling = new_res;
new_res = NULL;
@@ -1349,10 +1450,69 @@ int release_mem_region_adjustable(struct resource *parent,
write_unlock(&resource_lock);
free_resource(new_res);
- return ret;
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
+#ifdef CONFIG_MEMORY_HOTPLUG
+static bool system_ram_resources_mergeable(struct resource *r1,
+ struct resource *r2)
+{
+ /* We assume either r1 or r2 is IORESOURCE_SYSRAM_MERGEABLE. */
+ return r1->flags == r2->flags && r1->end + 1 == r2->start &&
+ r1->name == r2->name && r1->desc == r2->desc &&
+ !r1->child && !r2->child;
+}
+
+/**
+ * merge_system_ram_resource - mark the System RAM resource mergeable and try to
+ * merge it with adjacent, mergeable resources
+ * @res: resource descriptor
+ *
+ * This interface is intended for memory hotplug, whereby lots of contiguous
+ * system ram resources are added (e.g., via add_memory*()) by a driver, and
+ * the actual resource boundaries are not of interest (e.g., it might be
+ * relevant for DIMMs). Only resources that are marked mergeable, that have the
+ * same parent, and that don't have any children are considered. All mergeable
+ * resources must be immutable during the request.
+ *
+ * Note:
+ * - The caller has to make sure that no pointers to resources that are
+ * marked mergeable are used anymore after this call - the resource might
+ * be freed and the pointer might be stale!
+ * - release_mem_region_adjustable() will split on demand on memory hotunplug
+ */
+void merge_system_ram_resource(struct resource *res)
+{
+ const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ struct resource *cur;
+
+ if (WARN_ON_ONCE((res->flags & flags) != flags))
+ return;
+
+ write_lock(&resource_lock);
+ res->flags |= IORESOURCE_SYSRAM_MERGEABLE;
+
+ /* Try to merge with next item in the list. */
+ cur = res->sibling;
+ if (cur && system_ram_resources_mergeable(res, cur)) {
+ res->end = cur->end;
+ res->sibling = cur->sibling;
+ free_resource(cur);
+ }
+
+ /* Try to merge with previous item in the list. */
+ cur = res->parent->child;
+ while (cur && cur->sibling != res)
+ cur = cur->sibling;
+ if (cur && system_ram_resources_mergeable(cur, res)) {
+ cur->end = res->end;
+ cur->sibling = res->sibling;
+ free_resource(res);
+ }
+ write_unlock(&resource_lock);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
/*
* Managed region resource
*/
@@ -1536,22 +1696,22 @@ __setup("reserve=", reserve_setup);
*/
int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
{
- struct resource *p = &iomem_resource;
+ resource_size_t end = addr + size - 1;
+ struct resource *p;
int err = 0;
- loff_t l;
read_lock(&resource_lock);
- for (p = p->child; p ; p = r_next(NULL, p, &l)) {
+ for_each_resource(&iomem_resource, p, false) {
/*
* We can probably skip the resources without
* IORESOURCE_IO attribute?
*/
- if (p->start >= addr + size)
+ if (p->start > end)
continue;
if (p->end < addr)
continue;
if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
- PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
+ PFN_DOWN(p->end) >= PFN_DOWN(end))
continue;
/*
* if a resource is "BUSY", it's not a hardware resource
@@ -1562,10 +1722,8 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
if (p->flags & IORESOURCE_BUSY)
continue;
- printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n",
- (unsigned long long)addr,
- (unsigned long long)(addr + size - 1),
- p->name, p);
+ pr_warn("resource sanity check: requesting [mem %pa-%pa], which spans more than %s %pR\n",
+ &addr, &end, p->name, p);
err = -1;
break;
}
@@ -1581,37 +1739,46 @@ static int strict_iomem_checks;
#endif
/*
- * check if an address is reserved in the iomem resource tree
- * returns true if reserved, false if not reserved.
+ * Check if an address is exclusive to the kernel and must not be mapped to
+ * user space, for example, via /dev/mem.
+ *
+ * Returns true if exclusive to the kernel, otherwise returns false.
*/
-bool iomem_is_exclusive(u64 addr)
+bool resource_is_exclusive(struct resource *root, u64 addr, resource_size_t size)
{
- struct resource *p = &iomem_resource;
- bool err = false;
- loff_t l;
- int size = PAGE_SIZE;
-
- if (!strict_iomem_checks)
- return false;
-
- addr = addr & PAGE_MASK;
+ const unsigned int exclusive_system_ram = IORESOURCE_SYSTEM_RAM |
+ IORESOURCE_EXCLUSIVE;
+ bool skip_children = false, err = false;
+ struct resource *p;
read_lock(&resource_lock);
- for (p = p->child; p ; p = r_next(NULL, p, &l)) {
- /*
- * We can probably skip the resources without
- * IORESOURCE_IO attribute?
- */
+ for_each_resource(root, p, skip_children) {
if (p->start >= addr + size)
break;
- if (p->end < addr)
+ if (p->end < addr) {
+ skip_children = true;
continue;
+ }
+ skip_children = false;
+
+ /*
+ * IORESOURCE_SYSTEM_RAM resources are exclusive if
+ * IORESOURCE_EXCLUSIVE is set, even if they
+ * are not busy and even if "iomem=relaxed" is set. The
+ * responsible driver dynamically adds/removes system RAM within
+ * such an area and uncontrolled access is dangerous.
+ */
+ if ((p->flags & exclusive_system_ram) == exclusive_system_ram) {
+ err = true;
+ break;
+ }
+
/*
* A resource is exclusive if IORESOURCE_EXCLUSIVE is set
* or CONFIG_IO_STRICT_DEVMEM is enabled and the
* resource is busy.
*/
- if ((p->flags & IORESOURCE_BUSY) == 0)
+ if (!strict_iomem_checks || !(p->flags & IORESOURCE_BUSY))
continue;
if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
|| p->flags & IORESOURCE_EXCLUSIVE) {
@@ -1624,6 +1791,12 @@ bool iomem_is_exclusive(u64 addr)
return err;
}
+bool iomem_is_exclusive(u64 addr)
+{
+ return resource_is_exclusive(&iomem_resource, addr & PAGE_MASK,
+ PAGE_SIZE);
+}
+
struct resource_entry *resource_list_create_entry(struct resource *res,
size_t extra_size)
{
@@ -1648,31 +1821,139 @@ void resource_list_free(struct list_head *head)
}
EXPORT_SYMBOL(resource_list_free);
-#ifdef CONFIG_DEVICE_PRIVATE
-static struct resource *__request_free_mem_region(struct device *dev,
- struct resource *base, unsigned long size, const char *name)
+#ifdef CONFIG_GET_FREE_REGION
+#define GFR_DESCENDING (1UL << 0)
+#define GFR_REQUEST_REGION (1UL << 1)
+#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT)
+
+static resource_size_t gfr_start(struct resource *base, resource_size_t size,
+ resource_size_t align, unsigned long flags)
{
- resource_size_t end, addr;
+ if (flags & GFR_DESCENDING) {
+ resource_size_t end;
+
+ end = min_t(resource_size_t, base->end,
+ (1ULL << MAX_PHYSMEM_BITS) - 1);
+ return end - size + 1;
+ }
+
+ return ALIGN(base->start, align);
+}
+
+static bool gfr_continue(struct resource *base, resource_size_t addr,
+ resource_size_t size, unsigned long flags)
+{
+ if (flags & GFR_DESCENDING)
+ return addr > size && addr >= base->start;
+ /*
+ * In the ascend case be careful that the last increment by
+ * @size did not wrap 0.
+ */
+ return addr > addr - size &&
+ addr <= min_t(resource_size_t, base->end,
+ (1ULL << MAX_PHYSMEM_BITS) - 1);
+}
+
+static resource_size_t gfr_next(resource_size_t addr, resource_size_t size,
+ unsigned long flags)
+{
+ if (flags & GFR_DESCENDING)
+ return addr - size;
+ return addr + size;
+}
+
+static void remove_free_mem_region(void *_res)
+{
+ struct resource *res = _res;
+
+ if (res->parent)
+ remove_resource(res);
+ free_resource(res);
+}
+
+static struct resource *
+get_free_mem_region(struct device *dev, struct resource *base,
+ resource_size_t size, const unsigned long align,
+ const char *name, const unsigned long desc,
+ const unsigned long flags)
+{
+ resource_size_t addr;
struct resource *res;
+ struct region_devres *dr = NULL;
- size = ALIGN(size, 1UL << PA_SECTION_SHIFT);
- end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1);
- addr = end - size + 1UL;
+ size = ALIGN(size, align);
- for (; addr > size && addr >= base->start; addr -= size) {
- if (region_intersects(addr, size, 0, IORES_DESC_NONE) !=
- REGION_DISJOINT)
- continue;
+ res = alloc_resource(GFP_KERNEL);
+ if (!res)
+ return ERR_PTR(-ENOMEM);
- if (dev)
- res = devm_request_mem_region(dev, addr, size, name);
- else
- res = request_mem_region(addr, size, name);
- if (!res)
+ if (dev && (flags & GFR_REQUEST_REGION)) {
+ dr = devres_alloc(devm_region_release,
+ sizeof(struct region_devres), GFP_KERNEL);
+ if (!dr) {
+ free_resource(res);
+ return ERR_PTR(-ENOMEM);
+ }
+ } else if (dev) {
+ if (devm_add_action_or_reset(dev, remove_free_mem_region, res))
return ERR_PTR(-ENOMEM);
- res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
+ }
+
+ write_lock(&resource_lock);
+ for (addr = gfr_start(base, size, align, flags);
+ gfr_continue(base, addr, align, flags);
+ addr = gfr_next(addr, align, flags)) {
+ if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) !=
+ REGION_DISJOINT)
+ continue;
+
+ if (flags & GFR_REQUEST_REGION) {
+ if (__request_region_locked(res, &iomem_resource, addr,
+ size, name, 0))
+ break;
+
+ if (dev) {
+ dr->parent = &iomem_resource;
+ dr->start = addr;
+ dr->n = size;
+ devres_add(dev, dr);
+ }
+
+ res->desc = desc;
+ write_unlock(&resource_lock);
+
+
+ /*
+ * A driver is claiming this region so revoke any
+ * mappings.
+ */
+ revoke_iomem(res);
+ } else {
+ res->start = addr;
+ res->end = addr + size - 1;
+ res->name = name;
+ res->desc = desc;
+ res->flags = IORESOURCE_MEM;
+
+ /*
+ * Only succeed if the resource hosts an exclusive
+ * range after the insert
+ */
+ if (__insert_resource(base, res) || res->child)
+ break;
+
+ write_unlock(&resource_lock);
+ }
+
return res;
}
+ write_unlock(&resource_lock);
+
+ if (flags & GFR_REQUEST_REGION) {
+ free_resource(res);
+ devres_free(dr);
+ } else if (dev)
+ devm_release_action(dev, remove_free_mem_region, res);
return ERR_PTR(-ERANGE);
}
@@ -1691,18 +1972,48 @@ static struct resource *__request_free_mem_region(struct device *dev,
struct resource *devm_request_free_mem_region(struct device *dev,
struct resource *base, unsigned long size)
{
- return __request_free_mem_region(dev, base, size, dev_name(dev));
+ unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
+
+ return get_free_mem_region(dev, base, size, GFR_DEFAULT_ALIGN,
+ dev_name(dev),
+ IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
}
EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
struct resource *request_free_mem_region(struct resource *base,
unsigned long size, const char *name)
{
- return __request_free_mem_region(NULL, base, size, name);
+ unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
+
+ return get_free_mem_region(NULL, base, size, GFR_DEFAULT_ALIGN, name,
+ IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
}
EXPORT_SYMBOL_GPL(request_free_mem_region);
-#endif /* CONFIG_DEVICE_PRIVATE */
+/**
+ * alloc_free_mem_region - find a free region relative to @base
+ * @base: resource that will parent the new resource
+ * @size: size in bytes of memory to allocate from @base
+ * @align: alignment requirements for the allocation
+ * @name: resource name
+ *
+ * Buses like CXL, that can dynamically instantiate new memory regions,
+ * need a method to allocate physical address space for those regions.
+ * Allocate and insert a new resource to cover a free, unclaimed by a
+ * descendant of @base, range in the span of @base.
+ */
+struct resource *alloc_free_mem_region(struct resource *base,
+ unsigned long size, unsigned long align,
+ const char *name)
+{
+ /* Default of ascending direction and insert resource */
+ unsigned long flags = 0;
+
+ return get_free_mem_region(NULL, base, size, align, name,
+ IORES_DESC_NONE, flags);
+}
+EXPORT_SYMBOL_NS_GPL(alloc_free_mem_region, CXL);
+#endif /* CONFIG_GET_FREE_REGION */
static int __init strict_iomem(char *str)
{
@@ -1713,4 +2024,48 @@ static int __init strict_iomem(char *str)
return 1;
}
+static int iomem_fs_init_fs_context(struct fs_context *fc)
+{
+ return init_pseudo(fc, DEVMEM_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type iomem_fs_type = {
+ .name = "iomem",
+ .owner = THIS_MODULE,
+ .init_fs_context = iomem_fs_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+static int __init iomem_init_inode(void)
+{
+ static struct vfsmount *iomem_vfs_mount;
+ static int iomem_fs_cnt;
+ struct inode *inode;
+ int rc;
+
+ rc = simple_pin_fs(&iomem_fs_type, &iomem_vfs_mount, &iomem_fs_cnt);
+ if (rc < 0) {
+ pr_err("Cannot mount iomem pseudo filesystem: %d\n", rc);
+ return rc;
+ }
+
+ inode = alloc_anon_inode(iomem_vfs_mount->mnt_sb);
+ if (IS_ERR(inode)) {
+ rc = PTR_ERR(inode);
+ pr_err("Cannot allocate inode for iomem: %d\n", rc);
+ simple_release_fs(&iomem_vfs_mount, &iomem_fs_cnt);
+ return rc;
+ }
+
+ /*
+ * Publish iomem revocation inode initialized.
+ * Pairs with smp_load_acquire() in revoke_iomem().
+ */
+ smp_store_release(&iomem_inode, inode);
+
+ return 0;
+}
+
+fs_initcall(iomem_init_inode);
+
__setup("iomem=", strict_iomem);
diff --git a/kernel/resource_kunit.c b/kernel/resource_kunit.c
new file mode 100644
index 000000000000..58ab9f914602
--- /dev/null
+++ b/kernel/resource_kunit.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Test cases for API provided by resource.c and ioport.h
+ */
+
+#include <kunit/test.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#define R0_START 0x0000
+#define R0_END 0xffff
+#define R1_START 0x1234
+#define R1_END 0x2345
+#define R2_START 0x4567
+#define R2_END 0x5678
+#define R3_START 0x6789
+#define R3_END 0x789a
+#define R4_START 0x2000
+#define R4_END 0x7000
+
+static struct resource r0 = { .start = R0_START, .end = R0_END };
+static struct resource r1 = { .start = R1_START, .end = R1_END };
+static struct resource r2 = { .start = R2_START, .end = R2_END };
+static struct resource r3 = { .start = R3_START, .end = R3_END };
+static struct resource r4 = { .start = R4_START, .end = R4_END };
+
+struct result {
+ struct resource *r1;
+ struct resource *r2;
+ struct resource r;
+ bool ret;
+};
+
+static struct result results_for_union[] = {
+ {
+ .r1 = &r1, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true,
+ }, {
+ .r1 = &r2, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true,
+ }, {
+ .r1 = &r3, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true,
+ }, {
+ .r1 = &r4, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true,
+ }, {
+ .r1 = &r2, .r2 = &r1, .ret = false,
+ }, {
+ .r1 = &r3, .r2 = &r1, .ret = false,
+ }, {
+ .r1 = &r4, .r2 = &r1, .r.start = R1_START, .r.end = R4_END, .ret = true,
+ }, {
+ .r1 = &r2, .r2 = &r3, .ret = false,
+ }, {
+ .r1 = &r2, .r2 = &r4, .r.start = R4_START, .r.end = R4_END, .ret = true,
+ }, {
+ .r1 = &r3, .r2 = &r4, .r.start = R4_START, .r.end = R3_END, .ret = true,
+ },
+};
+
+static struct result results_for_intersection[] = {
+ {
+ .r1 = &r1, .r2 = &r0, .r.start = R1_START, .r.end = R1_END, .ret = true,
+ }, {
+ .r1 = &r2, .r2 = &r0, .r.start = R2_START, .r.end = R2_END, .ret = true,
+ }, {
+ .r1 = &r3, .r2 = &r0, .r.start = R3_START, .r.end = R3_END, .ret = true,
+ }, {
+ .r1 = &r4, .r2 = &r0, .r.start = R4_START, .r.end = R4_END, .ret = true,
+ }, {
+ .r1 = &r2, .r2 = &r1, .ret = false,
+ }, {
+ .r1 = &r3, .r2 = &r1, .ret = false,
+ }, {
+ .r1 = &r4, .r2 = &r1, .r.start = R4_START, .r.end = R1_END, .ret = true,
+ }, {
+ .r1 = &r2, .r2 = &r3, .ret = false,
+ }, {
+ .r1 = &r2, .r2 = &r4, .r.start = R2_START, .r.end = R2_END, .ret = true,
+ }, {
+ .r1 = &r3, .r2 = &r4, .r.start = R3_START, .r.end = R4_END, .ret = true,
+ },
+};
+
+static void resource_do_test(struct kunit *test, bool ret, struct resource *r,
+ bool exp_ret, struct resource *exp_r,
+ struct resource *r1, struct resource *r2)
+{
+ KUNIT_EXPECT_EQ_MSG(test, ret, exp_ret, "Resources %pR %pR", r1, r2);
+ KUNIT_EXPECT_EQ_MSG(test, r->start, exp_r->start, "Start elements are not equal");
+ KUNIT_EXPECT_EQ_MSG(test, r->end, exp_r->end, "End elements are not equal");
+}
+
+static void resource_do_union_test(struct kunit *test, struct result *r)
+{
+ struct resource result;
+ bool ret;
+
+ memset(&result, 0, sizeof(result));
+ ret = resource_union(r->r1, r->r2, &result);
+ resource_do_test(test, ret, &result, r->ret, &r->r, r->r1, r->r2);
+
+ memset(&result, 0, sizeof(result));
+ ret = resource_union(r->r2, r->r1, &result);
+ resource_do_test(test, ret, &result, r->ret, &r->r, r->r2, r->r1);
+}
+
+static void resource_test_union(struct kunit *test)
+{
+ struct result *r = results_for_union;
+ unsigned int i = 0;
+
+ do {
+ resource_do_union_test(test, &r[i]);
+ } while (++i < ARRAY_SIZE(results_for_union));
+}
+
+static void resource_do_intersection_test(struct kunit *test, struct result *r)
+{
+ struct resource result;
+ bool ret;
+
+ memset(&result, 0, sizeof(result));
+ ret = resource_intersection(r->r1, r->r2, &result);
+ resource_do_test(test, ret, &result, r->ret, &r->r, r->r1, r->r2);
+
+ memset(&result, 0, sizeof(result));
+ ret = resource_intersection(r->r2, r->r1, &result);
+ resource_do_test(test, ret, &result, r->ret, &r->r, r->r2, r->r1);
+}
+
+static void resource_test_intersection(struct kunit *test)
+{
+ struct result *r = results_for_intersection;
+ unsigned int i = 0;
+
+ do {
+ resource_do_intersection_test(test, &r[i]);
+ } while (++i < ARRAY_SIZE(results_for_intersection));
+}
+
+static struct kunit_case resource_test_cases[] = {
+ KUNIT_CASE(resource_test_union),
+ KUNIT_CASE(resource_test_intersection),
+ {}
+};
+
+static struct kunit_suite resource_test_suite = {
+ .name = "resource",
+ .test_cases = resource_test_cases,
+};
+kunit_test_suite(resource_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/kernel/rseq.c b/kernel/rseq.c
index a4f86a9d6937..9de6e35fe679 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -18,8 +18,12 @@
#define CREATE_TRACE_POINTS
#include <trace/events/rseq.h>
-#define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \
- RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT)
+/* The original rseq structure size (including padding) is 32 bytes. */
+#define ORIG_RSEQ_SIZE 32
+
+#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \
+ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
+ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
/*
*
@@ -81,21 +85,39 @@
* F1. <failure>
*/
-static int rseq_update_cpu_id(struct task_struct *t)
+static int rseq_update_cpu_node_id(struct task_struct *t)
{
+ struct rseq __user *rseq = t->rseq;
u32 cpu_id = raw_smp_processor_id();
-
- if (put_user(cpu_id, &t->rseq->cpu_id_start))
- return -EFAULT;
- if (put_user(cpu_id, &t->rseq->cpu_id))
- return -EFAULT;
+ u32 node_id = cpu_to_node(cpu_id);
+ u32 mm_cid = task_mm_cid(t);
+
+ WARN_ON_ONCE((int) mm_cid < 0);
+ if (!user_write_access_begin(rseq, t->rseq_len))
+ goto efault;
+ unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);
+ unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
+ unsafe_put_user(node_id, &rseq->node_id, efault_end);
+ unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end);
+ /*
+ * Additional feature fields added after ORIG_RSEQ_SIZE
+ * need to be conditionally updated only if
+ * t->rseq_len != ORIG_RSEQ_SIZE.
+ */
+ user_write_access_end();
trace_rseq_update(t);
return 0;
+
+efault_end:
+ user_write_access_end();
+efault:
+ return -EFAULT;
}
-static int rseq_reset_rseq_cpu_id(struct task_struct *t)
+static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
{
- u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
+ u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
+ mm_cid = 0;
/*
* Reset cpu_id_start to its initial state (0).
@@ -109,6 +131,21 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
*/
if (put_user(cpu_id, &t->rseq->cpu_id))
return -EFAULT;
+ /*
+ * Reset node_id to its initial state (0).
+ */
+ if (put_user(node_id, &t->rseq->node_id))
+ return -EFAULT;
+ /*
+ * Reset mm_cid to its initial state (0).
+ */
+ if (put_user(mm_cid, &t->rseq->mm_cid))
+ return -EFAULT;
+ /*
+ * Additional feature fields added after ORIG_RSEQ_SIZE
+ * need to be conditionally reset only if
+ * t->rseq_len != ORIG_RSEQ_SIZE.
+ */
return 0;
}
@@ -120,8 +157,13 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
u32 sig;
int ret;
- if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr)))
+#ifdef CONFIG_64BIT
+ if (get_user(ptr, &t->rseq->rseq_cs))
+ return -EFAULT;
+#else
+ if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr)))
return -EFAULT;
+#endif
if (!ptr) {
memset(rseq_cs, 0, sizeof(*rseq_cs));
return 0;
@@ -158,28 +200,35 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
return 0;
}
+static bool rseq_warn_flags(const char *str, u32 flags)
+{
+ u32 test_flags;
+
+ if (!flags)
+ return false;
+ test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS;
+ if (test_flags)
+ pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str);
+ test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS;
+ if (test_flags)
+ pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str);
+ return true;
+}
+
static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
{
u32 flags, event_mask;
int ret;
+ if (rseq_warn_flags("rseq_cs", cs_flags))
+ return -EINVAL;
+
/* Get thread flags. */
ret = get_user(flags, &t->rseq->flags);
if (ret)
return ret;
- /* Take critical section flags into account. */
- flags |= cs_flags;
-
- /*
- * Restart on signal can only be inhibited when restart on
- * preempt and restart on migrate are inhibited too. Otherwise,
- * a preempted signal handler could fail to restart the prior
- * execution context on sigreturn.
- */
- if (unlikely((flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) &&
- (flags & RSEQ_CS_PREEMPT_MIGRATE_FLAGS) !=
- RSEQ_CS_PREEMPT_MIGRATE_FLAGS))
+ if (rseq_warn_flags("rseq", flags))
return -EINVAL;
/*
@@ -191,7 +240,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
t->rseq_event_mask = 0;
preempt_enable();
- return !!(event_mask & ~flags);
+ return !!event_mask;
}
static int clear_rseq_cs(struct task_struct *t)
@@ -204,9 +253,13 @@ static int clear_rseq_cs(struct task_struct *t)
*
* Set rseq_cs to NULL.
*/
- if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64)))
+#ifdef CONFIG_64BIT
+ return put_user(0UL, &t->rseq->rseq_cs);
+#else
+ if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs)))
return -EFAULT;
return 0;
+#endif
}
/*
@@ -266,12 +319,18 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
if (unlikely(t->flags & PF_EXITING))
return;
- if (unlikely(!access_ok(t->rseq, sizeof(*t->rseq))))
- goto error;
- ret = rseq_ip_fixup(regs);
- if (unlikely(ret < 0))
- goto error;
- if (unlikely(rseq_update_cpu_id(t)))
+
+ /*
+ * regs is NULL if and only if the caller is in a syscall path. Skip
+ * fixup and leave rseq_cs as is so that rseq_sycall() will detect and
+ * kill a misbehaving userspace on debug kernels.
+ */
+ if (regs) {
+ ret = rseq_ip_fixup(regs);
+ if (unlikely(ret < 0))
+ goto error;
+ }
+ if (unlikely(rseq_update_cpu_node_id(t)))
goto error;
return;
@@ -294,8 +353,7 @@ void rseq_syscall(struct pt_regs *regs)
if (!t->rseq)
return;
- if (!access_ok(t->rseq, sizeof(*t->rseq)) ||
- rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
+ if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
force_sig(SIGSEGV);
}
@@ -315,15 +373,16 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
/* Unregister rseq for current thread. */
if (current->rseq != rseq || !current->rseq)
return -EINVAL;
- if (rseq_len != sizeof(*rseq))
+ if (rseq_len != current->rseq_len)
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
- ret = rseq_reset_rseq_cpu_id(current);
+ ret = rseq_reset_rseq_cpu_node_id(current);
if (ret)
return ret;
current->rseq = NULL;
current->rseq_sig = 0;
+ current->rseq_len = 0;
return 0;
}
@@ -336,7 +395,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
* the provided address differs from the prior
* one.
*/
- if (current->rseq != rseq || rseq_len != sizeof(*rseq))
+ if (current->rseq != rseq || rseq_len != current->rseq_len)
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
@@ -345,15 +404,24 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
}
/*
- * If there was no rseq previously registered,
- * ensure the provided rseq is properly aligned and valid.
+ * If there was no rseq previously registered, ensure the provided rseq
+ * is properly aligned, as communcated to user-space through the ELF
+ * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq
+ * size, the required alignment is the original struct rseq alignment.
+ *
+ * In order to be valid, rseq_len is either the original rseq size, or
+ * large enough to contain all supported fields, as communicated to
+ * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
*/
- if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
- rseq_len != sizeof(*rseq))
+ if (rseq_len < ORIG_RSEQ_SIZE ||
+ (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) ||
+ (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
+ rseq_len < offsetof(struct rseq, end))))
return -EINVAL;
if (!access_ok(rseq, rseq_len))
return -EFAULT;
current->rseq = rseq;
+ current->rseq_len = rseq_len;
current->rseq_sig = sig;
/*
* If rseq was previously inactive, and has just been
diff --git a/kernel/scftorture.c b/kernel/scftorture.c
new file mode 100644
index 000000000000..59032aaccd18
--- /dev/null
+++ b/kernel/scftorture.c
@@ -0,0 +1,666 @@
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Torture test for smp_call_function() and friends.
+//
+// Copyright (C) Facebook, 2020.
+//
+// Author: Paul E. McKenney <paulmck@kernel.org>
+
+#define pr_fmt(fmt) fmt
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/notifier.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/torture.h>
+#include <linux/types.h>
+
+#define SCFTORT_STRING "scftorture"
+#define SCFTORT_FLAG SCFTORT_STRING ": "
+
+#define VERBOSE_SCFTORTOUT(s, x...) \
+ do { if (verbose) pr_alert(SCFTORT_FLAG s "\n", ## x); } while (0)
+
+#define SCFTORTOUT_ERRSTRING(s, x...) pr_alert(SCFTORT_FLAG "!!! " s "\n", ## x)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>");
+
+// Wait until there are multiple CPUs before starting test.
+torture_param(int, holdoff, IS_BUILTIN(CONFIG_SCF_TORTURE_TEST) ? 10 : 0,
+ "Holdoff time before test start (s)");
+torture_param(int, longwait, 0, "Include ridiculously long waits? (seconds)");
+torture_param(int, nthreads, -1, "# threads, defaults to -1 for all CPUs.");
+torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
+torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, shutdown_secs, 0, "Shutdown time (ms), <= zero to disable.");
+torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s.");
+torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
+torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU hotplug.");
+torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
+torture_param(int, weight_resched, -1, "Testing weight for resched_cpu() operations.");
+torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations.");
+torture_param(int, weight_single_rpc, -1, "Testing weight for single-CPU RPC operations.");
+torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations.");
+torture_param(int, weight_many, -1, "Testing weight for multi-CPU no-wait operations.");
+torture_param(int, weight_many_wait, -1, "Testing weight for multi-CPU operations.");
+torture_param(int, weight_all, -1, "Testing weight for all-CPU no-wait operations.");
+torture_param(int, weight_all_wait, -1, "Testing weight for all-CPU operations.");
+
+char *torture_type = "";
+
+#ifdef MODULE
+# define SCFTORT_SHUTDOWN 0
+#else
+# define SCFTORT_SHUTDOWN 1
+#endif
+
+torture_param(bool, shutdown, SCFTORT_SHUTDOWN, "Shutdown at end of torture test.");
+
+struct scf_statistics {
+ struct task_struct *task;
+ int cpu;
+ long long n_resched;
+ long long n_single;
+ long long n_single_ofl;
+ long long n_single_rpc;
+ long long n_single_rpc_ofl;
+ long long n_single_wait;
+ long long n_single_wait_ofl;
+ long long n_many;
+ long long n_many_wait;
+ long long n_all;
+ long long n_all_wait;
+};
+
+static struct scf_statistics *scf_stats_p;
+static struct task_struct *scf_torture_stats_task;
+static DEFINE_PER_CPU(long long, scf_invoked_count);
+
+// Data for random primitive selection
+#define SCF_PRIM_RESCHED 0
+#define SCF_PRIM_SINGLE 1
+#define SCF_PRIM_SINGLE_RPC 2
+#define SCF_PRIM_MANY 3
+#define SCF_PRIM_ALL 4
+#define SCF_NPRIMS 8 // Need wait and no-wait versions of each,
+ // except for SCF_PRIM_RESCHED and
+ // SCF_PRIM_SINGLE_RPC.
+
+static char *scf_prim_name[] = {
+ "resched_cpu",
+ "smp_call_function_single",
+ "smp_call_function_single_rpc",
+ "smp_call_function_many",
+ "smp_call_function",
+};
+
+struct scf_selector {
+ unsigned long scfs_weight;
+ int scfs_prim;
+ bool scfs_wait;
+};
+static struct scf_selector scf_sel_array[SCF_NPRIMS];
+static int scf_sel_array_len;
+static unsigned long scf_sel_totweight;
+
+// Communicate between caller and handler.
+struct scf_check {
+ bool scfc_in;
+ bool scfc_out;
+ int scfc_cpu; // -1 for not _single().
+ bool scfc_wait;
+ bool scfc_rpc;
+ struct completion scfc_completion;
+};
+
+// Use to wait for all threads to start.
+static atomic_t n_started;
+static atomic_t n_errs;
+static atomic_t n_mb_in_errs;
+static atomic_t n_mb_out_errs;
+static atomic_t n_alloc_errs;
+static bool scfdone;
+static char *bangstr = "";
+
+static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand);
+
+extern void resched_cpu(int cpu); // An alternative IPI vector.
+
+// Print torture statistics. Caller must ensure serialization.
+static void scf_torture_stats_print(void)
+{
+ int cpu;
+ int i;
+ long long invoked_count = 0;
+ bool isdone = READ_ONCE(scfdone);
+ struct scf_statistics scfs = {};
+
+ for_each_possible_cpu(cpu)
+ invoked_count += data_race(per_cpu(scf_invoked_count, cpu));
+ for (i = 0; i < nthreads; i++) {
+ scfs.n_resched += scf_stats_p[i].n_resched;
+ scfs.n_single += scf_stats_p[i].n_single;
+ scfs.n_single_ofl += scf_stats_p[i].n_single_ofl;
+ scfs.n_single_rpc += scf_stats_p[i].n_single_rpc;
+ scfs.n_single_wait += scf_stats_p[i].n_single_wait;
+ scfs.n_single_wait_ofl += scf_stats_p[i].n_single_wait_ofl;
+ scfs.n_many += scf_stats_p[i].n_many;
+ scfs.n_many_wait += scf_stats_p[i].n_many_wait;
+ scfs.n_all += scf_stats_p[i].n_all;
+ scfs.n_all_wait += scf_stats_p[i].n_all_wait;
+ }
+ if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) ||
+ atomic_read(&n_mb_out_errs) ||
+ (!IS_ENABLED(CONFIG_KASAN) && atomic_read(&n_alloc_errs)))
+ bangstr = "!!! ";
+ pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld single_rpc: %lld single_rpc_ofl: %lld many: %lld/%lld all: %lld/%lld ",
+ SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, scfs.n_resched,
+ scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl,
+ scfs.n_single_rpc, scfs.n_single_rpc_ofl,
+ scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait);
+ torture_onoff_stats();
+ pr_cont("ste: %d stnmie: %d stnmoe: %d staf: %d\n", atomic_read(&n_errs),
+ atomic_read(&n_mb_in_errs), atomic_read(&n_mb_out_errs),
+ atomic_read(&n_alloc_errs));
+}
+
+// Periodically prints torture statistics, if periodic statistics printing
+// was specified via the stat_interval module parameter.
+static int
+scf_torture_stats(void *arg)
+{
+ VERBOSE_TOROUT_STRING("scf_torture_stats task started");
+ do {
+ schedule_timeout_interruptible(stat_interval * HZ);
+ scf_torture_stats_print();
+ torture_shutdown_absorb("scf_torture_stats");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("scf_torture_stats");
+ return 0;
+}
+
+// Add a primitive to the scf_sel_array[].
+static void scf_sel_add(unsigned long weight, int prim, bool wait)
+{
+ struct scf_selector *scfsp = &scf_sel_array[scf_sel_array_len];
+
+ // If no weight, if array would overflow, if computing three-place
+ // percentages would overflow, or if the scf_prim_name[] array would
+ // overflow, don't bother. In the last three two cases, complain.
+ if (!weight ||
+ WARN_ON_ONCE(scf_sel_array_len >= ARRAY_SIZE(scf_sel_array)) ||
+ WARN_ON_ONCE(0 - 100000 * weight <= 100000 * scf_sel_totweight) ||
+ WARN_ON_ONCE(prim >= ARRAY_SIZE(scf_prim_name)))
+ return;
+ scf_sel_totweight += weight;
+ scfsp->scfs_weight = scf_sel_totweight;
+ scfsp->scfs_prim = prim;
+ scfsp->scfs_wait = wait;
+ scf_sel_array_len++;
+}
+
+// Dump out weighting percentages for scf_prim_name[] array.
+static void scf_sel_dump(void)
+{
+ int i;
+ unsigned long oldw = 0;
+ struct scf_selector *scfsp;
+ unsigned long w;
+
+ for (i = 0; i < scf_sel_array_len; i++) {
+ scfsp = &scf_sel_array[i];
+ w = (scfsp->scfs_weight - oldw) * 100000 / scf_sel_totweight;
+ pr_info("%s: %3lu.%03lu %s(%s)\n", __func__, w / 1000, w % 1000,
+ scf_prim_name[scfsp->scfs_prim],
+ scfsp->scfs_wait ? "wait" : "nowait");
+ oldw = scfsp->scfs_weight;
+ }
+}
+
+// Randomly pick a primitive and wait/nowait, based on weightings.
+static struct scf_selector *scf_sel_rand(struct torture_random_state *trsp)
+{
+ int i;
+ unsigned long w = torture_random(trsp) % (scf_sel_totweight + 1);
+
+ for (i = 0; i < scf_sel_array_len; i++)
+ if (scf_sel_array[i].scfs_weight >= w)
+ return &scf_sel_array[i];
+ WARN_ON_ONCE(1);
+ return &scf_sel_array[0];
+}
+
+// Update statistics and occasionally burn up mass quantities of CPU time,
+// if told to do so via scftorture.longwait. Otherwise, occasionally burn
+// a little bit.
+static void scf_handler(void *scfc_in)
+{
+ int i;
+ int j;
+ unsigned long r = torture_random(this_cpu_ptr(&scf_torture_rand));
+ struct scf_check *scfcp = scfc_in;
+
+ if (likely(scfcp)) {
+ WRITE_ONCE(scfcp->scfc_out, false); // For multiple receivers.
+ if (WARN_ON_ONCE(unlikely(!READ_ONCE(scfcp->scfc_in))))
+ atomic_inc(&n_mb_in_errs);
+ }
+ this_cpu_inc(scf_invoked_count);
+ if (longwait <= 0) {
+ if (!(r & 0xffc0)) {
+ udelay(r & 0x3f);
+ goto out;
+ }
+ }
+ if (r & 0xfff)
+ goto out;
+ r = (r >> 12);
+ if (longwait <= 0) {
+ udelay((r & 0xff) + 1);
+ goto out;
+ }
+ r = r % longwait + 1;
+ for (i = 0; i < r; i++) {
+ for (j = 0; j < 1000; j++) {
+ udelay(1000);
+ cpu_relax();
+ }
+ }
+out:
+ if (unlikely(!scfcp))
+ return;
+ if (scfcp->scfc_wait) {
+ WRITE_ONCE(scfcp->scfc_out, true);
+ if (scfcp->scfc_rpc)
+ complete(&scfcp->scfc_completion);
+ } else {
+ kfree(scfcp);
+ }
+}
+
+// As above, but check for correct CPU.
+static void scf_handler_1(void *scfc_in)
+{
+ struct scf_check *scfcp = scfc_in;
+
+ if (likely(scfcp) && WARN_ONCE(smp_processor_id() != scfcp->scfc_cpu, "%s: Wanted CPU %d got CPU %d\n", __func__, scfcp->scfc_cpu, smp_processor_id())) {
+ atomic_inc(&n_errs);
+ }
+ scf_handler(scfcp);
+}
+
+// Randomly do an smp_call_function*() invocation.
+static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_random_state *trsp)
+{
+ bool allocfail = false;
+ uintptr_t cpu;
+ int ret = 0;
+ struct scf_check *scfcp = NULL;
+ struct scf_selector *scfsp = scf_sel_rand(trsp);
+
+ if (use_cpus_read_lock)
+ cpus_read_lock();
+ else
+ preempt_disable();
+ if (scfsp->scfs_prim == SCF_PRIM_SINGLE || scfsp->scfs_wait) {
+ scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC);
+ if (!scfcp) {
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_KASAN));
+ atomic_inc(&n_alloc_errs);
+ allocfail = true;
+ } else {
+ scfcp->scfc_cpu = -1;
+ scfcp->scfc_wait = scfsp->scfs_wait;
+ scfcp->scfc_out = false;
+ scfcp->scfc_rpc = false;
+ }
+ }
+ switch (scfsp->scfs_prim) {
+ case SCF_PRIM_RESCHED:
+ if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST)) {
+ cpu = torture_random(trsp) % nr_cpu_ids;
+ scfp->n_resched++;
+ resched_cpu(cpu);
+ this_cpu_inc(scf_invoked_count);
+ }
+ break;
+ case SCF_PRIM_SINGLE:
+ cpu = torture_random(trsp) % nr_cpu_ids;
+ if (scfsp->scfs_wait)
+ scfp->n_single_wait++;
+ else
+ scfp->n_single++;
+ if (scfcp) {
+ scfcp->scfc_cpu = cpu;
+ barrier(); // Prevent race-reduction compiler optimizations.
+ scfcp->scfc_in = true;
+ }
+ ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, scfsp->scfs_wait);
+ if (ret) {
+ if (scfsp->scfs_wait)
+ scfp->n_single_wait_ofl++;
+ else
+ scfp->n_single_ofl++;
+ kfree(scfcp);
+ scfcp = NULL;
+ }
+ break;
+ case SCF_PRIM_SINGLE_RPC:
+ if (!scfcp)
+ break;
+ cpu = torture_random(trsp) % nr_cpu_ids;
+ scfp->n_single_rpc++;
+ scfcp->scfc_cpu = cpu;
+ scfcp->scfc_wait = true;
+ init_completion(&scfcp->scfc_completion);
+ scfcp->scfc_rpc = true;
+ barrier(); // Prevent race-reduction compiler optimizations.
+ scfcp->scfc_in = true;
+ ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, 0);
+ if (!ret) {
+ if (use_cpus_read_lock)
+ cpus_read_unlock();
+ else
+ preempt_enable();
+ wait_for_completion(&scfcp->scfc_completion);
+ if (use_cpus_read_lock)
+ cpus_read_lock();
+ else
+ preempt_disable();
+ } else {
+ scfp->n_single_rpc_ofl++;
+ kfree(scfcp);
+ scfcp = NULL;
+ }
+ break;
+ case SCF_PRIM_MANY:
+ if (scfsp->scfs_wait)
+ scfp->n_many_wait++;
+ else
+ scfp->n_many++;
+ if (scfcp) {
+ barrier(); // Prevent race-reduction compiler optimizations.
+ scfcp->scfc_in = true;
+ }
+ smp_call_function_many(cpu_online_mask, scf_handler, scfcp, scfsp->scfs_wait);
+ break;
+ case SCF_PRIM_ALL:
+ if (scfsp->scfs_wait)
+ scfp->n_all_wait++;
+ else
+ scfp->n_all++;
+ if (scfcp) {
+ barrier(); // Prevent race-reduction compiler optimizations.
+ scfcp->scfc_in = true;
+ }
+ smp_call_function(scf_handler, scfcp, scfsp->scfs_wait);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ if (scfcp)
+ scfcp->scfc_out = true;
+ }
+ if (scfcp && scfsp->scfs_wait) {
+ if (WARN_ON_ONCE((num_online_cpus() > 1 || scfsp->scfs_prim == SCF_PRIM_SINGLE) &&
+ !scfcp->scfc_out)) {
+ pr_warn("%s: Memory-ordering failure, scfs_prim: %d.\n", __func__, scfsp->scfs_prim);
+ atomic_inc(&n_mb_out_errs); // Leak rather than trash!
+ } else {
+ kfree(scfcp);
+ }
+ barrier(); // Prevent race-reduction compiler optimizations.
+ }
+ if (use_cpus_read_lock)
+ cpus_read_unlock();
+ else
+ preempt_enable();
+ if (allocfail)
+ schedule_timeout_idle((1 + longwait) * HZ); // Let no-wait handlers complete.
+ else if (!(torture_random(trsp) & 0xfff))
+ schedule_timeout_uninterruptible(1);
+}
+
+// SCF test kthread. Repeatedly does calls to members of the
+// smp_call_function() family of functions.
+static int scftorture_invoker(void *arg)
+{
+ int cpu;
+ int curcpu;
+ DEFINE_TORTURE_RANDOM(rand);
+ struct scf_statistics *scfp = (struct scf_statistics *)arg;
+ bool was_offline = false;
+
+ VERBOSE_SCFTORTOUT("scftorture_invoker %d: task started", scfp->cpu);
+ cpu = scfp->cpu % nr_cpu_ids;
+ WARN_ON_ONCE(set_cpus_allowed_ptr(current, cpumask_of(cpu)));
+ set_user_nice(current, MAX_NICE);
+ if (holdoff)
+ schedule_timeout_interruptible(holdoff * HZ);
+
+ VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, raw_smp_processor_id());
+
+ // Make sure that the CPU is affinitized appropriately during testing.
+ curcpu = raw_smp_processor_id();
+ WARN_ONCE(curcpu != scfp->cpu % nr_cpu_ids,
+ "%s: Wanted CPU %d, running on %d, nr_cpu_ids = %d\n",
+ __func__, scfp->cpu, curcpu, nr_cpu_ids);
+
+ if (!atomic_dec_return(&n_started))
+ while (atomic_read_acquire(&n_started)) {
+ if (torture_must_stop()) {
+ VERBOSE_SCFTORTOUT("scftorture_invoker %d ended before starting", scfp->cpu);
+ goto end;
+ }
+ schedule_timeout_uninterruptible(1);
+ }
+
+ VERBOSE_SCFTORTOUT("scftorture_invoker %d started", scfp->cpu);
+
+ do {
+ scftorture_invoke_one(scfp, &rand);
+ while (cpu_is_offline(cpu) && !torture_must_stop()) {
+ schedule_timeout_interruptible(HZ / 5);
+ was_offline = true;
+ }
+ if (was_offline) {
+ set_cpus_allowed_ptr(current, cpumask_of(cpu));
+ was_offline = false;
+ }
+ cond_resched();
+ stutter_wait("scftorture_invoker");
+ } while (!torture_must_stop());
+
+ VERBOSE_SCFTORTOUT("scftorture_invoker %d ended", scfp->cpu);
+end:
+ torture_kthread_stopping("scftorture_invoker");
+ return 0;
+}
+
+static void
+scftorture_print_module_parms(const char *tag)
+{
+ pr_alert(SCFTORT_FLAG
+ "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d use_cpus_read_lock=%d, weight_resched=%d, weight_single=%d, weight_single_rpc=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag,
+ verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter, use_cpus_read_lock, weight_resched, weight_single, weight_single_rpc, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait);
+}
+
+static void scf_cleanup_handler(void *unused)
+{
+}
+
+static void scf_torture_cleanup(void)
+{
+ int i;
+
+ if (torture_cleanup_begin())
+ return;
+
+ WRITE_ONCE(scfdone, true);
+ if (nthreads && scf_stats_p)
+ for (i = 0; i < nthreads; i++)
+ torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task);
+ else
+ goto end;
+ smp_call_function(scf_cleanup_handler, NULL, 0);
+ torture_stop_kthread(scf_torture_stats, scf_torture_stats_task);
+ scf_torture_stats_print(); // -After- the stats thread is stopped!
+ kfree(scf_stats_p); // -After- the last stats print has completed!
+ scf_stats_p = NULL;
+
+ if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || atomic_read(&n_mb_out_errs))
+ scftorture_print_module_parms("End of test: FAILURE");
+ else if (torture_onoff_failures())
+ scftorture_print_module_parms("End of test: LOCK_HOTPLUG");
+ else
+ scftorture_print_module_parms("End of test: SUCCESS");
+
+end:
+ torture_cleanup_end();
+}
+
+static int __init scf_torture_init(void)
+{
+ long i;
+ int firsterr = 0;
+ unsigned long weight_resched1 = weight_resched;
+ unsigned long weight_single1 = weight_single;
+ unsigned long weight_single_rpc1 = weight_single_rpc;
+ unsigned long weight_single_wait1 = weight_single_wait;
+ unsigned long weight_many1 = weight_many;
+ unsigned long weight_many_wait1 = weight_many_wait;
+ unsigned long weight_all1 = weight_all;
+ unsigned long weight_all_wait1 = weight_all_wait;
+
+ if (!torture_init_begin(SCFTORT_STRING, verbose))
+ return -EBUSY;
+
+ scftorture_print_module_parms("Start of test");
+
+ if (weight_resched <= 0 &&
+ weight_single <= 0 && weight_single_rpc <= 0 && weight_single_wait <= 0 &&
+ weight_many <= 0 && weight_many_wait <= 0 &&
+ weight_all <= 0 && weight_all_wait <= 0) {
+ weight_resched1 = weight_resched == 0 ? 0 : 2 * nr_cpu_ids;
+ weight_single1 = weight_single == 0 ? 0 : 2 * nr_cpu_ids;
+ weight_single_rpc1 = weight_single_rpc == 0 ? 0 : 2 * nr_cpu_ids;
+ weight_single_wait1 = weight_single_wait == 0 ? 0 : 2 * nr_cpu_ids;
+ weight_many1 = weight_many == 0 ? 0 : 2;
+ weight_many_wait1 = weight_many_wait == 0 ? 0 : 2;
+ weight_all1 = weight_all == 0 ? 0 : 1;
+ weight_all_wait1 = weight_all_wait == 0 ? 0 : 1;
+ } else {
+ if (weight_resched == -1)
+ weight_resched1 = 0;
+ if (weight_single == -1)
+ weight_single1 = 0;
+ if (weight_single_rpc == -1)
+ weight_single_rpc1 = 0;
+ if (weight_single_wait == -1)
+ weight_single_wait1 = 0;
+ if (weight_many == -1)
+ weight_many1 = 0;
+ if (weight_many_wait == -1)
+ weight_many_wait1 = 0;
+ if (weight_all == -1)
+ weight_all1 = 0;
+ if (weight_all_wait == -1)
+ weight_all_wait1 = 0;
+ }
+ if (weight_resched1 == 0 && weight_single1 == 0 && weight_single_rpc1 == 0 &&
+ weight_single_wait1 == 0 && weight_many1 == 0 && weight_many_wait1 == 0 &&
+ weight_all1 == 0 && weight_all_wait1 == 0) {
+ SCFTORTOUT_ERRSTRING("all zero weights makes no sense");
+ firsterr = -EINVAL;
+ goto unwind;
+ }
+ if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST))
+ scf_sel_add(weight_resched1, SCF_PRIM_RESCHED, false);
+ else if (weight_resched1)
+ SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored");
+ scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false);
+ scf_sel_add(weight_single_rpc1, SCF_PRIM_SINGLE_RPC, true);
+ scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true);
+ scf_sel_add(weight_many1, SCF_PRIM_MANY, false);
+ scf_sel_add(weight_many_wait1, SCF_PRIM_MANY, true);
+ scf_sel_add(weight_all1, SCF_PRIM_ALL, false);
+ scf_sel_add(weight_all_wait1, SCF_PRIM_ALL, true);
+ scf_sel_dump();
+
+ if (onoff_interval > 0) {
+ firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, NULL);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+ if (shutdown_secs > 0) {
+ firsterr = torture_shutdown_init(shutdown_secs, scf_torture_cleanup);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+ if (stutter > 0) {
+ firsterr = torture_stutter_init(stutter, stutter);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+
+ // Worker tasks invoking smp_call_function().
+ if (nthreads < 0)
+ nthreads = num_online_cpus();
+ scf_stats_p = kcalloc(nthreads, sizeof(scf_stats_p[0]), GFP_KERNEL);
+ if (!scf_stats_p) {
+ SCFTORTOUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+
+ VERBOSE_SCFTORTOUT("Starting %d smp_call_function() threads", nthreads);
+
+ atomic_set(&n_started, nthreads);
+ for (i = 0; i < nthreads; i++) {
+ scf_stats_p[i].cpu = i;
+ firsterr = torture_create_kthread(scftorture_invoker, (void *)&scf_stats_p[i],
+ scf_stats_p[i].task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+ if (stat_interval > 0) {
+ firsterr = torture_create_kthread(scf_torture_stats, NULL, scf_torture_stats_task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ scf_torture_cleanup();
+ if (shutdown_secs) {
+ WARN_ON(!IS_MODULE(CONFIG_SCF_TORTURE_TEST));
+ kernel_power_off();
+ }
+ return firsterr;
+}
+
+module_init(scf_torture_init);
+module_exit(scf_torture_cleanup);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 5fc9c9b70862..976092b7bd45 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -1,17 +1,17 @@
# SPDX-License-Identifier: GPL-2.0
-ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
-endif
+
+# The compilers are complaining about unused variables inside an if(0) scope
+# block. This is daft, shut them up.
+ccflags-y += $(call cc-disable-warning, unused-but-set-variable)
# These files are disabled because they produce non-interesting flaky coverage
# that is not a function of syscall inputs. E.g. involuntary context switches.
KCOV_INSTRUMENT := n
-# There are numerous data races here, however, most of them are due to plain accesses.
-# This would make it even harder for syzbot to find reproducers, because these
-# bugs trigger without specific input. Disable by default, but should re-enable
-# eventually.
+# Disable KCSAN to avoid excessive noise and performance degradation. To avoid
+# false positives ensure barriers implied by sched functions are instrumented.
KCSAN_SANITIZE := n
+KCSAN_INSTRUMENT_BARRIERS := y
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
@@ -22,17 +22,13 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
-obj-y += core.o loadavg.o clock.o cputime.o
-obj-y += idle.o fair.o rt.o deadline.o
-obj-y += wait.o wait_bit.o swait.o completion.o
-
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
-obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
-obj-$(CONFIG_SCHEDSTATS) += stats.o
-obj-$(CONFIG_SCHED_DEBUG) += debug.o
-obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
-obj-$(CONFIG_CPU_FREQ) += cpufreq.o
-obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
-obj-$(CONFIG_MEMBARRIER) += membarrier.o
-obj-$(CONFIG_CPU_ISOLATION) += isolation.o
-obj-$(CONFIG_PSI) += psi.o
+#
+# Build efficiency:
+#
+# These compilation units have roughly the same size and complexity - so their
+# build parallelizes well and finishes roughly at once:
+#
+obj-y += core.o
+obj-y += fair.o
+obj-y += build_policy.o
+obj-y += build_utility.o
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 2067080bb235..991fc9002535 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -1,20 +1,42 @@
// SPDX-License-Identifier: GPL-2.0
+
/*
* Auto-group scheduling implementation:
*/
-#include <linux/nospec.h>
-#include "sched.h"
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
static struct autogroup autogroup_default;
static atomic_t autogroup_seq_nr;
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_autogroup_sysctls[] = {
+ {
+ .procname = "sched_autogroup_enabled",
+ .data = &sysctl_sched_autogroup_enabled,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {}
+};
+
+static void __init sched_autogroup_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_autogroup_sysctls);
+}
+#else
+#define sched_autogroup_sysctl_init() do { } while (0)
+#endif
+
void __init autogroup_init(struct task_struct *init_task)
{
autogroup_default.tg = &root_task_group;
kref_init(&autogroup_default.kref);
init_rwsem(&autogroup_default.lock);
init_task->signal->autogroup = &autogroup_default;
+ sched_autogroup_sysctl_init();
}
void autogroup_free(struct task_group *tg)
@@ -31,7 +53,7 @@ static inline void autogroup_destroy(struct kref *kref)
ag->tg->rt_se = NULL;
ag->tg->rt_rq = NULL;
#endif
- sched_offline_group(ag->tg);
+ sched_release_group(ag->tg);
sched_destroy_group(ag->tg);
}
@@ -139,7 +161,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
struct task_struct *t;
unsigned long flags;
- BUG_ON(!lock_task_sighand(p, &flags));
+ if (WARN_ON_ONCE(!lock_task_sighand(p, &flags)))
+ return;
prev = p->signal->autogroup;
if (prev == ag) {
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index b96419974a1f..90d69f2c5eaf 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -1,4 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _KERNEL_SCHED_AUTOGROUP_H
+#define _KERNEL_SCHED_AUTOGROUP_H
+
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup {
@@ -27,6 +30,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg)
{
+ extern unsigned int sysctl_sched_autogroup_enabled;
int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
if (enabled && task_wants_autogroup(p, tg))
@@ -58,3 +62,5 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
}
#endif /* CONFIG_SCHED_AUTOGROUP */
+
+#endif /* _KERNEL_SCHED_AUTOGROUP_H */
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
new file mode 100644
index 000000000000..d9dc9ab3773f
--- /dev/null
+++ b/kernel/sched/build_policy.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * These are the scheduling policy related scheduler files, built
+ * in a single compilation unit for build efficiency reasons.
+ *
+ * ( Incidentally, the size of the compilation unit is roughly
+ * comparable to core.c and fair.c, the other two big
+ * compilation units. This helps balance build time, while
+ * coalescing source files to amortize header inclusion
+ * cost. )
+ *
+ * core.c and fair.c are built separately.
+ */
+
+/* Headers: */
+#include <linux/sched/clock.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/posix-timers.h>
+#include <linux/sched/rt.h>
+
+#include <linux/cpuidle.h>
+#include <linux/jiffies.h>
+#include <linux/livepatch.h>
+#include <linux/psi.h>
+#include <linux/seqlock_api.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <linux/tsacct_kern.h>
+#include <linux/vtime.h>
+
+#include <uapi/linux/sched/types.h>
+
+#include "sched.h"
+#include "smp.h"
+
+#include "autogroup.h"
+#include "stats.h"
+#include "pelt.h"
+
+/* Source code modules: */
+
+#include "idle.c"
+
+#include "rt.c"
+
+#ifdef CONFIG_SMP
+# include "cpudeadline.c"
+# include "pelt.c"
+#endif
+
+#include "cputime.c"
+#include "deadline.c"
+
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
new file mode 100644
index 000000000000..80a3df49ab47
--- /dev/null
+++ b/kernel/sched/build_utility.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * These are various utility functions of the scheduler,
+ * built in a single compilation unit for build efficiency reasons.
+ *
+ * ( Incidentally, the size of the compilation unit is roughly
+ * comparable to core.c, fair.c, smp.c and policy.c, the other
+ * big compilation units. This helps balance build time, while
+ * coalescing source files to amortize header inclusion
+ * cost. )
+ */
+#include <linux/sched/clock.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/rseq_api.h>
+#include <linux/sched/task_stack.h>
+
+#include <linux/cpufreq.h>
+#include <linux/cpumask_api.h>
+#include <linux/cpuset.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/energy_model.h>
+#include <linux/hashtable_api.h>
+#include <linux/irq.h>
+#include <linux/kobject_api.h>
+#include <linux/membarrier.h>
+#include <linux/mempolicy.h>
+#include <linux/nmi.h>
+#include <linux/nospec.h>
+#include <linux/proc_fs.h>
+#include <linux/psi.h>
+#include <linux/ptrace_api.h>
+#include <linux/sched_clock.h>
+#include <linux/security.h>
+#include <linux/spinlock_api.h>
+#include <linux/swait_api.h>
+#include <linux/timex.h>
+#include <linux/utsname.h>
+#include <linux/wait_api.h>
+#include <linux/workqueue_api.h>
+
+#include <uapi/linux/prctl.h>
+#include <uapi/linux/sched/types.h>
+
+#include <asm/switch_to.h>
+
+#include "sched.h"
+#include "sched-pelt.h"
+#include "stats.h"
+#include "autogroup.h"
+
+#include "clock.c"
+
+#ifdef CONFIG_CGROUP_CPUACCT
+# include "cpuacct.c"
+#endif
+
+#ifdef CONFIG_CPU_FREQ
+# include "cpufreq.c"
+#endif
+
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+# include "cpufreq_schedutil.c"
+#endif
+
+#ifdef CONFIG_SCHED_DEBUG
+# include "debug.c"
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+# include "stats.c"
+#endif
+
+#include "loadavg.c"
+#include "completion.c"
+#include "swait.c"
+#include "wait_bit.c"
+#include "wait.c"
+
+#ifdef CONFIG_SMP
+# include "cpupri.c"
+# include "stop_task.c"
+# include "topology.c"
+#endif
+
+#ifdef CONFIG_SCHED_CORE
+# include "core_sched.c"
+#endif
+
+#ifdef CONFIG_PSI
+# include "psi.c"
+#endif
+
+#ifdef CONFIG_MEMBARRIER
+# include "membarrier.c"
+#endif
+
+#ifdef CONFIG_CPU_ISOLATION
+# include "isolation.c"
+#endif
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+# include "autogroup.c"
+#endif
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 12bca64dff73..3c6193de9cde 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -41,7 +41,7 @@
* Otherwise it tries to create a semi stable clock from a mixture of other
* clocks, including:
*
- * - GTOD (clock monotomic)
+ * - GTOD (clock monotonic)
* - sched_clock()
* - explicit idle events
*
@@ -53,15 +53,13 @@
* that is otherwise invisible (TSC gets stopped).
*
*/
-#include "sched.h"
-#include <linux/sched_clock.h>
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
* Architectures and sub-architectures can override this.
*/
-unsigned long long __weak sched_clock(void)
+notrace unsigned long long __weak sched_clock(void)
{
return (unsigned long long)(jiffies - INITIAL_JIFFIES)
* (NSEC_PER_SEC / HZ);
@@ -95,28 +93,28 @@ struct sched_clock_data {
static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
-static inline struct sched_clock_data *this_scd(void)
+static __always_inline struct sched_clock_data *this_scd(void)
{
return this_cpu_ptr(&sched_clock_data);
}
-static inline struct sched_clock_data *cpu_sdc(int cpu)
+notrace static inline struct sched_clock_data *cpu_sdc(int cpu)
{
return &per_cpu(sched_clock_data, cpu);
}
-int sched_clock_stable(void)
+notrace int sched_clock_stable(void)
{
return static_branch_likely(&__sched_clock_stable);
}
-static void __scd_stamp(struct sched_clock_data *scd)
+notrace static void __scd_stamp(struct sched_clock_data *scd)
{
scd->tick_gtod = ktime_get_ns();
scd->tick_raw = sched_clock();
}
-static void __set_sched_clock_stable(void)
+notrace static void __set_sched_clock_stable(void)
{
struct sched_clock_data *scd;
@@ -151,7 +149,7 @@ static void __set_sched_clock_stable(void)
* The only way to fully avoid random clock jumps is to boot with:
* "tsc=unstable".
*/
-static void __sched_clock_work(struct work_struct *work)
+notrace static void __sched_clock_work(struct work_struct *work)
{
struct sched_clock_data *scd;
int cpu;
@@ -177,7 +175,7 @@ static void __sched_clock_work(struct work_struct *work)
static DECLARE_WORK(sched_clock_work, __sched_clock_work);
-static void __clear_sched_clock_stable(void)
+notrace static void __clear_sched_clock_stable(void)
{
if (!sched_clock_stable())
return;
@@ -186,7 +184,7 @@ static void __clear_sched_clock_stable(void)
schedule_work(&sched_clock_work);
}
-void clear_sched_clock_stable(void)
+notrace void clear_sched_clock_stable(void)
{
__sched_clock_stable_early = 0;
@@ -196,7 +194,7 @@ void clear_sched_clock_stable(void)
__clear_sched_clock_stable();
}
-static void __sched_clock_gtod_offset(void)
+notrace static void __sched_clock_gtod_offset(void)
{
struct sched_clock_data *scd = this_scd();
@@ -246,12 +244,12 @@ late_initcall(sched_clock_init_late);
* min, max except they take wrapping into account
*/
-static inline u64 wrap_min(u64 x, u64 y)
+static __always_inline u64 wrap_min(u64 x, u64 y)
{
return (s64)(x - y) < 0 ? x : y;
}
-static inline u64 wrap_max(u64 x, u64 y)
+static __always_inline u64 wrap_max(u64 x, u64 y)
{
return (s64)(x - y) > 0 ? x : y;
}
@@ -262,13 +260,13 @@ static inline u64 wrap_max(u64 x, u64 y)
* - filter out backward motion
* - use the GTOD tick value to create a window to filter crazy TSC values
*/
-static u64 sched_clock_local(struct sched_clock_data *scd)
+static __always_inline u64 sched_clock_local(struct sched_clock_data *scd)
{
u64 now, clock, old_clock, min_clock, max_clock, gtod;
s64 delta;
again:
- now = sched_clock();
+ now = sched_clock_noinstr();
delta = now - scd->tick_raw;
if (unlikely(delta < 0))
delta = 0;
@@ -289,13 +287,38 @@ again:
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
- if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
+ if (!raw_try_cmpxchg64(&scd->clock, &old_clock, clock))
goto again;
return clock;
}
-static u64 sched_clock_remote(struct sched_clock_data *scd)
+noinstr u64 local_clock_noinstr(void)
+{
+ u64 clock;
+
+ if (static_branch_likely(&__sched_clock_stable))
+ return sched_clock_noinstr() + __sched_clock_offset;
+
+ if (!static_branch_likely(&sched_clock_running))
+ return sched_clock_noinstr();
+
+ clock = sched_clock_local(this_scd());
+
+ return clock;
+}
+
+u64 local_clock(void)
+{
+ u64 now;
+ preempt_disable_notrace();
+ now = local_clock_noinstr();
+ preempt_enable_notrace();
+ return now;
+}
+EXPORT_SYMBOL_GPL(local_clock);
+
+static notrace u64 sched_clock_remote(struct sched_clock_data *scd)
{
struct sched_clock_data *my_scd = this_scd();
u64 this_clock, remote_clock;
@@ -351,7 +374,7 @@ again:
val = remote_clock;
}
- if (cmpxchg64(ptr, old_val, val) != old_val)
+ if (!try_cmpxchg64(ptr, &old_val, val))
goto again;
return val;
@@ -362,7 +385,7 @@ again:
*
* See cpu_clock().
*/
-u64 sched_clock_cpu(int cpu)
+notrace u64 sched_clock_cpu(int cpu)
{
struct sched_clock_data *scd;
u64 clock;
@@ -386,7 +409,7 @@ u64 sched_clock_cpu(int cpu)
}
EXPORT_SYMBOL_GPL(sched_clock_cpu);
-void sched_clock_tick(void)
+notrace void sched_clock_tick(void)
{
struct sched_clock_data *scd;
@@ -403,7 +426,7 @@ void sched_clock_tick(void)
sched_clock_local(scd);
}
-void sched_clock_tick_stable(void)
+notrace void sched_clock_tick_stable(void)
{
if (!sched_clock_stable())
return;
@@ -423,7 +446,7 @@ void sched_clock_tick_stable(void)
/*
* We are going deep-idle (irqs are disabled):
*/
-void sched_clock_idle_sleep_event(void)
+notrace void sched_clock_idle_sleep_event(void)
{
sched_clock_cpu(smp_processor_id());
}
@@ -432,7 +455,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
/*
* We just idled; resync with ktime.
*/
-void sched_clock_idle_wakeup_event(void)
+notrace void sched_clock_idle_wakeup_event(void)
{
unsigned long flags;
@@ -458,7 +481,7 @@ void __init sched_clock_init(void)
local_irq_enable();
}
-u64 sched_clock_cpu(int cpu)
+notrace u64 sched_clock_cpu(int cpu)
{
if (!static_branch_likely(&sched_clock_running))
return 0;
@@ -476,7 +499,7 @@ u64 sched_clock_cpu(int cpu)
* On bare metal this function should return the same as local_clock.
* Architectures and sub-architectures can override this.
*/
-u64 __weak running_clock(void)
+notrace u64 __weak running_clock(void)
{
return local_clock();
}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a778554f9dad..3561ab533dd4 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+
/*
* Generic wait-for-completion handler;
*
@@ -11,7 +12,23 @@
* typically be used for exclusion which gives rise to priority inversion.
* Waiting for completion is a typically sync point, but not an exclusion point.
*/
-#include "sched.h"
+
+static void complete_with_flags(struct completion *x, int wake_flags)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
+
+ if (x->done != UINT_MAX)
+ x->done++;
+ swake_up_locked(&x->wait, wake_flags);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+
+void complete_on_current_cpu(struct completion *x)
+{
+ return complete_with_flags(x, WF_CURRENT_CPU);
+}
/**
* complete: - signals a single thread waiting on this completion
@@ -27,14 +44,7 @@
*/
void complete(struct completion *x)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&x->wait.lock, flags);
-
- if (x->done != UINT_MAX)
- x->done++;
- swake_up_locked(&x->wait);
- raw_spin_unlock_irqrestore(&x->wait.lock, flags);
+ complete_with_flags(x, 0);
}
EXPORT_SYMBOL(complete);
@@ -204,6 +214,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout);
int __sched wait_for_completion_interruptible(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+
if (t == -ERESTARTSYS)
return t;
return 0;
@@ -241,12 +252,23 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
int __sched wait_for_completion_killable(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
+
if (t == -ERESTARTSYS)
return t;
return 0;
}
EXPORT_SYMBOL(wait_for_completion_killable);
+int __sched wait_for_completion_state(struct completion *x, unsigned int state)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, state);
+
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_state);
+
/**
* wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
* @x: holds the state of this particular completion
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2142c6767682..9116bcc90346 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6,25 +6,98 @@
*
* Copyright (C) 1991-2002 Linus Torvalds
*/
-#include "sched.h"
-
-#include <linux/nospec.h>
-
+#include <linux/highmem.h>
+#include <linux/hrtimer_api.h>
+#include <linux/ktime_api.h>
+#include <linux/sched/signal.h>
+#include <linux/syscalls_api.h>
+#include <linux/debug_locks.h>
+#include <linux/prefetch.h>
+#include <linux/capability.h>
+#include <linux/pgtable_api.h>
+#include <linux/wait_bit.h>
+#include <linux/jiffies.h>
+#include <linux/spinlock_api.h>
+#include <linux/cpumask_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/hardirq.h>
+#include <linux/softirq.h>
+#include <linux/refcount_api.h>
+#include <linux/topology.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/cond_resched.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/init.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/rseq_api.h>
+#include <linux/sched/rt.h>
+
+#include <linux/blkdev.h>
+#include <linux/context_tracking.h>
+#include <linux/cpuset.h>
+#include <linux/delayacct.h>
+#include <linux/init_task.h>
+#include <linux/interrupt.h>
+#include <linux/ioprio.h>
+#include <linux/kallsyms.h>
#include <linux/kcov.h>
+#include <linux/kprobes.h>
+#include <linux/llist_api.h>
+#include <linux/mmu_context.h>
+#include <linux/mmzone.h>
+#include <linux/mutex_api.h>
+#include <linux/nmi.h>
+#include <linux/nospec.h>
+#include <linux/perf_event_api.h>
+#include <linux/profile.h>
+#include <linux/psi.h>
+#include <linux/rcuwait_api.h>
+#include <linux/rseq.h>
+#include <linux/sched/wake_q.h>
#include <linux/scs.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/vtime.h>
+#include <linux/wait_api.h>
+#include <linux/workqueue_api.h>
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+# ifdef CONFIG_GENERIC_ENTRY
+# include <linux/entry-common.h>
+# endif
+#endif
+#include <uapi/linux/sched/types.h>
+
+#include <asm/irq_regs.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
-#include "../workqueue_internal.h"
-#include "../../fs/io-wq.h"
-#include "../smpboot.h"
+#define CREATE_TRACE_POINTS
+#include <linux/sched/rseq_api.h>
+#include <trace/events/sched.h>
+#include <trace/events/ipi.h>
+#undef CREATE_TRACE_POINTS
+#include "sched.h"
+#include "stats.h"
+
+#include "autogroup.h"
#include "pelt.h"
#include "smp.h"
+#include "stats.h"
-#define CREATE_TRACE_POINTS
-#include <trace/events/sched.h>
+#include "../workqueue_internal.h"
+#include "../../io_uring/io-wq.h"
+#include "../smpboot.h"
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
+EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
/*
* Export tracepoints that act as a bare tracehook (ie: have no trace event
@@ -35,11 +108,17 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
+#ifdef CONFIG_SCHED_DEBUG
/*
* Debugging: various feature bits
*
@@ -53,27 +132,495 @@ const_debug unsigned int sysctl_sched_features =
#include "features.h"
0;
#undef SCHED_FEAT
-#endif
+
+/*
+ * Print a warning if need_resched is set for the given duration (if
+ * LATENCY_WARN is enabled).
+ *
+ * If sysctl_resched_latency_warn_once is set, only one warning will be shown
+ * per boot.
+ */
+__read_mostly int sysctl_resched_latency_warn_ms = 100;
+__read_mostly int sysctl_resched_latency_warn_once = 1;
+#endif /* CONFIG_SCHED_DEBUG */
/*
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
*/
-const_debug unsigned int sysctl_sched_nr_migrate = 32;
+const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
+
+__read_mostly int scheduler_running;
+
+#ifdef CONFIG_SCHED_CORE
+
+DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+/* kernel prio, less is more */
+static inline int __task_prio(const struct task_struct *p)
+{
+ if (p->sched_class == &stop_sched_class) /* trumps deadline */
+ return -2;
+
+ if (rt_prio(p->prio)) /* includes deadline */
+ return p->prio; /* [-1, 99] */
+
+ if (p->sched_class == &idle_sched_class)
+ return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
+
+ return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
+}
/*
- * period over which we measure -rt task CPU usage in us.
- * default: 1s
+ * l(a,b)
+ * le(a,b) := !l(b,a)
+ * g(a,b) := l(b,a)
+ * ge(a,b) := !l(a,b)
*/
-unsigned int sysctl_sched_rt_period = 1000000;
-__read_mostly int scheduler_running;
+/* real prio, less is less */
+static inline bool prio_less(const struct task_struct *a,
+ const struct task_struct *b, bool in_fi)
+{
+
+ int pa = __task_prio(a), pb = __task_prio(b);
+
+ if (-pa < -pb)
+ return true;
+
+ if (-pb < -pa)
+ return false;
+
+ if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
+ return !dl_time_before(a->dl.deadline, b->dl.deadline);
+
+ if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
+ return cfs_prio_less(a, b, in_fi);
+
+ return false;
+}
+
+static inline bool __sched_core_less(const struct task_struct *a,
+ const struct task_struct *b)
+{
+ if (a->core_cookie < b->core_cookie)
+ return true;
+
+ if (a->core_cookie > b->core_cookie)
+ return false;
+
+ /* flip prio, so high prio is leftmost */
+ if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
+ return true;
+
+ return false;
+}
+
+#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node)
+
+static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b)
+{
+ return __sched_core_less(__node_2_sc(a), __node_2_sc(b));
+}
+
+static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
+{
+ const struct task_struct *p = __node_2_sc(node);
+ unsigned long cookie = (unsigned long)key;
+
+ if (cookie < p->core_cookie)
+ return -1;
+
+ if (cookie > p->core_cookie)
+ return 1;
+
+ return 0;
+}
+
+void sched_core_enqueue(struct rq *rq, struct task_struct *p)
+{
+ rq->core->core_task_seq++;
+
+ if (!p->core_cookie)
+ return;
+
+ rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
+}
+
+void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
+{
+ rq->core->core_task_seq++;
+
+ if (sched_core_enqueued(p)) {
+ rb_erase(&p->core_node, &rq->core_tree);
+ RB_CLEAR_NODE(&p->core_node);
+ }
+
+ /*
+ * Migrating the last task off the cpu, with the cpu in forced idle
+ * state. Reschedule to create an accounting edge for forced idle,
+ * and re-examine whether the core is still in forced idle state.
+ */
+ if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
+ rq->core->core_forceidle_count && rq->curr == rq->idle)
+ resched_curr(rq);
+}
+
+static int sched_task_is_throttled(struct task_struct *p, int cpu)
+{
+ if (p->sched_class->task_is_throttled)
+ return p->sched_class->task_is_throttled(p, cpu);
+
+ return 0;
+}
+
+static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
+{
+ struct rb_node *node = &p->core_node;
+ int cpu = task_cpu(p);
+
+ do {
+ node = rb_next(node);
+ if (!node)
+ return NULL;
+
+ p = __node_2_sc(node);
+ if (p->core_cookie != cookie)
+ return NULL;
+
+ } while (sched_task_is_throttled(p, cpu));
+
+ return p;
+}
/*
- * part of the period that we allow rt tasks to run in us.
- * default: 0.95s
+ * Find left-most (aka, highest priority) and unthrottled task matching @cookie.
+ * If no suitable task is found, NULL will be returned.
*/
-int sysctl_sched_rt_runtime = 950000;
+static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+{
+ struct task_struct *p;
+ struct rb_node *node;
+
+ node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
+ if (!node)
+ return NULL;
+
+ p = __node_2_sc(node);
+ if (!sched_task_is_throttled(p, rq->cpu))
+ return p;
+
+ return sched_core_next(p, cookie);
+}
+
+/*
+ * Magic required such that:
+ *
+ * raw_spin_rq_lock(rq);
+ * ...
+ * raw_spin_rq_unlock(rq);
+ *
+ * ends up locking and unlocking the _same_ lock, and all CPUs
+ * always agree on what rq has what lock.
+ *
+ * XXX entirely possible to selectively enable cores, don't bother for now.
+ */
+
+static DEFINE_MUTEX(sched_core_mutex);
+static atomic_t sched_core_count;
+static struct cpumask sched_core_mask;
+
+static void sched_core_lock(int cpu, unsigned long *flags)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ int t, i = 0;
+
+ local_irq_save(*flags);
+ for_each_cpu(t, smt_mask)
+ raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
+}
+
+static void sched_core_unlock(int cpu, unsigned long *flags)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ int t;
+
+ for_each_cpu(t, smt_mask)
+ raw_spin_unlock(&cpu_rq(t)->__lock);
+ local_irq_restore(*flags);
+}
+
+static void __sched_core_flip(bool enabled)
+{
+ unsigned long flags;
+ int cpu, t;
+
+ cpus_read_lock();
+
+ /*
+ * Toggle the online cores, one by one.
+ */
+ cpumask_copy(&sched_core_mask, cpu_online_mask);
+ for_each_cpu(cpu, &sched_core_mask) {
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+
+ sched_core_lock(cpu, &flags);
+
+ for_each_cpu(t, smt_mask)
+ cpu_rq(t)->core_enabled = enabled;
+
+ cpu_rq(cpu)->core->core_forceidle_start = 0;
+
+ sched_core_unlock(cpu, &flags);
+
+ cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
+ }
+
+ /*
+ * Toggle the offline CPUs.
+ */
+ for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_online_mask)
+ cpu_rq(cpu)->core_enabled = enabled;
+
+ cpus_read_unlock();
+}
+
+static void sched_core_assert_empty(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree));
+}
+
+static void __sched_core_enable(void)
+{
+ static_branch_enable(&__sched_core_enabled);
+ /*
+ * Ensure all previous instances of raw_spin_rq_*lock() have finished
+ * and future ones will observe !sched_core_disabled().
+ */
+ synchronize_rcu();
+ __sched_core_flip(true);
+ sched_core_assert_empty();
+}
+
+static void __sched_core_disable(void)
+{
+ sched_core_assert_empty();
+ __sched_core_flip(false);
+ static_branch_disable(&__sched_core_enabled);
+}
+
+void sched_core_get(void)
+{
+ if (atomic_inc_not_zero(&sched_core_count))
+ return;
+
+ mutex_lock(&sched_core_mutex);
+ if (!atomic_read(&sched_core_count))
+ __sched_core_enable();
+
+ smp_mb__before_atomic();
+ atomic_inc(&sched_core_count);
+ mutex_unlock(&sched_core_mutex);
+}
+
+static void __sched_core_put(struct work_struct *work)
+{
+ if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) {
+ __sched_core_disable();
+ mutex_unlock(&sched_core_mutex);
+ }
+}
+
+void sched_core_put(void)
+{
+ static DECLARE_WORK(_work, __sched_core_put);
+
+ /*
+ * "There can be only one"
+ *
+ * Either this is the last one, or we don't actually need to do any
+ * 'work'. If it is the last *again*, we rely on
+ * WORK_STRUCT_PENDING_BIT.
+ */
+ if (!atomic_add_unless(&sched_core_count, -1, 1))
+ schedule_work(&_work);
+}
+
+#else /* !CONFIG_SCHED_CORE */
+
+static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
+static inline void
+sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
+
+#endif /* CONFIG_SCHED_CORE */
+
+/*
+ * Serialization rules:
+ *
+ * Lock order:
+ *
+ * p->pi_lock
+ * rq->lock
+ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
+ *
+ * rq1->lock
+ * rq2->lock where: rq1 < rq2
+ *
+ * Regular state:
+ *
+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the
+ * local CPU's rq->lock, it optionally removes the task from the runqueue and
+ * always looks at the local rq data structures to find the most eligible task
+ * to run next.
+ *
+ * Task enqueue is also under rq->lock, possibly taken from another CPU.
+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
+ * the local CPU to avoid bouncing the runqueue state around [ see
+ * ttwu_queue_wakelist() ]
+ *
+ * Task wakeup, specifically wakeups that involve migration, are horribly
+ * complicated to avoid having to take two rq->locks.
+ *
+ * Special state:
+ *
+ * System-calls and anything external will use task_rq_lock() which acquires
+ * both p->pi_lock and rq->lock. As a consequence the state they change is
+ * stable while holding either lock:
+ *
+ * - sched_setaffinity()/
+ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
+ * - set_user_nice(): p->se.load, p->*prio
+ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
+ * p->se.load, p->rt_priority,
+ * p->dl.dl_{runtime, deadline, period, flags, bw, density}
+ * - sched_setnuma(): p->numa_preferred_nid
+ * - sched_move_task(): p->sched_task_group
+ * - uclamp_update_active() p->uclamp*
+ *
+ * p->state <- TASK_*:
+ *
+ * is changed locklessly using set_current_state(), __set_current_state() or
+ * set_special_state(), see their respective comments, or by
+ * try_to_wake_up(). This latter uses p->pi_lock to serialize against
+ * concurrent self.
+ *
+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
+ *
+ * is set by activate_task() and cleared by deactivate_task(), under
+ * rq->lock. Non-zero indicates the task is runnable, the special
+ * ON_RQ_MIGRATING state is used for migration without holding both
+ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
+ *
+ * p->on_cpu <- { 0, 1 }:
+ *
+ * is set by prepare_task() and cleared by finish_task() such that it will be
+ * set before p is scheduled-in and cleared after p is scheduled-out, both
+ * under rq->lock. Non-zero indicates the task is running on its CPU.
+ *
+ * [ The astute reader will observe that it is possible for two tasks on one
+ * CPU to have ->on_cpu = 1 at the same time. ]
+ *
+ * task_cpu(p): is changed by set_task_cpu(), the rules are:
+ *
+ * - Don't call set_task_cpu() on a blocked task:
+ *
+ * We don't care what CPU we're not running on, this simplifies hotplug,
+ * the CPU assignment of blocked tasks isn't required to be valid.
+ *
+ * - for try_to_wake_up(), called under p->pi_lock:
+ *
+ * This allows try_to_wake_up() to only take one rq->lock, see its comment.
+ *
+ * - for migration called under rq->lock:
+ * [ see task_on_rq_migrating() in task_rq_lock() ]
+ *
+ * o move_queued_task()
+ * o detach_task()
+ *
+ * - for migration called under double_rq_lock():
+ *
+ * o __migrate_swap_task()
+ * o push_rt_task() / pull_rt_task()
+ * o push_dl_task() / pull_dl_task()
+ * o dl_task_offline_migration()
+ *
+ */
+
+void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
+{
+ raw_spinlock_t *lock;
+
+ /* Matches synchronize_rcu() in __sched_core_enable() */
+ preempt_disable();
+ if (sched_core_disabled()) {
+ raw_spin_lock_nested(&rq->__lock, subclass);
+ /* preempt_count *MUST* be > 1 */
+ preempt_enable_no_resched();
+ return;
+ }
+
+ for (;;) {
+ lock = __rq_lockp(rq);
+ raw_spin_lock_nested(lock, subclass);
+ if (likely(lock == __rq_lockp(rq))) {
+ /* preempt_count *MUST* be > 1 */
+ preempt_enable_no_resched();
+ return;
+ }
+ raw_spin_unlock(lock);
+ }
+}
+
+bool raw_spin_rq_trylock(struct rq *rq)
+{
+ raw_spinlock_t *lock;
+ bool ret;
+
+ /* Matches synchronize_rcu() in __sched_core_enable() */
+ preempt_disable();
+ if (sched_core_disabled()) {
+ ret = raw_spin_trylock(&rq->__lock);
+ preempt_enable();
+ return ret;
+ }
+
+ for (;;) {
+ lock = __rq_lockp(rq);
+ ret = raw_spin_trylock(lock);
+ if (!ret || (likely(lock == __rq_lockp(rq)))) {
+ preempt_enable();
+ return ret;
+ }
+ raw_spin_unlock(lock);
+ }
+}
+
+void raw_spin_rq_unlock(struct rq *rq)
+{
+ raw_spin_unlock(rq_lockp(rq));
+}
+
+#ifdef CONFIG_SMP
+/*
+ * double_rq_lock - safely lock two runqueues
+ */
+void double_rq_lock(struct rq *rq1, struct rq *rq2)
+{
+ lockdep_assert_irqs_disabled();
+
+ if (rq_order_less(rq2, rq1))
+ swap(rq1, rq2);
+
+ raw_spin_rq_lock(rq1);
+ if (__rq_lockp(rq1) != __rq_lockp(rq2))
+ raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+
+ double_rq_clock_clear_update(rq1, rq2);
+}
+#endif
/*
* __task_rq_lock - lock the rq @p resides on.
@@ -87,12 +634,12 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
for (;;) {
rq = task_rq(p);
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf);
return rq;
}
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
while (unlikely(task_on_rq_migrating(p)))
cpu_relax();
@@ -111,7 +658,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
for (;;) {
raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
rq = task_rq(p);
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
/*
* move_queued_task() task_rq_lock()
*
@@ -133,7 +680,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
rq_pin_lock(rq, rf);
return rq;
}
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
while (unlikely(task_on_rq_migrating(p)))
@@ -176,6 +723,8 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
+ psi_account_irqtime(rq->curr, irq_delta);
+ delayacct_irq(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) {
@@ -203,7 +752,7 @@ void update_rq_clock(struct rq *rq)
{
s64 delta;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
if (rq->clock_update_flags & RQCF_ACT_SKIP)
return;
@@ -221,14 +770,6 @@ void update_rq_clock(struct rq *rq)
update_rq_clock_task(rq, delta);
}
-static inline void
-rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
-{
- csd->flags = 0;
- csd->func = func;
- csd->info = rq;
-}
-
#ifdef CONFIG_SCHED_HRTICK
/*
* Use HR-timers to deliver accurate preemption points.
@@ -264,8 +805,9 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
+ ktime_t time = rq->hrtick_time;
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
+ hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
}
/*
@@ -289,7 +831,6 @@ static void __hrtick_start(void *arg)
void hrtick_start(struct rq *rq, u64 delay)
{
struct hrtimer *timer = &rq->hrtick_timer;
- ktime_t time;
s64 delta;
/*
@@ -297,9 +838,7 @@ void hrtick_start(struct rq *rq, u64 delay)
* doesn't make sense and can cause timer DoS.
*/
delta = max_t(s64, delay, 10000LL);
- time = ktime_add_ns(timer->base->get_time(), delta);
-
- hrtimer_set_expires(timer, time);
+ rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
if (rq == this_rq())
__hrtick_restart(rq);
@@ -329,7 +868,7 @@ void hrtick_start(struct rq *rq, u64 delay)
static void hrtick_rq_init(struct rq *rq)
{
#ifdef CONFIG_SMP
- rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
+ INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
#endif
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
rq->hrtick_timer.function = hrtick;
@@ -351,15 +890,11 @@ static inline void hrtick_rq_init(struct rq *rq)
({ \
typeof(ptr) _ptr = (ptr); \
typeof(mask) _mask = (mask); \
- typeof(*_ptr) _old, _val = *_ptr; \
+ typeof(*_ptr) _val = *_ptr; \
\
- for (;;) { \
- _old = cmpxchg(_ptr, _val, _val | _mask); \
- if (_old == _val) \
- break; \
- _val = _old; \
- } \
- _old; \
+ do { \
+ } while (!try_cmpxchg(_ptr, &_val, _val | _mask)); \
+ _val; \
})
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
@@ -368,7 +903,7 @@ static inline void hrtick_rq_init(struct rq *rq)
* this avoids any races wrt polling state changes and thereby avoids
* spurious IPIs.
*/
-static bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
@@ -383,30 +918,27 @@ static bool set_nr_and_not_polling(struct task_struct *p)
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
- typeof(ti->flags) old, val = READ_ONCE(ti->flags);
+ typeof(ti->flags) val = READ_ONCE(ti->flags);
- for (;;) {
+ do {
if (!(val & _TIF_POLLING_NRFLAG))
return false;
if (val & _TIF_NEED_RESCHED)
return true;
- old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
- if (old == val)
- break;
- val = old;
- }
+ } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED));
+
return true;
}
#else
-static bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct task_struct *p)
{
set_tsk_need_resched(p);
return true;
}
#ifdef CONFIG_SMP
-static bool set_nr_if_polling(struct task_struct *p)
+static inline bool set_nr_if_polling(struct task_struct *p)
{
return false;
}
@@ -419,7 +951,7 @@ static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
/*
* Atomically grab the task, if ->wake_q is !nil already it means
- * its already queued (either by us or someone else) and will get the
+ * it's already queued (either by us or someone else) and will get the
* wakeup due to that.
*
* In order to ensure that a pending wakeup will observe our pending
@@ -486,7 +1018,6 @@ void wake_up_q(struct wake_q_head *head)
struct task_struct *task;
task = container_of(node, struct task_struct, wake_q);
- BUG_ON(!task);
/* Task can safely be re-inserted now: */
node = node->next;
task->wake_q.next = NULL;
@@ -512,7 +1043,7 @@ void resched_curr(struct rq *rq)
struct task_struct *curr = rq->curr;
int cpu;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
if (test_tsk_need_resched(curr))
return;
@@ -536,10 +1067,10 @@ void resched_cpu(int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_rq_lock_irqsave(rq, flags);
if (cpu_online(cpu) || cpu == smp_processor_id())
resched_curr(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
}
#ifdef CONFIG_SMP
@@ -556,33 +1087,32 @@ int get_nohz_timer_target(void)
{
int i, cpu = smp_processor_id(), default_cpu = -1;
struct sched_domain *sd;
+ const struct cpumask *hk_mask;
- if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
+ if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
if (!idle_cpu(cpu))
return cpu;
default_cpu = cpu;
}
- rcu_read_lock();
+ hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
+
+ guard(rcu)();
+
for_each_domain(cpu, sd) {
- for_each_cpu_and(i, sched_domain_span(sd),
- housekeeping_cpumask(HK_FLAG_TIMER)) {
+ for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
if (cpu == i)
continue;
- if (!idle_cpu(i)) {
- cpu = i;
- goto unlock;
- }
+ if (!idle_cpu(i))
+ return i;
}
}
if (default_cpu == -1)
- default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
- cpu = default_cpu;
-unlock:
- rcu_read_unlock();
- return cpu;
+ default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
+
+ return default_cpu;
}
/*
@@ -602,6 +1132,28 @@ static void wake_up_idle_cpu(int cpu)
if (cpu == smp_processor_id())
return;
+ /*
+ * Set TIF_NEED_RESCHED and send an IPI if in the non-polling
+ * part of the idle loop. This forces an exit from the idle loop
+ * and a round trip to schedule(). Now this could be optimized
+ * because a simple new idle loop iteration is enough to
+ * re-evaluate the next tick. Provided some re-ordering of tick
+ * nohz functions that would need to follow TIF_NR_POLLING
+ * clearing:
+ *
+ * - On most archs, a simple fetch_or on ti::flags with a
+ * "0" value would be enough to know if an IPI needs to be sent.
+ *
+ * - x86 needs to perform a last need_resched() check between
+ * monitor and mwait which doesn't take timers into account.
+ * There a dedicated TIF_TIMER flag would be required to
+ * fetch_or here and be checked along with TIF_NEED_RESCHED
+ * before mwait().
+ *
+ * However, remote timer enqueue is not such a frequent event
+ * and testing of the above solutions didn't appear to report
+ * much benefits.
+ */
if (set_nr_and_not_polling(rq->idle))
smp_send_reschedule(cpu);
else
@@ -648,7 +1200,7 @@ static void nohz_csd_func(void *info)
/*
* Release the rq::nohz_csd.
*/
- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu));
WARN_ON(!(flags & NOHZ_KICK_MASK));
rq->idle_balance = idle_cpu(cpu);
@@ -661,6 +1213,20 @@ static void nohz_csd_func(void *info)
#endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
+static inline bool __need_bw_check(struct rq *rq, struct task_struct *p)
+{
+ if (rq->nr_running != 1)
+ return false;
+
+ if (p->sched_class != &fair_sched_class)
+ return false;
+
+ if (!task_on_rq_queued(p))
+ return false;
+
+ return true;
+}
+
bool sched_can_stop_tick(struct rq *rq)
{
int fifo_nr_running;
@@ -670,7 +1236,7 @@ bool sched_can_stop_tick(struct rq *rq)
return false;
/*
- * If there are more than one RR tasks, we need the tick to effect the
+ * If there are more than one RR tasks, we need the tick to affect the
* actual RR behaviour.
*/
if (rq->rt.rr_nr_running) {
@@ -696,6 +1262,18 @@ bool sched_can_stop_tick(struct rq *rq)
if (rq->nr_running > 1)
return false;
+ /*
+ * If there is one task and it has CFS runtime bandwidth constraints
+ * and it's on the cpu now we don't want to stop the tick.
+ * This check prevents clearing the bit if a newly enqueued task here is
+ * dequeued by migrating while the constrained task continues to run.
+ * E.g. going from 2->1 without going through pick_next_task().
+ */
+ if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) {
+ if (cfs_task_bw_constrained(rq->curr))
+ return false;
+ }
+
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -786,14 +1364,51 @@ static void set_load_weight(struct task_struct *p, bool update_load)
static DEFINE_MUTEX(uclamp_mutex);
/* Max allowed minimum utilization */
-unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
+static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
/* Max allowed maximum utilization */
-unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+
+/*
+ * By default RT tasks run at the maximum performance point/capacity of the
+ * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
+ * SCHED_CAPACITY_SCALE.
+ *
+ * This knob allows admins to change the default behavior when uclamp is being
+ * used. In battery powered devices, particularly, running at the maximum
+ * capacity and frequency will increase energy consumption and shorten the
+ * battery life.
+ *
+ * This knob only affects RT tasks that their uclamp_se->user_defined == false.
+ *
+ * This knob will not override the system default sched_util_clamp_min defined
+ * above.
+ */
+static unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
/* All clamps are required to be less or equal than these values */
static struct uclamp_se uclamp_default[UCLAMP_CNT];
+/*
+ * This static key is used to reduce the uclamp overhead in the fast path. It
+ * primarily disables the call to uclamp_rq_{inc, dec}() in
+ * enqueue/dequeue_task().
+ *
+ * This allows users to continue to enable uclamp in their kernel config with
+ * minimum uclamp overhead in the fast path.
+ *
+ * As soon as userspace modifies any of the uclamp knobs, the static key is
+ * enabled, since we have an actual users that make use of uclamp
+ * functionality.
+ *
+ * The knobs that would enable this static key are:
+ *
+ * * A task modifying its uclamp value with sched_setattr().
+ * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
+ * * An admin modifying the cgroup cpu.uclamp.{min, max}
+ */
+DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
+
/* Integer rounded range for each bucket */
#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
@@ -802,12 +1417,7 @@ static struct uclamp_se uclamp_default[UCLAMP_CNT];
static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
{
- return clamp_value / UCLAMP_BUCKET_DELTA;
-}
-
-static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
-{
- return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
+ return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
}
static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
@@ -849,7 +1459,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
return;
- WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
+ uclamp_rq_set(rq, clamp_id, clamp_value);
}
static inline
@@ -873,12 +1483,40 @@ unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
return uclamp_idle_value(rq, clamp_id, clamp_value);
}
+static void __uclamp_update_util_min_rt_default(struct task_struct *p)
+{
+ unsigned int default_util_min;
+ struct uclamp_se *uc_se;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ uc_se = &p->uclamp_req[UCLAMP_MIN];
+
+ /* Only sync if user didn't override the default */
+ if (uc_se->user_defined)
+ return;
+
+ default_util_min = sysctl_sched_uclamp_util_min_rt_default;
+ uclamp_se_set(uc_se, default_util_min, false);
+}
+
+static void uclamp_update_util_min_rt_default(struct task_struct *p)
+{
+ if (!rt_task(p))
+ return;
+
+ /* Protect updates to p->uclamp_* */
+ guard(task_rq_lock)(p);
+ __uclamp_update_util_min_rt_default(p);
+}
+
static inline struct uclamp_se
uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
{
+ /* Copy by value as we could modify it */
struct uclamp_se uc_req = p->uclamp_req[clamp_id];
#ifdef CONFIG_UCLAMP_TASK_GROUP
- struct uclamp_se uc_max;
+ unsigned int tg_min, tg_max, value;
/*
* Tasks in autogroups or root task group will be
@@ -889,9 +1527,11 @@ uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
if (task_group(p) == &root_task_group)
return uc_req;
- uc_max = task_group(p)->uclamp[clamp_id];
- if (uc_req.value > uc_max.value || !uc_req.user_defined)
- return uc_max;
+ tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
+ tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
+ value = uc_req.value;
+ value = clamp(value, tg_min, tg_max);
+ uclamp_se_set(&uc_req, value, false);
#endif
return uc_req;
@@ -948,7 +1588,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
struct uclamp_bucket *bucket;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
/* Update task effective clamp */
p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
@@ -966,8 +1606,8 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
if (bucket->tasks == 1 || uc_se->value > bucket->value)
bucket->value = uc_se->value;
- if (uc_se->value > READ_ONCE(uc_rq->value))
- WRITE_ONCE(uc_rq->value, uc_se->value);
+ if (uc_se->value > uclamp_rq_get(rq, clamp_id))
+ uclamp_rq_set(rq, clamp_id, uc_se->value);
}
/*
@@ -988,12 +1628,40 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
unsigned int bkt_clamp;
unsigned int rq_clamp;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * If sched_uclamp_used was enabled after task @p was enqueued,
+ * we could end up with unbalanced call to uclamp_rq_dec_id().
+ *
+ * In this case the uc_se->active flag should be false since no uclamp
+ * accounting was performed at enqueue time and we can just return
+ * here.
+ *
+ * Need to be careful of the following enqueue/dequeue ordering
+ * problem too
+ *
+ * enqueue(taskA)
+ * // sched_uclamp_used gets enabled
+ * enqueue(taskB)
+ * dequeue(taskA)
+ * // Must not decrement bucket->tasks here
+ * dequeue(taskB)
+ *
+ * where we could end up with stale data in uc_se and
+ * bucket[uc_se->bucket_id].
+ *
+ * The following check here eliminates the possibility of such race.
+ */
+ if (unlikely(!uc_se->active))
+ return;
bucket = &uc_rq->bucket[uc_se->bucket_id];
+
SCHED_WARN_ON(!bucket->tasks);
if (likely(bucket->tasks))
bucket->tasks--;
+
uc_se->active = false;
/*
@@ -1005,7 +1673,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
if (likely(bucket->tasks))
return;
- rq_clamp = READ_ONCE(uc_rq->value);
+ rq_clamp = uclamp_rq_get(rq, clamp_id);
/*
* Defensive programming: this should never happen. If it happens,
* e.g. due to future modification, warn and fixup the expected value.
@@ -1013,7 +1681,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
SCHED_WARN_ON(bucket->value > rq_clamp);
if (bucket->value >= rq_clamp) {
bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
- WRITE_ONCE(uc_rq->value, bkt_clamp);
+ uclamp_rq_set(rq, clamp_id, bkt_clamp);
}
}
@@ -1021,6 +1689,15 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
{
enum uclamp_id clamp_id;
+ /*
+ * Avoid any overhead until uclamp is actually used by the userspace.
+ *
+ * The condition is constructed such that a NOP is generated when
+ * sched_uclamp_used is disabled.
+ */
+ if (!static_branch_unlikely(&sched_uclamp_used))
+ return;
+
if (unlikely(!p->sched_class->uclamp_enabled))
return;
@@ -1036,6 +1713,15 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
{
enum uclamp_id clamp_id;
+ /*
+ * Avoid any overhead until uclamp is actually used by the userspace.
+ *
+ * The condition is constructed such that a NOP is generated when
+ * sched_uclamp_used is disabled.
+ */
+ if (!static_branch_unlikely(&sched_uclamp_used))
+ return;
+
if (unlikely(!p->sched_class->uclamp_enabled))
return;
@@ -1043,9 +1729,27 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
uclamp_rq_dec_id(rq, p, clamp_id);
}
+static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
+ enum uclamp_id clamp_id)
+{
+ if (!p->uclamp[clamp_id].active)
+ return;
+
+ uclamp_rq_dec_id(rq, p, clamp_id);
+ uclamp_rq_inc_id(rq, p, clamp_id);
+
+ /*
+ * Make sure to clear the idle flag if we've transiently reached 0
+ * active tasks on rq.
+ */
+ if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
+}
+
static inline void
-uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+uclamp_update_active(struct task_struct *p)
{
+ enum uclamp_id clamp_id;
struct rq_flags rf;
struct rq *rq;
@@ -1065,34 +1769,31 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
* affecting a valid clamp bucket, the next time it's enqueued,
* it will already see the updated clamp bucket value.
*/
- if (p->uclamp[clamp_id].active) {
- uclamp_rq_dec_id(rq, p, clamp_id);
- uclamp_rq_inc_id(rq, p, clamp_id);
- }
+ for_each_clamp_id(clamp_id)
+ uclamp_rq_reinc_id(rq, p, clamp_id);
task_rq_unlock(rq, p, &rf);
}
#ifdef CONFIG_UCLAMP_TASK_GROUP
static inline void
-uclamp_update_active_tasks(struct cgroup_subsys_state *css,
- unsigned int clamps)
+uclamp_update_active_tasks(struct cgroup_subsys_state *css)
{
- enum uclamp_id clamp_id;
struct css_task_iter it;
struct task_struct *p;
css_task_iter_start(css, 0, &it);
- while ((p = css_task_iter_next(&it))) {
- for_each_clamp_id(clamp_id) {
- if ((0x1 << clamp_id) & clamps)
- uclamp_update_active(p, clamp_id);
- }
- }
+ while ((p = css_task_iter_next(&it)))
+ uclamp_update_active(p);
css_task_iter_end(&it);
}
static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+#endif
+
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_UCLAMP_TASK
+#ifdef CONFIG_UCLAMP_TASK_GROUP
static void uclamp_update_root_tg(void)
{
struct task_group *tg = &root_task_group;
@@ -1102,33 +1803,62 @@ static void uclamp_update_root_tg(void)
uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
sysctl_sched_uclamp_util_max, false);
- rcu_read_lock();
+ guard(rcu)();
cpu_util_update_eff(&root_task_group.css);
- rcu_read_unlock();
}
#else
static void uclamp_update_root_tg(void) { }
#endif
-int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
+static void uclamp_sync_util_min_rt_default(void)
+{
+ struct task_struct *g, *p;
+
+ /*
+ * copy_process() sysctl_uclamp
+ * uclamp_min_rt = X;
+ * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
+ * // link thread smp_mb__after_spinlock()
+ * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
+ * sched_post_fork() for_each_process_thread()
+ * __uclamp_sync_rt() __uclamp_sync_rt()
+ *
+ * Ensures that either sched_post_fork() will observe the new
+ * uclamp_min_rt or for_each_process_thread() will observe the new
+ * task.
+ */
+ read_lock(&tasklist_lock);
+ smp_mb__after_spinlock();
+ read_unlock(&tasklist_lock);
+
+ guard(rcu)();
+ for_each_process_thread(g, p)
+ uclamp_update_util_min_rt_default(p);
+}
+
+static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
bool update_root_tg = false;
- int old_min, old_max;
+ int old_min, old_max, old_min_rt;
int result;
- mutex_lock(&uclamp_mutex);
+ guard(mutex)(&uclamp_mutex);
+
old_min = sysctl_sched_uclamp_util_min;
old_max = sysctl_sched_uclamp_util_max;
+ old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
result = proc_dointvec(table, write, buffer, lenp, ppos);
if (result)
goto undo;
if (!write)
- goto done;
+ return 0;
if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
- sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
+ sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
+ sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
+
result = -EINVAL;
goto undo;
}
@@ -1144,78 +1874,128 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
update_root_tg = true;
}
- if (update_root_tg)
+ if (update_root_tg) {
+ static_branch_enable(&sched_uclamp_used);
uclamp_update_root_tg();
+ }
+
+ if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
+ static_branch_enable(&sched_uclamp_used);
+ uclamp_sync_util_min_rt_default();
+ }
/*
* We update all RUNNABLE tasks only when task groups are in use.
* Otherwise, keep it simple and do just a lazy update at each next
* task enqueue time.
*/
-
- goto done;
+ return 0;
undo:
sysctl_sched_uclamp_util_min = old_min;
sysctl_sched_uclamp_util_max = old_max;
-done:
- mutex_unlock(&uclamp_mutex);
-
+ sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
return result;
}
+#endif
+#endif
static int uclamp_validate(struct task_struct *p,
const struct sched_attr *attr)
{
- unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
- unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
+ int util_min = p->uclamp_req[UCLAMP_MIN].value;
+ int util_max = p->uclamp_req[UCLAMP_MAX].value;
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
- lower_bound = attr->sched_util_min;
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
- upper_bound = attr->sched_util_max;
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+ util_min = attr->sched_util_min;
- if (lower_bound > upper_bound)
- return -EINVAL;
- if (upper_bound > SCHED_CAPACITY_SCALE)
+ if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
+ return -EINVAL;
+ }
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+ util_max = attr->sched_util_max;
+
+ if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
+ return -EINVAL;
+ }
+
+ if (util_min != -1 && util_max != -1 && util_min > util_max)
return -EINVAL;
+ /*
+ * We have valid uclamp attributes; make sure uclamp is enabled.
+ *
+ * We need to do that here, because enabling static branches is a
+ * blocking operation which obviously cannot be done while holding
+ * scheduler locks.
+ */
+ static_branch_enable(&sched_uclamp_used);
+
return 0;
}
+static bool uclamp_reset(const struct sched_attr *attr,
+ enum uclamp_id clamp_id,
+ struct uclamp_se *uc_se)
+{
+ /* Reset on sched class change for a non user-defined clamp value. */
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
+ !uc_se->user_defined)
+ return true;
+
+ /* Reset on sched_util_{min,max} == -1. */
+ if (clamp_id == UCLAMP_MIN &&
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
+ attr->sched_util_min == -1) {
+ return true;
+ }
+
+ if (clamp_id == UCLAMP_MAX &&
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
+ attr->sched_util_max == -1) {
+ return true;
+ }
+
+ return false;
+}
+
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr)
{
enum uclamp_id clamp_id;
- /*
- * On scheduling class change, reset to default clamps for tasks
- * without a task-specific value.
- */
for_each_clamp_id(clamp_id) {
struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
- unsigned int clamp_value = uclamp_none(clamp_id);
+ unsigned int value;
- /* Keep using defined clamps across class changes */
- if (uc_se->user_defined)
+ if (!uclamp_reset(attr, clamp_id, uc_se))
continue;
- /* By default, RT tasks always get 100% boost */
+ /*
+ * RT by default have a 100% boost value that could be modified
+ * at runtime.
+ */
if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
- clamp_value = uclamp_none(UCLAMP_MAX);
+ value = sysctl_sched_uclamp_util_min_rt_default;
+ else
+ value = uclamp_none(clamp_id);
+
+ uclamp_se_set(uc_se, value, false);
- uclamp_se_set(uc_se, clamp_value, false);
}
if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
return;
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
+ attr->sched_util_min != -1) {
uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
attr->sched_util_min, true);
}
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
+ attr->sched_util_max != -1) {
uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
attr->sched_util_max, true);
}
@@ -1225,6 +2005,10 @@ static void uclamp_fork(struct task_struct *p)
{
enum uclamp_id clamp_id;
+ /*
+ * We don't need to hold task_rq_lock() when updating p->uclamp_* here
+ * as the task is still at its early fork stages.
+ */
for_each_clamp_id(clamp_id)
p->uclamp[clamp_id].active = false;
@@ -1237,19 +2021,33 @@ static void uclamp_fork(struct task_struct *p)
}
}
+static void uclamp_post_fork(struct task_struct *p)
+{
+ uclamp_update_util_min_rt_default(p);
+}
+
+static void __init init_uclamp_rq(struct rq *rq)
+{
+ enum uclamp_id clamp_id;
+ struct uclamp_rq *uc_rq = rq->uclamp;
+
+ for_each_clamp_id(clamp_id) {
+ uc_rq[clamp_id] = (struct uclamp_rq) {
+ .value = uclamp_none(clamp_id)
+ };
+ }
+
+ rq->uclamp_flags = UCLAMP_FLAG_IDLE;
+}
+
static void __init init_uclamp(void)
{
struct uclamp_se uc_max = {};
enum uclamp_id clamp_id;
int cpu;
- mutex_init(&uclamp_mutex);
-
- for_each_possible_cpu(cpu) {
- memset(&cpu_rq(cpu)->uclamp, 0,
- sizeof(struct uclamp_rq)*UCLAMP_CNT);
- cpu_rq(cpu)->uclamp_flags = 0;
- }
+ for_each_possible_cpu(cpu)
+ init_uclamp_rq(cpu_rq(cpu));
for_each_clamp_id(clamp_id) {
uclamp_se_set(&init_task.uclamp_req[clamp_id],
@@ -1278,30 +2076,61 @@ static inline int uclamp_validate(struct task_struct *p,
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr) { }
static inline void uclamp_fork(struct task_struct *p) { }
+static inline void uclamp_post_fork(struct task_struct *p) { }
static inline void init_uclamp(void) { }
#endif /* CONFIG_UCLAMP_TASK */
+bool sched_task_on_rq(struct task_struct *p)
+{
+ return task_on_rq_queued(p);
+}
+
+unsigned long get_wchan(struct task_struct *p)
+{
+ unsigned long ip = 0;
+ unsigned int state;
+
+ if (!p || p == current)
+ return 0;
+
+ /* Only get wchan if task is blocked and we can keep it that way. */
+ raw_spin_lock_irq(&p->pi_lock);
+ state = READ_ONCE(p->__state);
+ smp_rmb(); /* see try_to_wake_up() */
+ if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq)
+ ip = __get_wchan(p);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ return ip;
+}
+
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);
if (!(flags & ENQUEUE_RESTORE)) {
- sched_info_queued(rq, p);
- psi_enqueue(p, flags & ENQUEUE_WAKEUP);
+ sched_info_enqueue(rq, p);
+ psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
}
uclamp_rq_inc(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
+
+ if (sched_core_enabled(rq))
+ sched_core_enqueue(rq, p);
}
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
+ if (sched_core_enabled(rq))
+ sched_core_dequeue(rq, p, flags);
+
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
if (!(flags & DEQUEUE_SAVE)) {
- sched_info_dequeued(rq, p);
+ sched_info_dequeue(rq, p);
psi_dequeue(p, flags & DEQUEUE_SLEEP);
}
@@ -1311,24 +2140,37 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
+ if (task_on_rq_migrating(p))
+ flags |= ENQUEUE_MIGRATED;
+ if (flags & ENQUEUE_MIGRATED)
+ sched_mm_cid_migrate_to(rq, p);
+
enqueue_task(rq, p, flags);
- p->on_rq = TASK_ON_RQ_QUEUED;
+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED);
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
}
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
- p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
+ WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING);
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
dequeue_task(rq, p, flags);
}
-/*
- * __normal_prio - return the priority that is based on the static prio
- */
-static inline int __normal_prio(struct task_struct *p)
+static inline int __normal_prio(int policy, int rt_prio, int nice)
{
- return p->static_prio;
+ int prio;
+
+ if (dl_policy(policy))
+ prio = MAX_DL_PRIO - 1;
+ else if (rt_policy(policy))
+ prio = MAX_RT_PRIO - 1 - rt_prio;
+ else
+ prio = NICE_TO_PRIO(nice);
+
+ return prio;
}
/*
@@ -1340,15 +2182,7 @@ static inline int __normal_prio(struct task_struct *p)
*/
static inline int normal_prio(struct task_struct *p)
{
- int prio;
-
- if (task_has_dl_policy(p))
- prio = MAX_DL_PRIO-1;
- else if (task_has_rt_policy(p))
- prio = MAX_RT_PRIO-1 - p->rt_priority;
- else
- prio = __normal_prio(p);
- return prio;
+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
}
/*
@@ -1402,22 +2236,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
p->sched_class->prio_changed(rq, p, oldprio);
}
-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
{
- const struct sched_class *class;
-
- if (p->sched_class == rq->curr->sched_class) {
- rq->curr->sched_class->check_preempt_curr(rq, p, flags);
- } else {
- for_each_class(class) {
- if (class == rq->curr->sched_class)
- break;
- if (class == p->sched_class) {
- resched_curr(rq);
- break;
- }
- }
- }
+ if (p->sched_class == rq->curr->sched_class)
+ rq->curr->sched_class->wakeup_preempt(rq, p, flags);
+ else if (sched_class_above(p->sched_class, rq->curr->sched_class))
+ resched_curr(rq);
/*
* A queue event has occurred, and we're going to schedule. In
@@ -1427,21 +2251,253 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
rq_clock_skip_update(rq);
}
+static __always_inline
+int __task_state_match(struct task_struct *p, unsigned int state)
+{
+ if (READ_ONCE(p->__state) & state)
+ return 1;
+
+ if (READ_ONCE(p->saved_state) & state)
+ return -1;
+
+ return 0;
+}
+
+static __always_inline
+int task_state_match(struct task_struct *p, unsigned int state)
+{
+ /*
+ * Serialize against current_save_and_set_rtlock_wait_state(),
+ * current_restore_rtlock_saved_state(), and __refrigerator().
+ */
+ guard(raw_spinlock_irq)(&p->pi_lock);
+ return __task_state_match(p, state);
+}
+
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * Wait for the thread to block in any of the states set in @match_state.
+ * If it changes, i.e. @p might have woken up, then return zero. When we
+ * succeed in waiting for @p to be off its CPU, we return a positive number
+ * (its total switch count). If a second call a short while later returns the
+ * same number, the caller can be sure that @p has remained unscheduled the
+ * whole time.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
+{
+ int running, queued, match;
+ struct rq_flags rf;
+ unsigned long ncsw;
+ struct rq *rq;
+
+ for (;;) {
+ /*
+ * We do the initial early heuristics without holding
+ * any task-queue locks at all. We'll only try to get
+ * the runqueue lock when things look like they will
+ * work out!
+ */
+ rq = task_rq(p);
+
+ /*
+ * If the task is actively running on another CPU
+ * still, just relax and busy-wait without holding
+ * any locks.
+ *
+ * NOTE! Since we don't hold any locks, it's not
+ * even sure that "rq" stays as the right runqueue!
+ * But we don't care, since "task_on_cpu()" will
+ * return false if the runqueue has changed and p
+ * is actually now running somewhere else!
+ */
+ while (task_on_cpu(rq, p)) {
+ if (!task_state_match(p, match_state))
+ return 0;
+ cpu_relax();
+ }
+
+ /*
+ * Ok, time to look more closely! We need the rq
+ * lock now, to be *sure*. If we're wrong, we'll
+ * just go back and repeat.
+ */
+ rq = task_rq_lock(p, &rf);
+ trace_sched_wait_task(p);
+ running = task_on_cpu(rq, p);
+ queued = task_on_rq_queued(p);
+ ncsw = 0;
+ if ((match = __task_state_match(p, match_state))) {
+ /*
+ * When matching on p->saved_state, consider this task
+ * still queued so it will wait.
+ */
+ if (match < 0)
+ queued = 1;
+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+ }
+ task_rq_unlock(rq, p, &rf);
+
+ /*
+ * If it changed from the expected state, bail out now.
+ */
+ if (unlikely(!ncsw))
+ break;
+
+ /*
+ * Was it really running after all now that we
+ * checked with the proper locks actually held?
+ *
+ * Oops. Go back and try again..
+ */
+ if (unlikely(running)) {
+ cpu_relax();
+ continue;
+ }
+
+ /*
+ * It's not enough that it's not actively running,
+ * it must be off the runqueue _entirely_, and not
+ * preempted!
+ *
+ * So if it was still runnable (but just not actively
+ * running right now), it's preempted, and we should
+ * yield - it could be a while.
+ */
+ if (unlikely(queued)) {
+ ktime_t to = NSEC_PER_SEC / HZ;
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
+ continue;
+ }
+
+ /*
+ * Ahh, all good. It wasn't running, and it wasn't
+ * runnable, which means that it will never become
+ * running in the future either. We're all done!
+ */
+ break;
+ }
+
+ return ncsw;
+}
+
#ifdef CONFIG_SMP
+static void
+__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
+
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ struct affinity_context *ctx);
+
+static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
+{
+ struct affinity_context ac = {
+ .new_mask = cpumask_of(rq->cpu),
+ .flags = SCA_MIGRATE_DISABLE,
+ };
+
+ if (likely(!p->migration_disabled))
+ return;
+
+ if (p->cpus_ptr != &p->cpus_mask)
+ return;
+
+ /*
+ * Violates locking rules! see comment in __do_set_cpus_allowed().
+ */
+ __do_set_cpus_allowed(p, &ac);
+}
+
+void migrate_disable(void)
+{
+ struct task_struct *p = current;
+
+ if (p->migration_disabled) {
+ p->migration_disabled++;
+ return;
+ }
+
+ guard(preempt)();
+ this_rq()->nr_pinned++;
+ p->migration_disabled = 1;
+}
+EXPORT_SYMBOL_GPL(migrate_disable);
+
+void migrate_enable(void)
+{
+ struct task_struct *p = current;
+ struct affinity_context ac = {
+ .new_mask = &p->cpus_mask,
+ .flags = SCA_MIGRATE_ENABLE,
+ };
+
+ if (p->migration_disabled > 1) {
+ p->migration_disabled--;
+ return;
+ }
+
+ if (WARN_ON_ONCE(!p->migration_disabled))
+ return;
+
+ /*
+ * Ensure stop_task runs either before or after this, and that
+ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+ */
+ guard(preempt)();
+ if (p->cpus_ptr != &p->cpus_mask)
+ __set_cpus_allowed_ptr(p, &ac);
+ /*
+ * Mustn't clear migration_disabled() until cpus_ptr points back at the
+ * regular cpus_mask, otherwise things that race (eg.
+ * select_fallback_rq) get confused.
+ */
+ barrier();
+ p->migration_disabled = 0;
+ this_rq()->nr_pinned--;
+}
+EXPORT_SYMBOL_GPL(migrate_enable);
+
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+ return rq->nr_pinned;
+}
+
/*
* Per-CPU kthreads are allowed to run on !active && online CPUs, see
* __set_cpus_allowed_ptr() and select_fallback_rq().
*/
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
+ /* When not in the task's cpumask, no point in looking further. */
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
return false;
- if (is_per_cpu_kthread(p))
+ /* migrate_disabled() must be allowed to finish. */
+ if (is_migration_disabled(p))
+ return cpu_online(cpu);
+
+ /* Non kernel threads are not allowed during either online or offline. */
+ if (!(p->flags & PF_KTHREAD))
+ return cpu_active(cpu) && task_cpu_possible(cpu, p);
+
+ /* KTHREAD_IS_PER_CPU is always allowed. */
+ if (kthread_is_per_cpu(p))
return cpu_online(cpu);
- return cpu_active(cpu);
+ /* Regular kernel threads don't get to stay during offline. */
+ if (cpu_dying(cpu))
+ return false;
+
+ /* But are allowed during online. */
+ return cpu_online(cpu);
}
/*
@@ -1466,27 +2522,38 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int new_cpu)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
- WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
- dequeue_task(rq, p, DEQUEUE_NOCLOCK);
+ deactivate_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu);
rq_unlock(rq, rf);
rq = cpu_rq(new_cpu);
rq_lock(rq, rf);
- BUG_ON(task_cpu(p) != new_cpu);
- enqueue_task(rq, p, 0);
- p->on_rq = TASK_ON_RQ_QUEUED;
- check_preempt_curr(rq, p, 0);
+ WARN_ON_ONCE(task_cpu(p) != new_cpu);
+ activate_task(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
return rq;
}
struct migration_arg {
- struct task_struct *task;
- int dest_cpu;
+ struct task_struct *task;
+ int dest_cpu;
+ struct set_affinity_pending *pending;
+};
+
+/*
+ * @refs: number of wait_for_completion()
+ * @stop_pending: is @stop_work in use
+ */
+struct set_affinity_pending {
+ refcount_t refs;
+ unsigned int stop_pending;
+ struct completion done;
+ struct cpu_stop_work stop_work;
+ struct migration_arg arg;
};
/*
@@ -1505,7 +2572,6 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
if (!is_cpu_allowed(p, dest_cpu))
return rq;
- update_rq_clock(rq);
rq = move_queued_task(rq, rf, p, dest_cpu);
return rq;
@@ -1519,39 +2585,149 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
static int migration_cpu_stop(void *data)
{
struct migration_arg *arg = data;
+ struct set_affinity_pending *pending = arg->pending;
struct task_struct *p = arg->task;
struct rq *rq = this_rq();
+ bool complete = false;
struct rq_flags rf;
/*
* The original target CPU might have gone down and we might
* be on another CPU but it doesn't matter.
*/
- local_irq_disable();
+ local_irq_save(rf.flags);
/*
* We need to explicitly wake pending tasks before running
* __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
- flush_smp_call_function_from_idle();
+ flush_smp_call_function_queue();
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
+
+ /*
+ * If we were passed a pending, then ->stop_pending was set, thus
+ * p->migration_pending must have remained stable.
+ */
+ WARN_ON_ONCE(pending && pending != p->migration_pending);
+
/*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
* we're holding p->pi_lock.
*/
if (task_rq(p) == rq) {
- if (task_on_rq_queued(p))
+ if (is_migration_disabled(p))
+ goto out;
+
+ if (pending) {
+ p->migration_pending = NULL;
+ complete = true;
+
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
+ goto out;
+ }
+
+ if (task_on_rq_queued(p)) {
+ update_rq_clock(rq);
rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
- else
+ } else {
p->wake_cpu = arg->dest_cpu;
+ }
+
+ /*
+ * XXX __migrate_task() can fail, at which point we might end
+ * up running on a dodgy CPU, AFAICT this can only happen
+ * during CPU hotplug, at which point we'll get pushed out
+ * anyway, so it's probably not a big deal.
+ */
+
+ } else if (pending) {
+ /*
+ * This happens when we get migrated between migrate_enable()'s
+ * preempt_enable() and scheduling the stopper task. At that
+ * point we're a regular task again and not current anymore.
+ *
+ * A !PREEMPT kernel has a giant hole here, which makes it far
+ * more likely.
+ */
+
+ /*
+ * The task moved before the stopper got to run. We're holding
+ * ->pi_lock, so the allowed mask is stable - if it got
+ * somewhere allowed, we're done.
+ */
+ if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
+ p->migration_pending = NULL;
+ complete = true;
+ goto out;
+ }
+
+ /*
+ * When migrate_enable() hits a rq mis-match we can't reliably
+ * determine is_migration_disabled() and so have to chase after
+ * it.
+ */
+ WARN_ON_ONCE(!pending->stop_pending);
+ preempt_disable();
+ task_rq_unlock(rq, p, &rf);
+ stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+ preempt_enable();
+ return 0;
}
- rq_unlock(rq, &rf);
- raw_spin_unlock(&p->pi_lock);
+out:
+ if (pending)
+ pending->stop_pending = false;
+ task_rq_unlock(rq, p, &rf);
+
+ if (complete)
+ complete_all(&pending->done);
- local_irq_enable();
+ return 0;
+}
+
+int push_cpu_stop(void *arg)
+{
+ struct rq *lowest_rq = NULL, *rq = this_rq();
+ struct task_struct *p = arg;
+
+ raw_spin_lock_irq(&p->pi_lock);
+ raw_spin_rq_lock(rq);
+
+ if (task_rq(p) != rq)
+ goto out_unlock;
+
+ if (is_migration_disabled(p)) {
+ p->migration_flags |= MDF_PUSH;
+ goto out_unlock;
+ }
+
+ p->migration_flags &= ~MDF_PUSH;
+
+ if (p->sched_class->find_lock_rq)
+ lowest_rq = p->sched_class->find_lock_rq(p, rq);
+
+ if (!lowest_rq)
+ goto out_unlock;
+
+ // XXX validate p is still the highest prio task
+ if (task_rq(p) == rq) {
+ deactivate_task(rq, p, 0);
+ set_task_cpu(p, lowest_rq->cpu);
+ activate_task(lowest_rq, p, 0);
+ resched_curr(lowest_rq);
+ }
+
+ double_unlock_balance(rq, lowest_rq);
+
+out_unlock:
+ rq->push_busy = false;
+ raw_spin_rq_unlock(rq);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
return 0;
}
@@ -1559,18 +2735,45 @@ static int migration_cpu_stop(void *data)
* sched_class::set_cpus_allowed must do the below, but is not required to
* actually call this function.
*/
-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
+void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx)
{
- cpumask_copy(&p->cpus_mask, new_mask);
- p->nr_cpus_allowed = cpumask_weight(new_mask);
+ if (ctx->flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
+ p->cpus_ptr = ctx->new_mask;
+ return;
+ }
+
+ cpumask_copy(&p->cpus_mask, ctx->new_mask);
+ p->nr_cpus_allowed = cpumask_weight(ctx->new_mask);
+
+ /*
+ * Swap in a new user_cpus_ptr if SCA_USER flag set
+ */
+ if (ctx->flags & SCA_USER)
+ swap(p->user_cpus_ptr, ctx->user_mask);
}
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+static void
+__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
struct rq *rq = task_rq(p);
bool queued, running;
- lockdep_assert_held(&p->pi_lock);
+ /*
+ * This here violates the locking rules for affinity, since we're only
+ * supposed to change these variables while holding both rq->lock and
+ * p->pi_lock.
+ *
+ * HOWEVER, it magically works, because ttwu() is the only code that
+ * accesses these variables under p->pi_lock and only does so after
+ * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
+ * before finish_task().
+ *
+ * XXX do further audits, this smells like something putrid.
+ */
+ if (ctx->flags & SCA_MIGRATE_DISABLE)
+ SCHED_WARN_ON(!p->on_cpu);
+ else
+ lockdep_assert_held(&p->pi_lock);
queued = task_on_rq_queued(p);
running = task_current(rq, p);
@@ -1580,13 +2783,13 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
* Because __kthread_bind() calls this on blocked tasks without
* holding rq->lock.
*/
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
}
if (running)
put_prev_task(rq, p);
- p->sched_class->set_cpus_allowed(p, new_mask);
+ p->sched_class->set_cpus_allowed(p, ctx);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -1595,113 +2798,572 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
}
/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
+ * Used for kthread_bind() and select_fallback_rq(), in both cases the user
+ * affinity (if any) should be destroyed too.
*/
-static int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask, bool check)
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ struct affinity_context ac = {
+ .new_mask = new_mask,
+ .user_mask = NULL,
+ .flags = SCA_USER, /* clear the user requested mask */
+ };
+ union cpumask_rcuhead {
+ cpumask_t cpumask;
+ struct rcu_head rcu;
+ };
+
+ __do_set_cpus_allowed(p, &ac);
+
+ /*
+ * Because this is called with p->pi_lock held, it is not possible
+ * to use kfree() here (when PREEMPT_RT=y), therefore punt to using
+ * kfree_rcu().
+ */
+ kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu);
+}
+
+static cpumask_t *alloc_user_cpus_ptr(int node)
+{
+ /*
+ * See do_set_cpus_allowed() above for the rcu_head usage.
+ */
+ int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));
+
+ return kmalloc_node(size, GFP_KERNEL, node);
+}
+
+int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
+ int node)
{
+ cpumask_t *user_mask;
+ unsigned long flags;
+
+ /*
+ * Always clear dst->user_cpus_ptr first as their user_cpus_ptr's
+ * may differ by now due to racing.
+ */
+ dst->user_cpus_ptr = NULL;
+
+ /*
+ * This check is racy and losing the race is a valid situation.
+ * It is not worth the extra overhead of taking the pi_lock on
+ * every fork/clone.
+ */
+ if (data_race(!src->user_cpus_ptr))
+ return 0;
+
+ user_mask = alloc_user_cpus_ptr(node);
+ if (!user_mask)
+ return -ENOMEM;
+
+ /*
+ * Use pi_lock to protect content of user_cpus_ptr
+ *
+ * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent
+ * do_set_cpus_allowed().
+ */
+ raw_spin_lock_irqsave(&src->pi_lock, flags);
+ if (src->user_cpus_ptr) {
+ swap(dst->user_cpus_ptr, user_mask);
+ cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ }
+ raw_spin_unlock_irqrestore(&src->pi_lock, flags);
+
+ if (unlikely(user_mask))
+ kfree(user_mask);
+
+ return 0;
+}
+
+static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
+{
+ struct cpumask *user_mask = NULL;
+
+ swap(p->user_cpus_ptr, user_mask);
+
+ return user_mask;
+}
+
+void release_user_cpus_ptr(struct task_struct *p)
+{
+ kfree(clear_user_cpus_ptr(p));
+}
+
+/*
+ * This function is wildly self concurrent; here be dragons.
+ *
+ *
+ * When given a valid mask, __set_cpus_allowed_ptr() must block until the
+ * designated task is enqueued on an allowed CPU. If that task is currently
+ * running, we have to kick it out using the CPU stopper.
+ *
+ * Migrate-Disable comes along and tramples all over our nice sandcastle.
+ * Consider:
+ *
+ * Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ * P0@CPU0 P1
+ *
+ * migrate_disable();
+ * <preempted>
+ * set_cpus_allowed_ptr(P0, [1]);
+ *
+ * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
+ * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
+ * This means we need the following scheme:
+ *
+ * P0@CPU0 P1
+ *
+ * migrate_disable();
+ * <preempted>
+ * set_cpus_allowed_ptr(P0, [1]);
+ * <blocks>
+ * <resumes>
+ * migrate_enable();
+ * __set_cpus_allowed_ptr();
+ * <wakes local stopper>
+ * `--> <woken on migration completion>
+ *
+ * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
+ * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
+ * task p are serialized by p->pi_lock, which we can leverage: the one that
+ * should come into effect at the end of the Migrate-Disable region is the last
+ * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
+ * but we still need to properly signal those waiting tasks at the appropriate
+ * moment.
+ *
+ * This is implemented using struct set_affinity_pending. The first
+ * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
+ * setup an instance of that struct and install it on the targeted task_struct.
+ * Any and all further callers will reuse that instance. Those then wait for
+ * a completion signaled at the tail of the CPU stopper callback (1), triggered
+ * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
+ *
+ *
+ * (1) In the cases covered above. There is one more where the completion is
+ * signaled within affine_move_task() itself: when a subsequent affinity request
+ * occurs after the stopper bailed out due to the targeted task still being
+ * Migrate-Disable. Consider:
+ *
+ * Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ * CPU0 P1 P2
+ * <P0>
+ * migrate_disable();
+ * <preempted>
+ * set_cpus_allowed_ptr(P0, [1]);
+ * <blocks>
+ * <migration/0>
+ * migration_cpu_stop()
+ * is_migration_disabled()
+ * <bails>
+ * set_cpus_allowed_ptr(P0, [0, 1]);
+ * <signal completion>
+ * <awakes>
+ *
+ * Note that the above is safe vs a concurrent migrate_enable(), as any
+ * pending affinity completion is preceded by an uninstallation of
+ * p->migration_pending done with p->pi_lock held.
+ */
+static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
+ int dest_cpu, unsigned int flags)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
+{
+ struct set_affinity_pending my_pending = { }, *pending = NULL;
+ bool stop_pending, complete = false;
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+ struct task_struct *push_task = NULL;
+
+ if ((flags & SCA_MIGRATE_ENABLE) &&
+ (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
+ rq->push_busy = true;
+ push_task = get_task_struct(p);
+ }
+
+ /*
+ * If there are pending waiters, but no pending stop_work,
+ * then complete now.
+ */
+ pending = p->migration_pending;
+ if (pending && !pending->stop_pending) {
+ p->migration_pending = NULL;
+ complete = true;
+ }
+
+ preempt_disable();
+ task_rq_unlock(rq, p, rf);
+ if (push_task) {
+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+ p, &rq->push_work);
+ }
+ preempt_enable();
+
+ if (complete)
+ complete_all(&pending->done);
+
+ return 0;
+ }
+
+ if (!(flags & SCA_MIGRATE_ENABLE)) {
+ /* serialized by p->pi_lock */
+ if (!p->migration_pending) {
+ /* Install the request */
+ refcount_set(&my_pending.refs, 1);
+ init_completion(&my_pending.done);
+ my_pending.arg = (struct migration_arg) {
+ .task = p,
+ .dest_cpu = dest_cpu,
+ .pending = &my_pending,
+ };
+
+ p->migration_pending = &my_pending;
+ } else {
+ pending = p->migration_pending;
+ refcount_inc(&pending->refs);
+ /*
+ * Affinity has changed, but we've already installed a
+ * pending. migration_cpu_stop() *must* see this, else
+ * we risk a completion of the pending despite having a
+ * task on a disallowed CPU.
+ *
+ * Serialized by p->pi_lock, so this is safe.
+ */
+ pending->arg.dest_cpu = dest_cpu;
+ }
+ }
+ pending = p->migration_pending;
+ /*
+ * - !MIGRATE_ENABLE:
+ * we'll have installed a pending if there wasn't one already.
+ *
+ * - MIGRATE_ENABLE:
+ * we're here because the current CPU isn't matching anymore,
+ * the only way that can happen is because of a concurrent
+ * set_cpus_allowed_ptr() call, which should then still be
+ * pending completion.
+ *
+ * Either way, we really should have a @pending here.
+ */
+ if (WARN_ON_ONCE(!pending)) {
+ task_rq_unlock(rq, p, rf);
+ return -EINVAL;
+ }
+
+ if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
+ /*
+ * MIGRATE_ENABLE gets here because 'p == current', but for
+ * anything else we cannot do is_migration_disabled(), punt
+ * and have the stopper function handle it all race-free.
+ */
+ stop_pending = pending->stop_pending;
+ if (!stop_pending)
+ pending->stop_pending = true;
+
+ if (flags & SCA_MIGRATE_ENABLE)
+ p->migration_flags &= ~MDF_PUSH;
+
+ preempt_disable();
+ task_rq_unlock(rq, p, rf);
+ if (!stop_pending) {
+ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+ }
+ preempt_enable();
+
+ if (flags & SCA_MIGRATE_ENABLE)
+ return 0;
+ } else {
+
+ if (!is_migration_disabled(p)) {
+ if (task_on_rq_queued(p))
+ rq = move_queued_task(rq, rf, p, dest_cpu);
+
+ if (!pending->stop_pending) {
+ p->migration_pending = NULL;
+ complete = true;
+ }
+ }
+ task_rq_unlock(rq, p, rf);
+
+ if (complete)
+ complete_all(&pending->done);
+ }
+
+ wait_for_completion(&pending->done);
+
+ if (refcount_dec_and_test(&pending->refs))
+ wake_up_var(&pending->refs); /* No UaF, just an address */
+
+ /*
+ * Block the original owner of &pending until all subsequent callers
+ * have seen the completion and decremented the refcount
+ */
+ wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
+
+ /* ARGH */
+ WARN_ON_ONCE(my_pending.stop_pending);
+
+ return 0;
+}
+
+/*
+ * Called with both p->pi_lock and rq->lock held; drops both before returning.
+ */
+static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
+ struct affinity_context *ctx,
+ struct rq *rq,
+ struct rq_flags *rf)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
+{
+ const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
const struct cpumask *cpu_valid_mask = cpu_active_mask;
+ bool kthread = p->flags & PF_KTHREAD;
unsigned int dest_cpu;
- struct rq_flags rf;
- struct rq *rq;
int ret = 0;
- rq = task_rq_lock(p, &rf);
update_rq_clock(rq);
- if (p->flags & PF_KTHREAD) {
+ if (kthread || is_migration_disabled(p)) {
/*
- * Kernel threads are allowed on online && !active CPUs
+ * Kernel threads are allowed on online && !active CPUs,
+ * however, during cpu-hot-unplug, even these might get pushed
+ * away if not KTHREAD_IS_PER_CPU.
+ *
+ * Specifically, migration_disabled() tasks must not fail the
+ * cpumask_any_and_distribute() pick below, esp. so on
+ * SCA_MIGRATE_ENABLE, otherwise we'll not call
+ * set_cpus_allowed_common() and actually reset p->cpus_ptr.
*/
cpu_valid_mask = cpu_online_mask;
}
+ if (!kthread && !cpumask_subset(ctx->new_mask, cpu_allowed_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
/*
* Must re-check here, to close a race against __kthread_bind(),
* sched_setaffinity() is not guaranteed to observe the flag.
*/
- if (check && (p->flags & PF_NO_SETAFFINITY)) {
+ if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
goto out;
}
- if (cpumask_equal(&p->cpus_mask, new_mask))
- goto out;
+ if (!(ctx->flags & SCA_MIGRATE_ENABLE)) {
+ if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) {
+ if (ctx->flags & SCA_USER)
+ swap(p->user_cpus_ptr, ctx->user_mask);
+ goto out;
+ }
+
+ if (WARN_ON_ONCE(p == current &&
+ is_migration_disabled(p) &&
+ !cpumask_test_cpu(task_cpu(p), ctx->new_mask))) {
+ ret = -EBUSY;
+ goto out;
+ }
+ }
/*
* Picking a ~random cpu helps in cases where we are changing affinity
* for groups of tasks (ie. cpuset), so that load balancing is not
* immediately required to distribute the tasks within their new mask.
*/
- dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ctx->new_mask);
if (dest_cpu >= nr_cpu_ids) {
ret = -EINVAL;
goto out;
}
- do_set_cpus_allowed(p, new_mask);
+ __do_set_cpus_allowed(p, ctx);
- if (p->flags & PF_KTHREAD) {
- /*
- * For kernel threads that do indeed end up on online &&
- * !active we want to ensure they are strict per-CPU threads.
- */
- WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
- !cpumask_intersects(new_mask, cpu_active_mask) &&
- p->nr_cpus_allowed != 1);
- }
+ return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
- goto out;
-
- if (task_running(rq, p) || p->state == TASK_WAKING) {
- struct migration_arg arg = { p, dest_cpu };
- /* Need help from migration thread: drop lock and wait. */
- task_rq_unlock(rq, p, &rf);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
- return 0;
- } else if (task_on_rq_queued(p)) {
- /*
- * OK, since we're going to drop the lock immediately
- * afterwards anyway.
- */
- rq = move_queued_task(rq, &rf, p, dest_cpu);
- }
out:
- task_rq_unlock(rq, p, &rf);
+ task_rq_unlock(rq, p, rf);
return ret;
}
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ struct affinity_context *ctx)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ /*
+ * Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_*
+ * flags are set.
+ */
+ if (p->user_cpus_ptr &&
+ !(ctx->flags & (SCA_USER | SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) &&
+ cpumask_and(rq->scratch_mask, ctx->new_mask, p->user_cpus_ptr))
+ ctx->new_mask = rq->scratch_mask;
+
+ return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf);
+}
+
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
- return __set_cpus_allowed_ptr(p, new_mask, false);
+ struct affinity_context ac = {
+ .new_mask = new_mask,
+ .flags = 0,
+ };
+
+ return __set_cpus_allowed_ptr(p, &ac);
}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+/*
+ * Change a given task's CPU affinity to the intersection of its current
+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
+ * If user_cpus_ptr is defined, use it as the basis for restricting CPU
+ * affinity or use cpu_online_mask instead.
+ *
+ * If the resulting mask is empty, leave the affinity unchanged and return
+ * -EINVAL.
+ */
+static int restrict_cpus_allowed_ptr(struct task_struct *p,
+ struct cpumask *new_mask,
+ const struct cpumask *subset_mask)
+{
+ struct affinity_context ac = {
+ .new_mask = new_mask,
+ .flags = 0,
+ };
+ struct rq_flags rf;
+ struct rq *rq;
+ int err;
+
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * Forcefully restricting the affinity of a deadline task is
+ * likely to cause problems, so fail and noisily override the
+ * mask entirely.
+ */
+ if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
+ err = -EPERM;
+ goto err_unlock;
+ }
+
+ if (!cpumask_and(new_mask, task_user_cpus(p), subset_mask)) {
+ err = -EINVAL;
+ goto err_unlock;
+ }
+
+ return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf);
+
+err_unlock:
+ task_rq_unlock(rq, p, &rf);
+ return err;
+}
+
+/*
+ * Restrict the CPU affinity of task @p so that it is a subset of
+ * task_cpu_possible_mask() and point @p->user_cpus_ptr to a copy of the
+ * old affinity mask. If the resulting mask is empty, we warn and walk
+ * up the cpuset hierarchy until we find a suitable mask.
+ */
+void force_compatible_cpus_allowed_ptr(struct task_struct *p)
+{
+ cpumask_var_t new_mask;
+ const struct cpumask *override_mask = task_cpu_possible_mask(p);
+
+ alloc_cpumask_var(&new_mask, GFP_KERNEL);
+
+ /*
+ * __migrate_task() can fail silently in the face of concurrent
+ * offlining of the chosen destination CPU, so take the hotplug
+ * lock to ensure that the migration succeeds.
+ */
+ cpus_read_lock();
+ if (!cpumask_available(new_mask))
+ goto out_set_mask;
+
+ if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
+ goto out_free_mask;
+
+ /*
+ * We failed to find a valid subset of the affinity mask for the
+ * task, so override it based on its cpuset hierarchy.
+ */
+ cpuset_cpus_allowed(p, new_mask);
+ override_mask = new_mask;
+
+out_set_mask:
+ if (printk_ratelimit()) {
+ printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
+ task_pid_nr(p), p->comm,
+ cpumask_pr_args(override_mask));
+ }
+
+ WARN_ON(set_cpus_allowed_ptr(p, override_mask));
+out_free_mask:
+ cpus_read_unlock();
+ free_cpumask_var(new_mask);
+}
+
+static int
+__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
+
+/*
+ * Restore the affinity of a task @p which was previously restricted by a
+ * call to force_compatible_cpus_allowed_ptr().
+ *
+ * It is the caller's responsibility to serialise this with any calls to
+ * force_compatible_cpus_allowed_ptr(@p).
+ */
+void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
+{
+ struct affinity_context ac = {
+ .new_mask = task_user_cpus(p),
+ .flags = 0,
+ };
+ int ret;
+
+ /*
+ * Try to restore the old affinity mask with __sched_setaffinity().
+ * Cpuset masking will be done there too.
+ */
+ ret = __sched_setaffinity(p, &ac);
+ WARN_ON_ONCE(ret);
+}
+
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
#ifdef CONFIG_SCHED_DEBUG
+ unsigned int state = READ_ONCE(p->__state);
+
/*
* We should never call set_task_cpu() on a blocked task,
* ttwu() will sort out the placement.
*/
- WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
- !p->on_rq);
+ WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);
/*
* Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
* because schedstat_wait_{start,end} rebase migrating task's wait_start
* time relying on p->on_rq.
*/
- WARN_ON_ONCE(p->state == TASK_RUNNING &&
+ WARN_ON_ONCE(state == TASK_RUNNING &&
p->sched_class == &fair_sched_class &&
(p->on_rq && !task_on_rq_migrating(p)));
@@ -1717,12 +3379,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* task_rq_lock().
*/
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
- lockdep_is_held(&task_rq(p)->lock)));
+ lockdep_is_held(__rq_lockp(task_rq(p)))));
#endif
/*
* Clearly, migrating tasks to offline CPUs is a fairly daft thing.
*/
WARN_ON_ONCE(!cpu_online(new_cpu));
+
+ WARN_ON_ONCE(is_migration_disabled(p));
#endif
trace_sched_migrate_task(p, new_cpu);
@@ -1732,6 +3396,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
rseq_migrate(p);
+ sched_mm_cid_migrate_from(p);
perf_event_task_migrate(p);
}
@@ -1754,7 +3419,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
- check_preempt_curr(dst_rq, p, 0);
+ wakeup_preempt(dst_rq, p, 0);
rq_unpin_lock(dst_rq, &drf);
rq_unpin_lock(src_rq, &srf);
@@ -1778,7 +3443,6 @@ static int migrate_swap_stop(void *data)
{
struct migration_swap_arg *arg = data;
struct rq *src_rq, *dst_rq;
- int ret = -EAGAIN;
if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
return -EAGAIN;
@@ -1786,33 +3450,25 @@ static int migrate_swap_stop(void *data)
src_rq = cpu_rq(arg->src_cpu);
dst_rq = cpu_rq(arg->dst_cpu);
- double_raw_lock(&arg->src_task->pi_lock,
- &arg->dst_task->pi_lock);
- double_rq_lock(src_rq, dst_rq);
+ guard(double_raw_spinlock)(&arg->src_task->pi_lock, &arg->dst_task->pi_lock);
+ guard(double_rq_lock)(src_rq, dst_rq);
if (task_cpu(arg->dst_task) != arg->dst_cpu)
- goto unlock;
+ return -EAGAIN;
if (task_cpu(arg->src_task) != arg->src_cpu)
- goto unlock;
+ return -EAGAIN;
if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
- goto unlock;
+ return -EAGAIN;
if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
- goto unlock;
+ return -EAGAIN;
__migrate_swap_task(arg->src_task, arg->dst_cpu);
__migrate_swap_task(arg->dst_task, arg->src_cpu);
- ret = 0;
-
-unlock:
- double_rq_unlock(src_rq, dst_rq);
- raw_spin_unlock(&arg->dst_task->pi_lock);
- raw_spin_unlock(&arg->src_task->pi_lock);
-
- return ret;
+ return 0;
}
/*
@@ -1855,114 +3511,6 @@ out:
}
#endif /* CONFIG_NUMA_BALANCING */
-/*
- * wait_task_inactive - wait for a thread to unschedule.
- *
- * If @match_state is nonzero, it's the @p->state value just checked and
- * not expected to change. If it changes, i.e. @p might have woken up,
- * then return zero. When we succeed in waiting for @p to be off its CPU,
- * we return a positive number (its total switch count). If a second call
- * a short while later returns the same number, the caller can be sure that
- * @p has remained unscheduled the whole time.
- *
- * The caller must ensure that the task *will* unschedule sometime soon,
- * else this function might spin for a *long* time. This function can't
- * be called with interrupts off, or it may introduce deadlock with
- * smp_call_function() if an IPI is sent by the same process we are
- * waiting to become inactive.
- */
-unsigned long wait_task_inactive(struct task_struct *p, long match_state)
-{
- int running, queued;
- struct rq_flags rf;
- unsigned long ncsw;
- struct rq *rq;
-
- for (;;) {
- /*
- * We do the initial early heuristics without holding
- * any task-queue locks at all. We'll only try to get
- * the runqueue lock when things look like they will
- * work out!
- */
- rq = task_rq(p);
-
- /*
- * If the task is actively running on another CPU
- * still, just relax and busy-wait without holding
- * any locks.
- *
- * NOTE! Since we don't hold any locks, it's not
- * even sure that "rq" stays as the right runqueue!
- * But we don't care, since "task_running()" will
- * return false if the runqueue has changed and p
- * is actually now running somewhere else!
- */
- while (task_running(rq, p)) {
- if (match_state && unlikely(p->state != match_state))
- return 0;
- cpu_relax();
- }
-
- /*
- * Ok, time to look more closely! We need the rq
- * lock now, to be *sure*. If we're wrong, we'll
- * just go back and repeat.
- */
- rq = task_rq_lock(p, &rf);
- trace_sched_wait_task(p);
- running = task_running(rq, p);
- queued = task_on_rq_queued(p);
- ncsw = 0;
- if (!match_state || p->state == match_state)
- ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- task_rq_unlock(rq, p, &rf);
-
- /*
- * If it changed from the expected state, bail out now.
- */
- if (unlikely(!ncsw))
- break;
-
- /*
- * Was it really running after all now that we
- * checked with the proper locks actually held?
- *
- * Oops. Go back and try again..
- */
- if (unlikely(running)) {
- cpu_relax();
- continue;
- }
-
- /*
- * It's not enough that it's not actively running,
- * it must be off the runqueue _entirely_, and not
- * preempted!
- *
- * So if it was still runnable (but just not actively
- * running right now), it's preempted, and we should
- * yield - it could be a while.
- */
- if (unlikely(queued)) {
- ktime_t to = NSEC_PER_SEC / HZ;
-
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_hrtimeout(&to, HRTIMER_MODE_REL);
- continue;
- }
-
- /*
- * Ahh, all good. It wasn't running, and it wasn't
- * runnable, which means that it will never become
- * running in the future either. We're all done!
- */
- break;
- }
-
- return ncsw;
-}
-
/***
* kick_process - kick a running thread to enter/exit the kernel
* @p: the to-be-kicked thread
@@ -1978,13 +3526,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
*/
void kick_process(struct task_struct *p)
{
- int cpu;
+ guard(preempt)();
+ int cpu = task_cpu(p);
- preempt_disable();
- cpu = task_cpu(p);
if ((cpu != smp_processor_id()) && task_curr(p))
smp_send_reschedule(cpu);
- preempt_enable();
}
EXPORT_SYMBOL_GPL(kick_process);
@@ -2027,9 +3573,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
/* Look for allowed, online CPU in same node. */
for_each_cpu(dest_cpu, nodemask) {
- if (!cpu_active(dest_cpu))
- continue;
- if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
+ if (is_cpu_allowed(p, dest_cpu))
return dest_cpu;
}
}
@@ -2046,17 +3590,21 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
/* No more Mr. Nice Guy. */
switch (state) {
case cpuset:
- if (IS_ENABLED(CONFIG_CPUSETS)) {
- cpuset_cpus_allowed_fallback(p);
+ if (cpuset_cpus_allowed_fallback(p)) {
state = possible;
break;
}
- /* Fall-through */
+ fallthrough;
case possible:
- do_set_cpus_allowed(p, cpu_possible_mask);
+ /*
+ * XXX When called from select_task_rq() we only
+ * hold p->pi_lock and again violate locking order.
+ *
+ * More yuck to audit.
+ */
+ do_set_cpus_allowed(p, task_cpu_possible_mask(p));
state = fail;
break;
-
case fail:
BUG();
break;
@@ -2083,12 +3631,12 @@ out:
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
*/
static inline
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
{
lockdep_assert_held(&p->pi_lock);
- if (p->nr_cpus_allowed > 1)
- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+ if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
+ cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
else
cpu = cpumask_any(p->cpus_ptr);
@@ -2110,6 +3658,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
+ static struct lock_class_key stop_pi_lock;
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
struct task_struct *old_stop = cpu_rq(cpu)->stop;
@@ -2125,6 +3674,20 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
stop->sched_class = &stop_sched_class;
+
+ /*
+ * The PI code calls rt_mutex_setprio() with ->pi_lock held to
+ * adjust the effective priority of a task. As a result,
+ * rt_mutex_setprio() can trigger (RT) balancing operations,
+ * which can then trigger wakeups of the stop thread to push
+ * around the current task.
+ *
+ * The stop task itself will never be part of the PI-chain, it
+ * never blocks, therefore that ->pi_lock recursion is safe.
+ * Tell lockdep about this by placing the stop->pi_lock in its
+ * own class.
+ */
+ lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
}
cpu_rq(cpu)->stop = stop;
@@ -2138,15 +3701,27 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
}
}
-#else
+#else /* CONFIG_SMP */
static inline int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask, bool check)
+ struct affinity_context *ctx)
{
- return set_cpus_allowed_ptr(p, new_mask);
+ return set_cpus_allowed_ptr(p, ctx->new_mask);
}
-#endif /* CONFIG_SMP */
+static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
+
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+ return false;
+}
+
+static inline cpumask_t *alloc_user_cpus_ptr(int node)
+{
+ return NULL;
+}
+
+#endif /* !CONFIG_SMP */
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
@@ -2161,46 +3736,71 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
#ifdef CONFIG_SMP
if (cpu == rq->cpu) {
__schedstat_inc(rq->ttwu_local);
- __schedstat_inc(p->se.statistics.nr_wakeups_local);
+ __schedstat_inc(p->stats.nr_wakeups_local);
} else {
struct sched_domain *sd;
- __schedstat_inc(p->se.statistics.nr_wakeups_remote);
- rcu_read_lock();
+ __schedstat_inc(p->stats.nr_wakeups_remote);
+
+ guard(rcu)();
for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
__schedstat_inc(sd->ttwu_wake_remote);
break;
}
}
- rcu_read_unlock();
}
if (wake_flags & WF_MIGRATED)
- __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
+ __schedstat_inc(p->stats.nr_wakeups_migrate);
#endif /* CONFIG_SMP */
__schedstat_inc(rq->ttwu_count);
- __schedstat_inc(p->se.statistics.nr_wakeups);
+ __schedstat_inc(p->stats.nr_wakeups);
if (wake_flags & WF_SYNC)
- __schedstat_inc(p->se.statistics.nr_wakeups_sync);
+ __schedstat_inc(p->stats.nr_wakeups_sync);
}
/*
- * Mark the task runnable and perform wakeup-preemption.
+ * Mark the task runnable.
*/
-static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
- struct rq_flags *rf)
+static inline void ttwu_do_wakeup(struct task_struct *p)
{
- check_preempt_curr(rq, p, wake_flags);
- p->state = TASK_RUNNING;
+ WRITE_ONCE(p->__state, TASK_RUNNING);
trace_sched_wakeup(p);
+}
+
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+ struct rq_flags *rf)
+{
+ int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
+
+ lockdep_assert_rq_held(rq);
+
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible--;
+
+#ifdef CONFIG_SMP
+ if (wake_flags & WF_MIGRATED)
+ en_flags |= ENQUEUE_MIGRATED;
+ else
+#endif
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
+ activate_task(rq, p, en_flags);
+ wakeup_preempt(rq, p, wake_flags);
+
+ ttwu_do_wakeup(p);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
- * Our task @p is fully woken up and running; so its safe to
+ * Our task @p is fully woken up and running; so it's safe to
* drop the rq->lock, hereafter rq is only used for statistics.
*/
rq_unpin_lock(rq, rf);
@@ -2220,35 +3820,36 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
rq->idle_stamp = 0;
}
#endif
-}
-
-static void
-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
- struct rq_flags *rf)
-{
- int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
-
- lockdep_assert_held(&rq->lock);
- if (p->sched_contributes_to_load)
- rq->nr_uninterruptible--;
-
-#ifdef CONFIG_SMP
- if (wake_flags & WF_MIGRATED)
- en_flags |= ENQUEUE_MIGRATED;
-#endif
-
- activate_task(rq, p, en_flags);
- ttwu_do_wakeup(rq, p, wake_flags, rf);
+ p->dl_server = NULL;
}
/*
- * Called in case the task @p isn't fully descheduled from its runqueue,
- * in this case we must do a remote wakeup. Its a 'light' wakeup though,
- * since all we need to do is flip p->state to TASK_RUNNING, since
- * the task is still ->on_rq.
+ * Consider @p being inside a wait loop:
+ *
+ * for (;;) {
+ * set_current_state(TASK_UNINTERRUPTIBLE);
+ *
+ * if (CONDITION)
+ * break;
+ *
+ * schedule();
+ * }
+ * __set_current_state(TASK_RUNNING);
+ *
+ * between set_current_state() and schedule(). In this case @p is still
+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
+ * an atomic manner.
+ *
+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
+ * then schedule() must still happen and p->state can be changed to
+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
+ * need to do a full wakeup with enqueue.
+ *
+ * Returns: %true when the wakeup is done,
+ * %false otherwise.
*/
-static int ttwu_remote(struct task_struct *p, int wake_flags)
+static int ttwu_runnable(struct task_struct *p, int wake_flags)
{
struct rq_flags rf;
struct rq *rq;
@@ -2256,9 +3857,15 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
rq = __task_rq_lock(p, &rf);
if (task_on_rq_queued(p)) {
- /* check_preempt_curr() may use rq clock */
- update_rq_clock(rq);
- ttwu_do_wakeup(rq, p, wake_flags, &rf);
+ if (!task_on_cpu(rq, p)) {
+ /*
+ * When on_rq && !on_cpu the task is preempted, see if
+ * it should preempt the task that is current now.
+ */
+ update_rq_clock(rq);
+ wakeup_preempt(rq, p, wake_flags);
+ }
+ ttwu_do_wakeup(p);
ret = 1;
}
__task_rq_unlock(rq, &rf);
@@ -2277,13 +3884,6 @@ void sched_ttwu_pending(void *arg)
if (!llist)
return;
- /*
- * rq::ttwu_pending racy indication of out-standing wakeups.
- * Races such that false-negatives are possible, since they
- * are shorter lived that false-positives would be.
- */
- WRITE_ONCE(rq->ttwu_pending, 0);
-
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
@@ -2297,17 +3897,34 @@ void sched_ttwu_pending(void *arg)
ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
}
+ /*
+ * Must be after enqueueing at least once task such that
+ * idle_cpu() does not observe a false-negative -- if it does,
+ * it is possible for select_idle_siblings() to stack a number
+ * of tasks on this CPU during that window.
+ *
+ * It is ok to clear ttwu_pending when another task pending.
+ * We will receive IPI after local irq enabled and then enqueue it.
+ * Since now nr_running > 0, idle_cpu() will always get correct result.
+ */
+ WRITE_ONCE(rq->ttwu_pending, 0);
rq_unlock_irqrestore(rq, &rf);
}
-void send_call_function_single_ipi(int cpu)
+/*
+ * Prepare the scene for sending an IPI for a remote smp_call
+ *
+ * Returns true if the caller can proceed with sending the IPI.
+ * Returns false otherwise.
+ */
+bool call_function_single_prep_ipi(int cpu)
{
- struct rq *rq = cpu_rq(cpu);
-
- if (!set_nr_if_polling(rq->idle))
- arch_send_call_function_single_ipi(cpu);
- else
+ if (set_nr_if_polling(cpu_rq(cpu)->idle)) {
trace_sched_wake_idle_without_ipi(cpu);
+ return false;
+ }
+
+ return true;
}
/*
@@ -2329,48 +3946,70 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags
void wake_up_if_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
-
- rcu_read_lock();
-
- if (!is_idle_task(rcu_dereference(rq->curr)))
- goto out;
- if (set_nr_if_polling(rq->idle)) {
- trace_sched_wake_idle_without_ipi(cpu);
- } else {
- rq_lock_irqsave(rq, &rf);
+ guard(rcu)();
+ if (is_idle_task(rcu_dereference(rq->curr))) {
+ guard(rq_lock_irqsave)(rq);
if (is_idle_task(rq->curr))
- smp_send_reschedule(cpu);
- /* Else CPU is not idle, do nothing here: */
- rq_unlock_irqrestore(rq, &rf);
+ resched_curr(rq);
}
-
-out:
- rcu_read_unlock();
}
bool cpus_share_cache(int this_cpu, int that_cpu)
{
+ if (this_cpu == that_cpu)
+ return true;
+
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
-static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+/*
+ * Whether CPUs are share cache resources, which means LLC on non-cluster
+ * machines and LLC tag or L2 on machines with clusters.
+ */
+bool cpus_share_resources(int this_cpu, int that_cpu)
+{
+ if (this_cpu == that_cpu)
+ return true;
+
+ return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu);
+}
+
+static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
/*
+ * Do not complicate things with the async wake_list while the CPU is
+ * in hotplug state.
+ */
+ if (!cpu_active(cpu))
+ return false;
+
+ /* Ensure the task will still be allowed to run on the CPU. */
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ return false;
+
+ /*
* If the CPU does not share cache, then queue the task on the
* remote rqs wakelist to avoid accessing remote data.
*/
if (!cpus_share_cache(smp_processor_id(), cpu))
return true;
+ if (cpu == smp_processor_id())
+ return false;
+
/*
- * If the task is descheduling and the only running task on the
- * CPU then use the wakelist to offload the task activation to
- * the soon-to-be-idle CPU as the current CPU is likely busy.
- * nr_running is checked to avoid unnecessary task stacking.
+ * If the wakee cpu is idle, or the task is descheduling and the
+ * only running task on the CPU, then use the wakelist to offload
+ * the task activation to the idle (or soon-to-be-idle) CPU as
+ * the current CPU is likely busy. nr_running is checked to
+ * avoid unnecessary task stacking.
+ *
+ * Note that we can only get here with (wakee) p->on_rq=0,
+ * p->on_cpu can be whatever, we've done the dequeue, so
+ * the wakee has been accounted out of ->nr_running.
*/
- if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
+ if (!cpu_rq(cpu)->nr_running)
return true;
return false;
@@ -2378,10 +4017,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
- if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
- if (WARN_ON_ONCE(cpu == smp_processor_id()))
- return false;
-
+ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) {
sched_clock_cpu(cpu); /* Sync clocks across CPUs */
__ttwu_queue_wakelist(p, cpu, wake_flags);
return true;
@@ -2389,6 +4025,14 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
return false;
}
+
+#else /* !CONFIG_SMP */
+
+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+{
+ return false;
+}
+
#endif /* CONFIG_SMP */
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
@@ -2396,10 +4040,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
-#if defined(CONFIG_SMP)
if (ttwu_queue_wakelist(p, cpu, wake_flags))
return;
-#endif
rq_lock(rq, &rf);
update_rq_clock(rq);
@@ -2408,6 +4050,56 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
}
/*
+ * Invoked from try_to_wake_up() to check whether the task can be woken up.
+ *
+ * The caller holds p::pi_lock if p != current or has preemption
+ * disabled when p == current.
+ *
+ * The rules of saved_state:
+ *
+ * The related locking code always holds p::pi_lock when updating
+ * p::saved_state, which means the code is fully serialized in both cases.
+ *
+ * For PREEMPT_RT, the lock wait and lock wakeups happen via TASK_RTLOCK_WAIT.
+ * No other bits set. This allows to distinguish all wakeup scenarios.
+ *
+ * For FREEZER, the wakeup happens via TASK_FROZEN. No other bits set. This
+ * allows us to prevent early wakeup of tasks before they can be run on
+ * asymmetric ISA architectures (eg ARMv9).
+ */
+static __always_inline
+bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
+{
+ int match;
+
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
+ WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
+ state != TASK_RTLOCK_WAIT);
+ }
+
+ *success = !!(match = __task_state_match(p, state));
+
+ /*
+ * Saved state preserves the task state across blocking on
+ * an RT lock or TASK_FREEZABLE tasks. If the state matches,
+ * set p::saved_state to TASK_RUNNING, but do not wake the task
+ * because it waits for a lock wakeup or __thaw_task(). Also
+ * indicate success because from the regular waker's point of
+ * view this has succeeded.
+ *
+ * After acquiring the lock the task will restore p::__state
+ * from p::saved_state which ensures that the regular
+ * wakeup is not lost. The restore will also set
+ * p::saved_state to TASK_RUNNING so any further tests will
+ * not result in false positives vs. @success
+ */
+ if (match < 0)
+ p->saved_state = TASK_RUNNING;
+
+ return match > 0;
+}
+
+/*
* Notes on Program-Order guarantees on SMP systems.
*
* MIGRATION
@@ -2455,8 +4147,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* migration. However the means are completely different as there is no lock
* chain to provide order. Instead we do:
*
- * 1) smp_store_release(X->on_cpu, 0)
- * 2) smp_cond_load_acquire(!X->on_cpu)
+ * 1) smp_store_release(X->on_cpu, 0) -- finish_task()
+ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
*
* Example:
*
@@ -2496,31 +4188,47 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* @state: the mask of task states that can be woken
* @wake_flags: wake modifier flags (WF_*)
*
- * If (@state & @p->state) @p->state = TASK_RUNNING.
+ * Conceptually does:
+ *
+ * If (@state & @p->state) @p->state = TASK_RUNNING.
*
* If the task was not queued/runnable, also place it back on a runqueue.
*
- * Atomic against schedule() which would dequeue a task, also see
- * set_current_state().
+ * This function is atomic against schedule() which would dequeue the task.
+ *
+ * It issues a full memory barrier before accessing @p->state, see the comment
+ * with set_current_state().
+ *
+ * Uses p->pi_lock to serialize against concurrent wake-ups.
+ *
+ * Relies on p->pi_lock stabilizing:
+ * - p->sched_class
+ * - p->cpus_ptr
+ * - p->sched_task_group
+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().
+ *
+ * Tries really hard to only take one task_rq(p)->lock for performance.
+ * Takes rq->lock in:
+ * - ttwu_runnable() -- old rq, unavoidable, see comment there;
+ * - ttwu_queue() -- new rq, for enqueue of the task;
+ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
*
- * This function executes a full memory barrier before accessing the task
- * state; see set_current_state().
+ * As a consequence we race really badly with just about everything. See the
+ * many memory barriers and their comments for details.
*
* Return: %true if @p->state changes (an actual wakeup was done),
* %false otherwise.
*/
-static int
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
- unsigned long flags;
+ guard(preempt)();
int cpu, success = 0;
- preempt_disable();
if (p == current) {
/*
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
* == smp_processor_id()'. Together this means we can special
- * case the whole 'p->on_rq && ttwu_remote()' case below
+ * case the whole 'p->on_rq && ttwu_runnable()' case below
* without taking any locks.
*
* In particular:
@@ -2528,199 +4236,256 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* - we're serialized against set_special_state() by virtue of
* it disabling IRQs (this allows not taking ->pi_lock).
*/
- if (!(p->state & state))
+ if (!ttwu_state_match(p, state, &success))
goto out;
- success = 1;
trace_sched_waking(p);
- p->state = TASK_RUNNING;
- trace_sched_wakeup(p);
+ ttwu_do_wakeup(p);
goto out;
}
/*
* If we are going to wake up a thread waiting for CONDITION we
* need to ensure that CONDITION=1 done by the caller can not be
- * reordered with p->state check below. This pairs with mb() in
- * set_current_state() the waiting thread does.
+ * reordered with p->state check below. This pairs with smp_store_mb()
+ * in set_current_state() that the waiting thread does.
*/
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- smp_mb__after_spinlock();
- if (!(p->state & state))
- goto unlock;
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
+ smp_mb__after_spinlock();
+ if (!ttwu_state_match(p, state, &success))
+ break;
- trace_sched_waking(p);
+ trace_sched_waking(p);
- /* We're going to change ->state: */
- success = 1;
+ /*
+ * Ensure we load p->on_rq _after_ p->state, otherwise it would
+ * be possible to, falsely, observe p->on_rq == 0 and get stuck
+ * in smp_cond_load_acquire() below.
+ *
+ * sched_ttwu_pending() try_to_wake_up()
+ * STORE p->on_rq = 1 LOAD p->state
+ * UNLOCK rq->lock
+ *
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * UNLOCK rq->lock
+ *
+ * [task p]
+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
+ *
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
+ *
+ * A similar smp_rmb() lives in __task_needs_rq_lock().
+ */
+ smp_rmb();
+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
+ break;
- /*
- * Ensure we load p->on_rq _after_ p->state, otherwise it would
- * be possible to, falsely, observe p->on_rq == 0 and get stuck
- * in smp_cond_load_acquire() below.
- *
- * sched_ttwu_pending() try_to_wake_up()
- * STORE p->on_rq = 1 LOAD p->state
- * UNLOCK rq->lock
- *
- * __schedule() (switch to task 'p')
- * LOCK rq->lock smp_rmb();
- * smp_mb__after_spinlock();
- * UNLOCK rq->lock
- *
- * [task p]
- * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
- *
- * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
- * __schedule(). See the comment for smp_mb__after_spinlock().
- *
- * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
- */
- smp_rmb();
- if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags))
- goto unlock;
+#ifdef CONFIG_SMP
+ /*
+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
+ * possible to, falsely, observe p->on_cpu == 0.
+ *
+ * One must be running (->on_cpu == 1) in order to remove oneself
+ * from the runqueue.
+ *
+ * __schedule() (switch to task 'p') try_to_wake_up()
+ * STORE p->on_cpu = 1 LOAD p->on_rq
+ * UNLOCK rq->lock
+ *
+ * __schedule() (put 'p' to sleep)
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * STORE p->on_rq = 0 LOAD p->on_cpu
+ *
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
+ *
+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
+ * schedule()'s deactivate_task() has 'happened' and p will no longer
+ * care about it's own p->state. See the comment in __schedule().
+ */
+ smp_acquire__after_ctrl_dep();
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
+ /*
+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
+ * == 0), which means we need to do an enqueue, change p->state to
+ * TASK_WAKING such that we can unlock p->pi_lock before doing the
+ * enqueue, such as ttwu_queue_wakelist().
+ */
+ WRITE_ONCE(p->__state, TASK_WAKING);
+
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, considering queueing p on the remote CPUs wake_list
+ * which potentially sends an IPI instead of spinning on p->on_cpu to
+ * let the waker make forward progress. This is safe because IRQs are
+ * disabled and the IPI will deliver after on_cpu is cleared.
+ *
+ * Ensure we load task_cpu(p) after p->on_cpu:
+ *
+ * set_task_cpu(p, cpu);
+ * STORE p->cpu = @cpu
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock
+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
+ * STORE p->on_cpu = 1 LOAD p->cpu
+ *
+ * to ensure we observe the correct CPU on which the task is currently
+ * scheduling.
+ */
+ if (smp_load_acquire(&p->on_cpu) &&
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
+ break;
+
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, wait until it's done referencing the task.
+ *
+ * Pairs with the smp_store_release() in finish_task().
+ *
+ * This ensures that tasks getting woken will be fully ordered against
+ * their previous state and preserve Program Order.
+ */
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
+
+ cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
+ if (task_cpu(p) != cpu) {
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
+ wake_flags |= WF_MIGRATED;
+ psi_ttwu_dequeue(p);
+ set_task_cpu(p, cpu);
+ }
+#else
+ cpu = task_cpu(p);
+#endif /* CONFIG_SMP */
+
+ ttwu_queue(p, cpu, wake_flags);
}
+out:
+ if (success)
+ ttwu_stat(p, task_cpu(p), wake_flags);
-#ifdef CONFIG_SMP
- /*
- * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
- * possible to, falsely, observe p->on_cpu == 0.
- *
- * One must be running (->on_cpu == 1) in order to remove oneself
- * from the runqueue.
- *
- * __schedule() (switch to task 'p') try_to_wake_up()
- * STORE p->on_cpu = 1 LOAD p->on_rq
- * UNLOCK rq->lock
- *
- * __schedule() (put 'p' to sleep)
- * LOCK rq->lock smp_rmb();
- * smp_mb__after_spinlock();
- * STORE p->on_rq = 0 LOAD p->on_cpu
- *
- * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
- * __schedule(). See the comment for smp_mb__after_spinlock().
- *
- * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
- * schedule()'s deactivate_task() has 'happened' and p will no longer
- * care about it's own p->state. See the comment in __schedule().
- */
- smp_acquire__after_ctrl_dep();
+ return success;
+}
+
+static bool __task_needs_rq_lock(struct task_struct *p)
+{
+ unsigned int state = READ_ONCE(p->__state);
/*
- * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
- * == 0), which means we need to do an enqueue, change p->state to
- * TASK_WAKING such that we can unlock p->pi_lock before doing the
- * enqueue, such as ttwu_queue_wakelist().
+ * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when
+ * the task is blocked. Make sure to check @state since ttwu() can drop
+ * locks at the end, see ttwu_queue_wakelist().
*/
- p->state = TASK_WAKING;
+ if (state == TASK_RUNNING || state == TASK_WAKING)
+ return true;
/*
- * If the owning (remote) CPU is still in the middle of schedule() with
- * this task as prev, considering queueing p on the remote CPUs wake_list
- * which potentially sends an IPI instead of spinning on p->on_cpu to
- * let the waker make forward progress. This is safe because IRQs are
- * disabled and the IPI will deliver after on_cpu is cleared.
- *
- * Ensure we load task_cpu(p) after p->on_cpu:
- *
- * set_task_cpu(p, cpu);
- * STORE p->cpu = @cpu
- * __schedule() (switch to task 'p')
- * LOCK rq->lock
- * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
- * STORE p->on_cpu = 1 LOAD p->cpu
+ * Ensure we load p->on_rq after p->__state, otherwise it would be
+ * possible to, falsely, observe p->on_rq == 0.
*
- * to ensure we observe the correct CPU on which the task is currently
- * scheduling.
+ * See try_to_wake_up() for a longer comment.
*/
- if (smp_load_acquire(&p->on_cpu) &&
- ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
- goto unlock;
+ smp_rmb();
+ if (p->on_rq)
+ return true;
+#ifdef CONFIG_SMP
/*
- * If the owning (remote) CPU is still in the middle of schedule() with
- * this task as prev, wait until its done referencing the task.
- *
- * Pairs with the smp_store_release() in finish_task().
- *
- * This ensures that tasks getting woken will be fully ordered against
- * their previous state and preserve Program Order.
+ * Ensure the task has finished __schedule() and will not be referenced
+ * anymore. Again, see try_to_wake_up() for a longer comment.
*/
+ smp_rmb();
smp_cond_load_acquire(&p->on_cpu, !VAL);
+#endif
- cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
- if (task_cpu(p) != cpu) {
- wake_flags |= WF_MIGRATED;
- psi_ttwu_dequeue(p);
- set_task_cpu(p, cpu);
- }
-#else
- cpu = task_cpu(p);
-#endif /* CONFIG_SMP */
-
- ttwu_queue(p, cpu, wake_flags);
-unlock:
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-out:
- if (success)
- ttwu_stat(p, task_cpu(p), wake_flags);
- preempt_enable();
-
- return success;
+ return false;
}
/**
- * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
- * @p: Process for which the function is to be invoked.
+ * task_call_func - Invoke a function on task in fixed state
+ * @p: Process for which the function is to be invoked, can be @current.
* @func: Function to invoke.
* @arg: Argument to function.
*
- * If the specified task can be quickly locked into a definite state
- * (either sleeping or on a given runqueue), arrange to keep it in that
- * state while invoking @func(@arg). This function can use ->on_rq and
- * task_curr() to work out what the state is, if required. Given that
- * @func can be invoked with a runqueue lock held, it had better be quite
- * lightweight.
+ * Fix the task in it's current state by avoiding wakeups and or rq operations
+ * and call @func(@arg) on it. This function can use ->on_rq and task_curr()
+ * to work out what the state is, if required. Given that @func can be invoked
+ * with a runqueue lock held, it had better be quite lightweight.
*
* Returns:
- * @false if the task slipped out from under the locks.
- * @true if the task was locked onto a runqueue or is sleeping.
- * However, @func can override this by returning @false.
+ * Whatever @func returns
*/
-bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
+int task_call_func(struct task_struct *p, task_call_f func, void *arg)
{
- bool ret = false;
+ struct rq *rq = NULL;
struct rq_flags rf;
- struct rq *rq;
+ int ret;
- lockdep_assert_irqs_enabled();
- raw_spin_lock_irq(&p->pi_lock);
- if (p->on_rq) {
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+
+ if (__task_needs_rq_lock(p))
rq = __task_rq_lock(p, &rf);
- if (task_rq(p) == rq)
- ret = func(p, arg);
+
+ /*
+ * At this point the task is pinned; either:
+ * - blocked and we're holding off wakeups (pi->lock)
+ * - woken, and we're holding off enqueue (rq->lock)
+ * - queued, and we're holding off schedule (rq->lock)
+ * - running, and we're holding off de-schedule (rq->lock)
+ *
+ * The called function (@func) can use: task_curr(), p->on_rq and
+ * p->__state to differentiate between these states.
+ */
+ ret = func(p, arg);
+
+ if (rq)
rq_unlock(rq, &rf);
- } else {
- switch (p->state) {
- case TASK_RUNNING:
- case TASK_WAKING:
- break;
- default:
- smp_rmb(); // See smp_rmb() comment in try_to_wake_up().
- if (!p->on_rq)
- ret = func(p, arg);
- }
- }
- raw_spin_unlock_irq(&p->pi_lock);
+
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
return ret;
}
/**
+ * cpu_curr_snapshot - Return a snapshot of the currently running task
+ * @cpu: The CPU on which to snapshot the task.
+ *
+ * Returns the task_struct pointer of the task "currently" running on
+ * the specified CPU. If the same task is running on that CPU throughout,
+ * the return value will be a pointer to that task's task_struct structure.
+ * If the CPU did any context switches even vaguely concurrently with the
+ * execution of this function, the return value will be a pointer to the
+ * task_struct structure of a randomly chosen task that was running on
+ * that CPU somewhere around the time that this function was executing.
+ *
+ * If the specified CPU was offline, the return value is whatever it
+ * is, perhaps a pointer to the task_struct structure of that CPU's idle
+ * task, but there is no guarantee. Callers wishing a useful return
+ * value must take some action to ensure that the specified CPU remains
+ * online throughout.
+ *
+ * This function executes full memory barriers before and after fetching
+ * the pointer, which permits the caller to confine this function's fetch
+ * with respect to the caller's accesses to other shared variables.
+ */
+struct task_struct *cpu_curr_snapshot(int cpu)
+{
+ struct task_struct *t;
+
+ smp_mb(); /* Pairing determined by caller's synchronization design. */
+ t = rcu_dereference(cpu_curr(cpu));
+ smp_mb(); /* Pairing determined by caller's synchronization design. */
+ return t;
+}
+
+/**
* wake_up_process - Wake up a specific process
* @p: The process to be woken up.
*
@@ -2758,6 +4523,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
+ p->se.vlag = 0;
+ p->se.slice = sysctl_sched_base_slice;
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2766,13 +4533,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#ifdef CONFIG_SCHEDSTATS
/* Even if schedstat is disabled, there should not be garbage */
- memset(&p->se.statistics, 0, sizeof(p->se.statistics));
+ memset(&p->stats, 0, sizeof(p->stats));
#endif
- RB_CLEAR_NODE(&p->dl.rb_node);
- init_dl_task_timer(&p->dl);
- init_dl_inactive_task_timer(&p->dl);
- __dl_clear_params(p);
+ init_dl_entity(&p->dl);
INIT_LIST_HEAD(&p->rt.run_list);
p->rt.timeout = 0;
@@ -2790,14 +4554,18 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
init_numa_balancing(clone_flags, p);
#ifdef CONFIG_SMP
p->wake_entry.u_flags = CSD_TYPE_TTWU;
+ p->migration_pending = NULL;
#endif
+ init_sched_mm_cid(p);
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
#ifdef CONFIG_NUMA_BALANCING
-void set_numabalancing_state(bool enabled)
+int sysctl_numa_balancing_mode;
+
+static void __set_numabalancing_state(bool enabled)
{
if (enabled)
static_branch_enable(&sched_numa_balancing);
@@ -2805,13 +4573,33 @@ void set_numabalancing_state(bool enabled)
static_branch_disable(&sched_numa_balancing);
}
+void set_numabalancing_state(bool enabled)
+{
+ if (enabled)
+ sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL;
+ else
+ sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED;
+ __set_numabalancing_state(enabled);
+}
+
#ifdef CONFIG_PROC_SYSCTL
-int sysctl_numa_balancing(struct ctl_table *table, int write,
+static void reset_memory_tiering(void)
+{
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat) {
+ pgdat->nbp_threshold = 0;
+ pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ pgdat->nbp_th_start = jiffies_to_msecs(jiffies);
+ }
+}
+
+static int sysctl_numa_balancing(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
- int state = static_branch_likely(&sched_numa_balancing);
+ int state = sysctl_numa_balancing_mode;
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -2821,8 +4609,13 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
if (err < 0)
return err;
- if (write)
- set_numabalancing_state(state);
+ if (write) {
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ (state & NUMA_BALANCING_MEMORY_TIERING))
+ reset_memory_tiering();
+ sysctl_numa_balancing_mode = state;
+ __set_numabalancing_state(state);
+ }
return err;
}
#endif
@@ -2831,7 +4624,6 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
#ifdef CONFIG_SCHEDSTATS
DEFINE_STATIC_KEY_FALSE(sched_schedstats);
-static bool __initdata __sched_schedstats = false;
static void set_schedstats(bool enabled)
{
@@ -2855,16 +4647,11 @@ static int __init setup_schedstats(char *str)
if (!str)
goto out;
- /*
- * This code is called before jump labels have been set up, so we can't
- * change the static branch directly just yet. Instead set a temporary
- * variable so init_schedstats() can do it later.
- */
if (!strcmp(str, "enable")) {
- __sched_schedstats = true;
+ set_schedstats(true);
ret = 1;
} else if (!strcmp(str, "disable")) {
- __sched_schedstats = false;
+ set_schedstats(false);
ret = 1;
}
out:
@@ -2875,13 +4662,8 @@ out:
}
__setup("schedstats=", setup_schedstats);
-static void __init init_schedstats(void)
-{
- set_schedstats(__sched_schedstats);
-}
-
#ifdef CONFIG_PROC_SYSCTL
-int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
+static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
@@ -2901,24 +4683,77 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
return err;
}
#endif /* CONFIG_PROC_SYSCTL */
-#else /* !CONFIG_SCHEDSTATS */
-static inline void init_schedstats(void) {}
#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_core_sysctls[] = {
+#ifdef CONFIG_SCHEDSTATS
+ {
+ .procname = "sched_schedstats",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_schedstats,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_UCLAMP_TASK
+ {
+ .procname = "sched_util_clamp_min",
+ .data = &sysctl_sched_uclamp_util_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+ {
+ .procname = "sched_util_clamp_max",
+ .data = &sysctl_sched_uclamp_util_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+ {
+ .procname = "sched_util_clamp_min_rt_default",
+ .data = &sysctl_sched_uclamp_util_min_rt_default,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+#endif /* CONFIG_UCLAMP_TASK */
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .procname = "numa_balancing",
+ .data = NULL, /* filled in by handler */
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_numa_balancing,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_FOUR,
+ },
+#endif /* CONFIG_NUMA_BALANCING */
+ {}
+};
+static int __init sched_core_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_core_sysctls);
+ return 0;
+}
+late_initcall(sched_core_sysctl_init);
+#endif /* CONFIG_SYSCTL */
+
/*
* fork()/clone()-time setup:
*/
int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
- unsigned long flags;
-
__sched_fork(clone_flags, p);
/*
* We mark the process as NEW here. This guarantees that
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
- p->state = TASK_NEW;
+ p->__state = TASK_NEW;
/*
* Make sure we do not leak PI boosting priority to the child.
@@ -2938,7 +4773,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
} else if (PRIO_TO_NICE(p->static_prio) < 0)
p->static_prio = NICE_TO_PRIO(0);
- p->prio = p->normal_prio = __normal_prio(p);
+ p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
/*
@@ -2957,23 +4792,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
init_entity_runnable_average(&p->se);
- /*
- * The child is not yet in the pid-hash so no cgroup attach races,
- * and the cgroup is pinned to this child due to cgroup_fork()
- * is ran before sched_fork().
- *
- * Silence PROVE_RCU.
- */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- rseq_migrate(p);
- /*
- * We're setting the CPU for the first time, we don't migrate,
- * so use __set_task_cpu().
- */
- __set_task_cpu(p, smp_processor_id());
- if (p->sched_class->task_fork)
- p->sched_class->task_fork(p);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#ifdef CONFIG_SCHED_INFO
if (likely(sched_info_on()))
@@ -2990,6 +4808,40 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
return 0;
}
+void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+{
+ unsigned long flags;
+
+ /*
+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
+ * required yet, but lockdep gets upset if rules are violated.
+ */
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+#ifdef CONFIG_CGROUP_SCHED
+ if (1) {
+ struct task_group *tg;
+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
+ struct task_group, css);
+ tg = autogroup_task_group(p, tg);
+ p->sched_task_group = tg;
+ }
+#endif
+ rseq_migrate(p);
+ /*
+ * We're setting the CPU for the first time, we don't migrate,
+ * so use __set_task_cpu().
+ */
+ __set_task_cpu(p, smp_processor_id());
+ if (p->sched_class->task_fork)
+ p->sched_class->task_fork(p);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
+void sched_post_fork(struct task_struct *p)
+{
+ uclamp_post_fork(p);
+}
+
unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
@@ -3019,7 +4871,7 @@ void wake_up_new_task(struct task_struct *p)
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
- p->state = TASK_RUNNING;
+ WRITE_ONCE(p->__state, TASK_RUNNING);
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
@@ -3031,7 +4883,7 @@ void wake_up_new_task(struct task_struct *p)
*/
p->recent_used_cpu = task_cpu(p);
rseq_migrate(p);
- __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK));
#endif
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
@@ -3039,11 +4891,11 @@ void wake_up_new_task(struct task_struct *p)
activate_task(rq, p, ENQUEUE_NOCLOCK);
trace_sched_wakeup_new(p);
- check_preempt_curr(rq, p, WF_FORK);
+ wakeup_preempt(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
- * Nothing relies on rq->lock after this, so its fine to
+ * Nothing relies on rq->lock after this, so it's fine to
* drop it.
*/
rq_unpin_lock(rq, &rf);
@@ -3147,8 +4999,11 @@ static inline void prepare_task(struct task_struct *next)
/*
* Claim the task as running, we do this before switching to it
* such that any running task will have this set.
+ *
+ * See the smp_load_acquire(&p->on_cpu) case in ttwu() and
+ * its ordering comment.
*/
- next->on_cpu = 1;
+ WRITE_ONCE(next->on_cpu, 1);
#endif
}
@@ -3156,8 +5011,9 @@ static inline void finish_task(struct task_struct *prev)
{
#ifdef CONFIG_SMP
/*
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
- * We must ensure this doesn't happen until the switch is completely
+ * This must be the very last reference to @prev from this CPU. After
+ * p->on_cpu is cleared, the task can be moved to a different CPU. We
+ * must ensure this doesn't happen until the switch is completely
* finished.
*
* In particular, the load of prev->state in finish_task_switch() must
@@ -3169,6 +5025,106 @@ static inline void finish_task(struct task_struct *prev)
#endif
}
+#ifdef CONFIG_SMP
+
+static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
+{
+ void (*func)(struct rq *rq);
+ struct balance_callback *next;
+
+ lockdep_assert_rq_held(rq);
+
+ while (head) {
+ func = (void (*)(struct rq *))head->func;
+ next = head->next;
+ head->next = NULL;
+ head = next;
+
+ func(rq);
+ }
+}
+
+static void balance_push(struct rq *rq);
+
+/*
+ * balance_push_callback is a right abuse of the callback interface and plays
+ * by significantly different rules.
+ *
+ * Where the normal balance_callback's purpose is to be ran in the same context
+ * that queued it (only later, when it's safe to drop rq->lock again),
+ * balance_push_callback is specifically targeted at __schedule().
+ *
+ * This abuse is tolerated because it places all the unlikely/odd cases behind
+ * a single test, namely: rq->balance_callback == NULL.
+ */
+struct balance_callback balance_push_callback = {
+ .next = NULL,
+ .func = balance_push,
+};
+
+static inline struct balance_callback *
+__splice_balance_callbacks(struct rq *rq, bool split)
+{
+ struct balance_callback *head = rq->balance_callback;
+
+ if (likely(!head))
+ return NULL;
+
+ lockdep_assert_rq_held(rq);
+ /*
+ * Must not take balance_push_callback off the list when
+ * splice_balance_callbacks() and balance_callbacks() are not
+ * in the same rq->lock section.
+ *
+ * In that case it would be possible for __schedule() to interleave
+ * and observe the list empty.
+ */
+ if (split && head == &balance_push_callback)
+ head = NULL;
+ else
+ rq->balance_callback = NULL;
+
+ return head;
+}
+
+static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
+{
+ return __splice_balance_callbacks(rq, true);
+}
+
+static void __balance_callbacks(struct rq *rq)
+{
+ do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
+}
+
+static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
+{
+ unsigned long flags;
+
+ if (unlikely(head)) {
+ raw_spin_rq_lock_irqsave(rq, flags);
+ do_balance_callbacks(rq, head);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
+ }
+}
+
+#else
+
+static inline void __balance_callbacks(struct rq *rq)
+{
+}
+
+static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
+{
+ return NULL;
+}
+
+static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
+{
+}
+
+#endif
+
static inline void
prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
{
@@ -3179,10 +5135,10 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf
* do an early lockdep release here:
*/
rq_unpin_lock(rq, rf);
- spin_release(&rq->lock.dep_map, _THIS_IP_);
+ spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_);
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
- rq->lock.owner = next;
+ rq_lockp(rq)->owner = next;
#endif
}
@@ -3193,8 +5149,9 @@ static inline void finish_lock_switch(struct rq *rq)
* fix up the runqueue lock - which gets 'carried over' from
* prev into current:
*/
- spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
- raw_spin_unlock_irq(&rq->lock);
+ spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
+ __balance_callbacks(rq);
+ raw_spin_rq_unlock_irq(rq);
}
/*
@@ -3209,6 +5166,22 @@ static inline void finish_lock_switch(struct rq *rq)
# define finish_arch_post_lock_switch() do { } while (0)
#endif
+static inline void kmap_local_sched_out(void)
+{
+#ifdef CONFIG_KMAP_LOCAL
+ if (unlikely(current->kmap_ctrl.idx))
+ __kmap_local_sched_out();
+#endif
+}
+
+static inline void kmap_local_sched_in(void)
+{
+#ifdef CONFIG_KMAP_LOCAL
+ if (unlikely(current->kmap_ctrl.idx))
+ __kmap_local_sched_in();
+#endif
+}
+
/**
* prepare_task_switch - prepare to switch tasks
* @rq: the runqueue preparing to switch
@@ -3231,6 +5204,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
perf_event_task_sched_out(prev, next);
rseq_preempt(prev);
fire_sched_out_preempt_notifiers(prev, next);
+ kmap_local_sched_out();
prepare_task(next);
prepare_arch_switch(next);
}
@@ -3259,7 +5233,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
{
struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
- long prev_state;
+ unsigned int prev_state;
/*
* The previous task will have left us with a preempt_count of 2
@@ -3290,13 +5264,22 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* running on another CPU and we could rave with its RUNNING -> DEAD
* transition, resulting in a double drop.
*/
- prev_state = prev->state;
+ prev_state = READ_ONCE(prev->__state);
vtime_task_switch(prev);
perf_event_task_sched_in(prev, current);
finish_task(prev);
+ tick_nohz_task_switch();
finish_lock_switch(rq);
finish_arch_post_lock_switch();
kcov_finish_switch(current);
+ /*
+ * kmap_local_sched_out() is invoked with rq::lock held and
+ * interrupts disabled. There is no requirement for that, but the
+ * sched out code does not have an interrupt enabled section.
+ * Restoring the maps on sched in does not require interrupts being
+ * disabled either.
+ */
+ kmap_local_sched_in();
fire_sched_in_preempt_notifiers(current);
/*
@@ -3308,70 +5291,27 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* rq->curr, before returning to userspace, so provide them here:
*
* - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
- * provided by mmdrop(),
+ * provided by mmdrop_lazy_tlb(),
* - a sync_core for SYNC_CORE.
*/
if (mm) {
membarrier_mm_sync_core_before_usermode(mm);
- mmdrop(mm);
+ mmdrop_lazy_tlb_sched(mm);
}
+
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
- /*
- * Remove function-return probe instances associated with this
- * task and put them back on the free list.
- */
- kprobe_flush_task(prev);
-
/* Task is done with its stack. */
put_task_stack(prev);
put_task_struct_rcu_user(prev);
}
- tick_nohz_task_switch();
return rq;
}
-#ifdef CONFIG_SMP
-
-/* rq->lock is NOT held, but preemption is disabled */
-static void __balance_callback(struct rq *rq)
-{
- struct callback_head *head, *next;
- void (*func)(struct rq *rq);
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- head = rq->balance_callback;
- rq->balance_callback = NULL;
- while (head) {
- func = (void (*)(struct rq *))head->func;
- next = head->next;
- head->next = NULL;
- head = next;
-
- func(rq);
- }
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-static inline void balance_callback(struct rq *rq)
-{
- if (unlikely(rq->balance_callback))
- __balance_callback(rq);
-}
-
-#else
-
-static inline void balance_callback(struct rq *rq)
-{
-}
-
-#endif
-
/**
* schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from.
@@ -3379,8 +5319,6 @@ static inline void balance_callback(struct rq *rq)
asmlinkage __visible void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
- struct rq *rq;
-
/*
* New tasks start with FORK_PREEMPT_COUNT, see there and
* finish_task_switch() for details.
@@ -3390,8 +5328,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
* PREEMPT_COUNT kernels).
*/
- rq = finish_task_switch(prev);
- balance_callback(rq);
+ finish_task_switch(prev);
preempt_enable();
if (current->set_child_tid)
@@ -3418,17 +5355,20 @@ context_switch(struct rq *rq, struct task_struct *prev,
/*
* kernel -> kernel lazy + transfer active
- * user -> kernel lazy + mmgrab() active
+ * user -> kernel lazy + mmgrab_lazy_tlb() active
*
- * kernel -> user switch + mmdrop() active
+ * kernel -> user switch + mmdrop_lazy_tlb() active
* user -> user switch
+ *
+ * switch_mm_cid() needs to be updated if the barriers provided
+ * by context_switch() are modified.
*/
if (!next->mm) { // to kernel
enter_lazy_tlb(prev->active_mm, next);
next->active_mm = prev->active_mm;
if (prev->mm) // from user
- mmgrab(prev->active_mm);
+ mmgrab_lazy_tlb(prev->active_mm);
else
prev->active_mm = NULL;
} else { // to user
@@ -3442,15 +5382,17 @@ context_switch(struct rq *rq, struct task_struct *prev,
* finish_task_switch()'s mmdrop().
*/
switch_mm_irqs_off(prev->active_mm, next->mm, next);
+ lru_gen_use_mm(next->mm);
if (!prev->mm) { // from kernel
- /* will mmdrop() in finish_task_switch(). */
+ /* will mmdrop_lazy_tlb() in finish_task_switch(). */
rq->prev_mm = prev->active_mm;
prev->active_mm = NULL;
}
}
- rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
+ /* switch_mm_cid() requires the memory barriers above. */
+ switch_mm_cid(rq, prev, next);
prepare_lock_switch(rq, next, rf);
@@ -3467,9 +5409,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
* externally visible scheduler statistics: current number of runnable
* threads, total number of context switches performed since bootup.
*/
-unsigned long nr_running(void)
+unsigned int nr_running(void)
{
- unsigned long i, sum = 0;
+ unsigned int i, sum = 0;
for_each_online_cpu(i)
sum += cpu_rq(i)->nr_running;
@@ -3496,6 +5438,11 @@ bool single_task_running(void)
}
EXPORT_SYMBOL(single_task_running);
+unsigned long long nr_context_switches_cpu(int cpu)
+{
+ return cpu_rq(cpu)->nr_switches;
+}
+
unsigned long long nr_context_switches(void)
{
int i;
@@ -3514,13 +5461,13 @@ unsigned long long nr_context_switches(void)
* it does become runnable.
*/
-unsigned long nr_iowait_cpu(int cpu)
+unsigned int nr_iowait_cpu(int cpu)
{
return atomic_read(&cpu_rq(cpu)->nr_iowait);
}
/*
- * IO-wait accounting, and how its mostly bollocks (on SMP).
+ * IO-wait accounting, and how it's mostly bollocks (on SMP).
*
* The idea behind IO-wait account is to account the idle time that we could
* have spend running if it were not for IO. That is, if we were to improve the
@@ -3549,9 +5496,9 @@ unsigned long nr_iowait_cpu(int cpu)
* Task CPU affinities can make all that even more 'interesting'.
*/
-unsigned long nr_iowait(void)
+unsigned int nr_iowait(void)
{
- unsigned long i, sum = 0;
+ unsigned int i, sum = 0;
for_each_possible_cpu(i)
sum += nr_iowait_cpu(i);
@@ -3568,23 +5515,20 @@ unsigned long nr_iowait(void)
void sched_exec(void)
{
struct task_struct *p = current;
- unsigned long flags;
+ struct migration_arg arg;
int dest_cpu;
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
- if (dest_cpu == smp_processor_id())
- goto unlock;
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
+ if (dest_cpu == smp_processor_id())
+ return;
- if (likely(cpu_active(dest_cpu))) {
- struct migration_arg arg = { p, dest_cpu };
+ if (unlikely(!cpu_active(dest_cpu)))
+ return;
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
- return;
+ arg = (struct migration_arg){ p, dest_cpu };
}
-unlock:
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
}
#endif
@@ -3656,16 +5600,54 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
-DEFINE_PER_CPU(unsigned long, thermal_pressure);
+#ifdef CONFIG_SCHED_DEBUG
+static u64 cpu_resched_latency(struct rq *rq)
+{
+ int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
+ u64 resched_latency, now = rq_clock(rq);
+ static bool warned_once;
+
+ if (sysctl_resched_latency_warn_once && warned_once)
+ return 0;
+
+ if (!need_resched() || !latency_warn_ms)
+ return 0;
+
+ if (system_state == SYSTEM_BOOTING)
+ return 0;
+
+ if (!rq->last_seen_need_resched_ns) {
+ rq->last_seen_need_resched_ns = now;
+ rq->ticks_without_resched = 0;
+ return 0;
+ }
+
+ rq->ticks_without_resched++;
+ resched_latency = now - rq->last_seen_need_resched_ns;
+ if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
+ return 0;
+
+ warned_once = true;
-void arch_set_thermal_pressure(struct cpumask *cpus,
- unsigned long th_pressure)
+ return resched_latency;
+}
+
+static int __init setup_resched_latency_warn_ms(char *str)
{
- int cpu;
+ long val;
+
+ if ((kstrtol(str, 0, &val))) {
+ pr_warn("Unable to set resched_latency_warn_ms\n");
+ return 1;
+ }
- for_each_cpu(cpu, cpus)
- WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
+ sysctl_resched_latency_warn_ms = val;
+ return 1;
}
+__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
+#else
+static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
+#endif /* CONFIG_SCHED_DEBUG */
/*
* This function gets called by the timer code, with HZ frequency.
@@ -3678,8 +5660,11 @@ void scheduler_tick(void)
struct task_struct *curr = rq->curr;
struct rq_flags rf;
unsigned long thermal_pressure;
+ u64 resched_latency;
+
+ if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ arch_scale_freq_tick();
- arch_scale_freq_tick();
sched_clock_tick();
rq_lock(rq, &rf);
@@ -3688,13 +5673,22 @@ void scheduler_tick(void)
thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
curr->sched_class->task_tick(rq, curr, 0);
+ if (sched_feat(LATENCY_WARN))
+ resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
- psi_task_tick(rq);
+ sched_core_tick(rq);
+ task_tick_mm_cid(rq, curr);
rq_unlock(rq, &rf);
+ if (sched_feat(LATENCY_WARN) && resched_latency)
+ resched_latency_warn(cpu, resched_latency);
+
perf_event_task_tick();
+ if (curr->flags & PF_WQ_WORKER)
+ wq_worker_tick(curr);
+
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq);
@@ -3744,9 +5738,6 @@ static void sched_tick_remote(struct work_struct *work)
struct tick_work *twork = container_of(dwork, struct tick_work, work);
int cpu = twork->cpu;
struct rq *rq = cpu_rq(cpu);
- struct task_struct *curr;
- struct rq_flags rf;
- u64 delta;
int os;
/*
@@ -3756,30 +5747,26 @@ static void sched_tick_remote(struct work_struct *work)
* statistics and checks timeslices in a time-independent way, regardless
* of when exactly it is running.
*/
- if (!tick_nohz_tick_stopped_cpu(cpu))
- goto out_requeue;
+ if (tick_nohz_tick_stopped_cpu(cpu)) {
+ guard(rq_lock_irq)(rq);
+ struct task_struct *curr = rq->curr;
- rq_lock_irq(rq, &rf);
- curr = rq->curr;
- if (cpu_is_offline(cpu))
- goto out_unlock;
+ if (cpu_online(cpu)) {
+ update_rq_clock(rq);
- update_rq_clock(rq);
+ if (!is_idle_task(curr)) {
+ /*
+ * Make sure the next tick runs within a
+ * reasonable amount of time.
+ */
+ u64 delta = rq_clock_task(rq) - curr->se.exec_start;
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ }
+ curr->sched_class->task_tick(rq, curr, 0);
- if (!is_idle_task(curr)) {
- /*
- * Make sure the next tick runs within a reasonable
- * amount of time.
- */
- delta = rq_clock_task(rq) - curr->se.exec_start;
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ calc_load_nohz_remote(rq);
+ }
}
- curr->sched_class->task_tick(rq, curr, 0);
-
- calc_load_nohz_remote(rq);
-out_unlock:
- rq_unlock_irq(rq, &rf);
-out_requeue:
/*
* Run the remote tick once per second (1Hz). This arbitrary
@@ -3798,7 +5785,7 @@ static void sched_tick_start(int cpu)
int os;
struct tick_work *twork;
- if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
@@ -3819,7 +5806,7 @@ static void sched_tick_stop(int cpu)
struct tick_work *twork;
int os;
- if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
@@ -3947,13 +5934,11 @@ static noinline void __schedule_bug(struct task_struct *prev)
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
- if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
- && in_atomic_preempt_off()) {
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
pr_err("Preemption disabled at:");
print_ip_sym(KERN_ERR, preempt_disable_ip);
}
- if (panic_on_warn)
- panic("scheduling while atomic\n");
+ check_panic_on_warn("scheduling while atomic");
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
@@ -3973,7 +5958,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
- if (!preempt && prev->state && prev->non_block_count) {
+ if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) {
printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
prev->comm, prev->pid, prev->non_block_count);
dump_stack();
@@ -3986,6 +5971,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
preempt_count_set(PREEMPT_DISABLED);
}
rcu_sleep_check();
+ SCHED_WARN_ON(ct_state() == CONTEXT_USER);
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -4018,7 +6004,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
@@ -4026,39 +6012,559 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/*
* Optimization: we know that if all tasks are in the fair class we can
* call that function directly, but only if the @prev task wasn't of a
- * higher scheduling class, because otherwise those loose the
+ * higher scheduling class, because otherwise those lose the
* opportunity to pull in more work from other CPUs.
*/
- if (likely((prev->sched_class == &idle_sched_class ||
- prev->sched_class == &fair_sched_class) &&
+ if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = pick_next_task_fair(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto restart;
- /* Assumes fair_sched_class->next == idle_sched_class */
+ /* Assume the next prioritized class is idle_sched_class */
if (!p) {
put_prev_task(rq, prev);
p = pick_next_task_idle(rq);
}
+ /*
+ * This is the fast path; it cannot be a DL server pick;
+ * therefore even if @p == @prev, ->dl_server must be NULL.
+ */
+ if (p->dl_server)
+ p->dl_server = NULL;
+
return p;
}
restart:
put_prev_task_balance(rq, prev, rf);
+ /*
+ * We've updated @prev and no longer need the server link, clear it.
+ * Must be done before ->pick_next_task() because that can (re)set
+ * ->dl_server.
+ */
+ if (prev->dl_server)
+ prev->dl_server = NULL;
+
for_each_class(class) {
p = class->pick_next_task(rq);
if (p)
return p;
}
- /* The idle class should always have a runnable task: */
- BUG();
+ BUG(); /* The idle class should always have a runnable task. */
+}
+
+#ifdef CONFIG_SCHED_CORE
+static inline bool is_task_rq_idle(struct task_struct *t)
+{
+ return (task_rq(t)->idle == t);
}
+static inline bool cookie_equals(struct task_struct *a, unsigned long cookie)
+{
+ return is_task_rq_idle(a) || (a->core_cookie == cookie);
+}
+
+static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
+{
+ if (is_task_rq_idle(a) || is_task_rq_idle(b))
+ return true;
+
+ return a->core_cookie == b->core_cookie;
+}
+
+static inline struct task_struct *pick_task(struct rq *rq)
+{
+ const struct sched_class *class;
+ struct task_struct *p;
+
+ for_each_class(class) {
+ p = class->pick_task(rq);
+ if (p)
+ return p;
+ }
+
+ BUG(); /* The idle class should always have a runnable task. */
+}
+
+extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
+
+static void queue_core_balance(struct rq *rq);
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct task_struct *next, *p, *max = NULL;
+ const struct cpumask *smt_mask;
+ bool fi_before = false;
+ bool core_clock_updated = (rq == rq->core);
+ unsigned long cookie;
+ int i, cpu, occ = 0;
+ struct rq *rq_i;
+ bool need_sync;
+
+ if (!sched_core_enabled(rq))
+ return __pick_next_task(rq, prev, rf);
+
+ cpu = cpu_of(rq);
+
+ /* Stopper task is switching into idle, no need core-wide selection. */
+ if (cpu_is_offline(cpu)) {
+ /*
+ * Reset core_pick so that we don't enter the fastpath when
+ * coming online. core_pick would already be migrated to
+ * another cpu during offline.
+ */
+ rq->core_pick = NULL;
+ return __pick_next_task(rq, prev, rf);
+ }
+
+ /*
+ * If there were no {en,de}queues since we picked (IOW, the task
+ * pointers are all still valid), and we haven't scheduled the last
+ * pick yet, do so now.
+ *
+ * rq->core_pick can be NULL if no selection was made for a CPU because
+ * it was either offline or went offline during a sibling's core-wide
+ * selection. In this case, do a core-wide selection.
+ */
+ if (rq->core->core_pick_seq == rq->core->core_task_seq &&
+ rq->core->core_pick_seq != rq->core_sched_seq &&
+ rq->core_pick) {
+ WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
+
+ next = rq->core_pick;
+ if (next != prev) {
+ put_prev_task(rq, prev);
+ set_next_task(rq, next);
+ }
+
+ rq->core_pick = NULL;
+ goto out;
+ }
+
+ put_prev_task_balance(rq, prev, rf);
+
+ smt_mask = cpu_smt_mask(cpu);
+ need_sync = !!rq->core->core_cookie;
+
+ /* reset state */
+ rq->core->core_cookie = 0UL;
+ if (rq->core->core_forceidle_count) {
+ if (!core_clock_updated) {
+ update_rq_clock(rq->core);
+ core_clock_updated = true;
+ }
+ sched_core_account_forceidle(rq);
+ /* reset after accounting force idle */
+ rq->core->core_forceidle_start = 0;
+ rq->core->core_forceidle_count = 0;
+ rq->core->core_forceidle_occupation = 0;
+ need_sync = true;
+ fi_before = true;
+ }
+
+ /*
+ * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq
+ *
+ * @task_seq guards the task state ({en,de}queues)
+ * @pick_seq is the @task_seq we did a selection on
+ * @sched_seq is the @pick_seq we scheduled
+ *
+ * However, preemptions can cause multiple picks on the same task set.
+ * 'Fix' this by also increasing @task_seq for every pick.
+ */
+ rq->core->core_task_seq++;
+
+ /*
+ * Optimize for common case where this CPU has no cookies
+ * and there are no cookied tasks running on siblings.
+ */
+ if (!need_sync) {
+ next = pick_task(rq);
+ if (!next->core_cookie) {
+ rq->core_pick = NULL;
+ /*
+ * For robustness, update the min_vruntime_fi for
+ * unconstrained picks as well.
+ */
+ WARN_ON_ONCE(fi_before);
+ task_vruntime_update(rq, next, false);
+ goto out_set_next;
+ }
+ }
+
+ /*
+ * For each thread: do the regular task pick and find the max prio task
+ * amongst them.
+ *
+ * Tie-break prio towards the current CPU
+ */
+ for_each_cpu_wrap(i, smt_mask, cpu) {
+ rq_i = cpu_rq(i);
+
+ /*
+ * Current cpu always has its clock updated on entrance to
+ * pick_next_task(). If the current cpu is not the core,
+ * the core may also have been updated above.
+ */
+ if (i != cpu && (rq_i != rq->core || !core_clock_updated))
+ update_rq_clock(rq_i);
+
+ p = rq_i->core_pick = pick_task(rq_i);
+ if (!max || prio_less(max, p, fi_before))
+ max = p;
+ }
+
+ cookie = rq->core->core_cookie = max->core_cookie;
+
+ /*
+ * For each thread: try and find a runnable task that matches @max or
+ * force idle.
+ */
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+ p = rq_i->core_pick;
+
+ if (!cookie_equals(p, cookie)) {
+ p = NULL;
+ if (cookie)
+ p = sched_core_find(rq_i, cookie);
+ if (!p)
+ p = idle_sched_class.pick_task(rq_i);
+ }
+
+ rq_i->core_pick = p;
+
+ if (p == rq_i->idle) {
+ if (rq_i->nr_running) {
+ rq->core->core_forceidle_count++;
+ if (!fi_before)
+ rq->core->core_forceidle_seq++;
+ }
+ } else {
+ occ++;
+ }
+ }
+
+ if (schedstat_enabled() && rq->core->core_forceidle_count) {
+ rq->core->core_forceidle_start = rq_clock(rq->core);
+ rq->core->core_forceidle_occupation = occ;
+ }
+
+ rq->core->core_pick_seq = rq->core->core_task_seq;
+ next = rq->core_pick;
+ rq->core_sched_seq = rq->core->core_pick_seq;
+
+ /* Something should have been selected for current CPU */
+ WARN_ON_ONCE(!next);
+
+ /*
+ * Reschedule siblings
+ *
+ * NOTE: L1TF -- at this point we're no longer running the old task and
+ * sending an IPI (below) ensures the sibling will no longer be running
+ * their task. This ensures there is no inter-sibling overlap between
+ * non-matching user state.
+ */
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+
+ /*
+ * An online sibling might have gone offline before a task
+ * could be picked for it, or it might be offline but later
+ * happen to come online, but its too late and nothing was
+ * picked for it. That's Ok - it will pick tasks for itself,
+ * so ignore it.
+ */
+ if (!rq_i->core_pick)
+ continue;
+
+ /*
+ * Update for new !FI->FI transitions, or if continuing to be in !FI:
+ * fi_before fi update?
+ * 0 0 1
+ * 0 1 1
+ * 1 0 1
+ * 1 1 0
+ */
+ if (!(fi_before && rq->core->core_forceidle_count))
+ task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
+
+ rq_i->core_pick->core_occupation = occ;
+
+ if (i == cpu) {
+ rq_i->core_pick = NULL;
+ continue;
+ }
+
+ /* Did we break L1TF mitigation requirements? */
+ WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
+
+ if (rq_i->curr == rq_i->core_pick) {
+ rq_i->core_pick = NULL;
+ continue;
+ }
+
+ resched_curr(rq_i);
+ }
+
+out_set_next:
+ set_next_task(rq, next);
+out:
+ if (rq->core->core_forceidle_count && next == rq->idle)
+ queue_core_balance(rq);
+
+ return next;
+}
+
+static bool try_steal_cookie(int this, int that)
+{
+ struct rq *dst = cpu_rq(this), *src = cpu_rq(that);
+ struct task_struct *p;
+ unsigned long cookie;
+ bool success = false;
+
+ guard(irq)();
+ guard(double_rq_lock)(dst, src);
+
+ cookie = dst->core->core_cookie;
+ if (!cookie)
+ return false;
+
+ if (dst->curr != dst->idle)
+ return false;
+
+ p = sched_core_find(src, cookie);
+ if (!p)
+ return false;
+
+ do {
+ if (p == src->core_pick || p == src->curr)
+ goto next;
+
+ if (!is_cpu_allowed(p, this))
+ goto next;
+
+ if (p->core_occupation > dst->idle->core_occupation)
+ goto next;
+ /*
+ * sched_core_find() and sched_core_next() will ensure
+ * that task @p is not throttled now, we also need to
+ * check whether the runqueue of the destination CPU is
+ * being throttled.
+ */
+ if (sched_task_is_throttled(p, this))
+ goto next;
+
+ deactivate_task(src, p, 0);
+ set_task_cpu(p, this);
+ activate_task(dst, p, 0);
+
+ resched_curr(dst);
+
+ success = true;
+ break;
+
+next:
+ p = sched_core_next(p, cookie);
+ } while (p);
+
+ return success;
+}
+
+static bool steal_cookie_task(int cpu, struct sched_domain *sd)
+{
+ int i;
+
+ for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) {
+ if (i == cpu)
+ continue;
+
+ if (need_resched())
+ break;
+
+ if (try_steal_cookie(cpu, i))
+ return true;
+ }
+
+ return false;
+}
+
+static void sched_core_balance(struct rq *rq)
+{
+ struct sched_domain *sd;
+ int cpu = cpu_of(rq);
+
+ guard(preempt)();
+ guard(rcu)();
+
+ raw_spin_rq_unlock_irq(rq);
+ for_each_domain(cpu, sd) {
+ if (need_resched())
+ break;
+
+ if (steal_cookie_task(cpu, sd))
+ break;
+ }
+ raw_spin_rq_lock_irq(rq);
+}
+
+static DEFINE_PER_CPU(struct balance_callback, core_balance_head);
+
+static void queue_core_balance(struct rq *rq)
+{
+ if (!sched_core_enabled(rq))
+ return;
+
+ if (!rq->core->core_cookie)
+ return;
+
+ if (!rq->nr_running) /* not forced idle */
+ return;
+
+ queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
+}
+
+DEFINE_LOCK_GUARD_1(core_lock, int,
+ sched_core_lock(*_T->lock, &_T->flags),
+ sched_core_unlock(*_T->lock, &_T->flags),
+ unsigned long flags)
+
+static void sched_core_cpu_starting(unsigned int cpu)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
+ int t;
+
+ guard(core_lock)(&cpu);
+
+ WARN_ON_ONCE(rq->core != rq);
+
+ /* if we're the first, we'll be our own leader */
+ if (cpumask_weight(smt_mask) == 1)
+ return;
+
+ /* find the leader */
+ for_each_cpu(t, smt_mask) {
+ if (t == cpu)
+ continue;
+ rq = cpu_rq(t);
+ if (rq->core == rq) {
+ core_rq = rq;
+ break;
+ }
+ }
+
+ if (WARN_ON_ONCE(!core_rq)) /* whoopsie */
+ return;
+
+ /* install and validate core_rq */
+ for_each_cpu(t, smt_mask) {
+ rq = cpu_rq(t);
+
+ if (t == cpu)
+ rq->core = core_rq;
+
+ WARN_ON_ONCE(rq->core != core_rq);
+ }
+}
+
+static void sched_core_cpu_deactivate(unsigned int cpu)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
+ int t;
+
+ guard(core_lock)(&cpu);
+
+ /* if we're the last man standing, nothing to do */
+ if (cpumask_weight(smt_mask) == 1) {
+ WARN_ON_ONCE(rq->core != rq);
+ return;
+ }
+
+ /* if we're not the leader, nothing to do */
+ if (rq->core != rq)
+ return;
+
+ /* find a new leader */
+ for_each_cpu(t, smt_mask) {
+ if (t == cpu)
+ continue;
+ core_rq = cpu_rq(t);
+ break;
+ }
+
+ if (WARN_ON_ONCE(!core_rq)) /* impossible */
+ return;
+
+ /* copy the shared state to the new leader */
+ core_rq->core_task_seq = rq->core_task_seq;
+ core_rq->core_pick_seq = rq->core_pick_seq;
+ core_rq->core_cookie = rq->core_cookie;
+ core_rq->core_forceidle_count = rq->core_forceidle_count;
+ core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+ core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
+
+ /*
+ * Accounting edge for forced idle is handled in pick_next_task().
+ * Don't need another one here, since the hotplug thread shouldn't
+ * have a cookie.
+ */
+ core_rq->core_forceidle_start = 0;
+
+ /* install new leader */
+ for_each_cpu(t, smt_mask) {
+ rq = cpu_rq(t);
+ rq->core = core_rq;
+ }
+}
+
+static inline void sched_core_cpu_dying(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->core != rq)
+ rq->core = rq;
+}
+
+#else /* !CONFIG_SCHED_CORE */
+
+static inline void sched_core_cpu_starting(unsigned int cpu) {}
+static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
+static inline void sched_core_cpu_dying(unsigned int cpu) {}
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ return __pick_next_task(rq, prev, rf);
+}
+
+#endif /* CONFIG_SCHED_CORE */
+
+/*
+ * Constants for the sched_mode argument of __schedule().
+ *
+ * The mode argument allows RT enabled kernels to differentiate a
+ * preemption from blocking on an 'sleeping' spin/rwlock. Note that
+ * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
+ * optimize the AND operation out and just check for zero.
+ */
+#define SM_NONE 0x0
+#define SM_PREEMPT 0x1
+#define SM_RTLOCK_WAIT 0x2
+
+#ifndef CONFIG_PREEMPT_RT
+# define SM_MASK_PREEMPT (~0U)
+#else
+# define SM_MASK_PREEMPT SM_PREEMPT
+#endif
+
/*
* __schedule() is the main scheduler function.
*
@@ -4098,7 +6604,7 @@ restart:
*
* WARNING: must be called with preemption disabled!
*/
-static void __sched notrace __schedule(bool preempt)
+static void __sched notrace __schedule(unsigned int sched_mode)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
@@ -4111,13 +6617,13 @@ static void __sched notrace __schedule(bool preempt)
rq = cpu_rq(cpu);
prev = rq->curr;
- schedule_debug(prev, preempt);
+ schedule_debug(prev, !!sched_mode);
- if (sched_feat(HRTICK))
+ if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
hrtick_clear(rq);
local_irq_disable();
- rcu_note_context_switch(preempt);
+ rcu_note_context_switch(!!sched_mode);
/*
* Make sure that signal_pending_state()->signal_pending() below
@@ -4140,25 +6646,23 @@ static void __sched notrace __schedule(bool preempt)
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
update_rq_clock(rq);
+ rq->clock_update_flags = RQCF_UPDATED;
switch_count = &prev->nivcsw;
/*
* We must load prev->state once (task_struct::state is volatile), such
- * that:
- *
- * - we form a control dependency vs deactivate_task() below.
- * - ptrace_{,un}freeze_traced() can change ->state underneath us.
+ * that we form a control dependency vs deactivate_task() below.
*/
- prev_state = prev->state;
- if (!preempt && prev_state) {
+ prev_state = READ_ONCE(prev->__state);
+ if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
if (signal_pending_state(prev_state, prev)) {
- prev->state = TASK_RUNNING;
+ WRITE_ONCE(prev->__state, TASK_RUNNING);
} else {
prev->sched_contributes_to_load =
(prev_state & TASK_UNINTERRUPTIBLE) &&
!(prev_state & TASK_NOLOAD) &&
- !(prev->flags & PF_FROZEN);
+ !(prev_state & TASK_FROZEN);
if (prev->sched_contributes_to_load)
rq->nr_uninterruptible++;
@@ -4187,6 +6691,9 @@ static void __sched notrace __schedule(bool preempt)
next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
+#ifdef CONFIG_SCHED_DEBUG
+ rq->last_seen_need_resched_ns = 0;
+#endif
if (likely(prev != next)) {
rq->nr_switches++;
@@ -4211,18 +6718,18 @@ static void __sched notrace __schedule(bool preempt)
*/
++*switch_count;
+ migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
- trace_sched_switch(preempt, prev, next);
+ trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
} else {
- rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
- rq_unlock_irq(rq, &rf);
+ rq_unpin_lock(rq, &rf);
+ __balance_callbacks(rq);
+ raw_spin_rq_unlock_irq(rq);
}
-
- balance_callback(rq);
}
void __noreturn do_task_dead(void)
@@ -4233,7 +6740,7 @@ void __noreturn do_task_dead(void)
/* Tell freezer to ignore us: */
current->flags |= PF_NOFREEZE;
- __schedule(false);
+ __schedule(SM_NONE);
BUG();
/* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
@@ -4243,35 +6750,39 @@ void __noreturn do_task_dead(void)
static inline void sched_submit_work(struct task_struct *tsk)
{
- if (!tsk->state)
- return;
+ static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
+ unsigned int task_flags;
/*
- * If a worker went to sleep, notify and ask workqueue whether
- * it wants to wake up a task to maintain concurrency.
- * As this function is called inside the schedule() context,
- * we disable preemption to avoid it calling schedule() again
- * in the possible wakeup of a kworker and because wq_worker_sleeping()
- * requires it.
+ * Establish LD_WAIT_CONFIG context to ensure none of the code called
+ * will use a blocking primitive -- which would lead to recursion.
*/
- if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
- preempt_disable();
- if (tsk->flags & PF_WQ_WORKER)
- wq_worker_sleeping(tsk);
- else
- io_wq_worker_sleeping(tsk);
- preempt_enable_no_resched();
- }
+ lock_map_acquire_try(&sched_map);
- if (tsk_is_pi_blocked(tsk))
- return;
+ task_flags = tsk->flags;
+ /*
+ * If a worker goes to sleep, notify and ask workqueue whether it
+ * wants to wake up a task to maintain concurrency.
+ */
+ if (task_flags & PF_WQ_WORKER)
+ wq_worker_sleeping(tsk);
+ else if (task_flags & PF_IO_WORKER)
+ io_wq_worker_sleeping(tsk);
+
+ /*
+ * spinlock and rwlock must not flush block requests. This will
+ * deadlock if the callback attempts to acquire a lock which is
+ * already acquired.
+ */
+ SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
*/
- if (blk_needs_flush_plug(tsk))
- blk_schedule_flush_plug(tsk);
+ blk_flush_plug(tsk->plug, true);
+
+ lock_map_release(&sched_map);
}
static void sched_update_worker(struct task_struct *tsk)
@@ -4284,16 +6795,26 @@ static void sched_update_worker(struct task_struct *tsk)
}
}
-asmlinkage __visible void __sched schedule(void)
+static __always_inline void __schedule_loop(unsigned int sched_mode)
{
- struct task_struct *tsk = current;
-
- sched_submit_work(tsk);
do {
preempt_disable();
- __schedule(false);
+ __schedule(sched_mode);
sched_preempt_enable_no_resched();
} while (need_resched());
+}
+
+asmlinkage __visible void __sched schedule(void)
+{
+ struct task_struct *tsk = current;
+
+#ifdef CONFIG_RT_MUTEXES
+ lockdep_assert(!tsk->sched_rt_mutex);
+#endif
+
+ if (!task_is_running(tsk))
+ sched_submit_work(tsk);
+ __schedule_loop(SM_NONE);
sched_update_worker(tsk);
}
EXPORT_SYMBOL(schedule);
@@ -4317,13 +6838,13 @@ void __sched schedule_idle(void)
* current task can be in any other state. Note, idle is always in the
* TASK_RUNNING state.
*/
- WARN_ON_ONCE(current->state);
+ WARN_ON_ONCE(current->__state);
do {
- __schedule(false);
+ __schedule(SM_NONE);
} while (need_resched());
}
-#ifdef CONFIG_CONTEXT_TRACKING
+#if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK)
asmlinkage __visible void __sched schedule_user(void)
{
/*
@@ -4354,6 +6875,14 @@ void __sched schedule_preempt_disabled(void)
preempt_disable();
}
+#ifdef CONFIG_PREEMPT_RT
+void __sched notrace schedule_rtlock(void)
+{
+ __schedule_loop(SM_RTLOCK_WAIT);
+}
+NOKPROBE_SYMBOL(schedule_rtlock);
+#endif
+
static void __sched notrace preempt_schedule_common(void)
{
do {
@@ -4372,7 +6901,7 @@ static void __sched notrace preempt_schedule_common(void)
*/
preempt_disable_notrace();
preempt_latency_start(1);
- __schedule(true);
+ __schedule(SM_PREEMPT);
preempt_latency_stop(1);
preempt_enable_no_resched_notrace();
@@ -4396,12 +6925,32 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
*/
if (likely(!preemptible()))
return;
-
preempt_schedule_common();
}
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#ifndef preempt_schedule_dynamic_enabled
+#define preempt_schedule_dynamic_enabled preempt_schedule
+#define preempt_schedule_dynamic_disabled NULL
+#endif
+DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
+EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);
+void __sched notrace dynamic_preempt_schedule(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule))
+ return;
+ preempt_schedule();
+}
+NOKPROBE_SYMBOL(dynamic_preempt_schedule);
+EXPORT_SYMBOL(dynamic_preempt_schedule);
+#endif
+#endif
+
/**
* preempt_schedule_notrace - preempt_schedule called by tracing
*
@@ -4445,7 +6994,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
* an infinite recursion.
*/
prev_ctx = exception_enter();
- __schedule(true);
+ __schedule(SM_PREEMPT);
exception_exit(prev_ctx);
preempt_latency_stop(1);
@@ -4454,6 +7003,27 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
}
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#ifndef preempt_schedule_notrace_dynamic_enabled
+#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
+#define preempt_schedule_notrace_dynamic_disabled NULL
+#endif
+DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
+EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace);
+void __sched notrace dynamic_preempt_schedule_notrace(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace))
+ return;
+ preempt_schedule_notrace();
+}
+NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);
+EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
+#endif
+#endif
+
#endif /* CONFIG_PREEMPTION */
/*
@@ -4474,7 +7044,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
do {
preempt_disable();
local_irq_enable();
- __schedule(true);
+ __schedule(SM_PREEMPT);
local_irq_disable();
sched_preempt_enable_no_resched();
} while (need_resched());
@@ -4485,13 +7055,51 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
- WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
return try_to_wake_up(curr->private, mode, wake_flags);
}
EXPORT_SYMBOL(default_wake_function);
+static void __setscheduler_prio(struct task_struct *p, int prio)
+{
+ if (dl_prio(prio))
+ p->sched_class = &dl_sched_class;
+ else if (rt_prio(prio))
+ p->sched_class = &rt_sched_class;
+ else
+ p->sched_class = &fair_sched_class;
+
+ p->prio = prio;
+}
+
#ifdef CONFIG_RT_MUTEXES
+/*
+ * Would be more useful with typeof()/auto_type but they don't mix with
+ * bit-fields. Since it's a local thing, use int. Keep the generic sounding
+ * name such that if someone were to implement this function we get to compare
+ * notes.
+ */
+#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; })
+
+void rt_mutex_pre_schedule(void)
+{
+ lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1));
+ sched_submit_work(current);
+}
+
+void rt_mutex_schedule(void)
+{
+ lockdep_assert(current->sched_rt_mutex);
+ __schedule_loop(SM_NONE);
+}
+
+void rt_mutex_post_schedule(void)
+{
+ sched_update_worker(current);
+ lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0));
+}
+
static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
{
if (pi_task)
@@ -4545,7 +7153,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
* right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
* ensure a task is de-boosted (pi_task is set to NULL) before the
* task is allowed to run again (and can exit). This ensures the pointer
- * points to a blocked task -- which guaratees the task is present.
+ * points to a blocked task -- which guarantees the task is present.
*/
p->pi_top_task = pi_task;
@@ -4600,26 +7208,24 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_prio(pi_task->prio) &&
dl_entity_preempt(&pi_task->dl, &p->dl))) {
- p->dl.dl_boosted = 1;
+ p->dl.pi_se = pi_task->dl.pi_se;
queue_flag |= ENQUEUE_REPLENISH;
- } else
- p->dl.dl_boosted = 0;
- p->sched_class = &dl_sched_class;
+ } else {
+ p->dl.pi_se = &p->dl;
+ }
} else if (rt_prio(prio)) {
if (dl_prio(oldprio))
- p->dl.dl_boosted = 0;
+ p->dl.pi_se = &p->dl;
if (oldprio < prio)
queue_flag |= ENQUEUE_HEAD;
- p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
- p->dl.dl_boosted = 0;
+ p->dl.pi_se = &p->dl;
if (rt_prio(oldprio))
p->rt.timeout = 0;
- p->sched_class = &fair_sched_class;
}
- p->prio = prio;
+ __setscheduler_prio(p, prio);
if (queued)
enqueue_task(rq, p, queue_flag);
@@ -4630,9 +7236,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
out_unlock:
/* Avoid rq from going away on us: */
preempt_disable();
- __task_rq_unlock(rq, &rf);
- balance_callback(rq);
+ rq_unpin_lock(rq, &rf);
+ __balance_callbacks(rq);
+ raw_spin_rq_unlock(rq);
+
preempt_enable();
}
#else
@@ -4645,9 +7253,8 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
void set_user_nice(struct task_struct *p, long nice)
{
bool queued, running;
- int old_prio;
- struct rq_flags rf;
struct rq *rq;
+ int old_prio;
if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
return;
@@ -4655,19 +7262,22 @@ void set_user_nice(struct task_struct *p, long nice)
* We have to be careful, if called from sys_setpriority(),
* the task might be in the middle of scheduling on another CPU.
*/
- rq = task_rq_lock(p, &rf);
+ CLASS(task_rq_lock, rq_guard)(p);
+ rq = rq_guard.rq;
+
update_rq_clock(rq);
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
- * it wont have any effect on scheduling until the task is
+ * it won't have any effect on scheduling until the task is
* SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
*/
if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
- goto out_unlock;
+ return;
}
+
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
@@ -4690,24 +7300,33 @@ void set_user_nice(struct task_struct *p, long nice)
* lowered its priority, then reschedule its CPU:
*/
p->sched_class->prio_changed(rq, p, old_prio);
-
-out_unlock:
- task_rq_unlock(rq, p, &rf);
}
EXPORT_SYMBOL(set_user_nice);
/*
- * can_nice - check if a task can reduce its nice value
+ * is_nice_reduction - check if nice value is an actual reduction
+ *
+ * Similar to can_nice() but does not perform a capability check.
+ *
* @p: task
* @nice: nice value
*/
-int can_nice(const struct task_struct *p, const int nice)
+static bool is_nice_reduction(const struct task_struct *p, const int nice)
{
/* Convert nice value [19,-20] to rlimit style value [1,40]: */
int nice_rlim = nice_to_rlimit(nice);
- return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
- capable(CAP_SYS_NICE));
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
+}
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
}
#ifdef __ARCH_WANT_SYS_NICE
@@ -4750,8 +7369,12 @@ SYSCALL_DEFINE1(nice, int, increment)
* @p: the task in question.
*
* Return: The priority value as seen by users in /proc.
- * RT tasks are offset by -200. Normal tasks are centered
- * around 0, value goes from -16 to +15.
+ *
+ * sched policy return value kernel prio user prio/nice
+ *
+ * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19]
+ * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99]
+ * deadline -101 -1 0
*/
int task_prio(const struct task_struct *p)
{
@@ -4810,6 +7433,120 @@ struct task_struct *idle_task(int cpu)
return cpu_rq(cpu)->idle;
}
+#ifdef CONFIG_SCHED_CORE
+int sched_core_idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (sched_core_enabled(rq) && rq->curr == rq->idle)
+ return 1;
+
+ return idle_cpu(cpu);
+}
+
+#endif
+
+#ifdef CONFIG_SMP
+/*
+ * This function computes an effective utilization for the given CPU, to be
+ * used for frequency selection given the linear relation: f = u * f_max.
+ *
+ * The scheduler tracks the following metrics:
+ *
+ * cpu_util_{cfs,rt,dl,irq}()
+ * cpu_bw_dl()
+ *
+ * Where the cfs,rt and dl util numbers are tracked with the same metric and
+ * synchronized windows and are thus directly comparable.
+ *
+ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+ * which excludes things like IRQ and steal-time. These latter are then accrued
+ * in the irq utilization.
+ *
+ * The DL bandwidth number otoh is not a measured metric but a value computed
+ * based on the task model parameters and gives the minimal utilization
+ * required to meet deadlines.
+ */
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long *min,
+ unsigned long *max)
+{
+ unsigned long util, irq, scale;
+ struct rq *rq = cpu_rq(cpu);
+
+ scale = arch_scale_cpu_capacity(cpu);
+
+ /*
+ * Early check to see if IRQ/steal time saturates the CPU, can be
+ * because of inaccuracies in how we track these -- see
+ * update_irq_load_avg().
+ */
+ irq = cpu_util_irq(rq);
+ if (unlikely(irq >= scale)) {
+ if (min)
+ *min = scale;
+ if (max)
+ *max = scale;
+ return scale;
+ }
+
+ if (min) {
+ /*
+ * The minimum utilization returns the highest level between:
+ * - the computed DL bandwidth needed with the IRQ pressure which
+ * steals time to the deadline task.
+ * - The minimum performance requirement for CFS and/or RT.
+ */
+ *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+
+ /*
+ * When an RT task is runnable and uclamp is not used, we must
+ * ensure that the task will run at maximum compute capacity.
+ */
+ if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
+ *min = max(*min, scale);
+ }
+
+ /*
+ * Because the time spend on RT/DL tasks is visible as 'lost' time to
+ * CFS tasks and we use the same metric to track the effective
+ * utilization (PELT windows are synchronized) we can directly add them
+ * to obtain the CPU's actual utilization.
+ */
+ util = util_cfs + cpu_util_rt(rq);
+ util += cpu_util_dl(rq);
+
+ /*
+ * The maximum hint is a soft bandwidth requirement, which can be lower
+ * than the actual utilization because of uclamp_max requirements.
+ */
+ if (max)
+ *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
+
+ if (util >= scale)
+ return scale;
+
+ /*
+ * There is still idle time; further improve the number by using the
+ * irq metric. Because IRQ/steal time is hidden from the task clock we
+ * need to scale the task numbers:
+ *
+ * max - irq
+ * U' = irq + --------- * U
+ * max
+ */
+ util = scale_irq_capacity(util, irq, scale);
+ util += irq;
+
+ return min(scale, util);
+}
+
+unsigned long sched_cpu_util(int cpu)
+{
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
+}
+#endif /* CONFIG_SMP */
+
/**
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
@@ -4821,6 +7558,21 @@ static struct task_struct *find_process_by_pid(pid_t pid)
return pid ? find_task_by_vpid(pid) : current;
}
+static struct task_struct *find_get_task(pid_t pid)
+{
+ struct task_struct *p;
+ guard(rcu)();
+
+ p = find_process_by_pid(pid);
+ if (likely(p))
+ get_task_struct(p);
+
+ return p;
+}
+
+DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
+ find_get_task(pid), pid_t pid)
+
/*
* sched_setparam() passes in -1 for its policy, to let the functions
* it calls know not to change it.
@@ -4852,64 +7604,95 @@ static void __setscheduler_params(struct task_struct *p,
set_load_weight(p, true);
}
-/* Actually do priority change: must hold pi & rq lock. */
-static void __setscheduler(struct rq *rq, struct task_struct *p,
- const struct sched_attr *attr, bool keep_boost)
+/*
+ * Check the target process has a UID that matches the current process's:
+ */
+static bool check_same_owner(struct task_struct *p)
{
+ const struct cred *cred = current_cred(), *pcred;
+ guard(rcu)();
+
+ pcred = __task_cred(p);
+ return (uid_eq(cred->euid, pcred->euid) ||
+ uid_eq(cred->euid, pcred->uid));
+}
+
+/*
+ * Allow unprivileged RT tasks to decrease priority.
+ * Only issue a capable test if needed and only once to avoid an audit
+ * event on permitted non-privileged operations:
+ */
+static int user_check_sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ int policy, int reset_on_fork)
+{
+ if (fair_policy(policy)) {
+ if (attr->sched_nice < task_nice(p) &&
+ !is_nice_reduction(p, attr->sched_nice))
+ goto req_priv;
+ }
+
+ if (rt_policy(policy)) {
+ unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+
+ /* Can't set/change the rt policy: */
+ if (policy != p->policy && !rlim_rtprio)
+ goto req_priv;
+
+ /* Can't increase priority: */
+ if (attr->sched_priority > p->rt_priority &&
+ attr->sched_priority > rlim_rtprio)
+ goto req_priv;
+ }
+
/*
- * If params can't change scheduling class changes aren't allowed
- * either.
+ * Can't set/change SCHED_DEADLINE policy at all for now
+ * (safest behavior); in the future we would like to allow
+ * unprivileged DL tasks to increase their relative deadline
+ * or reduce their runtime (both ways reducing utilization)
*/
- if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
- return;
-
- __setscheduler_params(p, attr);
+ if (dl_policy(policy))
+ goto req_priv;
/*
- * Keep a potential priority boosting if called from
- * sched_setscheduler().
+ * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
*/
- p->prio = normal_prio(p);
- if (keep_boost)
- p->prio = rt_effective_prio(p, p->prio);
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
+ if (!is_nice_reduction(p, task_nice(p)))
+ goto req_priv;
+ }
- if (dl_prio(p->prio))
- p->sched_class = &dl_sched_class;
- else if (rt_prio(p->prio))
- p->sched_class = &rt_sched_class;
- else
- p->sched_class = &fair_sched_class;
-}
+ /* Can't change other user's priorities: */
+ if (!check_same_owner(p))
+ goto req_priv;
-/*
- * Check the target process has a UID that matches the current process's:
- */
-static bool check_same_owner(struct task_struct *p)
-{
- const struct cred *cred = current_cred(), *pcred;
- bool match;
+ /* Normal users shall not reset the sched_reset_on_fork flag: */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ goto req_priv;
- rcu_read_lock();
- pcred = __task_cred(p);
- match = (uid_eq(cred->euid, pcred->euid) ||
- uid_eq(cred->euid, pcred->uid));
- rcu_read_unlock();
- return match;
+ return 0;
+
+req_priv:
+ if (!capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ return 0;
}
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
{
- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
- MAX_RT_PRIO - 1 - attr->sched_priority;
- int retval, oldprio, oldpolicy = -1, queued, running;
- int new_effective_prio, policy = attr->sched_policy;
+ int oldpolicy = -1, policy = attr->sched_policy;
+ int retval, oldprio, newprio, queued, running;
const struct sched_class *prev_class;
+ struct balance_callback *head;
struct rq_flags rf;
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq *rq;
+ bool cpuset_locked = false;
/* The pi code expects interrupts enabled */
BUG_ON(pi && in_interrupt());
@@ -4930,68 +7713,20 @@ recheck:
/*
* Valid priorities for SCHED_FIFO and SCHED_RR are
- * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
+ * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
* SCHED_BATCH and SCHED_IDLE is 0.
*/
- if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
- (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
+ if (attr->sched_priority > MAX_RT_PRIO-1)
return -EINVAL;
if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
(rt_policy(policy) != (attr->sched_priority != 0)))
return -EINVAL;
- /*
- * Allow unprivileged RT tasks to decrease priority:
- */
- if (user && !capable(CAP_SYS_NICE)) {
- if (fair_policy(policy)) {
- if (attr->sched_nice < task_nice(p) &&
- !can_nice(p, attr->sched_nice))
- return -EPERM;
- }
-
- if (rt_policy(policy)) {
- unsigned long rlim_rtprio =
- task_rlimit(p, RLIMIT_RTPRIO);
-
- /* Can't set/change the rt policy: */
- if (policy != p->policy && !rlim_rtprio)
- return -EPERM;
-
- /* Can't increase priority: */
- if (attr->sched_priority > p->rt_priority &&
- attr->sched_priority > rlim_rtprio)
- return -EPERM;
- }
-
- /*
- * Can't set/change SCHED_DEADLINE policy at all for now
- * (safest behavior); in the future we would like to allow
- * unprivileged DL tasks to increase their relative deadline
- * or reduce their runtime (both ways reducing utilization)
- */
- if (dl_policy(policy))
- return -EPERM;
-
- /*
- * Treat SCHED_IDLE as nice 20. Only allow a switch to
- * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
- */
- if (task_has_idle_policy(p) && !idle_policy(policy)) {
- if (!can_nice(p, task_nice(p)))
- return -EPERM;
- }
-
- /* Can't change other user's priorities: */
- if (!check_same_owner(p))
- return -EPERM;
-
- /* Normal users shall not reset the sched_reset_on_fork flag: */
- if (p->sched_reset_on_fork && !reset_on_fork)
- return -EPERM;
- }
-
if (user) {
+ retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
+ if (retval)
+ return retval;
+
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return -EINVAL;
@@ -5007,8 +7742,14 @@ recheck:
return retval;
}
- if (pi)
- cpuset_read_lock();
+ /*
+ * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
+ * information.
+ */
+ if (dl_policy(policy) || dl_policy(p->policy)) {
+ cpuset_locked = true;
+ cpuset_lock();
+ }
/*
* Make sure no PI-waiters arrive (or leave) while we are
@@ -5084,8 +7825,8 @@ change:
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
task_rq_unlock(rq, p, &rf);
- if (pi)
- cpuset_read_unlock();
+ if (cpuset_locked)
+ cpuset_unlock();
goto recheck;
}
@@ -5102,6 +7843,7 @@ change:
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio;
+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
if (pi) {
/*
* Take priority boosted tasks into account. If the new
@@ -5110,8 +7852,8 @@ change:
* the runqueue. This will be done when the task deboost
* itself.
*/
- new_effective_prio = rt_effective_prio(p, newprio);
- if (new_effective_prio == oldprio)
+ newprio = rt_effective_prio(p, newprio);
+ if (newprio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
@@ -5124,7 +7866,10 @@ change:
prev_class = p->sched_class;
- __setscheduler(rq, p, attr, pi);
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+ __setscheduler_params(p, attr);
+ __setscheduler_prio(p, newprio);
+ }
__setscheduler_uclamp(p, attr);
if (queued) {
@@ -5144,23 +7889,25 @@ change:
/* Avoid rq from going away on us: */
preempt_disable();
+ head = splice_balance_callbacks(rq);
task_rq_unlock(rq, p, &rf);
if (pi) {
- cpuset_read_unlock();
+ if (cpuset_locked)
+ cpuset_unlock();
rt_mutex_adjust_pi(p);
}
/* Run balance callbacks after we've adjusted the PI chain: */
- balance_callback(rq);
+ balance_callbacks(rq, head);
preempt_enable();
return 0;
unlock:
task_rq_unlock(rq, p, &rf);
- if (pi)
- cpuset_read_unlock();
+ if (cpuset_locked)
+ cpuset_unlock();
return retval;
}
@@ -5188,6 +7935,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
* @policy: new policy.
* @param: structure containing the new RT priority.
*
+ * Use sched_set_fifo(), read its comment.
+ *
* Return: 0 on success. An error code otherwise.
*
* NOTE that the task may be already dead.
@@ -5197,18 +7946,17 @@ int sched_setscheduler(struct task_struct *p, int policy,
{
return _sched_setscheduler(p, policy, param, true);
}
-EXPORT_SYMBOL_GPL(sched_setscheduler);
int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
{
return __sched_setscheduler(p, attr, true, true);
}
-EXPORT_SYMBOL_GPL(sched_setattr);
int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
{
return __sched_setscheduler(p, attr, false, true);
}
+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
/**
* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
@@ -5228,33 +7976,67 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
{
return _sched_setscheduler(p, policy, param, false);
}
-EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
+
+/*
+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
+ * incapable of resource management, which is the one thing an OS really should
+ * be doing.
+ *
+ * This is of course the reason it is limited to privileged users only.
+ *
+ * Worse still; it is fundamentally impossible to compose static priority
+ * workloads. You cannot take two correctly working static prio workloads
+ * and smash them together and still expect them to work.
+ *
+ * For this reason 'all' FIFO tasks the kernel creates are basically at:
+ *
+ * MAX_RT_PRIO / 2
+ *
+ * The administrator _MUST_ configure the system, the kernel simply doesn't
+ * know enough information to make a sensible choice.
+ */
+void sched_set_fifo(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo);
+
+/*
+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
+ */
+void sched_set_fifo_low(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = 1 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo_low);
+
+void sched_set_normal(struct task_struct *p, int nice)
+{
+ struct sched_attr attr = {
+ .sched_policy = SCHED_NORMAL,
+ .sched_nice = nice,
+ };
+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_normal);
static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
{
struct sched_param lparam;
- struct task_struct *p;
- int retval;
if (!param || pid < 0)
return -EINVAL;
if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
return -EFAULT;
- rcu_read_lock();
- retval = -ESRCH;
- p = find_process_by_pid(pid);
- if (likely(p))
- get_task_struct(p);
- rcu_read_unlock();
-
- if (likely(p)) {
- retval = sched_setscheduler(p, policy, &lparam);
- put_task_struct(p);
- }
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
- return retval;
+ return sched_setscheduler(p, policy, &lparam);
}
/*
@@ -5302,6 +8084,16 @@ err_size:
return -E2BIG;
}
+static void get_params(struct task_struct *p, struct sched_attr *attr)
+{
+ if (task_has_dl_policy(p))
+ __getparam_dl(p, attr);
+ else if (task_has_rt_policy(p))
+ attr->sched_priority = p->rt_priority;
+ else
+ attr->sched_nice = task_nice(p);
+}
+
/**
* sys_sched_setscheduler - set/change the scheduler policy and RT priority
* @pid: the pid in question.
@@ -5340,7 +8132,6 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, flags)
{
struct sched_attr attr;
- struct task_struct *p;
int retval;
if (!uattr || pid < 0 || flags)
@@ -5355,19 +8146,14 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
attr.sched_policy = SETPARAM_POLICY;
- rcu_read_lock();
- retval = -ESRCH;
- p = find_process_by_pid(pid);
- if (likely(p))
- get_task_struct(p);
- rcu_read_unlock();
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
- if (likely(p)) {
- retval = sched_setattr(p, &attr);
- put_task_struct(p);
- }
+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+ get_params(p, &attr);
- return retval;
+ return sched_setattr(p, &attr);
}
/**
@@ -5385,16 +8171,17 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
if (pid < 0)
return -EINVAL;
- retval = -ESRCH;
- rcu_read_lock();
+ guard(rcu)();
p = find_process_by_pid(pid);
- if (p) {
- retval = security_task_getscheduler(p);
- if (!retval)
- retval = p->policy
- | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (!retval) {
+ retval = p->policy;
+ if (p->sched_reset_on_fork)
+ retval |= SCHED_RESET_ON_FORK;
}
- rcu_read_unlock();
return retval;
}
@@ -5415,30 +8202,23 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
if (!param || pid < 0)
return -EINVAL;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- retval = -ESRCH;
- if (!p)
- goto out_unlock;
+ scoped_guard (rcu) {
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
- if (task_has_rt_policy(p))
- lp.sched_priority = p->rt_priority;
- rcu_read_unlock();
+ if (task_has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
+ }
/*
* This one might sleep, we cannot do it with a spinlock held ...
*/
- retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
-
- return retval;
-
-out_unlock:
- rcu_read_unlock();
- return retval;
+ return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
}
/*
@@ -5498,126 +8278,164 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
usize < SCHED_ATTR_SIZE_VER0 || flags)
return -EINVAL;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- retval = -ESRCH;
- if (!p)
- goto out_unlock;
+ scoped_guard (rcu) {
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
- kattr.sched_policy = p->policy;
- if (p->sched_reset_on_fork)
- kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- if (task_has_dl_policy(p))
- __getparam_dl(p, &kattr);
- else if (task_has_rt_policy(p))
- kattr.sched_priority = p->rt_priority;
- else
- kattr.sched_nice = task_nice(p);
+ kattr.sched_policy = p->policy;
+ if (p->sched_reset_on_fork)
+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ get_params(p, &kattr);
+ kattr.sched_flags &= SCHED_FLAG_ALL;
#ifdef CONFIG_UCLAMP_TASK
- kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
- kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+ /*
+ * This could race with another potential updater, but this is fine
+ * because it'll correctly read the old or the new value. We don't need
+ * to guarantee who wins the race as long as it doesn't return garbage.
+ */
+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
#endif
-
- rcu_read_unlock();
+ }
return sched_attr_copy_to_user(uattr, &kattr, usize);
-
-out_unlock:
- rcu_read_unlock();
- return retval;
}
-long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+#ifdef CONFIG_SMP
+int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{
- cpumask_var_t cpus_allowed, new_mask;
- struct task_struct *p;
- int retval;
+ /*
+ * If the task isn't a deadline task or admission control is
+ * disabled then we don't care about affinity changes.
+ */
+ if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
+ return 0;
- rcu_read_lock();
+ /*
+ * Since bandwidth control happens on root_domain basis,
+ * if admission test is enabled, we only admit -deadline
+ * tasks allowed to run on all the CPUs in the task's
+ * root_domain.
+ */
+ guard(rcu)();
+ if (!cpumask_subset(task_rq(p)->rd->span, mask))
+ return -EBUSY;
- p = find_process_by_pid(pid);
- if (!p) {
- rcu_read_unlock();
- return -ESRCH;
- }
+ return 0;
+}
+#endif
- /* Prevent p going away */
- get_task_struct(p);
- rcu_read_unlock();
+static int
+__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
+{
+ int retval;
+ cpumask_var_t cpus_allowed, new_mask;
+
+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
+ return -ENOMEM;
- if (p->flags & PF_NO_SETAFFINITY) {
- retval = -EINVAL;
- goto out_put_task;
- }
- if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
- retval = -ENOMEM;
- goto out_put_task;
- }
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
retval = -ENOMEM;
goto out_free_cpus_allowed;
}
- retval = -EPERM;
- if (!check_same_owner(p)) {
- rcu_read_lock();
- if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
- rcu_read_unlock();
- goto out_free_new_mask;
- }
- rcu_read_unlock();
- }
- retval = security_task_setscheduler(p);
+ cpuset_cpus_allowed(p, cpus_allowed);
+ cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
+
+ ctx->new_mask = new_mask;
+ ctx->flags |= SCA_CHECK;
+
+ retval = dl_task_check_affinity(p, new_mask);
if (retval)
goto out_free_new_mask;
+ retval = __set_cpus_allowed_ptr(p, ctx);
+ if (retval)
+ goto out_free_new_mask;
cpuset_cpus_allowed(p, cpus_allowed);
- cpumask_and(new_mask, in_mask, cpus_allowed);
+ if (!cpumask_subset(new_mask, cpus_allowed)) {
+ /*
+ * We must have raced with a concurrent cpuset update.
+ * Just reset the cpumask to the cpuset's cpus_allowed.
+ */
+ cpumask_copy(new_mask, cpus_allowed);
- /*
- * Since bandwidth control happens on root_domain basis,
- * if admission test is enabled, we only admit -deadline
- * tasks allowed to run on all the CPUs in the task's
- * root_domain.
- */
-#ifdef CONFIG_SMP
- if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
- rcu_read_lock();
- if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
- retval = -EBUSY;
- rcu_read_unlock();
- goto out_free_new_mask;
- }
- rcu_read_unlock();
- }
-#endif
-again:
- retval = __set_cpus_allowed_ptr(p, new_mask, true);
+ /*
+ * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
+ * will restore the previous user_cpus_ptr value.
+ *
+ * In the unlikely event a previous user_cpus_ptr exists,
+ * we need to further restrict the mask to what is allowed
+ * by that old user_cpus_ptr.
+ */
+ if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
+ bool empty = !cpumask_and(new_mask, new_mask,
+ ctx->user_mask);
- if (!retval) {
- cpuset_cpus_allowed(p, cpus_allowed);
- if (!cpumask_subset(new_mask, cpus_allowed)) {
- /*
- * We must have raced with a concurrent cpuset
- * update. Just reset the cpus_allowed to the
- * cpuset's cpus_allowed
- */
- cpumask_copy(new_mask, cpus_allowed);
- goto again;
+ if (WARN_ON_ONCE(empty))
+ cpumask_copy(new_mask, cpus_allowed);
}
+ __set_cpus_allowed_ptr(p, ctx);
+ retval = -EINVAL;
}
+
out_free_new_mask:
free_cpumask_var(new_mask);
out_free_cpus_allowed:
free_cpumask_var(cpus_allowed);
-out_put_task:
- put_task_struct(p);
+ return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+ struct affinity_context ac;
+ struct cpumask *user_mask;
+ int retval;
+
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
+
+ if (p->flags & PF_NO_SETAFFINITY)
+ return -EINVAL;
+
+ if (!check_same_owner(p)) {
+ guard(rcu)();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
+ return -EPERM;
+ }
+
+ retval = security_task_setscheduler(p);
+ if (retval)
+ return retval;
+
+ /*
+ * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
+ * alloc_user_cpus_ptr() returns NULL.
+ */
+ user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
+ if (user_mask) {
+ cpumask_copy(user_mask, in_mask);
+ } else if (IS_ENABLED(CONFIG_SMP)) {
+ return -ENOMEM;
+ }
+
+ ac = (struct affinity_context){
+ .new_mask = in_mask,
+ .user_mask = user_mask,
+ .flags = SCA_USER,
+ };
+
+ retval = __sched_setaffinity(p, &ac);
+ kfree(ac.user_mask);
+
return retval;
}
@@ -5659,28 +8477,21 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
long sched_getaffinity(pid_t pid, struct cpumask *mask)
{
struct task_struct *p;
- unsigned long flags;
int retval;
- rcu_read_lock();
-
- retval = -ESRCH;
+ guard(rcu)();
p = find_process_by_pid(pid);
if (!p)
- goto out_unlock;
+ return -ESRCH;
retval = security_task_getscheduler(p);
if (retval)
- goto out_unlock;
+ return retval;
- raw_spin_lock_irqsave(&p->pi_lock, flags);
+ guard(raw_spinlock_irqsave)(&p->pi_lock);
cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-
-out_unlock:
- rcu_read_unlock();
- return retval;
+ return 0;
}
/**
@@ -5703,14 +8514,14 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
if (len & (sizeof(unsigned long)-1))
return -EINVAL;
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
if (ret == 0) {
unsigned int retlen = min(len, cpumask_size());
- if (copy_to_user(user_mask_ptr, mask, retlen))
+ if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
ret = -EFAULT;
else
ret = retlen;
@@ -5720,14 +8531,6 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
return ret;
}
-/**
- * sys_sched_yield - yield the current processor to other threads.
- *
- * This function yields the current CPU to other tasks. If there are no
- * other threads running on this CPU then this function will return.
- *
- * Return: 0.
- */
static void do_sched_yield(void)
{
struct rq_flags rf;
@@ -5738,34 +8541,84 @@ static void do_sched_yield(void)
schedstat_inc(rq->yld_count);
current->sched_class->yield_task(rq);
- /*
- * Since we are going to call schedule() anyway, there's
- * no need to preempt or enable interrupts:
- */
preempt_disable();
- rq_unlock(rq, &rf);
+ rq_unlock_irq(rq, &rf);
sched_preempt_enable_no_resched();
schedule();
}
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * This function yields the current CPU to other tasks. If there are no
+ * other threads running on this CPU then this function will return.
+ *
+ * Return: 0.
+ */
SYSCALL_DEFINE0(sched_yield)
{
do_sched_yield();
return 0;
}
-#ifndef CONFIG_PREEMPTION
-int __sched _cond_resched(void)
+#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
+int __sched __cond_resched(void)
{
if (should_resched(0)) {
preempt_schedule_common();
return 1;
}
+ /*
+ * In preemptible kernels, ->rcu_read_lock_nesting tells the tick
+ * whether the current CPU is in an RCU read-side critical section,
+ * so the tick can report quiescent states even for CPUs looping
+ * in kernel context. In contrast, in non-preemptible kernels,
+ * RCU readers leave no in-memory hints, which means that CPU-bound
+ * processes executing in kernel context might never report an
+ * RCU quiescent state. Therefore, the following code causes
+ * cond_resched() to report a quiescent state, but only when RCU
+ * is in urgent need of one.
+ */
+#ifndef CONFIG_PREEMPT_RCU
rcu_all_qs();
+#endif
return 0;
}
-EXPORT_SYMBOL(_cond_resched);
+EXPORT_SYMBOL(__cond_resched);
+#endif
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#define cond_resched_dynamic_enabled __cond_resched
+#define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
+DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
+EXPORT_STATIC_CALL_TRAMP(cond_resched);
+
+#define might_resched_dynamic_enabled __cond_resched
+#define might_resched_dynamic_disabled ((void *)&__static_call_return0)
+DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
+EXPORT_STATIC_CALL_TRAMP(might_resched);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
+int __sched dynamic_cond_resched(void)
+{
+ klp_sched_try_switch();
+ if (!static_branch_unlikely(&sk_dynamic_cond_resched))
+ return 0;
+ return __cond_resched();
+}
+EXPORT_SYMBOL(dynamic_cond_resched);
+
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched);
+int __sched dynamic_might_resched(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_might_resched))
+ return 0;
+ return __cond_resched();
+}
+EXPORT_SYMBOL(dynamic_might_resched);
+#endif
#endif
/*
@@ -5785,9 +8638,7 @@ int __cond_resched_lock(spinlock_t *lock)
if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
- if (resched)
- preempt_schedule_common();
- else
+ if (!_cond_resched())
cpu_relax();
ret = 1;
spin_lock(lock);
@@ -5796,6 +8647,249 @@ int __cond_resched_lock(spinlock_t *lock)
}
EXPORT_SYMBOL(__cond_resched_lock);
+int __cond_resched_rwlock_read(rwlock_t *lock)
+{
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
+ int ret = 0;
+
+ lockdep_assert_held_read(lock);
+
+ if (rwlock_needbreak(lock) || resched) {
+ read_unlock(lock);
+ if (!_cond_resched())
+ cpu_relax();
+ ret = 1;
+ read_lock(lock);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__cond_resched_rwlock_read);
+
+int __cond_resched_rwlock_write(rwlock_t *lock)
+{
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
+ int ret = 0;
+
+ lockdep_assert_held_write(lock);
+
+ if (rwlock_needbreak(lock) || resched) {
+ write_unlock(lock);
+ if (!_cond_resched())
+ cpu_relax();
+ ret = 1;
+ write_lock(lock);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__cond_resched_rwlock_write);
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+#ifdef CONFIG_GENERIC_ENTRY
+#include <linux/entry-common.h>
+#endif
+
+/*
+ * SC:cond_resched
+ * SC:might_resched
+ * SC:preempt_schedule
+ * SC:preempt_schedule_notrace
+ * SC:irqentry_exit_cond_resched
+ *
+ *
+ * NONE:
+ * cond_resched <- __cond_resched
+ * might_resched <- RET0
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ *
+ * VOLUNTARY:
+ * cond_resched <- __cond_resched
+ * might_resched <- __cond_resched
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ *
+ * FULL:
+ * cond_resched <- RET0
+ * might_resched <- RET0
+ * preempt_schedule <- preempt_schedule
+ * preempt_schedule_notrace <- preempt_schedule_notrace
+ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ */
+
+enum {
+ preempt_dynamic_undefined = -1,
+ preempt_dynamic_none,
+ preempt_dynamic_voluntary,
+ preempt_dynamic_full,
+};
+
+int preempt_dynamic_mode = preempt_dynamic_undefined;
+
+int sched_dynamic_mode(const char *str)
+{
+ if (!strcmp(str, "none"))
+ return preempt_dynamic_none;
+
+ if (!strcmp(str, "voluntary"))
+ return preempt_dynamic_voluntary;
+
+ if (!strcmp(str, "full"))
+ return preempt_dynamic_full;
+
+ return -EINVAL;
+}
+
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
+#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key)
+#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key)
+#else
+#error "Unsupported PREEMPT_DYNAMIC mechanism"
+#endif
+
+static DEFINE_MUTEX(sched_dynamic_mutex);
+static bool klp_override;
+
+static void __sched_dynamic_update(int mode)
+{
+ /*
+ * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
+ * the ZERO state, which is invalid.
+ */
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(might_resched);
+ preempt_dynamic_enable(preempt_schedule);
+ preempt_dynamic_enable(preempt_schedule_notrace);
+ preempt_dynamic_enable(irqentry_exit_cond_resched);
+
+ switch (mode) {
+ case preempt_dynamic_none:
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_disable(might_resched);
+ preempt_dynamic_disable(preempt_schedule);
+ preempt_dynamic_disable(preempt_schedule_notrace);
+ preempt_dynamic_disable(irqentry_exit_cond_resched);
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: none\n");
+ break;
+
+ case preempt_dynamic_voluntary:
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(might_resched);
+ preempt_dynamic_disable(preempt_schedule);
+ preempt_dynamic_disable(preempt_schedule_notrace);
+ preempt_dynamic_disable(irqentry_exit_cond_resched);
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: voluntary\n");
+ break;
+
+ case preempt_dynamic_full:
+ if (!klp_override)
+ preempt_dynamic_disable(cond_resched);
+ preempt_dynamic_disable(might_resched);
+ preempt_dynamic_enable(preempt_schedule);
+ preempt_dynamic_enable(preempt_schedule_notrace);
+ preempt_dynamic_enable(irqentry_exit_cond_resched);
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: full\n");
+ break;
+ }
+
+ preempt_dynamic_mode = mode;
+}
+
+void sched_dynamic_update(int mode)
+{
+ mutex_lock(&sched_dynamic_mutex);
+ __sched_dynamic_update(mode);
+ mutex_unlock(&sched_dynamic_mutex);
+}
+
+#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
+
+static int klp_cond_resched(void)
+{
+ __klp_sched_try_switch();
+ return __cond_resched();
+}
+
+void sched_dynamic_klp_enable(void)
+{
+ mutex_lock(&sched_dynamic_mutex);
+
+ klp_override = true;
+ static_call_update(cond_resched, klp_cond_resched);
+
+ mutex_unlock(&sched_dynamic_mutex);
+}
+
+void sched_dynamic_klp_disable(void)
+{
+ mutex_lock(&sched_dynamic_mutex);
+
+ klp_override = false;
+ __sched_dynamic_update(preempt_dynamic_mode);
+
+ mutex_unlock(&sched_dynamic_mutex);
+}
+
+#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+static int __init setup_preempt_mode(char *str)
+{
+ int mode = sched_dynamic_mode(str);
+ if (mode < 0) {
+ pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
+ return 0;
+ }
+
+ sched_dynamic_update(mode);
+ return 1;
+}
+__setup("preempt=", setup_preempt_mode);
+
+static void __init preempt_dynamic_init(void)
+{
+ if (preempt_dynamic_mode == preempt_dynamic_undefined) {
+ if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
+ sched_dynamic_update(preempt_dynamic_none);
+ } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
+ sched_dynamic_update(preempt_dynamic_voluntary);
+ } else {
+ /* Default static call setting, nothing to do */
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
+ preempt_dynamic_mode = preempt_dynamic_full;
+ pr_info("Dynamic Preempt: full\n");
+ }
+ }
+}
+
+#define PREEMPT_MODEL_ACCESSOR(mode) \
+ bool preempt_model_##mode(void) \
+ { \
+ WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \
+ return preempt_dynamic_mode == preempt_dynamic_##mode; \
+ } \
+ EXPORT_SYMBOL_GPL(preempt_model_##mode)
+
+PREEMPT_MODEL_ACCESSOR(none);
+PREEMPT_MODEL_ACCESSOR(voluntary);
+PREEMPT_MODEL_ACCESSOR(full);
+
+#else /* !CONFIG_PREEMPT_DYNAMIC */
+
+static inline void preempt_dynamic_init(void) { }
+
+#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */
+
/**
* yield - yield the current processor to other threads.
*
@@ -5803,7 +8897,7 @@ EXPORT_SYMBOL(__cond_resched_lock);
*
* The scheduler is at all times free to pick the calling task as the most
* eligible task to run, if removing the yield() call from your code breaks
- * it, its already broken.
+ * it, it's already broken.
*
* Typical broken usage is:
*
@@ -5844,55 +8938,46 @@ int __sched yield_to(struct task_struct *p, bool preempt)
{
struct task_struct *curr = current;
struct rq *rq, *p_rq;
- unsigned long flags;
int yielded = 0;
- local_irq_save(flags);
- rq = this_rq();
+ scoped_guard (irqsave) {
+ rq = this_rq();
again:
- p_rq = task_rq(p);
- /*
- * If we're the only runnable task on the rq and target rq also
- * has only one task, there's absolutely no point in yielding.
- */
- if (rq->nr_running == 1 && p_rq->nr_running == 1) {
- yielded = -ESRCH;
- goto out_irq;
- }
+ p_rq = task_rq(p);
+ /*
+ * If we're the only runnable task on the rq and target rq also
+ * has only one task, there's absolutely no point in yielding.
+ */
+ if (rq->nr_running == 1 && p_rq->nr_running == 1)
+ return -ESRCH;
- double_rq_lock(rq, p_rq);
- if (task_rq(p) != p_rq) {
- double_rq_unlock(rq, p_rq);
- goto again;
- }
+ guard(double_rq_lock)(rq, p_rq);
+ if (task_rq(p) != p_rq)
+ goto again;
- if (!curr->sched_class->yield_to_task)
- goto out_unlock;
+ if (!curr->sched_class->yield_to_task)
+ return 0;
- if (curr->sched_class != p->sched_class)
- goto out_unlock;
+ if (curr->sched_class != p->sched_class)
+ return 0;
- if (task_running(p_rq, p) || p->state)
- goto out_unlock;
+ if (task_on_cpu(p_rq, p) || !task_is_running(p))
+ return 0;
- yielded = curr->sched_class->yield_to_task(rq, p, preempt);
- if (yielded) {
- schedstat_inc(rq->yld_count);
- /*
- * Make p's CPU reschedule; pick_next_entity takes care of
- * fairness.
- */
- if (preempt && rq != p_rq)
- resched_curr(p_rq);
+ yielded = curr->sched_class->yield_to_task(rq, p);
+ if (yielded) {
+ schedstat_inc(rq->yld_count);
+ /*
+ * Make p's CPU reschedule; pick_next_entity
+ * takes care of fairness.
+ */
+ if (preempt && rq != p_rq)
+ resched_curr(p_rq);
+ }
}
-out_unlock:
- double_rq_unlock(rq, p_rq);
-out_irq:
- local_irq_restore(flags);
-
- if (yielded > 0)
+ if (yielded)
schedule();
return yielded;
@@ -5904,8 +8989,7 @@ int io_schedule_prepare(void)
int old_iowait = current->in_iowait;
current->in_iowait = 1;
- blk_schedule_flush_plug(current);
-
+ blk_flush_plug(current->plug, true);
return old_iowait;
}
@@ -5956,7 +9040,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
switch (policy) {
case SCHED_FIFO:
case SCHED_RR:
- ret = MAX_USER_RT_PRIO-1;
+ ret = MAX_RT_PRIO-1;
break;
case SCHED_DEADLINE:
case SCHED_NORMAL:
@@ -5996,38 +9080,30 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
{
- struct task_struct *p;
- unsigned int time_slice;
- struct rq_flags rf;
- struct rq *rq;
+ unsigned int time_slice = 0;
int retval;
if (pid < 0)
return -EINVAL;
- retval = -ESRCH;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- if (!p)
- goto out_unlock;
+ scoped_guard (rcu) {
+ struct task_struct *p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
- rq = task_rq_lock(p, &rf);
- time_slice = 0;
- if (p->sched_class->get_rr_interval)
- time_slice = p->sched_class->get_rr_interval(rq, p);
- task_rq_unlock(rq, p, &rf);
+ scoped_guard (task_rq_lock, p) {
+ struct rq *rq = scope.rq;
+ if (p->sched_class->get_rr_interval)
+ time_slice = p->sched_class->get_rr_interval(rq, p);
+ }
+ }
- rcu_read_unlock();
jiffies_to_timespec64(time_slice, t);
return 0;
-
-out_unlock:
- rcu_read_unlock();
- return retval;
}
/**
@@ -6074,10 +9150,10 @@ void sched_show_task(struct task_struct *p)
if (!try_get_task_stack(p))
return;
- printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
- if (p->state == TASK_RUNNING)
- printk(KERN_CONT " running task ");
+ if (task_is_running(p))
+ pr_cont(" running task ");
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
@@ -6086,11 +9162,12 @@ void sched_show_task(struct task_struct *p)
if (pid_alive(p))
ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
- printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
- task_pid_nr(p), ppid,
- (unsigned long)task_thread_info(p)->flags);
+ pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d flags:0x%08lx\n",
+ free, task_pid_nr(p), task_tgid_nr(p),
+ ppid, read_task_thread_flags(p));
print_worker_info(KERN_INFO, p);
+ print_stop_info(KERN_INFO, p);
show_stack(p, NULL, KERN_INFO);
put_task_stack(p);
}
@@ -6099,36 +9176,31 @@ EXPORT_SYMBOL_GPL(sched_show_task);
static inline bool
state_filter_match(unsigned long state_filter, struct task_struct *p)
{
+ unsigned int state = READ_ONCE(p->__state);
+
/* no filter, everything matches */
if (!state_filter)
return true;
/* filter, but doesn't match */
- if (!(p->state & state_filter))
+ if (!(state & state_filter))
return false;
/*
* When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
* TASK_KILLABLE).
*/
- if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
+ if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD))
return false;
return true;
}
-void show_state_filter(unsigned long state_filter)
+void show_state_filter(unsigned int state_filter)
{
struct task_struct *g, *p;
-#if BITS_PER_LONG == 32
- printk(KERN_INFO
- " task PC stack pid father\n");
-#else
- printk(KERN_INFO
- " task PC stack pid father\n");
-#endif
rcu_read_lock();
for_each_process_thread(g, p) {
/*
@@ -6164,31 +9236,39 @@ void show_state_filter(unsigned long state_filter)
* NOTE: this function does not set the idle thread's NEED_RESCHED
* flag, to make booting more robust.
*/
-void init_idle(struct task_struct *idle, int cpu)
+void __init init_idle(struct task_struct *idle, int cpu)
{
+#ifdef CONFIG_SMP
+ struct affinity_context ac = (struct affinity_context) {
+ .new_mask = cpumask_of(cpu),
+ .flags = 0,
+ };
+#endif
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
__sched_fork(0, idle);
raw_spin_lock_irqsave(&idle->pi_lock, flags);
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
- idle->state = TASK_RUNNING;
+ idle->__state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
- idle->flags |= PF_IDLE;
-
- scs_task_reset(idle);
- kasan_unpoison_task_stack(idle);
+ /*
+ * PF_KTHREAD should already be set at this point; regardless, make it
+ * look like a proper per-CPU kthread.
+ */
+ idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY;
+ kthread_set_per_cpu(idle, cpu);
#ifdef CONFIG_SMP
/*
- * Its possible that init_idle() gets called multiple times on a task,
+ * It's possible that init_idle() gets called multiple times on a task,
* in that case do_set_cpus_allowed() will not do the right thing.
*
* And since this is boot we can forgo the serialization.
*/
- set_cpus_allowed_common(idle, cpumask_of(cpu));
+ set_cpus_allowed_common(idle, &ac);
#endif
/*
* We're having a chicken and egg problem, even though we are
@@ -6210,7 +9290,7 @@ void init_idle(struct task_struct *idle, int cpu)
#ifdef CONFIG_SMP
idle->on_cpu = 1;
#endif
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
@@ -6234,7 +9314,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
{
int ret = 1;
- if (!cpumask_weight(cur))
+ if (cpumask_empty(cur))
return ret;
ret = dl_cpuset_cpumask_can_shrink(cur, trial);
@@ -6242,8 +9322,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
return ret;
}
-int task_can_attach(struct task_struct *p,
- const struct cpumask *cs_cpus_allowed)
+int task_can_attach(struct task_struct *p)
{
int ret = 0;
@@ -6256,16 +9335,9 @@ int task_can_attach(struct task_struct *p,
* success of set_cpus_allowed_ptr() on all attached tasks
* before cpus_mask may be changed.
*/
- if (p->flags & PF_NO_SETAFFINITY) {
+ if (p->flags & PF_NO_SETAFFINITY)
ret = -EINVAL;
- goto out;
- }
- if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
- cs_cpus_allowed))
- ret = dl_task_can_attach(p, cs_cpus_allowed);
-
-out:
return ret;
}
@@ -6339,119 +9411,146 @@ void idle_task_exit(void)
/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
}
-/*
- * Since this CPU is going 'away' for a while, fold any nr_active delta
- * we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable. We need to take the teardown thread which
- * is calling this into account, so we hand in adjust = 1 to the load
- * calculation.
- *
- * Also see the comment "Global load-average calculations".
- */
-static void calc_load_migrate(struct rq *rq)
+static int __balance_push_cpu_stop(void *arg)
{
- long delta = calc_load_fold_active(rq, 1);
- if (delta)
- atomic_long_add(delta, &calc_load_tasks);
-}
+ struct task_struct *p = arg;
+ struct rq *rq = this_rq();
+ struct rq_flags rf;
+ int cpu;
-static struct task_struct *__pick_migrate_task(struct rq *rq)
-{
- const struct sched_class *class;
- struct task_struct *next;
+ raw_spin_lock_irq(&p->pi_lock);
+ rq_lock(rq, &rf);
- for_each_class(class) {
- next = class->pick_next_task(rq);
- if (next) {
- next->sched_class->put_prev_task(rq, next);
- return next;
- }
+ update_rq_clock(rq);
+
+ if (task_rq(p) == rq && task_on_rq_queued(p)) {
+ cpu = select_fallback_rq(rq->cpu, p);
+ rq = __migrate_task(rq, &rf, p, cpu);
}
- /* The idle class should always have a runnable task */
- BUG();
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
+
+ return 0;
}
+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
+
/*
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * try_to_wake_up()->select_task_rq().
+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
*
- * Called with rq->lock held even though we'er in stop_machine() and
- * there's no concurrency possible, we hold the required locks anyway
- * because of lock validation efforts.
+ * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only
+ * effective when the hotplug motion is down.
*/
-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
+static void balance_push(struct rq *rq)
{
- struct rq *rq = dead_rq;
- struct task_struct *next, *stop = rq->stop;
- struct rq_flags orf = *rf;
- int dest_cpu;
+ struct task_struct *push_task = rq->curr;
+
+ lockdep_assert_rq_held(rq);
/*
- * Fudge the rq selection such that the below task selection loop
- * doesn't get stuck on the currently eligible stop task.
- *
- * We're currently inside stop_machine() and the rq is either stuck
- * in the stop_machine_cpu_stop() loop, or we're executing this code,
- * either way we should never end up calling schedule() until we're
- * done here.
+ * Ensure the thing is persistent until balance_push_set(.on = false);
*/
- rq->stop = NULL;
+ rq->balance_callback = &balance_push_callback;
/*
- * put_prev_task() and pick_next_task() sched
- * class method both need to have an up-to-date
- * value of rq->clock[_task]
+ * Only active while going offline and when invoked on the outgoing
+ * CPU.
*/
- update_rq_clock(rq);
-
- for (;;) {
- /*
- * There's this thread running, bail when that's the only
- * remaining thread:
- */
- if (rq->nr_running == 1)
- break;
+ if (!cpu_dying(rq->cpu) || rq != this_rq())
+ return;
- next = __pick_migrate_task(rq);
+ /*
+ * Both the cpu-hotplug and stop task are in this case and are
+ * required to complete the hotplug process.
+ */
+ if (kthread_is_per_cpu(push_task) ||
+ is_migration_disabled(push_task)) {
/*
- * Rules for changing task_struct::cpus_mask are holding
- * both pi_lock and rq->lock, such that holding either
- * stabilizes the mask.
+ * If this is the idle task on the outgoing CPU try to wake
+ * up the hotplug control thread which might wait for the
+ * last task to vanish. The rcuwait_active() check is
+ * accurate here because the waiter is pinned on this CPU
+ * and can't obviously be running in parallel.
*
- * Drop rq->lock is not quite as disastrous as it usually is
- * because !cpu_active at this point, which means load-balance
- * will not interfere. Also, stop-machine.
- */
- rq_unlock(rq, rf);
- raw_spin_lock(&next->pi_lock);
- rq_relock(rq, rf);
-
- /*
- * Since we're inside stop-machine, _nothing_ should have
- * changed the task, WARN if weird stuff happened, because in
- * that case the above rq->lock drop is a fail too.
+ * On RT kernels this also has to check whether there are
+ * pinned and scheduled out tasks on the runqueue. They
+ * need to leave the migrate disabled section first.
*/
- if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
- raw_spin_unlock(&next->pi_lock);
- continue;
+ if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
+ rcuwait_active(&rq->hotplug_wait)) {
+ raw_spin_rq_unlock(rq);
+ rcuwait_wake_up(&rq->hotplug_wait);
+ raw_spin_rq_lock(rq);
}
+ return;
+ }
- /* Find suitable destination for @next, with force if needed. */
- dest_cpu = select_fallback_rq(dead_rq->cpu, next);
- rq = __migrate_task(rq, rf, next, dest_cpu);
- if (rq != dead_rq) {
- rq_unlock(rq, rf);
- rq = dead_rq;
- *rf = orf;
- rq_relock(rq, rf);
- }
- raw_spin_unlock(&next->pi_lock);
+ get_task_struct(push_task);
+ /*
+ * Temporarily drop rq->lock such that we can wake-up the stop task.
+ * Both preemption and IRQs are still disabled.
+ */
+ preempt_disable();
+ raw_spin_rq_unlock(rq);
+ stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
+ this_cpu_ptr(&push_work));
+ preempt_enable();
+ /*
+ * At this point need_resched() is true and we'll take the loop in
+ * schedule(). The next pick is obviously going to be the stop task
+ * which kthread_is_per_cpu() and will push this task away.
+ */
+ raw_spin_rq_lock(rq);
+}
+
+static void balance_push_set(int cpu, bool on)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ if (on) {
+ WARN_ON_ONCE(rq->balance_callback);
+ rq->balance_callback = &balance_push_callback;
+ } else if (rq->balance_callback == &balance_push_callback) {
+ rq->balance_callback = NULL;
}
+ rq_unlock_irqrestore(rq, &rf);
+}
- rq->stop = stop;
+/*
+ * Invoked from a CPUs hotplug control thread after the CPU has been marked
+ * inactive. All tasks which are not per CPU kernel threads are either
+ * pushed off this CPU now via balance_push() or placed on a different CPU
+ * during wakeup. Wait until the CPU is quiescent.
+ */
+static void balance_hotplug_wait(void)
+{
+ struct rq *rq = this_rq();
+
+ rcuwait_wait_event(&rq->hotplug_wait,
+ rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
+ TASK_UNINTERRUPTIBLE);
+}
+
+#else
+
+static inline void balance_push(struct rq *rq)
+{
}
+
+static inline void balance_push_set(int cpu, bool on)
+{
+}
+
+static inline void balance_hotplug_wait(void)
+{
+}
+
#endif /* CONFIG_HOTPLUG_CPU */
void set_rq_online(struct rq *rq)
@@ -6474,6 +9573,7 @@ void set_rq_offline(struct rq *rq)
if (rq->online) {
const struct sched_class *class;
+ update_rq_clock(rq);
for_each_class(class) {
if (class->rq_offline)
class->rq_offline(rq);
@@ -6522,8 +9622,10 @@ static void cpuset_cpu_active(void)
static int cpuset_cpu_inactive(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
- if (dl_cpu_busy(cpu))
- return -EBUSY;
+ int ret = dl_bw_check_overflow(cpu);
+
+ if (ret)
+ return ret;
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
@@ -6537,6 +9639,12 @@ int sched_cpu_activate(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
+ /*
+ * Clear the balance_push callback and prepare to schedule
+ * regular tasks.
+ */
+ balance_push_set(cpu, false);
+
#ifdef CONFIG_SCHED_SMT
/*
* When going up, increment the number of cores with SMT present.
@@ -6547,6 +9655,7 @@ int sched_cpu_activate(unsigned int cpu)
set_cpu_active(cpu, true);
if (sched_smp_initialized) {
+ sched_update_numa(cpu, true);
sched_domains_numa_masks_set(cpu);
cpuset_cpu_active();
}
@@ -6572,32 +9681,64 @@ int sched_cpu_activate(unsigned int cpu)
int sched_cpu_deactivate(unsigned int cpu)
{
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
int ret;
+ /*
+ * Remove CPU from nohz.idle_cpus_mask to prevent participating in
+ * load balancing when not active
+ */
+ nohz_balance_exit_idle(rq);
+
set_cpu_active(cpu, false);
+
+ /*
+ * From this point forward, this CPU will refuse to run any task that
+ * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
+ * push those tasks away until this gets cleared, see
+ * sched_cpu_dying().
+ */
+ balance_push_set(cpu, true);
+
/*
- * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
- * users of this state to go away such that all new such users will
- * observe it.
+ * We've cleared cpu_active_mask / set balance_push, wait for all
+ * preempt-disabled and RCU users of this state to go away such that
+ * all new such users will observe it.
+ *
+ * Specifically, we rely on ttwu to no longer target this CPU, see
+ * ttwu_queue_cond() and is_cpu_allowed().
*
* Do sync before park smpboot threads to take care the rcu boost case.
*/
synchronize_rcu();
+ rq_lock_irqsave(rq, &rf);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_offline(rq);
+ }
+ rq_unlock_irqrestore(rq, &rf);
+
#ifdef CONFIG_SCHED_SMT
/*
* When going down, decrement the number of cores with SMT present.
*/
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
static_branch_dec_cpuslocked(&sched_smt_present);
+
+ sched_core_cpu_deactivate(cpu);
#endif
if (!sched_smp_initialized)
return 0;
+ sched_update_numa(cpu, false);
ret = cpuset_cpu_inactive(cpu);
if (ret) {
+ balance_push_set(cpu, false);
set_cpu_active(cpu, true);
+ sched_update_numa(cpu, true);
return ret;
}
sched_domains_numa_masks_clear(cpu);
@@ -6614,12 +9755,67 @@ static void sched_rq_cpu_starting(unsigned int cpu)
int sched_cpu_starting(unsigned int cpu)
{
+ sched_core_cpu_starting(cpu);
sched_rq_cpu_starting(cpu);
sched_tick_start(cpu);
return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Invoked immediately before the stopper thread is invoked to bring the
+ * CPU down completely. At this point all per CPU kthreads except the
+ * hotplug thread (current) and the stopper thread (inactive) have been
+ * either parked or have been unbound from the outgoing CPU. Ensure that
+ * any of those which might be on the way out are gone.
+ *
+ * If after this point a bound task is being woken on this CPU then the
+ * responsible hotplug callback has failed to do it's job.
+ * sched_cpu_dying() will catch it with the appropriate fireworks.
+ */
+int sched_cpu_wait_empty(unsigned int cpu)
+{
+ balance_hotplug_wait();
+ return 0;
+}
+
+/*
+ * Since this CPU is going 'away' for a while, fold any nr_active delta we
+ * might have. Called from the CPU stopper task after ensuring that the
+ * stopper is the last running task on the CPU, so nr_active count is
+ * stable. We need to take the teardown thread which is calling this into
+ * account, so we hand in adjust = 1 to the load calculation.
+ *
+ * Also see the comment "Global load-average calculations".
+ */
+static void calc_load_migrate(struct rq *rq)
+{
+ long delta = calc_load_fold_active(rq, 1);
+
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+}
+
+static void dump_rq_tasks(struct rq *rq, const char *loglvl)
+{
+ struct task_struct *g, *p;
+ int cpu = cpu_of(rq);
+
+ lockdep_assert_rq_held(rq);
+
+ printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
+ for_each_process_thread(g, p) {
+ if (task_cpu(p) != cpu)
+ continue;
+
+ if (!task_on_rq_queued(p))
+ continue;
+
+ printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);
+ }
+}
+
int sched_cpu_dying(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -6629,25 +9825,23 @@ int sched_cpu_dying(unsigned int cpu)
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
- if (rq->rd) {
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
- set_rq_offline(rq);
+ if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
+ WARN(true, "Dying CPU not properly vacated!");
+ dump_rq_tasks(rq, KERN_WARNING);
}
- migrate_tasks(rq, &rf);
- BUG_ON(rq->nr_running != 1);
rq_unlock_irqrestore(rq, &rf);
calc_load_migrate(rq);
update_max_interval();
- nohz_balance_exit_idle(rq);
hrtick_clear(rq);
+ sched_core_cpu_dying(cpu);
return 0;
}
#endif
void __init sched_init_smp(void)
{
- sched_init_numa();
+ sched_init_numa(NUMA_NO_NODE);
/*
* There's no userspace yet to cause hotplug operations; hence all the
@@ -6659,8 +9853,9 @@ void __init sched_init_smp(void)
mutex_unlock(&sched_domains_mutex);
/* Move init over to a non-isolated CPU */
- if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)
BUG();
+ current->flags &= ~PF_NO_SETAFFINITY;
sched_init_granularity();
init_sched_rt_class();
@@ -6699,17 +9894,22 @@ struct task_group root_task_group;
LIST_HEAD(task_groups);
/* Cacheline aligned slab cache for task_group */
-static struct kmem_cache *task_group_cache __read_mostly;
+static struct kmem_cache *task_group_cache __ro_after_init;
#endif
-DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
-DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
-
void __init sched_init(void)
{
unsigned long ptr = 0;
int i;
+ /* Make sure the linker didn't screw up */
+ BUG_ON(&idle_sched_class != &fair_sched_class + 1 ||
+ &fair_sched_class != &rt_sched_class + 1 ||
+ &rt_sched_class != &dl_sched_class + 1);
+#ifdef CONFIG_SMP
+ BUG_ON(&dl_sched_class != &stop_sched_class + 1);
+#endif
+
wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6729,7 +9929,7 @@ void __init sched_init(void)
ptr += nr_cpu_ids * sizeof(void **);
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -6740,17 +9940,8 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
}
-#ifdef CONFIG_CPUMASK_OFFSTACK
- for_each_possible_cpu(i) {
- per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
- cpumask_size(), GFP_KERNEL, cpu_to_node(i));
- per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
- cpumask_size(), GFP_KERNEL, cpu_to_node(i));
- }
-#endif /* CONFIG_CPUMASK_OFFSTACK */
init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
- init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
#ifdef CONFIG_SMP
init_defrootdomain();
@@ -6774,7 +9965,7 @@ void __init sched_init(void)
struct rq *rq;
rq = cpu_rq(i);
- raw_spin_lock_init(&rq->lock);
+ raw_spin_lock_init(&rq->__lock);
rq->nr_running = 0;
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
@@ -6813,8 +10004,8 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
- rq->balance_callback = NULL;
+ rq->cpu_capacity = SCHED_CAPACITY_SCALE;
+ rq->balance_callback = &balance_push_callback;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
@@ -6831,11 +10022,27 @@ void __init sched_init(void)
rq->last_blocked_load_update_tick = jiffies;
atomic_set(&rq->nohz_flags, 0);
- rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
+ INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
+#endif
+#ifdef CONFIG_HOTPLUG_CPU
+ rcuwait_init(&rq->hotplug_wait);
#endif
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
+
+#ifdef CONFIG_SCHED_CORE
+ rq->core = rq;
+ rq->core_pick = NULL;
+ rq->core_enabled = 0;
+ rq->core_tree = RB_ROOT;
+ rq->core_forceidle_count = 0;
+ rq->core_forceidle_occupation = 0;
+ rq->core_forceidle_start = 0;
+
+ rq->core_cookie = 0UL;
+#endif
+ zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
}
set_load_weight(&init_task, false);
@@ -6843,10 +10050,18 @@ void __init sched_init(void)
/*
* The boot idle thread does lazy MMU switching as well:
*/
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
enter_lazy_tlb(&init_mm, current);
/*
+ * The idle task doesn't need the kthread struct to function, but it
+ * is dressed up as a per-CPU kthread and thus needs to play the part
+ * if we want to avoid special-casing it in code that deals with per-CPU
+ * kthreads.
+ */
+ WARN_ON(!set_kthread_struct(current));
+
+ /*
* Make us the idle thread. Technically, schedule() should not be
* called from this thread, however somewhere below it might be,
* but because we are the idle thread, we just pick up running again
@@ -6858,45 +10073,61 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
idle_thread_set_boot_cpu();
+ balance_push_set(smp_processor_id(), false);
#endif
init_sched_fair_class();
- init_schedstats();
-
psi_init();
init_uclamp();
+ preempt_dynamic_init();
+
scheduler_running = 1;
}
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-static inline int preempt_count_equals(int preempt_offset)
-{
- int nested = preempt_count() + rcu_preempt_depth();
- return (nested == preempt_offset);
-}
-
-void __might_sleep(const char *file, int line, int preempt_offset)
+void __might_sleep(const char *file, int line)
{
+ unsigned int state = get_current_state();
/*
* Blocking primitives will set (and therefore destroy) current->state,
* since we will exit with TASK_RUNNING make sure we enter with it,
* otherwise we will destroy state.
*/
- WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
+ WARN_ONCE(state != TASK_RUNNING && current->task_state_change,
"do not call blocking ops when !TASK_RUNNING; "
- "state=%lx set at [<%p>] %pS\n",
- current->state,
+ "state=%x set at [<%p>] %pS\n", state,
(void *)current->task_state_change,
(void *)current->task_state_change);
- ___might_sleep(file, line, preempt_offset);
+ __might_resched(file, line, 0);
}
EXPORT_SYMBOL(__might_sleep);
-void ___might_sleep(const char *file, int line, int preempt_offset)
+static void print_preempt_disable_ip(int preempt_offset, unsigned long ip)
+{
+ if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT))
+ return;
+
+ if (preempt_count() == preempt_offset)
+ return;
+
+ pr_err("Preemption disabled at:");
+ print_ip_sym(KERN_ERR, ip);
+}
+
+static inline bool resched_offsets_ok(unsigned int offsets)
+{
+ unsigned int nested = preempt_count();
+
+ nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT;
+
+ return nested == offsets;
+}
+
+void __might_resched(const char *file, int line, unsigned int offsets)
{
/* Ratelimiting timestamp: */
static unsigned long prev_jiffy;
@@ -6906,7 +10137,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
/* WARN_ON_ONCE() by default, no rate limit required: */
rcu_sleep_check();
- if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+ if ((resched_offsets_ok(offsets) && !irqs_disabled() &&
!is_idle_task(current) && !current->non_block_count) ||
system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
oops_in_progress)
@@ -6919,29 +10150,33 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
/* Save this before calling printk(), since that will clobber it: */
preempt_disable_ip = get_preempt_disable_ip(current);
- printk(KERN_ERR
- "BUG: sleeping function called from invalid context at %s:%d\n",
- file, line);
- printk(KERN_ERR
- "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
- in_atomic(), irqs_disabled(), current->non_block_count,
- current->pid, current->comm);
+ pr_err("BUG: sleeping function called from invalid context at %s:%d\n",
+ file, line);
+ pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(), current->non_block_count,
+ current->pid, current->comm);
+ pr_err("preempt_count: %x, expected: %x\n", preempt_count(),
+ offsets & MIGHT_RESCHED_PREEMPT_MASK);
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
+ pr_err("RCU nest depth: %d, expected: %u\n",
+ rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT);
+ }
if (task_stack_end_corrupted(current))
- printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+ pr_emerg("Thread overran stack, or stack corrupted\n");
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
- if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
- && !preempt_count_equals(preempt_offset)) {
- pr_err("Preemption disabled at:");
- print_ip_sym(KERN_ERR, preempt_disable_ip);
- }
+
+ print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK,
+ preempt_disable_ip);
+
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
-EXPORT_SYMBOL(___might_sleep);
+EXPORT_SYMBOL(__might_resched);
void __cant_sleep(const char *file, int line, int preempt_offset)
{
@@ -6970,6 +10205,39 @@ void __cant_sleep(const char *file, int line, int preempt_offset)
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
EXPORT_SYMBOL_GPL(__cant_sleep);
+
+#ifdef CONFIG_SMP
+void __cant_migrate(const char *file, int line)
+{
+ static unsigned long prev_jiffy;
+
+ if (irqs_disabled())
+ return;
+
+ if (is_migration_disabled(current))
+ return;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return;
+
+ if (preempt_count() > 0)
+ return;
+
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);
+ pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(), is_migration_disabled(current),
+ current->pid, current->comm);
+
+ debug_show_held_locks(current);
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+EXPORT_SYMBOL_GPL(__cant_migrate);
+#endif
#endif
#ifdef CONFIG_MAGIC_SYSRQ
@@ -6989,9 +10257,9 @@ void normalize_rt_tasks(void)
continue;
p->se.exec_start = 0;
- schedstat_set(p->se.statistics.wait_start, 0);
- schedstat_set(p->se.statistics.sleep_start, 0);
- schedstat_set(p->se.statistics.block_start, 0);
+ schedstat_set(p->stats.wait_start, 0);
+ schedstat_set(p->stats.sleep_start, 0);
+ schedstat_set(p->stats.block_start, 0);
if (!dl_task(p) && !rt_task(p)) {
/*
@@ -7010,9 +10278,9 @@ void normalize_rt_tasks(void)
#endif /* CONFIG_MAGIC_SYSRQ */
-#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+#if defined(CONFIG_KGDB_KDB)
/*
- * These functions are only useful for the IA64 MCA handling, or kdb.
+ * These functions are only useful for kdb.
*
* They can only be called when the whole system has been
* stopped - every CPU needs to be quiescent, and no scheduling
@@ -7034,30 +10302,7 @@ struct task_struct *curr_task(int cpu)
return cpu_curr(cpu);
}
-#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
-
-#ifdef CONFIG_IA64
-/**
- * ia64_set_curr_task - set the current task for a given CPU.
- * @cpu: the processor in question.
- * @p: the task pointer to set.
- *
- * Description: This function must only be used when non-maskable interrupts
- * are serviced on a separate stack. It allows the architecture to switch the
- * notion of the current task on a CPU in a non-blocking manner. This function
- * must be called with all CPU's synchronized, and interrupts disabled, the
- * and caller must save the original value of the current task (see
- * curr_task() above) and restore that value before reenabling interrupts and
- * re-starting the system.
- *
- * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
- */
-void ia64_set_curr_task(int cpu, struct task_struct *p)
-{
- cpu_curr(cpu) = p;
-}
-
-#endif
+#endif /* defined(CONFIG_KGDB_KDB) */
#ifdef CONFIG_CGROUP_SCHED
/* task_group_lock serializes the addition/removal of task groups */
@@ -7085,6 +10330,22 @@ static void sched_free_group(struct task_group *tg)
kmem_cache_free(task_group_cache, tg);
}
+static void sched_free_group_rcu(struct rcu_head *rcu)
+{
+ sched_free_group(container_of(rcu, struct task_group, rcu));
+}
+
+static void sched_unregister_group(struct task_group *tg)
+{
+ unregister_fair_sched_group(tg);
+ unregister_rt_sched_group(tg);
+ /*
+ * We have to wait for yet another RCU grace period to expire, as
+ * print_cfs_stats() might run concurrently.
+ */
+ call_rcu(&tg->rcu, sched_free_group_rcu);
+}
+
/* allocate runqueue etc for a new task group */
struct task_group *sched_create_group(struct task_group *parent)
{
@@ -7128,32 +10389,42 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
}
/* rcu callback to free various structures associated with a task group */
-static void sched_free_group_rcu(struct rcu_head *rhp)
+static void sched_unregister_group_rcu(struct rcu_head *rhp)
{
/* Now it should be safe to free those cfs_rqs: */
- sched_free_group(container_of(rhp, struct task_group, rcu));
+ sched_unregister_group(container_of(rhp, struct task_group, rcu));
}
void sched_destroy_group(struct task_group *tg)
{
/* Wait for possible concurrent references to cfs_rqs complete: */
- call_rcu(&tg->rcu, sched_free_group_rcu);
+ call_rcu(&tg->rcu, sched_unregister_group_rcu);
}
-void sched_offline_group(struct task_group *tg)
+void sched_release_group(struct task_group *tg)
{
unsigned long flags;
- /* End participation in shares distribution: */
- unregister_fair_sched_group(tg);
-
+ /*
+ * Unlink first, to avoid walk_tg_tree_from() from finding us (via
+ * sched_cfs_period_timer()).
+ *
+ * For this to be effective, we have to wait for all pending users of
+ * this task group to leave their RCU critical section to ensure no new
+ * user will see our dying task group any more. Specifically ensure
+ * that tg_unthrottle_up() won't add decayed cfs_rq's to it.
+ *
+ * We therefore defer calling unregister_fair_sched_group() to
+ * sched_unregister_group() which is guarantied to get called only after the
+ * current RCU grace period has expired.
+ */
spin_lock_irqsave(&task_group_lock, flags);
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
spin_unlock_irqrestore(&task_group_lock, flags);
}
-static void sched_change_group(struct task_struct *tsk, int type)
+static struct task_group *sched_get_task_group(struct task_struct *tsk)
{
struct task_group *tg;
@@ -7165,11 +10436,17 @@ static void sched_change_group(struct task_struct *tsk, int type)
tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
- tsk->sched_task_group = tg;
+
+ return tg;
+}
+
+static void sched_change_group(struct task_struct *tsk, struct task_group *group)
+{
+ tsk->sched_task_group = group;
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_change_group)
- tsk->sched_class->task_change_group(tsk, type);
+ tsk->sched_class->task_change_group(tsk);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
@@ -7186,10 +10463,20 @@ void sched_move_task(struct task_struct *tsk)
{
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
- struct rq_flags rf;
+ struct task_group *group;
struct rq *rq;
- rq = task_rq_lock(tsk, &rf);
+ CLASS(task_rq_lock, rq_guard)(tsk);
+ rq = rq_guard.rq;
+
+ /*
+ * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
+ * group changes.
+ */
+ group = sched_get_task_group(tsk);
+ if (group == tsk->sched_task_group)
+ return;
+
update_rq_clock(rq);
running = task_current(rq, tsk);
@@ -7200,7 +10487,7 @@ void sched_move_task(struct task_struct *tsk)
if (running)
put_prev_task(rq, tsk);
- sched_change_group(tsk, TASK_MOVE_GROUP);
+ sched_change_group(tsk, group);
if (queued)
enqueue_task(rq, tsk, queue_flags);
@@ -7213,8 +10500,6 @@ void sched_move_task(struct task_struct *tsk)
*/
resched_curr(rq);
}
-
- task_rq_unlock(rq, tsk, &rf);
}
static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
@@ -7251,6 +10536,8 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
#ifdef CONFIG_UCLAMP_TASK_GROUP
/* Propagate the effective uclamp value for the new group */
+ guard(mutex)(&uclamp_mutex);
+ guard(rcu)();
cpu_util_update_eff(css);
#endif
@@ -7261,7 +10548,7 @@ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
- sched_offline_group(tg);
+ sched_release_group(tg);
}
static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -7271,56 +10558,22 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
/*
* Relies on the RCU grace period between css_released() and this.
*/
- sched_free_group(tg);
-}
-
-/*
- * This is called before wake_up_new_task(), therefore we really only
- * have to set its group bits, all the other stuff does not apply.
- */
-static void cpu_cgroup_fork(struct task_struct *task)
-{
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(task, &rf);
-
- update_rq_clock(rq);
- sched_change_group(task, TASK_SET_GROUP);
-
- task_rq_unlock(rq, task, &rf);
+ sched_unregister_group(tg);
}
+#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
- int ret = 0;
cgroup_taskset_for_each(task, css, tset) {
-#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
-#endif
- /*
- * Serialize against wake_up_new_task() such that if its
- * running, we're sure to observe its full state.
- */
- raw_spin_lock_irq(&task->pi_lock);
- /*
- * Avoid calling sched_move_task() before wake_up_new_task()
- * has happened. This would lead to problems with PELT, due to
- * move wanting to detach+attach while we're not attached yet.
- */
- if (task->state == TASK_NEW)
- ret = -EINVAL;
- raw_spin_unlock_irq(&task->pi_lock);
-
- if (ret)
- break;
}
- return ret;
+ return 0;
}
+#endif
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
{
@@ -7341,6 +10594,9 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
enum uclamp_id clamp_id;
unsigned int clamps;
+ lockdep_assert_held(&uclamp_mutex);
+ SCHED_WARN_ON(!rcu_read_lock_held());
+
css_for_each_descendant_pre(css, top_css) {
uc_parent = css_tg(css)->parent
? css_tg(css)->parent->uclamp : NULL;
@@ -7373,7 +10629,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
}
/* Immediately update descendants RUNNABLE tasks */
- uclamp_update_active_tasks(css, clamps);
+ uclamp_update_active_tasks(css);
}
}
@@ -7431,8 +10687,10 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
if (req.ret)
return req.ret;
- mutex_lock(&uclamp_mutex);
- rcu_read_lock();
+ static_branch_enable(&sched_uclamp_used);
+
+ guard(mutex)(&uclamp_mutex);
+ guard(rcu)();
tg = css_tg(of_css(of));
if (tg->uclamp_req[clamp_id].value != req.util)
@@ -7447,9 +10705,6 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
/* Update effective clamps to track the most restrictive value */
cpu_util_update_eff(of_css(of));
- rcu_read_unlock();
- mutex_unlock(&uclamp_mutex);
-
return nbytes;
}
@@ -7475,10 +10730,10 @@ static inline void cpu_uclamp_print(struct seq_file *sf,
u64 percent;
u32 rem;
- rcu_read_lock();
- tg = css_tg(seq_css(sf));
- util_clamp = tg->uclamp_req[clamp_id].value;
- rcu_read_unlock();
+ scoped_guard (rcu) {
+ tg = css_tg(seq_css(sf));
+ util_clamp = tg->uclamp_req[clamp_id].value;
+ }
if (util_clamp == SCHED_CAPACITY_SCALE) {
seq_puts(sf, "max\n");
@@ -7530,7 +10785,8 @@ static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
-static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
+ u64 burst)
{
int i, ret = 0, runtime_enabled, runtime_was_enabled;
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
@@ -7547,7 +10803,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
return -EINVAL;
/*
- * Likewise, bound things on the otherside by preventing insane quota
+ * Likewise, bound things on the other side by preventing insane quota
* periods. This also allows us to normalize in computing quota
* feasibility.
*/
@@ -7560,15 +10816,20 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
if (quota != RUNTIME_INF && quota > max_cfs_runtime)
return -EINVAL;
+ if (quota != RUNTIME_INF && (burst > quota ||
+ burst + quota > max_cfs_runtime))
+ return -EINVAL;
+
/*
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/
- get_online_cpus();
- mutex_lock(&cfs_constraints_mutex);
+ guard(cpus_read_lock)();
+ guard(mutex)(&cfs_constraints_mutex);
+
ret = __cfs_schedulable(tg, period, quota);
if (ret)
- goto out_unlock;
+ return ret;
runtime_enabled = quota != RUNTIME_INF;
runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
@@ -7578,45 +10839,46 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
*/
if (runtime_enabled && !runtime_was_enabled)
cfs_bandwidth_usage_inc();
- raw_spin_lock_irq(&cfs_b->lock);
- cfs_b->period = ns_to_ktime(period);
- cfs_b->quota = quota;
- __refill_cfs_bandwidth_runtime(cfs_b);
+ scoped_guard (raw_spinlock_irq, &cfs_b->lock) {
+ cfs_b->period = ns_to_ktime(period);
+ cfs_b->quota = quota;
+ cfs_b->burst = burst;
- /* Restart the period timer (if active) to handle new period expiry: */
- if (runtime_enabled)
- start_cfs_bandwidth(cfs_b);
+ __refill_cfs_bandwidth_runtime(cfs_b);
- raw_spin_unlock_irq(&cfs_b->lock);
+ /*
+ * Restart the period timer (if active) to handle new
+ * period expiry:
+ */
+ if (runtime_enabled)
+ start_cfs_bandwidth(cfs_b);
+ }
for_each_online_cpu(i) {
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
struct rq *rq = cfs_rq->rq;
- struct rq_flags rf;
- rq_lock_irq(rq, &rf);
+ guard(rq_lock_irq)(rq);
cfs_rq->runtime_enabled = runtime_enabled;
cfs_rq->runtime_remaining = 0;
if (cfs_rq->throttled)
unthrottle_cfs_rq(cfs_rq);
- rq_unlock_irq(rq, &rf);
}
+
if (runtime_was_enabled && !runtime_enabled)
cfs_bandwidth_usage_dec();
-out_unlock:
- mutex_unlock(&cfs_constraints_mutex);
- put_online_cpus();
- return ret;
+ return 0;
}
static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
{
- u64 quota, period;
+ u64 quota, period, burst;
period = ktime_to_ns(tg->cfs_bandwidth.period);
+ burst = tg->cfs_bandwidth.burst;
if (cfs_quota_us < 0)
quota = RUNTIME_INF;
else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
@@ -7624,7 +10886,7 @@ static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
else
return -EINVAL;
- return tg_set_cfs_bandwidth(tg, period, quota);
+ return tg_set_cfs_bandwidth(tg, period, quota, burst);
}
static long tg_get_cfs_quota(struct task_group *tg)
@@ -7642,15 +10904,16 @@ static long tg_get_cfs_quota(struct task_group *tg)
static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
{
- u64 quota, period;
+ u64 quota, period, burst;
if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
return -EINVAL;
period = (u64)cfs_period_us * NSEC_PER_USEC;
quota = tg->cfs_bandwidth.quota;
+ burst = tg->cfs_bandwidth.burst;
- return tg_set_cfs_bandwidth(tg, period, quota);
+ return tg_set_cfs_bandwidth(tg, period, quota, burst);
}
static long tg_get_cfs_period(struct task_group *tg)
@@ -7663,6 +10926,30 @@ static long tg_get_cfs_period(struct task_group *tg)
return cfs_period_us;
}
+static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us)
+{
+ u64 quota, period, burst;
+
+ if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC)
+ return -EINVAL;
+
+ burst = (u64)cfs_burst_us * NSEC_PER_USEC;
+ period = ktime_to_ns(tg->cfs_bandwidth.period);
+ quota = tg->cfs_bandwidth.quota;
+
+ return tg_set_cfs_bandwidth(tg, period, quota, burst);
+}
+
+static long tg_get_cfs_burst(struct task_group *tg)
+{
+ u64 burst_us;
+
+ burst_us = tg->cfs_bandwidth.burst;
+ do_div(burst_us, NSEC_PER_USEC);
+
+ return burst_us;
+}
+
static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -7687,6 +10974,18 @@ static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
return tg_set_cfs_period(css_tg(css), cfs_period_us);
}
+static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return tg_get_cfs_burst(css_tg(css));
+}
+
+static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 cfs_burst_us)
+{
+ return tg_set_cfs_burst(css_tg(css), cfs_burst_us);
+}
+
struct cfs_schedulable_data {
struct task_group *tg;
u64 period, quota;
@@ -7732,11 +11031,16 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
/*
* Ensure max(child_quota) <= parent_quota. On cgroup2,
- * always take the min. On cgroup1, only inherit when no
- * limit is set:
+ * always take the non-RUNTIME_INF min. On cgroup1, only
+ * inherit when no limit is set. In both cases this is used
+ * by the scheduler to determine if a given CFS task has a
+ * bandwidth constraint at some higher level.
*/
if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
- quota = min(quota, parent_quota);
+ if (quota == RUNTIME_INF)
+ quota = parent_quota;
+ else if (parent_quota != RUNTIME_INF)
+ quota = min(quota, parent_quota);
} else {
if (quota == RUNTIME_INF)
quota = parent_quota;
@@ -7751,7 +11055,6 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
{
- int ret;
struct cfs_schedulable_data data = {
.tg = tg,
.period = period,
@@ -7763,11 +11066,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
do_div(data.quota, NSEC_PER_USEC);
}
- rcu_read_lock();
- ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
- rcu_read_unlock();
-
- return ret;
+ guard(rcu)();
+ return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
}
static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
@@ -7780,15 +11080,42 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
if (schedstat_enabled() && tg != &root_task_group) {
+ struct sched_statistics *stats;
u64 ws = 0;
int i;
- for_each_possible_cpu(i)
- ws += schedstat_val(tg->se[i]->statistics.wait_sum);
+ for_each_possible_cpu(i) {
+ stats = __schedstats_from_se(tg->se[i]);
+ ws += schedstat_val(stats->wait_sum);
+ }
seq_printf(sf, "wait_sum %llu\n", ws);
}
+ seq_printf(sf, "nr_bursts %d\n", cfs_b->nr_burst);
+ seq_printf(sf, "burst_time %llu\n", cfs_b->burst_time);
+
+ return 0;
+}
+
+static u64 throttled_time_self(struct task_group *tg)
+{
+ int i;
+ u64 total = 0;
+
+ for_each_possible_cpu(i) {
+ total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time);
+ }
+
+ return total;
+}
+
+static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
+{
+ struct task_group *tg = css_tg(seq_css(sf));
+
+ seq_printf(sf, "throttled_time %llu\n", throttled_time_self(tg));
+
return 0;
}
#endif /* CONFIG_CFS_BANDWIDTH */
@@ -7820,6 +11147,20 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return css_tg(css)->idle;
+}
+
+static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 idle)
+{
+ return sched_group_set_idle(css_tg(css), idle);
+}
+#endif
+
static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
{
@@ -7827,6 +11168,11 @@ static struct cftype cpu_legacy_files[] = {
.read_u64 = cpu_shares_read_u64,
.write_u64 = cpu_shares_write_u64,
},
+ {
+ .name = "idle",
+ .read_s64 = cpu_idle_read_s64,
+ .write_s64 = cpu_idle_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
@@ -7840,9 +11186,18 @@ static struct cftype cpu_legacy_files[] = {
.write_u64 = cpu_cfs_period_write_u64,
},
{
+ .name = "cfs_burst_us",
+ .read_u64 = cpu_cfs_burst_read_u64,
+ .write_u64 = cpu_cfs_burst_write_u64,
+ },
+ {
.name = "stat",
.seq_show = cpu_cfs_stat_show,
},
+ {
+ .name = "stat.local",
+ .seq_show = cpu_cfs_local_stat_show,
+ },
#endif
#ifdef CONFIG_RT_GROUP_SCHED
{
@@ -7880,16 +11235,38 @@ static int cpu_extra_stat_show(struct seq_file *sf,
{
struct task_group *tg = css_tg(css);
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
- u64 throttled_usec;
+ u64 throttled_usec, burst_usec;
throttled_usec = cfs_b->throttled_time;
do_div(throttled_usec, NSEC_PER_USEC);
+ burst_usec = cfs_b->burst_time;
+ do_div(burst_usec, NSEC_PER_USEC);
seq_printf(sf, "nr_periods %d\n"
"nr_throttled %d\n"
- "throttled_usec %llu\n",
+ "throttled_usec %llu\n"
+ "nr_bursts %d\n"
+ "burst_usec %llu\n",
cfs_b->nr_periods, cfs_b->nr_throttled,
- throttled_usec);
+ throttled_usec, cfs_b->nr_burst, burst_usec);
+ }
+#endif
+ return 0;
+}
+
+static int cpu_local_stat_show(struct seq_file *sf,
+ struct cgroup_subsys_state *css)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ struct task_group *tg = css_tg(css);
+ u64 throttled_self_usec;
+
+ throttled_self_usec = throttled_time_self(tg);
+ do_div(throttled_self_usec, NSEC_PER_USEC);
+
+ seq_printf(sf, "throttled_usec %llu\n",
+ throttled_self_usec);
}
#endif
return 0;
@@ -8004,12 +11381,13 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
{
struct task_group *tg = css_tg(of_css(of));
u64 period = tg_get_cfs_period(tg);
+ u64 burst = tg_get_cfs_burst(tg);
u64 quota;
int ret;
ret = cpu_period_quota_parse(buf, &period, &quota);
if (!ret)
- ret = tg_set_cfs_bandwidth(tg, period, quota);
+ ret = tg_set_cfs_bandwidth(tg, period, quota, burst);
return ret ?: nbytes;
}
#endif
@@ -8028,6 +11406,12 @@ static struct cftype cpu_files[] = {
.read_s64 = cpu_weight_nice_read_s64,
.write_s64 = cpu_weight_nice_write_s64,
},
+ {
+ .name = "idle",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_idle_read_s64,
+ .write_s64 = cpu_idle_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
@@ -8036,6 +11420,12 @@ static struct cftype cpu_files[] = {
.seq_show = cpu_max_show,
.write = cpu_max_write,
},
+ {
+ .name = "max.burst",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_cfs_burst_read_u64,
+ .write_u64 = cpu_cfs_burst_write_u64,
+ },
#endif
#ifdef CONFIG_UCLAMP_TASK_GROUP
{
@@ -8060,8 +11450,10 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
- .fork = cpu_cgroup_fork,
+ .css_local_stat_show = cpu_local_stat_show,
+#ifdef CONFIG_RT_GROUP_SCHED
.can_attach = cpu_cgroup_can_attach,
+#endif
.attach = cpu_cgroup_attach,
.legacy_cftypes = cpu_legacy_files,
.dfl_cftypes = cpu_files,
@@ -8073,6 +11465,19 @@ struct cgroup_subsys cpu_cgrp_subsys = {
void dump_cpu_task(int cpu)
{
+ if (cpu == smp_processor_id() && in_hardirq()) {
+ struct pt_regs *regs;
+
+ regs = get_irq_regs();
+ if (regs) {
+ show_regs(regs);
+ return;
+ }
+ }
+
+ if (trigger_single_cpu_backtrace(cpu))
+ return;
+
pr_info("Task dump for CPU %d:\n", cpu);
sched_show_task(cpu_curr(cpu));
}
@@ -8118,4 +11523,524 @@ const u32 sched_prio_to_wmult[40] = {
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
-#undef CREATE_TRACE_POINTS
+void call_trace_sched_update_nr_running(struct rq *rq, int count)
+{
+ trace_sched_update_nr_running_tp(rq, count);
+}
+
+#ifdef CONFIG_SCHED_MM_CID
+
+/*
+ * @cid_lock: Guarantee forward-progress of cid allocation.
+ *
+ * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock
+ * is only used when contention is detected by the lock-free allocation so
+ * forward progress can be guaranteed.
+ */
+DEFINE_RAW_SPINLOCK(cid_lock);
+
+/*
+ * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock.
+ *
+ * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is
+ * detected, it is set to 1 to ensure that all newly coming allocations are
+ * serialized by @cid_lock until the allocation which detected contention
+ * completes and sets @use_cid_lock back to 0. This guarantees forward progress
+ * of a cid allocation.
+ */
+int use_cid_lock;
+
+/*
+ * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid
+ * concurrently with respect to the execution of the source runqueue context
+ * switch.
+ *
+ * There is one basic properties we want to guarantee here:
+ *
+ * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively
+ * used by a task. That would lead to concurrent allocation of the cid and
+ * userspace corruption.
+ *
+ * Provide this guarantee by introducing a Dekker memory ordering to guarantee
+ * that a pair of loads observe at least one of a pair of stores, which can be
+ * shown as:
+ *
+ * X = Y = 0
+ *
+ * w[X]=1 w[Y]=1
+ * MB MB
+ * r[Y]=y r[X]=x
+ *
+ * Which guarantees that x==0 && y==0 is impossible. But rather than using
+ * values 0 and 1, this algorithm cares about specific state transitions of the
+ * runqueue current task (as updated by the scheduler context switch), and the
+ * per-mm/cpu cid value.
+ *
+ * Let's introduce task (Y) which has task->mm == mm and task (N) which has
+ * task->mm != mm for the rest of the discussion. There are two scheduler state
+ * transitions on context switch we care about:
+ *
+ * (TSA) Store to rq->curr with transition from (N) to (Y)
+ *
+ * (TSB) Store to rq->curr with transition from (Y) to (N)
+ *
+ * On the remote-clear side, there is one transition we care about:
+ *
+ * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag
+ *
+ * There is also a transition to UNSET state which can be performed from all
+ * sides (scheduler, remote-clear). It is always performed with a cmpxchg which
+ * guarantees that only a single thread will succeed:
+ *
+ * (TMB) cmpxchg to *pcpu_cid to mark UNSET
+ *
+ * Just to be clear, what we do _not_ want to happen is a transition to UNSET
+ * when a thread is actively using the cid (property (1)).
+ *
+ * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions.
+ *
+ * Scenario A) (TSA)+(TMA) (from next task perspective)
+ *
+ * CPU0 CPU1
+ *
+ * Context switch CS-1 Remote-clear
+ * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA)
+ * (implied barrier after cmpxchg)
+ * - switch_mm_cid()
+ * - memory barrier (see switch_mm_cid()
+ * comment explaining how this barrier
+ * is combined with other scheduler
+ * barriers)
+ * - mm_cid_get (next)
+ * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr)
+ *
+ * This Dekker ensures that either task (Y) is observed by the
+ * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are
+ * observed.
+ *
+ * If task (Y) store is observed by rcu_dereference(), it means that there is
+ * still an active task on the cpu. Remote-clear will therefore not transition
+ * to UNSET, which fulfills property (1).
+ *
+ * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(),
+ * it will move its state to UNSET, which clears the percpu cid perhaps
+ * uselessly (which is not an issue for correctness). Because task (Y) is not
+ * observed, CPU1 can move ahead to set the state to UNSET. Because moving
+ * state to UNSET is done with a cmpxchg expecting that the old state has the
+ * LAZY flag set, only one thread will successfully UNSET.
+ *
+ * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0
+ * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and
+ * CPU1 will observe task (Y) and do nothing more, which is fine.
+ *
+ * What we are effectively preventing with this Dekker is a scenario where
+ * neither LAZY flag nor store (Y) are observed, which would fail property (1)
+ * because this would UNSET a cid which is actively used.
+ */
+
+void sched_mm_cid_migrate_from(struct task_struct *t)
+{
+ t->migrate_from_cpu = task_cpu(t);
+}
+
+static
+int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
+ struct task_struct *t,
+ struct mm_cid *src_pcpu_cid)
+{
+ struct mm_struct *mm = t->mm;
+ struct task_struct *src_task;
+ int src_cid, last_mm_cid;
+
+ if (!mm)
+ return -1;
+
+ last_mm_cid = t->last_mm_cid;
+ /*
+ * If the migrated task has no last cid, or if the current
+ * task on src rq uses the cid, it means the source cid does not need
+ * to be moved to the destination cpu.
+ */
+ if (last_mm_cid == -1)
+ return -1;
+ src_cid = READ_ONCE(src_pcpu_cid->cid);
+ if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid)
+ return -1;
+
+ /*
+ * If we observe an active task using the mm on this rq, it means we
+ * are not the last task to be migrated from this cpu for this mm, so
+ * there is no need to move src_cid to the destination cpu.
+ */
+ guard(rcu)();
+ src_task = rcu_dereference(src_rq->curr);
+ if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
+ t->last_mm_cid = -1;
+ return -1;
+ }
+
+ return src_cid;
+}
+
+static
+int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
+ struct task_struct *t,
+ struct mm_cid *src_pcpu_cid,
+ int src_cid)
+{
+ struct task_struct *src_task;
+ struct mm_struct *mm = t->mm;
+ int lazy_cid;
+
+ if (src_cid == -1)
+ return -1;
+
+ /*
+ * Attempt to clear the source cpu cid to move it to the destination
+ * cpu.
+ */
+ lazy_cid = mm_cid_set_lazy_put(src_cid);
+ if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid))
+ return -1;
+
+ /*
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm matches the scheduler barrier in context_switch()
+ * between store to rq->curr and load of prev and next task's
+ * per-mm/cpu cid.
+ *
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm_cid_active matches the barrier in
+ * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
+ * sched_mm_cid_after_execve() between store to t->mm_cid_active and
+ * load of per-mm/cpu cid.
+ */
+
+ /*
+ * If we observe an active task using the mm on this rq after setting
+ * the lazy-put flag, this task will be responsible for transitioning
+ * from lazy-put flag set to MM_CID_UNSET.
+ */
+ scoped_guard (rcu) {
+ src_task = rcu_dereference(src_rq->curr);
+ if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
+ /*
+ * We observed an active task for this mm, there is therefore
+ * no point in moving this cid to the destination cpu.
+ */
+ t->last_mm_cid = -1;
+ return -1;
+ }
+ }
+
+ /*
+ * The src_cid is unused, so it can be unset.
+ */
+ if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
+ return -1;
+ return src_cid;
+}
+
+/*
+ * Migration to dst cpu. Called with dst_rq lock held.
+ * Interrupts are disabled, which keeps the window of cid ownership without the
+ * source rq lock held small.
+ */
+void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
+{
+ struct mm_cid *src_pcpu_cid, *dst_pcpu_cid;
+ struct mm_struct *mm = t->mm;
+ int src_cid, dst_cid, src_cpu;
+ struct rq *src_rq;
+
+ lockdep_assert_rq_held(dst_rq);
+
+ if (!mm)
+ return;
+ src_cpu = t->migrate_from_cpu;
+ if (src_cpu == -1) {
+ t->last_mm_cid = -1;
+ return;
+ }
+ /*
+ * Move the src cid if the dst cid is unset. This keeps id
+ * allocation closest to 0 in cases where few threads migrate around
+ * many cpus.
+ *
+ * If destination cid is already set, we may have to just clear
+ * the src cid to ensure compactness in frequent migrations
+ * scenarios.
+ *
+ * It is not useful to clear the src cid when the number of threads is
+ * greater or equal to the number of allowed cpus, because user-space
+ * can expect that the number of allowed cids can reach the number of
+ * allowed cpus.
+ */
+ dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
+ dst_cid = READ_ONCE(dst_pcpu_cid->cid);
+ if (!mm_cid_is_unset(dst_cid) &&
+ atomic_read(&mm->mm_users) >= t->nr_cpus_allowed)
+ return;
+ src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu);
+ src_rq = cpu_rq(src_cpu);
+ src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid);
+ if (src_cid == -1)
+ return;
+ src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid,
+ src_cid);
+ if (src_cid == -1)
+ return;
+ if (!mm_cid_is_unset(dst_cid)) {
+ __mm_cid_put(mm, src_cid);
+ return;
+ }
+ /* Move src_cid to dst cpu. */
+ mm_cid_snapshot_time(dst_rq, mm);
+ WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
+}
+
+static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid,
+ int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *t;
+ int cid, lazy_cid;
+
+ cid = READ_ONCE(pcpu_cid->cid);
+ if (!mm_cid_is_valid(cid))
+ return;
+
+ /*
+ * Clear the cpu cid if it is set to keep cid allocation compact. If
+ * there happens to be other tasks left on the source cpu using this
+ * mm, the next task using this mm will reallocate its cid on context
+ * switch.
+ */
+ lazy_cid = mm_cid_set_lazy_put(cid);
+ if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid))
+ return;
+
+ /*
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm matches the scheduler barrier in context_switch()
+ * between store to rq->curr and load of prev and next task's
+ * per-mm/cpu cid.
+ *
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm_cid_active matches the barrier in
+ * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
+ * sched_mm_cid_after_execve() between store to t->mm_cid_active and
+ * load of per-mm/cpu cid.
+ */
+
+ /*
+ * If we observe an active task using the mm on this rq after setting
+ * the lazy-put flag, that task will be responsible for transitioning
+ * from lazy-put flag set to MM_CID_UNSET.
+ */
+ scoped_guard (rcu) {
+ t = rcu_dereference(rq->curr);
+ if (READ_ONCE(t->mm_cid_active) && t->mm == mm)
+ return;
+ }
+
+ /*
+ * The cid is unused, so it can be unset.
+ * Disable interrupts to keep the window of cid ownership without rq
+ * lock small.
+ */
+ scoped_guard (irqsave) {
+ if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
+ __mm_cid_put(mm, cid);
+ }
+}
+
+static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct mm_cid *pcpu_cid;
+ struct task_struct *curr;
+ u64 rq_clock;
+
+ /*
+ * rq->clock load is racy on 32-bit but one spurious clear once in a
+ * while is irrelevant.
+ */
+ rq_clock = READ_ONCE(rq->clock);
+ pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
+
+ /*
+ * In order to take care of infrequently scheduled tasks, bump the time
+ * snapshot associated with this cid if an active task using the mm is
+ * observed on this rq.
+ */
+ scoped_guard (rcu) {
+ curr = rcu_dereference(rq->curr);
+ if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
+ WRITE_ONCE(pcpu_cid->time, rq_clock);
+ return;
+ }
+ }
+
+ if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
+ return;
+ sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+}
+
+static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
+ int weight)
+{
+ struct mm_cid *pcpu_cid;
+ int cid;
+
+ pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
+ cid = READ_ONCE(pcpu_cid->cid);
+ if (!mm_cid_is_valid(cid) || cid < weight)
+ return;
+ sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+}
+
+static void task_mm_cid_work(struct callback_head *work)
+{
+ unsigned long now = jiffies, old_scan, next_scan;
+ struct task_struct *t = current;
+ struct cpumask *cidmask;
+ struct mm_struct *mm;
+ int weight, cpu;
+
+ SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work));
+
+ work->next = work; /* Prevent double-add */
+ if (t->flags & PF_EXITING)
+ return;
+ mm = t->mm;
+ if (!mm)
+ return;
+ old_scan = READ_ONCE(mm->mm_cid_next_scan);
+ next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
+ if (!old_scan) {
+ unsigned long res;
+
+ res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan);
+ if (res != old_scan)
+ old_scan = res;
+ else
+ old_scan = next_scan;
+ }
+ if (time_before(now, old_scan))
+ return;
+ if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
+ return;
+ cidmask = mm_cidmask(mm);
+ /* Clear cids that were not recently used. */
+ for_each_possible_cpu(cpu)
+ sched_mm_cid_remote_clear_old(mm, cpu);
+ weight = cpumask_weight(cidmask);
+ /*
+ * Clear cids that are greater or equal to the cidmask weight to
+ * recompact it.
+ */
+ for_each_possible_cpu(cpu)
+ sched_mm_cid_remote_clear_weight(mm, cpu, weight);
+}
+
+void init_sched_mm_cid(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ int mm_users = 0;
+
+ if (mm) {
+ mm_users = atomic_read(&mm->mm_users);
+ if (mm_users == 1)
+ mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
+ }
+ t->cid_work.next = &t->cid_work; /* Protect against double add */
+ init_task_work(&t->cid_work, task_mm_cid_work);
+}
+
+void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
+{
+ struct callback_head *work = &curr->cid_work;
+ unsigned long now = jiffies;
+
+ if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
+ work->next != work)
+ return;
+ if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
+ return;
+ task_work_add(curr, work, TWA_RESUME);
+}
+
+void sched_mm_cid_exit_signals(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ struct rq *rq;
+
+ if (!mm)
+ return;
+
+ preempt_disable();
+ rq = this_rq();
+ guard(rq_lock_irqsave)(rq);
+ preempt_enable_no_resched(); /* holding spinlock */
+ WRITE_ONCE(t->mm_cid_active, 0);
+ /*
+ * Store t->mm_cid_active before loading per-mm/cpu cid.
+ * Matches barrier in sched_mm_cid_remote_clear_old().
+ */
+ smp_mb();
+ mm_cid_put(mm);
+ t->last_mm_cid = t->mm_cid = -1;
+}
+
+void sched_mm_cid_before_execve(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ struct rq *rq;
+
+ if (!mm)
+ return;
+
+ preempt_disable();
+ rq = this_rq();
+ guard(rq_lock_irqsave)(rq);
+ preempt_enable_no_resched(); /* holding spinlock */
+ WRITE_ONCE(t->mm_cid_active, 0);
+ /*
+ * Store t->mm_cid_active before loading per-mm/cpu cid.
+ * Matches barrier in sched_mm_cid_remote_clear_old().
+ */
+ smp_mb();
+ mm_cid_put(mm);
+ t->last_mm_cid = t->mm_cid = -1;
+}
+
+void sched_mm_cid_after_execve(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ struct rq *rq;
+
+ if (!mm)
+ return;
+
+ preempt_disable();
+ rq = this_rq();
+ scoped_guard (rq_lock_irqsave, rq) {
+ preempt_enable_no_resched(); /* holding spinlock */
+ WRITE_ONCE(t->mm_cid_active, 1);
+ /*
+ * Store t->mm_cid_active before loading per-mm/cpu cid.
+ * Matches barrier in sched_mm_cid_remote_clear_old().
+ */
+ smp_mb();
+ t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
+ }
+ rseq_set_notify_resume(t);
+}
+
+void sched_mm_cid_fork(struct task_struct *t)
+{
+ WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
+ t->mm_cid_active = 1;
+}
+#endif
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
new file mode 100644
index 000000000000..a57fd8f27498
--- /dev/null
+++ b/kernel/sched/core_sched.c
@@ -0,0 +1,300 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * A simple wrapper around refcount. An allocated sched_core_cookie's
+ * address is used to compute the cookie of the task.
+ */
+struct sched_core_cookie {
+ refcount_t refcnt;
+};
+
+static unsigned long sched_core_alloc_cookie(void)
+{
+ struct sched_core_cookie *ck = kmalloc(sizeof(*ck), GFP_KERNEL);
+ if (!ck)
+ return 0;
+
+ refcount_set(&ck->refcnt, 1);
+ sched_core_get();
+
+ return (unsigned long)ck;
+}
+
+static void sched_core_put_cookie(unsigned long cookie)
+{
+ struct sched_core_cookie *ptr = (void *)cookie;
+
+ if (ptr && refcount_dec_and_test(&ptr->refcnt)) {
+ kfree(ptr);
+ sched_core_put();
+ }
+}
+
+static unsigned long sched_core_get_cookie(unsigned long cookie)
+{
+ struct sched_core_cookie *ptr = (void *)cookie;
+
+ if (ptr)
+ refcount_inc(&ptr->refcnt);
+
+ return cookie;
+}
+
+/*
+ * sched_core_update_cookie - replace the cookie on a task
+ * @p: the task to update
+ * @cookie: the new cookie
+ *
+ * Effectively exchange the task cookie; caller is responsible for lifetimes on
+ * both ends.
+ *
+ * Returns: the old cookie
+ */
+static unsigned long sched_core_update_cookie(struct task_struct *p,
+ unsigned long cookie)
+{
+ unsigned long old_cookie;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * Since creating a cookie implies sched_core_get(), and we cannot set
+ * a cookie until after we've created it, similarly, we cannot destroy
+ * a cookie until after we've removed it, we must have core scheduling
+ * enabled here.
+ */
+ SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq));
+
+ if (sched_core_enqueued(p))
+ sched_core_dequeue(rq, p, DEQUEUE_SAVE);
+
+ old_cookie = p->core_cookie;
+ p->core_cookie = cookie;
+
+ /*
+ * Consider the cases: !prev_cookie and !cookie.
+ */
+ if (cookie && task_on_rq_queued(p))
+ sched_core_enqueue(rq, p);
+
+ /*
+ * If task is currently running, it may not be compatible anymore after
+ * the cookie change, so enter the scheduler on its CPU to schedule it
+ * away.
+ *
+ * Note that it is possible that as a result of this cookie change, the
+ * core has now entered/left forced idle state. Defer accounting to the
+ * next scheduling edge, rather than always forcing a reschedule here.
+ */
+ if (task_on_cpu(rq, p))
+ resched_curr(rq);
+
+ task_rq_unlock(rq, p, &rf);
+
+ return old_cookie;
+}
+
+static unsigned long sched_core_clone_cookie(struct task_struct *p)
+{
+ unsigned long cookie, flags;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ cookie = sched_core_get_cookie(p->core_cookie);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ return cookie;
+}
+
+void sched_core_fork(struct task_struct *p)
+{
+ RB_CLEAR_NODE(&p->core_node);
+ p->core_cookie = sched_core_clone_cookie(current);
+}
+
+void sched_core_free(struct task_struct *p)
+{
+ sched_core_put_cookie(p->core_cookie);
+}
+
+static void __sched_core_set(struct task_struct *p, unsigned long cookie)
+{
+ cookie = sched_core_get_cookie(cookie);
+ cookie = sched_core_update_cookie(p, cookie);
+ sched_core_put_cookie(cookie);
+}
+
+/* Called from prctl interface: PR_SCHED_CORE */
+int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
+ unsigned long uaddr)
+{
+ unsigned long cookie = 0, id = 0;
+ struct task_struct *task, *p;
+ struct pid *grp;
+ int err = 0;
+
+ if (!static_branch_likely(&sched_smt_present))
+ return -ENODEV;
+
+ BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD != PIDTYPE_PID);
+ BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD_GROUP != PIDTYPE_TGID);
+ BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_PROCESS_GROUP != PIDTYPE_PGID);
+
+ if (type > PIDTYPE_PGID || cmd >= PR_SCHED_CORE_MAX || pid < 0 ||
+ (cmd != PR_SCHED_CORE_GET && uaddr))
+ return -EINVAL;
+
+ rcu_read_lock();
+ if (pid == 0) {
+ task = current;
+ } else {
+ task = find_task_by_vpid(pid);
+ if (!task) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ }
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ /*
+ * Check if this process has the right to modify the specified
+ * process. Use the regular "ptrace_may_access()" checks.
+ */
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ switch (cmd) {
+ case PR_SCHED_CORE_GET:
+ if (type != PIDTYPE_PID || uaddr & 7) {
+ err = -EINVAL;
+ goto out;
+ }
+ cookie = sched_core_clone_cookie(task);
+ if (cookie) {
+ /* XXX improve ? */
+ ptr_to_hashval((void *)cookie, &id);
+ }
+ err = put_user(id, (u64 __user *)uaddr);
+ goto out;
+
+ case PR_SCHED_CORE_CREATE:
+ cookie = sched_core_alloc_cookie();
+ if (!cookie) {
+ err = -ENOMEM;
+ goto out;
+ }
+ break;
+
+ case PR_SCHED_CORE_SHARE_TO:
+ cookie = sched_core_clone_cookie(current);
+ break;
+
+ case PR_SCHED_CORE_SHARE_FROM:
+ if (type != PIDTYPE_PID) {
+ err = -EINVAL;
+ goto out;
+ }
+ cookie = sched_core_clone_cookie(task);
+ __sched_core_set(current, cookie);
+ goto out;
+
+ default:
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (type == PIDTYPE_PID) {
+ __sched_core_set(task, cookie);
+ goto out;
+ }
+
+ read_lock(&tasklist_lock);
+ grp = task_pid_type(task, type);
+
+ do_each_pid_thread(grp, type, p) {
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) {
+ err = -EPERM;
+ goto out_tasklist;
+ }
+ } while_each_pid_thread(grp, type, p);
+
+ do_each_pid_thread(grp, type, p) {
+ __sched_core_set(p, cookie);
+ } while_each_pid_thread(grp, type, p);
+out_tasklist:
+ read_unlock(&tasklist_lock);
+
+out:
+ sched_core_put_cookie(cookie);
+ put_task_struct(task);
+ return err;
+}
+
+#ifdef CONFIG_SCHEDSTATS
+
+/* REQUIRES: rq->core's clock recently updated. */
+void __sched_core_account_forceidle(struct rq *rq)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
+ u64 delta, now = rq_clock(rq->core);
+ struct rq *rq_i;
+ struct task_struct *p;
+ int i;
+
+ lockdep_assert_rq_held(rq);
+
+ WARN_ON_ONCE(!rq->core->core_forceidle_count);
+
+ if (rq->core->core_forceidle_start == 0)
+ return;
+
+ delta = now - rq->core->core_forceidle_start;
+ if (unlikely((s64)delta <= 0))
+ return;
+
+ rq->core->core_forceidle_start = now;
+
+ if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) {
+ /* can't be forced idle without a running task */
+ } else if (rq->core->core_forceidle_count > 1 ||
+ rq->core->core_forceidle_occupation > 1) {
+ /*
+ * For larger SMT configurations, we need to scale the charged
+ * forced idle amount since there can be more than one forced
+ * idle sibling and more than one running cookied task.
+ */
+ delta *= rq->core->core_forceidle_count;
+ delta = div_u64(delta, rq->core->core_forceidle_occupation);
+ }
+
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+ p = rq_i->core_pick ?: rq_i->curr;
+
+ if (p == rq_i->idle)
+ continue;
+
+ /*
+ * Note: this will account forceidle to the current cpu, even
+ * if it comes from our SMT sibling.
+ */
+ __account_forceidle_time(p, delta);
+ }
+}
+
+void __sched_core_tick(struct rq *rq)
+{
+ if (!rq->core->core_forceidle_count)
+ return;
+
+ if (rq != rq->core)
+ update_rq_clock(rq->core);
+
+ __sched_core_account_forceidle(rq);
+}
+
+#endif /* CONFIG_SCHEDSTATS */
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 941c28cf9738..0de9dda09949 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -1,12 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
+
/*
* CPU accounting code for task groups.
*
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com).
*/
-#include <asm/irq_regs.h>
-#include "sched.h"
/* Time spent by the tasks of the CPU accounting group executing in ... */
enum cpuacct_stat_index {
@@ -21,15 +20,11 @@ static const char * const cpuacct_stat_desc[] = {
[CPUACCT_STAT_SYSTEM] = "system",
};
-struct cpuacct_usage {
- u64 usages[CPUACCT_STAT_NSTATS];
-};
-
/* track CPU usage of a group of tasks and its child groups */
struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every CPU */
- struct cpuacct_usage __percpu *cpuusage;
+ u64 __percpu *cpuusage;
struct kernel_cpustat __percpu *cpustat;
};
@@ -49,7 +44,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca)
return css_ca(ca->css.parent);
}
-static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage);
+static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
static struct cpuacct root_cpuacct = {
.cpustat = &kernel_cpustat,
.cpuusage = &root_cpuacct_cpuusage,
@@ -68,7 +63,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
if (!ca)
goto out;
- ca->cpuusage = alloc_percpu(struct cpuacct_usage);
+ ca->cpuusage = alloc_percpu(u64);
if (!ca->cpuusage)
goto out_free_ca;
@@ -99,56 +94,66 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
enum cpuacct_stat_index index)
{
- struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
u64 data;
/*
* We allow index == CPUACCT_STAT_NSTATS here to read
- * the sum of suages.
+ * the sum of usages.
*/
- BUG_ON(index > CPUACCT_STAT_NSTATS);
+ if (WARN_ON_ONCE(index > CPUACCT_STAT_NSTATS))
+ return 0;
#ifndef CONFIG_64BIT
/*
* Take rq->lock to make 64-bit read safe on 32-bit platforms.
*/
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
- if (index == CPUACCT_STAT_NSTATS) {
- int i = 0;
-
- data = 0;
- for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
- data += cpuusage->usages[i];
- } else {
- data = cpuusage->usages[index];
+ switch (index) {
+ case CPUACCT_STAT_USER:
+ data = cpustat[CPUTIME_USER] + cpustat[CPUTIME_NICE];
+ break;
+ case CPUACCT_STAT_SYSTEM:
+ data = cpustat[CPUTIME_SYSTEM] + cpustat[CPUTIME_IRQ] +
+ cpustat[CPUTIME_SOFTIRQ];
+ break;
+ case CPUACCT_STAT_NSTATS:
+ data = *cpuusage;
+ break;
}
#ifndef CONFIG_64BIT
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_unlock_irq(cpu_rq(cpu));
#endif
return data;
}
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu)
{
- struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- int i;
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
+
+ /* Don't allow to reset global kernel_cpustat */
+ if (ca == &root_cpuacct)
+ return;
#ifndef CONFIG_64BIT
/*
* Take rq->lock to make 64-bit write safe on 32-bit platforms.
*/
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
-
- for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
- cpuusage->usages[i] = val;
+ *cpuusage = 0;
+ cpustat[CPUTIME_USER] = cpustat[CPUTIME_NICE] = 0;
+ cpustat[CPUTIME_SYSTEM] = cpustat[CPUTIME_IRQ] = 0;
+ cpustat[CPUTIME_SOFTIRQ] = 0;
#ifndef CONFIG_64BIT
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_unlock_irq(cpu_rq(cpu));
#endif
}
@@ -196,7 +201,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
return -EINVAL;
for_each_possible_cpu(cpu)
- cpuacct_cpuusage_write(ca, cpu, 0);
+ cpuacct_cpuusage_write(ca, cpu);
return 0;
}
@@ -243,25 +248,10 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
seq_puts(m, "\n");
for_each_possible_cpu(cpu) {
- struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-
seq_printf(m, "%d", cpu);
-
- for (index = 0; index < CPUACCT_STAT_NSTATS; index++) {
-#ifndef CONFIG_64BIT
- /*
- * Take rq->lock to make 64-bit read safe on 32-bit
- * platforms.
- */
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-#endif
-
- seq_printf(m, " %llu", cpuusage->usages[index]);
-
-#ifndef CONFIG_64BIT
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#endif
- }
+ for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
+ seq_printf(m, " %llu",
+ cpuacct_cpuusage_read(ca, cpu, index));
seq_puts(m, "\n");
}
return 0;
@@ -270,25 +260,30 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
static int cpuacct_stats_show(struct seq_file *sf, void *v)
{
struct cpuacct *ca = css_ca(seq_css(sf));
- s64 val[CPUACCT_STAT_NSTATS];
+ struct task_cputime cputime;
+ u64 val[CPUACCT_STAT_NSTATS];
int cpu;
int stat;
- memset(val, 0, sizeof(val));
+ memset(&cputime, 0, sizeof(cputime));
for_each_possible_cpu(cpu) {
u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
+ cputime.utime += cpustat[CPUTIME_USER];
+ cputime.utime += cpustat[CPUTIME_NICE];
+ cputime.stime += cpustat[CPUTIME_SYSTEM];
+ cputime.stime += cpustat[CPUTIME_IRQ];
+ cputime.stime += cpustat[CPUTIME_SOFTIRQ];
+
+ cputime.sum_exec_runtime += *per_cpu_ptr(ca->cpuusage, cpu);
}
+ cputime_adjust(&cputime, &seq_css(sf)->cgroup->prev_cputime,
+ &val[CPUACCT_STAT_USER], &val[CPUACCT_STAT_SYSTEM]);
+
for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
- seq_printf(sf, "%s %lld\n",
- cpuacct_stat_desc[stat],
- (long long)nsec_to_clock_t(val[stat]));
+ seq_printf(sf, "%s %llu\n", cpuacct_stat_desc[stat],
+ nsec_to_clock_t(val[stat]));
}
return 0;
@@ -338,19 +333,13 @@ static struct cftype files[] = {
*/
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
+ unsigned int cpu = task_cpu(tsk);
struct cpuacct *ca;
- int index = CPUACCT_STAT_SYSTEM;
- struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk);
- if (regs && user_mode(regs))
- index = CPUACCT_STAT_USER;
-
- rcu_read_lock();
+ lockdep_assert_rq_held(cpu_rq(cpu));
for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
- __this_cpu_add(ca->cpuusage->usages[index], cputime);
-
- rcu_read_unlock();
+ *per_cpu_ptr(ca->cpuusage, cpu) += cputime;
}
/*
@@ -362,10 +351,8 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
{
struct cpuacct *ca;
- rcu_read_lock();
for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
__this_cpu_add(ca->cpustat->cpustat[index], val);
- rcu_read_unlock();
}
struct cgroup_subsys cpuacct_cgrp_subsys = {
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5cc4012572ec..95baa12a1029 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -1,12 +1,11 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * kernel/sched/cpudl.c
+ * kernel/sched/cpudeadline.c
*
* Global CPU deadline management
*
* Author: Juri Lelli <j.lelli@sssup.it>
*/
-#include "sched.h"
static inline int parent(int i)
{
@@ -120,14 +119,38 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
- cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
+ cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) {
+ unsigned long cap, max_cap = 0;
+ int cpu, max_cpu = -1;
+
+ if (!sched_asym_cpucap_active())
+ return 1;
+
+ /* Ensure the capacity of the CPUs fits the task. */
+ for_each_cpu(cpu, later_mask) {
+ if (!dl_task_fits_capacity(p, cpu)) {
+ cpumask_clear_cpu(cpu, later_mask);
+
+ cap = arch_scale_cpu_capacity(cpu);
+
+ if (cap > max_cap ||
+ (cpu == task_cpu(p) && cap == max_cap)) {
+ max_cap = cap;
+ max_cpu = cpu;
+ }
+ }
+ }
+
+ if (cpumask_empty(later_mask))
+ cpumask_set_cpu(max_cpu, later_mask);
+
return 1;
} else {
int best_cpu = cpudl_maximum(cp);
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
- if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
+ if (cpumask_test_cpu(best_cpu, &p->cpus_mask) &&
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
if (later_mask)
cpumask_set_cpu(best_cpu, later_mask);
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 7c2fe50fd76d..5252fb191fae 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -5,9 +5,6 @@
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
-#include <linux/cpufreq.h>
-
-#include "sched.h"
DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 7fbaee24c824..eece6244f9d2 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -6,13 +6,6 @@
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include "sched.h"
-
-#include <linux/sched/cpufreq.h>
-#include <trace/events/power.h>
-
#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8)
struct sugov_tunables {
@@ -26,7 +19,7 @@ struct sugov_policy {
struct sugov_tunables *tunables;
struct list_head tunables_hook;
- raw_spinlock_t update_lock; /* For shared policies */
+ raw_spinlock_t update_lock;
u64 last_freq_update_time;
s64 freq_update_delay_ns;
unsigned int next_freq;
@@ -53,8 +46,8 @@ struct sugov_cpu {
unsigned int iowait_boost;
u64 last_update;
- unsigned long bw_dl;
- unsigned long max;
+ unsigned long util;
+ unsigned long bw_min;
/* The field below is for single-CPU policies only: */
#ifdef CONFIG_NO_HZ_COMMON
@@ -102,7 +95,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
unsigned int next_freq)
{
- if (sg_policy->next_freq == next_freq)
+ if (sg_policy->need_freq_update)
+ sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
+ else if (sg_policy->next_freq == next_freq)
return false;
sg_policy->next_freq = next_freq;
@@ -111,37 +106,38 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
return true;
}
-static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
- unsigned int next_freq)
+static void sugov_deferred_update(struct sugov_policy *sg_policy)
{
- struct cpufreq_policy *policy = sg_policy->policy;
- int cpu;
-
- if (!sugov_update_next_freq(sg_policy, time, next_freq))
- return;
-
- next_freq = cpufreq_driver_fast_switch(policy, next_freq);
- if (!next_freq)
- return;
-
- policy->cur = next_freq;
-
- if (trace_cpu_frequency_enabled()) {
- for_each_cpu(cpu, policy->cpus)
- trace_cpu_frequency(next_freq, cpu);
+ if (!sg_policy->work_in_progress) {
+ sg_policy->work_in_progress = true;
+ irq_work_queue(&sg_policy->irq_work);
}
}
-static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
- unsigned int next_freq)
+/**
+ * get_capacity_ref_freq - get the reference frequency that has been used to
+ * correlate frequency and compute capacity for a given cpufreq policy. We use
+ * the CPU managing it for the arch_scale_freq_ref() call in the function.
+ * @policy: the cpufreq policy of the CPU in question.
+ *
+ * Return: the reference CPU frequency to compute a capacity.
+ */
+static __always_inline
+unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy)
{
- if (!sugov_update_next_freq(sg_policy, time, next_freq))
- return;
+ unsigned int freq = arch_scale_freq_ref(policy->cpu);
- if (!sg_policy->work_in_progress) {
- sg_policy->work_in_progress = true;
- irq_work_queue(&sg_policy->irq_work);
- }
+ if (freq)
+ return freq;
+
+ if (arch_scale_freq_invariant())
+ return policy->cpuinfo.max_freq;
+
+ /*
+ * Apply a 25% margin so that we select a higher frequency than
+ * the current one before the CPU is fully busy:
+ */
+ return policy->cur + (policy->cur >> 2);
}
/**
@@ -170,135 +166,43 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
unsigned long util, unsigned long max)
{
struct cpufreq_policy *policy = sg_policy->policy;
- unsigned int freq = arch_scale_freq_invariant() ?
- policy->cpuinfo.max_freq : policy->cur;
+ unsigned int freq;
+ freq = get_capacity_ref_freq(policy);
freq = map_util_freq(util, freq, max);
if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
return sg_policy->next_freq;
- sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = freq;
return cpufreq_driver_resolve_freq(policy, freq);
}
-/*
- * This function computes an effective utilization for the given CPU, to be
- * used for frequency selection given the linear relation: f = u * f_max.
- *
- * The scheduler tracks the following metrics:
- *
- * cpu_util_{cfs,rt,dl,irq}()
- * cpu_bw_dl()
- *
- * Where the cfs,rt and dl util numbers are tracked with the same metric and
- * synchronized windows and are thus directly comparable.
- *
- * The cfs,rt,dl utilization are the running times measured with rq->clock_task
- * which excludes things like IRQ and steal-time. These latter are then accrued
- * in the irq utilization.
- *
- * The DL bandwidth number otoh is not a measured metric but a value computed
- * based on the task model parameters and gives the minimal utilization
- * required to meet deadlines.
- */
-unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum schedutil_type type,
- struct task_struct *p)
+unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
+ unsigned long min,
+ unsigned long max)
{
- unsigned long dl_util, util, irq;
- struct rq *rq = cpu_rq(cpu);
-
- if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) &&
- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
- return max;
- }
+ /* Add dvfs headroom to actual utilization */
+ actual = map_util_perf(actual);
+ /* Actually we don't need to target the max performance */
+ if (actual < max)
+ max = actual;
/*
- * Early check to see if IRQ/steal time saturates the CPU, can be
- * because of inaccuracies in how we track these -- see
- * update_irq_load_avg().
- */
- irq = cpu_util_irq(rq);
- if (unlikely(irq >= max))
- return max;
-
- /*
- * Because the time spend on RT/DL tasks is visible as 'lost' time to
- * CFS tasks and we use the same metric to track the effective
- * utilization (PELT windows are synchronized) we can directly add them
- * to obtain the CPU's actual utilization.
- *
- * CFS and RT utilization can be boosted or capped, depending on
- * utilization clamp constraints requested by currently RUNNABLE
- * tasks.
- * When there are no CFS RUNNABLE tasks, clamps are released and
- * frequency will be gracefully reduced with the utilization decay.
+ * Ensure at least minimum performance while providing more compute
+ * capacity when possible.
*/
- util = util_cfs + cpu_util_rt(rq);
- if (type == FREQUENCY_UTIL)
- util = uclamp_rq_util_with(rq, util, p);
-
- dl_util = cpu_util_dl(rq);
-
- /*
- * For frequency selection we do not make cpu_util_dl() a permanent part
- * of this sum because we want to use cpu_bw_dl() later on, but we need
- * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
- * that we select f_max when there is no idle time.
- *
- * NOTE: numerical errors or stop class might cause us to not quite hit
- * saturation when we should -- something for later.
- */
- if (util + dl_util >= max)
- return max;
-
- /*
- * OTOH, for energy computation we need the estimated running time, so
- * include util_dl and ignore dl_bw.
- */
- if (type == ENERGY_UTIL)
- util += dl_util;
-
- /*
- * There is still idle time; further improve the number by using the
- * irq metric. Because IRQ/steal time is hidden from the task clock we
- * need to scale the task numbers:
- *
- * max - irq
- * U' = irq + --------- * U
- * max
- */
- util = scale_irq_capacity(util, irq, max);
- util += irq;
-
- /*
- * Bandwidth required by DEADLINE must always be granted while, for
- * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
- * to gracefully reduce the frequency when no tasks show up for longer
- * periods of time.
- *
- * Ideally we would like to set bw_dl as min/guaranteed freq and util +
- * bw_dl as requested freq. However, cpufreq is not yet ready for such
- * an interface. So, we only do the latter for now.
- */
- if (type == FREQUENCY_UTIL)
- util += cpu_bw_dl(rq);
-
- return min(max, util);
+ return max(min, max);
}
-static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
+static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
{
- struct rq *rq = cpu_rq(sg_cpu->cpu);
- unsigned long util = cpu_util_cfs(rq);
- unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
+ unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu);
- sg_cpu->max = max;
- sg_cpu->bw_dl = cpu_bw_dl(rq);
-
- return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL);
+ util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
+ util = max(util, boost);
+ sg_cpu->bw_min = min;
+ sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max);
}
/**
@@ -375,8 +279,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* sugov_iowait_apply() - Apply the IO boost to a CPU.
* @sg_cpu: the sugov data for the cpu to boost
* @time: the update time from the caller
- * @util: the utilization to (eventually) boost
- * @max: the maximum value the utilization can be boosted to
+ * @max_cap: the max CPU capacity
*
* A CPU running a task which woken up after an IO operation can have its
* utilization boosted to speed up the completion of those IO operations.
@@ -391,17 +294,15 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* being more conservative on tasks which does sporadic IO operations.
*/
static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
- unsigned long util, unsigned long max)
+ unsigned long max_cap)
{
- unsigned long boost;
-
/* No boost currently required */
if (!sg_cpu->iowait_boost)
- return util;
+ return 0;
/* Reset boost if the CPU appears to have been idle enough */
if (sugov_iowait_reset(sg_cpu, time, false))
- return util;
+ return 0;
if (!sg_cpu->iowait_boost_pending) {
/*
@@ -410,18 +311,17 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
sg_cpu->iowait_boost >>= 1;
if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
sg_cpu->iowait_boost = 0;
- return util;
+ return 0;
}
}
sg_cpu->iowait_boost_pending = false;
/*
- * @util is already in capacity scale; convert iowait_boost
+ * sg_cpu->util is already in capacity scale; convert iowait_boost
* into the same scale so we can compare.
*/
- boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT;
- return max(boost, util);
+ return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -441,83 +341,137 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
* Make sugov_should_update_freq() ignore the rate limit when DL
* has increased the utilization.
*/
-static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
+static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
{
- if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
- sg_policy->limits_changed = true;
+ if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min)
+ sg_cpu->sg_policy->limits_changed = true;
}
-static void sugov_update_single(struct update_util_data *hook, u64 time,
- unsigned int flags)
+static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
+ u64 time, unsigned long max_cap,
+ unsigned int flags)
{
- struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
- struct sugov_policy *sg_policy = sg_cpu->sg_policy;
- unsigned long util, max;
- unsigned int next_f;
- bool busy;
+ unsigned long boost;
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
- ignore_dl_rate_limit(sg_cpu, sg_policy);
+ ignore_dl_rate_limit(sg_cpu);
- if (!sugov_should_update_freq(sg_policy, time))
- return;
+ if (!sugov_should_update_freq(sg_cpu->sg_policy, time))
+ return false;
+
+ boost = sugov_iowait_apply(sg_cpu, time, max_cap);
+ sugov_get_util(sg_cpu, boost);
+
+ return true;
+}
+
+static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
+ unsigned int flags)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ unsigned int cached_freq = sg_policy->cached_raw_freq;
+ unsigned long max_cap;
+ unsigned int next_f;
+
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
- /* Limits may have changed, don't skip frequency update */
- busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu);
+ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
+ return;
- util = sugov_get_util(sg_cpu);
- max = sg_cpu->max;
- util = sugov_iowait_apply(sg_cpu, time, util, max);
- next_f = get_next_freq(sg_policy, util, max);
+ next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap);
/*
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
+ *
+ * Except when the rq is capped by uclamp_max.
*/
- if (busy && next_f < sg_policy->next_freq) {
+ if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
+ sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq &&
+ !sg_policy->need_freq_update) {
next_f = sg_policy->next_freq;
- /* Reset cached freq as next_freq has changed */
- sg_policy->cached_raw_freq = 0;
+ /* Restore cached freq as next_freq has changed */
+ sg_policy->cached_raw_freq = cached_freq;
}
+ if (!sugov_update_next_freq(sg_policy, time, next_f))
+ return;
+
/*
* This code runs under rq->lock for the target CPU, so it won't run
* concurrently on two different CPUs for the same target and it is not
* necessary to acquire the lock in the fast switch case.
*/
if (sg_policy->policy->fast_switch_enabled) {
- sugov_fast_switch(sg_policy, time, next_f);
+ cpufreq_driver_fast_switch(sg_policy->policy, next_f);
} else {
raw_spin_lock(&sg_policy->update_lock);
- sugov_deferred_update(sg_policy, time, next_f);
+ sugov_deferred_update(sg_policy);
raw_spin_unlock(&sg_policy->update_lock);
}
}
+static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
+ unsigned int flags)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ unsigned long prev_util = sg_cpu->util;
+ unsigned long max_cap;
+
+ /*
+ * Fall back to the "frequency" path if frequency invariance is not
+ * supported, because the direct mapping between the utilization and
+ * the performance levels depends on the frequency invariance.
+ */
+ if (!arch_scale_freq_invariant()) {
+ sugov_update_single_freq(hook, time, flags);
+ return;
+ }
+
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
+ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
+ return;
+
+ /*
+ * Do not reduce the target performance level if the CPU has not been
+ * idle recently, as the reduction is likely to be premature then.
+ *
+ * Except when the rq is capped by uclamp_max.
+ */
+ if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
+ sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
+ sg_cpu->util = prev_util;
+
+ cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
+ sg_cpu->util, max_cap);
+
+ sg_cpu->sg_policy->last_freq_update_time = time;
+}
+
static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
- unsigned long util = 0, max = 1;
+ unsigned long util = 0, max_cap;
unsigned int j;
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
- unsigned long j_util, j_max;
+ unsigned long boost;
- j_util = sugov_get_util(j_sg_cpu);
- j_max = j_sg_cpu->max;
- j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max);
+ boost = sugov_iowait_apply(j_sg_cpu, time, max_cap);
+ sugov_get_util(j_sg_cpu, boost);
- if (j_util * max > j_max * util) {
- util = j_util;
- max = j_max;
- }
+ util = max(j_sg_cpu->util, util);
}
- return get_next_freq(sg_policy, util, max);
+ return get_next_freq(sg_policy, util, max_cap);
}
static void
@@ -532,17 +486,20 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
- ignore_dl_rate_limit(sg_cpu, sg_policy);
+ ignore_dl_rate_limit(sg_cpu);
if (sugov_should_update_freq(sg_policy, time)) {
next_f = sugov_next_freq_shared(sg_cpu, time);
+ if (!sugov_update_next_freq(sg_policy, time, next_f))
+ goto unlock;
+
if (sg_policy->policy->fast_switch_enabled)
- sugov_fast_switch(sg_policy, time, next_f);
+ cpufreq_driver_fast_switch(sg_policy->policy, next_f);
else
- sugov_deferred_update(sg_policy, time, next_f);
+ sugov_deferred_update(sg_policy);
}
-
+unlock:
raw_spin_unlock(&sg_policy->update_lock);
}
@@ -554,7 +511,7 @@ static void sugov_work(struct kthread_work *work)
/*
* Hold sg_policy->update_lock shortly to handle the case where:
- * incase sg_policy->next_freq is read here, and then updated by
+ * in case sg_policy->next_freq is read here, and then updated by
* sugov_deferred_update() just before work_in_progress is set to false
* here, we may miss queueing the new update.
*
@@ -624,13 +581,46 @@ static struct attribute *sugov_attrs[] = {
};
ATTRIBUTE_GROUPS(sugov);
-static struct kobj_type sugov_tunables_ktype = {
+static void sugov_tunables_free(struct kobject *kobj)
+{
+ struct gov_attr_set *attr_set = to_gov_attr_set(kobj);
+
+ kfree(to_sugov_tunables(attr_set));
+}
+
+static const struct kobj_type sugov_tunables_ktype = {
.default_groups = sugov_groups,
.sysfs_ops = &governor_sysfs_ops,
+ .release = &sugov_tunables_free,
};
/********************** cpufreq governor interface *********************/
+#ifdef CONFIG_ENERGY_MODEL
+static void rebuild_sd_workfn(struct work_struct *work)
+{
+ rebuild_sched_domains_energy();
+}
+
+static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
+
+/*
+ * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
+ * on governor changes to make sure the scheduler knows about it.
+ */
+static void sugov_eas_rebuild_sd(void)
+{
+ /*
+ * When called from the cpufreq_register_driver() path, the
+ * cpu_hotplug_lock is already held, so use a work item to
+ * avoid nested locking in rebuild_sched_domains().
+ */
+ schedule_work(&rebuild_sd_work);
+}
+#else
+static inline void sugov_eas_rebuild_sd(void) { };
+#endif
+
struct cpufreq_governor schedutil_gov;
static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
@@ -726,12 +716,10 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic
return tunables;
}
-static void sugov_tunables_free(struct sugov_tunables *tunables)
+static void sugov_clear_global_tunables(void)
{
if (!have_governor_per_policy())
global_tunables = NULL;
-
- kfree(tunables);
}
static int sugov_init(struct cpufreq_policy *policy)
@@ -787,6 +775,8 @@ static int sugov_init(struct cpufreq_policy *policy)
if (ret)
goto fail;
+ sugov_eas_rebuild_sd();
+
out:
mutex_unlock(&global_tunables_lock);
return 0;
@@ -794,7 +784,7 @@ out:
fail:
kobject_put(&tunables->attr_set.kobj);
policy->governor_data = NULL;
- sugov_tunables_free(tunables);
+ sugov_clear_global_tunables();
stop_kthread:
sugov_kthread_stop(sg_policy);
@@ -821,18 +811,21 @@ static void sugov_exit(struct cpufreq_policy *policy)
count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
policy->governor_data = NULL;
if (!count)
- sugov_tunables_free(tunables);
+ sugov_clear_global_tunables();
mutex_unlock(&global_tunables_lock);
sugov_kthread_stop(sg_policy);
sugov_policy_free(sg_policy);
cpufreq_disable_fast_switch(policy);
+
+ sugov_eas_rebuild_sd();
}
static int sugov_start(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
+ void (*uu)(struct update_util_data *data, u64 time, unsigned int flags);
unsigned int cpu;
sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
@@ -840,24 +833,24 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_policy->next_freq = 0;
sg_policy->work_in_progress = false;
sg_policy->limits_changed = false;
- sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = 0;
- for_each_cpu(cpu, policy->cpus) {
- struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+ sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
- memset(sg_cpu, 0, sizeof(*sg_cpu));
- sg_cpu->cpu = cpu;
- sg_cpu->sg_policy = sg_policy;
- }
+ if (policy_is_shared(policy))
+ uu = sugov_update_shared;
+ else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf())
+ uu = sugov_update_single_perf;
+ else
+ uu = sugov_update_single_freq;
for_each_cpu(cpu, policy->cpus) {
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
- cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
- policy_is_shared(policy) ?
- sugov_update_shared :
- sugov_update_single);
+ memset(sg_cpu, 0, sizeof(*sg_cpu));
+ sg_cpu->cpu = cpu;
+ sg_cpu->sg_policy = sg_policy;
+ cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu);
}
return 0;
}
@@ -894,7 +887,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
struct cpufreq_governor schedutil_gov = {
.name = "schedutil",
.owner = THIS_MODULE,
- .dynamic_switching = true,
+ .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING,
.init = sugov_init,
.exit = sugov_exit,
.start = sugov_start,
@@ -909,41 +902,4 @@ struct cpufreq_governor *cpufreq_default_governor(void)
}
#endif
-static int __init sugov_register(void)
-{
- return cpufreq_register_governor(&schedutil_gov);
-}
-core_initcall(sugov_register);
-
-#ifdef CONFIG_ENERGY_MODEL
-extern bool sched_energy_update;
-extern struct mutex sched_energy_mutex;
-
-static void rebuild_sd_workfn(struct work_struct *work)
-{
- mutex_lock(&sched_energy_mutex);
- sched_energy_update = true;
- rebuild_sched_domains();
- sched_energy_update = false;
- mutex_unlock(&sched_energy_mutex);
-}
-static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
-
-/*
- * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
- * on governor changes to make sure the scheduler knows about it.
- */
-void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
- struct cpufreq_governor *old_gov)
-{
- if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) {
- /*
- * When called from the cpufreq_register_driver() path, the
- * cpu_hotplug_lock is already held, so use a work item to
- * avoid nested locking in rebuild_sched_domains().
- */
- schedule_work(&rebuild_sd_work);
- }
-
-}
-#endif
+cpufreq_governor_init(schedutil_gov);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 0033731a0797..42c40cfdf836 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -11,7 +11,7 @@
* This code tracks the priority of each CPU so that global migration
* decisions are easy to calculate. Each CPU can be in a state as follows:
*
- * (INVALID), IDLE, NORMAL, RT1, ... RT99
+ * (INVALID), NORMAL, RT1, ... RT99, HIGHER
*
* going from the lowest priority to the highest. CPUs in the INVALID state
* are not eligible for routing. The system maintains this state with
@@ -19,24 +19,47 @@
* in that class). Therefore a typical application without affinity
* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
* searches). For tasks with affinity restrictions, the algorithm has a
- * worst case complexity of O(min(102, nr_domcpus)), though the scenario that
+ * worst case complexity of O(min(101, nr_domcpus)), though the scenario that
* yields the worst case search is fairly contrived.
*/
-#include "sched.h"
-/* Convert between a 140 based task->prio, and our 102 based cpupri */
+/*
+ * p->rt_priority p->prio newpri cpupri
+ *
+ * -1 -1 (CPUPRI_INVALID)
+ *
+ * 99 0 (CPUPRI_NORMAL)
+ *
+ * 1 98 98 1
+ * ...
+ * 49 50 50 49
+ * 50 49 49 50
+ * ...
+ * 99 0 0 99
+ *
+ * 100 100 (CPUPRI_HIGHER)
+ */
static int convert_prio(int prio)
{
int cpupri;
- if (prio == CPUPRI_INVALID)
- cpupri = CPUPRI_INVALID;
- else if (prio == MAX_PRIO)
- cpupri = CPUPRI_IDLE;
- else if (prio >= MAX_RT_PRIO)
- cpupri = CPUPRI_NORMAL;
- else
- cpupri = MAX_RT_PRIO - prio + 1;
+ switch (prio) {
+ case CPUPRI_INVALID:
+ cpupri = CPUPRI_INVALID; /* -1 */
+ break;
+
+ case 0 ... 98:
+ cpupri = MAX_RT_PRIO-1 - prio; /* 1 ... 99 */
+ break;
+
+ case MAX_RT_PRIO-1:
+ cpupri = CPUPRI_NORMAL; /* 0 */
+ break;
+
+ case MAX_RT_PRIO:
+ cpupri = CPUPRI_HIGHER; /* 100 */
+ break;
+ }
return cpupri;
}
@@ -53,7 +76,7 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
* When looking at the vector, we need to read the counter,
* do a memory barrier, then read the mask.
*
- * Note: This is still all racey, but we can deal with it.
+ * Note: This is still all racy, but we can deal with it.
* Ideally, we only want to look at masks that are set.
*
* If a mask is not set, then the only thing wrong is that we
@@ -73,11 +96,12 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
if (skip)
return 0;
- if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
+ if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
return 0;
if (lowest_mask) {
- cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
+ cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
+ cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
/*
* We have to ensure that we have at least one bit
@@ -124,7 +148,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
int task_pri = convert_prio(p->prio);
int idx, cpu;
- BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
+ WARN_ON_ONCE(task_pri >= CPUPRI_NR_PRIORITIES);
for (idx = 0; idx < task_pri; idx++) {
@@ -162,7 +186,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
* The cost of this trade-off is not entirely clear and will probably
* be good for some workloads and bad for others.
*
- * The main idea here is that if some CPUs were overcommitted, we try
+ * The main idea here is that if some CPUs were over-committed, we try
* to spread which is what the scheduler traditionally did. Sys admins
* must do proper RT planning to avoid overloading the system if they
* really care.
@@ -177,7 +201,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
* cpupri_set - update the CPU priority setting
* @cp: The cpupri context
* @cpu: The target CPU
- * @newpri: The priority (INVALID-RT99) to assign to this CPU
+ * @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU
*
* Note: Assumes cpu_rq(cpu)->lock is locked
*
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index efbb492bb94c..d6cba0020064 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -1,11 +1,11 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
+#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1)
#define CPUPRI_INVALID -1
-#define CPUPRI_IDLE 0
-#define CPUPRI_NORMAL 1
-/* values 2-101 are RT priorities 0-99 */
+#define CPUPRI_NORMAL 0
+/* values 1-99 are for RT1-RT99 priorities */
+#define CPUPRI_HIGHER 100
struct cpupri_vec {
atomic_t count;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ff9435dee1df..af7952f12e6c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -2,7 +2,10 @@
/*
* Simple CPU accounting cgroup controller
*/
-#include "sched.h"
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ #include <asm/cputime.h>
+#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -44,12 +47,13 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
}
/*
- * Called before incrementing preempt_count on {soft,}irq_enter
+ * Called after incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
*/
-void irqtime_account_irq(struct task_struct *curr)
+void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
{
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
+ unsigned int pc;
s64 delta;
int cpu;
@@ -59,6 +63,7 @@ void irqtime_account_irq(struct task_struct *curr)
cpu = smp_processor_id();
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
+ pc = irq_count() - offset;
/*
* We do not account for softirq time from ksoftirqd here.
@@ -66,12 +71,11 @@ void irqtime_account_irq(struct task_struct *curr)
* in that case, so as not to confuse scheduler with a special task
* that do not consume any time, but still wants to run.
*/
- if (hardirq_count())
+ if (pc & HARDIRQ_MASK)
irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
- else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd())
irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
}
-EXPORT_SYMBOL_GPL(irqtime_account_irq);
static u64 irqtime_tick_accounted(u64 maxtime)
{
@@ -147,10 +151,10 @@ void account_guest_time(struct task_struct *p, u64 cputime)
/* Add guest time to cpustat. */
if (task_nice(p) > 0) {
- cpustat[CPUTIME_NICE] += cputime;
+ task_group_account_field(p, CPUTIME_NICE, cputime);
cpustat[CPUTIME_GUEST_NICE] += cputime;
} else {
- cpustat[CPUTIME_USER] += cputime;
+ task_group_account_field(p, CPUTIME_USER, cputime);
cpustat[CPUTIME_GUEST] += cputime;
}
}
@@ -226,6 +230,21 @@ void account_idle_time(u64 cputime)
cpustat[CPUTIME_IDLE] += cputime;
}
+
+#ifdef CONFIG_SCHED_CORE
+/*
+ * Account for forceidle time due to core scheduling.
+ *
+ * REQUIRES: schedstat is enabled.
+ */
+void __account_forceidle_time(struct task_struct *p, u64 delta)
+{
+ __schedstat_add(p->stats.core_forceidle_sum, delta);
+
+ task_group_account_field(p, CPUTIME_FORCEIDLE, delta);
+}
+#endif
+
/*
* When a guest is interrupted for a longer amount of time, missed clock
* ticks are not redelivered later. Due to that, this function may on
@@ -418,24 +437,21 @@ void vtime_task_switch(struct task_struct *prev)
}
# endif
-/*
- * Archs that account the whole time spent in the idle task
- * (outside irq) as idle time can rely on this and just implement
- * vtime_account_kernel() and vtime_account_idle(). Archs that
- * have other meaning of the idle time (s390 only includes the
- * time spent by the CPU when it's in low power mode) must override
- * vtime_account().
- */
-#ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_account_irq_enter(struct task_struct *tsk)
+void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
{
- if (!in_interrupt() && is_idle_task(tsk))
+ unsigned int pc = irq_count() - offset;
+
+ if (pc & HARDIRQ_OFFSET) {
+ vtime_account_hardirq(tsk);
+ } else if (pc & SOFTIRQ_OFFSET) {
+ vtime_account_softirq(tsk);
+ } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
+ is_idle_task(tsk)) {
vtime_account_idle(tsk);
- else
+ } else {
vtime_account_kernel(tsk);
+ }
}
-EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
-#endif /* __ARCH_HAS_VTIME_ACCOUNT */
void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
u64 *ut, u64 *st)
@@ -520,50 +536,6 @@ void account_idle_ticks(unsigned long ticks)
}
/*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
- */
-static u64 scale_stime(u64 stime, u64 rtime, u64 total)
-{
- u64 scaled;
-
- for (;;) {
- /* Make sure "rtime" is the bigger of stime/rtime */
- if (stime > rtime)
- swap(rtime, stime);
-
- /* Make sure 'total' fits in 32 bits */
- if (total >> 32)
- goto drop_precision;
-
- /* Does rtime (and thus stime) fit in 32 bits? */
- if (!(rtime >> 32))
- break;
-
- /* Can we just balance rtime/stime rather than dropping bits? */
- if (stime >> 31)
- goto drop_precision;
-
- /* We can grow stime and shrink rtime and try to make them both fit */
- stime <<= 1;
- rtime >>= 1;
- continue;
-
-drop_precision:
- /* We drop from rtime, it has more bits than stime */
- rtime >>= 1;
- total >>= 1;
- }
-
- /*
- * Make sure gcc understands that this is a 32x32->64 multiply,
- * followed by a 64/32->64 divide.
- */
- scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
- return scaled;
-}
-
-/*
* Adjust tick based cputime random precision against scheduler runtime
* accounting.
*
@@ -609,7 +581,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
/*
* If either stime or utime are 0, assume all runtime is userspace.
- * Once a task gets some ticks, the monotonicy code at 'update:'
+ * Once a task gets some ticks, the monotonicity code at 'update:'
* will ensure things converge to the observed ratio.
*/
if (stime == 0) {
@@ -622,7 +594,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
goto update;
}
- stime = scale_stime(stime, rtime, stime + utime);
+ stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
update:
/*
@@ -661,7 +633,8 @@ void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
.sum_exec_runtime = p->se.sum_exec_runtime,
};
- task_cputime(p, &cputime.utime, &cputime.stime);
+ if (task_cputime(p, &cputime.utime, &cputime.stime))
+ cputime.sum_exec_runtime = task_sched_runtime(p);
cputime_adjust(&cputime, &p->prev_cputime, ut, st);
}
EXPORT_SYMBOL_GPL(task_cputime_adjusted);
@@ -874,19 +847,21 @@ u64 task_gtime(struct task_struct *t)
* add up the pending nohz execution time since the last
* cputime snapshot.
*/
-void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
+bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
{
struct vtime *vtime = &t->vtime;
unsigned int seq;
u64 delta;
+ int ret;
if (!vtime_accounting_enabled()) {
*utime = t->utime;
*stime = t->stime;
- return;
+ return false;
}
do {
+ ret = false;
seq = read_seqcount_begin(&vtime->seqcount);
*utime = t->utime;
@@ -896,6 +871,7 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
if (vtime->state < VTIME_SYS)
continue;
+ ret = true;
delta = vtime_delta(vtime);
/*
@@ -907,6 +883,8 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
else
*utime += vtime->utime + delta;
} while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return ret;
}
static int vtime_state_fetch(struct vtime *vtime, int cpu)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index f63f337c7147..a04a436af8cc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -15,13 +15,53 @@
* Michael Trimarchi <michael@amarulasolutions.com>,
* Fabio Checconi <fchecconi@gmail.com>
*/
-#include "sched.h"
-#include "pelt.h"
-struct dl_bandwidth def_dl_bandwidth;
+#include <linux/cpuset.h>
+
+/*
+ * Default limits for DL period; on the top end we guard against small util
+ * tasks still getting ridiculously long effective runtimes, on the bottom end we
+ * guard against timer DoS.
+ */
+static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
+static unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_dl_sysctls[] = {
+ {
+ .procname = "sched_deadline_period_max_us",
+ .data = &sysctl_sched_dl_period_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = (void *)&sysctl_sched_dl_period_min,
+ },
+ {
+ .procname = "sched_deadline_period_min_us",
+ .data = &sysctl_sched_dl_period_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra2 = (void *)&sysctl_sched_dl_period_max,
+ },
+ {}
+};
+
+static int __init sched_dl_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_dl_sysctls);
+ return 0;
+}
+late_initcall(sched_dl_sysctl_init);
+#endif
+
+static bool dl_server(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_server;
+}
static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
{
+ BUG_ON(dl_server(dl_se));
return container_of(dl_se, struct task_struct, dl);
}
@@ -30,12 +70,19 @@ static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
return container_of(dl_rq, struct rq, dl);
}
-static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+static inline struct rq *rq_of_dl_se(struct sched_dl_entity *dl_se)
{
- struct task_struct *p = dl_task_of(dl_se);
- struct rq *rq = task_rq(p);
+ struct rq *rq = dl_se->rq;
- return &rq->dl;
+ if (!dl_server(dl_se))
+ rq = task_rq(dl_task_of(dl_se));
+
+ return rq;
+}
+
+static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+{
+ return &rq_of_dl_se(dl_se)->dl;
}
static inline int on_dl_rq(struct sched_dl_entity *dl_se)
@@ -43,6 +90,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
return !RB_EMPTY_NODE(&dl_se->rb_node);
}
+#ifdef CONFIG_RT_MUTEXES
+static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
+{
+ return dl_se->pi_se;
+}
+
+static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
+{
+ return pi_of(dl_se) != dl_se;
+}
+#else
+static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
+{
+ return dl_se;
+}
+
+static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
+{
+ return false;
+}
+#endif
+
#ifdef CONFIG_SMP
static inline struct dl_bw *dl_bw_of(int i)
{
@@ -54,15 +123,75 @@ static inline struct dl_bw *dl_bw_of(int i)
static inline int dl_bw_cpus(int i)
{
struct root_domain *rd = cpu_rq(i)->rd;
- int cpus = 0;
+ int cpus;
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
"sched RCU must be held");
+
+ if (cpumask_subset(rd->span, cpu_active_mask))
+ return cpumask_weight(rd->span);
+
+ cpus = 0;
+
for_each_cpu_and(i, rd->span, cpu_active_mask)
cpus++;
return cpus;
}
+
+static inline unsigned long __dl_bw_capacity(const struct cpumask *mask)
+{
+ unsigned long cap = 0;
+ int i;
+
+ for_each_cpu_and(i, mask, cpu_active_mask)
+ cap += arch_scale_cpu_capacity(i);
+
+ return cap;
+}
+
+/*
+ * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity
+ * of the CPU the task is running on rather rd's \Sum CPU capacity.
+ */
+static inline unsigned long dl_bw_capacity(int i)
+{
+ if (!sched_asym_cpucap_active() &&
+ arch_scale_cpu_capacity(i) == SCHED_CAPACITY_SCALE) {
+ return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT;
+ } else {
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+
+ return __dl_bw_capacity(cpu_rq(i)->rd->span);
+ }
+}
+
+static inline bool dl_bw_visited(int cpu, u64 gen)
+{
+ struct root_domain *rd = cpu_rq(cpu)->rd;
+
+ if (rd->visit_gen == gen)
+ return true;
+
+ rd->visit_gen = gen;
+ return false;
+}
+
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+ struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
+ int i;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+ for_each_cpu_and(i, rd->span, cpu_active_mask) {
+ struct rq *rq = cpu_rq(i);
+
+ rq->dl.extra_bw += bw;
+ }
+}
#else
static inline struct dl_bw *dl_bw_of(int i)
{
@@ -73,14 +202,53 @@ static inline int dl_bw_cpus(int i)
{
return 1;
}
+
+static inline unsigned long dl_bw_capacity(int i)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+
+static inline bool dl_bw_visited(int cpu, u64 gen)
+{
+ return false;
+}
+
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+ struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
+
+ dl->extra_bw += bw;
+}
#endif
static inline
+void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+{
+ dl_b->total_bw -= tsk_bw;
+ __dl_update(dl_b, (s32)tsk_bw / cpus);
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+{
+ dl_b->total_bw += tsk_bw;
+ __dl_update(dl_b, -((s32)tsk_bw / cpus));
+}
+
+static inline bool
+__dl_overflow(struct dl_bw *dl_b, unsigned long cap, u64 old_bw, u64 new_bw)
+{
+ return dl_b->bw != -1 &&
+ cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
+}
+
+static inline
void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->running_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->running_bw += dl_bw;
SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
@@ -93,7 +261,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->running_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->running_bw -= dl_bw;
SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
if (dl_rq->running_bw > old)
@@ -107,7 +275,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->this_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->this_bw += dl_bw;
SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */
}
@@ -117,7 +285,7 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->this_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->this_bw -= dl_bw;
SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */
if (dl_rq->this_bw > old)
@@ -157,7 +325,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
{
struct rq *rq;
- BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV);
+ WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);
if (task_on_rq_queued(p))
return;
@@ -168,7 +336,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
p->dl.dl_non_contending = 0;
/*
* If the timer handler is currently running and the
- * timer cannot be cancelled, inactive_task_timer()
+ * timer cannot be canceled, inactive_task_timer()
* will see that dl_not_contending is not set, and
* will not touch the rq's active utilization,
* so we are still safe.
@@ -180,6 +348,8 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
__add_rq_bw(new_bw, &rq->dl);
}
+static void __dl_clear_params(struct sched_dl_entity *dl_se);
+
/*
* The utilization of a task cannot be immediately removed from
* the rq active utilization (running_bw) when the task blocks.
@@ -190,7 +360,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
* fires.
*
* If the task wakes up again before the inactive timer fires,
- * the timer is cancelled, whereas if the task wakes up after the
+ * the timer is canceled, whereas if the task wakes up after the
* inactive timer fired (and running_bw has been decreased) the
* task's utilization has to be added to running_bw again.
* A flag in the deadline scheduling entity (dl_non_contending)
@@ -234,12 +404,11 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
* up, and checks if the task is still in the "ACTIVE non contending"
* state or not (in the second case, it updates running_bw).
*/
-static void task_non_contending(struct task_struct *p)
+static void task_non_contending(struct sched_dl_entity *dl_se)
{
- struct sched_dl_entity *dl_se = &p->dl;
struct hrtimer *timer = &dl_se->inactive_timer;
- struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rq *rq = rq_of_dl_rq(dl_rq);
+ struct rq *rq = rq_of_dl_se(dl_se);
+ struct dl_rq *dl_rq = &rq->dl;
s64 zerolag_time;
/*
@@ -269,24 +438,33 @@ static void task_non_contending(struct task_struct *p)
* utilization now, instead of starting a timer
*/
if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
- if (dl_task(p))
+ if (dl_server(dl_se)) {
sub_running_bw(dl_se, dl_rq);
- if (!dl_task(p) || p->state == TASK_DEAD) {
- struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
-
- if (p->state == TASK_DEAD)
- sub_rq_bw(&p->dl, &rq->dl);
- raw_spin_lock(&dl_b->lock);
- __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
- __dl_clear_params(p);
- raw_spin_unlock(&dl_b->lock);
+ } else {
+ struct task_struct *p = dl_task_of(dl_se);
+
+ if (dl_task(p))
+ sub_running_bw(dl_se, dl_rq);
+
+ if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+ if (READ_ONCE(p->__state) == TASK_DEAD)
+ sub_rq_bw(dl_se, &rq->dl);
+ raw_spin_lock(&dl_b->lock);
+ __dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p)));
+ raw_spin_unlock(&dl_b->lock);
+ __dl_clear_params(dl_se);
+ }
}
return;
}
dl_se->dl_non_contending = 1;
- get_task_struct(p);
+ if (!dl_server(dl_se))
+ get_task_struct(dl_task_of(dl_se));
+
hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);
}
@@ -308,13 +486,15 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
dl_se->dl_non_contending = 0;
/*
* If the timer handler is currently running and the
- * timer cannot be cancelled, inactive_task_timer()
+ * timer cannot be canceled, inactive_task_timer()
* will see that dl_not_contending is not set, and
* will not touch the rq's active utilization,
* so we are still safe.
*/
- if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1)
- put_task_struct(dl_task_of(dl_se));
+ if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) {
+ if (!dl_server(dl_se))
+ put_task_struct(dl_task_of(dl_se));
+ }
} else {
/*
* Since "dl_non_contending" is not set, the
@@ -327,31 +507,20 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
}
}
-static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
+static inline int is_leftmost(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
- struct sched_dl_entity *dl_se = &p->dl;
-
- return dl_rq->root.rb_leftmost == &dl_se->rb_node;
+ return rb_first_cached(&dl_rq->root) == &dl_se->rb_node;
}
static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
-void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
-{
- raw_spin_lock_init(&dl_b->dl_runtime_lock);
- dl_b->dl_period = period;
- dl_b->dl_runtime = runtime;
-}
-
void init_dl_bw(struct dl_bw *dl_b)
{
raw_spin_lock_init(&dl_b->lock);
- raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
if (global_rt_runtime() == RUNTIME_INF)
dl_b->bw = -1;
else
dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
- raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
dl_b->total_bw = 0;
}
@@ -363,7 +532,6 @@ void init_dl_rq(struct dl_rq *dl_rq)
/* zero means no -deadline tasks */
dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
- dl_rq->dl_nr_migratory = 0;
dl_rq->overloaded = 0;
dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED;
#else
@@ -407,37 +575,17 @@ static inline void dl_clear_overload(struct rq *rq)
cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
}
-static void update_dl_migration(struct dl_rq *dl_rq)
-{
- if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
- if (!dl_rq->overloaded) {
- dl_set_overload(rq_of_dl_rq(dl_rq));
- dl_rq->overloaded = 1;
- }
- } else if (dl_rq->overloaded) {
- dl_clear_overload(rq_of_dl_rq(dl_rq));
- dl_rq->overloaded = 0;
- }
-}
+#define __node_2_pdl(node) \
+ rb_entry((node), struct task_struct, pushable_dl_tasks)
-static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b)
{
- struct task_struct *p = dl_task_of(dl_se);
-
- if (p->nr_cpus_allowed > 1)
- dl_rq->dl_nr_migratory++;
-
- update_dl_migration(dl_rq);
+ return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl);
}
-static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+static inline int has_pushable_dl_tasks(struct rq *rq)
{
- struct task_struct *p = dl_task_of(dl_se);
-
- if (p->nr_cpus_allowed > 1)
- dl_rq->dl_nr_migratory--;
-
- update_dl_migration(dl_rq);
+ return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
}
/*
@@ -446,69 +594,52 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
*/
static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
{
- struct dl_rq *dl_rq = &rq->dl;
- struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct task_struct *entry;
- bool leftmost = true;
-
- BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct task_struct,
- pushable_dl_tasks);
- if (dl_entity_preempt(&p->dl, &entry->dl))
- link = &parent->rb_left;
- else {
- link = &parent->rb_right;
- leftmost = false;
- }
- }
+ struct rb_node *leftmost;
+
+ WARN_ON_ONCE(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
+ leftmost = rb_add_cached(&p->pushable_dl_tasks,
+ &rq->dl.pushable_dl_tasks_root,
+ __pushable_less);
if (leftmost)
- dl_rq->earliest_dl.next = p->dl.deadline;
+ rq->dl.earliest_dl.next = p->dl.deadline;
- rb_link_node(&p->pushable_dl_tasks, parent, link);
- rb_insert_color_cached(&p->pushable_dl_tasks,
- &dl_rq->pushable_dl_tasks_root, leftmost);
+ if (!rq->dl.overloaded) {
+ dl_set_overload(rq);
+ rq->dl.overloaded = 1;
+ }
}
static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
{
struct dl_rq *dl_rq = &rq->dl;
+ struct rb_root_cached *root = &dl_rq->pushable_dl_tasks_root;
+ struct rb_node *leftmost;
if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
return;
- if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) {
- struct rb_node *next_node;
-
- next_node = rb_next(&p->pushable_dl_tasks);
- if (next_node) {
- dl_rq->earliest_dl.next = rb_entry(next_node,
- struct task_struct, pushable_dl_tasks)->dl.deadline;
- }
- }
+ leftmost = rb_erase_cached(&p->pushable_dl_tasks, root);
+ if (leftmost)
+ dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline;
- rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
-}
-static inline int has_pushable_dl_tasks(struct rq *rq)
-{
- return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
+ if (!has_pushable_dl_tasks(rq) && rq->dl.overloaded) {
+ dl_clear_overload(rq);
+ rq->dl.overloaded = 0;
+ }
}
static int push_dl_task(struct rq *rq);
static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
{
- return dl_task(prev);
+ return rq->online && dl_task(prev);
}
-static DEFINE_PER_CPU(struct callback_head, dl_push_head);
-static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
+static DEFINE_PER_CPU(struct balance_callback, dl_push_head);
+static DEFINE_PER_CPU(struct balance_callback, dl_pull_head);
static void push_dl_tasks(struct rq *);
static void pull_dl_task(struct rq *);
@@ -547,7 +678,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
* Failed to find any suitable CPU.
* The task will never come back!
*/
- BUG_ON(dl_bandwidth_enabled());
+ WARN_ON_ONCE(dl_bandwidth_enabled());
/*
* If admission control is disabled we
@@ -620,15 +751,6 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
}
-static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
-{
- return false;
-}
-
-static inline void pull_dl_task(struct rq *rq)
-{
-}
-
static inline void deadline_queue_push_tasks(struct rq *rq)
{
}
@@ -638,9 +760,19 @@ static inline void deadline_queue_pull_task(struct rq *rq)
}
#endif /* CONFIG_SMP */
+static void
+enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags);
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
-static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags);
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags);
+
+static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
+ struct rq *rq)
+{
+ /* for non-boosted task, pi_of(dl_se) == dl_se */
+ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+ dl_se->runtime = pi_of(dl_se)->dl_runtime;
+}
/*
* We are being explicitly informed that a new instance is starting,
@@ -659,7 +791,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
- WARN_ON(dl_se->dl_boosted);
+ WARN_ON(is_dl_boosted(dl_se));
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
/*
@@ -675,8 +807,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
* future; in fact, we must consider execution overheads (time
* spent on hardirq context, etc.).
*/
- dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
- dl_se->runtime = dl_se->dl_runtime;
+ replenish_dl_new_period(dl_se, rq);
}
/*
@@ -697,22 +828,19 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
* could happen are, typically, a entity voluntarily trying to overcome its
* runtime, or it just underestimated it during sched_setattr().
*/
-static void replenish_dl_entity(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se)
+static void replenish_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
- BUG_ON(pi_se->dl_runtime <= 0);
+ WARN_ON_ONCE(pi_of(dl_se)->dl_runtime <= 0);
/*
* This could be the case for a !-dl task that is boosted.
* Just go with full inherited parameters.
*/
- if (dl_se->dl_deadline == 0) {
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
- dl_se->runtime = pi_se->dl_runtime;
- }
+ if (dl_se->dl_deadline == 0)
+ replenish_dl_new_period(dl_se, rq);
if (dl_se->dl_yielded && dl_se->runtime > 0)
dl_se->runtime = 0;
@@ -724,8 +852,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* arbitrary large.
*/
while (dl_se->runtime <= 0) {
- dl_se->deadline += pi_se->dl_period;
- dl_se->runtime += pi_se->dl_runtime;
+ dl_se->deadline += pi_of(dl_se)->dl_period;
+ dl_se->runtime += pi_of(dl_se)->dl_runtime;
}
/*
@@ -739,8 +867,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
*/
if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
printk_deferred_once("sched: DL replenish lagged too much\n");
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
- dl_se->runtime = pi_se->dl_runtime;
+ replenish_dl_new_period(dl_se, rq);
}
if (dl_se->dl_yielded)
@@ -773,8 +900,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* task with deadline equal to period this is the same of using
* dl_period instead of dl_deadline in the equation above.
*/
-static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se, u64 t)
+static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t)
{
u64 left, right;
@@ -796,9 +922,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
* of anything below microseconds resolution is actually fiction
* (but still we want to give the user that illusion >;).
*/
- left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+ left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
right = ((dl_se->deadline - t) >> DL_SCALE) *
- (pi_se->dl_runtime >> DL_SCALE);
+ (pi_of(dl_se)->dl_runtime >> DL_SCALE);
return dl_time_before(right, left);
}
@@ -883,24 +1009,21 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
* Please refer to the comments update_dl_revised_wakeup() function to find
* more about the Revised CBS rule.
*/
-static void update_dl_entity(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se)
+static void update_dl_entity(struct sched_dl_entity *dl_se)
{
- struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rq *rq = rq_of_dl_rq(dl_rq);
+ struct rq *rq = rq_of_dl_se(dl_se);
if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
- dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
+ dl_entity_overflow(dl_se, rq_clock(rq))) {
if (unlikely(!dl_is_implicit(dl_se) &&
!dl_time_before(dl_se->deadline, rq_clock(rq)) &&
- !dl_se->dl_boosted)){
+ !is_dl_boosted(dl_se))) {
update_dl_revised_wakeup(dl_se, rq);
return;
}
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
- dl_se->runtime = pi_se->dl_runtime;
+ replenish_dl_new_period(dl_se, rq);
}
}
@@ -919,15 +1042,15 @@ static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
* actually started or not (i.e., the replenishment instant is in
* the future or in the past).
*/
-static int start_dl_timer(struct task_struct *p)
+static int start_dl_timer(struct sched_dl_entity *dl_se)
{
- struct sched_dl_entity *dl_se = &p->dl;
struct hrtimer *timer = &dl_se->dl_timer;
- struct rq *rq = task_rq(p);
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
ktime_t now, act;
s64 delta;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
/*
* We want the timer to fire at the deadline, but considering
@@ -957,13 +1080,33 @@ static int start_dl_timer(struct task_struct *p)
* and observe our state.
*/
if (!hrtimer_is_queued(timer)) {
- get_task_struct(p);
+ if (!dl_server(dl_se))
+ get_task_struct(dl_task_of(dl_se));
hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD);
}
return 1;
}
+static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
+{
+#ifdef CONFIG_SMP
+ /*
+ * Queueing this task back might have overloaded rq, check if we need
+ * to kick someone away.
+ */
+ if (has_pushable_dl_tasks(rq)) {
+ /*
+ * Nothing relies on rq->lock after this, so its safe to drop
+ * rq->lock.
+ */
+ rq_unpin_lock(rq, rf);
+ push_dl_task(rq);
+ rq_repin_lock(rq, rf);
+ }
+#endif
+}
+
/*
* This is the bandwidth enforcement timer callback. If here, we know
* a task is not on its dl_rq, since the fact that the timer was running
@@ -982,10 +1125,34 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
struct sched_dl_entity *dl_se = container_of(timer,
struct sched_dl_entity,
dl_timer);
- struct task_struct *p = dl_task_of(dl_se);
+ struct task_struct *p;
struct rq_flags rf;
struct rq *rq;
+ if (dl_server(dl_se)) {
+ struct rq *rq = rq_of_dl_se(dl_se);
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+ if (dl_se->dl_throttled) {
+ sched_clock_tick();
+ update_rq_clock(rq);
+
+ if (dl_se->server_has_tasks(dl_se)) {
+ enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
+ resched_curr(rq);
+ __push_dl_task(rq, &rf);
+ } else {
+ replenish_dl_entity(dl_se);
+ }
+
+ }
+ rq_unlock(rq, &rf);
+
+ return HRTIMER_NORESTART;
+ }
+
+ p = dl_task_of(dl_se);
rq = task_rq_lock(p, &rf);
/*
@@ -999,7 +1166,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* The task might have been boosted by someone else and might be in the
* boosting/deboosting path, its not throttled.
*/
- if (dl_se->dl_boosted)
+ if (is_dl_boosted(dl_se))
goto unlock;
/*
@@ -1027,7 +1194,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* but do not enqueue -- wait for our wakeup to do that.
*/
if (!task_on_rq_queued(p)) {
- replenish_dl_entity(dl_se, dl_se);
+ replenish_dl_entity(dl_se);
goto unlock;
}
@@ -1037,9 +1204,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* If the runqueue is no longer available, migrate the
* task elsewhere. This necessarily changes rq.
*/
- lockdep_unpin_lock(&rq->lock, rf.cookie);
+ lockdep_unpin_lock(__rq_lockp(rq), rf.cookie);
rq = dl_task_offline_migration(rq, p);
- rf.cookie = lockdep_pin_lock(&rq->lock);
+ rf.cookie = lockdep_pin_lock(__rq_lockp(rq));
update_rq_clock(rq);
/*
@@ -1052,25 +1219,11 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (dl_task(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
+ wakeup_preempt_dl(rq, p, 0);
else
resched_curr(rq);
-#ifdef CONFIG_SMP
- /*
- * Queueing this task back might have overloaded rq, check if we need
- * to kick someone away.
- */
- if (has_pushable_dl_tasks(rq)) {
- /*
- * Nothing relies on rq->lock after this, so its safe to drop
- * rq->lock.
- */
- rq_unpin_lock(rq, &rf);
- push_dl_task(rq);
- rq_repin_lock(rq, &rf);
- }
-#endif
+ __push_dl_task(rq, &rf);
unlock:
task_rq_unlock(rq, p, &rf);
@@ -1084,7 +1237,7 @@ unlock:
return HRTIMER_NORESTART;
}
-void init_dl_task_timer(struct sched_dl_entity *dl_se)
+static void init_dl_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->dl_timer;
@@ -1098,7 +1251,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
* cannot use the runtime, and so it replenishes the task. This rule
* works fine for implicit deadline tasks (deadline == period), and the
* CBS was designed for implicit deadline tasks. However, a task with
- * constrained deadline (deadine < period) might be awakened after the
+ * constrained deadline (deadline < period) might be awakened after the
* deadline, but before the next period. In this case, replenishing the
* task would allow it to run for runtime / deadline. As in this case
* deadline < period, CBS enables a task to run for more than the
@@ -1112,12 +1265,11 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
*/
static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
{
- struct task_struct *p = dl_task_of(dl_se);
- struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
+ struct rq *rq = rq_of_dl_se(dl_se);
if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
- if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
+ if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se)))
return;
dl_se->dl_throttled = 1;
if (dl_se->runtime > 0)
@@ -1131,89 +1283,56 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
return (dl_se->runtime <= 0);
}
-extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
-
/*
- * This function implements the GRUB accounting rule:
- * according to the GRUB reclaiming algorithm, the runtime is
- * not decreased as "dq = -dt", but as
- * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt",
+ * This function implements the GRUB accounting rule. According to the
+ * GRUB reclaiming algorithm, the runtime is not decreased as "dq = -dt",
+ * but as "dq = -(max{u, (Umax - Uinact - Uextra)} / Umax) dt",
* where u is the utilization of the task, Umax is the maximum reclaimable
* utilization, Uinact is the (per-runqueue) inactive utilization, computed
* as the difference between the "total runqueue utilization" and the
- * runqueue active utilization, and Uextra is the (per runqueue) extra
+ * "runqueue active utilization", and Uextra is the (per runqueue) extra
* reclaimable utilization.
- * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
- * multiplied by 2^BW_SHIFT, the result has to be shifted right by
- * BW_SHIFT.
- * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT,
- * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
- * Since delta is a 64 bit variable, to have an overflow its value
- * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
- * So, overflow is not an issue here.
+ * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied
+ * by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT.
+ * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw
+ * is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
+ * Since delta is a 64 bit variable, to have an overflow its value should be
+ * larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is
+ * not an issue here.
*/
static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
{
- u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
u64 u_act;
- u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT;
+ u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
/*
- * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)},
- * we compare u_inact + rq->dl.extra_bw with
- * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because
- * u_inact + rq->dl.extra_bw can be larger than
- * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative
- * leading to wrong results)
+ * Instead of computing max{u, (u_max - u_inact - u_extra)}, we
+ * compare u_inact + u_extra with u_max - u, because u_inact + u_extra
+ * can be larger than u_max. So, u_max - u_inact - u_extra would be
+ * negative leading to wrong results.
*/
- if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min)
- u_act = u_act_min;
+ if (u_inact + rq->dl.extra_bw > rq->dl.max_bw - dl_se->dl_bw)
+ u_act = dl_se->dl_bw;
else
- u_act = BW_UNIT - u_inact - rq->dl.extra_bw;
+ u_act = rq->dl.max_bw - u_inact - rq->dl.extra_bw;
+ u_act = (u_act * rq->dl.bw_ratio) >> RATIO_SHIFT;
return (delta * u_act) >> BW_SHIFT;
}
-/*
- * Update the current task's runtime statistics (provided it is still
- * a -deadline task and has not been removed from the dl_rq).
- */
-static void update_curr_dl(struct rq *rq)
+static inline void
+update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+ int flags);
+static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
{
- struct task_struct *curr = rq->curr;
- struct sched_dl_entity *dl_se = &curr->dl;
- u64 delta_exec, scaled_delta_exec;
- int cpu = cpu_of(rq);
- u64 now;
+ s64 scaled_delta_exec;
- if (!dl_task(curr) || !on_dl_rq(dl_se))
- return;
-
- /*
- * Consumed budget is computed considering the time as
- * observed by schedulable tasks (excluding time spent
- * in hardirq context, etc.). Deadlines are instead
- * computed using hard walltime. This seems to be the more
- * natural solution, but the full ramifications of this
- * approach need further study.
- */
- now = rq_clock_task(rq);
- delta_exec = now - curr->se.exec_start;
- if (unlikely((s64)delta_exec <= 0)) {
+ if (unlikely(delta_exec <= 0)) {
if (unlikely(dl_se->dl_yielded))
goto throttle;
return;
}
- schedstat_set(curr->se.statistics.exec_max,
- max(curr->se.statistics.exec_max, delta_exec));
-
- curr->se.sum_exec_runtime += delta_exec;
- account_group_exec_runtime(curr, delta_exec);
-
- curr->se.exec_start = now;
- cgroup_account_cputime(curr, delta_exec);
-
if (dl_entity_is_special(dl_se))
return;
@@ -1225,10 +1344,9 @@ static void update_curr_dl(struct rq *rq)
* according to current frequency and CPU maximum capacity.
*/
if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) {
- scaled_delta_exec = grub_reclaim(delta_exec,
- rq,
- &curr->dl);
+ scaled_delta_exec = grub_reclaim(delta_exec, rq, dl_se);
} else {
+ int cpu = cpu_of(rq);
unsigned long scale_freq = arch_scale_freq_capacity(cpu);
unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
@@ -1247,11 +1365,20 @@ throttle:
(dl_se->flags & SCHED_FLAG_DL_OVERRUN))
dl_se->dl_overrun = 1;
- __dequeue_task_dl(rq, curr, 0);
- if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
- enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
+ dequeue_dl_entity(dl_se, 0);
+ if (!dl_server(dl_se)) {
+ update_stats_dequeue_dl(&rq->dl, dl_se, 0);
+ dequeue_pushable_dl_task(rq, dl_task_of(dl_se));
+ }
+
+ if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) {
+ if (dl_server(dl_se))
+ enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
+ else
+ enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH);
+ }
- if (!is_leftmost(curr, &rq->dl))
+ if (!is_leftmost(dl_se, &rq->dl))
resched_curr(rq);
}
@@ -1281,24 +1408,86 @@ throttle:
}
}
+void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
+{
+ update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
+}
+
+void dl_server_start(struct sched_dl_entity *dl_se)
+{
+ if (!dl_server(dl_se)) {
+ dl_se->dl_server = 1;
+ setup_new_dl_entity(dl_se);
+ }
+ enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
+}
+
+void dl_server_stop(struct sched_dl_entity *dl_se)
+{
+ dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
+}
+
+void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+ dl_server_has_tasks_f has_tasks,
+ dl_server_pick_f pick)
+{
+ dl_se->rq = rq;
+ dl_se->server_has_tasks = has_tasks;
+ dl_se->server_pick = pick;
+}
+
+/*
+ * Update the current task's runtime statistics (provided it is still
+ * a -deadline task and has not been removed from the dl_rq).
+ */
+static void update_curr_dl(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ struct sched_dl_entity *dl_se = &curr->dl;
+ s64 delta_exec;
+
+ if (!dl_task(curr) || !on_dl_rq(dl_se))
+ return;
+
+ /*
+ * Consumed budget is computed considering the time as
+ * observed by schedulable tasks (excluding time spent
+ * in hardirq context, etc.). Deadlines are instead
+ * computed using hard walltime. This seems to be the more
+ * natural solution, but the full ramifications of this
+ * approach need further study.
+ */
+ delta_exec = update_curr_common(rq);
+ update_curr_dl_se(rq, dl_se, delta_exec);
+}
+
static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
{
struct sched_dl_entity *dl_se = container_of(timer,
struct sched_dl_entity,
inactive_timer);
- struct task_struct *p = dl_task_of(dl_se);
+ struct task_struct *p = NULL;
struct rq_flags rf;
struct rq *rq;
- rq = task_rq_lock(p, &rf);
+ if (!dl_server(dl_se)) {
+ p = dl_task_of(dl_se);
+ rq = task_rq_lock(p, &rf);
+ } else {
+ rq = dl_se->rq;
+ rq_lock(rq, &rf);
+ }
sched_clock_tick();
update_rq_clock(rq);
- if (!dl_task(p) || p->state == TASK_DEAD) {
+ if (dl_server(dl_se))
+ goto no_task;
+
+ if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
- if (p->state == TASK_DEAD && dl_se->dl_non_contending) {
+ if (READ_ONCE(p->__state) == TASK_DEAD && dl_se->dl_non_contending) {
sub_running_bw(&p->dl, dl_rq_of_se(&p->dl));
sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl));
dl_se->dl_non_contending = 0;
@@ -1307,23 +1496,30 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
raw_spin_lock(&dl_b->lock);
__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(&dl_b->lock);
- __dl_clear_params(p);
+ __dl_clear_params(dl_se);
goto unlock;
}
+
+no_task:
if (dl_se->dl_non_contending == 0)
goto unlock;
sub_running_bw(dl_se, &rq->dl);
dl_se->dl_non_contending = 0;
unlock:
- task_rq_unlock(rq, p, &rf);
- put_task_struct(p);
+
+ if (!dl_server(dl_se)) {
+ task_rq_unlock(rq, p, &rf);
+ put_task_struct(p);
+ } else {
+ rq_unlock(rq, &rf);
+ }
return HRTIMER_NORESTART;
}
-void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
+static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->inactive_timer;
@@ -1331,6 +1527,9 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
timer->function = inactive_task_timer;
}
+#define __node_2_dle(node) \
+ rb_entry((node), struct sched_dl_entity, rb_node)
+
#ifdef CONFIG_SMP
static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
@@ -1339,6 +1538,8 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
if (dl_rq->earliest_dl.curr == 0 ||
dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
+ if (dl_rq->earliest_dl.curr == 0)
+ cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_HIGHER);
dl_rq->earliest_dl.curr = deadline;
cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
}
@@ -1356,11 +1557,11 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
dl_rq->earliest_dl.curr = 0;
dl_rq->earliest_dl.next = 0;
cpudl_clear(&rq->rd->cpudl, rq->cpu);
+ cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
} else {
- struct rb_node *leftmost = dl_rq->root.rb_leftmost;
- struct sched_dl_entity *entry;
+ struct rb_node *leftmost = rb_first_cached(&dl_rq->root);
+ struct sched_dl_entity *entry = __node_2_dle(leftmost);
- entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
dl_rq->earliest_dl.curr = entry->deadline;
cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
}
@@ -1376,54 +1577,112 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
static inline
void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
- int prio = dl_task_of(dl_se)->prio;
u64 deadline = dl_se->deadline;
- WARN_ON(!dl_prio(prio));
dl_rq->dl_nr_running++;
add_nr_running(rq_of_dl_rq(dl_rq), 1);
inc_dl_deadline(dl_rq, deadline);
- inc_dl_migration(dl_se, dl_rq);
}
static inline
void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
- int prio = dl_task_of(dl_se)->prio;
-
- WARN_ON(!dl_prio(prio));
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
sub_nr_running(rq_of_dl_rq(dl_rq), 1);
dec_dl_deadline(dl_rq, dl_se->deadline);
- dec_dl_migration(dl_se, dl_rq);
+}
+
+static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
+{
+ return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline);
+}
+
+static inline struct sched_statistics *
+__schedstats_from_dl_se(struct sched_dl_entity *dl_se)
+{
+ return &dl_task_of(dl_se)->stats;
+}
+
+static inline void
+update_stats_wait_start_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
+{
+ struct sched_statistics *stats;
+
+ if (!schedstat_enabled())
+ return;
+
+ stats = __schedstats_from_dl_se(dl_se);
+ __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
+}
+
+static inline void
+update_stats_wait_end_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
+{
+ struct sched_statistics *stats;
+
+ if (!schedstat_enabled())
+ return;
+
+ stats = __schedstats_from_dl_se(dl_se);
+ __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
+}
+
+static inline void
+update_stats_enqueue_sleeper_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
+{
+ struct sched_statistics *stats;
+
+ if (!schedstat_enabled())
+ return;
+
+ stats = __schedstats_from_dl_se(dl_se);
+ __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
+}
+
+static inline void
+update_stats_enqueue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+ int flags)
+{
+ if (!schedstat_enabled())
+ return;
+
+ if (flags & ENQUEUE_WAKEUP)
+ update_stats_enqueue_sleeper_dl(dl_rq, dl_se);
+}
+
+static inline void
+update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+ int flags)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+
+ if (!schedstat_enabled())
+ return;
+
+ if ((flags & DEQUEUE_SLEEP)) {
+ unsigned int state;
+
+ state = READ_ONCE(p->__state);
+ if (state & TASK_INTERRUPTIBLE)
+ __schedstat_set(p->stats.sleep_start,
+ rq_clock(rq_of_dl_rq(dl_rq)));
+
+ if (state & TASK_UNINTERRUPTIBLE)
+ __schedstat_set(p->stats.block_start,
+ rq_clock(rq_of_dl_rq(dl_rq)));
+ }
}
static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rb_node **link = &dl_rq->root.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct sched_dl_entity *entry;
- int leftmost = 1;
-
- BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct sched_dl_entity, rb_node);
- if (dl_time_before(dl_se->deadline, entry->deadline))
- link = &parent->rb_left;
- else {
- link = &parent->rb_right;
- leftmost = 0;
- }
- }
- rb_link_node(&dl_se->rb_node, parent, link);
- rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost);
+ WARN_ON_ONCE(!RB_EMPTY_NODE(&dl_se->rb_node));
+
+ rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less);
inc_dl_tasks(dl_se, dl_rq);
}
@@ -1436,67 +1695,18 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
return;
rb_erase_cached(&dl_se->rb_node, &dl_rq->root);
+
RB_CLEAR_NODE(&dl_se->rb_node);
dec_dl_tasks(dl_se, dl_rq);
}
static void
-enqueue_dl_entity(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se, int flags)
-{
- BUG_ON(on_dl_rq(dl_se));
-
- /*
- * If this is a wakeup or a new instance, the scheduling
- * parameters of the task might need updating. Otherwise,
- * we want a replenishment of its runtime.
- */
- if (flags & ENQUEUE_WAKEUP) {
- task_contending(dl_se, flags);
- update_dl_entity(dl_se, pi_se);
- } else if (flags & ENQUEUE_REPLENISH) {
- replenish_dl_entity(dl_se, pi_se);
- } else if ((flags & ENQUEUE_RESTORE) &&
- dl_time_before(dl_se->deadline,
- rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
- setup_new_dl_entity(dl_se);
- }
-
- __enqueue_dl_entity(dl_se);
-}
-
-static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
+enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
{
- __dequeue_dl_entity(dl_se);
-}
+ WARN_ON_ONCE(on_dl_rq(dl_se));
-static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
-{
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
- struct sched_dl_entity *pi_se = &p->dl;
-
- /*
- * Use the scheduling parameters of the top pi-waiter task if:
- * - we have a top pi-waiter which is a SCHED_DEADLINE task AND
- * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is
- * smaller than our deadline OR we are a !SCHED_DEADLINE task getting
- * boosted due to a SCHED_DEADLINE pi-waiter).
- * Otherwise we keep our runtime and deadline.
- */
- if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
- pi_se = &pi_task->dl;
- } else if (!dl_prio(p->normal_prio)) {
- /*
- * Special case in which we have a !SCHED_DEADLINE task
- * that is going to be deboosted, but exceeds its
- * runtime while doing so. No point in replenishing
- * it, as it's going to return back to its original
- * scheduling class after this.
- */
- BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
- return;
- }
+ update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags);
/*
* Check if a constrained deadline task was activated
@@ -1504,12 +1714,14 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* If that is the case, the task will be throttled and
* the replenishment timer will be set to the next period.
*/
- if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl))
- dl_check_constrained_dl(&p->dl);
+ if (!dl_se->dl_throttled && !dl_is_implicit(dl_se))
+ dl_check_constrained_dl(dl_se);
- if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
- add_rq_bw(&p->dl, &rq->dl);
- add_running_bw(&p->dl, &rq->dl);
+ if (flags & (ENQUEUE_RESTORE|ENQUEUE_MIGRATING)) {
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+ add_rq_bw(dl_se, dl_rq);
+ add_running_bw(dl_se, dl_rq);
}
/*
@@ -1524,33 +1736,40 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* be counted in the active utilization; hence, we need to call
* add_running_bw().
*/
- if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
+ if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
if (flags & ENQUEUE_WAKEUP)
- task_contending(&p->dl, flags);
+ task_contending(dl_se, flags);
return;
}
- enqueue_dl_entity(&p->dl, pi_se, flags);
+ /*
+ * If this is a wakeup or a new instance, the scheduling
+ * parameters of the task might need updating. Otherwise,
+ * we want a replenishment of its runtime.
+ */
+ if (flags & ENQUEUE_WAKEUP) {
+ task_contending(dl_se, flags);
+ update_dl_entity(dl_se);
+ } else if (flags & ENQUEUE_REPLENISH) {
+ replenish_dl_entity(dl_se);
+ } else if ((flags & ENQUEUE_RESTORE) &&
+ dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
+ setup_new_dl_entity(dl_se);
+ }
- if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
- enqueue_pushable_dl_task(rq, p);
+ __enqueue_dl_entity(dl_se);
}
-static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags)
{
- dequeue_dl_entity(&p->dl);
- dequeue_pushable_dl_task(rq, p);
-}
+ __dequeue_dl_entity(dl_se);
-static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
-{
- update_curr_dl(rq);
- __dequeue_task_dl(rq, p, flags);
+ if (flags & (DEQUEUE_SAVE|DEQUEUE_MIGRATING)) {
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) {
- sub_running_bw(&p->dl, &rq->dl);
- sub_rq_bw(&p->dl, &rq->dl);
+ sub_running_bw(dl_se, dl_rq);
+ sub_rq_bw(dl_se, dl_rq);
}
/*
@@ -1563,7 +1782,76 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* or "inactive")
*/
if (flags & DEQUEUE_SLEEP)
- task_non_contending(p);
+ task_non_contending(dl_se);
+}
+
+static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (is_dl_boosted(&p->dl)) {
+ /*
+ * Because of delays in the detection of the overrun of a
+ * thread's runtime, it might be the case that a thread
+ * goes to sleep in a rt mutex with negative runtime. As
+ * a consequence, the thread will be throttled.
+ *
+ * While waiting for the mutex, this thread can also be
+ * boosted via PI, resulting in a thread that is throttled
+ * and boosted at the same time.
+ *
+ * In this case, the boost overrides the throttle.
+ */
+ if (p->dl.dl_throttled) {
+ /*
+ * The replenish timer needs to be canceled. No
+ * problem if it fires concurrently: boosted threads
+ * are ignored in dl_task_timer().
+ */
+ hrtimer_try_to_cancel(&p->dl.dl_timer);
+ p->dl.dl_throttled = 0;
+ }
+ } else if (!dl_prio(p->normal_prio)) {
+ /*
+ * Special case in which we have a !SCHED_DEADLINE task that is going
+ * to be deboosted, but exceeds its runtime while doing so. No point in
+ * replenishing it, as it's going to return back to its original
+ * scheduling class after this. If it has been throttled, we need to
+ * clear the flag, otherwise the task may wake up as throttled after
+ * being boosted again with no means to replenish the runtime and clear
+ * the throttle.
+ */
+ p->dl.dl_throttled = 0;
+ if (!(flags & ENQUEUE_REPLENISH))
+ printk_deferred_once("sched: DL de-boosted task PID %d: REPLENISH flag missing\n",
+ task_pid_nr(p));
+
+ return;
+ }
+
+ check_schedstat_required();
+ update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl);
+
+ if (p->on_rq == TASK_ON_RQ_MIGRATING)
+ flags |= ENQUEUE_MIGRATING;
+
+ enqueue_dl_entity(&p->dl, flags);
+
+ if (dl_server(&p->dl))
+ return;
+
+ if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1)
+ enqueue_pushable_dl_task(rq, p);
+}
+
+static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+ update_curr_dl(rq);
+
+ if (p->on_rq == TASK_ON_RQ_MIGRATING)
+ flags |= DEQUEUE_MIGRATING;
+
+ dequeue_dl_entity(&p->dl, flags);
+ if (!p->dl.dl_throttled && !dl_server(&p->dl))
+ dequeue_pushable_dl_task(rq, p);
}
/*
@@ -1598,15 +1886,24 @@ static void yield_task_dl(struct rq *rq)
#ifdef CONFIG_SMP
+static inline bool dl_task_is_earliest_deadline(struct task_struct *p,
+ struct rq *rq)
+{
+ return (!rq->dl.dl_nr_running ||
+ dl_time_before(p->dl.deadline,
+ rq->dl.earliest_dl.curr));
+}
+
static int find_later_rq(struct task_struct *task);
static int
-select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_dl(struct task_struct *p, int cpu, int flags)
{
struct task_struct *curr;
+ bool select_rq;
struct rq *rq;
- if (sd_flag != SD_BALANCE_WAKE)
+ if (!(flags & WF_TTWU))
goto out;
rq = cpu_rq(cpu);
@@ -1623,16 +1920,23 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
* other hand, if it has a shorter deadline, we
* try to make it stay here, it might be important.
*/
- if (unlikely(dl_task(curr)) &&
- (curr->nr_cpus_allowed < 2 ||
- !dl_entity_preempt(&p->dl, &curr->dl)) &&
- (p->nr_cpus_allowed > 1)) {
+ select_rq = unlikely(dl_task(curr)) &&
+ (curr->nr_cpus_allowed < 2 ||
+ !dl_entity_preempt(&p->dl, &curr->dl)) &&
+ p->nr_cpus_allowed > 1;
+
+ /*
+ * Take the capacity of the CPU into account to
+ * ensure it fits the requirement of the task.
+ */
+ if (sched_asym_cpucap_active())
+ select_rq |= !dl_task_fits_capacity(p, cpu);
+
+ if (select_rq) {
int target = find_later_rq(p);
if (target != -1 &&
- (dl_time_before(p->dl.deadline,
- cpu_rq(target)->dl.earliest_dl.curr) ||
- (cpu_rq(target)->dl.dl_nr_running == 0)))
+ dl_task_is_earliest_deadline(p, cpu_rq(target)))
cpu = target;
}
rcu_read_unlock();
@@ -1643,9 +1947,10 @@ out:
static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
{
+ struct rq_flags rf;
struct rq *rq;
- if (p->state != TASK_WAKING)
+ if (READ_ONCE(p->__state) != TASK_WAKING)
return;
rq = task_rq(p);
@@ -1654,13 +1959,14 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
* from try_to_wake_up(). Hence, p->pi_lock is locked, but
* rq->lock is not... So, lock it
*/
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
if (p->dl.dl_non_contending) {
+ update_rq_clock(rq);
sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0;
/*
* If the timer handler is currently running and the
- * timer cannot be cancelled, inactive_task_timer()
+ * timer cannot be canceled, inactive_task_timer()
* will see that dl_not_contending is not set, and
* will not touch the rq's active utilization,
* so we are still safe.
@@ -1669,7 +1975,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
put_task_struct(p);
}
sub_rq_bw(&p->dl, &rq->dl);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
}
static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
@@ -1715,7 +2021,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
* Only called when both the current and waking task are -deadline
* tasks.
*/
-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
int flags)
{
if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
@@ -1735,19 +2041,24 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
}
#ifdef CONFIG_SCHED_HRTICK
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
{
- hrtick_start(rq, p->dl.runtime);
+ hrtick_start(rq, dl_se->runtime);
}
#else /* !CONFIG_SCHED_HRTICK */
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
{
}
#endif
static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
{
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct dl_rq *dl_rq = &rq->dl;
+
p->se.exec_start = rq_clock_task(rq);
+ if (on_dl_rq(&p->dl))
+ update_stats_wait_end_dl(dl_rq, dl_se);
/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
@@ -1755,44 +2066,76 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
if (!first)
return;
- if (hrtick_enabled(rq))
- start_hrtick_dl(rq, p);
-
if (rq->curr->sched_class != &dl_sched_class)
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
deadline_queue_push_tasks(rq);
}
-static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
- struct dl_rq *dl_rq)
+static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
{
struct rb_node *left = rb_first_cached(&dl_rq->root);
if (!left)
return NULL;
- return rb_entry(left, struct sched_dl_entity, rb_node);
+ return __node_2_dle(left);
}
-static struct task_struct *pick_next_task_dl(struct rq *rq)
+static struct task_struct *pick_task_dl(struct rq *rq)
{
struct sched_dl_entity *dl_se;
struct dl_rq *dl_rq = &rq->dl;
struct task_struct *p;
+again:
if (!sched_dl_runnable(rq))
return NULL;
- dl_se = pick_next_dl_entity(rq, dl_rq);
- BUG_ON(!dl_se);
- p = dl_task_of(dl_se);
- set_next_task_dl(rq, p, true);
+ dl_se = pick_next_dl_entity(dl_rq);
+ WARN_ON_ONCE(!dl_se);
+
+ if (dl_server(dl_se)) {
+ p = dl_se->server_pick(dl_se);
+ if (!p) {
+ WARN_ON_ONCE(1);
+ dl_se->dl_yielded = 1;
+ update_curr_dl_se(rq, dl_se, 0);
+ goto again;
+ }
+ p->dl_server = dl_se;
+ } else {
+ p = dl_task_of(dl_se);
+ }
+
+ return p;
+}
+
+static struct task_struct *pick_next_task_dl(struct rq *rq)
+{
+ struct task_struct *p;
+
+ p = pick_task_dl(rq);
+ if (!p)
+ return p;
+
+ if (!p->dl_server)
+ set_next_task_dl(rq, p, true);
+
+ if (hrtick_enabled(rq))
+ start_hrtick_dl(rq, &p->dl);
+
return p;
}
static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
{
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct dl_rq *dl_rq = &rq->dl;
+
+ if (on_dl_rq(&p->dl))
+ update_stats_wait_start_dl(dl_rq, dl_se);
+
update_curr_dl(rq);
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
@@ -1818,9 +2161,9 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
* not being the leftmost task anymore. In that case NEED_RESCHED will
* be set and schedule() will start a new hrtick for the next task.
*/
- if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
- is_leftmost(p, &rq->dl))
- start_hrtick_dl(rq, p);
+ if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 &&
+ is_leftmost(&p->dl, &rq->dl))
+ start_hrtick_dl(rq, &p->dl);
}
static void task_fork_dl(struct task_struct *p)
@@ -1838,8 +2181,8 @@ static void task_fork_dl(struct task_struct *p)
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
{
- if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, p->cpus_ptr))
+ if (!task_on_cpu(rq, p) &&
+ cpumask_test_cpu(cpu, &p->cpus_mask))
return 1;
return 0;
}
@@ -1850,15 +2193,17 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
*/
static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
{
- struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost;
struct task_struct *p = NULL;
+ struct rb_node *next_node;
if (!has_pushable_dl_tasks(rq))
return NULL;
+ next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root);
+
next_node:
if (next_node) {
- p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
+ p = __node_2_pdl(next_node);
if (pick_dl_task(rq, p, cpu))
return p;
@@ -1929,8 +2274,8 @@ static int find_later_rq(struct task_struct *task)
return this_cpu;
}
- best_cpu = cpumask_first_and(later_mask,
- sched_domain_span(sd));
+ best_cpu = cpumask_any_and_distribute(later_mask,
+ sched_domain_span(sd));
/*
* Last chance: if a CPU being in both later_mask
* and current sd span is valid, that becomes our
@@ -1952,7 +2297,7 @@ static int find_later_rq(struct task_struct *task)
if (this_cpu != -1)
return this_cpu;
- cpu = cpumask_any(later_mask);
+ cpu = cpumask_any_distribute(later_mask);
if (cpu < nr_cpu_ids)
return cpu;
@@ -1974,9 +2319,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
later_rq = cpu_rq(cpu);
- if (later_rq->dl.dl_nr_running &&
- !dl_time_before(task->dl.deadline,
- later_rq->dl.earliest_dl.curr)) {
+ if (!dl_task_is_earliest_deadline(task, later_rq)) {
/*
* Target rq has tasks of equal or earlier deadline,
* retrying does not release any lock and is unlikely
@@ -1989,9 +2332,10 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
- task_running(rq, task) ||
+ !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
+ task_on_cpu(rq, task) ||
!dl_task(task) ||
+ is_migration_disabled(task) ||
!task_on_rq_queued(task))) {
double_unlock_balance(rq, later_rq);
later_rq = NULL;
@@ -2004,9 +2348,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
* its earliest one has a later deadline than our
* task, the rq is a good one.
*/
- if (!later_rq->dl.dl_nr_running ||
- dl_time_before(task->dl.deadline,
- later_rq->dl.earliest_dl.curr))
+ if (dl_task_is_earliest_deadline(task, later_rq))
break;
/* Otherwise we try again. */
@@ -2024,15 +2366,14 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
if (!has_pushable_dl_tasks(rq))
return NULL;
- p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost,
- struct task_struct, pushable_dl_tasks);
+ p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
- BUG_ON(rq->cpu != task_cpu(p));
- BUG_ON(task_current(rq, p));
- BUG_ON(p->nr_cpus_allowed <= 1);
+ WARN_ON_ONCE(rq->cpu != task_cpu(p));
+ WARN_ON_ONCE(task_current(rq, p));
+ WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
- BUG_ON(!task_on_rq_queued(p));
- BUG_ON(!dl_task(p));
+ WARN_ON_ONCE(!task_on_rq_queued(p));
+ WARN_ON_ONCE(!dl_task(p));
return p;
}
@@ -2048,17 +2389,11 @@ static int push_dl_task(struct rq *rq)
struct rq *later_rq;
int ret = 0;
- if (!rq->dl.overloaded)
- return 0;
-
next_task = pick_next_pushable_dl_task(rq);
if (!next_task)
return 0;
retry:
- if (WARN_ON(next_task == rq->curr))
- return 0;
-
/*
* If next_task preempts rq->curr, and rq->curr
* can move away, it makes sense to just reschedule
@@ -2071,6 +2406,12 @@ retry:
return 0;
}
+ if (is_migration_disabled(next_task))
+ return 0;
+
+ if (WARN_ON(next_task == rq->curr))
+ return 0;
+
/* We might release rq lock */
get_task_struct(next_task);
@@ -2104,13 +2445,7 @@ retry:
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, later_rq->cpu);
-
- /*
- * Update the later_rq clock here, because the clock is used
- * by the cpufreq_update_util() inside __add_running_bw().
- */
- update_rq_clock(later_rq);
- activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
+ activate_task(later_rq, next_task, 0);
ret = 1;
resched_curr(later_rq);
@@ -2133,7 +2468,7 @@ static void push_dl_tasks(struct rq *rq)
static void pull_dl_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, cpu;
- struct task_struct *p;
+ struct task_struct *p, *push_task;
bool resched = false;
struct rq *src_rq;
u64 dmin = LONG_MAX;
@@ -2163,6 +2498,7 @@ static void pull_dl_task(struct rq *this_rq)
continue;
/* Might drop this_rq->lock */
+ push_task = NULL;
double_lock_balance(this_rq, src_rq);
/*
@@ -2180,9 +2516,7 @@ static void pull_dl_task(struct rq *this_rq)
* - it will preempt the last one we pulled (if any).
*/
if (p && dl_time_before(p->dl.deadline, dmin) &&
- (!this_rq->dl.dl_nr_running ||
- dl_time_before(p->dl.deadline,
- this_rq->dl.earliest_dl.curr))) {
+ dl_task_is_earliest_deadline(p, this_rq)) {
WARN_ON(p == src_rq->curr);
WARN_ON(!task_on_rq_queued(p));
@@ -2194,17 +2528,29 @@ static void pull_dl_task(struct rq *this_rq)
src_rq->curr->dl.deadline))
goto skip;
- resched = true;
-
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, this_cpu);
- activate_task(this_rq, p, 0);
- dmin = p->dl.deadline;
+ if (is_migration_disabled(p)) {
+ push_task = get_push_task(src_rq);
+ } else {
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, this_cpu);
+ activate_task(this_rq, p, 0);
+ dmin = p->dl.deadline;
+ resched = true;
+ }
/* Is there any other task even earlier? */
}
skip:
double_unlock_balance(this_rq, src_rq);
+
+ if (push_task) {
+ preempt_disable();
+ raw_spin_rq_unlock(this_rq);
+ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+ push_task, &src_rq->push_work);
+ preempt_enable();
+ raw_spin_rq_lock(this_rq);
+ }
}
if (resched)
@@ -2217,7 +2563,7 @@ skip:
*/
static void task_woken_dl(struct rq *rq, struct task_struct *p)
{
- if (!task_running(rq, p) &&
+ if (!task_on_cpu(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
p->nr_cpus_allowed > 1 &&
dl_task(rq->curr) &&
@@ -2228,12 +2574,12 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
}
static void set_cpus_allowed_dl(struct task_struct *p,
- const struct cpumask *new_mask)
+ struct affinity_context *ctx)
{
struct root_domain *src_rd;
struct rq *rq;
- BUG_ON(!dl_task(p));
+ WARN_ON_ONCE(!dl_task(p));
rq = task_rq(p);
src_rd = rq->rd;
@@ -2243,7 +2589,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
* update. We already made space for us in the destination
* domain (see cpuset_can_attach()).
*/
- if (!cpumask_intersects(src_rd->span, new_mask)) {
+ if (!cpumask_intersects(src_rd->span, ctx->new_mask)) {
struct dl_bw *src_dl_b;
src_dl_b = dl_bw_of(cpu_of(rq));
@@ -2257,7 +2603,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
raw_spin_unlock(&src_dl_b->lock);
}
- set_cpus_allowed_common(p, new_mask);
+ set_cpus_allowed_common(p, ctx);
}
/* Assumes rq->lock is held */
@@ -2296,9 +2642,13 @@ void dl_add_task_root_domain(struct task_struct *p)
struct rq *rq;
struct dl_bw *dl_b;
- rq = task_rq_lock(p, &rf);
- if (!dl_task(p))
- goto unlock;
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+ if (!dl_task(p)) {
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
+ return;
+ }
+
+ rq = __task_rq_lock(p, &rf);
dl_b = &rq->rd->dl_bw;
raw_spin_lock(&dl_b->lock);
@@ -2307,7 +2657,6 @@ void dl_add_task_root_domain(struct task_struct *p)
raw_spin_unlock(&dl_b->lock);
-unlock:
task_rq_unlock(rq, p, &rf);
}
@@ -2333,7 +2682,13 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
* will reset the task parameters.
*/
if (task_on_rq_queued(p) && p->dl.dl_runtime)
- task_non_contending(p);
+ task_non_contending(&p->dl);
+
+ /*
+ * In case a task is setscheduled out from SCHED_DEADLINE we need to
+ * keep track of that on its cpuset (for correct bandwidth tracking).
+ */
+ dec_dl_tasks_cs(p);
if (!task_on_rq_queued(p)) {
/*
@@ -2375,6 +2730,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
put_task_struct(p);
+ /*
+ * In case a task is setscheduled to SCHED_DEADLINE we need to keep
+ * track of that on its cpuset (for correct bandwidth tracking).
+ */
+ inc_dl_tasks_cs(p);
+
/* If p is not queued we will update its parameters at next wakeup. */
if (!task_on_rq_queued(p)) {
add_rq_bw(&p->dl, &rq->dl);
@@ -2388,9 +2749,11 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
deadline_queue_push_tasks(rq);
#endif
if (dl_task(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
+ wakeup_preempt_dl(rq, p, 0);
else
resched_curr(rq);
+ } else {
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
}
}
@@ -2401,17 +2764,20 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
static void prio_changed_dl(struct rq *rq, struct task_struct *p,
int oldprio)
{
- if (task_on_rq_queued(p) || rq->curr == p) {
+ if (!task_on_rq_queued(p))
+ return;
+
#ifdef CONFIG_SMP
- /*
- * This might be too much, but unfortunately
- * we don't have the old deadline value, and
- * we can't argue if the task is increasing
- * or lowering its prio, so...
- */
- if (!rq->dl.overloaded)
- deadline_queue_pull_task(rq);
+ /*
+ * This might be too much, but unfortunately
+ * we don't have the old deadline value, and
+ * we can't argue if the task is increasing
+ * or lowering its prio, so...
+ */
+ if (!rq->dl.overloaded)
+ deadline_queue_pull_task(rq);
+ if (task_current(rq, p)) {
/*
* If we now have a earlier deadline task than p,
* then reschedule, provided p is still on this
@@ -2419,24 +2785,40 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
*/
if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
resched_curr(rq);
-#else
+ } else {
/*
- * Again, we don't know if p has a earlier
- * or later deadline, so let's blindly set a
- * (maybe not needed) rescheduling point.
+ * Current may not be deadline in case p was throttled but we
+ * have just replenished it (e.g. rt_mutex_setprio()).
+ *
+ * Otherwise, if p was given an earlier deadline, reschedule.
*/
- resched_curr(rq);
-#endif /* CONFIG_SMP */
+ if (!dl_task(rq->curr) ||
+ dl_time_before(p->dl.deadline, rq->curr->dl.deadline))
+ resched_curr(rq);
}
+#else
+ /*
+ * We don't know if p has a earlier or later deadline, so let's blindly
+ * set a (maybe not needed) rescheduling point.
+ */
+ resched_curr(rq);
+#endif
+}
+
+#ifdef CONFIG_SCHED_CORE
+static int task_is_throttled_dl(struct task_struct *p, int cpu)
+{
+ return p->dl.dl_throttled;
}
+#endif
+
+DEFINE_SCHED_CLASS(dl) = {
-const struct sched_class dl_sched_class = {
- .next = &rt_sched_class,
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
- .check_preempt_curr = check_preempt_curr_dl,
+ .wakeup_preempt = wakeup_preempt_dl,
.pick_next_task = pick_next_task_dl,
.put_prev_task = put_prev_task_dl,
@@ -2444,12 +2826,14 @@ const struct sched_class dl_sched_class = {
#ifdef CONFIG_SMP
.balance = balance_dl,
+ .pick_task = pick_task_dl,
.select_task_rq = select_task_rq_dl,
.migrate_task_rq = migrate_task_rq_dl,
.set_cpus_allowed = set_cpus_allowed_dl,
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
.task_woken = task_woken_dl,
+ .find_lock_rq = find_lock_later_rq,
#endif
.task_tick = task_tick_dl,
@@ -2460,35 +2844,44 @@ const struct sched_class dl_sched_class = {
.switched_to = switched_to_dl,
.update_curr = update_curr_dl,
+#ifdef CONFIG_SCHED_CORE
+ .task_is_throttled = task_is_throttled_dl,
+#endif
};
+/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */
+static u64 dl_generation;
+
int sched_dl_global_validate(void)
{
u64 runtime = global_rt_runtime();
u64 period = global_rt_period();
u64 new_bw = to_ratio(period, runtime);
+ u64 gen = ++dl_generation;
struct dl_bw *dl_b;
- int cpu, ret = 0;
+ int cpu, cpus, ret = 0;
unsigned long flags;
/*
* Here we want to check the bandwidth not being set to some
* value smaller than the currently allocated bandwidth in
* any of the root_domains.
- *
- * FIXME: Cycling on all the CPUs is overdoing, but simpler than
- * cycling on root_domains... Discussion on different/better
- * solutions is welcome!
*/
for_each_possible_cpu(cpu) {
rcu_read_lock_sched();
+
+ if (dl_bw_visited(cpu, gen))
+ goto next;
+
dl_b = dl_bw_of(cpu);
+ cpus = dl_bw_cpus(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- if (new_bw < dl_b->total_bw)
+ if (new_bw * cpus < dl_b->total_bw)
ret = -EBUSY;
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+next:
rcu_read_unlock_sched();
if (ret)
@@ -2502,33 +2895,34 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
{
if (global_rt_runtime() == RUNTIME_INF) {
dl_rq->bw_ratio = 1 << RATIO_SHIFT;
- dl_rq->extra_bw = 1 << BW_SHIFT;
+ dl_rq->max_bw = dl_rq->extra_bw = 1 << BW_SHIFT;
} else {
dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
- dl_rq->extra_bw = to_ratio(global_rt_period(),
- global_rt_runtime());
+ dl_rq->max_bw = dl_rq->extra_bw =
+ to_ratio(global_rt_period(), global_rt_runtime());
}
}
void sched_dl_do_global(void)
{
u64 new_bw = -1;
+ u64 gen = ++dl_generation;
struct dl_bw *dl_b;
int cpu;
unsigned long flags;
- def_dl_bandwidth.dl_period = global_rt_period();
- def_dl_bandwidth.dl_runtime = global_rt_runtime();
-
if (global_rt_runtime() != RUNTIME_INF)
new_bw = to_ratio(global_rt_period(), global_rt_runtime());
- /*
- * FIXME: As above...
- */
for_each_possible_cpu(cpu) {
rcu_read_lock_sched();
+
+ if (dl_bw_visited(cpu, gen)) {
+ rcu_read_unlock_sched();
+ continue;
+ }
+
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
@@ -2551,11 +2945,12 @@ void sched_dl_do_global(void)
int sched_dl_overflow(struct task_struct *p, int policy,
const struct sched_attr *attr)
{
- struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
u64 period = attr->sched_period ?: attr->sched_deadline;
u64 runtime = attr->sched_runtime;
u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
- int cpus, err = -1;
+ int cpus, err = -1, cpu = task_cpu(p);
+ struct dl_bw *dl_b = dl_bw_of(cpu);
+ unsigned long cap;
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return 0;
@@ -2570,15 +2965,17 @@ int sched_dl_overflow(struct task_struct *p, int policy,
* allocated bandwidth of the container.
*/
raw_spin_lock(&dl_b->lock);
- cpus = dl_bw_cpus(task_cpu(p));
+ cpus = dl_bw_cpus(cpu);
+ cap = dl_bw_capacity(cpu);
+
if (dl_policy(policy) && !task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+ !__dl_overflow(dl_b, cap, 0, new_bw)) {
if (hrtimer_active(&p->dl.inactive_timer))
__dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
err = 0;
} else if (dl_policy(policy) && task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+ !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) {
/*
* XXX this is slightly incorrect: when the task
* utilization decreases, we should delay the total
@@ -2618,7 +3015,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_runtime = attr->sched_runtime;
dl_se->dl_deadline = attr->sched_deadline;
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
- dl_se->flags = attr->sched_flags;
+ dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
}
@@ -2631,7 +3028,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
attr->sched_runtime = dl_se->dl_runtime;
attr->sched_deadline = dl_se->dl_deadline;
attr->sched_period = dl_se->dl_period;
- attr->sched_flags = dl_se->flags;
+ attr->sched_flags &= ~SCHED_DL_FLAGS;
+ attr->sched_flags |= dl_se->flags;
}
/*
@@ -2646,6 +3044,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
*/
bool __checkparam_dl(const struct sched_attr *attr)
{
+ u64 period, max, min;
+
/* special dl tasks don't actually use any parameter */
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return true;
@@ -2669,22 +3069,29 @@ bool __checkparam_dl(const struct sched_attr *attr)
attr->sched_period & (1ULL << 63))
return false;
+ period = attr->sched_period;
+ if (!period)
+ period = attr->sched_deadline;
+
/* runtime <= deadline <= period (if period != 0) */
- if ((attr->sched_period != 0 &&
- attr->sched_period < attr->sched_deadline) ||
+ if (period < attr->sched_deadline ||
attr->sched_deadline < attr->sched_runtime)
return false;
+ max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC;
+ min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC;
+
+ if (period < min || period > max)
+ return false;
+
return true;
}
/*
* This function clears the sched_dl_entity static params.
*/
-void __dl_clear_params(struct task_struct *p)
+static void __dl_clear_params(struct sched_dl_entity *dl_se)
{
- struct sched_dl_entity *dl_se = &p->dl;
-
dl_se->dl_runtime = 0;
dl_se->dl_deadline = 0;
dl_se->dl_period = 0;
@@ -2692,11 +3099,23 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_bw = 0;
dl_se->dl_density = 0;
- dl_se->dl_boosted = 0;
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
dl_se->dl_non_contending = 0;
dl_se->dl_overrun = 0;
+ dl_se->dl_server = 0;
+
+#ifdef CONFIG_RT_MUTEXES
+ dl_se->pi_se = dl_se;
+#endif
+}
+
+void init_dl_entity(struct sched_dl_entity *dl_se)
+{
+ RB_CLEAR_NODE(&dl_se->rb_node);
+ init_dl_task_timer(dl_se);
+ init_dl_inactive_task_timer(dl_se);
+ __dl_clear_params(dl_se);
}
bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
@@ -2706,60 +3125,25 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
if (dl_se->dl_runtime != attr->sched_runtime ||
dl_se->dl_deadline != attr->sched_deadline ||
dl_se->dl_period != attr->sched_period ||
- dl_se->flags != attr->sched_flags)
+ dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS))
return true;
return false;
}
#ifdef CONFIG_SMP
-int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
-{
- unsigned int dest_cpu;
- struct dl_bw *dl_b;
- bool overflow;
- int cpus, ret;
- unsigned long flags;
-
- dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
-
- rcu_read_lock_sched();
- dl_b = dl_bw_of(dest_cpu);
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(dest_cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
- if (overflow) {
- ret = -EBUSY;
- } else {
- /*
- * We reserve space for this task in the destination
- * root_domain, as we can't fail after this point.
- * We will free resources in the source root_domain
- * later on (see set_cpus_allowed_dl()).
- */
- __dl_add(dl_b, p->dl.dl_bw, cpus);
- ret = 0;
- }
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
- rcu_read_unlock_sched();
-
- return ret;
-}
-
int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial)
{
- int ret = 1, trial_cpus;
+ unsigned long flags, cap;
struct dl_bw *cur_dl_b;
- unsigned long flags;
+ int ret = 1;
rcu_read_lock_sched();
cur_dl_b = dl_bw_of(cpumask_any(cur));
- trial_cpus = cpumask_weight(trial);
-
+ cap = __dl_bw_capacity(trial);
raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
- if (cur_dl_b->bw != -1 &&
- cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
+ if (__dl_overflow(cur_dl_b, cap, 0, 0))
ret = 0;
raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
rcu_read_unlock_sched();
@@ -2767,22 +3151,59 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
return ret;
}
-bool dl_cpu_busy(unsigned int cpu)
+enum dl_bw_request {
+ dl_bw_req_check_overflow = 0,
+ dl_bw_req_alloc,
+ dl_bw_req_free
+};
+
+static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
{
unsigned long flags;
struct dl_bw *dl_b;
- bool overflow;
- int cpus;
+ bool overflow = 0;
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, 0);
+
+ if (req == dl_bw_req_free) {
+ __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu));
+ } else {
+ unsigned long cap = dl_bw_capacity(cpu);
+
+ overflow = __dl_overflow(dl_b, cap, 0, dl_bw);
+
+ if (req == dl_bw_req_alloc && !overflow) {
+ /*
+ * We reserve space in the destination
+ * root_domain, as we can't fail after this point.
+ * We will free resources in the source root_domain
+ * later on (see set_cpus_allowed_dl()).
+ */
+ __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu));
+ }
+ }
+
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
- return overflow;
+ return overflow ? -EBUSY : 0;
+}
+
+int dl_bw_check_overflow(int cpu)
+{
+ return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0);
+}
+
+int dl_bw_alloc(int cpu, u64 dl_bw)
+{
+ return dl_bw_manage(dl_bw_req_alloc, cpu, dl_bw);
+}
+
+void dl_bw_free(int cpu, u64 dl_bw)
+{
+ dl_bw_manage(dl_bw_req_free, cpu, dl_bw);
}
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 36c54265bb2b..8d5d98a5834d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -6,12 +6,9 @@
*
* Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
*/
-#include "sched.h"
-
-static DEFINE_SPINLOCK(sched_debug_lock);
/*
- * This allows printing both to /proc/sched_debug and
+ * This allows printing both to /sys/kernel/debug/sched/debug and
* to the console
*/
#define SEQ_printf(m, x...) \
@@ -169,191 +166,317 @@ static const struct file_operations sched_feat_fops = {
.release = single_release,
};
-__read_mostly bool sched_debug_enabled;
+#ifdef CONFIG_SMP
-static __init int sched_init_debug(void)
+static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- debugfs_create_file("sched_features", 0644, NULL, NULL,
- &sched_feat_fops);
+ char buf[16];
+ unsigned int scaling;
+
+ if (cnt > 15)
+ cnt = 15;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+ buf[cnt] = '\0';
+
+ if (kstrtouint(buf, 10, &scaling))
+ return -EINVAL;
+
+ if (scaling >= SCHED_TUNABLESCALING_END)
+ return -EINVAL;
- debugfs_create_bool("sched_debug", 0644, NULL,
- &sched_debug_enabled);
+ sysctl_sched_tunable_scaling = scaling;
+ if (sched_update_scaling())
+ return -EINVAL;
+ *ppos += cnt;
+ return cnt;
+}
+
+static int sched_scaling_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%d\n", sysctl_sched_tunable_scaling);
return 0;
}
-late_initcall(sched_init_debug);
+
+static int sched_scaling_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_scaling_show, NULL);
+}
+
+static const struct file_operations sched_scaling_fops = {
+ .open = sched_scaling_open,
+ .write = sched_scaling_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#endif /* SMP */
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[16];
+ int mode;
+
+ if (cnt > 15)
+ cnt = 15;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+ mode = sched_dynamic_mode(strstrip(buf));
+ if (mode < 0)
+ return mode;
+
+ sched_dynamic_update(mode);
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static int sched_dynamic_show(struct seq_file *m, void *v)
+{
+ static const char * preempt_modes[] = {
+ "none", "voluntary", "full"
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
+ if (preempt_dynamic_mode == i)
+ seq_puts(m, "(");
+ seq_puts(m, preempt_modes[i]);
+ if (preempt_dynamic_mode == i)
+ seq_puts(m, ")");
+
+ seq_puts(m, " ");
+ }
+
+ seq_puts(m, "\n");
+ return 0;
+}
+
+static int sched_dynamic_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_dynamic_show, NULL);
+}
+
+static const struct file_operations sched_dynamic_fops = {
+ .open = sched_dynamic_open,
+ .write = sched_dynamic_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#endif /* CONFIG_PREEMPT_DYNAMIC */
+
+__read_mostly bool sched_debug_verbose;
#ifdef CONFIG_SMP
+static struct dentry *sd_dentry;
-#ifdef CONFIG_SYSCTL
-static struct ctl_table sd_ctl_dir[] = {
- {
- .procname = "sched_domain",
- .mode = 0555,
- },
- {}
+static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ ssize_t result;
+ bool orig;
+
+ cpus_read_lock();
+ mutex_lock(&sched_domains_mutex);
+
+ orig = sched_debug_verbose;
+ result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
+
+ if (sched_debug_verbose && !orig)
+ update_sched_domain_debugfs();
+ else if (!sched_debug_verbose && orig) {
+ debugfs_remove(sd_dentry);
+ sd_dentry = NULL;
+ }
+
+ mutex_unlock(&sched_domains_mutex);
+ cpus_read_unlock();
+
+ return result;
+}
+#else
+#define sched_verbose_write debugfs_write_file_bool
+#endif
+
+static const struct file_operations sched_verbose_fops = {
+ .read = debugfs_read_file_bool,
+ .write = sched_verbose_write,
+ .open = simple_open,
+ .llseek = default_llseek,
};
-static struct ctl_table sd_ctl_root[] = {
- {
- .procname = "kernel",
- .mode = 0555,
- .child = sd_ctl_dir,
- },
- {}
+static const struct seq_operations sched_debug_sops;
+
+static int sched_debug_open(struct inode *inode, struct file *filp)
+{
+ return seq_open(filp, &sched_debug_sops);
+}
+
+static const struct file_operations sched_debug_fops = {
+ .open = sched_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
};
-static struct ctl_table *sd_alloc_ctl_entry(int n)
+static struct dentry *debugfs_sched;
+
+static __init int sched_init_debug(void)
{
- struct ctl_table *entry =
- kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+ struct dentry __maybe_unused *numa;
+
+ debugfs_sched = debugfs_create_dir("sched", NULL);
+
+ debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);
+ debugfs_create_file_unsafe("verbose", 0644, debugfs_sched, &sched_debug_verbose, &sched_verbose_fops);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+ debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
+#endif
+
+ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
+
+ debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
+ debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
+
+#ifdef CONFIG_SMP
+ debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
+ debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
+ debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
- return entry;
+ mutex_lock(&sched_domains_mutex);
+ update_sched_domain_debugfs();
+ mutex_unlock(&sched_domains_mutex);
+#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+ numa = debugfs_create_dir("numa_balancing", debugfs_sched);
+
+ debugfs_create_u32("scan_delay_ms", 0644, numa, &sysctl_numa_balancing_scan_delay);
+ debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min);
+ debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
+ debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
+ debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
+#endif
+
+ debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
+
+ return 0;
}
+late_initcall(sched_init_debug);
+
+#ifdef CONFIG_SMP
+
+static cpumask_var_t sd_sysctl_cpus;
-static void sd_free_ctl_entry(struct ctl_table **tablep)
+static int sd_flags_show(struct seq_file *m, void *v)
{
- struct ctl_table *entry;
+ unsigned long flags = *(unsigned int *)m->private;
+ int idx;
- /*
- * In the intermediate directories, both the child directory and
- * procname are dynamically allocated and could fail but the mode
- * will always be set. In the lowest directory the names are
- * static strings and all have proc handlers.
- */
- for (entry = *tablep; entry->mode; entry++) {
- if (entry->child)
- sd_free_ctl_entry(&entry->child);
- if (entry->proc_handler == NULL)
- kfree(entry->procname);
+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+ seq_puts(m, sd_flag_debug[idx].name);
+ seq_puts(m, " ");
}
+ seq_puts(m, "\n");
- kfree(*tablep);
- *tablep = NULL;
+ return 0;
}
-static void
-set_table_entry(struct ctl_table *entry,
- const char *procname, void *data, int maxlen,
- umode_t mode, proc_handler *proc_handler)
-{
- entry->procname = procname;
- entry->data = data;
- entry->maxlen = maxlen;
- entry->mode = mode;
- entry->proc_handler = proc_handler;
-}
-
-static struct ctl_table *
-sd_alloc_ctl_domain_table(struct sched_domain *sd)
-{
- struct ctl_table *table = sd_alloc_ctl_entry(9);
-
- if (table == NULL)
- return NULL;
-
- set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax);
- set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax);
- set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax);
- set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
- set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
- /* &table[8] is terminator */
-
- return table;
-}
-
-static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
-{
- struct ctl_table *entry, *table;
- struct sched_domain *sd;
- int domain_num = 0, i;
- char buf[32];
-
- for_each_domain(cpu, sd)
- domain_num++;
- entry = table = sd_alloc_ctl_entry(domain_num + 1);
- if (table == NULL)
- return NULL;
-
- i = 0;
- for_each_domain(cpu, sd) {
- snprintf(buf, 32, "domain%d", i);
- entry->procname = kstrdup(buf, GFP_KERNEL);
- entry->mode = 0555;
- entry->child = sd_alloc_ctl_domain_table(sd);
- entry++;
- i++;
- }
- return table;
+static int sd_flags_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, sd_flags_show, inode->i_private);
}
-static cpumask_var_t sd_sysctl_cpus;
-static struct ctl_table_header *sd_sysctl_header;
+static const struct file_operations sd_flags_fops = {
+ .open = sd_flags_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
-void register_sched_domain_sysctl(void)
+static void register_sd(struct sched_domain *sd, struct dentry *parent)
{
- static struct ctl_table *cpu_entries;
- static struct ctl_table **cpu_idx;
- static bool init_done = false;
- char buf[32];
- int i;
+#define SDM(type, mode, member) \
+ debugfs_create_##type(#member, mode, parent, &sd->member)
- if (!cpu_entries) {
- cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1);
- if (!cpu_entries)
- return;
+ SDM(ulong, 0644, min_interval);
+ SDM(ulong, 0644, max_interval);
+ SDM(u64, 0644, max_newidle_lb_cost);
+ SDM(u32, 0644, busy_factor);
+ SDM(u32, 0644, imbalance_pct);
+ SDM(u32, 0644, cache_nice_tries);
+ SDM(str, 0444, name);
- WARN_ON(sd_ctl_dir[0].child);
- sd_ctl_dir[0].child = cpu_entries;
- }
+#undef SDM
- if (!cpu_idx) {
- struct ctl_table *e = cpu_entries;
+ debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops);
+ debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops);
+}
- cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL);
- if (!cpu_idx)
- return;
+void update_sched_domain_debugfs(void)
+{
+ int cpu, i;
- /* deal with sparse possible map */
- for_each_possible_cpu(i) {
- cpu_idx[i] = e;
- e++;
- }
- }
+ /*
+ * This can unfortunately be invoked before sched_debug_init() creates
+ * the debug directory. Don't touch sd_sysctl_cpus until then.
+ */
+ if (!debugfs_sched)
+ return;
+
+ if (!sched_debug_verbose)
+ return;
if (!cpumask_available(sd_sysctl_cpus)) {
if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
return;
+ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
}
- if (!init_done) {
- init_done = true;
- /* init to possible to not have holes in @cpu_entries */
- cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
+ if (!sd_dentry) {
+ sd_dentry = debugfs_create_dir("domains", debugfs_sched);
+
+ /* rebuild sd_sysctl_cpus if empty since it gets cleared below */
+ if (cpumask_empty(sd_sysctl_cpus))
+ cpumask_copy(sd_sysctl_cpus, cpu_online_mask);
}
- for_each_cpu(i, sd_sysctl_cpus) {
- struct ctl_table *e = cpu_idx[i];
+ for_each_cpu(cpu, sd_sysctl_cpus) {
+ struct sched_domain *sd;
+ struct dentry *d_cpu;
+ char buf[32];
+
+ snprintf(buf, sizeof(buf), "cpu%d", cpu);
+ debugfs_lookup_and_remove(buf, sd_dentry);
+ d_cpu = debugfs_create_dir(buf, sd_dentry);
+
+ i = 0;
+ for_each_domain(cpu, sd) {
+ struct dentry *d_sd;
- if (e->child)
- sd_free_ctl_entry(&e->child);
+ snprintf(buf, sizeof(buf), "domain%d", i);
+ d_sd = debugfs_create_dir(buf, d_cpu);
- if (!e->procname) {
- snprintf(buf, 32, "cpu%d", i);
- e->procname = kstrdup(buf, GFP_KERNEL);
+ register_sd(sd, d_sd);
+ i++;
}
- e->mode = 0555;
- e->child = sd_alloc_ctl_cpu_table(i);
- __cpumask_clear_cpu(i, sd_sysctl_cpus);
+ __cpumask_clear_cpu(cpu, sd_sysctl_cpus);
}
-
- WARN_ON(sd_sysctl_header);
- sd_sysctl_header = register_sysctl_table(sd_ctl_root);
}
void dirty_sched_domain_sysctl(int cpu)
@@ -362,13 +485,6 @@ void dirty_sched_domain_sysctl(int cpu)
__cpumask_set_cpu(cpu, sd_sysctl_cpus);
}
-/* may be called multiple times per register */
-void unregister_sched_domain_sysctl(void)
-{
- unregister_sysctl_table(sd_sysctl_header);
- sd_sysctl_header = NULL;
-}
-#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -377,9 +493,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
struct sched_entity *se = tg->se[cpu];
#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
-#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
+#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", \
+ #F, (long long)schedstat_val(stats->F))
#define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
-#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
+#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", \
+ #F, SPLIT_NS((long long)schedstat_val(stats->F)))
if (!se)
return;
@@ -389,16 +507,19 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN(se->sum_exec_runtime);
if (schedstat_enabled()) {
- PN_SCHEDSTAT(se->statistics.wait_start);
- PN_SCHEDSTAT(se->statistics.sleep_start);
- PN_SCHEDSTAT(se->statistics.block_start);
- PN_SCHEDSTAT(se->statistics.sleep_max);
- PN_SCHEDSTAT(se->statistics.block_max);
- PN_SCHEDSTAT(se->statistics.exec_max);
- PN_SCHEDSTAT(se->statistics.slice_max);
- PN_SCHEDSTAT(se->statistics.wait_max);
- PN_SCHEDSTAT(se->statistics.wait_sum);
- P_SCHEDSTAT(se->statistics.wait_count);
+ struct sched_statistics *stats;
+ stats = __schedstats_from_se(se);
+
+ PN_SCHEDSTAT(wait_start);
+ PN_SCHEDSTAT(sleep_start);
+ PN_SCHEDSTAT(block_start);
+ PN_SCHEDSTAT(sleep_max);
+ PN_SCHEDSTAT(block_max);
+ PN_SCHEDSTAT(exec_max);
+ PN_SCHEDSTAT(slice_max);
+ PN_SCHEDSTAT(wait_max);
+ PN_SCHEDSTAT(wait_sum);
+ P_SCHEDSTAT(wait_count);
}
P(se->load.weight);
@@ -416,43 +537,69 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#endif
#ifdef CONFIG_CGROUP_SCHED
+static DEFINE_SPINLOCK(sched_debug_lock);
static char group_path[PATH_MAX];
-static char *task_group_path(struct task_group *tg)
+static void task_group_path(struct task_group *tg, char *path, int plen)
{
- if (autogroup_path(tg, group_path, PATH_MAX))
- return group_path;
+ if (autogroup_path(tg, path, plen))
+ return;
- cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+ cgroup_path(tg->css.cgroup, path, plen);
+}
- return group_path;
+/*
+ * Only 1 SEQ_printf_task_group_path() caller can use the full length
+ * group_path[] for cgroup path. Other simultaneous callers will have
+ * to use a shorter stack buffer. A "..." suffix is appended at the end
+ * of the stack buffer so that it will show up in case the output length
+ * matches the given buffer size to indicate possible path name truncation.
+ */
+#define SEQ_printf_task_group_path(m, tg, fmt...) \
+{ \
+ if (spin_trylock(&sched_debug_lock)) { \
+ task_group_path(tg, group_path, sizeof(group_path)); \
+ SEQ_printf(m, fmt, group_path); \
+ spin_unlock(&sched_debug_lock); \
+ } else { \
+ char buf[128]; \
+ char *bufend = buf + sizeof(buf) - 3; \
+ task_group_path(tg, buf, bufend - buf); \
+ strcpy(bufend - 1, "..."); \
+ SEQ_printf(m, fmt, buf); \
+ } \
}
#endif
static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
- if (rq->curr == p)
+ if (task_current(rq, p))
SEQ_printf(m, ">R");
else
SEQ_printf(m, " %c", task_state_to_char(p));
- SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
+ entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
+ SPLIT_NS(p->se.deadline),
+ SPLIT_NS(p->se.slice),
+ SPLIT_NS(p->se.sum_exec_runtime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);
- SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)),
+ SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld",
+ SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)),
SPLIT_NS(p->se.sum_exec_runtime),
- SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime)));
+ SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
+ SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
#ifdef CONFIG_NUMA_BALANCING
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
#endif
#ifdef CONFIG_CGROUP_SCHED
- SEQ_printf(m, " %s", task_group_path(task_group(p)));
+ SEQ_printf_task_group_path(m, task_group(p), " %s")
#endif
SEQ_printf(m, "\n");
@@ -481,15 +628,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
- spread, rq0_min_vruntime, spread0;
+ s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread;
+ struct sched_entity *last, *first, *root;
struct rq *rq = cpu_rq(cpu);
- struct sched_entity *last;
unsigned long flags;
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, "\n");
- SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
+ SEQ_printf_task_group_path(m, cfs_rq->tg, "cfs_rq[%d]:%s\n", cpu);
#else
SEQ_printf(m, "\n");
SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
@@ -497,30 +643,39 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
SPLIT_NS(cfs_rq->exec_clock));
- raw_spin_lock_irqsave(&rq->lock, flags);
- if (rb_first_cached(&cfs_rq->tasks_timeline))
- MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
+ raw_spin_rq_lock_irqsave(rq, flags);
+ root = __pick_root_entity(cfs_rq);
+ if (root)
+ left_vruntime = root->min_vruntime;
+ first = __pick_first_entity(cfs_rq);
+ if (first)
+ left_deadline = first->deadline;
last = __pick_last_entity(cfs_rq);
if (last)
- max_vruntime = last->vruntime;
+ right_vruntime = last->vruntime;
min_vruntime = cfs_rq->min_vruntime;
- rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
- SPLIT_NS(MIN_vruntime));
+ raw_spin_rq_unlock_irqrestore(rq, flags);
+
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline",
+ SPLIT_NS(left_deadline));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
+ SPLIT_NS(left_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
SPLIT_NS(min_vruntime));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
- SPLIT_NS(max_vruntime));
- spread = max_vruntime - MIN_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
- SPLIT_NS(spread));
- spread0 = min_vruntime - rq0_min_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
- SPLIT_NS(spread0));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
+ SPLIT_NS(avg_vruntime(cfs_rq)));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
+ SPLIT_NS(right_vruntime));
+ spread = right_vruntime - left_vruntime;
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running",
+ cfs_rq->idle_nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running",
+ cfs_rq->idle_h_nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
@@ -529,8 +684,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->avg.runnable_avg);
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
cfs_rq->avg.util_avg);
- SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
- cfs_rq->avg.util_est.enqueued);
+ SEQ_printf(m, " .%-30s: %u\n", "util_est",
+ cfs_rq->avg.util_est);
SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
cfs_rq->removed.load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
@@ -560,7 +715,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
{
#ifdef CONFIG_RT_GROUP_SCHED
SEQ_printf(m, "\n");
- SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
+ SEQ_printf_task_group_path(m, rt_rq->tg, "rt_rq[%d]:%s\n", cpu);
#else
SEQ_printf(m, "\n");
SEQ_printf(m, "rt_rq[%d]:\n", cpu);
@@ -574,9 +729,6 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
PU(rt_nr_running);
-#ifdef CONFIG_SMP
- PU(rt_nr_migratory);
-#endif
P(rt_throttled);
PN(rt_time);
PN(rt_runtime);
@@ -598,7 +750,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
PU(dl_nr_running);
#ifdef CONFIG_SMP
- PU(dl_nr_migratory);
dl_bw = &cpu_rq(cpu)->rd->dl_bw;
#else
dl_bw = &dl_rq->dl_bw;
@@ -612,7 +763,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
static void print_cpu(struct seq_file *m, int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
#ifdef CONFIG_X86
{
@@ -628,7 +778,7 @@ static void print_cpu(struct seq_file *m, int cpu)
#define P(x) \
do { \
if (sizeof(rq->x) == 4) \
- SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
+ SEQ_printf(m, " .%-30s: %d\n", #x, (int)(rq->x)); \
else \
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
} while (0)
@@ -663,13 +813,11 @@ do { \
}
#undef P
- spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
print_dl_stats(m, cpu);
print_rq(m, rq, cpu);
- spin_unlock_irqrestore(&sched_debug_lock, flags);
SEQ_printf(m, "\n");
}
@@ -716,10 +864,7 @@ static void sched_debug_header(struct seq_file *m)
SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
#define PN(x) \
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
- PN(sysctl_sched_latency);
- PN(sysctl_sched_min_granularity);
- PN(sysctl_sched_wakeup_granularity);
- P(sysctl_sched_child_runs_first);
+ PN(sysctl_sched_base_slice);
P(sysctl_sched_features);
#undef PN
#undef P
@@ -761,7 +906,7 @@ void sysrq_sched_debug_show(void)
}
/*
- * This itererator needs some explanation.
+ * This iterator needs some explanation.
* It returns 1 for the header position.
* This means 2 is CPU 0.
* In a hotplugged system some CPUs, including CPU 0, may be missing so we have
@@ -806,18 +951,10 @@ static const struct seq_operations sched_debug_sops = {
.show = sched_debug_show,
};
-static int __init init_sched_debug_procfs(void)
-{
- if (!proc_create_seq("sched_debug", 0444, NULL, &sched_debug_sops))
- return -ENOMEM;
- return 0;
-}
-
-__initcall(init_sched_debug_procfs);
-
#define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
#define __P(F) __PS(#F, F)
#define P(F) __PS(#F, p->F)
+#define PM(F, M) __PS(#F, p->F & (M))
#define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
#define __PN(F) __PSN(#F, F)
#define PN(F) __PSN(#F, p->F)
@@ -837,25 +974,15 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
static void sched_show_numa(struct task_struct *p, struct seq_file *m)
{
#ifdef CONFIG_NUMA_BALANCING
- struct mempolicy *pol;
-
if (p->mm)
P(mm->numa_scan_seq);
- task_lock(p);
- pol = p->mempolicy;
- if (pol && !(pol->flags & MPOL_F_MORON))
- pol = NULL;
- mpol_get(pol);
- task_unlock(p);
-
P(numa_pages_migrated);
P(numa_preferred_nid);
P(total_numa_faults);
SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
task_node(p), task_numa_group_id(p));
show_numa_stats(p, m);
- mpol_put(pol);
#endif
}
@@ -870,8 +997,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
"---------------------------------------------------------"
"----------\n");
-#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->F))
-#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->F))
+#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->stats.F))
+#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->stats.F))
PN(se.exec_start);
PN(se.vruntime);
@@ -884,33 +1011,34 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu;
- PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
- PN_SCHEDSTAT(se.statistics.wait_start);
- PN_SCHEDSTAT(se.statistics.sleep_start);
- PN_SCHEDSTAT(se.statistics.block_start);
- PN_SCHEDSTAT(se.statistics.sleep_max);
- PN_SCHEDSTAT(se.statistics.block_max);
- PN_SCHEDSTAT(se.statistics.exec_max);
- PN_SCHEDSTAT(se.statistics.slice_max);
- PN_SCHEDSTAT(se.statistics.wait_max);
- PN_SCHEDSTAT(se.statistics.wait_sum);
- P_SCHEDSTAT(se.statistics.wait_count);
- PN_SCHEDSTAT(se.statistics.iowait_sum);
- P_SCHEDSTAT(se.statistics.iowait_count);
- P_SCHEDSTAT(se.statistics.nr_migrations_cold);
- P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
- P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
- P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
- P_SCHEDSTAT(se.statistics.nr_forced_migrations);
- P_SCHEDSTAT(se.statistics.nr_wakeups);
- P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
- P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
- P_SCHEDSTAT(se.statistics.nr_wakeups_local);
- P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
- P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
- P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
- P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
- P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
+ PN_SCHEDSTAT(sum_sleep_runtime);
+ PN_SCHEDSTAT(sum_block_runtime);
+ PN_SCHEDSTAT(wait_start);
+ PN_SCHEDSTAT(sleep_start);
+ PN_SCHEDSTAT(block_start);
+ PN_SCHEDSTAT(sleep_max);
+ PN_SCHEDSTAT(block_max);
+ PN_SCHEDSTAT(exec_max);
+ PN_SCHEDSTAT(slice_max);
+ PN_SCHEDSTAT(wait_max);
+ PN_SCHEDSTAT(wait_sum);
+ P_SCHEDSTAT(wait_count);
+ PN_SCHEDSTAT(iowait_sum);
+ P_SCHEDSTAT(iowait_count);
+ P_SCHEDSTAT(nr_migrations_cold);
+ P_SCHEDSTAT(nr_failed_migrations_affine);
+ P_SCHEDSTAT(nr_failed_migrations_running);
+ P_SCHEDSTAT(nr_failed_migrations_hot);
+ P_SCHEDSTAT(nr_forced_migrations);
+ P_SCHEDSTAT(nr_wakeups);
+ P_SCHEDSTAT(nr_wakeups_sync);
+ P_SCHEDSTAT(nr_wakeups_migrate);
+ P_SCHEDSTAT(nr_wakeups_local);
+ P_SCHEDSTAT(nr_wakeups_remote);
+ P_SCHEDSTAT(nr_wakeups_affine);
+ P_SCHEDSTAT(nr_wakeups_affine_attempts);
+ P_SCHEDSTAT(nr_wakeups_passive);
+ P_SCHEDSTAT(nr_wakeups_idle);
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
@@ -928,6 +1056,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
__PN(avg_atom);
__PN(avg_per_cpu);
+
+#ifdef CONFIG_SCHED_CORE
+ PN_SCHEDSTAT(core_forceidle_sum);
+#endif
}
__P(nr_switches);
@@ -943,8 +1075,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(se.avg.runnable_avg);
P(se.avg.util_avg);
P(se.avg.last_update_time);
- P(se.avg.util_est.ewma);
- P(se.avg.util_est.enqueued);
+ PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED);
#endif
#ifdef CONFIG_UCLAMP_TASK
__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
@@ -976,6 +1107,16 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
void proc_sched_set_task(struct task_struct *p)
{
#ifdef CONFIG_SCHEDSTATS
- memset(&p->se.statistics, 0, sizeof(p->se.statistics));
+ memset(&p->stats, 0, sizeof(p->stats));
#endif
}
+
+void resched_latency_warn(int cpu, u64 latency)
+{
+ static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
+
+ WARN(__ratelimit(&latency_check_ratelimit),
+ "sched: CPU %d need_resched set for > %llu ns (%d ticks) "
+ "without schedule\n",
+ cpu, latency, cpu_rq(cpu)->ticks_without_resched);
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04fa8dbcfa4d..533547e3c90a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,25 +20,40 @@
* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
-#include "sched.h"
-
-#include <trace/events/sched.h>
+#include <linux/energy_model.h>
+#include <linux/mmap_lock.h>
+#include <linux/hugetlb_inline.h>
+#include <linux/jiffies.h>
+#include <linux/mm_api.h>
+#include <linux/highmem.h>
+#include <linux/spinlock_api.h>
+#include <linux/cpumask_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/softirq.h>
+#include <linux/refcount_api.h>
+#include <linux/topology.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/cond_resched.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/nohz.h>
+
+#include <linux/cpuidle.h>
+#include <linux/interrupt.h>
+#include <linux/memory-tiers.h>
+#include <linux/mempolicy.h>
+#include <linux/mutex_api.h>
+#include <linux/profile.h>
+#include <linux/psi.h>
+#include <linux/ratelimit.h>
+#include <linux/task_work.h>
+#include <linux/rbtree_augmented.h>
+
+#include <asm/switch_to.h>
-/*
- * Targeted preemption latency for CPU-bound tasks:
- *
- * NOTE: this latency value is not the same as the concept of
- * 'timeslice length' - timeslices in CFS are of variable length
- * and have no persistent notion like in traditional, time-slice
- * based scheduling concepts.
- *
- * (to see the precise effective timeslice length of your workload,
- * run vmstat and monitor the context-switches (cs) field)
- *
- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
- */
-unsigned int sysctl_sched_latency = 6000000ULL;
-static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+#include "sched.h"
+#include "stats.h"
+#include "autogroup.h"
/*
* The initial- and re-scaling of tunables is configurable
@@ -51,38 +66,15 @@ static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
*
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*/
-enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
+unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
/*
* Minimal preemption granularity for CPU-bound tasks:
*
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 750000ULL;
-static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
-
-/*
- * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
- */
-static unsigned int sched_nr_latency = 8;
-
-/*
- * After fork, child runs first. If set to 0 (default) then
- * parent will (try to) run first.
- */
-unsigned int sysctl_sched_child_runs_first __read_mostly;
-
-/*
- * SCHED_OTHER wake-up granularity.
- *
- * This option delays the preemption effects of decoupled workloads
- * and reduces their over-scheduling. Synchronous workloads will still
- * have immediate wakeup/sleep latencies.
- *
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
- */
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
-static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int sysctl_sched_base_slice = 750000ULL;
+static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
@@ -115,6 +107,13 @@ int __weak arch_asym_cpu_priority(int cpu)
*/
#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
+/*
+ * The margin used when comparing CPU capacities.
+ * is 'cap1' noticeably greater than 'cap2'
+ *
+ * (default: ~5%)
+ */
+#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
#endif
#ifdef CONFIG_CFS_BANDWIDTH
@@ -128,7 +127,45 @@ int __weak arch_asym_cpu_priority(int cpu)
*
* (default: 5 msec, units: microseconds)
*/
-unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+#endif
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_fair_sysctls[] = {
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ .procname = "sched_cfs_bandwidth_slice_us",
+ .data = &sysctl_sched_cfs_bandwidth_slice,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ },
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .procname = "numa_balancing_promote_rate_limit_MBps",
+ .data = &sysctl_numa_balancing_promote_rate_limit,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+#endif /* CONFIG_NUMA_BALANCING */
+ {}
+};
+
+static int __init sched_fair_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_fair_sysctls);
+ return 0;
+}
+late_initcall(sched_fair_sysctl_init);
#endif
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
@@ -185,9 +222,7 @@ static void update_sysctl(void)
#define SET_SYSCTL(name) \
(sysctl_##name = (factor) * normalized_sysctl_##name)
- SET_SYSCTL(sched_min_granularity);
- SET_SYSCTL(sched_latency);
- SET_SYSCTL(sched_wakeup_granularity);
+ SET_SYSCTL(sched_base_slice);
#undef SET_SYSCTL
}
@@ -231,27 +266,40 @@ static void __update_inv_weight(struct load_weight *lw)
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
u64 fact = scale_load_down(weight);
+ u32 fact_hi = (u32)(fact >> 32);
int shift = WMULT_SHIFT;
+ int fs;
__update_inv_weight(lw);
- if (unlikely(fact >> 32)) {
- while (fact >> 32) {
- fact >>= 1;
- shift--;
- }
+ if (unlikely(fact_hi)) {
+ fs = fls(fact_hi);
+ shift -= fs;
+ fact >>= fs;
}
fact = mul_u32_u32(fact, lw->inv_weight);
- while (fact >> 32) {
- fact >>= 1;
- shift--;
+ fact_hi = (u32)(fact >> 32);
+ if (fact_hi) {
+ fs = fls(fact_hi);
+ shift -= fs;
+ fact >>= fs;
}
return mul_u64_u32_shr(delta_exec, fact, shift);
}
+/*
+ * delta /= w
+ */
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
+{
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
+
+ return delta;
+}
const struct sched_class fair_sched_class;
@@ -260,46 +308,11 @@ const struct sched_class fair_sched_class;
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
- SCHED_WARN_ON(!entity_is_task(se));
- return container_of(se, struct task_struct, se);
-}
/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
- return p->se.cfs_rq;
-}
-
-/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
- return se->cfs_rq;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
- return grp->my_q;
-}
-
-static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
-{
- if (!path)
- return;
-
- if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
- autogroup_path(cfs_rq->tg, path, len);
- else if (cfs_rq && cfs_rq->tg->css.cgroup)
- cgroup_path(cfs_rq->tg->css.cgroup, path, len);
- else
- strlcpy(path, "(null)", len);
-}
-
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
@@ -408,7 +421,7 @@ is_same_group(struct sched_entity *se, struct sched_entity *pse)
return NULL;
}
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
+static inline struct sched_entity *parent_entity(const struct sched_entity *se)
{
return se->parent;
}
@@ -445,40 +458,27 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
}
}
-#else /* !CONFIG_FAIR_GROUP_SCHED */
-
-static inline struct task_struct *task_of(struct sched_entity *se)
+static int tg_is_idle(struct task_group *tg)
{
- return container_of(se, struct task_struct, se);
+ return tg->idle > 0;
}
-#define for_each_sched_entity(se) \
- for (; se; se = NULL)
-
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
{
- return &task_rq(p)->cfs;
+ return cfs_rq->idle > 0;
}
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+static int se_is_idle(struct sched_entity *se)
{
- struct task_struct *p = task_of(se);
- struct rq *rq = task_rq(p);
-
- return &rq->cfs;
+ if (entity_is_task(se))
+ return task_has_idle_policy(task_of(se));
+ return cfs_rq_is_idle(group_cfs_rq(se));
}
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
- return NULL;
-}
+#else /* !CONFIG_FAIR_GROUP_SCHED */
-static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
-{
- if (path)
- strlcpy(path, "(null)", len);
-}
+#define for_each_sched_entity(se) \
+ for (; se; se = NULL)
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
@@ -506,6 +506,21 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
}
+static inline int tg_is_idle(struct task_group *tg)
+{
+ return 0;
+}
+
+static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
+{
+ return 0;
+}
+
+static int se_is_idle(struct sched_entity *se)
+{
+ return 0;
+}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
static __always_inline
@@ -533,17 +548,221 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
return min_vruntime;
}
-static inline int entity_before(struct sched_entity *a,
- struct sched_entity *b)
+static inline bool entity_before(const struct sched_entity *a,
+ const struct sched_entity *b)
{
- return (s64)(a->vruntime - b->vruntime) < 0;
+ /*
+ * Tiebreak on vruntime seems unnecessary since it can
+ * hardly happen.
+ */
+ return (s64)(a->deadline - b->deadline) < 0;
}
-static void update_min_vruntime(struct cfs_rq *cfs_rq)
+static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return (s64)(se->vruntime - cfs_rq->min_vruntime);
+}
+
+#define __node_2_se(node) \
+ rb_entry((node), struct sched_entity, run_node)
+
+/*
+ * Compute virtual time from the per-task service numbers:
+ *
+ * Fair schedulers conserve lag:
+ *
+ * \Sum lag_i = 0
+ *
+ * Where lag_i is given by:
+ *
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * Where S is the ideal service time and V is it's virtual time counterpart.
+ * Therefore:
+ *
+ * \Sum lag_i = 0
+ * \Sum w_i * (V - v_i) = 0
+ * \Sum w_i * V - w_i * v_i = 0
+ *
+ * From which we can solve an expression for V in v_i (which we have in
+ * se->vruntime):
+ *
+ * \Sum v_i * w_i \Sum v_i * w_i
+ * V = -------------- = --------------
+ * \Sum w_i W
+ *
+ * Specifically, this is the weighted average of all entity virtual runtimes.
+ *
+ * [[ NOTE: this is only equal to the ideal scheduler under the condition
+ * that join/leave operations happen at lag_i = 0, otherwise the
+ * virtual time has non-continguous motion equivalent to:
+ *
+ * V +-= lag_i / W
+ *
+ * Also see the comment in place_entity() that deals with this. ]]
+ *
+ * However, since v_i is u64, and the multiplcation could easily overflow
+ * transform it into a relative form that uses smaller quantities:
+ *
+ * Substitute: v_i == (v_i - v0) + v0
+ *
+ * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i
+ * V = ---------------------------- = --------------------- + v0
+ * W W
+ *
+ * Which we track using:
+ *
+ * v0 := cfs_rq->min_vruntime
+ * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
+ * \Sum w_i := cfs_rq->avg_load
+ *
+ * Since min_vruntime is a monotonic increasing variable that closely tracks
+ * the per-task service, these deltas: (v_i - v), will be in the order of the
+ * maximal (virtual) lag induced in the system due to quantisation.
+ *
+ * Also, we use scale_load_down() to reduce the size.
+ *
+ * As measured, the max (key * weight) value was ~44 bits for a kernel build.
+ */
+static void
+avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 key = entity_key(cfs_rq, se);
+
+ cfs_rq->avg_vruntime += key * weight;
+ cfs_rq->avg_load += weight;
+}
+
+static void
+avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 key = entity_key(cfs_rq, se);
+
+ cfs_rq->avg_vruntime -= key * weight;
+ cfs_rq->avg_load -= weight;
+}
+
+static inline
+void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+{
+ /*
+ * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
+ */
+ cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
+}
+
+/*
+ * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
+ * For this to be so, the result of this function must have a left bias.
+ */
+u64 avg_vruntime(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 avg = cfs_rq->avg_vruntime;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq) {
+ unsigned long weight = scale_load_down(curr->load.weight);
+
+ avg += entity_key(cfs_rq, curr) * weight;
+ load += weight;
+ }
+
+ if (load) {
+ /* sign flips effective floor / ceil */
+ if (avg < 0)
+ avg -= (load - 1);
+ avg = div_s64(avg, load);
+ }
+
+ return cfs_rq->min_vruntime + avg;
+}
+
+/*
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * However, since V is approximated by the weighted average of all entities it
+ * is possible -- by addition/removal/reweight to the tree -- to move V around
+ * and end up with a larger lag than we started with.
+ *
+ * Limit this to either double the slice length with a minimum of TICK_NSEC
+ * since that is the timing granularity.
+ *
+ * EEVDF gives the following limit for a steady state system:
+ *
+ * -r_max < lag < max(r_max, q)
+ *
+ * XXX could add max_slice to the augmented data to track this.
+ */
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ s64 lag, limit;
+
+ SCHED_WARN_ON(!se->on_rq);
+ lag = avg_vruntime(cfs_rq) - se->vruntime;
+
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+ se->vlag = clamp(lag, -limit, limit);
+}
+
+/*
+ * Entity is eligible once it received less service than it ought to have,
+ * eg. lag >= 0.
+ *
+ * lag_i = S - s_i = w_i*(V - v_i)
+ *
+ * lag_i >= 0 -> V >= v_i
+ *
+ * \Sum (v_i - v)*w_i
+ * V = ------------------ + v
+ * \Sum w_i
+ *
+ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
+ *
+ * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
+ * to the loss in precision caused by the division.
+ */
+static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
{
struct sched_entity *curr = cfs_rq->curr;
- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
+ s64 avg = cfs_rq->avg_vruntime;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq) {
+ unsigned long weight = scale_load_down(curr->load.weight);
+
+ avg += entity_key(cfs_rq, curr) * weight;
+ load += weight;
+ }
+
+ return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
+}
+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return vruntime_eligible(cfs_rq, se->vruntime);
+}
+
+static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
+{
+ u64 min_vruntime = cfs_rq->min_vruntime;
+ /*
+ * open coded max_vruntime() to allow updating avg_vruntime
+ */
+ s64 delta = (s64)(vruntime - min_vruntime);
+ if (delta > 0) {
+ avg_vruntime_update(cfs_rq, delta);
+ min_vruntime = vruntime;
+ }
+ return min_vruntime;
+}
+
+static void update_min_vruntime(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *se = __pick_root_entity(cfs_rq);
+ struct sched_entity *curr = cfs_rq->curr;
u64 vruntime = cfs_rq->min_vruntime;
if (curr) {
@@ -553,60 +772,78 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
curr = NULL;
}
- if (leftmost) { /* non-empty tree */
- struct sched_entity *se;
- se = rb_entry(leftmost, struct sched_entity, run_node);
-
+ if (se) {
if (!curr)
- vruntime = se->vruntime;
+ vruntime = se->min_vruntime;
else
- vruntime = min_vruntime(vruntime, se->vruntime);
+ vruntime = min_vruntime(vruntime, se->min_vruntime);
}
/* ensure we never gain time by being placed backwards. */
- cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
-#ifndef CONFIG_64BIT
- smp_wmb();
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
+ u64_u32_store(cfs_rq->min_vruntime,
+ __update_min_vruntime(cfs_rq, vruntime));
+}
+
+static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
+{
+ return entity_before(__node_2_se(a), __node_2_se(b));
+}
+
+#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
+
+static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node)
+{
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+ if (vruntime_gt(min_vruntime, se, rse))
+ se->min_vruntime = rse->min_vruntime;
+ }
}
/*
- * Enqueue an entity into the rb-tree:
+ * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
*/
-static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
{
- struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct sched_entity *entry;
- bool leftmost = true;
+ u64 old_min_vruntime = se->min_vruntime;
+ struct rb_node *node = &se->run_node;
- /*
- * Find the right place in the rbtree:
- */
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct sched_entity, run_node);
- /*
- * We dont care about collisions. Nodes with
- * the same key stay together.
- */
- if (entity_before(se, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = false;
- }
- }
+ se->min_vruntime = se->vruntime;
+ __min_vruntime_update(se, node->rb_right);
+ __min_vruntime_update(se, node->rb_left);
+
+ return se->min_vruntime == old_min_vruntime;
+}
+
+RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
+ run_node, min_vruntime, min_vruntime_update);
- rb_link_node(&se->run_node, parent, link);
- rb_insert_color_cached(&se->run_node,
- &cfs_rq->tasks_timeline, leftmost);
+/*
+ * Enqueue an entity into the rb-tree:
+ */
+static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ avg_vruntime_add(cfs_rq, se);
+ se->min_vruntime = se->vruntime;
+ rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ __entity_less, &min_vruntime_cb);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ &min_vruntime_cb);
+ avg_vruntime_sub(cfs_rq, se);
+}
+
+struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
+{
+ struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node;
+
+ if (!root)
+ return NULL;
+
+ return __node_2_se(root);
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
@@ -616,17 +853,91 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
if (!left)
return NULL;
- return rb_entry(left, struct sched_entity, run_node);
+ return __node_2_se(left);
}
-static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+/*
+ * Earliest Eligible Virtual Deadline First
+ *
+ * In order to provide latency guarantees for different request sizes
+ * EEVDF selects the best runnable task from two criteria:
+ *
+ * 1) the task must be eligible (must be owed service)
+ *
+ * 2) from those tasks that meet 1), we select the one
+ * with the earliest virtual deadline.
+ *
+ * We can do this in O(log n) time due to an augmented RB-tree. The
+ * tree keeps the entries sorted on deadline, but also functions as a
+ * heap based on the vruntime by keeping:
+ *
+ * se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime)
+ *
+ * Which allows tree pruning through eligibility.
+ */
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
{
- struct rb_node *next = rb_next(&se->run_node);
+ struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
+ struct sched_entity *curr = cfs_rq->curr;
+ struct sched_entity *best = NULL;
- if (!next)
- return NULL;
+ /*
+ * We can safely skip eligibility check if there is only one entity
+ * in this cfs_rq, saving some cycles.
+ */
+ if (cfs_rq->nr_running == 1)
+ return curr && curr->on_rq ? curr : se;
+
+ if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
+ curr = NULL;
+
+ /*
+ * Once selected, run a task until it either becomes non-eligible or
+ * until it gets a new slice. See the HACK in set_next_entity().
+ */
+ if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
+ return curr;
+
+ /* Pick the leftmost entity if it's eligible */
+ if (se && entity_eligible(cfs_rq, se)) {
+ best = se;
+ goto found;
+ }
+
+ /* Heap search for the EEVD entity */
+ while (node) {
+ struct rb_node *left = node->rb_left;
+
+ /*
+ * Eligible entities in left subtree are always better
+ * choices, since they have earlier deadlines.
+ */
+ if (left && vruntime_eligible(cfs_rq,
+ __node_2_se(left)->min_vruntime)) {
+ node = left;
+ continue;
+ }
+
+ se = __node_2_se(node);
+
+ /*
+ * The left subtree either is empty or has no eligible
+ * entity, so check the current node since it is the one
+ * with earliest deadline that might be eligible.
+ */
+ if (entity_eligible(cfs_rq, se)) {
+ best = se;
+ break;
+ }
+
+ node = node->rb_right;
+ }
+found:
+ if (!best || (curr && entity_before(curr, best)))
+ best = curr;
- return rb_entry(next, struct sched_entity, run_node);
+ return best;
}
#ifdef CONFIG_SCHED_DEBUG
@@ -637,99 +948,57 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
if (!last)
return NULL;
- return rb_entry(last, struct sched_entity, run_node);
+ return __node_2_se(last);
}
/**************************************************************
* Scheduling class statistics methods:
*/
-
-int sched_proc_update_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+#ifdef CONFIG_SMP
+int sched_update_scaling(void)
{
- int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
unsigned int factor = get_update_sysctl_factor();
- if (ret || !write)
- return ret;
-
- sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
- sysctl_sched_min_granularity);
-
#define WRT_SYSCTL(name) \
(normalized_sysctl_##name = sysctl_##name / (factor))
- WRT_SYSCTL(sched_min_granularity);
- WRT_SYSCTL(sched_latency);
- WRT_SYSCTL(sched_wakeup_granularity);
+ WRT_SYSCTL(sched_base_slice);
#undef WRT_SYSCTL
return 0;
}
#endif
+#endif
-/*
- * delta /= w
- */
-static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
-{
- if (unlikely(se->load.weight != NICE_0_LOAD))
- delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
-
- return delta;
-}
-
-/*
- * The idea is to set a period in which each task runs once.
- *
- * When there are too many tasks (sched_nr_latency) we have to stretch
- * this period because otherwise the slices get too small.
- *
- * p = (nr <= nl) ? l : l*nr/nl
- */
-static u64 __sched_period(unsigned long nr_running)
-{
- if (unlikely(nr_running > sched_nr_latency))
- return nr_running * sysctl_sched_min_granularity;
- else
- return sysctl_sched_latency;
-}
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
/*
- * We calculate the wall-time slice from the period by taking a part
- * proportional to the weight.
- *
- * s = p*P[w/rw]
+ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
+ * this is probably good enough.
*/
-static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
-
- for_each_sched_entity(se) {
- struct load_weight *load;
- struct load_weight lw;
+ if ((s64)(se->vruntime - se->deadline) < 0)
+ return;
- cfs_rq = cfs_rq_of(se);
- load = &cfs_rq->load;
+ /*
+ * For EEVDF the virtual time slope is determined by w_i (iow.
+ * nice) while the request time r_i is determined by
+ * sysctl_sched_base_slice.
+ */
+ se->slice = sysctl_sched_base_slice;
- if (unlikely(!se->on_rq)) {
- lw = cfs_rq->load;
+ /*
+ * EEVDF: vd_i = ve_i + r_i / w_i
+ */
+ se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
- update_load_add(&lw, se->load.weight);
- load = &lw;
- }
- slice = __calc_delta(slice, se->load.weight, load);
+ /*
+ * The task has consumed its request, reschedule.
+ */
+ if (cfs_rq->nr_running > 1) {
+ resched_curr(rq_of(cfs_rq));
+ clear_buddies(cfs_rq, se);
}
- return slice;
-}
-
-/*
- * We calculate the vruntime slice of a to-be-inserted task.
- *
- * vs = s/w
- */
-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
#include "pelt.h"
@@ -758,8 +1027,6 @@ void init_entity_runnable_average(struct sched_entity *se)
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
-static void attach_entity_cfs_rq(struct sched_entity *se);
-
/*
* With new tasks being created, their initial util_avgs are extrapolated
* based on the cfs_rq's current util_avg:
@@ -794,20 +1061,6 @@ void post_init_entity_util_avg(struct task_struct *p)
long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
- if (cap > 0) {
- if (cfs_rq->avg.util_avg != 0) {
- sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
- sa->util_avg /= (cfs_rq->avg.load_avg + 1);
-
- if (sa->util_avg > cap)
- sa->util_avg = cap;
- } else {
- sa->util_avg = cap;
- }
- }
-
- sa->runnable_avg = sa->util_avg;
-
if (p->sched_class != &fair_sched_class) {
/*
* For !fair tasks do:
@@ -823,7 +1076,19 @@ void post_init_entity_util_avg(struct task_struct *p)
return;
}
- attach_entity_cfs_rq(se);
+ if (cap > 0) {
+ if (cfs_rq->avg.util_avg != 0) {
+ sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
+ sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+ if (sa->util_avg > cap)
+ sa->util_avg = cap;
+ } else {
+ sa->util_avg = cap;
+ }
+ }
+
+ sa->runnable_avg = sa->util_avg;
}
#else /* !CONFIG_SMP */
@@ -833,45 +1098,79 @@ void init_entity_runnable_average(struct sched_entity *se)
void post_init_entity_util_avg(struct task_struct *p)
{
}
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
}
#endif /* CONFIG_SMP */
+static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
+{
+ u64 now = rq_clock_task(rq);
+ s64 delta_exec;
+
+ delta_exec = now - curr->exec_start;
+ if (unlikely(delta_exec <= 0))
+ return delta_exec;
+
+ curr->exec_start = now;
+ curr->sum_exec_runtime += delta_exec;
+
+ if (schedstat_enabled()) {
+ struct sched_statistics *stats;
+
+ stats = __schedstats_from_se(curr);
+ __schedstat_set(stats->exec_max,
+ max(delta_exec, stats->exec_max));
+ }
+
+ return delta_exec;
+}
+
+static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
+{
+ trace_sched_stat_runtime(p, delta_exec);
+ account_group_exec_runtime(p, delta_exec);
+ cgroup_account_cputime(p, delta_exec);
+ if (p->dl_server)
+ dl_server_update(p->dl_server, delta_exec);
+}
+
+/*
+ * Used by other classes to account runtime.
+ */
+s64 update_curr_common(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ s64 delta_exec;
+
+ delta_exec = update_curr_se(rq, &curr->se);
+ if (likely(delta_exec > 0))
+ update_curr_task(curr, delta_exec);
+
+ return delta_exec;
+}
+
/*
* Update the current task's runtime statistics.
*/
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- u64 now = rq_clock_task(rq_of(cfs_rq));
- u64 delta_exec;
+ s64 delta_exec;
if (unlikely(!curr))
return;
- delta_exec = now - curr->exec_start;
- if (unlikely((s64)delta_exec <= 0))
+ delta_exec = update_curr_se(rq_of(cfs_rq), curr);
+ if (unlikely(delta_exec <= 0))
return;
- curr->exec_start = now;
-
- schedstat_set(curr->statistics.exec_max,
- max(delta_exec, curr->statistics.exec_max));
-
- curr->sum_exec_runtime += delta_exec;
- schedstat_add(cfs_rq->exec_clock, delta_exec);
-
curr->vruntime += calc_delta_fair(delta_exec, curr);
+ update_deadline(cfs_rq, curr);
update_min_vruntime(cfs_rq);
- if (entity_is_task(curr)) {
- struct task_struct *curtask = task_of(curr);
-
- trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
- cgroup_account_cputime(curtask, delta_exec);
- account_group_exec_runtime(curtask, delta_exec);
- }
+ if (entity_is_task(curr))
+ update_curr_task(task_of(curr), delta_exec);
account_cfs_rq_runtime(cfs_rq, delta_exec);
}
@@ -882,128 +1181,70 @@ static void update_curr_fair(struct rq *rq)
}
static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 wait_start, prev_wait_start;
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
if (!schedstat_enabled())
return;
- wait_start = rq_clock(rq_of(cfs_rq));
- prev_wait_start = schedstat_val(se->statistics.wait_start);
+ stats = __schedstats_from_se(se);
- if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
- likely(wait_start > prev_wait_start))
- wait_start -= prev_wait_start;
+ if (entity_is_task(se))
+ p = task_of(se);
- __schedstat_set(se->statistics.wait_start, wait_start);
+ __update_stats_wait_start(rq_of(cfs_rq), p, stats);
}
static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct task_struct *p;
- u64 delta;
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
if (!schedstat_enabled())
return;
- delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
+ stats = __schedstats_from_se(se);
- if (entity_is_task(se)) {
+ /*
+ * When the sched_schedstat changes from 0 to 1, some sched se
+ * maybe already in the runqueue, the se->statistics.wait_start
+ * will be 0.So it will let the delta wrong. We need to avoid this
+ * scenario.
+ */
+ if (unlikely(!schedstat_val(stats->wait_start)))
+ return;
+
+ if (entity_is_task(se))
p = task_of(se);
- if (task_on_rq_migrating(p)) {
- /*
- * Preserve migrating task's wait time so wait_start
- * time stamp can be adjusted to accumulate wait time
- * prior to migration.
- */
- __schedstat_set(se->statistics.wait_start, delta);
- return;
- }
- trace_sched_stat_wait(p, delta);
- }
- __schedstat_set(se->statistics.wait_max,
- max(schedstat_val(se->statistics.wait_max), delta));
- __schedstat_inc(se->statistics.wait_count);
- __schedstat_add(se->statistics.wait_sum, delta);
- __schedstat_set(se->statistics.wait_start, 0);
+ __update_stats_wait_end(rq_of(cfs_rq), p, stats);
}
static inline void
-update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ struct sched_statistics *stats;
struct task_struct *tsk = NULL;
- u64 sleep_start, block_start;
if (!schedstat_enabled())
return;
- sleep_start = schedstat_val(se->statistics.sleep_start);
- block_start = schedstat_val(se->statistics.block_start);
+ stats = __schedstats_from_se(se);
if (entity_is_task(se))
tsk = task_of(se);
- if (sleep_start) {
- u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
-
- if ((s64)delta < 0)
- delta = 0;
-
- if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
- __schedstat_set(se->statistics.sleep_max, delta);
-
- __schedstat_set(se->statistics.sleep_start, 0);
- __schedstat_add(se->statistics.sum_sleep_runtime, delta);
-
- if (tsk) {
- account_scheduler_latency(tsk, delta >> 10, 1);
- trace_sched_stat_sleep(tsk, delta);
- }
- }
- if (block_start) {
- u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
-
- if ((s64)delta < 0)
- delta = 0;
-
- if (unlikely(delta > schedstat_val(se->statistics.block_max)))
- __schedstat_set(se->statistics.block_max, delta);
-
- __schedstat_set(se->statistics.block_start, 0);
- __schedstat_add(se->statistics.sum_sleep_runtime, delta);
-
- if (tsk) {
- if (tsk->in_iowait) {
- __schedstat_add(se->statistics.iowait_sum, delta);
- __schedstat_inc(se->statistics.iowait_count);
- trace_sched_stat_iowait(tsk, delta);
- }
-
- trace_sched_stat_blocked(tsk, delta);
-
- /*
- * Blocking time is in units of nanosecs, so shift by
- * 20 to get a milliseconds-range estimation of the
- * amount of time that the task spent sleeping:
- */
- if (unlikely(prof_on == SLEEP_PROFILING)) {
- profile_hits(SLEEP_PROFILING,
- (void *)get_wchan(tsk),
- delta >> 20);
- }
- account_scheduler_latency(tsk, delta >> 10, 0);
- }
- }
+ __update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats);
}
/*
* Task is being enqueued - update stats:
*/
static inline void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
if (!schedstat_enabled())
return;
@@ -1013,14 +1254,14 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* a dequeue/enqueue event is a NOP)
*/
if (se != cfs_rq->curr)
- update_stats_wait_start(cfs_rq, se);
+ update_stats_wait_start_fair(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP)
- update_stats_enqueue_sleeper(cfs_rq, se);
+ update_stats_enqueue_sleeper_fair(cfs_rq, se);
}
static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
if (!schedstat_enabled())
@@ -1031,16 +1272,19 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* waiting task:
*/
if (se != cfs_rq->curr)
- update_stats_wait_end(cfs_rq, se);
+ update_stats_wait_end_fair(cfs_rq, se);
if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
+ unsigned int state;
- if (tsk->state & TASK_INTERRUPTIBLE)
- __schedstat_set(se->statistics.sleep_start,
+ /* XXX racy against TTWU */
+ state = READ_ONCE(tsk->__state);
+ if (state & TASK_INTERRUPTIBLE)
+ __schedstat_set(tsk->stats.sleep_start,
rq_clock(rq_of(cfs_rq)));
- if (tsk->state & TASK_UNINTERRUPTIBLE)
- __schedstat_set(se->statistics.block_start,
+ if (state & TASK_UNINTERRUPTIBLE)
+ __schedstat_set(tsk->stats.block_start,
rq_clock(rq_of(cfs_rq)));
}
}
@@ -1061,6 +1305,50 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
+static inline bool is_core_idle(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+ int sibling;
+
+ for_each_cpu(sibling, cpu_smt_mask(cpu)) {
+ if (cpu == sibling)
+ continue;
+
+ if (!idle_cpu(sibling))
+ return false;
+ }
+#endif
+
+ return true;
+}
+
+#ifdef CONFIG_NUMA
+#define NUMA_IMBALANCE_MIN 2
+
+static inline long
+adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
+{
+ /*
+ * Allow a NUMA imbalance if busy CPUs is less than the maximum
+ * threshold. Above this threshold, individual tasks may be contending
+ * for both memory bandwidth and any shared HT resources. This is an
+ * approximation as the number of running tasks may not be related to
+ * the number of busy CPUs due to sched_setaffinity.
+ */
+ if (dst_running > imb_numa_nr)
+ return imbalance;
+
+ /*
+ * Allow a small imbalance based on a simple pair of communicating
+ * tasks that remain local when the destination is lightly loaded.
+ */
+ if (imbalance <= NUMA_IMBALANCE_MIN)
+ return 0;
+
+ return imbalance;
+}
+#endif /* CONFIG_NUMA */
+
#ifdef CONFIG_NUMA_BALANCING
/*
* Approximate time to scan a full NUMA task in ms. The task scan period is
@@ -1076,6 +1364,9 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+/* The page with hint page fault latency < threshold in ms is considered hot */
+unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
+
struct numa_group {
refcount_t refcount;
@@ -1088,11 +1379,12 @@ struct numa_group {
unsigned long total_faults;
unsigned long max_faults_cpu;
/*
+ * faults[] array is split into two regions: faults_mem and faults_cpu.
+ *
* Faults_cpu is used to decide whether memory should move
* towards the CPU. As a consequence, these stats are weighted
* more by CPU use than by memory faults.
*/
- unsigned long *faults_cpu;
unsigned long faults[];
};
@@ -1103,7 +1395,7 @@ struct numa_group {
static struct numa_group *deref_task_numa_group(struct task_struct *p)
{
return rcu_dereference_check(p->numa_group, p == current ||
- (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
+ (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
}
static struct numa_group *deref_curr_numa_group(struct task_struct *p)
@@ -1133,7 +1425,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
return rss / nr_scan_pages;
}
-/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
+/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
#define MAX_SCAN_WINDOW 2560
static unsigned int task_scan_min(struct task_struct *p)
@@ -1266,8 +1558,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
{
- return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
- group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
+ return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] +
+ group->faults[task_faults_idx(NUMA_CPU, nid, 1)];
}
static inline unsigned long group_faults_priv(struct numa_group *ng)
@@ -1308,10 +1600,10 @@ static bool numa_is_active_node(int nid, struct numa_group *ng)
/* Handle placement on systems where not all nodes are directly connected. */
static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
- int maxdist, bool task)
+ int lim_dist, bool task)
{
unsigned long score = 0;
- int node;
+ int node, max_dist;
/*
* All nodes are directly connected, and the same distance
@@ -1320,6 +1612,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
if (sched_numa_topology_type == NUMA_DIRECT)
return 0;
+ /* sched_max_numa_distance may be changed in parallel. */
+ max_dist = READ_ONCE(sched_max_numa_distance);
/*
* This code is called for each node, introducing N^2 complexity,
* which should be ok given the number of nodes rarely exceeds 8.
@@ -1332,7 +1626,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* The furthest away nodes in the system are not interesting
* for placement; nid was already counted.
*/
- if (dist == sched_max_numa_distance || node == nid)
+ if (dist >= max_dist || node == nid)
continue;
/*
@@ -1342,8 +1636,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* "hoplimit", only nodes closer by than "hoplimit" are part
* of each group. Skip other nodes.
*/
- if (sched_numa_topology_type == NUMA_BACKPLANE &&
- dist >= maxdist)
+ if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
continue;
/* Add up the faults from nearby nodes. */
@@ -1361,8 +1654,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* This seems to result in good task placement.
*/
if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
- faults *= (sched_max_numa_distance - dist);
- faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+ faults *= (max_dist - dist);
+ faults /= (max_dist - LOCAL_DISTANCE);
}
score += faults;
@@ -1416,15 +1709,164 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
return 1000 * faults / total_faults;
}
-bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+/*
+ * If memory tiering mode is enabled, cpupid of slow memory page is
+ * used to record scan time instead of CPU and PID. When tiering mode
+ * is disabled at run time, the scan time (in cpupid) will be
+ * interpreted as CPU and PID. So CPU needs to be checked to avoid to
+ * access out of array bound.
+ */
+static inline bool cpupid_valid(int cpupid)
+{
+ return cpupid_to_cpu(cpupid) < nr_cpu_ids;
+}
+
+/*
+ * For memory tiering mode, if there are enough free pages (more than
+ * enough watermark defined here) in fast memory node, to take full
+ * advantage of fast memory capacity, all recently accessed slow
+ * memory pages will be migrated to fast memory node without
+ * considering hot threshold.
+ */
+static bool pgdat_free_space_enough(struct pglist_data *pgdat)
+{
+ int z;
+ unsigned long enough_wmark;
+
+ enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
+ pgdat->node_present_pages >> 4);
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone_watermark_ok(zone, 0,
+ wmark_pages(zone, WMARK_PROMO) + enough_wmark,
+ ZONE_MOVABLE, 0))
+ return true;
+ }
+ return false;
+}
+
+/*
+ * For memory tiering mode, when page tables are scanned, the scan
+ * time will be recorded in struct page in addition to make page
+ * PROT_NONE for slow memory page. So when the page is accessed, in
+ * hint page fault handler, the hint page fault latency is calculated
+ * via,
+ *
+ * hint page fault latency = hint page fault time - scan time
+ *
+ * The smaller the hint page fault latency, the higher the possibility
+ * for the page to be hot.
+ */
+static int numa_hint_fault_latency(struct folio *folio)
+{
+ int last_time, time;
+
+ time = jiffies_to_msecs(jiffies);
+ last_time = folio_xchg_access_time(folio, time);
+
+ return (time - last_time) & PAGE_ACCESS_TIME_MASK;
+}
+
+/*
+ * For memory tiering mode, too high promotion/demotion throughput may
+ * hurt application latency. So we provide a mechanism to rate limit
+ * the number of pages that are tried to be promoted.
+ */
+static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
+ unsigned long rate_limit, int nr)
+{
+ unsigned long nr_cand;
+ unsigned int now, start;
+
+ now = jiffies_to_msecs(jiffies);
+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ start = pgdat->nbp_rl_start;
+ if (now - start > MSEC_PER_SEC &&
+ cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
+ pgdat->nbp_rl_nr_cand = nr_cand;
+ if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
+ return true;
+ return false;
+}
+
+#define NUMA_MIGRATION_ADJUST_STEPS 16
+
+static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
+ unsigned long rate_limit,
+ unsigned int ref_th)
+{
+ unsigned int now, start, th_period, unit_th, th;
+ unsigned long nr_cand, ref_cand, diff_cand;
+
+ now = jiffies_to_msecs(jiffies);
+ th_period = sysctl_numa_balancing_scan_period_max;
+ start = pgdat->nbp_th_start;
+ if (now - start > th_period &&
+ cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
+ ref_cand = rate_limit *
+ sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
+ unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
+ th = pgdat->nbp_threshold ? : ref_th;
+ if (diff_cand > ref_cand * 11 / 10)
+ th = max(th - unit_th, unit_th);
+ else if (diff_cand < ref_cand * 9 / 10)
+ th = min(th + unit_th, ref_th * 2);
+ pgdat->nbp_th_nr_cand = nr_cand;
+ pgdat->nbp_threshold = th;
+ }
+}
+
+bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
int src_nid, int dst_cpu)
{
struct numa_group *ng = deref_curr_numa_group(p);
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
+ /*
+ * The pages in slow memory node should be migrated according
+ * to hot/cold instead of private/shared.
+ */
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+ !node_is_toptier(src_nid)) {
+ struct pglist_data *pgdat;
+ unsigned long rate_limit;
+ unsigned int latency, th, def_th;
+
+ pgdat = NODE_DATA(dst_nid);
+ if (pgdat_free_space_enough(pgdat)) {
+ /* workload changed, reset hot threshold */
+ pgdat->nbp_threshold = 0;
+ return true;
+ }
+
+ def_th = sysctl_numa_balancing_hot_threshold;
+ rate_limit = sysctl_numa_balancing_promote_rate_limit << \
+ (20 - PAGE_SHIFT);
+ numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
+
+ th = pgdat->nbp_threshold ? : def_th;
+ latency = numa_hint_fault_latency(folio);
+ if (latency >= th)
+ return false;
+
+ return !numa_promotion_rate_limit(pgdat, rate_limit,
+ folio_nr_pages(folio));
+ }
+
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
- last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+ last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid);
+
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
+ return false;
/*
* Allow first faults or private faults to migrate immediately early in
@@ -1506,6 +1948,7 @@ enum numa_type {
/* Cached statistics for all CPUs within a node */
struct numa_stats {
unsigned long load;
+ unsigned long runnable;
unsigned long util;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
@@ -1515,28 +1958,12 @@ struct numa_stats {
int idle_cpu;
};
-static inline bool is_core_idle(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
- int sibling;
-
- for_each_cpu(sibling, cpu_smt_mask(cpu)) {
- if (cpu == sibling)
- continue;
-
- if (!idle_cpu(cpu))
- return false;
- }
-#endif
-
- return true;
-}
-
struct task_numa_env {
struct task_struct *p;
int src_cpu, src_nid;
int dst_cpu, dst_nid;
+ int imb_numa_nr;
struct numa_stats src_stats, dst_stats;
@@ -1549,19 +1976,20 @@ struct task_numa_env {
};
static unsigned long cpu_load(struct rq *rq);
-static unsigned long cpu_util(int cpu);
-static inline long adjust_numa_imbalance(int imbalance, int src_nr_running);
+static unsigned long cpu_runnable(struct rq *rq);
static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
struct numa_stats *ns)
{
if ((ns->nr_running > ns->weight) &&
- ((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
+ (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
+ ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
return node_overloaded;
if ((ns->nr_running < ns->weight) ||
- ((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
+ (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
+ ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
return node_has_spare;
return node_fully_busy;
@@ -1569,11 +1997,11 @@ numa_type numa_classify(unsigned int imbalance_pct,
#ifdef CONFIG_SCHED_SMT
/* Forward declarations of select_idle_sibling helpers */
-static inline bool test_idle_cores(int cpu, bool def);
+static inline bool test_idle_cores(int cpu);
static inline int numa_idle_core(int idle_core, int cpu)
{
if (!static_branch_likely(&sched_smt_present) ||
- idle_core >= 0 || !test_idle_cores(cpu, false))
+ idle_core >= 0 || !test_idle_cores(cpu))
return idle_core;
/*
@@ -1612,11 +2040,12 @@ static void update_numa_stats(struct task_numa_env *env,
struct rq *rq = cpu_rq(cpu);
ns->load += cpu_load(rq);
- ns->util += cpu_util(cpu);
+ ns->runnable += cpu_runnable(rq);
+ ns->util += cpu_util_cfs(cpu);
ns->nr_running += rq->cfs.h_nr_running;
ns->compute_capacity += capacity_of(cpu);
- if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
+ if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
if (READ_ONCE(rq->numa_migrate_on) ||
!cpumask_test_cpu(cpu, env->p->cpus_ptr))
continue;
@@ -1648,7 +2077,7 @@ static void task_numa_assign(struct task_numa_env *env,
int start = env->dst_cpu;
/* Find alternative idle CPU. */
- for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
+ for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) {
if (cpu == env->best_cpu || !idle_cpu(cpu) ||
!cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
continue;
@@ -1788,6 +2217,15 @@ static bool task_numa_compare(struct task_numa_env *env,
*/
cur_ng = rcu_dereference(cur->numa_group);
if (cur_ng == p_ng) {
+ /*
+ * Do not swap within a group or between tasks that have
+ * no group if there is spare capacity. Swapping does
+ * not address the load imbalance and helps one task at
+ * the cost of punishing another.
+ */
+ if (env->dst_stats.node_type == node_has_spare)
+ goto unlock;
+
imp = taskimp + task_weight(cur, env->src_nid, dist) -
task_weight(cur, env->dst_nid, dist);
/*
@@ -1927,7 +2365,8 @@ static void task_numa_find_cpu(struct task_numa_env *env,
src_running = env->src_stats.nr_running - 1;
dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running);
- imbalance = adjust_numa_imbalance(imbalance, src_running);
+ imbalance = adjust_numa_imbalance(imbalance, dst_running,
+ env->imb_numa_nr);
/* Use idle CPU if there is no imbalance */
if (!imbalance) {
@@ -1992,8 +2431,10 @@ static int task_numa_migrate(struct task_struct *p)
*/
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
- if (sd)
+ if (sd) {
env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+ env.imb_numa_nr = sd->imb_numa_nr;
+ }
rcu_read_unlock();
/*
@@ -2028,7 +2469,7 @@ static int task_numa_migrate(struct task_struct *p)
*/
ng = deref_curr_numa_group(p);
if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
@@ -2116,7 +2557,7 @@ static void numa_migrate_preferred(struct task_struct *p)
}
/*
- * Find out how many nodes on the workload is actively running on. Do this by
+ * Find out how many nodes the workload is actively running on. Do this by
* tracking the nodes from which NUMA hinting faults are triggered. This can
* be different from the set of nodes where the workload's memory is currently
* located.
@@ -2126,13 +2567,13 @@ static void numa_group_count_active_nodes(struct numa_group *numa_group)
unsigned long faults, max_faults = 0;
int nid, active_nodes = 0;
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
faults = group_faults_cpu(numa_group, nid);
if (faults > max_faults)
max_faults = faults;
}
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
faults = group_faults_cpu(numa_group, nid);
if (faults * ACTIVE_NODE_FRACTION > max_faults)
active_nodes++;
@@ -2170,7 +2611,7 @@ static void update_task_scan_period(struct task_struct *p,
/*
* If there were no record hinting faults then either the task is
- * completely idle or all activity is areas that are not of interest
+ * completely idle or all activity is in areas that are not of interest
* to automatic numa balancing. Related to that, if there were failed
* migration then it implies we are migrating too quickly or the local
* node is overloaded. In either case, scan slower
@@ -2286,7 +2727,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
dist = sched_max_numa_distance;
- for_each_online_node(node) {
+ for_each_node_state(node, N_CPU) {
score = group_weight(p, node, dist);
if (score > max_score) {
max_score = score;
@@ -2305,7 +2746,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
* inside the highest scoring group of nodes. The nodemask tricks
* keep the complexity of the search down.
*/
- nodes = node_online_map;
+ nodes = node_states[N_CPU];
for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
unsigned long max_faults = 0;
nodemask_t max_group = NODE_MASK_NONE;
@@ -2427,7 +2868,7 @@ static void task_numa_placement(struct task_struct *p)
* is at the beginning of the numa_faults array.
*/
ng->faults[mem_idx] += diff;
- ng->faults_cpu[mem_idx] += f_diff;
+ ng->faults[cpu_idx] += f_diff;
ng->total_faults += diff;
group_faults += ng->faults[mem_idx];
}
@@ -2444,6 +2885,9 @@ static void task_numa_placement(struct task_struct *p)
}
}
+ /* Cannot migrate task to CPU-less node */
+ max_nid = numa_nearest_node(max_nid, N_CPU);
+
if (ng) {
numa_group_count_active_nodes(ng);
spin_unlock_irq(group_lock);
@@ -2481,7 +2925,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (unlikely(!deref_curr_numa_group(p))) {
unsigned int size = sizeof(struct numa_group) +
- 4*nr_node_ids*sizeof(unsigned long);
+ NR_NUMA_HINT_FAULT_STATS *
+ nr_node_ids * sizeof(unsigned long);
grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
if (!grp)
@@ -2492,9 +2937,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
grp->max_faults_cpu = 0;
spin_lock_init(&grp->lock);
grp->gid = p->pid;
- /* Second half of the array tracks nids where faults happen */
- grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
- nr_node_ids;
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] = p->numa_faults[i];
@@ -2551,7 +2993,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!join)
return;
- BUG_ON(irqs_disabled());
+ WARN_ON_ONCE(irqs_disabled());
double_lock_irq(&my_grp->lock, &grp->lock);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
@@ -2578,7 +3020,7 @@ no_join:
}
/*
- * Get rid of NUMA staticstics associated with a task (either current or dead).
+ * Get rid of NUMA statistics associated with a task (either current or dead).
* If @final is set, the task is dead and has reached refcount zero, so we can
* safely free all relevant data structures. Otherwise, there might be
* concurrent reads from places like load balancing and procfs, and we should
@@ -2636,6 +3078,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (!p->mm)
return;
+ /*
+ * NUMA faults statistics are unnecessary for the slow memory
+ * node for memory tiering mode.
+ */
+ if (!node_is_toptier(mem_node) &&
+ (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
+ !cpupid_valid(last_cpupid)))
+ return;
+
/* Allocate buffer to track faults on a per-node basis */
if (unlikely(!p->numa_faults)) {
int size = sizeof(*p->numa_faults) *
@@ -2706,6 +3157,36 @@ static void reset_ptenuma_scan(struct task_struct *p)
p->mm->numa_scan_offset = 0;
}
+static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ unsigned long pids;
+ /*
+ * Allow unconditional access first two times, so that all the (pages)
+ * of VMAs get prot_none fault introduced irrespective of accesses.
+ * This is also done to avoid any side effect of task scanning
+ * amplifying the unfairness of disjoint set of VMAs' access.
+ */
+ if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2)
+ return true;
+
+ pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
+ if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
+ return true;
+
+ /*
+ * Complete a scan that has already started regardless of PID access, or
+ * some VMAs may never be scanned in multi-threaded applications:
+ */
+ if (mm->numa_scan_offset > vma->vm_start) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
+ return true;
+ }
+
+ return false;
+}
+
+#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
+
/*
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
@@ -2720,6 +3201,9 @@ static void task_numa_work(struct callback_head *work)
unsigned long start, end;
unsigned long nr_pte_updates = 0;
long pages, virtpages;
+ struct vma_iterator vmi;
+ bool vma_pids_skipped;
+ bool vma_pids_forced = false;
SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
@@ -2753,7 +3237,7 @@ static void task_numa_work(struct callback_head *work)
}
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
- if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+ if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan))
return;
/*
@@ -2762,7 +3246,6 @@ static void task_numa_work(struct callback_head *work)
*/
p->node_stamp += 2 * TICK_NSEC;
- start = mm->numa_scan_offset;
pages = sysctl_numa_balancing_scan_size;
pages <<= 20 - PAGE_SHIFT; /* MB in pages */
virtpages = pages * 8; /* Scan up to this much virtual space */
@@ -2772,15 +3255,29 @@ static void task_numa_work(struct callback_head *work)
if (!mmap_read_trylock(mm))
return;
- vma = find_vma(mm, start);
+
+ /*
+ * VMAs are skipped if the current PID has not trapped a fault within
+ * the VMA recently. Allow scanning to be forced if there is no
+ * suitable VMA remaining.
+ */
+ vma_pids_skipped = false;
+
+retry_pids:
+ start = mm->numa_scan_offset;
+ vma_iter_init(&vmi, mm, start);
+ vma = vma_next(&vmi);
if (!vma) {
reset_ptenuma_scan(p);
start = 0;
- vma = mm->mmap;
+ vma_iter_set(&vmi, start);
+ vma = vma_next(&vmi);
}
- for (; vma; vma = vma->vm_next) {
+
+ do {
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
continue;
}
@@ -2791,15 +3288,79 @@ static void task_numa_work(struct callback_head *work)
* as migrating the pages will be of marginal benefit.
*/
if (!vma->vm_mm ||
- (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+ (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO);
continue;
+ }
/*
* Skip inaccessible VMAs to avoid any confusion between
* PROT_NONE and NUMA hinting ptes
*/
- if (!vma_is_accessible(vma))
+ if (!vma_is_accessible(vma)) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
+ continue;
+ }
+
+ /* Initialise new per-VMA NUMAB state. */
+ if (!vma->numab_state) {
+ vma->numab_state = kzalloc(sizeof(struct vma_numab_state),
+ GFP_KERNEL);
+ if (!vma->numab_state)
+ continue;
+
+ vma->numab_state->start_scan_seq = mm->numa_scan_seq;
+
+ vma->numab_state->next_scan = now +
+ msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+
+ /* Reset happens after 4 times scan delay of scan start */
+ vma->numab_state->pids_active_reset = vma->numab_state->next_scan +
+ msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+
+ /*
+ * Ensure prev_scan_seq does not match numa_scan_seq,
+ * to prevent VMAs being skipped prematurely on the
+ * first scan:
+ */
+ vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1;
+ }
+
+ /*
+ * Scanning the VMA's of short lived tasks add more overhead. So
+ * delay the scan for new VMAs.
+ */
+ if (mm->numa_scan_seq && time_before(jiffies,
+ vma->numab_state->next_scan)) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
continue;
+ }
+
+ /* RESET access PIDs regularly for old VMAs. */
+ if (mm->numa_scan_seq &&
+ time_after(jiffies, vma->numab_state->pids_active_reset)) {
+ vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
+ msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+ vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
+ vma->numab_state->pids_active[1] = 0;
+ }
+
+ /* Do not rescan VMAs twice within the same sequence. */
+ if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
+ mm->numa_scan_offset = vma->vm_end;
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED);
+ continue;
+ }
+
+ /*
+ * Do not scan the VMA if task has not accessed it, unless no other
+ * VMA candidate exists.
+ */
+ if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
+ vma_pids_skipped = true;
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
+ continue;
+ }
do {
start = max(start, vma->vm_start);
@@ -2825,6 +3386,26 @@ static void task_numa_work(struct callback_head *work)
cond_resched();
} while (end != vma->vm_end);
+
+ /* VMA scan is complete, do not scan until next sequence. */
+ vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
+
+ /*
+ * Only force scan within one VMA at a time, to limit the
+ * cost of scanning a potentially uninteresting VMA.
+ */
+ if (vma_pids_forced)
+ break;
+ } for_each_vma(vmi, vma);
+
+ /*
+ * If no VMAs are remaining and VMAs were skipped due to the PID
+ * not accessing the VMA previously, then force a scan to ensure
+ * forward progress:
+ */
+ if (!vma && !vma_pids_forced && vma_pids_skipped) {
+ vma_pids_forced = true;
+ goto retry_pids;
}
out:
@@ -2867,9 +3448,12 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
p->node_stamp = 0;
p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_migrate_retry = 0;
/* Protect against double add, see task_tick_numa and task_numa_work */
p->numa_work.next = &p->numa_work;
p->numa_faults = NULL;
+ p->numa_pages_migrated = 0;
+ p->total_numa_faults = 0;
RCU_INIT_POINTER(p->numa_group, NULL);
p->last_task_numa_placement = 0;
p->last_sum_exec_runtime = 0;
@@ -2907,7 +3491,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
/*
* We don't care about NUMA placement if we don't have memory.
*/
- if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
+ if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
return;
/*
@@ -2925,7 +3509,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
curr->node_stamp += period;
if (!time_before(jiffies, curr->mm->numa_next_scan))
- task_work_add(curr, work, true);
+ task_work_add(curr, work, TWA_RESUME);
}
}
@@ -2995,6 +3579,8 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#endif
cfs_rq->nr_running++;
+ if (se_is_idle(se))
+ cfs_rq->idle_nr_running++;
}
static void
@@ -3008,6 +3594,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#endif
cfs_rq->nr_running--;
+ if (se_is_idle(se))
+ cfs_rq->idle_nr_running--;
}
/*
@@ -3071,6 +3659,9 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
+ cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
#else
static inline void
@@ -3079,31 +3670,163 @@ static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
#endif
+static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ unsigned long weight)
+{
+ unsigned long old_weight = se->load.weight;
+ u64 avruntime = avg_vruntime(cfs_rq);
+ s64 vlag, vslice;
+
+ /*
+ * VRUNTIME
+ * ========
+ *
+ * COROLLARY #1: The virtual runtime of the entity needs to be
+ * adjusted if re-weight at !0-lag point.
+ *
+ * Proof: For contradiction assume this is not true, so we can
+ * re-weight without changing vruntime at !0-lag point.
+ *
+ * Weight VRuntime Avg-VRuntime
+ * before w v V
+ * after w' v' V'
+ *
+ * Since lag needs to be preserved through re-weight:
+ *
+ * lag = (V - v)*w = (V'- v')*w', where v = v'
+ * ==> V' = (V - v)*w/w' + v (1)
+ *
+ * Let W be the total weight of the entities before reweight,
+ * since V' is the new weighted average of entities:
+ *
+ * V' = (WV + w'v - wv) / (W + w' - w) (2)
+ *
+ * by using (1) & (2) we obtain:
+ *
+ * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
+ * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
+ * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
+ * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
+ *
+ * Since we are doing at !0-lag point which means V != v, we
+ * can simplify (3):
+ *
+ * ==> W / (W + w' - w) = w / w'
+ * ==> Ww' = Ww + ww' - ww
+ * ==> W * (w' - w) = w * (w' - w)
+ * ==> W = w (re-weight indicates w' != w)
+ *
+ * So the cfs_rq contains only one entity, hence vruntime of
+ * the entity @v should always equal to the cfs_rq's weighted
+ * average vruntime @V, which means we will always re-weight
+ * at 0-lag point, thus breach assumption. Proof completed.
+ *
+ *
+ * COROLLARY #2: Re-weight does NOT affect weighted average
+ * vruntime of all the entities.
+ *
+ * Proof: According to corollary #1, Eq. (1) should be:
+ *
+ * (V - v)*w = (V' - v')*w'
+ * ==> v' = V' - (V - v)*w/w' (4)
+ *
+ * According to the weighted average formula, we have:
+ *
+ * V' = (WV - wv + w'v') / (W - w + w')
+ * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
+ * = (WV - wv + w'V' - Vw + wv) / (W - w + w')
+ * = (WV + w'V' - Vw) / (W - w + w')
+ *
+ * ==> V'*(W - w + w') = WV + w'V' - Vw
+ * ==> V' * (W - w) = (W - w) * V (5)
+ *
+ * If the entity is the only one in the cfs_rq, then reweight
+ * always occurs at 0-lag point, so V won't change. Or else
+ * there are other entities, hence W != w, then Eq. (5) turns
+ * into V' = V. So V won't change in either case, proof done.
+ *
+ *
+ * So according to corollary #1 & #2, the effect of re-weight
+ * on vruntime should be:
+ *
+ * v' = V' - (V - v) * w / w' (4)
+ * = V - (V - v) * w / w'
+ * = V - vl * w / w'
+ * = V - vl'
+ */
+ if (avruntime != se->vruntime) {
+ vlag = (s64)(avruntime - se->vruntime);
+ vlag = div_s64(vlag * old_weight, weight);
+ se->vruntime = avruntime - vlag;
+ }
+
+ /*
+ * DEADLINE
+ * ========
+ *
+ * When the weight changes, the virtual time slope changes and
+ * we should adjust the relative virtual deadline accordingly.
+ *
+ * d' = v' + (d - v)*w/w'
+ * = V' - (V - v)*w/w' + (d - v)*w/w'
+ * = V - (V - v)*w/w' + (d - v)*w/w'
+ * = V + (d - V)*w/w'
+ */
+ vslice = (s64)(se->deadline - avruntime);
+ vslice = div_s64(vslice * old_weight, weight);
+ se->deadline = avruntime + vslice;
+}
+
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
+ bool curr = cfs_rq->curr == se;
+
if (se->on_rq) {
/* commit outstanding execution time */
- if (cfs_rq->curr == se)
+ if (curr)
update_curr(cfs_rq);
- account_entity_dequeue(cfs_rq, se);
+ else
+ __dequeue_entity(cfs_rq, se);
+ update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
+ if (!se->on_rq) {
+ /*
+ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
+ * we need to scale se->vlag when w_i changes.
+ */
+ se->vlag = div_s64(se->vlag * se->load.weight, weight);
+ } else {
+ reweight_eevdf(cfs_rq, se, weight);
+ }
+
update_load_set(&se->load, weight);
#ifdef CONFIG_SMP
do {
- u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
+ u32 divider = get_pelt_divider(&se->avg);
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
} while (0);
#endif
enqueue_load_avg(cfs_rq, se);
- if (se->on_rq)
- account_entity_enqueue(cfs_rq, se);
+ if (se->on_rq) {
+ update_load_add(&cfs_rq->load, se->load.weight);
+ if (!curr)
+ __enqueue_entity(cfs_rq, se);
+ /*
+ * The entity's vruntime has been adjusted, so let's check
+ * whether the rq-wide min_vruntime needs updated too. Since
+ * the calculations above require stable min_vruntime rather
+ * than up-to-date one, we do the update at the end of the
+ * reweight process.
+ */
+ update_min_vruntime(cfs_rq);
+ }
}
void reweight_task(struct task_struct *p, int prio)
@@ -3117,6 +3840,8 @@ void reweight_task(struct task_struct *p, int prio)
load->inv_weight = sched_prio_to_wmult[prio];
}
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
/*
@@ -3128,7 +3853,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- (1)
- * \Sum grq->load.weight
+ * \Sum grq->load.weight
*
* Now, because computing that sum is prohibitively expensive to compute (been
* there, done that) we approximate it with this average stuff. The average
@@ -3142,7 +3867,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->avg.load_avg
* ge->load.weight = ------------------------------ (3)
- * tg->load_avg
+ * tg->load_avg
*
* Where: tg->load_avg ~= \Sum grq->avg.load_avg
*
@@ -3158,7 +3883,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- = tg->weight (4)
- * grp->load.weight
+ * grp->load.weight
*
* That is, the sum collapses because all other CPUs are idle; the UP scenario.
*
@@ -3177,7 +3902,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- (6)
- * tg_load_avg'
+ * tg_load_avg'
*
* Where:
*
@@ -3227,8 +3952,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
}
#endif /* CONFIG_SMP */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-
/*
* Recomputes the group entity based on the current state of its group
* runqueue.
@@ -3246,14 +3969,11 @@ static void update_cfs_group(struct sched_entity *se)
#ifndef CONFIG_SMP
shares = READ_ONCE(gcfs_rq->tg->shares);
-
- if (likely(se->load.weight == shares))
- return;
#else
- shares = calc_group_shares(gcfs_rq);
+ shares = calc_group_shares(gcfs_rq);
#endif
-
- reweight_entity(cfs_rq_of(se), se, shares);
+ if (unlikely(se->load.weight != shares))
+ reweight_entity(cfs_rq_of(se), se, shares);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
@@ -3279,18 +3999,85 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
* As is, the util number is not freq-invariant (we'd have to
* implement arch_scale_freq_capacity() for that).
*
- * See cpu_util().
+ * See cpu_util_cfs().
*/
cpufreq_update_util(rq, flags);
}
}
#ifdef CONFIG_SMP
+static inline bool load_avg_is_decayed(struct sched_avg *sa)
+{
+ if (sa->load_sum)
+ return false;
+
+ if (sa->util_sum)
+ return false;
+
+ if (sa->runnable_sum)
+ return false;
+
+ /*
+ * _avg must be null when _sum are null because _avg = _sum / divider
+ * Make sure that rounding and/or propagation of PELT values never
+ * break this.
+ */
+ SCHED_WARN_ON(sa->load_avg ||
+ sa->util_avg ||
+ sa->runnable_avg);
+
+ return true;
+}
+
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+ return u64_u32_load_copy(cfs_rq->avg.last_update_time,
+ cfs_rq->last_update_time_copy);
+}
#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
+ * immediately before a parent cfs_rq, and cfs_rqs are removed from the list
+ * bottom-up, we only have to test whether the cfs_rq before us on the list
+ * is our child.
+ * If cfs_rq is not on the list, test whether a child needs its to be added to
+ * connect a branch to the tree * (see list_add_leaf_cfs_rq() for details).
+ */
+static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
+{
+ struct cfs_rq *prev_cfs_rq;
+ struct list_head *prev;
+
+ if (cfs_rq->on_list) {
+ prev = cfs_rq->leaf_cfs_rq_list.prev;
+ } else {
+ struct rq *rq = rq_of(cfs_rq);
+
+ prev = rq->tmp_alone_branch;
+ }
+
+ prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
+
+ return (prev_cfs_rq->tg->parent == cfs_rq->tg);
+}
+
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->load.weight)
+ return false;
+
+ if (!load_avg_is_decayed(&cfs_rq->avg))
+ return false;
+
+ if (child_cfs_rq_on_list(cfs_rq))
+ return false;
+
+ return true;
+}
+
/**
* update_tg_load_avg - update the tg's load avg
* @cfs_rq: the cfs_rq whose avg changed
- * @force: update regardless of how small the difference
*
* This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
* However, because tg->load_avg is a global value there are performance
@@ -3302,9 +4089,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
*
* Updating tg's load_avg is necessary before update_cfs_share().
*/
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
- long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+ long delta;
+ u64 now;
/*
* No need to update load_avg for root_task_group as it is not used.
@@ -3312,12 +4100,69 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
if (cfs_rq->tg == &root_task_group)
return;
- if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
+ /* rq has been offline and doesn't contribute to the share anymore: */
+ if (!cpu_active(cpu_of(rq_of(cfs_rq))))
+ return;
+
+ /*
+ * For migration heavy workloads, access to tg->load_avg can be
+ * unbound. Limit the update rate to at most once per ms.
+ */
+ now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+ if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC)
+ return;
+
+ delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+ if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
+ cfs_rq->last_update_tg_load_avg = now;
}
}
+static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
+{
+ long delta;
+ u64 now;
+
+ /*
+ * No need to update load_avg for root_task_group, as it is not used.
+ */
+ if (cfs_rq->tg == &root_task_group)
+ return;
+
+ now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+ delta = 0 - cfs_rq->tg_load_avg_contrib;
+ atomic_long_add(delta, &cfs_rq->tg->load_avg);
+ cfs_rq->tg_load_avg_contrib = 0;
+ cfs_rq->last_update_tg_load_avg = now;
+}
+
+/* CPU offline callback: */
+static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
+{
+ struct task_group *tg;
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * The rq clock has already been updated in
+ * set_rq_offline(), so we should skip updating
+ * the rq clock again in unthrottle_cfs_rq().
+ */
+ rq_clock_start_loop_update(rq);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+ clear_tg_load_avg(cfs_rq);
+ }
+ rcu_read_unlock();
+
+ rq_clock_stop_loop_update(rq);
+}
+
/*
* Called within set_task_rq() right before setting a task's CPU. The
* caller only guarantees p->pi_lock is held; no other assumptions,
@@ -3342,32 +4187,13 @@ void set_task_rq_fair(struct sched_entity *se,
if (!(se->avg.last_update_time && prev))
return;
-#ifndef CONFIG_64BIT
- {
- u64 p_last_update_time_copy;
- u64 n_last_update_time_copy;
+ p_last_update_time = cfs_rq_last_update_time(prev);
+ n_last_update_time = cfs_rq_last_update_time(next);
- do {
- p_last_update_time_copy = prev->load_last_update_time_copy;
- n_last_update_time_copy = next->load_last_update_time_copy;
-
- smp_rmb();
-
- p_last_update_time = prev->avg.last_update_time;
- n_last_update_time = next->avg.last_update_time;
-
- } while (p_last_update_time != p_last_update_time_copy ||
- n_last_update_time != n_last_update_time_copy);
- }
-#else
- p_last_update_time = prev->avg.last_update_time;
- n_last_update_time = next->avg.last_update_time;
-#endif
__update_load_avg_blocked_se(p_last_update_time, se);
se->avg.last_update_time = n_last_update_time;
}
-
/*
* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
* propagate its contribution. The key to this propagation is the invariant
@@ -3435,51 +4261,66 @@ void set_task_rq_fair(struct sched_entity *se,
* XXX: only do this for the part of runnable > running ?
*
*/
-
static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+ long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
+ u32 new_sum, divider;
+
+ /* Nothing to update */
+ if (!delta_avg)
+ return;
+
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ divider = get_pelt_divider(&cfs_rq->avg);
- /* Nothing to update */
- if (!delta)
- return;
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
- se->avg.util_sum = se->avg.util_avg * divider;
+ new_sum = se->avg.util_avg * divider;
+ delta_sum = (long)new_sum - (long)se->avg.util_sum;
+ se->avg.util_sum = new_sum;
/* Update parent cfs_rq utilization */
- add_positive(&cfs_rq->avg.util_avg, delta);
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
+ add_positive(&cfs_rq->avg.util_avg, delta_avg);
+ add_positive(&cfs_rq->avg.util_sum, delta_sum);
+
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+ cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
}
static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+ long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+ u32 new_sum, divider;
+
+ /* Nothing to update */
+ if (!delta_avg)
+ return;
+
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
-
- /* Nothing to update */
- if (!delta)
- return;
+ divider = get_pelt_divider(&cfs_rq->avg);
/* Set new sched_entity's runnable */
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
- se->avg.runnable_sum = se->avg.runnable_avg * divider;
+ new_sum = se->avg.runnable_avg * divider;
+ delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
+ se->avg.runnable_sum = new_sum;
/* Update parent cfs_rq runnable */
- add_positive(&cfs_rq->avg.runnable_avg, delta);
- cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
+ add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
+ add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
+ cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
}
static inline void
@@ -3500,7 +4341,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
- divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ divider = get_pelt_divider(&cfs_rq->avg);
if (runnable_sum >= 0) {
/*
@@ -3515,7 +4356,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
* assuming all tasks are equally runnable.
*/
if (scale_load_down(gcfs_rq->load.weight)) {
- load_sum = div_s64(gcfs_rq->avg.load_sum,
+ load_sum = div_u64(gcfs_rq->avg.load_sum,
scale_load_down(gcfs_rq->load.weight));
}
@@ -3532,16 +4373,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
runnable_sum = max(runnable_sum, running_sum);
- load_sum = (s64)se_weight(se) * runnable_sum;
- load_avg = div_s64(load_sum, divider);
+ load_sum = se_weight(se) * runnable_sum;
+ load_avg = div_u64(load_sum, divider);
- delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
delta_avg = load_avg - se->avg.load_avg;
+ if (!delta_avg)
+ return;
+
+ delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
se->avg.load_sum = runnable_sum;
se->avg.load_avg = load_avg;
add_positive(&cfs_rq->avg.load_avg, delta_avg);
add_positive(&cfs_rq->avg.load_sum, delta_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
+ cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3610,7 +4457,9 @@ static inline bool skip_blocked_update(struct sched_entity *se)
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
+
+static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {}
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
@@ -3621,18 +4470,100 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_NO_HZ_COMMON
+static inline void migrate_se_pelt_lag(struct sched_entity *se)
+{
+ u64 throttled = 0, now, lut;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
+ bool is_idle;
+
+ if (load_avg_is_decayed(&se->avg))
+ return;
+
+ cfs_rq = cfs_rq_of(se);
+ rq = rq_of(cfs_rq);
+
+ rcu_read_lock();
+ is_idle = is_idle_task(rcu_dereference(rq->curr));
+ rcu_read_unlock();
+
+ /*
+ * The lag estimation comes with a cost we don't want to pay all the
+ * time. Hence, limiting to the case where the source CPU is idle and
+ * we know we are at the greatest risk to have an outdated clock.
+ */
+ if (!is_idle)
+ return;
+
+ /*
+ * Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where:
+ *
+ * last_update_time (the cfs_rq's last_update_time)
+ * = cfs_rq_clock_pelt()@cfs_rq_idle
+ * = rq_clock_pelt()@cfs_rq_idle
+ * - cfs->throttled_clock_pelt_time@cfs_rq_idle
+ *
+ * cfs_idle_lag (delta between rq's update and cfs_rq's update)
+ * = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle
+ *
+ * rq_idle_lag (delta between now and rq's update)
+ * = sched_clock_cpu() - rq_clock()@rq_idle
+ *
+ * We can then write:
+ *
+ * now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time +
+ * sched_clock_cpu() - rq_clock()@rq_idle
+ * Where:
+ * rq_clock_pelt()@rq_idle is rq->clock_pelt_idle
+ * rq_clock()@rq_idle is rq->clock_idle
+ * cfs->throttled_clock_pelt_time@cfs_rq_idle
+ * is cfs_rq->throttled_pelt_idle
+ */
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ throttled = u64_u32_load(cfs_rq->throttled_pelt_idle);
+ /* The clock has been stopped for throttling */
+ if (throttled == U64_MAX)
+ return;
+#endif
+ now = u64_u32_load(rq->clock_pelt_idle);
+ /*
+ * Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case
+ * is observed the old clock_pelt_idle value and the new clock_idle,
+ * which lead to an underestimation. The opposite would lead to an
+ * overestimation.
+ */
+ smp_rmb();
+ lut = cfs_rq_last_update_time(cfs_rq);
+
+ now -= throttled;
+ if (now < lut)
+ /*
+ * cfs_rq->avg.last_update_time is more recent than our
+ * estimation, let's use it.
+ */
+ now = lut;
+ else
+ now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle);
+
+ __update_load_avg_blocked_se(now, se);
+}
+#else
+static void migrate_se_pelt_lag(struct sched_entity *se) {}
+#endif
+
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_pelt()
* @cfs_rq: cfs_rq to update
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
- * avg. The immediate corollary is that all (fair) tasks must be attached, see
- * post_init_entity_util_avg().
+ * avg. The immediate corollary is that all (fair) tasks must be attached.
*
* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
*
- * Returns true if the load decayed or we removed load.
+ * Return: true if the load decayed or we removed load.
*
* Since both these conditions indicate a changed cfs_rq->avg.load we should
* call update_tg_load_avg() when this function returns true.
@@ -3646,7 +4577,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
if (cfs_rq->removed.nr) {
unsigned long r;
- u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
raw_spin_lock(&cfs_rq->removed.lock);
swap(cfs_rq->removed.util_avg, removed_util);
@@ -3658,14 +4589,31 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
r = removed_load;
sub_positive(&sa->load_avg, r);
sub_positive(&sa->load_sum, r * divider);
+ /* See sa->util_sum below */
+ sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
r = removed_util;
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * divider);
+ /*
+ * Because of rounding, se->util_sum might ends up being +1 more than
+ * cfs->util_sum. Although this is not a problem by itself, detaching
+ * a lot of tasks with the rounding problem between 2 updates of
+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
+ * cfs_util_avg is not.
+ * Check that util_sum is still above its lower bound for the new
+ * util_avg. Given that period_contrib might have moved since the last
+ * sync, we are only sure that util_sum must be above or equal to
+ * util_avg * minimum possible divider
+ */
+ sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
r = removed_runnable;
sub_positive(&sa->runnable_avg, r);
sub_positive(&sa->runnable_sum, r * divider);
+ /* See sa->util_sum above */
+ sa->runnable_sum = max_t(u32, sa->runnable_sum,
+ sa->runnable_avg * PELT_MIN_DIVIDER);
/*
* removed_runnable is the unweighted version of removed_load so we
@@ -3678,12 +4626,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
}
decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
-
-#ifndef CONFIG_64BIT
- smp_wmb();
- cfs_rq->load_last_update_time_copy = sa->last_update_time;
-#endif
-
+ u64_u32_store_copy(sa->last_update_time,
+ cfs_rq->last_update_time_copy,
+ sa->last_update_time);
return decayed;
}
@@ -3701,7 +4646,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
/*
* When we attach the @se to the @cfs_rq, we must align the decay
@@ -3723,11 +4668,11 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
se->avg.runnable_sum = se->avg.runnable_avg * divider;
- se->avg.load_sum = divider;
- if (se_weight(se)) {
- se->avg.load_sum =
- div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
- }
+ se->avg.load_sum = se->avg.load_avg * divider;
+ if (se_weight(se) < se->avg.load_sum)
+ se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
+ else
+ se->avg.load_sum = 1;
enqueue_load_avg(cfs_rq, se);
cfs_rq->avg.util_avg += se->avg.util_avg;
@@ -3755,8 +4700,15 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+ cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
+
sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
+ cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
@@ -3771,6 +4723,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
#define UPDATE_TG 0x1
#define SKIP_AGE_LOAD 0x2
#define DO_ATTACH 0x4
+#define DO_DETACH 0x8
/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -3798,37 +4751,23 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* IOW we're enqueueing a task on a new CPU.
*/
attach_entity_load_avg(cfs_rq, se);
- update_tg_load_avg(cfs_rq, 0);
+ update_tg_load_avg(cfs_rq);
+ } else if (flags & DO_DETACH) {
+ /*
+ * DO_DETACH means we're here from dequeue_entity()
+ * and we are migrating task out of the CPU.
+ */
+ detach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq);
} else if (decayed) {
cfs_rq_util_change(cfs_rq, 0);
if (flags & UPDATE_TG)
- update_tg_load_avg(cfs_rq, 0);
+ update_tg_load_avg(cfs_rq);
}
}
-#ifndef CONFIG_64BIT
-static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
-{
- u64 last_update_time_copy;
- u64 last_update_time;
-
- do {
- last_update_time_copy = cfs_rq->load_last_update_time_copy;
- smp_rmb();
- last_update_time = cfs_rq->avg.last_update_time;
- } while (last_update_time != last_update_time_copy);
-
- return last_update_time;
-}
-#else
-static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->avg.last_update_time;
-}
-#endif
-
/*
* Synchronize entity load avg of dequeued entity without locking
* the previous rq.
@@ -3853,8 +4792,8 @@ static void remove_entity_load_avg(struct sched_entity *se)
/*
* tasks cannot exit without having gone through wake_up_new_task() ->
- * post_init_entity_util_avg() which will have added things to the
- * cfs_rq, so we can remove unconditionally.
+ * enqueue_task_fair() which will have added things to the cfs_rq,
+ * so we can remove unconditionally.
*/
sync_entity_load_avg(se);
@@ -3884,31 +4823,20 @@ static inline unsigned long task_util(struct task_struct *p)
return READ_ONCE(p->se.avg.util_avg);
}
-static inline unsigned long _task_util_est(struct task_struct *p)
+static inline unsigned long task_runnable(struct task_struct *p)
{
- struct util_est ue = READ_ONCE(p->se.avg.util_est);
-
- return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
+ return READ_ONCE(p->se.avg.runnable_avg);
}
-static inline unsigned long task_util_est(struct task_struct *p)
+static inline unsigned long _task_util_est(struct task_struct *p)
{
- return max(task_util(p), _task_util_est(p));
+ return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
}
-#ifdef CONFIG_UCLAMP_TASK
-static inline unsigned long uclamp_task_util(struct task_struct *p)
-{
- return clamp(task_util_est(p),
- uclamp_eff_value(p, UCLAMP_MIN),
- uclamp_eff_value(p, UCLAMP_MAX));
-}
-#else
-static inline unsigned long uclamp_task_util(struct task_struct *p)
+static inline unsigned long task_util_est(struct task_struct *p)
{
- return task_util_est(p);
+ return max(task_util(p), _task_util_est(p));
}
-#endif
static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
struct task_struct *p)
@@ -3919,38 +4847,39 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
return;
/* Update root cfs_rq's estimated utilization */
- enqueued = cfs_rq->avg.util_est.enqueued;
+ enqueued = cfs_rq->avg.util_est;
enqueued += _task_util_est(p);
- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
-}
+ WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
-/*
- * Check if a (signed) value is within a specified (unsigned) margin,
- * based on the observation that:
- *
- * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
- *
- * NOTE: this only works when value + maring < INT_MAX.
- */
-static inline bool within_margin(int value, int margin)
-{
- return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
+ trace_sched_util_est_cfs_tp(cfs_rq);
}
-static void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
+static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
{
- long last_ewma_diff;
- struct util_est ue;
- int cpu;
+ unsigned int enqueued;
if (!sched_feat(UTIL_EST))
return;
/* Update root cfs_rq's estimated utilization */
- ue.enqueued = cfs_rq->avg.util_est.enqueued;
- ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
+ enqueued = cfs_rq->avg.util_est;
+ enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
+ WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
+
+ trace_sched_util_est_cfs_tp(cfs_rq);
+}
+
+#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
+
+static inline void util_est_update(struct cfs_rq *cfs_rq,
+ struct task_struct *p,
+ bool task_sleep)
+{
+ unsigned int ewma, dequeued, last_ewma_diff;
+
+ if (!sched_feat(UTIL_EST))
+ return;
/*
* Skip update of task's estimated utilization when the task has not
@@ -3959,82 +4888,219 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
if (!task_sleep)
return;
+ /* Get current estimate of utilization */
+ ewma = READ_ONCE(p->se.avg.util_est);
+
/*
* If the PELT values haven't changed since enqueue time,
* skip the util_est update.
*/
- ue = p->se.avg.util_est;
- if (ue.enqueued & UTIL_AVG_UNCHANGED)
+ if (ewma & UTIL_AVG_UNCHANGED)
return;
+ /* Get utilization at dequeue */
+ dequeued = task_util(p);
+
/*
* Reset EWMA on utilization increases, the moving average is used only
* to smooth utilization decreases.
*/
- ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
- if (sched_feat(UTIL_EST_FASTUP)) {
- if (ue.ewma < ue.enqueued) {
- ue.ewma = ue.enqueued;
- goto done;
- }
+ if (ewma <= dequeued) {
+ ewma = dequeued;
+ goto done;
}
/*
- * Skip update of task's estimated utilization when its EWMA is
+ * Skip update of task's estimated utilization when its members are
* already ~1% close to its last activation value.
*/
- last_ewma_diff = ue.enqueued - ue.ewma;
- if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
- return;
+ last_ewma_diff = ewma - dequeued;
+ if (last_ewma_diff < UTIL_EST_MARGIN)
+ goto done;
/*
* To avoid overestimation of actual task utilization, skip updates if
* we cannot grant there is idle time in this CPU.
*/
- cpu = cpu_of(rq_of(cfs_rq));
- if (task_util(p) > capacity_orig_of(cpu))
+ if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
return;
/*
+ * To avoid underestimate of task utilization, skip updates of EWMA if
+ * we cannot grant that thread got all CPU time it wanted.
+ */
+ if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
+ goto done;
+
+
+ /*
* Update Task's estimated utilization
*
* When *p completes an activation we can consolidate another sample
- * of the task size. This is done by storing the current PELT value
- * as ue.enqueued and by using this value to update the Exponential
- * Weighted Moving Average (EWMA):
+ * of the task size. This is done by using this value to update the
+ * Exponential Weighted Moving Average (EWMA):
*
* ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
* = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
* = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
- * = w * ( last_ewma_diff ) + ewma(t-1)
- * = w * (last_ewma_diff + ewma(t-1) / w)
+ * = w * ( -last_ewma_diff ) + ewma(t-1)
+ * = w * (-last_ewma_diff + ewma(t-1) / w)
*
* Where 'w' is the weight of new samples, which is configured to be
* 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
*/
- ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
- ue.ewma += last_ewma_diff;
- ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+ ewma <<= UTIL_EST_WEIGHT_SHIFT;
+ ewma -= last_ewma_diff;
+ ewma >>= UTIL_EST_WEIGHT_SHIFT;
done:
- WRITE_ONCE(p->se.avg.util_est, ue);
+ ewma |= UTIL_AVG_UNCHANGED;
+ WRITE_ONCE(p->se.avg.util_est, ewma);
+
+ trace_sched_util_est_se_tp(&p->se);
}
-static inline int task_fits_capacity(struct task_struct *p, long capacity)
+static inline int util_fits_cpu(unsigned long util,
+ unsigned long uclamp_min,
+ unsigned long uclamp_max,
+ int cpu)
{
- return fits_capacity(uclamp_task_util(p), capacity);
+ unsigned long capacity_orig, capacity_orig_thermal;
+ unsigned long capacity = capacity_of(cpu);
+ bool fits, uclamp_max_fits;
+
+ /*
+ * Check if the real util fits without any uclamp boost/cap applied.
+ */
+ fits = fits_capacity(util, capacity);
+
+ if (!uclamp_is_used())
+ return fits;
+
+ /*
+ * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
+ * uclamp_max. We only care about capacity pressure (by using
+ * capacity_of()) for comparing against the real util.
+ *
+ * If a task is boosted to 1024 for example, we don't want a tiny
+ * pressure to skew the check whether it fits a CPU or not.
+ *
+ * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
+ * should fit a little cpu even if there's some pressure.
+ *
+ * Only exception is for thermal pressure since it has a direct impact
+ * on available OPP of the system.
+ *
+ * We honour it for uclamp_min only as a drop in performance level
+ * could result in not getting the requested minimum performance level.
+ *
+ * For uclamp_max, we can tolerate a drop in performance level as the
+ * goal is to cap the task. So it's okay if it's getting less.
+ */
+ capacity_orig = arch_scale_cpu_capacity(cpu);
+ capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
+
+ /*
+ * We want to force a task to fit a cpu as implied by uclamp_max.
+ * But we do have some corner cases to cater for..
+ *
+ *
+ * C=z
+ * | ___
+ * | C=y | |
+ * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
+ * | C=x | | | |
+ * | ___ | | | |
+ * | | | | | | | (util somewhere in this region)
+ * | | | | | | |
+ * | | | | | | |
+ * +----------------------------------------
+ * cpu0 cpu1 cpu2
+ *
+ * In the above example if a task is capped to a specific performance
+ * point, y, then when:
+ *
+ * * util = 80% of x then it does not fit on cpu0 and should migrate
+ * to cpu1
+ * * util = 80% of y then it is forced to fit on cpu1 to honour
+ * uclamp_max request.
+ *
+ * which is what we're enforcing here. A task always fits if
+ * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
+ * the normal upmigration rules should withhold still.
+ *
+ * Only exception is when we are on max capacity, then we need to be
+ * careful not to block overutilized state. This is so because:
+ *
+ * 1. There's no concept of capping at max_capacity! We can't go
+ * beyond this performance level anyway.
+ * 2. The system is being saturated when we're operating near
+ * max capacity, it doesn't make sense to block overutilized.
+ */
+ uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
+ uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
+ fits = fits || uclamp_max_fits;
+
+ /*
+ *
+ * C=z
+ * | ___ (region a, capped, util >= uclamp_max)
+ * | C=y | |
+ * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
+ * | C=x | | | |
+ * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
+ * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
+ * | | | | | | |
+ * | | | | | | | (region c, boosted, util < uclamp_min)
+ * +----------------------------------------
+ * cpu0 cpu1 cpu2
+ *
+ * a) If util > uclamp_max, then we're capped, we don't care about
+ * actual fitness value here. We only care if uclamp_max fits
+ * capacity without taking margin/pressure into account.
+ * See comment above.
+ *
+ * b) If uclamp_min <= util <= uclamp_max, then the normal
+ * fits_capacity() rules apply. Except we need to ensure that we
+ * enforce we remain within uclamp_max, see comment above.
+ *
+ * c) If util < uclamp_min, then we are boosted. Same as (b) but we
+ * need to take into account the boosted value fits the CPU without
+ * taking margin/pressure into account.
+ *
+ * Cases (a) and (b) are handled in the 'fits' variable already. We
+ * just need to consider an extra check for case (c) after ensuring we
+ * handle the case uclamp_min > uclamp_max.
+ */
+ uclamp_min = min(uclamp_min, uclamp_max);
+ if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
+ return -1;
+
+ return fits;
+}
+
+static inline int task_fits_cpu(struct task_struct *p, int cpu)
+{
+ unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
+ unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
+ unsigned long util = task_util_est(p);
+ /*
+ * Return true only if the cpu fully fits the task requirements, which
+ * include the utilization but also the performance hints.
+ */
+ return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
{
- if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ if (!sched_asym_cpucap_active())
return;
- if (!p) {
+ if (!p || p->nr_cpus_allowed == 1) {
rq->misfit_task_load = 0;
return;
}
- if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
+ if (task_fits_cpu(p, cpu_of(rq))) {
rq->misfit_task_load = 0;
return;
}
@@ -4048,9 +5114,15 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
#else /* CONFIG_SMP */
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+ return !cfs_rq->nr_running;
+}
+
#define UPDATE_TG 0x0
#define SKIP_AGE_LOAD 0x0
#define DO_ATTACH 0x0
+#define DO_DETACH 0x0
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
{
@@ -4073,178 +5145,186 @@ static inline void
util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
static inline void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
- bool task_sleep) {}
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+
+static inline void
+util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
+ bool task_sleep) {}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
#endif /* CONFIG_SMP */
-static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHED_DEBUG
- s64 d = se->vruntime - cfs_rq->min_vruntime;
-
- if (d < 0)
- d = -d;
-
- if (d > 3*sysctl_sched_latency)
- schedstat_inc(cfs_rq->nr_spread_over);
-#endif
-}
-
static void
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- u64 vruntime = cfs_rq->min_vruntime;
+ u64 vslice, vruntime = avg_vruntime(cfs_rq);
+ s64 lag = 0;
+
+ se->slice = sysctl_sched_base_slice;
+ vslice = calc_delta_fair(se->slice, se);
/*
- * The 'current' period is already promised to the current tasks,
- * however the extra weight of the new task will slow them down a
- * little, place the new task so that it fits in the slot that
- * stays open at the end.
+ * Due to how V is constructed as the weighted average of entities,
+ * adding tasks with positive lag, or removing tasks with negative lag
+ * will move 'time' backwards, this can screw around with the lag of
+ * other tasks.
+ *
+ * EEVDF: placement strategy #1 / #2
*/
- if (initial && sched_feat(START_DEBIT))
- vruntime += sched_vslice(cfs_rq, se);
+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
+ struct sched_entity *curr = cfs_rq->curr;
+ unsigned long load;
- /* sleeps up to a single latency don't count. */
- if (!initial) {
- unsigned long thresh = sysctl_sched_latency;
+ lag = se->vlag;
/*
- * Halve their sleep time's effect, to allow
- * for a gentler effect of sleepers:
+ * If we want to place a task and preserve lag, we have to
+ * consider the effect of the new entity on the weighted
+ * average and compensate for this, otherwise lag can quickly
+ * evaporate.
+ *
+ * Lag is defined as:
+ *
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * To avoid the 'w_i' term all over the place, we only track
+ * the virtual lag:
+ *
+ * vl_i = V - v_i <=> v_i = V - vl_i
+ *
+ * And we take V to be the weighted average of all v:
+ *
+ * V = (\Sum w_j*v_j) / W
+ *
+ * Where W is: \Sum w_j
+ *
+ * Then, the weighted average after adding an entity with lag
+ * vl_i is given by:
+ *
+ * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
+ * = (W*V + w_i*(V - vl_i)) / (W + w_i)
+ * = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
+ * = (V*(W + w_i) - w_i*l) / (W + w_i)
+ * = V - w_i*vl_i / (W + w_i)
+ *
+ * And the actual lag after adding an entity with vl_i is:
+ *
+ * vl'_i = V' - v_i
+ * = V - w_i*vl_i / (W + w_i) - (V - vl_i)
+ * = vl_i - w_i*vl_i / (W + w_i)
+ *
+ * Which is strictly less than vl_i. So in order to preserve lag
+ * we should inflate the lag before placement such that the
+ * effective lag after placement comes out right.
+ *
+ * As such, invert the above relation for vl'_i to get the vl_i
+ * we need to use such that the lag after placement is the lag
+ * we computed before dequeue.
+ *
+ * vl'_i = vl_i - w_i*vl_i / (W + w_i)
+ * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
+ *
+ * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
+ * = W*vl_i
+ *
+ * vl_i = (W + w_i)*vl'_i / W
*/
- if (sched_feat(GENTLE_FAIR_SLEEPERS))
- thresh >>= 1;
+ load = cfs_rq->avg_load;
+ if (curr && curr->on_rq)
+ load += scale_load_down(curr->load.weight);
- vruntime -= thresh;
+ lag *= load + scale_load_down(se->load.weight);
+ if (WARN_ON_ONCE(!load))
+ load = 1;
+ lag = div_s64(lag, load);
}
- /* ensure we never gain time by being placed backwards. */
- se->vruntime = max_vruntime(se->vruntime, vruntime);
-}
+ se->vruntime = vruntime - lag;
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
-
-static inline void check_schedstat_required(void)
-{
-#ifdef CONFIG_SCHEDSTATS
- if (schedstat_enabled())
- return;
+ /*
+ * When joining the competition; the exisiting tasks will be,
+ * on average, halfway through their slice, as such start tasks
+ * off with half a slice to ease into the competition.
+ */
+ if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
+ vslice /= 2;
- /* Force schedstat enabled if a dependent tracepoint is active */
- if (trace_sched_stat_wait_enabled() ||
- trace_sched_stat_sleep_enabled() ||
- trace_sched_stat_iowait_enabled() ||
- trace_sched_stat_blocked_enabled() ||
- trace_sched_stat_runtime_enabled()) {
- printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
- "stat_blocked and stat_runtime require the "
- "kernel parameter schedstats=enable or "
- "kernel.sched_schedstats=1\n");
- }
-#endif
+ /*
+ * EEVDF: vd_i = ve_i + r_i/w_i
+ */
+ se->deadline = se->vruntime + vslice;
}
-static inline bool cfs_bandwidth_used(void);
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
-/*
- * MIGRATION
- *
- * dequeue
- * update_curr()
- * update_min_vruntime()
- * vruntime -= min_vruntime
- *
- * enqueue
- * update_curr()
- * update_min_vruntime()
- * vruntime += min_vruntime
- *
- * this way the vruntime transition between RQs is done when both
- * min_vruntime are up-to-date.
- *
- * WAKEUP (remote)
- *
- * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
- * vruntime -= min_vruntime
- *
- * enqueue
- * update_curr()
- * update_min_vruntime()
- * vruntime += min_vruntime
- *
- * this way we don't have the most up-to-date min_vruntime on the originating
- * CPU and an up-to-date min_vruntime on the destination CPU.
- */
+static inline bool cfs_bandwidth_used(void);
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
bool curr = cfs_rq->curr == se;
/*
* If we're the current task, we must renormalise before calling
* update_curr().
*/
- if (renorm && curr)
- se->vruntime += cfs_rq->min_vruntime;
+ if (curr)
+ place_entity(cfs_rq, se, flags);
update_curr(cfs_rq);
/*
- * Otherwise, renormalise after, such that we're placed at the current
- * moment in time, instead of some random moment in the past. Being
- * placed in the past could significantly boost this task to the
- * fairness detriment of existing tasks.
- */
- if (renorm && !curr)
- se->vruntime += cfs_rq->min_vruntime;
-
- /*
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
- * - Add its load to cfs_rq->runnable_avg
+ * - For group_entity, update its runnable_weight to reflect the new
+ * h_nr_running of its group cfs_rq.
* - For group_entity, update its weight to reflect the new share of
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
*/
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
se_update_runnable(se);
+ /*
+ * XXX update_load_avg() above will have attached us to the pelt sum;
+ * but update_cfs_group() here will re-adjust the weight and have to
+ * undo/redo all that. Seems wasteful.
+ */
update_cfs_group(se);
+
+ /*
+ * XXX now that the entity has been re-weighted, and it's lag adjusted,
+ * we can place the entity.
+ */
+ if (!curr)
+ place_entity(cfs_rq, se, flags);
+
account_entity_enqueue(cfs_rq, se);
- if (flags & ENQUEUE_WAKEUP)
- place_entity(cfs_rq, se, 0);
+ /* Entity has migrated, no longer consider this task hot */
+ if (flags & ENQUEUE_MIGRATED)
+ se->exec_start = 0;
check_schedstat_required();
- update_stats_enqueue(cfs_rq, se, flags);
- check_spread(cfs_rq, se);
+ update_stats_enqueue_fair(cfs_rq, se, flags);
if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
- /*
- * When bandwidth control is enabled, cfs might have been removed
- * because of a parent been throttled but cfs->nr_running > 1. Try to
- * add it unconditionnally.
- */
- if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
- list_add_leaf_cfs_rq(cfs_rq);
-
- if (cfs_rq->nr_running == 1)
+ if (cfs_rq->nr_running == 1) {
check_enqueue_throttle(cfs_rq);
-}
-
-static void __clear_buddies_last(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->last != se)
- break;
+ if (!throttled_hierarchy(cfs_rq)) {
+ list_add_leaf_cfs_rq(cfs_rq);
+ } else {
+#ifdef CONFIG_CFS_BANDWIDTH
+ struct rq *rq = rq_of(cfs_rq);
- cfs_rq->last = NULL;
+ if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
+ cfs_rq->throttled_clock = rq_clock(rq);
+ if (!cfs_rq->throttled_clock_self)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
+#endif
+ }
}
}
@@ -4259,27 +5339,10 @@ static void __clear_buddies_next(struct sched_entity *se)
}
}
-static void __clear_buddies_skip(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->skip != se)
- break;
-
- cfs_rq->skip = NULL;
- }
-}
-
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (cfs_rq->last == se)
- __clear_buddies_last(se);
-
if (cfs_rq->next == se)
__clear_buddies_next(se);
-
- if (cfs_rq->skip == se)
- __clear_buddies_skip(se);
}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@ -4287,6 +5350,11 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+ int action = UPDATE_TG;
+
+ if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
+ action |= DO_DETACH;
+
/*
* Update run-time statistics of the 'current'.
*/
@@ -4295,32 +5363,25 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/*
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
- * - Subtract its load from the cfs_rq->runnable_avg.
+ * - For group_entity, update its runnable_weight to reflect the new
+ * h_nr_running of its group cfs_rq.
* - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
*/
- update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_load_avg(cfs_rq, se, action);
se_update_runnable(se);
- update_stats_dequeue(cfs_rq, se, flags);
+ update_stats_dequeue_fair(cfs_rq, se, flags);
clear_buddies(cfs_rq, se);
+ update_entity_lag(cfs_rq, se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);
- /*
- * Normalize after update_curr(); which will also have moved
- * min_vruntime if @se is the one holding it back. But before doing
- * update_min_vruntime() again, which will discount @se's position and
- * can move min_vruntime forward still more.
- */
- if (!(flags & DEQUEUE_SLEEP))
- se->vruntime -= cfs_rq->min_vruntime;
-
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
@@ -4334,51 +5395,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
-}
-
-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static void
-check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-{
- unsigned long ideal_runtime, delta_exec;
- struct sched_entity *se;
- s64 delta;
-
- ideal_runtime = sched_slice(cfs_rq, curr);
- delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- if (delta_exec > ideal_runtime) {
- resched_curr(rq_of(cfs_rq));
- /*
- * The current task ran long enough, ensure it doesn't get
- * re-elected due to buddy favours.
- */
- clear_buddies(cfs_rq, curr);
- return;
- }
-
- /*
- * Ensure that a task that missed wakeup preemption by a
- * narrow margin doesn't have to wait for a full slice.
- * This also mitigates buddy induced latencies under load.
- */
- if (delta_exec < sysctl_sched_min_granularity)
- return;
-
- se = __pick_first_entity(cfs_rq);
- delta = curr->vruntime - se->vruntime;
-
- if (delta < 0)
- return;
- if (delta > ideal_runtime)
- resched_curr(rq_of(cfs_rq));
+ if (cfs_rq->nr_running == 0)
+ update_idle_cfs_rq_clock_pelt(cfs_rq);
}
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ clear_buddies(cfs_rq, se);
+
/* 'current' is not kept within the tree. */
if (se->on_rq) {
/*
@@ -4386,9 +5412,14 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
- update_stats_wait_end(cfs_rq, se);
+ update_stats_wait_end_fair(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
+ /*
+ * HACK, stash a copy of deadline at the point of pick in vlag,
+ * which isn't used until dequeue.
+ */
+ se->vlag = se->deadline;
}
update_stats_curr_start(cfs_rq, se);
@@ -4401,17 +5432,17 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
if (schedstat_enabled() &&
rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
- schedstat_set(se->statistics.slice_max,
- max((u64)schedstat_val(se->statistics.slice_max),
- se->sum_exec_runtime - se->prev_sum_exec_runtime));
+ struct sched_statistics *stats;
+
+ stats = __schedstats_from_se(se);
+ __schedstat_set(stats->slice_max,
+ max((u64)stats->slice_max,
+ se->sum_exec_runtime - se->prev_sum_exec_runtime));
}
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
-
/*
* Pick the next process, keeping these things in mind, in this order:
* 1) keep things fair between processes/task groups
@@ -4420,54 +5451,16 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *
-pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+pick_next_entity(struct cfs_rq *cfs_rq)
{
- struct sched_entity *left = __pick_first_entity(cfs_rq);
- struct sched_entity *se;
-
- /*
- * If curr is set we have to see if its left of the leftmost entity
- * still in the tree, provided there was anything in the tree at all.
- */
- if (!left || (curr && entity_before(curr, left)))
- left = curr;
-
- se = left; /* ideally we run the leftmost entity */
-
- /*
- * Avoid running the skip buddy, if running something else can
- * be done without getting too unfair.
- */
- if (cfs_rq->skip == se) {
- struct sched_entity *second;
-
- if (se == curr) {
- second = __pick_first_entity(cfs_rq);
- } else {
- second = __pick_next_entity(se);
- if (!second || (curr && entity_before(curr, second)))
- second = curr;
- }
-
- if (second && wakeup_preempt_entity(second, left) < 1)
- se = second;
- }
-
/*
- * Prefer last buddy, try to return the CPU to a preempted task.
+ * Enabling NEXT_BUDDY will affect latency but not fairness.
*/
- if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
- se = cfs_rq->last;
-
- /*
- * Someone really wants this to run. If it's not unfair, run it.
- */
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
- se = cfs_rq->next;
-
- clear_buddies(cfs_rq, se);
+ if (sched_feat(NEXT_BUDDY) &&
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
+ return cfs_rq->next;
- return se;
+ return pick_eevdf(cfs_rq);
}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@ -4484,10 +5477,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
- check_spread(cfs_rq, prev);
-
if (prev->on_rq) {
- update_stats_wait_start(cfs_rq, prev);
+ update_stats_wait_start_fair(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
@@ -4526,9 +5517,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
return;
#endif
-
- if (cfs_rq->nr_running > 1)
- check_preempt_tick(cfs_rq, curr);
}
@@ -4588,8 +5576,20 @@ static inline u64 sched_cfs_bandwidth_slice(void)
*/
void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
{
- if (cfs_b->quota != RUNTIME_INF)
- cfs_b->runtime = cfs_b->quota;
+ s64 runtime;
+
+ if (unlikely(cfs_b->quota == RUNTIME_INF))
+ return;
+
+ cfs_b->runtime += cfs_b->quota;
+ runtime = cfs_b->runtime_snap - cfs_b->runtime;
+ if (runtime > 0) {
+ cfs_b->burst_time += runtime;
+ cfs_b->nr_burst++;
+ }
+
+ cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
+ cfs_b->runtime_snap = cfs_b->runtime;
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4700,12 +5700,23 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttle_count--;
if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
- cfs_rq->throttled_clock_task;
+ cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+ cfs_rq->throttled_clock_pelt;
- /* Add cfs_rq with already running entity in the list */
- if (cfs_rq->nr_running >= 1)
+ /* Add cfs_rq with load or one or more already running entities to the list */
+ if (!cfs_rq_is_decayed(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
+
+ if (cfs_rq->throttled_clock_self) {
+ u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+
+ cfs_rq->throttled_clock_self = 0;
+
+ if (SCHED_WARN_ON((s64)delta < 0))
+ delta = 0;
+
+ cfs_rq->throttled_clock_self_time += delta;
+ }
}
return 0;
@@ -4718,8 +5729,12 @@ static int tg_throttle_down(struct task_group *tg, void *data)
/* group is entering throttled state, stop time */
if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_task = rq_clock_task(rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
list_del_leaf_cfs_rq(cfs_rq);
+
+ SCHED_WARN_ON(cfs_rq->throttled_clock_self);
+ if (cfs_rq->nr_running)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
}
cfs_rq->throttle_count++;
@@ -4767,31 +5782,51 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
- break;
+ goto done;
- if (dequeue) {
- dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
- } else {
- update_load_avg(qcfs_rq, se, 0);
- se_update_runnable(se);
- }
+ dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_task_delta = cfs_rq->h_nr_running;
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
- if (qcfs_rq->load.weight)
- dequeue = 0;
+ if (qcfs_rq->load.weight) {
+ /* Avoid re-evaluating load for this entity: */
+ se = parent_entity(se);
+ break;
+ }
}
- if (!se)
- sub_nr_running(rq, task_delta);
+ for_each_sched_entity(se) {
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+ /* throttled entity or throttle-on-deactivate */
+ if (!se->on_rq)
+ goto done;
+ update_load_avg(qcfs_rq, se, 0);
+ se_update_runnable(se);
+
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_task_delta = cfs_rq->h_nr_running;
+
+ qcfs_rq->h_nr_running -= task_delta;
+ qcfs_rq->idle_h_nr_running -= idle_task_delta;
+ }
+
+ /* At this point se is NULL and we are at root level*/
+ sub_nr_running(rq, task_delta);
+
+done:
/*
* Note: distribution will already see us throttled via the
* throttled-list. rq->lock protects completion.
*/
cfs_rq->throttled = 1;
- cfs_rq->throttled_clock = rq_clock(rq);
+ SCHED_WARN_ON(cfs_rq->throttled_clock);
+ if (cfs_rq->nr_running)
+ cfs_rq->throttled_clock = rq_clock(rq);
return true;
}
@@ -4809,93 +5844,184 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
update_rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
- cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+ if (cfs_rq->throttled_clock) {
+ cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+ cfs_rq->throttled_clock = 0;
+ }
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
- if (!cfs_rq->load.weight)
- return;
+ if (!cfs_rq->load.weight) {
+ if (!cfs_rq->on_list)
+ return;
+ /*
+ * Nothing to run but something to decay (on_list)?
+ * Complete the branch.
+ */
+ for_each_sched_entity(se) {
+ if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
+ break;
+ }
+ goto unthrottle_throttle;
+ }
task_delta = cfs_rq->h_nr_running;
idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+
if (se->on_rq)
break;
- cfs_rq = cfs_rq_of(se);
- enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+ enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
+
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_task_delta = cfs_rq->h_nr_running;
- cfs_rq->h_nr_running += task_delta;
- cfs_rq->idle_h_nr_running += idle_task_delta;
+ qcfs_rq->h_nr_running += task_delta;
+ qcfs_rq->idle_h_nr_running += idle_task_delta;
/* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
+ if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle;
}
for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_load_avg(qcfs_rq, se, UPDATE_TG);
se_update_runnable(se);
- cfs_rq->h_nr_running += task_delta;
- cfs_rq->idle_h_nr_running += idle_task_delta;
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_task_delta = cfs_rq->h_nr_running;
+ qcfs_rq->h_nr_running += task_delta;
+ qcfs_rq->idle_h_nr_running += idle_task_delta;
/* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
+ if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle;
-
- /*
- * One parent has been throttled and cfs_rq removed from the
- * list. Add it back to not break the leaf list.
- */
- if (throttled_hierarchy(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
}
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, task_delta);
unthrottle_throttle:
+ assert_list_leaf_cfs_rq(rq);
+
+ /* Determine whether we need to wake up potentially idle CPU: */
+ if (rq->curr == rq->idle && rq->cfs.nr_running)
+ resched_curr(rq);
+}
+
+#ifdef CONFIG_SMP
+static void __cfsb_csd_unthrottle(void *arg)
+{
+ struct cfs_rq *cursor, *tmp;
+ struct rq *rq = arg;
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+
/*
- * The cfs_rq_throttled() breaks in the above iteration can result in
- * incomplete leaf list maintenance, resulting in triggering the
- * assertion below.
+ * Iterating over the list can trigger several call to
+ * update_rq_clock() in unthrottle_cfs_rq().
+ * Do it once and skip the potential next ones.
*/
- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
+ update_rq_clock(rq);
+ rq_clock_start_loop_update(rq);
- if (list_add_leaf_cfs_rq(cfs_rq))
- break;
+ /*
+ * Since we hold rq lock we're safe from concurrent manipulation of
+ * the CSD list. However, this RCU critical section annotates the
+ * fact that we pair with sched_free_group_rcu(), so that we cannot
+ * race with group being freed in the window between removing it
+ * from the list and advancing to the next entry in the list.
+ */
+ rcu_read_lock();
+
+ list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
+ throttled_csd_list) {
+ list_del_init(&cursor->throttled_csd_list);
+
+ if (cfs_rq_throttled(cursor))
+ unthrottle_cfs_rq(cursor);
}
- assert_list_leaf_cfs_rq(rq);
+ rcu_read_unlock();
- /* Determine whether we need to wake up potentially idle CPU: */
- if (rq->curr == rq->idle && rq->cfs.nr_running)
- resched_curr(rq);
+ rq_clock_stop_loop_update(rq);
+ rq_unlock(rq, &rf);
}
-static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
{
- struct cfs_rq *cfs_rq;
+ struct rq *rq = rq_of(cfs_rq);
+ bool first;
+
+ if (rq == this_rq()) {
+ unthrottle_cfs_rq(cfs_rq);
+ return;
+ }
+
+ /* Already enqueued */
+ if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
+ return;
+
+ first = list_empty(&rq->cfsb_csd_list);
+ list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
+ if (first)
+ smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
+}
+#else
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ unthrottle_cfs_rq(cfs_rq);
+}
+#endif
+
+static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ lockdep_assert_rq_held(rq_of(cfs_rq));
+
+ if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
+ cfs_rq->runtime_remaining <= 0))
+ return;
+
+ __unthrottle_cfs_rq_async(cfs_rq);
+}
+
+static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+{
+ int this_cpu = smp_processor_id();
u64 runtime, remaining = 1;
+ bool throttled = false;
+ struct cfs_rq *cfs_rq, *tmp;
+ struct rq_flags rf;
+ struct rq *rq;
+ LIST_HEAD(local_unthrottle);
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) {
- struct rq *rq = rq_of(cfs_rq);
- struct rq_flags rf;
+ rq = rq_of(cfs_rq);
+
+ if (!remaining) {
+ throttled = true;
+ break;
+ }
rq_lock_irqsave(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;
- /* By the above check, this should never be true */
+ /* Already queued for async unthrottle */
+ if (!list_empty(&cfs_rq->throttled_csd_list))
+ goto next;
+
+ /* By the above checks, this should never be true */
SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
raw_spin_lock(&cfs_b->lock);
@@ -4909,16 +6035,44 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
cfs_rq->runtime_remaining += runtime;
/* we check whether we're throttled above */
- if (cfs_rq->runtime_remaining > 0)
- unthrottle_cfs_rq(cfs_rq);
+ if (cfs_rq->runtime_remaining > 0) {
+ if (cpu_of(rq) != this_cpu) {
+ unthrottle_cfs_rq_async(cfs_rq);
+ } else {
+ /*
+ * We currently only expect to be unthrottling
+ * a single cfs_rq locally.
+ */
+ SCHED_WARN_ON(!list_empty(&local_unthrottle));
+ list_add_tail(&cfs_rq->throttled_csd_list,
+ &local_unthrottle);
+ }
+ } else {
+ throttled = true;
+ }
next:
rq_unlock_irqrestore(rq, &rf);
+ }
- if (!remaining)
- break;
+ list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
+ throttled_csd_list) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ rq_lock_irqsave(rq, &rf);
+
+ list_del_init(&cfs_rq->throttled_csd_list);
+
+ if (cfs_rq_throttled(cfs_rq))
+ unthrottle_cfs_rq(cfs_rq);
+
+ rq_unlock_irqrestore(rq, &rf);
}
+ SCHED_WARN_ON(!list_empty(&local_unthrottle));
+
rcu_read_unlock();
+
+ return throttled;
}
/*
@@ -4938,6 +6092,9 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
cfs_b->nr_periods += overrun;
+ /* Refill extra burst quota even if cfs_b->idle */
+ __refill_cfs_bandwidth_runtime(cfs_b);
+
/*
* idle depends on !throttled (for the case of a large deficit), and if
* we're going inactive then everything else can be deferred
@@ -4945,8 +6102,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
if (cfs_b->idle && !throttled)
goto out_deactivate;
- __refill_cfs_bandwidth_runtime(cfs_b);
-
if (!throttled) {
/* mark as potentially idle for the upcoming period */
cfs_b->idle = 1;
@@ -4962,10 +6117,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
while (throttled && cfs_b->runtime > 0) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
- distribute_cfs_runtime(cfs_b);
+ throttled = distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
-
- throttled = !list_empty(&cfs_b->throttled_cfs_rq);
}
/*
@@ -4999,7 +6152,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
{
struct hrtimer *refresh_timer = &cfs_b->period_timer;
- u64 remaining;
+ s64 remaining;
/* if the call-back is running a quota refresh is already occurring */
if (hrtimer_callback_running(refresh_timer))
@@ -5007,7 +6160,7 @@ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
/* is a quota refresh about to occur? */
remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
- if (remaining < min_expire)
+ if (remaining < (s64)min_expire)
return 1;
return 0;
@@ -5093,15 +6246,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
return;
distribute_cfs_runtime(cfs_b);
-
- raw_spin_lock_irqsave(&cfs_b->lock, flags);
- raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
/*
* When a group wakes up we want to make sure that its quota is not already
* expired/exceeded, otherwise it may be allowed to steal additional ticks of
- * runtime as update_curr() throttling can not not trigger until it's on-rq.
+ * runtime as update_curr() throttling can not trigger until it's on-rq.
*/
static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
{
@@ -5136,7 +6286,7 @@ static void sync_throttle(struct task_group *tg, int cpu)
pcfs_rq = tg->parent->cfs_rq[cpu];
cfs_rq->throttle_count = pcfs_rq->throttle_count;
- cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
@@ -5199,6 +6349,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (new < max_cfs_quota_period) {
cfs_b->period = ns_to_ktime(new);
cfs_b->quota *= 2;
+ cfs_b->burst *= 2;
pr_warn_ratelimited(
"cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
@@ -5224,16 +6375,22 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
-void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent)
{
raw_spin_lock_init(&cfs_b->lock);
cfs_b->runtime = 0;
cfs_b->quota = RUNTIME_INF;
cfs_b->period = ns_to_ktime(default_cfs_period());
+ cfs_b->burst = 0;
+ cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
cfs_b->period_timer.function = sched_cfs_period_timer;
+
+ /* Add a random offset so that timers interleave */
+ hrtimer_set_expires(&cfs_b->period_timer,
+ get_random_u32_below(cfs_b->period));
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
cfs_b->slack_started = false;
@@ -5243,6 +6400,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
+ INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -5259,12 +6417,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
+ int __maybe_unused i;
+
/* init_cfs_bandwidth() was not called */
if (!cfs_b->throttled_cfs_rq.next)
return;
hrtimer_cancel(&cfs_b->period_timer);
hrtimer_cancel(&cfs_b->slack_timer);
+
+ /*
+ * It is possible that we still have some cfs_rq's pending on a CSD
+ * list, though this race is very rare. In order for this to occur, we
+ * must have raced with the last task leaving the group while there
+ * exist throttled cfs_rq(s), and the period_timer must have queued the
+ * CSD item but the remote cpu has not yet processed it. To handle this,
+ * we can simply flush all pending CSD work inline here. We're
+ * guaranteed at this point that no additional cfs_rq of this group can
+ * join a CSD list.
+ */
+#ifdef CONFIG_SMP
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+ unsigned long flags;
+
+ if (list_empty(&rq->cfsb_csd_list))
+ continue;
+
+ local_irq_save(flags);
+ __cfsb_csd_unthrottle(rq);
+ local_irq_restore(flags);
+ }
+#endif
}
/*
@@ -5274,12 +6458,12 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
* bits doesn't do much.
*/
-/* cpu online calback */
+/* cpu online callback */
static void __maybe_unused update_runtime_enabled(struct rq *rq)
{
struct task_group *tg;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
@@ -5298,7 +6482,14 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
struct task_group *tg;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * The rq clock has already been updated in the
+ * set_rq_offline(), so we should skip updating
+ * the rq clock again in unthrottle_cfs_rq().
+ */
+ rq_clock_start_loop_update(rq);
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
@@ -5322,8 +6513,50 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
unthrottle_cfs_rq(cfs_rq);
}
rcu_read_unlock();
+
+ rq_clock_stop_loop_update(rq);
+}
+
+bool cfs_task_bw_constrained(struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+ if (!cfs_bandwidth_used())
+ return false;
+
+ if (cfs_rq->runtime_enabled ||
+ tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF)
+ return true;
+
+ return false;
}
+#ifdef CONFIG_NO_HZ_FULL
+/* called from pick_next_task_fair() */
+static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
+{
+ int cpu = cpu_of(rq);
+
+ if (!sched_feat(HZ_BW) || !cfs_bandwidth_used())
+ return;
+
+ if (!tick_nohz_full_cpu(cpu))
+ return;
+
+ if (rq->nr_running != 1)
+ return;
+
+ /*
+ * We know there is only one task runnable and we've just picked it. The
+ * normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will
+ * be otherwise able to stop the tick. Just need to check if we are using
+ * bandwidth control.
+ */
+ if (cfs_task_bw_constrained(p))
+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+#endif
+
#else /* CONFIG_CFS_BANDWIDTH */
static inline bool cfs_bandwidth_used(void)
@@ -5353,9 +6586,8 @@ static inline int throttled_lb_pair(struct task_group *tg,
return 0;
}
-void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-
#ifdef CONFIG_FAIR_GROUP_SCHED
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
#endif
@@ -5366,9 +6598,18 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
static inline void update_runtime_enabled(struct rq *rq) {}
static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
-
+#ifdef CONFIG_CGROUP_SCHED
+bool cfs_task_bw_constrained(struct task_struct *p)
+{
+ return false;
+}
+#endif
#endif /* CONFIG_CFS_BANDWIDTH */
+#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
+static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {}
+#endif
+
/**************************************************
* CFS operations on tasks:
*/
@@ -5377,17 +6618,16 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
SCHED_WARN_ON(task_rq(p) != rq);
if (rq->cfs.h_nr_running > 1) {
- u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ u64 slice = se->slice;
s64 delta = slice - ran;
if (delta < 0) {
- if (rq->curr == p)
+ if (task_current(rq, p))
resched_curr(rq);
return;
}
@@ -5404,11 +6644,10 @@ static void hrtick_update(struct rq *rq)
{
struct task_struct *curr = rq->curr;
- if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
+ if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
return;
- if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
- hrtick_start_fair(rq, curr);
+ hrtick_start_fair(rq, curr);
}
#else /* !CONFIG_SCHED_HRTICK */
static inline void
@@ -5422,11 +6661,13 @@ static inline void hrtick_update(struct rq *rq)
#endif
#ifdef CONFIG_SMP
-static inline unsigned long cpu_util(int cpu);
-
static inline bool cpu_overutilized(int cpu)
{
- return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
+ unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+
+ /* Return true only if the utilization doesn't fit CPU's capacity */
+ return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
}
static inline void update_overutilized_status(struct rq *rq)
@@ -5465,6 +6706,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int idle_h_nr_running = task_has_idle_policy(p);
+ int task_new = !(flags & ENQUEUE_WAKEUP);
/*
* The code below (indirectly) updates schedutil which looks at
@@ -5491,6 +6733,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ if (cfs_rq_is_idle(cfs_rq))
+ idle_h_nr_running = 1;
+
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
@@ -5508,16 +6753,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ if (cfs_rq_is_idle(cfs_rq))
+ idle_h_nr_running = 1;
+
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
-
- /*
- * One parent has been throttled and cfs_rq removed from the
- * list. Add it back to not break the leaf list.
- */
- if (throttled_hierarchy(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
}
/* At this point se is NULL and we are at root level*/
@@ -5537,25 +6778,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* into account, but that is not straightforward to implement,
* and the following generally works well enough in practice.
*/
- if (flags & ENQUEUE_WAKEUP)
+ if (!task_new)
update_overutilized_status(rq);
enqueue_throttle:
- if (cfs_bandwidth_used()) {
- /*
- * When bandwidth control is enabled; the cfs_rq_throttled()
- * breaks in the above iteration can result in incomplete
- * leaf list maintenance, resulting in triggering the assertion
- * below.
- */
- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
-
- if (list_add_leaf_cfs_rq(cfs_rq))
- break;
- }
- }
-
assert_list_leaf_cfs_rq(rq);
hrtick_update(rq);
@@ -5576,6 +6802,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int idle_h_nr_running = task_has_idle_policy(p);
bool was_sched_idle = sched_idle_rq(rq);
+ util_est_dequeue(&rq->cfs, p);
+
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
@@ -5583,6 +6811,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running--;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ if (cfs_rq_is_idle(cfs_rq))
+ idle_h_nr_running = 1;
+
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto dequeue_throttle;
@@ -5612,29 +6843,33 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running--;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ if (cfs_rq_is_idle(cfs_rq))
+ idle_h_nr_running = 1;
+
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto dequeue_throttle;
}
-dequeue_throttle:
- if (!se)
- sub_nr_running(rq, 1);
+ /* At this point se is NULL and we are at root level*/
+ sub_nr_running(rq, 1);
/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
- util_est_dequeue(&rq->cfs, p, task_sleep);
+dequeue_throttle:
+ util_est_update(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
#ifdef CONFIG_SMP
/* Working cpumask for: load_balance, load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
+static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
+static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
#ifdef CONFIG_NO_HZ_COMMON
@@ -5642,6 +6877,7 @@ static struct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
int has_blocked; /* Idle CPUS has blocked load */
+ int needs_update; /* Newly idle CPUs need their next_balance collated */
unsigned long next_balance; /* in jiffy units */
unsigned long next_blocked; /* Next update of blocked load in jiffies */
} nohz ____cacheline_aligned;
@@ -5792,6 +7028,9 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
if (sync && cpu_rq(this_cpu)->nr_running == 1)
return this_cpu;
+ if (available_idle_cpu(prev_cpu))
+ return prev_cpu;
+
return nr_cpumask_bits;
}
@@ -5849,12 +7088,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
- schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
- if (target == nr_cpumask_bits)
+ schedstat_inc(p->stats.nr_wakeups_affine_attempts);
+ if (target != this_cpu)
return prev_cpu;
schedstat_inc(sd->ttwu_move_affine);
- schedstat_inc(p->se.statistics.nr_wakeups_affine);
+ schedstat_inc(p->stats.nr_wakeups_affine);
return target;
}
@@ -5880,11 +7119,15 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+ struct rq *rq = cpu_rq(i);
+
+ if (!sched_core_cookie_match(rq, p))
+ continue;
+
if (sched_idle_cpu(i))
return i;
if (available_idle_cpu(i)) {
- struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
/*
@@ -5970,6 +7213,15 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
return new_cpu;
}
+static inline int __select_idle_cpu(int cpu, struct task_struct *p)
+{
+ if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
+ sched_cpu_cookie_match(cpu_rq(cpu), p))
+ return cpu;
+
+ return -1;
+}
+
#ifdef CONFIG_SCHED_SMT
DEFINE_STATIC_KEY_FALSE(sched_smt_present);
EXPORT_SYMBOL_GPL(sched_smt_present);
@@ -5983,7 +7235,7 @@ static inline void set_idle_cores(int cpu, int val)
WRITE_ONCE(sds->has_idle_cores, val);
}
-static inline bool test_idle_cores(int cpu, bool def)
+static inline bool test_idle_cores(int cpu)
{
struct sched_domain_shared *sds;
@@ -5991,7 +7243,7 @@ static inline bool test_idle_cores(int cpu, bool def)
if (sds)
return READ_ONCE(sds->has_idle_cores);
- return def;
+ return false;
}
/*
@@ -6007,7 +7259,7 @@ void __update_idle_core(struct rq *rq)
int cpu;
rcu_read_lock();
- if (test_idle_cores(core, true))
+ if (test_idle_cores(core))
goto unlock;
for_each_cpu(cpu, cpu_smt_mask(core)) {
@@ -6028,39 +7280,31 @@ unlock:
* there are no idle cores left in the system; tracked through
* sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
*/
-static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
{
- struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
- int core, cpu;
-
- if (!static_branch_likely(&sched_smt_present))
- return -1;
-
- if (!test_idle_cores(target, false))
- return -1;
-
- cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
-
- for_each_cpu_wrap(core, cpus, target) {
- bool idle = true;
+ bool idle = true;
+ int cpu;
- for_each_cpu(cpu, cpu_smt_mask(core)) {
- if (!available_idle_cpu(cpu)) {
- idle = false;
- break;
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ if (!available_idle_cpu(cpu)) {
+ idle = false;
+ if (*idle_cpu == -1) {
+ if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) {
+ *idle_cpu = cpu;
+ break;
+ }
+ continue;
}
+ break;
}
- cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
-
- if (idle)
- return core;
+ if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr))
+ *idle_cpu = cpu;
}
- /*
- * Failed to find an idle core; stop looking for one.
- */
- set_idle_cores(target, 0);
+ if (idle)
+ return core;
+ cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
return -1;
}
@@ -6071,11 +7315,8 @@ static int select_idle_smt(struct task_struct *p, int target)
{
int cpu;
- if (!static_branch_likely(&sched_smt_present))
- return -1;
-
- for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
+ if (cpu == target)
continue;
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
@@ -6086,9 +7327,18 @@ static int select_idle_smt(struct task_struct *p, int target)
#else /* CONFIG_SCHED_SMT */
-static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+static inline void set_idle_cores(int cpu, int val)
{
- return -1;
+}
+
+static inline bool test_idle_cores(int cpu)
+{
+ return false;
+}
+
+static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
+{
+ return __select_idle_cpu(core, p);
}
static inline int select_idle_smt(struct task_struct *p, int target)
@@ -6103,52 +7353,68 @@ static inline int select_idle_smt(struct task_struct *p, int target)
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
* average idle time for this rq (as found in rq->avg_idle).
*/
-static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
{
- struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
- struct sched_domain *this_sd;
- u64 avg_cost, avg_idle;
- u64 time;
- int this = smp_processor_id();
- int cpu, nr = INT_MAX;
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
+ int i, cpu, idle_cpu = -1, nr = INT_MAX;
+ struct sched_domain_shared *sd_share;
- this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
- if (!this_sd)
- return -1;
-
- /*
- * Due to large variance we need a large fuzz factor; hackbench in
- * particularly is sensitive here.
- */
- avg_idle = this_rq()->avg_idle / 512;
- avg_cost = this_sd->avg_scan_cost + 1;
-
- if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
- return -1;
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
- if (sched_feat(SIS_PROP)) {
- u64 span_avg = sd->span_weight * avg_idle;
- if (span_avg > 4*avg_cost)
- nr = div_u64(span_avg, avg_cost);
- else
- nr = 4;
+ if (sched_feat(SIS_UTIL)) {
+ sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
+ if (sd_share) {
+ /* because !--nr is the condition to stop scan */
+ nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
+ /* overloaded LLC is unlikely to have idle cpu/core */
+ if (nr == 1)
+ return -1;
+ }
}
- time = cpu_clock(this);
+ if (static_branch_unlikely(&sched_cluster_active)) {
+ struct sched_group *sg = sd->groups;
+
+ if (sg->flags & SD_CLUSTER) {
+ for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) {
+ if (!cpumask_test_cpu(cpu, cpus))
+ continue;
+
+ if (has_idle_core) {
+ i = select_idle_core(p, cpu, cpus, &idle_cpu);
+ if ((unsigned int)i < nr_cpumask_bits)
+ return i;
+ } else {
+ if (--nr <= 0)
+ return -1;
+ idle_cpu = __select_idle_cpu(cpu, p);
+ if ((unsigned int)idle_cpu < nr_cpumask_bits)
+ return idle_cpu;
+ }
+ }
+ cpumask_andnot(cpus, cpus, sched_group_span(sg));
+ }
+ }
- cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+ for_each_cpu_wrap(cpu, cpus, target + 1) {
+ if (has_idle_core) {
+ i = select_idle_core(p, cpu, cpus, &idle_cpu);
+ if ((unsigned int)i < nr_cpumask_bits)
+ return i;
- for_each_cpu_wrap(cpu, cpus, target) {
- if (!--nr)
- return -1;
- if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
- break;
+ } else {
+ if (--nr <= 0)
+ return -1;
+ idle_cpu = __select_idle_cpu(cpu, p);
+ if ((unsigned int)idle_cpu < nr_cpumask_bits)
+ break;
+ }
}
- time = cpu_clock(this) - time;
- update_avg(&this_sd->avg_scan_cost, time);
+ if (has_idle_core)
+ set_idle_cores(target, false);
- return cpu;
+ return idle_cpu;
}
/*
@@ -6159,71 +7425,109 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
static int
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
{
- unsigned long best_cap = 0;
+ unsigned long task_util, util_min, util_max, best_cap = 0;
+ int fits, best_fits = 0;
int cpu, best_cpu = -1;
struct cpumask *cpus;
- sync_entity_load_avg(&p->se);
-
- cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+ task_util = task_util_est(p);
+ util_min = uclamp_eff_value(p, UCLAMP_MIN);
+ util_max = uclamp_eff_value(p, UCLAMP_MAX);
+
for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
continue;
- if (task_fits_capacity(p, cpu_cap))
+
+ fits = util_fits_cpu(task_util, util_min, util_max, cpu);
+
+ /* This CPU fits with all requirements */
+ if (fits > 0)
return cpu;
+ /*
+ * Only the min performance hint (i.e. uclamp_min) doesn't fit.
+ * Look for the CPU with best capacity.
+ */
+ else if (fits < 0)
+ cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
- if (cpu_cap > best_cap) {
+ /*
+ * First, select CPU which fits better (-1 being better than 0).
+ * Then, select the one with best capacity at same level.
+ */
+ if ((fits < best_fits) ||
+ ((fits == best_fits) && (cpu_cap > best_cap))) {
best_cap = cpu_cap;
best_cpu = cpu;
+ best_fits = fits;
}
}
return best_cpu;
}
+static inline bool asym_fits_cpu(unsigned long util,
+ unsigned long util_min,
+ unsigned long util_max,
+ int cpu)
+{
+ if (sched_asym_cpucap_active())
+ /*
+ * Return true only if the cpu fully fits the task requirements
+ * which include the utilization and the performance hints.
+ */
+ return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
+
+ return true;
+}
+
/*
* Try and locate an idle core/thread in the LLC cache domain.
*/
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
+ bool has_idle_core = false;
struct sched_domain *sd;
- int i, recent_used_cpu;
+ unsigned long task_util, util_min, util_max;
+ int i, recent_used_cpu, prev_aff = -1;
/*
- * For asymmetric CPU capacity systems, our domain of interest is
- * sd_asym_cpucapacity rather than sd_llc.
+ * On asymmetric system, update task utilization because we will check
+ * that the task fits with cpu's capacity.
*/
- if (static_branch_unlikely(&sched_asym_cpucapacity)) {
- sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
- /*
- * On an asymmetric CPU capacity system where an exclusive
- * cpuset defines a symmetric island (i.e. one unique
- * capacity_orig value through the cpuset), the key will be set
- * but the CPUs within that cpuset will not have a domain with
- * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
- * capacity path.
- */
- if (!sd)
- goto symmetric;
-
- i = select_idle_capacity(p, sd, target);
- return ((unsigned)i < nr_cpumask_bits) ? i : target;
+ if (sched_asym_cpucap_active()) {
+ sync_entity_load_avg(&p->se);
+ task_util = task_util_est(p);
+ util_min = uclamp_eff_value(p, UCLAMP_MIN);
+ util_max = uclamp_eff_value(p, UCLAMP_MAX);
}
-symmetric:
- if (available_idle_cpu(target) || sched_idle_cpu(target))
+ /*
+ * per-cpu select_rq_mask usage
+ */
+ lockdep_assert_irqs_disabled();
+
+ if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+ asym_fits_cpu(task_util, util_min, util_max, target))
return target;
/*
* If the previous CPU is cache affine and idle, don't be stupid:
*/
if (prev != target && cpus_share_cache(prev, target) &&
- (available_idle_cpu(prev) || sched_idle_cpu(prev)))
- return prev;
+ (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+ asym_fits_cpu(task_util, util_min, util_max, prev)) {
+
+ if (!static_branch_unlikely(&sched_cluster_active) ||
+ cpus_share_resources(prev, target))
+ return prev;
+
+ prev_aff = prev;
+ }
/*
* Allow a per-cpu kthread to stack with the wakee if the
@@ -6234,95 +7538,197 @@ symmetric:
* pattern is IO completions.
*/
if (is_per_cpu_kthread(current) &&
+ in_task() &&
prev == smp_processor_id() &&
- this_rq()->nr_running <= 1) {
+ this_rq()->nr_running <= 1 &&
+ asym_fits_cpu(task_util, util_min, util_max, prev)) {
return prev;
}
/* Check a recently used CPU as a potential idle candidate: */
recent_used_cpu = p->recent_used_cpu;
+ p->recent_used_cpu = prev;
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
- cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
+ cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
+ asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
+
+ if (!static_branch_unlikely(&sched_cluster_active) ||
+ cpus_share_resources(recent_used_cpu, target))
+ return recent_used_cpu;
+
+ } else {
+ recent_used_cpu = -1;
+ }
+
+ /*
+ * For asymmetric CPU capacity systems, our domain of interest is
+ * sd_asym_cpucapacity rather than sd_llc.
+ */
+ if (sched_asym_cpucap_active()) {
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
/*
- * Replace recent_used_cpu with prev as it is a potential
- * candidate for the next wake:
+ * On an asymmetric CPU capacity system where an exclusive
+ * cpuset defines a symmetric island (i.e. one unique
+ * capacity_orig value through the cpuset), the key will be set
+ * but the CPUs within that cpuset will not have a domain with
+ * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
+ * capacity path.
*/
- p->recent_used_cpu = prev;
- return recent_used_cpu;
+ if (sd) {
+ i = select_idle_capacity(p, sd, target);
+ return ((unsigned)i < nr_cpumask_bits) ? i : target;
+ }
}
sd = rcu_dereference(per_cpu(sd_llc, target));
if (!sd)
return target;
- i = select_idle_core(p, sd, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
+ if (sched_smt_active()) {
+ has_idle_core = test_idle_cores(target);
- i = select_idle_cpu(p, sd, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
+ if (!has_idle_core && cpus_share_cache(prev, target)) {
+ i = select_idle_smt(p, prev);
+ if ((unsigned int)i < nr_cpumask_bits)
+ return i;
+ }
+ }
- i = select_idle_smt(p, target);
+ i = select_idle_cpu(p, sd, has_idle_core, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
+ /*
+ * For cluster machines which have lower sharing cache like L2 or
+ * LLC Tag, we tend to find an idle CPU in the target's cluster
+ * first. But prev_cpu or recent_used_cpu may also be a good candidate,
+ * use them if possible when no idle CPU found in select_idle_cpu().
+ */
+ if ((unsigned int)prev_aff < nr_cpumask_bits)
+ return prev_aff;
+ if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
+ return recent_used_cpu;
+
return target;
}
/**
- * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
- * @cpu: the CPU to get the utilization of
- *
- * The unit of the return value must be the one of capacity so we can compare
- * the utilization with the capacity of the CPU that is available for CFS task
- * (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * The estimated utilization of a CPU is defined to be the maximum between its
- * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
- * currently RUNNABLE on that CPU.
- * This allows to properly represent the expected utilization of a CPU which
- * has just got a big task running since a long sleep period. At the same time
- * however it preserves the benefits of the "blocked utilization" in
- * describing the potential for other tasks waking up on the same CPU.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- *
- * Return: the (estimated) utilization for the specified CPU
- */
-static inline unsigned long cpu_util(int cpu)
+ * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks.
+ * @cpu: the CPU to get the utilization for
+ * @p: task for which the CPU utilization should be predicted or NULL
+ * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL
+ * @boost: 1 to enable boosting, otherwise 0
+ *
+ * The unit of the return value must be the same as the one of CPU capacity
+ * so that CPU utilization can be compared with CPU capacity.
+ *
+ * CPU utilization is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on that CPU.
+ * It represents the amount of CPU capacity currently used by CFS tasks in
+ * the range [0..max CPU capacity] with max CPU capacity being the CPU
+ * capacity at f_max.
+ *
+ * The estimated CPU utilization is defined as the maximum between CPU
+ * utilization and sum of the estimated utilization of the currently
+ * runnable tasks on that CPU. It preserves a utilization "snapshot" of
+ * previously-executed tasks, which helps better deduce how busy a CPU will
+ * be when a long-sleeping task wakes up. The contribution to CPU utilization
+ * of such a task would be significantly decayed at this point of time.
+ *
+ * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization).
+ * CPU contention for CFS tasks can be detected by CPU runnable > CPU
+ * utilization. Boosting is implemented in cpu_util() so that internal
+ * users (e.g. EAS) can use it next to external users (e.g. schedutil),
+ * latter via cpu_util_cfs_boost().
+ *
+ * CPU utilization can be higher than the current CPU capacity
+ * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
+ * of rounding errors as well as task migrations or wakeups of new tasks.
+ * CPU utilization has to be capped to fit into the [0..max CPU capacity]
+ * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
+ * could be seen as over-utilized even though CPU1 has 20% of spare CPU
+ * capacity. CPU utilization is allowed to overshoot current CPU capacity
+ * though since this is useful for predicting the CPU capacity required
+ * after task migrations (scheduler-driven DVFS).
+ *
+ * Return: (Boosted) (estimated) utilization for the specified CPU.
+ */
+static unsigned long
+cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
{
- struct cfs_rq *cfs_rq;
- unsigned int util;
+ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+ unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
+ unsigned long runnable;
- cfs_rq = &cpu_rq(cpu)->cfs;
- util = READ_ONCE(cfs_rq->avg.util_avg);
+ if (boost) {
+ runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
+ util = max(util, runnable);
+ }
+
+ /*
+ * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
+ * contribution. If @p migrates from another CPU to @cpu add its
+ * contribution. In all the other cases @cpu is not impacted by the
+ * migration so its util_avg is already correct.
+ */
+ if (p && task_cpu(p) == cpu && dst_cpu != cpu)
+ lsub_positive(&util, task_util(p));
+ else if (p && task_cpu(p) != cpu && dst_cpu == cpu)
+ util += task_util(p);
+
+ if (sched_feat(UTIL_EST)) {
+ unsigned long util_est;
+
+ util_est = READ_ONCE(cfs_rq->avg.util_est);
+
+ /*
+ * During wake-up @p isn't enqueued yet and doesn't contribute
+ * to any cpu_rq(cpu)->cfs.avg.util_est.
+ * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
+ * has been enqueued.
+ *
+ * During exec (@dst_cpu = -1) @p is enqueued and does
+ * contribute to cpu_rq(cpu)->cfs.util_est.
+ * Remove it to "simulate" cpu_util without @p's contribution.
+ *
+ * Despite the task_on_rq_queued(@p) check there is still a
+ * small window for a possible race when an exec
+ * select_task_rq_fair() races with LB's detach_task().
+ *
+ * detach_task()
+ * deactivate_task()
+ * p->on_rq = TASK_ON_RQ_MIGRATING;
+ * -------------------------------- A
+ * dequeue_task() \
+ * dequeue_task_fair() + Race Time
+ * util_est_dequeue() /
+ * -------------------------------- B
+ *
+ * The additional check "current == p" is required to further
+ * reduce the race window.
+ */
+ if (dst_cpu == cpu)
+ util_est += _task_util_est(p);
+ else if (p && unlikely(task_on_rq_queued(p) || current == p))
+ lsub_positive(&util_est, _task_util_est(p));
+
+ util = max(util, util_est);
+ }
- if (sched_feat(UTIL_EST))
- util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+ return min(util, arch_scale_cpu_capacity(cpu));
+}
- return min_t(unsigned long, util, capacity_orig_of(cpu));
+unsigned long cpu_util_cfs(int cpu)
+{
+ return cpu_util(cpu, NULL, -1, 0);
+}
+
+unsigned long cpu_util_cfs_boost(int cpu)
+{
+ return cpu_util(cpu, NULL, -1, 1);
}
/*
@@ -6340,154 +7746,104 @@ static inline unsigned long cpu_util(int cpu)
*/
static unsigned long cpu_util_without(int cpu, struct task_struct *p)
{
- struct cfs_rq *cfs_rq;
- unsigned int util;
-
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
- return cpu_util(cpu);
+ p = NULL;
- cfs_rq = &cpu_rq(cpu)->cfs;
- util = READ_ONCE(cfs_rq->avg.util_avg);
-
- /* Discount task's util from CPU's util */
- lsub_positive(&util, task_util(p));
+ return cpu_util(cpu, p, -1, 0);
+}
- /*
- * Covered cases:
- *
- * a) if *p is the only task sleeping on this CPU, then:
- * cpu_util (== task_util) > util_est (== 0)
- * and thus we return:
- * cpu_util_without = (cpu_util - task_util) = 0
- *
- * b) if other tasks are SLEEPING on this CPU, which is now exiting
- * IDLE, then:
- * cpu_util >= task_util
- * cpu_util > util_est (== 0)
- * and thus we discount *p's blocked utilization to return:
- * cpu_util_without = (cpu_util - task_util) >= 0
- *
- * c) if other tasks are RUNNABLE on that CPU and
- * util_est > cpu_util
- * then we use util_est since it returns a more restrictive
- * estimation of the spare capacity on that CPU, by just
- * considering the expected utilization of tasks already
- * runnable on that CPU.
- *
- * Cases a) and b) are covered by the above code, while case c) is
- * covered by the following code when estimated utilization is
- * enabled.
- */
- if (sched_feat(UTIL_EST)) {
- unsigned int estimated =
- READ_ONCE(cfs_rq->avg.util_est.enqueued);
+/*
+ * energy_env - Utilization landscape for energy estimation.
+ * @task_busy_time: Utilization contribution by the task for which we test the
+ * placement. Given by eenv_task_busy_time().
+ * @pd_busy_time: Utilization of the whole perf domain without the task
+ * contribution. Given by eenv_pd_busy_time().
+ * @cpu_cap: Maximum CPU capacity for the perf domain.
+ * @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap).
+ */
+struct energy_env {
+ unsigned long task_busy_time;
+ unsigned long pd_busy_time;
+ unsigned long cpu_cap;
+ unsigned long pd_cap;
+};
- /*
- * Despite the following checks we still have a small window
- * for a possible race, when an execl's select_task_rq_fair()
- * races with LB's detach_task():
- *
- * detach_task()
- * p->on_rq = TASK_ON_RQ_MIGRATING;
- * ---------------------------------- A
- * deactivate_task() \
- * dequeue_task() + RaceTime
- * util_est_dequeue() /
- * ---------------------------------- B
- *
- * The additional check on "current == p" it's required to
- * properly fix the execl regression and it helps in further
- * reducing the chances for the above race.
- */
- if (unlikely(task_on_rq_queued(p) || current == p))
- lsub_positive(&estimated, _task_util_est(p));
+/*
+ * Compute the task busy time for compute_energy(). This time cannot be
+ * injected directly into effective_cpu_util() because of the IRQ scaling.
+ * The latter only makes sense with the most recent CPUs where the task has
+ * run.
+ */
+static inline void eenv_task_busy_time(struct energy_env *eenv,
+ struct task_struct *p, int prev_cpu)
+{
+ unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu);
+ unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu));
- util = max(util, estimated);
- }
+ if (unlikely(irq >= max_cap))
+ busy_time = max_cap;
+ else
+ busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap);
- /*
- * Utilization (estimated) can exceed the CPU capacity, thus let's
- * clamp to the maximum CPU capacity to ensure consistency with
- * the cpu_util call.
- */
- return min_t(unsigned long, util, capacity_orig_of(cpu));
+ eenv->task_busy_time = busy_time;
}
/*
- * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
- * to @dst_cpu.
+ * Compute the perf_domain (PD) busy time for compute_energy(). Based on the
+ * utilization for each @pd_cpus, it however doesn't take into account
+ * clamping since the ratio (utilization / cpu_capacity) is already enough to
+ * scale the EM reported power consumption at the (eventually clamped)
+ * cpu_capacity.
+ *
+ * The contribution of the task @p for which we want to estimate the
+ * energy cost is removed (by cpu_util()) and must be calculated
+ * separately (see eenv_task_busy_time). This ensures:
+ *
+ * - A stable PD utilization, no matter which CPU of that PD we want to place
+ * the task on.
+ *
+ * - A fair comparison between CPUs as the task contribution (task_util())
+ * will always be the same no matter which CPU utilization we rely on
+ * (util_avg or util_est).
+ *
+ * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't
+ * exceed @eenv->pd_cap.
*/
-static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
+static inline void eenv_pd_busy_time(struct energy_env *eenv,
+ struct cpumask *pd_cpus,
+ struct task_struct *p)
{
- struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
- unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
-
- /*
- * If @p migrates from @cpu to another, remove its contribution. Or,
- * if @p migrates from another CPU to @cpu, add its contribution. In
- * the other cases, @cpu is not impacted by the migration, so the
- * util_avg should already be correct.
- */
- if (task_cpu(p) == cpu && dst_cpu != cpu)
- sub_positive(&util, task_util(p));
- else if (task_cpu(p) != cpu && dst_cpu == cpu)
- util += task_util(p);
-
- if (sched_feat(UTIL_EST)) {
- util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
+ unsigned long busy_time = 0;
+ int cpu;
- /*
- * During wake-up, the task isn't enqueued yet and doesn't
- * appear in the cfs_rq->avg.util_est.enqueued of any rq,
- * so just add it (if needed) to "simulate" what will be
- * cpu_util() after the task has been enqueued.
- */
- if (dst_cpu == cpu)
- util_est += _task_util_est(p);
+ for_each_cpu(cpu, pd_cpus) {
+ unsigned long util = cpu_util(cpu, p, -1, 0);
- util = max(util, util_est);
+ busy_time += effective_cpu_util(cpu, util, NULL, NULL);
}
- return min(util, capacity_orig_of(cpu));
+ eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
}
/*
- * compute_energy(): Estimates the energy that @pd would consume if @p was
- * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
- * landscape of @pd's CPUs after the task migration, and uses the Energy Model
- * to compute what would be the energy if we decided to actually migrate that
- * task.
+ * Compute the maximum utilization for compute_energy() when the task @p
+ * is placed on the cpu @dst_cpu.
+ *
+ * Returns the maximum utilization among @eenv->cpus. This utilization can't
+ * exceed @eenv->cpu_cap.
*/
-static long
-compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
+static inline unsigned long
+eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
+ struct task_struct *p, int dst_cpu)
{
- struct cpumask *pd_mask = perf_domain_span(pd);
- unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
- unsigned long max_util = 0, sum_util = 0;
+ unsigned long max_util = 0;
int cpu;
- /*
- * The capacity state of CPUs of the current rd can be driven by CPUs
- * of another rd if they belong to the same pd. So, account for the
- * utilization of these CPUs too by masking pd with cpu_online_mask
- * instead of the rd span.
- *
- * If an entire pd is outside of the current rd, it will not appear in
- * its pd list and will not be accounted by compute_energy().
- */
- for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
- unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
- struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
-
- /*
- * Busy time computation: utilization clamping is not
- * required since the ratio (sum_util / cpu_capacity)
- * is already enough to scale the EM reported power
- * consumption at the (eventually clamped) cpu_capacity.
- */
- sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
- ENERGY_UTIL, NULL);
+ for_each_cpu(cpu, pd_cpus) {
+ struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
+ unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
+ unsigned long eff_util, min, max;
/*
* Performance domain frequency: utilization clamping
@@ -6496,12 +7852,50 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
- cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
- FREQUENCY_UTIL, tsk);
- max_util = max(max_util, cpu_util);
+ eff_util = effective_cpu_util(cpu, util, &min, &max);
+
+ /* Task's uclamp can modify min and max value */
+ if (tsk && uclamp_is_used()) {
+ min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
+
+ /*
+ * If there is no active max uclamp constraint,
+ * directly use task's one, otherwise keep max.
+ */
+ if (uclamp_rq_is_idle(cpu_rq(cpu)))
+ max = uclamp_eff_value(p, UCLAMP_MAX);
+ else
+ max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
+ }
+
+ eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
+ max_util = max(max_util, eff_util);
}
- return em_pd_energy(pd->em_pd, max_util, sum_util);
+ return min(max_util, eenv->cpu_cap);
+}
+
+/*
+ * compute_energy(): Use the Energy Model to estimate the energy that @pd would
+ * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
+ * contribution is ignored.
+ */
+static inline unsigned long
+compute_energy(struct energy_env *eenv, struct perf_domain *pd,
+ struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
+{
+ unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
+ unsigned long busy_time = eenv->pd_busy_time;
+ unsigned long energy;
+
+ if (dst_cpu >= 0)
+ busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
+
+ energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
+
+ trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
+
+ return energy;
}
/*
@@ -6545,17 +7939,23 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
*/
static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
- struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
- unsigned long cpu_cap, util, base_energy = 0;
- int cpu, best_energy_cpu = prev_cpu;
+ unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
+ unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
+ struct root_domain *rd = this_rq()->rd;
+ int cpu, best_energy_cpu, target = -1;
+ int prev_fits = -1, best_fits = -1;
+ unsigned long best_thermal_cap = 0;
+ unsigned long prev_thermal_cap = 0;
struct sched_domain *sd;
struct perf_domain *pd;
+ struct energy_env eenv;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
if (!pd || READ_ONCE(rd->overutilized))
- goto fail;
+ goto unlock;
/*
* Energy-aware wake-up happens on the lowest sched_domain starting
@@ -6565,112 +7965,198 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
sd = sd->parent;
if (!sd)
- goto fail;
+ goto unlock;
+
+ target = prev_cpu;
sync_entity_load_avg(&p->se);
- if (!task_util_est(p))
+ if (!task_util_est(p) && p_util_min == 0)
goto unlock;
+ eenv_task_busy_time(&eenv, p, prev_cpu);
+
for (; pd; pd = pd->next) {
- unsigned long cur_delta, spare_cap, max_spare_cap = 0;
- unsigned long base_energy_pd;
+ unsigned long util_min = p_util_min, util_max = p_util_max;
+ unsigned long cpu_cap, cpu_thermal_cap, util;
+ long prev_spare_cap = -1, max_spare_cap = -1;
+ unsigned long rq_util_min, rq_util_max;
+ unsigned long cur_delta, base_energy;
int max_spare_cap_cpu = -1;
+ int fits, max_fits = -1;
- /* Compute the 'base' energy of the pd, without @p */
- base_energy_pd = compute_energy(p, -1, pd);
- base_energy += base_energy_pd;
+ cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
+
+ if (cpumask_empty(cpus))
+ continue;
+
+ /* Account thermal pressure for the energy estimation */
+ cpu = cpumask_first(cpus);
+ cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
+ cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
+
+ eenv.cpu_cap = cpu_thermal_cap;
+ eenv.pd_cap = 0;
+
+ for_each_cpu(cpu, cpus) {
+ struct rq *rq = cpu_rq(cpu);
+
+ eenv.pd_cap += cpu_thermal_cap;
+
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
+ continue;
- for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
- util = cpu_util_next(cpu, p, cpu);
+ util = cpu_util(cpu, p, cpu, 0);
cpu_cap = capacity_of(cpu);
- spare_cap = cpu_cap - util;
/*
* Skip CPUs that cannot satisfy the capacity request.
* IOW, placing the task there would make the CPU
* overutilized. Take uclamp into account to see how
* much capacity we can get out of the CPU; this is
- * aligned with schedutil_cpu_util().
+ * aligned with sched_cpu_util().
*/
- util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
- if (!fits_capacity(util, cpu_cap))
+ if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
+ /*
+ * Open code uclamp_rq_util_with() except for
+ * the clamp() part. Ie: apply max aggregation
+ * only. util_fits_cpu() logic requires to
+ * operate on non clamped util but must use the
+ * max-aggregated uclamp_{min, max}.
+ */
+ rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
+ rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
+
+ util_min = max(rq_util_min, p_util_min);
+ util_max = max(rq_util_max, p_util_max);
+ }
+
+ fits = util_fits_cpu(util, util_min, util_max, cpu);
+ if (!fits)
continue;
- /* Always use prev_cpu as a candidate. */
+ lsub_positive(&cpu_cap, util);
+
if (cpu == prev_cpu) {
- prev_delta = compute_energy(p, prev_cpu, pd);
- prev_delta -= base_energy_pd;
- best_delta = min(best_delta, prev_delta);
+ /* Always use prev_cpu as a candidate. */
+ prev_spare_cap = cpu_cap;
+ prev_fits = fits;
+ } else if ((fits > max_fits) ||
+ ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
+ /*
+ * Find the CPU with the maximum spare capacity
+ * among the remaining CPUs in the performance
+ * domain.
+ */
+ max_spare_cap = cpu_cap;
+ max_spare_cap_cpu = cpu;
+ max_fits = fits;
}
+ }
+
+ if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
+ continue;
+
+ eenv_pd_busy_time(&eenv, cpus, p);
+ /* Compute the 'base' energy of the pd, without @p */
+ base_energy = compute_energy(&eenv, pd, cpus, p, -1);
+
+ /* Evaluate the energy impact of using prev_cpu. */
+ if (prev_spare_cap > -1) {
+ prev_delta = compute_energy(&eenv, pd, cpus, p,
+ prev_cpu);
+ /* CPU utilization has changed */
+ if (prev_delta < base_energy)
+ goto unlock;
+ prev_delta -= base_energy;
+ prev_thermal_cap = cpu_thermal_cap;
+ best_delta = min(best_delta, prev_delta);
+ }
+
+ /* Evaluate the energy impact of using max_spare_cap_cpu. */
+ if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
+ /* Current best energy cpu fits better */
+ if (max_fits < best_fits)
+ continue;
/*
- * Find the CPU with the maximum spare capacity in
- * the performance domain
+ * Both don't fit performance hint (i.e. uclamp_min)
+ * but best energy cpu has better capacity.
*/
- if (spare_cap > max_spare_cap) {
- max_spare_cap = spare_cap;
- max_spare_cap_cpu = cpu;
- }
- }
+ if ((max_fits < 0) &&
+ (cpu_thermal_cap <= best_thermal_cap))
+ continue;
- /* Evaluate the energy impact of using this CPU. */
- if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
- cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
- cur_delta -= base_energy_pd;
- if (cur_delta < best_delta) {
- best_delta = cur_delta;
- best_energy_cpu = max_spare_cap_cpu;
- }
+ cur_delta = compute_energy(&eenv, pd, cpus, p,
+ max_spare_cap_cpu);
+ /* CPU utilization has changed */
+ if (cur_delta < base_energy)
+ goto unlock;
+ cur_delta -= base_energy;
+
+ /*
+ * Both fit for the task but best energy cpu has lower
+ * energy impact.
+ */
+ if ((max_fits > 0) && (best_fits > 0) &&
+ (cur_delta >= best_delta))
+ continue;
+
+ best_delta = cur_delta;
+ best_energy_cpu = max_spare_cap_cpu;
+ best_fits = max_fits;
+ best_thermal_cap = cpu_thermal_cap;
}
}
-unlock:
rcu_read_unlock();
- /*
- * Pick the best CPU if prev_cpu cannot be used, or if it saves at
- * least 6% of the energy used by prev_cpu.
- */
- if (prev_delta == ULONG_MAX)
- return best_energy_cpu;
+ if ((best_fits > prev_fits) ||
+ ((best_fits > 0) && (best_delta < prev_delta)) ||
+ ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
+ target = best_energy_cpu;
- if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
- return best_energy_cpu;
-
- return prev_cpu;
+ return target;
-fail:
+unlock:
rcu_read_unlock();
- return -1;
+ return target;
}
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
- * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
+ * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
*
* Balances load by selecting the idlest CPU in the idlest group, or under
* certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
*
* Returns the target CPU number.
- *
- * preempt must be disabled.
*/
static int
-select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
{
+ int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
struct sched_domain *tmp, *sd = NULL;
int cpu = smp_processor_id();
int new_cpu = prev_cpu;
int want_affine = 0;
- int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
+ /* SD_flags and WF_flags share the first nibble */
+ int sd_flag = wake_flags & 0xF;
- if (sd_flag & SD_BALANCE_WAKE) {
+ /*
+ * required for stable ->cpus_allowed
+ */
+ lockdep_assert_held(&p->pi_lock);
+ if (wake_flags & WF_TTWU) {
record_wakee(p);
+ if ((wake_flags & WF_CURRENT_CPU) &&
+ cpumask_test_cpu(cpu, p->cpus_ptr))
+ return cpu;
+
if (sched_energy_enabled()) {
new_cpu = find_energy_efficient_cpu(p, prev_cpu);
if (new_cpu >= 0)
@@ -6696,6 +8182,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
break;
}
+ /*
+ * Usually only true for WF_EXEC and WF_FORK, as sched_domains
+ * usually do not have SD_BALANCE_WAKE set. That means wakeup
+ * will usually go to the fast path.
+ */
if (tmp->flags & sd_flag)
sd = tmp;
else if (!want_affine)
@@ -6705,21 +8196,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (unlikely(sd)) {
/* Slow path */
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
- } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
+ } else if (wake_flags & WF_TTWU) { /* XXX always ? */
/* Fast path */
-
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
-
- if (want_affine)
- current->recent_used_cpu = cpu;
}
rcu_read_unlock();
return new_cpu;
}
-static void detach_entity_cfs_rq(struct sched_entity *se);
-
/*
* Called immediately before a task is migrated to a new CPU; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
@@ -6727,57 +8212,26 @@ static void detach_entity_cfs_rq(struct sched_entity *se);
*/
static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
- /*
- * As blocked tasks retain absolute vruntime the migration needs to
- * deal with this by subtracting the old and adding the new
- * min_vruntime -- the latter is done by enqueue_entity() when placing
- * the task on the new runqueue.
- */
- if (p->state == TASK_WAKING) {
- struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 min_vruntime;
-
-#ifndef CONFIG_64BIT
- u64 min_vruntime_copy;
-
- do {
- min_vruntime_copy = cfs_rq->min_vruntime_copy;
- smp_rmb();
- min_vruntime = cfs_rq->min_vruntime;
- } while (min_vruntime != min_vruntime_copy);
-#else
- min_vruntime = cfs_rq->min_vruntime;
-#endif
+ struct sched_entity *se = &p->se;
- se->vruntime -= min_vruntime;
- }
+ if (!task_on_rq_migrating(p)) {
+ remove_entity_load_avg(se);
- if (p->on_rq == TASK_ON_RQ_MIGRATING) {
/*
- * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
- * rq->lock and can modify state directly.
- */
- lockdep_assert_held(&task_rq(p)->lock);
- detach_entity_cfs_rq(&p->se);
-
- } else {
- /*
- * We are supposed to update the task to "current" time, then
- * its up to date and ready to go to new CPU/cfs_rq. But we
- * have difficulty in getting what current time is, so simply
- * throw away the out-of-date time. This will result in the
- * wakee task is less decayed, but giving the wakee more load
- * sounds not bad.
+ * Here, the task's PELT values have been updated according to
+ * the current rq's clock. But if that clock hasn't been
+ * updated in a while, a substantial idle time will be missed,
+ * leading to an inflation after wake-up on the new rq.
+ *
+ * Estimate the missing time from the cfs_rq last_update_time
+ * and update sched_avg to improve the PELT continuity after
+ * migration.
*/
- remove_entity_load_avg(&p->se);
+ migrate_se_pelt_lag(se);
}
/* Tell new CPU we are migrated */
- p->se.avg.last_update_time = 0;
-
- /* We have migrated, no longer consider this task hot */
- p->se.exec_start = 0;
+ se->avg.last_update_time = 0;
update_scan_period(p, new_cpu);
}
@@ -6797,111 +8251,41 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
#endif /* CONFIG_SMP */
-static unsigned long wakeup_gran(struct sched_entity *se)
-{
- unsigned long gran = sysctl_sched_wakeup_granularity;
-
- /*
- * Since its curr running now, convert the gran from real-time
- * to virtual-time in his units.
- *
- * By using 'se' instead of 'curr' we penalize light tasks, so
- * they get preempted easier. That is, if 'se' < 'curr' then
- * the resulting gran will be larger, therefore penalizing the
- * lighter, if otoh 'se' > 'curr' then the resulting gran will
- * be smaller, again penalizing the lighter task.
- *
- * This is especially important for buddies when the leftmost
- * task is higher priority than the buddy.
- */
- return calc_delta_fair(gran, se);
-}
-
-/*
- * Should 'se' preempt 'curr'.
- *
- * |s1
- * |s2
- * |s3
- * g
- * |<--->|c
- *
- * w(c, s1) = -1
- * w(c, s2) = 0
- * w(c, s3) = 1
- *
- */
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
-{
- s64 gran, vdiff = curr->vruntime - se->vruntime;
-
- if (vdiff <= 0)
- return -1;
-
- gran = wakeup_gran(se);
- if (vdiff > gran)
- return 1;
-
- return 0;
-}
-
-static void set_last_buddy(struct sched_entity *se)
-{
- if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
- return;
-
- for_each_sched_entity(se) {
- if (SCHED_WARN_ON(!se->on_rq))
- return;
- cfs_rq_of(se)->last = se;
- }
-}
-
static void set_next_buddy(struct sched_entity *se)
{
- if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
- return;
-
for_each_sched_entity(se) {
if (SCHED_WARN_ON(!se->on_rq))
return;
+ if (se_is_idle(se))
+ return;
cfs_rq_of(se)->next = se;
}
}
-static void set_skip_buddy(struct sched_entity *se)
-{
- for_each_sched_entity(se)
- cfs_rq_of(se)->skip = se;
-}
-
/*
* Preempt the current task with a newly woken task if needed:
*/
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
{
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- int scale = cfs_rq->nr_running >= sched_nr_latency;
- int next_buddy_marked = 0;
+ int cse_is_idle, pse_is_idle;
if (unlikely(se == pse))
return;
/*
* This is possible from callers such as attach_tasks(), in which we
- * unconditionally check_prempt_curr() after an enqueue (which may have
+ * unconditionally wakeup_preempt() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
*/
if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
return;
- if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
+ if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) {
set_next_buddy(pse);
- next_buddy_marked = 1;
}
/*
@@ -6930,37 +8314,67 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
find_matching_se(&se, &pse);
- update_curr(cfs_rq_of(se));
- BUG_ON(!pse);
- if (wakeup_preempt_entity(se, pse) == 1) {
- /*
- * Bias pick_next to pick the sched entity that is
- * triggering this preemption.
- */
- if (!next_buddy_marked)
- set_next_buddy(pse);
+ WARN_ON_ONCE(!pse);
+
+ cse_is_idle = se_is_idle(se);
+ pse_is_idle = se_is_idle(pse);
+
+ /*
+ * Preempt an idle group in favor of a non-idle group (and don't preempt
+ * in the inverse case).
+ */
+ if (cse_is_idle && !pse_is_idle)
+ goto preempt;
+ if (cse_is_idle != pse_is_idle)
+ return;
+
+ cfs_rq = cfs_rq_of(se);
+ update_curr(cfs_rq);
+
+ /*
+ * XXX pick_eevdf(cfs_rq) != se ?
+ */
+ if (pick_eevdf(cfs_rq) == pse)
goto preempt;
- }
return;
preempt:
resched_curr(rq);
- /*
- * Only set the backward buddy when the current task is still
- * on the rq. This can happen when a wakeup gets interleaved
- * with schedule on the ->pre_schedule() or idle_balance()
- * point, either of which can * drop the rq lock.
- *
- * Also, during early boot the idle thread is in the fair class,
- * for obvious reasons its a bad idea to schedule back to it.
- */
- if (unlikely(!se->on_rq || curr == rq->idle))
- return;
+}
+
+#ifdef CONFIG_SMP
+static struct task_struct *pick_task_fair(struct rq *rq)
+{
+ struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+
+again:
+ cfs_rq = &rq->cfs;
+ if (!cfs_rq->nr_running)
+ return NULL;
+
+ do {
+ struct sched_entity *curr = cfs_rq->curr;
+
+ /* When we pick for a remote RQ, we'll not have done put_prev_entity() */
+ if (curr) {
+ if (curr->on_rq)
+ update_curr(cfs_rq);
+ else
+ curr = NULL;
+
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ goto again;
+ }
- if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
- set_last_buddy(se);
+ se = pick_next_entity(cfs_rq);
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
+
+ return task_of(se);
}
+#endif
struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
@@ -7017,7 +8431,7 @@ again:
}
}
- se = pick_next_entity(cfs_rq, curr);
+ se = pick_next_entity(cfs_rq);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
@@ -7056,7 +8470,7 @@ simple:
put_prev_task(rq, prev);
do {
- se = pick_next_entity(cfs_rq, NULL);
+ se = pick_next_entity(cfs_rq);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
@@ -7073,10 +8487,11 @@ done: __maybe_unused;
list_move(&p->se.group_node, &rq->cfs_tasks);
#endif
- if (hrtick_enabled(rq))
+ if (hrtick_enabled_fair(rq))
hrtick_start_fair(rq, p);
update_misfit_status(p, rq);
+ sched_fair_update_stop_tick(rq, p);
return p;
@@ -7127,8 +8542,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
/*
* sched_yield() is very simple
- *
- * The magic of dealing with the ->skip buddy is in pick_next_entity.
*/
static void yield_task_fair(struct rq *rq)
{
@@ -7144,24 +8557,22 @@ static void yield_task_fair(struct rq *rq)
clear_buddies(cfs_rq, se);
- if (curr->policy != SCHED_BATCH) {
- update_rq_clock(rq);
- /*
- * Update run-time statistics of the 'current'.
- */
- update_curr(cfs_rq);
- /*
- * Tell update_rq_clock() that we've just updated,
- * so we don't do microscopic update in schedule()
- * and double the fastpath cost.
- */
- rq_clock_skip_update(rq);
- }
+ update_rq_clock(rq);
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+ /*
+ * Tell update_rq_clock() that we've just updated,
+ * so we don't do microscopic update in schedule()
+ * and double the fastpath cost.
+ */
+ rq_clock_skip_update(rq);
- set_skip_buddy(se);
+ se->deadline += calc_delta_fair(se->slice, se);
}
-static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
@@ -7316,11 +8727,16 @@ enum group_type {
*/
group_fully_busy,
/*
- * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
- * and must be migrated to a more powerful CPU.
+ * One task doesn't fit with CPU's capacity and must be migrated to a
+ * more powerful CPU.
*/
group_misfit_task,
/*
+ * Balance SMT group that's fully busy. Can benefit from migration
+ * a task on SMT with busy sibling to another CPU on idle core.
+ */
+ group_smt_balance,
+ /*
* SD_ASYM_PACKING only: One local CPU with higher capacity is available,
* and the task should be migrated to it instead of running on the
* current CPU.
@@ -7349,8 +8765,7 @@ enum migration_type {
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
-#define LBF_NOHZ_STATS 0x10
-#define LBF_NOHZ_AGAIN 0x20
+#define LBF_ACTIVE_LB 0x10
struct lb_env {
struct sched_domain *sd;
@@ -7386,7 +8801,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
{
s64 delta;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
if (p->sched_class != &fair_sched_class)
return 0;
@@ -7394,16 +8809,27 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
if (unlikely(task_has_idle_policy(p)))
return 0;
+ /* SMT siblings share cache */
+ if (env->sd->flags & SD_SHARE_CPUCAPACITY)
+ return 0;
+
/*
* Buddy candidates are cache hot:
*/
if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
- (&p->se == cfs_rq_of(&p->se)->next ||
- &p->se == cfs_rq_of(&p->se)->last))
+ (&p->se == cfs_rq_of(&p->se)->next))
return 1;
if (sysctl_sched_migration_cost == -1)
return 1;
+
+ /*
+ * Don't migrate task if the task's cookie does not match
+ * with the destination CPU's core cookie.
+ */
+ if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
+ return 1;
+
if (sysctl_sched_migration_cost == 0)
return 0;
@@ -7480,7 +8906,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
/*
* We do not migrate tasks that are:
@@ -7492,10 +8918,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
+ /* Disregard pcpu kthreads; they are where they need to be. */
+ if (kthread_is_per_cpu(p))
+ return 0;
+
if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;
- schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
+ schedstat_inc(p->stats.nr_failed_migrations_affine);
env->flags |= LBF_SOME_PINNED;
@@ -7504,10 +8934,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* our sched_group. We may want to revisit it if we couldn't
* meet load balance goals by pulling other tasks on src_cpu.
*
- * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
- * already computed one in current iteration.
+ * Avoid computing new_dst_cpu
+ * - for NEWLY_IDLE
+ * - if we have already computed one in current iteration
+ * - if it's an active balance
*/
- if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
+ if (env->idle == CPU_NEWLY_IDLE ||
+ env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
return 0;
/* Prevent to re-select dst_cpu via env's CPUs: */
@@ -7522,20 +8955,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
return 0;
}
- /* Record that we found atleast one task that could run on dst_cpu */
+ /* Record that we found at least one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
- if (task_running(env->src_rq, p)) {
- schedstat_inc(p->se.statistics.nr_failed_migrations_running);
+ if (task_on_cpu(env->src_rq, p)) {
+ schedstat_inc(p->stats.nr_failed_migrations_running);
return 0;
}
/*
* Aggressive migration if:
- * 1) destination numa is preferred
- * 2) task is cache cold, or
- * 3) too many balance attempts have failed.
+ * 1) active balance
+ * 2) destination numa is preferred
+ * 3) task is cache cold, or
+ * 4) too many balance attempts have failed.
*/
+ if (env->flags & LBF_ACTIVE_LB)
+ return 1;
+
tsk_cache_hot = migrate_degrades_locality(p, env);
if (tsk_cache_hot == -1)
tsk_cache_hot = task_hot(p, env);
@@ -7544,12 +8981,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (tsk_cache_hot == 1) {
schedstat_inc(env->sd->lb_hot_gained[env->idle]);
- schedstat_inc(p->se.statistics.nr_forced_migrations);
+ schedstat_inc(p->stats.nr_forced_migrations);
}
return 1;
}
- schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
+ schedstat_inc(p->stats.nr_failed_migrations_hot);
return 0;
}
@@ -7558,7 +8995,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
*/
static void detach_task(struct task_struct *p, struct lb_env *env)
{
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu);
@@ -7574,7 +9011,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
{
struct task_struct *p;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
list_for_each_entry_reverse(p,
&env->src_rq->cfs_tasks, se.group_node) {
@@ -7595,8 +9032,6 @@ static struct task_struct *detach_one_task(struct lb_env *env)
return NULL;
}
-static const unsigned int sched_nr_migrate_break = 32;
-
/*
* detach_tasks() -- tries to detach up to imbalance load/util/tasks from
* busiest_rq, as part of a balancing operation within domain "sd".
@@ -7610,7 +9045,16 @@ static int detach_tasks(struct lb_env *env)
struct task_struct *p;
int detached = 0;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
+
+ /*
+ * Source run queue has been emptied by another CPU, clear
+ * LBF_ALL_PINNED flag as we will not test any task.
+ */
+ if (env->src_rq->nr_running <= 1) {
+ env->flags &= ~LBF_ALL_PINNED;
+ return 0;
+ }
if (env->imbalance <= 0)
return 0;
@@ -7623,20 +9067,24 @@ static int detach_tasks(struct lb_env *env)
if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
break;
- p = list_last_entry(tasks, struct task_struct, se.group_node);
-
env->loop++;
- /* We've more or less seen every task there is, call it quits */
- if (env->loop > env->loop_max)
+ /*
+ * We've more or less seen every task there is, call it quits
+ * unless we haven't found any movable task yet.
+ */
+ if (env->loop > env->loop_max &&
+ !(env->flags & LBF_ALL_PINNED))
break;
/* take a breather every nr_migrate tasks */
if (env->loop > env->loop_break) {
- env->loop_break += sched_nr_migrate_break;
+ env->loop_break += SCHED_NR_MIGRATE_BREAK;
env->flags |= LBF_NEED_BREAK;
break;
}
+ p = list_last_entry(tasks, struct task_struct, se.group_node);
+
if (!can_migrate_task(p, env))
goto next;
@@ -7661,8 +9109,7 @@ static int detach_tasks(struct lb_env *env)
* scheduler fails to find a good waiting task to
* migrate.
*/
- if (load/2 > env->imbalance &&
- env->sd->nr_balance_failed <= env->sd->cache_nice_tries)
+ if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
goto next;
env->imbalance -= load;
@@ -7671,7 +9118,7 @@ static int detach_tasks(struct lb_env *env)
case migrate_util:
util = task_util_est(p);
- if (util > env->imbalance)
+ if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance)
goto next;
env->imbalance -= util;
@@ -7683,7 +9130,7 @@ static int detach_tasks(struct lb_env *env)
case migrate_misfit:
/* This is not a misfit task */
- if (task_fits_capacity(p, capacity_of(env->src_cpu)))
+ if (task_fits_cpu(p, env->src_cpu))
goto next;
env->imbalance = 0;
@@ -7732,11 +9179,11 @@ next:
*/
static void attach_task(struct rq *rq, struct task_struct *p)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
- BUG_ON(task_rq(p) != rq);
+ WARN_ON_ONCE(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
- check_preempt_curr(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
}
/*
@@ -7807,16 +9254,20 @@ static inline bool others_have_blocked(struct rq *rq)
return false;
}
-static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+static inline void update_blocked_load_tick(struct rq *rq)
{
- rq->last_blocked_load_update_tick = jiffies;
+ WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
+}
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+{
if (!has_blocked)
rq->has_blocked_load = 0;
}
#else
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
static inline bool others_have_blocked(struct rq *rq) { return false; }
+static inline void update_blocked_load_tick(struct rq *rq) {}
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
#endif
@@ -7848,23 +9299,6 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
#ifdef CONFIG_FAIR_GROUP_SCHED
-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
-{
- if (cfs_rq->load.weight)
- return false;
-
- if (cfs_rq->avg.load_sum)
- return false;
-
- if (cfs_rq->avg.util_sum)
- return false;
-
- if (cfs_rq->avg.runnable_sum)
- return false;
-
- return true;
-}
-
static bool __update_blocked_fair(struct rq *rq, bool *done)
{
struct cfs_rq *cfs_rq, *pos;
@@ -7879,7 +9313,10 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
struct sched_entity *se;
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
- update_tg_load_avg(cfs_rq, 0);
+ update_tg_load_avg(cfs_rq);
+
+ if (cfs_rq->nr_running == 0)
+ update_idle_cfs_rq_clock_pelt(cfs_rq);
if (cfs_rq == &rq->cfs)
decayed = true;
@@ -7888,7 +9325,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
- update_load_avg(cfs_rq_of(se), se, 0);
+ update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
/*
* There can be a lot of idle CPU cgroups. Don't let fully
@@ -7977,6 +9414,7 @@ static void update_blocked_averages(int cpu)
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
+ update_blocked_load_tick(rq);
update_rq_clock(rq);
decayed |= __update_blocked_others(rq, &done);
@@ -8005,6 +9443,7 @@ struct sg_lb_stats {
unsigned int group_weight;
enum group_type group_type;
unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
+ unsigned int group_smt_balance; /* Task on busy SMT be moved */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -8049,7 +9488,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
};
}
-static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
+static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long max = arch_scale_cpu_capacity(cpu);
@@ -8081,15 +9520,15 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long capacity = scale_rt_capacity(sd, cpu);
+ unsigned long capacity = scale_rt_capacity(cpu);
struct sched_group *sdg = sd->groups;
- cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
-
if (!capacity)
capacity = 1;
cpu_rq(cpu)->cpu_capacity = capacity;
+ trace_sched_cpu_capacity_tp(cpu_rq(cpu));
+
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = capacity;
sdg->sgc->max_capacity = capacity;
@@ -8159,7 +9598,7 @@ static inline int
check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
{
return ((rq->cpu_capacity * sd->imbalance_pct) <
- (rq->cpu_capacity_orig * 100));
+ (arch_scale_cpu_capacity(cpu_of(rq)) * 100));
}
/*
@@ -8170,7 +9609,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
{
return rq->misfit_task_load &&
- (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
+ (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
check_cpu_capacity(rq, sd));
}
@@ -8211,7 +9650,7 @@ static inline int sg_imbalanced(struct sched_group *group)
/*
* group_has_capacity returns true if the group has spare capacity that could
* be used by some tasks.
- * We consider that a group has spare capacity if the * number of task is
+ * We consider that a group has spare capacity if the number of task is
* smaller than the number of CPUs or if the utilization is lower than the
* available capacity for CFS tasks.
* For the latter, we use a threshold to stabilize the state, to take into
@@ -8262,26 +9701,6 @@ group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
return false;
}
-/*
- * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
- * per-CPU capacity than sched_group ref.
- */
-static inline bool
-group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
-{
- return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
-}
-
-/*
- * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
- * per-CPU capacity_orig than sched_group ref.
- */
-static inline bool
-group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
-{
- return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
-}
-
static inline enum
group_type group_classify(unsigned int imbalance_pct,
struct sched_group *group,
@@ -8296,6 +9715,9 @@ group_type group_classify(unsigned int imbalance_pct,
if (sgs->group_asym_packing)
return group_asym_packing;
+ if (sgs->group_smt_balance)
+ return group_smt_balance;
+
if (sgs->group_misfit_task_load)
return group_misfit_task;
@@ -8305,36 +9727,154 @@ group_type group_classify(unsigned int imbalance_pct,
return group_has_spare;
}
-static bool update_nohz_stats(struct rq *rq, bool force)
+/**
+ * sched_use_asym_prio - Check whether asym_packing priority must be used
+ * @sd: The scheduling domain of the load balancing
+ * @cpu: A CPU
+ *
+ * Always use CPU priority when balancing load between SMT siblings. When
+ * balancing load between cores, it is not sufficient that @cpu is idle. Only
+ * use CPU priority if the whole core is idle.
+ *
+ * Returns: True if the priority of @cpu must be followed. False otherwise.
+ */
+static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
{
-#ifdef CONFIG_NO_HZ_COMMON
- unsigned int cpu = rq->cpu;
+ if (!sched_smt_active())
+ return true;
- if (!rq->has_blocked_load)
+ return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
+}
+
+/**
+ * sched_asym - Check if the destination CPU can do asym_packing load balance
+ * @env: The load balancing environment
+ * @sds: Load-balancing data with statistics of the local group
+ * @sgs: Load-balancing statistics of the candidate busiest group
+ * @group: The candidate busiest group
+ *
+ * @env::dst_cpu can do asym_packing if it has higher priority than the
+ * preferred CPU of @group.
+ *
+ * SMT is a special case. If we are balancing load between cores, @env::dst_cpu
+ * can do asym_packing balance only if all its SMT siblings are idle. Also, it
+ * can only do it if @group is an SMT group and has exactly on busy CPU. Larger
+ * imbalances in the number of CPUS are dealt with in find_busiest_group().
+ *
+ * If we are balancing load within an SMT core, or at PKG domain level, always
+ * proceed.
+ *
+ * Return: true if @env::dst_cpu can do with asym_packing load balance. False
+ * otherwise.
+ */
+static inline bool
+sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ /* Ensure that the whole local core is idle, if applicable. */
+ if (!sched_use_asym_prio(env->sd, env->dst_cpu))
return false;
- if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ /*
+ * CPU priorities does not make sense for SMT cores with more than one
+ * busy sibling.
+ */
+ if (group->flags & SD_SHARE_CPUCAPACITY) {
+ if (sgs->group_weight - sgs->idle_cpus != 1)
+ return false;
+ }
+
+ return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
+}
+
+/* One group has more than one SMT CPU while the other group does not */
+static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
+ struct sched_group *sg2)
+{
+ if (!sg1 || !sg2)
return false;
- if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
- return true;
+ return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
+ (sg2->flags & SD_SHARE_CPUCAPACITY);
+}
- update_blocked_averages(cpu);
+static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ if (env->idle == CPU_NOT_IDLE)
+ return false;
+
+ /*
+ * For SMT source group, it is better to move a task
+ * to a CPU that doesn't have multiple tasks sharing its CPU capacity.
+ * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
+ * will not be on.
+ */
+ if (group->flags & SD_SHARE_CPUCAPACITY &&
+ sgs->sum_h_nr_running > 1)
+ return true;
- return rq->has_blocked_load;
-#else
return false;
-#endif
+}
+
+static inline long sibling_imbalance(struct lb_env *env,
+ struct sd_lb_stats *sds,
+ struct sg_lb_stats *busiest,
+ struct sg_lb_stats *local)
+{
+ int ncores_busiest, ncores_local;
+ long imbalance;
+
+ if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running)
+ return 0;
+
+ ncores_busiest = sds->busiest->cores;
+ ncores_local = sds->local->cores;
+
+ if (ncores_busiest == ncores_local) {
+ imbalance = busiest->sum_nr_running;
+ lsub_positive(&imbalance, local->sum_nr_running);
+ return imbalance;
+ }
+
+ /* Balance such that nr_running/ncores ratio are same on both groups */
+ imbalance = ncores_local * busiest->sum_nr_running;
+ lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running);
+ /* Normalize imbalance and do rounding on normalization */
+ imbalance = 2 * imbalance + ncores_local + ncores_busiest;
+ imbalance /= ncores_local + ncores_busiest;
+
+ /* Take advantage of resource in an empty sched group */
+ if (imbalance <= 1 && local->sum_nr_running == 0 &&
+ busiest->sum_nr_running > 1)
+ imbalance = 2;
+
+ return imbalance;
+}
+
+static inline bool
+sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
+{
+ /*
+ * When there is more than 1 task, the group_overloaded case already
+ * takes care of cpu with reduced capacity
+ */
+ if (rq->cfs.h_nr_running != 1)
+ return false;
+
+ return check_cpu_capacity(rq, sd);
}
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
+ * @sds: Load-balancing data with statistics of the local group.
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
* @sg_status: Holds flag indicating the status of the sched_group
*/
static inline void update_sg_lb_stats(struct lb_env *env,
+ struct sd_lb_stats *sds,
struct sched_group *group,
struct sg_lb_stats *sgs,
int *sg_status)
@@ -8343,16 +9883,14 @@ static inline void update_sg_lb_stats(struct lb_env *env,
memset(sgs, 0, sizeof(*sgs));
- local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
+ local_group = group == sds->local;
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
+ unsigned long load = cpu_load(rq);
- if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
- env->flags |= LBF_NOHZ_AGAIN;
-
- sgs->group_load += cpu_load(rq);
- sgs->group_util += cpu_util(i);
+ sgs->group_load += load;
+ sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq);
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
@@ -8381,25 +9919,34 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (local_group)
continue;
- /* Check for a misfit task on the cpu */
- if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
- sgs->group_misfit_task_load < rq->misfit_task_load) {
- sgs->group_misfit_task_load = rq->misfit_task_load;
- *sg_status |= SG_OVERLOAD;
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+ /* Check for a misfit task on the cpu */
+ if (sgs->group_misfit_task_load < rq->misfit_task_load) {
+ sgs->group_misfit_task_load = rq->misfit_task_load;
+ *sg_status |= SG_OVERLOAD;
+ }
+ } else if ((env->idle != CPU_NOT_IDLE) &&
+ sched_reduced_capacity(rq, env->sd)) {
+ /* Check for a task running on a CPU with reduced capacity */
+ if (sgs->group_misfit_task_load < load)
+ sgs->group_misfit_task_load = load;
}
}
+ sgs->group_capacity = group->sgc->capacity;
+
+ sgs->group_weight = group->group_weight;
+
/* Check if dst CPU is idle and preferred to this group */
- if (env->sd->flags & SD_ASYM_PACKING &&
- env->idle != CPU_NOT_IDLE &&
- sgs->sum_h_nr_running &&
- sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
+ if (!local_group && env->sd->flags & SD_ASYM_PACKING &&
+ env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running &&
+ sched_asym(env, sds, sgs, group)) {
sgs->group_asym_packing = 1;
}
- sgs->group_capacity = group->sgc->capacity;
-
- sgs->group_weight = group->group_weight;
+ /* Check for loaded SMT group to be balanced to dst CPU */
+ if (!local_group && smt_balance(env, sgs, group))
+ sgs->group_smt_balance = 1;
sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
@@ -8439,8 +9986,9 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* CPUs in the group should either be possible to resolve
* internally or be covered by avg_load imbalance (eventually).
*/
- if (sgs->group_type == group_misfit_task &&
- (!group_smaller_max_cpu_capacity(sg, sds->local) ||
+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
+ (sgs->group_type == group_misfit_task) &&
+ (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
sds->local_stat.group_type != group_has_spare))
return false;
@@ -8484,6 +10032,16 @@ static bool update_sd_pick_busiest(struct lb_env *env,
return false;
break;
+ case group_smt_balance:
+ /*
+ * Check if we have spare CPUs on either SMT group to
+ * choose has spare or fully busy handling.
+ */
+ if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0)
+ goto has_spare;
+
+ fallthrough;
+
case group_fully_busy:
/*
* Select the fully busy group with highest avg_load. In
@@ -8493,14 +10051,39 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* contention when accessing shared HW resources.
*
* XXX for now avg_load is not computed and always 0 so we
- * select the 1st one.
+ * select the 1st one, except if @sg is composed of SMT
+ * siblings.
*/
- if (sgs->avg_load <= busiest->avg_load)
+
+ if (sgs->avg_load < busiest->avg_load)
return false;
+
+ if (sgs->avg_load == busiest->avg_load) {
+ /*
+ * SMT sched groups need more help than non-SMT groups.
+ * If @sg happens to also be SMT, either choice is good.
+ */
+ if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
+ return false;
+ }
+
break;
case group_has_spare:
/*
+ * Do not pick sg with SMT CPUs over sg with pure CPUs,
+ * as we do not want to pull task off SMT core with one task
+ * and make the core idle.
+ */
+ if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
+ if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
+ return false;
+ else
+ return true;
+ }
+has_spare:
+
+ /*
* Select not overloaded group with lowest number of idle cpus
* and highest number of running tasks. We could also compare
* the spare capacity which is more stable but it can end up
@@ -8524,7 +10107,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
*/
if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
(sgs->group_type <= group_fully_busy) &&
- (group_smaller_min_cpu_capacity(sds->local, sg)))
+ (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
return false;
return true;
@@ -8623,6 +10206,10 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
memset(sgs, 0, sizeof(*sgs));
+ /* Assume that task can't fit any CPU of the group */
+ if (sd->flags & SD_ASYM_CPUCAPACITY)
+ sgs->group_misfit_task_load = 1;
+
for_each_cpu(i, sched_group_span(group)) {
struct rq *rq = cpu_rq(i);
unsigned int local;
@@ -8642,12 +10229,12 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
if (!nr_running && idle_cpu_without(i, p))
sgs->idle_cpus++;
- }
+ /* Check if task fits in the CPU */
+ if (sd->flags & SD_ASYM_CPUCAPACITY &&
+ sgs->group_misfit_task_load &&
+ task_fits_cpu(p, i))
+ sgs->group_misfit_task_load = 0;
- /* Check if task fits in the group */
- if (sd->flags & SD_ASYM_CPUCAPACITY &&
- !task_fits_capacity(p, group->sgc->max_capacity)) {
- sgs->group_misfit_task_load = 1;
}
sgs->group_capacity = group->sgc->capacity;
@@ -8692,6 +10279,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
case group_imbalanced:
case group_asym_packing:
+ case group_smt_balance:
/* Those types are not used in the slow wakeup path */
return false;
@@ -8703,8 +10291,14 @@ static bool update_pick_idlest(struct sched_group *idlest,
case group_has_spare:
/* Select group with most idle CPUs */
- if (idlest_sgs->idle_cpus >= sgs->idle_cpus)
+ if (idlest_sgs->idle_cpus > sgs->idle_cpus)
return false;
+
+ /* Select group with lowest group_util */
+ if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
+ idlest_sgs->group_util <= sgs->group_util)
+ return false;
+
break;
}
@@ -8729,9 +10323,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
.group_type = group_overloaded,
};
- imbalance = scale_load_down(NICE_0_LOAD) *
- (sd->imbalance_pct-100) / 100;
-
do {
int local_group;
@@ -8740,6 +10331,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
p->cpus_ptr))
continue;
+ /* Skip over this group if no cookie matched */
+ if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
+ continue;
+
local_group = cpumask_test_cpu(this_cpu,
sched_group_span(group));
@@ -8785,6 +10380,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
switch (local_sgs.group_type) {
case group_overloaded:
case group_fully_busy:
+
+ /* Calculate allowed imbalance based on load */
+ imbalance = scale_load_down(NICE_0_LOAD) *
+ (sd->imbalance_pct-100) / 100;
+
/*
* When comparing groups across NUMA domains, it's possible for
* the local domain to be very lightly loaded relative to the
@@ -8811,6 +10411,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
case group_imbalanced:
case group_asym_packing:
+ case group_smt_balance:
/* Those type are not used in the slow wakeup path */
return NULL;
@@ -8821,7 +10422,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
break;
case group_has_spare:
+#ifdef CONFIG_NUMA
if (sd->flags & SD_NUMA) {
+ int imb_numa_nr = sd->imb_numa_nr;
#ifdef CONFIG_NUMA_BALANCING
int idlest_cpu;
/*
@@ -8834,16 +10437,31 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
idlest_cpu = cpumask_first(sched_group_span(idlest));
if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
return idlest;
-#endif
+#endif /* CONFIG_NUMA_BALANCING */
/*
- * Otherwise, keep the task on this node to stay close
- * its wakeup source and improve locality. If there is
- * a real need of migration, periodic load balance will
+ * Otherwise, keep the task close to the wakeup source
+ * and improve locality if the number of running tasks
+ * would remain below threshold where an imbalance is
+ * allowed while accounting for the possibility the
+ * task is pinned to a subset of CPUs. If there is a
+ * real need of migration, periodic load balance will
* take care of it.
*/
- if (local_sgs.idle_cpus)
+ if (p->nr_cpus_allowed != NR_CPUS) {
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
+
+ cpumask_and(cpus, sched_group_span(local), p->cpus_ptr);
+ imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr);
+ }
+
+ imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
+ if (!adjust_numa_imbalance(imbalance,
+ local_sgs.sum_nr_running + 1,
+ imb_numa_nr)) {
return NULL;
+ }
}
+#endif /* CONFIG_NUMA */
/*
* Select group with highest number of idle CPUs. We could also
@@ -8859,6 +10477,77 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
return idlest;
}
+static void update_idle_cpu_scan(struct lb_env *env,
+ unsigned long sum_util)
+{
+ struct sched_domain_shared *sd_share;
+ int llc_weight, pct;
+ u64 x, y, tmp;
+ /*
+ * Update the number of CPUs to scan in LLC domain, which could
+ * be used as a hint in select_idle_cpu(). The update of sd_share
+ * could be expensive because it is within a shared cache line.
+ * So the write of this hint only occurs during periodic load
+ * balancing, rather than CPU_NEWLY_IDLE, because the latter
+ * can fire way more frequently than the former.
+ */
+ if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
+ return;
+
+ llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
+ if (env->sd->span_weight != llc_weight)
+ return;
+
+ sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
+ if (!sd_share)
+ return;
+
+ /*
+ * The number of CPUs to search drops as sum_util increases, when
+ * sum_util hits 85% or above, the scan stops.
+ * The reason to choose 85% as the threshold is because this is the
+ * imbalance_pct(117) when a LLC sched group is overloaded.
+ *
+ * let y = SCHED_CAPACITY_SCALE - p * x^2 [1]
+ * and y'= y / SCHED_CAPACITY_SCALE
+ *
+ * x is the ratio of sum_util compared to the CPU capacity:
+ * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE)
+ * y' is the ratio of CPUs to be scanned in the LLC domain,
+ * and the number of CPUs to scan is calculated by:
+ *
+ * nr_scan = llc_weight * y' [2]
+ *
+ * When x hits the threshold of overloaded, AKA, when
+ * x = 100 / pct, y drops to 0. According to [1],
+ * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000
+ *
+ * Scale x by SCHED_CAPACITY_SCALE:
+ * x' = sum_util / llc_weight; [3]
+ *
+ * and finally [1] becomes:
+ * y = SCHED_CAPACITY_SCALE -
+ * x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4]
+ *
+ */
+ /* equation [3] */
+ x = sum_util;
+ do_div(x, llc_weight);
+
+ /* equation [4] */
+ pct = env->sd->imbalance_pct;
+ tmp = x * x * pct * pct;
+ do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
+ tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
+ y = SCHED_CAPACITY_SCALE - tmp;
+
+ /* equation [2] */
+ y *= llc_weight;
+ do_div(y, SCHED_CAPACITY_SCALE);
+ if ((int)y != sd_share->nr_idle_scan)
+ WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
+}
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
@@ -8867,17 +10556,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
- struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
+ unsigned long sum_util = 0;
int sg_status = 0;
-#ifdef CONFIG_NO_HZ_COMMON
- if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
- env->flags |= LBF_NOHZ_STATS;
-#endif
-
do {
struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
@@ -8892,7 +10576,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_group_capacity(env->sd, env->dst_cpu);
}
- update_sg_lb_stats(env, sg, sgs, &sg_status);
+ update_sg_lb_stats(env, sds, sg, sgs, &sg_status);
if (local_group)
goto next_group;
@@ -8908,20 +10592,18 @@ next_group:
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
+ sum_util += sgs->group_util;
sg = sg->next;
} while (sg != env->sd->groups);
- /* Tag domain that child domain prefers tasks go to siblings first */
- sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
-
-#ifdef CONFIG_NO_HZ_COMMON
- if ((env->flags & LBF_NOHZ_AGAIN) &&
- cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
+ /*
+ * Indicate that the child domain of the busiest group prefers tasks
+ * go to a child's sibling domains first. NB the flags of a sched group
+ * are those of the child domain.
+ */
+ if (sds->busiest)
+ sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
- WRITE_ONCE(nohz.next_blocked,
- jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
- }
-#endif
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
@@ -8941,21 +10623,8 @@ next_group:
WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
}
-}
-
-static inline long adjust_numa_imbalance(int imbalance, int src_nr_running)
-{
- unsigned int imbalance_min;
- /*
- * Allow a small imbalance based on a simple pair of communicating
- * tasks that remain local when the source domain is almost idle.
- */
- imbalance_min = 2;
- if (src_nr_running <= imbalance_min)
- return 0;
-
- return imbalance;
+ update_idle_cpu_scan(env, sum_util);
}
/**
@@ -8972,9 +10641,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
busiest = &sds->busiest_stat;
if (busiest->group_type == group_misfit_task) {
- /* Set imbalance to allow misfit tasks to be balanced. */
- env->migration_type = migrate_misfit;
- env->imbalance = 1;
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+ /* Set imbalance to allow misfit tasks to be balanced. */
+ env->migration_type = migrate_misfit;
+ env->imbalance = 1;
+ } else {
+ /*
+ * Set load imbalance to allow moving task from cpu
+ * with reduced capacity.
+ */
+ env->migration_type = migrate_load;
+ env->imbalance = busiest->group_misfit_task_load;
+ }
return;
}
@@ -8988,6 +10666,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
return;
}
+ if (busiest->group_type == group_smt_balance) {
+ /* Reduce number of tasks sharing CPU capacity */
+ env->migration_type = migrate_task;
+ env->imbalance = 1;
+ return;
+ }
+
if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
@@ -9005,7 +10690,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* emptying busiest.
*/
if (local->group_type == group_has_spare) {
- if (busiest->group_type > group_fully_busy) {
+ if ((busiest->group_type > group_fully_busy) &&
+ !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
/*
* If busiest is overloaded, try to fill spare
* capacity. This might end up creating spare capacity
@@ -9034,14 +10720,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
}
if (busiest->group_weight == 1 || sds->prefer_sibling) {
- unsigned int nr_diff = busiest->sum_nr_running;
/*
* When prefer sibling, evenly spread running tasks on
* groups.
*/
env->migration_type = migrate_task;
- lsub_positive(&nr_diff, local->sum_nr_running);
- env->imbalance = nr_diff >> 1;
+ env->imbalance = sibling_imbalance(env, sds, busiest, local);
} else {
/*
@@ -9049,14 +10733,21 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* idle cpus.
*/
env->migration_type = migrate_task;
- env->imbalance = max_t(long, 0, (local->idle_cpus -
- busiest->idle_cpus) >> 1);
+ env->imbalance = max_t(long, 0,
+ (local->idle_cpus - busiest->idle_cpus));
}
+#ifdef CONFIG_NUMA
/* Consider allowing a small imbalance between NUMA groups */
- if (env->sd->flags & SD_NUMA)
+ if (env->sd->flags & SD_NUMA) {
env->imbalance = adjust_numa_imbalance(env->imbalance,
- busiest->sum_nr_running);
+ local->sum_nr_running + 1,
+ env->sd->imb_numa_nr);
+ }
+#endif
+
+ /* Number of tasks to move to restore balance */
+ env->imbalance >>= 1;
return;
}
@@ -9074,8 +10765,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
local->group_capacity;
- sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
- sds->total_capacity;
/*
* If the local group is more loaded than the selected
* busiest group don't try to pull any tasks.
@@ -9084,6 +10773,19 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
env->imbalance = 0;
return;
}
+
+ sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
+ sds->total_capacity;
+
+ /*
+ * If the local group is more loaded than the average system
+ * load, don't try to pull any tasks.
+ */
+ if (local->avg_load >= sds->avg_load) {
+ env->imbalance = 0;
+ return;
+ }
+
}
/*
@@ -9109,7 +10811,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
* has_spare nr_idle balanced N/A N/A balanced balanced
* fully_busy nr_idle nr_idle N/A N/A balanced balanced
- * misfit_task force N/A N/A N/A force force
+ * misfit_task force N/A N/A N/A N/A N/A
* asym_packing force force N/A N/A force force
* imbalanced force force N/A N/A force force
* overloaded force force N/A N/A force avg_load
@@ -9126,12 +10828,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/**
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
+ * @env: The load balancing environment.
*
* Also calculates the amount of runnable load which should be moved
* to restore balance.
*
- * @env: The load balancing environment.
- *
* Return: - The busiest group if imbalance exists.
*/
static struct sched_group *find_busiest_group(struct lb_env *env)
@@ -9147,24 +10848,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
*/
update_sd_lb_stats(env, &sds);
- if (sched_energy_enabled()) {
- struct root_domain *rd = env->dst_rq->rd;
-
- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
- goto out_balanced;
- }
-
- local = &sds.local_stat;
- busiest = &sds.busiest_stat;
-
/* There is no busy sibling group to pull tasks from */
if (!sds.busiest)
goto out_balanced;
+ busiest = &sds.busiest_stat;
+
/* Misfit tasks should be dealt with regardless of the avg load */
if (busiest->group_type == group_misfit_task)
goto force_balance;
+ if (sched_energy_enabled()) {
+ struct root_domain *rd = env->dst_rq->rd;
+
+ if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
+ goto out_balanced;
+ }
+
/* ASYM feature bypasses nice load balance check */
if (busiest->group_type == group_asym_packing)
goto force_balance;
@@ -9177,6 +10877,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (busiest->group_type == group_imbalanced)
goto force_balance;
+ local = &sds.local_stat;
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
@@ -9216,22 +10917,32 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto out_balanced;
}
- /* Try to move all excess tasks to child's sibling domain */
+ /*
+ * Try to move all excess tasks to a sibling domain of the busiest
+ * group's child domain.
+ */
if (sds.prefer_sibling && local->group_type == group_has_spare &&
- busiest->sum_nr_running > local->sum_nr_running + 1)
+ sibling_imbalance(env, &sds, busiest, local) > 1)
goto force_balance;
if (busiest->group_type != group_overloaded) {
- if (env->idle == CPU_NOT_IDLE)
+ if (env->idle == CPU_NOT_IDLE) {
/*
* If the busiest group is not overloaded (and as a
* result the local one too) but this CPU is already
* busy, let another idle CPU try to pull task.
*/
goto out_balanced;
+ }
+
+ if (busiest->group_type == group_smt_balance &&
+ smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
+ /* Let non SMT CPU pull from SMT CPU sharing with sibling */
+ goto force_balance;
+ }
if (busiest->group_weight > 1 &&
- local->idle_cpus <= (busiest->idle_cpus + 1))
+ local->idle_cpus <= (busiest->idle_cpus + 1)) {
/*
* If the busiest group is not overloaded
* and there is no imbalance between this and busiest
@@ -9242,12 +10953,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* there is more than 1 CPU per group.
*/
goto out_balanced;
+ }
- if (busiest->sum_h_nr_running == 1)
+ if (busiest->sum_h_nr_running == 1) {
/*
* busiest doesn't have any tasks waiting to run
*/
goto out_balanced;
+ }
}
force_balance:
@@ -9301,8 +11014,11 @@ static struct rq *find_busiest_queue(struct lb_env *env,
if (rt > env->fbq_type)
continue;
- capacity = capacity_of(i);
nr_running = rq->cfs.h_nr_running;
+ if (!nr_running)
+ continue;
+
+ capacity = capacity_of(i);
/*
* For ASYM_CPUCAPACITY domains, don't pick a CPU that could
@@ -9311,7 +11027,20 @@ static struct rq *find_busiest_queue(struct lb_env *env,
* average load.
*/
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
- capacity_of(env->dst_cpu) < capacity &&
+ !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
+ nr_running == 1)
+ continue;
+
+ /*
+ * Make sure we only pull tasks from a CPU of lower priority
+ * when balancing between SMT siblings.
+ *
+ * If balancing between cores, let lower priority CPUs help
+ * SMT cores with more than one busy sibling.
+ */
+ if ((env->sd->flags & SD_ASYM_PACKING) &&
+ sched_use_asym_prio(env->sd, i) &&
+ sched_asym_prefer(i, env->dst_cpu) &&
nr_running == 1)
continue;
@@ -9348,7 +11077,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
break;
case migrate_util:
- util = cpu_util(cpu_of(rq));
+ util = cpu_util_cfs_boost(i);
/*
* Don't try to pull utilization from a CPU with one
@@ -9399,22 +11128,48 @@ static inline bool
asym_active_balance(struct lb_env *env)
{
/*
- * ASYM_PACKING needs to force migrate tasks from busy but
- * lower priority CPUs in order to pack all tasks in the
- * highest priority CPUs.
+ * ASYM_PACKING needs to force migrate tasks from busy but lower
+ * priority CPUs in order to pack all tasks in the highest priority
+ * CPUs. When done between cores, do it only if the whole core if the
+ * whole core is idle.
+ *
+ * If @env::src_cpu is an SMT core with busy siblings, let
+ * the lower priority @env::dst_cpu help it. Do not follow
+ * CPU priority.
*/
return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
- sched_asym_prefer(env->dst_cpu, env->src_cpu);
+ sched_use_asym_prio(env->sd, env->dst_cpu) &&
+ (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
+ !sched_use_asym_prio(env->sd, env->src_cpu));
}
static inline bool
-voluntary_active_balance(struct lb_env *env)
+imbalanced_active_balance(struct lb_env *env)
+{
+ struct sched_domain *sd = env->sd;
+
+ /*
+ * The imbalanced case includes the case of pinned tasks preventing a fair
+ * distribution of the load on the system but also the even distribution of the
+ * threads on a system with spare capacity
+ */
+ if ((env->migration_type == migrate_task) &&
+ (sd->nr_balance_failed > sd->cache_nice_tries+2))
+ return 1;
+
+ return 0;
+}
+
+static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
if (asym_active_balance(env))
return 1;
+ if (imbalanced_active_balance(env))
+ return 1;
+
/*
* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
* It's worth migrating the task if the src_cpu's capacity is reduced
@@ -9434,22 +11189,13 @@ voluntary_active_balance(struct lb_env *env)
return 0;
}
-static int need_active_balance(struct lb_env *env)
-{
- struct sched_domain *sd = env->sd;
-
- if (voluntary_active_balance(env))
- return 1;
-
- return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
-}
-
static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
+ struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask);
struct sched_group *sg = env->sd->groups;
- int cpu;
+ int cpu, idle_smt = -1;
/*
* Ensure the balancing environment is consistent; can happen
@@ -9461,19 +11207,52 @@ static int should_we_balance(struct lb_env *env)
/*
* In the newly idle case, we will allow all the CPUs
* to do the newly idle load balance.
+ *
+ * However, we bail out if we already have tasks or a wakeup pending,
+ * to optimize wakeup latency.
*/
- if (env->idle == CPU_NEWLY_IDLE)
+ if (env->idle == CPU_NEWLY_IDLE) {
+ if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending)
+ return 0;
return 1;
+ }
+ cpumask_copy(swb_cpus, group_balance_mask(sg));
/* Try to find first idle CPU */
- for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
+ for_each_cpu_and(cpu, swb_cpus, env->cpus) {
if (!idle_cpu(cpu))
continue;
- /* Are we the first idle CPU? */
+ /*
+ * Don't balance to idle SMT in busy core right away when
+ * balancing cores, but remember the first idle SMT CPU for
+ * later consideration. Find CPU on an idle core first.
+ */
+ if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
+ if (idle_smt == -1)
+ idle_smt = cpu;
+ /*
+ * If the core is not idle, and first SMT sibling which is
+ * idle has been found, then its not needed to check other
+ * SMT siblings for idleness:
+ */
+#ifdef CONFIG_SCHED_SMT
+ cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu));
+#endif
+ continue;
+ }
+
+ /*
+ * Are we the first idle core in a non-SMT domain or higher,
+ * or the first idle CPU in a SMT domain?
+ */
return cpu == env->dst_cpu;
}
+ /* Are we the first idle CPU with busy siblings? */
+ if (idle_smt != -1)
+ return idle_smt == env->dst_cpu;
+
/* Are we the first CPU of this group ? */
return group_balance_cpu(sg) == env->dst_cpu;
}
@@ -9492,14 +11271,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct rq *busiest;
struct rq_flags rf;
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
-
struct lb_env env = {
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
- .dst_grpmask = sched_group_span(sd->groups),
+ .dst_grpmask = group_balance_mask(sd->groups),
.idle = idle,
- .loop_break = sched_nr_migrate_break,
+ .loop_break = SCHED_NR_MIGRATE_BREAK,
.cpus = cpus,
.fbq_type = all,
.tasks = LIST_HEAD_INIT(env.tasks),
@@ -9527,7 +11305,7 @@ redo:
goto out_balanced;
}
- BUG_ON(busiest == env.dst_rq);
+ WARN_ON_ONCE(busiest == env.dst_rq);
schedstat_add(sd->lb_imbalance[idle], env.imbalance);
@@ -9535,6 +11313,8 @@ redo:
env.src_rq = busiest;
ld_moved = 0;
+ /* Clear this flag as soon as we find a pullable task */
+ env.flags |= LBF_ALL_PINNED;
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
@@ -9542,7 +11322,6 @@ redo:
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
- env.flags |= LBF_ALL_PINNED;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
@@ -9574,7 +11353,9 @@ more_balance:
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
- goto more_balance;
+ /* Stop if we tried all running tasks */
+ if (env.loop < busiest->nr_running)
+ goto more_balance;
}
/*
@@ -9591,7 +11372,7 @@ more_balance:
* load to given_cpu. In rare situations, this may cause
* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
* _independently_ and at _same_ time to move some load to
- * given_cpu) causing exceess load to be moved to given_cpu.
+ * given_cpu) causing excess load to be moved to given_cpu.
* This however should not happen so much in practice and
* moreover subsequent load balance cycles should correct the
* excess load moved.
@@ -9605,7 +11386,7 @@ more_balance:
env.dst_cpu = env.new_dst_cpu;
env.flags &= ~LBF_DST_PINNED;
env.loop = 0;
- env.loop_break = sched_nr_migrate_break;
+ env.loop_break = SCHED_NR_MIGRATE_BREAK;
/*
* Go back to "more_balance" rather than "redo" since we
@@ -9637,7 +11418,7 @@ more_balance:
*/
if (!cpumask_subset(cpus, env.dst_grpmask)) {
env.loop = 0;
- env.loop_break = sched_nr_migrate_break;
+ env.loop_break = SCHED_NR_MIGRATE_BREAK;
goto redo;
}
goto out_all_pinned;
@@ -9658,7 +11439,7 @@ more_balance:
if (need_active_balance(&env)) {
unsigned long flags;
- raw_spin_lock_irqsave(&busiest->lock, flags);
+ raw_spin_rq_lock_irqsave(busiest, flags);
/*
* Don't kick the active_load_balance_cpu_stop,
@@ -9666,12 +11447,13 @@ more_balance:
* moved to this_cpu:
*/
if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
- raw_spin_unlock_irqrestore(&busiest->lock,
- flags);
- env.flags |= LBF_ALL_PINNED;
+ raw_spin_rq_unlock_irqrestore(busiest, flags);
goto out_one_pinned;
}
+ /* Record that we found at least one task that could run on this_cpu */
+ env.flags &= ~LBF_ALL_PINNED;
+
/*
* ->active_balance synchronizes accesses to
* ->active_balance_work. Once set, it's cleared
@@ -9682,32 +11464,23 @@ more_balance:
busiest->push_cpu = this_cpu;
active_balance = 1;
}
- raw_spin_unlock_irqrestore(&busiest->lock, flags);
+ preempt_disable();
+ raw_spin_rq_unlock_irqrestore(busiest, flags);
if (active_balance) {
stop_one_cpu_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
}
-
- /* We've kicked active balancing, force task migration. */
- sd->nr_balance_failed = sd->cache_nice_tries+1;
+ preempt_enable();
}
- } else
+ } else {
sd->nr_balance_failed = 0;
+ }
- if (likely(!active_balance) || voluntary_active_balance(&env)) {
+ if (likely(!active_balance) || need_active_balance(&env)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
- } else {
- /*
- * If we've begun active balancing, start to back off. This
- * case may not be covered by the all_pinned logic if there
- * is only 1 task on the busy runqueue (because we don't call
- * detach_tasks).
- */
- if (sd->balance_interval < sd->max_interval)
- sd->balance_interval *= 2;
}
goto out;
@@ -9741,7 +11514,7 @@ out_one_pinned:
/*
* newidle_balance() disregards balance intervals, so we could
* repeatedly reach this code, which would lead to balance_interval
- * skyrocketting in a short amount of time. Skip the balance_interval
+ * skyrocketing in a short amount of time. Skip the balance_interval
* increase logic to avoid that.
*/
if (env.idle == CPU_NEWLY_IDLE)
@@ -9766,6 +11539,15 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
+
+ /*
+ * Reduce likelihood of busy balancing at higher domains racing with
+ * balancing at lower domains by preventing their balancing periods
+ * from being multiples of each other.
+ */
+ if (cpu_busy)
+ interval -= 1;
+
interval = clamp(interval, 1UL, max_load_balance_interval);
return interval;
@@ -9823,7 +11605,7 @@ static int active_load_balance_cpu_stop(void *data)
* we need to fix it. Originally reported by
* Bjorn Helgaas on a 128-CPU setup.
*/
- BUG_ON(busiest_rq == target_rq);
+ WARN_ON_ONCE(busiest_rq == target_rq);
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
@@ -9840,13 +11622,7 @@ static int active_load_balance_cpu_stop(void *data)
.src_cpu = busiest_rq->cpu,
.src_rq = busiest_rq,
.idle = CPU_IDLE,
- /*
- * can_migrate_task() doesn't need to compute new_dst_cpu
- * for active balancing. Since we have CPU_IDLE, but no
- * @dst_grpmask we need to make that test go away with lying
- * about DST_PINNED.
- */
- .flags = LBF_DST_PINNED,
+ .flags = LBF_ACTIVE_LB,
};
schedstat_inc(sd->alb_count);
@@ -9885,6 +11661,30 @@ void update_max_interval(void)
max_load_balance_interval = HZ*num_online_cpus()/10;
}
+static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
+{
+ if (cost > sd->max_newidle_lb_cost) {
+ /*
+ * Track max cost of a domain to make sure to not delay the
+ * next wakeup on the CPU.
+ */
+ sd->max_newidle_lb_cost = cost;
+ sd->last_decay_max_lb_cost = jiffies;
+ } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
+ /*
+ * Decay the newidle max times by ~1% per second to ensure that
+ * it is not outdated and the current max cost is actually
+ * shorter.
+ */
+ sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
+ sd->last_decay_max_lb_cost = jiffies;
+
+ return true;
+ }
+
+ return false;
+}
+
/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
@@ -9908,14 +11708,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
for_each_domain(cpu, sd) {
/*
* Decay the newidle max times here because this is a regular
- * visit to all the domains. Decay ~1% per second.
+ * visit to all the domains.
*/
- if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
- sd->max_newidle_lb_cost =
- (sd->max_newidle_lb_cost * 253) / 256;
- sd->next_decay_max_lb_cost = jiffies + HZ;
- need_decay = 1;
- }
+ need_decay = update_newidle_cost(sd, 0);
max_cost += sd->max_newidle_lb_cost;
/*
@@ -9973,22 +11768,9 @@ out:
* When the cpu is attached to null domain for ex, it will not be
* updated.
*/
- if (likely(update_next_balance)) {
+ if (likely(update_next_balance))
rq->next_balance = next_balance;
-#ifdef CONFIG_NO_HZ_COMMON
- /*
- * If this CPU has been elected to perform the nohz idle
- * balance. Other idle CPUs have already rebalanced with
- * nohz_idle_balance() and nohz.next_balance has been
- * updated accordingly. This CPU is now running the idle load
- * balance for itself and we need to update the
- * nohz.next_balance accordingly.
- */
- if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
- nohz.next_balance = rq->next_balance;
-#endif
- }
}
static inline int on_null_domain(struct rq *rq)
@@ -9998,40 +11780,53 @@ static inline int on_null_domain(struct rq *rq)
#ifdef CONFIG_NO_HZ_COMMON
/*
- * idle load balancing details
- * - When one of the busy CPUs notice that there may be an idle rebalancing
+ * NOHZ idle load balancing (ILB) details:
+ *
+ * - When one of the busy CPUs notices that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
- * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
+ *
+ * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set
* anywhere yet.
*/
-
static inline int find_new_ilb(void)
{
- int ilb;
+ const struct cpumask *hk_mask;
+ int ilb_cpu;
+
+ hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
+
+ for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
+
+ if (ilb_cpu == smp_processor_id())
+ continue;
- for_each_cpu_and(ilb, nohz.idle_cpus_mask,
- housekeeping_cpumask(HK_FLAG_MISC)) {
- if (idle_cpu(ilb))
- return ilb;
+ if (idle_cpu(ilb_cpu))
+ return ilb_cpu;
}
- return nr_cpu_ids;
+ return -1;
}
/*
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
- * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
+ * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
+ * SMP function call (IPI).
+ *
+ * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
int ilb_cpu;
- nohz.next_balance++;
+ /*
+ * Increase nohz.next_balance only when if full ilb is triggered but
+ * not if we only update stats.
+ */
+ if (flags & NOHZ_BALANCE_KICK)
+ nohz.next_balance = jiffies+1;
ilb_cpu = find_new_ilb();
-
- if (ilb_cpu >= nr_cpu_ids)
+ if (ilb_cpu < 0)
return;
/*
@@ -10044,7 +11839,7 @@ static void kick_ilb(unsigned int flags)
/*
* This way we generate an IPI on the target CPU which
- * is idle. And the softirq performing nohz idle load balance
+ * is idle, and the softirq performing NOHZ idle load balancing
* will be run before returning from the IPI.
*/
smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
@@ -10073,7 +11868,7 @@ static void nohz_balancer_kick(struct rq *rq)
/*
* None are in tickless mode and hence no need for NOHZ idle load
- * balancing.
+ * balancing:
*/
if (likely(!atomic_read(&nohz.nr_cpus)))
return;
@@ -10086,7 +11881,7 @@ static void nohz_balancer_kick(struct rq *rq)
goto out;
if (rq->nr_running >= 2) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto out;
}
@@ -10095,12 +11890,11 @@ static void nohz_balancer_kick(struct rq *rq)
sd = rcu_dereference(rq->sd);
if (sd) {
/*
- * If there's a CFS task and the current CPU has reduced
- * capacity; kick the ILB to see if there's a better CPU to run
- * on.
+ * If there's a runnable CFS task and the current CPU has reduced
+ * capacity, kick the ILB to see if there's a better CPU to run on:
*/
if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
}
@@ -10111,10 +11905,14 @@ static void nohz_balancer_kick(struct rq *rq)
* When ASYM_PACKING; see if there's a more preferred CPU
* currently idle; in which case, kick the ILB to move tasks
* around.
+ *
+ * When balancing betwen cores, all the SMT siblings of the
+ * preferred CPU must be idle.
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
- if (sched_asym_prefer(i, cpu)) {
- flags = NOHZ_KICK_MASK;
+ if (sched_use_asym_prio(sd, i) &&
+ sched_asym_prefer(i, cpu)) {
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
}
@@ -10127,7 +11925,7 @@ static void nohz_balancer_kick(struct rq *rq)
* to run the misfit task on.
*/
if (check_misfit_status(rq, sd)) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
@@ -10145,22 +11943,25 @@ static void nohz_balancer_kick(struct rq *rq)
if (sds) {
/*
* If there is an imbalance between LLC domains (IOW we could
- * increase the overall cache use), we need some less-loaded LLC
- * domain to pull some load. Likewise, we may need to spread
+ * increase the overall cache utilization), we need a less-loaded LLC
+ * domain to pull some load from. Likewise, we may need to spread
* load within the current LLC domain (e.g. packed SMT cores but
* other CPUs are idle). We can't really know from here how busy
- * the others are - so just get a nohz balance going if it looks
+ * the others are - so just get a NOHZ balance going if it looks
* like this LLC domain has tasks we could move.
*/
nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
}
unlock:
rcu_read_unlock();
out:
+ if (READ_ONCE(nohz.needs_update))
+ flags |= NOHZ_NEXT_KICK;
+
if (flags)
kick_ilb(flags);
}
@@ -10226,7 +12027,7 @@ void nohz_balance_enter_idle(int cpu)
return;
/* Spare idle load balancing on CPUs that don't want to be disturbed: */
- if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
+ if (!housekeeping_cpu(cpu, HK_TYPE_SCHED))
return;
/*
@@ -10257,12 +12058,13 @@ void nohz_balance_enter_idle(int cpu)
/*
* Ensures that if nohz_idle_balance() fails to observe our
* @idle_cpus_mask store, it must observe the @has_blocked
- * store.
+ * and @needs_update stores.
*/
smp_mb__after_atomic();
set_cpu_sd_state_idle(cpu);
+ WRITE_ONCE(nohz.needs_update, 1);
out:
/*
* Each time a cpu enter idle, we assume that it has blocked load and
@@ -10271,15 +12073,30 @@ out:
WRITE_ONCE(nohz.has_blocked, 1);
}
+static bool update_nohz_stats(struct rq *rq)
+{
+ unsigned int cpu = rq->cpu;
+
+ if (!rq->has_blocked_load)
+ return false;
+
+ if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ return false;
+
+ if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
+ return true;
+
+ update_blocked_averages(cpu);
+
+ return rq->has_blocked_load;
+}
+
/*
* Internal function that runs load balance for all idle cpus. The load balance
* can be a simple update of blocked load or a complete load balance with
* tasks movement depending of flags.
- * The function returns false if the loop has stopped before running
- * through all idle CPUs.
*/
-static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
- enum cpu_idle_type idle)
+static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
{
/* Earliest time when we have to do rebalance again */
unsigned long now = jiffies;
@@ -10288,7 +12105,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
int balance_cpu;
- int ret = false;
struct rq *rq;
SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
@@ -10296,12 +12112,17 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
/*
* We assume there will be no idle load after this update and clear
* the has_blocked flag. If a cpu enters idle in the mean time, it will
- * set the has_blocked flag and trig another update of idle load.
+ * set the has_blocked flag and trigger another update of idle load.
* Because a cpu that becomes idle, is added to idle_cpus_mask before
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
+ *
+ * Same applies to idle_cpus_mask vs needs_update.
*/
- WRITE_ONCE(nohz.has_blocked, 0);
+ if (flags & NOHZ_STATS_KICK)
+ WRITE_ONCE(nohz.has_blocked, 0);
+ if (flags & NOHZ_NEXT_KICK)
+ WRITE_ONCE(nohz.needs_update, 0);
/*
* Ensures that if we miss the CPU, we must see the has_blocked
@@ -10309,8 +12130,12 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
*/
smp_mb();
- for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
- if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
+ /*
+ * Start with the next CPU after this_cpu so we will end with this_cpu and let a
+ * chance for other idle cpu to pull load.
+ */
+ for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) {
+ if (!idle_cpu(balance_cpu))
continue;
/*
@@ -10319,13 +12144,17 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
* balancing owner will pick it up.
*/
if (need_resched()) {
- has_blocked_load = true;
+ if (flags & NOHZ_STATS_KICK)
+ has_blocked_load = true;
+ if (flags & NOHZ_NEXT_KICK)
+ WRITE_ONCE(nohz.needs_update, 1);
goto abort;
}
rq = cpu_rq(balance_cpu);
- has_blocked_load |= update_nohz_stats(rq, true);
+ if (flags & NOHZ_STATS_KICK)
+ has_blocked_load |= update_nohz_stats(rq);
/*
* If time for next balance is due,
@@ -10348,26 +12177,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
}
}
- /* Newly idle CPU doesn't need an update */
- if (idle != CPU_NEWLY_IDLE) {
- update_blocked_averages(this_cpu);
- has_blocked_load |= this_rq->has_blocked_load;
- }
-
- if (flags & NOHZ_BALANCE_KICK)
- rebalance_domains(this_rq, CPU_IDLE);
-
- WRITE_ONCE(nohz.next_blocked,
- now + msecs_to_jiffies(LOAD_AVG_PERIOD));
-
- /* The full idle balance loop has been done */
- ret = true;
-
-abort:
- /* There is still blocked load, enable periodic update */
- if (has_blocked_load)
- WRITE_ONCE(nohz.has_blocked, 1);
-
/*
* next_balance will be updated only when there is a need.
* When the CPU is attached to null domain for ex, it will not be
@@ -10376,7 +12185,14 @@ abort:
if (likely(update_next_balance))
nohz.next_balance = next_balance;
- return ret;
+ if (flags & NOHZ_STATS_KICK)
+ WRITE_ONCE(nohz.next_blocked,
+ now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+
+abort:
+ /* There is still blocked load, enable periodic update */
+ if (has_blocked_load)
+ WRITE_ONCE(nohz.has_blocked, 1);
}
/*
@@ -10395,11 +12211,40 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (idle != CPU_IDLE)
return false;
- _nohz_idle_balance(this_rq, flags, idle);
+ _nohz_idle_balance(this_rq, flags);
return true;
}
+/*
+ * Check if we need to directly run the ILB for updating blocked load before
+ * entering idle state. Here we run ILB directly without issuing IPIs.
+ *
+ * Note that when this function is called, the tick may not yet be stopped on
+ * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and
+ * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates
+ * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle
+ * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is
+ * called from this function on (this) CPU that's not yet in the mask. That's
+ * OK because the goal of nohz_run_idle_balance() is to run ILB only for
+ * updating the blocked load of already idle CPUs without waking up one of
+ * those idle CPUs and outside the preempt disable / irq off phase of the local
+ * cpu about to enter idle, because it can take a long time.
+ */
+void nohz_run_idle_balance(int cpu)
+{
+ unsigned int flags;
+
+ flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
+
+ /*
+ * Update the blocked load only if no SCHED_SOFTIRQ is about to happen
+ * (ie NOHZ_STATS_KICK set) and will do the same.
+ */
+ if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
+ _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
+}
+
static void nohz_newidle_balance(struct rq *this_rq)
{
int this_cpu = this_rq->cpu;
@@ -10408,7 +12253,7 @@ static void nohz_newidle_balance(struct rq *this_rq)
* This CPU doesn't want to be disturbed by scheduler
* housekeeping
*/
- if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
+ if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED))
return;
/* Will wake up very soon. No time for doing anything else*/
@@ -10420,16 +12265,11 @@ static void nohz_newidle_balance(struct rq *this_rq)
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;
- raw_spin_unlock(&this_rq->lock);
/*
- * This CPU is going to be idle and blocked load of idle CPUs
- * need to be updated. Run the ilb locally as it is a good
- * candidate for ilb instead of waking up another idle CPU.
- * Kick an normal ilb if we failed to do the update.
+ * Set the need to trigger ILB in order to update blocked load
+ * before entering idle state.
*/
- if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
- kick_ilb(NOHZ_STATS_KICK);
- raw_spin_lock(&this_rq->lock);
+ atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
}
#else /* !CONFIG_NO_HZ_COMMON */
@@ -10444,7 +12284,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
#endif /* CONFIG_NO_HZ_COMMON */
/*
- * idle_balance is called by schedule() if this_cpu is about to become
+ * newidle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*
* Returns:
@@ -10456,11 +12296,19 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
+ u64 t0, t1, curr_cost = 0;
struct sched_domain *sd;
int pulled_task = 0;
- u64 curr_cost = 0;
update_misfit_status(NULL, this_rq);
+
+ /*
+ * There is a task waiting to run. No need to search for one.
+ * Return 0; the task will be enqueued when switching to idle.
+ */
+ if (this_rq->ttwu_pending)
+ return 0;
+
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
@@ -10481,64 +12329,64 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
*/
rq_unpin_lock(this_rq, rf);
- if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !READ_ONCE(this_rq->rd->overload)) {
+ rcu_read_lock();
+ sd = rcu_dereference_check_sched_domain(this_rq->sd);
+
+ if (!READ_ONCE(this_rq->rd->overload) ||
+ (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
- rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
update_next_balance(sd, &next_balance);
rcu_read_unlock();
- nohz_newidle_balance(this_rq);
-
goto out;
}
+ rcu_read_unlock();
- raw_spin_unlock(&this_rq->lock);
+ raw_spin_rq_unlock(this_rq);
+ t0 = sched_clock_cpu(this_cpu);
update_blocked_averages(this_cpu);
+
rcu_read_lock();
for_each_domain(this_cpu, sd) {
int continue_balancing = 1;
- u64 t0, domain_cost;
+ u64 domain_cost;
- if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
- update_next_balance(sd, &next_balance);
+ update_next_balance(sd, &next_balance);
+
+ if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
break;
- }
if (sd->flags & SD_BALANCE_NEWIDLE) {
- t0 = sched_clock_cpu(this_cpu);
pulled_task = load_balance(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
&continue_balancing);
- domain_cost = sched_clock_cpu(this_cpu) - t0;
- if (domain_cost > sd->max_newidle_lb_cost)
- sd->max_newidle_lb_cost = domain_cost;
+ t1 = sched_clock_cpu(this_cpu);
+ domain_cost = t1 - t0;
+ update_newidle_cost(sd, domain_cost);
curr_cost += domain_cost;
+ t0 = t1;
}
- update_next_balance(sd, &next_balance);
-
/*
* Stop searching for tasks to pull if there are
* now runnable tasks on this rq.
*/
- if (pulled_task || this_rq->nr_running > 0)
+ if (pulled_task || this_rq->nr_running > 0 ||
+ this_rq->ttwu_pending)
break;
}
rcu_read_unlock();
- raw_spin_lock(&this_rq->lock);
+ raw_spin_rq_lock(this_rq);
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
-out:
/*
* While browsing the domains, we released the rq lock, a task could
* have been enqueued in the meantime. Since we're not going idle,
@@ -10547,16 +12395,19 @@ out:
if (this_rq->cfs.h_nr_running && !pulled_task)
pulled_task = 1;
- /* Move the next balance forward */
- if (time_after(this_rq->next_balance, next_balance))
- this_rq->next_balance = next_balance;
-
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
+out:
+ /* Move the next balance forward */
+ if (time_after(this_rq->next_balance, next_balance))
+ this_rq->next_balance = next_balance;
+
if (pulled_task)
this_rq->idle_stamp = 0;
+ else
+ nohz_newidle_balance(this_rq);
rq_repin_lock(this_rq, rf);
@@ -10594,8 +12445,11 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
*/
void trigger_load_balance(struct rq *rq)
{
- /* Don't need to rebalance while attached to NULL domain */
- if (unlikely(on_null_domain(rq)))
+ /*
+ * Don't need to rebalance while attached to NULL domain or
+ * runqueue CPU is not active
+ */
+ if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
return;
if (time_after_eq(jiffies, rq->next_balance))
@@ -10617,10 +12471,140 @@ static void rq_offline_fair(struct rq *rq)
/* Ensure any throttled groups are reachable by pick_next_task */
unthrottle_offline_cfs_rqs(rq);
+
+ /* Ensure that we remove rq contribution to group share: */
+ clear_tg_offline_cfs_rqs(rq);
}
#endif /* CONFIG_SMP */
+#ifdef CONFIG_SCHED_CORE
+static inline bool
+__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
+{
+ u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ u64 slice = se->slice;
+
+ return (rtime * min_nr_tasks > slice);
+}
+
+#define MIN_NR_TASKS_DURING_FORCEIDLE 2
+static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
+{
+ if (!sched_core_enabled(rq))
+ return;
+
+ /*
+ * If runqueue has only one task which used up its slice and
+ * if the sibling is forced idle, then trigger schedule to
+ * give forced idle task a chance.
+ *
+ * sched_slice() considers only this active rq and it gets the
+ * whole slice. But during force idle, we have siblings acting
+ * like a single runqueue and hence we need to consider runnable
+ * tasks on this CPU and the forced idle CPU. Ideally, we should
+ * go through the forced idle rq, but that would be a perf hit.
+ * We can assume that the forced idle CPU has at least
+ * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
+ * if we need to give up the CPU.
+ */
+ if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
+ __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
+ resched_curr(rq);
+}
+
+/*
+ * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
+ */
+static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
+ bool forceidle)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (forceidle) {
+ if (cfs_rq->forceidle_seq == fi_seq)
+ break;
+ cfs_rq->forceidle_seq = fi_seq;
+ }
+
+ cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
+ }
+}
+
+void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
+{
+ struct sched_entity *se = &p->se;
+
+ if (p->sched_class != &fair_sched_class)
+ return;
+
+ se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
+}
+
+bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
+ bool in_fi)
+{
+ struct rq *rq = task_rq(a);
+ const struct sched_entity *sea = &a->se;
+ const struct sched_entity *seb = &b->se;
+ struct cfs_rq *cfs_rqa;
+ struct cfs_rq *cfs_rqb;
+ s64 delta;
+
+ SCHED_WARN_ON(task_rq(b)->core != rq->core);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * Find an se in the hierarchy for tasks a and b, such that the se's
+ * are immediate siblings.
+ */
+ while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
+ int sea_depth = sea->depth;
+ int seb_depth = seb->depth;
+
+ if (sea_depth >= seb_depth)
+ sea = parent_entity(sea);
+ if (sea_depth <= seb_depth)
+ seb = parent_entity(seb);
+ }
+
+ se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
+ se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
+
+ cfs_rqa = sea->cfs_rq;
+ cfs_rqb = seb->cfs_rq;
+#else
+ cfs_rqa = &task_rq(a)->cfs;
+ cfs_rqb = &task_rq(b)->cfs;
+#endif
+
+ /*
+ * Find delta after normalizing se's vruntime with its cfs_rq's
+ * min_vruntime_fi, which would have been updated in prior calls
+ * to se_fi_update().
+ */
+ delta = (s64)(sea->vruntime - seb->vruntime) +
+ (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+
+ return delta > 0;
+}
+
+static int task_is_throttled_fair(struct task_struct *p, int cpu)
+{
+ struct cfs_rq *cfs_rq;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ cfs_rq = task_group(p)->cfs_rq[cpu];
+#else
+ cfs_rq = &cpu_rq(cpu)->cfs;
+#endif
+ return throttled_hierarchy(cfs_rq);
+}
+#else
+static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
+#endif
+
/*
* scheduler tick hitting a task of our scheduling class.
*
@@ -10644,6 +12628,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
update_misfit_status(curr, rq);
update_overutilized_status(task_rq(curr));
+
+ task_tick_core(rq, curr);
}
/*
@@ -10653,8 +12639,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
*/
static void task_fork_fair(struct task_struct *p)
{
- struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
+ struct cfs_rq *cfs_rq;
struct rq *rq = this_rq();
struct rq_flags rf;
@@ -10663,22 +12649,9 @@ static void task_fork_fair(struct task_struct *p)
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
- if (curr) {
+ if (curr)
update_curr(cfs_rq);
- se->vruntime = curr->vruntime;
- }
- place_entity(cfs_rq, se, 1);
-
- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
- /*
- * Upon rescheduling, sched_class::put_prev_task() will place
- * 'current' within the tree based on its new key value.
- */
- swap(curr->vruntime, se->vruntime);
- resched_curr(rq);
- }
-
- se->vruntime -= cfs_rq->min_vruntime;
+ place_entity(cfs_rq, se, ENQUEUE_INITIAL);
rq_unlock(rq, &rf);
}
@@ -10700,39 +12673,11 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
- if (rq->curr == p) {
+ if (task_current(rq, p)) {
if (p->prio > oldprio)
resched_curr(rq);
} else
- check_preempt_curr(rq, p, 0);
-}
-
-static inline bool vruntime_normalized(struct task_struct *p)
-{
- struct sched_entity *se = &p->se;
-
- /*
- * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
- * the dequeue_entity(.flags=0) will already have normalized the
- * vruntime.
- */
- if (p->on_rq)
- return true;
-
- /*
- * When !on_rq, vruntime of the task has usually NOT been normalized.
- * But there are some cases where it has already been normalized:
- *
- * - A forked child which is waiting for being woken up by
- * wake_up_new_task().
- * - A task which has been woken up by try_to_wake_up() and
- * waiting for actually being woken up by sched_ttwu_pending().
- */
- if (!se->sum_exec_runtime ||
- (p->state == TASK_WAKING && p->sched_remote_wakeup))
- return true;
-
- return false;
+ wakeup_preempt(rq, p, 0);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -10742,7 +12687,13 @@ static inline bool vruntime_normalized(struct task_struct *p)
*/
static void propagate_entity_cfs_rq(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (cfs_rq_throttled(cfs_rq))
+ return;
+
+ if (!throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
/* Start to propagate at parent */
se = se->parent;
@@ -10750,10 +12701,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(cfs_rq, se, UPDATE_TG);
+ if (!throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
}
}
#else
@@ -10764,10 +12718,21 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+#ifdef CONFIG_SMP
+ /*
+ * In case the task sched_avg hasn't been attached:
+ * - A forked task which hasn't been woken up by wake_up_new_task().
+ * - A task which has been woken up by try_to_wake_up() but is
+ * waiting for actually being woken up by sched_ttwu_pending().
+ */
+ if (!se->avg.last_update_time)
+ return;
+#endif
+
/* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se);
- update_tg_load_avg(cfs_rq, false);
+ update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
}
@@ -10775,34 +12740,16 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- /*
- * Since the real-depth could have been changed (only FAIR
- * class maintain depth value), reset depth properly.
- */
- se->depth = se->parent ? se->parent->depth + 1 : 0;
-#endif
-
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
- update_tg_load_avg(cfs_rq, false);
+ update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
}
static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- if (!vruntime_normalized(p)) {
- /*
- * Fix up our vruntime so that the current sleep doesn't
- * cause 'unlimited' sleep bonus.
- */
- place_entity(cfs_rq, se, 0);
- se->vruntime -= cfs_rq->min_vruntime;
- }
detach_entity_cfs_rq(se);
}
@@ -10810,12 +12757,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
static void attach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
attach_entity_cfs_rq(se);
-
- if (!vruntime_normalized(p))
- se->vruntime += cfs_rq->min_vruntime;
}
static void switched_from_fair(struct rq *rq, struct task_struct *p)
@@ -10833,10 +12776,10 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
* kick off the schedule if running, otherwise just see
* if we can still preempt the current task.
*/
- if (rq->curr == p)
+ if (task_current(rq, p))
resched_curr(rq);
else
- check_preempt_curr(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
}
}
@@ -10871,55 +12814,36 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
- cfs_rq->min_vruntime = (u64)(-(1LL << 20));
-#ifndef CONFIG_64BIT
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
+ u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
#ifdef CONFIG_SMP
raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_set_group_fair(struct task_struct *p)
+static void task_change_group_fair(struct task_struct *p)
{
- struct sched_entity *se = &p->se;
-
- set_task_rq(p, task_cpu(p));
- se->depth = se->parent ? se->parent->depth + 1 : 0;
-}
+ /*
+ * We couldn't detach or attach a forked task which
+ * hasn't been woken up by wake_up_new_task().
+ */
+ if (READ_ONCE(p->__state) == TASK_NEW)
+ return;
-static void task_move_group_fair(struct task_struct *p)
-{
detach_task_cfs_rq(p);
- set_task_rq(p, task_cpu(p));
#ifdef CONFIG_SMP
/* Tell se's cfs_rq has been changed -- migrated */
p->se.avg.last_update_time = 0;
#endif
+ set_task_rq(p, task_cpu(p));
attach_task_cfs_rq(p);
}
-static void task_change_group_fair(struct task_struct *p, int type)
-{
- switch (type) {
- case TASK_SET_GROUP:
- task_set_group_fair(p);
- break;
-
- case TASK_MOVE_GROUP:
- task_move_group_fair(p);
- break;
- }
-}
-
void free_fair_sched_group(struct task_group *tg)
{
int i;
- destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
-
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
@@ -10946,7 +12870,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
tg->shares = NICE_0_LOAD;
- init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+ init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent));
for_each_possible_cpu(i) {
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
@@ -10954,7 +12878,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
if (!cfs_rq)
goto err;
- se = kzalloc_node(sizeof(struct sched_entity),
+ se = kzalloc_node(sizeof(struct sched_entity_stats),
GFP_KERNEL, cpu_to_node(i));
if (!se)
goto err_free_rq;
@@ -10996,6 +12920,8 @@ void unregister_fair_sched_group(struct task_group *tg)
struct rq *rq;
int cpu;
+ destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
for_each_possible_cpu(cpu) {
if (tg->se[cpu])
remove_entity_load_avg(tg->se[cpu]);
@@ -11009,9 +12935,9 @@ void unregister_fair_sched_group(struct task_group *tg)
rq = cpu_rq(cpu);
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_rq_lock_irqsave(rq, flags);
list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
}
}
@@ -11048,10 +12974,12 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
static DEFINE_MUTEX(shares_mutex);
-int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
int i;
+ lockdep_assert_held(&shares_mutex);
+
/*
* We can't change the weight of the root cgroup.
*/
@@ -11060,9 +12988,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
- mutex_lock(&shares_mutex);
if (tg->shares == shares)
- goto done;
+ return 0;
tg->shares = shares;
for_each_possible_cpu(i) {
@@ -11080,22 +13007,95 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
rq_unlock_irqrestore(rq, &rf);
}
-done:
- mutex_unlock(&shares_mutex);
return 0;
}
-#else /* CONFIG_FAIR_GROUP_SCHED */
-void free_fair_sched_group(struct task_group *tg) { }
-
-int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+int sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
- return 1;
+ int ret;
+
+ mutex_lock(&shares_mutex);
+ if (tg_is_idle(tg))
+ ret = -EINVAL;
+ else
+ ret = __sched_group_set_shares(tg, shares);
+ mutex_unlock(&shares_mutex);
+
+ return ret;
}
-void online_fair_sched_group(struct task_group *tg) { }
+int sched_group_set_idle(struct task_group *tg, long idle)
+{
+ int i;
+
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ if (idle < 0 || idle > 1)
+ return -EINVAL;
+
+ mutex_lock(&shares_mutex);
+
+ if (tg->idle == idle) {
+ mutex_unlock(&shares_mutex);
+ return 0;
+ }
+
+ tg->idle = idle;
+
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+ struct sched_entity *se = tg->se[i];
+ struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i];
+ bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
+ long idle_task_delta;
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+
+ grp_cfs_rq->idle = idle;
+ if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
+ goto next_cpu;
-void unregister_fair_sched_group(struct task_group *tg) { }
+ if (se->on_rq) {
+ parent_cfs_rq = cfs_rq_of(se);
+ if (cfs_rq_is_idle(grp_cfs_rq))
+ parent_cfs_rq->idle_nr_running++;
+ else
+ parent_cfs_rq->idle_nr_running--;
+ }
+
+ idle_task_delta = grp_cfs_rq->h_nr_running -
+ grp_cfs_rq->idle_h_nr_running;
+ if (!cfs_rq_is_idle(grp_cfs_rq))
+ idle_task_delta *= -1;
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (!se->on_rq)
+ break;
+
+ cfs_rq->idle_h_nr_running += idle_task_delta;
+
+ /* Already accounted at parent level and above. */
+ if (cfs_rq_is_idle(cfs_rq))
+ break;
+ }
+
+next_cpu:
+ rq_unlock_irqrestore(rq, &rf);
+ }
+
+ /* Idle groups have minimum weight. */
+ if (tg_is_idle(tg))
+ __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
+ else
+ __sched_group_set_shares(tg, NICE_0_LOAD);
+
+ mutex_unlock(&shares_mutex);
+ return 0;
+}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -11110,7 +13110,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* idle runqueue:
*/
if (rq->cfs.load.weight)
- rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
+ rr_interval = NS_TO_JIFFIES(se->slice);
return rr_interval;
}
@@ -11118,14 +13118,14 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
/*
* All the scheduling class methods:
*/
-const struct sched_class fair_sched_class = {
- .next = &idle_sched_class,
+DEFINE_SCHED_CLASS(fair) = {
+
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
.yield_to_task = yield_to_task_fair,
- .check_preempt_curr = check_preempt_wakeup,
+ .wakeup_preempt = check_preempt_wakeup_fair,
.pick_next_task = __pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
@@ -11133,6 +13133,7 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_SMP
.balance = balance_fair,
+ .pick_task = pick_task_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
@@ -11158,6 +13159,10 @@ const struct sched_class fair_sched_class = {
.task_change_group = task_change_group_fair,
#endif
+#ifdef CONFIG_SCHED_CORE
+ .task_is_throttled = task_is_throttled_fair,
+#endif
+
#ifdef CONFIG_UCLAMP_TASK
.uclamp_enabled = 1,
#endif
@@ -11202,6 +13207,20 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
__init void init_sched_fair_class(void)
{
#ifdef CONFIG_SMP
+ int i;
+
+ for_each_possible_cpu(i) {
+ zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
+ zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
+ zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
+ GFP_KERNEL, cpu_to_node(i));
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
+ INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
+#endif
+ }
+
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
#ifdef CONFIG_NO_HZ_COMMON
@@ -11212,83 +13231,3 @@ __init void init_sched_fair_class(void)
#endif /* SMP */
}
-
-/*
- * Helper functions to facilitate extracting info from tracepoints.
- */
-
-const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
-{
-#ifdef CONFIG_SMP
- return cfs_rq ? &cfs_rq->avg : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
-
-char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
-{
- if (!cfs_rq) {
- if (str)
- strlcpy(str, "(null)", len);
- else
- return NULL;
- }
-
- cfs_rq_tg_path(cfs_rq, str, len);
- return str;
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
-
-int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
-{
- return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
-
-const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
-{
-#ifdef CONFIG_SMP
- return rq ? &rq->avg_rt : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
-
-const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
-{
-#ifdef CONFIG_SMP
- return rq ? &rq->avg_dl : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
-
-const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
-{
-#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
- return rq ? &rq->avg_irq : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
-
-int sched_trace_rq_cpu(struct rq *rq)
-{
- return rq ? cpu_of(rq) : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
-
-const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
-{
-#ifdef CONFIG_SMP
- return rd ? rd->span : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rd_span);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 7481cd96f391..143f55df890b 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -1,16 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Only give sleepers 50% of their service deficit. This allows
- * them to run sooner, but does not allow tons of sleepers to
- * rip the spread apart.
- */
-SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
/*
- * Place new tasks ahead so that they do not starve already running
- * tasks
+ * Using the avg_vruntime, do the right thing and preserve lag across
+ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
*/
-SCHED_FEAT(START_DEBIT, true)
+SCHED_FEAT(PLACE_LAG, true)
+SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
+SCHED_FEAT(RUN_TO_PARITY, true)
/*
* Prefer to schedule the task we woke last (assuming it failed
@@ -20,14 +16,7 @@ SCHED_FEAT(START_DEBIT, true)
SCHED_FEAT(NEXT_BUDDY, false)
/*
- * Prefer to schedule the task that ran last (when we did
- * wake-preempt) as that likely will touch the same data, increases
- * cache locality.
- */
-SCHED_FEAT(LAST_BUDDY, true)
-
-/*
- * Consider buddies to be cache hot, decreases the likelyness of a
+ * Consider buddies to be cache hot, decreases the likeliness of a
* cache buddy being migrated away, increases cache locality.
*/
SCHED_FEAT(CACHE_HOT_BUDDY, true)
@@ -38,6 +27,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
SCHED_FEAT(WAKEUP_PREEMPTION, true)
SCHED_FEAT(HRTICK, false)
+SCHED_FEAT(HRTICK_DL, false)
SCHED_FEAT(DOUBLE_TICK, false)
/*
@@ -45,17 +35,21 @@ SCHED_FEAT(DOUBLE_TICK, false)
*/
SCHED_FEAT(NONTASK_CAPACITY, true)
+#ifdef CONFIG_PREEMPT_RT
+SCHED_FEAT(TTWU_QUEUE, false)
+#else
+
/*
* Queue remote wakeups on the target CPU and process them
* using the scheduler IPI. Reduces rq->lock contention/bounces.
*/
SCHED_FEAT(TTWU_QUEUE, true)
+#endif
/*
* When doing wakeups, attempt to limit superfluous scans of the LLC domain.
*/
-SCHED_FEAT(SIS_AVG_CPU, false)
-SCHED_FEAT(SIS_PROP, true)
+SCHED_FEAT(SIS_UTIL, true)
/*
* Issue a WARN when we do multiple update_rq_clock() calls
@@ -77,7 +71,7 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
SCHED_FEAT(RT_PUSH_IPI, true)
#endif
-SCHED_FEAT(RT_RUNTIME_SHARE, true)
+SCHED_FEAT(RT_RUNTIME_SHARE, false)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
@@ -89,4 +83,7 @@ SCHED_FEAT(WA_BIAS, true)
* UtilEstimation. Use estimated CPU utilization.
*/
SCHED_FEAT(UTIL_EST, true)
-SCHED_FEAT(UTIL_EST_FASTUP, true)
+
+SCHED_FEAT(LATENCY_WARN, false)
+
+SCHED_FEAT(HZ_BW, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1ae95b9150d3..31231925f1ec 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -6,9 +6,6 @@
* (NOTE: these are not related to SCHED_IDLE batch scheduled
* tasks which are handled in sched/fair.c )
*/
-#include "sched.h"
-
-#include <trace/events/power.h>
/* Linker adds these: start and end of __cpuidle functions */
extern char __cpuidle_text_start[], __cpuidle_text_end[];
@@ -54,17 +51,22 @@ __setup("hlt", cpu_idle_nopoll_setup);
static noinline int __cpuidle cpu_idle_poll(void)
{
- rcu_idle_enter();
- trace_cpu_idle_rcuidle(0, smp_processor_id());
- local_irq_enable();
+ instrumentation_begin();
+ trace_cpu_idle(0, smp_processor_id());
stop_critical_timings();
+ ct_cpuidle_enter();
+ raw_local_irq_enable();
while (!tif_need_resched() &&
- (cpu_idle_force_poll || tick_check_broadcast_expired()))
+ (cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
+ raw_local_irq_disable();
+
+ ct_cpuidle_exit();
start_critical_timings();
- trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
- rcu_idle_exit();
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+ local_irq_enable();
+ instrumentation_end();
return 1;
}
@@ -73,11 +75,10 @@ static noinline int __cpuidle cpu_idle_poll(void)
void __weak arch_cpu_idle_prepare(void) { }
void __weak arch_cpu_idle_enter(void) { }
void __weak arch_cpu_idle_exit(void) { }
-void __weak arch_cpu_idle_dead(void) { }
+void __weak __noreturn arch_cpu_idle_dead(void) { while (1); }
void __weak arch_cpu_idle(void)
{
cpu_idle_force_poll = 1;
- local_irq_enable();
}
/**
@@ -87,13 +88,20 @@ void __weak arch_cpu_idle(void)
*/
void __cpuidle default_idle_call(void)
{
- if (current_clr_polling_and_test()) {
- local_irq_enable();
- } else {
+ instrumentation_begin();
+ if (!current_clr_polling_and_test()) {
+ trace_cpu_idle(1, smp_processor_id());
stop_critical_timings();
+
+ ct_cpuidle_enter();
arch_cpu_idle();
+ ct_cpuidle_exit();
+
start_critical_timings();
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}
+ local_irq_enable();
+ instrumentation_end();
}
static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
@@ -131,7 +139,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
*
* NOTE: no locks or semaphores should be used here
*
- * On archs that support TIF_POLLING_NRFLAG, is called with polling
+ * On architectures that support TIF_POLLING_NRFLAG, is called with polling
* set, and it returns with polling set. If it ever stops polling, it
* must clear the polling bit.
*/
@@ -158,7 +166,6 @@ static void cpuidle_idle_call(void)
if (cpuidle_not_available(drv, dev)) {
tick_nohz_idle_stop_tick();
- rcu_idle_enter();
default_idle_call();
goto exit_idle;
@@ -168,7 +175,7 @@ static void cpuidle_idle_call(void)
* Suspend-to-idle ("s2idle") is a system state in which all user space
* has been frozen, all I/O devices have been suspended and the only
* activity happens here and in interrupts (if any). In that case bypass
- * the cpuidle governor and go stratight for the deepest idle state
+ * the cpuidle governor and go straight for the deepest idle state
* available. Possibly also suspend the local tick and the entire
* timekeeping to prevent timer interrupts from kicking us out of idle
* until a proper wakeup interrupt happens.
@@ -178,21 +185,17 @@ static void cpuidle_idle_call(void)
u64 max_latency_ns;
if (idle_should_enter_s2idle()) {
- rcu_idle_enter();
entered_state = call_cpuidle_s2idle(drv, dev);
if (entered_state > 0)
goto exit_idle;
- rcu_idle_exit();
-
max_latency_ns = U64_MAX;
} else {
max_latency_ns = dev->forced_idle_latency_limit_ns;
}
tick_nohz_idle_stop_tick();
- rcu_idle_enter();
next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
call_cpuidle(drv, dev, next_state);
@@ -209,8 +212,6 @@ static void cpuidle_idle_call(void)
else
tick_nohz_idle_retain_tick();
- rcu_idle_enter();
-
entered_state = call_cpuidle(drv, dev, next_state);
/*
* Give the governor an opportunity to reflect on the outcome
@@ -226,8 +227,6 @@ exit_idle:
*/
if (WARN_ON_ONCE(irqs_disabled()))
local_irq_enable();
-
- rcu_idle_exit();
}
/*
@@ -238,6 +237,12 @@ exit_idle:
static void do_idle(void)
{
int cpu = smp_processor_id();
+
+ /*
+ * Check if we need to update blocked load
+ */
+ nohz_run_idle_balance(cpu);
+
/*
* If the arch has a polling bit, we maintain an invariant:
*
@@ -253,6 +258,36 @@ static void do_idle(void)
while (!need_resched()) {
rmb();
+ /*
+ * Interrupts shouldn't be re-enabled from that point on until
+ * the CPU sleeping instruction is reached. Otherwise an interrupt
+ * may fire and queue a timer that would be ignored until the CPU
+ * wakes from the sleeping instruction. And testing need_resched()
+ * doesn't tell about pending needed timer reprogram.
+ *
+ * Several cases to consider:
+ *
+ * - SLEEP-UNTIL-PENDING-INTERRUPT based instructions such as
+ * "wfi" or "mwait" are fine because they can be entered with
+ * interrupt disabled.
+ *
+ * - sti;mwait() couple is fine because the interrupts are
+ * re-enabled only upon the execution of mwait, leaving no gap
+ * in-between.
+ *
+ * - ROLLBACK based idle handlers with the sleeping instruction
+ * called with interrupts enabled are NOT fine. In this scheme
+ * when the interrupt detects it has interrupted an idle handler,
+ * it rolls back to its beginning which performs the
+ * need_resched() check before re-executing the sleeping
+ * instruction. This can leak a pending needed timer reprogram.
+ * If such a scheme is really mandatory due to the lack of an
+ * appropriate CPU sleeping instruction, then a FAST-FORWARD
+ * must instead be applied: when the interrupt detects it has
+ * interrupted an idle handler, it must resume to the end of
+ * this idle handler so that the generic idle loop is iterated
+ * again to reprogram the tick.
+ */
local_irq_disable();
if (cpu_is_offline(cpu)) {
@@ -262,6 +297,7 @@ static void do_idle(void)
}
arch_cpu_idle_enter();
+ rcu_nocb_flush_deferred_wakeup();
/*
* In poll mode we reenable interrupts and spin. Also if we
@@ -300,7 +336,7 @@ static void do_idle(void)
* RCU relies on this call to be done outside of an RCU read-side
* critical section.
*/
- flush_smp_call_function_from_idle();
+ flush_smp_call_function_queue();
schedule_idle();
if (unlikely(klp_patch_pending(current)))
@@ -341,6 +377,7 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns)
WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
WARN_ON_ONCE(!duration_ns);
+ WARN_ON_ONCE(current->mm);
rcu_sleep_check();
preempt_disable();
@@ -348,10 +385,10 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns)
cpuidle_use_deepest_state(latency_ns);
it.done = 0;
- hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
it.timer.function = idle_inject_timer_fn;
hrtimer_start(&it.timer, ns_to_ktime(duration_ns),
- HRTIMER_MODE_REL_PINNED);
+ HRTIMER_MODE_REL_PINNED_HARD);
while (!READ_ONCE(it.done))
do_idle();
@@ -366,6 +403,7 @@ EXPORT_SYMBOL_GPL(play_idle_precise);
void cpu_startup_entry(enum cpuhp_state state)
{
+ current->flags |= PF_IDLE;
arch_cpu_idle_prepare();
cpuhp_online_idle(state);
while (1)
@@ -378,7 +416,7 @@ void cpu_startup_entry(enum cpuhp_state state)
#ifdef CONFIG_SMP
static int
-select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int cpu, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
@@ -393,7 +431,7 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/*
* Idle tasks are unconditionally rescheduled:
*/
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
+static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
{
resched_curr(rq);
}
@@ -408,6 +446,13 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
schedstat_inc(rq->sched_goidle);
}
+#ifdef CONFIG_SMP
+static struct task_struct *pick_task_idle(struct rq *rq)
+{
+ return rq->idle;
+}
+#endif
+
struct task_struct *pick_next_task_idle(struct rq *rq)
{
struct task_struct *next = rq->idle;
@@ -424,10 +469,10 @@ struct task_struct *pick_next_task_idle(struct rq *rq)
static void
dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
- raw_spin_unlock_irq(&rq->lock);
+ raw_spin_rq_unlock_irq(rq);
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
dump_stack();
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_rq_lock_irq(rq);
}
/*
@@ -453,11 +498,6 @@ prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
BUG();
}
-static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
-{
- return 0;
-}
-
static void update_curr_idle(struct rq *rq)
{
}
@@ -465,14 +505,14 @@ static void update_curr_idle(struct rq *rq)
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
-const struct sched_class idle_sched_class = {
- /* .next is NULL */
+DEFINE_SCHED_CLASS(idle) = {
+
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
.dequeue_task = dequeue_task_idle,
- .check_preempt_curr = check_preempt_curr_idle,
+ .wakeup_preempt = wakeup_preempt_idle,
.pick_next_task = pick_next_task_idle,
.put_prev_task = put_prev_task_idle,
@@ -480,14 +520,13 @@ const struct sched_class idle_sched_class = {
#ifdef CONFIG_SMP
.balance = balance_idle,
+ .pick_task = pick_task_idle,
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
.task_tick = task_tick_idle,
- .get_rr_interval = get_rr_interval_idle,
-
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 808244f3ddd9..373d42c707bc 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -7,140 +7,182 @@
* Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
*
*/
-#include "sched.h"
+
+enum hk_flags {
+ HK_FLAG_TIMER = BIT(HK_TYPE_TIMER),
+ HK_FLAG_RCU = BIT(HK_TYPE_RCU),
+ HK_FLAG_MISC = BIT(HK_TYPE_MISC),
+ HK_FLAG_SCHED = BIT(HK_TYPE_SCHED),
+ HK_FLAG_TICK = BIT(HK_TYPE_TICK),
+ HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),
+ HK_FLAG_WQ = BIT(HK_TYPE_WQ),
+ HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ),
+ HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD),
+};
DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
EXPORT_SYMBOL_GPL(housekeeping_overridden);
-static cpumask_var_t housekeeping_mask;
-static unsigned int housekeeping_flags;
-bool housekeeping_enabled(enum hk_flags flags)
+struct housekeeping {
+ cpumask_var_t cpumasks[HK_TYPE_MAX];
+ unsigned long flags;
+};
+
+static struct housekeeping housekeeping;
+
+bool housekeeping_enabled(enum hk_type type)
{
- return !!(housekeeping_flags & flags);
+ return !!(housekeeping.flags & BIT(type));
}
EXPORT_SYMBOL_GPL(housekeeping_enabled);
-int housekeeping_any_cpu(enum hk_flags flags)
+int housekeeping_any_cpu(enum hk_type type)
{
int cpu;
if (static_branch_unlikely(&housekeeping_overridden)) {
- if (housekeeping_flags & flags) {
- cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id());
+ if (housekeeping.flags & BIT(type)) {
+ cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id());
if (cpu < nr_cpu_ids)
return cpu;
- return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+ return cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask);
}
}
return smp_processor_id();
}
EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
-const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
+const struct cpumask *housekeeping_cpumask(enum hk_type type)
{
if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping_flags & flags)
- return housekeeping_mask;
+ if (housekeeping.flags & BIT(type))
+ return housekeeping.cpumasks[type];
return cpu_possible_mask;
}
EXPORT_SYMBOL_GPL(housekeeping_cpumask);
-void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
+void housekeeping_affine(struct task_struct *t, enum hk_type type)
{
if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping_flags & flags)
- set_cpus_allowed_ptr(t, housekeeping_mask);
+ if (housekeeping.flags & BIT(type))
+ set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]);
}
EXPORT_SYMBOL_GPL(housekeeping_affine);
-bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
+bool housekeeping_test_cpu(int cpu, enum hk_type type)
{
if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping_flags & flags)
- return cpumask_test_cpu(cpu, housekeeping_mask);
+ if (housekeeping.flags & BIT(type))
+ return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]);
return true;
}
EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
void __init housekeeping_init(void)
{
- if (!housekeeping_flags)
+ enum hk_type type;
+
+ if (!housekeeping.flags)
return;
static_branch_enable(&housekeeping_overridden);
- if (housekeeping_flags & HK_FLAG_TICK)
+ if (housekeeping.flags & HK_FLAG_TICK)
sched_tick_offload_init();
- /* We need at least one CPU to handle housekeeping work */
- WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
+ for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
+ /* We need at least one CPU to handle housekeeping work */
+ WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type]));
+ }
}
-static int __init housekeeping_setup(char *str, enum hk_flags flags)
+static void __init housekeeping_setup_type(enum hk_type type,
+ cpumask_var_t housekeeping_staging)
{
- cpumask_var_t non_housekeeping_mask;
- cpumask_var_t tmp;
- int err;
+
+ alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]);
+ cpumask_copy(housekeeping.cpumasks[type],
+ housekeeping_staging);
+}
+
+static int __init housekeeping_setup(char *str, unsigned long flags)
+{
+ cpumask_var_t non_housekeeping_mask, housekeeping_staging;
+ int err = 0;
+
+ if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) {
+ if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+ pr_warn("Housekeeping: nohz unsupported."
+ " Build with CONFIG_NO_HZ_FULL\n");
+ return 0;
+ }
+ }
alloc_bootmem_cpumask_var(&non_housekeeping_mask);
- err = cpulist_parse(str, non_housekeeping_mask);
- if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) {
+ if (cpulist_parse(str, non_housekeeping_mask) < 0) {
pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
- free_bootmem_cpumask_var(non_housekeeping_mask);
- return 0;
+ goto free_non_housekeeping_mask;
}
- alloc_bootmem_cpumask_var(&tmp);
- if (!housekeeping_flags) {
- alloc_bootmem_cpumask_var(&housekeeping_mask);
- cpumask_andnot(housekeeping_mask,
- cpu_possible_mask, non_housekeeping_mask);
+ alloc_bootmem_cpumask_var(&housekeeping_staging);
+ cpumask_andnot(housekeeping_staging,
+ cpu_possible_mask, non_housekeeping_mask);
- cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
- if (cpumask_empty(tmp)) {
+ if (!cpumask_intersects(cpu_present_mask, housekeeping_staging)) {
+ __cpumask_set_cpu(smp_processor_id(), housekeeping_staging);
+ __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
+ if (!housekeeping.flags) {
pr_warn("Housekeeping: must include one present CPU, "
"using boot CPU:%d\n", smp_processor_id());
- __cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
- __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
- }
- } else {
- cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
- if (cpumask_empty(tmp))
- __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
- cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
- if (!cpumask_equal(tmp, housekeeping_mask)) {
- pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
- free_bootmem_cpumask_var(tmp);
- free_bootmem_cpumask_var(non_housekeeping_mask);
- return 0;
}
}
- free_bootmem_cpumask_var(tmp);
- if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
- if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
- tick_nohz_full_setup(non_housekeeping_mask);
- } else {
- pr_warn("Housekeeping: nohz unsupported."
- " Build with CONFIG_NO_HZ_FULL\n");
- free_bootmem_cpumask_var(non_housekeeping_mask);
- return 0;
+ if (!housekeeping.flags) {
+ /* First setup call ("nohz_full=" or "isolcpus=") */
+ enum hk_type type;
+
+ for_each_set_bit(type, &flags, HK_TYPE_MAX)
+ housekeeping_setup_type(type, housekeeping_staging);
+ } else {
+ /* Second setup call ("nohz_full=" after "isolcpus=" or the reverse) */
+ enum hk_type type;
+ unsigned long iter_flags = flags & housekeeping.flags;
+
+ for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) {
+ if (!cpumask_equal(housekeeping_staging,
+ housekeeping.cpumasks[type])) {
+ pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
+ goto free_housekeeping_staging;
+ }
}
+
+ iter_flags = flags & ~housekeeping.flags;
+
+ for_each_set_bit(type, &iter_flags, HK_TYPE_MAX)
+ housekeeping_setup_type(type, housekeeping_staging);
}
- housekeeping_flags |= flags;
+ if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK))
+ tick_nohz_full_setup(non_housekeeping_mask);
+
+ housekeeping.flags |= flags;
+ err = 1;
+free_housekeeping_staging:
+ free_bootmem_cpumask_var(housekeeping_staging);
+free_non_housekeeping_mask:
free_bootmem_cpumask_var(non_housekeeping_mask);
- return 1;
+ return err;
}
static int __init housekeeping_nohz_full_setup(char *str)
{
- unsigned int flags;
+ unsigned long flags;
- flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+ flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU |
+ HK_FLAG_MISC | HK_FLAG_KTHREAD;
return housekeeping_setup(str, flags);
}
@@ -148,7 +190,7 @@ __setup("nohz_full=", housekeeping_nohz_full_setup);
static int __init housekeeping_isolcpus_setup(char *str)
{
- unsigned int flags = 0;
+ unsigned long flags = 0;
bool illegal = false;
char *par;
int len;
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index de22da666ac7..52c8f8226b0d 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -6,7 +6,6 @@
* figure. Its a silly number but people think its important. We go through
* great pains to make it work on big machines and tickless kernels.
*/
-#include "sched.h"
/*
* Global load-average calculations
@@ -81,7 +80,7 @@ long calc_load_fold_active(struct rq *this_rq, long adjust)
long nr_active, delta = 0;
nr_active = this_rq->nr_running - adjust;
- nr_active += (long)this_rq->nr_uninterruptible;
+ nr_active += (int)this_rq->nr_uninterruptible;
if (nr_active != this_rq->calc_load_active) {
delta = nr_active - this_rq->calc_load_active;
@@ -189,7 +188,7 @@ calc_load_n(unsigned long load, unsigned long exp,
* w:0 1 1 0 0 1 1 0 0
*
* This ensures we'll fold the old NO_HZ contribution in this window while
- * accumlating the new one.
+ * accumulating the new one.
*
* - When we wake up from NO_HZ during the window, we push up our
* contribution, since we effectively move our sample point to a known
@@ -347,7 +346,7 @@ static inline void calc_global_nohz(void) { }
*
* Called from the global timer code.
*/
-void calc_global_load(unsigned long ticks)
+void calc_global_load(void)
{
unsigned long sample_window;
long active, delta;
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 168479a7d61b..4e715b9b278e 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -4,7 +4,134 @@
*
* membarrier system call
*/
-#include "sched.h"
+
+/*
+ * For documentation purposes, here are some membarrier ordering
+ * scenarios to keep in mind:
+ *
+ * A) Userspace thread execution after IPI vs membarrier's memory
+ * barrier before sending the IPI
+ *
+ * Userspace variables:
+ *
+ * int x = 0, y = 0;
+ *
+ * The memory barrier at the start of membarrier() on CPU0 is necessary in
+ * order to enforce the guarantee that any writes occurring on CPU0 before
+ * the membarrier() is executed will be visible to any code executing on
+ * CPU1 after the IPI-induced memory barrier:
+ *
+ * CPU0 CPU1
+ *
+ * x = 1
+ * membarrier():
+ * a: smp_mb()
+ * b: send IPI IPI-induced mb
+ * c: smp_mb()
+ * r2 = y
+ * y = 1
+ * barrier()
+ * r1 = x
+ *
+ * BUG_ON(r1 == 0 && r2 == 0)
+ *
+ * The write to y and load from x by CPU1 are unordered by the hardware,
+ * so it's possible to have "r1 = x" reordered before "y = 1" at any
+ * point after (b). If the memory barrier at (a) is omitted, then "x = 1"
+ * can be reordered after (a) (although not after (c)), so we get r1 == 0
+ * and r2 == 0. This violates the guarantee that membarrier() is
+ * supposed by provide.
+ *
+ * The timing of the memory barrier at (a) has to ensure that it executes
+ * before the IPI-induced memory barrier on CPU1.
+ *
+ * B) Userspace thread execution before IPI vs membarrier's memory
+ * barrier after completing the IPI
+ *
+ * Userspace variables:
+ *
+ * int x = 0, y = 0;
+ *
+ * The memory barrier at the end of membarrier() on CPU0 is necessary in
+ * order to enforce the guarantee that any writes occurring on CPU1 before
+ * the membarrier() is executed will be visible to any code executing on
+ * CPU0 after the membarrier():
+ *
+ * CPU0 CPU1
+ *
+ * x = 1
+ * barrier()
+ * y = 1
+ * r2 = y
+ * membarrier():
+ * a: smp_mb()
+ * b: send IPI IPI-induced mb
+ * c: smp_mb()
+ * r1 = x
+ * BUG_ON(r1 == 0 && r2 == 1)
+ *
+ * The writes to x and y are unordered by the hardware, so it's possible to
+ * have "r2 = 1" even though the write to x doesn't execute until (b). If
+ * the memory barrier at (c) is omitted then "r1 = x" can be reordered
+ * before (b) (although not before (a)), so we get "r1 = 0". This violates
+ * the guarantee that membarrier() is supposed to provide.
+ *
+ * The timing of the memory barrier at (c) has to ensure that it executes
+ * after the IPI-induced memory barrier on CPU1.
+ *
+ * C) Scheduling userspace thread -> kthread -> userspace thread vs membarrier
+ *
+ * CPU0 CPU1
+ *
+ * membarrier():
+ * a: smp_mb()
+ * d: switch to kthread (includes mb)
+ * b: read rq->curr->mm == NULL
+ * e: switch to user (includes mb)
+ * c: smp_mb()
+ *
+ * Using the scenario from (A), we can show that (a) needs to be paired
+ * with (e). Using the scenario from (B), we can show that (c) needs to
+ * be paired with (d).
+ *
+ * D) exit_mm vs membarrier
+ *
+ * Two thread groups are created, A and B. Thread group B is created by
+ * issuing clone from group A with flag CLONE_VM set, but not CLONE_THREAD.
+ * Let's assume we have a single thread within each thread group (Thread A
+ * and Thread B). Thread A runs on CPU0, Thread B runs on CPU1.
+ *
+ * CPU0 CPU1
+ *
+ * membarrier():
+ * a: smp_mb()
+ * exit_mm():
+ * d: smp_mb()
+ * e: current->mm = NULL
+ * b: read rq->curr->mm == NULL
+ * c: smp_mb()
+ *
+ * Using scenario (B), we can show that (c) needs to be paired with (d).
+ *
+ * E) kthread_{use,unuse}_mm vs membarrier
+ *
+ * CPU0 CPU1
+ *
+ * membarrier():
+ * a: smp_mb()
+ * kthread_unuse_mm()
+ * d: smp_mb()
+ * e: current->mm = NULL
+ * b: read rq->curr->mm == NULL
+ * kthread_use_mm()
+ * f: current->mm = mm
+ * g: smp_mb()
+ * c: smp_mb()
+ *
+ * Using the scenario from (A), we can show that (a) needs to be paired
+ * with (g). Using the scenario from (B), we can show that (c) needs to
+ * be paired with (d).
+ */
/*
* Bitmask made from a "or" of all commands within enum membarrier_cmd,
@@ -18,18 +145,61 @@
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
#endif
+#ifdef CONFIG_RSEQ
+#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \
+ (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ)
+#else
+#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK 0
+#endif
+
#define MEMBARRIER_CMD_BITMASK \
(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
| MEMBARRIER_CMD_PRIVATE_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
- | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
+ | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
+ | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \
+ | MEMBARRIER_CMD_GET_REGISTRATIONS)
+
+static DEFINE_MUTEX(membarrier_ipi_mutex);
+#define SERIALIZE_IPI() guard(mutex)(&membarrier_ipi_mutex)
static void ipi_mb(void *info)
{
smp_mb(); /* IPIs should be serializing but paranoid. */
}
+static void ipi_sync_core(void *info)
+{
+ /*
+ * The smp_mb() in membarrier after all the IPIs is supposed to
+ * ensure that memory on remote CPUs that occur before the IPI
+ * become visible to membarrier()'s caller -- see scenario B in
+ * the big comment at the top of this file.
+ *
+ * A sync_core() would provide this guarantee, but
+ * sync_core_before_usermode() might end up being deferred until
+ * after membarrier()'s smp_mb().
+ */
+ smp_mb(); /* IPIs should be serializing but paranoid. */
+
+ sync_core_before_usermode();
+}
+
+static void ipi_rseq(void *info)
+{
+ /*
+ * Ensure that all stores done by the calling thread are visible
+ * to the current task before the current task resumes. We could
+ * probably optimize this away on most architectures, but by the
+ * time we've already sent an IPI, the cost of the extra smp_mb()
+ * is negligible.
+ */
+ smp_mb();
+ rseq_preempt(current);
+}
+
static void ipi_sync_rq_state(void *info)
{
struct mm_struct *mm = (struct mm_struct *) info;
@@ -63,6 +233,18 @@ void membarrier_exec_mmap(struct mm_struct *mm)
this_cpu_write(runqueues.membarrier_state, 0);
}
+void membarrier_update_current_mm(struct mm_struct *next_mm)
+{
+ struct rq *rq = this_rq();
+ int membarrier_state = 0;
+
+ if (next_mm)
+ membarrier_state = atomic_read(&next_mm->membarrier_state);
+ if (READ_ONCE(rq->membarrier_state) == membarrier_state)
+ return;
+ WRITE_ONCE(rq->membarrier_state, membarrier_state);
+}
+
static int membarrier_global_expedited(void)
{
int cpu;
@@ -80,6 +262,7 @@ static int membarrier_global_expedited(void)
if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
return -ENOMEM;
+ SERIALIZE_IPI();
cpus_read_lock();
rcu_read_lock();
for_each_online_cpu(cpu) {
@@ -101,12 +284,11 @@ static int membarrier_global_expedited(void)
continue;
/*
- * Skip the CPU if it runs a kernel thread. The scheduler
- * leaves the prior task mm in place as an optimization when
- * scheduling a kthread.
+ * Skip the CPU if it runs a kernel thread which is not using
+ * a task mm.
*/
p = rcu_dereference(cpu_rq(cpu)->curr);
- if (p->flags & PF_KTHREAD)
+ if (!p->mm)
continue;
__cpumask_set_cpu(cpu, tmpmask);
@@ -129,25 +311,35 @@ static int membarrier_global_expedited(void)
return 0;
}
-static int membarrier_private_expedited(int flags)
+static int membarrier_private_expedited(int flags, int cpu_id)
{
- int cpu;
cpumask_var_t tmpmask;
struct mm_struct *mm = current->mm;
+ smp_call_func_t ipi_func = ipi_mb;
- if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL;
if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
return -EPERM;
+ ipi_func = ipi_sync_core;
+ } else if (flags == MEMBARRIER_FLAG_RSEQ) {
+ if (!IS_ENABLED(CONFIG_RSEQ))
+ return -EINVAL;
+ if (!(atomic_read(&mm->membarrier_state) &
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
+ return -EPERM;
+ ipi_func = ipi_rseq;
} else {
+ WARN_ON_ONCE(flags);
if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
return -EPERM;
}
- if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
+ if (flags != MEMBARRIER_FLAG_SYNC_CORE &&
+ (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1))
return 0;
/*
@@ -156,35 +348,74 @@ static int membarrier_private_expedited(int flags)
*/
smp_mb(); /* system call entry is not a mb. */
- if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
return -ENOMEM;
+ SERIALIZE_IPI();
cpus_read_lock();
- rcu_read_lock();
- for_each_online_cpu(cpu) {
+
+ if (cpu_id >= 0) {
struct task_struct *p;
+ if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
+ goto out;
+ rcu_read_lock();
+ p = rcu_dereference(cpu_rq(cpu_id)->curr);
+ if (!p || p->mm != mm) {
+ rcu_read_unlock();
+ goto out;
+ }
+ rcu_read_unlock();
+ } else {
+ int cpu;
+
+ rcu_read_lock();
+ for_each_online_cpu(cpu) {
+ struct task_struct *p;
+
+ p = rcu_dereference(cpu_rq(cpu)->curr);
+ if (p && p->mm == mm)
+ __cpumask_set_cpu(cpu, tmpmask);
+ }
+ rcu_read_unlock();
+ }
+
+ if (cpu_id >= 0) {
/*
- * Skipping the current CPU is OK even through we can be
- * migrated at any point. The current CPU, at the point
- * where we read raw_smp_processor_id(), is ensured to
- * be in program order with respect to the caller
- * thread. Therefore, we can skip this CPU from the
- * iteration.
+ * smp_call_function_single() will call ipi_func() if cpu_id
+ * is the calling CPU.
*/
- if (cpu == raw_smp_processor_id())
- continue;
- p = rcu_dereference(cpu_rq(cpu)->curr);
- if (p && p->mm == mm)
- __cpumask_set_cpu(cpu, tmpmask);
+ smp_call_function_single(cpu_id, ipi_func, NULL, 1);
+ } else {
+ /*
+ * For regular membarrier, we can save a few cycles by
+ * skipping the current cpu -- we're about to do smp_mb()
+ * below, and if we migrate to a different cpu, this cpu
+ * and the new cpu will execute a full barrier in the
+ * scheduler.
+ *
+ * For SYNC_CORE, we do need a barrier on the current cpu --
+ * otherwise, if we are migrated and replaced by a different
+ * task in the same mm just before, during, or after
+ * membarrier, we will end up with some thread in the mm
+ * running without a core sync.
+ *
+ * For RSEQ, don't rseq_preempt() the caller. User code
+ * is not supposed to issue syscalls at all from inside an
+ * rseq critical section.
+ */
+ if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
+ preempt_disable();
+ smp_call_function_many(tmpmask, ipi_func, NULL, true);
+ preempt_enable();
+ } else {
+ on_each_cpu_mask(tmpmask, ipi_func, NULL, true);
+ }
}
- rcu_read_unlock();
- preempt_disable();
- smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
- preempt_enable();
-
- free_cpumask_var(tmpmask);
+out:
+ if (cpu_id < 0)
+ free_cpumask_var(tmpmask);
cpus_read_unlock();
/*
@@ -229,11 +460,12 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm)
/*
* For each cpu runqueue, if the task's mm match @mm, ensure that all
- * @mm's membarrier state set bits are also set in in the runqueue's
+ * @mm's membarrier state set bits are also set in the runqueue's
* membarrier state. This ensures that a runqueue scheduling
* between threads which are users of @mm has its membarrier state
* updated.
*/
+ SERIALIZE_IPI();
cpus_read_lock();
rcu_read_lock();
for_each_online_cpu(cpu) {
@@ -246,9 +478,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm)
}
rcu_read_unlock();
- preempt_disable();
- smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
- preempt_enable();
+ on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);
free_cpumask_var(tmpmask);
cpus_read_unlock();
@@ -283,11 +513,18 @@ static int membarrier_register_private_expedited(int flags)
set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
ret;
- if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL;
ready_state =
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+ } else if (flags == MEMBARRIER_FLAG_RSEQ) {
+ if (!IS_ENABLED(CONFIG_RSEQ))
+ return -EINVAL;
+ ready_state =
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
+ } else {
+ WARN_ON_ONCE(flags);
}
/*
@@ -299,6 +536,8 @@ static int membarrier_register_private_expedited(int flags)
return 0;
if (flags & MEMBARRIER_FLAG_SYNC_CORE)
set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
+ if (flags & MEMBARRIER_FLAG_RSEQ)
+ set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
atomic_or(set_state, &mm->membarrier_state);
ret = sync_runqueues_membarrier_state(mm);
if (ret)
@@ -308,10 +547,51 @@ static int membarrier_register_private_expedited(int flags)
return 0;
}
+static int membarrier_get_registrations(void)
+{
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ int registrations_mask = 0, membarrier_state, i;
+ static const int states[] = {
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED |
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED |
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE |
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY,
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ |
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY
+ };
+ static const int registration_cmds[] = {
+ MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ
+ };
+ BUILD_BUG_ON(ARRAY_SIZE(states) != ARRAY_SIZE(registration_cmds));
+
+ membarrier_state = atomic_read(&mm->membarrier_state);
+ for (i = 0; i < ARRAY_SIZE(states); ++i) {
+ if (membarrier_state & states[i]) {
+ registrations_mask |= registration_cmds[i];
+ membarrier_state &= ~states[i];
+ }
+ }
+ WARN_ON_ONCE(membarrier_state != 0);
+ return registrations_mask;
+}
+
/**
* sys_membarrier - issue memory barriers on a set of threads
- * @cmd: Takes command values defined in enum membarrier_cmd.
- * @flags: Currently needs to be 0. For future extensions.
+ * @cmd: Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0 for all commands other than
+ * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
+ * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
+ * contains the CPU on which to interrupt (= restart)
+ * the RSEQ critical section.
+ * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
+ * RSEQ CS should be interrupted (@cmd must be
+ * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
*
* If this system call is not implemented, -ENOSYS is returned. If the
* command specified does not exist, not available on the running
@@ -337,10 +617,21 @@ static int membarrier_register_private_expedited(int flags)
* smp_mb() X O O
* sys_membarrier() O O O
*/
-SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
{
- if (unlikely(flags))
- return -EINVAL;
+ switch (cmd) {
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+ if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
+ return -EINVAL;
+ break;
+ default:
+ if (unlikely(flags))
+ return -EINVAL;
+ }
+
+ if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
+ cpu_id = -1;
+
switch (cmd) {
case MEMBARRIER_CMD_QUERY:
{
@@ -362,13 +653,19 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
return membarrier_register_global_expedited();
case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
- return membarrier_private_expedited(0);
+ return membarrier_private_expedited(0, cpu_id);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
return membarrier_register_private_expedited(0);
case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
- return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+ return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+ return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
+ return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
+ case MEMBARRIER_CMD_GET_REGISTRATIONS:
+ return membarrier_get_registrations();
default:
return -EINVAL;
}
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index b4b1ff96642f..63b6cf898220 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Per Entity Load Tracking
+ * Per Entity Load Tracking (PELT)
*
* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*
@@ -24,12 +24,6 @@
* Author: Vincent Guittot <vincent.guittot@linaro.org>
*/
-#include <linux/sched.h>
-#include "sched.h"
-#include "pelt.h"
-
-#include <trace/events/sched.h>
-
/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
@@ -83,8 +77,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
return c1 + c2 + c3;
}
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
-
/*
* Accumulate the three separate parts of the sum; d1 the remainder
* of the last (incomplete) period, d2 the span of full periods and d3
@@ -137,7 +129,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
* runnable = running = 0;
*
* clause from ___update_load_sum(); this results in
- * the below usage of @contrib to dissapear entirely,
+ * the below usage of @contrib to disappear entirely,
* so no point in calculating it.
*/
contrib = __accumulate_pelt_segments(periods,
@@ -264,7 +256,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
static __always_inline void
___update_load_avg(struct sched_avg *sa, unsigned long load)
{
- u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+ u32 divider = get_pelt_divider(sa);
/*
* Step 2: update *_avg.
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index eb034d9f024d..9e1083465fbc 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -37,14 +37,12 @@ update_irq_load_avg(struct rq *rq, u64 running)
}
#endif
-/*
- * When a task is dequeued, its estimated utilization should not be update if
- * its util_avg has not been updated at least once.
- * This flag is used to synchronize util_avg updates with util_est updates.
- * We map this information into the LSB bit of the utilization saved at
- * dequeue time (i.e. util_est.dequeued).
- */
-#define UTIL_AVG_UNCHANGED 0x1
+#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024)
+
+static inline u32 get_pelt_divider(struct sched_avg *avg)
+{
+ return PELT_MIN_DIVIDER + avg->period_contrib;
+}
static inline void cfs_se_util_change(struct sched_avg *avg)
{
@@ -53,14 +51,33 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
if (!sched_feat(UTIL_EST))
return;
- /* Avoid store if the flag has been already set */
- enqueued = avg->util_est.enqueued;
+ /* Avoid store if the flag has been already reset */
+ enqueued = avg->util_est;
if (!(enqueued & UTIL_AVG_UNCHANGED))
return;
/* Reset flag to report util_avg has been updated */
enqueued &= ~UTIL_AVG_UNCHANGED;
- WRITE_ONCE(avg->util_est.enqueued, enqueued);
+ WRITE_ONCE(avg->util_est, enqueued);
+}
+
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ assert_clock_updated(rq);
+
+ return rq->clock_pelt - rq->lost_idle_time;
+}
+
+/* The rq is idle, we can sync to clock_task */
+static inline void _update_idle_rq_clock_pelt(struct rq *rq)
+{
+ rq->clock_pelt = rq_clock_task(rq);
+
+ u64_u32_store(rq->clock_idle, rq_clock(rq));
+ /* Paired with smp_rmb in migrate_se_pelt_lag() */
+ smp_wmb();
+ u64_u32_store(rq->clock_pelt_idle, rq_clock_pelt(rq));
}
/*
@@ -78,8 +95,7 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
{
if (unlikely(is_idle_task(rq->curr))) {
- /* The rq is idle, we can sync to clock_task */
- rq->clock_pelt = rq_clock_task(rq);
+ _update_idle_rq_clock_pelt(rq);
return;
}
@@ -125,33 +141,40 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
* Reflecting stolen time makes sense only if the idle
* phase would be present at max capacity. As soon as the
* utilization of a rq has reached the maximum value, it is
- * considered as an always runnig rq without idle time to
+ * considered as an always running rq without idle time to
* steal. This potential idle time is considered as lost in
* this case. We keep track of this lost idle time compare to
* rq's clock_task.
*/
if (util_sum >= divider)
rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
+
+ _update_idle_rq_clock_pelt(rq);
}
-static inline u64 rq_clock_pelt(struct rq *rq)
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
- lockdep_assert_held(&rq->lock);
- assert_clock_updated(rq);
+ u64 throttled;
- return rq->clock_pelt - rq->lost_idle_time;
+ if (unlikely(cfs_rq->throttle_count))
+ throttled = U64_MAX;
+ else
+ throttled = cfs_rq->throttled_clock_pelt_time;
+
+ u64_u32_store(cfs_rq->throttled_pelt_idle, throttled);
}
-#ifdef CONFIG_CFS_BANDWIDTH
/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
if (unlikely(cfs_rq->throttle_count))
- return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
+ return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
- return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
+ return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
}
#else
+static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
return rq_clock_pelt(rq_of(cfs_rq));
@@ -206,6 +229,7 @@ update_rq_clock_pelt(struct rq *rq, s64 delta) { }
static inline void
update_idle_rq_clock_pelt(struct rq *rq) { }
+static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
#endif
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 8f45cdb6463b..7b4aa5809c0f 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Pressure stall information for CPU, memory and IO
*
@@ -34,10 +35,19 @@
* delayed on that resource such that nobody is advancing and the CPU
* goes idle. This leaves both workload and CPU unproductive.
*
- * (Naturally, the FULL state doesn't exist for the CPU resource.)
- *
* SOME = nr_delayed_tasks != 0
- * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
+ * FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
+ *
+ * What it means for a task to be productive is defined differently
+ * for each resource. For IO, productive means a running task. For
+ * memory, productive means a running task that isn't a reclaimer. For
+ * CPU, productive means an oncpu task.
+ *
+ * Naturally, the FULL state doesn't exist for the CPU resource at the
+ * system level, but exist at the cgroup level. At the cgroup level,
+ * FULL means all non-idle tasks in the cgroup are delayed on the CPU
+ * resource which is being used by others outside of the cgroup or
+ * throttled by the cgroup cpu.max configuration.
*
* The percentage of wallclock time spent in those compound stall
* states gives pressure numbers between 0 and 100 for each resource,
@@ -59,7 +69,7 @@
* states, we would have to conclude a CPU SOME pressure number of
* 100%, since *somebody* is waiting on a runqueue at all
* times. However, that is clearly not the amount of contention the
- * workload is experiencing: only one out of 256 possible exceution
+ * workload is experiencing: only one out of 256 possible execution
* threads will be contended at any given time, or about 0.4%.
*
* Conversely, consider a scenario of 4 tasks and 4 CPUs where at any
@@ -73,18 +83,18 @@
* we have to base our calculation on the number of non-idle tasks in
* conjunction with the number of available CPUs, which is the number
* of potential execution threads. SOME becomes then the proportion of
- * delayed tasks to possibe threads, and FULL is the share of possible
+ * delayed tasks to possible threads, and FULL is the share of possible
* threads that are unproductive due to delays:
*
* threads = min(nr_nonidle_tasks, nr_cpus)
* SOME = min(nr_delayed_tasks / threads, 1)
- * FULL = (threads - min(nr_running_tasks, threads)) / threads
+ * FULL = (threads - min(nr_productive_tasks, threads)) / threads
*
* For the 257 number crunchers on 256 CPUs, this yields:
*
* threads = min(257, 256)
* SOME = min(1 / 256, 1) = 0.4%
- * FULL = (256 - min(257, 256)) / 256 = 0%
+ * FULL = (256 - min(256, 256)) / 256 = 0%
*
* For the 1 out of 4 memory-delayed tasks, this yields:
*
@@ -109,7 +119,7 @@
* For each runqueue, we track:
*
* tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
- * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
+ * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
* tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
*
* and then periodically aggregate:
@@ -127,24 +137,10 @@
* sampling of the aggregate task states would be.
*/
-#include "../workqueue_internal.h"
-#include <linux/sched/loadavg.h>
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-#include <linux/seqlock.h>
-#include <linux/uaccess.h>
-#include <linux/cgroup.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/ctype.h>
-#include <linux/file.h>
-#include <linux/poll.h>
-#include <linux/psi.h>
-#include "sched.h"
-
static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled);
+static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
#ifdef CONFIG_PSI_DEFAULT_DISABLED
static bool psi_enable;
@@ -164,7 +160,6 @@ __setup("psi=", setup_psi);
#define EXP_300s 2034 /* 1/exp(2s/300s) */
/* PSI trigger definitions */
-#define WINDOW_MIN_US 500000 /* Min window size is 500ms */
#define WINDOW_MAX_US 10000000 /* Max window size is 10s */
#define UPDATES_PER_WINDOW 10 /* 10 updates per window */
@@ -179,53 +174,66 @@ struct psi_group psi_system = {
static void psi_avgs_work(struct work_struct *work);
+static void poll_timer_fn(struct timer_list *t);
+
static void group_init(struct psi_group *group)
{
int cpu;
+ group->enabled = true;
for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
group->avg_last_update = sched_clock();
group->avg_next_update = group->avg_last_update + psi_period;
- INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
mutex_init(&group->avgs_lock);
- /* Init trigger-related members */
- atomic_set(&group->poll_scheduled, 0);
- mutex_init(&group->trigger_lock);
- INIT_LIST_HEAD(&group->triggers);
- memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
- group->poll_states = 0;
- group->poll_min_period = U32_MAX;
- memset(group->polling_total, 0, sizeof(group->polling_total));
- group->polling_next_update = ULLONG_MAX;
- group->polling_until = 0;
- rcu_assign_pointer(group->poll_kworker, NULL);
+
+ /* Init avg trigger-related members */
+ INIT_LIST_HEAD(&group->avg_triggers);
+ memset(group->avg_nr_triggers, 0, sizeof(group->avg_nr_triggers));
+ INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
+
+ /* Init rtpoll trigger-related members */
+ atomic_set(&group->rtpoll_scheduled, 0);
+ mutex_init(&group->rtpoll_trigger_lock);
+ INIT_LIST_HEAD(&group->rtpoll_triggers);
+ group->rtpoll_min_period = U32_MAX;
+ group->rtpoll_next_update = ULLONG_MAX;
+ init_waitqueue_head(&group->rtpoll_wait);
+ timer_setup(&group->rtpoll_timer, poll_timer_fn, 0);
+ rcu_assign_pointer(group->rtpoll_task, NULL);
}
void __init psi_init(void)
{
if (!psi_enable) {
static_branch_enable(&psi_disabled);
+ static_branch_disable(&psi_cgroups_enabled);
return;
}
+ if (!cgroup_psi_enabled())
+ static_branch_disable(&psi_cgroups_enabled);
+
psi_period = jiffies_to_nsecs(PSI_FREQ);
group_init(&psi_system);
}
-static bool test_state(unsigned int *tasks, enum psi_states state)
+static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
{
switch (state) {
case PSI_IO_SOME:
- return tasks[NR_IOWAIT];
+ return unlikely(tasks[NR_IOWAIT]);
case PSI_IO_FULL:
- return tasks[NR_IOWAIT] && !tasks[NR_RUNNING];
+ return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]);
case PSI_MEM_SOME:
- return tasks[NR_MEMSTALL];
+ return unlikely(tasks[NR_MEMSTALL]);
case PSI_MEM_FULL:
- return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
+ return unlikely(tasks[NR_MEMSTALL] &&
+ tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
case PSI_CPU_SOME:
- return tasks[NR_RUNNING] > tasks[NR_ONCPU];
+ return unlikely(tasks[NR_RUNNING] > oncpu);
+ case PSI_CPU_FULL:
+ return unlikely(tasks[NR_RUNNING] && !oncpu);
case PSI_NONIDLE:
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
tasks[NR_RUNNING];
@@ -239,6 +247,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
u32 *pchanged_states)
{
struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+ int current_cpu = raw_smp_processor_id();
+ unsigned int tasks[NR_PSI_TASK_COUNTS];
u64 now, state_start;
enum psi_states s;
unsigned int seq;
@@ -253,6 +263,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
memcpy(times, groupc->times, sizeof(groupc->times));
state_mask = groupc->state_mask;
state_start = groupc->state_start;
+ if (cpu == current_cpu)
+ memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
} while (read_seqcount_retry(&groupc->seq, seq));
/* Calculate state time deltas against the previous snapshot */
@@ -277,6 +289,28 @@ static void get_recent_times(struct psi_group *group, int cpu,
if (delta)
*pchanged_states |= (1 << s);
}
+
+ /*
+ * When collect_percpu_times() from the avgs_work, we don't want to
+ * re-arm avgs_work when all CPUs are IDLE. But the current CPU running
+ * this avgs_work is never IDLE, cause avgs_work can't be shut off.
+ * So for the current CPU, we need to re-arm avgs_work only when
+ * (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs
+ * we can just check PSI_NONIDLE delta.
+ */
+ if (current_work() == &group->avgs_work.work) {
+ bool reschedule;
+
+ if (cpu == current_cpu)
+ reschedule = tasks[NR_RUNNING] +
+ tasks[NR_IOWAIT] +
+ tasks[NR_MEMSTALL] > 1;
+ else
+ reschedule = *pchanged_states & (1 << PSI_NONIDLE);
+
+ if (reschedule)
+ *pchanged_states |= PSI_STATE_RESCHEDULE;
+ }
}
static void calc_avgs(unsigned long avg[3], int missed_periods,
@@ -354,6 +388,114 @@ static void collect_percpu_times(struct psi_group *group,
*pchanged_states = changed_states;
}
+/* Trigger tracking window manipulations */
+static void window_reset(struct psi_window *win, u64 now, u64 value,
+ u64 prev_growth)
+{
+ win->start_time = now;
+ win->start_value = value;
+ win->prev_growth = prev_growth;
+}
+
+/*
+ * PSI growth tracking window update and growth calculation routine.
+ *
+ * This approximates a sliding tracking window by interpolating
+ * partially elapsed windows using historical growth data from the
+ * previous intervals. This minimizes memory requirements (by not storing
+ * all the intermediate values in the previous window) and simplifies
+ * the calculations. It works well because PSI signal changes only in
+ * positive direction and over relatively small window sizes the growth
+ * is close to linear.
+ */
+static u64 window_update(struct psi_window *win, u64 now, u64 value)
+{
+ u64 elapsed;
+ u64 growth;
+
+ elapsed = now - win->start_time;
+ growth = value - win->start_value;
+ /*
+ * After each tracking window passes win->start_value and
+ * win->start_time get reset and win->prev_growth stores
+ * the average per-window growth of the previous window.
+ * win->prev_growth is then used to interpolate additional
+ * growth from the previous window assuming it was linear.
+ */
+ if (elapsed > win->size)
+ window_reset(win, now, value, growth);
+ else {
+ u32 remaining;
+
+ remaining = win->size - elapsed;
+ growth += div64_u64(win->prev_growth * remaining, win->size);
+ }
+
+ return growth;
+}
+
+static void update_triggers(struct psi_group *group, u64 now,
+ enum psi_aggregators aggregator)
+{
+ struct psi_trigger *t;
+ u64 *total = group->total[aggregator];
+ struct list_head *triggers;
+ u64 *aggregator_total;
+
+ if (aggregator == PSI_AVGS) {
+ triggers = &group->avg_triggers;
+ aggregator_total = group->avg_total;
+ } else {
+ triggers = &group->rtpoll_triggers;
+ aggregator_total = group->rtpoll_total;
+ }
+
+ /*
+ * On subsequent updates, calculate growth deltas and let
+ * watchers know when their specified thresholds are exceeded.
+ */
+ list_for_each_entry(t, triggers, node) {
+ u64 growth;
+ bool new_stall;
+
+ new_stall = aggregator_total[t->state] != total[t->state];
+
+ /* Check for stall activity or a previous threshold breach */
+ if (!new_stall && !t->pending_event)
+ continue;
+ /*
+ * Check for new stall activity, as well as deferred
+ * events that occurred in the last window after the
+ * trigger had already fired (we want to ratelimit
+ * events without dropping any).
+ */
+ if (new_stall) {
+ /* Calculate growth since last update */
+ growth = window_update(&t->win, now, total[t->state]);
+ if (!t->pending_event) {
+ if (growth < t->threshold)
+ continue;
+
+ t->pending_event = true;
+ }
+ }
+ /* Limit event signaling to once per window */
+ if (now < t->last_event_time + t->win.size)
+ continue;
+
+ /* Generate an event */
+ if (cmpxchg(&t->event, 0, 1) == 0) {
+ if (t->of)
+ kernfs_notify(t->of->kn);
+ else
+ wake_up_interruptible(&t->event_wait);
+ }
+ t->last_event_time = now;
+ /* Reset threshold breach flag once event got generated */
+ t->pending_event = false;
+ }
+}
+
static u64 update_averages(struct psi_group *group, u64 now)
{
unsigned long missed_periods = 0;
@@ -412,7 +554,6 @@ static void psi_avgs_work(struct work_struct *work)
struct delayed_work *dwork;
struct psi_group *group;
u32 changed_states;
- bool nonidle;
u64 now;
dwork = to_delayed_work(work);
@@ -423,7 +564,6 @@ static void psi_avgs_work(struct work_struct *work)
now = sched_clock();
collect_percpu_times(group, PSI_AVGS, &changed_states);
- nonidle = changed_states & (1 << PSI_NONIDLE);
/*
* If there is task activity, periodically fold the per-cpu
* times and feed samples into the running averages. If things
@@ -431,10 +571,12 @@ static void psi_avgs_work(struct work_struct *work)
* Once restarted, we'll catch up the running averages in one
* go - see calc_avgs() and missed_periods.
*/
- if (now >= group->avg_next_update)
+ if (now >= group->avg_next_update) {
+ update_triggers(group, now, PSI_AVGS);
group->avg_next_update = update_averages(group, now);
+ }
- if (nonidle) {
+ if (changed_states & PSI_STATE_RESCHEDULE) {
schedule_delayed_work(dwork, nsecs_to_jiffies(
group->avg_next_update - now) + 1);
}
@@ -442,194 +584,161 @@ static void psi_avgs_work(struct work_struct *work)
mutex_unlock(&group->avgs_lock);
}
-/* Trigger tracking window manupulations */
-static void window_reset(struct psi_window *win, u64 now, u64 value,
- u64 prev_growth)
-{
- win->start_time = now;
- win->start_value = value;
- win->prev_growth = prev_growth;
-}
-
-/*
- * PSI growth tracking window update and growth calculation routine.
- *
- * This approximates a sliding tracking window by interpolating
- * partially elapsed windows using historical growth data from the
- * previous intervals. This minimizes memory requirements (by not storing
- * all the intermediate values in the previous window) and simplifies
- * the calculations. It works well because PSI signal changes only in
- * positive direction and over relatively small window sizes the growth
- * is close to linear.
- */
-static u64 window_update(struct psi_window *win, u64 now, u64 value)
-{
- u64 elapsed;
- u64 growth;
-
- elapsed = now - win->start_time;
- growth = value - win->start_value;
- /*
- * After each tracking window passes win->start_value and
- * win->start_time get reset and win->prev_growth stores
- * the average per-window growth of the previous window.
- * win->prev_growth is then used to interpolate additional
- * growth from the previous window assuming it was linear.
- */
- if (elapsed > win->size)
- window_reset(win, now, value, growth);
- else {
- u32 remaining;
-
- remaining = win->size - elapsed;
- growth += div64_u64(win->prev_growth * remaining, win->size);
- }
-
- return growth;
-}
-
-static void init_triggers(struct psi_group *group, u64 now)
+static void init_rtpoll_triggers(struct psi_group *group, u64 now)
{
struct psi_trigger *t;
- list_for_each_entry(t, &group->triggers, node)
+ list_for_each_entry(t, &group->rtpoll_triggers, node)
window_reset(&t->win, now,
group->total[PSI_POLL][t->state], 0);
- memcpy(group->polling_total, group->total[PSI_POLL],
- sizeof(group->polling_total));
- group->polling_next_update = now + group->poll_min_period;
+ memcpy(group->rtpoll_total, group->total[PSI_POLL],
+ sizeof(group->rtpoll_total));
+ group->rtpoll_next_update = now + group->rtpoll_min_period;
}
-static u64 update_triggers(struct psi_group *group, u64 now)
+/* Schedule rtpolling if it's not already scheduled or forced. */
+static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay,
+ bool force)
{
- struct psi_trigger *t;
- bool new_stall = false;
- u64 *total = group->total[PSI_POLL];
+ struct task_struct *task;
/*
- * On subsequent updates, calculate growth deltas and let
- * watchers know when their specified thresholds are exceeded.
+ * atomic_xchg should be called even when !force to provide a
+ * full memory barrier (see the comment inside psi_rtpoll_work).
*/
- list_for_each_entry(t, &group->triggers, node) {
- u64 growth;
-
- /* Check for stall activity */
- if (group->polling_total[t->state] == total[t->state])
- continue;
-
- /*
- * Multiple triggers might be looking at the same state,
- * remember to update group->polling_total[] once we've
- * been through all of them. Also remember to extend the
- * polling time if we see new stall activity.
- */
- new_stall = true;
-
- /* Calculate growth since last update */
- growth = window_update(&t->win, now, total[t->state]);
- if (growth < t->threshold)
- continue;
-
- /* Limit event signaling to once per window */
- if (now < t->last_event_time + t->win.size)
- continue;
-
- /* Generate an event */
- if (cmpxchg(&t->event, 0, 1) == 0)
- wake_up_interruptible(&t->event_wait);
- t->last_event_time = now;
- }
-
- if (new_stall)
- memcpy(group->polling_total, total,
- sizeof(group->polling_total));
-
- return now + group->poll_min_period;
-}
-
-/*
- * Schedule polling if it's not already scheduled. It's safe to call even from
- * hotpath because even though kthread_queue_delayed_work takes worker->lock
- * spinlock that spinlock is never contended due to poll_scheduled atomic
- * preventing such competition.
- */
-static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
-{
- struct kthread_worker *kworker;
-
- /* Do not reschedule if already scheduled */
- if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
+ if (atomic_xchg(&group->rtpoll_scheduled, 1) && !force)
return;
rcu_read_lock();
- kworker = rcu_dereference(group->poll_kworker);
+ task = rcu_dereference(group->rtpoll_task);
/*
* kworker might be NULL in case psi_trigger_destroy races with
* psi_task_change (hotpath) which can't use locks
*/
- if (likely(kworker))
- kthread_queue_delayed_work(kworker, &group->poll_work, delay);
+ if (likely(task))
+ mod_timer(&group->rtpoll_timer, jiffies + delay);
else
- atomic_set(&group->poll_scheduled, 0);
+ atomic_set(&group->rtpoll_scheduled, 0);
rcu_read_unlock();
}
-static void psi_poll_work(struct kthread_work *work)
+static void psi_rtpoll_work(struct psi_group *group)
{
- struct kthread_delayed_work *dwork;
- struct psi_group *group;
+ bool force_reschedule = false;
u32 changed_states;
u64 now;
- dwork = container_of(work, struct kthread_delayed_work, work);
- group = container_of(dwork, struct psi_group, poll_work);
+ mutex_lock(&group->rtpoll_trigger_lock);
- atomic_set(&group->poll_scheduled, 0);
+ now = sched_clock();
- mutex_lock(&group->trigger_lock);
+ if (now > group->rtpoll_until) {
+ /*
+ * We are either about to start or might stop rtpolling if no
+ * state change was recorded. Resetting rtpoll_scheduled leaves
+ * a small window for psi_group_change to sneak in and schedule
+ * an immediate rtpoll_work before we get to rescheduling. One
+ * potential extra wakeup at the end of the rtpolling window
+ * should be negligible and rtpoll_next_update still keeps
+ * updates correctly on schedule.
+ */
+ atomic_set(&group->rtpoll_scheduled, 0);
+ /*
+ * A task change can race with the rtpoll worker that is supposed to
+ * report on it. To avoid missing events, ensure ordering between
+ * rtpoll_scheduled and the task state accesses, such that if the
+ * rtpoll worker misses the state update, the task change is
+ * guaranteed to reschedule the rtpoll worker:
+ *
+ * rtpoll worker:
+ * atomic_set(rtpoll_scheduled, 0)
+ * smp_mb()
+ * LOAD states
+ *
+ * task change:
+ * STORE states
+ * if atomic_xchg(rtpoll_scheduled, 1) == 0:
+ * schedule rtpoll worker
+ *
+ * The atomic_xchg() implies a full barrier.
+ */
+ smp_mb();
+ } else {
+ /* The rtpolling window is not over, keep rescheduling */
+ force_reschedule = true;
+ }
- now = sched_clock();
collect_percpu_times(group, PSI_POLL, &changed_states);
- if (changed_states & group->poll_states) {
- /* Initialize trigger windows when entering polling mode */
- if (now > group->polling_until)
- init_triggers(group, now);
+ if (changed_states & group->rtpoll_states) {
+ /* Initialize trigger windows when entering rtpolling mode */
+ if (now > group->rtpoll_until)
+ init_rtpoll_triggers(group, now);
/*
* Keep the monitor active for at least the duration of the
* minimum tracking window as long as monitor states are
* changing.
*/
- group->polling_until = now +
- group->poll_min_period * UPDATES_PER_WINDOW;
+ group->rtpoll_until = now +
+ group->rtpoll_min_period * UPDATES_PER_WINDOW;
}
- if (now > group->polling_until) {
- group->polling_next_update = ULLONG_MAX;
+ if (now > group->rtpoll_until) {
+ group->rtpoll_next_update = ULLONG_MAX;
goto out;
}
- if (now >= group->polling_next_update)
- group->polling_next_update = update_triggers(group, now);
+ if (now >= group->rtpoll_next_update) {
+ if (changed_states & group->rtpoll_states) {
+ update_triggers(group, now, PSI_POLL);
+ memcpy(group->rtpoll_total, group->total[PSI_POLL],
+ sizeof(group->rtpoll_total));
+ }
+ group->rtpoll_next_update = now + group->rtpoll_min_period;
+ }
- psi_schedule_poll_work(group,
- nsecs_to_jiffies(group->polling_next_update - now) + 1);
+ psi_schedule_rtpoll_work(group,
+ nsecs_to_jiffies(group->rtpoll_next_update - now) + 1,
+ force_reschedule);
out:
- mutex_unlock(&group->trigger_lock);
+ mutex_unlock(&group->rtpoll_trigger_lock);
+}
+
+static int psi_rtpoll_worker(void *data)
+{
+ struct psi_group *group = (struct psi_group *)data;
+
+ sched_set_fifo_low(current);
+
+ while (true) {
+ wait_event_interruptible(group->rtpoll_wait,
+ atomic_cmpxchg(&group->rtpoll_wakeup, 1, 0) ||
+ kthread_should_stop());
+ if (kthread_should_stop())
+ break;
+
+ psi_rtpoll_work(group);
+ }
+ return 0;
+}
+
+static void poll_timer_fn(struct timer_list *t)
+{
+ struct psi_group *group = from_timer(group, t, rtpoll_timer);
+
+ atomic_set(&group->rtpoll_wakeup, 1);
+ wake_up_interruptible(&group->rtpoll_wait);
}
-static void record_times(struct psi_group_cpu *groupc, int cpu,
- bool memstall_tick)
+static void record_times(struct psi_group_cpu *groupc, u64 now)
{
u32 delta;
- u64 now;
- now = cpu_clock(cpu);
delta = now - groupc->state_start;
groupc->state_start = now;
@@ -643,109 +752,130 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
groupc->times[PSI_MEM_SOME] += delta;
if (groupc->state_mask & (1 << PSI_MEM_FULL))
groupc->times[PSI_MEM_FULL] += delta;
- else if (memstall_tick) {
- u32 sample;
- /*
- * Since we care about lost potential, a
- * memstall is FULL when there are no other
- * working tasks, but also when the CPU is
- * actively reclaiming and nothing productive
- * could run even if it were runnable.
- *
- * When the timer tick sees a reclaiming CPU,
- * regardless of runnable tasks, sample a FULL
- * tick (or less if it hasn't been a full tick
- * since the last state change).
- */
- sample = min(delta, (u32)jiffies_to_nsecs(1));
- groupc->times[PSI_MEM_FULL] += sample;
- }
}
- if (groupc->state_mask & (1 << PSI_CPU_SOME))
+ if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
groupc->times[PSI_CPU_SOME] += delta;
+ if (groupc->state_mask & (1 << PSI_CPU_FULL))
+ groupc->times[PSI_CPU_FULL] += delta;
+ }
if (groupc->state_mask & (1 << PSI_NONIDLE))
groupc->times[PSI_NONIDLE] += delta;
}
static void psi_group_change(struct psi_group *group, int cpu,
- unsigned int clear, unsigned int set,
+ unsigned int clear, unsigned int set, u64 now,
bool wake_clock)
{
struct psi_group_cpu *groupc;
- u32 state_mask = 0;
unsigned int t, m;
enum psi_states s;
+ u32 state_mask;
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
- * First we assess the aggregate resource states this CPU's
- * tasks have been in since the last change, and account any
- * SOME and FULL time these may have resulted in.
- *
- * Then we update the task counts according to the state
+ * First we update the task counts according to the state
* change requested through the @clear and @set bits.
+ *
+ * Then if the cgroup PSI stats accounting enabled, we
+ * assess the aggregate resource states this CPU's tasks
+ * have been in since the last change, and account any
+ * SOME and FULL time these may have resulted in.
*/
write_seqcount_begin(&groupc->seq);
- record_times(groupc, cpu, false);
+ /*
+ * Start with TSK_ONCPU, which doesn't have a corresponding
+ * task count - it's just a boolean flag directly encoded in
+ * the state mask. Clear, set, or carry the current state if
+ * no changes are requested.
+ */
+ if (unlikely(clear & TSK_ONCPU)) {
+ state_mask = 0;
+ clear &= ~TSK_ONCPU;
+ } else if (unlikely(set & TSK_ONCPU)) {
+ state_mask = PSI_ONCPU;
+ set &= ~TSK_ONCPU;
+ } else {
+ state_mask = groupc->state_mask & PSI_ONCPU;
+ }
+ /*
+ * The rest of the state mask is calculated based on the task
+ * counts. Update those first, then construct the mask.
+ */
for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t)))
continue;
- if (groupc->tasks[t] == 0 && !psi_bug) {
+ if (groupc->tasks[t]) {
+ groupc->tasks[t]--;
+ } else if (!psi_bug) {
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
groupc->tasks[3], clear, set);
psi_bug = 1;
}
- groupc->tasks[t]--;
}
for (t = 0; set; set &= ~(1 << t), t++)
if (set & (1 << t))
groupc->tasks[t]++;
- /* Calculate state mask representing active states */
+ if (!group->enabled) {
+ /*
+ * On the first group change after disabling PSI, conclude
+ * the current state and flush its time. This is unlikely
+ * to matter to the user, but aggregation (get_recent_times)
+ * may have already incorporated the live state into times_prev;
+ * avoid a delta sample underflow when PSI is later re-enabled.
+ */
+ if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
+ record_times(groupc, now);
+
+ groupc->state_mask = state_mask;
+
+ write_seqcount_end(&groupc->seq);
+ return;
+ }
+
for (s = 0; s < NR_PSI_STATES; s++) {
- if (test_state(groupc->tasks, s))
+ if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
state_mask |= (1 << s);
}
+
+ /*
+ * Since we care about lost potential, a memstall is FULL
+ * when there are no other working tasks, but also when
+ * the CPU is actively reclaiming and nothing productive
+ * could run even if it were runnable. So when the current
+ * task in a cgroup is in_memstall, the corresponding groupc
+ * on that cpu is in PSI_MEM_FULL state.
+ */
+ if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
+ state_mask |= (1 << PSI_MEM_FULL);
+
+ record_times(groupc, now);
+
groupc->state_mask = state_mask;
write_seqcount_end(&groupc->seq);
- if (state_mask & group->poll_states)
- psi_schedule_poll_work(group, 1);
+ if (state_mask & group->rtpoll_states)
+ psi_schedule_rtpoll_work(group, 1, false);
if (wake_clock && !delayed_work_pending(&group->avgs_work))
schedule_delayed_work(&group->avgs_work, PSI_FREQ);
}
-static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
+static inline struct psi_group *task_psi_group(struct task_struct *task)
{
#ifdef CONFIG_CGROUPS
- struct cgroup *cgroup = NULL;
-
- if (!*iter)
- cgroup = task->cgroups->dfl_cgrp;
- else if (*iter == &psi_system)
- return NULL;
- else
- cgroup = cgroup_parent(*iter);
-
- if (cgroup && cgroup_parent(cgroup)) {
- *iter = cgroup;
- return cgroup_psi(cgroup);
- }
-#else
- if (*iter)
- return NULL;
+ if (static_branch_likely(&psi_cgroups_enabled))
+ return cgroup_psi(task_dfl_cgroup(task));
#endif
- *iter = &psi_system;
return &psi_system;
}
@@ -768,27 +898,19 @@ void psi_task_change(struct task_struct *task, int clear, int set)
{
int cpu = task_cpu(task);
struct psi_group *group;
- bool wake_clock = true;
- void *iter = NULL;
+ u64 now;
if (!task->pid)
return;
psi_flags_change(task, clear, set);
- /*
- * Periodic aggregation shuts off if there is a period of no
- * task changes, so we wake it back up if necessary. However,
- * don't do this if the task change is the aggregation worker
- * itself going to sleep, or we'll ping-pong forever.
- */
- if (unlikely((clear & TSK_RUNNING) &&
- (task->flags & PF_WQ_WORKER) &&
- wq_worker_last_func(task) == psi_avgs_work))
- wake_clock = false;
+ now = cpu_clock(cpu);
- while ((group = iterate_groups(task, &iter)))
- psi_group_change(group, cpu, clear, set, wake_clock);
+ group = task_psi_group(task);
+ do {
+ psi_group_change(group, cpu, clear, set, now, true);
+ } while ((group = group->parent));
}
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@@ -796,59 +918,113 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
{
struct psi_group *group, *common = NULL;
int cpu = task_cpu(prev);
- void *iter;
+ u64 now = cpu_clock(cpu);
if (next->pid) {
psi_flags_change(next, 0, TSK_ONCPU);
/*
- * When moving state between tasks, the group that
- * contains them both does not change: we can stop
- * updating the tree once we reach the first common
- * ancestor. Iterate @next's ancestors until we
- * encounter @prev's state.
+ * Set TSK_ONCPU on @next's cgroups. If @next shares any
+ * ancestors with @prev, those will already have @prev's
+ * TSK_ONCPU bit set, and we can stop the iteration there.
*/
- iter = NULL;
- while ((group = iterate_groups(next, &iter))) {
- if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
+ group = task_psi_group(next);
+ do {
+ if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
+ PSI_ONCPU) {
common = group;
break;
}
- psi_group_change(group, cpu, 0, TSK_ONCPU, true);
- }
+ psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
+ } while ((group = group->parent));
}
- /*
- * If this is a voluntary sleep, dequeue will have taken care
- * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
- * only need to deal with it during preemption.
- */
- if (sleep)
- return;
-
if (prev->pid) {
- psi_flags_change(prev, TSK_ONCPU, 0);
+ int clear = TSK_ONCPU, set = 0;
+ bool wake_clock = true;
+
+ /*
+ * When we're going to sleep, psi_dequeue() lets us
+ * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
+ * TSK_IOWAIT here, where we can combine it with
+ * TSK_ONCPU and save walking common ancestors twice.
+ */
+ if (sleep) {
+ clear |= TSK_RUNNING;
+ if (prev->in_memstall)
+ clear |= TSK_MEMSTALL_RUNNING;
+ if (prev->in_iowait)
+ set |= TSK_IOWAIT;
- iter = NULL;
- while ((group = iterate_groups(prev, &iter)) && group != common)
- psi_group_change(group, cpu, TSK_ONCPU, 0, true);
+ /*
+ * Periodic aggregation shuts off if there is a period of no
+ * task changes, so we wake it back up if necessary. However,
+ * don't do this if the task change is the aggregation worker
+ * itself going to sleep, or we'll ping-pong forever.
+ */
+ if (unlikely((prev->flags & PF_WQ_WORKER) &&
+ wq_worker_last_func(prev) == psi_avgs_work))
+ wake_clock = false;
+ }
+
+ psi_flags_change(prev, clear, set);
+
+ group = task_psi_group(prev);
+ do {
+ if (group == common)
+ break;
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
+ } while ((group = group->parent));
+
+ /*
+ * TSK_ONCPU is handled up to the common ancestor. If there are
+ * any other differences between the two tasks (e.g. prev goes
+ * to sleep, or only one task is memstall), finish propagating
+ * those differences all the way up to the root.
+ */
+ if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
+ clear &= ~TSK_ONCPU;
+ for (; group; group = group->parent)
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
+ }
}
}
-void psi_memstall_tick(struct task_struct *task, int cpu)
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+void psi_account_irqtime(struct task_struct *task, u32 delta)
{
+ int cpu = task_cpu(task);
struct psi_group *group;
- void *iter = NULL;
+ struct psi_group_cpu *groupc;
+ u64 now;
+
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ if (!task->pid)
+ return;
- while ((group = iterate_groups(task, &iter))) {
- struct psi_group_cpu *groupc;
+ now = cpu_clock(cpu);
+
+ group = task_psi_group(task);
+ do {
+ if (!group->enabled)
+ continue;
groupc = per_cpu_ptr(group->pcpu, cpu);
+
write_seqcount_begin(&groupc->seq);
- record_times(groupc, cpu, true);
+
+ record_times(groupc, now);
+ groupc->times[PSI_IRQ_FULL] += delta;
+
write_seqcount_end(&groupc->seq);
- }
+
+ if (group->rtpoll_states & (1 << PSI_IRQ_FULL))
+ psi_schedule_rtpoll_work(group, 1, false);
+ } while ((group = group->parent));
}
+#endif
/**
* psi_memstall_enter - mark the beginning of a memory stall section
@@ -876,10 +1052,11 @@ void psi_memstall_enter(unsigned long *flags)
rq = this_rq_lock_irq(&rf);
current->in_memstall = 1;
- psi_task_change(current, 0, TSK_MEMSTALL);
+ psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
rq_unlock_irq(rq, &rf);
}
+EXPORT_SYMBOL_GPL(psi_memstall_enter);
/**
* psi_memstall_leave - mark the end of an memory stall section
@@ -905,33 +1082,42 @@ void psi_memstall_leave(unsigned long *flags)
rq = this_rq_lock_irq(&rf);
current->in_memstall = 0;
- psi_task_change(current, TSK_MEMSTALL, 0);
+ psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
rq_unlock_irq(rq, &rf);
}
+EXPORT_SYMBOL_GPL(psi_memstall_leave);
#ifdef CONFIG_CGROUPS
int psi_cgroup_alloc(struct cgroup *cgroup)
{
- if (static_branch_likely(&psi_disabled))
+ if (!static_branch_likely(&psi_cgroups_enabled))
return 0;
- cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
- if (!cgroup->psi.pcpu)
+ cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
+ if (!cgroup->psi)
+ return -ENOMEM;
+
+ cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu);
+ if (!cgroup->psi->pcpu) {
+ kfree(cgroup->psi);
return -ENOMEM;
- group_init(&cgroup->psi);
+ }
+ group_init(cgroup->psi);
+ cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup));
return 0;
}
void psi_cgroup_free(struct cgroup *cgroup)
{
- if (static_branch_likely(&psi_disabled))
+ if (!static_branch_likely(&psi_cgroups_enabled))
return;
- cancel_delayed_work_sync(&cgroup->psi.avgs_work);
- free_percpu(cgroup->psi.pcpu);
+ cancel_delayed_work_sync(&cgroup->psi->avgs_work);
+ free_percpu(cgroup->psi->pcpu);
/* All triggers must be removed by now */
- WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");
+ WARN_ONCE(cgroup->psi->rtpoll_states, "psi: trigger leak\n");
+ kfree(cgroup->psi);
}
/**
@@ -948,11 +1134,11 @@ void psi_cgroup_free(struct cgroup *cgroup)
*/
void cgroup_move_task(struct task_struct *task, struct css_set *to)
{
- unsigned int task_flags = 0;
+ unsigned int task_flags;
struct rq_flags rf;
struct rq *rq;
- if (static_branch_likely(&psi_disabled)) {
+ if (!static_branch_likely(&psi_cgroups_enabled)) {
/*
* Lame to do this here, but the scheduler cannot be locked
* from the outside, so we move cgroups from inside sched/.
@@ -963,15 +1149,31 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
rq = task_rq_lock(task, &rf);
- if (task_on_rq_queued(task)) {
- task_flags = TSK_RUNNING;
- if (task_current(rq, task))
- task_flags |= TSK_ONCPU;
- } else if (task->in_iowait)
- task_flags = TSK_IOWAIT;
-
- if (task->in_memstall)
- task_flags |= TSK_MEMSTALL;
+ /*
+ * We may race with schedule() dropping the rq lock between
+ * deactivating prev and switching to next. Because the psi
+ * updates from the deactivation are deferred to the switch
+ * callback to save cgroup tree updates, the task's scheduling
+ * state here is not coherent with its psi state:
+ *
+ * schedule() cgroup_move_task()
+ * rq_lock()
+ * deactivate_task()
+ * p->on_rq = 0
+ * psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates
+ * pick_next_task()
+ * rq_unlock()
+ * rq_lock()
+ * psi_task_change() // old cgroup
+ * task->cgroups = to
+ * psi_task_change() // new cgroup
+ * rq_unlock()
+ * rq_lock()
+ * psi_sched_switch() // does deferred updates in new cgroup
+ *
+ * Don't rely on the scheduling state. Use psi_flags instead.
+ */
+ task_flags = task->psi_flags;
if (task_flags)
psi_task_change(task, task_flags, 0);
@@ -984,10 +1186,45 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
task_rq_unlock(rq, task, &rf);
}
+
+void psi_cgroup_restart(struct psi_group *group)
+{
+ int cpu;
+
+ /*
+ * After we disable psi_group->enabled, we don't actually
+ * stop percpu tasks accounting in each psi_group_cpu,
+ * instead only stop test_state() loop, record_times()
+ * and averaging worker, see psi_group_change() for details.
+ *
+ * When disable cgroup PSI, this function has nothing to sync
+ * since cgroup pressure files are hidden and percpu psi_group_cpu
+ * would see !psi_group->enabled and only do task accounting.
+ *
+ * When re-enable cgroup PSI, this function use psi_group_change()
+ * to get correct state mask from test_state() loop on tasks[],
+ * and restart groupc->state_start from now, use .clear = .set = 0
+ * here since no task status really changed.
+ */
+ if (!group->enabled)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+ u64 now;
+
+ rq_lock_irq(rq, &rf);
+ now = cpu_clock(cpu);
+ psi_group_change(group, cpu, 0, 0, now, true);
+ rq_unlock_irq(rq, &rf);
+ }
+}
#endif /* CONFIG_CGROUPS */
int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
{
+ bool only_full = false;
int full;
u64 now;
@@ -1002,18 +1239,25 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
group->avg_next_update = update_averages(group, now);
mutex_unlock(&group->avgs_lock);
- for (full = 0; full < 2 - (res == PSI_CPU); full++) {
- unsigned long avg[3];
- u64 total;
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ only_full = res == PSI_IRQ;
+#endif
+
+ for (full = 0; full < 2 - only_full; full++) {
+ unsigned long avg[3] = { 0, };
+ u64 total = 0;
int w;
- for (w = 0; w < 3; w++)
- avg[w] = group->avg[res * 2 + full][w];
- total = div_u64(group->total[PSI_AVGS][res * 2 + full],
- NSEC_PER_USEC);
+ /* CPU FULL is undefined at the system level */
+ if (!(group == &psi_system && res == PSI_CPU && full)) {
+ for (w = 0; w < 3; w++)
+ avg[w] = group->avg[res * 2 + full][w];
+ total = div_u64(group->total[PSI_AVGS][res * 2 + full],
+ NSEC_PER_USEC);
+ }
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
- full ? "full" : "some",
+ full || only_full ? "full" : "some",
LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
@@ -1023,47 +1267,25 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
return 0;
}
-static int psi_io_show(struct seq_file *m, void *v)
-{
- return psi_show(m, &psi_system, PSI_IO);
-}
-
-static int psi_memory_show(struct seq_file *m, void *v)
-{
- return psi_show(m, &psi_system, PSI_MEM);
-}
-
-static int psi_cpu_show(struct seq_file *m, void *v)
-{
- return psi_show(m, &psi_system, PSI_CPU);
-}
-
-static int psi_io_open(struct inode *inode, struct file *file)
-{
- return single_open(file, psi_io_show, NULL);
-}
-
-static int psi_memory_open(struct inode *inode, struct file *file)
-{
- return single_open(file, psi_memory_show, NULL);
-}
-
-static int psi_cpu_open(struct inode *inode, struct file *file)
-{
- return single_open(file, psi_cpu_show, NULL);
-}
-
-struct psi_trigger *psi_trigger_create(struct psi_group *group,
- char *buf, size_t nbytes, enum psi_res res)
+struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
+ enum psi_res res, struct file *file,
+ struct kernfs_open_file *of)
{
struct psi_trigger *t;
enum psi_states state;
u32 threshold_us;
+ bool privileged;
u32 window_us;
if (static_branch_likely(&psi_disabled))
return ERR_PTR(-EOPNOTSUPP);
+ /*
+ * Checking the privilege here on file->f_cred implies that a privileged user
+ * could open the file and delegate the write to an unprivileged one.
+ */
+ privileged = cap_raised(file->f_cred->cap_effective, CAP_SYS_RESOURCE);
+
if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
state = PSI_IO_SOME + res * 2;
else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
@@ -1071,11 +1293,22 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
else
return ERR_PTR(-EINVAL);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ if (res == PSI_IRQ && --state != PSI_IRQ_FULL)
+ return ERR_PTR(-EINVAL);
+#endif
+
if (state >= PSI_NONIDLE)
return ERR_PTR(-EINVAL);
- if (window_us < WINDOW_MIN_US ||
- window_us > WINDOW_MAX_US)
+ if (window_us == 0 || window_us > WINDOW_MAX_US)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * Unprivileged users can only use 2s windows so that averages aggregation
+ * work is used, and no RT threads need to be spawned.
+ */
+ if (!privileged && window_us % 2000000)
return ERR_PTR(-EINVAL);
/* Check threshold */
@@ -1090,123 +1323,137 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
t->state = state;
t->threshold = threshold_us * NSEC_PER_USEC;
t->win.size = window_us * NSEC_PER_USEC;
- window_reset(&t->win, 0, 0, 0);
+ window_reset(&t->win, sched_clock(),
+ group->total[PSI_POLL][t->state], 0);
t->event = 0;
t->last_event_time = 0;
- init_waitqueue_head(&t->event_wait);
- kref_init(&t->refcount);
-
- mutex_lock(&group->trigger_lock);
-
- if (!rcu_access_pointer(group->poll_kworker)) {
- struct sched_param param = {
- .sched_priority = 1,
- };
- struct kthread_worker *kworker;
-
- kworker = kthread_create_worker(0, "psimon");
- if (IS_ERR(kworker)) {
- kfree(t);
- mutex_unlock(&group->trigger_lock);
- return ERR_CAST(kworker);
+ t->of = of;
+ if (!of)
+ init_waitqueue_head(&t->event_wait);
+ t->pending_event = false;
+ t->aggregator = privileged ? PSI_POLL : PSI_AVGS;
+
+ if (privileged) {
+ mutex_lock(&group->rtpoll_trigger_lock);
+
+ if (!rcu_access_pointer(group->rtpoll_task)) {
+ struct task_struct *task;
+
+ task = kthread_create(psi_rtpoll_worker, group, "psimon");
+ if (IS_ERR(task)) {
+ kfree(t);
+ mutex_unlock(&group->rtpoll_trigger_lock);
+ return ERR_CAST(task);
+ }
+ atomic_set(&group->rtpoll_wakeup, 0);
+ wake_up_process(task);
+ rcu_assign_pointer(group->rtpoll_task, task);
}
- sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
- kthread_init_delayed_work(&group->poll_work,
- psi_poll_work);
- rcu_assign_pointer(group->poll_kworker, kworker);
- }
- list_add(&t->node, &group->triggers);
- group->poll_min_period = min(group->poll_min_period,
- div_u64(t->win.size, UPDATES_PER_WINDOW));
- group->nr_triggers[t->state]++;
- group->poll_states |= (1 << t->state);
+ list_add(&t->node, &group->rtpoll_triggers);
+ group->rtpoll_min_period = min(group->rtpoll_min_period,
+ div_u64(t->win.size, UPDATES_PER_WINDOW));
+ group->rtpoll_nr_triggers[t->state]++;
+ group->rtpoll_states |= (1 << t->state);
- mutex_unlock(&group->trigger_lock);
+ mutex_unlock(&group->rtpoll_trigger_lock);
+ } else {
+ mutex_lock(&group->avgs_lock);
+ list_add(&t->node, &group->avg_triggers);
+ group->avg_nr_triggers[t->state]++;
+
+ mutex_unlock(&group->avgs_lock);
+ }
return t;
}
-static void psi_trigger_destroy(struct kref *ref)
+void psi_trigger_destroy(struct psi_trigger *t)
{
- struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
- struct psi_group *group = t->group;
- struct kthread_worker *kworker_to_destroy = NULL;
+ struct psi_group *group;
+ struct task_struct *task_to_destroy = NULL;
- if (static_branch_likely(&psi_disabled))
+ /*
+ * We do not check psi_disabled since it might have been disabled after
+ * the trigger got created.
+ */
+ if (!t)
return;
+ group = t->group;
/*
- * Wakeup waiters to stop polling. Can happen if cgroup is deleted
- * from under a polling process.
+ * Wakeup waiters to stop polling and clear the queue to prevent it from
+ * being accessed later. Can happen if cgroup is deleted from under a
+ * polling process.
*/
- wake_up_interruptible(&t->event_wait);
-
- mutex_lock(&group->trigger_lock);
-
- if (!list_empty(&t->node)) {
- struct psi_trigger *tmp;
- u64 period = ULLONG_MAX;
-
- list_del(&t->node);
- group->nr_triggers[t->state]--;
- if (!group->nr_triggers[t->state])
- group->poll_states &= ~(1 << t->state);
- /* reset min update period for the remaining triggers */
- list_for_each_entry(tmp, &group->triggers, node)
- period = min(period, div_u64(tmp->win.size,
- UPDATES_PER_WINDOW));
- group->poll_min_period = period;
- /* Destroy poll_kworker when the last trigger is destroyed */
- if (group->poll_states == 0) {
- group->polling_until = 0;
- kworker_to_destroy = rcu_dereference_protected(
- group->poll_kworker,
- lockdep_is_held(&group->trigger_lock));
- rcu_assign_pointer(group->poll_kworker, NULL);
+ if (t->of)
+ kernfs_notify(t->of->kn);
+ else
+ wake_up_interruptible(&t->event_wait);
+
+ if (t->aggregator == PSI_AVGS) {
+ mutex_lock(&group->avgs_lock);
+ if (!list_empty(&t->node)) {
+ list_del(&t->node);
+ group->avg_nr_triggers[t->state]--;
+ }
+ mutex_unlock(&group->avgs_lock);
+ } else {
+ mutex_lock(&group->rtpoll_trigger_lock);
+ if (!list_empty(&t->node)) {
+ struct psi_trigger *tmp;
+ u64 period = ULLONG_MAX;
+
+ list_del(&t->node);
+ group->rtpoll_nr_triggers[t->state]--;
+ if (!group->rtpoll_nr_triggers[t->state])
+ group->rtpoll_states &= ~(1 << t->state);
+ /*
+ * Reset min update period for the remaining triggers
+ * iff the destroying trigger had the min window size.
+ */
+ if (group->rtpoll_min_period == div_u64(t->win.size, UPDATES_PER_WINDOW)) {
+ list_for_each_entry(tmp, &group->rtpoll_triggers, node)
+ period = min(period, div_u64(tmp->win.size,
+ UPDATES_PER_WINDOW));
+ group->rtpoll_min_period = period;
+ }
+ /* Destroy rtpoll_task when the last trigger is destroyed */
+ if (group->rtpoll_states == 0) {
+ group->rtpoll_until = 0;
+ task_to_destroy = rcu_dereference_protected(
+ group->rtpoll_task,
+ lockdep_is_held(&group->rtpoll_trigger_lock));
+ rcu_assign_pointer(group->rtpoll_task, NULL);
+ del_timer(&group->rtpoll_timer);
+ }
}
+ mutex_unlock(&group->rtpoll_trigger_lock);
}
- mutex_unlock(&group->trigger_lock);
-
/*
- * Wait for both *trigger_ptr from psi_trigger_replace and
- * poll_kworker RCUs to complete their read-side critical sections
- * before destroying the trigger and optionally the poll_kworker
+ * Wait for psi_schedule_rtpoll_work RCU to complete its read-side
+ * critical section before destroying the trigger and optionally the
+ * rtpoll_task.
*/
synchronize_rcu();
/*
- * Destroy the kworker after releasing trigger_lock to prevent a
- * deadlock while waiting for psi_poll_work to acquire trigger_lock
+ * Stop kthread 'psimon' after releasing rtpoll_trigger_lock to prevent
+ * a deadlock while waiting for psi_rtpoll_work to acquire
+ * rtpoll_trigger_lock
*/
- if (kworker_to_destroy) {
+ if (task_to_destroy) {
/*
* After the RCU grace period has expired, the worker
- * can no longer be found through group->poll_kworker.
- * But it might have been already scheduled before
- * that - deschedule it cleanly before destroying it.
+ * can no longer be found through group->rtpoll_task.
*/
- kthread_cancel_delayed_work_sync(&group->poll_work);
- atomic_set(&group->poll_scheduled, 0);
-
- kthread_destroy_worker(kworker_to_destroy);
+ kthread_stop(task_to_destroy);
+ atomic_set(&group->rtpoll_scheduled, 0);
}
kfree(t);
}
-void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
-{
- struct psi_trigger *old = *trigger_ptr;
-
- if (static_branch_likely(&psi_disabled))
- return;
-
- rcu_assign_pointer(*trigger_ptr, new);
- if (old)
- kref_put(&old->refcount, psi_trigger_destroy);
-}
-
__poll_t psi_trigger_poll(void **trigger_ptr,
struct file *file, poll_table *wait)
{
@@ -1216,27 +1463,52 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
if (static_branch_likely(&psi_disabled))
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- rcu_read_lock();
-
- t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
- if (!t) {
- rcu_read_unlock();
+ t = smp_load_acquire(trigger_ptr);
+ if (!t)
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- }
- kref_get(&t->refcount);
- rcu_read_unlock();
-
- poll_wait(file, &t->event_wait, wait);
+ if (t->of)
+ kernfs_generic_poll(t->of, wait);
+ else
+ poll_wait(file, &t->event_wait, wait);
if (cmpxchg(&t->event, 1, 0) == 1)
ret |= EPOLLPRI;
- kref_put(&t->refcount, psi_trigger_destroy);
-
return ret;
}
+#ifdef CONFIG_PROC_FS
+static int psi_io_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_IO);
+}
+
+static int psi_memory_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_MEM);
+}
+
+static int psi_cpu_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_CPU);
+}
+
+static int psi_io_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psi_io_show, NULL);
+}
+
+static int psi_memory_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psi_memory_show, NULL);
+}
+
+static int psi_cpu_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psi_cpu_show, NULL);
+}
+
static ssize_t psi_write(struct file *file, const char __user *user_buf,
size_t nbytes, enum psi_res res)
{
@@ -1257,14 +1529,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
buf[buf_size - 1] = '\0';
- new = psi_trigger_create(&psi_system, buf, nbytes, res);
- if (IS_ERR(new))
- return PTR_ERR(new);
-
seq = file->private_data;
+
/* Take seq->lock to protect seq->private from concurrent writes */
mutex_lock(&seq->lock);
- psi_trigger_replace(&seq->private, new);
+
+ /* Allow only one trigger per file descriptor */
+ if (seq->private) {
+ mutex_unlock(&seq->lock);
+ return -EBUSY;
+ }
+
+ new = psi_trigger_create(&psi_system, buf, res, file, NULL);
+ if (IS_ERR(new)) {
+ mutex_unlock(&seq->lock);
+ return PTR_ERR(new);
+ }
+
+ smp_store_release(&seq->private, new);
mutex_unlock(&seq->lock);
return nbytes;
@@ -1299,7 +1581,7 @@ static int psi_fop_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
- psi_trigger_replace(&seq->private, NULL);
+ psi_trigger_destroy(seq->private);
return single_release(inode, file);
}
@@ -1330,14 +1612,46 @@ static const struct proc_ops psi_cpu_proc_ops = {
.proc_release = psi_fop_release,
};
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int psi_irq_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_IRQ);
+}
+
+static int psi_irq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psi_irq_show, NULL);
+}
+
+static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_IRQ);
+}
+
+static const struct proc_ops psi_irq_proc_ops = {
+ .proc_open = psi_irq_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_irq_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
+};
+#endif
+
static int __init psi_proc_init(void)
{
if (psi_enable) {
proc_mkdir("pressure", NULL);
- proc_create("pressure/io", 0, NULL, &psi_io_proc_ops);
- proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops);
- proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops);
+ proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
+ proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
+ proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
+#endif
}
return 0;
}
module_init(psi_proc_init);
+
+#endif /* CONFIG_PROC_FS */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f395ddb75f38..3261b067b67e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -3,12 +3,8 @@
* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
* policies)
*/
-#include "sched.h"
-
-#include "pelt.h"
int sched_rr_timeslice = RR_TIMESLICE;
-int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
/* More than 4 hours if BW_SHIFT equals 20. */
static const u64 max_rt_runtime = MAX_BW;
@@ -16,6 +12,61 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
struct rt_bandwidth def_rt_bandwidth;
+/*
+ * period over which we measure -rt task CPU usage in us.
+ * default: 1s
+ */
+int sysctl_sched_rt_period = 1000000;
+
+/*
+ * part of the period that we allow rt tasks to run in us.
+ * default: 0.95s
+ */
+int sysctl_sched_rt_runtime = 950000;
+
+#ifdef CONFIG_SYSCTL
+static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
+static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
+static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
+static struct ctl_table sched_rt_sysctls[] = {
+ {
+ .procname = "sched_rt_period_us",
+ .data = &sysctl_sched_rt_period,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sched_rt_handler,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
+ },
+ {
+ .procname = "sched_rt_runtime_us",
+ .data = &sysctl_sched_rt_runtime,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sched_rt_handler,
+ .extra1 = SYSCTL_NEG_ONE,
+ .extra2 = (void *)&sysctl_sched_rt_period,
+ },
+ {
+ .procname = "sched_rr_timeslice_ms",
+ .data = &sysctl_sched_rr_timeslice,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sched_rr_handler,
+ },
+ {}
+};
+
+static int __init sched_rt_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_rt_sysctls);
+ return 0;
+}
+late_initcall(sched_rt_sysctl_init);
+#endif
+
static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
{
struct rt_bandwidth *rt_b =
@@ -52,11 +103,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
rt_b->rt_period_timer.function = sched_rt_period_timer;
}
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
- if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
- return;
-
raw_spin_lock(&rt_b->rt_runtime_lock);
if (!rt_b->rt_period_active) {
rt_b->rt_period_active = 1;
@@ -75,6 +123,14 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
+ return;
+
+ do_start_rt_bandwidth(rt_b);
+}
+
void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
@@ -89,9 +145,8 @@ void init_rt_rq(struct rt_rq *rt_rq)
__set_bit(MAX_RT_PRIO, array->bitmap);
#if defined CONFIG_SMP
- rt_rq->highest_prio.curr = MAX_RT_PRIO;
- rt_rq->highest_prio.next = MAX_RT_PRIO;
- rt_rq->rt_nr_migratory = 0;
+ rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
+ rt_rq->highest_prio.next = MAX_RT_PRIO-1;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
#endif /* CONFIG_SMP */
@@ -137,13 +192,17 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
return rt_rq->rq;
}
-void free_rt_sched_group(struct task_group *tg)
+void unregister_rt_sched_group(struct task_group *tg)
{
- int i;
-
if (tg->rt_se)
destroy_rt_bandwidth(&tg->rt_bandwidth);
+}
+
+void free_rt_sched_group(struct task_group *tg)
+{
+ int i;
+
for_each_possible_cpu(i) {
if (tg->rt_rq)
kfree(tg->rt_rq[i]);
@@ -161,7 +220,7 @@ void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
{
struct rq *rq = cpu_rq(cpu);
- rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
rt_rq->rt_nr_boosted = 0;
rt_rq->rq = rq;
rt_rq->tg = tg;
@@ -250,6 +309,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
return &rq->rt;
}
+void unregister_rt_sched_group(struct task_group *tg) { }
+
void free_rt_sched_group(struct task_group *tg) { }
int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
@@ -260,12 +321,10 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
#ifdef CONFIG_SMP
-static void pull_rt_task(struct rq *this_rq);
-
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
{
/* Try to pull RT tasks here if we lower this rq's prio */
- return rq->rt.highest_prio.curr > prev->prio;
+ return rq->online && rq->rt.highest_prio.curr > prev->prio;
}
static inline int rt_overloaded(struct rq *rq)
@@ -302,60 +361,13 @@ static inline void rt_clear_overload(struct rq *rq)
cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
}
-static void update_rt_migration(struct rt_rq *rt_rq)
-{
- if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
- if (!rt_rq->overloaded) {
- rt_set_overload(rq_of_rt_rq(rt_rq));
- rt_rq->overloaded = 1;
- }
- } else if (rt_rq->overloaded) {
- rt_clear_overload(rq_of_rt_rq(rt_rq));
- rt_rq->overloaded = 0;
- }
-}
-
-static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- struct task_struct *p;
-
- if (!rt_entity_is_task(rt_se))
- return;
-
- p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
- rt_rq->rt_nr_total++;
- if (p->nr_cpus_allowed > 1)
- rt_rq->rt_nr_migratory++;
-
- update_rt_migration(rt_rq);
-}
-
-static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- struct task_struct *p;
-
- if (!rt_entity_is_task(rt_se))
- return;
-
- p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
- rt_rq->rt_nr_total--;
- if (p->nr_cpus_allowed > 1)
- rt_rq->rt_nr_migratory--;
-
- update_rt_migration(rt_rq);
-}
-
static inline int has_pushable_tasks(struct rq *rq)
{
return !plist_head_empty(&rq->rt.pushable_tasks);
}
-static DEFINE_PER_CPU(struct callback_head, rt_push_head);
-static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
+static DEFINE_PER_CPU(struct balance_callback, rt_push_head);
+static DEFINE_PER_CPU(struct balance_callback, rt_pull_head);
static void push_rt_tasks(struct rq *);
static void pull_rt_task(struct rq *);
@@ -382,6 +394,11 @@ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
/* Update the highest prio pushable task */
if (p->prio < rq->rt.highest_prio.next)
rq->rt.highest_prio.next = p->prio;
+
+ if (!rq->rt.overloaded) {
+ rt_set_overload(rq);
+ rq->rt.overloaded = 1;
+ }
}
static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -393,8 +410,14 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
p = plist_first_entry(&rq->rt.pushable_tasks,
struct task_struct, pushable_tasks);
rq->rt.highest_prio.next = p->prio;
- } else
- rq->rt.highest_prio.next = MAX_RT_PRIO;
+ } else {
+ rq->rt.highest_prio.next = MAX_RT_PRIO-1;
+
+ if (rq->rt.overloaded) {
+ rt_clear_overload(rq);
+ rq->rt.overloaded = 0;
+ }
+ }
}
#else
@@ -407,32 +430,13 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
{
}
-static inline
-void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
-static inline
-void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
-static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
-{
- return false;
-}
-
-static inline void pull_rt_task(struct rq *this_rq)
-{
-}
-
static inline void rt_queue_push_tasks(struct rq *rq)
{
}
#endif /* CONFIG_SMP */
static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
-static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
+static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
@@ -461,13 +465,13 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
unsigned int cpu_cap;
/* Only heterogeneous systems can benefit from this check */
- if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ if (!sched_asym_cpucap_active())
return true;
min_cap = uclamp_eff_value(p, UCLAMP_MIN);
max_cap = uclamp_eff_value(p, UCLAMP_MAX);
- cpu_cap = capacity_orig_of(cpu);
+ cpu_cap = arch_scale_cpu_capacity(cpu);
return cpu_cap >= min(min_cap, max_cap);
}
@@ -553,7 +557,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
if (!rt_se) {
- dequeue_top_rt_rq(rt_rq);
+ dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
}
@@ -639,7 +643,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
- dequeue_top_rt_rq(rt_rq);
+ dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
}
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -699,7 +703,7 @@ static void do_balance_runtime(struct rt_rq *rt_rq)
/*
* Either all rqs have inf runtime and there's nothing to steal
* or __disable_runtime() below sets a specific rq to inf to
- * indicate its been disabled and disalow stealing.
+ * indicate its been disabled and disallow stealing.
*/
if (iter->rt_runtime == RUNTIME_INF)
goto next;
@@ -795,7 +799,7 @@ static void __disable_runtime(struct rq *rq)
* We cannot be left wanting - that would mean some runtime
* leaked out of the system.
*/
- BUG_ON(want);
+ WARN_ON_ONCE(want);
balanced:
/*
* Disable all the borrow logic by pretending we have inf
@@ -873,6 +877,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
struct rq *rq = rq_of_rt_rq(rt_rq);
+ struct rq_flags rf;
int skip;
/*
@@ -887,7 +892,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (skip)
continue;
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
update_rq_clock(rq);
if (rt_rq->rt_time) {
@@ -904,7 +909,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
/*
* When we're idle and a woken (rt) task is
- * throttled check_preempt_curr() will set
+ * throttled wakeup_preempt() will set
* skip_update and the time between the wakeup
* and this unthrottle will get accounted as
* 'runtime'.
@@ -925,7 +930,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (enqueue)
sched_rt_rq_enqueue(rt_rq);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
}
if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
@@ -997,44 +1002,37 @@ static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
- u64 delta_exec;
- u64 now;
+ s64 delta_exec;
if (curr->sched_class != &rt_sched_class)
return;
- now = rq_clock_task(rq);
- delta_exec = now - curr->se.exec_start;
- if (unlikely((s64)delta_exec <= 0))
+ delta_exec = update_curr_common(rq);
+ if (unlikely(delta_exec <= 0))
return;
- schedstat_set(curr->se.statistics.exec_max,
- max(curr->se.statistics.exec_max, delta_exec));
-
- curr->se.sum_exec_runtime += delta_exec;
- account_group_exec_runtime(curr, delta_exec);
-
- curr->se.exec_start = now;
- cgroup_account_cputime(curr, delta_exec);
-
if (!rt_bandwidth_enabled())
return;
for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ int exceeded;
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_time += delta_exec;
- if (sched_rt_runtime_exceeded(rt_rq))
+ exceeded = sched_rt_runtime_exceeded(rt_rq);
+ if (exceeded)
resched_curr(rq);
raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ if (exceeded)
+ do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
}
}
}
static void
-dequeue_top_rt_rq(struct rt_rq *rt_rq)
+dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
@@ -1045,7 +1043,7 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
BUG_ON(!rq->nr_running);
- sub_nr_running(rq, rt_rq->rt_nr_running);
+ sub_nr_running(rq, count);
rt_rq->rt_queued = 0;
}
@@ -1147,8 +1145,9 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
sched_find_first_bit(array->bitmap);
}
- } else
- rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ } else {
+ rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
+ }
dec_rt_prio_smp(rt_rq, prio, prev_prio);
}
@@ -1229,7 +1228,6 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
inc_rt_prio(rt_rq, prio);
- inc_rt_migration(rt_se, rt_rq);
inc_rt_group(rt_se, rt_rq);
}
@@ -1242,7 +1240,6 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
dec_rt_prio(rt_rq, rt_se_prio(rt_se));
- dec_rt_migration(rt_se, rt_rq);
dec_rt_group(rt_se, rt_rq);
}
@@ -1269,6 +1266,112 @@ static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_arr
rt_se->on_list = 0;
}
+static inline struct sched_statistics *
+__schedstats_from_rt_se(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+ /* schedstats is not supported for rt group. */
+ if (!rt_entity_is_task(rt_se))
+ return NULL;
+#endif
+
+ return &rt_task_of(rt_se)->stats;
+}
+
+static inline void
+update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ stats = __schedstats_from_rt_se(rt_se);
+ if (!stats)
+ return;
+
+ __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats);
+}
+
+static inline void
+update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ stats = __schedstats_from_rt_se(rt_se);
+ if (!stats)
+ return;
+
+ __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats);
+}
+
+static inline void
+update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
+ int flags)
+{
+ if (!schedstat_enabled())
+ return;
+
+ if (flags & ENQUEUE_WAKEUP)
+ update_stats_enqueue_sleeper_rt(rt_rq, rt_se);
+}
+
+static inline void
+update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ stats = __schedstats_from_rt_se(rt_se);
+ if (!stats)
+ return;
+
+ __update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats);
+}
+
+static inline void
+update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
+ int flags)
+{
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ if ((flags & DEQUEUE_SLEEP) && p) {
+ unsigned int state;
+
+ state = READ_ONCE(p->__state);
+ if (state & TASK_INTERRUPTIBLE)
+ __schedstat_set(p->stats.sleep_start,
+ rq_clock(rq_of_rt_rq(rt_rq)));
+
+ if (state & TASK_UNINTERRUPTIBLE)
+ __schedstat_set(p->stats.block_start,
+ rq_clock(rq_of_rt_rq(rt_rq)));
+ }
+}
+
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
@@ -1324,24 +1427,29 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct sched_rt_entity *back = NULL;
+ unsigned int rt_nr_running;
for_each_sched_rt_entity(rt_se) {
rt_se->back = back;
back = rt_se;
}
- dequeue_top_rt_rq(rt_rq_of_se(back));
+ rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
__dequeue_rt_entity(rt_se, flags);
}
+
+ dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
}
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
+ update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
+
dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se)
__enqueue_rt_entity(rt_se, flags);
@@ -1352,6 +1460,8 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
+ update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
+
dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se) {
@@ -1374,6 +1484,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
if (flags & ENQUEUE_WAKEUP)
rt_se->timeout = 0;
+ check_schedstat_required();
+ update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se);
+
enqueue_rt_entity(rt_se, flags);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
@@ -1428,14 +1541,14 @@ static void yield_task_rt(struct rq *rq)
static int find_lowest_rq(struct task_struct *task);
static int
-select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int cpu, int flags)
{
struct task_struct *curr;
struct rq *rq;
bool test;
/* For anything but wake ups, just return the task_cpu */
- if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+ if (!(flags & (WF_TTWU | WF_FORK)))
goto out;
rq = cpu_rq(cpu);
@@ -1547,7 +1660,7 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
/*
* Preempt the current task with a newly woken task if needed:
*/
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
+static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
{
if (p->prio < rq->curr->prio) {
resched_curr(rq);
@@ -1574,7 +1687,12 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
{
+ struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq = &rq->rt;
+
p->se.exec_start = rq_clock_task(rq);
+ if (on_rt_rq(&p->rt))
+ update_stats_wait_end_rt(rt_rq, rt_se);
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
@@ -1593,8 +1711,7 @@ static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool f
rt_queue_push_tasks(rq);
}
-static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
- struct rt_rq *rt_rq)
+static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
{
struct rt_prio_array *array = &rt_rq->active;
struct sched_rt_entity *next = NULL;
@@ -1605,6 +1722,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
BUG_ON(idx >= MAX_RT_PRIO);
queue = array->queue + idx;
+ if (SCHED_WARN_ON(list_empty(queue)))
+ return NULL;
next = list_entry(queue->next, struct sched_rt_entity, run_list);
return next;
@@ -1616,15 +1735,16 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
struct rt_rq *rt_rq = &rq->rt;
do {
- rt_se = pick_next_rt_entity(rq, rt_rq);
- BUG_ON(!rt_se);
+ rt_se = pick_next_rt_entity(rt_rq);
+ if (unlikely(!rt_se))
+ return NULL;
rt_rq = group_rt_rq(rt_se);
} while (rt_rq);
return rt_task_of(rt_se);
}
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *pick_task_rt(struct rq *rq)
{
struct task_struct *p;
@@ -1632,12 +1752,28 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
return NULL;
p = _pick_next_task_rt(rq);
- set_next_task_rt(rq, p, true);
+
+ return p;
+}
+
+static struct task_struct *pick_next_task_rt(struct rq *rq)
+{
+ struct task_struct *p = pick_task_rt(rq);
+
+ if (p)
+ set_next_task_rt(rq, p, true);
+
return p;
}
static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
+ struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq = &rq->rt;
+
+ if (on_rt_rq(&p->rt))
+ update_stats_wait_start_rt(rt_rq, rt_se);
+
update_curr_rt(rq);
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
@@ -1657,8 +1793,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
- if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, p->cpus_ptr))
+ if (!task_on_cpu(rq, p) &&
+ cpumask_test_cpu(cpu, &p->cpus_mask))
return 1;
return 0;
@@ -1705,7 +1841,7 @@ static int find_lowest_rq(struct task_struct *task)
* If we're on asym system ensure we consider the different capacities
* of the CPUs when searching for the lowest_mask.
*/
- if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ if (sched_asym_cpucap_active()) {
ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
task, lowest_mask,
@@ -1752,8 +1888,8 @@ static int find_lowest_rq(struct task_struct *task)
return this_cpu;
}
- best_cpu = cpumask_first_and(lowest_mask,
- sched_domain_span(sd));
+ best_cpu = cpumask_any_and_distribute(lowest_mask,
+ sched_domain_span(sd));
if (best_cpu < nr_cpu_ids) {
rcu_read_unlock();
return best_cpu;
@@ -1770,7 +1906,7 @@ static int find_lowest_rq(struct task_struct *task)
if (this_cpu != -1)
return this_cpu;
- cpu = cpumask_any(lowest_mask);
+ cpu = cpumask_any_distribute(lowest_mask);
if (cpu < nr_cpu_ids)
return cpu;
@@ -1809,11 +1945,15 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
* the mean time, task could have
* migrated already or had its affinity changed.
* Also make sure that it wasn't scheduled on its rq.
+ * It is possible the task was scheduled, set
+ * "migrate_disabled" and then got preempted, so we must
+ * check the task migration disable flag here too.
*/
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
- task_running(rq, task) ||
+ !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
+ task_on_cpu(rq, task) ||
!rt_task(task) ||
+ is_migration_disabled(task) ||
!task_on_rq_queued(task))) {
double_unlock_balance(rq, lowest_rq);
@@ -1859,7 +1999,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
* running task can migrate over to a CPU that is running a task
* of lesser priority.
*/
-static int push_rt_task(struct rq *rq)
+static int push_rt_task(struct rq *rq, bool pull)
{
struct task_struct *next_task;
struct rq *lowest_rq;
@@ -1873,9 +2013,6 @@ static int push_rt_task(struct rq *rq)
return 0;
retry:
- if (WARN_ON(next_task == rq->curr))
- return 0;
-
/*
* It's possible that the next_task slipped in of
* higher priority than current. If that's the case
@@ -1886,6 +2023,51 @@ retry:
return 0;
}
+ if (is_migration_disabled(next_task)) {
+ struct task_struct *push_task = NULL;
+ int cpu;
+
+ if (!pull || rq->push_busy)
+ return 0;
+
+ /*
+ * Invoking find_lowest_rq() on anything but an RT task doesn't
+ * make sense. Per the above priority check, curr has to
+ * be of higher priority than next_task, so no need to
+ * reschedule when bailing out.
+ *
+ * Note that the stoppers are masqueraded as SCHED_FIFO
+ * (cf. sched_set_stop_task()), so we can't rely on rt_task().
+ */
+ if (rq->curr->sched_class != &rt_sched_class)
+ return 0;
+
+ cpu = find_lowest_rq(rq->curr);
+ if (cpu == -1 || cpu == rq->cpu)
+ return 0;
+
+ /*
+ * Given we found a CPU with lower priority than @next_task,
+ * therefore it should be running. However we cannot migrate it
+ * to this other CPU, instead attempt to push the current
+ * running task on this CPU away.
+ */
+ push_task = get_push_task(rq);
+ if (push_task) {
+ preempt_disable();
+ raw_spin_rq_unlock(rq);
+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+ push_task, &rq->push_work);
+ preempt_enable();
+ raw_spin_rq_lock(rq);
+ }
+
+ return 0;
+ }
+
+ if (WARN_ON(next_task == rq->curr))
+ return 0;
+
/* We might release rq lock */
get_task_struct(next_task);
@@ -1927,12 +2109,10 @@ retry:
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, lowest_rq->cpu);
activate_task(lowest_rq, next_task, 0);
- ret = 1;
-
resched_curr(lowest_rq);
+ ret = 1;
double_unlock_balance(rq, lowest_rq);
-
out:
put_task_struct(next_task);
@@ -1942,7 +2122,7 @@ out:
static void push_rt_tasks(struct rq *rq)
{
/* push_rt_task will return true if it moved an RT */
- while (push_rt_task(rq))
+ while (push_rt_task(rq, false))
;
}
@@ -1970,7 +2150,7 @@ static void push_rt_tasks(struct rq *rq)
*
* Each root domain has its own irq work function that can iterate over
* all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
- * tassk must be checked if there's one or many CPUs that are lowering
+ * task must be checked if there's one or many CPUs that are lowering
* their priority, there's a single irq work iterator that will try to
* push off RT tasks that are waiting to run.
*
@@ -2094,9 +2274,10 @@ void rto_push_irq_work_func(struct irq_work *work)
* When it gets updated, a check is made if a push is possible.
*/
if (has_pushable_tasks(rq)) {
- raw_spin_lock(&rq->lock);
- push_rt_tasks(rq);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_lock(rq);
+ while (push_rt_task(rq, true))
+ ;
+ raw_spin_rq_unlock(rq);
}
raw_spin_lock(&rd->rto_lock);
@@ -2120,7 +2301,7 @@ static void pull_rt_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, cpu;
bool resched = false;
- struct task_struct *p;
+ struct task_struct *p, *push_task;
struct rq *src_rq;
int rt_overload_count = rt_overloaded(this_rq);
@@ -2167,6 +2348,7 @@ static void pull_rt_task(struct rq *this_rq)
* double_lock_balance, and another CPU could
* alter this_rq
*/
+ push_task = NULL;
double_lock_balance(this_rq, src_rq);
/*
@@ -2186,7 +2368,7 @@ static void pull_rt_task(struct rq *this_rq)
/*
* There's a chance that p is higher in priority
* than what's currently running on its CPU.
- * This is just that p is wakeing up and hasn't
+ * This is just that p is waking up and hasn't
* had a chance to schedule. We only pull
* p if it is lower in priority than the
* current task on the run queue
@@ -2194,11 +2376,14 @@ static void pull_rt_task(struct rq *this_rq)
if (p->prio < src_rq->curr->prio)
goto skip;
- resched = true;
-
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, this_cpu);
- activate_task(this_rq, p, 0);
+ if (is_migration_disabled(p)) {
+ push_task = get_push_task(src_rq);
+ } else {
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, this_cpu);
+ activate_task(this_rq, p, 0);
+ resched = true;
+ }
/*
* We continue with the search, just in
* case there's an even higher prio task
@@ -2208,6 +2393,15 @@ static void pull_rt_task(struct rq *this_rq)
}
skip:
double_unlock_balance(this_rq, src_rq);
+
+ if (push_task) {
+ preempt_disable();
+ raw_spin_rq_unlock(this_rq);
+ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+ push_task, &src_rq->push_work);
+ preempt_enable();
+ raw_spin_rq_lock(this_rq);
+ }
}
if (resched)
@@ -2220,7 +2414,7 @@ skip:
*/
static void task_woken_rt(struct rq *rq, struct task_struct *p)
{
- bool need_to_push = !task_running(rq, p) &&
+ bool need_to_push = !task_on_cpu(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
p->nr_cpus_allowed > 1 &&
(dl_task(rq->curr) || rt_task(rq->curr)) &&
@@ -2291,13 +2485,20 @@ void __init init_sched_rt_class(void)
static void switched_to_rt(struct rq *rq, struct task_struct *p)
{
/*
- * If we are already running, then there's nothing
- * that needs to be done. But if we are not running
- * we may need to preempt the current running task.
- * If that current running task is also an RT task
+ * If we are running, update the avg_rt tracking, as the running time
+ * will now on be accounted into the latter.
+ */
+ if (task_current(rq, p)) {
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ return;
+ }
+
+ /*
+ * If we are not running we may need to preempt the current
+ * running task. If that current running task is also an RT task
* then see if we can move to another run queue.
*/
- if (task_on_rq_queued(p) && rq->curr != p) {
+ if (task_on_rq_queued(p)) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
rt_queue_push_tasks(rq);
@@ -2317,7 +2518,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
if (!task_on_rq_queued(p))
return;
- if (rq->curr == p) {
+ if (task_current(rq, p)) {
#ifdef CONFIG_SMP
/*
* If our priority decreases while running, we
@@ -2429,13 +2630,28 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
return 0;
}
-const struct sched_class rt_sched_class = {
- .next = &fair_sched_class,
+#ifdef CONFIG_SCHED_CORE
+static int task_is_throttled_rt(struct task_struct *p, int cpu)
+{
+ struct rt_rq *rt_rq;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ rt_rq = task_group(p)->rt_rq[cpu];
+#else
+ rt_rq = &cpu_rq(cpu)->rt;
+#endif
+
+ return rt_rq_throttled(rt_rq);
+}
+#endif
+
+DEFINE_SCHED_CLASS(rt) = {
+
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
- .check_preempt_curr = check_preempt_curr_rt,
+ .wakeup_preempt = wakeup_preempt_rt,
.pick_next_task = pick_next_task_rt,
.put_prev_task = put_prev_task_rt,
@@ -2443,12 +2659,14 @@ const struct sched_class rt_sched_class = {
#ifdef CONFIG_SMP
.balance = balance_rt,
+ .pick_task = pick_task_rt,
.select_task_rq = select_task_rq_rt,
.set_cpus_allowed = set_cpus_allowed_common,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
+ .find_lock_rq = find_lock_lowest_rq,
#endif
.task_tick = task_tick_rt,
@@ -2460,6 +2678,10 @@ const struct sched_class rt_sched_class = {
.update_curr = update_curr_rt,
+#ifdef CONFIG_SCHED_CORE
+ .task_is_throttled = task_is_throttled_rt,
+#endif
+
#ifdef CONFIG_UCLAMP_TASK
.uclamp_enabled = 1,
#endif
@@ -2664,6 +2886,7 @@ long sched_group_rt_period(struct task_group *tg)
return rt_period_us;
}
+#ifdef CONFIG_SYSCTL
static int sched_rt_global_constraints(void)
{
int ret = 0;
@@ -2674,6 +2897,7 @@ static int sched_rt_global_constraints(void)
return ret;
}
+#endif /* CONFIG_SYSCTL */
int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
{
@@ -2685,6 +2909,8 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
}
#else /* !CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_SYSCTL
static int sched_rt_global_constraints(void)
{
unsigned long flags;
@@ -2702,13 +2928,12 @@ static int sched_rt_global_constraints(void)
return 0;
}
+#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_SYSCTL
static int sched_rt_global_validate(void)
{
- if (sysctl_sched_rt_period <= 0)
- return -EINVAL;
-
if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
((u64)sysctl_sched_rt_runtime *
@@ -2720,11 +2945,15 @@ static int sched_rt_global_validate(void)
static void sched_rt_do_global(void)
{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
def_rt_bandwidth.rt_runtime = global_rt_runtime();
def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+ raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
}
-int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
+static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int old_period, old_runtime;
@@ -2735,7 +2964,7 @@ int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
old_period = sysctl_sched_rt_period;
old_runtime = sysctl_sched_rt_runtime;
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write) {
ret = sched_rt_global_validate();
@@ -2763,7 +2992,7 @@ undo:
return ret;
}
-int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
+static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int ret;
@@ -2779,11 +3008,15 @@ int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
sched_rr_timeslice =
sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
msecs_to_jiffies(sysctl_sched_rr_timeslice);
+
+ if (sysctl_sched_rr_timeslice <= 0)
+ sysctl_sched_rr_timeslice = jiffies_to_msecs(RR_TIMESLICE);
}
mutex_unlock(&mutex);
return ret;
}
+#endif /* CONFIG_SYSCTL */
#ifdef CONFIG_SCHED_DEBUG
void print_rt_stats(struct seq_file *m, int cpu)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 877fb08eb1b0..001fe047bd5d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,83 +2,90 @@
/*
* Scheduler internal types and methods:
*/
-#include <linux/sched.h>
+#ifndef _KERNEL_SCHED_SCHED_H
+#define _KERNEL_SCHED_SCHED_H
+#include <linux/sched/affinity.h>
#include <linux/sched/autogroup.h>
-#include <linux/sched/clock.h>
-#include <linux/sched/coredump.h>
#include <linux/sched/cpufreq.h>
-#include <linux/sched/cputime.h>
#include <linux/sched/deadline.h>
-#include <linux/sched/debug.h>
-#include <linux/sched/hotplug.h>
-#include <linux/sched/idle.h>
-#include <linux/sched/init.h>
-#include <linux/sched/isolation.h>
-#include <linux/sched/jobctl.h>
+#include <linux/sched.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/mm.h>
-#include <linux/sched/nohz.h>
-#include <linux/sched/numa_balancing.h>
-#include <linux/sched/prio.h>
-#include <linux/sched/rt.h>
+#include <linux/sched/rseq_api.h>
#include <linux/sched/signal.h>
#include <linux/sched/smt.h>
#include <linux/sched/stat.h>
#include <linux/sched/sysctl.h>
+#include <linux/sched/task_flags.h>
#include <linux/sched/task.h>
-#include <linux/sched/task_stack.h>
#include <linux/sched/topology.h>
-#include <linux/sched/user.h>
-#include <linux/sched/wake_q.h>
-#include <linux/sched/xacct.h>
-
-#include <uapi/linux/sched/types.h>
-#include <linux/binfmts.h>
-#include <linux/blkdev.h>
-#include <linux/compat.h>
+#include <linux/atomic.h>
+#include <linux/bitmap.h>
+#include <linux/bug.h>
+#include <linux/capability.h>
+#include <linux/cgroup_api.h>
+#include <linux/cgroup.h>
#include <linux/context_tracking.h>
#include <linux/cpufreq.h>
-#include <linux/cpuidle.h>
-#include <linux/cpuset.h>
+#include <linux/cpumask_api.h>
#include <linux/ctype.h>
-#include <linux/debugfs.h>
-#include <linux/delayacct.h>
-#include <linux/energy_model.h>
-#include <linux/init_task.h>
-#include <linux/kprobes.h>
+#include <linux/file.h>
+#include <linux/fs_api.h>
+#include <linux/hrtimer_api.h>
+#include <linux/interrupt.h>
+#include <linux/irq_work.h>
+#include <linux/jiffies.h>
+#include <linux/kref_api.h>
#include <linux/kthread.h>
-#include <linux/membarrier.h>
-#include <linux/migrate.h>
-#include <linux/mmu_context.h>
-#include <linux/nmi.h>
+#include <linux/ktime_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/lockdep.h>
+#include <linux/minmax.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex_api.h>
+#include <linux/plist.h>
+#include <linux/poll.h>
#include <linux/proc_fs.h>
-#include <linux/prefetch.h>
#include <linux/profile.h>
#include <linux/psi.h>
-#include <linux/rcupdate_wait.h>
-#include <linux/security.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/seqlock.h>
+#include <linux/softirq.h>
+#include <linux/spinlock_api.h>
+#include <linux/static_key.h>
#include <linux/stop_machine.h>
-#include <linux/suspend.h>
-#include <linux/swait.h>
+#include <linux/syscalls_api.h>
#include <linux/syscalls.h>
-#include <linux/task_work.h>
-#include <linux/tsacct_kern.h>
+#include <linux/tick.h>
+#include <linux/topology.h>
+#include <linux/types.h>
+#include <linux/u64_stats_sync_api.h>
+#include <linux/uaccess.h>
+#include <linux/wait_api.h>
+#include <linux/wait_bit.h>
+#include <linux/workqueue_api.h>
+
+#include <trace/events/power.h>
+#include <trace/events/sched.h>
-#include <asm/tlb.h>
+#include "../workqueue_internal.h"
#ifdef CONFIG_PARAVIRT
# include <asm/paravirt.h>
+# include <asm/paravirt_api_clock.h>
#endif
#include "cpupri.h"
#include "cpudeadline.h"
#ifdef CONFIG_SCHED_DEBUG
-# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
+# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
#else
-# define SCHED_WARN_ON(x) ({ (void)(x), 0; })
+# define SCHED_WARN_ON(x) ({ (void)(x), 0; })
#endif
struct rq;
@@ -96,6 +103,12 @@ extern atomic_long_t calc_load_tasks;
extern void calc_global_load_tick(struct rq *this_rq);
extern long calc_load_fold_active(struct rq *this_rq, long adjust);
+extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
+
+extern int sysctl_sched_rt_period;
+extern int sysctl_sched_rt_runtime;
+extern int sched_rr_timeslice;
+
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
@@ -137,7 +150,7 @@ extern long calc_load_fold_active(struct rq *this_rq, long adjust);
* scale_load() and scale_load_down(w) to convert between them. The
* following must be true:
*
- * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD
+ * scale_load(sched_prio_to_weight[NICE_TO_PRIO(0)-MAX_RT_PRIO]) == NICE_0_LOAD
*
*/
#define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT)
@@ -202,6 +215,13 @@ static inline void update_avg(u64 *avg, u64 sample)
}
/*
+ * Shifting a value by an exponent greater *or equal* to the size of said value
+ * is UB; cap at size-1.
+ */
+#define shr_bound(val, shift) \
+ (val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1))
+
+/*
* !! For sched_setattr_nocheck() (kernel) only !!
*
* This is actually gross. :(
@@ -215,7 +235,9 @@ static inline void update_avg(u64 *avg, u64 sample)
*/
#define SCHED_FLAG_SUGOV 0x10000000
-static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
+#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
+
+static inline bool dl_entity_is_special(const struct sched_dl_entity *dl_se)
{
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
return unlikely(dl_se->flags & SCHED_FLAG_SUGOV);
@@ -227,8 +249,8 @@ static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
/*
* Tells if entity @a should preempt entity @b.
*/
-static inline bool
-dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
+static inline bool dl_entity_preempt(const struct sched_dl_entity *a,
+ const struct sched_dl_entity *b)
{
return dl_entity_is_special(a) ||
dl_time_before(a->deadline, b->deadline);
@@ -251,13 +273,17 @@ struct rt_bandwidth {
unsigned int rt_period_active;
};
-void __dl_clear_params(struct task_struct *p);
+static inline int dl_bandwidth_enabled(void)
+{
+ return sysctl_sched_rt_runtime >= 0;
+}
/*
- * To keep the bandwidth of -deadline tasks and groups under control
+ * To keep the bandwidth of -deadline tasks under control
* we need some place where:
- * - store the maximum -deadline bandwidth of the system (the group);
- * - cache the fraction of that bandwidth that is currently allocated.
+ * - store the maximum -deadline bandwidth of each cpu;
+ * - cache the fraction of bandwidth that is currently allocated in
+ * each root domain;
*
* This is all done in the data structure below. It is similar to the
* one used for RT-throttling (rt_bandwidth), with the main difference
@@ -265,58 +291,17 @@ void __dl_clear_params(struct task_struct *p);
* do not decrease any runtime while the group "executes", neither we
* need a timer to replenish it.
*
- * With respect to SMP, the bandwidth is given on a per-CPU basis,
+ * With respect to SMP, bandwidth is given on a per root domain basis,
* meaning that:
- * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
- * - dl_total_bw array contains, in the i-eth element, the currently
- * allocated bandwidth on the i-eth CPU.
- * Moreover, groups consume bandwidth on each CPU, while tasks only
- * consume bandwidth on the CPU they're running on.
- * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
- * that will be shown the next time the proc or cgroup controls will
- * be red. It on its turn can be changed by writing on its own
- * control.
+ * - bw (< 100%) is the deadline bandwidth of each CPU;
+ * - total_bw is the currently allocated bandwidth in each root domain;
*/
-struct dl_bandwidth {
- raw_spinlock_t dl_runtime_lock;
- u64 dl_runtime;
- u64 dl_period;
-};
-
-static inline int dl_bandwidth_enabled(void)
-{
- return sysctl_sched_rt_runtime >= 0;
-}
-
struct dl_bw {
raw_spinlock_t lock;
u64 bw;
u64 total_bw;
};
-static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
-
-static inline
-void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
-{
- dl_b->total_bw -= tsk_bw;
- __dl_update(dl_b, (s32)tsk_bw / cpus);
-}
-
-static inline
-void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
-{
- dl_b->total_bw += tsk_bw;
- __dl_update(dl_b, -((s32)tsk_bw / cpus));
-}
-
-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
-{
- return dl_b->bw != -1 &&
- dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
-}
-
extern void init_dl_bw(struct dl_bw *dl_b);
extern int sched_dl_global_validate(void);
extern void sched_dl_do_global(void);
@@ -325,14 +310,37 @@ extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
extern bool __checkparam_dl(const struct sched_attr *attr);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
-extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
-extern bool dl_cpu_busy(unsigned int cpu);
+extern int dl_bw_check_overflow(int cpu);
-#ifdef CONFIG_CGROUP_SCHED
+/*
+ * SCHED_DEADLINE supports servers (nested scheduling) with the following
+ * interface:
+ *
+ * dl_se::rq -- runqueue we belong to.
+ *
+ * dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the
+ * server when it runs out of tasks to run.
+ *
+ * dl_se::server_pick() -- nested pick_next_task(); we yield the period if this
+ * returns NULL.
+ *
+ * dl_server_update() -- called from update_curr_common(), propagates runtime
+ * to the server.
+ *
+ * dl_server_start()
+ * dl_server_stop() -- start/stop the server when it has (no) tasks.
+ *
+ * dl_server_init() -- initializes the server.
+ */
+extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
+extern void dl_server_start(struct sched_dl_entity *dl_se);
+extern void dl_server_stop(struct sched_dl_entity *dl_se);
+extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+ dl_server_has_tasks_f has_tasks,
+ dl_server_pick_f pick);
-#include <linux/cgroup.h>
-#include <linux/psi.h>
+#ifdef CONFIG_CGROUP_SCHED
struct cfs_rq;
struct rt_rq;
@@ -345,6 +353,8 @@ struct cfs_bandwidth {
ktime_t period;
u64 quota;
u64 runtime;
+ u64 burst;
+ u64 runtime_snap;
s64 hierarchical_quota;
u8 idle;
@@ -357,7 +367,9 @@ struct cfs_bandwidth {
/* Statistics: */
int nr_periods;
int nr_throttled;
+ int nr_burst;
u64 throttled_time;
+ u64 burst_time;
#endif
};
@@ -372,6 +384,9 @@ struct task_group {
struct cfs_rq **cfs_rq;
unsigned long shares;
+ /* A positive value indicates that this is a SCHED_IDLE group. */
+ int idle;
+
#ifdef CONFIG_SMP
/*
* load_avg can be heavily contended at clock tick time, so put
@@ -446,21 +461,31 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
extern int tg_nop(struct task_group *tg, void *data);
+#ifdef CONFIG_FAIR_GROUP_SCHED
extern void free_fair_sched_group(struct task_group *tg);
extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
extern void online_fair_sched_group(struct task_group *tg);
extern void unregister_fair_sched_group(struct task_group *tg);
+#else
+static inline void free_fair_sched_group(struct task_group *tg) { }
+static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ return 1;
+}
+static inline void online_fair_sched_group(struct task_group *tg) { }
+static inline void unregister_fair_sched_group(struct task_group *tg) { }
+#endif
+
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
struct sched_entity *parent);
-extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent);
extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
+extern bool cfs_task_bw_constrained(struct task_struct *p);
-extern void free_rt_sched_group(struct task_group *tg);
-extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
struct sched_rt_entity *rt_se, int cpu,
struct sched_rt_entity *parent);
@@ -474,13 +499,15 @@ extern struct task_group *sched_create_group(struct task_group *parent);
extern void sched_online_group(struct task_group *tg,
struct task_group *parent);
extern void sched_destroy_group(struct task_group *tg);
-extern void sched_offline_group(struct task_group *tg);
+extern void sched_release_group(struct task_group *tg);
extern void sched_move_task(struct task_struct *tsk);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+extern int sched_group_set_idle(struct task_group *tg, long idle);
+
#ifdef CONFIG_SMP
extern void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next);
@@ -493,18 +520,71 @@ static inline void set_task_rq_fair(struct sched_entity *se,
#else /* CONFIG_CGROUP_SCHED */
struct cfs_bandwidth { };
+static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; }
#endif /* CONFIG_CGROUP_SCHED */
+extern void unregister_rt_sched_group(struct task_group *tg);
+extern void free_rt_sched_group(struct task_group *tg);
+extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
+
+/*
+ * u64_u32_load/u64_u32_store
+ *
+ * Use a copy of a u64 value to protect against data race. This is only
+ * applicable for 32-bits architectures.
+ */
+#ifdef CONFIG_64BIT
+# define u64_u32_load_copy(var, copy) var
+# define u64_u32_store_copy(var, copy, val) (var = val)
+#else
+# define u64_u32_load_copy(var, copy) \
+({ \
+ u64 __val, __val_copy; \
+ do { \
+ __val_copy = copy; \
+ /* \
+ * paired with u64_u32_store_copy(), ordering access \
+ * to var and copy. \
+ */ \
+ smp_rmb(); \
+ __val = var; \
+ } while (__val != __val_copy); \
+ __val; \
+})
+# define u64_u32_store_copy(var, copy, val) \
+do { \
+ typeof(val) __val = (val); \
+ var = __val; \
+ /* \
+ * paired with u64_u32_load_copy(), ordering access to var and \
+ * copy. \
+ */ \
+ smp_wmb(); \
+ copy = __val; \
+} while (0)
+#endif
+# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy)
+# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val)
+
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
unsigned int nr_running;
unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int idle_nr_running; /* SCHED_IDLE */
unsigned int idle_h_nr_running; /* SCHED_IDLE */
+ s64 avg_vruntime;
+ u64 avg_load;
+
u64 exec_clock;
u64 min_vruntime;
+#ifdef CONFIG_SCHED_CORE
+ unsigned int forceidle_seq;
+ u64 min_vruntime_fi;
+#endif
+
#ifndef CONFIG_64BIT
u64 min_vruntime_copy;
#endif
@@ -517,8 +597,6 @@ struct cfs_rq {
*/
struct sched_entity *curr;
struct sched_entity *next;
- struct sched_entity *last;
- struct sched_entity *skip;
#ifdef CONFIG_SCHED_DEBUG
unsigned int nr_spread_over;
@@ -530,7 +608,7 @@ struct cfs_rq {
*/
struct sched_avg avg;
#ifndef CONFIG_64BIT
- u64 load_last_update_time_copy;
+ u64 last_update_time_copy;
#endif
struct {
raw_spinlock_t lock ____cacheline_aligned;
@@ -541,6 +619,7 @@ struct cfs_rq {
} removed;
#ifdef CONFIG_FAIR_GROUP_SCHED
+ u64 last_update_tg_load_avg;
unsigned long tg_load_avg_contrib;
long propagate;
long prop_runnable_sum;
@@ -572,16 +651,26 @@ struct cfs_rq {
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
+ /* Locally cached copy of our task_group's idle value */
+ int idle;
+
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
s64 runtime_remaining;
+ u64 throttled_pelt_idle;
+#ifndef CONFIG_64BIT
+ u64 throttled_pelt_idle_copy;
+#endif
u64 throttled_clock;
- u64 throttled_clock_task;
- u64 throttled_clock_task_time;
+ u64 throttled_clock_pelt;
+ u64 throttled_clock_pelt_time;
+ u64 throttled_clock_self;
+ u64 throttled_clock_self_time;
int throttled;
int throttle_count;
struct list_head throttled_list;
+ struct list_head throttled_csd_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -610,8 +699,6 @@ struct rt_rq {
} highest_prio;
#endif
#ifdef CONFIG_SMP
- unsigned long rt_nr_migratory;
- unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
@@ -625,7 +712,7 @@ struct rt_rq {
raw_spinlock_t rt_runtime_lock;
#ifdef CONFIG_RT_GROUP_SCHED
- unsigned long rt_nr_boosted;
+ unsigned int rt_nr_boosted;
struct rq *rq;
struct task_group *tg;
@@ -642,7 +729,7 @@ struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */
struct rb_root_cached root;
- unsigned long dl_nr_running;
+ unsigned int dl_nr_running;
#ifdef CONFIG_SMP
/*
@@ -656,7 +743,6 @@ struct dl_rq {
u64 next;
} earliest_dl;
- unsigned long dl_nr_migratory;
int overloaded;
/*
@@ -688,6 +774,12 @@ struct dl_rq {
u64 extra_bw;
/*
+ * Maximum available bandwidth for reclaiming by SCHED_FLAG_RECLAIM
+ * tasks of this rq. Used in calculation of reclaimable bandwidth(GRUB).
+ */
+ u64 max_bw;
+
+ /*
* Inverse of the fraction of CPU utilization that can be reclaimed
* by the GRUB algorithm.
*/
@@ -782,6 +874,15 @@ struct root_domain {
struct dl_bw dl_bw;
struct cpudl cpudl;
+ /*
+ * Indicate whether a root_domain's dl_bw has been checked or
+ * updated. It's monotonously increasing value.
+ *
+ * Also, some corner cases, like 'wrap around' is dangerous, but given
+ * that u64 is 'big enough'. So that shouldn't be a concern.
+ */
+ u64 visit_gen;
+
#ifdef HAVE_RT_PUSH_IPI
/*
* For IPI pull requests, loop across the rto_mask.
@@ -862,8 +963,16 @@ struct uclamp_rq {
unsigned int value;
struct uclamp_bucket bucket[UCLAMP_BUCKETS];
};
+
+DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
#endif /* CONFIG_UCLAMP_TASK */
+struct rq;
+struct balance_callback {
+ struct balance_callback *next;
+ void (*func)(struct rq *rq);
+};
+
/*
* This is the main, per-CPU runqueue data structure.
*
@@ -873,12 +982,8 @@ struct uclamp_rq {
*/
struct rq {
/* runqueue lock: */
- raw_spinlock_t lock;
+ raw_spinlock_t __lock;
- /*
- * nr_running and cpu_load should be in the same cacheline because
- * remote CPUs use both these fields when doing load calculation.
- */
unsigned int nr_running;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -923,7 +1028,7 @@ struct rq {
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
- unsigned long nr_uninterruptible;
+ unsigned int nr_uninterruptible;
struct task_struct __rcu *curr;
struct task_struct *idle;
@@ -937,9 +1042,20 @@ struct rq {
u64 clock_task ____cacheline_aligned;
u64 clock_pelt;
unsigned long lost_idle_time;
+ u64 clock_pelt_idle;
+ u64 clock_idle;
+#ifndef CONFIG_64BIT
+ u64 clock_pelt_idle_copy;
+ u64 clock_idle_copy;
+#endif
atomic_t nr_iowait;
+#ifdef CONFIG_SCHED_DEBUG
+ u64 last_seen_need_resched_ns;
+ int ticks_without_resched;
+#endif
+
#ifdef CONFIG_MEMBARRIER
int membarrier_state;
#endif
@@ -949,9 +1065,8 @@ struct rq {
struct sched_domain __rcu *sd;
unsigned long cpu_capacity;
- unsigned long cpu_capacity_orig;
- struct callback_head *balance_callback;
+ struct balance_callback *balance_callback;
unsigned char nohz_idle_balance;
unsigned char idle_balance;
@@ -982,6 +1097,10 @@ struct rq {
/* This is used to determine avg_idle's max value */
u64 max_idle_balance_cost;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ struct rcuwait hotplug_wait;
+#endif
#endif /* CONFIG_SMP */
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -1003,6 +1122,7 @@ struct rq {
call_single_data_t hrtick_csd;
#endif
struct hrtimer hrtick_timer;
+ ktime_t hrtick_time;
#endif
#ifdef CONFIG_SCHEDSTATS
@@ -1027,6 +1147,38 @@ struct rq {
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
#endif
+
+#ifdef CONFIG_SMP
+ unsigned int nr_pinned;
+#endif
+ unsigned int push_busy;
+ struct cpu_stop_work push_work;
+
+#ifdef CONFIG_SCHED_CORE
+ /* per rq */
+ struct rq *core;
+ struct task_struct *core_pick;
+ unsigned int core_enabled;
+ unsigned int core_sched_seq;
+ struct rb_root core_tree;
+
+ /* shared state -- careful with sched_core_cpu_deactivate() */
+ unsigned int core_task_seq;
+ unsigned int core_pick_seq;
+ unsigned long core_cookie;
+ unsigned int core_forceidle_count;
+ unsigned int core_forceidle_seq;
+ unsigned int core_forceidle_occupation;
+ u64 core_forceidle_start;
+#endif
+
+ /* Scratch cpumask to be temporarily used under rq_lock */
+ cpumask_var_t scratch_mask;
+
+#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP)
+ call_single_data_t cfsb_csd;
+ struct list_head cfsb_csd_list;
+#endif
};
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1054,6 +1206,215 @@ static inline int cpu_of(struct rq *rq)
#endif
}
+#define MDF_PUSH 0x01
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+#ifdef CONFIG_SMP
+ return p->migration_disabled;
+#else
+ return false;
+#endif
+}
+
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+
+#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
+#define this_rq() this_cpu_ptr(&runqueues)
+#define task_rq(p) cpu_rq(task_cpu(p))
+#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+#define raw_rq() raw_cpu_ptr(&runqueues)
+
+struct sched_group;
+#ifdef CONFIG_SCHED_CORE
+static inline struct cpumask *sched_group_span(struct sched_group *sg);
+
+DECLARE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+static inline bool sched_core_enabled(struct rq *rq)
+{
+ return static_branch_unlikely(&__sched_core_enabled) && rq->core_enabled;
+}
+
+static inline bool sched_core_disabled(void)
+{
+ return !static_branch_unlikely(&__sched_core_enabled);
+}
+
+/*
+ * Be careful with this function; not for general use. The return value isn't
+ * stable unless you actually hold a relevant rq->__lock.
+ */
+static inline raw_spinlock_t *rq_lockp(struct rq *rq)
+{
+ if (sched_core_enabled(rq))
+ return &rq->core->__lock;
+
+ return &rq->__lock;
+}
+
+static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
+{
+ if (rq->core_enabled)
+ return &rq->core->__lock;
+
+ return &rq->__lock;
+}
+
+bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
+ bool fi);
+void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
+
+/*
+ * Helpers to check if the CPU's core cookie matches with the task's cookie
+ * when core scheduling is enabled.
+ * A special case is that the task's cookie always matches with CPU's core
+ * cookie if the CPU is in an idle core.
+ */
+static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ /* Ignore cookie match if core scheduler is not enabled on the CPU. */
+ if (!sched_core_enabled(rq))
+ return true;
+
+ return rq->core->core_cookie == p->core_cookie;
+}
+
+static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ bool idle_core = true;
+ int cpu;
+
+ /* Ignore cookie match if core scheduler is not enabled on the CPU. */
+ if (!sched_core_enabled(rq))
+ return true;
+
+ for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) {
+ if (!available_idle_cpu(cpu)) {
+ idle_core = false;
+ break;
+ }
+ }
+
+ /*
+ * A CPU in an idle core is always the best choice for tasks with
+ * cookies.
+ */
+ return idle_core || rq->core->core_cookie == p->core_cookie;
+}
+
+static inline bool sched_group_cookie_match(struct rq *rq,
+ struct task_struct *p,
+ struct sched_group *group)
+{
+ int cpu;
+
+ /* Ignore cookie match if core scheduler is not enabled on the CPU. */
+ if (!sched_core_enabled(rq))
+ return true;
+
+ for_each_cpu_and(cpu, sched_group_span(group), p->cpus_ptr) {
+ if (sched_core_cookie_match(cpu_rq(cpu), p))
+ return true;
+ }
+ return false;
+}
+
+static inline bool sched_core_enqueued(struct task_struct *p)
+{
+ return !RB_EMPTY_NODE(&p->core_node);
+}
+
+extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
+extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
+
+extern void sched_core_get(void);
+extern void sched_core_put(void);
+
+#else /* !CONFIG_SCHED_CORE */
+
+static inline bool sched_core_enabled(struct rq *rq)
+{
+ return false;
+}
+
+static inline bool sched_core_disabled(void)
+{
+ return true;
+}
+
+static inline raw_spinlock_t *rq_lockp(struct rq *rq)
+{
+ return &rq->__lock;
+}
+
+static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
+{
+ return &rq->__lock;
+}
+
+static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ return true;
+}
+
+static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ return true;
+}
+
+static inline bool sched_group_cookie_match(struct rq *rq,
+ struct task_struct *p,
+ struct sched_group *group)
+{
+ return true;
+}
+#endif /* CONFIG_SCHED_CORE */
+
+static inline void lockdep_assert_rq_held(struct rq *rq)
+{
+ lockdep_assert_held(__rq_lockp(rq));
+}
+
+extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass);
+extern bool raw_spin_rq_trylock(struct rq *rq);
+extern void raw_spin_rq_unlock(struct rq *rq);
+
+static inline void raw_spin_rq_lock(struct rq *rq)
+{
+ raw_spin_rq_lock_nested(rq, 0);
+}
+
+static inline void raw_spin_rq_lock_irq(struct rq *rq)
+{
+ local_irq_disable();
+ raw_spin_rq_lock(rq);
+}
+
+static inline void raw_spin_rq_unlock_irq(struct rq *rq)
+{
+ raw_spin_rq_unlock(rq);
+ local_irq_enable();
+}
+
+static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq)
+{
+ unsigned long flags;
+ local_irq_save(flags);
+ raw_spin_rq_lock(rq);
+ return flags;
+}
+
+static inline void raw_spin_rq_unlock_irqrestore(struct rq *rq, unsigned long flags)
+{
+ raw_spin_rq_unlock(rq);
+ local_irq_restore(flags);
+}
+
+#define raw_spin_rq_lock_irqsave(rq, flags) \
+do { \
+ flags = _raw_spin_rq_lock_irqsave(rq); \
+} while (0)
#ifdef CONFIG_SCHED_SMT
extern void __update_idle_core(struct rq *rq);
@@ -1068,21 +1429,56 @@ static inline void update_idle_core(struct rq *rq)
static inline void update_idle_core(struct rq *rq) { }
#endif
-DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+ SCHED_WARN_ON(!entity_is_task(se));
+ return container_of(se, struct task_struct, se);
+}
-#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
-#define this_rq() this_cpu_ptr(&runqueues)
-#define task_rq(p) cpu_rq(task_cpu(p))
-#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-#define raw_rq() raw_cpu_ptr(&runqueues)
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+ return p->se.cfs_rq;
+}
-extern void update_rq_clock(struct rq *rq);
+/* runqueue on which this entity is (to be) queued */
+static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se)
+{
+ return se->cfs_rq;
+}
-static inline u64 __rq_clock_broken(struct rq *rq)
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
- return READ_ONCE(rq->clock);
+ return grp->my_q;
}
+#else
+
+#define task_of(_se) container_of(_se, struct task_struct, se)
+
+static inline struct cfs_rq *task_cfs_rq(const struct task_struct *p)
+{
+ return &task_rq(p)->cfs;
+}
+
+static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se)
+{
+ const struct task_struct *p = task_of(se);
+ struct rq *rq = task_rq(p);
+
+ return &rq->cfs;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+ return NULL;
+}
+#endif
+
+extern void update_rq_clock(struct rq *rq);
+
/*
* rq::clock_update_flags bits
*
@@ -1102,7 +1498,7 @@ static inline u64 __rq_clock_broken(struct rq *rq)
*
* if (rq-clock_update_flags >= RQCF_UPDATED)
*
- * to check if %RQCF_UPADTED is set. It'll never be shifted more than
+ * to check if %RQCF_UPDATED is set. It'll never be shifted more than
* one position though, because the next rq_unpin_lock() will shift it
* back.
*/
@@ -1121,7 +1517,7 @@ static inline void assert_clock_updated(struct rq *rq)
static inline u64 rq_clock(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
assert_clock_updated(rq);
return rq->clock;
@@ -1129,7 +1525,7 @@ static inline u64 rq_clock(struct rq *rq)
static inline u64 rq_clock_task(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
assert_clock_updated(rq);
return rq->clock_task;
@@ -1155,20 +1551,42 @@ static inline u64 rq_clock_thermal(struct rq *rq)
static inline void rq_clock_skip_update(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
rq->clock_update_flags |= RQCF_REQ_SKIP;
}
/*
* See rt task throttling, which is the only time a skip
- * request is cancelled.
+ * request is canceled.
*/
static inline void rq_clock_cancel_skipupdate(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
rq->clock_update_flags &= ~RQCF_REQ_SKIP;
}
+/*
+ * During cpu offlining and rq wide unthrottling, we can trigger
+ * an update_rq_clock() for several cfs and rt runqueues (Typically
+ * when using list_for_each_entry_*)
+ * rq_clock_start_loop_update() can be called after updating the clock
+ * once and before iterating over the list to prevent multiple update.
+ * After the iterative traversal, we need to call rq_clock_stop_loop_update()
+ * to clear RQCF_ACT_SKIP of rq->clock_update_flags.
+ */
+static inline void rq_clock_start_loop_update(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP);
+ rq->clock_update_flags |= RQCF_ACT_SKIP;
+}
+
+static inline void rq_clock_stop_loop_update(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ rq->clock_update_flags &= ~RQCF_ACT_SKIP;
+}
+
struct rq_flags {
unsigned long flags;
struct pin_cookie cookie;
@@ -1182,13 +1600,28 @@ struct rq_flags {
#endif
};
+extern struct balance_callback balance_push_callback;
+
+/*
+ * Lockdep annotation that avoids accidental unlocks; it's like a
+ * sticky/continuous lockdep_assert_held().
+ *
+ * This avoids code that has access to 'struct rq *rq' (basically everything in
+ * the scheduler) from accidentally unlocking the rq if they do not also have a
+ * copy of the (on-stack) 'struct rq_flags rf'.
+ *
+ * Also see Documentation/locking/lockdep-design.rst.
+ */
static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
{
- rf->cookie = lockdep_pin_lock(&rq->lock);
+ rf->cookie = lockdep_pin_lock(__rq_lockp(rq));
#ifdef CONFIG_SCHED_DEBUG
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
rf->clock_update_flags = 0;
+#ifdef CONFIG_SMP
+ SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback);
+#endif
#endif
}
@@ -1199,12 +1632,12 @@ static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
rf->clock_update_flags = RQCF_UPDATED;
#endif
- lockdep_unpin_lock(&rq->lock, rf->cookie);
+ lockdep_unpin_lock(__rq_lockp(rq), rf->cookie);
}
static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
{
- lockdep_repin_lock(&rq->lock, rf->cookie);
+ lockdep_repin_lock(__rq_lockp(rq), rf->cookie);
#ifdef CONFIG_SCHED_DEBUG
/*
@@ -1225,7 +1658,7 @@ static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
}
static inline void
@@ -1234,15 +1667,20 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
__releases(p->pi_lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
+DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
+ _T->rq = task_rq_lock(_T->lock, &_T->rf),
+ task_rq_unlock(_T->rq, _T->lock, &_T->rf),
+ struct rq *rq; struct rq_flags rf)
+
static inline void
rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock_irqsave(&rq->lock, rf->flags);
+ raw_spin_rq_lock_irqsave(rq, rf->flags);
rq_pin_lock(rq, rf);
}
@@ -1250,7 +1688,7 @@ static inline void
rq_lock_irq(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_rq_lock_irq(rq);
rq_pin_lock(rq, rf);
}
@@ -1258,24 +1696,16 @@ static inline void
rq_lock(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
rq_pin_lock(rq, rf);
}
static inline void
-rq_relock(struct rq *rq, struct rq_flags *rf)
- __acquires(rq->lock)
-{
- raw_spin_lock(&rq->lock);
- rq_repin_lock(rq, rf);
-}
-
-static inline void
rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
+ raw_spin_rq_unlock_irqrestore(rq, rf->flags);
}
static inline void
@@ -1283,7 +1713,7 @@ rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock_irq(&rq->lock);
+ raw_spin_rq_unlock_irq(rq);
}
static inline void
@@ -1291,9 +1721,24 @@ rq_unlock(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
}
+DEFINE_LOCK_GUARD_1(rq_lock, struct rq,
+ rq_lock(_T->lock, &_T->rf),
+ rq_unlock(_T->lock, &_T->rf),
+ struct rq_flags rf)
+
+DEFINE_LOCK_GUARD_1(rq_lock_irq, struct rq,
+ rq_lock_irq(_T->lock, &_T->rf),
+ rq_unlock_irq(_T->lock, &_T->rf),
+ struct rq_flags rf)
+
+DEFINE_LOCK_GUARD_1(rq_lock_irqsave, struct rq,
+ rq_lock_irqsave(_T->lock, &_T->rf),
+ rq_unlock_irqrestore(_T->lock, &_T->rf),
+ struct rq_flags rf)
+
static inline struct rq *
this_rq_lock_irq(struct rq_flags *rf)
__acquires(rq->lock)
@@ -1315,12 +1760,14 @@ enum numa_topology_type {
extern enum numa_topology_type sched_numa_topology_type;
extern int sched_max_numa_distance;
extern bool find_numa_distance(int distance);
-extern void sched_init_numa(void);
+extern void sched_init_numa(int offline_node);
+extern void sched_update_numa(int cpu, bool online);
extern void sched_domains_numa_masks_set(unsigned int cpu);
extern void sched_domains_numa_masks_clear(unsigned int cpu);
extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);
#else
-static inline void sched_init_numa(void) { }
+static inline void sched_init_numa(int offline_node) { }
+static inline void sched_update_numa(int cpu, bool online) { }
static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
@@ -1353,15 +1800,20 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
static inline void
queue_balance_callback(struct rq *rq,
- struct callback_head *head,
+ struct balance_callback *head,
void (*func)(struct rq *rq))
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
- if (unlikely(head->next))
+ /*
+ * Don't (re)queue an already queued item; nor queue anything when
+ * balance_push() is active, see the comment with
+ * balance_push_callback.
+ */
+ if (unlikely(head->next || rq->balance_callback == &balance_push_callback))
return;
- head->func = (void (*)(struct callback_head *))func;
+ head->func = func;
head->next = rq->balance_callback;
rq->balance_callback = head;
}
@@ -1381,6 +1833,13 @@ queue_balance_callback(struct rq *rq,
for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
__sd; __sd = __sd->parent)
+/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) |
+static const unsigned int SD_SHARED_CHILD_MASK =
+#include <linux/sched/sd_flags.h>
+0;
+#undef SD_FLAG
+
/**
* highest_flag_domain - Return highest sched_domain containing flag.
* @cpu: The CPU whose highest level of sched domain is to
@@ -1388,16 +1847,25 @@ queue_balance_callback(struct rq *rq,
* @flag: The flag to check for the highest sched_domain
* for the given CPU.
*
- * Returns the highest sched_domain of a CPU which contains the given flag.
+ * Returns the highest sched_domain of a CPU which contains @flag. If @flag has
+ * the SDF_SHARED_CHILD metaflag, all the children domains also have @flag.
*/
static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
{
struct sched_domain *sd, *hsd = NULL;
for_each_domain(cpu, sd) {
- if (!(sd->flags & flag))
+ if (sd->flags & flag) {
+ hsd = sd;
+ continue;
+ }
+
+ /*
+ * Stop the search if @flag is known to be shared at lower
+ * levels. It will not be found further up.
+ */
+ if (flag & SD_SHARED_CHILD_MASK)
break;
- hsd = sd;
}
return hsd;
@@ -1418,11 +1886,18 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(int, sd_share_id);
DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
extern struct static_key_false sched_asym_cpucapacity;
+extern struct static_key_false sched_cluster_active;
+
+static __always_inline bool sched_asym_cpucap_active(void)
+{
+ return static_branch_unlikely(&sched_asym_cpucapacity);
+}
struct sched_group_capacity {
atomic_t ref;
@@ -1440,7 +1915,7 @@ struct sched_group_capacity {
int id;
#endif
- unsigned long cpumask[0]; /* Balance mask */
+ unsigned long cpumask[]; /* Balance mask */
};
struct sched_group {
@@ -1448,8 +1923,10 @@ struct sched_group {
atomic_t ref;
unsigned int group_weight;
+ unsigned int cores;
struct sched_group_capacity *sgc;
int asym_prefer_cpu; /* CPU of highest priority in group */
+ int flags;
/*
* The CPUs this group covers.
@@ -1474,41 +1951,57 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
return to_cpumask(sg->sgc->cpumask);
}
-/**
- * group_first_cpu - Returns the first CPU in the cpumask of a sched_group.
- * @group: The group whose first CPU is to be returned.
- */
-static inline unsigned int group_first_cpu(struct sched_group *group)
-{
- return cpumask_first(sched_group_span(group));
-}
-
extern int group_balance_cpu(struct sched_group *sg);
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
-void register_sched_domain_sysctl(void);
+#ifdef CONFIG_SCHED_DEBUG
+void update_sched_domain_debugfs(void);
void dirty_sched_domain_sysctl(int cpu);
-void unregister_sched_domain_sysctl(void);
#else
-static inline void register_sched_domain_sysctl(void)
+static inline void update_sched_domain_debugfs(void)
{
}
static inline void dirty_sched_domain_sysctl(int cpu)
{
}
-static inline void unregister_sched_domain_sysctl(void)
-{
-}
#endif
-extern void flush_smp_call_function_from_idle(void);
+extern int sched_update_scaling(void);
-#else /* !CONFIG_SMP: */
-static inline void flush_smp_call_function_from_idle(void) { }
-#endif
+static inline const struct cpumask *task_user_cpus(struct task_struct *p)
+{
+ if (!p->user_cpus_ptr)
+ return cpu_possible_mask; /* &init_task.cpus_mask */
+ return p->user_cpus_ptr;
+}
+#endif /* CONFIG_SMP */
#include "stats.h"
-#include "autogroup.h"
+
+#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
+
+extern void __sched_core_account_forceidle(struct rq *rq);
+
+static inline void sched_core_account_forceidle(struct rq *rq)
+{
+ if (schedstat_enabled())
+ __sched_core_account_forceidle(rq);
+}
+
+extern void __sched_core_tick(struct rq *rq);
+
+static inline void sched_core_tick(struct rq *rq)
+{
+ if (sched_core_enabled(rq) && schedstat_enabled())
+ __sched_core_tick(rq);
+}
+
+#else
+
+static inline void sched_core_account_forceidle(struct rq *rq) {}
+
+static inline void sched_core_tick(struct rq *rq) {}
+
+#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */
#ifdef CONFIG_CGROUP_SCHED
@@ -1541,6 +2034,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
p->se.cfs_rq = tg->cfs_rq[cpu];
p->se.parent = tg->se[cpu];
+ p->se.depth = tg->se[cpu] ? tg->se[cpu]->depth + 1 : 0;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -1569,11 +2063,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
* per-task data have been completed by this moment.
*/
smp_wmb();
-#ifdef CONFIG_THREAD_INFO_IN_TASK
- WRITE_ONCE(p->cpu, cpu);
-#else
WRITE_ONCE(task_thread_info(p)->cpu, cpu);
-#endif
p->wake_cpu = cpu;
#endif
}
@@ -1582,7 +2072,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
* Tunables that become constants when CONFIG_SCHED_DEBUG is off:
*/
#ifdef CONFIG_SCHED_DEBUG
-# include <linux/static_key.h>
# define const_debug __read_mostly
#else
# define const_debug const
@@ -1598,7 +2087,7 @@ enum {
#undef SCHED_FEAT
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
+#ifdef CONFIG_SCHED_DEBUG
/*
* To support run-time toggling of sched features, all the translation units
@@ -1606,6 +2095,7 @@ enum {
*/
extern const_debug unsigned int sysctl_sched_features;
+#ifdef CONFIG_JUMP_LABEL
#define SCHED_FEAT(name, enabled) \
static __always_inline bool static_branch_##name(struct static_key *key) \
{ \
@@ -1618,7 +2108,13 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
-#else /* !(SCHED_DEBUG && CONFIG_JUMP_LABEL) */
+#else /* !CONFIG_JUMP_LABEL */
+
+#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+
+#endif /* CONFIG_JUMP_LABEL */
+
+#else /* !SCHED_DEBUG */
/*
* Each translation unit has its own copy of sysctl_sched_features to allow
@@ -1634,7 +2130,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
-#endif /* SCHED_DEBUG && CONFIG_JUMP_LABEL */
+#endif /* SCHED_DEBUG */
extern struct static_key_false sched_numa_balancing;
extern struct static_key_false sched_schedstats;
@@ -1657,7 +2153,7 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
return rq->curr == p;
}
-static inline int task_running(struct rq *rq, struct task_struct *p)
+static inline int task_on_cpu(struct rq *rq, struct task_struct *p)
{
#ifdef CONFIG_SMP
return p->on_cpu;
@@ -1676,13 +2172,20 @@ static inline int task_on_rq_migrating(struct task_struct *p)
return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;
}
-/*
- * wake flags
- */
-#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
-#define WF_FORK 0x02 /* Child wakeup after fork */
-#define WF_MIGRATED 0x04 /* Internal use, task got migrated */
-#define WF_ON_CPU 0x08 /* Wakee is on_cpu */
+/* Wake flags. The first three directly map to some SD flag value */
+#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
+#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
+#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */
+
+#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
+#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
+#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */
+
+#ifdef CONFIG_SMP
+static_assert(WF_EXEC == SD_BALANCE_EXEC);
+static_assert(WF_FORK == SD_BALANCE_FORK);
+static_assert(WF_TTWU == SD_BALANCE_WAKE);
+#endif
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1712,6 +2215,10 @@ extern const u32 sched_prio_to_wmult[40];
* MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
* in the runqueue.
*
+ * NOCLOCK - skip the update_rq_clock() (avoids double updates)
+ *
+ * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE)
+ *
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
* ENQUEUE_MIGRATED - the task was migrated during wakeup
@@ -1722,6 +2229,7 @@ extern const u32 sched_prio_to_wmult[40];
#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
+#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
@@ -1735,11 +2243,20 @@ extern const u32 sched_prio_to_wmult[40];
#else
#define ENQUEUE_MIGRATED 0x00
#endif
+#define ENQUEUE_INITIAL 0x80
+#define ENQUEUE_MIGRATING 0x100
#define RETRY_TASK ((void *)-1UL)
+struct affinity_context {
+ const struct cpumask *new_mask;
+ struct cpumask *user_mask;
+ unsigned int flags;
+};
+
+extern s64 update_curr_common(struct rq *rq);
+
struct sched_class {
- const struct sched_class *next;
#ifdef CONFIG_UCLAMP_TASK
int uclamp_enabled;
@@ -1748,9 +2265,9 @@ struct sched_class {
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
- bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
+ bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
- void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
+ void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
struct task_struct *(*pick_next_task)(struct rq *rq);
@@ -1759,16 +2276,20 @@ struct sched_class {
#ifdef CONFIG_SMP
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
- int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
+ int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
+
+ struct task_struct * (*pick_task)(struct rq *rq);
+
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
- void (*set_cpus_allowed)(struct task_struct *p,
- const struct cpumask *newmask);
+ void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx);
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
+
+ struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
#endif
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
@@ -1777,7 +2298,7 @@ struct sched_class {
/*
* The switched_from() call is allowed to drop rq->lock, therefore we
- * cannot assume the switched_from/switched_to pair is serliazed by
+ * cannot assume the switched_from/switched_to pair is serialized by
* rq->lock. They are however serialized by p->pi_lock.
*/
void (*switched_from)(struct rq *this_rq, struct task_struct *task);
@@ -1790,11 +2311,12 @@ struct sched_class {
void (*update_curr)(struct rq *rq);
-#define TASK_SET_GROUP 0
-#define TASK_MOVE_GROUP 1
-
#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*task_change_group)(struct task_struct *p, int type);
+ void (*task_change_group)(struct task_struct *p);
+#endif
+
+#ifdef CONFIG_SCHED_CORE
+ int (*task_is_throttled)(struct task_struct *p, int cpu);
#endif
};
@@ -1806,21 +2328,36 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
static inline void set_next_task(struct rq *rq, struct task_struct *next)
{
- WARN_ON_ONCE(rq->curr != next);
next->sched_class->set_next_task(rq, next, false);
}
-#ifdef CONFIG_SMP
-#define sched_class_highest (&stop_sched_class)
-#else
-#define sched_class_highest (&dl_sched_class)
-#endif
+
+/*
+ * Helper to define a sched_class instance; each one is placed in a separate
+ * section which is ordered by the linker script:
+ *
+ * include/asm-generic/vmlinux.lds.h
+ *
+ * *CAREFUL* they are laid out in *REVERSE* order!!!
+ *
+ * Also enforce alignment on the instance, not the type, to guarantee layout.
+ */
+#define DEFINE_SCHED_CLASS(name) \
+const struct sched_class name##_sched_class \
+ __aligned(__alignof__(struct sched_class)) \
+ __section("__" #name "_sched_class")
+
+/* Defined in include/asm-generic/vmlinux.lds.h */
+extern struct sched_class __sched_class_highest[];
+extern struct sched_class __sched_class_lowest[];
#define for_class_range(class, _from, _to) \
- for (class = (_from); class != (_to); class = class->next)
+ for (class = (_from); class < (_to); class++)
#define for_each_class(class) \
- for_class_range(class, sched_class_highest, NULL)
+ for_class_range(class, __sched_class_highest, __sched_class_lowest)
+
+#define sched_class_above(_a, _b) ((_a) < (_b))
extern const struct sched_class stop_sched_class;
extern const struct sched_class dl_sched_class;
@@ -1851,13 +2388,39 @@ static inline bool sched_fair_runnable(struct rq *rq)
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
extern struct task_struct *pick_next_task_idle(struct rq *rq);
+#define SCA_CHECK 0x01
+#define SCA_MIGRATE_DISABLE 0x02
+#define SCA_MIGRATE_ENABLE 0x04
+#define SCA_USER 0x08
+
#ifdef CONFIG_SMP
extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
-extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
+extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx);
+
+static inline struct task_struct *get_push_task(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ lockdep_assert_rq_held(rq);
+
+ if (rq->push_busy)
+ return NULL;
+
+ if (p->nr_cpus_allowed == 1)
+ return NULL;
+
+ if (p->migration_disabled)
+ return NULL;
+
+ rq->push_busy = true;
+ return get_task_struct(p);
+}
+
+extern int push_cpu_stop(void *arg);
#endif
@@ -1887,6 +2450,7 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
#endif
extern void schedule_idle(void);
+asmlinkage void schedule_user(void);
extern void sysrq_sched_debug_show(void);
extern void sched_init_granularity(void);
@@ -1903,11 +2467,9 @@ extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
-extern struct dl_bandwidth def_dl_bandwidth;
-extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
-extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
-extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
+extern void init_dl_entity(struct sched_dl_entity *dl_se);
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
@@ -1930,12 +2492,7 @@ extern int __init sched_tick_offload_init(void);
*/
static inline void sched_update_tick_dependency(struct rq *rq)
{
- int cpu;
-
- if (!tick_nohz_full_enabled())
- return;
-
- cpu = cpu_of(rq);
+ int cpu = cpu_of(rq);
if (!tick_nohz_full_cpu(cpu))
return;
@@ -1955,6 +2512,9 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
unsigned prev_nr = rq->nr_running;
rq->nr_running = prev_nr + count;
+ if (trace_sched_update_nr_running_tp_enabled()) {
+ call_trace_sched_update_nr_running(rq, count);
+ }
#ifdef CONFIG_SMP
if (prev_nr < 2 && rq->nr_running >= 2) {
@@ -1969,6 +2529,10 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
static inline void sub_nr_running(struct rq *rq, unsigned count)
{
rq->nr_running -= count;
+ if (trace_sched_update_nr_running_tp_enabled()) {
+ call_trace_sched_update_nr_running(rq, -count);
+ }
+
/* Check if we still need preemption */
sched_update_tick_dependency(rq);
}
@@ -1976,11 +2540,32 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
-extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
+extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
+
+#ifdef CONFIG_PREEMPT_RT
+#define SCHED_NR_MIGRATE_BREAK 8
+#else
+#define SCHED_NR_MIGRATE_BREAK 32
+#endif
extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost;
+extern unsigned int sysctl_sched_base_slice;
+
+#ifdef CONFIG_SCHED_DEBUG
+extern int sysctl_resched_latency_warn_ms;
+extern int sysctl_resched_latency_warn_once;
+
+extern unsigned int sysctl_sched_tunable_scaling;
+
+extern unsigned int sysctl_numa_balancing_scan_delay;
+extern unsigned int sysctl_numa_balancing_scan_period_min;
+extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_scan_size;
+extern unsigned int sysctl_numa_balancing_hot_threshold;
+#endif
+
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -1990,17 +2575,39 @@ extern const_debug unsigned int sysctl_sched_migration_cost;
*/
static inline int hrtick_enabled(struct rq *rq)
{
- if (!sched_feat(HRTICK))
- return 0;
if (!cpu_active(cpu_of(rq)))
return 0;
return hrtimer_is_hres_active(&rq->hrtick_timer);
}
+static inline int hrtick_enabled_fair(struct rq *rq)
+{
+ if (!sched_feat(HRTICK))
+ return 0;
+ return hrtick_enabled(rq);
+}
+
+static inline int hrtick_enabled_dl(struct rq *rq)
+{
+ if (!sched_feat(HRTICK_DL))
+ return 0;
+ return hrtick_enabled(rq);
+}
+
void hrtick_start(struct rq *rq, u64 delay);
#else
+static inline int hrtick_enabled_fair(struct rq *rq)
+{
+ return 0;
+}
+
+static inline int hrtick_enabled_dl(struct rq *rq)
+{
+ return 0;
+}
+
static inline int hrtick_enabled(struct rq *rq)
{
return 0;
@@ -2016,6 +2623,16 @@ void arch_scale_freq_tick(void)
#endif
#ifndef arch_scale_freq_capacity
+/**
+ * arch_scale_freq_capacity - get the frequency scale factor of a given CPU.
+ * @cpu: the CPU in question.
+ *
+ * Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e.
+ *
+ * f_curr
+ * ------ * SCHED_CAPACITY_SCALE
+ * f_max
+ */
static __always_inline
unsigned long arch_scale_freq_capacity(int cpu)
{
@@ -2023,10 +2640,62 @@ unsigned long arch_scale_freq_capacity(int cpu)
}
#endif
+#ifdef CONFIG_SCHED_DEBUG
+/*
+ * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to
+ * acquire rq lock instead of rq_lock(). So at the end of these two functions
+ * we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of
+ * rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning.
+ */
+static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
+{
+ rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+ /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */
#ifdef CONFIG_SMP
-#ifdef CONFIG_PREEMPTION
+ rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+#endif
+}
+#else
+static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {}
+#endif
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
+#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \
+__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \
+static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \
+{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \
+ _lock; return _t; }
+
+#ifdef CONFIG_SMP
+
+static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
+{
+#ifdef CONFIG_SCHED_CORE
+ /*
+ * In order to not have {0,2},{1,3} turn into into an AB-BA,
+ * order by core-id first and cpu-id second.
+ *
+ * Notably:
+ *
+ * double_rq_lock(0,3); will take core-0, core-1 lock
+ * double_rq_lock(1,2); will take core-1, core-0 lock
+ *
+ * when only cpu-id is considered.
+ */
+ if (rq1->core->cpu < rq2->core->cpu)
+ return true;
+ if (rq1->core->cpu > rq2->core->cpu)
+ return false;
+
+ /*
+ * __sched_core_flip() relies on SMT having cpu-id lock order.
+ */
+#endif
+ return rq1->cpu < rq2->cpu;
+}
+
+extern void double_rq_lock(struct rq *rq1, struct rq *rq2);
+
+#ifdef CONFIG_PREEMPTION
/*
* fair double_lock_balance: Safely acquires both rq->locks in a fair
@@ -2041,7 +2710,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
- raw_spin_unlock(&this_rq->lock);
+ raw_spin_rq_unlock(this_rq);
double_rq_lock(this_rq, busiest);
return 1;
@@ -2060,20 +2729,22 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
- int ret = 0;
-
- if (unlikely(!raw_spin_trylock(&busiest->lock))) {
- if (busiest < this_rq) {
- raw_spin_unlock(&this_rq->lock);
- raw_spin_lock(&busiest->lock);
- raw_spin_lock_nested(&this_rq->lock,
- SINGLE_DEPTH_NESTING);
- ret = 1;
- } else
- raw_spin_lock_nested(&busiest->lock,
- SINGLE_DEPTH_NESTING);
+ if (__rq_lockp(this_rq) == __rq_lockp(busiest) ||
+ likely(raw_spin_rq_trylock(busiest))) {
+ double_rq_clock_clear_update(this_rq, busiest);
+ return 0;
+ }
+
+ if (rq_order_less(this_rq, busiest)) {
+ raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING);
+ double_rq_clock_clear_update(this_rq, busiest);
+ return 0;
}
- return ret;
+
+ raw_spin_rq_unlock(this_rq);
+ double_rq_lock(this_rq, busiest);
+
+ return 1;
}
#endif /* CONFIG_PREEMPTION */
@@ -2083,11 +2754,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
*/
static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
{
- if (unlikely(!irqs_disabled())) {
- /* printk() doesn't work well under rq->lock */
- raw_spin_unlock(&this_rq->lock);
- BUG_ON(1);
- }
+ lockdep_assert_irqs_disabled();
return _double_lock_balance(this_rq, busiest);
}
@@ -2095,8 +2762,9 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
- raw_spin_unlock(&busiest->lock);
- lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+ if (__rq_lockp(this_rq) != __rq_lockp(busiest))
+ raw_spin_rq_unlock(busiest);
+ lock_set_subclass(&__rq_lockp(this_rq)->dep_map, 0, _RET_IP_);
}
static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
@@ -2126,31 +2794,16 @@ static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
}
-/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
- __acquires(rq1->lock)
- __acquires(rq2->lock)
+static inline void double_raw_unlock(raw_spinlock_t *l1, raw_spinlock_t *l2)
{
- BUG_ON(!irqs_disabled());
- if (rq1 == rq2) {
- raw_spin_lock(&rq1->lock);
- __acquire(rq2->lock); /* Fake it out ;) */
- } else {
- if (rq1 < rq2) {
- raw_spin_lock(&rq1->lock);
- raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
- } else {
- raw_spin_lock(&rq2->lock);
- raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
- }
- }
+ raw_spin_unlock(l1);
+ raw_spin_unlock(l2);
}
+DEFINE_LOCK_GUARD_2(double_raw_spinlock, raw_spinlock_t,
+ double_raw_lock(_T->lock, _T->lock2),
+ double_raw_unlock(_T->lock, _T->lock2))
+
/*
* double_rq_unlock - safely unlock two runqueues
*
@@ -2161,11 +2814,11 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__releases(rq1->lock)
__releases(rq2->lock)
{
- raw_spin_unlock(&rq1->lock);
- if (rq1 != rq2)
- raw_spin_unlock(&rq2->lock);
+ if (__rq_lockp(rq1) != __rq_lockp(rq2))
+ raw_spin_rq_unlock(rq2);
else
__release(rq2->lock);
+ raw_spin_rq_unlock(rq1);
}
extern void set_rq_online (struct rq *rq);
@@ -2184,10 +2837,11 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
__acquires(rq1->lock)
__acquires(rq2->lock)
{
- BUG_ON(!irqs_disabled());
- BUG_ON(rq1 != rq2);
- raw_spin_lock(&rq1->lock);
+ WARN_ON_ONCE(!irqs_disabled());
+ WARN_ON_ONCE(rq1 != rq2);
+ raw_spin_rq_lock(rq1);
__acquire(rq2->lock); /* Fake it out ;) */
+ double_rq_clock_clear_update(rq1, rq2);
}
/*
@@ -2200,18 +2854,23 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__releases(rq1->lock)
__releases(rq2->lock)
{
- BUG_ON(rq1 != rq2);
- raw_spin_unlock(&rq1->lock);
+ WARN_ON_ONCE(rq1 != rq2);
+ raw_spin_rq_unlock(rq1);
__release(rq2->lock);
}
#endif
+DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq,
+ double_rq_lock(_T->lock, _T->lock2),
+ double_rq_unlock(_T->lock, _T->lock2))
+
+extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
#ifdef CONFIG_SCHED_DEBUG
-extern bool sched_debug_enabled;
+extern bool sched_debug_verbose;
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
@@ -2219,6 +2878,8 @@ extern void print_dl_stats(struct seq_file *m, int cpu);
extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
+
+extern void resched_latency_warn(int cpu, u64 latency);
#ifdef CONFIG_NUMA_BALANCING
extern void
show_numa_stats(struct task_struct *p, struct seq_file *m);
@@ -2226,6 +2887,8 @@ extern void
print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
unsigned long tpf, unsigned long gsf, unsigned long gpf);
#endif /* CONFIG_NUMA_BALANCING */
+#else
+static inline void resched_latency_warn(int cpu, u64 latency) {}
#endif /* CONFIG_SCHED_DEBUG */
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
@@ -2238,11 +2901,19 @@ extern void cfs_bandwidth_usage_dec(void);
#ifdef CONFIG_NO_HZ_COMMON
#define NOHZ_BALANCE_KICK_BIT 0
#define NOHZ_STATS_KICK_BIT 1
+#define NOHZ_NEWILB_KICK_BIT 2
+#define NOHZ_NEXT_KICK_BIT 3
+/* Run rebalance_domains() */
#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
+/* Update blocked load */
#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
+/* Update blocked load when entering idle */
+#define NOHZ_NEWILB_KICK BIT(NOHZ_NEWILB_KICK_BIT)
+/* Update nohz.next_balance */
+#define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT)
-#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK)
#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
@@ -2251,33 +2922,12 @@ extern void nohz_balance_exit_idle(struct rq *rq);
static inline void nohz_balance_exit_idle(struct rq *rq) { }
#endif
-
-#ifdef CONFIG_SMP
-static inline
-void __dl_update(struct dl_bw *dl_b, s64 bw)
-{
- struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
- int i;
-
- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
- "sched RCU must be held");
- for_each_cpu_and(i, rd->span, cpu_active_mask) {
- struct rq *rq = cpu_rq(i);
-
- rq->dl.extra_bw += bw;
- }
-}
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern void nohz_run_idle_balance(int cpu);
#else
-static inline
-void __dl_update(struct dl_bw *dl_b, s64 bw)
-{
- struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
-
- dl->extra_bw += bw;
-}
+static inline void nohz_run_idle_balance(int cpu) { }
#endif
-
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
struct irqtime {
u64 total;
@@ -2290,7 +2940,7 @@ DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
/*
* Returns the irqtime minus the softirq time computed by ksoftirqd.
- * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * Otherwise ksoftirqd's sum_exec_runtime is subtracted its own runtime
* and never move forward.
*/
static inline u64 irq_time_read(int cpu)
@@ -2346,40 +2996,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
-#ifdef CONFIG_UCLAMP_TASK
-unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
-
-static __always_inline
-unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
- struct task_struct *p)
-{
- unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
- unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
-
- if (p) {
- min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
- max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX));
- }
-
- /*
- * Since CPU's {min,max}_util clamps are MAX aggregated considering
- * RUNNABLE tasks with _different_ clamps, we can end up with an
- * inversion. Fix it now when the clamps are applied.
- */
- if (unlikely(min_util >= max_util))
- return min_util;
-
- return clamp(util, min_util, max_util);
-}
-#else /* CONFIG_UCLAMP_TASK */
-static inline
-unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
- struct task_struct *p)
-{
- return util;
-}
-#endif /* CONFIG_UCLAMP_TASK */
-
#ifdef arch_scale_freq_capacity
# ifndef arch_scale_freq_invariant
# define arch_scale_freq_invariant() true
@@ -2389,32 +3005,29 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
#endif
#ifdef CONFIG_SMP
-static inline unsigned long capacity_orig_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity_orig;
-}
-#endif
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long *min,
+ unsigned long *max);
-/**
- * enum schedutil_type - CPU utilization type
- * @FREQUENCY_UTIL: Utilization used to select frequency
- * @ENERGY_UTIL: Utilization used during energy calculation
+unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
+ unsigned long min,
+ unsigned long max);
+
+
+/*
+ * Verify the fitness of task @p to run on @cpu taking into account the
+ * CPU original capacity and the runtime/deadline ratio of the task.
*
- * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time
- * need to be aggregated differently depending on the usage made of them. This
- * enum is used within schedutil_freq_util() to differentiate the types of
- * utilization expected by the callers, and adjust the aggregation accordingly.
+ * The function will return true if the original capacity of @cpu is
+ * greater than or equal to task's deadline density right shifted by
+ * (BW_SHIFT - SCHED_CAPACITY_SHIFT) and false otherwise.
*/
-enum schedutil_type {
- FREQUENCY_UTIL,
- ENERGY_UTIL,
-};
-
-#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu)
+{
+ unsigned long cap = arch_scale_cpu_capacity(cpu);
-unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum schedutil_type type,
- struct task_struct *p);
+ return cap >= p->dl.dl_density >> (BW_SHIFT - SCHED_CAPACITY_SHIFT);
+}
static inline unsigned long cpu_bw_dl(struct rq *rq)
{
@@ -2426,30 +3039,99 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
return READ_ONCE(rq->avg_dl.util_avg);
}
-static inline unsigned long cpu_util_cfs(struct rq *rq)
+
+extern unsigned long cpu_util_cfs(int cpu);
+extern unsigned long cpu_util_cfs_boost(int cpu);
+
+static inline unsigned long cpu_util_rt(struct rq *rq)
{
- unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
+ return READ_ONCE(rq->avg_rt.util_avg);
+}
+#endif
- if (sched_feat(UTIL_EST)) {
- util = max_t(unsigned long, util,
- READ_ONCE(rq->cfs.avg.util_est.enqueued));
- }
+#ifdef CONFIG_UCLAMP_TASK
+unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
- return util;
+static inline unsigned long uclamp_rq_get(struct rq *rq,
+ enum uclamp_id clamp_id)
+{
+ return READ_ONCE(rq->uclamp[clamp_id].value);
}
-static inline unsigned long cpu_util_rt(struct rq *rq)
+static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int value)
{
- return READ_ONCE(rq->avg_rt.util_avg);
+ WRITE_ONCE(rq->uclamp[clamp_id].value, value);
}
-#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
-static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum schedutil_type type,
- struct task_struct *p)
+
+static inline bool uclamp_rq_is_idle(struct rq *rq)
{
- return 0;
+ return rq->uclamp_flags & UCLAMP_FLAG_IDLE;
}
-#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+
+/* Is the rq being capped/throttled by uclamp_max? */
+static inline bool uclamp_rq_is_capped(struct rq *rq)
+{
+ unsigned long rq_util;
+ unsigned long max_util;
+
+ if (!static_branch_likely(&sched_uclamp_used))
+ return false;
+
+ rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq);
+ max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+
+ return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util;
+}
+
+/*
+ * When uclamp is compiled in, the aggregation at rq level is 'turned off'
+ * by default in the fast path and only gets turned on once userspace performs
+ * an operation that requires it.
+ *
+ * Returns true if userspace opted-in to use uclamp and aggregation at rq level
+ * hence is active.
+ */
+static inline bool uclamp_is_used(void)
+{
+ return static_branch_likely(&sched_uclamp_used);
+}
+#else /* CONFIG_UCLAMP_TASK */
+static inline unsigned long uclamp_eff_value(struct task_struct *p,
+ enum uclamp_id clamp_id)
+{
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+
+ return SCHED_CAPACITY_SCALE;
+}
+
+static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; }
+
+static inline bool uclamp_is_used(void)
+{
+ return false;
+}
+
+static inline unsigned long uclamp_rq_get(struct rq *rq,
+ enum uclamp_id clamp_id)
+{
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+
+ return SCHED_CAPACITY_SCALE;
+}
+
+static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int value)
+{
+}
+
+static inline bool uclamp_rq_is_idle(struct rq *rq)
+{
+ return false;
+}
+#endif /* CONFIG_UCLAMP_TASK */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
static inline unsigned long cpu_util_irq(struct rq *rq)
@@ -2490,6 +3172,8 @@ static inline bool sched_energy_enabled(void)
return static_branch_unlikely(&sched_energy_present);
}
+extern struct cpufreq_governor schedutil_gov;
+
#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
#define perf_domain_span(pd) NULL
@@ -2540,5 +3224,253 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
}
#endif
-void swake_up_all_locked(struct swait_queue_head *q);
-void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern void swake_up_all_locked(struct swait_queue_head *q);
+extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+extern int try_to_wake_up(struct task_struct *tsk, unsigned int state, int wake_flags);
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+extern int preempt_dynamic_mode;
+extern int sched_dynamic_mode(const char *str);
+extern void sched_dynamic_update(int mode);
+#endif
+
+#ifdef CONFIG_SCHED_MM_CID
+
+#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */
+#define MM_CID_SCAN_DELAY 100 /* 100ms */
+
+extern raw_spinlock_t cid_lock;
+extern int use_cid_lock;
+
+extern void sched_mm_cid_migrate_from(struct task_struct *t);
+extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
+extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr);
+extern void init_sched_mm_cid(struct task_struct *t);
+
+static inline void __mm_cid_put(struct mm_struct *mm, int cid)
+{
+ if (cid < 0)
+ return;
+ cpumask_clear_cpu(cid, mm_cidmask(mm));
+}
+
+/*
+ * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to
+ * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to
+ * be held to transition to other states.
+ *
+ * State transitions synchronized with cmpxchg or try_cmpxchg need to be
+ * consistent across cpus, which prevents use of this_cpu_cmpxchg.
+ */
+static inline void mm_cid_put_lazy(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
+ int cid;
+
+ lockdep_assert_irqs_disabled();
+ cid = __this_cpu_read(pcpu_cid->cid);
+ if (!mm_cid_is_lazy_put(cid) ||
+ !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
+ return;
+ __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+}
+
+static inline int mm_cid_pcpu_unset(struct mm_struct *mm)
+{
+ struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
+ int cid, res;
+
+ lockdep_assert_irqs_disabled();
+ cid = __this_cpu_read(pcpu_cid->cid);
+ for (;;) {
+ if (mm_cid_is_unset(cid))
+ return MM_CID_UNSET;
+ /*
+ * Attempt transition from valid or lazy-put to unset.
+ */
+ res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET);
+ if (res == cid)
+ break;
+ cid = res;
+ }
+ return cid;
+}
+
+static inline void mm_cid_put(struct mm_struct *mm)
+{
+ int cid;
+
+ lockdep_assert_irqs_disabled();
+ cid = mm_cid_pcpu_unset(mm);
+ if (cid == MM_CID_UNSET)
+ return;
+ __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+}
+
+static inline int __mm_cid_try_get(struct mm_struct *mm)
+{
+ struct cpumask *cpumask;
+ int cid;
+
+ cpumask = mm_cidmask(mm);
+ /*
+ * Retry finding first zero bit if the mask is temporarily
+ * filled. This only happens during concurrent remote-clear
+ * which owns a cid without holding a rq lock.
+ */
+ for (;;) {
+ cid = cpumask_first_zero(cpumask);
+ if (cid < nr_cpu_ids)
+ break;
+ cpu_relax();
+ }
+ if (cpumask_test_and_set_cpu(cid, cpumask))
+ return -1;
+ return cid;
+}
+
+/*
+ * Save a snapshot of the current runqueue time of this cpu
+ * with the per-cpu cid value, allowing to estimate how recently it was used.
+ */
+static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm)
+{
+ struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq));
+
+ lockdep_assert_rq_held(rq);
+ WRITE_ONCE(pcpu_cid->time, rq->clock);
+}
+
+static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm)
+{
+ int cid;
+
+ /*
+ * All allocations (even those using the cid_lock) are lock-free. If
+ * use_cid_lock is set, hold the cid_lock to perform cid allocation to
+ * guarantee forward progress.
+ */
+ if (!READ_ONCE(use_cid_lock)) {
+ cid = __mm_cid_try_get(mm);
+ if (cid >= 0)
+ goto end;
+ raw_spin_lock(&cid_lock);
+ } else {
+ raw_spin_lock(&cid_lock);
+ cid = __mm_cid_try_get(mm);
+ if (cid >= 0)
+ goto unlock;
+ }
+
+ /*
+ * cid concurrently allocated. Retry while forcing following
+ * allocations to use the cid_lock to ensure forward progress.
+ */
+ WRITE_ONCE(use_cid_lock, 1);
+ /*
+ * Set use_cid_lock before allocation. Only care about program order
+ * because this is only required for forward progress.
+ */
+ barrier();
+ /*
+ * Retry until it succeeds. It is guaranteed to eventually succeed once
+ * all newcoming allocations observe the use_cid_lock flag set.
+ */
+ do {
+ cid = __mm_cid_try_get(mm);
+ cpu_relax();
+ } while (cid < 0);
+ /*
+ * Allocate before clearing use_cid_lock. Only care about
+ * program order because this is for forward progress.
+ */
+ barrier();
+ WRITE_ONCE(use_cid_lock, 0);
+unlock:
+ raw_spin_unlock(&cid_lock);
+end:
+ mm_cid_snapshot_time(rq, mm);
+ return cid;
+}
+
+static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm)
+{
+ struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
+ struct cpumask *cpumask;
+ int cid;
+
+ lockdep_assert_rq_held(rq);
+ cpumask = mm_cidmask(mm);
+ cid = __this_cpu_read(pcpu_cid->cid);
+ if (mm_cid_is_valid(cid)) {
+ mm_cid_snapshot_time(rq, mm);
+ return cid;
+ }
+ if (mm_cid_is_lazy_put(cid)) {
+ if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
+ __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+ }
+ cid = __mm_cid_get(rq, mm);
+ __this_cpu_write(pcpu_cid->cid, cid);
+ return cid;
+}
+
+static inline void switch_mm_cid(struct rq *rq,
+ struct task_struct *prev,
+ struct task_struct *next)
+{
+ /*
+ * Provide a memory barrier between rq->curr store and load of
+ * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition.
+ *
+ * Should be adapted if context_switch() is modified.
+ */
+ if (!next->mm) { // to kernel
+ /*
+ * user -> kernel transition does not guarantee a barrier, but
+ * we can use the fact that it performs an atomic operation in
+ * mmgrab().
+ */
+ if (prev->mm) // from user
+ smp_mb__after_mmgrab();
+ /*
+ * kernel -> kernel transition does not change rq->curr->mm
+ * state. It stays NULL.
+ */
+ } else { // to user
+ /*
+ * kernel -> user transition does not provide a barrier
+ * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu].
+ * Provide it here.
+ */
+ if (!prev->mm) // from kernel
+ smp_mb();
+ /*
+ * user -> user transition guarantees a memory barrier through
+ * switch_mm() when current->mm changes. If current->mm is
+ * unchanged, no barrier is needed.
+ */
+ }
+ if (prev->mm_cid_active) {
+ mm_cid_snapshot_time(rq, prev->mm);
+ mm_cid_put_lazy(prev);
+ prev->mm_cid = -1;
+ }
+ if (next->mm_cid_active)
+ next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm);
+}
+
+#else
+static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
+static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
+static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
+static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
+static inline void init_sched_mm_cid(struct task_struct *t) { }
+#endif
+
+extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
+#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h
index 9620e323162c..21ac44428bb0 100644
--- a/kernel/sched/smp.h
+++ b/kernel/sched/smp.h
@@ -6,4 +6,10 @@
extern void sched_ttwu_pending(void *arg);
-extern void send_call_function_single_ipi(int cpu);
+extern bool call_function_single_prep_ipi(int cpu);
+
+#ifdef CONFIG_SMP
+extern void flush_smp_call_function_queue(void);
+#else
+static inline void flush_smp_call_function_queue(void) { }
+#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 750fb3c67eed..857f837f52cb 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -2,7 +2,110 @@
/*
* /proc/schedstat implementation
*/
-#include "sched.h"
+
+void __update_stats_wait_start(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats)
+{
+ u64 wait_start, prev_wait_start;
+
+ wait_start = rq_clock(rq);
+ prev_wait_start = schedstat_val(stats->wait_start);
+
+ if (p && likely(wait_start > prev_wait_start))
+ wait_start -= prev_wait_start;
+
+ __schedstat_set(stats->wait_start, wait_start);
+}
+
+void __update_stats_wait_end(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats)
+{
+ u64 delta = rq_clock(rq) - schedstat_val(stats->wait_start);
+
+ if (p) {
+ if (task_on_rq_migrating(p)) {
+ /*
+ * Preserve migrating task's wait time so wait_start
+ * time stamp can be adjusted to accumulate wait time
+ * prior to migration.
+ */
+ __schedstat_set(stats->wait_start, delta);
+
+ return;
+ }
+
+ trace_sched_stat_wait(p, delta);
+ }
+
+ __schedstat_set(stats->wait_max,
+ max(schedstat_val(stats->wait_max), delta));
+ __schedstat_inc(stats->wait_count);
+ __schedstat_add(stats->wait_sum, delta);
+ __schedstat_set(stats->wait_start, 0);
+}
+
+void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats)
+{
+ u64 sleep_start, block_start;
+
+ sleep_start = schedstat_val(stats->sleep_start);
+ block_start = schedstat_val(stats->block_start);
+
+ if (sleep_start) {
+ u64 delta = rq_clock(rq) - sleep_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > schedstat_val(stats->sleep_max)))
+ __schedstat_set(stats->sleep_max, delta);
+
+ __schedstat_set(stats->sleep_start, 0);
+ __schedstat_add(stats->sum_sleep_runtime, delta);
+
+ if (p) {
+ account_scheduler_latency(p, delta >> 10, 1);
+ trace_sched_stat_sleep(p, delta);
+ }
+ }
+
+ if (block_start) {
+ u64 delta = rq_clock(rq) - block_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > schedstat_val(stats->block_max)))
+ __schedstat_set(stats->block_max, delta);
+
+ __schedstat_set(stats->block_start, 0);
+ __schedstat_add(stats->sum_sleep_runtime, delta);
+ __schedstat_add(stats->sum_block_runtime, delta);
+
+ if (p) {
+ if (p->in_iowait) {
+ __schedstat_add(stats->iowait_sum, delta);
+ __schedstat_inc(stats->iowait_count);
+ trace_sched_stat_iowait(p, delta);
+ }
+
+ trace_sched_stat_blocked(p, delta);
+
+ /*
+ * Blocking time is in units of nanosecs, so shift by
+ * 20 to get a milliseconds-range estimation of the
+ * amount of time that the task spent sleeping:
+ */
+ if (unlikely(prof_on == SLEEP_PROFILING)) {
+ profile_hits(SLEEP_PROFILING,
+ (void *)get_wchan(p),
+ delta >> 20);
+ }
+ account_scheduler_latency(p, delta >> 10, 0);
+ }
+ }
+}
/*
* Current schedstat API version.
@@ -74,7 +177,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
}
/*
- * This itererator needs some explanation.
+ * This iterator needs some explanation.
* It returns 1 for the header position.
* This means 2 is cpu 0.
* In a hotplugged system some CPUs, including cpu 0, may be missing so we have
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 33d0daf83842..38f3698f5e5b 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -1,7 +1,11 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _KERNEL_STATS_H
+#define _KERNEL_STATS_H
#ifdef CONFIG_SCHEDSTATS
+extern struct static_key_false sched_schedstats;
+
/*
* Expects runqueue lock to be held for atomicity of update
*/
@@ -25,7 +29,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
}
static inline void
-rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+rq_sched_info_dequeue(struct rq *rq, unsigned long long delta)
{
if (rq)
rq->rq_sched_info.run_delay += delta;
@@ -40,9 +44,33 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
#define schedstat_val(var) (var)
#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
+void __update_stats_wait_start(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats);
+
+void __update_stats_wait_end(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats);
+void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats);
+
+static inline void
+check_schedstat_required(void)
+{
+ if (schedstat_enabled())
+ return;
+
+ /* Force schedstat enabled if a dependent tracepoint is active */
+ if (trace_sched_stat_wait_enabled() ||
+ trace_sched_stat_sleep_enabled() ||
+ trace_sched_stat_iowait_enabled() ||
+ trace_sched_stat_blocked_enabled() ||
+ trace_sched_stat_runtime_enabled())
+ printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, stat_blocked and stat_runtime require the kernel parameter schedstats=enable or kernel.sched_schedstats=1\n");
+}
+
#else /* !CONFIG_SCHEDSTATS: */
+
static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { }
-static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { }
+static inline void rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { }
static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { }
# define schedstat_enabled() 0
# define __schedstat_inc(var) do { } while (0)
@@ -53,9 +81,37 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt
# define schedstat_set(var, val) do { } while (0)
# define schedstat_val(var) 0
# define schedstat_val_or_zero(var) 0
+
+# define __update_stats_wait_start(rq, p, stats) do { } while (0)
+# define __update_stats_wait_end(rq, p, stats) do { } while (0)
+# define __update_stats_enqueue_sleeper(rq, p, stats) do { } while (0)
+# define check_schedstat_required() do { } while (0)
+
#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+struct sched_entity_stats {
+ struct sched_entity se;
+ struct sched_statistics stats;
+} __no_randomize_layout;
+#endif
+
+static inline struct sched_statistics *
+__schedstats_from_se(struct sched_entity *se)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (!entity_is_task(se))
+ return &container_of(se, struct sched_entity_stats, se)->stats;
+#endif
+ return &task_of(se)->stats;
+}
+
#ifdef CONFIG_PSI
+void psi_task_change(struct task_struct *task, int clear, int set);
+void psi_task_switch(struct task_struct *prev, struct task_struct *next,
+ bool sleep);
+void psi_account_irqtime(struct task_struct *task, u32 delta);
+
/*
* PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps,
@@ -69,11 +125,12 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
if (static_branch_likely(&psi_disabled))
return;
- if (!wakeup || p->sched_psi_wake_requeue) {
+ if (p->in_memstall)
+ set |= TSK_MEMSTALL_RUNNING;
+
+ if (!wakeup) {
if (p->in_memstall)
set |= TSK_MEMSTALL;
- if (p->sched_psi_wake_requeue)
- p->sched_psi_wake_requeue = 0;
} else {
if (p->in_iowait)
clear |= TSK_IOWAIT;
@@ -84,28 +141,19 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
static inline void psi_dequeue(struct task_struct *p, bool sleep)
{
- int clear = TSK_RUNNING, set = 0;
-
if (static_branch_likely(&psi_disabled))
return;
- if (!sleep) {
- if (p->in_memstall)
- clear |= TSK_MEMSTALL;
- } else {
- /*
- * When a task sleeps, schedule() dequeues it before
- * switching to the next one. Merge the clearing of
- * TSK_RUNNING and TSK_ONCPU to save an unnecessary
- * psi_task_change() call in psi_sched_switch().
- */
- clear |= TSK_ONCPU;
-
- if (p->in_iowait)
- set |= TSK_IOWAIT;
- }
+ /*
+ * A voluntary sleep is a dequeue followed by a task switch. To
+ * avoid walking all ancestors twice, psi_task_switch() handles
+ * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
+ * Do nothing here.
+ */
+ if (sleep)
+ return;
- psi_task_change(p, clear, set);
+ psi_task_change(p, p->psi_flags, 0);
}
static inline void psi_ttwu_dequeue(struct task_struct *p)
@@ -117,19 +165,12 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
* deregister its sleep-persistent psi states from the old
* queue, and let psi_enqueue() know it has to requeue.
*/
- if (unlikely(p->in_iowait || p->in_memstall)) {
+ if (unlikely(p->psi_flags)) {
struct rq_flags rf;
struct rq *rq;
- int clear = 0;
-
- if (p->in_iowait)
- clear |= TSK_IOWAIT;
- if (p->in_memstall)
- clear |= TSK_MEMSTALL;
rq = __task_rq_lock(p, &rf);
- psi_task_change(p, clear, 0);
- p->sched_psi_wake_requeue = 1;
+ psi_task_change(p, p->psi_flags, 0);
__task_rq_unlock(rq, &rf);
}
}
@@ -144,14 +185,6 @@ static inline void psi_sched_switch(struct task_struct *prev,
psi_task_switch(prev, next, sleep);
}
-static inline void psi_task_tick(struct rq *rq)
-{
- if (static_branch_likely(&psi_disabled))
- return;
-
- if (unlikely(rq->curr->in_memstall))
- psi_memstall_tick(rq->curr, cpu_of(rq));
-}
#else /* CONFIG_PSI */
static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
@@ -159,33 +192,28 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
-static inline void psi_task_tick(struct rq *rq) {}
+static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
-static inline void sched_info_reset_dequeued(struct task_struct *t)
-{
- t->sched_info.last_queued = 0;
-}
-
/*
* We are interested in knowing how long it was from the *first* time a
* task was queued to the time that it finally hit a CPU, we call this routine
* from dequeue_task() to account for possible rq->clock skew across CPUs. The
* delta taken on each CPU would annul the skew.
*/
-static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
+static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
{
- unsigned long long now = rq_clock(rq), delta = 0;
+ unsigned long long delta = 0;
- if (sched_info_on()) {
- if (t->sched_info.last_queued)
- delta = now - t->sched_info.last_queued;
- }
- sched_info_reset_dequeued(t);
+ if (!t->sched_info.last_queued)
+ return;
+
+ delta = rq_clock(rq) - t->sched_info.last_queued;
+ t->sched_info.last_queued = 0;
t->sched_info.run_delay += delta;
- rq_sched_info_dequeued(rq, delta);
+ rq_sched_info_dequeue(rq, delta);
}
/*
@@ -195,11 +223,14 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
*/
static void sched_info_arrive(struct rq *rq, struct task_struct *t)
{
- unsigned long long now = rq_clock(rq), delta = 0;
+ unsigned long long now, delta = 0;
- if (t->sched_info.last_queued)
- delta = now - t->sched_info.last_queued;
- sched_info_reset_dequeued(t);
+ if (!t->sched_info.last_queued)
+ return;
+
+ now = rq_clock(rq);
+ delta = now - t->sched_info.last_queued;
+ t->sched_info.last_queued = 0;
t->sched_info.run_delay += delta;
t->sched_info.last_arrival = now;
t->sched_info.pcount++;
@@ -210,14 +241,12 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
/*
* This function is only called from enqueue_task(), but also only updates
* the timestamp if it is already not set. It's assumed that
- * sched_info_dequeued() will clear that stamp when appropriate.
+ * sched_info_dequeue() will clear that stamp when appropriate.
*/
-static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
+static inline void sched_info_enqueue(struct rq *rq, struct task_struct *t)
{
- if (sched_info_on()) {
- if (!t->sched_info.last_queued)
- t->sched_info.last_queued = rq_clock(rq);
- }
+ if (!t->sched_info.last_queued)
+ t->sched_info.last_queued = rq_clock(rq);
}
/*
@@ -225,7 +254,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
* due, typically, to expiring its time slice (this may also be called when
* switching to the idle task). Now we can calculate how long we ran.
* Also, if the process is still in the TASK_RUNNING state, call
- * sched_info_queued() to mark that it has now again started waiting on
+ * sched_info_enqueue() to mark that it has now again started waiting on
* the runqueue.
*/
static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
@@ -234,8 +263,8 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
rq_sched_info_depart(rq, delta);
- if (t->state == TASK_RUNNING)
- sched_info_queued(rq, t);
+ if (task_is_running(t))
+ sched_info_enqueue(rq, t);
}
/*
@@ -244,7 +273,7 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
* the idle task.) We are only called when prev != next.
*/
static inline void
-__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
+sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
/*
* prev now departs the CPU. It's not interesting to record
@@ -258,18 +287,10 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct
sched_info_arrive(rq, next);
}
-static inline void
-sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
-{
- if (sched_info_on())
- __sched_info_switch(rq, prev, next);
-}
-
#else /* !CONFIG_SCHED_INFO: */
-# define sched_info_queued(rq, t) do { } while (0)
-# define sched_info_reset_dequeued(t) do { } while (0)
-# define sched_info_dequeued(rq, t) do { } while (0)
-# define sched_info_depart(rq, t) do { } while (0)
-# define sched_info_arrive(rq, next) do { } while (0)
+# define sched_info_enqueue(rq, t) do { } while (0)
+# define sched_info_dequeue(rq, t) do { } while (0)
# define sched_info_switch(rq, t, next) do { } while (0)
#endif /* CONFIG_SCHED_INFO */
+
+#endif /* _KERNEL_STATS_H */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 4c9e9975684f..b1b8fe61c532 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -7,11 +7,10 @@
*
* See kernel/stop_machine.c
*/
-#include "sched.h"
#ifdef CONFIG_SMP
static int
-select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int cpu, int flags)
{
return task_cpu(p); /* stop tasks as never migrate */
}
@@ -24,7 +23,7 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
#endif /* CONFIG_SMP */
static void
-check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
+wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags)
{
/* we're never preempted */
}
@@ -34,15 +33,24 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir
stop->se.exec_start = rq_clock_task(rq);
}
-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *pick_task_stop(struct rq *rq)
{
if (!sched_stop_runnable(rq))
return NULL;
- set_next_task_stop(rq, rq->stop, true);
return rq->stop;
}
+static struct task_struct *pick_next_task_stop(struct rq *rq)
+{
+ struct task_struct *p = pick_task_stop(rq);
+
+ if (p)
+ set_next_task_stop(rq, p, true);
+
+ return p;
+}
+
static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
@@ -62,21 +70,7 @@ static void yield_task_stop(struct rq *rq)
static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
{
- struct task_struct *curr = rq->curr;
- u64 delta_exec;
-
- delta_exec = rq_clock_task(rq) - curr->se.exec_start;
- if (unlikely((s64)delta_exec < 0))
- delta_exec = 0;
-
- schedstat_set(curr->se.statistics.exec_max,
- max(curr->se.statistics.exec_max, delta_exec));
-
- curr->se.sum_exec_runtime += delta_exec;
- account_group_exec_runtime(curr, delta_exec);
-
- curr->se.exec_start = rq_clock_task(rq);
- cgroup_account_cputime(curr, delta_exec);
+ update_curr_common(rq);
}
/*
@@ -102,12 +96,6 @@ prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
BUG(); /* how!?, what priority? */
}
-static unsigned int
-get_rr_interval_stop(struct rq *rq, struct task_struct *task)
-{
- return 0;
-}
-
static void update_curr_stop(struct rq *rq)
{
}
@@ -115,14 +103,13 @@ static void update_curr_stop(struct rq *rq)
/*
* Simple, special scheduling class for the per-CPU stop tasks:
*/
-const struct sched_class stop_sched_class = {
- .next = &dl_sched_class,
+DEFINE_SCHED_CLASS(stop) = {
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
.yield_task = yield_task_stop,
- .check_preempt_curr = check_preempt_curr_stop,
+ .wakeup_preempt = wakeup_preempt_stop,
.pick_next_task = pick_next_task_stop,
.put_prev_task = put_prev_task_stop,
@@ -130,14 +117,13 @@ const struct sched_class stop_sched_class = {
#ifdef CONFIG_SMP
.balance = balance_stop,
+ .pick_task = pick_task_stop,
.select_task_rq = select_task_rq_stop,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
.task_tick = task_tick_stop,
- .get_rr_interval = get_rr_interval_stop,
-
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
.update_curr = update_curr_stop,
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index e1c655f928c7..72505cd3b60a 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -2,7 +2,6 @@
/*
* <linux/swait.h> (simple wait queues ) implementation:
*/
-#include "sched.h"
void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
struct lock_class_key *key)
@@ -19,7 +18,7 @@ EXPORT_SYMBOL(__init_swait_queue_head);
* If for some reason it would return 0, that means the previously waiting
* task is already running, so it will observe condition true (or has already).
*/
-void swake_up_locked(struct swait_queue_head *q)
+void swake_up_locked(struct swait_queue_head *q, int wake_flags)
{
struct swait_queue *curr;
@@ -27,7 +26,7 @@ void swake_up_locked(struct swait_queue_head *q)
return;
curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
- wake_up_process(curr->task);
+ try_to_wake_up(curr->task, TASK_NORMAL, wake_flags);
list_del_init(&curr->task_list);
}
EXPORT_SYMBOL(swake_up_locked);
@@ -42,7 +41,7 @@ EXPORT_SYMBOL(swake_up_locked);
void swake_up_all_locked(struct swait_queue_head *q)
{
while (!list_empty(&q->task_list))
- swake_up_locked(q);
+ swake_up_locked(q, 0);
}
void swake_up_one(struct swait_queue_head *q)
@@ -50,7 +49,7 @@ void swake_up_one(struct swait_queue_head *q)
unsigned long flags;
raw_spin_lock_irqsave(&q->lock, flags);
- swake_up_locked(q);
+ swake_up_locked(q, 0);
raw_spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(swake_up_one);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ba81187bb7af..10d1391e7416 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2,7 +2,8 @@
/*
* Scheduler topology setup/handling methods
*/
-#include "sched.h"
+
+#include <linux/bsearch.h>
DEFINE_MUTEX(sched_domains_mutex);
@@ -14,21 +15,29 @@ static cpumask_var_t sched_domains_tmpmask2;
static int __init sched_debug_setup(char *str)
{
- sched_debug_enabled = true;
+ sched_debug_verbose = true;
return 0;
}
-early_param("sched_debug", sched_debug_setup);
+early_param("sched_verbose", sched_debug_setup);
static inline bool sched_debug(void)
{
- return sched_debug_enabled;
+ return sched_debug_verbose;
}
+#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
+const struct sd_flag_debug sd_flag_debug[] = {
+#include <linux/sched/sd_flags.h>
+};
+#undef SD_FLAG
+
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
struct cpumask *groupmask)
{
struct sched_group *group = sd->groups;
+ unsigned long flags = sd->flags;
+ unsigned int idx;
cpumask_clear(groupmask);
@@ -43,6 +52,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
}
+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+ unsigned int flag = BIT(idx);
+ unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
+
+ if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
+ !(sd->child->flags & flag))
+ printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
+ sd_flag_debug[idx].name);
+
+ if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
+ !(sd->parent->flags & flag))
+ printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
+ sd_flag_debug[idx].name);
+ }
+
printk(KERN_DEBUG "%*s groups:", level + 1, "");
do {
if (!group) {
@@ -51,7 +75,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!cpumask_weight(sched_group_span(group))) {
+ if (cpumask_empty(sched_group_span(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: empty group\n");
break;
@@ -108,7 +132,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
{
int level = 0;
- if (!sched_debug_enabled)
+ if (!sched_debug_verbose)
return;
if (!sd) {
@@ -129,7 +153,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
}
#else /* !CONFIG_SCHED_DEBUG */
-# define sched_debug_enabled 0
+# define sched_debug_verbose 0
# define sched_domain_debug(sd, cpu) do { } while (0)
static inline bool sched_debug(void)
{
@@ -137,22 +161,22 @@ static inline bool sched_debug(void)
}
#endif /* CONFIG_SCHED_DEBUG */
+/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
+static const unsigned int SD_DEGENERATE_GROUPS_MASK =
+#include <linux/sched/sd_flags.h>
+0;
+#undef SD_FLAG
+
static int sd_degenerate(struct sched_domain *sd)
{
if (cpumask_weight(sched_domain_span(sd)) == 1)
return 1;
/* Following flags need at least 2 groups */
- if (sd->flags & (SD_BALANCE_NEWIDLE |
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- SD_SHARE_CPUCAPACITY |
- SD_ASYM_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_SHARE_POWERDOMAIN)) {
- if (sd->groups != sd->groups->next)
- return 0;
- }
+ if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
+ (sd->groups != sd->groups->next))
+ return 0;
/* Following flags don't use groups */
if (sd->flags & (SD_WAKE_AFFINE))
@@ -173,18 +197,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
return 0;
/* Flags needing groups don't count if only 1 group in parent */
- if (parent->groups == parent->groups->next) {
- pflags &= ~(SD_BALANCE_NEWIDLE |
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- SD_ASYM_CPUCAPACITY |
- SD_SHARE_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_PREFER_SIBLING |
- SD_SHARE_POWERDOMAIN);
- if (nr_node_ids == 1)
- pflags &= ~SD_SERIALIZE;
- }
+ if (parent->groups == parent->groups->next)
+ pflags &= ~SD_DEGENERATE_GROUPS_MASK;
+
if (~cflags & pflags)
return 0;
@@ -193,12 +208,84 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
DEFINE_STATIC_KEY_FALSE(sched_energy_present);
-unsigned int sysctl_sched_energy_aware = 1;
-DEFINE_MUTEX(sched_energy_mutex);
-bool sched_energy_update;
+static unsigned int sysctl_sched_energy_aware = 1;
+static DEFINE_MUTEX(sched_energy_mutex);
+static bool sched_energy_update;
+
+static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
+{
+ bool any_asym_capacity = false;
+ struct cpufreq_policy *policy;
+ struct cpufreq_governor *gov;
+ int i;
+
+ /* EAS is enabled for asymmetric CPU capacity topologies. */
+ for_each_cpu(i, cpu_mask) {
+ if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, i))) {
+ any_asym_capacity = true;
+ break;
+ }
+ }
+ if (!any_asym_capacity) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS, CPUs do not have asymmetric capacities\n",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+
+ /* EAS definitely does *not* handle SMT */
+ if (sched_smt_active()) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS, SMT is not supported\n",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+
+ if (!arch_scale_freq_invariant()) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+
+ /* Do not attempt EAS if schedutil is not being used. */
+ for_each_cpu(i, cpu_mask) {
+ policy = cpufreq_cpu_get(i);
+ if (!policy) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d",
+ cpumask_pr_args(cpu_mask), i);
+ }
+ return false;
+ }
+ gov = policy->governor;
+ cpufreq_cpu_put(policy);
+ if (gov != &schedutil_gov) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void rebuild_sched_domains_energy(void)
+{
+ mutex_lock(&sched_energy_mutex);
+ sched_energy_update = true;
+ rebuild_sched_domains();
+ sched_energy_update = false;
+ mutex_unlock(&sched_energy_mutex);
+}
#ifdef CONFIG_PROC_SYSCTL
-int sched_energy_aware_handler(struct ctl_table *table, int write,
+static int sched_energy_aware_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret, state;
@@ -206,20 +293,45 @@ int sched_energy_aware_handler(struct ctl_table *table, int write,
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!sched_is_eas_possible(cpu_active_mask)) {
+ if (write) {
+ return -EOPNOTSUPP;
+ } else {
+ *lenp = 0;
+ return 0;
+ }
+ }
+
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write) {
state = static_branch_unlikely(&sched_energy_present);
- if (state != sysctl_sched_energy_aware) {
- mutex_lock(&sched_energy_mutex);
- sched_energy_update = 1;
- rebuild_sched_domains();
- sched_energy_update = 0;
- mutex_unlock(&sched_energy_mutex);
- }
+ if (state != sysctl_sched_energy_aware)
+ rebuild_sched_domains_energy();
}
return ret;
}
+
+static struct ctl_table sched_energy_aware_sysctls[] = {
+ {
+ .procname = "sched_energy_aware",
+ .data = &sysctl_sched_energy_aware,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_energy_aware_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {}
+};
+
+static int __init sched_energy_aware_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_energy_aware_sysctls);
+ return 0;
+}
+
+late_initcall(sched_energy_aware_sysctl_init);
#endif
static void free_pd(struct perf_domain *pd)
@@ -272,10 +384,10 @@ static void perf_domain_debug(const struct cpumask *cpu_map,
printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
while (pd) {
- printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
+ printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
cpumask_first(perf_domain_span(pd)),
cpumask_pr_args(perf_domain_span(pd)),
- em_pd_nr_cap_states(pd->em_pd));
+ em_pd_nr_perf_states(pd->em_pd));
pd = pd->next;
}
@@ -308,94 +420,33 @@ static void sched_energy_set(bool has_eas)
* 1. an Energy Model (EM) is available;
* 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
* 3. no SMT is detected.
- * 4. the EM complexity is low enough to keep scheduling overheads low;
- * 5. schedutil is driving the frequency of all CPUs of the rd;
- *
- * The complexity of the Energy Model is defined as:
- *
- * C = nr_pd * (nr_cpus + nr_cs)
- *
- * with parameters defined as:
- * - nr_pd: the number of performance domains
- * - nr_cpus: the number of CPUs
- * - nr_cs: the sum of the number of capacity states of all performance
- * domains (for example, on a system with 2 performance domains,
- * with 10 capacity states each, nr_cs = 2 * 10 = 20).
- *
- * It is generally not a good idea to use such a model in the wake-up path on
- * very complex platforms because of the associated scheduling overheads. The
- * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
- * with per-CPU DVFS and less than 8 capacity states each, for example.
+ * 4. schedutil is driving the frequency of all CPUs of the rd;
+ * 5. frequency invariance support is present;
*/
-#define EM_MAX_COMPLEXITY 2048
-
-extern struct cpufreq_governor schedutil_gov;
static bool build_perf_domains(const struct cpumask *cpu_map)
{
- int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
+ int i;
struct perf_domain *pd = NULL, *tmp;
int cpu = cpumask_first(cpu_map);
struct root_domain *rd = cpu_rq(cpu)->rd;
- struct cpufreq_policy *policy;
- struct cpufreq_governor *gov;
if (!sysctl_sched_energy_aware)
goto free;
- /* EAS is enabled for asymmetric CPU capacity topologies. */
- if (!per_cpu(sd_asym_cpucapacity, cpu)) {
- if (sched_debug()) {
- pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
- cpumask_pr_args(cpu_map));
- }
- goto free;
- }
-
- /* EAS definitely does *not* handle SMT */
- if (sched_smt_active()) {
- pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
- cpumask_pr_args(cpu_map));
+ if (!sched_is_eas_possible(cpu_map))
goto free;
- }
for_each_cpu(i, cpu_map) {
/* Skip already covered CPUs. */
if (find_pd(pd, i))
continue;
- /* Do not attempt EAS if schedutil is not being used. */
- policy = cpufreq_cpu_get(i);
- if (!policy)
- goto free;
- gov = policy->governor;
- cpufreq_cpu_put(policy);
- if (gov != &schedutil_gov) {
- if (rd->pd)
- pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
- cpumask_pr_args(cpu_map));
- goto free;
- }
-
/* Create the new pd and add it to the local list. */
tmp = pd_init(i);
if (!tmp)
goto free;
tmp->next = pd;
pd = tmp;
-
- /*
- * Count performance domains and capacity states for the
- * complexity check.
- */
- nr_pd++;
- nr_cs += em_pd_nr_cap_states(pd->em_pd);
- }
-
- /* Bail out if the Energy Model complexity is too high. */
- if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
- WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
- cpumask_pr_args(cpu_map));
- goto free;
}
perf_domain_debug(cpu_map, pd);
@@ -438,9 +489,9 @@ static void free_rootdomain(struct rcu_head *rcu)
void rq_attach_root(struct rq *rq, struct root_domain *rd)
{
struct root_domain *old_rd = NULL;
- unsigned long flags;
+ struct rq_flags rf;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ rq_lock_irqsave(rq, &rf);
if (rq->rd) {
old_rd = rq->rd;
@@ -466,7 +517,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
if (old_rd)
call_rcu(&old_rd->rcu, free_rootdomain);
@@ -499,9 +550,10 @@ static int init_rootdomain(struct root_domain *rd)
#ifdef HAVE_RT_PUSH_IPI
rd->rto_cpu = -1;
raw_spin_lock_init(&rd->rto_lock);
- init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+ rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func);
#endif
+ rd->visit_gen = 0;
init_dl_bw(&rd->dl_bw);
if (cpudl_init(&rd->cpudl) != 0)
goto free_rto_mask;
@@ -530,7 +582,7 @@ out:
*/
struct root_domain def_root_domain;
-void init_defrootdomain(void)
+void __init init_defrootdomain(void)
{
init_rootdomain(&def_root_domain);
@@ -616,11 +668,14 @@ static void destroy_sched_domains(struct sched_domain *sd)
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(int, sd_share_id);
DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+
DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
+DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
static void update_top_cache_domain(int cpu)
{
@@ -641,13 +696,24 @@ static void update_top_cache_domain(int cpu)
per_cpu(sd_llc_id, cpu) = id;
rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
+ sd = lowest_flag_domain(cpu, SD_CLUSTER);
+ if (sd)
+ id = cpumask_first(sched_domain_span(sd));
+
+ /*
+ * This assignment should be placed after the sd_llc_id as
+ * we want this id equals to cluster id on cluster machines
+ * but equals to LLC id on non-Cluster machines.
+ */
+ per_cpu(sd_share_id, cpu) = id;
+
sd = lowest_flag_domain(cpu, SD_NUMA);
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
- sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
+ sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
}
@@ -669,8 +735,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
if (sd_parent_degenerate(tmp, parent)) {
tmp->parent = parent->parent;
- if (parent->parent)
+
+ if (parent->parent) {
parent->parent->child = tmp;
+ parent->parent->groups->flags = tmp->flags;
+ }
+
/*
* Transfer SD_PREFER_SIBLING down in case of a
* degenerate parent; the spans match for this
@@ -687,8 +757,20 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
tmp = sd;
sd = sd->parent;
destroy_sched_domain(tmp);
- if (sd)
+ if (sd) {
+ struct sched_group *sg = sd->groups;
+
+ /*
+ * sched groups hold the flags of the child sched
+ * domain for convenience. Clear such flags since
+ * the child is being destroyed.
+ */
+ do {
+ sg->flags = 0;
+ } while (sg != sd->groups);
+
sd->child = NULL;
+ }
}
sched_domain_debug(sd, cpu);
@@ -884,10 +966,12 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
return NULL;
sg_span = sched_group_span(sg);
- if (sd->child)
+ if (sd->child) {
cpumask_copy(sg_span, sched_domain_span(sd->child));
- else
+ sg->flags = sd->child->flags;
+ } else {
cpumask_copy(sg_span, sched_domain_span(sd));
+ }
atomic_inc(&sg->ref);
return sg;
@@ -902,7 +986,7 @@ static void init_overlap_sched_group(struct sched_domain *sd,
int cpu;
build_balance_mask(sd, sg, mask);
- cpu = cpumask_first_and(sched_group_span(sg), mask);
+ cpu = cpumask_first(mask);
sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
if (atomic_inc_return(&sg->sgc->ref) == 1)
@@ -921,6 +1005,31 @@ static void init_overlap_sched_group(struct sched_domain *sd,
sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
}
+static struct sched_domain *
+find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)
+{
+ /*
+ * The proper descendant would be the one whose child won't span out
+ * of sd
+ */
+ while (sibling->child &&
+ !cpumask_subset(sched_domain_span(sibling->child),
+ sched_domain_span(sd)))
+ sibling = sibling->child;
+
+ /*
+ * As we are referencing sgc across different topology level, we need
+ * to go down to skip those sched_domains which don't contribute to
+ * scheduling because they will be degenerated in cpu_attach_domain
+ */
+ while (sibling->child &&
+ cpumask_equal(sched_domain_span(sibling->child),
+ sched_domain_span(sibling)))
+ sibling = sibling->child;
+
+ return sibling;
+}
+
static int
build_overlap_sched_groups(struct sched_domain *sd, int cpu)
{
@@ -954,6 +1063,41 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
continue;
+ /*
+ * Usually we build sched_group by sibling's child sched_domain
+ * But for machines whose NUMA diameter are 3 or above, we move
+ * to build sched_group by sibling's proper descendant's child
+ * domain because sibling's child sched_domain will span out of
+ * the sched_domain being built as below.
+ *
+ * Smallest diameter=3 topology is:
+ *
+ * node 0 1 2 3
+ * 0: 10 20 30 40
+ * 1: 20 10 20 30
+ * 2: 30 20 10 20
+ * 3: 40 30 20 10
+ *
+ * 0 --- 1 --- 2 --- 3
+ *
+ * NUMA-3 0-3 N/A N/A 0-3
+ * groups: {0-2},{1-3} {1-3},{0-2}
+ *
+ * NUMA-2 0-2 0-3 0-3 1-3
+ * groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2}
+ *
+ * NUMA-1 0-1 0-2 1-3 2-3
+ * groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2}
+ *
+ * NUMA-0 0 1 2 3
+ *
+ * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
+ * group span isn't a subset of the domain span.
+ */
+ if (sibling->child &&
+ !cpumask_subset(sched_domain_span(sibling->child), span))
+ sibling = find_descended_sibling(sd, sibling);
+
sg = build_group_from_child_sched_domain(sibling, cpu);
if (!sg)
goto fail;
@@ -961,7 +1105,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
sg_span = sched_group_span(sg);
cpumask_or(covered, covered, sg_span);
- init_overlap_sched_group(sd, sg);
+ init_overlap_sched_group(sibling, sg);
if (!first)
first = sg;
@@ -989,7 +1133,7 @@ fail:
*
* - Simultaneous multithreading (SMT)
* - Multi-Core Cache (MC)
- * - Package (DIE)
+ * - Package (PKG)
*
* Where the last one more or less denotes everything up to a NUMA node.
*
@@ -1011,13 +1155,13 @@ fail:
*
* CPU 0 1 2 3 4 5 6 7
*
- * DIE [ ]
+ * PKG [ ]
* MC [ ] [ ]
* SMT [ ] [ ] [ ] [ ]
*
* - or -
*
- * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
+ * PKG 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
* MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
* SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
*
@@ -1077,6 +1221,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
if (child) {
cpumask_copy(sched_group_span(sg), sched_domain_span(child));
cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
+ sg->flags = child->flags;
} else {
cpumask_set_cpu(cpu, sched_group_span(sg));
cpumask_set_cpu(cpu, group_balance_mask(sg));
@@ -1145,14 +1290,24 @@ build_sched_groups(struct sched_domain *sd, int cpu)
static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{
struct sched_group *sg = sd->groups;
+ struct cpumask *mask = sched_domains_tmpmask2;
WARN_ON(!sg);
do {
- int cpu, max_cpu = -1;
+ int cpu, cores = 0, max_cpu = -1;
sg->group_weight = cpumask_weight(sched_group_span(sg));
+ cpumask_copy(mask, sched_group_span(sg));
+ for_each_cpu(cpu, mask) {
+ cores++;
+#ifdef CONFIG_SCHED_SMT
+ cpumask_andnot(mask, mask, cpu_smt_mask(cpu));
+#endif
+ }
+ sg->cores = cores;
+
if (!(sd->flags & SD_ASYM_PACKING))
goto next;
@@ -1175,6 +1330,116 @@ next:
}
/*
+ * Asymmetric CPU capacity bits
+ */
+struct asym_cap_data {
+ struct list_head link;
+ unsigned long capacity;
+ unsigned long cpus[];
+};
+
+/*
+ * Set of available CPUs grouped by their corresponding capacities
+ * Each list entry contains a CPU mask reflecting CPUs that share the same
+ * capacity.
+ * The lifespan of data is unlimited.
+ */
+static LIST_HEAD(asym_cap_list);
+
+#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
+
+/*
+ * Verify whether there is any CPU capacity asymmetry in a given sched domain.
+ * Provides sd_flags reflecting the asymmetry scope.
+ */
+static inline int
+asym_cpu_capacity_classify(const struct cpumask *sd_span,
+ const struct cpumask *cpu_map)
+{
+ struct asym_cap_data *entry;
+ int count = 0, miss = 0;
+
+ /*
+ * Count how many unique CPU capacities this domain spans across
+ * (compare sched_domain CPUs mask with ones representing available
+ * CPUs capacities). Take into account CPUs that might be offline:
+ * skip those.
+ */
+ list_for_each_entry(entry, &asym_cap_list, link) {
+ if (cpumask_intersects(sd_span, cpu_capacity_span(entry)))
+ ++count;
+ else if (cpumask_intersects(cpu_map, cpu_capacity_span(entry)))
+ ++miss;
+ }
+
+ WARN_ON_ONCE(!count && !list_empty(&asym_cap_list));
+
+ /* No asymmetry detected */
+ if (count < 2)
+ return 0;
+ /* Some of the available CPU capacity values have not been detected */
+ if (miss)
+ return SD_ASYM_CPUCAPACITY;
+
+ /* Full asymmetry */
+ return SD_ASYM_CPUCAPACITY | SD_ASYM_CPUCAPACITY_FULL;
+
+}
+
+static inline void asym_cpu_capacity_update_data(int cpu)
+{
+ unsigned long capacity = arch_scale_cpu_capacity(cpu);
+ struct asym_cap_data *entry = NULL;
+
+ list_for_each_entry(entry, &asym_cap_list, link) {
+ if (capacity == entry->capacity)
+ goto done;
+ }
+
+ entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL);
+ if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n"))
+ return;
+ entry->capacity = capacity;
+ list_add(&entry->link, &asym_cap_list);
+done:
+ __cpumask_set_cpu(cpu, cpu_capacity_span(entry));
+}
+
+/*
+ * Build-up/update list of CPUs grouped by their capacities
+ * An update requires explicit request to rebuild sched domains
+ * with state indicating CPU topology changes.
+ */
+static void asym_cpu_capacity_scan(void)
+{
+ struct asym_cap_data *entry, *next;
+ int cpu;
+
+ list_for_each_entry(entry, &asym_cap_list, link)
+ cpumask_clear(cpu_capacity_span(entry));
+
+ for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN))
+ asym_cpu_capacity_update_data(cpu);
+
+ list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
+ if (cpumask_empty(cpu_capacity_span(entry))) {
+ list_del(&entry->link);
+ kfree(entry);
+ }
+ }
+
+ /*
+ * Only one capacity value has been detected i.e. this system is symmetric.
+ * No need to keep this data around.
+ */
+ if (list_is_singular(&asym_cap_list)) {
+ entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
+ list_del(&entry->link);
+ kfree(entry);
+ }
+}
+
+/*
* Initializers for schedule domains
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
@@ -1219,13 +1484,13 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
case sa_rootdomain:
if (!atomic_read(&d->rd->refcount))
free_rootdomain(&d->rd->rcu);
- /* Fall through */
+ fallthrough;
case sa_sd:
free_percpu(d->sd);
- /* Fall through */
+ fallthrough;
case sa_sd_storage:
__sdt_free(cpu_map);
- /* Fall through */
+ fallthrough;
case sa_none:
break;
}
@@ -1279,7 +1544,6 @@ static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
-int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
#endif
/*
@@ -1292,7 +1556,6 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
* SD_SHARE_CPUCAPACITY - describes SMT topologies
* SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN - describes shared power domain
*
* Odd one out, which beside describing the topology has a quirk also
* prescribes the desired behaviour that goes along with it:
@@ -1301,19 +1564,20 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
*/
#define TOPOLOGY_SD_FLAGS \
(SD_SHARE_CPUCAPACITY | \
+ SD_CLUSTER | \
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
- SD_ASYM_PACKING | \
- SD_SHARE_POWERDOMAIN)
+ SD_ASYM_PACKING)
static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map,
- struct sched_domain *child, int dflags, int cpu)
+ struct sched_domain *child, int cpu)
{
struct sd_data *sdd = &tl->data;
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
int sd_id, sd_weight, sd_flags = 0;
+ struct cpumask *sd_span;
#ifdef CONFIG_NUMA
/*
@@ -1328,16 +1592,13 @@ sd_init(struct sched_domain_topology_level *tl,
sd_flags = (*tl->sd_flags)();
if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
"wrong sd_flags in topology description\n"))
- sd_flags &= ~TOPOLOGY_SD_FLAGS;
-
- /* Apply detected topology flags */
- sd_flags |= dflags;
+ sd_flags &= TOPOLOGY_SD_FLAGS;
*sd = (struct sched_domain){
.min_interval = sd_weight,
.max_interval = 2*sd_weight,
- .busy_factor = 32,
- .imbalance_pct = 125,
+ .busy_factor = 16,
+ .imbalance_pct = 117,
.cache_nice_tries = 0,
@@ -1357,20 +1618,26 @@ sd_init(struct sched_domain_topology_level *tl,
.last_balance = jiffies,
.balance_interval = sd_weight,
.max_newidle_lb_cost = 0,
- .next_decay_max_lb_cost = jiffies,
+ .last_decay_max_lb_cost = jiffies,
.child = child,
#ifdef CONFIG_SCHED_DEBUG
.name = tl->name,
#endif
};
- cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
- sd_id = cpumask_first(sched_domain_span(sd));
+ sd_span = sched_domain_span(sd);
+ cpumask_and(sd_span, cpu_map, tl->mask(cpu));
+ sd_id = cpumask_first(sd_span);
+
+ sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
+
+ WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) ==
+ (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY),
+ "CPU capacity asymmetry not supported on SMT\n");
/*
* Convert topological properties into behaviour.
*/
-
/* Don't attempt to spread across CPUs of different capacities. */
if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
sd->child->flags &= ~SD_PREFER_SIBLING;
@@ -1421,25 +1688,32 @@ static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
+
+#ifdef CONFIG_SCHED_CLUSTER
+ { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
+#endif
+
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
- { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { cpu_cpu_mask, SD_INIT_NAME(PKG) },
{ NULL, },
};
static struct sched_domain_topology_level *sched_domain_topology =
default_topology;
+static struct sched_domain_topology_level *sched_domain_topology_saved;
#define for_each_sd_topology(tl) \
for (tl = sched_domain_topology; tl->mask; tl++)
-void set_sched_topology(struct sched_domain_topology_level *tl)
+void __init set_sched_topology(struct sched_domain_topology_level *tl)
{
if (WARN_ON_ONCE(sched_smp_initialized))
return;
sched_domain_topology = tl;
+ sched_domain_topology_saved = NULL;
}
#ifdef CONFIG_NUMA
@@ -1463,8 +1737,12 @@ static void sched_numa_warn(const char *str)
for (i = 0; i < nr_node_ids; i++) {
printk(KERN_WARNING " ");
- for (j = 0; j < nr_node_ids; j++)
- printk(KERN_CONT "%02d ", node_distance(i,j));
+ for (j = 0; j < nr_node_ids; j++) {
+ if (!node_state(i, N_CPU) || !node_state(j, N_CPU))
+ printk(KERN_CONT "(%02d) ", node_distance(i,j));
+ else
+ printk(KERN_CONT " %02d ", node_distance(i,j));
+ }
printk(KERN_CONT "\n");
}
printk(KERN_WARNING "\n");
@@ -1472,19 +1750,34 @@ static void sched_numa_warn(const char *str)
bool find_numa_distance(int distance)
{
- int i;
+ bool found = false;
+ int i, *distances;
if (distance == node_distance(0, 0))
return true;
+ rcu_read_lock();
+ distances = rcu_dereference(sched_domains_numa_distance);
+ if (!distances)
+ goto unlock;
for (i = 0; i < sched_domains_numa_levels; i++) {
- if (sched_domains_numa_distance[i] == distance)
- return true;
+ if (distances[i] == distance) {
+ found = true;
+ break;
+ }
}
+unlock:
+ rcu_read_unlock();
- return false;
+ return found;
}
+#define for_each_cpu_node_but(n, nbut) \
+ for_each_node_state(n, N_CPU) \
+ if (n == nbut) \
+ continue; \
+ else
+
/*
* A system can have three types of NUMA topology:
* NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
@@ -1504,7 +1797,7 @@ bool find_numa_distance(int distance)
* there is an intermediary node C, which is < N hops away from both
* nodes A and B, the system is a glueless mesh.
*/
-static void init_numa_topology_type(void)
+static void init_numa_topology_type(int offline_node)
{
int a, b, c, n;
@@ -1515,14 +1808,14 @@ static void init_numa_topology_type(void)
return;
}
- for_each_online_node(a) {
- for_each_online_node(b) {
+ for_each_cpu_node_but(a, offline_node) {
+ for_each_cpu_node_but(b, offline_node) {
/* Find two nodes furthest removed from each other. */
if (node_distance(a, b) < n)
continue;
/* Is there an intermediary node between a and b? */
- for_each_online_node(c) {
+ for_each_cpu_node_but(c, offline_node) {
if (node_distance(a, c) < n &&
node_distance(b, c) < n) {
sched_numa_topology_type =
@@ -1535,68 +1828,67 @@ static void init_numa_topology_type(void)
return;
}
}
+
+ pr_err("Failed to find a NUMA topology type, defaulting to DIRECT\n");
+ sched_numa_topology_type = NUMA_DIRECT;
}
-void sched_init_numa(void)
-{
- int next_distance, curr_distance = node_distance(0, 0);
- struct sched_domain_topology_level *tl;
- int level = 0;
- int i, j, k;
- sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
- if (!sched_domains_numa_distance)
- return;
+#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
- /* Includes NUMA identity node at level 0. */
- sched_domains_numa_distance[level++] = curr_distance;
- sched_domains_numa_levels = level;
+void sched_init_numa(int offline_node)
+{
+ struct sched_domain_topology_level *tl;
+ unsigned long *distance_map;
+ int nr_levels = 0;
+ int i, j;
+ int *distances;
+ struct cpumask ***masks;
/*
* O(nr_nodes^2) deduplicating selection sort -- in order to find the
* unique distances in the node_distance() table.
- *
- * Assumes node_distance(0,j) includes all distances in
- * node_distance(i,j) in order to avoid cubic time.
*/
- next_distance = curr_distance;
- for (i = 0; i < nr_node_ids; i++) {
- for (j = 0; j < nr_node_ids; j++) {
- for (k = 0; k < nr_node_ids; k++) {
- int distance = node_distance(i, k);
-
- if (distance > curr_distance &&
- (distance < next_distance ||
- next_distance == curr_distance))
- next_distance = distance;
+ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
+ if (!distance_map)
+ return;
- /*
- * While not a strong assumption it would be nice to know
- * about cases where if node A is connected to B, B is not
- * equally connected to A.
- */
- if (sched_debug() && node_distance(k, i) != distance)
- sched_numa_warn("Node-distance not symmetric");
+ bitmap_zero(distance_map, NR_DISTANCE_VALUES);
+ for_each_cpu_node_but(i, offline_node) {
+ for_each_cpu_node_but(j, offline_node) {
+ int distance = node_distance(i, j);
- if (sched_debug() && i && !find_numa_distance(distance))
- sched_numa_warn("Node-0 not representative");
+ if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
+ sched_numa_warn("Invalid distance value range");
+ bitmap_free(distance_map);
+ return;
}
- if (next_distance != curr_distance) {
- sched_domains_numa_distance[level++] = next_distance;
- sched_domains_numa_levels = level;
- curr_distance = next_distance;
- } else break;
+
+ bitmap_set(distance_map, distance, 1);
}
+ }
+ /*
+ * We can now figure out how many unique distance values there are and
+ * allocate memory accordingly.
+ */
+ nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
- /*
- * In case of sched_debug() we verify the above assumption.
- */
- if (!sched_debug())
- break;
+ distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
+ if (!distances) {
+ bitmap_free(distance_map);
+ return;
}
+ for (i = 0, j = 0; i < nr_levels; i++, j++) {
+ j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
+ distances[i] = j;
+ }
+ rcu_assign_pointer(sched_domains_numa_distance, distances);
+
+ bitmap_free(distance_map);
+
/*
- * 'level' contains the number of unique distances
+ * 'nr_levels' contains the number of unique distances
*
* The sched_domains_numa_distance[] array includes the actual distance
* numbers.
@@ -1605,36 +1897,40 @@ void sched_init_numa(void)
/*
* Here, we should temporarily reset sched_domains_numa_levels to 0.
* If it fails to allocate memory for array sched_domains_numa_masks[][],
- * the array will contain less then 'level' members. This could be
+ * the array will contain less then 'nr_levels' members. This could be
* dangerous when we use it to iterate array sched_domains_numa_masks[][]
* in other functions.
*
- * We reset it to 'level' at the end of this function.
+ * We reset it to 'nr_levels' at the end of this function.
*/
sched_domains_numa_levels = 0;
- sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
- if (!sched_domains_numa_masks)
+ masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
+ if (!masks)
return;
/*
* Now for each level, construct a mask per node which contains all
* CPUs of nodes that are that many hops away from us.
*/
- for (i = 0; i < level; i++) {
- sched_domains_numa_masks[i] =
- kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
- if (!sched_domains_numa_masks[i])
+ for (i = 0; i < nr_levels; i++) {
+ masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+ if (!masks[i])
return;
- for (j = 0; j < nr_node_ids; j++) {
+ for_each_cpu_node_but(j, offline_node) {
struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ int k;
+
if (!mask)
return;
- sched_domains_numa_masks[i][j] = mask;
+ masks[i][j] = mask;
+
+ for_each_cpu_node_but(k, offline_node) {
+ if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
+ sched_numa_warn("Node-distance not symmetric");
- for_each_node(k) {
if (node_distance(j, k) > sched_domains_numa_distance[i])
continue;
@@ -1642,11 +1938,12 @@ void sched_init_numa(void)
}
}
}
+ rcu_assign_pointer(sched_domains_numa_masks, masks);
/* Compute default topology size */
for (i = 0; sched_domain_topology[i].mask; i++);
- tl = kzalloc((i + level + 1) *
+ tl = kzalloc((i + nr_levels + 1) *
sizeof(struct sched_domain_topology_level), GFP_KERNEL);
if (!tl)
return;
@@ -1669,7 +1966,7 @@ void sched_init_numa(void)
/*
* .. and append 'j' levels of NUMA goodness.
*/
- for (j = 1; j < level; i++, j++) {
+ for (j = 1; j < nr_levels; i++, j++) {
tl[i] = (struct sched_domain_topology_level){
.mask = sd_numa_mask,
.sd_flags = cpu_numa_flags,
@@ -1679,12 +1976,67 @@ void sched_init_numa(void)
};
}
+ sched_domain_topology_saved = sched_domain_topology;
sched_domain_topology = tl;
- sched_domains_numa_levels = level;
- sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+ sched_domains_numa_levels = nr_levels;
+ WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);
+
+ init_numa_topology_type(offline_node);
+}
+
+
+static void sched_reset_numa(void)
+{
+ int nr_levels, *distances;
+ struct cpumask ***masks;
+
+ nr_levels = sched_domains_numa_levels;
+ sched_domains_numa_levels = 0;
+ sched_max_numa_distance = 0;
+ sched_numa_topology_type = NUMA_DIRECT;
+ distances = sched_domains_numa_distance;
+ rcu_assign_pointer(sched_domains_numa_distance, NULL);
+ masks = sched_domains_numa_masks;
+ rcu_assign_pointer(sched_domains_numa_masks, NULL);
+ if (distances || masks) {
+ int i, j;
+
+ synchronize_rcu();
+ kfree(distances);
+ for (i = 0; i < nr_levels && masks; i++) {
+ if (!masks[i])
+ continue;
+ for_each_node(j)
+ kfree(masks[i][j]);
+ kfree(masks[i]);
+ }
+ kfree(masks);
+ }
+ if (sched_domain_topology_saved) {
+ kfree(sched_domain_topology);
+ sched_domain_topology = sched_domain_topology_saved;
+ sched_domain_topology_saved = NULL;
+ }
+}
+
+/*
+ * Call with hotplug lock held
+ */
+void sched_update_numa(int cpu, bool online)
+{
+ int node;
- init_numa_topology_type();
+ node = cpu_to_node(cpu);
+ /*
+ * Scheduler NUMA topology is updated when the first CPU of a
+ * node is onlined or the last CPU of a node is offlined.
+ */
+ if (cpumask_weight(cpumask_of_node(node)) != 1)
+ return;
+
+ sched_reset_numa();
+ sched_init_numa(online ? NUMA_NO_NODE : node);
}
void sched_domains_numa_masks_set(unsigned int cpu)
@@ -1694,6 +2046,10 @@ void sched_domains_numa_masks_set(unsigned int cpu)
for (i = 0; i < sched_domains_numa_levels; i++) {
for (j = 0; j < nr_node_ids; j++) {
+ if (!node_state(j, N_CPU))
+ continue;
+
+ /* Set ourselves in the remote node's masks */
if (node_distance(j, node) <= sched_domains_numa_distance[i])
cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
}
@@ -1705,8 +2061,10 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
int i, j;
for (i = 0; i < sched_domains_numa_levels; i++) {
- for (j = 0; j < nr_node_ids; j++)
- cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ for (j = 0; j < nr_node_ids; j++) {
+ if (sched_domains_numa_masks[i][j])
+ cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
}
}
@@ -1720,16 +2078,130 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
*/
int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
{
- int i, j = cpu_to_node(cpu);
+ int i, j = cpu_to_node(cpu), found = nr_cpu_ids;
+ struct cpumask ***masks;
+ rcu_read_lock();
+ masks = rcu_dereference(sched_domains_numa_masks);
+ if (!masks)
+ goto unlock;
for (i = 0; i < sched_domains_numa_levels; i++) {
- cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
- if (cpu < nr_cpu_ids)
- return cpu;
+ if (!masks[i][j])
+ break;
+ cpu = cpumask_any_and(cpus, masks[i][j]);
+ if (cpu < nr_cpu_ids) {
+ found = cpu;
+ break;
+ }
+ }
+unlock:
+ rcu_read_unlock();
+
+ return found;
+}
+
+struct __cmp_key {
+ const struct cpumask *cpus;
+ struct cpumask ***masks;
+ int node;
+ int cpu;
+ int w;
+};
+
+static int hop_cmp(const void *a, const void *b)
+{
+ struct cpumask **prev_hop, **cur_hop = *(struct cpumask ***)b;
+ struct __cmp_key *k = (struct __cmp_key *)a;
+
+ if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu)
+ return 1;
+
+ if (b == k->masks) {
+ k->w = 0;
+ return 0;
}
- return nr_cpu_ids;
+
+ prev_hop = *((struct cpumask ***)b - 1);
+ k->w = cpumask_weight_and(k->cpus, prev_hop[k->node]);
+ if (k->w <= k->cpu)
+ return 0;
+
+ return -1;
}
+/**
+ * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth closest CPU
+ * from @cpus to @cpu, taking into account distance
+ * from a given @node.
+ * @cpus: cpumask to find a cpu from
+ * @cpu: CPU to start searching
+ * @node: NUMA node to order CPUs by distance
+ *
+ * Return: cpu, or nr_cpu_ids when nothing found.
+ */
+int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
+{
+ struct __cmp_key k = { .cpus = cpus, .cpu = cpu };
+ struct cpumask ***hop_masks;
+ int hop, ret = nr_cpu_ids;
+
+ if (node == NUMA_NO_NODE)
+ return cpumask_nth_and(cpu, cpus, cpu_online_mask);
+
+ rcu_read_lock();
+
+ /* CPU-less node entries are uninitialized in sched_domains_numa_masks */
+ node = numa_nearest_node(node, N_CPU);
+ k.node = node;
+
+ k.masks = rcu_dereference(sched_domains_numa_masks);
+ if (!k.masks)
+ goto unlock;
+
+ hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
+ hop = hop_masks - k.masks;
+
+ ret = hop ?
+ cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
+ cpumask_nth_and(cpu, cpus, k.masks[0][node]);
+unlock:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
+
+/**
+ * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from
+ * @node
+ * @node: The node to count hops from.
+ * @hops: Include CPUs up to that many hops away. 0 means local node.
+ *
+ * Return: On success, a pointer to a cpumask of CPUs at most @hops away from
+ * @node, an error value otherwise.
+ *
+ * Requires rcu_lock to be held. Returned cpumask is only valid within that
+ * read-side section, copy it if required beyond that.
+ *
+ * Note that not all hops are equal in distance; see sched_init_numa() for how
+ * distances and masks are handled.
+ * Also note that this is a reflection of sched_domains_numa_masks, which may change
+ * during the lifetime of the system (offline nodes are taken out of the masks).
+ */
+const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops)
+{
+ struct cpumask ***masks;
+
+ if (node >= nr_node_ids || hops >= sched_domains_numa_levels)
+ return ERR_PTR(-EINVAL);
+
+ masks = rcu_dereference(sched_domains_numa_masks);
+ if (!masks)
+ return ERR_PTR(-EBUSY);
+
+ return masks[hops][node];
+}
+EXPORT_SYMBOL_GPL(sched_numa_hop_mask);
+
#endif /* CONFIG_NUMA */
static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -1839,9 +2311,9 @@ static void __sdt_free(const struct cpumask *cpu_map)
static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *child, int dflags, int cpu)
+ struct sched_domain *child, int cpu)
{
- struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
+ struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
if (child) {
sd->level = child->level + 1;
@@ -1904,65 +2376,6 @@ static bool topology_span_sane(struct sched_domain_topology_level *tl,
}
/*
- * Find the sched_domain_topology_level where all CPU capacities are visible
- * for all CPUs.
- */
-static struct sched_domain_topology_level
-*asym_cpu_capacity_level(const struct cpumask *cpu_map)
-{
- int i, j, asym_level = 0;
- bool asym = false;
- struct sched_domain_topology_level *tl, *asym_tl = NULL;
- unsigned long cap;
-
- /* Is there any asymmetry? */
- cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
-
- for_each_cpu(i, cpu_map) {
- if (arch_scale_cpu_capacity(i) != cap) {
- asym = true;
- break;
- }
- }
-
- if (!asym)
- return NULL;
-
- /*
- * Examine topology from all CPU's point of views to detect the lowest
- * sched_domain_topology_level where a highest capacity CPU is visible
- * to everyone.
- */
- for_each_cpu(i, cpu_map) {
- unsigned long max_capacity = arch_scale_cpu_capacity(i);
- int tl_id = 0;
-
- for_each_sd_topology(tl) {
- if (tl_id < asym_level)
- goto next_level;
-
- for_each_cpu_and(j, tl->mask(i), cpu_map) {
- unsigned long capacity;
-
- capacity = arch_scale_cpu_capacity(j);
-
- if (capacity <= max_capacity)
- continue;
-
- max_capacity = capacity;
- asym_level = tl_id;
- asym_tl = tl;
- }
-next_level:
- tl_id++;
- }
- }
-
- return asym_tl;
-}
-
-
-/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
*/
@@ -1974,8 +2387,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
struct s_data d;
struct rq *rq = NULL;
int i, ret = -ENOMEM;
- struct sched_domain_topology_level *tl_asym;
bool has_asym = false;
+ bool has_cluster = false;
if (WARN_ON(cpumask_empty(cpu_map)))
goto error;
@@ -1984,25 +2397,19 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (alloc_state != sa_rootdomain)
goto error;
- tl_asym = asym_cpu_capacity_level(cpu_map);
-
/* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) {
struct sched_domain_topology_level *tl;
sd = NULL;
for_each_sd_topology(tl) {
- int dflags = 0;
-
- if (tl == tl_asym) {
- dflags |= SD_ASYM_CPUCAPACITY;
- has_asym = true;
- }
if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
goto error;
- sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
+ sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+
+ has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
@@ -2027,6 +2434,64 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
}
}
+ /*
+ * Calculate an allowed NUMA imbalance such that LLCs do not get
+ * imbalanced.
+ */
+ for_each_cpu(i, cpu_map) {
+ unsigned int imb = 0;
+ unsigned int imb_span = 1;
+
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ struct sched_domain *child = sd->child;
+
+ if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child &&
+ (child->flags & SD_SHARE_PKG_RESOURCES)) {
+ struct sched_domain __rcu *top_p;
+ unsigned int nr_llcs;
+
+ /*
+ * For a single LLC per node, allow an
+ * imbalance up to 12.5% of the node. This is
+ * arbitrary cutoff based two factors -- SMT and
+ * memory channels. For SMT-2, the intent is to
+ * avoid premature sharing of HT resources but
+ * SMT-4 or SMT-8 *may* benefit from a different
+ * cutoff. For memory channels, this is a very
+ * rough estimate of how many channels may be
+ * active and is based on recent CPUs with
+ * many cores.
+ *
+ * For multiple LLCs, allow an imbalance
+ * until multiple tasks would share an LLC
+ * on one node while LLCs on another node
+ * remain idle. This assumes that there are
+ * enough logical CPUs per LLC to avoid SMT
+ * factors and that there is a correlation
+ * between LLCs and memory channels.
+ */
+ nr_llcs = sd->span_weight / child->span_weight;
+ if (nr_llcs == 1)
+ imb = sd->span_weight >> 3;
+ else
+ imb = nr_llcs;
+ imb = max(1U, imb);
+ sd->imb_numa_nr = imb;
+
+ /* Set span based on the first NUMA domain. */
+ top_p = sd->parent;
+ while (top_p && !(top_p->flags & SD_NUMA)) {
+ top_p = top_p->parent;
+ }
+ imb_span = top_p ? top_p->span_weight : sd->span_weight;
+ } else {
+ int factor = max(1U, (sd->span_weight / imb_span));
+
+ sd->imb_numa_nr = imb * factor;
+ }
+ }
+ }
+
/* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
if (!cpumask_test_cpu(i, cpu_map))
@@ -2041,21 +2506,30 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
+ unsigned long capacity;
+
rq = cpu_rq(i);
sd = *per_cpu_ptr(d.sd, i);
+ capacity = arch_scale_cpu_capacity(i);
/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
- if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
- WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
+ if (capacity > READ_ONCE(d.rd->max_cpu_capacity))
+ WRITE_ONCE(d.rd->max_cpu_capacity, capacity);
cpu_attach_domain(sd, d.rd, i);
+
+ if (lowest_flag_domain(i, SD_CLUSTER))
+ has_cluster = true;
}
rcu_read_unlock();
if (has_asym)
static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
- if (rq && sched_debug_enabled) {
+ if (has_cluster)
+ static_branch_inc_cpuslocked(&sched_cluster_active);
+
+ if (rq && sched_debug_verbose) {
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
}
@@ -2073,7 +2547,7 @@ static cpumask_var_t *doms_cur;
/* Number of sched domains in 'doms_cur': */
static int ndoms_cur;
-/* Attribues of custom domains in 'doms_cur' */
+/* Attributes of custom domains in 'doms_cur' */
static struct sched_domain_attr *dattr_cur;
/*
@@ -2122,7 +2596,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
* Set up scheduler domains and groups. For now this just excludes isolated
* CPUs, but could be used to exclude other special cases in the future.
*/
-int sched_init_domains(const struct cpumask *cpu_map)
+int __init sched_init_domains(const struct cpumask *cpu_map)
{
int err;
@@ -2131,13 +2605,13 @@ int sched_init_domains(const struct cpumask *cpu_map)
zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
arch_update_cpu_topology();
+ asym_cpu_capacity_scan();
ndoms_cur = 1;
doms_cur = alloc_sched_domains(ndoms_cur);
if (!doms_cur)
doms_cur = &fallback_doms;
- cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
+ cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN));
err = build_sched_domains(doms_cur[0], NULL);
- register_sched_domain_sysctl();
return err;
}
@@ -2154,6 +2628,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map)
if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
+ if (static_branch_unlikely(&sched_cluster_active))
+ static_branch_dec_cpuslocked(&sched_cluster_active);
+
rcu_read_lock();
for_each_cpu(i, cpu_map)
cpu_attach_domain(NULL, &def_root_domain, i);
@@ -2212,11 +2689,11 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
lockdep_assert_held(&sched_domains_mutex);
- /* Always unregister in case we don't destroy any domains: */
- unregister_sched_domain_sysctl();
-
/* Let the architecture update CPU core mappings: */
new_topology = arch_update_cpu_topology();
+ /* Trigger rebuilding CPU capacity asymmetry data */
+ if (new_topology)
+ asym_cpu_capacity_scan();
if (!doms_new) {
WARN_ON_ONCE(dattr_new);
@@ -2225,7 +2702,7 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
if (doms_new) {
n = 1;
cpumask_and(doms_new[0], cpu_active_mask,
- housekeeping_cpumask(HK_FLAG_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
}
} else {
n = ndoms_new;
@@ -2260,7 +2737,7 @@ match1:
n = 0;
doms_new = &fallback_doms;
cpumask_and(doms_new[0], cpu_active_mask,
- housekeeping_cpumask(HK_FLAG_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
}
/* Build new domains: */
@@ -2303,7 +2780,7 @@ match3:
dattr_cur = dattr_new;
ndoms_cur = ndoms_new;
- register_sched_domain_sysctl();
+ update_sched_domain_debugfs();
}
/*
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index ba059fbfc53a..51e38f5f4701 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -4,7 +4,6 @@
*
* (C) 2004 Nadia Yvette Chambers, Oracle
*/
-#include "sched.h"
void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
{
@@ -37,6 +36,17 @@ void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue
}
EXPORT_SYMBOL(add_wait_queue_exclusive);
+void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+{
+ unsigned long flags;
+
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ __add_wait_queue(wq_head, wq_entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL_GPL(add_wait_queue_priority);
+
void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
@@ -48,37 +58,26 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry
EXPORT_SYMBOL(remove_wait_queue);
/*
- * Scan threshold to break wait queue walk.
- * This allows a waker to take a break from holding the
- * wait queue lock during the wait queue walk.
- */
-#define WAITQUEUE_WALK_BREAK_CNT 64
-
-/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
- * number) then we wake all the non-exclusive tasks and one exclusive task.
+ * number) then we wake that number of exclusive tasks, and potentially all
+ * the non-exclusive tasks. Normally, exclusive tasks will be at the end of
+ * the list and any non-exclusive tasks will be woken first. A priority task
+ * may be at the head of the list, and can consume the event without any other
+ * tasks being woken.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
- int nr_exclusive, int wake_flags, void *key,
- wait_queue_entry_t *bookmark)
+ int nr_exclusive, int wake_flags, void *key)
{
wait_queue_entry_t *curr, *next;
- int cnt = 0;
lockdep_assert_held(&wq_head->lock);
- if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
- curr = list_next_entry(bookmark, entry);
-
- list_del(&bookmark->entry);
- bookmark->flags = 0;
- } else
- curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
+ curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
if (&curr->entry == &wq_head->head)
return nr_exclusive;
@@ -87,43 +86,28 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
unsigned flags = curr->flags;
int ret;
- if (flags & WQ_FLAG_BOOKMARK)
- continue;
-
ret = curr->func(curr, mode, wake_flags, key);
if (ret < 0)
break;
if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
-
- if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
- (&next->entry != &wq_head->head)) {
- bookmark->flags = WQ_FLAG_BOOKMARK;
- list_add_tail(&bookmark->entry, &next->entry);
- break;
- }
}
return nr_exclusive;
}
-static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
unsigned long flags;
- wait_queue_entry_t bookmark;
+ int remaining;
- bookmark.flags = 0;
- bookmark.private = NULL;
- bookmark.func = NULL;
- INIT_LIST_HEAD(&bookmark.entry);
+ spin_lock_irqsave(&wq_head->lock, flags);
+ remaining = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags,
+ key);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
- do {
- spin_lock_irqsave(&wq_head->lock, flags);
- nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
- wake_flags, key, &bookmark);
- spin_unlock_irqrestore(&wq_head->lock, flags);
- } while (bookmark.flags & WQ_FLAG_BOOKMARK);
+ return nr_exclusive - remaining;
}
/**
@@ -133,38 +117,37 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: is directly passed to the wakeup function
*
- * If this function wakes up a task, it executes a full memory barrier before
- * accessing the task state.
+ * If this function wakes up a task, it executes a full memory barrier
+ * before accessing the task state. Returns the number of exclusive
+ * tasks that were awaken.
*/
-void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
- int nr_exclusive, void *key)
+int __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
+ int nr_exclusive, void *key)
{
- __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
+ return __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
}
EXPORT_SYMBOL(__wake_up);
+void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key)
+{
+ __wake_up_common_lock(wq_head, mode, 1, WF_CURRENT_CPU, key);
+}
+
/*
* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
*/
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
{
- __wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
+ __wake_up_common(wq_head, mode, nr, 0, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
{
- __wake_up_common(wq_head, mode, 1, 0, key, NULL);
+ __wake_up_common(wq_head, mode, 1, 0, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
-void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
- unsigned int mode, void *key, wait_queue_entry_t *bookmark)
-{
- __wake_up_common(wq_head, mode, 1, 0, key, bookmark);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
-
/**
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
* @wq_head: the waitqueue
@@ -210,7 +193,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync_key);
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head,
unsigned int mode, void *key)
{
- __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL);
+ __wake_up_common(wq_head, mode, 1, WF_SYNC, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key);
@@ -223,6 +206,13 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode)
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+void __wake_up_pollfree(struct wait_queue_head *wq_head)
+{
+ __wake_up(wq_head, TASK_NORMAL, 0, poll_to_key(EPOLLHUP | POLLFREE));
+ /* POLLFREE must have cleared the queue. */
+ WARN_ON_ONCE(waitqueue_active(wq_head));
+}
+
/*
* Note: we use "set_current_state()" _after_ the wait-queue add,
* because we need a memory barrier there on SMP, so that any
@@ -249,17 +239,22 @@ prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_ent
}
EXPORT_SYMBOL(prepare_to_wait);
-void
+/* Returns true if we are the first waiter in the queue, false otherwise. */
+bool
prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
{
unsigned long flags;
+ bool was_empty = false;
wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&wq_head->lock, flags);
- if (list_empty(&wq_entry->entry))
+ if (list_empty(&wq_entry->entry)) {
+ was_empty = list_empty(&wq_head->head);
__add_wait_queue_entry_tail(wq_head, wq_entry);
+ }
set_current_state(state);
spin_unlock_irqrestore(&wq_head->lock, flags);
+ return was_empty;
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);
@@ -389,17 +384,12 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
int ret = default_wake_function(wq_entry, mode, sync, key);
if (ret)
- list_del_init(&wq_entry->entry);
+ list_del_init_careful(&wq_entry->entry);
return ret;
}
EXPORT_SYMBOL(autoremove_wake_function);
-static inline bool is_kthread_should_stop(void)
-{
- return (current->flags & PF_KTHREAD) && kthread_should_stop();
-}
-
/*
* DEFINE_WAIT_FUNC(wait, woken_wake_func);
*
@@ -429,7 +419,7 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout)
* or woken_wake_function() sees our store to current->state.
*/
set_current_state(mode); /* A */
- if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
+ if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !kthread_should_stop_or_park())
timeout = schedule_timeout(timeout);
__set_current_state(TASK_RUNNING);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 02ce292b9bc0..0b1cd985dc27 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-only
+
/*
* The implementation of the wait_bit*() and related waiting APIs:
*/
-#include "sched.h"
#define WAIT_TABLE_BITS 8
#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
@@ -47,7 +47,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
ret = (*action)(&wbq_entry->key, mode);
- } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+ } while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
finish_wait(wq_head, &wbq_entry->wq_entry);
diff --git a/kernel/scs.c b/kernel/scs.c
index 5d4d9bbdec36..d7809affe740 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -5,26 +5,57 @@
* Copyright (C) 2019 Google LLC
*/
+#include <linux/cpuhotplug.h>
#include <linux/kasan.h>
#include <linux/mm.h>
#include <linux/scs.h>
-#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <linux/vmstat.h>
-static struct kmem_cache *scs_cache;
+#ifdef CONFIG_DYNAMIC_SCS
+DEFINE_STATIC_KEY_FALSE(dynamic_scs_enabled);
+#endif
static void __scs_account(void *s, int account)
{
- struct page *scs_page = virt_to_page(s);
+ struct page *scs_page = vmalloc_to_page(s);
- mod_zone_page_state(page_zone(scs_page), NR_KERNEL_SCS_KB,
+ mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB,
account * (SCS_SIZE / SZ_1K));
}
-static void *scs_alloc(int node)
+/* Matches NR_CACHED_STACKS for VMAP_STACK */
+#define NR_CACHED_SCS 2
+static DEFINE_PER_CPU(void *, scs_cache[NR_CACHED_SCS]);
+
+static void *__scs_alloc(int node)
+{
+ int i;
+ void *s;
+
+ for (i = 0; i < NR_CACHED_SCS; i++) {
+ s = this_cpu_xchg(scs_cache[i], NULL);
+ if (s) {
+ s = kasan_unpoison_vmalloc(s, SCS_SIZE,
+ KASAN_VMALLOC_PROT_NORMAL);
+ memset(s, 0, SCS_SIZE);
+ goto out;
+ }
+ }
+
+ s = __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END,
+ GFP_SCS, PAGE_KERNEL, 0, node,
+ __builtin_return_address(0));
+
+out:
+ return kasan_reset_tag(s);
+}
+
+void *scs_alloc(int node)
{
- void *s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node);
+ void *s;
+ s = __scs_alloc(node);
if (!s)
return NULL;
@@ -34,27 +65,60 @@ static void *scs_alloc(int node)
* Poison the allocation to catch unintentional accesses to
* the shadow stack when KASAN is enabled.
*/
- kasan_poison_object_data(scs_cache, s);
+ kasan_poison_vmalloc(s, SCS_SIZE);
__scs_account(s, 1);
return s;
}
-static void scs_free(void *s)
+void scs_free(void *s)
{
+ int i;
+
__scs_account(s, -1);
- kasan_unpoison_object_data(scs_cache, s);
- kmem_cache_free(scs_cache, s);
+
+ /*
+ * We cannot sleep as this can be called in interrupt context,
+ * so use this_cpu_cmpxchg to update the cache, and vfree_atomic
+ * to free the stack.
+ */
+
+ for (i = 0; i < NR_CACHED_SCS; i++)
+ if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL)
+ return;
+
+ kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_PROT_NORMAL);
+ vfree_atomic(s);
+}
+
+static int scs_cleanup(unsigned int cpu)
+{
+ int i;
+ void **cache = per_cpu_ptr(scs_cache, cpu);
+
+ for (i = 0; i < NR_CACHED_SCS; i++) {
+ vfree(cache[i]);
+ cache[i] = NULL;
+ }
+
+ return 0;
}
void __init scs_init(void)
{
- scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL);
+ if (!scs_is_enabled())
+ return;
+ cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "scs:scs_cache", NULL,
+ scs_cleanup);
}
int scs_prepare(struct task_struct *tsk, int node)
{
- void *s = scs_alloc(node);
+ void *s;
+
+ if (!scs_is_enabled())
+ return 0;
+ s = scs_alloc(node);
if (!s)
return -ENOMEM;
@@ -94,7 +158,7 @@ void scs_release(struct task_struct *tsk)
{
void *s = task_scs(tsk);
- if (!s)
+ if (!scs_is_enabled() || !s)
return;
WARN(task_scs_end_corrupted(tsk),
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index d653d8426de9..aca7b437882e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -13,6 +13,7 @@
* Mode 2 allows user-defined system call filters in the form
* of Berkeley Packet Filters/Linux Socket Filters.
*/
+#define pr_fmt(fmt) "seccomp: " fmt
#include <linux/refcount.h>
#include <linux/audit.h>
@@ -28,6 +29,9 @@
#include <linux/syscalls.h>
#include <linux/sysctl.h>
+/* Not exposed in headers: strictly internal use only. */
+#define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1)
+
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
#include <asm/syscall.h>
#endif
@@ -37,10 +41,18 @@
#include <linux/filter.h>
#include <linux/pid.h>
#include <linux/ptrace.h>
-#include <linux/security.h>
-#include <linux/tracehook.h>
+#include <linux/capability.h>
#include <linux/uaccess.h>
#include <linux/anon_inodes.h>
+#include <linux/lockdep.h>
+
+/*
+ * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
+ * wrong direction flag in the ioctl number. This is the broken one,
+ * which the kernel needs to keep supporting until all userspaces stop
+ * using the wrong command number.
+ */
+#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64)
enum notify_state {
SECCOMP_NOTIFY_INIT,
@@ -77,10 +89,49 @@ struct seccomp_knotif {
long val;
u32 flags;
- /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
+ /*
+ * Signals when this has changed states, such as the listener
+ * dying, a new seccomp addfd message, or changing to REPLIED
+ */
struct completion ready;
struct list_head list;
+
+ /* outstanding addfd requests */
+ struct list_head addfd;
+};
+
+/**
+ * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages
+ *
+ * @file: A reference to the file to install in the other task
+ * @fd: The fd number to install it at. If the fd number is -1, it means the
+ * installing process should allocate the fd as normal.
+ * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
+ * is allowed.
+ * @ioctl_flags: The flags used for the seccomp_addfd ioctl.
+ * @setfd: whether or not SECCOMP_ADDFD_FLAG_SETFD was set during notify_addfd
+ * @ret: The return value of the installing process. It is set to the fd num
+ * upon success (>= 0).
+ * @completion: Indicates that the installing process has completed fd
+ * installation, or gone away (either due to successful
+ * reply, or signal)
+ * @list: list_head for chaining seccomp_kaddfd together.
+ *
+ */
+struct seccomp_kaddfd {
+ struct file *file;
+ int fd;
+ unsigned int flags;
+ __u32 ioctl_flags;
+
+ union {
+ bool setfd;
+ /* To only be set on reply */
+ int ret;
+ };
+ struct completion completion;
+ struct list_head list;
};
/**
@@ -89,32 +140,78 @@ struct seccomp_knotif {
* structure is fairly large, we store the notification-specific stuff in a
* separate structure.
*
- * @request: A semaphore that users of this notification can wait on for
- * changes. Actual reads and writes are still controlled with
- * filter->notify_lock.
+ * @requests: A semaphore that users of this notification can wait on for
+ * changes. Actual reads and writes are still controlled with
+ * filter->notify_lock.
+ * @flags: A set of SECCOMP_USER_NOTIF_FD_* flags.
* @next_id: The id of the next request.
* @notifications: A list of struct seccomp_knotif elements.
- * @wqh: A wait queue for poll.
*/
+
struct notification {
- struct semaphore request;
+ atomic_t requests;
+ u32 flags;
u64 next_id;
struct list_head notifications;
- wait_queue_head_t wqh;
};
+#ifdef SECCOMP_ARCH_NATIVE
+/**
+ * struct action_cache - per-filter cache of seccomp actions per
+ * arch/syscall pair
+ *
+ * @allow_native: A bitmap where each bit represents whether the
+ * filter will always allow the syscall, for the
+ * native architecture.
+ * @allow_compat: A bitmap where each bit represents whether the
+ * filter will always allow the syscall, for the
+ * compat architecture.
+ */
+struct action_cache {
+ DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR);
+#ifdef SECCOMP_ARCH_COMPAT
+ DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR);
+#endif
+};
+#else
+struct action_cache { };
+
+static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
+ const struct seccomp_data *sd)
+{
+ return false;
+}
+
+static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+}
+#endif /* SECCOMP_ARCH_NATIVE */
+
/**
* struct seccomp_filter - container for seccomp BPF programs
*
- * @usage: reference count to manage the object lifetime.
- * get/put helpers should be used when accessing an instance
- * outside of a lifetime-guarded section. In general, this
- * is only needed for handling filters shared across tasks.
+ * @refs: Reference count to manage the object lifetime.
+ * A filter's reference count is incremented for each directly
+ * attached task, once for the dependent filter, and if
+ * requested for the user notifier. When @refs reaches zero,
+ * the filter can be freed.
+ * @users: A filter's @users count is incremented for each directly
+ * attached task (filter installation, fork(), thread_sync),
+ * and once for the dependent filter (tracked in filter->prev).
+ * When it reaches zero it indicates that no direct or indirect
+ * users of that filter exist. No new tasks can get associated with
+ * this filter after reaching 0. The @users count is always smaller
+ * or equal to @refs. Hence, reaching 0 for @users does not mean
+ * the filter can be freed.
+ * @cache: cache of arch/syscall mappings to actions
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
+ * @wait_killable_recv: Put notifying process in killable state once the
+ * notification is received by the userspace listener.
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
* @notif: the struct that holds all notification related information
* @notify_lock: A lock for all notification-related accesses.
+ * @wqh: A wait queue for poll if a notifier is in use.
*
* seccomp_filter objects are organized in a tree linked via the @prev
* pointer. For any task, it appears to be a singly-linked list starting
@@ -124,15 +221,19 @@ struct notification {
* how namespaces work.
*
* seccomp_filter objects should never be modified after being attached
- * to a task_struct (other than @usage).
+ * to a task_struct (other than @refs).
*/
struct seccomp_filter {
- refcount_t usage;
+ refcount_t refs;
+ refcount_t users;
bool log;
+ bool wait_killable_recv;
+ struct action_cache cache;
struct seccomp_filter *prev;
struct bpf_prog *prog;
struct notification *notif;
struct mutex notify_lock;
+ wait_queue_head_t wqh;
};
/* Limit any path through the tree to 256KB worth of instructions. */
@@ -144,6 +245,10 @@ struct seccomp_filter {
*/
static void populate_seccomp_data(struct seccomp_data *sd)
{
+ /*
+ * Instead of using current_pt_reg(), we're already doing the work
+ * to safely fetch "current", so just use "task" everywhere below.
+ */
struct task_struct *task = current;
struct pt_regs *regs = task_pt_regs(task);
unsigned long args[6];
@@ -242,6 +347,53 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
return 0;
}
+#ifdef SECCOMP_ARCH_NATIVE
+static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap,
+ size_t bitmap_size,
+ int syscall_nr)
+{
+ if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size))
+ return false;
+ syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
+
+ return test_bit(syscall_nr, bitmap);
+}
+
+/**
+ * seccomp_cache_check_allow - lookup seccomp cache
+ * @sfilter: The seccomp filter
+ * @sd: The seccomp data to lookup the cache with
+ *
+ * Returns true if the seccomp_data is cached and allowed.
+ */
+static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
+ const struct seccomp_data *sd)
+{
+ int syscall_nr = sd->nr;
+ const struct action_cache *cache = &sfilter->cache;
+
+#ifndef SECCOMP_ARCH_COMPAT
+ /* A native-only architecture doesn't need to check sd->arch. */
+ return seccomp_cache_check_allow_bitmap(cache->allow_native,
+ SECCOMP_ARCH_NATIVE_NR,
+ syscall_nr);
+#else
+ if (likely(sd->arch == SECCOMP_ARCH_NATIVE))
+ return seccomp_cache_check_allow_bitmap(cache->allow_native,
+ SECCOMP_ARCH_NATIVE_NR,
+ syscall_nr);
+ if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
+ return seccomp_cache_check_allow_bitmap(cache->allow_compat,
+ SECCOMP_ARCH_COMPAT_NR,
+ syscall_nr);
+#endif /* SECCOMP_ARCH_COMPAT */
+
+ WARN_ON_ONCE(true);
+ return false;
+}
+#endif /* SECCOMP_ARCH_NATIVE */
+
+#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
/**
* seccomp_run_filters - evaluates all seccomp filters against @sd
* @sd: optional seccomp data to be passed to filters
@@ -251,7 +403,6 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
*
* Returns valid seccomp BPF response codes.
*/
-#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
static u32 seccomp_run_filters(const struct seccomp_data *sd,
struct seccomp_filter **match)
{
@@ -264,6 +415,9 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
if (WARN_ON(f == NULL))
return SECCOMP_RET_KILL_PROCESS;
+ if (seccomp_cache_check_allow(f, sd))
+ return SECCOMP_RET_ALLOW;
+
/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
@@ -300,14 +454,14 @@ static inline void seccomp_assign_mode(struct task_struct *task,
task->seccomp.mode = seccomp_mode;
/*
- * Make sure TIF_SECCOMP cannot be set before the mode (and
+ * Make sure SYSCALL_WORK_SECCOMP cannot be set before the mode (and
* filter) is set.
*/
smp_mb__before_atomic();
/* Assume default seccomp processes want spec flaw mitigation. */
if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
arch_seccomp_spec_mitigate(task);
- set_tsk_thread_flag(task, TIF_SECCOMP);
+ set_task_syscall_work(task, SECCOMP);
}
#ifdef CONFIG_SECCOMP_FILTER
@@ -366,9 +520,69 @@ static inline pid_t seccomp_can_sync_threads(void)
return 0;
}
+static inline void seccomp_filter_free(struct seccomp_filter *filter)
+{
+ if (filter) {
+ bpf_prog_destroy(filter->prog);
+ kfree(filter);
+ }
+}
+
+static void __seccomp_filter_orphan(struct seccomp_filter *orig)
+{
+ while (orig && refcount_dec_and_test(&orig->users)) {
+ if (waitqueue_active(&orig->wqh))
+ wake_up_poll(&orig->wqh, EPOLLHUP);
+ orig = orig->prev;
+ }
+}
+
+static void __put_seccomp_filter(struct seccomp_filter *orig)
+{
+ /* Clean up single-reference branches iteratively. */
+ while (orig && refcount_dec_and_test(&orig->refs)) {
+ struct seccomp_filter *freeme = orig;
+ orig = orig->prev;
+ seccomp_filter_free(freeme);
+ }
+}
+
+static void __seccomp_filter_release(struct seccomp_filter *orig)
+{
+ /* Notify about any unused filters in the task's former filter tree. */
+ __seccomp_filter_orphan(orig);
+ /* Finally drop all references to the task's former tree. */
+ __put_seccomp_filter(orig);
+}
+
+/**
+ * seccomp_filter_release - Detach the task from its filter tree,
+ * drop its reference count, and notify
+ * about unused filters
+ *
+ * @tsk: task the filter should be released from.
+ *
+ * This function should only be called when the task is exiting as
+ * it detaches it from its filter tree. As such, READ_ONCE() and
+ * barriers are not needed here, as would normally be needed.
+ */
+void seccomp_filter_release(struct task_struct *tsk)
+{
+ struct seccomp_filter *orig = tsk->seccomp.filter;
+
+ /* We are effectively holding the siglock by not having any sighand. */
+ WARN_ON(tsk->sighand != NULL);
+
+ /* Detach task from its filter tree. */
+ tsk->seccomp.filter = NULL;
+ __seccomp_filter_release(orig);
+}
+
/**
* seccomp_sync_threads: sets all threads to use current's filter
*
+ * @flags: SECCOMP_FILTER_FLAG_* flags to set during sync.
+ *
* Expects sighand and cred_guard_mutex locks to be held, and for
* seccomp_can_sync_threads() to have returned success already
* without dropping the locks.
@@ -390,14 +604,19 @@ static inline void seccomp_sync_threads(unsigned long flags)
/* Get a task reference for the new leaf node. */
get_seccomp_filter(caller);
+
/*
* Drop the task reference to the shared ancestor since
* current's path will hold a reference. (This also
* allows a put before the assignment.)
*/
- put_seccomp_filter(thread);
+ __seccomp_filter_release(thread->seccomp.filter);
+
+ /* Make our new filter tree visible. */
smp_store_release(&thread->seccomp.filter,
caller->seccomp.filter);
+ atomic_set(&thread->seccomp.filter_count,
+ atomic_read(&caller->seccomp.filter_count));
/*
* Don't let an unprivileged task work around
@@ -430,7 +649,12 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *sfilter;
int ret;
- const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
+ const bool save_orig =
+#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE)
+ true;
+#else
+ false;
+#endif
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
@@ -444,8 +668,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
* behavior of privileged children.
*/
if (!task_no_new_privs(current) &&
- security_capable(current_cred(), current_user_ns(),
- CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) != 0)
+ !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
return ERR_PTR(-EACCES);
/* Allocate a new seccomp_filter */
@@ -461,7 +684,9 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
return ERR_PTR(ret);
}
- refcount_set(&sfilter->usage, 1);
+ refcount_set(&sfilter->refs, 1);
+ refcount_set(&sfilter->users, 1);
+ init_waitqueue_head(&sfilter->wqh);
return sfilter;
}
@@ -494,6 +719,148 @@ out:
return filter;
}
+#ifdef SECCOMP_ARCH_NATIVE
+/**
+ * seccomp_is_const_allow - check if filter is constant allow with given data
+ * @fprog: The BPF programs
+ * @sd: The seccomp data to check against, only syscall number and arch
+ * number are considered constant.
+ */
+static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
+ struct seccomp_data *sd)
+{
+ unsigned int reg_value = 0;
+ unsigned int pc;
+ bool op_res;
+
+ if (WARN_ON_ONCE(!fprog))
+ return false;
+
+ for (pc = 0; pc < fprog->len; pc++) {
+ struct sock_filter *insn = &fprog->filter[pc];
+ u16 code = insn->code;
+ u32 k = insn->k;
+
+ switch (code) {
+ case BPF_LD | BPF_W | BPF_ABS:
+ switch (k) {
+ case offsetof(struct seccomp_data, nr):
+ reg_value = sd->nr;
+ break;
+ case offsetof(struct seccomp_data, arch):
+ reg_value = sd->arch;
+ break;
+ default:
+ /* can't optimize (non-constant value load) */
+ return false;
+ }
+ break;
+ case BPF_RET | BPF_K:
+ /* reached return with constant values only, check allow */
+ return k == SECCOMP_RET_ALLOW;
+ case BPF_JMP | BPF_JA:
+ pc += insn->k;
+ break;
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ switch (BPF_OP(code)) {
+ case BPF_JEQ:
+ op_res = reg_value == k;
+ break;
+ case BPF_JGE:
+ op_res = reg_value >= k;
+ break;
+ case BPF_JGT:
+ op_res = reg_value > k;
+ break;
+ case BPF_JSET:
+ op_res = !!(reg_value & k);
+ break;
+ default:
+ /* can't optimize (unknown jump) */
+ return false;
+ }
+
+ pc += op_res ? insn->jt : insn->jf;
+ break;
+ case BPF_ALU | BPF_AND | BPF_K:
+ reg_value &= k;
+ break;
+ default:
+ /* can't optimize (unknown insn) */
+ return false;
+ }
+ }
+
+ /* ran off the end of the filter?! */
+ WARN_ON(1);
+ return false;
+}
+
+static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
+ void *bitmap, const void *bitmap_prev,
+ size_t bitmap_size, int arch)
+{
+ struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
+ struct seccomp_data sd;
+ int nr;
+
+ if (bitmap_prev) {
+ /* The new filter must be as restrictive as the last. */
+ bitmap_copy(bitmap, bitmap_prev, bitmap_size);
+ } else {
+ /* Before any filters, all syscalls are always allowed. */
+ bitmap_fill(bitmap, bitmap_size);
+ }
+
+ for (nr = 0; nr < bitmap_size; nr++) {
+ /* No bitmap change: not a cacheable action. */
+ if (!test_bit(nr, bitmap))
+ continue;
+
+ sd.nr = nr;
+ sd.arch = arch;
+
+ /* No bitmap change: continue to always allow. */
+ if (seccomp_is_const_allow(fprog, &sd))
+ continue;
+
+ /*
+ * Not a cacheable action: always run filters.
+ * atomic clear_bit() not needed, filter not visible yet.
+ */
+ __clear_bit(nr, bitmap);
+ }
+}
+
+/**
+ * seccomp_cache_prepare - emulate the filter to find cacheable syscalls
+ * @sfilter: The seccomp filter
+ *
+ * Returns 0 if successful or -errno if error occurred.
+ */
+static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+ struct action_cache *cache = &sfilter->cache;
+ const struct action_cache *cache_prev =
+ sfilter->prev ? &sfilter->prev->cache : NULL;
+
+ seccomp_cache_prepare_bitmap(sfilter, cache->allow_native,
+ cache_prev ? cache_prev->allow_native : NULL,
+ SECCOMP_ARCH_NATIVE_NR,
+ SECCOMP_ARCH_NATIVE);
+
+#ifdef SECCOMP_ARCH_COMPAT
+ seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat,
+ cache_prev ? cache_prev->allow_compat : NULL,
+ SECCOMP_ARCH_COMPAT_NR,
+ SECCOMP_ARCH_COMPAT);
+#endif /* SECCOMP_ARCH_COMPAT */
+}
+#endif /* SECCOMP_ARCH_NATIVE */
+
/**
* seccomp_attach_filter: validate and attach filter
* @flags: flags to change filter behavior
@@ -538,12 +905,18 @@ static long seccomp_attach_filter(unsigned int flags,
if (flags & SECCOMP_FILTER_FLAG_LOG)
filter->log = true;
+ /* Set wait killable flag, if present. */
+ if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
+ filter->wait_killable_recv = true;
+
/*
* If there is an existing filter, make it the prev and don't drop its
* task reference.
*/
filter->prev = current->seccomp.filter;
+ seccomp_cache_prepare(filter);
current->seccomp.filter = filter;
+ atomic_inc(&current->seccomp.filter_count);
/* Now that the new filter is in place, synchronize to all threads. */
if (flags & SECCOMP_FILTER_FLAG_TSYNC)
@@ -554,7 +927,7 @@ static long seccomp_attach_filter(unsigned int flags,
static void __get_seccomp_filter(struct seccomp_filter *filter)
{
- refcount_inc(&filter->usage);
+ refcount_inc(&filter->refs);
}
/* get_seccomp_filter - increments the reference count of the filter on @tsk */
@@ -564,56 +937,9 @@ void get_seccomp_filter(struct task_struct *tsk)
if (!orig)
return;
__get_seccomp_filter(orig);
+ refcount_inc(&orig->users);
}
-static inline void seccomp_filter_free(struct seccomp_filter *filter)
-{
- if (filter) {
- bpf_prog_destroy(filter->prog);
- kfree(filter);
- }
-}
-
-static void __put_seccomp_filter(struct seccomp_filter *orig)
-{
- /* Clean up single-reference branches iteratively. */
- while (orig && refcount_dec_and_test(&orig->usage)) {
- struct seccomp_filter *freeme = orig;
- orig = orig->prev;
- seccomp_filter_free(freeme);
- }
-}
-
-/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
-void put_seccomp_filter(struct task_struct *tsk)
-{
- __put_seccomp_filter(tsk->seccomp.filter);
-}
-
-static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason)
-{
- clear_siginfo(info);
- info->si_signo = SIGSYS;
- info->si_code = SYS_SECCOMP;
- info->si_call_addr = (void __user *)KSTK_EIP(current);
- info->si_errno = reason;
- info->si_arch = syscall_get_arch(current);
- info->si_syscall = syscall;
-}
-
-/**
- * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
- * @syscall: syscall number to send to userland
- * @reason: filter-supplied reason code to send to userland (via si_errno)
- *
- * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
- */
-static void seccomp_send_sigsys(int syscall, int reason)
-{
- struct kernel_siginfo info;
- seccomp_init_siginfo(&info, syscall, reason);
- force_sig_info(&info);
-}
#endif /* CONFIG_SECCOMP_FILTER */
/* For use with seccomp_actions_logged */
@@ -684,24 +1010,25 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
*/
static const int mode1_syscalls[] = {
__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
- 0, /* null terminated */
+ -1, /* negative terminated */
};
static void __secure_computing_strict(int this_syscall)
{
- const int *syscall_whitelist = mode1_syscalls;
+ const int *allowed_syscalls = mode1_syscalls;
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
- syscall_whitelist = get_compat_mode1_syscalls();
+ allowed_syscalls = get_compat_mode1_syscalls();
#endif
do {
- if (*syscall_whitelist == this_syscall)
+ if (*allowed_syscalls == this_syscall)
return;
- } while (*++syscall_whitelist);
+ } while (*++allowed_syscalls != -1);
#ifdef SECCOMP_DEBUG
dump_stack();
#endif
+ current->seccomp.mode = SECCOMP_MODE_DEAD;
seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
do_exit(SIGKILL);
}
@@ -735,6 +1062,46 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
return filter->notif->next_id++;
}
+static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_knotif *n)
+{
+ int fd;
+
+ /*
+ * Remove the notification, and reset the list pointers, indicating
+ * that it has been handled.
+ */
+ list_del_init(&addfd->list);
+ if (!addfd->setfd)
+ fd = receive_fd(addfd->file, NULL, addfd->flags);
+ else
+ fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
+ addfd->ret = fd;
+
+ if (addfd->ioctl_flags & SECCOMP_ADDFD_FLAG_SEND) {
+ /* If we fail reset and return an error to the notifier */
+ if (fd < 0) {
+ n->state = SECCOMP_NOTIFY_SENT;
+ } else {
+ /* Return the FD we just added */
+ n->flags = 0;
+ n->error = 0;
+ n->val = fd;
+ }
+ }
+
+ /*
+ * Mark the notification as completed. From this point, addfd mem
+ * might be invalidated and we can't safely read it anymore.
+ */
+ complete(&addfd->completion);
+}
+
+static bool should_sleep_killable(struct seccomp_filter *match,
+ struct seccomp_knotif *n)
+{
+ return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT;
+}
+
static int seccomp_do_user_notification(int this_syscall,
struct seccomp_filter *match,
const struct seccomp_data *sd)
@@ -743,6 +1110,7 @@ static int seccomp_do_user_notification(int this_syscall,
u32 flags = 0;
long ret = 0;
struct seccomp_knotif n = {};
+ struct seccomp_kaddfd *addfd, *tmp;
mutex_lock(&match->notify_lock);
err = -ENOSYS;
@@ -754,26 +1122,63 @@ static int seccomp_do_user_notification(int this_syscall,
n.data = sd;
n.id = seccomp_next_notify_id(match);
init_completion(&n.ready);
- list_add(&n.list, &match->notif->notifications);
+ list_add_tail(&n.list, &match->notif->notifications);
+ INIT_LIST_HEAD(&n.addfd);
- up(&match->notif->request);
- wake_up_poll(&match->notif->wqh, EPOLLIN | EPOLLRDNORM);
- mutex_unlock(&match->notify_lock);
+ atomic_inc(&match->notif->requests);
+ if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+ wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM);
+ else
+ wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
/*
* This is where we wait for a reply from userspace.
*/
- err = wait_for_completion_interruptible(&n.ready);
- mutex_lock(&match->notify_lock);
- if (err == 0) {
- ret = n.val;
- err = n.error;
- flags = n.flags;
+ do {
+ bool wait_killable = should_sleep_killable(match, &n);
+
+ mutex_unlock(&match->notify_lock);
+ if (wait_killable)
+ err = wait_for_completion_killable(&n.ready);
+ else
+ err = wait_for_completion_interruptible(&n.ready);
+ mutex_lock(&match->notify_lock);
+
+ if (err != 0) {
+ /*
+ * Check to see if the notifcation got picked up and
+ * whether we should switch to wait killable.
+ */
+ if (!wait_killable && should_sleep_killable(match, &n))
+ continue;
+
+ goto interrupted;
+ }
+
+ addfd = list_first_entry_or_null(&n.addfd,
+ struct seccomp_kaddfd, list);
+ /* Check if we were woken up by a addfd message */
+ if (addfd)
+ seccomp_handle_addfd(addfd, &n);
+
+ } while (n.state != SECCOMP_NOTIFY_REPLIED);
+
+ ret = n.val;
+ err = n.error;
+ flags = n.flags;
+
+interrupted:
+ /* If there were any pending addfd calls, clear them out */
+ list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
+ /* The process went away before we got a chance to handle it */
+ addfd->ret = -ESRCH;
+ list_del_init(&addfd->list);
+ complete(&addfd->completion);
}
/*
* Note that it's possible the listener died in between the time when
- * we were notified of a respons (or a signal) and when we were able to
+ * we were notified of a response (or a signal) and when we were able to
* re-acquire the lock, so only delete from the list if the
* notification actually exists.
*
@@ -790,7 +1195,7 @@ out:
if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE)
return 0;
- syscall_set_return_value(current, task_pt_regs(current),
+ syscall_set_return_value(current, current_pt_regs(),
err, ret);
return -1;
}
@@ -805,9 +1210,9 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
/*
* Make sure that any changes to mode from another thread have
- * been seen after TIF_SECCOMP was seen.
+ * been seen after SYSCALL_WORK_SECCOMP was seen.
*/
- rmb();
+ smp_rmb();
if (!sd) {
populate_seccomp_data(&sd_local);
@@ -823,15 +1228,15 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
/* Set low-order bits as an errno, capped at MAX_ERRNO. */
if (data > MAX_ERRNO)
data = MAX_ERRNO;
- syscall_set_return_value(current, task_pt_regs(current),
+ syscall_set_return_value(current, current_pt_regs(),
-data, 0);
goto skip;
case SECCOMP_RET_TRAP:
/* Show the handler the original registers. */
- syscall_rollback(current, task_pt_regs(current));
+ syscall_rollback(current, current_pt_regs());
/* Let the filter pass back 16 bits of data. */
- seccomp_send_sigsys(this_syscall, data);
+ force_sig_seccomp(this_syscall, data, false);
goto skip;
case SECCOMP_RET_TRACE:
@@ -842,7 +1247,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
/* ENOSYS these calls if there is no tracer attached. */
if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
syscall_set_return_value(current,
- task_pt_regs(current),
+ current_pt_regs(),
-ENOSYS, 0);
goto skip;
}
@@ -862,7 +1267,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
if (fatal_signal_pending(current))
goto skip;
/* Check if the tracer forced the syscall to be skipped. */
- this_syscall = syscall_get_nr(current, task_pt_regs(current));
+ this_syscall = syscall_get_nr(current, current_pt_regs());
if (this_syscall < 0)
goto skip;
@@ -898,22 +1303,19 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
case SECCOMP_RET_KILL_THREAD:
case SECCOMP_RET_KILL_PROCESS:
default:
+ current->seccomp.mode = SECCOMP_MODE_DEAD;
seccomp_log(this_syscall, SIGSYS, action, true);
/* Dump core only if this is the last remaining thread. */
- if (action == SECCOMP_RET_KILL_PROCESS ||
- get_nr_threads(current) == 1) {
- kernel_siginfo_t info;
-
+ if (action != SECCOMP_RET_KILL_THREAD ||
+ (atomic_read(&current->signal->live) == 1)) {
/* Show the original registers in the dump. */
- syscall_rollback(current, task_pt_regs(current));
- /* Trigger a manual coredump since do_exit skips it. */
- seccomp_init_siginfo(&info, this_syscall, data);
- do_coredump(&info);
- }
- if (action == SECCOMP_RET_KILL_PROCESS)
- do_group_exit(SIGSYS);
- else
+ syscall_rollback(current, current_pt_regs());
+ /* Trigger a coredump with SIGSYS */
+ force_sig_seccomp(this_syscall, data, true);
+ } else {
do_exit(SIGSYS);
+ }
+ return -1; /* skip the syscall go directly to signal handling */
}
unreachable();
@@ -927,6 +1329,8 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
const bool recheck_after_trace)
{
BUG();
+
+ return -1;
}
#endif
@@ -940,7 +1344,7 @@ int __secure_computing(const struct seccomp_data *sd)
return 0;
this_syscall = sd ? sd->nr :
- syscall_get_nr(current, task_pt_regs(current));
+ syscall_get_nr(current, current_pt_regs());
switch (mode) {
case SECCOMP_MODE_STRICT:
@@ -948,6 +1352,11 @@ int __secure_computing(const struct seccomp_data *sd)
return 0;
case SECCOMP_MODE_FILTER:
return __seccomp_filter(this_syscall, sd, false);
+ /* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */
+ case SECCOMP_MODE_DEAD:
+ WARN_ON_ONCE(1);
+ do_exit(SIGKILL);
+ return -1;
default:
BUG();
}
@@ -989,13 +1398,18 @@ out:
}
#ifdef CONFIG_SECCOMP_FILTER
-static int seccomp_notify_release(struct inode *inode, struct file *file)
+static void seccomp_notify_free(struct seccomp_filter *filter)
+{
+ kfree(filter->notif);
+ filter->notif = NULL;
+}
+
+static void seccomp_notify_detach(struct seccomp_filter *filter)
{
- struct seccomp_filter *filter = file->private_data;
struct seccomp_knotif *knotif;
if (!filter)
- return 0;
+ return;
mutex_lock(&filter->notify_lock);
@@ -1011,16 +1425,75 @@ static int seccomp_notify_release(struct inode *inode, struct file *file)
knotif->error = -ENOSYS;
knotif->val = 0;
+ /*
+ * We do not need to wake up any pending addfd messages, as
+ * the notifier will do that for us, as this just looks
+ * like a standard reply.
+ */
complete(&knotif->ready);
}
- kfree(filter->notif);
- filter->notif = NULL;
+ seccomp_notify_free(filter);
mutex_unlock(&filter->notify_lock);
+}
+
+static int seccomp_notify_release(struct inode *inode, struct file *file)
+{
+ struct seccomp_filter *filter = file->private_data;
+
+ seccomp_notify_detach(filter);
__put_seccomp_filter(filter);
return 0;
}
+/* must be called with notif_lock held */
+static inline struct seccomp_knotif *
+find_notification(struct seccomp_filter *filter, u64 id)
+{
+ struct seccomp_knotif *cur;
+
+ lockdep_assert_held(&filter->notify_lock);
+
+ list_for_each_entry(cur, &filter->notif->notifications, list) {
+ if (cur->id == id)
+ return cur;
+ }
+
+ return NULL;
+}
+
+static int recv_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync,
+ void *key)
+{
+ /* Avoid a wakeup if event not interesting for us. */
+ if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR)))
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+
+static int recv_wait_event(struct seccomp_filter *filter)
+{
+ DEFINE_WAIT_FUNC(wait, recv_wake_function);
+ int ret;
+
+ if (atomic_dec_if_positive(&filter->notif->requests) >= 0)
+ return 0;
+
+ for (;;) {
+ ret = prepare_to_wait_event(&filter->wqh, &wait, TASK_INTERRUPTIBLE);
+
+ if (atomic_dec_if_positive(&filter->notif->requests) >= 0)
+ break;
+
+ if (ret)
+ return ret;
+
+ schedule();
+ }
+ finish_wait(&filter->wqh, &wait);
+ return 0;
+}
+
static long seccomp_notify_recv(struct seccomp_filter *filter,
void __user *buf)
{
@@ -1037,7 +1510,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
memset(&unotif, 0, sizeof(unotif));
- ret = down_interruptible(&filter->notif->request);
+ ret = recv_wait_event(filter);
if (ret < 0)
return ret;
@@ -1064,7 +1537,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
unotif.data = *(knotif->data);
knotif->state = SECCOMP_NOTIFY_SENT;
- wake_up_poll(&filter->notif->wqh, EPOLLOUT | EPOLLWRNORM);
+ wake_up_poll(&filter->wqh, EPOLLOUT | EPOLLWRNORM);
ret = 0;
out:
mutex_unlock(&filter->notify_lock);
@@ -1078,18 +1551,15 @@ out:
* may have died when we released the lock, so we need to make
* sure it's still around.
*/
- knotif = NULL;
mutex_lock(&filter->notify_lock);
- list_for_each_entry(cur, &filter->notif->notifications, list) {
- if (cur->id == unotif.id) {
- knotif = cur;
- break;
- }
- }
-
+ knotif = find_notification(filter, unotif.id);
if (knotif) {
+ /* Reset the process to make sure it's not stuck */
+ if (should_sleep_killable(filter, knotif))
+ complete(&knotif->ready);
knotif->state = SECCOMP_NOTIFY_INIT;
- up(&filter->notif->request);
+ atomic_inc(&filter->notif->requests);
+ wake_up_poll(&filter->wqh, EPOLLIN | EPOLLRDNORM);
}
mutex_unlock(&filter->notify_lock);
}
@@ -1101,7 +1571,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
void __user *buf)
{
struct seccomp_notif_resp resp = {};
- struct seccomp_knotif *knotif = NULL, *cur;
+ struct seccomp_knotif *knotif;
long ret;
if (copy_from_user(&resp, buf, sizeof(resp)))
@@ -1118,13 +1588,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
if (ret < 0)
return ret;
- list_for_each_entry(cur, &filter->notif->notifications, list) {
- if (cur->id == resp.id) {
- knotif = cur;
- break;
- }
- }
-
+ knotif = find_notification(filter, resp.id);
if (!knotif) {
ret = -ENOENT;
goto out;
@@ -1141,7 +1605,10 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
knotif->error = resp.error;
knotif->val = resp.val;
knotif->flags = resp.flags;
- complete(&knotif->ready);
+ if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+ complete_on_current_cpu(&knotif->ready);
+ else
+ complete(&knotif->ready);
out:
mutex_unlock(&filter->notify_lock);
return ret;
@@ -1150,7 +1617,7 @@ out:
static long seccomp_notify_id_valid(struct seccomp_filter *filter,
void __user *buf)
{
- struct seccomp_knotif *knotif = NULL;
+ struct seccomp_knotif *knotif;
u64 id;
long ret;
@@ -1161,17 +1628,143 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
if (ret < 0)
return ret;
- ret = -ENOENT;
- list_for_each_entry(knotif, &filter->notif->notifications, list) {
- if (knotif->id == id) {
- if (knotif->state == SECCOMP_NOTIFY_SENT)
- ret = 0;
- goto out;
+ knotif = find_notification(filter, id);
+ if (knotif && knotif->state == SECCOMP_NOTIFY_SENT)
+ ret = 0;
+ else
+ ret = -ENOENT;
+
+ mutex_unlock(&filter->notify_lock);
+ return ret;
+}
+
+static long seccomp_notify_set_flags(struct seccomp_filter *filter,
+ unsigned long flags)
+{
+ long ret;
+
+ if (flags & ~SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+ return -EINVAL;
+
+ ret = mutex_lock_interruptible(&filter->notify_lock);
+ if (ret < 0)
+ return ret;
+ filter->notif->flags = flags;
+ mutex_unlock(&filter->notify_lock);
+ return 0;
+}
+
+static long seccomp_notify_addfd(struct seccomp_filter *filter,
+ struct seccomp_notif_addfd __user *uaddfd,
+ unsigned int size)
+{
+ struct seccomp_notif_addfd addfd;
+ struct seccomp_knotif *knotif;
+ struct seccomp_kaddfd kaddfd;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0);
+ BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST);
+
+ if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE)
+ return -EINVAL;
+
+ ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
+ if (ret)
+ return ret;
+
+ if (addfd.newfd_flags & ~O_CLOEXEC)
+ return -EINVAL;
+
+ if (addfd.flags & ~(SECCOMP_ADDFD_FLAG_SETFD | SECCOMP_ADDFD_FLAG_SEND))
+ return -EINVAL;
+
+ if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
+ return -EINVAL;
+
+ kaddfd.file = fget(addfd.srcfd);
+ if (!kaddfd.file)
+ return -EBADF;
+
+ kaddfd.ioctl_flags = addfd.flags;
+ kaddfd.flags = addfd.newfd_flags;
+ kaddfd.setfd = addfd.flags & SECCOMP_ADDFD_FLAG_SETFD;
+ kaddfd.fd = addfd.newfd;
+ init_completion(&kaddfd.completion);
+
+ ret = mutex_lock_interruptible(&filter->notify_lock);
+ if (ret < 0)
+ goto out;
+
+ knotif = find_notification(filter, addfd.id);
+ if (!knotif) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ /*
+ * We do not want to allow for FD injection to occur before the
+ * notification has been picked up by a userspace handler, or after
+ * the notification has been replied to.
+ */
+ if (knotif->state != SECCOMP_NOTIFY_SENT) {
+ ret = -EINPROGRESS;
+ goto out_unlock;
+ }
+
+ if (addfd.flags & SECCOMP_ADDFD_FLAG_SEND) {
+ /*
+ * Disallow queuing an atomic addfd + send reply while there are
+ * some addfd requests still to process.
+ *
+ * There is no clear reason to support it and allows us to keep
+ * the loop on the other side straight-forward.
+ */
+ if (!list_empty(&knotif->addfd)) {
+ ret = -EBUSY;
+ goto out_unlock;
}
+
+ /* Allow exactly only one reply */
+ knotif->state = SECCOMP_NOTIFY_REPLIED;
}
-out:
+ list_add(&kaddfd.list, &knotif->addfd);
+ complete(&knotif->ready);
mutex_unlock(&filter->notify_lock);
+
+ /* Now we wait for it to be processed or be interrupted */
+ ret = wait_for_completion_interruptible(&kaddfd.completion);
+ if (ret == 0) {
+ /*
+ * We had a successful completion. The other side has already
+ * removed us from the addfd queue, and
+ * wait_for_completion_interruptible has a memory barrier upon
+ * success that lets us read this value directly without
+ * locking.
+ */
+ ret = kaddfd.ret;
+ goto out;
+ }
+
+ mutex_lock(&filter->notify_lock);
+ /*
+ * Even though we were woken up by a signal and not a successful
+ * completion, a completion may have happened in the mean time.
+ *
+ * We need to check again if the addfd request has been handled,
+ * and if not, we will remove it from the queue.
+ */
+ if (list_empty(&kaddfd.list))
+ ret = kaddfd.ret;
+ else
+ list_del(&kaddfd.list);
+
+out_unlock:
+ mutex_unlock(&filter->notify_lock);
+out:
+ fput(kaddfd.file);
+
return ret;
}
@@ -1181,13 +1774,24 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
struct seccomp_filter *filter = file->private_data;
void __user *buf = (void __user *)arg;
+ /* Fixed-size ioctls */
switch (cmd) {
case SECCOMP_IOCTL_NOTIF_RECV:
return seccomp_notify_recv(filter, buf);
case SECCOMP_IOCTL_NOTIF_SEND:
return seccomp_notify_send(filter, buf);
+ case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
case SECCOMP_IOCTL_NOTIF_ID_VALID:
return seccomp_notify_id_valid(filter, buf);
+ case SECCOMP_IOCTL_NOTIF_SET_FLAGS:
+ return seccomp_notify_set_flags(filter, arg);
+ }
+
+ /* Extensible Argument ioctls */
+#define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK))
+ switch (EA_IOCTL(cmd)) {
+ case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD):
+ return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd));
default:
return -EINVAL;
}
@@ -1200,7 +1804,7 @@ static __poll_t seccomp_notify_poll(struct file *file,
__poll_t ret = 0;
struct seccomp_knotif *cur;
- poll_wait(file, &filter->notif->wqh, poll_tab);
+ poll_wait(file, &filter->wqh, poll_tab);
if (mutex_lock_interruptible(&filter->notify_lock) < 0)
return EPOLLERR;
@@ -1216,6 +1820,9 @@ static __poll_t seccomp_notify_poll(struct file *file,
mutex_unlock(&filter->notify_lock);
+ if (refcount_read(&filter->users) == 0)
+ ret |= EPOLLHUP;
+
return ret;
}
@@ -1228,23 +1835,15 @@ static const struct file_operations seccomp_notify_ops = {
static struct file *init_listener(struct seccomp_filter *filter)
{
- struct file *ret = ERR_PTR(-EBUSY);
- struct seccomp_filter *cur;
-
- for (cur = current->seccomp.filter; cur; cur = cur->prev) {
- if (cur->notif)
- goto out;
- }
+ struct file *ret;
ret = ERR_PTR(-ENOMEM);
filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL);
if (!filter->notif)
goto out;
- sema_init(&filter->notif->request, 0);
filter->notif->next_id = get_random_u64();
INIT_LIST_HEAD(&filter->notif->notifications);
- init_waitqueue_head(&filter->notif->wqh);
ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
filter, O_RDWR);
@@ -1256,11 +1855,36 @@ static struct file *init_listener(struct seccomp_filter *filter)
out_notif:
if (IS_ERR(ret))
- kfree(filter->notif);
+ seccomp_notify_free(filter);
out:
return ret;
}
+/*
+ * Does @new_child have a listener while an ancestor also has a listener?
+ * If so, we'll want to reject this filter.
+ * This only has to be tested for the current process, even in the TSYNC case,
+ * because TSYNC installs @child with the same parent on all threads.
+ * Note that @new_child is not hooked up to its parent at this point yet, so
+ * we use current->seccomp.filter.
+ */
+static bool has_duplicate_listener(struct seccomp_filter *new_child)
+{
+ struct seccomp_filter *cur;
+
+ /* must be protected against concurrent TSYNC */
+ lockdep_assert_held(&current->sighand->siglock);
+
+ if (!new_child->notif)
+ return false;
+ for (cur = current->seccomp.filter; cur; cur = cur->prev) {
+ if (cur->notif)
+ return true;
+ }
+
+ return false;
+}
+
/**
* seccomp_set_mode_filter: internal function for setting seccomp filter
* @flags: flags to change filter behavior
@@ -1299,6 +1923,14 @@ static long seccomp_set_mode_filter(unsigned int flags,
((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0))
return -EINVAL;
+ /*
+ * The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't make sense
+ * without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag.
+ */
+ if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) &&
+ ((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0))
+ return -EINVAL;
+
/* Prepare the new filter before holding any locks. */
prepared = seccomp_prepare_user_filter(filter);
if (IS_ERR(prepared))
@@ -1332,6 +1964,11 @@ static long seccomp_set_mode_filter(unsigned int flags,
if (!seccomp_may_assign_mode(seccomp_mode))
goto out;
+ if (has_duplicate_listener(prepared)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
ret = seccomp_attach_filter(flags, prepared);
if (ret)
goto out;
@@ -1349,6 +1986,7 @@ out_put_fd:
listener_f->private_data = NULL;
fput(listener_f);
put_unused_fd(listener);
+ seccomp_notify_detach(prepared);
} else {
fd_install(listener, listener_f);
ret = listener;
@@ -1696,7 +2334,7 @@ static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
return true;
}
-static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
+static int read_actions_logged(struct ctl_table *ro_table, void *buffer,
size_t *lenp, loff_t *ppos)
{
char names[sizeof(seccomp_actions_avail)];
@@ -1714,7 +2352,7 @@ static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
return proc_dostring(&table, 0, buffer, lenp, ppos);
}
-static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer,
+static int write_actions_logged(struct ctl_table *ro_table, void *buffer,
size_t *lenp, loff_t *ppos, u32 *actions_logged)
{
char names[sizeof(seccomp_actions_avail)];
@@ -1794,12 +2432,6 @@ static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
return ret;
}
-static struct ctl_path seccomp_sysctl_path[] = {
- { .procname = "kernel", },
- { .procname = "seccomp", },
- { }
-};
-
static struct ctl_table seccomp_sysctl_table[] = {
{
.procname = "actions_avail",
@@ -1818,17 +2450,66 @@ static struct ctl_table seccomp_sysctl_table[] = {
static int __init seccomp_sysctl_init(void)
{
- struct ctl_table_header *hdr;
-
- hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
- if (!hdr)
- pr_warn("seccomp: sysctl registration failed\n");
- else
- kmemleak_not_leak(hdr);
-
+ register_sysctl_init("kernel/seccomp", seccomp_sysctl_table);
return 0;
}
device_initcall(seccomp_sysctl_init)
#endif /* CONFIG_SYSCTL */
+
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+/* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */
+static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name,
+ const void *bitmap, size_t bitmap_size)
+{
+ int nr;
+
+ for (nr = 0; nr < bitmap_size; nr++) {
+ bool cached = test_bit(nr, bitmap);
+ char *status = cached ? "ALLOW" : "FILTER";
+
+ seq_printf(m, "%s %d %s\n", name, nr, status);
+ }
+}
+
+int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct seccomp_filter *f;
+ unsigned long flags;
+
+ /*
+ * We don't want some sandboxed process to know what their seccomp
+ * filters consist of.
+ */
+ if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (!lock_task_sighand(task, &flags))
+ return -ESRCH;
+
+ f = READ_ONCE(task->seccomp.filter);
+ if (!f) {
+ unlock_task_sighand(task, &flags);
+ return 0;
+ }
+
+ /* prevent filter from being freed while we are printing it */
+ __get_seccomp_filter(f);
+ unlock_task_sighand(task, &flags);
+
+ proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME,
+ f->cache.allow_native,
+ SECCOMP_ARCH_NATIVE_NR);
+
+#ifdef SECCOMP_ARCH_COMPAT
+ proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
+ f->cache.allow_compat,
+ SECCOMP_ARCH_COMPAT_NR);
+#endif /* SECCOMP_ARCH_COMPAT */
+
+ __put_seccomp_filter(f);
+ return 0;
+}
+#endif /* CONFIG_SECCOMP_CACHE_DEBUG */
diff --git a/kernel/signal.c b/kernel/signal.c
index 6f16f7c5d375..c9c57d053ce4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,6 +22,7 @@
#include <linux/sched/cputime.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/mm.h>
#include <linux/proc_fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
@@ -32,7 +33,7 @@
#include <linux/signal.h>
#include <linux/signalfd.h>
#include <linux/ratelimit.h>
-#include <linux/tracehook.h>
+#include <linux/task_work.h>
#include <linux/capability.h>
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
@@ -43,9 +44,9 @@
#include <linux/cn_proc.h>
#include <linux/compiler.h>
#include <linux/posix-timers.h>
-#include <linux/livepatch.h>
#include <linux/cgroup.h>
#include <linux/audit.h>
+#include <linux/sysctl.h>
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -55,6 +56,7 @@
#include <asm/unistd.h>
#include <asm/siginfo.h>
#include <asm/cacheflush.h>
+#include <asm/syscall.h> /* for syscall_get_* */
/*
* SLAB caches for signal bits.
@@ -169,20 +171,9 @@ static bool recalc_sigpending_tsk(struct task_struct *t)
return false;
}
-/*
- * After recalculating TIF_SIGPENDING, we need to make sure the task wakes up.
- * This is superfluous when called on current, the wakeup is a harmless no-op.
- */
-void recalc_sigpending_and_wake(struct task_struct *t)
-{
- if (recalc_sigpending_tsk(t))
- signal_wake_up(t, 0);
-}
-
void recalc_sigpending(void)
{
- if (!recalc_sigpending_tsk(current) && !freezing(current) &&
- !klp_patch_pending(current))
+ if (!recalc_sigpending_tsk(current) && !freezing(current))
clear_thread_flag(TIF_SIGPENDING);
}
@@ -391,16 +382,17 @@ static bool task_participate_group_stop(struct task_struct *task)
void task_join_group_stop(struct task_struct *task)
{
+ unsigned long mask = current->jobctl & JOBCTL_STOP_SIGMASK;
+ struct signal_struct *sig = current->signal;
+
+ if (sig->group_stop_count) {
+ sig->group_stop_count++;
+ mask |= JOBCTL_STOP_CONSUME;
+ } else if (!(sig->flags & SIGNAL_STOP_STOPPED))
+ return;
+
/* Have the new thread join an on-going signal group stop */
- unsigned long jobctl = current->jobctl;
- if (jobctl & JOBCTL_STOP_PENDING) {
- struct signal_struct *sig = current->signal;
- unsigned long signr = jobctl & JOBCTL_STOP_SIGMASK;
- unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
- if (task_set_jobctl_pending(task, signr | gstop)) {
- sig->group_stop_count++;
- }
- }
+ task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING);
}
/*
@@ -409,11 +401,12 @@ void task_join_group_stop(struct task_struct *task)
* appropriate lock must be held to stop the target task from exiting
*/
static struct sigqueue *
-__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
+__sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
+ int override_rlimit, const unsigned int sigqueue_flags)
{
struct sigqueue *q = NULL;
- struct user_struct *user;
- int sigpending;
+ struct ucounts *ucounts;
+ long sigpending;
/*
* Protect access to @t credentials. This can go away when all
@@ -424,27 +417,25 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
* changes from/to zero.
*/
rcu_read_lock();
- user = __task_cred(t)->user;
- sigpending = atomic_inc_return(&user->sigpending);
- if (sigpending == 1)
- get_uid(user);
+ ucounts = task_ucounts(t);
+ sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
rcu_read_unlock();
+ if (!sigpending)
+ return NULL;
if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
- q = kmem_cache_alloc(sigqueue_cachep, flags);
+ q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
} else {
print_dropped_signal(sig);
}
if (unlikely(q == NULL)) {
- if (atomic_dec_and_test(&user->sigpending))
- free_uid(user);
+ dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
} else {
INIT_LIST_HEAD(&q->list);
- q->flags = 0;
- q->user = user;
+ q->flags = sigqueue_flags;
+ q->ucounts = ucounts;
}
-
return q;
}
@@ -452,8 +443,10 @@ static void __sigqueue_free(struct sigqueue *q)
{
if (q->flags & SIGQUEUE_PREALLOC)
return;
- if (atomic_dec_and_test(&q->user->sigpending))
- free_uid(q->user);
+ if (q->ucounts) {
+ dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING);
+ q->ucounts = NULL;
+ }
kmem_cache_free(sigqueue_cachep, q);
}
@@ -560,6 +553,10 @@ bool unhandled_signal(struct task_struct *tsk, int sig)
if (handler != SIG_IGN && handler != SIG_DFL)
return false;
+ /* If dying, we handle all new signals by ignoring them */
+ if (fatal_signal_pending(tsk))
+ return false;
+
/* if ptraced, let the tracer determine */
return !tsk->ptrace;
}
@@ -625,7 +622,8 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
*
* All callers have to hold the siglock.
*/
-int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *info)
+int dequeue_signal(struct task_struct *tsk, sigset_t *mask,
+ kernel_siginfo_t *info, enum pid_type *type)
{
bool resched_timer = false;
int signr;
@@ -633,8 +631,10 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *in
/* We only dequeue private signals from ourselves, we don't let
* signalfd steal them
*/
+ *type = PIDTYPE_PID;
signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer);
if (!signr) {
+ *type = PIDTYPE_TGID;
signr = __dequeue_signal(&tsk->signal->shared_pending,
mask, info, &resched_timer);
#ifdef CONFIG_POSIX_TIMERS
@@ -758,7 +758,10 @@ still_pending:
*/
void signal_wake_up_state(struct task_struct *t, unsigned int state)
{
+ lockdep_assert_held(&t->sighand->siglock);
+
set_tsk_thread_flag(t, TIF_SIGPENDING);
+
/*
* TASK_WAKEKILL also means wake it up in the stopped/traced/killable
* case. We don't check t->state here because there is a race with it
@@ -851,7 +854,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info,
*/
if (!sid || sid == task_session(current))
break;
- /* fall through */
+ fallthrough;
default:
return -EPERM;
}
@@ -880,7 +883,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info,
static void ptrace_trap_notify(struct task_struct *t)
{
WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
- assert_spin_locked(&t->sighand->siglock);
+ lockdep_assert_held(&t->sighand->siglock);
task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
@@ -902,12 +905,13 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
struct task_struct *t;
sigset_t flush;
- if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
- if (!(signal->flags & SIGNAL_GROUP_EXIT))
+ if (signal->flags & SIGNAL_GROUP_EXIT) {
+ if (signal->core_state)
return sig == SIGKILL;
/*
- * The process is in the middle of dying, nothing to do.
+ * The process is in the middle of dying, drop the signal.
*/
+ return false;
} else if (sig_kernel_stop(sig)) {
/*
* This is a stop signal. Remove SIGCONT from all queues.
@@ -926,9 +930,10 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
for_each_thread(p, t) {
flush_sigqueue_mask(&flush, &t->pending);
task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
- if (likely(!(t->ptrace & PT_SEIZED)))
+ if (likely(!(t->ptrace & PT_SEIZED))) {
+ t->jobctl &= ~JOBCTL_STOPPED;
wake_up_state(t, __TASK_STOPPED);
- else
+ } else
ptrace_trap_notify(t);
}
@@ -983,7 +988,7 @@ static inline bool wants_signal(int sig, struct task_struct *p)
if (task_is_stopped_or_traced(p))
return false;
- return task_curr(p) || !signal_pending(p);
+ return task_curr(p) || !task_sigpending(p);
}
static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
@@ -994,8 +999,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
/*
* Now find a thread we can wake up to take the signal off the queue.
*
- * If the main thread wants the signal, it gets first crack.
- * Probably the least surprising to the average bear.
+ * Try the suggested task first (may or may not be the main thread).
*/
if (wants_signal(sig, p))
t = p;
@@ -1028,7 +1032,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
* then start taking the whole group down immediately.
*/
if (sig_fatal(p, sig) &&
- !(signal->flags & SIGNAL_GROUP_EXIT) &&
+ (signal->core_state || !(signal->flags & SIGNAL_GROUP_EXIT)) &&
!sigismember(&t->real_blocked, sig) &&
(sig == SIGKILL || !p->ptrace)) {
/*
@@ -1044,12 +1048,11 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
signal->flags = SIGNAL_GROUP_EXIT;
signal->group_exit_code = sig;
signal->group_stop_count = 0;
- t = p;
- do {
+ __for_each_thread(signal, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
- } while_each_thread(p, t);
+ }
return;
}
}
@@ -1067,15 +1070,15 @@ static inline bool legacy_queue(struct sigpending *signals, int sig)
return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}
-static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t,
- enum pid_type type, bool force)
+static int __send_signal_locked(int sig, struct kernel_siginfo *info,
+ struct task_struct *t, enum pid_type type, bool force)
{
struct sigpending *pending;
struct sigqueue *q;
int override_rlimit;
int ret = 0, result;
- assert_spin_locked(&t->sighand->siglock);
+ lockdep_assert_held(&t->sighand->siglock);
result = TRACE_SIGNAL_IGNORED;
if (!prepare_signal(sig, t, force))
@@ -1112,7 +1115,8 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
else
override_rlimit = 0;
- q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit);
+ q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit, 0);
+
if (q) {
list_add_tail(&q->list, &pending->list);
switch ((unsigned long) info) {
@@ -1195,9 +1199,11 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
case SIL_TIMER:
case SIL_POLL:
case SIL_FAULT:
+ case SIL_FAULT_TRAPNO:
case SIL_FAULT_MCEERR:
case SIL_FAULT_BNDERR:
case SIL_FAULT_PKUERR:
+ case SIL_FAULT_PERF_EVENT:
case SIL_SYS:
ret = false;
break;
@@ -1205,8 +1211,8 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
return ret;
}
-static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t,
- enum pid_type type)
+int send_signal_locked(int sig, struct kernel_siginfo *info,
+ struct task_struct *t, enum pid_type type)
{
/* Should SIGKILL or SIGSTOP be received by a pid namespace init? */
bool force = false;
@@ -1238,13 +1244,23 @@ static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct
force = true;
}
}
- return __send_signal(sig, info, t, type, force);
+ return __send_signal_locked(sig, info, t, type, force);
}
static void print_fatal_signal(int signr)
{
- struct pt_regs *regs = signal_pt_regs();
- pr_info("potentially unexpected fatal signal %d.\n", signr);
+ struct pt_regs *regs = task_pt_regs(current);
+ struct file *exe_file;
+
+ exe_file = get_task_exe_file(current);
+ if (exe_file) {
+ pr_info("%pD: %s: potentially unexpected fatal signal %d.\n",
+ exe_file, current->comm, signr);
+ fput(exe_file);
+ } else {
+ pr_info("%s: potentially unexpected fatal signal %d.\n",
+ current->comm, signr);
+ }
#if defined(__i386__) && !defined(__arch_um__)
pr_info("code at %08lx: ", regs->ip);
@@ -1274,12 +1290,6 @@ static int __init setup_print_fatal_signals(char *str)
__setup("print-fatal-signals=", setup_print_fatal_signals);
-int
-__group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p)
-{
- return send_signal(sig, info, p, PIDTYPE_TGID);
-}
-
int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p,
enum pid_type type)
{
@@ -1287,13 +1297,19 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p
int ret = -ESRCH;
if (lock_task_sighand(p, &flags)) {
- ret = send_signal(sig, info, p, type);
+ ret = send_signal_locked(sig, info, p, type);
unlock_task_sighand(p, &flags);
}
return ret;
}
+enum sig_handler {
+ HANDLER_CURRENT, /* If reachable use the current handler */
+ HANDLER_SIG_DFL, /* Always use SIG_DFL handler semantics */
+ HANDLER_EXIT, /* Only visible as the process exit code */
+};
+
/*
* Force a signal that the process can't ignore: if necessary
* we unblock the signal and change any SIG_IGN to SIG_DFL.
@@ -1306,7 +1322,8 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p
* that is why we also clear SIGNAL_UNKILLABLE.
*/
static int
-force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t)
+force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
+ enum sig_handler handler)
{
unsigned long int flags;
int ret, blocked, ignored;
@@ -1317,20 +1334,24 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t)
action = &t->sighand->action[sig-1];
ignored = action->sa.sa_handler == SIG_IGN;
blocked = sigismember(&t->blocked, sig);
- if (blocked || ignored) {
+ if (blocked || ignored || (handler != HANDLER_CURRENT)) {
action->sa.sa_handler = SIG_DFL;
- if (blocked) {
+ if (handler == HANDLER_EXIT)
+ action->sa.sa_flags |= SA_IMMUTABLE;
+ if (blocked)
sigdelset(&t->blocked, sig);
- recalc_sigpending_and_wake(t);
- }
}
/*
* Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
- * debugging to leave init killable.
+ * debugging to leave init killable. But HANDLER_EXIT is always fatal.
*/
- if (action->sa.sa_handler == SIG_DFL && !t->ptrace)
+ if (action->sa.sa_handler == SIG_DFL &&
+ (!t->ptrace || (handler == HANDLER_EXIT)))
t->signal->flags &= ~SIGNAL_UNKILLABLE;
- ret = send_signal(sig, info, t, PIDTYPE_PID);
+ ret = send_signal_locked(sig, info, t, PIDTYPE_PID);
+ /* This can happen if the signal was already pending and blocked */
+ if (!task_sigpending(t))
+ signal_wake_up(t, 0);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
return ret;
@@ -1338,7 +1359,7 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t)
int force_sig_info(struct kernel_siginfo *info)
{
- return force_sig_info_to_task(info, current);
+ return force_sig_info_to_task(info, current, HANDLER_CURRENT);
}
/*
@@ -1346,14 +1367,16 @@ int force_sig_info(struct kernel_siginfo *info)
*/
int zap_other_threads(struct task_struct *p)
{
- struct task_struct *t = p;
+ struct task_struct *t;
int count = 0;
p->signal->group_stop_count = 0;
- while_each_thread(p, t) {
+ for_other_threads(p, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
- count++;
+ /* Don't require de_thread to wait for the vhost_worker */
+ if ((t->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)
+ count++;
/* Don't bother with already dead threads */
if (t->exit_state)
@@ -1397,6 +1420,21 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
return sighand;
}
+#ifdef CONFIG_LOCKDEP
+void lockdep_assert_task_sighand_held(struct task_struct *task)
+{
+ struct sighand_struct *sighand;
+
+ rcu_read_lock();
+ sighand = rcu_dereference(task->sighand);
+ if (sighand)
+ lockdep_assert_held(&sighand->siglock);
+ else
+ WARN_ON_ONCE(1);
+ rcu_read_unlock();
+}
+#endif
+
/*
* send signal info to all the members of a group
*/
@@ -1423,16 +1461,21 @@ int group_send_sig_info(int sig, struct kernel_siginfo *info,
int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
{
struct task_struct *p = NULL;
- int retval, success;
+ int ret = -ESRCH;
- success = 0;
- retval = -ESRCH;
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID);
- success |= !err;
- retval = err;
+ /*
+ * If group_send_sig_info() succeeds at least once ret
+ * becomes 0 and after that the code below has no effect.
+ * Otherwise we return the last err or -ESRCH if this
+ * process group is empty.
+ */
+ if (ret)
+ ret = err;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
- return success ? 0 : retval;
+
+ return ret;
}
int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
@@ -1535,7 +1578,7 @@ int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr,
if (sig) {
if (lock_task_sighand(p, &flags)) {
- ret = __send_signal(sig, &info, p, PIDTYPE_TGID, false);
+ ret = __send_signal_locked(sig, &info, p, PIDTYPE_TGID, false);
unlock_task_sighand(p, &flags);
} else
ret = -ESRCH;
@@ -1630,6 +1673,32 @@ void force_sig(int sig)
}
EXPORT_SYMBOL(force_sig);
+void force_fatal_sig(int sig)
+{
+ struct kernel_siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_KERNEL;
+ info.si_pid = 0;
+ info.si_uid = 0;
+ force_sig_info_to_task(&info, current, HANDLER_SIG_DFL);
+}
+
+void force_exit_sig(int sig)
+{
+ struct kernel_siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_KERNEL;
+ info.si_pid = 0;
+ info.si_uid = 0;
+ force_sig_info_to_task(&info, current, HANDLER_EXIT);
+}
+
/*
* When things go south during signal handling, we
* will force a SIGSEGV. And if the signal that caused
@@ -1638,21 +1707,14 @@ EXPORT_SYMBOL(force_sig);
*/
void force_sigsegv(int sig)
{
- struct task_struct *p = current;
-
- if (sig == SIGSEGV) {
- unsigned long flags;
- spin_lock_irqsave(&p->sighand->siglock, flags);
- p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL;
- spin_unlock_irqrestore(&p->sighand->siglock, flags);
- }
- force_sig(SIGSEGV);
+ if (sig == SIGSEGV)
+ force_fatal_sig(SIGSEGV);
+ else
+ force_sig(SIGSEGV);
}
-int force_sig_fault_to_task(int sig, int code, void __user *addr
- ___ARCH_SI_TRAPNO(int trapno)
- ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
- , struct task_struct *t)
+int force_sig_fault_to_task(int sig, int code, void __user *addr,
+ struct task_struct *t)
{
struct kernel_siginfo info;
@@ -1661,30 +1723,15 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr
info.si_errno = 0;
info.si_code = code;
info.si_addr = addr;
-#ifdef __ARCH_SI_TRAPNO
- info.si_trapno = trapno;
-#endif
-#ifdef __ia64__
- info.si_imm = imm;
- info.si_flags = flags;
- info.si_isr = isr;
-#endif
- return force_sig_info_to_task(&info, t);
+ return force_sig_info_to_task(&info, t, HANDLER_CURRENT);
}
-int force_sig_fault(int sig, int code, void __user *addr
- ___ARCH_SI_TRAPNO(int trapno)
- ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr))
+int force_sig_fault(int sig, int code, void __user *addr)
{
- return force_sig_fault_to_task(sig, code, addr
- ___ARCH_SI_TRAPNO(trapno)
- ___ARCH_SI_IA64(imm, flags, isr), current);
+ return force_sig_fault_to_task(sig, code, addr, current);
}
-int send_sig_fault(int sig, int code, void __user *addr
- ___ARCH_SI_TRAPNO(int trapno)
- ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
- , struct task_struct *t)
+int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t)
{
struct kernel_siginfo info;
@@ -1693,14 +1740,6 @@ int send_sig_fault(int sig, int code, void __user *addr
info.si_errno = 0;
info.si_code = code;
info.si_addr = addr;
-#ifdef __ARCH_SI_TRAPNO
- info.si_trapno = trapno;
-#endif
-#ifdef __ia64__
- info.si_imm = imm;
- info.si_flags = flags;
- info.si_isr = isr;
-#endif
return send_sig_info(info.si_signo, &info, t);
}
@@ -1762,6 +1801,55 @@ int force_sig_pkuerr(void __user *addr, u32 pkey)
}
#endif
+int send_sig_perf(void __user *addr, u32 type, u64 sig_data)
+{
+ struct kernel_siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = SIGTRAP;
+ info.si_errno = 0;
+ info.si_code = TRAP_PERF;
+ info.si_addr = addr;
+ info.si_perf_data = sig_data;
+ info.si_perf_type = type;
+
+ /*
+ * Signals generated by perf events should not terminate the whole
+ * process if SIGTRAP is blocked, however, delivering the signal
+ * asynchronously is better than not delivering at all. But tell user
+ * space if the signal was asynchronous, so it can clearly be
+ * distinguished from normal synchronous ones.
+ */
+ info.si_perf_flags = sigismember(&current->blocked, info.si_signo) ?
+ TRAP_PERF_FLAG_ASYNC :
+ 0;
+
+ return send_sig_info(info.si_signo, &info, current);
+}
+
+/**
+ * force_sig_seccomp - signals the task to allow in-process syscall emulation
+ * @syscall: syscall number to send to userland
+ * @reason: filter-supplied reason code to send to userland (via si_errno)
+ * @force_coredump: true to trigger a coredump
+ *
+ * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
+ */
+int force_sig_seccomp(int syscall, int reason, bool force_coredump)
+{
+ struct kernel_siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = SIGSYS;
+ info.si_code = SYS_SECCOMP;
+ info.si_call_addr = (void __user *)KSTK_EIP(current);
+ info.si_errno = reason;
+ info.si_arch = syscall_get_arch(current);
+ info.si_syscall = syscall;
+ return force_sig_info_to_task(&info, current,
+ force_coredump ? HANDLER_EXIT : HANDLER_CURRENT);
+}
+
/* For the crazy architectures that include trap information in
* the errno field, instead of an actual errno value.
*/
@@ -1777,6 +1865,39 @@ int force_sig_ptrace_errno_trap(int errno, void __user *addr)
return force_sig_info(&info);
}
+/* For the rare architectures that include trap information using
+ * si_trapno.
+ */
+int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno)
+{
+ struct kernel_siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = code;
+ info.si_addr = addr;
+ info.si_trapno = trapno;
+ return force_sig_info(&info);
+}
+
+/* For the rare architectures that include trap information using
+ * si_trapno.
+ */
+int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno,
+ struct task_struct *t)
+{
+ struct kernel_siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = code;
+ info.si_addr = addr;
+ info.si_trapno = trapno;
+ return send_sig_info(info.si_signo, &info, t);
+}
+
int kill_pgrp(struct pid *pid, int sig, int priv)
{
int ret;
@@ -1806,12 +1927,7 @@ EXPORT_SYMBOL(kill_pid);
*/
struct sigqueue *sigqueue_alloc(void)
{
- struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
-
- if (q)
- q->flags |= SIGQUEUE_PREALLOC;
-
- return q;
+ return __sigqueue_alloc(-1, current, GFP_KERNEL, 0, SIGQUEUE_PREALLOC);
}
void sigqueue_free(struct sigqueue *q)
@@ -1851,8 +1967,24 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
ret = -1;
rcu_read_lock();
+
+ /*
+ * This function is used by POSIX timers to deliver a timer signal.
+ * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID
+ * set), the signal must be delivered to the specific thread (queues
+ * into t->pending).
+ *
+ * Where type is not PIDTYPE_PID, signals must be delivered to the
+ * process. In this case, prefer to deliver to current if it is in
+ * the same thread group as the target process, which avoids
+ * unnecessarily waking up a potentially idle task.
+ */
t = pid_task(pid, type);
- if (!t || !likely(lock_task_sighand(t, &flags)))
+ if (!t)
+ goto ret;
+ if (type != PIDTYPE_PID && same_thread_group(t, current))
+ t = current;
+ if (!likely(lock_task_sighand(t, &flags)))
goto ret;
ret = 1; /* the signal is ignored */
@@ -1911,12 +2043,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
bool autoreap = false;
u64 utime, stime;
- BUG_ON(sig == -1);
+ WARN_ON_ONCE(sig == -1);
- /* do_notify_parent_cldstop should have been called instead. */
- BUG_ON(task_is_stopped_or_traced(tsk));
+ /* do_notify_parent_cldstop should have been called instead. */
+ WARN_ON_ONCE(task_is_stopped_or_traced(tsk));
- BUG_ON(!tsk->ptrace &&
+ WARN_ON_ONCE(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
/* Wake up all pidfd waiters */
@@ -1994,7 +2126,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
* parent's namespaces.
*/
if (valid_signal(sig) && sig)
- __send_signal(sig, &info, tsk->parent, PIDTYPE_TGID, false);
+ __send_signal_locked(sig, &info, tsk->parent, PIDTYPE_TGID, false);
__wake_up_parent(tsk, tsk->parent);
spin_unlock_irqrestore(&psig->siglock, flags);
@@ -2064,7 +2196,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
spin_lock_irqsave(&sighand->siglock, flags);
if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
!(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP))
- __group_send_sig_info(SIGCHLD, &info, parent);
+ send_signal_locked(SIGCHLD, &info, parent, PIDTYPE_TGID);
/*
* Even if SIGCHLD is not generated, we must wake up wait4 calls.
*/
@@ -2072,40 +2204,6 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
spin_unlock_irqrestore(&sighand->siglock, flags);
}
-static inline bool may_ptrace_stop(void)
-{
- if (!likely(current->ptrace))
- return false;
- /*
- * Are we in the middle of do_coredump?
- * If so and our tracer is also part of the coredump stopping
- * is a deadlock situation, and pointless because our tracer
- * is dead so don't allow us to stop.
- * If SIGKILL was already sent before the caller unlocked
- * ->siglock we must see ->core_state != NULL. Otherwise it
- * is safe to enter schedule().
- *
- * This is almost outdated, a task with the pending SIGKILL can't
- * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported
- * after SIGKILL was already dequeued.
- */
- if (unlikely(current->mm->core_state) &&
- unlikely(current->mm == current->parent->mm))
- return false;
-
- return true;
-}
-
-/*
- * Return non-zero if there is a SIGKILL that should be waking us up.
- * Called with the siglock held.
- */
-static bool sigkill_pending(struct task_struct *tsk)
-{
- return sigismember(&tsk->pending.signal, SIGKILL) ||
- sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
-}
-
/*
* This must be called with current->sighand->siglock held.
*
@@ -2114,16 +2212,18 @@ static bool sigkill_pending(struct task_struct *tsk)
* That makes it a way to test a stopped process for
* being ptrace-stopped vs being job-control-stopped.
*
- * If we actually decide not to stop at all because the tracer
- * is gone, we keep current->exit_code unless clear_code.
+ * Returns the signal the ptracer requested the code resume
+ * with. If the code did not stop because the tracer is gone,
+ * the stop signal remains unchanged unless clear_code.
*/
-static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t *info)
+static int ptrace_stop(int exit_code, int why, unsigned long message,
+ kernel_siginfo_t *info)
__releases(&current->sighand->siglock)
__acquires(&current->sighand->siglock)
{
bool gstop_done = false;
- if (arch_ptrace_stop_needed(exit_code, info)) {
+ if (arch_ptrace_stop_needed()) {
/*
* The arch code has something special to do before a
* ptrace stop. This is allowed to block, e.g. for faults
@@ -2131,18 +2231,23 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
* calling arch_ptrace_stop, so we must release it now.
* To preserve proper semantics, we must do this before
* any signal bookkeeping like checking group_stop_count.
- * Meanwhile, a SIGKILL could come in before we retake the
- * siglock. That must prevent us from sleeping in TASK_TRACED.
- * So after regaining the lock, we must check for SIGKILL.
*/
spin_unlock_irq(&current->sighand->siglock);
- arch_ptrace_stop(exit_code, info);
+ arch_ptrace_stop();
spin_lock_irq(&current->sighand->siglock);
- if (sigkill_pending(current))
- return;
}
+ /*
+ * After this point ptrace_signal_wake_up or signal_wake_up
+ * will clear TASK_TRACED if ptrace_unlink happens or a fatal
+ * signal comes in. Handle previous ptrace_unlinks and fatal
+ * signals here to prevent ptrace_stop sleeping in schedule.
+ */
+ if (!current->ptrace || __fatal_signal_pending(current))
+ return exit_code;
+
set_special_state(TASK_TRACED);
+ current->jobctl |= JOBCTL_TRACED;
/*
* We're committing to trapping. TRACED should be visible before
@@ -2164,6 +2269,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
*/
smp_wmb();
+ current->ptrace_message = message;
current->last_siginfo = info;
current->exit_code = exit_code;
@@ -2187,53 +2293,56 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
spin_unlock_irq(&current->sighand->siglock);
read_lock(&tasklist_lock);
- if (may_ptrace_stop()) {
- /*
- * Notify parents of the stop.
- *
- * While ptraced, there are two parents - the ptracer and
- * the real_parent of the group_leader. The ptracer should
- * know about every stop while the real parent is only
- * interested in the completion of group stop. The states
- * for the two don't interact with each other. Notify
- * separately unless they're gonna be duplicates.
- */
+ /*
+ * Notify parents of the stop.
+ *
+ * While ptraced, there are two parents - the ptracer and
+ * the real_parent of the group_leader. The ptracer should
+ * know about every stop while the real parent is only
+ * interested in the completion of group stop. The states
+ * for the two don't interact with each other. Notify
+ * separately unless they're gonna be duplicates.
+ */
+ if (current->ptrace)
do_notify_parent_cldstop(current, true, why);
- if (gstop_done && ptrace_reparented(current))
- do_notify_parent_cldstop(current, false, why);
+ if (gstop_done && (!current->ptrace || ptrace_reparented(current)))
+ do_notify_parent_cldstop(current, false, why);
- /*
- * Don't want to allow preemption here, because
- * sys_ptrace() needs this task to be inactive.
- *
- * XXX: implement read_unlock_no_resched().
- */
+ /*
+ * The previous do_notify_parent_cldstop() invocation woke ptracer.
+ * One a PREEMPTION kernel this can result in preemption requirement
+ * which will be fulfilled after read_unlock() and the ptracer will be
+ * put on the CPU.
+ * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for
+ * this task wait in schedule(). If this task gets preempted then it
+ * remains enqueued on the runqueue. The ptracer will observe this and
+ * then sleep for a delay of one HZ tick. In the meantime this task
+ * gets scheduled, enters schedule() and will wait for the ptracer.
+ *
+ * This preemption point is not bad from a correctness point of
+ * view but extends the runtime by one HZ tick time due to the
+ * ptracer's sleep. The preempt-disable section ensures that there
+ * will be no preemption between unlock and schedule() and so
+ * improving the performance since the ptracer will observe that
+ * the tracee is scheduled out once it gets on the CPU.
+ *
+ * On PREEMPT_RT locking tasklist_lock does not disable preemption.
+ * Therefore the task can be preempted after do_notify_parent_cldstop()
+ * before unlocking tasklist_lock so there is no benefit in doing this.
+ *
+ * In fact disabling preemption is harmful on PREEMPT_RT because
+ * the spinlock_t in cgroup_enter_frozen() must not be acquired
+ * with preemption disabled due to the 'sleeping' spinlock
+ * substitution of RT.
+ */
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
- read_unlock(&tasklist_lock);
- cgroup_enter_frozen();
+ read_unlock(&tasklist_lock);
+ cgroup_enter_frozen();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable_no_resched();
- freezable_schedule();
- cgroup_leave_frozen(true);
- } else {
- /*
- * By the time we got the lock, our tracer went away.
- * Don't drop the lock yet, another tracer may come.
- *
- * If @gstop_done, the ptracer went away between group stop
- * completion and here. During detach, it would have set
- * JOBCTL_STOP_PENDING on us and we'll re-enter
- * TASK_STOPPED in do_signal_stop() on return, so notifying
- * the real parent of the group stop completion is enough.
- */
- if (gstop_done)
- do_notify_parent_cldstop(current, false, why);
-
- /* tasklist protects us from ptrace_freeze_traced() */
- __set_current_state(TASK_RUNNING);
- if (clear_code)
- current->exit_code = 0;
- read_unlock(&tasklist_lock);
- }
+ schedule();
+ cgroup_leave_frozen(true);
/*
* We are back. Now reacquire the siglock before touching
@@ -2241,10 +2350,13 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
* any signal-sending on another CPU that wants to examine it.
*/
spin_lock_irq(&current->sighand->siglock);
+ exit_code = current->exit_code;
current->last_siginfo = NULL;
+ current->ptrace_message = 0;
+ current->exit_code = 0;
/* LISTENING can be set only during STOP traps, clear it */
- current->jobctl &= ~JOBCTL_LISTENING;
+ current->jobctl &= ~(JOBCTL_LISTENING | JOBCTL_PTRACE_FROZEN);
/*
* Queued signals ignored us while we were stopped for tracing.
@@ -2252,9 +2364,10 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
* This sets TIF_SIGPENDING, but never clears it.
*/
recalc_sigpending_tsk(current);
+ return exit_code;
}
-static void ptrace_do_notify(int signr, int exit_code, int why)
+static int ptrace_do_notify(int signr, int exit_code, int why, unsigned long message)
{
kernel_siginfo_t info;
@@ -2265,18 +2378,21 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
/* Let the debugger run. */
- ptrace_stop(exit_code, why, 1, &info);
+ return ptrace_stop(exit_code, why, message, &info);
}
-void ptrace_notify(int exit_code)
+int ptrace_notify(int exit_code, unsigned long message)
{
+ int signr;
+
BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
- if (unlikely(current->task_works))
+ if (unlikely(task_work_pending(current)))
task_work_run();
spin_lock_irq(&current->sighand->siglock);
- ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
+ signr = ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED, message);
spin_unlock_irq(&current->sighand->siglock);
+ return signr;
}
/**
@@ -2314,7 +2430,8 @@ static bool do_signal_stop(int signr)
WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);
if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
- unlikely(signal_group_exit(sig)))
+ unlikely(sig->flags & SIGNAL_GROUP_EXIT) ||
+ unlikely(sig->group_exec_task))
return false;
/*
* There is no group stop already in progress. We must
@@ -2339,12 +2456,10 @@ static bool do_signal_stop(int signr)
sig->group_exit_code = signr;
sig->group_stop_count = 0;
-
if (task_set_jobctl_pending(current, signr | gstop))
sig->group_stop_count++;
- t = current;
- while_each_thread(current, t) {
+ for_other_threads(current, t) {
/*
* Setting state to TASK_STOPPED for a group
* stop is always done with the siglock held,
@@ -2372,6 +2487,7 @@ static bool do_signal_stop(int signr)
if (task_participate_group_stop(current))
notify = CLD_STOPPED;
+ current->jobctl |= JOBCTL_STOPPED;
set_special_state(TASK_STOPPED);
spin_unlock_irq(&current->sighand->siglock);
@@ -2392,7 +2508,7 @@ static bool do_signal_stop(int signr)
/* Now we don't run again until woken by SIGCONT or SIGKILL */
cgroup_enter_frozen();
- freezable_schedule();
+ schedule();
return true;
} else {
/*
@@ -2430,11 +2546,10 @@ static void do_jobctl_trap(void)
signr = SIGTRAP;
WARN_ON_ONCE(!signr);
ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
- CLD_STOPPED);
+ CLD_STOPPED, 0);
} else {
WARN_ON_ONCE(!signr);
ptrace_stop(signr, CLD_STOPPED, 0, NULL);
- current->exit_code = 0;
}
}
@@ -2468,14 +2583,14 @@ static void do_freezer_trap(void)
* immediately (if there is a non-fatal signal pending), and
* put the task into sleep.
*/
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
clear_thread_flag(TIF_SIGPENDING);
spin_unlock_irq(&current->sighand->siglock);
cgroup_enter_frozen();
- freezable_schedule();
+ schedule();
}
-static int ptrace_signal(int signr, kernel_siginfo_t *info)
+static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
{
/*
* We do not check sig_kernel_stop(signr) but set this marker
@@ -2487,15 +2602,12 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info)
* comment in dequeue_signal().
*/
current->jobctl |= JOBCTL_STOP_DEQUEUED;
- ptrace_stop(signr, CLD_TRAPPED, 0, info);
+ signr = ptrace_stop(signr, CLD_TRAPPED, 0, info);
/* We're back. Did the debugger cancel the sig? */
- signr = current->exit_code;
if (signr == 0)
return signr;
- current->exit_code = 0;
-
/*
* Update the siginfo structure if the signal has
* changed. If the debugger wanted something
@@ -2515,20 +2627,50 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info)
}
/* If the (new) signal is now blocked, requeue it. */
- if (sigismember(&current->blocked, signr)) {
- send_signal(signr, info, current, PIDTYPE_PID);
+ if (sigismember(&current->blocked, signr) ||
+ fatal_signal_pending(current)) {
+ send_signal_locked(signr, info, current, type);
signr = 0;
}
return signr;
}
+static void hide_si_addr_tag_bits(struct ksignal *ksig)
+{
+ switch (siginfo_layout(ksig->sig, ksig->info.si_code)) {
+ case SIL_FAULT:
+ case SIL_FAULT_TRAPNO:
+ case SIL_FAULT_MCEERR:
+ case SIL_FAULT_BNDERR:
+ case SIL_FAULT_PKUERR:
+ case SIL_FAULT_PERF_EVENT:
+ ksig->info.si_addr = arch_untagged_si_addr(
+ ksig->info.si_addr, ksig->sig, ksig->info.si_code);
+ break;
+ case SIL_KILL:
+ case SIL_TIMER:
+ case SIL_POLL:
+ case SIL_CHLD:
+ case SIL_RT:
+ case SIL_SYS:
+ break;
+ }
+}
+
bool get_signal(struct ksignal *ksig)
{
struct sighand_struct *sighand = current->sighand;
struct signal_struct *signal = current->signal;
int signr;
+ clear_notify_signal();
+ if (unlikely(task_work_pending(current)))
+ task_work_run();
+
+ if (!task_sigpending(current))
+ return false;
+
if (unlikely(uprobe_deny_signal()))
return false;
@@ -2541,12 +2683,6 @@ bool get_signal(struct ksignal *ksig)
relock:
spin_lock_irq(&sighand->siglock);
- current->jobctl &= ~JOBCTL_TASK_WORK;
- if (unlikely(current->task_works)) {
- spin_unlock_irq(&sighand->siglock);
- task_work_run();
- goto relock;
- }
/*
* Every stopped thread goes here after wakeup. Check to see if
@@ -2584,18 +2720,21 @@ relock:
goto relock;
}
- /* Has this task already been marked for death? */
- if (signal_group_exit(signal)) {
- ksig->info.si_signo = signr = SIGKILL;
- sigdelset(&current->pending.signal, SIGKILL);
- trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
- &sighand->action[SIGKILL - 1]);
- recalc_sigpending();
- goto fatal;
- }
-
for (;;) {
struct k_sigaction *ka;
+ enum pid_type type;
+
+ /* Has this task already been marked for death? */
+ if ((signal->flags & SIGNAL_GROUP_EXIT) ||
+ signal->group_exec_task) {
+ clear_siginfo(&ksig->info);
+ ksig->info.si_signo = signr = SIGKILL;
+ sigdelset(&current->pending.signal, SIGKILL);
+ trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
+ &sighand->action[SIGKILL - 1]);
+ recalc_sigpending();
+ goto fatal;
+ }
if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
do_signal_stop(0))
@@ -2628,15 +2767,18 @@ relock:
* so that the instruction pointer in the signal stack
* frame points to the faulting instruction.
*/
+ type = PIDTYPE_PID;
signr = dequeue_synchronous_signal(&ksig->info);
if (!signr)
- signr = dequeue_signal(current, &current->blocked, &ksig->info);
+ signr = dequeue_signal(current, &current->blocked,
+ &ksig->info, &type);
if (!signr)
break; /* will return 0 */
- if (unlikely(current->ptrace) && signr != SIGKILL) {
- signr = ptrace_signal(signr, &ksig->info);
+ if (unlikely(current->ptrace) && (signr != SIGKILL) &&
+ !(sighand->action[signr -1].sa.sa_flags & SA_IMMUTABLE)) {
+ signr = ptrace_signal(signr, &ksig->info, type);
if (!signr)
continue;
}
@@ -2738,25 +2880,37 @@ relock:
}
/*
+ * PF_USER_WORKER threads will catch and exit on fatal signals
+ * themselves. They have cleanup that must be performed, so
+ * we cannot call do_exit() on their behalf.
+ */
+ if (current->flags & PF_USER_WORKER)
+ goto out;
+
+ /*
* Death signals, no core dump.
*/
do_group_exit(ksig->info.si_signo);
/* NOTREACHED */
}
spin_unlock_irq(&sighand->siglock);
-
+out:
ksig->sig = signr;
+
+ if (!(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS))
+ hide_si_addr_tag_bits(ksig);
+
return ksig->sig > 0;
}
/**
- * signal_delivered -
+ * signal_delivered - called after signal delivery to update blocked signals
* @ksig: kernel signal struct
* @stepping: nonzero if debugger single-step or block-step in use
*
* This function should be called when a signal has successfully been
* delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask
- * is always blocked, and the signal itself is blocked unless %SA_NODEFER
+ * is always blocked), and the signal itself is blocked unless %SA_NODEFER
* is set in @ksig->ka.sa.sa_flags. Tracing is notified.
*/
static void signal_delivered(struct ksignal *ksig, int stepping)
@@ -2773,7 +2927,10 @@ static void signal_delivered(struct ksignal *ksig, int stepping)
if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
sigaddset(&blocked, ksig->sig);
set_current_blocked(&blocked);
- tracehook_signal_handler(stepping);
+ if (current->sas_ss_flags & SS_AUTODISARM)
+ sas_ss_reset(current);
+ if (stepping)
+ ptrace_notify(SIGTRAP, 0);
}
void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
@@ -2798,8 +2955,7 @@ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
if (sigisemptyset(&retarget))
return;
- t = tsk;
- while_each_thread(tsk, t) {
+ for_other_threads(tsk, t) {
if (t->flags & PF_EXITING)
continue;
@@ -2808,7 +2964,7 @@ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
/* Remove the signals this thread can handle. */
sigandsets(&retarget, &retarget, &t->blocked);
- if (!signal_pending(t))
+ if (!task_sigpending(t))
signal_wake_up(t, 0);
if (sigisemptyset(&retarget))
@@ -2827,7 +2983,8 @@ void exit_signals(struct task_struct *tsk)
*/
cgroup_threadgroup_change_begin(tsk);
- if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
+ if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
+ sched_mm_cid_exit_signals(tsk);
tsk->flags |= PF_EXITING;
cgroup_threadgroup_change_end(tsk);
return;
@@ -2838,11 +2995,12 @@ void exit_signals(struct task_struct *tsk)
* From now this task is not visible for group-wide signals,
* see wants_signal(), do_signal_stop().
*/
+ sched_mm_cid_exit_signals(tsk);
tsk->flags |= PF_EXITING;
cgroup_threadgroup_change_end(tsk);
- if (!signal_pending(tsk))
+ if (!task_sigpending(tsk))
goto out;
unblocked = tsk->blocked;
@@ -2886,7 +3044,7 @@ long do_no_restart_syscall(struct restart_block *param)
static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
{
- if (signal_pending(tsk) && !thread_group_empty(tsk)) {
+ if (task_sigpending(tsk) && !thread_group_empty(tsk)) {
sigset_t newblocked;
/* A set of now blocked but previously unblocked signals. */
sigandnsets(&newblocked, newset, &current->blocked);
@@ -3170,6 +3328,15 @@ enum siginfo_layout siginfo_layout(unsigned sig, int si_code)
else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR))
layout = SIL_FAULT_PKUERR;
#endif
+ else if ((sig == SIGTRAP) && (si_code == TRAP_PERF))
+ layout = SIL_FAULT_PERF_EVENT;
+ else if (IS_ENABLED(CONFIG_SPARC) &&
+ (sig == SIGILL) && (si_code == ILL_ILLTRP))
+ layout = SIL_FAULT_TRAPNO;
+ else if (IS_ENABLED(CONFIG_ALPHA) &&
+ ((sig == SIGFPE) ||
+ ((sig == SIGTRAP) && (si_code == TRAP_UNK))))
+ layout = SIL_FAULT_TRAPNO;
}
else if (si_code <= NSIGPOLL)
layout = SIL_POLL;
@@ -3273,32 +3440,30 @@ void copy_siginfo_to_external32(struct compat_siginfo *to,
break;
case SIL_FAULT:
to->si_addr = ptr_to_compat(from->si_addr);
-#ifdef __ARCH_SI_TRAPNO
+ break;
+ case SIL_FAULT_TRAPNO:
+ to->si_addr = ptr_to_compat(from->si_addr);
to->si_trapno = from->si_trapno;
-#endif
break;
case SIL_FAULT_MCEERR:
to->si_addr = ptr_to_compat(from->si_addr);
-#ifdef __ARCH_SI_TRAPNO
- to->si_trapno = from->si_trapno;
-#endif
to->si_addr_lsb = from->si_addr_lsb;
break;
case SIL_FAULT_BNDERR:
to->si_addr = ptr_to_compat(from->si_addr);
-#ifdef __ARCH_SI_TRAPNO
- to->si_trapno = from->si_trapno;
-#endif
to->si_lower = ptr_to_compat(from->si_lower);
to->si_upper = ptr_to_compat(from->si_upper);
break;
case SIL_FAULT_PKUERR:
to->si_addr = ptr_to_compat(from->si_addr);
-#ifdef __ARCH_SI_TRAPNO
- to->si_trapno = from->si_trapno;
-#endif
to->si_pkey = from->si_pkey;
break;
+ case SIL_FAULT_PERF_EVENT:
+ to->si_addr = ptr_to_compat(from->si_addr);
+ to->si_perf_data = from->si_perf_data;
+ to->si_perf_type = from->si_perf_type;
+ to->si_perf_flags = from->si_perf_flags;
+ break;
case SIL_CHLD:
to->si_pid = from->si_pid;
to->si_uid = from->si_uid;
@@ -3353,32 +3518,30 @@ static int post_copy_siginfo_from_user32(kernel_siginfo_t *to,
break;
case SIL_FAULT:
to->si_addr = compat_ptr(from->si_addr);
-#ifdef __ARCH_SI_TRAPNO
+ break;
+ case SIL_FAULT_TRAPNO:
+ to->si_addr = compat_ptr(from->si_addr);
to->si_trapno = from->si_trapno;
-#endif
break;
case SIL_FAULT_MCEERR:
to->si_addr = compat_ptr(from->si_addr);
-#ifdef __ARCH_SI_TRAPNO
- to->si_trapno = from->si_trapno;
-#endif
to->si_addr_lsb = from->si_addr_lsb;
break;
case SIL_FAULT_BNDERR:
to->si_addr = compat_ptr(from->si_addr);
-#ifdef __ARCH_SI_TRAPNO
- to->si_trapno = from->si_trapno;
-#endif
to->si_lower = compat_ptr(from->si_lower);
to->si_upper = compat_ptr(from->si_upper);
break;
case SIL_FAULT_PKUERR:
to->si_addr = compat_ptr(from->si_addr);
-#ifdef __ARCH_SI_TRAPNO
- to->si_trapno = from->si_trapno;
-#endif
to->si_pkey = from->si_pkey;
break;
+ case SIL_FAULT_PERF_EVENT:
+ to->si_addr = compat_ptr(from->si_addr);
+ to->si_perf_data = from->si_perf_data;
+ to->si_perf_type = from->si_perf_type;
+ to->si_perf_flags = from->si_perf_flags;
+ break;
case SIL_CHLD:
to->si_pid = from->si_pid;
to->si_uid = from->si_uid;
@@ -3444,6 +3607,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info,
ktime_t *to = NULL, timeout = KTIME_MAX;
struct task_struct *tsk = current;
sigset_t mask = *which;
+ enum pid_type type;
int sig, ret = 0;
if (ts) {
@@ -3460,7 +3624,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info,
signotset(&mask);
spin_lock_irq(&tsk->sighand->siglock);
- sig = dequeue_signal(tsk, &mask, info);
+ sig = dequeue_signal(tsk, &mask, info, &type);
if (!sig && timeout) {
/*
* None ready, temporarily unblock those we're interested
@@ -3473,13 +3637,13 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info,
recalc_sigpending();
spin_unlock_irq(&tsk->sighand->siglock);
- __set_current_state(TASK_INTERRUPTIBLE);
- ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns,
- HRTIMER_MODE_REL);
+ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns,
+ HRTIMER_MODE_REL);
spin_lock_irq(&tsk->sighand->siglock);
__set_task_blocked(tsk, &tsk->real_blocked);
sigemptyset(&tsk->real_blocked);
- sig = dequeue_signal(tsk, &mask, info);
+ sig = dequeue_signal(tsk, &mask, info, &type);
}
spin_unlock_irq(&tsk->sighand->siglock);
@@ -3670,7 +3834,8 @@ static bool access_pidfd_pidns(struct pid *pid)
return true;
}
-static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
+static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo,
+ siginfo_t __user *info)
{
#ifdef CONFIG_COMPAT
/*
@@ -3967,9 +4132,29 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
k = &p->sighand->action[sig-1];
spin_lock_irq(&p->sighand->siglock);
+ if (k->sa.sa_flags & SA_IMMUTABLE) {
+ spin_unlock_irq(&p->sighand->siglock);
+ return -EINVAL;
+ }
if (oact)
*oact = *k;
+ /*
+ * Make sure that we never accidentally claim to support SA_UNSUPPORTED,
+ * e.g. by having an architecture use the bit in their uapi.
+ */
+ BUILD_BUG_ON(UAPI_SA_FLAGS & SA_UNSUPPORTED);
+
+ /*
+ * Clear unknown flag bits in order to allow userspace to detect missing
+ * support for flag bits and to allow the kernel to use non-uapi bits
+ * internally.
+ */
+ if (act)
+ act->sa.sa_flags &= UAPI_SA_FLAGS;
+ if (oact)
+ oact->sa.sa_flags &= UAPI_SA_FLAGS;
+
sigaction_compat_abi(act, oact);
if (act) {
@@ -4000,11 +4185,29 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
return 0;
}
+#ifdef CONFIG_DYNAMIC_SIGFRAME
+static inline void sigaltstack_lock(void)
+ __acquires(&current->sighand->siglock)
+{
+ spin_lock_irq(&current->sighand->siglock);
+}
+
+static inline void sigaltstack_unlock(void)
+ __releases(&current->sighand->siglock)
+{
+ spin_unlock_irq(&current->sighand->siglock);
+}
+#else
+static inline void sigaltstack_lock(void) { }
+static inline void sigaltstack_unlock(void) { }
+#endif
+
static int
do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp,
size_t min_ss_size)
{
struct task_struct *t = current;
+ int ret = 0;
if (oss) {
memset(oss, 0, sizeof(stack_t));
@@ -4028,19 +4231,33 @@ do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp,
ss_mode != 0))
return -EINVAL;
+ /*
+ * Return before taking any locks if no actual
+ * sigaltstack changes were requested.
+ */
+ if (t->sas_ss_sp == (unsigned long)ss_sp &&
+ t->sas_ss_size == ss_size &&
+ t->sas_ss_flags == ss_flags)
+ return 0;
+
+ sigaltstack_lock();
if (ss_mode == SS_DISABLE) {
ss_size = 0;
ss_sp = NULL;
} else {
if (unlikely(ss_size < min_ss_size))
- return -ENOMEM;
+ ret = -ENOMEM;
+ if (!sigaltstack_size_valid(ss_size))
+ ret = -ENOMEM;
}
-
- t->sas_ss_sp = (unsigned long) ss_sp;
- t->sas_ss_size = ss_size;
- t->sas_ss_flags = ss_flags;
+ if (!ret) {
+ t->sas_ss_sp = (unsigned long) ss_sp;
+ t->sas_ss_size = ss_size;
+ t->sas_ss_flags = ss_flags;
+ }
+ sigaltstack_unlock();
}
- return 0;
+ return ret;
}
SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
@@ -4074,11 +4291,7 @@ int __save_altstack(stack_t __user *uss, unsigned long sp)
int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
__put_user(t->sas_ss_flags, &uss->ss_flags) |
__put_user(t->sas_ss_size, &uss->ss_size);
- if (err)
- return err;
- if (t->sas_ss_flags & SS_AUTODISARM)
- sas_ss_reset(t);
- return 0;
+ return err;
}
#ifdef CONFIG_COMPAT
@@ -4133,11 +4346,7 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
&uss->ss_sp) |
__put_user(t->sas_ss_flags, &uss->ss_flags) |
__put_user(t->sas_ss_size, &uss->ss_size);
- if (err)
- return err;
- if (t->sas_ss_flags & SS_AUTODISARM)
- sas_ss_reset(t);
- return 0;
+ return err;
}
#endif
@@ -4538,10 +4747,14 @@ static inline void siginfo_buildtime_checks(void)
/* sigfault */
CHECK_OFFSET(si_addr);
+ CHECK_OFFSET(si_trapno);
CHECK_OFFSET(si_addr_lsb);
CHECK_OFFSET(si_lower);
CHECK_OFFSET(si_upper);
CHECK_OFFSET(si_pkey);
+ CHECK_OFFSET(si_perf_data);
+ CHECK_OFFSET(si_perf_type);
+ CHECK_OFFSET(si_perf_flags);
/* sigpoll */
CHECK_OFFSET(si_band);
@@ -4576,11 +4789,33 @@ static inline void siginfo_buildtime_checks(void)
#endif
}
+#if defined(CONFIG_SYSCTL)
+static struct ctl_table signal_debug_table[] = {
+#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
+ {
+ .procname = "exception-trace",
+ .data = &show_unhandled_signals,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif
+ { }
+};
+
+static int __init init_signal_sysctls(void)
+{
+ register_sysctl_init("debug", signal_debug_table);
+ return 0;
+}
+early_initcall(init_signal_sysctls);
+#endif /* CONFIG_SYSCTL */
+
void __init signals_init(void)
{
siginfo_buildtime_checks();
- sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
+ sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT);
}
#ifdef CONFIG_KGDB_KDB
@@ -4603,7 +4838,7 @@ void kdb_send_sig(struct task_struct *t, int sig)
}
new_t = kdb_prev_t != t;
kdb_prev_t = t;
- if (t->state != TASK_RUNNING && new_t) {
+ if (!task_is_running(t) && new_t) {
spin_unlock(&t->sighand->siglock);
kdb_printf("Process is not RUNNING, sending a signal from "
"kdb risks deadlock\n"
@@ -4613,7 +4848,7 @@ void kdb_send_sig(struct task_struct *t, int sig)
"the deadlock.\n");
return;
}
- ret = send_signal(sig, SEND_SIG_PRIV, t, PIDTYPE_PID);
+ ret = send_signal_locked(sig, SEND_SIG_PRIV, t, PIDTYPE_PID);
spin_unlock(&t->sighand->siglock);
if (ret)
kdb_printf("Fail to deliver Signal %d to process %d.\n",
diff --git a/kernel/smp.c b/kernel/smp.c
index aa17eedff5be..f085ebcdf9e7 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -14,17 +14,27 @@
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/init.h>
+#include <linux/interrupt.h>
#include <linux/gfp.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/sched.h>
#include <linux/sched/idle.h>
#include <linux/hypervisor.h>
+#include <linux/sched/clock.h>
+#include <linux/nmi.h>
+#include <linux/sched/debug.h>
+#include <linux/jump_label.h>
+
+#include <trace/events/ipi.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/csd.h>
+#undef CREATE_TRACE_POINTS
#include "smpboot.h"
#include "sched/smp.h"
-#define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK)
+#define CSD_TYPE(_csd) ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK)
struct call_function_data {
call_single_data_t __percpu *csd;
@@ -36,7 +46,9 @@ static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
-static void flush_smp_call_function_queue(bool warn_cpu_offline);
+static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1);
+
+static void __flush_smp_call_function_queue(bool warn_cpu_offline);
int smpcfd_prepare_cpu(unsigned int cpu)
{
@@ -81,7 +93,7 @@ int smpcfd_dying_cpu(unsigned int cpu)
* ensure that the outgoing CPU doesn't go offline with work
* still pending.
*/
- flush_smp_call_function_queue(false);
+ __flush_smp_call_function_queue(false);
irq_work_run();
return 0;
}
@@ -96,6 +108,178 @@ void __init call_function_init(void)
smpcfd_prepare_cpu(smp_processor_id());
}
+static __always_inline void
+send_call_function_single_ipi(int cpu)
+{
+ if (call_function_single_prep_ipi(cpu)) {
+ trace_ipi_send_cpu(cpu, _RET_IP_,
+ generic_smp_call_function_single_interrupt);
+ arch_send_call_function_single_ipi(cpu);
+ }
+}
+
+static __always_inline void
+send_call_function_ipi_mask(struct cpumask *mask)
+{
+ trace_ipi_send_cpumask(mask, _RET_IP_,
+ generic_smp_call_function_single_interrupt);
+ arch_send_call_function_ipi_mask(mask);
+}
+
+static __always_inline void
+csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd)
+{
+ trace_csd_function_entry(func, csd);
+ func(info);
+ trace_csd_function_exit(func, csd);
+}
+
+#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
+
+static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled);
+
+/*
+ * Parse the csdlock_debug= kernel boot parameter.
+ *
+ * If you need to restore the old "ext" value that once provided
+ * additional debugging information, reapply the following commits:
+ *
+ * de7b09ef658d ("locking/csd_lock: Prepare more CSD lock debugging")
+ * a5aabace5fb8 ("locking/csd_lock: Add more data to CSD lock debugging")
+ */
+static int __init csdlock_debug(char *str)
+{
+ int ret;
+ unsigned int val = 0;
+
+ ret = get_option(&str, &val);
+ if (ret) {
+ if (val)
+ static_branch_enable(&csdlock_debug_enabled);
+ else
+ static_branch_disable(&csdlock_debug_enabled);
+ }
+
+ return 1;
+}
+__setup("csdlock_debug=", csdlock_debug);
+
+static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
+static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
+static DEFINE_PER_CPU(void *, cur_csd_info);
+
+static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */
+module_param(csd_lock_timeout, ulong, 0444);
+static int panic_on_ipistall; /* CSD panic timeout in milliseconds, 300000 for five minutes. */
+module_param(panic_on_ipistall, int, 0444);
+
+static atomic_t csd_bug_count = ATOMIC_INIT(0);
+
+/* Record current CSD work for current CPU, NULL to erase. */
+static void __csd_lock_record(call_single_data_t *csd)
+{
+ if (!csd) {
+ smp_mb(); /* NULL cur_csd after unlock. */
+ __this_cpu_write(cur_csd, NULL);
+ return;
+ }
+ __this_cpu_write(cur_csd_func, csd->func);
+ __this_cpu_write(cur_csd_info, csd->info);
+ smp_wmb(); /* func and info before csd. */
+ __this_cpu_write(cur_csd, csd);
+ smp_mb(); /* Update cur_csd before function call. */
+ /* Or before unlock, as the case may be. */
+}
+
+static __always_inline void csd_lock_record(call_single_data_t *csd)
+{
+ if (static_branch_unlikely(&csdlock_debug_enabled))
+ __csd_lock_record(csd);
+}
+
+static int csd_lock_wait_getcpu(call_single_data_t *csd)
+{
+ unsigned int csd_type;
+
+ csd_type = CSD_TYPE(csd);
+ if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC)
+ return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */
+ return -1;
+}
+
+/*
+ * Complain if too much time spent waiting. Note that only
+ * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
+ * so waiting on other types gets much less information.
+ */
+static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
+{
+ int cpu = -1;
+ int cpux;
+ bool firsttime;
+ u64 ts2, ts_delta;
+ call_single_data_t *cpu_cur_csd;
+ unsigned int flags = READ_ONCE(csd->node.u_flags);
+ unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC;
+
+ if (!(flags & CSD_FLAG_LOCK)) {
+ if (!unlikely(*bug_id))
+ return true;
+ cpu = csd_lock_wait_getcpu(csd);
+ pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n",
+ *bug_id, raw_smp_processor_id(), cpu);
+ return true;
+ }
+
+ ts2 = sched_clock();
+ /* How long since we last checked for a stuck CSD lock.*/
+ ts_delta = ts2 - *ts1;
+ if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0))
+ return false;
+
+ firsttime = !*bug_id;
+ if (firsttime)
+ *bug_id = atomic_inc_return(&csd_bug_count);
+ cpu = csd_lock_wait_getcpu(csd);
+ if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu))
+ cpux = 0;
+ else
+ cpux = cpu;
+ cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
+ /* How long since this CSD lock was stuck. */
+ ts_delta = ts2 - ts0;
+ pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
+ firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta,
+ cpu, csd->func, csd->info);
+ /*
+ * If the CSD lock is still stuck after 5 minutes, it is unlikely
+ * to become unstuck. Use a signed comparison to avoid triggering
+ * on underflows when the TSC is out of sync between sockets.
+ */
+ BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC));
+ if (cpu_cur_csd && csd != cpu_cur_csd) {
+ pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n",
+ *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)),
+ READ_ONCE(per_cpu(cur_csd_info, cpux)));
+ } else {
+ pr_alert("\tcsd: CSD lock (#%d) %s.\n",
+ *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request");
+ }
+ if (cpu >= 0) {
+ if (atomic_cmpxchg_acquire(&per_cpu(trigger_backtrace, cpu), 1, 0))
+ dump_cpu_task(cpu);
+ if (!cpu_cur_csd) {
+ pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
+ arch_send_call_function_single_ipi(cpu);
+ }
+ }
+ if (firsttime)
+ dump_stack();
+ *ts1 = ts2;
+
+ return false;
+}
+
/*
* csd_lock/csd_unlock used to serialize access to per-cpu csd resources
*
@@ -103,15 +287,44 @@ void __init call_function_init(void)
* previous function call. For multi-cpu calls its even more interesting
* as we'll have to ensure no other cpu is observing our csd.
*/
+static void __csd_lock_wait(call_single_data_t *csd)
+{
+ int bug_id = 0;
+ u64 ts0, ts1;
+
+ ts1 = ts0 = sched_clock();
+ for (;;) {
+ if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id))
+ break;
+ cpu_relax();
+ }
+ smp_acquire__after_ctrl_dep();
+}
+
static __always_inline void csd_lock_wait(call_single_data_t *csd)
{
- smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
+ if (static_branch_unlikely(&csdlock_debug_enabled)) {
+ __csd_lock_wait(csd);
+ return;
+ }
+
+ smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
+}
+#else
+static void csd_lock_record(call_single_data_t *csd)
+{
+}
+
+static __always_inline void csd_lock_wait(call_single_data_t *csd)
+{
+ smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
}
+#endif
static __always_inline void csd_lock(call_single_data_t *csd)
{
csd_lock_wait(csd);
- csd->flags |= CSD_FLAG_LOCK;
+ csd->node.u_flags |= CSD_FLAG_LOCK;
/*
* prevent CPU from reordering the above assignment
@@ -123,12 +336,12 @@ static __always_inline void csd_lock(call_single_data_t *csd)
static __always_inline void csd_unlock(call_single_data_t *csd)
{
- WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
+ WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));
/*
* ensure we're all done before releasing data:
*/
- smp_store_release(&csd->flags, 0);
+ smp_store_release(&csd->node.u_flags, 0);
}
static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
@@ -136,9 +349,28 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
void __smp_call_single_queue(int cpu, struct llist_node *node)
{
/*
- * The list addition should be visible before sending the IPI
- * handler locks the list to pull the entry off it because of
- * normal cache coherency rules implied by spinlocks.
+ * We have to check the type of the CSD before queueing it, because
+ * once queued it can have its flags cleared by
+ * flush_smp_call_function_queue()
+ * even if we haven't sent the smp_call IPI yet (e.g. the stopper
+ * executes migration_cpu_stop() on the remote CPU).
+ */
+ if (trace_csd_queue_cpu_enabled()) {
+ call_single_data_t *csd;
+ smp_call_func_t func;
+
+ csd = container_of(node, call_single_data_t, node.llist);
+ func = CSD_TYPE(csd) == CSD_TYPE_TTWU ?
+ sched_ttwu_pending : csd->func;
+
+ trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
+ }
+
+ /*
+ * The list addition should be visible to the target CPU when it pops
+ * the head of the list to pull the entry off it in the IPI handler
+ * because of normal cache coherency rules implied by the underlying
+ * llist ops.
*
* If IPIs can go out of order to the cache coherency protocol
* in an architecture, sufficient synchronisation should be added
@@ -166,9 +398,11 @@ static int generic_exec_single(int cpu, call_single_data_t *csd)
* We can unlock early even for the synchronous on-stack case,
* since we're doing this from the same CPU..
*/
+ csd_lock_record(csd);
csd_unlock(csd);
local_irq_save(flags);
- func(info);
+ csd_do_func(func, info, NULL);
+ csd_lock_record(NULL);
local_irq_restore(flags);
return 0;
}
@@ -178,7 +412,7 @@ static int generic_exec_single(int cpu, call_single_data_t *csd)
return -ENXIO;
}
- __smp_call_single_queue(cpu, &csd->llist);
+ __smp_call_single_queue(cpu, &csd->node.llist);
return 0;
}
@@ -191,11 +425,11 @@ static int generic_exec_single(int cpu, call_single_data_t *csd)
*/
void generic_smp_call_function_single_interrupt(void)
{
- flush_smp_call_function_queue(true);
+ __flush_smp_call_function_queue(true);
}
/**
- * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
+ * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks
*
* @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
* offline CPU. Skip this check if set to 'false'.
@@ -208,22 +442,27 @@ void generic_smp_call_function_single_interrupt(void)
* Loop through the call_single_queue and run all the queued callbacks.
* Must be called with interrupts disabled.
*/
-static void flush_smp_call_function_queue(bool warn_cpu_offline)
+static void __flush_smp_call_function_queue(bool warn_cpu_offline)
{
call_single_data_t *csd, *csd_next;
struct llist_node *entry, *prev;
struct llist_head *head;
static bool warned;
+ atomic_t *tbt;
lockdep_assert_irqs_disabled();
+ /* Allow waiters to send backtrace NMI from here onwards */
+ tbt = this_cpu_ptr(&trigger_backtrace);
+ atomic_set_release(tbt, 1);
+
head = this_cpu_ptr(&call_single_queue);
entry = llist_del_all(head);
entry = llist_reverse_order(entry);
/* There shouldn't be any pending callbacks on an offline CPU. */
if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
- !warned && !llist_empty(head))) {
+ !warned && entry != NULL)) {
warned = true;
WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
@@ -231,7 +470,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
* We don't have to use the _safe() variant here
* because we are not invoking the IPI handlers yet.
*/
- llist_for_each_entry(csd, entry, llist) {
+ llist_for_each_entry(csd, entry, node.llist) {
switch (CSD_TYPE(csd)) {
case CSD_TYPE_ASYNC:
case CSD_TYPE_SYNC:
@@ -256,22 +495,24 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
* First; run all SYNC callbacks, people are waiting for us.
*/
prev = NULL;
- llist_for_each_entry_safe(csd, csd_next, entry, llist) {
+ llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
/* Do we wait until *after* callback? */
if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
smp_call_func_t func = csd->func;
void *info = csd->info;
if (prev) {
- prev->next = &csd_next->llist;
+ prev->next = &csd_next->node.llist;
} else {
- entry = &csd_next->llist;
+ entry = &csd_next->node.llist;
}
- func(info);
+ csd_lock_record(csd);
+ csd_do_func(func, info, csd);
csd_unlock(csd);
+ csd_lock_record(NULL);
} else {
- prev = &csd->llist;
+ prev = &csd->node.llist;
}
}
@@ -282,47 +523,70 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
* Second; run all !SYNC callbacks.
*/
prev = NULL;
- llist_for_each_entry_safe(csd, csd_next, entry, llist) {
+ llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
int type = CSD_TYPE(csd);
if (type != CSD_TYPE_TTWU) {
if (prev) {
- prev->next = &csd_next->llist;
+ prev->next = &csd_next->node.llist;
} else {
- entry = &csd_next->llist;
+ entry = &csd_next->node.llist;
}
if (type == CSD_TYPE_ASYNC) {
smp_call_func_t func = csd->func;
void *info = csd->info;
+ csd_lock_record(csd);
csd_unlock(csd);
- func(info);
+ csd_do_func(func, info, csd);
+ csd_lock_record(NULL);
} else if (type == CSD_TYPE_IRQ_WORK) {
irq_work_single(csd);
}
} else {
- prev = &csd->llist;
+ prev = &csd->node.llist;
}
}
/*
* Third; only CSD_TYPE_TTWU is left, issue those.
*/
- if (entry)
- sched_ttwu_pending(entry);
+ if (entry) {
+ csd = llist_entry(entry, typeof(*csd), node.llist);
+ csd_do_func(sched_ttwu_pending, entry, csd);
+ }
}
-void flush_smp_call_function_from_idle(void)
+
+/**
+ * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
+ * from task context (idle, migration thread)
+ *
+ * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it
+ * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by
+ * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to
+ * handle queued SMP function calls before scheduling.
+ *
+ * The migration thread has to ensure that an eventually pending wakeup has
+ * been handled before it migrates a task.
+ */
+void flush_smp_call_function_queue(void)
{
+ unsigned int was_pending;
unsigned long flags;
if (llist_empty(this_cpu_ptr(&call_single_queue)))
return;
local_irq_save(flags);
- flush_smp_call_function_queue(true);
+ /* Get the already pending soft interrupts for RT enabled kernels */
+ was_pending = local_softirq_pending();
+ __flush_smp_call_function_queue(true);
+ if (local_softirq_pending())
+ do_softirq_post_smp_call_flush(was_pending);
+
local_irq_restore(flags);
}
@@ -339,7 +603,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
{
call_single_data_t *csd;
call_single_data_t csd_stack = {
- .flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC,
+ .node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, },
};
int this_cpu;
int err;
@@ -375,6 +639,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
csd->func = func;
csd->info = info;
+#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
+ csd->node.src = smp_processor_id();
+ csd->node.dst = cpu;
+#endif
err = generic_exec_single(cpu, csd);
@@ -388,7 +656,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
EXPORT_SYMBOL(smp_call_function_single);
/**
- * smp_call_function_single_async(): Run an asynchronous function on a
+ * smp_call_function_single_async() - Run an asynchronous function on a
* specific CPU.
* @cpu: The CPU to run on.
* @csd: Pre-allocated and setup data structure
@@ -407,6 +675,8 @@ EXPORT_SYMBOL(smp_call_function_single);
*
* NOTE: Be careful, there is unfortunately no current debugging facility to
* validate the correctness of this serialization.
+ *
+ * Return: %0 on success or negative errno value on error
*/
int smp_call_function_single_async(int cpu, call_single_data_t *csd)
{
@@ -414,12 +684,12 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
preempt_disable();
- if (csd->flags & CSD_FLAG_LOCK) {
+ if (csd->node.u_flags & CSD_FLAG_LOCK) {
err = -EBUSY;
goto out;
}
- csd->flags = CSD_FLAG_LOCK;
+ csd->node.u_flags = CSD_FLAG_LOCK;
smp_wmb();
err = generic_exec_single(cpu, csd);
@@ -474,12 +744,28 @@ call:
}
EXPORT_SYMBOL_GPL(smp_call_function_any);
+/*
+ * Flags to be used as scf_flags argument of smp_call_function_many_cond().
+ *
+ * %SCF_WAIT: Wait until function execution is completed
+ * %SCF_RUN_LOCAL: Run also locally if local cpu is set in cpumask
+ */
+#define SCF_WAIT (1U << 0)
+#define SCF_RUN_LOCAL (1U << 1)
+
static void smp_call_function_many_cond(const struct cpumask *mask,
smp_call_func_t func, void *info,
- bool wait, smp_cond_func_t cond_func)
+ unsigned int scf_flags,
+ smp_cond_func_t cond_func)
{
+ int cpu, last_cpu, this_cpu = smp_processor_id();
struct call_function_data *cfd;
- int cpu, next_cpu, this_cpu = smp_processor_id();
+ bool wait = scf_flags & SCF_WAIT;
+ int nr_cpus = 0;
+ bool run_remote = false;
+ bool run_local = false;
+
+ lockdep_assert_preemption_disabled();
/*
* Can deadlock when called with interrupts disabled.
@@ -487,8 +773,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
* send smp call function interrupt to this cpu and as such deadlocks
* can't happen.
*/
- WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
- && !oops_in_progress && !early_boot_irqs_disabled);
+ if (cpu_online(this_cpu) && !oops_in_progress &&
+ !early_boot_irqs_disabled)
+ lockdep_assert_irqs_enabled();
/*
* When @wait we can deadlock when we interrupt between llist_add() and
@@ -498,56 +785,69 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
*/
WARN_ON_ONCE(!in_task());
- /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */
+ /* Check if we need local execution. */
+ if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask))
+ run_local = true;
+
+ /* Check if we need remote execution, i.e., any CPU excluding this one. */
cpu = cpumask_first_and(mask, cpu_online_mask);
if (cpu == this_cpu)
cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
+ if (cpu < nr_cpu_ids)
+ run_remote = true;
- /* No online cpus? We're done. */
- if (cpu >= nr_cpu_ids)
- return;
-
- /* Do we have another CPU which isn't us? */
- next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
- if (next_cpu == this_cpu)
- next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
-
- /* Fastpath: do that cpu by itself. */
- if (next_cpu >= nr_cpu_ids) {
- if (!cond_func || cond_func(cpu, info))
- smp_call_function_single(cpu, func, info, wait);
- return;
- }
+ if (run_remote) {
+ cfd = this_cpu_ptr(&cfd_data);
+ cpumask_and(cfd->cpumask, mask, cpu_online_mask);
+ __cpumask_clear_cpu(this_cpu, cfd->cpumask);
- cfd = this_cpu_ptr(&cfd_data);
+ cpumask_clear(cfd->cpumask_ipi);
+ for_each_cpu(cpu, cfd->cpumask) {
+ call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
- cpumask_and(cfd->cpumask, mask, cpu_online_mask);
- __cpumask_clear_cpu(this_cpu, cfd->cpumask);
+ if (cond_func && !cond_func(cpu, info)) {
+ __cpumask_clear_cpu(cpu, cfd->cpumask);
+ continue;
+ }
- /* Some callers race with other cpus changing the passed mask */
- if (unlikely(!cpumask_weight(cfd->cpumask)))
- return;
+ csd_lock(csd);
+ if (wait)
+ csd->node.u_flags |= CSD_TYPE_SYNC;
+ csd->func = func;
+ csd->info = info;
+#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
+ csd->node.src = smp_processor_id();
+ csd->node.dst = cpu;
+#endif
+ trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
+
+ if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
+ __cpumask_set_cpu(cpu, cfd->cpumask_ipi);
+ nr_cpus++;
+ last_cpu = cpu;
+ }
+ }
- cpumask_clear(cfd->cpumask_ipi);
- for_each_cpu(cpu, cfd->cpumask) {
- call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
+ /*
+ * Choose the most efficient way to send an IPI. Note that the
+ * number of CPUs might be zero due to concurrent changes to the
+ * provided mask.
+ */
+ if (nr_cpus == 1)
+ send_call_function_single_ipi(last_cpu);
+ else if (likely(nr_cpus > 1))
+ send_call_function_ipi_mask(cfd->cpumask_ipi);
+ }
- if (cond_func && !cond_func(cpu, info))
- continue;
+ if (run_local && (!cond_func || cond_func(this_cpu, info))) {
+ unsigned long flags;
- csd_lock(csd);
- if (wait)
- csd->flags |= CSD_TYPE_SYNC;
- csd->func = func;
- csd->info = info;
- if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
- __cpumask_set_cpu(cpu, cfd->cpumask_ipi);
+ local_irq_save(flags);
+ csd_do_func(func, info, NULL);
+ local_irq_restore(flags);
}
- /* Send a message to all CPUs in the map */
- arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
-
- if (wait) {
+ if (run_remote && wait) {
for_each_cpu(cpu, cfd->cpumask) {
call_single_data_t *csd;
@@ -558,12 +858,14 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
}
/**
- * smp_call_function_many(): Run a function on a set of other CPUs.
+ * smp_call_function_many(): Run a function on a set of CPUs.
* @mask: The set of cpus to run on (only runs on online subset).
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed
- * on other CPUs.
+ * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait
+ * (atomically) until function has completed on other CPUs. If
+ * %SCF_RUN_LOCAL is set, the function will also be run locally
+ * if the local CPU is set in the @cpumask.
*
* If @wait is true, then returns once @func has returned.
*
@@ -574,7 +876,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
void smp_call_function_many(const struct cpumask *mask,
smp_call_func_t func, void *info, bool wait)
{
- smp_call_function_many_cond(mask, func, info, wait, NULL);
+ smp_call_function_many_cond(mask, func, info, wait * SCF_WAIT, NULL);
}
EXPORT_SYMBOL(smp_call_function_many);
@@ -617,7 +919,7 @@ EXPORT_SYMBOL(setup_max_cpus);
* SMP mode to <NUM>.
*/
-void __weak arch_disable_smp_support(void) { }
+void __weak __init arch_disable_smp_support(void) { }
static int __init nosmp(char *str)
{
@@ -634,9 +936,8 @@ static int __init nrcpus(char *str)
{
int nr_cpus;
- get_option(&str, &nr_cpus);
- if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
- nr_cpu_ids = nr_cpus;
+ if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids)
+ set_nr_cpu_ids(nr_cpus);
return 0;
}
@@ -654,14 +955,16 @@ static int __init maxcpus(char *str)
early_param("maxcpus", maxcpus);
+#if (NR_CPUS > 1) && !defined(CONFIG_FORCE_NR_CPUS)
/* Setup number of possible processor ids */
unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
EXPORT_SYMBOL(nr_cpu_ids);
+#endif
/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
void __init setup_nr_cpu_ids(void)
{
- nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
+ set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1);
}
/* Called by boot processor to activate the rest. */
@@ -687,62 +990,12 @@ void __init smp_init(void)
}
/*
- * Call a function on all processors. May be used during early boot while
- * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
- * of local_irq_disable/enable().
- */
-void on_each_cpu(smp_call_func_t func, void *info, int wait)
-{
- unsigned long flags;
-
- preempt_disable();
- smp_call_function(func, info, wait);
- local_irq_save(flags);
- func(info);
- local_irq_restore(flags);
- preempt_enable();
-}
-EXPORT_SYMBOL(on_each_cpu);
-
-/**
- * on_each_cpu_mask(): Run a function on processors specified by
- * cpumask, which may include the local processor.
- * @mask: The set of cpus to run on (only runs on online subset).
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed
- * on other CPUs.
- *
- * If @wait is true, then returns once @func has returned.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler. The
- * exception is that it may be used during early boot while
- * early_boot_irqs_disabled is set.
- */
-void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
- void *info, bool wait)
-{
- int cpu = get_cpu();
-
- smp_call_function_many(mask, func, info, wait);
- if (cpumask_test_cpu(cpu, mask)) {
- unsigned long flags;
- local_irq_save(flags);
- func(info);
- local_irq_restore(flags);
- }
- put_cpu();
-}
-EXPORT_SYMBOL(on_each_cpu_mask);
-
-/*
* on_each_cpu_cond(): Call a function on each processor for which
* the supplied function cond_func returns true, optionally waiting
* for all the required CPUs to finish. This may include the local
* processor.
* @cond_func: A callback function that is passed a cpu id and
- * the the info parameter. The function is called
+ * the info parameter. The function is called
* with preemption disabled. The function should
* return a blooean value indicating whether to IPI
* the specified CPU.
@@ -761,27 +1014,17 @@ EXPORT_SYMBOL(on_each_cpu_mask);
void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
void *info, bool wait, const struct cpumask *mask)
{
- int cpu = get_cpu();
+ unsigned int scf_flags = SCF_RUN_LOCAL;
- smp_call_function_many_cond(mask, func, info, wait, cond_func);
- if (cpumask_test_cpu(cpu, mask) && cond_func(cpu, info)) {
- unsigned long flags;
+ if (wait)
+ scf_flags |= SCF_WAIT;
- local_irq_save(flags);
- func(info);
- local_irq_restore(flags);
- }
- put_cpu();
+ preempt_disable();
+ smp_call_function_many_cond(mask, func, info, scf_flags, cond_func);
+ preempt_enable();
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);
-void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
- void *info, bool wait)
-{
- on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask);
-}
-EXPORT_SYMBOL(on_each_cpu_cond);
-
static void do_nothing(void *unused)
{
}
@@ -815,19 +1058,23 @@ void wake_up_all_idle_cpus(void)
{
int cpu;
- preempt_disable();
- for_each_online_cpu(cpu) {
- if (cpu == smp_processor_id())
- continue;
-
- wake_up_if_idle(cpu);
+ for_each_possible_cpu(cpu) {
+ preempt_disable();
+ if (cpu != smp_processor_id() && cpu_online(cpu))
+ wake_up_if_idle(cpu);
+ preempt_enable();
}
- preempt_enable();
}
EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
/**
- * smp_call_on_cpu - Call a function on a specific cpu
+ * struct smp_call_on_cpu_struct - Call a function on a specific CPU
+ * @work: &work_struct
+ * @done: &completion to signal
+ * @func: function to call
+ * @data: function's data argument
+ * @ret: return value from @func
+ * @cpu: target CPU (%-1 for any CPU)
*
* Used to call a function on a specific cpu and wait for it to return.
* Optionally make sure the call is done on a specified physical cpu via vcpu
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 2efe1e206167..1992b62e980b 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -33,7 +33,6 @@ struct task_struct *idle_thread_get(unsigned int cpu)
if (!tsk)
return ERR_PTR(-ENOMEM);
- init_idle(tsk, cpu);
return tsk;
}
@@ -48,7 +47,7 @@ void __init idle_thread_set_boot_cpu(void)
*
* Creates the thread if it does not exist.
*/
-static inline void idle_init(unsigned int cpu)
+static __always_inline void idle_init(unsigned int cpu)
{
struct task_struct *tsk = per_cpu(idle_threads, cpu);
@@ -188,6 +187,7 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
kfree(td);
return PTR_ERR(tsk);
}
+ kthread_set_per_cpu(tsk, cpu);
/*
* Park the thread so that it could start right on the CPU
* when it is available.
@@ -272,8 +272,7 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
if (tsk) {
- kthread_stop(tsk);
- put_task_struct(tsk);
+ kthread_stop_put(tsk);
*per_cpu_ptr(ht->store, cpu) = NULL;
}
}
@@ -291,7 +290,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
unsigned int cpu;
int ret = 0;
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&smpboot_threads_lock);
for_each_online_cpu(cpu) {
ret = __smpboot_create_thread(plug_thread, cpu);
@@ -304,7 +303,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
list_add(&plug_thread->list, &hotplug_threads);
out:
mutex_unlock(&smpboot_threads_lock);
- put_online_cpus();
+ cpus_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
@@ -317,166 +316,11 @@ EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
*/
void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
{
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&smpboot_threads_lock);
list_del(&plug_thread->list);
smpboot_destroy_threads(plug_thread);
mutex_unlock(&smpboot_threads_lock);
- put_online_cpus();
+ cpus_read_unlock();
}
EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
-
-static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
-
-/*
- * Called to poll specified CPU's state, for example, when waiting for
- * a CPU to come online.
- */
-int cpu_report_state(int cpu)
-{
- return atomic_read(&per_cpu(cpu_hotplug_state, cpu));
-}
-
-/*
- * If CPU has died properly, set its state to CPU_UP_PREPARE and
- * return success. Otherwise, return -EBUSY if the CPU died after
- * cpu_wait_death() timed out. And yet otherwise again, return -EAGAIN
- * if cpu_wait_death() timed out and the CPU still hasn't gotten around
- * to dying. In the latter two cases, the CPU might not be set up
- * properly, but it is up to the arch-specific code to decide.
- * Finally, -EIO indicates an unanticipated problem.
- *
- * Note that it is permissible to omit this call entirely, as is
- * done in architectures that do no CPU-hotplug error checking.
- */
-int cpu_check_up_prepare(int cpu)
-{
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
- atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
- return 0;
- }
-
- switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) {
-
- case CPU_POST_DEAD:
-
- /* The CPU died properly, so just start it up again. */
- atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
- return 0;
-
- case CPU_DEAD_FROZEN:
-
- /*
- * Timeout during CPU death, so let caller know.
- * The outgoing CPU completed its processing, but after
- * cpu_wait_death() timed out and reported the error. The
- * caller is free to proceed, in which case the state
- * will be reset properly by cpu_set_state_online().
- * Proceeding despite this -EBUSY return makes sense
- * for systems where the outgoing CPUs take themselves
- * offline, with no post-death manipulation required from
- * a surviving CPU.
- */
- return -EBUSY;
-
- case CPU_BROKEN:
-
- /*
- * The most likely reason we got here is that there was
- * a timeout during CPU death, and the outgoing CPU never
- * did complete its processing. This could happen on
- * a virtualized system if the outgoing VCPU gets preempted
- * for more than five seconds, and the user attempts to
- * immediately online that same CPU. Trying again later
- * might return -EBUSY above, hence -EAGAIN.
- */
- return -EAGAIN;
-
- default:
-
- /* Should not happen. Famous last words. */
- return -EIO;
- }
-}
-
-/*
- * Mark the specified CPU online.
- *
- * Note that it is permissible to omit this call entirely, as is
- * done in architectures that do no CPU-hotplug error checking.
- */
-void cpu_set_state_online(int cpu)
-{
- (void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE);
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Wait for the specified CPU to exit the idle loop and die.
- */
-bool cpu_wait_death(unsigned int cpu, int seconds)
-{
- int jf_left = seconds * HZ;
- int oldstate;
- bool ret = true;
- int sleep_jf = 1;
-
- might_sleep();
-
- /* The outgoing CPU will normally get done quite quickly. */
- if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
- goto update_state;
- udelay(5);
-
- /* But if the outgoing CPU dawdles, wait increasingly long times. */
- while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) {
- schedule_timeout_uninterruptible(sleep_jf);
- jf_left -= sleep_jf;
- if (jf_left <= 0)
- break;
- sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
- }
-update_state:
- oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
- if (oldstate == CPU_DEAD) {
- /* Outgoing CPU died normally, update state. */
- smp_mb(); /* atomic_read() before update. */
- atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
- } else {
- /* Outgoing CPU still hasn't died, set state accordingly. */
- if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
- oldstate, CPU_BROKEN) != oldstate)
- goto update_state;
- ret = false;
- }
- return ret;
-}
-
-/*
- * Called by the outgoing CPU to report its successful death. Return
- * false if this report follows the surviving CPU's timing out.
- *
- * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU
- * timed out. This approach allows architectures to omit calls to
- * cpu_check_up_prepare() and cpu_set_state_online() without defeating
- * the next cpu_wait_death()'s polling loop.
- */
-bool cpu_report_death(void)
-{
- int oldstate;
- int newstate;
- int cpu = smp_processor_id();
-
- do {
- oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
- if (oldstate != CPU_BROKEN)
- newstate = CPU_DEAD;
- else
- newstate = CPU_DEAD_FROZEN;
- } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
- oldstate, newstate) != oldstate);
- return newstate == CPU_DEAD;
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c4201b7f42b1..210cf5f8d92c 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -13,6 +13,7 @@
#include <linux/kernel_stat.h>
#include <linux/interrupt.h>
#include <linux/init.h>
+#include <linux/local_lock.h>
#include <linux/mm.h>
#include <linux/notifier.h>
#include <linux/percpu.h>
@@ -25,6 +26,9 @@
#include <linux/smpboot.h>
#include <linux/tick.h>
#include <linux/irq.h>
+#include <linux/wait_bit.h>
+
+#include <asm/softirq_stack.h>
#define CREATE_TRACE_POINTS
#include <trace/events/irq.h>
@@ -72,46 +76,234 @@ static void wakeup_softirqd(void)
/* Interrupts are disabled: no need to stop preemption */
struct task_struct *tsk = __this_cpu_read(ksoftirqd);
- if (tsk && tsk->state != TASK_RUNNING)
+ if (tsk)
wake_up_process(tsk);
}
+#ifdef CONFIG_TRACE_IRQFLAGS
+DEFINE_PER_CPU(int, hardirqs_enabled);
+DEFINE_PER_CPU(int, hardirq_context);
+EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled);
+EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context);
+#endif
+
/*
- * If ksoftirqd is scheduled, we do not want to process pending softirqs
- * right now. Let ksoftirqd handle this at its own rate, to get fairness,
- * unless we're doing some of the synchronous softirqs.
+ * SOFTIRQ_OFFSET usage:
+ *
+ * On !RT kernels 'count' is the preempt counter, on RT kernels this applies
+ * to a per CPU counter and to task::softirqs_disabled_cnt.
+ *
+ * - count is changed by SOFTIRQ_OFFSET on entering or leaving softirq
+ * processing.
+ *
+ * - count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
+ * on local_bh_disable or local_bh_enable.
+ *
+ * This lets us distinguish between whether we are currently processing
+ * softirq and whether we just have bh disabled.
+ */
+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * RT accounts for BH disabled sections in task::softirqs_disabled_cnt and
+ * also in per CPU softirq_ctrl::cnt. This is necessary to allow tasks in a
+ * softirq disabled section to be preempted.
+ *
+ * The per task counter is used for softirq_count(), in_softirq() and
+ * in_serving_softirqs() because these counts are only valid when the task
+ * holding softirq_ctrl::lock is running.
+ *
+ * The per CPU counter prevents pointless wakeups of ksoftirqd in case that
+ * the task which is in a softirq disabled section is preempted or blocks.
*/
-#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ))
-static bool ksoftirqd_running(unsigned long pending)
+struct softirq_ctrl {
+ local_lock_t lock;
+ int cnt;
+};
+
+static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = {
+ .lock = INIT_LOCAL_LOCK(softirq_ctrl.lock),
+};
+
+/**
+ * local_bh_blocked() - Check for idle whether BH processing is blocked
+ *
+ * Returns false if the per CPU softirq::cnt is 0 otherwise true.
+ *
+ * This is invoked from the idle task to guard against false positive
+ * softirq pending warnings, which would happen when the task which holds
+ * softirq_ctrl::lock was the only running task on the CPU and blocks on
+ * some other lock.
+ */
+bool local_bh_blocked(void)
{
- struct task_struct *tsk = __this_cpu_read(ksoftirqd);
+ return __this_cpu_read(softirq_ctrl.cnt) != 0;
+}
+
+void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
+{
+ unsigned long flags;
+ int newcnt;
+
+ WARN_ON_ONCE(in_hardirq());
+
+ /* First entry of a task into a BH disabled section? */
+ if (!current->softirq_disable_cnt) {
+ if (preemptible()) {
+ local_lock(&softirq_ctrl.lock);
+ /* Required to meet the RCU bottomhalf requirements. */
+ rcu_read_lock();
+ } else {
+ DEBUG_LOCKS_WARN_ON(this_cpu_read(softirq_ctrl.cnt));
+ }
+ }
+
+ /*
+ * Track the per CPU softirq disabled state. On RT this is per CPU
+ * state to allow preemption of bottom half disabled sections.
+ */
+ newcnt = __this_cpu_add_return(softirq_ctrl.cnt, cnt);
+ /*
+ * Reflect the result in the task state to prevent recursion on the
+ * local lock and to make softirq_count() & al work.
+ */
+ current->softirq_disable_cnt = newcnt;
+
+ if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) {
+ raw_local_irq_save(flags);
+ lockdep_softirqs_off(ip);
+ raw_local_irq_restore(flags);
+ }
+}
+EXPORT_SYMBOL(__local_bh_disable_ip);
+
+static void __local_bh_enable(unsigned int cnt, bool unlock)
+{
+ unsigned long flags;
+ int newcnt;
+
+ DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt !=
+ this_cpu_read(softirq_ctrl.cnt));
+
+ if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && softirq_count() == cnt) {
+ raw_local_irq_save(flags);
+ lockdep_softirqs_on(_RET_IP_);
+ raw_local_irq_restore(flags);
+ }
+
+ newcnt = __this_cpu_sub_return(softirq_ctrl.cnt, cnt);
+ current->softirq_disable_cnt = newcnt;
+
+ if (!newcnt && unlock) {
+ rcu_read_unlock();
+ local_unlock(&softirq_ctrl.lock);
+ }
+}
+
+void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
+{
+ bool preempt_on = preemptible();
+ unsigned long flags;
+ u32 pending;
+ int curcnt;
+
+ WARN_ON_ONCE(in_hardirq());
+ lockdep_assert_irqs_enabled();
+
+ local_irq_save(flags);
+ curcnt = __this_cpu_read(softirq_ctrl.cnt);
+
+ /*
+ * If this is not reenabling soft interrupts, no point in trying to
+ * run pending ones.
+ */
+ if (curcnt != cnt)
+ goto out;
+
+ pending = local_softirq_pending();
+ if (!pending)
+ goto out;
+
+ /*
+ * If this was called from non preemptible context, wake up the
+ * softirq daemon.
+ */
+ if (!preempt_on) {
+ wakeup_softirqd();
+ goto out;
+ }
+
+ /*
+ * Adjust softirq count to SOFTIRQ_OFFSET which makes
+ * in_serving_softirq() become true.
+ */
+ cnt = SOFTIRQ_OFFSET;
+ __local_bh_enable(cnt, false);
+ __do_softirq();
- if (pending & SOFTIRQ_NOW_MASK)
- return false;
- return tsk && (tsk->state == TASK_RUNNING) &&
- !__kthread_should_park(tsk);
+out:
+ __local_bh_enable(cnt, preempt_on);
+ local_irq_restore(flags);
}
+EXPORT_SYMBOL(__local_bh_enable_ip);
/*
- * preempt_count and SOFTIRQ_OFFSET usage:
- * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
- * softirq processing.
- * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
- * on local_bh_disable or local_bh_enable.
- * This lets us distinguish between whether we are currently processing
- * softirq and whether we just have bh disabled.
+ * Invoked from ksoftirqd_run() outside of the interrupt disabled section
+ * to acquire the per CPU local lock for reentrancy protection.
*/
+static inline void ksoftirqd_run_begin(void)
+{
+ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+ local_irq_disable();
+}
+
+/* Counterpart to ksoftirqd_run_begin() */
+static inline void ksoftirqd_run_end(void)
+{
+ __local_bh_enable(SOFTIRQ_OFFSET, true);
+ WARN_ON_ONCE(in_interrupt());
+ local_irq_enable();
+}
+
+static inline void softirq_handle_begin(void) { }
+static inline void softirq_handle_end(void) { }
+
+static inline bool should_wake_ksoftirqd(void)
+{
+ return !this_cpu_read(softirq_ctrl.cnt);
+}
+
+static inline void invoke_softirq(void)
+{
+ if (should_wake_ksoftirqd())
+ wakeup_softirqd();
+}
/*
- * This one is for softirq.c-internal use,
- * where hardirqs are disabled legitimately:
+ * flush_smp_call_function_queue() can raise a soft interrupt in a function
+ * call. On RT kernels this is undesired and the only known functionality
+ * in the block layer which does this is disabled on RT. If soft interrupts
+ * get raised which haven't been raised before the flush, warn so it can be
+ * investigated.
+ */
+void do_softirq_post_smp_call_flush(unsigned int was_pending)
+{
+ if (WARN_ON_ONCE(was_pending != local_softirq_pending()))
+ invoke_softirq();
+}
+
+#else /* CONFIG_PREEMPT_RT */
+
+/*
+ * This one is for softirq.c-internal use, where hardirqs are disabled
+ * legitimately:
*/
#ifdef CONFIG_TRACE_IRQFLAGS
void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
unsigned long flags;
- WARN_ON_ONCE(in_irq());
+ WARN_ON_ONCE(in_hardirq());
raw_local_irq_save(flags);
/*
@@ -158,14 +350,14 @@ static void __local_bh_enable(unsigned int cnt)
*/
void _local_bh_enable(void)
{
- WARN_ON_ONCE(in_irq());
+ WARN_ON_ONCE(in_hardirq());
__local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
}
EXPORT_SYMBOL(_local_bh_enable);
void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
{
- WARN_ON_ONCE(in_irq());
+ WARN_ON_ONCE(in_hardirq());
lockdep_assert_irqs_enabled();
#ifdef CONFIG_TRACE_IRQFLAGS
local_irq_disable();
@@ -179,7 +371,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
* Keep preemption disabled until we are done with
* softirq processing:
*/
- preempt_count_sub(cnt - 1);
+ __preempt_count_sub(cnt - 1);
if (unlikely(!in_interrupt() && local_softirq_pending())) {
/*
@@ -197,6 +389,75 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
}
EXPORT_SYMBOL(__local_bh_enable_ip);
+static inline void softirq_handle_begin(void)
+{
+ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+}
+
+static inline void softirq_handle_end(void)
+{
+ __local_bh_enable(SOFTIRQ_OFFSET);
+ WARN_ON_ONCE(in_interrupt());
+}
+
+static inline void ksoftirqd_run_begin(void)
+{
+ local_irq_disable();
+}
+
+static inline void ksoftirqd_run_end(void)
+{
+ local_irq_enable();
+}
+
+static inline bool should_wake_ksoftirqd(void)
+{
+ return true;
+}
+
+static inline void invoke_softirq(void)
+{
+ if (!force_irqthreads() || !__this_cpu_read(ksoftirqd)) {
+#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
+ /*
+ * We can safely execute softirq on the current stack if
+ * it is the irq stack, because it should be near empty
+ * at this stage.
+ */
+ __do_softirq();
+#else
+ /*
+ * Otherwise, irq_exit() is called on the task stack that can
+ * be potentially deep already. So call softirq in its own stack
+ * to prevent from any overrun.
+ */
+ do_softirq_own_stack();
+#endif
+ } else {
+ wakeup_softirqd();
+ }
+}
+
+asmlinkage __visible void do_softirq(void)
+{
+ __u32 pending;
+ unsigned long flags;
+
+ if (in_interrupt())
+ return;
+
+ local_irq_save(flags);
+
+ pending = local_softirq_pending();
+
+ if (pending)
+ do_softirq_own_stack();
+
+ local_irq_restore(flags);
+}
+
+#endif /* !CONFIG_PREEMPT_RT */
+
/*
* We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
* but break the loop if need_resched() is set or after 2 ms.
@@ -224,7 +485,7 @@ static inline bool lockdep_softirq_start(void)
{
bool in_hardirq = false;
- if (lockdep_hardirq_context(current)) {
+ if (lockdep_hardirq_context()) {
in_hardirq = true;
lockdep_hardirq_exit();
}
@@ -264,10 +525,10 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
current->flags &= ~PF_MEMALLOC;
pending = local_softirq_pending();
- account_irq_enter_time(current);
- __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+ softirq_handle_begin();
in_hardirq = lockdep_softirq_start();
+ account_softirq_enter(current);
restart:
/* Reset the pending bitmask before enabling irqs */
@@ -301,8 +562,10 @@ restart:
pending >>= softirq_bit;
}
- if (__this_cpu_read(ksoftirqd) == current)
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) &&
+ __this_cpu_read(ksoftirqd) == current)
rcu_softirq_qs();
+
local_irq_disable();
pending = local_softirq_pending();
@@ -314,46 +577,24 @@ restart:
wakeup_softirqd();
}
+ account_softirq_exit(current);
lockdep_softirq_end(in_hardirq);
- account_irq_exit_time(current);
- __local_bh_enable(SOFTIRQ_OFFSET);
- WARN_ON_ONCE(in_interrupt());
+ softirq_handle_end();
current_restore_flags(old_flags, PF_MEMALLOC);
}
-asmlinkage __visible void do_softirq(void)
-{
- __u32 pending;
- unsigned long flags;
-
- if (in_interrupt())
- return;
-
- local_irq_save(flags);
-
- pending = local_softirq_pending();
-
- if (pending && !ksoftirqd_running(pending))
- do_softirq_own_stack();
-
- local_irq_restore(flags);
-}
-
/**
* irq_enter_rcu - Enter an interrupt context with RCU watching
*/
void irq_enter_rcu(void)
{
- if (is_idle_task(current) && !in_interrupt()) {
- /*
- * Prevent raise_softirq from needlessly waking up ksoftirqd
- * here, as softirq will be serviced on return from interrupt.
- */
- local_bh_disable();
+ __irq_enter_raw();
+
+ if (tick_nohz_full_cpu(smp_processor_id()) ||
+ (is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET)))
tick_irq_enter();
- _local_bh_enable();
- }
- __irq_enter();
+
+ account_hardirq_enter(current);
}
/**
@@ -361,44 +602,18 @@ void irq_enter_rcu(void)
*/
void irq_enter(void)
{
- rcu_irq_enter();
+ ct_irq_enter();
irq_enter_rcu();
}
-static inline void invoke_softirq(void)
-{
- if (ksoftirqd_running(local_softirq_pending()))
- return;
-
- if (!force_irqthreads) {
-#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
- /*
- * We can safely execute softirq on the current stack if
- * it is the irq stack, because it should be near empty
- * at this stage.
- */
- __do_softirq();
-#else
- /*
- * Otherwise, irq_exit() is called on the task stack that can
- * be potentially deep already. So call softirq in its own stack
- * to prevent from any overrun.
- */
- do_softirq_own_stack();
-#endif
- } else {
- wakeup_softirqd();
- }
-}
-
static inline void tick_irq_exit(void)
{
#ifdef CONFIG_NO_HZ_COMMON
int cpu = smp_processor_id();
/* Make sure that timer wheel updates are propagated */
- if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
- if (!in_irq())
+ if ((sched_core_idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
+ if (!in_hardirq())
tick_nohz_irq_exit();
}
#endif
@@ -411,7 +626,7 @@ static inline void __irq_exit_rcu(void)
#else
lockdep_assert_irqs_disabled();
#endif
- account_irq_exit_time(current);
+ account_hardirq_exit(current);
preempt_count_sub(HARDIRQ_OFFSET);
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
@@ -439,7 +654,7 @@ void irq_exit_rcu(void)
void irq_exit(void)
{
__irq_exit_rcu();
- rcu_irq_exit();
+ ct_irq_exit();
/* must be last! */
lockdep_hardirq_exit();
}
@@ -460,7 +675,7 @@ inline void raise_softirq_irqoff(unsigned int nr)
* Otherwise we wake up ksoftirqd to make sure we
* schedule the softirq soon.
*/
- if (!in_interrupt())
+ if (!in_interrupt() && should_wake_ksoftirqd())
wakeup_softirqd();
}
@@ -475,6 +690,7 @@ void raise_softirq(unsigned int nr)
void __raise_softirq_irqoff(unsigned int nr)
{
+ lockdep_assert_irqs_disabled();
trace_softirq_raise(nr);
or_softirq_pending(1UL << nr);
}
@@ -525,6 +741,20 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
}
EXPORT_SYMBOL(__tasklet_hi_schedule);
+static bool tasklet_clear_sched(struct tasklet_struct *t)
+{
+ if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) {
+ wake_up_var(&t->state);
+ return true;
+ }
+
+ WARN_ONCE(1, "tasklet SCHED state not set: %s %pS\n",
+ t->use_callback ? "callback" : "func",
+ t->use_callback ? (void *)t->callback : (void *)t->func);
+
+ return false;
+}
+
static void tasklet_action_common(struct softirq_action *a,
struct tasklet_head *tl_head,
unsigned int softirq_nr)
@@ -544,10 +774,17 @@ static void tasklet_action_common(struct softirq_action *a,
if (tasklet_trylock(t)) {
if (!atomic_read(&t->count)) {
- if (!test_and_clear_bit(TASKLET_STATE_SCHED,
- &t->state))
- BUG();
- t->func(t->data);
+ if (tasklet_clear_sched(t)) {
+ if (t->use_callback) {
+ trace_tasklet_entry(t, t->callback);
+ t->callback(t);
+ trace_tasklet_exit(t, t->callback);
+ } else {
+ trace_tasklet_entry(t, t->func);
+ t->func(t->data);
+ trace_tasklet_exit(t, t->func);
+ }
+ }
tasklet_unlock(t);
continue;
}
@@ -573,6 +810,18 @@ static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
}
+void tasklet_setup(struct tasklet_struct *t,
+ void (*callback)(struct tasklet_struct *))
+{
+ t->next = NULL;
+ t->state = 0;
+ atomic_set(&t->count, 0);
+ t->callback = callback;
+ t->use_callback = true;
+ t->data = 0;
+}
+EXPORT_SYMBOL(tasklet_setup);
+
void tasklet_init(struct tasklet_struct *t,
void (*func)(unsigned long), unsigned long data)
{
@@ -580,25 +829,67 @@ void tasklet_init(struct tasklet_struct *t,
t->state = 0;
atomic_set(&t->count, 0);
t->func = func;
+ t->use_callback = false;
t->data = data;
}
EXPORT_SYMBOL(tasklet_init);
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+/*
+ * Do not use in new code. Waiting for tasklets from atomic contexts is
+ * error prone and should be avoided.
+ */
+void tasklet_unlock_spin_wait(struct tasklet_struct *t)
+{
+ while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ /*
+ * Prevent a live lock when current preempted soft
+ * interrupt processing or prevents ksoftirqd from
+ * running. If the tasklet runs on a different CPU
+ * then this has no effect other than doing the BH
+ * disable/enable dance for nothing.
+ */
+ local_bh_disable();
+ local_bh_enable();
+ } else {
+ cpu_relax();
+ }
+ }
+}
+EXPORT_SYMBOL(tasklet_unlock_spin_wait);
+#endif
+
void tasklet_kill(struct tasklet_struct *t)
{
if (in_interrupt())
pr_notice("Attempt to kill tasklet from interrupt\n");
- while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
- do {
- yield();
- } while (test_bit(TASKLET_STATE_SCHED, &t->state));
- }
+ while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
+ wait_var_event(&t->state, !test_bit(TASKLET_STATE_SCHED, &t->state));
+
tasklet_unlock_wait(t);
- clear_bit(TASKLET_STATE_SCHED, &t->state);
+ tasklet_clear_sched(t);
}
EXPORT_SYMBOL(tasklet_kill);
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+void tasklet_unlock(struct tasklet_struct *t)
+{
+ smp_mb__before_atomic();
+ clear_bit(TASKLET_STATE_RUN, &t->state);
+ smp_mb__after_atomic();
+ wake_up_var(&t->state);
+}
+EXPORT_SYMBOL_GPL(tasklet_unlock);
+
+void tasklet_unlock_wait(struct tasklet_struct *t)
+{
+ wait_var_event(&t->state, !test_bit(TASKLET_STATE_RUN, &t->state));
+}
+EXPORT_SYMBOL_GPL(tasklet_unlock_wait);
+#endif
+
void __init softirq_init(void)
{
int cpu;
@@ -621,53 +912,21 @@ static int ksoftirqd_should_run(unsigned int cpu)
static void run_ksoftirqd(unsigned int cpu)
{
- local_irq_disable();
+ ksoftirqd_run_begin();
if (local_softirq_pending()) {
/*
* We can safely run softirq on inline stack, as we are not deep
* in the task stack here.
*/
__do_softirq();
- local_irq_enable();
+ ksoftirqd_run_end();
cond_resched();
return;
}
- local_irq_enable();
+ ksoftirqd_run_end();
}
#ifdef CONFIG_HOTPLUG_CPU
-/*
- * tasklet_kill_immediate is called to remove a tasklet which can already be
- * scheduled for execution on @cpu.
- *
- * Unlike tasklet_kill, this function removes the tasklet
- * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state.
- *
- * When this function is called, @cpu must be in the CPU_DEAD state.
- */
-void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
-{
- struct tasklet_struct **i;
-
- BUG_ON(cpu_online(cpu));
- BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state));
-
- if (!test_bit(TASKLET_STATE_SCHED, &t->state))
- return;
-
- /* CPU is dead, so no lock needed. */
- for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) {
- if (*i == t) {
- *i = t->next;
- /* If this was the tail element, move the tail ptr */
- if (*i == NULL)
- per_cpu(tasklet_vec, cpu).tail = i;
- return;
- }
- }
- BUG();
-}
-
static int takeover_tasklets(unsigned int cpu)
{
/* CPU is dead, so no lock needed. */
diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index b193a59fc05b..34c9d81eea94 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -16,10 +16,12 @@
#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
#include <linux/jump_label.h>
#include <linux/sysctl.h>
+#include <linux/init.h>
static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass);
-int stack_erasing_sysctl(struct ctl_table *table, int write,
+#ifdef CONFIG_SYSCTL
+static int stack_erasing_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int ret = 0;
@@ -42,81 +44,121 @@ int stack_erasing_sysctl(struct ctl_table *table, int write,
state ? "enabled" : "disabled");
return ret;
}
+static struct ctl_table stackleak_sysctls[] = {
+ {
+ .procname = "stack_erasing",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = stack_erasing_sysctl,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {}
+};
+
+static int __init stackleak_sysctls_init(void)
+{
+ register_sysctl_init("kernel", stackleak_sysctls);
+ return 0;
+}
+late_initcall(stackleak_sysctls_init);
+#endif /* CONFIG_SYSCTL */
#define skip_erasing() static_branch_unlikely(&stack_erasing_bypass)
#else
#define skip_erasing() false
#endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */
-asmlinkage void notrace stackleak_erase(void)
+#ifndef __stackleak_poison
+static __always_inline void __stackleak_poison(unsigned long erase_low,
+ unsigned long erase_high,
+ unsigned long poison)
{
- /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */
- unsigned long kstack_ptr = current->lowest_stack;
- unsigned long boundary = (unsigned long)end_of_stack(current);
- unsigned int poison_count = 0;
- const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long);
-
- if (skip_erasing())
- return;
-
- /* Check that 'lowest_stack' value is sane */
- if (unlikely(kstack_ptr - boundary >= THREAD_SIZE))
- kstack_ptr = boundary;
-
- /* Search for the poison value in the kernel stack */
- while (kstack_ptr > boundary && poison_count <= depth) {
- if (*(unsigned long *)kstack_ptr == STACKLEAK_POISON)
- poison_count++;
- else
- poison_count = 0;
-
- kstack_ptr -= sizeof(unsigned long);
+ while (erase_low < erase_high) {
+ *(unsigned long *)erase_low = poison;
+ erase_low += sizeof(unsigned long);
}
+}
+#endif
- /*
- * One 'long int' at the bottom of the thread stack is reserved and
- * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK=y).
- */
- if (kstack_ptr == boundary)
- kstack_ptr += sizeof(unsigned long);
+static __always_inline void __stackleak_erase(bool on_task_stack)
+{
+ const unsigned long task_stack_low = stackleak_task_low_bound(current);
+ const unsigned long task_stack_high = stackleak_task_high_bound(current);
+ unsigned long erase_low, erase_high;
+
+ erase_low = stackleak_find_top_of_poison(task_stack_low,
+ current->lowest_stack);
#ifdef CONFIG_STACKLEAK_METRICS
- current->prev_lowest_stack = kstack_ptr;
+ current->prev_lowest_stack = erase_low;
#endif
/*
- * Now write the poison value to the kernel stack. Start from
- * 'kstack_ptr' and move up till the new 'boundary'. We assume that
- * the stack pointer doesn't change when we write poison.
+ * Write poison to the task's stack between 'erase_low' and
+ * 'erase_high'.
+ *
+ * If we're running on a different stack (e.g. an entry trampoline
+ * stack) we can erase everything below the pt_regs at the top of the
+ * task stack.
+ *
+ * If we're running on the task stack itself, we must not clobber any
+ * stack used by this function and its caller. We assume that this
+ * function has a fixed-size stack frame, and the current stack pointer
+ * doesn't change while we write poison.
*/
- if (on_thread_stack())
- boundary = current_stack_pointer;
+ if (on_task_stack)
+ erase_high = current_stack_pointer;
else
- boundary = current_top_of_stack();
+ erase_high = task_stack_high;
- while (kstack_ptr < boundary) {
- *(unsigned long *)kstack_ptr = STACKLEAK_POISON;
- kstack_ptr += sizeof(unsigned long);
- }
+ __stackleak_poison(erase_low, erase_high, STACKLEAK_POISON);
/* Reset the 'lowest_stack' value for the next syscall */
- current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64;
+ current->lowest_stack = task_stack_high;
}
-NOKPROBE_SYMBOL(stackleak_erase);
-void __used notrace stackleak_track_stack(void)
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can be called from the task stack or an entry stack when the task stack is
+ * no longer in use.
+ */
+asmlinkage void noinstr stackleak_erase(void)
{
- /*
- * N.B. stackleak_erase() fills the kernel stack with the poison value,
- * which has the register width. That code assumes that the value
- * of 'lowest_stack' is aligned on the register width boundary.
- *
- * That is true for x86 and x86_64 because of the kernel stack
- * alignment on these platforms (for details, see 'cc_stack_align' in
- * arch/x86/Makefile). Take care of that when you port STACKLEAK to
- * new platforms.
- */
- unsigned long sp = (unsigned long)&sp;
+ if (skip_erasing())
+ return;
+
+ __stackleak_erase(on_thread_stack());
+}
+
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can only be called from the task stack.
+ */
+asmlinkage void noinstr stackleak_erase_on_task_stack(void)
+{
+ if (skip_erasing())
+ return;
+
+ __stackleak_erase(true);
+}
+
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can only be called from a stack other than the task stack.
+ */
+asmlinkage void noinstr stackleak_erase_off_task_stack(void)
+{
+ if (skip_erasing())
+ return;
+
+ __stackleak_erase(false);
+}
+
+void __used __no_caller_saved_registers noinstr stackleak_track_stack(void)
+{
+ unsigned long sp = current_stack_pointer;
/*
* Having CONFIG_STACKLEAK_TRACK_MIN_SIZE larger than
@@ -125,9 +167,10 @@ void __used notrace stackleak_track_stack(void)
*/
BUILD_BUG_ON(CONFIG_STACKLEAK_TRACK_MIN_SIZE > STACKLEAK_SEARCH_DEPTH);
+ /* 'lowest_stack' should be aligned on the register width boundary */
+ sp = ALIGN(sp, sizeof(unsigned long));
if (sp < current->lowest_stack &&
- sp >= (unsigned long)task_stack_page(current) +
- sizeof(unsigned long)) {
+ sp >= stackleak_task_low_bound(current)) {
current->lowest_stack = sp;
}
}
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 2af66e449aa6..afb3c116da91 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -13,6 +13,7 @@
#include <linux/export.h>
#include <linux/kallsyms.h>
#include <linux/stacktrace.h>
+#include <linux/interrupt.h>
/**
* stack_trace_print - Print the entries in the stack trace
@@ -78,8 +79,7 @@ struct stacktrace_cookie {
unsigned int len;
};
-static bool stack_trace_consume_entry(void *cookie, unsigned long addr,
- bool reliable)
+static bool stack_trace_consume_entry(void *cookie, unsigned long addr)
{
struct stacktrace_cookie *c = cookie;
@@ -94,12 +94,11 @@ static bool stack_trace_consume_entry(void *cookie, unsigned long addr,
return c->len < c->size;
}
-static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr,
- bool reliable)
+static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr)
{
if (in_sched_functions(addr))
return true;
- return stack_trace_consume_entry(cookie, addr, reliable);
+ return stack_trace_consume_entry(cookie, addr);
}
/**
@@ -127,7 +126,7 @@ EXPORT_SYMBOL_GPL(stack_trace_save);
/**
* stack_trace_save_tsk - Save a task stack trace into a storage array
- * @task: The task to examine
+ * @tsk: The task to examine
* @store: Pointer to storage array
* @size: Size of the storage array
* @skipnr: Number of entries to skip at the start of the stack trace
@@ -152,6 +151,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store,
put_task_stack(tsk);
return c.len;
}
+EXPORT_SYMBOL_GPL(stack_trace_save_tsk);
/**
* stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
@@ -227,16 +227,12 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
.store = store,
.size = size,
};
- mm_segment_t fs;
/* Trace user stack if not a kernel thread */
if (current->flags & PF_KTHREAD)
return 0;
- fs = get_fs();
- set_fs(USER_DS);
arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
- set_fs(fs);
return c.len;
}
@@ -306,6 +302,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task,
save_stack_trace_tsk(task, &trace);
return trace.nr_entries;
}
+EXPORT_SYMBOL_GPL(stack_trace_save_tsk);
/**
* stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
@@ -376,3 +373,32 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
#endif /* CONFIG_USER_STACKTRACE_SUPPORT */
#endif /* !CONFIG_ARCH_STACKWALK */
+
+static inline bool in_irqentry_text(unsigned long ptr)
+{
+ return (ptr >= (unsigned long)&__irqentry_text_start &&
+ ptr < (unsigned long)&__irqentry_text_end) ||
+ (ptr >= (unsigned long)&__softirqentry_text_start &&
+ ptr < (unsigned long)&__softirqentry_text_end);
+}
+
+/**
+ * filter_irq_stacks - Find first IRQ stack entry in trace
+ * @entries: Pointer to stack trace array
+ * @nr_entries: Number of entries in the storage array
+ *
+ * Return: Number of trace entries until IRQ stack starts.
+ */
+unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries)
+{
+ unsigned int i;
+
+ for (i = 0; i < nr_entries; i++) {
+ if (in_irqentry_text(entries[i])) {
+ /* Include the irqentry function into the stack. */
+ return i + 1;
+ }
+ }
+ return nr_entries;
+}
+EXPORT_SYMBOL_GPL(filter_irq_stacks);
diff --git a/kernel/static_call.c b/kernel/static_call.c
new file mode 100644
index 000000000000..e9c3e69f3837
--- /dev/null
+++ b/kernel/static_call.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/static_call.h>
+
+long __static_call_return0(void)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__static_call_return0);
diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c
new file mode 100644
index 000000000000..639397b5491c
--- /dev/null
+++ b/kernel/static_call_inline.c
@@ -0,0 +1,556 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/static_call.h>
+#include <linux/bug.h>
+#include <linux/smp.h>
+#include <linux/sort.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/processor.h>
+#include <asm/sections.h>
+
+extern struct static_call_site __start_static_call_sites[],
+ __stop_static_call_sites[];
+extern struct static_call_tramp_key __start_static_call_tramp_key[],
+ __stop_static_call_tramp_key[];
+
+static int static_call_initialized;
+
+/*
+ * Must be called before early_initcall() to be effective.
+ */
+void static_call_force_reinit(void)
+{
+ if (WARN_ON_ONCE(!static_call_initialized))
+ return;
+
+ static_call_initialized++;
+}
+
+/* mutex to protect key modules/sites */
+static DEFINE_MUTEX(static_call_mutex);
+
+static void static_call_lock(void)
+{
+ mutex_lock(&static_call_mutex);
+}
+
+static void static_call_unlock(void)
+{
+ mutex_unlock(&static_call_mutex);
+}
+
+static inline void *static_call_addr(struct static_call_site *site)
+{
+ return (void *)((long)site->addr + (long)&site->addr);
+}
+
+static inline unsigned long __static_call_key(const struct static_call_site *site)
+{
+ return (long)site->key + (long)&site->key;
+}
+
+static inline struct static_call_key *static_call_key(const struct static_call_site *site)
+{
+ return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS);
+}
+
+/* These assume the key is word-aligned. */
+static inline bool static_call_is_init(struct static_call_site *site)
+{
+ return __static_call_key(site) & STATIC_CALL_SITE_INIT;
+}
+
+static inline bool static_call_is_tail(struct static_call_site *site)
+{
+ return __static_call_key(site) & STATIC_CALL_SITE_TAIL;
+}
+
+static inline void static_call_set_init(struct static_call_site *site)
+{
+ site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) -
+ (long)&site->key;
+}
+
+static int static_call_site_cmp(const void *_a, const void *_b)
+{
+ const struct static_call_site *a = _a;
+ const struct static_call_site *b = _b;
+ const struct static_call_key *key_a = static_call_key(a);
+ const struct static_call_key *key_b = static_call_key(b);
+
+ if (key_a < key_b)
+ return -1;
+
+ if (key_a > key_b)
+ return 1;
+
+ return 0;
+}
+
+static void static_call_site_swap(void *_a, void *_b, int size)
+{
+ long delta = (unsigned long)_a - (unsigned long)_b;
+ struct static_call_site *a = _a;
+ struct static_call_site *b = _b;
+ struct static_call_site tmp = *a;
+
+ a->addr = b->addr - delta;
+ a->key = b->key - delta;
+
+ b->addr = tmp.addr + delta;
+ b->key = tmp.key + delta;
+}
+
+static inline void static_call_sort_entries(struct static_call_site *start,
+ struct static_call_site *stop)
+{
+ sort(start, stop - start, sizeof(struct static_call_site),
+ static_call_site_cmp, static_call_site_swap);
+}
+
+static inline bool static_call_key_has_mods(struct static_call_key *key)
+{
+ return !(key->type & 1);
+}
+
+static inline struct static_call_mod *static_call_key_next(struct static_call_key *key)
+{
+ if (!static_call_key_has_mods(key))
+ return NULL;
+
+ return key->mods;
+}
+
+static inline struct static_call_site *static_call_key_sites(struct static_call_key *key)
+{
+ if (static_call_key_has_mods(key))
+ return NULL;
+
+ return (struct static_call_site *)(key->type & ~1);
+}
+
+void __static_call_update(struct static_call_key *key, void *tramp, void *func)
+{
+ struct static_call_site *site, *stop;
+ struct static_call_mod *site_mod, first;
+
+ cpus_read_lock();
+ static_call_lock();
+
+ if (key->func == func)
+ goto done;
+
+ key->func = func;
+
+ arch_static_call_transform(NULL, tramp, func, false);
+
+ /*
+ * If uninitialized, we'll not update the callsites, but they still
+ * point to the trampoline and we just patched that.
+ */
+ if (WARN_ON_ONCE(!static_call_initialized))
+ goto done;
+
+ first = (struct static_call_mod){
+ .next = static_call_key_next(key),
+ .mod = NULL,
+ .sites = static_call_key_sites(key),
+ };
+
+ for (site_mod = &first; site_mod; site_mod = site_mod->next) {
+ bool init = system_state < SYSTEM_RUNNING;
+ struct module *mod = site_mod->mod;
+
+ if (!site_mod->sites) {
+ /*
+ * This can happen if the static call key is defined in
+ * a module which doesn't use it.
+ *
+ * It also happens in the has_mods case, where the
+ * 'first' entry has no sites associated with it.
+ */
+ continue;
+ }
+
+ stop = __stop_static_call_sites;
+
+ if (mod) {
+#ifdef CONFIG_MODULES
+ stop = mod->static_call_sites +
+ mod->num_static_call_sites;
+ init = mod->state == MODULE_STATE_COMING;
+#endif
+ }
+
+ for (site = site_mod->sites;
+ site < stop && static_call_key(site) == key; site++) {
+ void *site_addr = static_call_addr(site);
+
+ if (!init && static_call_is_init(site))
+ continue;
+
+ if (!kernel_text_address((unsigned long)site_addr)) {
+ /*
+ * This skips patching built-in __exit, which
+ * is part of init_section_contains() but is
+ * not part of kernel_text_address().
+ *
+ * Skipping built-in __exit is fine since it
+ * will never be executed.
+ */
+ WARN_ONCE(!static_call_is_init(site),
+ "can't patch static call site at %pS",
+ site_addr);
+ continue;
+ }
+
+ arch_static_call_transform(site_addr, NULL, func,
+ static_call_is_tail(site));
+ }
+ }
+
+done:
+ static_call_unlock();
+ cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(__static_call_update);
+
+static int __static_call_init(struct module *mod,
+ struct static_call_site *start,
+ struct static_call_site *stop)
+{
+ struct static_call_site *site;
+ struct static_call_key *key, *prev_key = NULL;
+ struct static_call_mod *site_mod;
+
+ if (start == stop)
+ return 0;
+
+ static_call_sort_entries(start, stop);
+
+ for (site = start; site < stop; site++) {
+ void *site_addr = static_call_addr(site);
+
+ if ((mod && within_module_init((unsigned long)site_addr, mod)) ||
+ (!mod && init_section_contains(site_addr, 1)))
+ static_call_set_init(site);
+
+ key = static_call_key(site);
+ if (key != prev_key) {
+ prev_key = key;
+
+ /*
+ * For vmlinux (!mod) avoid the allocation by storing
+ * the sites pointer in the key itself. Also see
+ * __static_call_update()'s @first.
+ *
+ * This allows architectures (eg. x86) to call
+ * static_call_init() before memory allocation works.
+ */
+ if (!mod) {
+ key->sites = site;
+ key->type |= 1;
+ goto do_transform;
+ }
+
+ site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
+ if (!site_mod)
+ return -ENOMEM;
+
+ /*
+ * When the key has a direct sites pointer, extract
+ * that into an explicit struct static_call_mod, so we
+ * can have a list of modules.
+ */
+ if (static_call_key_sites(key)) {
+ site_mod->mod = NULL;
+ site_mod->next = NULL;
+ site_mod->sites = static_call_key_sites(key);
+
+ key->mods = site_mod;
+
+ site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
+ if (!site_mod)
+ return -ENOMEM;
+ }
+
+ site_mod->mod = mod;
+ site_mod->sites = site;
+ site_mod->next = static_call_key_next(key);
+ key->mods = site_mod;
+ }
+
+do_transform:
+ arch_static_call_transform(site_addr, NULL, key->func,
+ static_call_is_tail(site));
+ }
+
+ return 0;
+}
+
+static int addr_conflict(struct static_call_site *site, void *start, void *end)
+{
+ unsigned long addr = (unsigned long)static_call_addr(site);
+
+ if (addr <= (unsigned long)end &&
+ addr + CALL_INSN_SIZE > (unsigned long)start)
+ return 1;
+
+ return 0;
+}
+
+static int __static_call_text_reserved(struct static_call_site *iter_start,
+ struct static_call_site *iter_stop,
+ void *start, void *end, bool init)
+{
+ struct static_call_site *iter = iter_start;
+
+ while (iter < iter_stop) {
+ if (init || !static_call_is_init(iter)) {
+ if (addr_conflict(iter, start, end))
+ return 1;
+ }
+ iter++;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_MODULES
+
+static int __static_call_mod_text_reserved(void *start, void *end)
+{
+ struct module *mod;
+ int ret;
+
+ preempt_disable();
+ mod = __module_text_address((unsigned long)start);
+ WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+ if (!try_module_get(mod))
+ mod = NULL;
+ preempt_enable();
+
+ if (!mod)
+ return 0;
+
+ ret = __static_call_text_reserved(mod->static_call_sites,
+ mod->static_call_sites + mod->num_static_call_sites,
+ start, end, mod->state == MODULE_STATE_COMING);
+
+ module_put(mod);
+
+ return ret;
+}
+
+static unsigned long tramp_key_lookup(unsigned long addr)
+{
+ struct static_call_tramp_key *start = __start_static_call_tramp_key;
+ struct static_call_tramp_key *stop = __stop_static_call_tramp_key;
+ struct static_call_tramp_key *tramp_key;
+
+ for (tramp_key = start; tramp_key != stop; tramp_key++) {
+ unsigned long tramp;
+
+ tramp = (long)tramp_key->tramp + (long)&tramp_key->tramp;
+ if (tramp == addr)
+ return (long)tramp_key->key + (long)&tramp_key->key;
+ }
+
+ return 0;
+}
+
+static int static_call_add_module(struct module *mod)
+{
+ struct static_call_site *start = mod->static_call_sites;
+ struct static_call_site *stop = start + mod->num_static_call_sites;
+ struct static_call_site *site;
+
+ for (site = start; site != stop; site++) {
+ unsigned long s_key = __static_call_key(site);
+ unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS;
+ unsigned long key;
+
+ /*
+ * Is the key is exported, 'addr' points to the key, which
+ * means modules are allowed to call static_call_update() on
+ * it.
+ *
+ * Otherwise, the key isn't exported, and 'addr' points to the
+ * trampoline so we need to lookup the key.
+ *
+ * We go through this dance to prevent crazy modules from
+ * abusing sensitive static calls.
+ */
+ if (!kernel_text_address(addr))
+ continue;
+
+ key = tramp_key_lookup(addr);
+ if (!key) {
+ pr_warn("Failed to fixup __raw_static_call() usage at: %ps\n",
+ static_call_addr(site));
+ return -EINVAL;
+ }
+
+ key |= s_key & STATIC_CALL_SITE_FLAGS;
+ site->key = key - (long)&site->key;
+ }
+
+ return __static_call_init(mod, start, stop);
+}
+
+static void static_call_del_module(struct module *mod)
+{
+ struct static_call_site *start = mod->static_call_sites;
+ struct static_call_site *stop = mod->static_call_sites +
+ mod->num_static_call_sites;
+ struct static_call_key *key, *prev_key = NULL;
+ struct static_call_mod *site_mod, **prev;
+ struct static_call_site *site;
+
+ for (site = start; site < stop; site++) {
+ key = static_call_key(site);
+ if (key == prev_key)
+ continue;
+
+ prev_key = key;
+
+ for (prev = &key->mods, site_mod = key->mods;
+ site_mod && site_mod->mod != mod;
+ prev = &site_mod->next, site_mod = site_mod->next)
+ ;
+
+ if (!site_mod)
+ continue;
+
+ *prev = site_mod->next;
+ kfree(site_mod);
+ }
+}
+
+static int static_call_module_notify(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+ int ret = 0;
+
+ cpus_read_lock();
+ static_call_lock();
+
+ switch (val) {
+ case MODULE_STATE_COMING:
+ ret = static_call_add_module(mod);
+ if (ret) {
+ WARN(1, "Failed to allocate memory for static calls");
+ static_call_del_module(mod);
+ }
+ break;
+ case MODULE_STATE_GOING:
+ static_call_del_module(mod);
+ break;
+ }
+
+ static_call_unlock();
+ cpus_read_unlock();
+
+ return notifier_from_errno(ret);
+}
+
+static struct notifier_block static_call_module_nb = {
+ .notifier_call = static_call_module_notify,
+};
+
+#else
+
+static inline int __static_call_mod_text_reserved(void *start, void *end)
+{
+ return 0;
+}
+
+#endif /* CONFIG_MODULES */
+
+int static_call_text_reserved(void *start, void *end)
+{
+ bool init = system_state < SYSTEM_RUNNING;
+ int ret = __static_call_text_reserved(__start_static_call_sites,
+ __stop_static_call_sites, start, end, init);
+
+ if (ret)
+ return ret;
+
+ return __static_call_mod_text_reserved(start, end);
+}
+
+int __init static_call_init(void)
+{
+ int ret;
+
+ /* See static_call_force_reinit(). */
+ if (static_call_initialized == 1)
+ return 0;
+
+ cpus_read_lock();
+ static_call_lock();
+ ret = __static_call_init(NULL, __start_static_call_sites,
+ __stop_static_call_sites);
+ static_call_unlock();
+ cpus_read_unlock();
+
+ if (ret) {
+ pr_err("Failed to allocate memory for static_call!\n");
+ BUG();
+ }
+
+#ifdef CONFIG_MODULES
+ if (!static_call_initialized)
+ register_module_notifier(&static_call_module_nb);
+#endif
+
+ static_call_initialized = 1;
+ return 0;
+}
+early_initcall(static_call_init);
+
+#ifdef CONFIG_STATIC_CALL_SELFTEST
+
+static int func_a(int x)
+{
+ return x+1;
+}
+
+static int func_b(int x)
+{
+ return x+2;
+}
+
+DEFINE_STATIC_CALL(sc_selftest, func_a);
+
+static struct static_call_data {
+ int (*func)(int);
+ int val;
+ int expect;
+} static_call_data [] __initdata = {
+ { NULL, 2, 3 },
+ { func_b, 2, 4 },
+ { func_a, 2, 3 }
+};
+
+static int __init test_static_call_init(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(static_call_data); i++ ) {
+ struct static_call_data *scd = &static_call_data[i];
+
+ if (scd->func)
+ static_call_update(sc_selftest, scd->func);
+
+ WARN_ON(static_call(sc_selftest)(scd->val) != scd->expect);
+ }
+
+ return 0;
+}
+early_initcall(test_static_call_init);
+
+#endif /* CONFIG_STATIC_CALL_SELFTEST */
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 865bb0228ab6..cedb17ba158a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -42,11 +42,27 @@ struct cpu_stopper {
struct list_head works; /* list of pending works */
struct cpu_stop_work stop_work; /* for stop_cpus */
+ unsigned long caller;
+ cpu_stop_fn_t fn;
};
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
static bool stop_machine_initialized = false;
+void print_stop_info(const char *log_lvl, struct task_struct *task)
+{
+ /*
+ * If @task is a stopper task, it cannot migrate and task_cpu() is
+ * stable.
+ */
+ struct cpu_stopper *stopper = per_cpu_ptr(&cpu_stopper, task_cpu(task));
+
+ if (task != stopper->thread)
+ return;
+
+ printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller);
+}
+
/* static data for stop_cpus */
static DEFINE_MUTEX(stop_cpus_mutex);
static bool stop_cpus_in_progress;
@@ -123,7 +139,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
{
struct cpu_stop_done done;
- struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
+ struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ };
cpu_stop_init_done(&done, 1);
if (!cpu_stop_queue_work(cpu, &work))
@@ -178,7 +194,7 @@ static void ack_state(struct multi_stop_data *msdata)
set_state(msdata, msdata->state + 1);
}
-void __weak stop_machine_yield(const struct cpumask *cpumask)
+notrace void __weak stop_machine_yield(const struct cpumask *cpumask)
{
cpu_relax();
}
@@ -331,7 +347,8 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
work1 = work2 = (struct cpu_stop_work){
.fn = multi_cpu_stop,
.arg = &msdata,
- .done = &done
+ .done = &done,
+ .caller = _RET_IP_,
};
cpu_stop_init_done(&done, 2);
@@ -367,7 +384,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
struct cpu_stop_work *work_buf)
{
- *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
+ *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
return cpu_stop_queue_work(cpu, work_buf);
}
@@ -392,6 +409,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
work->fn = fn;
work->arg = arg;
work->done = done;
+ work->caller = _RET_IP_;
if (cpu_stop_queue_work(cpu, work))
queued = true;
}
@@ -487,6 +505,8 @@ repeat:
int ret;
/* cpu stop callbacks must not sleep, make in_atomic() == T */
+ stopper->caller = work->caller;
+ stopper->fn = fn;
preempt_count_inc();
ret = fn(arg);
if (done) {
@@ -495,6 +515,8 @@ repeat:
cpu_stop_signal_done(done);
}
preempt_count_dec();
+ stopper->fn = NULL;
+ stopper->caller = 0;
WARN_ONCE(preempt_count(),
"cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);
goto repeat;
@@ -513,8 +535,6 @@ void stop_machine_park(int cpu)
kthread_park(stopper->thread);
}
-extern void sched_set_stop_task(int cpu, struct task_struct *stop);
-
static void cpu_stop_create(unsigned int cpu)
{
sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu));
@@ -611,6 +631,27 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
}
EXPORT_SYMBOL_GPL(stop_machine);
+#ifdef CONFIG_SCHED_SMT
+int stop_core_cpuslocked(unsigned int cpu, cpu_stop_fn_t fn, void *data)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+
+ struct multi_stop_data msdata = {
+ .fn = fn,
+ .data = data,
+ .num_threads = cpumask_weight(smt_mask),
+ .active_cpus = smt_mask,
+ };
+
+ lockdep_assert_cpus_held();
+
+ /* Set the initial state and stop all online cpus. */
+ set_state(&msdata, MULTI_STOP_PREPARE);
+ return stop_cpus(smt_mask, multi_cpu_stop, &msdata);
+}
+EXPORT_SYMBOL_GPL(stop_core_cpuslocked);
+#endif
+
/**
* stop_machine_from_inactive_cpu - stop_machine() from inactive CPU
* @fn: the function to run
diff --git a/kernel/sys.c b/kernel/sys.c
index 00a96746e28a..f8e543f1e38a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/utsname.h>
#include <linux/mman.h>
#include <linux/reboot.h>
@@ -14,6 +15,7 @@
#include <linux/highuid.h>
#include <linux/fs.h>
#include <linux/kmod.h>
+#include <linux/ksm.h>
#include <linux/perf_event.h>
#include <linux/resource.h>
#include <linux/kernel.h>
@@ -24,7 +26,7 @@
#include <linux/times.h>
#include <linux/posix-timers.h>
#include <linux/security.h>
-#include <linux/dcookies.h>
+#include <linux/random.h>
#include <linux/suspend.h>
#include <linux/tty.h>
#include <linux/signal.h>
@@ -42,6 +44,7 @@
#include <linux/syscore_ops.h>
#include <linux/version.h>
#include <linux/ctype.h>
+#include <linux/syscall_user_dispatch.h>
#include <linux/compat.h>
#include <linux/syscalls.h>
@@ -116,15 +119,33 @@
#ifndef SVE_GET_VL
# define SVE_GET_VL() (-EINVAL)
#endif
+#ifndef SME_SET_VL
+# define SME_SET_VL(a) (-EINVAL)
+#endif
+#ifndef SME_GET_VL
+# define SME_GET_VL() (-EINVAL)
+#endif
#ifndef PAC_RESET_KEYS
# define PAC_RESET_KEYS(a, b) (-EINVAL)
#endif
+#ifndef PAC_SET_ENABLED_KEYS
+# define PAC_SET_ENABLED_KEYS(a, b, c) (-EINVAL)
+#endif
+#ifndef PAC_GET_ENABLED_KEYS
+# define PAC_GET_ENABLED_KEYS(a) (-EINVAL)
+#endif
#ifndef SET_TAGGED_ADDR_CTRL
# define SET_TAGGED_ADDR_CTRL(a) (-EINVAL)
#endif
#ifndef GET_TAGGED_ADDR_CTRL
# define GET_TAGGED_ADDR_CTRL() (-EINVAL)
#endif
+#ifndef RISCV_V_SET_CONTROL
+# define RISCV_V_SET_CONTROL(a) (-EINVAL)
+#endif
+#ifndef RISCV_V_GET_CONTROL
+# define RISCV_V_GET_CONTROL() (-EINVAL)
+#endif
/*
* this is where the system-wide overflow UID and GID are defined, for
@@ -214,7 +235,6 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
niceval = MAX_NICE;
rcu_read_lock();
- read_lock(&tasklist_lock);
switch (which) {
case PRIO_PROCESS:
if (who)
@@ -229,9 +249,11 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
pgrp = find_vpid(who);
else
pgrp = task_pgrp(current);
+ read_lock(&tasklist_lock);
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
error = set_one_prio(p, niceval, error);
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+ read_unlock(&tasklist_lock);
break;
case PRIO_USER:
uid = make_kuid(cred->user_ns, who);
@@ -243,16 +265,15 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
if (!user)
goto out_unlock; /* No processes for this user */
}
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
if (uid_eq(task_uid(p), uid) && task_pid_vnr(p))
error = set_one_prio(p, niceval, error);
- } while_each_thread(g, p);
+ }
if (!uid_eq(uid, cred->uid))
free_uid(user); /* For find_user() */
break;
}
out_unlock:
- read_unlock(&tasklist_lock);
rcu_read_unlock();
out:
return error;
@@ -277,7 +298,6 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
return -EINVAL;
rcu_read_lock();
- read_lock(&tasklist_lock);
switch (which) {
case PRIO_PROCESS:
if (who)
@@ -295,11 +315,13 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
pgrp = find_vpid(who);
else
pgrp = task_pgrp(current);
+ read_lock(&tasklist_lock);
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+ read_unlock(&tasklist_lock);
break;
case PRIO_USER:
uid = make_kuid(cred->user_ns, who);
@@ -311,19 +333,18 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
if (!user)
goto out_unlock; /* No processes for this user */
}
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) {
niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
}
- } while_each_thread(g, p);
+ }
if (!uid_eq(uid, cred->uid))
free_uid(user); /* for find_user() */
break;
}
out_unlock:
- read_unlock(&tasklist_lock);
rcu_read_unlock();
return retval;
@@ -373,7 +394,7 @@ long __sys_setregid(gid_t rgid, gid_t egid)
if (rgid != (gid_t) -1) {
if (gid_eq(old->gid, krgid) ||
gid_eq(old->egid, krgid) ||
- ns_capable(old->user_ns, CAP_SETGID))
+ ns_capable_setid(old->user_ns, CAP_SETGID))
new->gid = krgid;
else
goto error;
@@ -382,7 +403,7 @@ long __sys_setregid(gid_t rgid, gid_t egid)
if (gid_eq(old->gid, kegid) ||
gid_eq(old->egid, kegid) ||
gid_eq(old->sgid, kegid) ||
- ns_capable(old->user_ns, CAP_SETGID))
+ ns_capable_setid(old->user_ns, CAP_SETGID))
new->egid = kegid;
else
goto error;
@@ -432,7 +453,7 @@ long __sys_setgid(gid_t gid)
old = current_cred();
retval = -EPERM;
- if (ns_capable(old->user_ns, CAP_SETGID))
+ if (ns_capable_setid(old->user_ns, CAP_SETGID))
new->gid = new->egid = new->sgid = new->fsgid = kgid;
else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
new->egid = new->fsgid = kgid;
@@ -466,6 +487,16 @@ static int set_user(struct cred *new)
if (!new_user)
return -EAGAIN;
+ free_uid(new->user);
+ new->user = new_user;
+ return 0;
+}
+
+static void flag_nproc_exceeded(struct cred *new)
+{
+ if (new->ucounts == current_ucounts())
+ return;
+
/*
* We don't fail in case of NPROC limit excess here because too many
* poorly written programs don't check set*uid() return code, assuming
@@ -473,15 +504,11 @@ static int set_user(struct cred *new)
* for programs doing set*uid()+execve() by harmlessly deferring the
* failure to the execve() stage.
*/
- if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
- new_user != INIT_USER)
+ if (is_rlimit_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
+ new->user != INIT_USER)
current->flags |= PF_NPROC_EXCEEDED;
else
current->flags &= ~PF_NPROC_EXCEEDED;
-
- free_uid(new->user);
- new->user = new_user;
- return 0;
}
/*
@@ -552,6 +579,11 @@ long __sys_setreuid(uid_t ruid, uid_t euid)
if (retval < 0)
goto error;
+ retval = set_cred_ucounts(new);
+ if (retval < 0)
+ goto error;
+
+ flag_nproc_exceeded(new);
return commit_creds(new);
error:
@@ -610,6 +642,11 @@ long __sys_setuid(uid_t uid)
if (retval < 0)
goto error;
+ retval = set_cred_ucounts(new);
+ if (retval < 0)
+ goto error;
+
+ flag_nproc_exceeded(new);
return commit_creds(new);
error:
@@ -634,6 +671,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
struct cred *new;
int retval;
kuid_t kruid, keuid, ksuid;
+ bool ruid_new, euid_new, suid_new;
kruid = make_kuid(ns, ruid);
keuid = make_kuid(ns, euid);
@@ -648,25 +686,29 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
if ((suid != (uid_t) -1) && !uid_valid(ksuid))
return -EINVAL;
+ old = current_cred();
+
+ /* check for no-op */
+ if ((ruid == (uid_t) -1 || uid_eq(kruid, old->uid)) &&
+ (euid == (uid_t) -1 || (uid_eq(keuid, old->euid) &&
+ uid_eq(keuid, old->fsuid))) &&
+ (suid == (uid_t) -1 || uid_eq(ksuid, old->suid)))
+ return 0;
+
+ ruid_new = ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
+ !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid);
+ euid_new = euid != (uid_t) -1 && !uid_eq(keuid, old->uid) &&
+ !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid);
+ suid_new = suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) &&
+ !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid);
+ if ((ruid_new || euid_new || suid_new) &&
+ !ns_capable_setid(old->user_ns, CAP_SETUID))
+ return -EPERM;
+
new = prepare_creds();
if (!new)
return -ENOMEM;
- old = current_cred();
-
- retval = -EPERM;
- if (!ns_capable_setid(old->user_ns, CAP_SETUID)) {
- if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
- !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
- goto error;
- if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) &&
- !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
- goto error;
- if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) &&
- !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
- goto error;
- }
-
if (ruid != (uid_t) -1) {
new->uid = kruid;
if (!uid_eq(kruid, old->uid)) {
@@ -685,6 +727,11 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
if (retval < 0)
goto error;
+ retval = set_cred_ucounts(new);
+ if (retval < 0)
+ goto error;
+
+ flag_nproc_exceeded(new);
return commit_creds(new);
error:
@@ -726,6 +773,7 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
struct cred *new;
int retval;
kgid_t krgid, kegid, ksgid;
+ bool rgid_new, egid_new, sgid_new;
krgid = make_kgid(ns, rgid);
kegid = make_kgid(ns, egid);
@@ -738,23 +786,28 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
return -EINVAL;
+ old = current_cred();
+
+ /* check for no-op */
+ if ((rgid == (gid_t) -1 || gid_eq(krgid, old->gid)) &&
+ (egid == (gid_t) -1 || (gid_eq(kegid, old->egid) &&
+ gid_eq(kegid, old->fsgid))) &&
+ (sgid == (gid_t) -1 || gid_eq(ksgid, old->sgid)))
+ return 0;
+
+ rgid_new = rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
+ !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid);
+ egid_new = egid != (gid_t) -1 && !gid_eq(kegid, old->gid) &&
+ !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid);
+ sgid_new = sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) &&
+ !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid);
+ if ((rgid_new || egid_new || sgid_new) &&
+ !ns_capable_setid(old->user_ns, CAP_SETGID))
+ return -EPERM;
+
new = prepare_creds();
if (!new)
return -ENOMEM;
- old = current_cred();
-
- retval = -EPERM;
- if (!ns_capable(old->user_ns, CAP_SETGID)) {
- if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
- !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
- goto error;
- if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) &&
- !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
- goto error;
- if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) &&
- !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
- goto error;
- }
if (rgid != (gid_t) -1)
new->gid = krgid;
@@ -871,7 +924,7 @@ long __sys_setfsgid(gid_t gid)
if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) ||
gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
- ns_capable(old->user_ns, CAP_SETGID)) {
+ ns_capable_setid(old->user_ns, CAP_SETGID)) {
if (!gid_eq(kgid, old->fsgid)) {
new->fsgid = kgid;
if (security_task_fix_setgid(new,old,LSM_SETID_FS) == 0)
@@ -1242,7 +1295,7 @@ static int override_release(char __user *release, size_t len)
break;
rest++;
}
- v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 60;
+ v = LINUX_VERSION_PATCHLEVEL + 60;
copy = clamp_t(size_t, len, 1, sizeof(buf));
copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
ret = copy_to_user(release, buf, copy + 1);
@@ -1332,6 +1385,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
if (!copy_from_user(tmp, name, len)) {
struct new_utsname *u;
+ add_device_randomness(tmp, len);
down_write(&uts_sem);
u = utsname();
memcpy(u->nodename, tmp, len);
@@ -1385,6 +1439,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
if (!copy_from_user(tmp, name, len)) {
struct new_utsname *u;
+ add_device_randomness(tmp, len);
down_write(&uts_sem);
u = utsname();
memcpy(u->domainname, tmp, len);
@@ -1396,6 +1451,70 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
return errno;
}
+/* make sure you are allowed to change @tsk limits before calling this */
+static int do_prlimit(struct task_struct *tsk, unsigned int resource,
+ struct rlimit *new_rlim, struct rlimit *old_rlim)
+{
+ struct rlimit *rlim;
+ int retval = 0;
+
+ if (resource >= RLIM_NLIMITS)
+ return -EINVAL;
+ resource = array_index_nospec(resource, RLIM_NLIMITS);
+
+ if (new_rlim) {
+ if (new_rlim->rlim_cur > new_rlim->rlim_max)
+ return -EINVAL;
+ if (resource == RLIMIT_NOFILE &&
+ new_rlim->rlim_max > sysctl_nr_open)
+ return -EPERM;
+ }
+
+ /* Holding a refcount on tsk protects tsk->signal from disappearing. */
+ rlim = tsk->signal->rlim + resource;
+ task_lock(tsk->group_leader);
+ if (new_rlim) {
+ /*
+ * Keep the capable check against init_user_ns until cgroups can
+ * contain all limits.
+ */
+ if (new_rlim->rlim_max > rlim->rlim_max &&
+ !capable(CAP_SYS_RESOURCE))
+ retval = -EPERM;
+ if (!retval)
+ retval = security_task_setrlimit(tsk, resource, new_rlim);
+ }
+ if (!retval) {
+ if (old_rlim)
+ *old_rlim = *rlim;
+ if (new_rlim)
+ *rlim = *new_rlim;
+ }
+ task_unlock(tsk->group_leader);
+
+ /*
+ * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
+ * infinite. In case of RLIM_INFINITY the posix CPU timer code
+ * ignores the rlimit.
+ */
+ if (!retval && new_rlim && resource == RLIMIT_CPU &&
+ new_rlim->rlim_cur != RLIM_INFINITY &&
+ IS_ENABLED(CONFIG_POSIX_TIMERS)) {
+ /*
+ * update_rlimit_cpu can fail if the task is exiting, but there
+ * may be other tasks in the thread group that are not exiting,
+ * and they need their cpu timers adjusted.
+ *
+ * The group_leader is the last task to be released, so if we
+ * cannot update_rlimit_cpu on it, then the entire process is
+ * exiting and we do not need to update at all.
+ */
+ update_rlimit_cpu(tsk->group_leader, new_rlim->rlim_cur);
+ }
+
+ return retval;
+}
+
SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
{
struct rlimit value;
@@ -1539,63 +1658,6 @@ static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
rlim->rlim_max = (unsigned long)rlim64->rlim_max;
}
-/* make sure you are allowed to change @tsk limits before calling this */
-int do_prlimit(struct task_struct *tsk, unsigned int resource,
- struct rlimit *new_rlim, struct rlimit *old_rlim)
-{
- struct rlimit *rlim;
- int retval = 0;
-
- if (resource >= RLIM_NLIMITS)
- return -EINVAL;
- if (new_rlim) {
- if (new_rlim->rlim_cur > new_rlim->rlim_max)
- return -EINVAL;
- if (resource == RLIMIT_NOFILE &&
- new_rlim->rlim_max > sysctl_nr_open)
- return -EPERM;
- }
-
- /* protect tsk->signal and tsk->sighand from disappearing */
- read_lock(&tasklist_lock);
- if (!tsk->sighand) {
- retval = -ESRCH;
- goto out;
- }
-
- rlim = tsk->signal->rlim + resource;
- task_lock(tsk->group_leader);
- if (new_rlim) {
- /* Keep the capable check against init_user_ns until
- cgroups can contain all limits */
- if (new_rlim->rlim_max > rlim->rlim_max &&
- !capable(CAP_SYS_RESOURCE))
- retval = -EPERM;
- if (!retval)
- retval = security_task_setrlimit(tsk, resource, new_rlim);
- }
- if (!retval) {
- if (old_rlim)
- *old_rlim = *rlim;
- if (new_rlim)
- *rlim = *new_rlim;
- }
- task_unlock(tsk->group_leader);
-
- /*
- * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
- * infite. In case of RLIM_INFINITY the posix CPU timer code
- * ignores the rlimit.
- */
- if (!retval && new_rlim && resource == RLIMIT_CPU &&
- new_rlim->rlim_cur != RLIM_INFINITY &&
- IS_ENABLED(CONFIG_POSIX_TIMERS))
- update_rlimit_cpu(tsk, new_rlim->rlim_cur);
-out:
- read_unlock(&tasklist_lock);
- return retval;
-}
-
/* rcu lock must be held */
static int check_prlimit_permission(struct task_struct *task,
unsigned int flags)
@@ -1723,74 +1785,87 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
struct task_struct *t;
unsigned long flags;
u64 tgutime, tgstime, utime, stime;
- unsigned long maxrss = 0;
+ unsigned long maxrss;
+ struct mm_struct *mm;
+ struct signal_struct *sig = p->signal;
+ unsigned int seq = 0;
- memset((char *)r, 0, sizeof (*r));
+retry:
+ memset(r, 0, sizeof(*r));
utime = stime = 0;
+ maxrss = 0;
if (who == RUSAGE_THREAD) {
task_cputime_adjusted(current, &utime, &stime);
accumulate_thread_rusage(p, r);
- maxrss = p->signal->maxrss;
- goto out;
+ maxrss = sig->maxrss;
+ goto out_thread;
}
- if (!lock_task_sighand(p, &flags))
- return;
+ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
switch (who) {
case RUSAGE_BOTH:
case RUSAGE_CHILDREN:
- utime = p->signal->cutime;
- stime = p->signal->cstime;
- r->ru_nvcsw = p->signal->cnvcsw;
- r->ru_nivcsw = p->signal->cnivcsw;
- r->ru_minflt = p->signal->cmin_flt;
- r->ru_majflt = p->signal->cmaj_flt;
- r->ru_inblock = p->signal->cinblock;
- r->ru_oublock = p->signal->coublock;
- maxrss = p->signal->cmaxrss;
+ utime = sig->cutime;
+ stime = sig->cstime;
+ r->ru_nvcsw = sig->cnvcsw;
+ r->ru_nivcsw = sig->cnivcsw;
+ r->ru_minflt = sig->cmin_flt;
+ r->ru_majflt = sig->cmaj_flt;
+ r->ru_inblock = sig->cinblock;
+ r->ru_oublock = sig->coublock;
+ maxrss = sig->cmaxrss;
if (who == RUSAGE_CHILDREN)
break;
- /* fall through */
+ fallthrough;
case RUSAGE_SELF:
- thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- utime += tgutime;
- stime += tgstime;
- r->ru_nvcsw += p->signal->nvcsw;
- r->ru_nivcsw += p->signal->nivcsw;
- r->ru_minflt += p->signal->min_flt;
- r->ru_majflt += p->signal->maj_flt;
- r->ru_inblock += p->signal->inblock;
- r->ru_oublock += p->signal->oublock;
- if (maxrss < p->signal->maxrss)
- maxrss = p->signal->maxrss;
- t = p;
- do {
+ r->ru_nvcsw += sig->nvcsw;
+ r->ru_nivcsw += sig->nivcsw;
+ r->ru_minflt += sig->min_flt;
+ r->ru_majflt += sig->maj_flt;
+ r->ru_inblock += sig->inblock;
+ r->ru_oublock += sig->oublock;
+ if (maxrss < sig->maxrss)
+ maxrss = sig->maxrss;
+
+ rcu_read_lock();
+ __for_each_thread(sig, t)
accumulate_thread_rusage(t, r);
- } while_each_thread(p, t);
+ rcu_read_unlock();
+
break;
default:
BUG();
}
- unlock_task_sighand(p, &flags);
-out:
- r->ru_utime = ns_to_kernel_old_timeval(utime);
- r->ru_stime = ns_to_kernel_old_timeval(stime);
+ if (need_seqretry(&sig->stats_lock, seq)) {
+ seq = 1;
+ goto retry;
+ }
+ done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
- if (who != RUSAGE_CHILDREN) {
- struct mm_struct *mm = get_task_mm(p);
+ if (who == RUSAGE_CHILDREN)
+ goto out_children;
- if (mm) {
- setmax_mm_hiwater_rss(&maxrss, mm);
- mmput(mm);
- }
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
+ utime += tgutime;
+ stime += tgstime;
+
+out_thread:
+ mm = get_task_mm(p);
+ if (mm) {
+ setmax_mm_hiwater_rss(&maxrss, mm);
+ mmput(mm);
}
+
+out_children:
r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
+ r->ru_utime = ns_to_kernel_old_timeval(utime);
+ r->ru_stime = ns_to_kernel_old_timeval(stime);
}
SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
@@ -1828,7 +1903,6 @@ SYSCALL_DEFINE1(umask, int, mask)
static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
{
struct fd exe;
- struct file *old_exe, *exe_file;
struct inode *inode;
int err;
@@ -1847,44 +1921,14 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
goto exit;
- err = inode_permission(inode, MAY_EXEC);
+ err = file_permission(exe.file, MAY_EXEC);
if (err)
goto exit;
- /*
- * Forbid mm->exe_file change if old file still mapped.
- */
- exe_file = get_mm_exe_file(mm);
- err = -EBUSY;
- if (exe_file) {
- struct vm_area_struct *vma;
-
- mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (!vma->vm_file)
- continue;
- if (path_equal(&vma->vm_file->f_path,
- &exe_file->f_path))
- goto exit_err;
- }
-
- mmap_read_unlock(mm);
- fput(exe_file);
- }
-
- err = 0;
- /* set the new file, lockless */
- get_file(exe.file);
- old_exe = xchg(&mm->exe_file, exe.file);
- if (old_exe)
- fput(old_exe);
+ err = replace_mm_exe_file(mm, exe.file);
exit:
fdput(exe);
return err;
-exit_err:
- mmap_read_unlock(mm);
- fput(exe_file);
- goto exit;
}
/*
@@ -1942,13 +1986,6 @@ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map)
error = -EINVAL;
/*
- * @brk should be after @end_data in traditional maps.
- */
- if (prctl_map->start_brk <= prctl_map->end_data ||
- prctl_map->brk <= prctl_map->end_data)
- goto out;
-
- /*
* Neither we should allow to override limits if they set.
*/
if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
@@ -2007,12 +2044,15 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
if (prctl_map.exe_fd != (u32)-1) {
/*
- * Make sure the caller has the rights to
- * change /proc/pid/exe link: only local sys admin should
- * be allowed to.
+ * Check if the current user is checkpoint/restore capable.
+ * At the time of this writing, it checks for CAP_SYS_ADMIN
+ * or CAP_CHECKPOINT_RESTORE.
+ * Note that a user with access to ptrace can masquerade an
+ * arbitrary program as any executable, even setuid ones.
+ * This may have implications in the tomoyo subsystem.
*/
- if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
- return -EINVAL;
+ if (!checkpoint_restore_ns_capable(current_user_ns()))
+ return -EPERM;
error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
if (error)
@@ -2020,7 +2060,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
}
/*
- * arg_lock protects concurent updates but we still need mmap_lock for
+ * arg_lock protects concurrent updates but we still need mmap_lock for
* read to exclude races with sys_brk.
*/
mmap_read_lock(mm);
@@ -2031,8 +2071,8 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
* VMAs already unmapped and kernel uses these members for statistics
* output in procfs mostly, except
*
- * - @start_brk/@brk which are used in do_brk but kernel lookups
- * for VMAs when updating these memvers so anything wrong written
+ * - @start_brk/@brk which are used in do_brk_flags but kernel lookups
+ * for VMAs when updating these members so anything wrong written
* here cause kernel to swear at userspace program but won't lead
* to any problem in kernel itself
*/
@@ -2076,7 +2116,7 @@ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
* up to the caller to provide sane values here, otherwise userspace
* tools which use this vector might be unhappy.
*/
- unsigned long user_auxv[AT_VECTOR_SIZE];
+ unsigned long user_auxv[AT_VECTOR_SIZE] = {};
if (len > sizeof(user_auxv))
return -EINVAL;
@@ -2134,7 +2174,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
error = -EINVAL;
/*
- * arg_lock protects concurent updates of arg boundaries, we need
+ * arg_lock protects concurrent updates of arg boundaries, we need
* mmap_lock for a) concurrent sys_brk, b) finding VMA for addr
* validation.
*/
@@ -2201,7 +2241,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
* If command line arguments and environment
* are placed somewhere else on stack, we can
* set them up here, ARG_START/END to setup
- * command line argumets and ENV_START/END
+ * command line arguments and ENV_START/END
* for environment.
*/
case PR_SET_MM_START_STACK:
@@ -2235,12 +2275,12 @@ out:
}
#ifdef CONFIG_CHECKPOINT_RESTORE
-static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
+static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr)
{
return put_user(me->clear_child_tid, tid_addr);
}
#else
-static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
+static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr)
{
return -EINVAL;
}
@@ -2249,8 +2289,8 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
static int propagate_has_child_subreaper(struct task_struct *p, void *data)
{
/*
- * If task has has_child_subreaper - all its decendants
- * already have these flag too and new decendants will
+ * If task has has_child_subreaper - all its descendants
+ * already have these flag too and new descendants will
* inherit it on fork, skip them.
*
* If we've found child_reaper - skip descendants in
@@ -2277,6 +2317,131 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
+#ifdef CONFIG_ANON_VMA_NAME
+
+#define ANON_VMA_NAME_MAX_LEN 80
+#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]"
+
+static inline bool is_valid_name_char(char ch)
+{
+ /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */
+ return ch > 0x1f && ch < 0x7f &&
+ !strchr(ANON_VMA_NAME_INVALID_CHARS, ch);
+}
+
+static int prctl_set_vma(unsigned long opt, unsigned long addr,
+ unsigned long size, unsigned long arg)
+{
+ struct mm_struct *mm = current->mm;
+ const char __user *uname;
+ struct anon_vma_name *anon_name = NULL;
+ int error;
+
+ switch (opt) {
+ case PR_SET_VMA_ANON_NAME:
+ uname = (const char __user *)arg;
+ if (uname) {
+ char *name, *pch;
+
+ name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ for (pch = name; *pch != '\0'; pch++) {
+ if (!is_valid_name_char(*pch)) {
+ kfree(name);
+ return -EINVAL;
+ }
+ }
+ /* anon_vma has its own copy */
+ anon_name = anon_vma_name_alloc(name);
+ kfree(name);
+ if (!anon_name)
+ return -ENOMEM;
+
+ }
+
+ mmap_write_lock(mm);
+ error = madvise_set_anon_name(mm, addr, size, anon_name);
+ mmap_write_unlock(mm);
+ anon_vma_name_put(anon_name);
+ break;
+ default:
+ error = -EINVAL;
+ }
+
+ return error;
+}
+
+#else /* CONFIG_ANON_VMA_NAME */
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+ unsigned long size, unsigned long arg)
+{
+ return -EINVAL;
+}
+#endif /* CONFIG_ANON_VMA_NAME */
+
+static inline unsigned long get_current_mdwe(void)
+{
+ unsigned long ret = 0;
+
+ if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
+ ret |= PR_MDWE_REFUSE_EXEC_GAIN;
+ if (test_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags))
+ ret |= PR_MDWE_NO_INHERIT;
+
+ return ret;
+}
+
+static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ unsigned long current_bits;
+
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT))
+ return -EINVAL;
+
+ /* NO_INHERIT only makes sense with REFUSE_EXEC_GAIN */
+ if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN))
+ return -EINVAL;
+
+ /* PARISC cannot allow mdwe as it needs writable stacks */
+ if (IS_ENABLED(CONFIG_PARISC))
+ return -EINVAL;
+
+ current_bits = get_current_mdwe();
+ if (current_bits && current_bits != bits)
+ return -EPERM; /* Cannot unset the flags */
+
+ if (bits & PR_MDWE_NO_INHERIT)
+ set_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags);
+ if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
+ set_bit(MMF_HAS_MDWE, &current->mm->flags);
+
+ return 0;
+}
+
+static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ return get_current_mdwe();
+}
+
+static int prctl_get_auxv(void __user *addr, unsigned long len)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long size = min_t(unsigned long, sizeof(mm->saved_auxv), len);
+
+ if (size && copy_to_user(addr, mm->saved_auxv, size))
+ return -EFAULT;
+ return sizeof(mm->saved_auxv);
+}
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2424,7 +2589,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = prctl_set_mm(arg2, arg3, arg4, arg5);
break;
case PR_GET_TID_ADDRESS:
- error = prctl_get_tid_address(me, (int __user **)arg2);
+ error = prctl_get_tid_address(me, (int __user * __user *)arg2);
break;
case PR_SET_CHILD_SUBREAPER:
me->signal->is_child_subreaper = !!arg2;
@@ -2479,6 +2644,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SVE_GET_VL:
error = SVE_GET_VL();
break;
+ case PR_SME_SET_VL:
+ error = SME_SET_VL(arg2);
+ break;
+ case PR_SME_GET_VL:
+ error = SME_GET_VL();
+ break;
case PR_GET_SPECULATION_CTRL:
if (arg3 || arg4 || arg5)
return -EINVAL;
@@ -2494,6 +2665,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = PAC_RESET_KEYS(me, arg2);
break;
+ case PR_PAC_SET_ENABLED_KEYS:
+ if (arg4 || arg5)
+ return -EINVAL;
+ error = PAC_SET_ENABLED_KEYS(me, arg2, arg3);
+ break;
+ case PR_PAC_GET_ENABLED_KEYS:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = PAC_GET_ENABLED_KEYS(me);
+ break;
case PR_SET_TAGGED_ADDR_CTRL:
if (arg3 || arg4 || arg5)
return -EINVAL;
@@ -2527,6 +2708,55 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
break;
+ case PR_SET_SYSCALL_USER_DISPATCH:
+ error = set_syscall_user_dispatch(arg2, arg3, arg4,
+ (char __user *) arg5);
+ break;
+#ifdef CONFIG_SCHED_CORE
+ case PR_SCHED_CORE:
+ error = sched_core_share_pid(arg2, arg3, arg4, arg5);
+ break;
+#endif
+ case PR_SET_MDWE:
+ error = prctl_set_mdwe(arg2, arg3, arg4, arg5);
+ break;
+ case PR_GET_MDWE:
+ error = prctl_get_mdwe(arg2, arg3, arg4, arg5);
+ break;
+ case PR_SET_VMA:
+ error = prctl_set_vma(arg2, arg3, arg4, arg5);
+ break;
+ case PR_GET_AUXV:
+ if (arg4 || arg5)
+ return -EINVAL;
+ error = prctl_get_auxv((void __user *)arg2, arg3);
+ break;
+#ifdef CONFIG_KSM
+ case PR_SET_MEMORY_MERGE:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ if (mmap_write_lock_killable(me->mm))
+ return -EINTR;
+
+ if (arg2)
+ error = ksm_enable_merge_any(me->mm);
+ else
+ error = ksm_disable_merge_any(me->mm);
+ mmap_write_unlock(me->mm);
+ break;
+ case PR_GET_MEMORY_MERGE:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags);
+ break;
+#endif
+ case PR_RISCV_V_SET_CONTROL:
+ error = RISCV_V_SET_CONTROL(arg2);
+ break;
+ case PR_RISCV_V_GET_CONTROL:
+ error = RISCV_V_GET_CONTROL();
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 3b69a560a7ac..faad00cce269 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -51,125 +51,48 @@ COND_SYSCALL_COMPAT(io_pgetevents);
COND_SYSCALL(io_uring_setup);
COND_SYSCALL(io_uring_enter);
COND_SYSCALL(io_uring_register);
-
-/* fs/xattr.c */
-
-/* fs/dcache.c */
-
-/* fs/cookies.c */
-COND_SYSCALL(lookup_dcookie);
-COND_SYSCALL_COMPAT(lookup_dcookie);
-
-/* fs/eventfd.c */
COND_SYSCALL(eventfd2);
-
-/* fs/eventfd.c */
COND_SYSCALL(epoll_create1);
COND_SYSCALL(epoll_ctl);
COND_SYSCALL(epoll_pwait);
COND_SYSCALL_COMPAT(epoll_pwait);
-
-/* fs/fcntl.c */
-
-/* fs/inotify_user.c */
+COND_SYSCALL(epoll_pwait2);
+COND_SYSCALL_COMPAT(epoll_pwait2);
COND_SYSCALL(inotify_init1);
COND_SYSCALL(inotify_add_watch);
COND_SYSCALL(inotify_rm_watch);
-
-/* fs/ioctl.c */
-
-/* fs/ioprio.c */
COND_SYSCALL(ioprio_set);
COND_SYSCALL(ioprio_get);
-
-/* fs/locks.c */
COND_SYSCALL(flock);
-
-/* fs/namei.c */
-
-/* fs/namespace.c */
-
-/* fs/nfsctl.c */
-
-/* fs/open.c */
-
-/* fs/pipe.c */
-
-/* fs/quota.c */
COND_SYSCALL(quotactl);
-
-/* fs/readdir.c */
-
-/* fs/read_write.c */
-
-/* fs/sendfile.c */
-
-/* fs/select.c */
-
-/* fs/signalfd.c */
+COND_SYSCALL(quotactl_fd);
COND_SYSCALL(signalfd4);
COND_SYSCALL_COMPAT(signalfd4);
-
-/* fs/splice.c */
-
-/* fs/stat.c */
-
-/* fs/sync.c */
-
-/* fs/timerfd.c */
COND_SYSCALL(timerfd_create);
COND_SYSCALL(timerfd_settime);
COND_SYSCALL(timerfd_settime32);
COND_SYSCALL(timerfd_gettime);
COND_SYSCALL(timerfd_gettime32);
-
-/* fs/utimes.c */
-
-/* kernel/acct.c */
COND_SYSCALL(acct);
-
-/* kernel/capability.c */
COND_SYSCALL(capget);
COND_SYSCALL(capset);
-
-/* kernel/exec_domain.c */
-
-/* kernel/exit.c */
-
-/* kernel/fork.c */
/* __ARCH_WANT_SYS_CLONE3 */
COND_SYSCALL(clone3);
-
-/* kernel/futex.c */
COND_SYSCALL(futex);
COND_SYSCALL(futex_time32);
COND_SYSCALL(set_robust_list);
COND_SYSCALL_COMPAT(set_robust_list);
COND_SYSCALL(get_robust_list);
COND_SYSCALL_COMPAT(get_robust_list);
-
-/* kernel/hrtimer.c */
-
-/* kernel/itimer.c */
-
-/* kernel/kexec.c */
+COND_SYSCALL(futex_waitv);
+COND_SYSCALL(futex_wake);
+COND_SYSCALL(futex_wait);
+COND_SYSCALL(futex_requeue);
COND_SYSCALL(kexec_load);
COND_SYSCALL_COMPAT(kexec_load);
-
-/* kernel/module.c */
COND_SYSCALL(init_module);
COND_SYSCALL(delete_module);
-
-/* kernel/posix-timers.c */
-
-/* kernel/printk.c */
COND_SYSCALL(syslog);
-
-/* kernel/ptrace.c */
-
-/* kernel/sched/core.c */
-
-/* kernel/sys.c */
COND_SYSCALL(setregid);
COND_SYSCALL(setgid);
COND_SYSCALL(setreuid);
@@ -182,12 +105,6 @@ COND_SYSCALL(setfsuid);
COND_SYSCALL(setfsgid);
COND_SYSCALL(setgroups);
COND_SYSCALL(getgroups);
-
-/* kernel/time.c */
-
-/* kernel/timer.c */
-
-/* ipc/mqueue.c */
COND_SYSCALL(mq_open);
COND_SYSCALL_COMPAT(mq_open);
COND_SYSCALL(mq_unlink);
@@ -199,8 +116,6 @@ COND_SYSCALL(mq_notify);
COND_SYSCALL_COMPAT(mq_notify);
COND_SYSCALL(mq_getsetattr);
COND_SYSCALL_COMPAT(mq_getsetattr);
-
-/* ipc/msg.c */
COND_SYSCALL(msgget);
COND_SYSCALL(old_msgctl);
COND_SYSCALL(msgctl);
@@ -210,8 +125,6 @@ COND_SYSCALL(msgrcv);
COND_SYSCALL_COMPAT(msgrcv);
COND_SYSCALL(msgsnd);
COND_SYSCALL_COMPAT(msgsnd);
-
-/* ipc/sem.c */
COND_SYSCALL(semget);
COND_SYSCALL(old_semctl);
COND_SYSCALL(semctl);
@@ -220,8 +133,6 @@ COND_SYSCALL_COMPAT(old_semctl);
COND_SYSCALL(semtimedop);
COND_SYSCALL(semtimedop_time32);
COND_SYSCALL(semop);
-
-/* ipc/shm.c */
COND_SYSCALL(shmget);
COND_SYSCALL(old_shmctl);
COND_SYSCALL(shmctl);
@@ -230,8 +141,6 @@ COND_SYSCALL_COMPAT(old_shmctl);
COND_SYSCALL(shmat);
COND_SYSCALL_COMPAT(shmat);
COND_SYSCALL(shmdt);
-
-/* net/socket.c */
COND_SYSCALL(socket);
COND_SYSCALL(socketpair);
COND_SYSCALL(bind);
@@ -252,24 +161,21 @@ COND_SYSCALL(sendmsg);
COND_SYSCALL_COMPAT(sendmsg);
COND_SYSCALL(recvmsg);
COND_SYSCALL_COMPAT(recvmsg);
-
-/* mm/filemap.c */
-
-/* mm/nommu.c, also with MMU */
COND_SYSCALL(mremap);
-
-/* security/keys/keyctl.c */
COND_SYSCALL(add_key);
COND_SYSCALL(request_key);
COND_SYSCALL(keyctl);
COND_SYSCALL_COMPAT(keyctl);
-
-/* arch/example/kernel/sys_example.c */
-
-/* mm/fadvise.c */
+COND_SYSCALL(landlock_create_ruleset);
+COND_SYSCALL(landlock_add_rule);
+COND_SYSCALL(landlock_restrict_self);
COND_SYSCALL(fadvise64_64);
+COND_SYSCALL_COMPAT(fadvise64_64);
+COND_SYSCALL(lsm_get_self_attr);
+COND_SYSCALL(lsm_set_self_attr);
+COND_SYSCALL(lsm_list_modules);
-/* mm/, CONFIG_MMU only */
+/* CONFIG_MMU only */
COND_SYSCALL(swapon);
COND_SYSCALL(swapoff);
COND_SYSCALL(mprotect);
@@ -280,17 +186,16 @@ COND_SYSCALL(mlockall);
COND_SYSCALL(munlockall);
COND_SYSCALL(mincore);
COND_SYSCALL(madvise);
+COND_SYSCALL(process_madvise);
+COND_SYSCALL(process_mrelease);
COND_SYSCALL(remap_file_pages);
COND_SYSCALL(mbind);
-COND_SYSCALL_COMPAT(mbind);
COND_SYSCALL(get_mempolicy);
-COND_SYSCALL_COMPAT(get_mempolicy);
COND_SYSCALL(set_mempolicy);
-COND_SYSCALL_COMPAT(set_mempolicy);
COND_SYSCALL(migrate_pages);
-COND_SYSCALL_COMPAT(migrate_pages);
COND_SYSCALL(move_pages);
-COND_SYSCALL_COMPAT(move_pages);
+COND_SYSCALL(set_mempolicy_home_node);
+COND_SYSCALL(cachestat);
COND_SYSCALL(perf_event_open);
COND_SYSCALL(accept4);
@@ -299,6 +204,20 @@ COND_SYSCALL(recvmmsg_time32);
COND_SYSCALL_COMPAT(recvmmsg_time32);
COND_SYSCALL_COMPAT(recvmmsg_time64);
+/* Posix timer syscalls may be configured out */
+COND_SYSCALL(timer_create);
+COND_SYSCALL(timer_gettime);
+COND_SYSCALL(timer_getoverrun);
+COND_SYSCALL(timer_settime);
+COND_SYSCALL(timer_delete);
+COND_SYSCALL(clock_adjtime);
+COND_SYSCALL(getitimer);
+COND_SYSCALL(setitimer);
+COND_SYSCALL(alarm);
+COND_SYSCALL_COMPAT(timer_create);
+COND_SYSCALL_COMPAT(getitimer);
+COND_SYSCALL_COMPAT(setitimer);
+
/*
* Architecture specific syscalls: see further below
*/
@@ -349,6 +268,8 @@ COND_SYSCALL(pkey_mprotect);
COND_SYSCALL(pkey_alloc);
COND_SYSCALL(pkey_free);
+/* memfd_secret */
+COND_SYSCALL(memfd_secret);
/*
* Architecture specific weak syscall entries.
@@ -364,15 +285,14 @@ COND_SYSCALL(socketcall);
COND_SYSCALL_COMPAT(socketcall);
/* compat syscalls for arm64, x86, ... */
-COND_SYSCALL_COMPAT(sysctl);
COND_SYSCALL_COMPAT(fanotify_mark);
/* x86 */
COND_SYSCALL(vm86old);
COND_SYSCALL(modify_ldt);
-COND_SYSCALL_COMPAT(quotactl32);
COND_SYSCALL(vm86);
COND_SYSCALL(kexec_file_load);
+COND_SYSCALL(map_shadow_stack);
/* s390 */
COND_SYSCALL(s390_pci_mmio_read);
@@ -407,7 +327,6 @@ COND_SYSCALL(epoll_wait);
COND_SYSCALL(recv);
COND_SYSCALL_COMPAT(recv);
COND_SYSCALL(send);
-COND_SYSCALL(bdflush);
COND_SYSCALL(uselib);
/* optional: time32 */
diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c
index ccb78509f1a8..6ef887c19c48 100644
--- a/kernel/sysctl-test.c
+++ b/kernel/sysctl-test.c
@@ -9,9 +9,6 @@
#define KUNIT_PROC_READ 0
#define KUNIT_PROC_WRITE 1
-static int i_zero;
-static int i_one_hundred = 100;
-
/*
* Test that proc_dointvec will not try to use a NULL .data field even when the
* length is non-zero.
@@ -29,8 +26,8 @@ static void sysctl_test_api_dointvec_null_tbl_data(struct kunit *test)
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
/*
* proc_dointvec expects a buffer in user space, so we allocate one. We
@@ -49,7 +46,7 @@ static void sysctl_test_api_dointvec_null_tbl_data(struct kunit *test)
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&null_data_table,
KUNIT_PROC_READ, buffer, &len,
&pos));
- KUNIT_EXPECT_EQ(test, (size_t)0, len);
+ KUNIT_EXPECT_EQ(test, 0, len);
/*
* See above.
@@ -58,7 +55,7 @@ static void sysctl_test_api_dointvec_null_tbl_data(struct kunit *test)
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&null_data_table,
KUNIT_PROC_WRITE, buffer, &len,
&pos));
- KUNIT_EXPECT_EQ(test, (size_t)0, len);
+ KUNIT_EXPECT_EQ(test, 0, len);
}
/*
@@ -79,8 +76,8 @@ static void sysctl_test_api_dointvec_table_maxlen_unset(struct kunit *test)
.maxlen = 0,
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int),
GFP_USER);
@@ -95,7 +92,7 @@ static void sysctl_test_api_dointvec_table_maxlen_unset(struct kunit *test)
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&data_maxlen_unset_table,
KUNIT_PROC_READ, buffer, &len,
&pos));
- KUNIT_EXPECT_EQ(test, (size_t)0, len);
+ KUNIT_EXPECT_EQ(test, 0, len);
/*
* See previous comment.
@@ -104,7 +101,7 @@ static void sysctl_test_api_dointvec_table_maxlen_unset(struct kunit *test)
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&data_maxlen_unset_table,
KUNIT_PROC_WRITE, buffer, &len,
&pos));
- KUNIT_EXPECT_EQ(test, (size_t)0, len);
+ KUNIT_EXPECT_EQ(test, 0, len);
}
/*
@@ -122,8 +119,8 @@ static void sysctl_test_api_dointvec_table_len_is_zero(struct kunit *test)
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int),
GFP_USER);
@@ -135,11 +132,11 @@ static void sysctl_test_api_dointvec_table_len_is_zero(struct kunit *test)
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ, buffer,
&len, &pos));
- KUNIT_EXPECT_EQ(test, (size_t)0, len);
+ KUNIT_EXPECT_EQ(test, 0, len);
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_WRITE, buffer,
&len, &pos));
- KUNIT_EXPECT_EQ(test, (size_t)0, len);
+ KUNIT_EXPECT_EQ(test, 0, len);
}
/*
@@ -156,8 +153,8 @@ static void sysctl_test_api_dointvec_table_read_but_position_set(
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int),
GFP_USER);
@@ -174,7 +171,7 @@ static void sysctl_test_api_dointvec_table_read_but_position_set(
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ, buffer,
&len, &pos));
- KUNIT_EXPECT_EQ(test, (size_t)0, len);
+ KUNIT_EXPECT_EQ(test, 0, len);
}
/*
@@ -191,8 +188,8 @@ static void sysctl_test_dointvec_read_happy_single_positive(struct kunit *test)
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
size_t len = 4;
loff_t pos = 0;
@@ -203,7 +200,7 @@ static void sysctl_test_dointvec_read_happy_single_positive(struct kunit *test)
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ,
user_buffer, &len, &pos));
- KUNIT_ASSERT_EQ(test, (size_t)3, len);
+ KUNIT_ASSERT_EQ(test, 3, len);
buffer[len] = '\0';
/* And we read 13 back out. */
KUNIT_EXPECT_STREQ(test, "13\n", buffer);
@@ -222,8 +219,8 @@ static void sysctl_test_dointvec_read_happy_single_negative(struct kunit *test)
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
size_t len = 5;
loff_t pos = 0;
@@ -233,9 +230,9 @@ static void sysctl_test_dointvec_read_happy_single_negative(struct kunit *test)
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ,
user_buffer, &len, &pos));
- KUNIT_ASSERT_EQ(test, (size_t)4, len);
+ KUNIT_ASSERT_EQ(test, 4, len);
buffer[len] = '\0';
- KUNIT_EXPECT_STREQ(test, "-16\n", (char *)buffer);
+ KUNIT_EXPECT_STREQ(test, "-16\n", buffer);
}
/*
@@ -251,8 +248,8 @@ static void sysctl_test_dointvec_write_happy_single_positive(struct kunit *test)
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
char input[] = "9";
size_t len = sizeof(input) - 1;
@@ -265,7 +262,7 @@ static void sysctl_test_dointvec_write_happy_single_positive(struct kunit *test)
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_WRITE,
user_buffer, &len, &pos));
KUNIT_EXPECT_EQ(test, sizeof(input) - 1, len);
- KUNIT_EXPECT_EQ(test, sizeof(input) - 1, (size_t)pos);
+ KUNIT_EXPECT_EQ(test, sizeof(input) - 1, pos);
KUNIT_EXPECT_EQ(test, 9, *((int *)table.data));
}
@@ -281,8 +278,8 @@ static void sysctl_test_dointvec_write_happy_single_negative(struct kunit *test)
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
char input[] = "-9";
size_t len = sizeof(input) - 1;
@@ -295,7 +292,7 @@ static void sysctl_test_dointvec_write_happy_single_negative(struct kunit *test)
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_WRITE,
user_buffer, &len, &pos));
KUNIT_EXPECT_EQ(test, sizeof(input) - 1, len);
- KUNIT_EXPECT_EQ(test, sizeof(input) - 1, (size_t)pos);
+ KUNIT_EXPECT_EQ(test, sizeof(input) - 1, pos);
KUNIT_EXPECT_EQ(test, -9, *((int *)table.data));
}
@@ -313,8 +310,8 @@ static void sysctl_test_api_dointvec_write_single_less_int_min(
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
size_t max_len = 32, len = max_len;
loff_t pos = 0;
@@ -351,8 +348,8 @@ static void sysctl_test_api_dointvec_write_single_greater_int_max(
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &i_zero,
- .extra2 = &i_one_hundred,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
};
size_t max_len = 32, len = max_len;
loff_t pos = 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index db1ce7af2563..157f7ce2942d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -20,18 +20,19 @@
*/
#include <linux/module.h>
-#include <linux/aio.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/bitmap.h>
#include <linux/signal.h>
+#include <linux/panic.h>
#include <linux/printk.h>
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/ctype.h>
#include <linux/kmemleak.h>
+#include <linux/filter.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -41,14 +42,12 @@
#include <linux/highuid.h>
#include <linux/writeback.h>
#include <linux/ratelimit.h>
-#include <linux/compaction.h>
#include <linux/hugetlb.h>
#include <linux/initrd.h>
#include <linux/key.h>
#include <linux/times.h>
#include <linux/limits.h>
#include <linux/dcache.h>
-#include <linux/dnotify.h>
#include <linux/syscalls.h>
#include <linux/vmstat.h>
#include <linux/nfs_fs.h>
@@ -56,20 +55,13 @@
#include <linux/reboot.h>
#include <linux/ftrace.h>
#include <linux/perf_event.h>
-#include <linux/kprobes.h>
-#include <linux/pipe_fs_i.h>
#include <linux/oom.h>
#include <linux/kmod.h>
#include <linux/capability.h>
#include <linux/binfmts.h>
#include <linux/sched/sysctl.h>
-#include <linux/sched/coredump.h>
-#include <linux/kexec.h>
-#include <linux/bpf.h>
#include <linux/mount.h>
#include <linux/userfaultfd_k.h>
-#include <linux/coredump.h>
-#include <linux/latencytop.h>
#include <linux/pid.h>
#include "../lib/kstrtox.h"
@@ -85,70 +77,29 @@
#ifdef CONFIG_SPARC
#include <asm/setup.h>
#endif
-#ifdef CONFIG_BSD_PROCESS_ACCT
-#include <linux/acct.h>
-#endif
#ifdef CONFIG_RT_MUTEXES
#include <linux/rtmutex.h>
#endif
-#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
-#include <linux/lockdep.h>
-#endif
-#ifdef CONFIG_CHR_DEV_SG
-#include <scsi/sg.h>
-#endif
-#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
-#include <linux/stackleak.h>
-#endif
-#ifdef CONFIG_LOCKUP_DETECTOR
-#include <linux/nmi.h>
-#endif
+
+/* shared constants to be used in various sysctls */
+const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 };
+EXPORT_SYMBOL(sysctl_vals);
+
+const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX };
+EXPORT_SYMBOL_GPL(sysctl_long_vals);
#if defined(CONFIG_SYSCTL)
-/* Constants used for minimum and maximum */
-#ifdef CONFIG_LOCKUP_DETECTOR
-static int sixty = 60;
-#endif
+/* Constants used for minimum and maximum */
-static int __maybe_unused neg_one = -1;
-static int __maybe_unused two = 2;
-static int __maybe_unused four = 4;
-static unsigned long zero_ul;
-static unsigned long one_ul = 1;
-static unsigned long long_max = LONG_MAX;
-static int one_hundred = 100;
-static int two_hundred = 200;
-static int one_thousand = 1000;
-#ifdef CONFIG_PRINTK
-static int ten_thousand = 10000;
-#endif
#ifdef CONFIG_PERF_EVENTS
-static int six_hundred_forty_kb = 640 * 1024;
+static const int six_hundred_forty_kb = 640 * 1024;
#endif
-/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
-static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
-
-/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
-static int maxolduid = 65535;
-static int minolduid;
-static int ngroups_max = NGROUPS_MAX;
+static const int ngroups_max = NGROUPS_MAX;
static const int cap_last_cap = CAP_LAST_CAP;
-/*
- * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs
- * and hung_task_check_interval_secs
- */
-#ifdef CONFIG_DETECT_HUNG_TASK
-static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
-#endif
-
-#ifdef CONFIG_INOTIFY_USER
-#include <linux/inotify.h>
-#endif
-
#ifdef CONFIG_PROC_SYSCTL
/**
@@ -184,58 +135,8 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
int sysctl_legacy_va_layout;
#endif
-#ifdef CONFIG_SCHED_DEBUG
-static int min_sched_granularity_ns = 100000; /* 100 usecs */
-static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
-static int min_wakeup_granularity_ns; /* 0 usecs */
-static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
-#ifdef CONFIG_SMP
-static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
-static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif /* CONFIG_SMP */
-#endif /* CONFIG_SCHED_DEBUG */
-
-#ifdef CONFIG_COMPACTION
-static int min_extfrag_threshold;
-static int max_extfrag_threshold = 1000;
-#endif
-
#endif /* CONFIG_SYSCTL */
-#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL)
-static int bpf_stats_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
-{
- struct static_key *key = (struct static_key *)table->data;
- static int saved_val;
- int val, ret;
- struct ctl_table tmp = {
- .data = &val,
- .maxlen = sizeof(val),
- .mode = table->mode,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- };
-
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- mutex_lock(&bpf_stats_enabled_mutex);
- val = saved_val;
- ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
- if (write && !ret && val != saved_val) {
- if (val)
- static_key_slow_inc(key);
- else
- static_key_slow_dec(key);
- saved_val = val;
- }
- mutex_unlock(&bpf_stats_enabled_mutex);
- return ret;
-}
-#endif
-
/*
* /proc/sys support
*/
@@ -365,13 +266,14 @@ int proc_dostring(struct ctl_table *table, int write,
ppos);
}
-static size_t proc_skip_spaces(char **buf)
+static void proc_skip_spaces(char **buf, size_t *size)
{
- size_t ret;
- char *tmp = skip_spaces(*buf);
- ret = tmp - *buf;
- *buf = tmp;
- return ret;
+ while (*size) {
+ if (!isspace(**buf))
+ break;
+ (*size)--;
+ (*buf)++;
+ }
}
static void proc_skip_char(char **buf, size_t *size, const char v)
@@ -440,13 +342,12 @@ static int proc_get_long(char **buf, size_t *size,
unsigned long *val, bool *neg,
const char *perm_tr, unsigned perm_tr_len, char *tr)
{
- int len;
char *p, tmp[TMPBUFLEN];
+ ssize_t len = *size;
- if (!*size)
+ if (len <= 0)
return -EINVAL;
- len = *size;
if (len > TMPBUFLEN - 1)
len = TMPBUFLEN - 1;
@@ -531,14 +432,14 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
if (*negp) {
if (*lvalp > (unsigned long) INT_MAX + 1)
return -EINVAL;
- *valp = -*lvalp;
+ WRITE_ONCE(*valp, -*lvalp);
} else {
if (*lvalp > (unsigned long) INT_MAX)
return -EINVAL;
- *valp = *lvalp;
+ WRITE_ONCE(*valp, *lvalp);
}
} else {
- int val = *valp;
+ int val = READ_ONCE(*valp);
if (val < 0) {
*negp = true;
*lvalp = -(unsigned long)val;
@@ -557,9 +458,9 @@ static int do_proc_douintvec_conv(unsigned long *lvalp,
if (write) {
if (*lvalp > UINT_MAX)
return -EINVAL;
- *valp = *lvalp;
+ WRITE_ONCE(*valp, *lvalp);
} else {
- unsigned int val = *valp;
+ unsigned int val = READ_ONCE(*valp);
*lvalp = (unsigned long)val;
}
return 0;
@@ -577,12 +478,12 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
int *i, vleft, first = 1, err = 0;
size_t left;
char *p;
-
+
if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
*lenp = 0;
return 0;
}
-
+
i = (int *) tbl_data;
vleft = table->maxlen / sizeof(*i);
left = *lenp;
@@ -604,7 +505,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
bool neg;
if (write) {
- left -= proc_skip_spaces(&p);
+ proc_skip_spaces(&p, &left);
if (!left)
break;
@@ -631,7 +532,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
if (!write && !first && left && !err)
proc_put_char(&buffer, &left, '\n');
if (write && !err && left)
- left -= proc_skip_spaces(&p);
+ proc_skip_spaces(&p, &left);
if (write && first)
return err ? : -EINVAL;
*lenp -= left;
@@ -673,7 +574,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data,
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
- left -= proc_skip_spaces(&p);
+ proc_skip_spaces(&p, &left);
if (!left) {
err = -EINVAL;
goto out_free;
@@ -693,7 +594,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data,
}
if (!err && left)
- left -= proc_skip_spaces(&p);
+ proc_skip_spaces(&p, &left);
out_free:
if (err)
@@ -774,18 +675,58 @@ static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data);
}
-static int do_proc_douintvec(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
+int do_proc_douintvec(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
{
return __do_proc_douintvec(table->data, table, write,
buffer, lenp, ppos, conv, data);
}
/**
+ * proc_dobool - read/write a bool
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes one integer value from/to the user buffer,
+ * treated as an ASCII string.
+ *
+ * table->data must point to a bool variable and table->maxlen must
+ * be sizeof(bool).
+ *
+ * Returns 0 on success.
+ */
+int proc_dobool(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tmp;
+ bool *data = table->data;
+ int res, val;
+
+ /* Do not support arrays yet. */
+ if (table->maxlen != sizeof(bool))
+ return -EINVAL;
+
+ tmp = *table;
+ tmp.maxlen = sizeof(val);
+ tmp.data = &val;
+
+ val = READ_ONCE(*data);
+ res = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+ if (res)
+ return res;
+ if (write)
+ WRITE_ONCE(*data, val);
+ return 0;
+}
+
+/**
* proc_dointvec - read a vector of integers
* @table: the sysctl table
* @write: %TRUE if this is a write to the sysctl file
@@ -794,7 +735,7 @@ static int do_proc_douintvec(struct ctl_table *table, int write,
* @ppos: file position
*
* Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
+ * values from/to the user buffer, treated as an ASCII string.
*
* Returns 0 on success.
*/
@@ -804,27 +745,6 @@ int proc_dointvec(struct ctl_table *table, int write, void *buffer,
return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
}
-#ifdef CONFIG_COMPACTION
-static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
- int write, void *buffer, size_t *lenp, loff_t *ppos)
-{
- int ret, old;
-
- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
- return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-
- old = *(int *)table->data;
- ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (ret)
- return ret;
- if (old != *(int *)table->data)
- pr_warn_once("sysctl attribute %s changed by %s[%d]\n",
- table->procname, current->comm,
- task_pid_nr(current));
- return ret;
-}
-#endif
-
/**
* proc_douintvec - read a vector of unsigned integers
* @table: the sysctl table
@@ -888,17 +808,6 @@ static int proc_taint(struct ctl_table *table, int write,
return err;
}
-#ifdef CONFIG_PRINTK
-static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-}
-#endif
-
/**
* struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
* @min: pointer to minimum allowable value
@@ -933,7 +842,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
if ((param->min && *param->min > tmp) ||
(param->max && *param->max < tmp))
return -EINVAL;
- *valp = tmp;
+ WRITE_ONCE(*valp, tmp);
}
return 0;
@@ -999,7 +908,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
(param->max && *param->max < tmp))
return -ERANGE;
- *valp = tmp;
+ WRITE_ONCE(*valp, tmp);
}
return 0;
@@ -1035,66 +944,64 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
do_proc_douintvec_minmax_conv, &param);
}
-static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data)
+/**
+ * proc_dou8vec_minmax - read a vector of unsigned chars with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(u8) unsigned chars
+ * values from/to the user buffer, treated as an ASCII string. Negative
+ * strings are not allowed.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success or an error on write when the range check fails.
+ */
+int proc_dou8vec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
- if (write) {
- unsigned int val;
+ struct ctl_table tmp;
+ unsigned int min = 0, max = 255U, val;
+ u8 *data = table->data;
+ struct do_proc_douintvec_minmax_conv_param param = {
+ .min = &min,
+ .max = &max,
+ };
+ int res;
- val = round_pipe_size(*lvalp);
- if (val == 0)
- return -EINVAL;
+ /* Do not support arrays yet. */
+ if (table->maxlen != sizeof(u8))
+ return -EINVAL;
- *valp = val;
- } else {
- unsigned int val = *valp;
- *lvalp = (unsigned long) val;
+ if (table->extra1) {
+ min = *(unsigned int *) table->extra1;
+ if (min > 255U)
+ return -EINVAL;
}
-
- return 0;
-}
-
-static int proc_dopipe_max_size(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_douintvec(table, write, buffer, lenp, ppos,
- do_proc_dopipe_max_size_conv, NULL);
-}
-
-static void validate_coredump_safety(void)
-{
-#ifdef CONFIG_COREDUMP
- if (suid_dumpable == SUID_DUMP_ROOT &&
- core_pattern[0] != '/' && core_pattern[0] != '|') {
- printk(KERN_WARNING
-"Unsafe core_pattern used with fs.suid_dumpable=2.\n"
-"Pipe handler or fully qualified core dump path required.\n"
-"Set kernel.core_pattern before fs.suid_dumpable.\n"
- );
+ if (table->extra2) {
+ max = *(unsigned int *) table->extra2;
+ if (max > 255U)
+ return -EINVAL;
}
-#endif
-}
-static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!error)
- validate_coredump_safety();
- return error;
-}
+ tmp = *table;
-#ifdef CONFIG_COREDUMP
-static int proc_dostring_coredump(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- int error = proc_dostring(table, write, buffer, lenp, ppos);
- if (!error)
- validate_coredump_safety();
- return error;
+ tmp.maxlen = sizeof(val);
+ tmp.data = &val;
+ val = READ_ONCE(*data);
+ res = do_proc_douintvec(&tmp, write, buffer, lenp, ppos,
+ do_proc_douintvec_minmax_conv, &param);
+ if (res)
+ return res;
+ if (write)
+ WRITE_ONCE(*data, val);
+ return 0;
}
-#endif
+EXPORT_SYMBOL_GPL(proc_dou8vec_minmax);
#ifdef CONFIG_MAGIC_SYSRQ
static int sysrq_sysctl_handler(struct ctl_table *table, int write,
@@ -1130,9 +1037,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table,
return 0;
}
- i = (unsigned long *) data;
- min = (unsigned long *) table->extra1;
- max = (unsigned long *) table->extra2;
+ i = data;
+ min = table->extra1;
+ max = table->extra2;
vleft = table->maxlen / sizeof(unsigned long);
left = *lenp;
@@ -1151,25 +1058,26 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table,
if (write) {
bool neg;
- left -= proc_skip_spaces(&p);
+ proc_skip_spaces(&p, &left);
if (!left)
break;
err = proc_get_long(&p, &left, &val, &neg,
proc_wspace_sep,
sizeof(proc_wspace_sep), NULL);
- if (err)
+ if (err || neg) {
+ err = -EINVAL;
break;
- if (neg)
- continue;
+ }
+
val = convmul * val / convdiv;
if ((min && val < *min) || (max && val > *max)) {
err = -EINVAL;
break;
}
- *i = val;
+ WRITE_ONCE(*i, val);
} else {
- val = convdiv * (*i) / convmul;
+ val = convdiv * READ_ONCE(*i) / convmul;
if (!first)
proc_put_char(&buffer, &left, '\t');
proc_put_long(&buffer, &left, val, false);
@@ -1179,7 +1087,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table,
if (!write && !first && left && !err)
proc_put_char(&buffer, &left, '\n');
if (write && !err)
- left -= proc_skip_spaces(&p);
+ proc_skip_spaces(&p, &left);
if (write && first)
return err ? : -EINVAL;
*lenp -= left;
@@ -1250,9 +1158,12 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
if (write) {
if (*lvalp > INT_MAX / HZ)
return 1;
- *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
+ if (*negp)
+ WRITE_ONCE(*valp, -*lvalp * HZ);
+ else
+ WRITE_ONCE(*valp, *lvalp * HZ);
} else {
- int val = *valp;
+ int val = READ_ONCE(*valp);
unsigned long lval;
if (val < 0) {
*negp = true;
@@ -1298,9 +1209,9 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
if (jif > INT_MAX)
return 1;
- *valp = (int)jif;
+ WRITE_ONCE(*valp, (int)jif);
} else {
- int val = *valp;
+ int val = READ_ONCE(*valp);
unsigned long lval;
if (val < 0) {
*negp = true;
@@ -1314,6 +1225,30 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
return 0;
}
+static int do_proc_dointvec_ms_jiffies_minmax_conv(bool *negp, unsigned long *lvalp,
+ int *valp, int write, void *data)
+{
+ int tmp, ret;
+ struct do_proc_dointvec_minmax_conv_param *param = data;
+ /*
+ * If writing, first do so via a temporary local int so we can
+ * bounds-check it before touching *valp.
+ */
+ int *ip = write ? &tmp : valp;
+
+ ret = do_proc_dointvec_ms_jiffies_conv(negp, lvalp, ip, write, data);
+ if (ret)
+ return ret;
+
+ if (write) {
+ if ((param->min && *param->min > tmp) ||
+ (param->max && *param->max < tmp))
+ return -EINVAL;
+ *valp = tmp;
+ }
+ return 0;
+}
+
/**
* proc_dointvec_jiffies - read a vector of integers as seconds
* @table: the sysctl table
@@ -1323,7 +1258,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
* @ppos: file position
*
* Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
+ * values from/to the user buffer, treated as an ASCII string.
* The values read are assumed to be in seconds, and are converted into
* jiffies.
*
@@ -1336,6 +1271,17 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write,
do_proc_dointvec_jiffies_conv,NULL);
}
+int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct do_proc_dointvec_minmax_conv_param param = {
+ .min = (int *) table->extra1,
+ .max = (int *) table->extra2,
+ };
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_dointvec_ms_jiffies_minmax_conv, &param);
+}
+
/**
* proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
* @table: the sysctl table
@@ -1345,8 +1291,8 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write,
* @ppos: pointer to the file position
*
* Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in 1/USER_HZ seconds, and
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/USER_HZ seconds, and
* are converted into jiffies.
*
* Returns 0 on success.
@@ -1354,8 +1300,8 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write,
int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- return do_proc_dointvec(table,write,buffer,lenp,ppos,
- do_proc_dointvec_userhz_jiffies_conv,NULL);
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_dointvec_userhz_jiffies_conv, NULL);
}
/**
@@ -1368,8 +1314,8 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
* @ppos: the current position in the file
*
* Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in 1/1000 seconds, and
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/1000 seconds, and
* are converted into jiffies.
*
* Returns 0 on success.
@@ -1424,7 +1370,6 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int err = 0;
- bool first = 1;
size_t left = *lenp;
unsigned long bitmap_len = table->maxlen;
unsigned long *bitmap = *(unsigned long **) table->data;
@@ -1509,12 +1454,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
}
bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
- first = 0;
proc_skip_char(&p, &left, '\n');
}
left += skipped;
} else {
unsigned long bit_a, bit_b = 0;
+ bool first = 1;
while (left) {
bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
@@ -1559,6 +1504,12 @@ int proc_dostring(struct ctl_table *table, int write,
return -ENOSYS;
}
+int proc_dobool(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_dointvec(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -1583,12 +1534,24 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
return -ENOSYS;
}
+int proc_dou8vec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_dointvec_jiffies(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
+int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -1654,234 +1617,12 @@ int proc_do_static_key(struct ctl_table *table, int write,
static struct ctl_table kern_table[] = {
{
- .procname = "sched_child_runs_first",
- .data = &sysctl_sched_child_runs_first,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#ifdef CONFIG_SCHED_DEBUG
- {
- .procname = "sched_min_granularity_ns",
- .data = &sysctl_sched_min_granularity,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_sched_granularity_ns,
- .extra2 = &max_sched_granularity_ns,
- },
- {
- .procname = "sched_latency_ns",
- .data = &sysctl_sched_latency,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_sched_granularity_ns,
- .extra2 = &max_sched_granularity_ns,
- },
- {
- .procname = "sched_wakeup_granularity_ns",
- .data = &sysctl_sched_wakeup_granularity,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_wakeup_granularity_ns,
- .extra2 = &max_wakeup_granularity_ns,
- },
-#ifdef CONFIG_SMP
- {
- .procname = "sched_tunable_scaling",
- .data = &sysctl_sched_tunable_scaling,
- .maxlen = sizeof(enum sched_tunable_scaling),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_sched_tunable_scaling,
- .extra2 = &max_sched_tunable_scaling,
- },
- {
- .procname = "sched_migration_cost_ns",
- .data = &sysctl_sched_migration_cost,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "sched_nr_migrate",
- .data = &sysctl_sched_nr_migrate,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#ifdef CONFIG_SCHEDSTATS
- {
- .procname = "sched_schedstats",
- .data = NULL,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sysctl_schedstats,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif /* CONFIG_SCHEDSTATS */
-#endif /* CONFIG_SMP */
-#ifdef CONFIG_NUMA_BALANCING
- {
- .procname = "numa_balancing_scan_delay_ms",
- .data = &sysctl_numa_balancing_scan_delay,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "numa_balancing_scan_period_min_ms",
- .data = &sysctl_numa_balancing_scan_period_min,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "numa_balancing_scan_period_max_ms",
- .data = &sysctl_numa_balancing_scan_period_max,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "numa_balancing_scan_size_mb",
- .data = &sysctl_numa_balancing_scan_size,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ONE,
- },
- {
- .procname = "numa_balancing",
- .data = NULL, /* filled in by handler */
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sysctl_numa_balancing,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif /* CONFIG_NUMA_BALANCING */
-#endif /* CONFIG_SCHED_DEBUG */
- {
- .procname = "sched_rt_period_us",
- .data = &sysctl_sched_rt_period,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_rt_handler,
- },
- {
- .procname = "sched_rt_runtime_us",
- .data = &sysctl_sched_rt_runtime,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = sched_rt_handler,
- },
- {
- .procname = "sched_rr_timeslice_ms",
- .data = &sysctl_sched_rr_timeslice,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = sched_rr_handler,
- },
-#ifdef CONFIG_UCLAMP_TASK
- {
- .procname = "sched_util_clamp_min",
- .data = &sysctl_sched_uclamp_util_min,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sysctl_sched_uclamp_handler,
- },
- {
- .procname = "sched_util_clamp_max",
- .data = &sysctl_sched_uclamp_util_max,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sysctl_sched_uclamp_handler,
- },
-#endif
-#ifdef CONFIG_SCHED_AUTOGROUP
- {
- .procname = "sched_autogroup_enabled",
- .data = &sysctl_sched_autogroup_enabled,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
-#ifdef CONFIG_CFS_BANDWIDTH
- {
- .procname = "sched_cfs_bandwidth_slice_us",
- .data = &sysctl_sched_cfs_bandwidth_slice,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ONE,
- },
-#endif
-#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
- {
- .procname = "sched_energy_aware",
- .data = &sysctl_sched_energy_aware,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_energy_aware_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
-#ifdef CONFIG_PROVE_LOCKING
- {
- .procname = "prove_locking",
- .data = &prove_locking,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#ifdef CONFIG_LOCK_STAT
- {
- .procname = "lock_stat",
- .data = &lock_stat,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
- {
.procname = "panic",
.data = &panic_timeout,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
-#ifdef CONFIG_COREDUMP
- {
- .procname = "core_uses_pid",
- .data = &core_uses_pid,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "core_pattern",
- .data = core_pattern,
- .maxlen = CORENAME_MAX_SIZE,
- .mode = 0644,
- .proc_handler = proc_dostring_coredump,
- },
- {
- .procname = "core_pipe_limit",
- .data = &core_pipe_limit,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
#ifdef CONFIG_PROC_SYSCTL
{
.procname = "tainted",
@@ -1895,28 +1636,10 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &neg_one,
+ .extra1 = SYSCTL_NEG_ONE,
.extra2 = SYSCTL_ONE,
},
#endif
-#ifdef CONFIG_LATENCYTOP
- {
- .procname = "latencytop",
- .data = &latencytop_enabled,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = sysctl_latencytop,
- },
-#endif
-#ifdef CONFIG_BLK_DEV_INITRD
- {
- .procname = "real-root-dev",
- .data = &real_root_dev,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
{
.procname = "print-fatal-signals",
.data = &print_fatal_signals,
@@ -1974,22 +1697,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
- {
- .procname = "ctrl-alt-del",
- .data = &C_A_D,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#ifdef CONFIG_FUNCTION_TRACER
- {
- .procname = "ftrace_enabled",
- .data = &ftrace_enabled,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = ftrace_enable_sysctl,
- },
-#endif
#ifdef CONFIG_STACK_TRACER
{
.procname = "stack_tracer_enabled",
@@ -2022,18 +1729,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = tracepoint_printk_sysctl,
},
#endif
-#ifdef CONFIG_KEXEC_CORE
- {
- .procname = "kexec_load_disabled",
- .data = &kexec_load_disabled,
- .maxlen = sizeof(int),
- .mode = 0644,
- /* only handle a transition from default "0" to "1" */
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ONE,
- .extra2 = SYSCTL_ONE,
- },
-#endif
#ifdef CONFIG_MODULES
{
.procname = "modprobe",
@@ -2062,24 +1757,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dostring,
},
#endif
-#ifdef CONFIG_CHR_DEV_SG
- {
- .procname = "sg-big-buff",
- .data = &sg_big_buff,
- .maxlen = sizeof (int),
- .mode = 0444,
- .proc_handler = proc_dointvec,
- },
-#endif
-#ifdef CONFIG_BSD_PROCESS_ACCT
- {
- .procname = "acct",
- .data = &acct_parm,
- .maxlen = 3*sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
#ifdef CONFIG_MAGIC_SYSRQ
{
.procname = "sysrq",
@@ -2106,30 +1783,13 @@ static struct ctl_table kern_table[] = {
.proc_handler = sysctl_max_threads,
},
{
- .procname = "random",
- .mode = 0555,
- .child = random_table,
- },
- {
- .procname = "usermodehelper",
- .mode = 0555,
- .child = usermodehelper_table,
- },
-#ifdef CONFIG_FW_LOADER_USER_HELPER
- {
- .procname = "firmware_config",
- .mode = 0555,
- .child = firmware_config_table,
- },
-#endif
- {
.procname = "overflowuid",
.data = &overflowuid,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &minolduid,
- .extra2 = &maxolduid,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_MAXOLDUID,
},
{
.procname = "overflowgid",
@@ -2137,8 +1797,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &minolduid,
- .extra2 = &maxolduid,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_MAXOLDUID,
},
#ifdef CONFIG_S390
{
@@ -2149,17 +1809,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_SMP
- {
- .procname = "oops_all_cpu_backtrace",
- .data = &sysctl_oops_all_cpu_backtrace,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif /* CONFIG_SMP */
{
.procname = "pid_max",
.data = &pid_max,
@@ -2183,66 +1832,9 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
-#if defined CONFIG_PRINTK
- {
- .procname = "printk",
- .data = &console_loglevel,
- .maxlen = 4*sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "printk_ratelimit",
- .data = &printk_ratelimit_state.interval,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "printk_ratelimit_burst",
- .data = &printk_ratelimit_state.burst,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "printk_delay",
- .data = &printk_delay_msec,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &ten_thousand,
- },
- {
- .procname = "printk_devkmsg",
- .data = devkmsg_log_str,
- .maxlen = DEVKMSG_STR_MAX_SIZE,
- .mode = 0644,
- .proc_handler = devkmsg_sysctl_set_loglvl,
- },
- {
- .procname = "dmesg_restrict",
- .data = &dmesg_restrict,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax_sysadmin,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- {
- .procname = "kptr_restrict",
- .data = &kptr_restrict,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax_sysadmin,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &two,
- },
-#endif
{
.procname = "ngroups_max",
- .data = &ngroups_max,
+ .data = (void *)&ngroups_max,
.maxlen = sizeof (int),
.mode = 0444,
.proc_handler = proc_dointvec,
@@ -2254,96 +1846,6 @@ static struct ctl_table kern_table[] = {
.mode = 0444,
.proc_handler = proc_dointvec,
},
-#if defined(CONFIG_LOCKUP_DETECTOR)
- {
- .procname = "watchdog",
- .data = &watchdog_user_enabled,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_watchdog,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- {
- .procname = "watchdog_thresh",
- .data = &watchdog_thresh,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_watchdog_thresh,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &sixty,
- },
- {
- .procname = "nmi_watchdog",
- .data = &nmi_watchdog_user_enabled,
- .maxlen = sizeof(int),
- .mode = NMI_WATCHDOG_SYSCTL_PERM,
- .proc_handler = proc_nmi_watchdog,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- {
- .procname = "watchdog_cpumask",
- .data = &watchdog_cpumask_bits,
- .maxlen = NR_CPUS,
- .mode = 0644,
- .proc_handler = proc_watchdog_cpumask,
- },
-#ifdef CONFIG_SOFTLOCKUP_DETECTOR
- {
- .procname = "soft_watchdog",
- .data = &soft_watchdog_user_enabled,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_soft_watchdog,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- {
- .procname = "softlockup_panic",
- .data = &softlockup_panic,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#ifdef CONFIG_SMP
- {
- .procname = "softlockup_all_cpu_backtrace",
- .data = &sysctl_softlockup_all_cpu_backtrace,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif /* CONFIG_SMP */
-#endif
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
- {
- .procname = "hardlockup_panic",
- .data = &hardlockup_panic,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#ifdef CONFIG_SMP
- {
- .procname = "hardlockup_all_cpu_backtrace",
- .data = &sysctl_hardlockup_all_cpu_backtrace,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif /* CONFIG_SMP */
-#endif
-#endif
-
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
{
.procname = "unknown_nmi_panic",
@@ -2437,69 +1939,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_IA64
- {
- .procname = "unaligned-dump-stack",
- .data = &unaligned_dump_stack,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#ifdef CONFIG_DETECT_HUNG_TASK
-#ifdef CONFIG_SMP
- {
- .procname = "hung_task_all_cpu_backtrace",
- .data = &sysctl_hung_task_all_cpu_backtrace,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif /* CONFIG_SMP */
- {
- .procname = "hung_task_panic",
- .data = &sysctl_hung_task_panic,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- {
- .procname = "hung_task_check_count",
- .data = &sysctl_hung_task_check_count,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- },
- {
- .procname = "hung_task_timeout_secs",
- .data = &sysctl_hung_task_timeout_secs,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = proc_dohung_task_timeout_secs,
- .extra2 = &hung_task_timeout_max,
- },
- {
- .procname = "hung_task_check_interval_secs",
- .data = &sysctl_hung_task_check_interval_secs,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = proc_dohung_task_timeout_secs,
- .extra2 = &hung_task_timeout_max,
- },
- {
- .procname = "hung_task_warnings",
- .data = &sysctl_hung_task_warnings,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &neg_one,
- },
-#endif
#ifdef CONFIG_RT_MUTEXES
{
.procname = "max_lock_depth",
@@ -2509,20 +1948,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
- {
- .procname = "poweroff_cmd",
- .data = &poweroff_cmd,
- .maxlen = POWEROFF_CMD_PATH_LEN,
- .mode = 0644,
- .proc_handler = proc_dostring,
- },
-#ifdef CONFIG_KEYS
- {
- .procname = "keys",
- .mode = 0555,
- .child = key_sysctls,
- },
-#endif
#ifdef CONFIG_PERF_EVENTS
/*
* User-space scripts rely on the existence of this file
@@ -2549,7 +1974,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_perf_event_sample_rate,
.maxlen = sizeof(sysctl_perf_event_sample_rate),
.mode = 0644,
- .proc_handler = perf_proc_update_handler,
+ .proc_handler = perf_event_max_sample_rate_handler,
.extra1 = SYSCTL_ONE,
},
{
@@ -2559,7 +1984,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = perf_cpu_time_max_percent_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &one_hundred,
+ .extra2 = SYSCTL_ONE_HUNDRED,
},
{
.procname = "perf_event_max_stack",
@@ -2568,7 +1993,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = perf_event_max_stack_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &six_hundred_forty_kb,
+ .extra2 = (void *)&six_hundred_forty_kb,
},
{
.procname = "perf_event_max_contexts_per_stack",
@@ -2577,7 +2002,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = perf_event_max_stack_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &one_thousand,
+ .extra2 = SYSCTL_ONE_THOUSAND,
},
#endif
{
@@ -2589,37 +2014,7 @@ static struct ctl_table kern_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
- {
- .procname = "timer_migration",
- .data = &sysctl_timer_migration,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = timer_migration_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
-#ifdef CONFIG_BPF_SYSCALL
- {
- .procname = "unprivileged_bpf_disabled",
- .data = &sysctl_unprivileged_bpf_disabled,
- .maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
- .mode = 0644,
- /* only handle a transition from default "0" to "1" */
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ONE,
- .extra2 = SYSCTL_ONE,
- },
- {
- .procname = "bpf_stats_enabled",
- .data = &bpf_stats_enabled_key.key,
- .maxlen = sizeof(bpf_stats_enabled_key),
- .mode = 0644,
- .proc_handler = bpf_stats_handler,
- },
-#endif
-#if defined(CONFIG_TREE_RCU)
+#ifdef CONFIG_TREE_RCU
{
.procname = "panic_on_rcu_stall",
.data = &sysctl_panic_on_rcu_stall,
@@ -2629,16 +2024,14 @@ static struct ctl_table kern_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
-#endif
-#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
{
- .procname = "stack_erasing",
- .data = NULL,
- .maxlen = sizeof(int),
- .mode = 0600,
- .proc_handler = stack_erasing_sysctl,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .procname = "max_rcu_stall_to_panic",
+ .data = &sysctl_max_rcu_stall_to_panic,
+ .maxlen = sizeof(sysctl_max_rcu_stall_to_panic),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
#endif
{ }
@@ -2650,32 +2043,9 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_overcommit_memory,
.maxlen = sizeof(sysctl_overcommit_memory),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = overcommit_policy_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &two,
- },
- {
- .procname = "panic_on_oom",
- .data = &sysctl_panic_on_oom,
- .maxlen = sizeof(sysctl_panic_on_oom),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &two,
- },
- {
- .procname = "oom_kill_allocating_task",
- .data = &sysctl_oom_kill_allocating_task,
- .maxlen = sizeof(sysctl_oom_kill_allocating_task),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "oom_dump_tasks",
- .data = &sysctl_oom_dump_tasks,
- .maxlen = sizeof(sysctl_oom_dump_tasks),
- .mode = 0644,
- .proc_handler = proc_dointvec,
+ .extra2 = SYSCTL_TWO,
},
{
.procname = "overcommit_ratio",
@@ -2698,55 +2068,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- },
- {
- .procname = "dirty_background_ratio",
- .data = &dirty_background_ratio,
- .maxlen = sizeof(dirty_background_ratio),
- .mode = 0644,
- .proc_handler = dirty_background_ratio_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &one_hundred,
- },
- {
- .procname = "dirty_background_bytes",
- .data = &dirty_background_bytes,
- .maxlen = sizeof(dirty_background_bytes),
- .mode = 0644,
- .proc_handler = dirty_background_bytes_handler,
- .extra1 = &one_ul,
- },
- {
- .procname = "dirty_ratio",
- .data = &vm_dirty_ratio,
- .maxlen = sizeof(vm_dirty_ratio),
- .mode = 0644,
- .proc_handler = dirty_ratio_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &one_hundred,
- },
- {
- .procname = "dirty_bytes",
- .data = &vm_dirty_bytes,
- .maxlen = sizeof(vm_dirty_bytes),
- .mode = 0644,
- .proc_handler = dirty_bytes_handler,
- .extra1 = &dirty_bytes_min,
- },
- {
- .procname = "dirty_writeback_centisecs",
- .data = &dirty_writeback_interval,
- .maxlen = sizeof(dirty_writeback_interval),
- .mode = 0644,
- .proc_handler = dirty_writeback_centisecs_handler,
- },
- {
- .procname = "dirty_expire_centisecs",
- .data = &dirty_expire_interval,
- .maxlen = sizeof(dirty_expire_interval),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&page_cluster_max,
},
{
.procname = "dirtytime_expire_seconds",
@@ -2763,124 +2085,34 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = &two_hundred,
- },
-#ifdef CONFIG_HUGETLB_PAGE
- {
- .procname = "nr_hugepages",
- .data = NULL,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = hugetlb_sysctl_handler,
+ .extra2 = SYSCTL_TWO_HUNDRED,
},
#ifdef CONFIG_NUMA
{
- .procname = "nr_hugepages_mempolicy",
- .data = NULL,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = &hugetlb_mempolicy_sysctl_handler,
- },
- {
- .procname = "numa_stat",
- .data = &sysctl_vm_numa_stat,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = sysctl_vm_numa_stat_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
- {
- .procname = "hugetlb_shm_group",
- .data = &sysctl_hugetlb_shm_group,
- .maxlen = sizeof(gid_t),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "nr_overcommit_hugepages",
- .data = NULL,
- .maxlen = sizeof(unsigned long),
+ .procname = "numa_stat",
+ .data = &sysctl_vm_numa_stat,
+ .maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = hugetlb_overcommit_handler,
+ .proc_handler = sysctl_vm_numa_stat_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
{
- .procname = "lowmem_reserve_ratio",
- .data = &sysctl_lowmem_reserve_ratio,
- .maxlen = sizeof(sysctl_lowmem_reserve_ratio),
- .mode = 0644,
- .proc_handler = lowmem_reserve_ratio_sysctl_handler,
- },
- {
.procname = "drop_caches",
.data = &sysctl_drop_caches,
.maxlen = sizeof(int),
.mode = 0200,
.proc_handler = drop_caches_sysctl_handler,
.extra1 = SYSCTL_ONE,
- .extra2 = &four,
+ .extra2 = SYSCTL_FOUR,
},
-#ifdef CONFIG_COMPACTION
{
- .procname = "compact_memory",
- .data = &sysctl_compact_memory,
- .maxlen = sizeof(int),
- .mode = 0200,
- .proc_handler = sysctl_compaction_handler,
- },
- {
- .procname = "extfrag_threshold",
- .data = &sysctl_extfrag_threshold,
- .maxlen = sizeof(int),
+ .procname = "page_lock_unfairness",
+ .data = &sysctl_page_lock_unfairness,
+ .maxlen = sizeof(sysctl_page_lock_unfairness),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &min_extfrag_threshold,
- .extra2 = &max_extfrag_threshold,
- },
- {
- .procname = "compact_unevictable_allowed",
- .data = &sysctl_compact_unevictable_allowed,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax_warn_RT_change,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-
-#endif /* CONFIG_COMPACTION */
- {
- .procname = "min_free_kbytes",
- .data = &min_free_kbytes,
- .maxlen = sizeof(min_free_kbytes),
- .mode = 0644,
- .proc_handler = min_free_kbytes_sysctl_handler,
- .extra1 = SYSCTL_ZERO,
- },
- {
- .procname = "watermark_boost_factor",
- .data = &watermark_boost_factor,
- .maxlen = sizeof(watermark_boost_factor),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- },
- {
- .procname = "watermark_scale_factor",
- .data = &watermark_scale_factor,
- .maxlen = sizeof(watermark_scale_factor),
- .mode = 0644,
- .proc_handler = watermark_scale_factor_sysctl_handler,
- .extra1 = SYSCTL_ONE,
- .extra2 = &one_thousand,
- },
- {
- .procname = "percpu_pagelist_fraction",
- .data = &percpu_pagelist_fraction,
- .maxlen = sizeof(percpu_pagelist_fraction),
- .mode = 0644,
- .proc_handler = percpu_pagelist_fraction_sysctl_handler,
.extra1 = SYSCTL_ZERO,
},
#ifdef CONFIG_MMU
@@ -2903,26 +2135,11 @@ static struct ctl_table vm_table[] = {
},
#endif
{
- .procname = "laptop_mode",
- .data = &laptop_mode,
- .maxlen = sizeof(laptop_mode),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "block_dump",
- .data = &block_dump,
- .maxlen = sizeof(block_dump),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- .extra1 = SYSCTL_ZERO,
- },
- {
.procname = "vfs_cache_pressure",
.data = &sysctl_vfs_cache_pressure,
.maxlen = sizeof(sysctl_vfs_cache_pressure),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
@@ -2932,7 +2149,7 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_legacy_va_layout,
.maxlen = sizeof(sysctl_legacy_va_layout),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
#endif
@@ -2942,26 +2159,8 @@ static struct ctl_table vm_table[] = {
.data = &node_reclaim_mode,
.maxlen = sizeof(node_reclaim_mode),
.mode = 0644,
- .proc_handler = proc_dointvec,
- .extra1 = SYSCTL_ZERO,
- },
- {
- .procname = "min_unmapped_ratio",
- .data = &sysctl_min_unmapped_ratio,
- .maxlen = sizeof(sysctl_min_unmapped_ratio),
- .mode = 0644,
- .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &one_hundred,
- },
- {
- .procname = "min_slab_ratio",
- .data = &sysctl_min_slab_ratio,
- .maxlen = sizeof(sysctl_min_slab_ratio),
- .mode = 0644,
- .proc_handler = sysctl_min_slab_ratio_sysctl_handler,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = &one_hundred,
},
#endif
#ifdef CONFIG_SMP
@@ -2989,15 +2188,6 @@ static struct ctl_table vm_table[] = {
.proc_handler = mmap_min_addr_handler,
},
#endif
-#ifdef CONFIG_NUMA
- {
- .procname = "numa_zonelist_order",
- .data = &numa_zonelist_order,
- .maxlen = NUMA_ZONELIST_ORDER_LEN,
- .mode = 0644,
- .proc_handler = numa_zonelist_order_handler,
- },
-#endif
#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
(defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
{
@@ -3014,37 +2204,6 @@ static struct ctl_table vm_table[] = {
.extra1 = SYSCTL_ZERO,
},
#endif
-#ifdef CONFIG_HIGHMEM
- {
- .procname = "highmem_is_dirtyable",
- .data = &vm_highmem_is_dirtyable,
- .maxlen = sizeof(vm_highmem_is_dirtyable),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
-#ifdef CONFIG_MEMORY_FAILURE
- {
- .procname = "memory_failure_early_kill",
- .data = &sysctl_memory_failure_early_kill,
- .maxlen = sizeof(sysctl_memory_failure_early_kill),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- {
- .procname = "memory_failure_recovery",
- .data = &sysctl_memory_failure_recovery,
- .maxlen = sizeof(sysctl_memory_failure_recovery),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
{
.procname = "user_reserve_kbytes",
.data = &sysctl_user_reserve_kbytes,
@@ -3081,291 +2240,14 @@ static struct ctl_table vm_table[] = {
.extra2 = (void *)&mmap_rnd_compat_bits_max,
},
#endif
-#ifdef CONFIG_USERFAULTFD
- {
- .procname = "unprivileged_userfaultfd",
- .data = &sysctl_unprivileged_userfaultfd,
- .maxlen = sizeof(sysctl_unprivileged_userfaultfd),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
- { }
-};
-
-static struct ctl_table fs_table[] = {
- {
- .procname = "inode-nr",
- .data = &inodes_stat,
- .maxlen = 2*sizeof(long),
- .mode = 0444,
- .proc_handler = proc_nr_inodes,
- },
- {
- .procname = "inode-state",
- .data = &inodes_stat,
- .maxlen = 7*sizeof(long),
- .mode = 0444,
- .proc_handler = proc_nr_inodes,
- },
- {
- .procname = "file-nr",
- .data = &files_stat,
- .maxlen = sizeof(files_stat),
- .mode = 0444,
- .proc_handler = proc_nr_files,
- },
- {
- .procname = "file-max",
- .data = &files_stat.max_files,
- .maxlen = sizeof(files_stat.max_files),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- .extra1 = &zero_ul,
- .extra2 = &long_max,
- },
- {
- .procname = "nr_open",
- .data = &sysctl_nr_open,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &sysctl_nr_open_min,
- .extra2 = &sysctl_nr_open_max,
- },
- {
- .procname = "dentry-state",
- .data = &dentry_stat,
- .maxlen = 6*sizeof(long),
- .mode = 0444,
- .proc_handler = proc_nr_dentry,
- },
- {
- .procname = "overflowuid",
- .data = &fs_overflowuid,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &minolduid,
- .extra2 = &maxolduid,
- },
- {
- .procname = "overflowgid",
- .data = &fs_overflowgid,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &minolduid,
- .extra2 = &maxolduid,
- },
-#ifdef CONFIG_FILE_LOCKING
- {
- .procname = "leases-enable",
- .data = &leases_enable,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#ifdef CONFIG_DNOTIFY
- {
- .procname = "dir-notify-enable",
- .data = &dir_notify_enable,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#ifdef CONFIG_MMU
-#ifdef CONFIG_FILE_LOCKING
- {
- .procname = "lease-break-time",
- .data = &lease_break_time,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#ifdef CONFIG_AIO
- {
- .procname = "aio-nr",
- .data = &aio_nr,
- .maxlen = sizeof(aio_nr),
- .mode = 0444,
- .proc_handler = proc_doulongvec_minmax,
- },
- {
- .procname = "aio-max-nr",
- .data = &aio_max_nr,
- .maxlen = sizeof(aio_max_nr),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- },
-#endif /* CONFIG_AIO */
-#ifdef CONFIG_INOTIFY_USER
- {
- .procname = "inotify",
- .mode = 0555,
- .child = inotify_table,
- },
-#endif
-#ifdef CONFIG_EPOLL
- {
- .procname = "epoll",
- .mode = 0555,
- .child = epoll_table,
- },
-#endif
-#endif
- {
- .procname = "protected_symlinks",
- .data = &sysctl_protected_symlinks,
- .maxlen = sizeof(int),
- .mode = 0600,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- {
- .procname = "protected_hardlinks",
- .data = &sysctl_protected_hardlinks,
- .maxlen = sizeof(int),
- .mode = 0600,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- {
- .procname = "protected_fifos",
- .data = &sysctl_protected_fifos,
- .maxlen = sizeof(int),
- .mode = 0600,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &two,
- },
- {
- .procname = "protected_regular",
- .data = &sysctl_protected_regular,
- .maxlen = sizeof(int),
- .mode = 0600,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &two,
- },
- {
- .procname = "suid_dumpable",
- .data = &suid_dumpable,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax_coredump,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &two,
- },
-#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
- {
- .procname = "binfmt_misc",
- .mode = 0555,
- .child = sysctl_mount_point,
- },
-#endif
- {
- .procname = "pipe-max-size",
- .data = &pipe_max_size,
- .maxlen = sizeof(pipe_max_size),
- .mode = 0644,
- .proc_handler = proc_dopipe_max_size,
- },
- {
- .procname = "pipe-user-pages-hard",
- .data = &pipe_user_pages_hard,
- .maxlen = sizeof(pipe_user_pages_hard),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- },
- {
- .procname = "pipe-user-pages-soft",
- .data = &pipe_user_pages_soft,
- .maxlen = sizeof(pipe_user_pages_soft),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- },
- {
- .procname = "mount-max",
- .data = &sysctl_mount_max,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ONE,
- },
- { }
-};
-
-static struct ctl_table debug_table[] = {
-#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
- {
- .procname = "exception-trace",
- .data = &show_unhandled_signals,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
-#endif
-#if defined(CONFIG_OPTPROBES)
- {
- .procname = "kprobes-optimization",
- .data = &sysctl_kprobes_optimization,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_kprobes_optimization_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
- { }
-};
-
-static struct ctl_table dev_table[] = {
- { }
-};
-
-static struct ctl_table sysctl_base_table[] = {
- {
- .procname = "kernel",
- .mode = 0555,
- .child = kern_table,
- },
- {
- .procname = "vm",
- .mode = 0555,
- .child = vm_table,
- },
- {
- .procname = "fs",
- .mode = 0555,
- .child = fs_table,
- },
- {
- .procname = "debug",
- .mode = 0555,
- .child = debug_table,
- },
- {
- .procname = "dev",
- .mode = 0555,
- .child = dev_table,
- },
{ }
};
-int __init sysctl_init(void)
+int __init sysctl_init_bases(void)
{
- struct ctl_table_header *hdr;
+ register_sysctl_init("kernel", kern_table);
+ register_sysctl_init("vm", vm_table);
- hdr = register_sysctl_table(sysctl_base_table);
- kmemleak_not_leak(hdr);
return 0;
}
#endif /* CONFIG_SYSCTL */
@@ -3373,6 +2255,7 @@ int __init sysctl_init(void)
* No sense putting this after each symbol definition, twice,
* exception granted :-)
*/
+EXPORT_SYMBOL(proc_dobool);
EXPORT_SYMBOL(proc_dointvec);
EXPORT_SYMBOL(proc_douintvec);
EXPORT_SYMBOL(proc_dointvec_jiffies);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
deleted file mode 100644
index 7d550cc76a3b..000000000000
--- a/kernel/sysctl_binary.c
+++ /dev/null
@@ -1,171 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/stat.h>
-#include <linux/sysctl.h>
-#include "../fs/xfs/xfs_sysctl.h"
-#include <linux/sunrpc/debug.h>
-#include <linux/string.h>
-#include <linux/syscalls.h>
-#include <linux/namei.h>
-#include <linux/mount.h>
-#include <linux/fs.h>
-#include <linux/nsproxy.h>
-#include <linux/pid_namespace.h>
-#include <linux/file.h>
-#include <linux/ctype.h>
-#include <linux/netdevice.h>
-#include <linux/kernel.h>
-#include <linux/uuid.h>
-#include <linux/slab.h>
-#include <linux/compat.h>
-
-static ssize_t binary_sysctl(const int *name, int nlen,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- return -ENOSYS;
-}
-
-static void deprecated_sysctl_warning(const int *name, int nlen)
-{
- int i;
-
- /*
- * CTL_KERN/KERN_VERSION is used by older glibc and cannot
- * ever go away.
- */
- if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION)
- return;
-
- if (printk_ratelimit()) {
- printk(KERN_INFO
- "warning: process `%s' used the deprecated sysctl "
- "system call with ", current->comm);
- for (i = 0; i < nlen; i++)
- printk(KERN_CONT "%d.", name[i]);
- printk(KERN_CONT "\n");
- }
- return;
-}
-
-#define WARN_ONCE_HASH_BITS 8
-#define WARN_ONCE_HASH_SIZE (1<<WARN_ONCE_HASH_BITS)
-
-static DECLARE_BITMAP(warn_once_bitmap, WARN_ONCE_HASH_SIZE);
-
-#define FNV32_OFFSET 2166136261U
-#define FNV32_PRIME 0x01000193
-
-/*
- * Print each legacy sysctl (approximately) only once.
- * To avoid making the tables non-const use a external
- * hash-table instead.
- * Worst case hash collision: 6, but very rarely.
- * NOTE! We don't use the SMP-safe bit tests. We simply
- * don't care enough.
- */
-static void warn_on_bintable(const int *name, int nlen)
-{
- int i;
- u32 hash = FNV32_OFFSET;
-
- for (i = 0; i < nlen; i++)
- hash = (hash ^ name[i]) * FNV32_PRIME;
- hash %= WARN_ONCE_HASH_SIZE;
- if (__test_and_set_bit(hash, warn_once_bitmap))
- return;
- deprecated_sysctl_warning(name, nlen);
-}
-
-static ssize_t do_sysctl(int __user *args_name, int nlen,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- int name[CTL_MAXNAME];
- int i;
-
- /* Check args->nlen. */
- if (nlen < 0 || nlen > CTL_MAXNAME)
- return -ENOTDIR;
- /* Read in the sysctl name for simplicity */
- for (i = 0; i < nlen; i++)
- if (get_user(name[i], args_name + i))
- return -EFAULT;
-
- warn_on_bintable(name, nlen);
-
- return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen);
-}
-
-SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
-{
- struct __sysctl_args tmp;
- size_t oldlen = 0;
- ssize_t result;
-
- if (copy_from_user(&tmp, args, sizeof(tmp)))
- return -EFAULT;
-
- if (tmp.oldval && !tmp.oldlenp)
- return -EFAULT;
-
- if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp))
- return -EFAULT;
-
- result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen,
- tmp.newval, tmp.newlen);
-
- if (result >= 0) {
- oldlen = result;
- result = 0;
- }
-
- if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp))
- return -EFAULT;
-
- return result;
-}
-
-
-#ifdef CONFIG_COMPAT
-
-struct compat_sysctl_args {
- compat_uptr_t name;
- int nlen;
- compat_uptr_t oldval;
- compat_uptr_t oldlenp;
- compat_uptr_t newval;
- compat_size_t newlen;
- compat_ulong_t __unused[4];
-};
-
-COMPAT_SYSCALL_DEFINE1(sysctl, struct compat_sysctl_args __user *, args)
-{
- struct compat_sysctl_args tmp;
- compat_size_t __user *compat_oldlenp;
- size_t oldlen = 0;
- ssize_t result;
-
- if (copy_from_user(&tmp, args, sizeof(tmp)))
- return -EFAULT;
-
- if (tmp.oldval && !tmp.oldlenp)
- return -EFAULT;
-
- compat_oldlenp = compat_ptr(tmp.oldlenp);
- if (compat_oldlenp && get_user(oldlen, compat_oldlenp))
- return -EFAULT;
-
- result = do_sysctl(compat_ptr(tmp.name), tmp.nlen,
- compat_ptr(tmp.oldval), oldlen,
- compat_ptr(tmp.newval), tmp.newlen);
-
- if (result >= 0) {
- oldlen = result;
- result = 0;
- }
-
- if (compat_oldlenp && put_user(oldlen, compat_oldlenp))
- return -EFAULT;
-
- return result;
-}
-
-#endif /* CONFIG_COMPAT */
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 5c0848ca1287..95a7e1b7f1da 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/spinlock.h>
#include <linux/task_work.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
static struct callback_head work_exited; /* all we need is ->next == NULL */
@@ -9,44 +9,65 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
* task_work_add - ask the @task to execute @work->func()
* @task: the task which should run the callback
* @work: the callback to run
- * @notify: send the notification if true
+ * @notify: how to notify the targeted task
+ *
+ * Queue @work for task_work_run() below and notify the @task if @notify
+ * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI.
+ *
+ * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted
+ * task and run the task_work, regardless of whether the task is currently
+ * running in the kernel or userspace.
+ * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a
+ * reschedule IPI to force the targeted task to reschedule and run task_work.
+ * This can be advantageous if there's no strict requirement that the
+ * task_work be run as soon as possible, just whenever the task enters the
+ * kernel anyway.
+ * @TWA_RESUME work is run only when the task exits the kernel and returns to
+ * user mode, or before entering guest mode.
*
- * Queue @work for task_work_run() below and notify the @task if @notify.
* Fails if the @task is exiting/exited and thus it can't process this @work.
- * Otherwise @work->func() will be called when the @task returns from kernel
- * mode or exits.
+ * Otherwise @work->func() will be called when the @task goes through one of
+ * the aforementioned transitions, or exits.
*
- * This is like the signal handler which runs in kernel mode, but it doesn't
- * try to wake up the @task.
+ * If the targeted task is exiting, then an error is returned and the work item
+ * is not queued. It's up to the caller to arrange for an alternative mechanism
+ * in that case.
*
- * Note: there is no ordering guarantee on works queued here.
+ * Note: there is no ordering guarantee on works queued here. The task_work
+ * list is LIFO.
*
* RETURNS:
* 0 if succeeds or -ESRCH.
*/
-int
-task_work_add(struct task_struct *task, struct callback_head *work, int notify)
+int task_work_add(struct task_struct *task, struct callback_head *work,
+ enum task_work_notify_mode notify)
{
struct callback_head *head;
- unsigned long flags;
+ /* record the work call stack in order to print it in KASAN reports */
+ kasan_record_aux_stack(work);
+
+ head = READ_ONCE(task->task_works);
do {
- head = READ_ONCE(task->task_works);
if (unlikely(head == &work_exited))
return -ESRCH;
work->next = head;
- } while (cmpxchg(&task->task_works, head, work) != head);
+ } while (!try_cmpxchg(&task->task_works, &head, work));
switch (notify) {
+ case TWA_NONE:
+ break;
case TWA_RESUME:
set_notify_resume(task);
break;
case TWA_SIGNAL:
- if (lock_task_sighand(task, &flags)) {
- task->jobctl |= JOBCTL_TASK_WORK;
- signal_wake_up(task, 0);
- unlock_task_sighand(task, &flags);
- }
+ set_notify_signal(task);
+ break;
+ case TWA_SIGNAL_NO_IPI:
+ __set_notify_signal(task);
+ break;
+ default:
+ WARN_ON_ONCE(1);
break;
}
@@ -54,24 +75,24 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify)
}
/**
- * task_work_cancel - cancel a pending work added by task_work_add()
+ * task_work_cancel_match - cancel a pending work added by task_work_add()
* @task: the task which should execute the work
- * @func: identifies the work to remove
- *
- * Find the last queued pending work with ->func == @func and remove
- * it from queue.
+ * @match: match function to call
+ * @data: data to be passed in to match function
*
* RETURNS:
* The found work or NULL if not found.
*/
struct callback_head *
-task_work_cancel(struct task_struct *task, task_work_func_t func)
+task_work_cancel_match(struct task_struct *task,
+ bool (*match)(struct callback_head *, void *data),
+ void *data)
{
struct callback_head **pprev = &task->task_works;
struct callback_head *work;
unsigned long flags;
- if (likely(!task->task_works))
+ if (likely(!task_work_pending(task)))
return NULL;
/*
* If cmpxchg() fails we continue without updating pprev.
@@ -80,10 +101,12 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
* we raced with task_work_run(), *pprev == NULL/exited.
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
- while ((work = READ_ONCE(*pprev))) {
- if (work->func != func)
+ work = READ_ONCE(*pprev);
+ while (work) {
+ if (!match(work, data)) {
pprev = &work->next;
- else if (cmpxchg(pprev, work, work->next) == work)
+ work = READ_ONCE(*pprev);
+ } else if (try_cmpxchg(pprev, &work, work->next))
break;
}
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
@@ -91,6 +114,28 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
return work;
}
+static bool task_work_func_match(struct callback_head *cb, void *data)
+{
+ return cb->func == data;
+}
+
+/**
+ * task_work_cancel - cancel a pending work added by task_work_add()
+ * @task: the task which should execute the work
+ * @func: identifies the work to remove
+ *
+ * Find the last queued pending work with ->func == @func and remove
+ * it from queue.
+ *
+ * RETURNS:
+ * The found work or NULL if not found.
+ */
+struct callback_head *
+task_work_cancel(struct task_struct *task, task_work_func_t func)
+{
+ return task_work_cancel_match(task, task_work_func_match, func);
+}
+
/**
* task_work_run - execute the works added by task_work_add()
*
@@ -109,16 +154,16 @@ void task_work_run(void)
* work->func() can do task_work_add(), do not set
* work_exited unless the list is empty.
*/
+ work = READ_ONCE(task->task_works);
do {
head = NULL;
- work = READ_ONCE(task->task_works);
if (!work) {
if (task->flags & PF_EXITING)
head = &work_exited;
else
break;
}
- } while (cmpxchg(&task->task_works, work, head) != work);
+ } while (!try_cmpxchg(&task->task_works, &work, head));
if (!work)
break;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index e2ac0e37c4ae..4354ea231fab 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -9,6 +9,7 @@
#include <linux/kernel.h>
#include <linux/taskstats_kern.h>
#include <linux/tsacct_kern.h>
+#include <linux/acct.h>
#include <linux/delayacct.h>
#include <linux/cpumask.h>
#include <linux/percpu.h>
@@ -34,17 +35,13 @@ struct kmem_cache *taskstats_cache;
static struct genl_family family;
-static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
+static const struct nla_policy taskstats_cmd_get_policy[] = {
[TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
[TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
-/*
- * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family.
- * Make sure they are always aligned.
- */
-static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
+static const struct nla_policy cgroupstats_cmd_get_policy[] = {
[CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
};
@@ -117,13 +114,14 @@ static void send_cpu_listeners(struct sk_buff *skb,
struct listener *s, *tmp;
struct sk_buff *skb_next, *skb_cur = skb;
void *reply = genlmsg_data(genlhdr);
- int rc, delcount = 0;
+ int delcount = 0;
genlmsg_end(skb, reply);
- rc = 0;
down_read(&listeners->sem);
list_for_each_entry(s, &listeners->list, list) {
+ int rc;
+
skb_next = NULL;
if (!list_is_last(&s->list, &listeners->list)) {
skb_next = skb_clone(skb_cur, GFP_KERNEL);
@@ -156,6 +154,23 @@ static void send_cpu_listeners(struct sk_buff *skb,
up_write(&listeners->sem);
}
+static void exe_add_tsk(struct taskstats *stats, struct task_struct *tsk)
+{
+ /* No idea if I'm allowed to access that here, now. */
+ struct file *exe_file = get_task_exe_file(tsk);
+
+ if (exe_file) {
+ /* Following cp_new_stat64() in stat.c . */
+ stats->ac_exe_dev =
+ huge_encode_dev(exe_file->f_inode->i_sb->s_dev);
+ stats->ac_exe_inode = exe_file->f_inode->i_ino;
+ fput(exe_file);
+ } else {
+ stats->ac_exe_dev = 0;
+ stats->ac_exe_inode = 0;
+ }
+}
+
static void fill_stats(struct user_namespace *user_ns,
struct pid_namespace *pid_ns,
struct task_struct *tsk, struct taskstats *stats)
@@ -178,6 +193,9 @@ static void fill_stats(struct user_namespace *user_ns,
/* fill in extended acct fields */
xacct_add_tsk(stats, tsk);
+
+ /* add executable info */
+ exe_add_tsk(stats, tsk);
}
static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
@@ -215,9 +233,8 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
else
memset(stats, 0, sizeof(*stats));
- tsk = first;
start_time = ktime_get_ns();
- do {
+ for_each_thread(first, tsk) {
if (tsk->exit_state)
continue;
/*
@@ -240,7 +257,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
stats->nvcsw += tsk->nvcsw;
stats->nivcsw += tsk->nivcsw;
- } while_each_thread(first, tsk);
+ }
unlock_task_sighand(first, &flags);
rc = 0;
@@ -350,7 +367,7 @@ static int parse(struct nlattr *na, struct cpumask *mask)
data = kmalloc(len, GFP_KERNEL);
if (!data)
return -ENOMEM;
- nla_strlcpy(data, na, len);
+ nla_strscpy(data, na, len);
ret = cpulist_parse(data, mask);
kfree(data);
return ret;
@@ -623,6 +640,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
goto err;
fill_stats(&init_user_ns, &init_pid_ns, tsk, stats);
+ if (group_dead)
+ stats->ac_flag |= AGROUP;
/*
* Doesn't matter if tsk is the leader or the last group member leaving
@@ -649,47 +668,27 @@ static const struct genl_ops taskstats_ops[] = {
.cmd = TASKSTATS_CMD_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = taskstats_user_cmd,
- /* policy enforced later */
- .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_HASPOL,
+ .policy = taskstats_cmd_get_policy,
+ .maxattr = ARRAY_SIZE(taskstats_cmd_get_policy) - 1,
+ .flags = GENL_ADMIN_PERM,
},
{
.cmd = CGROUPSTATS_CMD_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = cgroupstats_user_cmd,
- /* policy enforced later */
- .flags = GENL_CMD_CAP_HASPOL,
+ .policy = cgroupstats_cmd_get_policy,
+ .maxattr = ARRAY_SIZE(cgroupstats_cmd_get_policy) - 1,
},
};
-static int taskstats_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
- struct genl_info *info)
-{
- const struct nla_policy *policy = NULL;
-
- switch (ops->cmd) {
- case TASKSTATS_CMD_GET:
- policy = taskstats_cmd_get_policy;
- break;
- case CGROUPSTATS_CMD_GET:
- policy = cgroupstats_cmd_get_policy;
- break;
- default:
- return -EINVAL;
- }
-
- return nlmsg_validate_deprecated(info->nlhdr, GENL_HDRLEN,
- TASKSTATS_CMD_ATTR_MAX, policy,
- info->extack);
-}
-
static struct genl_family family __ro_after_init = {
.name = TASKSTATS_GENL_NAME,
.version = TASKSTATS_GENL_VERSION,
- .maxattr = TASKSTATS_CMD_ATTR_MAX,
.module = THIS_MODULE,
.ops = taskstats_ops,
.n_ops = ARRAY_SIZE(taskstats_ops),
- .pre_doit = taskstats_pre_doit,
+ .resv_start_op = CGROUPSTATS_CMD_GET + 1,
+ .netnsok = true,
};
/* Needed early in initialization */
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
deleted file mode 100644
index 76c997fdbc9d..000000000000
--- a/kernel/test_kprobes.c
+++ /dev/null
@@ -1,313 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * test_kprobes.c - simple sanity test for *probes
- *
- * Copyright IBM Corp. 2008
- */
-
-#define pr_fmt(fmt) "Kprobe smoke test: " fmt
-
-#include <linux/kernel.h>
-#include <linux/kprobes.h>
-#include <linux/random.h>
-
-#define div_factor 3
-
-static u32 rand1, preh_val, posth_val;
-static int errors, handler_errors, num_tests;
-static u32 (*target)(u32 value);
-static u32 (*target2)(u32 value);
-
-static noinline u32 kprobe_target(u32 value)
-{
- return (value / div_factor);
-}
-
-static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
-{
- if (preemptible()) {
- handler_errors++;
- pr_err("pre-handler is preemptible\n");
- }
- preh_val = (rand1 / div_factor);
- return 0;
-}
-
-static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
- unsigned long flags)
-{
- if (preemptible()) {
- handler_errors++;
- pr_err("post-handler is preemptible\n");
- }
- if (preh_val != (rand1 / div_factor)) {
- handler_errors++;
- pr_err("incorrect value in post_handler\n");
- }
- posth_val = preh_val + div_factor;
-}
-
-static struct kprobe kp = {
- .symbol_name = "kprobe_target",
- .pre_handler = kp_pre_handler,
- .post_handler = kp_post_handler
-};
-
-static int test_kprobe(void)
-{
- int ret;
-
- ret = register_kprobe(&kp);
- if (ret < 0) {
- pr_err("register_kprobe returned %d\n", ret);
- return ret;
- }
-
- ret = target(rand1);
- unregister_kprobe(&kp);
-
- if (preh_val == 0) {
- pr_err("kprobe pre_handler not called\n");
- handler_errors++;
- }
-
- if (posth_val == 0) {
- pr_err("kprobe post_handler not called\n");
- handler_errors++;
- }
-
- return 0;
-}
-
-static noinline u32 kprobe_target2(u32 value)
-{
- return (value / div_factor) + 1;
-}
-
-static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs)
-{
- preh_val = (rand1 / div_factor) + 1;
- return 0;
-}
-
-static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs,
- unsigned long flags)
-{
- if (preh_val != (rand1 / div_factor) + 1) {
- handler_errors++;
- pr_err("incorrect value in post_handler2\n");
- }
- posth_val = preh_val + div_factor;
-}
-
-static struct kprobe kp2 = {
- .symbol_name = "kprobe_target2",
- .pre_handler = kp_pre_handler2,
- .post_handler = kp_post_handler2
-};
-
-static int test_kprobes(void)
-{
- int ret;
- struct kprobe *kps[2] = {&kp, &kp2};
-
- /* addr and flags should be cleard for reusing kprobe. */
- kp.addr = NULL;
- kp.flags = 0;
- ret = register_kprobes(kps, 2);
- if (ret < 0) {
- pr_err("register_kprobes returned %d\n", ret);
- return ret;
- }
-
- preh_val = 0;
- posth_val = 0;
- ret = target(rand1);
-
- if (preh_val == 0) {
- pr_err("kprobe pre_handler not called\n");
- handler_errors++;
- }
-
- if (posth_val == 0) {
- pr_err("kprobe post_handler not called\n");
- handler_errors++;
- }
-
- preh_val = 0;
- posth_val = 0;
- ret = target2(rand1);
-
- if (preh_val == 0) {
- pr_err("kprobe pre_handler2 not called\n");
- handler_errors++;
- }
-
- if (posth_val == 0) {
- pr_err("kprobe post_handler2 not called\n");
- handler_errors++;
- }
-
- unregister_kprobes(kps, 2);
- return 0;
-
-}
-
-#ifdef CONFIG_KRETPROBES
-static u32 krph_val;
-
-static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
-{
- if (preemptible()) {
- handler_errors++;
- pr_err("kretprobe entry handler is preemptible\n");
- }
- krph_val = (rand1 / div_factor);
- return 0;
-}
-
-static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
-{
- unsigned long ret = regs_return_value(regs);
-
- if (preemptible()) {
- handler_errors++;
- pr_err("kretprobe return handler is preemptible\n");
- }
- if (ret != (rand1 / div_factor)) {
- handler_errors++;
- pr_err("incorrect value in kretprobe handler\n");
- }
- if (krph_val == 0) {
- handler_errors++;
- pr_err("call to kretprobe entry handler failed\n");
- }
-
- krph_val = rand1;
- return 0;
-}
-
-static struct kretprobe rp = {
- .handler = return_handler,
- .entry_handler = entry_handler,
- .kp.symbol_name = "kprobe_target"
-};
-
-static int test_kretprobe(void)
-{
- int ret;
-
- ret = register_kretprobe(&rp);
- if (ret < 0) {
- pr_err("register_kretprobe returned %d\n", ret);
- return ret;
- }
-
- ret = target(rand1);
- unregister_kretprobe(&rp);
- if (krph_val != rand1) {
- pr_err("kretprobe handler not called\n");
- handler_errors++;
- }
-
- return 0;
-}
-
-static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs)
-{
- unsigned long ret = regs_return_value(regs);
-
- if (ret != (rand1 / div_factor) + 1) {
- handler_errors++;
- pr_err("incorrect value in kretprobe handler2\n");
- }
- if (krph_val == 0) {
- handler_errors++;
- pr_err("call to kretprobe entry handler failed\n");
- }
-
- krph_val = rand1;
- return 0;
-}
-
-static struct kretprobe rp2 = {
- .handler = return_handler2,
- .entry_handler = entry_handler,
- .kp.symbol_name = "kprobe_target2"
-};
-
-static int test_kretprobes(void)
-{
- int ret;
- struct kretprobe *rps[2] = {&rp, &rp2};
-
- /* addr and flags should be cleard for reusing kprobe. */
- rp.kp.addr = NULL;
- rp.kp.flags = 0;
- ret = register_kretprobes(rps, 2);
- if (ret < 0) {
- pr_err("register_kretprobe returned %d\n", ret);
- return ret;
- }
-
- krph_val = 0;
- ret = target(rand1);
- if (krph_val != rand1) {
- pr_err("kretprobe handler not called\n");
- handler_errors++;
- }
-
- krph_val = 0;
- ret = target2(rand1);
- if (krph_val != rand1) {
- pr_err("kretprobe handler2 not called\n");
- handler_errors++;
- }
- unregister_kretprobes(rps, 2);
- return 0;
-}
-#endif /* CONFIG_KRETPROBES */
-
-int init_test_probes(void)
-{
- int ret;
-
- target = kprobe_target;
- target2 = kprobe_target2;
-
- do {
- rand1 = prandom_u32();
- } while (rand1 <= div_factor);
-
- pr_info("started\n");
- num_tests++;
- ret = test_kprobe();
- if (ret < 0)
- errors++;
-
- num_tests++;
- ret = test_kprobes();
- if (ret < 0)
- errors++;
-
-#ifdef CONFIG_KRETPROBES
- num_tests++;
- ret = test_kretprobe();
- if (ret < 0)
- errors++;
-
- num_tests++;
- ret = test_kretprobes();
- if (ret < 0)
- errors++;
-#endif /* CONFIG_KRETPROBES */
-
- if (errors)
- pr_err("BUG: %d out of %d tests failed\n", errors, num_tests);
- else if (handler_errors)
- pr_err("BUG: %d error(s) running handlers\n", handler_errors);
- else
- pr_info("passed successfully\n");
-
- return 0;
-}
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index fcc42353f125..bae8f11070be 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -26,13 +26,9 @@ config CLOCKSOURCE_VALIDATE_LAST_CYCLE
config GENERIC_TIME_VSYSCALL
bool
-# Old style timekeeping
-config ARCH_USES_GETTIMEOFFSET
- bool
-
# The generic clock events infrastructure
config GENERIC_CLOCKEVENTS
- bool
+ def_bool !LEGACY_TIMER_TICK
# Architecture can handle broadcast in a driver-agnostic way
config ARCH_HAS_TICK_BROADCAST
@@ -52,6 +48,40 @@ config GENERIC_CLOCKEVENTS_MIN_ADJUST
config GENERIC_CMOS_UPDATE
bool
+# Select to handle posix CPU timers from task_work
+# and not from the timer interrupt context
+config HAVE_POSIX_CPU_TIMERS_TASK_WORK
+ bool
+
+config POSIX_CPU_TIMERS_TASK_WORK
+ bool
+ default y if POSIX_TIMERS && HAVE_POSIX_CPU_TIMERS_TASK_WORK
+
+config LEGACY_TIMER_TICK
+ bool
+ help
+ The legacy timer tick helper is used by platforms that
+ lack support for the generic clockevent framework.
+ New platforms should use generic clockevents instead.
+
+config TIME_KUNIT_TEST
+ tristate "KUnit test for kernel/time functions" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ Enable this option to test RTC library functions.
+
+ If unsure, say N.
+
+config CONTEXT_TRACKING
+ bool
+
+config CONTEXT_TRACKING_IDLE
+ bool
+ select CONTEXT_TRACKING
+ help
+ Tracks idle state on behalf of RCU.
+
if GENERIC_CLOCKEVENTS
menu "Timers subsystem"
@@ -63,7 +93,6 @@ config TICK_ONESHOT
config NO_HZ_COMMON
bool
- depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
select TICK_ONESHOT
choice
@@ -78,7 +107,6 @@ config HZ_PERIODIC
config NO_HZ_IDLE
bool "Idle dynticks system (tickless idle)"
- depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
select NO_HZ_COMMON
help
This option enables a tickless idle system: timer interrupts
@@ -90,10 +118,9 @@ config NO_HZ_IDLE
config NO_HZ_FULL
bool "Full dynticks system (tickless)"
# NO_HZ_COMMON dependency
- depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
# We need at least one periodic CPU for timekeeping
depends on SMP
- depends on HAVE_CONTEXT_TRACKING
+ depends on HAVE_CONTEXT_TRACKING_USER
# VIRT_CPU_ACCOUNTING_GEN dependency
depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
select NO_HZ_COMMON
@@ -108,48 +135,54 @@ config NO_HZ_FULL
the task mostly runs in userspace and has few kernel activity.
You need to fill up the nohz_full boot parameter with the
- desired range of dynticks CPUs.
+ desired range of dynticks CPUs to use it. This is implemented at
+ the expense of some overhead in user <-> kernel transitions:
+ syscalls, exceptions and interrupts.
- This is implemented at the expense of some overhead in user <-> kernel
- transitions: syscalls, exceptions and interrupts. Even when it's
- dynamically off.
+ By default, without passing the nohz_full parameter, this behaves just
+ like NO_HZ_IDLE.
- Say N.
+ If you're a distro say Y.
endchoice
-config CONTEXT_TRACKING
- bool
+config CONTEXT_TRACKING_USER
+ bool
+ depends on HAVE_CONTEXT_TRACKING_USER
+ select CONTEXT_TRACKING
+ help
+ Track transitions between kernel and user on behalf of RCU and
+ tickless cputime accounting. The former case relies on context
+ tracking to enter/exit RCU extended quiescent states.
-config CONTEXT_TRACKING_FORCE
- bool "Force context tracking"
- depends on CONTEXT_TRACKING
+config CONTEXT_TRACKING_USER_FORCE
+ bool "Force user context tracking"
+ depends on CONTEXT_TRACKING_USER
default y if !NO_HZ_FULL
help
The major pre-requirement for full dynticks to work is to
- support the context tracking subsystem. But there are also
+ support the user context tracking subsystem. But there are also
other dependencies to provide in order to make the full
dynticks working.
This option stands for testing when an arch implements the
- context tracking backend but doesn't yet fullfill all the
+ user context tracking backend but doesn't yet fulfill all the
requirements to make the full dynticks feature working.
Without the full dynticks, there is no way to test the support
- for context tracking and the subsystems that rely on it: RCU
+ for user context tracking and the subsystems that rely on it: RCU
userspace extended quiescent state and tickless cputime
accounting. This option copes with the absence of the full
- dynticks subsystem by forcing the context tracking on all
+ dynticks subsystem by forcing the user context tracking on all
CPUs in the system.
Say Y only if you're working on the development of an
- architecture backend for the context tracking.
+ architecture backend for the user context tracking.
Say N otherwise, this option brings an overhead that you
don't want in production.
config NO_HZ
bool "Old Idle dynticks config"
- depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
help
This is the old config entry that enables dynticks idle.
We keep it around for a little while to enforce backward
@@ -157,12 +190,24 @@ config NO_HZ
config HIGH_RES_TIMERS
bool "High Resolution Timer Support"
- depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
select TICK_ONESHOT
help
This option enables high resolution timer support. If your
hardware is not capable then this option only increases
the size of the kernel image.
+config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
+ int "Clocksource watchdog maximum allowable skew (in μs)"
+ depends on CLOCKSOURCE_WATCHDOG
+ range 50 1000
+ default 125
+ help
+ Specify the maximum amount of allowable watchdog skew in
+ microseconds before reporting the clocksource to be unstable.
+ The default is based on a half-second clocksource watchdog
+ interval and NTP's maximum frequency drift of 500 parts
+ per million. If the clocksource is good enough for NTP,
+ it is good enough for the clocksource watchdog!
+
endmenu
endif
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index c8f00168afe8..7e875e63ff3b 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -16,7 +16,10 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
endif
obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
+obj-$(CONFIG_LEGACY_TIMER_TICK) += tick-legacy.o
obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
obj-$(CONFIG_TIME_NS) += namespace.o
+obj-$(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) += clocksource-wdtest.o
+obj-$(CONFIG_TIME_KUNIT_TEST) += time_test.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 2ffb466af77e..4657cb8e8b1f 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -2,13 +2,13 @@
/*
* Alarmtimer interface
*
- * This interface provides a timer which is similarto hrtimers,
+ * This interface provides a timer which is similar to hrtimers,
* but triggers a RTC alarm if the box is suspend.
*
* This interface is influenced by the Android RTC Alarm timer
* interface.
*
- * Copyright (C) 2010 IBM Corperation
+ * Copyright (C) 2010 IBM Corporation
*
* Author: John Stultz <john.stultz@linaro.org>
*/
@@ -81,8 +81,7 @@ struct rtc_device *alarmtimer_get_rtcdev(void)
}
EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev);
-static int alarmtimer_rtc_add_device(struct device *dev,
- struct class_interface *class_intf)
+static int alarmtimer_rtc_add_device(struct device *dev)
{
unsigned long flags;
struct rtc_device *rtc = to_rtc_device(dev);
@@ -92,7 +91,7 @@ static int alarmtimer_rtc_add_device(struct device *dev,
if (rtcdev)
return -EBUSY;
- if (!rtc->ops->set_alarm)
+ if (!test_bit(RTC_FEATURE_ALARM, rtc->features))
return -1;
if (!device_may_wakeup(rtc->dev.parent))
return -1;
@@ -192,7 +191,7 @@ static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)
* When a alarm timer fires, this runs through the timerqueue to
* see which alarms expired, and runs those. If there are more alarm
* timers queued for the future, we set the hrtimer to fire when
- * when the next future alarm timer expires.
+ * the next future alarm timer expires.
*/
static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
{
@@ -291,6 +290,17 @@ static int alarmtimer_suspend(struct device *dev)
rtc_timer_cancel(rtc, &rtctimer);
rtc_read_time(rtc, &tm);
now = rtc_tm_to_ktime(tm);
+
+ /*
+ * If the RTC alarm timer only supports a limited time offset, set the
+ * alarm time to the maximum supported value.
+ * The system may wake up earlier (possibly much earlier) than expected
+ * when the alarmtimer runs. This is the best the kernel can do if
+ * the alarmtimer exceeds the time that the rtc device can be programmed
+ * for.
+ */
+ min = rtc_bound_alarmtime(rtc, min);
+
now = ktime_add(now, min);
/* Set alarm, if in the past reject suspend briefly to handle */
@@ -470,11 +480,35 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
}
EXPORT_SYMBOL_GPL(alarm_forward);
-u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
+static u64 __alarm_forward_now(struct alarm *alarm, ktime_t interval, bool throttle)
{
struct alarm_base *base = &alarm_bases[alarm->type];
+ ktime_t now = base->get_ktime();
+
+ if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && throttle) {
+ /*
+ * Same issue as with posix_timer_fn(). Timers which are
+ * periodic but the signal is ignored can starve the system
+ * with a very small interval. The real fix which was
+ * promised in the context of posix_timer_fn() never
+ * materialized, but someone should really work on it.
+ *
+ * To prevent DOS fake @now to be 1 jiffie out which keeps
+ * the overrun accounting correct but creates an
+ * inconsistency vs. timer_gettime(2).
+ */
+ ktime_t kj = NSEC_PER_SEC / HZ;
+
+ if (interval < kj)
+ now = ktime_add(now, kj);
+ }
- return alarm_forward(alarm, base->get_ktime(), interval);
+ return alarm_forward(alarm, now, interval);
+}
+
+u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
+{
+ return __alarm_forward_now(alarm, interval, false);
}
EXPORT_SYMBOL_GPL(alarm_forward_now);
@@ -527,8 +561,11 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
/**
* alarm_handle_timer - Callback for posix timers
* @alarm: alarm that fired
+ * @now: time at the timer expiration
*
* Posix timer callback for expired alarm timers.
+ *
+ * Return: whether the timer is to be restarted
*/
static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
ktime_t now)
@@ -548,9 +585,10 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
if (posix_timer_event(ptr, si_private) && ptr->it_interval) {
/*
* Handle ignored signals and rearm the timer. This will go
- * away once we handle ignored signals proper.
+ * away once we handle ignored signals proper. Ensure that
+ * small intervals cannot starve the system.
*/
- ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval);
+ ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true);
++ptr->it_requeue_pending;
ptr->it_active = 1;
result = ALARMTIMER_RESTART;
@@ -715,13 +753,16 @@ static int alarm_timer_create(struct k_itimer *new_timer)
/**
* alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep
* @alarm: ptr to alarm that fired
+ * @now: time at the timer expiration
*
* Wakes up the task that set the alarmtimer
+ *
+ * Return: ALARMTIMER_NORESTART
*/
static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
ktime_t now)
{
- struct task_struct *task = (struct task_struct *)alarm->data;
+ struct task_struct *task = alarm->data;
alarm->data = NULL;
if (task)
@@ -733,6 +774,7 @@ static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
* alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation
* @alarm: ptr to alarmtimer
* @absexp: absolute expiration time
+ * @type: alarm type (BOOTTIME/REALTIME).
*
* Sets the alarm timer and sleeps until it is fired or interrupted.
*/
@@ -804,9 +846,8 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
/**
* alarm_timer_nsleep - alarmtimer nanosleep
* @which_clock: clockid
- * @flags: determins abstime or relative
+ * @flags: determines abstime or relative
* @tsreq: requested sleep time (abs or rel)
- * @rmtp: remaining sleep time saved
*
* Handles clock_nanosleep calls against _ALARM clockids
*/
@@ -817,7 +858,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
struct restart_block *restart = &current->restart_block;
struct alarm alarm;
ktime_t exp;
- int ret = 0;
+ int ret;
if (!alarmtimer_get_rtcdev())
return -EOPNOTSUPP;
@@ -848,9 +889,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
if (flags == TIMER_ABSTIME)
return -ERESTARTNOHAND;
- restart->fn = alarm_timer_nsleep_restart;
restart->nanosleep.clockid = type;
restart->nanosleep.expires = exp;
+ set_restart_fn(restart, alarm_timer_nsleep_restart);
return ret;
}
@@ -908,7 +949,7 @@ static int __init alarmtimer_init(void)
/* Initialize alarm bases */
alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real;
- alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64,
+ alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64;
alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime;
alarm_bases[ALARM_BOOTTIME].get_timespec = get_boottime_timespec;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index f5490222e134..960143b183cd 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -76,7 +76,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
}
/**
- * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
+ * clockevent_delta2ns - Convert a latch value (device ticks) to nanoseconds
* @latch: value to convert
* @evt: pointer to clock event device descriptor
*
@@ -347,8 +347,7 @@ static void clockevents_notify_released(void)
while (!list_empty(&clockevents_released)) {
dev = list_entry(clockevents_released.next,
struct clock_event_device, list);
- list_del(&dev->list);
- list_add(&dev->list, &clockevent_devices);
+ list_move(&dev->list, &clockevent_devices);
tick_check_new_device(dev);
}
}
@@ -576,8 +575,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
if (old) {
module_put(old->owner);
clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED);
- list_del(&old->list);
- list_add(&old->list, &clockevents_released);
+ list_move(&old->list, &clockevents_released);
}
if (new) {
@@ -629,6 +627,7 @@ void tick_offline_cpu(unsigned int cpu)
/**
* tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
+ * @cpu: The dead CPU
*/
void tick_cleanup_dead_cpu(int cpu)
{
@@ -668,9 +667,9 @@ static struct bus_type clockevents_subsys = {
static DEFINE_PER_CPU(struct device, tick_percpu_dev);
static struct tick_device *tick_get_tick_dev(struct device *dev);
-static ssize_t sysfs_show_current_tick_dev(struct device *dev,
- struct device_attribute *attr,
- char *buf)
+static ssize_t current_device_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
{
struct tick_device *td;
ssize_t count = 0;
@@ -682,16 +681,16 @@ static ssize_t sysfs_show_current_tick_dev(struct device *dev,
raw_spin_unlock_irq(&clockevents_lock);
return count;
}
-static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL);
+static DEVICE_ATTR_RO(current_device);
/* We don't support the abomination of removable broadcast devices */
-static ssize_t sysfs_unbind_tick_dev(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t unbind_device_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
char name[CS_NAME_LEN];
ssize_t ret = sysfs_get_uname(buf, name, count);
- struct clock_event_device *ce;
+ struct clock_event_device *ce = NULL, *iter;
if (ret < 0)
return ret;
@@ -699,9 +698,10 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev,
ret = -ENODEV;
mutex_lock(&clockevents_mutex);
raw_spin_lock_irq(&clockevents_lock);
- list_for_each_entry(ce, &clockevent_devices, list) {
- if (!strcmp(ce->name, name)) {
- ret = __clockevents_try_unbind(ce, dev->id);
+ list_for_each_entry(iter, &clockevent_devices, list) {
+ if (!strcmp(iter->name, name)) {
+ ret = __clockevents_try_unbind(iter, dev->id);
+ ce = iter;
break;
}
}
@@ -714,7 +714,7 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev,
mutex_unlock(&clockevents_mutex);
return ret ? ret : count;
}
-static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev);
+static DEVICE_ATTR_WO(unbind_device);
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
static struct device tick_bc_dev = {
diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c
new file mode 100644
index 000000000000..df922f49d171
--- /dev/null
+++ b/kernel/time/clocksource-wdtest.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Unit test for the clocksource watchdog.
+ *
+ * Copyright (C) 2021 Facebook, Inc.
+ *
+ * Author: Paul E. McKenney <paulmck@kernel.org>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/device.h>
+#include <linux/clocksource.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
+#include <linux/tick.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/prandom.h>
+#include <linux/cpu.h>
+
+#include "tick-internal.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>");
+
+static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0;
+module_param(holdoff, int, 0444);
+MODULE_PARM_DESC(holdoff, "Time to wait to start test (s).");
+
+/* Watchdog kthread's task_struct pointer for debug purposes. */
+static struct task_struct *wdtest_task;
+
+static u64 wdtest_jiffies_read(struct clocksource *cs)
+{
+ return (u64)jiffies;
+}
+
+static struct clocksource clocksource_wdtest_jiffies = {
+ .name = "wdtest-jiffies",
+ .rating = 1, /* lowest valid rating*/
+ .uncertainty_margin = TICK_NSEC,
+ .read = wdtest_jiffies_read,
+ .mask = CLOCKSOURCE_MASK(32),
+ .flags = CLOCK_SOURCE_MUST_VERIFY,
+ .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */
+ .shift = JIFFIES_SHIFT,
+ .max_cycles = 10,
+};
+
+static int wdtest_ktime_read_ndelays;
+static bool wdtest_ktime_read_fuzz;
+
+static u64 wdtest_ktime_read(struct clocksource *cs)
+{
+ int wkrn = READ_ONCE(wdtest_ktime_read_ndelays);
+ static int sign = 1;
+ u64 ret;
+
+ if (wkrn) {
+ udelay(cs->uncertainty_margin / 250);
+ WRITE_ONCE(wdtest_ktime_read_ndelays, wkrn - 1);
+ }
+ ret = ktime_get_real_fast_ns();
+ if (READ_ONCE(wdtest_ktime_read_fuzz)) {
+ sign = -sign;
+ ret = ret + sign * 100 * NSEC_PER_MSEC;
+ }
+ return ret;
+}
+
+static void wdtest_ktime_cs_mark_unstable(struct clocksource *cs)
+{
+ pr_info("--- Marking %s unstable due to clocksource watchdog.\n", cs->name);
+}
+
+#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \
+ CLOCK_SOURCE_VALID_FOR_HRES | \
+ CLOCK_SOURCE_MUST_VERIFY | \
+ CLOCK_SOURCE_VERIFY_PERCPU)
+
+static struct clocksource clocksource_wdtest_ktime = {
+ .name = "wdtest-ktime",
+ .rating = 300,
+ .read = wdtest_ktime_read,
+ .mask = CLOCKSOURCE_MASK(64),
+ .flags = KTIME_FLAGS,
+ .mark_unstable = wdtest_ktime_cs_mark_unstable,
+ .list = LIST_HEAD_INIT(clocksource_wdtest_ktime.list),
+};
+
+/* Reset the clocksource if needed. */
+static void wdtest_ktime_clocksource_reset(void)
+{
+ if (clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE) {
+ clocksource_unregister(&clocksource_wdtest_ktime);
+ clocksource_wdtest_ktime.flags = KTIME_FLAGS;
+ schedule_timeout_uninterruptible(HZ / 10);
+ clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
+ }
+}
+
+/* Run the specified series of watchdog tests. */
+static int wdtest_func(void *arg)
+{
+ unsigned long j1, j2;
+ char *s;
+ int i;
+
+ schedule_timeout_uninterruptible(holdoff * HZ);
+
+ /*
+ * Verify that jiffies-like clocksources get the manually
+ * specified uncertainty margin.
+ */
+ pr_info("--- Verify jiffies-like uncertainty margin.\n");
+ __clocksource_register(&clocksource_wdtest_jiffies);
+ WARN_ON_ONCE(clocksource_wdtest_jiffies.uncertainty_margin != TICK_NSEC);
+
+ j1 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies);
+ schedule_timeout_uninterruptible(HZ);
+ j2 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies);
+ WARN_ON_ONCE(j1 == j2);
+
+ clocksource_unregister(&clocksource_wdtest_jiffies);
+
+ /*
+ * Verify that tsc-like clocksources are assigned a reasonable
+ * uncertainty margin.
+ */
+ pr_info("--- Verify tsc-like uncertainty margin.\n");
+ clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
+ WARN_ON_ONCE(clocksource_wdtest_ktime.uncertainty_margin < NSEC_PER_USEC);
+
+ j1 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime);
+ udelay(1);
+ j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime);
+ pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1);
+ WARN_ON_ONCE(time_before(j2, j1 + NSEC_PER_USEC));
+
+ /* Verify tsc-like stability with various numbers of errors injected. */
+ for (i = 0; i <= max_cswd_read_retries + 1; i++) {
+ if (i <= 1 && i < max_cswd_read_retries)
+ s = "";
+ else if (i <= max_cswd_read_retries)
+ s = ", expect message";
+ else
+ s = ", expect clock skew";
+ pr_info("--- Watchdog with %dx error injection, %lu retries%s.\n", i, max_cswd_read_retries, s);
+ WRITE_ONCE(wdtest_ktime_read_ndelays, i);
+ schedule_timeout_uninterruptible(2 * HZ);
+ WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays));
+ WARN_ON_ONCE((i <= max_cswd_read_retries) !=
+ !(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
+ wdtest_ktime_clocksource_reset();
+ }
+
+ /* Verify tsc-like stability with clock-value-fuzz error injection. */
+ pr_info("--- Watchdog clock-value-fuzz error injection, expect clock skew and per-CPU mismatches.\n");
+ WRITE_ONCE(wdtest_ktime_read_fuzz, true);
+ schedule_timeout_uninterruptible(2 * HZ);
+ WARN_ON_ONCE(!(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
+ clocksource_verify_percpu(&clocksource_wdtest_ktime);
+ WRITE_ONCE(wdtest_ktime_read_fuzz, false);
+
+ clocksource_unregister(&clocksource_wdtest_ktime);
+
+ pr_info("--- Done with test.\n");
+ return 0;
+}
+
+static void wdtest_print_module_parms(void)
+{
+ pr_alert("--- holdoff=%d\n", holdoff);
+}
+
+/* Cleanup function. */
+static void clocksource_wdtest_cleanup(void)
+{
+}
+
+static int __init clocksource_wdtest_init(void)
+{
+ int ret = 0;
+
+ wdtest_print_module_parms();
+
+ /* Create watchdog-test task. */
+ wdtest_task = kthread_run(wdtest_func, NULL, "wdtest");
+ if (IS_ERR(wdtest_task)) {
+ ret = PTR_ERR(wdtest_task);
+ pr_warn("%s: Failed to create wdtest kthread.\n", __func__);
+ wdtest_task = NULL;
+ return ret;
+ }
+
+ return 0;
+}
+
+module_init(clocksource_wdtest_init);
+module_exit(clocksource_wdtest_cleanup);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 02441ead3c3b..3052b1f1168e 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -14,6 +14,8 @@
#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
#include <linux/tick.h>
#include <linux/kthread.h>
+#include <linux/prandom.h>
+#include <linux/cpu.h>
#include "tick-internal.h"
#include "timekeeping_internal.h"
@@ -38,7 +40,7 @@
* calculated mult and shift factors. This guarantees that no 64bit
* overflow happens when the input value of the conversion is
* multiplied with the calculated mult factor. Larger ranges may
- * reduce the conversion accuracy by chosing smaller mult and shift
+ * reduce the conversion accuracy by choosing smaller mult and shift
* factors.
*/
void
@@ -93,6 +95,35 @@ static char override_name[CS_NAME_LEN];
static int finished_booting;
static u64 suspend_start;
+/*
+ * Interval: 0.5sec.
+ */
+#define WATCHDOG_INTERVAL (HZ >> 1)
+#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ))
+
+/*
+ * Threshold: 0.0312s, when doubled: 0.0625s.
+ * Also a default for cs->uncertainty_margin when registering clocks.
+ */
+#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5)
+
+/*
+ * Maximum permissible delay between two readouts of the watchdog
+ * clocksource surrounding a read of the clocksource being validated.
+ * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
+ * a lower bound for cs->uncertainty_margin values when registering clocks.
+ *
+ * The default of 500 parts per million is based on NTP's limits.
+ * If a clocksource is good enough for NTP, it is good enough for us!
+ */
+#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
+#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
+#else
+#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ)
+#endif
+
+#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
+
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
static void clocksource_watchdog_work(struct work_struct *work);
static void clocksource_select(void);
@@ -104,6 +135,7 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
static DEFINE_SPINLOCK(watchdog_lock);
static int watchdog_running;
static atomic_t watchdog_reset_pending;
+static int64_t watchdog_max_interval;
static inline void clocksource_watchdog_lock(unsigned long *flags)
{
@@ -118,12 +150,6 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags)
static int clocksource_watchdog_kthread(void *data);
static void __clocksource_change_rating(struct clocksource *cs, int rating);
-/*
- * Interval: 0.5sec Threshold: 0.0625s
- */
-#define WATCHDOG_INTERVAL (HZ >> 1)
-#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
-
static void clocksource_watchdog_work(struct work_struct *work)
{
/*
@@ -184,12 +210,203 @@ void clocksource_mark_unstable(struct clocksource *cs)
spin_unlock_irqrestore(&watchdog_lock, flags);
}
-static void clocksource_watchdog(struct timer_list *unused)
+ulong max_cswd_read_retries = 2;
+module_param(max_cswd_read_retries, ulong, 0644);
+EXPORT_SYMBOL_GPL(max_cswd_read_retries);
+static int verify_n_cpus = 8;
+module_param(verify_n_cpus, int, 0644);
+
+enum wd_read_status {
+ WD_READ_SUCCESS,
+ WD_READ_UNSTABLE,
+ WD_READ_SKIP
+};
+
+static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
+{
+ unsigned int nretries;
+ u64 wd_end, wd_end2, wd_delta;
+ int64_t wd_delay, wd_seq_delay;
+
+ for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) {
+ local_irq_disable();
+ *wdnow = watchdog->read(watchdog);
+ *csnow = cs->read(cs);
+ wd_end = watchdog->read(watchdog);
+ wd_end2 = watchdog->read(watchdog);
+ local_irq_enable();
+
+ wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
+ wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult,
+ watchdog->shift);
+ if (wd_delay <= WATCHDOG_MAX_SKEW) {
+ if (nretries > 1 || nretries >= max_cswd_read_retries) {
+ pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
+ smp_processor_id(), watchdog->name, nretries);
+ }
+ return WD_READ_SUCCESS;
+ }
+
+ /*
+ * Now compute delay in consecutive watchdog read to see if
+ * there is too much external interferences that cause
+ * significant delay in reading both clocksource and watchdog.
+ *
+ * If consecutive WD read-back delay > WATCHDOG_MAX_SKEW/2,
+ * report system busy, reinit the watchdog and skip the current
+ * watchdog test.
+ */
+ wd_delta = clocksource_delta(wd_end2, wd_end, watchdog->mask);
+ wd_seq_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift);
+ if (wd_seq_delay > WATCHDOG_MAX_SKEW/2)
+ goto skip_test;
+ }
+
+ pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n",
+ smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name);
+ return WD_READ_UNSTABLE;
+
+skip_test:
+ pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n",
+ smp_processor_id(), watchdog->name, wd_seq_delay);
+ pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n",
+ cs->name, wd_delay);
+ return WD_READ_SKIP;
+}
+
+static u64 csnow_mid;
+static cpumask_t cpus_ahead;
+static cpumask_t cpus_behind;
+static cpumask_t cpus_chosen;
+
+static void clocksource_verify_choose_cpus(void)
+{
+ int cpu, i, n = verify_n_cpus;
+
+ if (n < 0) {
+ /* Check all of the CPUs. */
+ cpumask_copy(&cpus_chosen, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
+ return;
+ }
+
+ /* If no checking desired, or no other CPU to check, leave. */
+ cpumask_clear(&cpus_chosen);
+ if (n == 0 || num_online_cpus() <= 1)
+ return;
+
+ /* Make sure to select at least one CPU other than the current CPU. */
+ cpu = cpumask_first(cpu_online_mask);
+ if (cpu == smp_processor_id())
+ cpu = cpumask_next(cpu, cpu_online_mask);
+ if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
+ return;
+ cpumask_set_cpu(cpu, &cpus_chosen);
+
+ /* Force a sane value for the boot parameter. */
+ if (n > nr_cpu_ids)
+ n = nr_cpu_ids;
+
+ /*
+ * Randomly select the specified number of CPUs. If the same
+ * CPU is selected multiple times, that CPU is checked only once,
+ * and no replacement CPU is selected. This gracefully handles
+ * situations where verify_n_cpus is greater than the number of
+ * CPUs that are currently online.
+ */
+ for (i = 1; i < n; i++) {
+ cpu = get_random_u32_below(nr_cpu_ids);
+ cpu = cpumask_next(cpu - 1, cpu_online_mask);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_first(cpu_online_mask);
+ if (!WARN_ON_ONCE(cpu >= nr_cpu_ids))
+ cpumask_set_cpu(cpu, &cpus_chosen);
+ }
+
+ /* Don't verify ourselves. */
+ cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
+}
+
+static void clocksource_verify_one_cpu(void *csin)
+{
+ struct clocksource *cs = (struct clocksource *)csin;
+
+ csnow_mid = cs->read(cs);
+}
+
+void clocksource_verify_percpu(struct clocksource *cs)
+{
+ int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX;
+ u64 csnow_begin, csnow_end;
+ int cpu, testcpu;
+ s64 delta;
+
+ if (verify_n_cpus == 0)
+ return;
+ cpumask_clear(&cpus_ahead);
+ cpumask_clear(&cpus_behind);
+ cpus_read_lock();
+ preempt_disable();
+ clocksource_verify_choose_cpus();
+ if (cpumask_empty(&cpus_chosen)) {
+ preempt_enable();
+ cpus_read_unlock();
+ pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
+ return;
+ }
+ testcpu = smp_processor_id();
+ pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
+ for_each_cpu(cpu, &cpus_chosen) {
+ if (cpu == testcpu)
+ continue;
+ csnow_begin = cs->read(cs);
+ smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1);
+ csnow_end = cs->read(cs);
+ delta = (s64)((csnow_mid - csnow_begin) & cs->mask);
+ if (delta < 0)
+ cpumask_set_cpu(cpu, &cpus_behind);
+ delta = (csnow_end - csnow_mid) & cs->mask;
+ if (delta < 0)
+ cpumask_set_cpu(cpu, &cpus_ahead);
+ delta = clocksource_delta(csnow_end, csnow_begin, cs->mask);
+ cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+ if (cs_nsec > cs_nsec_max)
+ cs_nsec_max = cs_nsec;
+ if (cs_nsec < cs_nsec_min)
+ cs_nsec_min = cs_nsec;
+ }
+ preempt_enable();
+ cpus_read_unlock();
+ if (!cpumask_empty(&cpus_ahead))
+ pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n",
+ cpumask_pr_args(&cpus_ahead), testcpu, cs->name);
+ if (!cpumask_empty(&cpus_behind))
+ pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n",
+ cpumask_pr_args(&cpus_behind), testcpu, cs->name);
+ if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind))
+ pr_warn(" CPU %d check durations %lldns - %lldns for clocksource %s.\n",
+ testcpu, cs_nsec_min, cs_nsec_max, cs->name);
+}
+EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
+
+static inline void clocksource_reset_watchdog(void)
{
struct clocksource *cs;
+
+ list_for_each_entry(cs, &watchdog_list, wd_list)
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+}
+
+
+static void clocksource_watchdog(struct timer_list *unused)
+{
u64 csnow, wdnow, cslast, wdlast, delta;
- int64_t wd_nsec, cs_nsec;
+ int64_t wd_nsec, cs_nsec, interval;
int next_cpu, reset_pending;
+ struct clocksource *cs;
+ enum wd_read_status read_ret;
+ unsigned long extra_wait = 0;
+ u32 md;
spin_lock(&watchdog_lock);
if (!watchdog_running)
@@ -206,10 +423,31 @@ static void clocksource_watchdog(struct timer_list *unused)
continue;
}
- local_irq_disable();
- csnow = cs->read(cs);
- wdnow = watchdog->read(watchdog);
- local_irq_enable();
+ read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
+
+ if (read_ret == WD_READ_UNSTABLE) {
+ /* Clock readout unreliable, so give it up. */
+ __clocksource_unstable(cs);
+ continue;
+ }
+
+ /*
+ * When WD_READ_SKIP is returned, it means the system is likely
+ * under very heavy load, where the latency of reading
+ * watchdog/clocksource is very big, and affect the accuracy of
+ * watchdog check. So give system some space and suspend the
+ * watchdog check for 5 minutes.
+ */
+ if (read_ret == WD_READ_SKIP) {
+ /*
+ * As the watchdog timer will be suspended, and
+ * cs->last could keep unchanged for 5 minutes, reset
+ * the counters.
+ */
+ clocksource_reset_watchdog();
+ extra_wait = HZ * 300;
+ break;
+ }
/* Clocksource initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
@@ -234,14 +472,50 @@ static void clocksource_watchdog(struct timer_list *unused)
if (atomic_read(&watchdog_reset_pending))
continue;
+ /*
+ * The processing of timer softirqs can get delayed (usually
+ * on account of ksoftirqd not getting to run in a timely
+ * manner), which causes the watchdog interval to stretch.
+ * Skew detection may fail for longer watchdog intervals
+ * on account of fixed margins being used.
+ * Some clocksources, e.g. acpi_pm, cannot tolerate
+ * watchdog intervals longer than a few seconds.
+ */
+ interval = max(cs_nsec, wd_nsec);
+ if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) {
+ if (system_state > SYSTEM_SCHEDULING &&
+ interval > 2 * watchdog_max_interval) {
+ watchdog_max_interval = interval;
+ pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n",
+ cs_nsec, wd_nsec);
+ }
+ watchdog_timer.expires = jiffies;
+ continue;
+ }
+
/* Check the deviation from the watchdog clocksource. */
- if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
+ md = cs->uncertainty_margin + watchdog->uncertainty_margin;
+ if (abs(cs_nsec - wd_nsec) > md) {
+ s64 cs_wd_msec;
+ s64 wd_msec;
+ u32 wd_rem;
+
pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
smp_processor_id(), cs->name);
- pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
- watchdog->name, wdnow, wdlast, watchdog->mask);
- pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
- cs->name, csnow, cslast, cs->mask);
+ pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
+ watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
+ pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
+ cs->name, cs_nsec, csnow, cslast, cs->mask);
+ cs_wd_msec = div_s64_rem(cs_nsec - wd_nsec, 1000 * 1000, &wd_rem);
+ wd_msec = div_s64_rem(wd_nsec, 1000 * 1000, &wd_rem);
+ pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n",
+ cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec);
+ if (curr_clocksource == cs)
+ pr_warn(" '%s' is current clocksource.\n", cs->name);
+ else if (curr_clocksource)
+ pr_warn(" '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name);
+ else
+ pr_warn(" No current clocksource.\n");
__clocksource_unstable(cs);
continue;
}
@@ -299,7 +573,7 @@ static void clocksource_watchdog(struct timer_list *unused)
* pair clocksource_stop_watchdog() clocksource_start_watchdog().
*/
if (!timer_pending(&watchdog_timer)) {
- watchdog_timer.expires += WATCHDOG_INTERVAL;
+ watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
add_timer_on(&watchdog_timer, next_cpu);
}
out:
@@ -324,14 +598,6 @@ static inline void clocksource_stop_watchdog(void)
watchdog_running = 0;
}
-static inline void clocksource_reset_watchdog(void)
-{
- struct clocksource *cs;
-
- list_for_each_entry(cs, &watchdog_list, wd_list)
- cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
-}
-
static void clocksource_resume_watchdog(void)
{
atomic_inc(&watchdog_reset_pending);
@@ -407,6 +673,12 @@ static int __clocksource_watchdog_kthread(void)
unsigned long flags;
int select = 0;
+ /* Do any required per-CPU skew verification. */
+ if (curr_clocksource &&
+ curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE &&
+ curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU)
+ clocksource_verify_percpu(curr_clocksource);
+
spin_lock_irqsave(&watchdog_lock, flags);
list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
@@ -518,7 +790,7 @@ static void clocksource_suspend_select(bool fallback)
* the suspend time when resuming system.
*
* This function is called late in the suspend process from timekeeping_suspend(),
- * that means processes are freezed, non-boot cpus and interrupts are disabled
+ * that means processes are frozen, non-boot cpus and interrupts are disabled
* now. It is therefore possible to start the suspend timer without taking the
* clocksource mutex.
*/
@@ -705,8 +977,6 @@ static inline void clocksource_update_max_deferment(struct clocksource *cs)
&cs->max_cycles);
}
-#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
-
static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
{
struct clocksource *cs;
@@ -798,12 +1068,6 @@ static void clocksource_select_fallback(void)
__clocksource_select(true);
}
-#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
-static inline void clocksource_select(void) { }
-static inline void clocksource_select_fallback(void) { }
-
-#endif
-
/*
* clocksource_done_booting - Called near the end of core bootup
*
@@ -884,6 +1148,26 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
NSEC_PER_SEC / scale, sec * scale);
}
+
+ /*
+ * If the uncertainty margin is not specified, calculate it.
+ * If both scale and freq are non-zero, calculate the clock
+ * period, but bound below at 2*WATCHDOG_MAX_SKEW. However,
+ * if either of scale or freq is zero, be very conservative and
+ * take the tens-of-milliseconds WATCHDOG_THRESHOLD value for the
+ * uncertainty margin. Allow stupidly small uncertainty margins
+ * to be specified by the caller for testing purposes, but warn
+ * to discourage production use of this capability.
+ */
+ if (scale && freq && !cs->uncertainty_margin) {
+ cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq);
+ if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW)
+ cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW;
+ } else if (!cs->uncertainty_margin) {
+ cs->uncertainty_margin = WATCHDOG_THRESHOLD;
+ }
+ WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW);
+
/*
* Ensure clocksources that have large 'mult' values don't overflow
* when adjusted.
@@ -928,6 +1212,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
clocksource_arch_init(cs);
+ if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX))
+ cs->id = CSID_GENERIC;
if (cs->vdso_clock_mode < 0 ||
cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
@@ -1217,7 +1503,7 @@ static int __init boot_override_clocksource(char* str)
{
mutex_lock(&clocksource_mutex);
if (str)
- strlcpy(override_name, str, sizeof(override_name));
+ strscpy(override_name, str, sizeof(override_name));
mutex_unlock(&clocksource_mutex);
return 1;
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d89da1c7e005..edb0f821dcea 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -135,7 +135,11 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
* timer->base->cpu_base
*/
static struct hrtimer_cpu_base migration_cpu_base = {
- .clock_base = { { .cpu_base = &migration_cpu_base, }, },
+ .clock_base = { {
+ .cpu_base = &migration_cpu_base,
+ .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
+ &migration_cpu_base.lock),
+ }, },
};
#define migration_base migration_cpu_base.clock_base[0]
@@ -160,6 +164,7 @@ static inline bool is_migration_base(struct hrtimer_clock_base *base)
static
struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
unsigned long *flags)
+ __acquires(&timer->base->lock)
{
struct hrtimer_clock_base *base;
@@ -276,6 +281,7 @@ static inline bool is_migration_base(struct hrtimer_clock_base *base)
static inline struct hrtimer_clock_base *
lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+ __acquires(&timer->base->cpu_base->lock)
{
struct hrtimer_clock_base *base = timer->base;
@@ -338,7 +344,7 @@ EXPORT_SYMBOL_GPL(ktime_add_safe);
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
-static struct debug_obj_descr hrtimer_debug_descr;
+static const struct debug_obj_descr hrtimer_debug_descr;
static void *hrtimer_debug_hint(void *addr)
{
@@ -373,7 +379,7 @@ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
switch (state) {
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
- /* fall through */
+ fallthrough;
default:
return false;
}
@@ -397,7 +403,7 @@ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
}
}
-static struct debug_obj_descr hrtimer_debug_descr = {
+static const struct debug_obj_descr hrtimer_debug_descr = {
.name = "hrtimer",
.debug_hint = hrtimer_debug_hint,
.fixup_init = hrtimer_fixup_init,
@@ -421,11 +427,6 @@ static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
debug_object_deactivate(timer, &hrtimer_debug_descr);
}
-static inline void debug_hrtimer_free(struct hrtimer *timer)
-{
- debug_object_free(timer, &hrtimer_debug_descr);
-}
-
static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode);
@@ -547,8 +548,11 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
}
/*
- * Recomputes cpu_base::*next_timer and returns the earliest expires_next but
- * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram.
+ * Recomputes cpu_base::*next_timer and returns the earliest expires_next
+ * but does not set cpu_base::*expires_next, that is done by
+ * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating
+ * cpu_base::*expires_next right away, reprogramming logic would no longer
+ * work.
*
* When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
* those timers will get run whenever the softirq gets handled, at the end of
@@ -589,6 +593,37 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
return expires_next;
}
+static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
+{
+ ktime_t expires_next, soft = KTIME_MAX;
+
+ /*
+ * If the soft interrupt has already been activated, ignore the
+ * soft bases. They will be handled in the already raised soft
+ * interrupt.
+ */
+ if (!cpu_base->softirq_activated) {
+ soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
+ /*
+ * Update the soft expiry time. clock_settime() might have
+ * affected it.
+ */
+ cpu_base->softirq_expires_next = soft;
+ }
+
+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD);
+ /*
+ * If a softirq timer is expiring first, update cpu_base->next_timer
+ * and program the hardware with the soft expiry time.
+ */
+ if (expires_next > soft) {
+ cpu_base->next_timer = cpu_base->softirq_next_timer;
+ expires_next = soft;
+ }
+
+ return expires_next;
+}
+
static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
@@ -619,37 +654,10 @@ static inline int hrtimer_hres_active(void)
return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
}
-/*
- * Reprogram the event source with checking both queues for the
- * next event
- * Called with interrupts disabled and base->lock held
- */
-static void
-hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
+static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
+ struct hrtimer *next_timer,
+ ktime_t expires_next)
{
- ktime_t expires_next;
-
- /*
- * Find the current next expiration time.
- */
- expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
-
- if (cpu_base->next_timer && cpu_base->next_timer->is_soft) {
- /*
- * When the softirq is activated, hrtimer has to be
- * programmed with the first hard hrtimer because soft
- * timer interrupt could occur too late.
- */
- if (cpu_base->softirq_activated)
- expires_next = __hrtimer_get_next_event(cpu_base,
- HRTIMER_ACTIVE_HARD);
- else
- cpu_base->softirq_expires_next = expires_next;
- }
-
- if (skip_equal && expires_next == cpu_base->expires_next)
- return;
-
cpu_base->expires_next = expires_next;
/*
@@ -666,13 +674,31 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
* T1 is removed, so this code is called and would reprogram
* the hardware to 5s from now. Any hrtimer_start after that
* will not reprogram the hardware due to hang_detected being
- * set. So we'd effectivly block all timers until the T2 event
+ * set. So we'd effectively block all timers until the T2 event
* fires.
*/
if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
return;
- tick_program_event(cpu_base->expires_next, 1);
+ tick_program_event(expires_next, 1);
+}
+
+/*
+ * Reprogram the event source with checking both queues for the
+ * next event
+ * Called with interrupts disabled and base->lock held
+ */
+static void
+hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
+{
+ ktime_t expires_next;
+
+ expires_next = hrtimer_update_next_event(cpu_base);
+
+ if (skip_equal && expires_next == cpu_base->expires_next)
+ return;
+
+ __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next);
}
/* High resolution timer related functions */
@@ -703,23 +729,7 @@ static inline int hrtimer_is_hres_enabled(void)
return hrtimer_hres_enabled;
}
-/*
- * Retrigger next event is called after clock was set
- *
- * Called with interrupts disabled via on_each_cpu()
- */
-static void retrigger_next_event(void *arg)
-{
- struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
-
- if (!__hrtimer_hres_active(base))
- return;
-
- raw_spin_lock(&base->lock);
- hrtimer_update_base(base);
- hrtimer_force_reprogram(base, 0);
- raw_spin_unlock(&base->lock);
-}
+static void retrigger_next_event(void *arg);
/*
* Switch to high resolution mode
@@ -741,29 +751,54 @@ static void hrtimer_switch_to_hres(void)
retrigger_next_event(NULL);
}
-static void clock_was_set_work(struct work_struct *work)
-{
- clock_was_set();
-}
+#else
-static DECLARE_WORK(hrtimer_work, clock_was_set_work);
+static inline int hrtimer_is_hres_enabled(void) { return 0; }
+static inline void hrtimer_switch_to_hres(void) { }
+#endif /* CONFIG_HIGH_RES_TIMERS */
/*
- * Called from timekeeping and resume code to reprogram the hrtimer
- * interrupt device on all cpus.
+ * Retrigger next event is called after clock was set with interrupts
+ * disabled through an SMP function call or directly from low level
+ * resume code.
+ *
+ * This is only invoked when:
+ * - CONFIG_HIGH_RES_TIMERS is enabled.
+ * - CONFIG_NOHZ_COMMON is enabled
+ *
+ * For the other cases this function is empty and because the call sites
+ * are optimized out it vanishes as well, i.e. no need for lots of
+ * #ifdeffery.
*/
-void clock_was_set_delayed(void)
+static void retrigger_next_event(void *arg)
{
- schedule_work(&hrtimer_work);
-}
-
-#else
+ struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
-static inline int hrtimer_is_hres_enabled(void) { return 0; }
-static inline void hrtimer_switch_to_hres(void) { }
-static inline void retrigger_next_event(void *arg) { }
+ /*
+ * When high resolution mode or nohz is active, then the offsets of
+ * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the
+ * next tick will take care of that.
+ *
+ * If high resolution mode is active then the next expiring timer
+ * must be reevaluated and the clock event device reprogrammed if
+ * necessary.
+ *
+ * In the NOHZ case the update of the offset and the reevaluation
+ * of the next expiring timer is enough. The return from the SMP
+ * function call will take care of the reprogramming in case the
+ * CPU was in a NOHZ idle sleep.
+ */
+ if (!__hrtimer_hres_active(base) && !tick_nohz_active)
+ return;
-#endif /* CONFIG_HIGH_RES_TIMERS */
+ raw_spin_lock(&base->lock);
+ hrtimer_update_base(base);
+ if (__hrtimer_hres_active(base))
+ hrtimer_force_reprogram(base, 0);
+ else
+ hrtimer_update_next_event(base);
+ raw_spin_unlock(&base->lock);
+}
/*
* When a timer is enqueued and expires earlier than the already enqueued
@@ -818,75 +853,161 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
if (base->cpu_base != cpu_base)
return;
+ if (expires >= cpu_base->expires_next)
+ return;
+
/*
- * If the hrtimer interrupt is running, then it will
- * reevaluate the clock bases and reprogram the clock event
- * device. The callbacks are always executed in hard interrupt
- * context so we don't need an extra check for a running
- * callback.
+ * If the hrtimer interrupt is running, then it will reevaluate the
+ * clock bases and reprogram the clock event device.
*/
if (cpu_base->in_hrtirq)
return;
- if (expires >= cpu_base->expires_next)
- return;
-
- /* Update the pointer to the next expiring timer */
cpu_base->next_timer = timer;
- cpu_base->expires_next = expires;
+
+ __hrtimer_reprogram(cpu_base, timer, expires);
+}
+
+static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
+ unsigned int active)
+{
+ struct hrtimer_clock_base *base;
+ unsigned int seq;
+ ktime_t expires;
/*
- * If hres is not active, hardware does not have to be
- * programmed yet.
+ * Update the base offsets unconditionally so the following
+ * checks whether the SMP function call is required works.
*
- * If a hang was detected in the last timer interrupt then we
- * do not schedule a timer which is earlier than the expiry
- * which we enforced in the hang detection. We want the system
- * to make progress.
+ * The update is safe even when the remote CPU is in the hrtimer
+ * interrupt or the hrtimer soft interrupt and expiring affected
+ * bases. Either it will see the update before handling a base or
+ * it will see it when it finishes the processing and reevaluates
+ * the next expiring timer.
*/
- if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
- return;
+ seq = cpu_base->clock_was_set_seq;
+ hrtimer_update_base(cpu_base);
/*
- * Program the timer hardware. We enforce the expiry for
- * events which are already in the past.
+ * If the sequence did not change over the update then the
+ * remote CPU already handled it.
*/
- tick_program_event(expires, 1);
+ if (seq == cpu_base->clock_was_set_seq)
+ return false;
+
+ /*
+ * If the remote CPU is currently handling an hrtimer interrupt, it
+ * will reevaluate the first expiring timer of all clock bases
+ * before reprogramming. Nothing to do here.
+ */
+ if (cpu_base->in_hrtirq)
+ return false;
+
+ /*
+ * Walk the affected clock bases and check whether the first expiring
+ * timer in a clock base is moving ahead of the first expiring timer of
+ * @cpu_base. If so, the IPI must be invoked because per CPU clock
+ * event devices cannot be remotely reprogrammed.
+ */
+ active &= cpu_base->active_bases;
+
+ for_each_active_base(base, cpu_base, active) {
+ struct timerqueue_node *next;
+
+ next = timerqueue_getnext(&base->active);
+ expires = ktime_sub(next->expires, base->offset);
+ if (expires < cpu_base->expires_next)
+ return true;
+
+ /* Extra check for softirq clock bases */
+ if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT)
+ continue;
+ if (cpu_base->softirq_activated)
+ continue;
+ if (expires < cpu_base->softirq_expires_next)
+ return true;
+ }
+ return false;
}
/*
- * Clock realtime was set
+ * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and
+ * CLOCK_BOOTTIME (for late sleep time injection).
*
- * Change the offset of the realtime clock vs. the monotonic
- * clock.
- *
- * We might have to reprogram the high resolution timer interrupt. On
- * SMP we call the architecture specific code to retrigger _all_ high
- * resolution timer interrupts. On UP we just disable interrupts and
- * call the high resolution interrupt code.
+ * This requires to update the offsets for these clocks
+ * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this
+ * also requires to eventually reprogram the per CPU clock event devices
+ * when the change moves an affected timer ahead of the first expiring
+ * timer on that CPU. Obviously remote per CPU clock event devices cannot
+ * be reprogrammed. The other reason why an IPI has to be sent is when the
+ * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets
+ * in the tick, which obviously might be stopped, so this has to bring out
+ * the remote CPU which might sleep in idle to get this sorted.
*/
-void clock_was_set(void)
+void clock_was_set(unsigned int bases)
{
-#ifdef CONFIG_HIGH_RES_TIMERS
- /* Retrigger the CPU local events everywhere */
- on_each_cpu(retrigger_next_event, NULL, 1);
-#endif
+ struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases);
+ cpumask_var_t mask;
+ int cpu;
+
+ if (!__hrtimer_hres_active(cpu_base) && !tick_nohz_active)
+ goto out_timerfd;
+
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
+ on_each_cpu(retrigger_next_event, NULL, 1);
+ goto out_timerfd;
+ }
+
+ /* Avoid interrupting CPUs if possible */
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ unsigned long flags;
+
+ cpu_base = &per_cpu(hrtimer_bases, cpu);
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
+
+ if (update_needs_ipi(cpu_base, bases))
+ cpumask_set_cpu(cpu, mask);
+
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+ }
+
+ preempt_disable();
+ smp_call_function_many(mask, retrigger_next_event, NULL, 1);
+ preempt_enable();
+ cpus_read_unlock();
+ free_cpumask_var(mask);
+
+out_timerfd:
timerfd_clock_was_set();
}
+static void clock_was_set_work(struct work_struct *work)
+{
+ clock_was_set(CLOCK_SET_WALL);
+}
+
+static DECLARE_WORK(hrtimer_work, clock_was_set_work);
+
+/*
+ * Called from timekeeping code to reprogram the hrtimer interrupt device
+ * on all cpus and to notify timerfd.
+ */
+void clock_was_set_delayed(void)
+{
+ schedule_work(&hrtimer_work);
+}
+
/*
- * During resume we might have to reprogram the high resolution timer
- * interrupt on all online CPUs. However, all other CPUs will be
- * stopped with IRQs interrupts disabled so the clock_was_set() call
- * must be deferred.
+ * Called during resume either directly from via timekeeping_resume()
+ * or in the case of s2idle from tick_unfreeze() to ensure that the
+ * hrtimers are up to date.
*/
-void hrtimers_resume(void)
+void hrtimers_resume_local(void)
{
lockdep_assert_irqs_disabled();
/* Retrigger on the local CPU */
retrigger_next_event(NULL);
- /* And schedule a retrigger for all others */
- clock_was_set_delayed();
}
/*
@@ -894,6 +1015,7 @@ void hrtimers_resume(void)
*/
static inline
void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+ __releases(&timer->base->cpu_base->lock)
{
raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
}
@@ -963,6 +1085,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
enum hrtimer_mode mode)
{
debug_activate(timer, mode);
+ WARN_ON_ONCE(!base->cpu_base->online);
base->cpu_base->active_bases |= 1 << base->index;
@@ -1002,7 +1125,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
* cpu_base->next_timer. This happens when we remove the first
* timer on a remote cpu. No harm as we never dereference
* cpu_base->next_timer. So the worst thing what can happen is
- * an superflous call to hrtimer_force_reprogram() on the
+ * an superfluous call to hrtimer_force_reprogram() on the
* remote cpu later on if the same timer gets enqueued again.
*/
if (reprogram && timer == cpu_base->next_timer)
@@ -1013,12 +1136,13 @@ static void __remove_hrtimer(struct hrtimer *timer,
* remove hrtimer, called with base lock held
*/
static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
+ bool restart, bool keep_local)
{
u8 state = timer->state;
if (state & HRTIMER_STATE_ENQUEUED) {
- int reprogram;
+ bool reprogram;
/*
* Remove the timer and force reprogramming when high
@@ -1031,8 +1155,16 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
debug_deactivate(timer);
reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+ /*
+ * If the timer is not restarted then reprogramming is
+ * required if the timer is local. If it is local and about
+ * to be restarted, avoid programming it twice (on removal
+ * and a moment later when it's requeued).
+ */
if (!restart)
state = HRTIMER_STATE_INACTIVE;
+ else
+ reprogram &= !keep_local;
__remove_hrtimer(timer, base, state, reprogram);
return 1;
@@ -1086,9 +1218,31 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
struct hrtimer_clock_base *base)
{
struct hrtimer_clock_base *new_base;
+ bool force_local, first;
+
+ /*
+ * If the timer is on the local cpu base and is the first expiring
+ * timer then this might end up reprogramming the hardware twice
+ * (on removal and on enqueue). To avoid that by prevent the
+ * reprogram on removal, keep the timer local to the current CPU
+ * and enforce reprogramming after it is queued no matter whether
+ * it is the new first expiring timer again or not.
+ */
+ force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+ force_local &= base->cpu_base->next_timer == timer;
- /* Remove an active timer from the queue: */
- remove_hrtimer(timer, base, true);
+ /*
+ * Remove an active timer from the queue. In case it is not queued
+ * on the current CPU, make sure that remove_hrtimer() updates the
+ * remote data correctly.
+ *
+ * If it's on the current CPU and the first expiring timer, then
+ * skip reprogramming, keep the timer local and enforce
+ * reprogramming later if it was the first expiring timer. This
+ * avoids programming the underlying clock event twice (once at
+ * removal and once after enqueue).
+ */
+ remove_hrtimer(timer, base, true, force_local);
if (mode & HRTIMER_MODE_REL)
tim = ktime_add_safe(tim, base->get_time());
@@ -1098,9 +1252,24 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
hrtimer_set_expires_range_ns(timer, tim, delta_ns);
/* Switch the timer base, if necessary: */
- new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
+ if (!force_local) {
+ new_base = switch_hrtimer_base(timer, base,
+ mode & HRTIMER_MODE_PINNED);
+ } else {
+ new_base = base;
+ }
- return enqueue_hrtimer(timer, new_base, mode);
+ first = enqueue_hrtimer(timer, new_base, mode);
+ if (!force_local)
+ return first;
+
+ /*
+ * Timer was forced to stay on the current CPU to avoid
+ * reprogramming on removal and enqueue. Force reprogram the
+ * hardware by evaluating the new first expiring timer.
+ */
+ hrtimer_force_reprogram(new_base->cpu_base, 1);
+ return 0;
}
/**
@@ -1166,7 +1335,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
base = lock_hrtimer_base(timer, &flags);
if (!hrtimer_callback_running(timer))
- ret = remove_hrtimer(timer, base, false);
+ ret = remove_hrtimer(timer, base, false, false);
unlock_hrtimer_base(timer, &flags);
@@ -1195,7 +1364,7 @@ static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
* The counterpart to hrtimer_cancel_wait_running().
*
* If there is a waiter for cpu_base->expiry_lock, then it was waiting for
- * the timer callback to finish. Drop expiry_lock and reaquire it. That
+ * the timer callback to finish. Drop expiry_lock and reacquire it. That
* allows the waiter to acquire the lock and make progress.
*/
static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
@@ -1285,7 +1454,7 @@ int hrtimer_cancel(struct hrtimer *timer)
EXPORT_SYMBOL_GPL(hrtimer_cancel);
/**
- * hrtimer_get_remaining - get remaining time for the timer
+ * __hrtimer_get_remaining - get remaining time for the timer
* @timer: the timer to read
* @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y
*/
@@ -1381,7 +1550,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
int base;
/*
- * On PREEMPT_RT enabled kernels hrtimers which are not explicitely
+ * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
* marked for hard interrupt expiry mode are moved into soft
* interrupt context for latency reasons and because the callbacks
* can invoke functions which might sleep on RT, e.g. spin_lock().
@@ -1413,7 +1582,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
* hrtimer_init - initialize a timer to the given clock
* @timer: the timer to be initialized
* @clock_id: the clock to be used
- * @mode: The modes which are relevant for intitialization:
+ * @mode: The modes which are relevant for initialization:
* HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
* HRTIMER_MODE_REL_SOFT
*
@@ -1470,7 +1639,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
* insufficient for that.
*
* The sequence numbers are required because otherwise we could still observe
- * a false negative if the read side got smeared over multiple consequtive
+ * a false negative if the read side got smeared over multiple consecutive
* __run_hrtimer() invocations.
*/
@@ -1571,7 +1740,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
* minimizing wakeups, not running timers at the
* earliest interrupt after their soft expiration.
* This allows us to avoid using a Priority Search
- * Tree, which can answer a stabbing querry for
+ * Tree, which can answer a stabbing query for
* overlapping intervals and instead use the simple
* BST we already have.
* We don't add extra wakeups by delaying timers that
@@ -1645,8 +1814,8 @@ retry:
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
- /* Reevaluate the clock bases for the next expiry */
- expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
+ /* Reevaluate the clock bases for the [soft] next expiry */
+ expires_next = hrtimer_update_next_event(cpu_base);
/*
* Store the new expiry value so the migration code can verify
* against it.
@@ -1805,7 +1974,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
clockid_t clock_id, enum hrtimer_mode mode)
{
/*
- * On PREEMPT_RT enabled kernels hrtimers which are not explicitely
+ * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
* marked for hard interrupt expiry mode are moved into soft
* interrupt context either for latency reasons or because the
* hrtimer callback takes regular spinlocks or invokes other
@@ -1818,7 +1987,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
* the same CPU. That causes a latency spike due to the wakeup of
* a gazillion threads.
*
- * OTOH, priviledged real-time user space applications rely on the
+ * OTOH, privileged real-time user space applications rely on the
* low latency of hard interrupt wakeups. If the current task is in
* a real-time scheduling class, mark the mode for hard interrupt
* expiry.
@@ -1872,11 +2041,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
struct restart_block *restart;
do {
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
hrtimer_sleeper_start_expires(t, mode);
if (likely(t->task))
- freezable_schedule();
+ schedule();
hrtimer_cancel(&t->timer);
mode = HRTIMER_MODE_ABS;
@@ -1924,7 +2093,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
u64 slack;
slack = current->timer_slack_ns;
- if (dl_task(current) || rt_task(current))
+ if (rt_task(current))
slack = 0;
hrtimer_init_sleeper_on_stack(&t, clockid, mode);
@@ -1940,9 +2109,9 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
}
restart = &current->restart_block;
- restart->fn = hrtimer_nanosleep_restart;
restart->nanosleep.clockid = t.timer.base->clockid;
restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
+ set_restart_fn(restart, hrtimer_nanosleep_restart);
out:
destroy_hrtimer_on_stack(&t.timer);
return ret;
@@ -1961,6 +2130,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
if (!timespec64_valid(&tu))
return -EINVAL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
@@ -1982,6 +2152,7 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
if (!timespec64_valid(&tu))
return -EINVAL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
@@ -1998,8 +2169,11 @@ int hrtimers_prepare_cpu(unsigned int cpu)
int i;
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
- cpu_base->clock_base[i].cpu_base = cpu_base;
- timerqueue_init_head(&cpu_base->clock_base[i].active);
+ struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];
+
+ clock_b->cpu_base = cpu_base;
+ seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
+ timerqueue_init_head(&clock_b->active);
}
cpu_base->cpu = cpu;
@@ -2010,6 +2184,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
cpu_base->softirq_next_timer = NULL;
cpu_base->expires_next = KTIME_MAX;
cpu_base->softirq_expires_next = KTIME_MAX;
+ cpu_base->online = 1;
hrtimer_cpu_base_init_expiry_lock(cpu_base);
return 0;
}
@@ -2046,29 +2221,22 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
}
}
-int hrtimers_dead_cpu(unsigned int scpu)
+int hrtimers_cpu_dying(unsigned int dying_cpu)
{
struct hrtimer_cpu_base *old_base, *new_base;
- int i;
+ int i, ncpu = cpumask_first(cpu_active_mask);
- BUG_ON(cpu_online(scpu));
- tick_cancel_sched_timer(scpu);
+ tick_cancel_sched_timer(dying_cpu);
+
+ old_base = this_cpu_ptr(&hrtimer_bases);
+ new_base = &per_cpu(hrtimer_bases, ncpu);
- /*
- * this BH disable ensures that raise_softirq_irqoff() does
- * not wakeup ksoftirqd (and acquire the pi-lock) while
- * holding the cpu_base lock
- */
- local_bh_disable();
- local_irq_disable();
- old_base = &per_cpu(hrtimer_bases, scpu);
- new_base = this_cpu_ptr(&hrtimer_bases);
/*
* The caller is globally serialized and nobody else
* takes two locks at once, deadlock is not possible.
*/
- raw_spin_lock(&new_base->lock);
- raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock(&old_base->lock);
+ raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
migrate_hrtimer_list(&old_base->clock_base[i],
@@ -2079,15 +2247,14 @@ int hrtimers_dead_cpu(unsigned int scpu)
* The migration might have changed the first expiring softirq
* timer on this CPU. Update it.
*/
- hrtimer_update_softirq_timer(new_base, false);
+ __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
+ /* Tell the other CPU to retrigger the next event */
+ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
- raw_spin_unlock(&old_base->lock);
raw_spin_unlock(&new_base->lock);
+ old_base->online = 0;
+ raw_spin_unlock(&old_base->lock);
- /* Check, if we got expired work to do */
- __hrtimer_peek_ahead_timers();
- local_irq_enable();
- local_bh_enable();
return 0;
}
@@ -2102,7 +2269,7 @@ void __init hrtimers_init(void)
/**
* schedule_hrtimeout_range_clock - sleep until timeout
* @expires: timeout value (ktime_t)
- * @delta: slack in expires timeout (ktime_t)
+ * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks
* @mode: timer mode
* @clock_id: timer clock to be used
*/
@@ -2129,6 +2296,13 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
return -EINTR;
}
+ /*
+ * Override any slack passed by the user if under
+ * rt contraints.
+ */
+ if (rt_task(current))
+ delta = 0;
+
hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
hrtimer_sleeper_start_expires(&t, mode);
@@ -2143,11 +2317,12 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
return !t.task ? 0 : -EINTR;
}
+EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock);
/**
* schedule_hrtimeout_range - sleep until timeout
* @expires: timeout value (ktime_t)
- * @delta: slack in expires timeout (ktime_t)
+ * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks
* @mode: timer mode
*
* Make the current task sleep until the given expiry time has
@@ -2155,7 +2330,8 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
* the current task state has been set (see set_current_state()).
*
* The @delta argument gives the kernel the freedom to schedule the
- * actual wakeup to a time that is both power and performance friendly.
+ * actual wakeup to a time that is both power and performance friendly
+ * for regular (non RT/DL) tasks.
* The kernel give the normal best effort behavior for "@expires+@delta",
* but may decide to fire the timer earlier, but no earlier than @expires.
*
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index ca4e6d57d68b..00629e658ca1 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -172,10 +172,6 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
u64 oval, nval, ointerval, ninterval;
struct cpu_itimer *it = &tsk->signal->it[clock_id];
- /*
- * Use the to_ktime conversion because that clamps the maximum
- * value to KTIME_MAX and avoid multiplication overflows.
- */
nval = timespec64_to_ns(&value->it_value);
ninterval = timespec64_to_ns(&value->it_interval);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index eddcf4970444..bc4db9e5ab70 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -10,28 +10,9 @@
#include <linux/init.h>
#include "timekeeping.h"
+#include "tick-internal.h"
-/* Since jiffies uses a simple TICK_NSEC multiplier
- * conversion, the .shift value could be zero. However
- * this would make NTP adjustments impossible as they are
- * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
- * shift both the nominator and denominator the same
- * amount, and give ntp adjustments in units of 1/2^8
- *
- * The value 8 is somewhat carefully chosen, as anything
- * larger can result in overflows. TICK_NSEC grows as HZ
- * shrinks, so values greater than 8 overflow 32bits when
- * HZ=100.
- */
-#if HZ < 34
-#define JIFFIES_SHIFT 6
-#elif HZ < 67
-#define JIFFIES_SHIFT 7
-#else
-#define JIFFIES_SHIFT 8
-#endif
-
static u64 jiffies_read(struct clocksource *cs)
{
return (u64) jiffies;
@@ -44,22 +25,24 @@ static u64 jiffies_read(struct clocksource *cs)
* the timer interrupt frequency HZ and it suffers
* inaccuracies caused by missed or lost timer
* interrupts and the inability for the timer
- * interrupt hardware to accuratly tick at the
+ * interrupt hardware to accurately tick at the
* requested HZ value. It is also not recommended
* for "tick-less" systems.
*/
static struct clocksource clocksource_jiffies = {
- .name = "jiffies",
- .rating = 1, /* lowest valid rating*/
- .read = jiffies_read,
- .mask = CLOCKSOURCE_MASK(32),
- .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */
- .shift = JIFFIES_SHIFT,
- .max_cycles = 10,
+ .name = "jiffies",
+ .rating = 1, /* lowest valid rating*/
+ .uncertainty_margin = 32 * NSEC_PER_MSEC,
+ .read = jiffies_read,
+ .mask = CLOCKSOURCE_MASK(32),
+ .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */
+ .shift = JIFFIES_SHIFT,
+ .max_cycles = 10,
};
__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
-__cacheline_aligned_in_smp seqcount_t jiffies_seq;
+__cacheline_aligned_in_smp seqcount_raw_spinlock_t jiffies_seq =
+ SEQCNT_RAW_SPINLOCK_ZERO(jiffies_seq, &jiffies_lock);
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void)
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 5d9fc22d836a..0775b9ec952a 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -88,13 +88,13 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
goto fail;
err = -ENOMEM;
- ns = kmalloc(sizeof(*ns), GFP_KERNEL);
+ ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
if (!ns)
goto fail_dec;
- kref_init(&ns->kref);
+ refcount_set(&ns->ns.count, 1);
- ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!ns->vvar_page)
goto fail_free;
@@ -192,6 +192,24 @@ static void timens_setup_vdso_data(struct vdso_data *vdata,
offset[CLOCK_BOOTTIME_ALARM] = boottime;
}
+struct page *find_timens_vvar_page(struct vm_area_struct *vma)
+{
+ if (likely(vma->vm_mm == current->mm))
+ return current->nsproxy->time_ns->vvar_page;
+
+ /*
+ * VM_PFNMAP | VM_IO protect .fault() handler from being called
+ * through interfaces like /proc/$pid/mem or
+ * process_vm_{readv,writev}() as long as there's no .access()
+ * in special_mapping_vmops().
+ * For more details check_vma_flags() and __access_remote_vm()
+ */
+
+ WARN(1, "vvar_page accessed remotely");
+
+ return NULL;
+}
+
/*
* Protects possibly multiple offsets writers racing each other
* and tasks entering the namespace.
@@ -226,11 +244,8 @@ out:
mutex_unlock(&offset_lock);
}
-void free_time_ns(struct kref *kref)
+void free_time_ns(struct time_namespace *ns)
{
- struct time_namespace *ns;
-
- ns = container_of(kref, struct time_namespace, kref);
dec_time_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
@@ -280,11 +295,16 @@ static void timens_put(struct ns_common *ns)
put_time_ns(to_time_ns(ns));
}
+void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
+{
+ timens_set_vvar_page(tsk, ns);
+ vdso_join_timens(tsk, ns);
+}
+
static int timens_install(struct nsset *nsset, struct ns_common *new)
{
struct nsproxy *nsproxy = nsset->nsproxy;
struct time_namespace *ns = to_time_ns(new);
- int err;
if (!current_is_single_threaded())
return -EUSERS;
@@ -293,12 +313,6 @@ static int timens_install(struct nsset *nsset, struct ns_common *new)
!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
- timens_set_vvar_page(current, ns);
-
- err = vdso_join_timens(current, ns);
- if (err)
- return err;
-
get_time_ns(ns);
put_time_ns(nsproxy->time_ns);
nsproxy->time_ns = ns;
@@ -309,27 +323,20 @@ static int timens_install(struct nsset *nsset, struct ns_common *new)
return 0;
}
-int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
+void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
{
struct ns_common *nsc = &nsproxy->time_ns_for_children->ns;
struct time_namespace *ns = to_time_ns(nsc);
- int err;
/* create_new_namespaces() already incremented the ref counter */
if (nsproxy->time_ns == nsproxy->time_ns_for_children)
- return 0;
-
- timens_set_vvar_page(tsk, ns);
-
- err = vdso_join_timens(tsk, ns);
- if (err)
- return err;
+ return;
get_time_ns(ns);
put_time_ns(nsproxy->time_ns);
nsproxy->time_ns = ns;
- return 0;
+ timens_commit(tsk, ns);
}
static struct user_namespace *timens_owner(struct ns_common *ns)
@@ -470,15 +477,9 @@ const struct proc_ns_operations timens_for_children_operations = {
};
struct time_namespace init_time_ns = {
- .kref = KREF_INIT(3),
+ .ns.count = REFCOUNT_INIT(3),
.user_ns = &init_user_ns,
.ns.inum = PROC_TIME_INIT_INO,
.ns.ops = &timens_operations,
.frozen_offsets = true,
};
-
-static int __init time_ns_init(void)
-{
- return 0;
-}
-subsys_initcall(time_ns_init);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 069ca78fb0bf..406dccb79c2b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -494,65 +494,74 @@ out:
return leap;
}
+#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
static void sync_hw_clock(struct work_struct *work);
-static DECLARE_DELAYED_WORK(sync_work, sync_hw_clock);
-
-static void sched_sync_hw_clock(struct timespec64 now,
- unsigned long target_nsec, bool fail)
+static DECLARE_WORK(sync_work, sync_hw_clock);
+static struct hrtimer sync_hrtimer;
+#define SYNC_PERIOD_NS (11ULL * 60 * NSEC_PER_SEC)
+static enum hrtimer_restart sync_timer_callback(struct hrtimer *timer)
{
- struct timespec64 next;
-
- ktime_get_real_ts64(&next);
- if (!fail)
- next.tv_sec = 659;
- else {
- /*
- * Try again as soon as possible. Delaying long periods
- * decreases the accuracy of the work queue timer. Due to this
- * the algorithm is very likely to require a short-sleep retry
- * after the above long sleep to synchronize ts_nsec.
- */
- next.tv_sec = 0;
- }
-
- /* Compute the needed delay that will get to tv_nsec == target_nsec */
- next.tv_nsec = target_nsec - next.tv_nsec;
- if (next.tv_nsec <= 0)
- next.tv_nsec += NSEC_PER_SEC;
- if (next.tv_nsec >= NSEC_PER_SEC) {
- next.tv_sec++;
- next.tv_nsec -= NSEC_PER_SEC;
- }
+ queue_work(system_freezable_power_efficient_wq, &sync_work);
- queue_delayed_work(system_power_efficient_wq, &sync_work,
- timespec64_to_jiffies(&next));
+ return HRTIMER_NORESTART;
}
-static void sync_rtc_clock(void)
+static void sched_sync_hw_clock(unsigned long offset_nsec, bool retry)
{
- unsigned long target_nsec;
- struct timespec64 adjust, now;
- int rc;
+ ktime_t exp = ktime_set(ktime_get_real_seconds(), 0);
- if (!IS_ENABLED(CONFIG_RTC_SYSTOHC))
- return;
+ if (retry)
+ exp = ktime_add_ns(exp, 2ULL * NSEC_PER_SEC - offset_nsec);
+ else
+ exp = ktime_add_ns(exp, SYNC_PERIOD_NS - offset_nsec);
- ktime_get_real_ts64(&now);
+ hrtimer_start(&sync_hrtimer, exp, HRTIMER_MODE_ABS);
+}
- adjust = now;
- if (persistent_clock_is_local)
- adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
+/*
+ * Check whether @now is correct versus the required time to update the RTC
+ * and calculate the value which needs to be written to the RTC so that the
+ * next seconds increment of the RTC after the write is aligned with the next
+ * seconds increment of clock REALTIME.
+ *
+ * tsched t1 write(t2.tv_sec - 1sec)) t2 RTC increments seconds
+ *
+ * t2.tv_nsec == 0
+ * tsched = t2 - set_offset_nsec
+ * newval = t2 - NSEC_PER_SEC
+ *
+ * ==> neval = tsched + set_offset_nsec - NSEC_PER_SEC
+ *
+ * As the execution of this code is not guaranteed to happen exactly at
+ * tsched this allows it to happen within a fuzzy region:
+ *
+ * abs(now - tsched) < FUZZ
+ *
+ * If @now is not inside the allowed window the function returns false.
+ */
+static inline bool rtc_tv_nsec_ok(unsigned long set_offset_nsec,
+ struct timespec64 *to_set,
+ const struct timespec64 *now)
+{
+ /* Allowed error in tv_nsec, arbitrarily set to 5 jiffies in ns. */
+ const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5;
+ struct timespec64 delay = {.tv_sec = -1,
+ .tv_nsec = set_offset_nsec};
- /*
- * The current RTC in use will provide the target_nsec it wants to be
- * called at, and does rtc_tv_nsec_ok internally.
- */
- rc = rtc_set_ntp_time(adjust, &target_nsec);
- if (rc == -ENODEV)
- return;
+ *to_set = timespec64_add(*now, delay);
+
+ if (to_set->tv_nsec < TIME_SET_NSEC_FUZZ) {
+ to_set->tv_nsec = 0;
+ return true;
+ }
- sched_sync_hw_clock(now, target_nsec, rc);
+ if (to_set->tv_nsec > NSEC_PER_SEC - TIME_SET_NSEC_FUZZ) {
+ to_set->tv_sec++;
+ to_set->tv_nsec = 0;
+ return true;
+ }
+ return false;
}
#ifdef CONFIG_GENERIC_CMOS_UPDATE
@@ -560,48 +569,47 @@ int __weak update_persistent_clock64(struct timespec64 now64)
{
return -ENODEV;
}
+#else
+static inline int update_persistent_clock64(struct timespec64 now64)
+{
+ return -ENODEV;
+}
#endif
-static bool sync_cmos_clock(void)
+#ifdef CONFIG_RTC_SYSTOHC
+/* Save NTP synchronized time to the RTC */
+static int update_rtc(struct timespec64 *to_set, unsigned long *offset_nsec)
{
- static bool no_cmos;
- struct timespec64 now;
- struct timespec64 adjust;
- int rc = -EPROTO;
- long target_nsec = NSEC_PER_SEC / 2;
+ struct rtc_device *rtc;
+ struct rtc_time tm;
+ int err = -ENODEV;
- if (!IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE))
- return false;
+ rtc = rtc_class_open(CONFIG_RTC_SYSTOHC_DEVICE);
+ if (!rtc)
+ return -ENODEV;
- if (no_cmos)
- return false;
+ if (!rtc->ops || !rtc->ops->set_time)
+ goto out_close;
- /*
- * Historically update_persistent_clock64() has followed x86
- * semantics, which match the MC146818A/etc RTC. This RTC will store
- * 'adjust' and then in .5s it will advance once second.
- *
- * Architectures are strongly encouraged to use rtclib and not
- * implement this legacy API.
- */
- ktime_get_real_ts64(&now);
- if (rtc_tv_nsec_ok(-1 * target_nsec, &adjust, &now)) {
- if (persistent_clock_is_local)
- adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
- rc = update_persistent_clock64(adjust);
- /*
- * The machine does not support update_persistent_clock64 even
- * though it defines CONFIG_GENERIC_CMOS_UPDATE.
- */
- if (rc == -ENODEV) {
- no_cmos = true;
- return false;
- }
+ /* First call might not have the correct offset */
+ if (*offset_nsec == rtc->set_offset_nsec) {
+ rtc_time64_to_tm(to_set->tv_sec, &tm);
+ err = rtc_set_time(rtc, &tm);
+ } else {
+ /* Store the update offset and let the caller try again */
+ *offset_nsec = rtc->set_offset_nsec;
+ err = -EAGAIN;
}
-
- sched_sync_hw_clock(now, target_nsec, rc);
- return true;
+out_close:
+ rtc_class_close(rtc);
+ return err;
+}
+#else
+static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_nsec)
+{
+ return -ENODEV;
}
+#endif
/*
* If we have an externally synchronized Linux clock, then update RTC clock
@@ -613,24 +621,64 @@ static bool sync_cmos_clock(void)
*/
static void sync_hw_clock(struct work_struct *work)
{
- if (!ntp_synced())
- return;
+ /*
+ * The default synchronization offset is 500ms for the deprecated
+ * update_persistent_clock64() under the assumption that it uses
+ * the infamous CMOS clock (MC146818).
+ */
+ static unsigned long offset_nsec = NSEC_PER_SEC / 2;
+ struct timespec64 now, to_set;
+ int res = -EAGAIN;
- if (sync_cmos_clock())
+ /*
+ * Don't update if STA_UNSYNC is set and if ntp_notify_cmos_timer()
+ * managed to schedule the work between the timer firing and the
+ * work being able to rearm the timer. Wait for the timer to expire.
+ */
+ if (!ntp_synced() || hrtimer_is_queued(&sync_hrtimer))
return;
- sync_rtc_clock();
+ ktime_get_real_ts64(&now);
+ /* If @now is not in the allowed window, try again */
+ if (!rtc_tv_nsec_ok(offset_nsec, &to_set, &now))
+ goto rearm;
+
+ /* Take timezone adjusted RTCs into account */
+ if (persistent_clock_is_local)
+ to_set.tv_sec -= (sys_tz.tz_minuteswest * 60);
+
+ /* Try the legacy RTC first. */
+ res = update_persistent_clock64(to_set);
+ if (res != -ENODEV)
+ goto rearm;
+
+ /* Try the RTC class */
+ res = update_rtc(&to_set, &offset_nsec);
+ if (res == -ENODEV)
+ return;
+rearm:
+ sched_sync_hw_clock(offset_nsec, res != 0);
}
void ntp_notify_cmos_timer(void)
{
- if (!ntp_synced())
- return;
+ /*
+ * When the work is currently executed but has not yet the timer
+ * rearmed this queues the work immediately again. No big issue,
+ * just a pointless work scheduled.
+ */
+ if (ntp_synced() && !hrtimer_is_queued(&sync_hrtimer))
+ queue_work(system_freezable_power_efficient_wq, &sync_work);
+}
- if (IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE) ||
- IS_ENABLED(CONFIG_RTC_SYSTOHC))
- queue_delayed_work(system_power_efficient_wq, &sync_work, 0);
+static void __init ntp_init_cmos_sync(void)
+{
+ hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+ sync_hrtimer.function = sync_timer_callback;
}
+#else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */
+static inline void __init ntp_init_cmos_sync(void) { }
+#endif /* !CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */
/*
* Propagate a new txc->status value into the NTP state:
@@ -1044,4 +1092,5 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup);
void __init ntp_init(void)
{
ntp_clear();
+ ntp_init_cmos_sync();
}
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 908ecaa65fc3..23d1b74c3065 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -12,4 +12,11 @@ extern int __do_adjtimex(struct __kernel_timex *txc,
const struct timespec64 *ts,
s32 *time_tai, struct audit_ntp_data *ad);
extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts);
+
+#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
+extern void ntp_notify_cmos_timer(void);
+#else
+static inline void ntp_notify_cmos_timer(void) { }
+#endif
+
#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 77c0c2370b6d..9de66bbbb3d1 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -19,7 +19,8 @@
*/
static struct posix_clock *get_posix_clock(struct file *fp)
{
- struct posix_clock *clk = fp->private_data;
+ struct posix_clock_context *pccontext = fp->private_data;
+ struct posix_clock *clk = pccontext->clk;
down_read(&clk->rwsem);
@@ -39,6 +40,7 @@ static void put_posix_clock(struct posix_clock *clk)
static ssize_t posix_clock_read(struct file *fp, char __user *buf,
size_t count, loff_t *ppos)
{
+ struct posix_clock_context *pccontext = fp->private_data;
struct posix_clock *clk = get_posix_clock(fp);
int err = -EINVAL;
@@ -46,7 +48,7 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
return -ENODEV;
if (clk->ops.read)
- err = clk->ops.read(clk, fp->f_flags, buf, count);
+ err = clk->ops.read(pccontext, fp->f_flags, buf, count);
put_posix_clock(clk);
@@ -55,6 +57,7 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
{
+ struct posix_clock_context *pccontext = fp->private_data;
struct posix_clock *clk = get_posix_clock(fp);
__poll_t result = 0;
@@ -62,7 +65,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
return EPOLLERR;
if (clk->ops.poll)
- result = clk->ops.poll(clk, fp, wait);
+ result = clk->ops.poll(pccontext, fp, wait);
put_posix_clock(clk);
@@ -72,6 +75,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
static long posix_clock_ioctl(struct file *fp,
unsigned int cmd, unsigned long arg)
{
+ struct posix_clock_context *pccontext = fp->private_data;
struct posix_clock *clk = get_posix_clock(fp);
int err = -ENOTTY;
@@ -79,7 +83,7 @@ static long posix_clock_ioctl(struct file *fp,
return -ENODEV;
if (clk->ops.ioctl)
- err = clk->ops.ioctl(clk, cmd, arg);
+ err = clk->ops.ioctl(pccontext, cmd, arg);
put_posix_clock(clk);
@@ -90,6 +94,7 @@ static long posix_clock_ioctl(struct file *fp,
static long posix_clock_compat_ioctl(struct file *fp,
unsigned int cmd, unsigned long arg)
{
+ struct posix_clock_context *pccontext = fp->private_data;
struct posix_clock *clk = get_posix_clock(fp);
int err = -ENOTTY;
@@ -97,7 +102,7 @@ static long posix_clock_compat_ioctl(struct file *fp,
return -ENODEV;
if (clk->ops.ioctl)
- err = clk->ops.ioctl(clk, cmd, arg);
+ err = clk->ops.ioctl(pccontext, cmd, arg);
put_posix_clock(clk);
@@ -110,6 +115,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
int err;
struct posix_clock *clk =
container_of(inode->i_cdev, struct posix_clock, cdev);
+ struct posix_clock_context *pccontext;
down_read(&clk->rwsem);
@@ -117,14 +123,20 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
err = -ENODEV;
goto out;
}
+ pccontext = kzalloc(sizeof(*pccontext), GFP_KERNEL);
+ if (!pccontext) {
+ err = -ENOMEM;
+ goto out;
+ }
+ pccontext->clk = clk;
+ fp->private_data = pccontext;
if (clk->ops.open)
- err = clk->ops.open(clk, fp->f_mode);
+ err = clk->ops.open(pccontext, fp->f_mode);
else
err = 0;
if (!err) {
get_device(clk->dev);
- fp->private_data = clk;
}
out:
up_read(&clk->rwsem);
@@ -133,14 +145,20 @@ out:
static int posix_clock_release(struct inode *inode, struct file *fp)
{
- struct posix_clock *clk = fp->private_data;
+ struct posix_clock_context *pccontext = fp->private_data;
+ struct posix_clock *clk;
int err = 0;
+ if (!pccontext)
+ return -ENODEV;
+ clk = pccontext->clk;
+
if (clk->ops.release)
- err = clk->ops.release(clk);
+ err = clk->ops.release(pccontext);
put_device(clk->dev);
+ kfree(pccontext);
fp->private_data = NULL;
return err;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 165117996ea0..e9c6f9d0e42c 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -15,6 +15,7 @@
#include <linux/workqueue.h>
#include <linux/compat.h>
#include <linux/sched/deadline.h>
+#include <linux/task_work.h>
#include "posix-timers.h"
@@ -34,14 +35,20 @@ void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
* tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if
* necessary. Needs siglock protection since other code may update the
* expiration cache as well.
+ *
+ * Returns 0 on success, -ESRCH on failure. Can fail if the task is exiting and
+ * we cannot lock_task_sighand. Cannot fail if task is current.
*/
-void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
+int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
{
u64 nsecs = rlim_new * NSEC_PER_SEC;
+ unsigned long irq_fl;
- spin_lock_irq(&task->sighand->siglock);
+ if (!lock_task_sighand(task, &irq_fl))
+ return -ESRCH;
set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL);
- spin_unlock_irq(&task->sighand->siglock);
+ unlock_task_sighand(task, &irq_fl);
+ return 0;
}
/*
@@ -236,13 +243,12 @@ static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,
*/
static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
{
- u64 curr_cputime;
-retry:
- curr_cputime = atomic64_read(cputime);
- if (sum_cputime > curr_cputime) {
- if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
- goto retry;
- }
+ u64 curr_cputime = atomic64_read(cputime);
+
+ do {
+ if (sum_cputime <= curr_cputime)
+ return;
+ } while (!atomic64_try_cmpxchg(cputime, &curr_cputime, sum_cputime));
}
static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic,
@@ -279,7 +285,7 @@ void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples)
* @tsk: Task for which cputime needs to be started
* @samples: Storage for time samples
*
- * The thread group cputime accouting is avoided when there are no posix
+ * The thread group cputime accounting is avoided when there are no posix
* CPU timers armed. Before starting a timer it's required to check whether
* the time accounting is active. If not, a full update of the atomic
* accounting store needs to be done and the accounting enabled.
@@ -291,6 +297,8 @@ static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples)
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
struct posix_cputimers *pct = &tsk->signal->posix_cputimers;
+ lockdep_assert_task_sighand_held(tsk);
+
/* Check if cputimer isn't running. This is accessed without locking. */
if (!READ_ONCE(pct->timers_active)) {
struct task_cputime sum;
@@ -377,6 +385,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
*/
static int posix_cpu_timer_create(struct k_itimer *new_timer)
{
+ static struct lock_class_key posix_cpu_timers_key;
struct pid *pid;
rcu_read_lock();
@@ -386,6 +395,17 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
return -EINVAL;
}
+ /*
+ * If posix timer expiry is handled in task work context then
+ * timer::it_lock can be taken without disabling interrupts as all
+ * other locking happens in task context. This requires a separate
+ * lock class key otherwise regular posix timer expiry would record
+ * the lock class being taken in interrupt context and generate a
+ * false positive warning.
+ */
+ if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK))
+ lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key);
+
new_timer->kclock = &clock_posix_cpu;
timerqueue_init(&new_timer->it.cpu.node);
new_timer->it.cpu.pid = get_pid(pid);
@@ -393,6 +413,55 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
return 0;
}
+static struct posix_cputimer_base *timer_base(struct k_itimer *timer,
+ struct task_struct *tsk)
+{
+ int clkidx = CPUCLOCK_WHICH(timer->it_clock);
+
+ if (CPUCLOCK_PERTHREAD(timer->it_clock))
+ return tsk->posix_cputimers.bases + clkidx;
+ else
+ return tsk->signal->posix_cputimers.bases + clkidx;
+}
+
+/*
+ * Force recalculating the base earliest expiration on the next tick.
+ * This will also re-evaluate the need to keep around the process wide
+ * cputime counter and tick dependency and eventually shut these down
+ * if necessary.
+ */
+static void trigger_base_recalc_expires(struct k_itimer *timer,
+ struct task_struct *tsk)
+{
+ struct posix_cputimer_base *base = timer_base(timer, tsk);
+
+ base->nextevt = 0;
+}
+
+/*
+ * Dequeue the timer and reset the base if it was its earliest expiration.
+ * It makes sure the next tick recalculates the base next expiration so we
+ * don't keep the costly process wide cputime counter around for a random
+ * amount of time, along with the tick dependency.
+ *
+ * If another timer gets queued between this and the next tick, its
+ * expiration will update the base next event if necessary on the next
+ * tick.
+ */
+static void disarm_timer(struct k_itimer *timer, struct task_struct *p)
+{
+ struct cpu_timer *ctmr = &timer->it.cpu;
+ struct posix_cputimer_base *base;
+
+ if (!cpu_timer_dequeue(ctmr))
+ return;
+
+ base = timer_base(timer, p);
+ if (cpu_timer_getexpires(ctmr) == base->nextevt)
+ trigger_base_recalc_expires(timer, p);
+}
+
+
/*
* Clean up a CPU-clock timer that is about to be destroyed.
* This is called from timer deletion with the timer already locked.
@@ -427,7 +496,7 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
if (timer->it.cpu.firing)
ret = TIMER_RETRY;
else
- cpu_timer_dequeue(ctmr);
+ disarm_timer(timer, p);
unlock_task_sighand(p, &flags);
}
@@ -486,15 +555,9 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
*/
static void arm_timer(struct k_itimer *timer, struct task_struct *p)
{
- int clkidx = CPUCLOCK_WHICH(timer->it_clock);
+ struct posix_cputimer_base *base = timer_base(timer, p);
struct cpu_timer *ctmr = &timer->it.cpu;
u64 newexp = cpu_timer_getexpires(ctmr);
- struct posix_cputimer_base *base;
-
- if (CPUCLOCK_PERTHREAD(timer->it_clock))
- base = p->posix_cputimers.bases + clkidx;
- else
- base = p->signal->posix_cputimers.bases + clkidx;
if (!cpu_timer_enqueue(&base->tqhead, ctmr))
return;
@@ -511,7 +574,7 @@ static void arm_timer(struct k_itimer *timer, struct task_struct *p)
if (CPUCLOCK_PERTHREAD(timer->it_clock))
tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
else
- tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);
+ tick_dep_set_signal(p, TICK_DEP_BIT_POSIX_TIMER);
}
/*
@@ -691,16 +754,29 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
timer->it_overrun_last = 0;
timer->it_overrun = -1;
- if (new_expires != 0 && !(val < new_expires)) {
+ if (val >= new_expires) {
+ if (new_expires != 0) {
+ /*
+ * The designated time already passed, so we notify
+ * immediately, even if the thread never runs to
+ * accumulate more time on this clock.
+ */
+ cpu_timer_fire(timer);
+ }
+
/*
- * The designated time already passed, so we notify
- * immediately, even if the thread never runs to
- * accumulate more time on this clock.
+ * Make sure we don't keep around the process wide cputime
+ * counter or the tick dependency if they are not necessary.
*/
- cpu_timer_fire(timer);
- }
+ sighand = lock_task_sighand(p, &flags);
+ if (!sighand)
+ goto out;
+
+ if (!cpu_timer_queued(ctmr))
+ trigger_base_recalc_expires(timer, p);
- ret = 0;
+ unlock_task_sighand(p, &flags);
+ }
out:
rcu_read_unlock();
if (old)
@@ -770,6 +846,8 @@ static u64 collect_timerqueue(struct timerqueue_head *head,
return expires;
ctmr->firing = 1;
+ /* See posix_cpu_timer_wait_running() */
+ rcu_assign_pointer(ctmr->handling, current);
cpu_timer_dequeue(ctmr);
list_add_tail(&ctmr->elist, firing);
}
@@ -793,7 +871,7 @@ static inline void check_dl_overrun(struct task_struct *tsk)
{
if (tsk->dl.dl_overrun) {
tsk->dl.dl_overrun = 0;
- __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
+ send_signal_locked(SIGXCPU, SEND_SIG_PRIV, tsk, PIDTYPE_TGID);
}
}
@@ -807,7 +885,7 @@ static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard)
rt ? "RT" : "CPU", hard ? "hard" : "soft",
current->comm, task_pid_nr(current));
}
- __group_send_sig_info(signo, SEND_SIG_PRIV, current);
+ send_signal_locked(signo, SEND_SIG_PRIV, current, PIDTYPE_TGID);
return true;
}
@@ -881,7 +959,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
trace_itimer_expire(signo == SIGPROF ?
ITIMER_PROF : ITIMER_VIRTUAL,
task_tgid(tsk), cur_time);
- __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
+ send_signal_locked(signo, SEND_SIG_PRIV, tsk, PIDTYPE_TGID);
}
if (it->expires && it->expires < *expires)
@@ -979,6 +1057,11 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
if (!p)
goto out;
+ /* Protect timer list r/w in arm_timer() */
+ sighand = lock_task_sighand(p, &flags);
+ if (unlikely(sighand == NULL))
+ goto out;
+
/*
* Fetch the current sample and update the timer's expiry time.
*/
@@ -989,11 +1072,6 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
bump_cpu_timer(timer, now);
- /* Protect timer list r/w in arm_timer() */
- sighand = lock_task_sighand(p, &flags);
- if (unlikely(sighand == NULL))
- goto out;
-
/*
* Now re-arm for the new expiry time.
*/
@@ -1080,43 +1158,233 @@ static inline bool fastpath_timer_check(struct task_struct *tsk)
return false;
}
+static void handle_posix_cpu_timers(struct task_struct *tsk);
+
+#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
+static void posix_cpu_timers_work(struct callback_head *work)
+{
+ struct posix_cputimers_work *cw = container_of(work, typeof(*cw), work);
+
+ mutex_lock(&cw->mutex);
+ handle_posix_cpu_timers(current);
+ mutex_unlock(&cw->mutex);
+}
+
/*
- * This is called from the timer interrupt handler. The irq handler has
- * already updated our counts. We need to check if any timers fire now.
- * Interrupts are disabled.
+ * Invoked from the posix-timer core when a cancel operation failed because
+ * the timer is marked firing. The caller holds rcu_read_lock(), which
+ * protects the timer and the task which is expiring it from being freed.
*/
-void run_posix_cpu_timers(void)
+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
{
- struct task_struct *tsk = current;
- struct k_itimer *timer, *next;
- unsigned long flags;
- LIST_HEAD(firing);
+ struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling);
- lockdep_assert_irqs_disabled();
+ /* Has the handling task completed expiry already? */
+ if (!tsk)
+ return;
+
+ /* Ensure that the task cannot go away */
+ get_task_struct(tsk);
+ /* Now drop the RCU protection so the mutex can be locked */
+ rcu_read_unlock();
+ /* Wait on the expiry mutex */
+ mutex_lock(&tsk->posix_cputimers_work.mutex);
+ /* Release it immediately again. */
+ mutex_unlock(&tsk->posix_cputimers_work.mutex);
+ /* Drop the task reference. */
+ put_task_struct(tsk);
+ /* Relock RCU so the callsite is balanced */
+ rcu_read_lock();
+}
+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
+{
+ /* Ensure that timr->it.cpu.handling task cannot go away */
+ rcu_read_lock();
+ spin_unlock_irq(&timr->it_lock);
+ posix_cpu_timer_wait_running(timr);
+ rcu_read_unlock();
+ /* @timr is on stack and is valid */
+ spin_lock_irq(&timr->it_lock);
+}
+
+/*
+ * Clear existing posix CPU timers task work.
+ */
+void clear_posix_cputimers_work(struct task_struct *p)
+{
/*
- * The fast path checks that there are no expired thread or thread
- * group timers. If that's so, just return.
+ * A copied work entry from the old task is not meaningful, clear it.
+ * N.B. init_task_work will not do this.
*/
- if (!fastpath_timer_check(tsk))
- return;
+ memset(&p->posix_cputimers_work.work, 0,
+ sizeof(p->posix_cputimers_work.work));
+ init_task_work(&p->posix_cputimers_work.work,
+ posix_cpu_timers_work);
+ mutex_init(&p->posix_cputimers_work.mutex);
+ p->posix_cputimers_work.scheduled = false;
+}
- lockdep_posixtimer_enter();
- if (!lock_task_sighand(tsk, &flags)) {
- lockdep_posixtimer_exit();
+/*
+ * Initialize posix CPU timers task work in init task. Out of line to
+ * keep the callback static and to avoid header recursion hell.
+ */
+void __init posix_cputimers_init_work(void)
+{
+ clear_posix_cputimers_work(current);
+}
+
+/*
+ * Note: All operations on tsk->posix_cputimer_work.scheduled happen either
+ * in hard interrupt context or in task context with interrupts
+ * disabled. Aside of that the writer/reader interaction is always in the
+ * context of the current task, which means they are strict per CPU.
+ */
+static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
+{
+ return tsk->posix_cputimers_work.scheduled;
+}
+
+static inline void __run_posix_cpu_timers(struct task_struct *tsk)
+{
+ if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled))
return;
+
+ /* Schedule task work to actually expire the timers */
+ tsk->posix_cputimers_work.scheduled = true;
+ task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME);
+}
+
+static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
+ unsigned long start)
+{
+ bool ret = true;
+
+ /*
+ * On !RT kernels interrupts are disabled while collecting expired
+ * timers, so no tick can happen and the fast path check can be
+ * reenabled without further checks.
+ */
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ tsk->posix_cputimers_work.scheduled = false;
+ return true;
}
+
/*
- * Here we take off tsk->signal->cpu_timers[N] and
- * tsk->cpu_timers[N] all the timers that are firing, and
- * put them on the firing list.
+ * On RT enabled kernels ticks can happen while the expired timers
+ * are collected under sighand lock. But any tick which observes
+ * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath
+ * checks. So reenabling the tick work has do be done carefully:
+ *
+ * Disable interrupts and run the fast path check if jiffies have
+ * advanced since the collecting of expired timers started. If
+ * jiffies have not advanced or the fast path check did not find
+ * newly expired timers, reenable the fast path check in the timer
+ * interrupt. If there are newly expired timers, return false and
+ * let the collection loop repeat.
*/
- check_thread_timers(tsk, &firing);
+ local_irq_disable();
+ if (start != jiffies && fastpath_timer_check(tsk))
+ ret = false;
+ else
+ tsk->posix_cputimers_work.scheduled = false;
+ local_irq_enable();
+
+ return ret;
+}
+#else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
+static inline void __run_posix_cpu_timers(struct task_struct *tsk)
+{
+ lockdep_posixtimer_enter();
+ handle_posix_cpu_timers(tsk);
+ lockdep_posixtimer_exit();
+}
+
+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
+{
+ cpu_relax();
+}
+
+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
+{
+ spin_unlock_irq(&timr->it_lock);
+ cpu_relax();
+ spin_lock_irq(&timr->it_lock);
+}
+
+static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
+{
+ return false;
+}
+
+static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
+ unsigned long start)
+{
+ return true;
+}
+#endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
+
+static void handle_posix_cpu_timers(struct task_struct *tsk)
+{
+ struct k_itimer *timer, *next;
+ unsigned long flags, start;
+ LIST_HEAD(firing);
+
+ if (!lock_task_sighand(tsk, &flags))
+ return;
+
+ do {
+ /*
+ * On RT locking sighand lock does not disable interrupts,
+ * so this needs to be careful vs. ticks. Store the current
+ * jiffies value.
+ */
+ start = READ_ONCE(jiffies);
+ barrier();
+
+ /*
+ * Here we take off tsk->signal->cpu_timers[N] and
+ * tsk->cpu_timers[N] all the timers that are firing, and
+ * put them on the firing list.
+ */
+ check_thread_timers(tsk, &firing);
+
+ check_process_timers(tsk, &firing);
- check_process_timers(tsk, &firing);
+ /*
+ * The above timer checks have updated the expiry cache and
+ * because nothing can have queued or modified timers after
+ * sighand lock was taken above it is guaranteed to be
+ * consistent. So the next timer interrupt fastpath check
+ * will find valid data.
+ *
+ * If timer expiry runs in the timer interrupt context then
+ * the loop is not relevant as timers will be directly
+ * expired in interrupt context. The stub function below
+ * returns always true which allows the compiler to
+ * optimize the loop out.
+ *
+ * If timer expiry is deferred to task work context then
+ * the following rules apply:
+ *
+ * - On !RT kernels no tick can have happened on this CPU
+ * after sighand lock was acquired because interrupts are
+ * disabled. So reenabling task work before dropping
+ * sighand lock and reenabling interrupts is race free.
+ *
+ * - On RT kernels ticks might have happened but the tick
+ * work ignored posix CPU timer handling because the
+ * CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work
+ * must be done very carefully including a check whether
+ * ticks have happened since the start of the timer
+ * expiry checks. posix_cpu_timers_enable_work() takes
+ * care of that and eventually lets the expiry checks
+ * run again.
+ */
+ } while (!posix_cpu_timers_enable_work(tsk, start));
/*
- * We must release these locks before taking any timer's lock.
+ * We must release sighand lock before taking any timer's lock.
* There is a potential race with timer deletion here, as the
* siglock now protects our private firing list. We have set
* the firing flag in each timer, so that a deletion attempt
@@ -1134,6 +1402,13 @@ void run_posix_cpu_timers(void)
list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) {
int cpu_firing;
+ /*
+ * spin_lock() is sufficient here even independent of the
+ * expiry context. If expiry happens in hard interrupt
+ * context it's obvious. For task work context it's safe
+ * because all other operations on timer::it_lock happen in
+ * task context (syscall or exit).
+ */
spin_lock(&timer->it_lock);
list_del_init(&timer->it.cpu.elist);
cpu_firing = timer->it.cpu.firing;
@@ -1145,9 +1420,38 @@ void run_posix_cpu_timers(void)
*/
if (likely(cpu_firing >= 0))
cpu_timer_fire(timer);
+ /* See posix_cpu_timer_wait_running() */
+ rcu_assign_pointer(timer->it.cpu.handling, NULL);
spin_unlock(&timer->it_lock);
}
- lockdep_posixtimer_exit();
+}
+
+/*
+ * This is called from the timer interrupt handler. The irq handler has
+ * already updated our counts. We need to check if any timers fire now.
+ * Interrupts are disabled.
+ */
+void run_posix_cpu_timers(void)
+{
+ struct task_struct *tsk = current;
+
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * If the actual expiry is deferred to task work context and the
+ * work is already scheduled there is no point to do anything here.
+ */
+ if (posix_cpu_timers_work_scheduled(tsk))
+ return;
+
+ /*
+ * The fast path checks that there are no expired thread or thread
+ * group timers. If that's so, just return.
+ */
+ if (!fastpath_timer_check(tsk))
+ return;
+
+ __run_posix_cpu_timers(tsk);
}
/*
@@ -1180,9 +1484,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid,
}
}
- if (!*newval)
- return;
- *newval += now;
+ if (*newval)
+ *newval += now;
}
/*
@@ -1192,7 +1495,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid,
if (*newval < *nextevt)
*nextevt = *newval;
- tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER);
+ tick_dep_set_signal(tsk, TICK_DEP_BIT_POSIX_TIMER);
}
static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
@@ -1253,23 +1556,16 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
expires = cpu_timer_getexpires(&timer.it.cpu);
error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);
if (!error) {
- /*
- * Timer is now unarmed, deletion can not fail.
- */
+ /* Timer is now unarmed, deletion can not fail. */
posix_cpu_timer_del(&timer);
+ } else {
+ while (error == TIMER_RETRY) {
+ posix_cpu_timer_wait_running_nsleep(&timer);
+ error = posix_cpu_timer_del(&timer);
+ }
}
- spin_unlock_irq(&timer.it_lock);
- while (error == TIMER_RETRY) {
- /*
- * We need to handle case when timer was or is in the
- * middle of firing. In other cases we already freed
- * resources.
- */
- spin_lock_irq(&timer.it_lock);
- error = posix_cpu_timer_del(&timer);
- spin_unlock_irq(&timer.it_lock);
- }
+ spin_unlock_irq(&timer.it_lock);
if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
/*
@@ -1314,8 +1610,8 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
if (flags & TIMER_ABSTIME)
return -ERESTARTNOHAND;
- restart_block->fn = posix_cpu_nsleep_restart;
restart_block->nanosleep.clockid = which_clock;
+ set_restart_fn(restart_block, posix_cpu_nsleep_restart);
}
return error;
}
@@ -1379,6 +1675,7 @@ const struct k_clock clock_posix_cpu = {
.timer_del = posix_cpu_timer_del,
.timer_get = posix_cpu_timer_get,
.timer_rearm = posix_cpu_timer_rearm,
+ .timer_wait_running = posix_cpu_timer_wait_running,
};
const struct k_clock clock_process = {
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index fcb3b21d8bdc..9b6fcb8d85e7 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -17,40 +17,6 @@
#include <linux/time_namespace.h>
#include <linux/compat.h>
-#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
-/* Architectures may override SYS_NI and COMPAT_SYS_NI */
-#include <asm/syscall_wrapper.h>
-#endif
-
-asmlinkage long sys_ni_posix_timers(void)
-{
- pr_err_once("process %d (%s) attempted a POSIX timer syscall "
- "while CONFIG_POSIX_TIMERS is not set\n",
- current->pid, current->comm);
- return -ENOSYS;
-}
-
-#ifndef SYS_NI
-#define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers)
-#endif
-
-#ifndef COMPAT_SYS_NI
-#define COMPAT_SYS_NI(name) SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers)
-#endif
-
-SYS_NI(timer_create);
-SYS_NI(timer_gettime);
-SYS_NI(timer_getoverrun);
-SYS_NI(timer_settime);
-SYS_NI(timer_delete);
-SYS_NI(clock_adjtime);
-SYS_NI(getitimer);
-SYS_NI(setitimer);
-SYS_NI(clock_adjtime32);
-#ifdef __ARCH_WANT_SYS_ALARM
-SYS_NI(alarm);
-#endif
-
/*
* We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC
* as it is easy to remain compatible with little code. CLOCK_BOOTTIME
@@ -70,7 +36,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
return do_sys_settimeofday64(&new_tp, NULL);
}
-int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
+static int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
{
switch (which_clock) {
case CLOCK_REALTIME:
@@ -90,6 +56,7 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
return 0;
}
+
SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
struct __kernel_timespec __user *, tp)
{
@@ -146,6 +113,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
texp = timespec64_to_ktime(t);
@@ -156,18 +124,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
which_clock);
}
-#ifdef CONFIG_COMPAT
-COMPAT_SYS_NI(timer_create);
-#endif
-
-#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA)
-COMPAT_SYS_NI(getitimer);
-COMPAT_SYS_NI(setitimer);
-#endif
-
#ifdef CONFIG_COMPAT_32BIT_TIME
-SYS_NI(timer_settime32);
-SYS_NI(timer_gettime32);
SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock,
struct old_timespec32 __user *, tp)
@@ -239,6 +196,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
texp = timespec64_to_ktime(t);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 07709ac30439..b924f0f096fa 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -35,20 +35,17 @@
#include "timekeeping.h"
#include "posix-timers.h"
-/*
- * Management arrays for POSIX timers. Timers are now kept in static hash table
- * with 512 entries.
- * Timer ids are allocated by local routine, which selects proper hash head by
- * key, constructed from current->signal address and per signal struct counter.
- * This keeps timer ids unique per process, but now they can intersect between
- * processes.
- */
+static struct kmem_cache *posix_timers_cache;
/*
- * Lets keep our timers in a slab cache :-)
+ * Timers are managed in a hash table for lockless lookup. The hash key is
+ * constructed from current::signal and the timer ID and the timer is
+ * matched against current::signal and the timer ID when walking the hash
+ * bucket list.
+ *
+ * This allows checkpoint/restore to reconstruct the exact timer IDs for
+ * a process.
*/
-static struct kmem_cache *posix_timers_cache;
-
static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
static DEFINE_SPINLOCK(hash_lock);
@@ -56,52 +53,12 @@ static const struct k_clock * const posix_clocks[];
static const struct k_clock *clockid_to_kclock(const clockid_t id);
static const struct k_clock clock_realtime, clock_monotonic;
-/*
- * we assume that the new SIGEV_THREAD_ID shares no bits with the other
- * SIGEV values. Here we put out an error if this assumption fails.
- */
+/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */
#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
- ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
+ ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
#endif
-/*
- * The timer ID is turned into a timer address by idr_find().
- * Verifying a valid ID consists of:
- *
- * a) checking that idr_find() returns other than -1.
- * b) checking that the timer id matches the one in the timer itself.
- * c) that the timer owner is in the callers thread group.
- */
-
-/*
- * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
- * to implement others. This structure defines the various
- * clocks.
- *
- * RESOLUTION: Clock resolution is used to round up timer and interval
- * times, NOT to report clock times, which are reported with as
- * much resolution as the system can muster. In some cases this
- * resolution may depend on the underlying clock hardware and
- * may not be quantifiable until run time, and only then is the
- * necessary code is written. The standard says we should say
- * something about this issue in the documentation...
- *
- * FUNCTIONS: The CLOCKs structure defines possible functions to
- * handle various clock functions.
- *
- * The standard POSIX timer management code assumes the
- * following: 1.) The k_itimer struct (sched.h) is used for
- * the timer. 2.) The list, it_lock, it_clock, it_id and
- * it_pid fields are not modified by timer code.
- *
- * Permissions: It is assumed that the clock_settime() function defined
- * for each clock will take care of permission checks. Some
- * clocks may be set able by any user (i.e. local process
- * clocks) others not. Currently the only set able clock we
- * have is CLOCK_REALTIME and its high res counter part, both of
- * which we beg off on and pass to do_sys_settimeofday().
- */
static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
#define lock_timer(tid, flags) \
@@ -121,9 +78,9 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head,
{
struct k_itimer *timer;
- hlist_for_each_entry_rcu(timer, head, t_hash,
- lockdep_is_held(&hash_lock)) {
- if ((timer->it_signal == sig) && (timer->it_id == id))
+ hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) {
+ /* timer->it_signal can be set concurrently */
+ if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id))
return timer;
}
return NULL;
@@ -140,25 +97,30 @@ static struct k_itimer *posix_timer_by_id(timer_t id)
static int posix_timer_add(struct k_itimer *timer)
{
struct signal_struct *sig = current->signal;
- int first_free_id = sig->posix_timer_id;
struct hlist_head *head;
- int ret = -ENOENT;
+ unsigned int cnt, id;
- do {
+ /*
+ * FIXME: Replace this by a per signal struct xarray once there is
+ * a plan to handle the resulting CRIU regression gracefully.
+ */
+ for (cnt = 0; cnt <= INT_MAX; cnt++) {
spin_lock(&hash_lock);
- head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
- if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
+ id = sig->next_posix_timer_id;
+
+ /* Write the next ID back. Clamp it to the positive space */
+ sig->next_posix_timer_id = (id + 1) & INT_MAX;
+
+ head = &posix_timers_hashtable[hash(sig, id)];
+ if (!__posix_timers_find(head, sig, id)) {
hlist_add_head_rcu(&timer->t_hash, head);
- ret = sig->posix_timer_id;
+ spin_unlock(&hash_lock);
+ return id;
}
- if (++sig->posix_timer_id < 0)
- sig->posix_timer_id = 0;
- if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
- /* Loop over all possible ids completed */
- ret = -EAGAIN;
spin_unlock(&hash_lock);
- } while (ret == -ENOENT);
- return ret;
+ }
+ /* POSIX return code when no timer ID could be allocated */
+ return -EAGAIN;
}
static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
@@ -166,7 +128,6 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
spin_unlock_irqrestore(&timr->it_lock, flags);
}
-/* Get clock_realtime */
static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_real_ts64(tp);
@@ -178,7 +139,6 @@ static ktime_t posix_get_realtime_ktime(clockid_t which_clock)
return ktime_get_real();
}
-/* Set clock_realtime */
static int posix_clock_realtime_set(const clockid_t which_clock,
const struct timespec64 *tp)
{
@@ -191,9 +151,6 @@ static int posix_clock_realtime_adj(const clockid_t which_clock,
return do_adjtimex(t);
}
-/*
- * Get monotonic time for posix timers
- */
static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_ts64(tp);
@@ -206,9 +163,6 @@ static ktime_t posix_get_monotonic_ktime(clockid_t which_clock)
return ktime_get();
}
-/*
- * Get monotonic-raw time for posix timers
- */
static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_raw_ts64(tp);
@@ -216,7 +170,6 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
return 0;
}
-
static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_coarse_real_ts64(tp);
@@ -267,14 +220,11 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
return 0;
}
-/*
- * Initialize everything, well, just everything in Posix clocks/timers ;)
- */
static __init int init_posix_timers(void)
{
posix_timers_cache = kmem_cache_create("posix_timers_cache",
- sizeof (struct k_itimer), 0, SLAB_PANIC,
- NULL);
+ sizeof(struct k_itimer), 0,
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
return 0;
}
__initcall(init_posix_timers);
@@ -300,15 +250,9 @@ static void common_hrtimer_rearm(struct k_itimer *timr)
}
/*
- * This function is exported for use by the signal deliver code. It is
- * called just prior to the info block being released and passes that
- * block to us. It's function is to update the overrun entry AND to
- * restart the timer. It should only be called if the timer is to be
- * restarted (i.e. we have flagged this in the sys_private entry of the
- * info block).
- *
- * To protect against the timer going away while the interrupt is queued,
- * we require that the it_requeue_pending flag be set.
+ * This function is called from the signal delivery code if
+ * info->si_sys_private is not zero, which indicates that the timer has to
+ * be rearmed. Restart the timer and update info::si_overrun.
*/
void posixtimer_rearm(struct kernel_siginfo *info)
{
@@ -336,7 +280,7 @@ void posixtimer_rearm(struct kernel_siginfo *info)
int posix_timer_event(struct k_itimer *timr, int si_private)
{
enum pid_type type;
- int ret = -1;
+ int ret;
/*
* FIXME: if ->sigq is queued we can race with
* dequeue_signal()->posixtimer_rearm().
@@ -357,18 +301,18 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
}
/*
- * This function gets called when a POSIX.1b interval timer expires. It
- * is used as a callback from the kernel internal timer. The
- * run_timer_list code ALWAYS calls with interrupts on.
-
- * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
+ * This function gets called when a POSIX.1b interval timer expires from
+ * the HRTIMER interrupt (soft interrupt on RT kernels).
+ *
+ * Handles CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME and CLOCK_TAI
+ * based timers.
*/
static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
{
+ enum hrtimer_restart ret = HRTIMER_NORESTART;
struct k_itimer *timr;
unsigned long flags;
int si_private = 0;
- enum hrtimer_restart ret = HRTIMER_NORESTART;
timr = container_of(timer, struct k_itimer, it.real.timer);
spin_lock_irqsave(&timr->it_lock, flags);
@@ -379,9 +323,10 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
if (posix_timer_event(timr, si_private)) {
/*
- * signal was not sent because of sig_ignor
- * we will not get a call back to restart it AND
- * it should be restarted.
+ * The signal was not queued due to SIG_IGN. As a
+ * consequence the timer is not going to be rearmed from
+ * the signal delivery path. But as a real signal handler
+ * can be installed later the timer must be rearmed here.
*/
if (timr->it_interval != 0) {
ktime_t now = hrtimer_cb_get_time(timer);
@@ -390,34 +335,35 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
* FIXME: What we really want, is to stop this
* timer completely and restart it in case the
* SIG_IGN is removed. This is a non trivial
- * change which involves sighand locking
- * (sigh !), which we don't want to do late in
- * the release cycle.
+ * change to the signal handling code.
+ *
+ * For now let timers with an interval less than a
+ * jiffie expire every jiffie and recheck for a
+ * valid signal handler.
*
- * For now we just let timers with an interval
- * less than a jiffie expire every jiffie to
- * avoid softirq starvation in case of SIG_IGN
- * and a very small interval, which would put
- * the timer right back on the softirq pending
- * list. By moving now ahead of time we trick
- * hrtimer_forward() to expire the timer
- * later, while we still maintain the overrun
- * accuracy, but have some inconsistency in
- * the timer_gettime() case. This is at least
- * better than a starved softirq. A more
- * complex fix which solves also another related
- * inconsistency is already in the pipeline.
+ * This avoids interrupt starvation in case of a
+ * very small interval, which would expire the
+ * timer immediately again.
+ *
+ * Moving now ahead of time by one jiffie tricks
+ * hrtimer_forward() to expire the timer later,
+ * while it still maintains the overrun accuracy
+ * for the price of a slight inconsistency in the
+ * timer_gettime() case. This is at least better
+ * than a timer storm.
+ *
+ * Only required when high resolution timers are
+ * enabled as the periodic tick based timers are
+ * automatically aligned to the next tick.
*/
-#ifdef CONFIG_HIGH_RES_TIMERS
- {
- ktime_t kj = NSEC_PER_SEC / HZ;
+ if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS)) {
+ ktime_t kj = TICK_NSEC;
if (timr->it_interval < kj)
now = ktime_add(now, kj);
}
-#endif
- timr->it_overrun += hrtimer_forward(timer, now,
- timr->it_interval);
+
+ timr->it_overrun += hrtimer_forward(timer, now, timr->it_interval);
ret = HRTIMER_RESTART;
++timr->it_requeue_pending;
timr->it_active = 1;
@@ -439,12 +385,12 @@ static struct pid *good_sigevent(sigevent_t * event)
rtn = pid_task(pid, PIDTYPE_PID);
if (!rtn || !same_thread_group(rtn, current))
return NULL;
- /* FALLTHRU */
+ fallthrough;
case SIGEV_SIGNAL:
case SIGEV_THREAD:
if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX)
return NULL;
- /* FALLTHRU */
+ fallthrough;
case SIGEV_NONE:
return pid;
default:
@@ -454,8 +400,8 @@ static struct pid *good_sigevent(sigevent_t * event)
static struct k_itimer * alloc_posix_timer(void)
{
- struct k_itimer *tmr;
- tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
+ struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
+
if (!tmr)
return tmr;
if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
@@ -473,21 +419,21 @@ static void k_itimer_rcu_free(struct rcu_head *head)
kmem_cache_free(posix_timers_cache, tmr);
}
-#define IT_ID_SET 1
-#define IT_ID_NOT_SET 0
-static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
+static void posix_timer_free(struct k_itimer *tmr)
{
- if (it_id_set) {
- unsigned long flags;
- spin_lock_irqsave(&hash_lock, flags);
- hlist_del_rcu(&tmr->t_hash);
- spin_unlock_irqrestore(&hash_lock, flags);
- }
put_pid(tmr->it_pid);
sigqueue_free(tmr->sigq);
call_rcu(&tmr->rcu, k_itimer_rcu_free);
}
+static void posix_timer_unhash_and_free(struct k_itimer *tmr)
+{
+ spin_lock(&hash_lock);
+ hlist_del_rcu(&tmr->t_hash);
+ spin_unlock(&hash_lock);
+ posix_timer_free(tmr);
+}
+
static int common_timer_create(struct k_itimer *new_timer)
{
hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
@@ -501,7 +447,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct k_itimer *new_timer;
int error, new_timer_id;
- int it_id_set = IT_ID_NOT_SET;
if (!kc)
return -EINVAL;
@@ -513,13 +458,18 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
return -EAGAIN;
spin_lock_init(&new_timer->it_lock);
+
+ /*
+ * Add the timer to the hash table. The timer is not yet valid
+ * because new_timer::it_signal is still NULL. The timer id is also
+ * not yet visible to user space.
+ */
new_timer_id = posix_timer_add(new_timer);
if (new_timer_id < 0) {
- error = new_timer_id;
- goto out;
+ posix_timer_free(new_timer);
+ return new_timer_id;
}
- it_id_set = IT_ID_SET;
new_timer->it_id = (timer_t) new_timer_id;
new_timer->it_clock = which_clock;
new_timer->kclock = kc;
@@ -547,30 +497,33 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
new_timer->sigq->info.si_tid = new_timer->it_id;
new_timer->sigq->info.si_code = SI_TIMER;
- if (copy_to_user(created_timer_id,
- &new_timer_id, sizeof (new_timer_id))) {
+ if (copy_to_user(created_timer_id, &new_timer_id, sizeof (new_timer_id))) {
error = -EFAULT;
goto out;
}
-
+ /*
+ * After succesful copy out, the timer ID is visible to user space
+ * now but not yet valid because new_timer::signal is still NULL.
+ *
+ * Complete the initialization with the clock specific create
+ * callback.
+ */
error = kc->timer_create(new_timer);
if (error)
goto out;
spin_lock_irq(&current->sighand->siglock);
- new_timer->it_signal = current->signal;
+ /* This makes the timer valid in the hash table */
+ WRITE_ONCE(new_timer->it_signal, current->signal);
list_add(&new_timer->list, &current->signal->posix_timers);
spin_unlock_irq(&current->sighand->siglock);
-
- return 0;
/*
- * In the case of the timer belonging to another task, after
- * the task is unlocked, the timer is owned by the other task
- * and may cease to exist at any time. Don't use or modify
- * new_timer after the unlock call.
+ * After unlocking sighand::siglock @new_timer is subject to
+ * concurrent removal and cannot be touched anymore
*/
+ return 0;
out:
- release_posix_timer(new_timer, it_id_set);
+ posix_timer_unhash_and_free(new_timer);
return error;
}
@@ -604,13 +557,6 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
}
#endif
-/*
- * Locking issues: We need to protect the result of the id look up until
- * we get the timer locked down so it is not deleted under us. The
- * removal is done under the idr spinlock so we use that here to bridge
- * the find to the timer lock. To avoid a dead lock, the timer id MUST
- * be release with out holding the timer lock.
- */
static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
{
struct k_itimer *timr;
@@ -622,10 +568,35 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
if ((unsigned long long)timer_id > INT_MAX)
return NULL;
+ /*
+ * The hash lookup and the timers are RCU protected.
+ *
+ * Timers are added to the hash in invalid state where
+ * timr::it_signal == NULL. timer::it_signal is only set after the
+ * rest of the initialization succeeded.
+ *
+ * Timer destruction happens in steps:
+ * 1) Set timr::it_signal to NULL with timr::it_lock held
+ * 2) Release timr::it_lock
+ * 3) Remove from the hash under hash_lock
+ * 4) Call RCU for removal after the grace period
+ *
+ * Holding rcu_read_lock() accross the lookup ensures that
+ * the timer cannot be freed.
+ *
+ * The lookup validates locklessly that timr::it_signal ==
+ * current::it_signal and timr::it_id == @timer_id. timr::it_id
+ * can't change, but timr::it_signal becomes NULL during
+ * destruction.
+ */
rcu_read_lock();
timr = posix_timer_by_id(timer_id);
if (timr) {
spin_lock_irqsave(&timr->it_lock, *flags);
+ /*
+ * Validate under timr::it_lock that timr::it_signal is
+ * still valid. Pairs with #1 above.
+ */
if (timr->it_signal == current->signal) {
rcu_read_unlock();
return timr;
@@ -652,20 +623,16 @@ static s64 common_hrtimer_forward(struct k_itimer *timr, ktime_t now)
}
/*
- * Get the time remaining on a POSIX.1b interval timer. This function
- * is ALWAYS called with spin_lock_irq on the timer, thus it must not
- * mess with irq.
+ * Get the time remaining on a POSIX.1b interval timer.
+ *
+ * Two issues to handle here:
*
- * We have a couple of messes to clean up here. First there is the case
- * of a timer that has a requeue pending. These timers should appear to
- * be in the timer list with an expiry as if we were to requeue them
- * now.
+ * 1) The timer has a requeue pending. The return value must appear as
+ * if the timer has been requeued right now.
*
- * The second issue is the SIGEV_NONE timer which may be active but is
- * not really ever put in the timer list (to save system resources).
- * This timer may be expired, and if so, we will do it here. Otherwise
- * it is the same as a requeue pending timer WRT to what we should
- * report.
+ * 2) The timer is a SIGEV_NONE timer. These timers are never enqueued
+ * into the hrtimer queue and therefore never expired. Emulate expiry
+ * here taking #1 into account.
*/
void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
{
@@ -681,8 +648,12 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
cur_setting->it_interval = ktime_to_timespec64(iv);
} else if (!timr->it_active) {
/*
- * SIGEV_NONE oneshot timers are never queued. Check them
- * below.
+ * SIGEV_NONE oneshot timers are never queued and therefore
+ * timr->it_active is always false. The check below
+ * vs. remaining time will handle this case.
+ *
+ * For all other timers there is nothing to update here, so
+ * return.
*/
if (!sig_none)
return;
@@ -691,18 +662,29 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
now = kc->clock_get_ktime(timr->it_clock);
/*
- * When a requeue is pending or this is a SIGEV_NONE timer move the
- * expiry time forward by intervals, so expiry is > now.
+ * If this is an interval timer and either has requeue pending or
+ * is a SIGEV_NONE timer move the expiry time forward by intervals,
+ * so expiry is > now.
*/
if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none))
timr->it_overrun += kc->timer_forward(timr, now);
remaining = kc->timer_remaining(timr, now);
- /* Return 0 only, when the timer is expired and not pending */
+ /*
+ * As @now is retrieved before a possible timer_forward() and
+ * cannot be reevaluated by the compiler @remaining is based on the
+ * same @now value. Therefore @remaining is consistent vs. @now.
+ *
+ * Consequently all interval timers, i.e. @iv > 0, cannot have a
+ * remaining time <= 0 because timer_forward() guarantees to move
+ * them forward so that the next timer expiry is > @now.
+ */
if (remaining <= 0) {
/*
- * A single shot SIGEV_NONE timer must return 0, when
- * it is expired !
+ * A single shot SIGEV_NONE timer must return 0, when it is
+ * expired! Timers which have a real signal delivery mode
+ * must return a remaining time greater than 0 because the
+ * signal has not yet been delivered.
*/
if (!sig_none)
cur_setting->it_value.tv_nsec = 1;
@@ -711,11 +693,10 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
}
}
-/* Get the time remaining on a POSIX.1b interval timer. */
static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting)
{
- struct k_itimer *timr;
const struct k_clock *kc;
+ struct k_itimer *timr;
unsigned long flags;
int ret = 0;
@@ -765,20 +746,29 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id,
#endif
-/*
- * Get the number of overruns of a POSIX.1b interval timer. This is to
- * be the overrun of the timer last delivered. At the same time we are
- * accumulating overruns on the next timer. The overrun is frozen when
- * the signal is delivered, either at the notify time (if the info block
- * is not queued) or at the actual delivery time (as we are informed by
- * the call back to posixtimer_rearm(). So all we need to do is
- * to pick up the frozen overrun.
+/**
+ * sys_timer_getoverrun - Get the number of overruns of a POSIX.1b interval timer
+ * @timer_id: The timer ID which identifies the timer
+ *
+ * The "overrun count" of a timer is one plus the number of expiration
+ * intervals which have elapsed between the first expiry, which queues the
+ * signal and the actual signal delivery. On signal delivery the "overrun
+ * count" is calculated and cached, so it can be returned directly here.
+ *
+ * As this is relative to the last queued signal the returned overrun count
+ * is meaningless outside of the signal delivery path and even there it
+ * does not accurately reflect the current state when user space evaluates
+ * it.
+ *
+ * Returns:
+ * -EINVAL @timer_id is invalid
+ * 1..INT_MAX The number of overruns related to the last delivered signal
*/
SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
{
struct k_itimer *timr;
- int overrun;
unsigned long flags;
+ int overrun;
timr = lock_timer(timer_id, &flags);
if (!timr)
@@ -831,10 +821,18 @@ static void common_timer_wait_running(struct k_itimer *timer)
}
/*
- * On PREEMPT_RT this prevent priority inversion against softirq kthread in
- * case it gets preempted while executing a timer callback. See comments in
- * hrtimer_cancel_wait_running. For PREEMPT_RT=n this just results in a
- * cpu_relax().
+ * On PREEMPT_RT this prevents priority inversion and a potential livelock
+ * against the ksoftirqd thread in case that ksoftirqd gets preempted while
+ * executing a hrtimer callback.
+ *
+ * See the comments in hrtimer_cancel_wait_running(). For PREEMPT_RT=n this
+ * just results in a cpu_relax().
+ *
+ * For POSIX CPU timers with CONFIG_POSIX_CPU_TIMERS_TASK_WORK=n this is
+ * just a cpu_relax(). With CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y this
+ * prevents spinning on an eventually scheduled out task and a livelock
+ * when the task which tries to delete or disarm the timer has preempted
+ * the task which runs the expiry in task work context.
*/
static struct k_itimer *timer_wait_running(struct k_itimer *timer,
unsigned long *flags)
@@ -846,6 +844,10 @@ static struct k_itimer *timer_wait_running(struct k_itimer *timer,
rcu_read_lock();
unlock_timer(timer, *flags);
+ /*
+ * kc->timer_wait_running() might drop RCU lock. So @timer
+ * cannot be touched anymore after the function returns!
+ */
if (!WARN_ON_ONCE(!kc->timer_wait_running))
kc->timer_wait_running(timer);
@@ -939,8 +941,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
const struct __kernel_itimerspec __user *, new_setting,
struct __kernel_itimerspec __user *, old_setting)
{
- struct itimerspec64 new_spec, old_spec;
- struct itimerspec64 *rtn = old_setting ? &old_spec : NULL;
+ struct itimerspec64 new_spec, old_spec, *rtn;
int error = 0;
if (!new_setting)
@@ -949,6 +950,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
if (get_itimerspec64(&new_spec, new_setting))
return -EFAULT;
+ rtn = old_setting ? &old_spec : NULL;
error = do_timer_settime(timer_id, flags, &new_spec, rtn);
if (!error && old_setting) {
if (put_itimerspec64(&old_spec, old_setting))
@@ -1022,44 +1024,88 @@ retry_delete:
list_del(&timer->list);
spin_unlock(&current->sighand->siglock);
/*
- * This keeps any tasks waiting on the spin lock from thinking
- * they got something (see the lock code above).
+ * A concurrent lookup could check timer::it_signal lockless. It
+ * will reevaluate with timer::it_lock held and observe the NULL.
*/
- timer->it_signal = NULL;
+ WRITE_ONCE(timer->it_signal, NULL);
unlock_timer(timer, flags);
- release_posix_timer(timer, IT_ID_SET);
+ posix_timer_unhash_and_free(timer);
return 0;
}
/*
- * return timer owned by the process, used by exit_itimers
+ * Delete a timer if it is armed, remove it from the hash and schedule it
+ * for RCU freeing.
*/
static void itimer_delete(struct k_itimer *timer)
{
-retry_delete:
- spin_lock_irq(&timer->it_lock);
+ unsigned long flags;
+
+ /*
+ * irqsave is required to make timer_wait_running() work.
+ */
+ spin_lock_irqsave(&timer->it_lock, flags);
+retry_delete:
+ /*
+ * Even if the timer is not longer accessible from other tasks
+ * it still might be armed and queued in the underlying timer
+ * mechanism. Worse, that timer mechanism might run the expiry
+ * function concurrently.
+ */
if (timer_delete_hook(timer) == TIMER_RETRY) {
- spin_unlock_irq(&timer->it_lock);
+ /*
+ * Timer is expired concurrently, prevent livelocks
+ * and pointless spinning on RT.
+ *
+ * timer_wait_running() drops timer::it_lock, which opens
+ * the possibility for another task to delete the timer.
+ *
+ * That's not possible here because this is invoked from
+ * do_exit() only for the last thread of the thread group.
+ * So no other task can access and delete that timer.
+ */
+ if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer))
+ return;
+
goto retry_delete;
}
list_del(&timer->list);
- spin_unlock_irq(&timer->it_lock);
- release_posix_timer(timer, IT_ID_SET);
+ /*
+ * Setting timer::it_signal to NULL is technically not required
+ * here as nothing can access the timer anymore legitimately via
+ * the hash table. Set it to NULL nevertheless so that all deletion
+ * paths are consistent.
+ */
+ WRITE_ONCE(timer->it_signal, NULL);
+
+ spin_unlock_irqrestore(&timer->it_lock, flags);
+ posix_timer_unhash_and_free(timer);
}
/*
- * This is called by do_exit or de_thread, only when there are no more
- * references to the shared signal_struct.
+ * Invoked from do_exit() when the last thread of a thread group exits.
+ * At that point no other task can access the timers of the dying
+ * task anymore.
*/
-void exit_itimers(struct signal_struct *sig)
+void exit_itimers(struct task_struct *tsk)
{
+ struct list_head timers;
struct k_itimer *tmr;
- while (!list_empty(&sig->posix_timers)) {
- tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
+ if (list_empty(&tsk->signal->posix_timers))
+ return;
+
+ /* Protect against concurrent read via /proc/$PID/timers */
+ spin_lock_irq(&tsk->sighand->siglock);
+ list_replace_init(&tsk->signal->posix_timers, &timers);
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ /* The timers are not longer accessible via tsk::signal */
+ while (!list_empty(&timers)) {
+ tmr = list_first_entry(&timers, struct k_itimer, list);
itimer_delete(tmr);
}
}
@@ -1076,6 +1122,10 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
if (get_timespec64(&new_tp, tp))
return -EFAULT;
+ /*
+ * Permission checks have to be done inside the clock specific
+ * setter callback.
+ */
return kc->clock_set(which_clock, &new_tp);
}
@@ -1126,6 +1176,79 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
return err;
}
+/**
+ * sys_clock_getres - Get the resolution of a clock
+ * @which_clock: The clock to get the resolution for
+ * @tp: Pointer to a a user space timespec64 for storage
+ *
+ * POSIX defines:
+ *
+ * "The clock_getres() function shall return the resolution of any
+ * clock. Clock resolutions are implementation-defined and cannot be set by
+ * a process. If the argument res is not NULL, the resolution of the
+ * specified clock shall be stored in the location pointed to by res. If
+ * res is NULL, the clock resolution is not returned. If the time argument
+ * of clock_settime() is not a multiple of res, then the value is truncated
+ * to a multiple of res."
+ *
+ * Due to the various hardware constraints the real resolution can vary
+ * wildly and even change during runtime when the underlying devices are
+ * replaced. The kernel also can use hardware devices with different
+ * resolutions for reading the time and for arming timers.
+ *
+ * The kernel therefore deviates from the POSIX spec in various aspects:
+ *
+ * 1) The resolution returned to user space
+ *
+ * For CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, CLOCK_TAI,
+ * CLOCK_REALTIME_ALARM, CLOCK_BOOTTIME_ALAREM and CLOCK_MONOTONIC_RAW
+ * the kernel differentiates only two cases:
+ *
+ * I) Low resolution mode:
+ *
+ * When high resolution timers are disabled at compile or runtime
+ * the resolution returned is nanoseconds per tick, which represents
+ * the precision at which timers expire.
+ *
+ * II) High resolution mode:
+ *
+ * When high resolution timers are enabled the resolution returned
+ * is always one nanosecond independent of the actual resolution of
+ * the underlying hardware devices.
+ *
+ * For CLOCK_*_ALARM the actual resolution depends on system
+ * state. When system is running the resolution is the same as the
+ * resolution of the other clocks. During suspend the actual
+ * resolution is the resolution of the underlying RTC device which
+ * might be way less precise than the clockevent device used during
+ * running state.
+ *
+ * For CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE the resolution
+ * returned is always nanoseconds per tick.
+ *
+ * For CLOCK_PROCESS_CPUTIME and CLOCK_THREAD_CPUTIME the resolution
+ * returned is always one nanosecond under the assumption that the
+ * underlying scheduler clock has a better resolution than nanoseconds
+ * per tick.
+ *
+ * For dynamic POSIX clocks (PTP devices) the resolution returned is
+ * always one nanosecond.
+ *
+ * 2) Affect on sys_clock_settime()
+ *
+ * The kernel does not truncate the time which is handed in to
+ * sys_clock_settime(). The kernel internal timekeeping is always using
+ * nanoseconds precision independent of the clocksource device which is
+ * used to read the time from. The resolution of that device only
+ * affects the presicion of the time returned by sys_clock_gettime().
+ *
+ * Returns:
+ * 0 Success. @tp contains the resolution
+ * -EINVAL @which_clock is not a valid clock ID
+ * -EFAULT Copying the resolution to @tp faulted
+ * -ENODEV Dynamic POSIX clock is not backed by a device
+ * -EOPNOTSUPP Dynamic POSIX clock does not support getres()
+ */
SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
struct __kernel_timespec __user *, tp)
{
@@ -1191,8 +1314,8 @@ SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock,
err = do_clock_adjtime(which_clock, &ktx);
- if (err >= 0)
- err = put_old_timex32(utp, &ktx);
+ if (err >= 0 && put_old_timex32(utp, &ktx))
+ return -EFAULT;
return err;
}
@@ -1217,7 +1340,7 @@ SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
#endif
/*
- * nanosleep for monotonic and realtime clocks
+ * sys_clock_nanosleep() for CLOCK_REALTIME and CLOCK_TAI
*/
static int common_nsleep(const clockid_t which_clock, int flags,
const struct timespec64 *rqtp)
@@ -1229,8 +1352,13 @@ static int common_nsleep(const clockid_t which_clock, int flags,
which_clock);
}
+/*
+ * sys_clock_nanosleep() for CLOCK_MONOTONIC and CLOCK_BOOTTIME
+ *
+ * Absolute nanosleeps for these clocks are time-namespace adjusted.
+ */
static int common_nsleep_timens(const clockid_t which_clock, int flags,
- const struct timespec64 *rqtp)
+ const struct timespec64 *rqtp)
{
ktime_t texp = timespec64_to_ktime(*rqtp);
@@ -1261,6 +1389,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
@@ -1288,6 +1417,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index fa3f800d7d76..68d6c1190ac7 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -8,6 +8,7 @@
#include <linux/jiffies.h>
#include <linux/ktime.h>
#include <linux/kernel.h>
+#include <linux/math.h>
#include <linux/moduleparam.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
@@ -20,31 +21,6 @@
#include "timekeeping.h"
/**
- * struct clock_read_data - data required to read from sched_clock()
- *
- * @epoch_ns: sched_clock() value at last update
- * @epoch_cyc: Clock cycle value at last update.
- * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit
- * clocks.
- * @read_sched_clock: Current clock source (or dummy source when suspended).
- * @mult: Multipler for scaled math conversion.
- * @shift: Shift value for scaled math conversion.
- *
- * Care must be taken when updating this structure; it is read by
- * some very hot code paths. It occupies <=40 bytes and, when combined
- * with the seqcount used to synchronize access, comfortably fits into
- * a 64 byte cache line.
- */
-struct clock_read_data {
- u64 epoch_ns;
- u64 epoch_cyc;
- u64 sched_clock_mask;
- u64 (*read_sched_clock)(void);
- u32 mult;
- u32 shift;
-};
-
-/**
* struct clock_data - all data needed for sched_clock() (including
* registration of a new clock source)
*
@@ -60,7 +36,7 @@ struct clock_read_data {
* into a single 64-byte cache line.
*/
struct clock_data {
- seqcount_t seq;
+ seqcount_latch_t seq;
struct clock_read_data read_data[2];
ktime_t wrap_kt;
unsigned long rate;
@@ -88,29 +64,49 @@ static struct clock_data cd ____cacheline_aligned = {
.actual_read_sched_clock = jiffy_sched_clock_read,
};
-static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
+static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift)
{
return (cyc * mult) >> shift;
}
-unsigned long long notrace sched_clock(void)
+notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq)
+{
+ *seq = raw_read_seqcount_latch(&cd.seq);
+ return cd.read_data + (*seq & 1);
+}
+
+notrace int sched_clock_read_retry(unsigned int seq)
+{
+ return raw_read_seqcount_latch_retry(&cd.seq, seq);
+}
+
+unsigned long long noinstr sched_clock_noinstr(void)
{
- u64 cyc, res;
- unsigned int seq;
struct clock_read_data *rd;
+ unsigned int seq;
+ u64 cyc, res;
do {
- seq = raw_read_seqcount(&cd.seq);
+ seq = raw_read_seqcount_latch(&cd.seq);
rd = cd.read_data + (seq & 1);
cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
rd->sched_clock_mask;
res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
- } while (read_seqcount_retry(&cd.seq, seq));
+ } while (raw_read_seqcount_latch_retry(&cd.seq, seq));
return res;
}
+unsigned long long notrace sched_clock(void)
+{
+ unsigned long long ns;
+ preempt_disable_notrace();
+ ns = sched_clock_noinstr();
+ preempt_enable_notrace();
+ return ns;
+}
+
/*
* Updating the data required to read the clock.
*
@@ -214,15 +210,13 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
r = rate;
if (r >= 4000000) {
- r /= 1000000;
+ r = DIV_ROUND_CLOSEST(r, 1000000);
r_unit = 'M';
+ } else if (r >= 4000) {
+ r = DIV_ROUND_CLOSEST(r, 1000);
+ r_unit = 'k';
} else {
- if (r >= 1000) {
- r /= 1000;
- r_unit = 'k';
- } else {
- r_unit = ' ';
- }
+ r_unit = ' ';
}
/* Calculate the ns resolution of this counter */
@@ -244,7 +238,7 @@ void __init generic_sched_clock_init(void)
{
/*
* If no sched_clock() function has been provided at that point,
- * make it the final one one.
+ * make it the final one.
*/
if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
@@ -273,7 +267,7 @@ void __init generic_sched_clock_init(void)
*/
static u64 notrace suspended_sched_clock_read(void)
{
- unsigned int seq = raw_read_seqcount(&cd.seq);
+ unsigned int seq = raw_read_seqcount_latch(&cd.seq);
return cd.read_data[seq & 1].epoch_cyc;
}
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
index 77c63005dc4e..20d5df631570 100644
--- a/kernel/time/test_udelay.c
+++ b/kernel/time/test_udelay.c
@@ -21,7 +21,6 @@
#define DEBUGFS_FILENAME "udelay_test"
static DEFINE_MUTEX(udelay_test_lock);
-static struct dentry *udelay_test_debugfs_file;
static int udelay_test_usecs;
static int udelay_test_iterations = DEFAULT_ITERATIONS;
@@ -138,8 +137,8 @@ static const struct file_operations udelay_test_debugfs_ops = {
static int __init udelay_test_init(void)
{
mutex_lock(&udelay_test_lock);
- udelay_test_debugfs_file = debugfs_create_file(DEBUGFS_FILENAME,
- S_IRUSR, NULL, NULL, &udelay_test_debugfs_ops);
+ debugfs_create_file(DEBUGFS_FILENAME, S_IRUSR, NULL, NULL,
+ &udelay_test_debugfs_ops);
mutex_unlock(&udelay_test_lock);
return 0;
@@ -150,7 +149,7 @@ module_init(udelay_test_init);
static void __exit udelay_test_exit(void)
{
mutex_lock(&udelay_test_lock);
- debugfs_remove(udelay_test_debugfs_file);
+ debugfs_lookup_and_remove(DEBUGFS_FILENAME, NULL);
mutex_unlock(&udelay_test_lock);
}
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index b5a65e212df2..e28f9210f8a1 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -53,28 +53,23 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
* reasons.
*
* Each caller tries to arm the hrtimer on its own CPU, but if the
- * hrtimer callbback function is currently running, then
+ * hrtimer callback function is currently running, then
* hrtimer_start() cannot move it and the timer stays on the CPU on
* which it is assigned at the moment.
+ */
+ hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD);
+ /*
+ * The core tick broadcast mode expects bc->bound_on to be set
+ * correctly to prevent a CPU which has the broadcast hrtimer
+ * armed from going deep idle.
*
- * As this can be called from idle code, the hrtimer_start()
- * invocation has to be wrapped with RCU_NONIDLE() as
- * hrtimer_start() can call into tracing.
+ * As tick_broadcast_lock is held, nothing can change the cpu
+ * base which was just established in hrtimer_start() above. So
+ * the below access is safe even without holding the hrtimer
+ * base lock.
*/
- RCU_NONIDLE( {
- hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD);
- /*
- * The core tick broadcast mode expects bc->bound_on to be set
- * correctly to prevent a CPU which has the broadcast hrtimer
- * armed from going deep idle.
- *
- * As tick_broadcast_lock is held, nothing can change the cpu
- * base which was just established in hrtimer_start() above. So
- * the below access is safe even without holding the hrtimer
- * base lock.
- */
- bc->bound_on = bctimer.base->cpu_base->cpu;
- } );
+ bc->bound_on = bctimer.base->cpu_base->cpu;
+
return 0;
}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index e51778c312f1..771d1e040303 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -33,14 +33,17 @@ static int tick_broadcast_forced;
static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
#ifdef CONFIG_TICK_ONESHOT
-static void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
+static DEFINE_PER_CPU(struct clock_event_device *, tick_oneshot_wakeup_device);
+
+static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic);
static void tick_broadcast_clear_oneshot(int cpu);
static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
# ifdef CONFIG_HOTPLUG_CPU
static void tick_broadcast_oneshot_offline(unsigned int cpu);
# endif
#else
-static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
+static inline void
+tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic) { BUG(); }
static inline void tick_broadcast_clear_oneshot(int cpu) { }
static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
# ifdef CONFIG_HOTPLUG_CPU
@@ -61,6 +64,13 @@ struct cpumask *tick_get_broadcast_mask(void)
return tick_broadcast_mask;
}
+static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu);
+
+const struct clock_event_device *tick_get_wakeup_device(int cpu)
+{
+ return tick_get_oneshot_wakeup_device(cpu);
+}
+
/*
* Start the device in periodic mode
*/
@@ -88,13 +98,75 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev,
return !curdev || newdev->rating > curdev->rating;
}
+#ifdef CONFIG_TICK_ONESHOT
+static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu)
+{
+ return per_cpu(tick_oneshot_wakeup_device, cpu);
+}
+
+static void tick_oneshot_wakeup_handler(struct clock_event_device *wd)
+{
+ /*
+ * If we woke up early and the tick was reprogrammed in the
+ * meantime then this may be spurious but harmless.
+ */
+ tick_receive_broadcast();
+}
+
+static bool tick_set_oneshot_wakeup_device(struct clock_event_device *newdev,
+ int cpu)
+{
+ struct clock_event_device *curdev = tick_get_oneshot_wakeup_device(cpu);
+
+ if (!newdev)
+ goto set_device;
+
+ if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
+ (newdev->features & CLOCK_EVT_FEAT_C3STOP))
+ return false;
+
+ if (!(newdev->features & CLOCK_EVT_FEAT_PERCPU) ||
+ !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return false;
+
+ if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu)))
+ return false;
+
+ if (curdev && newdev->rating <= curdev->rating)
+ return false;
+
+ if (!try_module_get(newdev->owner))
+ return false;
+
+ newdev->event_handler = tick_oneshot_wakeup_handler;
+set_device:
+ clockevents_exchange_device(curdev, newdev);
+ per_cpu(tick_oneshot_wakeup_device, cpu) = newdev;
+ return true;
+}
+#else
+static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu)
+{
+ return NULL;
+}
+
+static bool tick_set_oneshot_wakeup_device(struct clock_event_device *newdev,
+ int cpu)
+{
+ return false;
+}
+#endif
+
/*
* Conditionally install/replace broadcast device
*/
-void tick_install_broadcast_device(struct clock_event_device *dev)
+void tick_install_broadcast_device(struct clock_event_device *dev, int cpu)
{
struct clock_event_device *cur = tick_broadcast_device.evtdev;
+ if (tick_set_oneshot_wakeup_device(dev, cpu))
+ return;
+
if (!tick_check_broadcast_device(cur, dev))
return;
@@ -107,6 +179,19 @@ void tick_install_broadcast_device(struct clock_event_device *dev)
tick_broadcast_device.evtdev = dev;
if (!cpumask_empty(tick_broadcast_mask))
tick_broadcast_start_periodic(dev);
+
+ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return;
+
+ /*
+ * If the system already runs in oneshot mode, switch the newly
+ * registered broadcast device to oneshot mode explicitly.
+ */
+ if (tick_broadcast_oneshot_active()) {
+ tick_broadcast_switch_to_oneshot();
+ return;
+ }
+
/*
* Inform all cpus about this. We might be in a situation
* where we did not switch to oneshot mode because the per cpu
@@ -115,8 +200,7 @@ void tick_install_broadcast_device(struct clock_event_device *dev)
* notification the systems stays stuck in periodic mode
* forever.
*/
- if (dev->features & CLOCK_EVT_FEAT_ONESHOT)
- tick_clock_notify();
+ tick_clock_notify();
}
/*
@@ -157,7 +241,7 @@ static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
}
/*
- * Check, if the device is disfunctional and a place holder, which
+ * Check, if the device is dysfunctional and a placeholder, which
* needs to be handled by the broadcast device.
*/
int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
@@ -181,7 +265,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
tick_broadcast_start_periodic(bc);
else
- tick_broadcast_setup_oneshot(bc);
+ tick_broadcast_setup_oneshot(bc, false);
ret = 1;
} else {
/*
@@ -241,7 +325,6 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
return ret;
}
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
int tick_receive_broadcast(void)
{
struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
@@ -256,7 +339,6 @@ int tick_receive_broadcast(void)
evt->event_handler(evt);
return 0;
}
-#endif
/*
* Broadcast the event to the cpus, which are set in the mask (mangled).
@@ -331,7 +413,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
bc_local = tick_do_periodic_broadcast();
if (clockevent_state_oneshot(dev)) {
- ktime_t next = ktime_add(dev->next_event, tick_period);
+ ktime_t next = ktime_add_ns(dev->next_event, TICK_NSEC);
clockevents_program_event(dev, next, true);
}
@@ -381,7 +463,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
switch (mode) {
case TICK_BROADCAST_FORCE:
tick_broadcast_forced = 1;
- /* fall through */
+ fallthrough;
case TICK_BROADCAST_ON:
cpumask_set_cpu(cpu, tick_broadcast_on);
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
@@ -391,7 +473,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
* - the broadcast device exists
* - the broadcast device is not a hrtimer based one
* - the broadcast device is in periodic mode to
- * avoid a hickup during switch to oneshot mode
+ * avoid a hiccup during switch to oneshot mode
*/
if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) &&
tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
@@ -419,7 +501,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
tick_broadcast_start_periodic(bc);
else
- tick_broadcast_setup_oneshot(bc);
+ tick_broadcast_setup_oneshot(bc, false);
}
}
out:
@@ -541,9 +623,13 @@ struct cpumask *tick_get_broadcast_oneshot_mask(void)
* to avoid a deep idle transition as we are about to get the
* broadcast IPI right away.
*/
-int tick_check_broadcast_expired(void)
+noinstr int tick_check_broadcast_expired(void)
{
+#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
+ return arch_test_bit(smp_processor_id(), cpumask_bits(tick_broadcast_force_mask));
+#else
return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask);
+#endif
}
/*
@@ -707,24 +793,16 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
}
-int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+static int ___tick_broadcast_oneshot_control(enum tick_broadcast_state state,
+ struct tick_device *td,
+ int cpu)
{
- struct clock_event_device *bc, *dev;
- int cpu, ret = 0;
+ struct clock_event_device *bc, *dev = td->evtdev;
+ int ret = 0;
ktime_t now;
- /*
- * If there is no broadcast device, tell the caller not to go
- * into deep idle.
- */
- if (!tick_broadcast_device.evtdev)
- return -EBUSY;
-
- dev = this_cpu_ptr(&tick_cpu_device)->evtdev;
-
raw_spin_lock(&tick_broadcast_lock);
bc = tick_broadcast_device.evtdev;
- cpu = smp_processor_id();
if (state == TICK_BROADCAST_ENTER) {
/*
@@ -853,6 +931,53 @@ out:
return ret;
}
+static int tick_oneshot_wakeup_control(enum tick_broadcast_state state,
+ struct tick_device *td,
+ int cpu)
+{
+ struct clock_event_device *dev, *wd;
+
+ dev = td->evtdev;
+ if (td->mode != TICKDEV_MODE_ONESHOT)
+ return -EINVAL;
+
+ wd = tick_get_oneshot_wakeup_device(cpu);
+ if (!wd)
+ return -ENODEV;
+
+ switch (state) {
+ case TICK_BROADCAST_ENTER:
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED);
+ clockevents_switch_state(wd, CLOCK_EVT_STATE_ONESHOT);
+ clockevents_program_event(wd, dev->next_event, 1);
+ break;
+ case TICK_BROADCAST_EXIT:
+ /* We may have transitioned to oneshot mode while idle */
+ if (clockevent_get_state(wd) != CLOCK_EVT_STATE_ONESHOT)
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+ int cpu = smp_processor_id();
+
+ if (!tick_oneshot_wakeup_control(state, td, cpu))
+ return 0;
+
+ if (tick_broadcast_device.evtdev)
+ return ___tick_broadcast_oneshot_control(state, td, cpu);
+
+ /*
+ * If there is no broadcast or wakeup device, tell the caller not
+ * to go into deep idle.
+ */
+ return -EBUSY;
+}
+
/*
* Reset the one shot broadcast for a cpu
*
@@ -877,50 +1002,120 @@ static void tick_broadcast_init_next_event(struct cpumask *mask,
}
}
+static inline ktime_t tick_get_next_period(void)
+{
+ ktime_t next;
+
+ /*
+ * Protect against concurrent updates (store /load tearing on
+ * 32bit). It does not matter if the time is already in the
+ * past. The broadcast device which is about to be programmed will
+ * fire in any case.
+ */
+ raw_spin_lock(&jiffies_lock);
+ next = tick_next_period;
+ raw_spin_unlock(&jiffies_lock);
+ return next;
+}
+
/**
* tick_broadcast_setup_oneshot - setup the broadcast device
*/
-static void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
+static void tick_broadcast_setup_oneshot(struct clock_event_device *bc,
+ bool from_periodic)
{
int cpu = smp_processor_id();
+ ktime_t nexttick = 0;
if (!bc)
return;
- /* Set it up only once ! */
- if (bc->event_handler != tick_handle_oneshot_broadcast) {
- int was_periodic = clockevent_state_periodic(bc);
-
- bc->event_handler = tick_handle_oneshot_broadcast;
-
+ /*
+ * When the broadcast device was switched to oneshot by the first
+ * CPU handling the NOHZ change, the other CPUs will reach this
+ * code via hrtimer_run_queues() -> tick_check_oneshot_change()
+ * too. Set up the broadcast device only once!
+ */
+ if (bc->event_handler == tick_handle_oneshot_broadcast) {
/*
- * We must be careful here. There might be other CPUs
- * waiting for periodic broadcast. We need to set the
- * oneshot_mask bits for those and program the
- * broadcast device to fire.
+ * The CPU which switched from periodic to oneshot mode
+ * set the broadcast oneshot bit for all other CPUs which
+ * are in the general (periodic) broadcast mask to ensure
+ * that CPUs which wait for the periodic broadcast are
+ * woken up.
+ *
+ * Clear the bit for the local CPU as the set bit would
+ * prevent the first tick_broadcast_enter() after this CPU
+ * switched to oneshot state to program the broadcast
+ * device.
+ *
+ * This code can also be reached via tick_broadcast_control(),
+ * but this cannot avoid the tick_broadcast_clear_oneshot()
+ * as that would break the periodic to oneshot transition of
+ * secondary CPUs. But that's harmless as the below only
+ * clears already cleared bits.
*/
+ tick_broadcast_clear_oneshot(cpu);
+ return;
+ }
+
+
+ bc->event_handler = tick_handle_oneshot_broadcast;
+ bc->next_event = KTIME_MAX;
+
+ /*
+ * When the tick mode is switched from periodic to oneshot it must
+ * be ensured that CPUs which are waiting for periodic broadcast
+ * get their wake-up at the next tick. This is achieved by ORing
+ * tick_broadcast_mask into tick_broadcast_oneshot_mask.
+ *
+ * For other callers, e.g. broadcast device replacement,
+ * tick_broadcast_oneshot_mask must not be touched as this would
+ * set bits for CPUs which are already NOHZ, but not idle. Their
+ * next tick_broadcast_enter() would observe the bit set and fail
+ * to update the expiry time and the broadcast event device.
+ */
+ if (from_periodic) {
cpumask_copy(tmpmask, tick_broadcast_mask);
+ /* Remove the local CPU as it is obviously not idle */
cpumask_clear_cpu(cpu, tmpmask);
- cpumask_or(tick_broadcast_oneshot_mask,
- tick_broadcast_oneshot_mask, tmpmask);
-
- if (was_periodic && !cpumask_empty(tmpmask)) {
- clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
- tick_broadcast_init_next_event(tmpmask,
- tick_next_period);
- tick_broadcast_set_event(bc, cpu, tick_next_period);
- } else
- bc->next_event = KTIME_MAX;
- } else {
+ cpumask_or(tick_broadcast_oneshot_mask, tick_broadcast_oneshot_mask, tmpmask);
+
/*
- * The first cpu which switches to oneshot mode sets
- * the bit for all other cpus which are in the general
- * (periodic) broadcast mask. So the bit is set and
- * would prevent the first broadcast enter after this
- * to program the bc device.
+ * Ensure that the oneshot broadcast handler will wake the
+ * CPUs which are still waiting for periodic broadcast.
*/
- tick_broadcast_clear_oneshot(cpu);
+ nexttick = tick_get_next_period();
+ tick_broadcast_init_next_event(tmpmask, nexttick);
+
+ /*
+ * If the underlying broadcast clock event device is
+ * already in oneshot state, then there is nothing to do.
+ * The device was already armed for the next tick
+ * in tick_handle_broadcast_periodic()
+ */
+ if (clockevent_state_oneshot(bc))
+ return;
}
+
+ /*
+ * When switching from periodic to oneshot mode arm the broadcast
+ * device for the next tick.
+ *
+ * If the broadcast device has been replaced in oneshot mode and
+ * the oneshot broadcast mask is not empty, then arm it to expire
+ * immediately in order to reevaluate the next expiring timer.
+ * @nexttick is 0 and therefore in the past which will cause the
+ * clockevent code to force an event.
+ *
+ * For both cases the programming can be avoided when the oneshot
+ * broadcast mask is empty.
+ *
+ * tick_broadcast_set_event() implicitly switches the broadcast
+ * device to oneshot state.
+ */
+ if (!cpumask_empty(tick_broadcast_oneshot_mask))
+ tick_broadcast_set_event(bc, cpu, nexttick);
}
/*
@@ -929,14 +1124,16 @@ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
void tick_broadcast_switch_to_oneshot(void)
{
struct clock_event_device *bc;
+ enum tick_device_mode oldmode;
unsigned long flags;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+ oldmode = tick_broadcast_device.mode;
tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
bc = tick_broadcast_device.evtdev;
if (bc)
- tick_broadcast_setup_oneshot(bc);
+ tick_broadcast_setup_oneshot(bc, oldmode == TICKDEV_MODE_PERIODIC);
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
@@ -962,6 +1159,9 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu)
*/
static void tick_broadcast_oneshot_offline(unsigned int cpu)
{
+ if (tick_get_oneshot_wakeup_device(cpu))
+ tick_set_oneshot_wakeup_device(NULL, cpu);
+
/*
* Clear the broadcast masks for the dead cpu, but do not stop
* the broadcast device!
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 6c9c342dd0e5..e9138cd7a0f5 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -27,10 +27,11 @@
*/
DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
/*
- * Tick next event: keeps track of the tick time
+ * Tick next event: keeps track of the tick time. It's updated by the
+ * CPU which handles the tick and protected by jiffies_lock. There is
+ * no requirement to write hold the jiffies seqcount for it.
*/
ktime_t tick_next_period;
-ktime_t tick_period;
/*
* tick_do_timer_cpu is a timer core internal variable which holds the CPU NR
@@ -88,7 +89,7 @@ static void tick_periodic(int cpu)
write_seqcount_begin(&jiffies_seq);
/* Keep track of the next tick event */
- tick_next_period = ktime_add(tick_next_period, tick_period);
+ tick_next_period = ktime_add_ns(tick_next_period, TICK_NSEC);
do_timer(1);
write_seqcount_end(&jiffies_seq);
@@ -127,7 +128,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
* Setup the next period for devices, which do not have
* periodic mode:
*/
- next = ktime_add(next, tick_period);
+ next = ktime_add_ns(next, TICK_NSEC);
if (!clockevents_program_event(dev, next, false))
return;
@@ -173,7 +174,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
for (;;) {
if (!clockevents_program_event(dev, next, false))
return;
- next = ktime_add(next, tick_period);
+ next = ktime_add_ns(next, TICK_NSEC);
}
}
}
@@ -218,9 +219,7 @@ static void tick_setup_device(struct tick_device *td,
*/
if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
tick_do_timer_cpu = cpu;
-
tick_next_period = ktime_get();
- tick_period = NSEC_PER_SEC / HZ;
#ifdef CONFIG_NO_HZ_FULL
/*
* The boot CPU may be nohz_full, in which case set
@@ -348,12 +347,7 @@ void tick_check_new_device(struct clock_event_device *newdev)
td = &per_cpu(tick_cpu_device, cpu);
curdev = td->evtdev;
- /* cpu local device ? */
- if (!tick_check_percpu(curdev, newdev, cpu))
- goto out_bc;
-
- /* Preference decision */
- if (!tick_check_preferred(curdev, newdev))
+ if (!tick_check_replacement(curdev, newdev))
goto out_bc;
if (!try_module_get(newdev->owner))
@@ -378,7 +372,7 @@ out_bc:
/*
* Can the new device be used as a broadcast device ?
*/
- tick_install_broadcast_device(newdev);
+ tick_install_broadcast_device(newdev, cpu);
}
/**
@@ -407,17 +401,13 @@ EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
/*
* Transfer the do_timer job away from a dying cpu.
*
- * Called with interrupts disabled. Not locking required. If
+ * Called with interrupts disabled. No locking required. If
* tick_do_timer_cpu is owned by this cpu, nothing can change it.
*/
void tick_handover_do_timer(void)
{
- if (tick_do_timer_cpu == smp_processor_id()) {
- int cpu = cpumask_first(cpu_online_mask);
-
- tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
- TICK_DO_TIMER_NONE;
- }
+ if (tick_do_timer_cpu == smp_processor_id())
+ tick_do_timer_cpu = cpumask_first(cpu_online_mask);
}
/*
@@ -479,6 +469,13 @@ void tick_resume_local(void)
else
tick_resume_oneshot();
}
+
+ /*
+ * Ensure that hrtimers are up to date and the clockevents device
+ * is reprogrammed correctly when high resolution timers are
+ * enabled.
+ */
+ hrtimers_resume_local();
}
/**
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 7b2496136729..481b7ab65e2c 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -15,7 +15,6 @@
DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
extern ktime_t tick_next_period;
-extern ktime_t tick_period;
extern int tick_do_timer_cpu __read_mostly;
extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
@@ -57,12 +56,11 @@ extern int clockevents_program_event(struct clock_event_device *dev,
ktime_t expires, bool force);
extern void clockevents_handle_noop(struct clock_event_device *dev);
extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
-extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
/* Broadcasting support */
# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
-extern void tick_install_broadcast_device(struct clock_event_device *dev);
+extern void tick_install_broadcast_device(struct clock_event_device *dev, int cpu);
extern int tick_is_broadcast_device(struct clock_event_device *dev);
extern void tick_suspend_broadcast(void);
extern void tick_resume_broadcast(void);
@@ -72,8 +70,9 @@ extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadc
extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
extern struct tick_device *tick_get_broadcast_device(void);
extern struct cpumask *tick_get_broadcast_mask(void);
+extern const struct clock_event_device *tick_get_wakeup_device(int cpu);
# else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */
-static inline void tick_install_broadcast_device(struct clock_event_device *dev) { }
+static inline void tick_install_broadcast_device(struct clock_event_device *dev, int cpu) { }
static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }
static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }
static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
@@ -165,3 +164,37 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
void timer_clear_idle(void);
+
+#define CLOCK_SET_WALL \
+ (BIT(HRTIMER_BASE_REALTIME) | BIT(HRTIMER_BASE_REALTIME_SOFT) | \
+ BIT(HRTIMER_BASE_TAI) | BIT(HRTIMER_BASE_TAI_SOFT))
+
+#define CLOCK_SET_BOOT \
+ (BIT(HRTIMER_BASE_BOOTTIME) | BIT(HRTIMER_BASE_BOOTTIME_SOFT))
+
+void clock_was_set(unsigned int bases);
+void clock_was_set_delayed(void);
+
+void hrtimers_resume_local(void);
+
+/* Since jiffies uses a simple TICK_NSEC multiplier
+ * conversion, the .shift value could be zero. However
+ * this would make NTP adjustments impossible as they are
+ * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
+ * shift both the nominator and denominator the same
+ * amount, and give ntp adjustments in units of 1/2^8
+ *
+ * The value 8 is somewhat carefully chosen, as anything
+ * larger can result in overflows. TICK_NSEC grows as HZ
+ * shrinks, so values greater than 8 overflow 32bits when
+ * HZ=100.
+ */
+#if HZ < 34
+#define JIFFIES_SHIFT 6
+#elif HZ < 67
+#define JIFFIES_SHIFT 7
+#else
+#define JIFFIES_SHIFT 8
+#endif
+
+extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
diff --git a/kernel/time/tick-legacy.c b/kernel/time/tick-legacy.c
new file mode 100644
index 000000000000..af225b32f5b3
--- /dev/null
+++ b/kernel/time/tick-legacy.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Timer tick function for architectures that lack generic clockevents,
+ * consolidated here from m68k/ia64/parisc/arm.
+ */
+
+#include <linux/irq.h>
+#include <linux/profile.h>
+#include <linux/timekeeper_internal.h>
+
+#include "tick-internal.h"
+
+/**
+ * legacy_timer_tick() - advances the timekeeping infrastructure
+ * @ticks: number of ticks, that have elapsed since the last call.
+ *
+ * This is used by platforms that have not been converted to
+ * generic clockevents.
+ *
+ * If 'ticks' is zero, the CPU is not handling timekeeping, so
+ * only perform process accounting and profiling.
+ *
+ * Must be called with interrupts disabled.
+ */
+void legacy_timer_tick(unsigned long ticks)
+{
+ if (ticks) {
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
+ do_timer(ticks);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
+ update_wall_time();
+ }
+ update_process_times(user_mode(get_irq_regs()));
+ profile_tick(CPU_PROFILING);
+}
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index f9745d47425a..5e2c2c26b3cc 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -18,7 +18,7 @@
#include "tick-internal.h"
/**
- * tick_program_event
+ * tick_program_event - program the CPU local timer device for the next event
*/
int tick_program_event(ktime_t expires, int force)
{
@@ -45,7 +45,7 @@ int tick_program_event(ktime_t expires, int force)
}
/**
- * tick_resume_onshot - resume oneshot mode
+ * tick_resume_oneshot - resume oneshot mode
*/
void tick_resume_oneshot(void)
{
@@ -99,7 +99,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
}
/**
- * tick_check_oneshot_mode - check whether the system is in oneshot mode
+ * tick_oneshot_mode_active - check whether the system is in oneshot mode
*
* returns 1 when either nohz or highres are enabled. otherwise 0.
*/
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3e2dc9b8858c..01fb50c1b17e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -4,7 +4,7 @@
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
*
- * No idle tick implementation for low and high resolution timers
+ * NOHZ implementation for low and high resolution timers
*
* Started by: Thomas Gleixner and Ingo Molnar
*/
@@ -20,6 +20,7 @@
#include <linux/sched/clock.h>
#include <linux/sched/stat.h>
#include <linux/sched/nohz.h>
+#include <linux/sched/loadavg.h>
#include <linux/module.h>
#include <linux/irq_work.h>
#include <linux/posix-timers.h>
@@ -44,7 +45,9 @@ struct tick_sched *tick_get_tick_sched(int cpu)
#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
/*
- * The time, when the last jiffy update happened. Protected by jiffies_lock.
+ * The time when the last jiffy update happened. Write access must hold
+ * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a
+ * consistent view of jiffies and last_jiffies_update.
*/
static ktime_t last_jiffies_update;
@@ -53,50 +56,95 @@ static ktime_t last_jiffies_update;
*/
static void tick_do_update_jiffies64(ktime_t now)
{
- unsigned long ticks = 0;
- ktime_t delta;
+ unsigned long ticks = 1;
+ ktime_t delta, nextp;
/*
- * Do a quick check without holding jiffies_lock:
- * The READ_ONCE() pairs with two updates done later in this function.
+ * 64-bit can do a quick check without holding the jiffies lock and
+ * without looking at the sequence count. The smp_load_acquire()
+ * pairs with the update done later in this function.
+ *
+ * 32-bit cannot do that because the store of 'tick_next_period'
+ * consists of two 32-bit stores, and the first store could be
+ * moved by the CPU to a random point in the future.
*/
- delta = ktime_sub(now, READ_ONCE(last_jiffies_update));
- if (delta < tick_period)
- return;
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ if (ktime_before(now, smp_load_acquire(&tick_next_period)))
+ return;
+ } else {
+ unsigned int seq;
+
+ /*
+ * Avoid contention on 'jiffies_lock' and protect the quick
+ * check with the sequence count.
+ */
+ do {
+ seq = read_seqcount_begin(&jiffies_seq);
+ nextp = tick_next_period;
+ } while (read_seqcount_retry(&jiffies_seq, seq));
- /* Reevaluate with jiffies_lock held */
+ if (ktime_before(now, nextp))
+ return;
+ }
+
+ /* Quick check failed, i.e. update is required. */
raw_spin_lock(&jiffies_lock);
+ /*
+ * Re-evaluate with the lock held. Another CPU might have done the
+ * update already.
+ */
+ if (ktime_before(now, tick_next_period)) {
+ raw_spin_unlock(&jiffies_lock);
+ return;
+ }
+
write_seqcount_begin(&jiffies_seq);
- delta = ktime_sub(now, last_jiffies_update);
- if (delta >= tick_period) {
+ delta = ktime_sub(now, tick_next_period);
+ if (unlikely(delta >= TICK_NSEC)) {
+ /* Slow path for long idle sleep times */
+ s64 incr = TICK_NSEC;
- delta = ktime_sub(delta, tick_period);
- /* Pairs with the lockless read in this function. */
- WRITE_ONCE(last_jiffies_update,
- ktime_add(last_jiffies_update, tick_period));
+ ticks += ktime_divns(delta, incr);
- /* Slow path for long timeouts */
- if (unlikely(delta >= tick_period)) {
- s64 incr = ktime_to_ns(tick_period);
+ last_jiffies_update = ktime_add_ns(last_jiffies_update,
+ incr * ticks);
+ } else {
+ last_jiffies_update = ktime_add_ns(last_jiffies_update,
+ TICK_NSEC);
+ }
- ticks = ktime_divns(delta, incr);
+ /* Advance jiffies to complete the 'jiffies_seq' protected job */
+ jiffies_64 += ticks;
- /* Pairs with the lockless read in this function. */
- WRITE_ONCE(last_jiffies_update,
- ktime_add_ns(last_jiffies_update,
- incr * ticks));
- }
- do_timer(++ticks);
+ /* Keep the tick_next_period variable up to date */
+ nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC);
- /* Keep the tick_next_period variable up to date */
- tick_next_period = ktime_add(last_jiffies_update, tick_period);
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ /*
+ * Pairs with smp_load_acquire() in the lockless quick
+ * check above, and ensures that the update to 'jiffies_64' is
+ * not reordered vs. the store to 'tick_next_period', neither
+ * by the compiler nor by the CPU.
+ */
+ smp_store_release(&tick_next_period, nextp);
} else {
- write_seqcount_end(&jiffies_seq);
- raw_spin_unlock(&jiffies_lock);
- return;
+ /*
+ * A plain store is good enough on 32-bit, as the quick check
+ * above is protected by the sequence count.
+ */
+ tick_next_period = nextp;
}
+
+ /*
+ * Release the sequence count. calc_global_load() below is not
+ * protected by it, but 'jiffies_lock' needs to be held to prevent
+ * concurrent invocations.
+ */
write_seqcount_end(&jiffies_seq);
+
+ calc_global_load();
+
raw_spin_unlock(&jiffies_lock);
update_wall_time();
}
@@ -110,15 +158,31 @@ static ktime_t tick_init_jiffy_update(void)
raw_spin_lock(&jiffies_lock);
write_seqcount_begin(&jiffies_seq);
- /* Did we start the jiffies update yet ? */
- if (last_jiffies_update == 0)
+
+ /* Have we started the jiffies update yet ? */
+ if (last_jiffies_update == 0) {
+ u32 rem;
+
+ /*
+ * Ensure that the tick is aligned to a multiple of
+ * TICK_NSEC.
+ */
+ div_u64_rem(tick_next_period, TICK_NSEC, &rem);
+ if (rem)
+ tick_next_period += TICK_NSEC - rem;
+
last_jiffies_update = tick_next_period;
+ }
period = last_jiffies_update;
+
write_seqcount_end(&jiffies_seq);
raw_spin_unlock(&jiffies_lock);
+
return period;
}
+#define MAX_STALLED_JIFFIES 5
+
static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
{
int cpu = smp_processor_id();
@@ -129,23 +193,38 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
* concurrency: This happens only when the CPU in charge went
* into a long sleep. If two CPUs happen to assign themselves to
* this duty, then the jiffies update is still serialized by
- * jiffies_lock.
+ * 'jiffies_lock'.
*
* If nohz_full is enabled, this should not happen because the
- * tick_do_timer_cpu never relinquishes.
+ * 'tick_do_timer_cpu' CPU never relinquishes.
*/
if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
#ifdef CONFIG_NO_HZ_FULL
- WARN_ON(tick_nohz_full_running);
+ WARN_ON_ONCE(tick_nohz_full_running);
#endif
tick_do_timer_cpu = cpu;
}
#endif
- /* Check, if the jiffies need an update */
+ /* Check if jiffies need an update */
if (tick_do_timer_cpu == cpu)
tick_do_update_jiffies64(now);
+ /*
+ * If the jiffies update stalled for too long (timekeeper in stop_machine()
+ * or VMEXIT'ed for several msecs), force an update.
+ */
+ if (ts->last_tick_jiffies != jiffies) {
+ ts->stalled_jiffies = 0;
+ ts->last_tick_jiffies = READ_ONCE(jiffies);
+ } else {
+ if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
+ tick_do_update_jiffies64(now);
+ ts->stalled_jiffies = 0;
+ ts->last_tick_jiffies = READ_ONCE(jiffies);
+ }
+ }
+
if (ts->inidle)
ts->got_idle_tick = 1;
}
@@ -156,10 +235,10 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
/*
* When we are idle and the tick is stopped, we have to touch
* the watchdog as we might not schedule for a really long
- * time. This happens on complete idle SMP systems while
+ * time. This happens on completely idle SMP systems while
* waiting on the login prompt. We also increment the "start of
* idle" jiffy stamp so the idle accounting adjustment we do
- * when we go busy again does not account too much ticks.
+ * when we go busy again does not account too many ticks.
*/
if (ts->tick_stopped) {
touch_softlockup_watchdog_sched();
@@ -180,6 +259,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
+EXPORT_SYMBOL_GPL(tick_nohz_full_mask);
bool tick_nohz_full_running;
EXPORT_SYMBOL_GPL(tick_nohz_full_running);
static atomic_t tick_dep_mask;
@@ -213,6 +293,11 @@ static bool check_tick_dependency(atomic_t *dep)
return true;
}
+ if (val & TICK_DEP_MASK_RCU_EXP) {
+ trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP);
+ return true;
+ }
+
return false;
}
@@ -243,10 +328,8 @@ static void nohz_full_kick_func(struct irq_work *work)
/* Empty, the tick restart happens on tick_nohz_irq_exit() */
}
-static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
- .func = nohz_full_kick_func,
- .flags = ATOMIC_INIT(IRQ_WORK_HARD_IRQ),
-};
+static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) =
+ IRQ_WORK_INIT_HARD(nohz_full_kick_func);
/*
* Kick this CPU if it's full dynticks in order to force it to
@@ -274,6 +357,46 @@ void tick_nohz_full_kick_cpu(int cpu)
irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
}
+static void tick_nohz_kick_task(struct task_struct *tsk)
+{
+ int cpu;
+
+ /*
+ * If the task is not running, run_posix_cpu_timers()
+ * has nothing to elapse, and an IPI can then be optimized out.
+ *
+ * activate_task() STORE p->tick_dep_mask
+ * STORE p->on_rq
+ * __schedule() (switch to task 'p') smp_mb() (atomic_fetch_or())
+ * LOCK rq->lock LOAD p->on_rq
+ * smp_mb__after_spin_lock()
+ * tick_nohz_task_switch()
+ * LOAD p->tick_dep_mask
+ */
+ if (!sched_task_on_rq(tsk))
+ return;
+
+ /*
+ * If the task concurrently migrates to another CPU,
+ * we guarantee it sees the new tick dependency upon
+ * schedule.
+ *
+ * set_task_cpu(p, cpu);
+ * STORE p->cpu = @cpu
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock
+ * smp_mb__after_spin_lock() STORE p->tick_dep_mask
+ * tick_nohz_task_switch() smp_mb() (atomic_fetch_or())
+ * LOAD p->tick_dep_mask LOAD p->cpu
+ */
+ cpu = task_cpu(tsk);
+
+ preempt_disable();
+ if (cpu_online(cpu))
+ tick_nohz_full_kick_cpu(cpu);
+ preempt_enable();
+}
+
/*
* Kick all full dynticks CPUs in order to force these to re-evaluate
* their dependency on the tick and restart it if necessary.
@@ -303,7 +426,7 @@ static void tick_nohz_dep_set_all(atomic_t *dep,
/*
* Set a global tick dependency. Used by perf events that rely on freq and
- * by unstable clock.
+ * unstable clocks.
*/
void tick_nohz_dep_set(enum tick_dep_bits bit)
{
@@ -317,7 +440,7 @@ void tick_nohz_dep_clear(enum tick_dep_bits bit)
/*
* Set per-CPU tick dependency. Used by scheduler and perf events in order to
- * manage events throttling.
+ * manage event-throttling.
*/
void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
{
@@ -333,7 +456,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
if (cpu == smp_processor_id()) {
tick_nohz_full_kick();
} else {
- /* Remote irq work not NMI-safe */
+ /* Remote IRQ work not NMI-safe */
if (!WARN_ON_ONCE(in_nmi()))
tick_nohz_full_kick_cpu(cpu);
}
@@ -351,16 +474,13 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
/*
- * Set a per-task tick dependency. Posix CPU timers need this in order to elapse
- * per task timers.
+ * Set a per-task tick dependency. RCU needs this. Also posix CPU timers
+ * in order to elapse per task timers.
*/
void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
- /*
- * We could optimize this with just kicking the target running the task
- * if that noise matters for nohz full users.
- */
- tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit);
+ if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask))
+ tick_nohz_kick_task(tsk);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);
@@ -374,9 +494,20 @@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task);
* Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
* per process timers.
*/
-void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit)
+void tick_nohz_dep_set_signal(struct task_struct *tsk,
+ enum tick_dep_bits bit)
{
- tick_nohz_dep_set_all(&sig->tick_dep_mask, bit);
+ int prev;
+ struct signal_struct *sig = tsk->signal;
+
+ prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask);
+ if (!prev) {
+ struct task_struct *t;
+
+ lockdep_assert_held(&tsk->sighand->siglock);
+ __for_each_thread(sig, t)
+ tick_nohz_kick_task(t);
+ }
}
void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
@@ -391,13 +522,10 @@ void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bi
*/
void __tick_nohz_task_switch(void)
{
- unsigned long flags;
struct tick_sched *ts;
- local_irq_save(flags);
-
if (!tick_nohz_full_cpu(smp_processor_id()))
- goto out;
+ return;
ts = this_cpu_ptr(&tick_cpu_sched);
@@ -406,8 +534,6 @@ void __tick_nohz_task_switch(void)
atomic_read(&current->signal->tick_dep_mask))
tick_nohz_full_kick();
}
-out:
- local_irq_restore(flags);
}
/* Get the boot-time nohz CPU list from the kernel parameters. */
@@ -417,18 +543,22 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
cpumask_copy(tick_nohz_full_mask, cpumask);
tick_nohz_full_running = true;
}
-EXPORT_SYMBOL_GPL(tick_nohz_full_setup);
-static int tick_nohz_cpu_down(unsigned int cpu)
+bool tick_nohz_cpu_hotpluggable(unsigned int cpu)
{
/*
- * The tick_do_timer_cpu CPU handles housekeeping duty (unbound
+ * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound
* timers, workqueues, timekeeping, ...) on behalf of full dynticks
* CPUs. It must remain online when nohz full is enabled.
*/
if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
- return -EBUSY;
- return 0;
+ return false;
+ return true;
+}
+
+static int tick_nohz_cpu_down(unsigned int cpu)
+{
+ return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY;
}
void __init tick_nohz_init(void)
@@ -439,12 +569,12 @@ void __init tick_nohz_init(void)
return;
/*
- * Full dynticks uses irq work to drive the tick rescheduling on safe
- * locking contexts. But then we need irq work to raise its own
- * interrupts to avoid circular dependency on the tick
+ * Full dynticks uses IRQ work to drive the tick rescheduling on safe
+ * locking contexts. But then we need IRQ work to raise its own
+ * interrupts to avoid circular dependency on the tick.
*/
if (!arch_irq_work_has_interrupt()) {
- pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
+ pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n");
cpumask_clear(tick_nohz_full_mask);
tick_nohz_full_running = false;
return;
@@ -462,7 +592,7 @@ void __init tick_nohz_init(void)
}
for_each_cpu(cpu, tick_nohz_full_mask)
- context_tracking_cpu_set(cpu);
+ ct_cpu_track_user(cpu);
ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
"kernel/nohz:predown", NULL,
@@ -514,7 +644,7 @@ bool tick_nohz_tick_stopped_cpu(int cpu)
* In case the sched_tick was stopped on this CPU, we have to check if jiffies
* must be updated. Otherwise an interrupt handler could use a stale jiffy
* value. We do this unconditionally on any CPU, as we don't know whether the
- * CPU, which has the update task assigned is in a long sleep.
+ * CPU, which has the update task assigned, is in a long sleep.
*/
static void tick_nohz_update_jiffies(ktime_t now)
{
@@ -529,43 +659,67 @@ static void tick_nohz_update_jiffies(ktime_t now)
touch_softlockup_watchdog_sched();
}
-/*
- * Updates the per-CPU time idle statistics counters
- */
-static void
-update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
+static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
{
ktime_t delta;
- if (ts->idle_active) {
- delta = ktime_sub(now, ts->idle_entrytime);
- if (nr_iowait_cpu(cpu) > 0)
- ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
- else
- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
- ts->idle_entrytime = now;
- }
+ if (WARN_ON_ONCE(!ts->idle_active))
+ return;
- if (last_update_time)
- *last_update_time = ktime_to_us(now);
+ delta = ktime_sub(now, ts->idle_entrytime);
-}
+ write_seqcount_begin(&ts->idle_sleeptime_seq);
+ if (nr_iowait_cpu(smp_processor_id()) > 0)
+ ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+ else
+ ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
-{
- update_ts_time_stats(smp_processor_id(), ts, now, NULL);
+ ts->idle_entrytime = now;
ts->idle_active = 0;
+ write_seqcount_end(&ts->idle_sleeptime_seq);
sched_clock_idle_wakeup_event();
}
static void tick_nohz_start_idle(struct tick_sched *ts)
{
+ write_seqcount_begin(&ts->idle_sleeptime_seq);
ts->idle_entrytime = ktime_get();
ts->idle_active = 1;
+ write_seqcount_end(&ts->idle_sleeptime_seq);
+
sched_clock_idle_sleep_event();
}
+static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
+ bool compute_delta, u64 *last_update_time)
+{
+ ktime_t now, idle;
+ unsigned int seq;
+
+ if (!tick_nohz_active)
+ return -1;
+
+ now = ktime_get();
+ if (last_update_time)
+ *last_update_time = ktime_to_us(now);
+
+ do {
+ seq = read_seqcount_begin(&ts->idle_sleeptime_seq);
+
+ if (ts->idle_active && compute_delta) {
+ ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+
+ idle = ktime_add(*sleeptime, delta);
+ } else {
+ idle = *sleeptime;
+ }
+ } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq));
+
+ return ktime_to_us(idle);
+
+}
+
/**
* get_cpu_idle_time_us - get the total idle time of a CPU
* @cpu: CPU number to query
@@ -573,7 +727,10 @@ static void tick_nohz_start_idle(struct tick_sched *ts)
* counters if NULL.
*
* Return the cumulative idle time (since boot) for a given
- * CPU, in microseconds.
+ * CPU, in microseconds. Note that this is partially broken due to
+ * the counter of iowait tasks that can be remotely updated without
+ * any synchronization. Therefore it is possible to observe backward
+ * values within two consecutive reads.
*
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
@@ -583,27 +740,9 @@ static void tick_nohz_start_idle(struct tick_sched *ts)
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t now, idle;
-
- if (!tick_nohz_active)
- return -1;
-
- now = ktime_get();
- if (last_update_time) {
- update_ts_time_stats(cpu, ts, now, last_update_time);
- idle = ts->idle_sleeptime;
- } else {
- if (ts->idle_active && !nr_iowait_cpu(cpu)) {
- ktime_t delta = ktime_sub(now, ts->idle_entrytime);
-
- idle = ktime_add(ts->idle_sleeptime, delta);
- } else {
- idle = ts->idle_sleeptime;
- }
- }
-
- return ktime_to_us(idle);
+ return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime,
+ !nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
@@ -614,7 +753,10 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
* counters if NULL.
*
* Return the cumulative iowait time (since boot) for a given
- * CPU, in microseconds.
+ * CPU, in microseconds. Note this is partially broken due to
+ * the counter of iowait tasks that can be remotely updated without
+ * any synchronization. Therefore it is possible to observe backward
+ * values within two consecutive reads.
*
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
@@ -624,26 +766,9 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t now, iowait;
-
- if (!tick_nohz_active)
- return -1;
-
- now = ktime_get();
- if (last_update_time) {
- update_ts_time_stats(cpu, ts, now, last_update_time);
- iowait = ts->iowait_sleeptime;
- } else {
- if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
- ktime_t delta = ktime_sub(now, ts->idle_entrytime);
-
- iowait = ktime_add(ts->iowait_sleeptime, delta);
- } else {
- iowait = ts->iowait_sleeptime;
- }
- }
- return ktime_to_us(iowait);
+ return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime,
+ nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
@@ -653,7 +778,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
/* Forward the time to expire in the future */
- hrtimer_forward(&ts->sched_timer, now, tick_period);
+ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
hrtimer_start_expires(&ts->sched_timer,
@@ -663,7 +788,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
}
/*
- * Reset to make sure next tick stop doesn't get fooled by past
+ * Reset to make sure the next tick stop doesn't get fooled by past
* cached clock deadline.
*/
ts->next_tick = 0;
@@ -676,7 +801,7 @@ static inline bool local_timer_softirq_pending(void)
static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
{
- u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
+ u64 basemono, next_tick, delta, expires;
unsigned long basejiff;
unsigned int seq;
@@ -692,14 +817,14 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
/*
* Keep the periodic tick, when RCU, architecture or irq_work
* requests it.
- * Aside of that check whether the local timer softirq is
- * pending. If so its a bad idea to call get_next_timer_interrupt()
+ * Aside of that, check whether the local timer softirq is
+ * pending. If so, its a bad idea to call get_next_timer_interrupt(),
* because there is an already expired timer, so it will request
- * immeditate expiry, which rearms the hardware timer with a
- * minimal delta which brings us back to this place
+ * immediate expiry, which rearms the hardware timer with a
+ * minimal delta, which brings us back to this place
* immediately. Lather, rinse and repeat...
*/
- if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
+ if (rcu_needs_cpu() || arch_needs_cpu() ||
irq_work_needs_cpu() || local_timer_softirq_pending()) {
next_tick = basemono + TICK_NSEC;
} else {
@@ -710,12 +835,14 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
* disabled this also looks at the next expiring
* hrtimer.
*/
- next_tmr = get_next_timer_interrupt(basejiff, basemono);
- ts->next_timer = next_tmr;
- /* Take the next rcu event into account */
- next_tick = next_rcu < next_tmr ? next_rcu : next_tmr;
+ next_tick = get_next_timer_interrupt(basejiff, basemono);
+ ts->next_timer = next_tick;
}
+ /* Make sure next_tick is never before basemono! */
+ if (WARN_ON_ONCE(basemono > next_tick))
+ next_tick = basemono;
+
/*
* If the tick is due in the next period, keep it ticking or
* force prod the timer.
@@ -739,7 +866,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
/*
* If this CPU is the one which had the do_timer() duty last, we limit
- * the sleep time to the timekeeping max_deferment value.
+ * the sleep time to the timekeeping 'max_deferment' value.
* Otherwise we can sleep as long as we want.
*/
delta = timekeeping_max_deferment();
@@ -764,7 +891,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
u64 basemono = ts->timer_expires_base;
u64 expires = ts->timer_expires;
- ktime_t tick = expires;
/* Make sure we won't be trying to stop it twice in a row. */
ts->timer_expires_base = 0;
@@ -773,8 +899,8 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
* If this CPU is the one which updates jiffies, then give up
* the assignment and let it be taken by the CPU which runs
* the tick timer next, which might be this CPU as well. If we
- * don't drop this here the jiffies might be stale and
- * do_timer() never invoked. Keep track of the fact that it
+ * don't drop this here, the jiffies might be stale and
+ * do_timer() never gets invoked. Keep track of the fact that it
* was the one which had the do_timer() duty last.
*/
if (cpu == tick_do_timer_cpu) {
@@ -784,10 +910,10 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
ts->do_timer_last = 0;
}
- /* Skip reprogram of event if its not changed */
+ /* Skip reprogram of event if it's not changed */
if (ts->tick_stopped && (expires == ts->next_tick)) {
/* Sanity check: make sure clockevent is actually programmed */
- if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
+ if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
return;
WARN_ON_ONCE(1);
@@ -797,11 +923,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
}
/*
- * nohz_stop_sched_tick can be called several times before
- * the nohz_restart_sched_tick is called. This happens when
- * interrupts arrive which do not cause a reschedule. In the
- * first call we save the current tick time, so we can restart
- * the scheduler tick in nohz_restart_sched_tick.
+ * tick_nohz_stop_tick() can be called several times before
+ * tick_nohz_restart_sched_tick() is called. This happens when
+ * interrupts arrive which do not cause a reschedule. In the first
+ * call we save the current tick time, so we can restart the
+ * scheduler tick in tick_nohz_restart_sched_tick().
*/
if (!ts->tick_stopped) {
calc_load_nohz_start();
@@ -812,7 +938,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
trace_tick_stop(1, TICK_DEP_MASK_NONE);
}
- ts->next_tick = tick;
+ ts->next_tick = expires;
/*
* If the expiration time == KTIME_MAX, then we simply stop
@@ -821,15 +947,17 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
if (unlikely(expires == KTIME_MAX)) {
if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
hrtimer_cancel(&ts->sched_timer);
+ else
+ tick_program_event(KTIME_MAX, 1);
return;
}
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- hrtimer_start(&ts->sched_timer, tick,
+ hrtimer_start(&ts->sched_timer, expires,
HRTIMER_MODE_ABS_PINNED_HARD);
} else {
- hrtimer_set_expires(&ts->sched_timer, tick);
- tick_program_event(tick, 1);
+ hrtimer_set_expires(&ts->sched_timer, expires);
+ tick_program_event(expires, 1);
}
}
@@ -861,31 +989,73 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
calc_load_nohz_stop();
touch_softlockup_watchdog_sched();
- /*
- * Cancel the scheduled timer and restore the tick
- */
- ts->tick_stopped = 0;
- ts->idle_exittime = now;
+ /* Cancel the scheduled timer and restore the tick: */
+ ts->tick_stopped = 0;
tick_nohz_restart(ts, now);
}
-static void tick_nohz_full_update_tick(struct tick_sched *ts)
+static void __tick_nohz_full_update_tick(struct tick_sched *ts,
+ ktime_t now)
{
#ifdef CONFIG_NO_HZ_FULL
int cpu = smp_processor_id();
- if (!tick_nohz_full_cpu(cpu))
+ if (can_stop_full_tick(cpu, ts))
+ tick_nohz_stop_sched_tick(ts, cpu);
+ else if (ts->tick_stopped)
+ tick_nohz_restart_sched_tick(ts, now);
+#endif
+}
+
+static void tick_nohz_full_update_tick(struct tick_sched *ts)
+{
+ if (!tick_nohz_full_cpu(smp_processor_id()))
return;
if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
return;
- if (can_stop_full_tick(cpu, ts))
- tick_nohz_stop_sched_tick(ts, cpu);
- else if (ts->tick_stopped)
- tick_nohz_restart_sched_tick(ts, ktime_get());
-#endif
+ __tick_nohz_full_update_tick(ts, ktime_get());
+}
+
+/*
+ * A pending softirq outside an IRQ (or softirq disabled section) context
+ * should be waiting for ksoftirqd to handle it. Therefore we shouldn't
+ * reach this code due to the need_resched() early check in can_stop_idle_tick().
+ *
+ * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the
+ * cpu_down() process, softirqs can still be raised while ksoftirqd is parked,
+ * triggering the code below, since wakep_softirqd() is ignored.
+ *
+ */
+static bool report_idle_softirq(void)
+{
+ static int ratelimit;
+ unsigned int pending = local_softirq_pending();
+
+ if (likely(!pending))
+ return false;
+
+ /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */
+ if (!cpu_active(smp_processor_id())) {
+ pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK;
+ if (!pending)
+ return false;
+ }
+
+ if (ratelimit >= 10)
+ return false;
+
+ /* On RT, softirq handling may be waiting on some lock */
+ if (local_bh_blocked())
+ return false;
+
+ pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
+ pending);
+ ratelimit++;
+
+ return true;
}
static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
@@ -894,8 +1064,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
* If this CPU is offline and it is the one which updates
* jiffies, then give up the assignment and let it be taken by
* the CPU which runs the tick timer next. If we don't drop
- * this here the jiffies might be stale and do_timer() never
- * invoked.
+ * this here, the jiffies might be stale and do_timer() never
+ * gets invoked.
*/
if (unlikely(!cpu_online(cpu))) {
if (cpu == tick_do_timer_cpu)
@@ -914,17 +1084,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
if (need_resched())
return false;
- if (unlikely(local_softirq_pending())) {
- static int ratelimit;
-
- if (ratelimit < 10 &&
- (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
- pr_warn("NOHZ: local_softirq_pending %02x\n",
- (unsigned int) local_softirq_pending());
- ratelimit++;
- }
+ if (unlikely(report_idle_softirq()))
return false;
- }
if (tick_nohz_full_enabled()) {
/*
@@ -933,13 +1094,6 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
*/
if (tick_do_timer_cpu == cpu)
return false;
- /*
- * Boot safety: make sure the timekeeping duty has been
- * assigned before entering dyntick-idle mode,
- * tick_do_timer_cpu is TICK_DO_TIMER_BOOT
- */
- if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT))
- return false;
/* Should not happen for nohz-full */
if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
@@ -949,10 +1103,16 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return true;
}
-static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
+/**
+ * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
+ *
+ * When the next event is more than a tick into the future, stop the idle tick
+ */
+void tick_nohz_idle_stop_tick(void)
{
- ktime_t expires;
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
int cpu = smp_processor_id();
+ ktime_t expires;
/*
* If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
@@ -984,16 +1144,6 @@ static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
}
}
-/**
- * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
- *
- * When the next event is more than a tick into the future, stop the idle tick
- */
-void tick_nohz_idle_stop_tick(void)
-{
- __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched));
-}
-
void tick_nohz_idle_retain_tick(void)
{
tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
@@ -1028,12 +1178,23 @@ void tick_nohz_idle_enter(void)
}
/**
- * tick_nohz_irq_exit - update next tick event from interrupt exit
+ * tick_nohz_irq_exit - Notify the tick about IRQ exit
+ *
+ * A timer may have been added/modified/deleted either by the current IRQ,
+ * or by another place using this IRQ as a notification. This IRQ may have
+ * also updated the RCU callback list. These events may require a
+ * re-evaluation of the next tick. Depending on the context:
*
- * When an interrupt fires while we are idle and it doesn't cause
- * a reschedule, it may still add, modify or delete a timer, enqueue
- * an RCU callback, etc...
- * So we need to re-calculate and reprogram the next tick event.
+ * 1) If the CPU is idle and no resched is pending, just proceed with idle
+ * time accounting. The next tick will be re-evaluated on the next idle
+ * loop iteration.
+ *
+ * 2) If the CPU is nohz_full:
+ *
+ * 2.1) If there is any tick dependency, restart the tick if stopped.
+ *
+ * 2.2) If there is no tick dependency, (re-)evaluate the next tick and
+ * stop/update it accordingly.
*/
void tick_nohz_irq_exit(void)
{
@@ -1061,7 +1222,7 @@ bool tick_nohz_idle_got_tick(void)
/**
* tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer
- * or the tick, whatever that expires first. Note that, if the tick has been
+ * or the tick, whichever expires first. Note that, if the tick has been
* stopped, it returns the next hrtimer.
*
* Called from power state control code with interrupts disabled
@@ -1075,7 +1236,11 @@ ktime_t tick_nohz_get_next_hrtimer(void)
* tick_nohz_get_sleep_length - return the expected length of the current sleep
* @delta_next: duration until the next event if the tick cannot be stopped
*
- * Called from power state control code with interrupts disabled
+ * Called from power state control code with interrupts disabled.
+ *
+ * The return value of this function and/or the value returned by it through the
+ * @delta_next pointer can be negative which must be taken into account by its
+ * callers.
*/
ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
{
@@ -1101,7 +1266,7 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
return *delta_next;
/*
- * If the next highres timer to expire is earlier than next_event, the
+ * If the next highres timer to expire is earlier than 'next_event', the
* idle governor needs to know that.
*/
next_event = min_t(u64, next_event,
@@ -1135,17 +1300,19 @@ unsigned long tick_nohz_get_idle_calls(void)
return ts->idle_calls;
}
-static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
+static void tick_nohz_account_idle_time(struct tick_sched *ts,
+ ktime_t now)
{
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
unsigned long ticks;
+ ts->idle_exittime = now;
+
if (vtime_accounting_enabled_this_cpu())
return;
/*
- * We stopped the tick in idle. Update process times would miss the
- * time we slept as update_process_times does only a 1 tick
- * accounting. Enforce that this is accounted to idle !
+ * We stopped the tick in idle. update_process_times() would miss the
+ * time we slept, as it does only a 1 tick accounting.
+ * Enforce that this is accounted to idle !
*/
ticks = jiffies - ts->idle_jiffies;
/*
@@ -1153,29 +1320,44 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
*/
if (ticks && ticks < LONG_MAX)
account_idle_ticks(ticks);
-#endif
}
-static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now)
+void tick_nohz_idle_restart_tick(void)
{
- tick_nohz_restart_sched_tick(ts, now);
- tick_nohz_account_idle_ticks(ts);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ if (ts->tick_stopped) {
+ ktime_t now = ktime_get();
+ tick_nohz_restart_sched_tick(ts, now);
+ tick_nohz_account_idle_time(ts, now);
+ }
}
-void tick_nohz_idle_restart_tick(void)
+static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
{
- struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+ if (tick_nohz_full_cpu(smp_processor_id()))
+ __tick_nohz_full_update_tick(ts, now);
+ else
+ tick_nohz_restart_sched_tick(ts, now);
- if (ts->tick_stopped)
- __tick_nohz_idle_restart_tick(ts, ktime_get());
+ tick_nohz_account_idle_time(ts, now);
}
/**
- * tick_nohz_idle_exit - restart the idle tick from the idle task
+ * tick_nohz_idle_exit - Update the tick upon idle task exit
+ *
+ * When the idle task exits, update the tick depending on the
+ * following situations:
+ *
+ * 1) If the CPU is not in nohz_full mode (most cases), then
+ * restart the tick.
+ *
+ * 2) If the CPU is in nohz_full mode (corner case):
+ * 2.1) If the tick can be kept stopped (no tick dependencies)
+ * then re-evaluate the next tick and try to keep it stopped
+ * as long as possible.
+ * 2.2) If the tick has dependencies, restart the tick.
*
- * Restart the idle tick when the CPU is woken up from idle
- * This also exit the RCU extended quiescent state. The CPU
- * can use RCU again after this function is called.
*/
void tick_nohz_idle_exit(void)
{
@@ -1199,15 +1381,21 @@ void tick_nohz_idle_exit(void)
tick_nohz_stop_idle(ts, now);
if (tick_stopped)
- __tick_nohz_idle_restart_tick(ts, now);
+ tick_nohz_idle_update_tick(ts, now);
local_irq_enable();
}
/*
- * The nohz low res interrupt handler
+ * In low-resolution mode, the tick handler must be implemented directly
+ * at the clockevent level. hrtimer can't be used instead, because its
+ * infrastructure actually relies on the tick itself as a backend in
+ * low-resolution mode (see hrtimer_run_queues()).
+ *
+ * This low-resolution handler still makes use of some hrtimer APIs meanwhile
+ * for convenience with expiration calculation and forwarding.
*/
-static void tick_nohz_handler(struct clock_event_device *dev)
+static void tick_nohz_lowres_handler(struct clock_event_device *dev)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
struct pt_regs *regs = get_irq_regs();
@@ -1218,12 +1406,16 @@ static void tick_nohz_handler(struct clock_event_device *dev)
tick_sched_do_timer(ts, now);
tick_sched_handle(ts, regs);
- /* No need to reprogram if we are running tickless */
- if (unlikely(ts->tick_stopped))
- return;
+ /*
+ * In dynticks mode, tick reprogram is deferred:
+ * - to the idle task if in dynticks-idle
+ * - to IRQ exit if in full-dynticks.
+ */
+ if (likely(!ts->tick_stopped)) {
+ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
+ tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+ }
- hrtimer_forward(&ts->sched_timer, now, tick_period);
- tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
}
static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
@@ -1237,7 +1429,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
}
/**
- * tick_nohz_switch_to_nohz - switch to nohz mode
+ * tick_nohz_switch_to_nohz - switch to NOHZ mode
*/
static void tick_nohz_switch_to_nohz(void)
{
@@ -1247,19 +1439,19 @@ static void tick_nohz_switch_to_nohz(void)
if (!tick_nohz_enabled)
return;
- if (tick_switch_to_oneshot(tick_nohz_handler))
+ if (tick_switch_to_oneshot(tick_nohz_lowres_handler))
return;
/*
- * Recycle the hrtimer in ts, so we can share the
- * hrtimer_forward with the highres code.
+ * Recycle the hrtimer in 'ts', so we can share the
+ * hrtimer_forward_now() function with the highres code.
*/
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
/* Get the next period */
next = tick_init_jiffy_update();
hrtimer_set_expires(&ts->sched_timer, next);
- hrtimer_forward_now(&ts->sched_timer, tick_period);
+ hrtimer_forward_now(&ts->sched_timer, TICK_NSEC);
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
}
@@ -1274,6 +1466,13 @@ static inline void tick_nohz_irq_enter(void)
now = ktime_get();
if (ts->idle_active)
tick_nohz_stop_idle(ts, now);
+ /*
+ * If all CPUs are idle we may need to update a stale jiffies value.
+ * Note nohz_full is a special case: a timekeeper is guaranteed to stay
+ * alive but it might be busy looping with interrupts disabled in some
+ * rare case (typically stop machine). So we must make sure we have a
+ * last resort.
+ */
if (ts->tick_stopped)
tick_nohz_update_jiffies(now);
}
@@ -1287,7 +1486,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { }
#endif /* CONFIG_NO_HZ_COMMON */
/*
- * Called from irq_enter to notify about the possible interruption of idle()
+ * Called from irq_enter() to notify about the possible interruption of idle()
*/
void tick_irq_enter(void)
{
@@ -1303,7 +1502,7 @@ void tick_irq_enter(void)
* We rearm the timer until we get disabled by the idle code.
* Called with interrupts disabled.
*/
-static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
+static enum hrtimer_restart tick_nohz_highres_handler(struct hrtimer *timer)
{
struct tick_sched *ts =
container_of(timer, struct tick_sched, sched_timer);
@@ -1313,19 +1512,23 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
tick_sched_do_timer(ts, now);
/*
- * Do not call, when we are not in irq context and have
- * no valid regs pointer
+ * Do not call when we are not in IRQ context and have
+ * no valid 'regs' pointer
*/
if (regs)
tick_sched_handle(ts, regs);
else
ts->next_tick = 0;
- /* No need to reprogram if we are in idle or full dynticks mode */
+ /*
+ * In dynticks mode, tick reprogram is deferred:
+ * - to the idle task if in dynticks-idle
+ * - to IRQ exit if in full-dynticks.
+ */
if (unlikely(ts->tick_stopped))
return HRTIMER_NORESTART;
- hrtimer_forward(timer, now, tick_period);
+ hrtimer_forward(timer, now, TICK_NSEC);
return HRTIMER_RESTART;
}
@@ -1348,24 +1551,22 @@ void tick_setup_sched_timer(void)
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t now = ktime_get();
- /*
- * Emulate tick processing via per-CPU hrtimers:
- */
+ /* Emulate tick processing via per-CPU hrtimers: */
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- ts->sched_timer.function = tick_sched_timer;
+ ts->sched_timer.function = tick_nohz_highres_handler;
/* Get the next period (per-CPU) */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
- /* Offset the tick to avert jiffies_lock contention. */
+ /* Offset the tick to avert 'jiffies_lock' contention. */
if (sched_skew_tick) {
- u64 offset = ktime_to_ns(tick_period) >> 1;
+ u64 offset = TICK_NSEC >> 1;
do_div(offset, num_possible_cpus());
offset *= smp_processor_id();
hrtimer_add_expires_ns(&ts->sched_timer, offset);
}
- hrtimer_forward(&ts->sched_timer, now, tick_period);
+ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD);
tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);
}
@@ -1375,17 +1576,27 @@ void tick_setup_sched_timer(void)
void tick_cancel_sched_timer(int cpu)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ ktime_t idle_sleeptime, iowait_sleeptime;
+ unsigned long idle_calls, idle_sleeps;
# ifdef CONFIG_HIGH_RES_TIMERS
if (ts->sched_timer.base)
hrtimer_cancel(&ts->sched_timer);
# endif
+ idle_sleeptime = ts->idle_sleeptime;
+ iowait_sleeptime = ts->iowait_sleeptime;
+ idle_calls = ts->idle_calls;
+ idle_sleeps = ts->idle_sleeps;
memset(ts, 0, sizeof(*ts));
+ ts->idle_sleeptime = idle_sleeptime;
+ ts->iowait_sleeptime = iowait_sleeptime;
+ ts->idle_calls = idle_calls;
+ ts->idle_sleeps = idle_sleeps;
}
#endif
-/**
+/*
* Async notification about clocksource changes
*/
void tick_clock_notify(void)
@@ -1406,11 +1617,11 @@ void tick_oneshot_notify(void)
set_bit(0, &ts->check_clocks);
}
-/**
- * Check, if a change happened, which makes oneshot possible.
+/*
+ * Check if a change happened, which makes oneshot possible.
*
- * Called cyclic from the hrtimer softirq (driven by the timer
- * softirq) allow_nohz signals, that we can switch into low-res nohz
+ * Called cyclically from the hrtimer softirq (driven by the timer
+ * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ
* mode, because high resolution timers are disabled (either compile
* or runtime). Called with interrupts disabled.
*/
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 4fb06527cf64..5ed5a9d41d5a 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -22,61 +22,82 @@ enum tick_nohz_mode {
/**
* struct tick_sched - sched tick emulation and no idle tick control/stats
- * @sched_timer: hrtimer to schedule the periodic tick in high
- * resolution mode
- * @check_clocks: Notification mechanism about clocksource changes
- * @nohz_mode: Mode - one state of tick_nohz_mode
+ *
* @inidle: Indicator that the CPU is in the tick idle mode
* @tick_stopped: Indicator that the idle tick has been stopped
* @idle_active: Indicator that the CPU is actively in the tick idle mode;
- * it is resetted during irq handling phases.
- * @do_timer_lst: CPU was the last one doing do_timer before going idle
+ * it is reset during irq handling phases.
+ * @do_timer_last: CPU was the last one doing do_timer before going idle
* @got_idle_tick: Tick timer function has run with @inidle set
+ * @stalled_jiffies: Number of stalled jiffies detected across ticks
+ * @last_tick_jiffies: Value of jiffies seen on last tick
+ * @sched_timer: hrtimer to schedule the periodic tick in high
+ * resolution mode
* @last_tick: Store the last tick expiry time when the tick
* timer is modified for nohz sleeps. This is necessary
* to resume the tick timer operation in the timeline
* when the CPU returns from nohz sleep.
* @next_tick: Next tick to be fired when in dynticks mode.
* @idle_jiffies: jiffies at the entry to idle for idle time accounting
+ * @idle_waketime: Time when the idle was interrupted
+ * @idle_entrytime: Time when the idle call was entered
+ * @nohz_mode: Mode - one state of tick_nohz_mode
+ * @last_jiffies: Base jiffies snapshot when next event was last computed
+ * @timer_expires_base: Base time clock monotonic for @timer_expires
+ * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
+ * @next_timer: Expiry time of next expiring timer for debugging purpose only
+ * @idle_expires: Next tick in idle, for debugging purpose only
* @idle_calls: Total number of idle calls
* @idle_sleeps: Number of idle calls, where the sched tick was stopped
- * @idle_entrytime: Time when the idle call was entered
- * @idle_waketime: Time when the idle was interrupted
* @idle_exittime: Time when the idle state was left
* @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
* @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
- * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
- * @timer_expires_base: Base time clock monotonic for @timer_expires
- * @next_timer: Expiry time of next expiring timer for debugging purpose only
* @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick
+ * @check_clocks: Notification mechanism about clocksource changes
*/
struct tick_sched {
- struct hrtimer sched_timer;
- unsigned long check_clocks;
- enum tick_nohz_mode nohz_mode;
-
+ /* Common flags */
unsigned int inidle : 1;
unsigned int tick_stopped : 1;
unsigned int idle_active : 1;
unsigned int do_timer_last : 1;
unsigned int got_idle_tick : 1;
+ /* Tick handling: jiffies stall check */
+ unsigned int stalled_jiffies;
+ unsigned long last_tick_jiffies;
+
+ /* Tick handling */
+ struct hrtimer sched_timer;
ktime_t last_tick;
ktime_t next_tick;
unsigned long idle_jiffies;
- unsigned long idle_calls;
- unsigned long idle_sleeps;
- ktime_t idle_entrytime;
ktime_t idle_waketime;
- ktime_t idle_exittime;
- ktime_t idle_sleeptime;
- ktime_t iowait_sleeptime;
+
+ /* Idle entry */
+ seqcount_t idle_sleeptime_seq;
+ ktime_t idle_entrytime;
+
+ /* Tick stop */
+ enum tick_nohz_mode nohz_mode;
unsigned long last_jiffies;
- u64 timer_expires;
u64 timer_expires_base;
+ u64 timer_expires;
u64 next_timer;
ktime_t idle_expires;
+ unsigned long idle_calls;
+ unsigned long idle_sleeps;
+
+ /* Idle exit */
+ ktime_t idle_exittime;
+ ktime_t idle_sleeptime;
+ ktime_t iowait_sleeptime;
+
+ /* Full dynticks handling */
atomic_t tick_dep_mask;
+
+ /* Clocksource changes */
+ unsigned long check_clocks;
};
extern struct tick_sched *tick_get_tick_sched(int cpu);
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 3985b2b32d08..642647f5046b 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -365,11 +365,14 @@ SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp)
}
#endif
-/*
- * Convert jiffies to milliseconds and back.
+/**
+ * jiffies_to_msecs - Convert jiffies to milliseconds
+ * @j: jiffies value
*
* Avoid unnecessary multiplications/divisions in the
- * two most common HZ cases:
+ * two most common HZ cases.
+ *
+ * Return: milliseconds value
*/
unsigned int jiffies_to_msecs(const unsigned long j)
{
@@ -388,6 +391,12 @@ unsigned int jiffies_to_msecs(const unsigned long j)
}
EXPORT_SYMBOL(jiffies_to_msecs);
+/**
+ * jiffies_to_usecs - Convert jiffies to microseconds
+ * @j: jiffies value
+ *
+ * Return: microseconds value
+ */
unsigned int jiffies_to_usecs(const unsigned long j)
{
/*
@@ -408,8 +417,15 @@ unsigned int jiffies_to_usecs(const unsigned long j)
}
EXPORT_SYMBOL(jiffies_to_usecs);
-/*
+/**
* mktime64 - Converts date to seconds.
+ * @year0: year to convert
+ * @mon0: month to convert
+ * @day: day to convert
+ * @hour: hour to convert
+ * @min: minute to convert
+ * @sec: second to convert
+ *
* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
* Assumes input in normal date format, i.e. 1980-12-31 23:59:59
* => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
@@ -427,6 +443,8 @@ EXPORT_SYMBOL(jiffies_to_usecs);
*
* An encoding of midnight at the end of the day as 24:00:00 - ie. midnight
* tomorrow - (allowable under ISO 8601) is supported.
+ *
+ * Return: seconds since the epoch time for the given input date
*/
time64_t mktime64(const unsigned int year0, const unsigned int mon0,
const unsigned int day, const unsigned int hour,
@@ -449,7 +467,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0,
}
EXPORT_SYMBOL(mktime64);
-struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec)
+struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec)
{
struct timespec64 ts = ns_to_timespec64(nsec);
struct __kernel_old_timeval tv;
@@ -462,7 +480,7 @@ struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec)
EXPORT_SYMBOL(ns_to_kernel_old_timeval);
/**
- * set_normalized_timespec - set timespec sec and nsec parts and normalize
+ * set_normalized_timespec64 - set timespec sec and nsec parts and normalize
*
* @ts: pointer to timespec variable to be set
* @sec: seconds to set
@@ -471,8 +489,7 @@ EXPORT_SYMBOL(ns_to_kernel_old_timeval);
* Set seconds and nanoseconds field of a timespec variable and
* normalize to the timespec storage format
*
- * Note: The tv_nsec part is always in the range of
- * 0 <= tv_nsec < NSEC_PER_SEC
+ * Note: The tv_nsec part is always in the range of 0 <= tv_nsec < NSEC_PER_SEC.
* For negative values only the tv_sec field is negative !
*/
void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
@@ -501,9 +518,9 @@ EXPORT_SYMBOL(set_normalized_timespec64);
* ns_to_timespec64 - Convert nanoseconds to timespec64
* @nsec: the nanoseconds value to be converted
*
- * Returns the timespec64 representation of the nsec parameter.
+ * Return: the timespec64 representation of the nsec parameter.
*/
-struct timespec64 ns_to_timespec64(const s64 nsec)
+struct timespec64 ns_to_timespec64(s64 nsec)
{
struct timespec64 ts = { 0, 0 };
s32 rem;
@@ -526,7 +543,7 @@ struct timespec64 ns_to_timespec64(const s64 nsec)
EXPORT_SYMBOL(ns_to_timespec64);
/**
- * msecs_to_jiffies: - convert milliseconds to jiffies
+ * __msecs_to_jiffies: - convert milliseconds to jiffies
* @m: time in milliseconds
*
* conversion is done as follows:
@@ -541,13 +558,15 @@ EXPORT_SYMBOL(ns_to_timespec64);
* handling any 32-bit overflows.
* for the details see __msecs_to_jiffies()
*
- * msecs_to_jiffies() checks for the passed in value being a constant
+ * __msecs_to_jiffies() checks for the passed in value being a constant
* via __builtin_constant_p() allowing gcc to eliminate most of the
* code, __msecs_to_jiffies() is called if the value passed does not
* allow constant folding and the actual conversion must be done at
* runtime.
- * the _msecs_to_jiffies helpers are the HZ dependent conversion
+ * The _msecs_to_jiffies helpers are the HZ dependent conversion
* routines found in include/linux/jiffies.h
+ *
+ * Return: jiffies value
*/
unsigned long __msecs_to_jiffies(const unsigned int m)
{
@@ -560,6 +579,12 @@ unsigned long __msecs_to_jiffies(const unsigned int m)
}
EXPORT_SYMBOL(__msecs_to_jiffies);
+/**
+ * __usecs_to_jiffies: - convert microseconds to jiffies
+ * @u: time in milliseconds
+ *
+ * Return: jiffies value
+ */
unsigned long __usecs_to_jiffies(const unsigned int u)
{
if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
@@ -568,10 +593,13 @@ unsigned long __usecs_to_jiffies(const unsigned int u)
}
EXPORT_SYMBOL(__usecs_to_jiffies);
-/*
+/**
+ * timespec64_to_jiffies - convert a timespec64 value to jiffies
+ * @value: pointer to &struct timespec64
+ *
* The TICK_NSEC - 1 rounds up the value to the next resolution. Note
* that a remainder subtract here would not do the right thing as the
- * resolution values don't fall on second boundries. I.e. the line:
+ * resolution values don't fall on second boundaries. I.e. the line:
* nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
* Note that due to the small error in the multiplier here, this
* rounding is incorrect for sufficiently large values of tv_nsec, but
@@ -582,8 +610,9 @@ EXPORT_SYMBOL(__usecs_to_jiffies);
*
* The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
* value to a scaled second value.
+ *
+ * Return: jiffies value
*/
-
unsigned long
timespec64_to_jiffies(const struct timespec64 *value)
{
@@ -601,6 +630,11 @@ timespec64_to_jiffies(const struct timespec64 *value)
}
EXPORT_SYMBOL(timespec64_to_jiffies);
+/**
+ * jiffies_to_timespec64 - convert jiffies value to &struct timespec64
+ * @jiffies: jiffies value
+ * @value: pointer to &struct timespec64
+ */
void
jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
{
@@ -618,6 +652,13 @@ EXPORT_SYMBOL(jiffies_to_timespec64);
/*
* Convert jiffies/jiffies_64 to clock_t and back.
*/
+
+/**
+ * jiffies_to_clock_t - Convert jiffies to clock_t
+ * @x: jiffies value
+ *
+ * Return: jiffies converted to clock_t (CLOCKS_PER_SEC)
+ */
clock_t jiffies_to_clock_t(unsigned long x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
@@ -632,6 +673,12 @@ clock_t jiffies_to_clock_t(unsigned long x)
}
EXPORT_SYMBOL(jiffies_to_clock_t);
+/**
+ * clock_t_to_jiffies - Convert clock_t to jiffies
+ * @x: clock_t value
+ *
+ * Return: clock_t value converted to jiffies
+ */
unsigned long clock_t_to_jiffies(unsigned long x)
{
#if (HZ % USER_HZ)==0
@@ -649,6 +696,12 @@ unsigned long clock_t_to_jiffies(unsigned long x)
}
EXPORT_SYMBOL(clock_t_to_jiffies);
+/**
+ * jiffies_64_to_clock_t - Convert jiffies_64 to clock_t
+ * @x: jiffies_64 value
+ *
+ * Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
+ */
u64 jiffies_64_to_clock_t(u64 x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
@@ -671,6 +724,12 @@ u64 jiffies_64_to_clock_t(u64 x)
}
EXPORT_SYMBOL(jiffies_64_to_clock_t);
+/**
+ * nsec_to_clock_t - Convert nsec value to clock_t
+ * @x: nsec value
+ *
+ * Return: nsec value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
+ */
u64 nsec_to_clock_t(u64 x)
{
#if (NSEC_PER_SEC % USER_HZ) == 0
@@ -687,6 +746,12 @@ u64 nsec_to_clock_t(u64 x)
#endif
}
+/**
+ * jiffies64_to_nsecs - Convert jiffies64 to nanoseconds
+ * @j: jiffies64 value
+ *
+ * Return: nanoseconds value
+ */
u64 jiffies64_to_nsecs(u64 j)
{
#if !(NSEC_PER_SEC % HZ)
@@ -697,6 +762,12 @@ u64 jiffies64_to_nsecs(u64 j)
}
EXPORT_SYMBOL(jiffies64_to_nsecs);
+/**
+ * jiffies64_to_msecs - Convert jiffies64 to milliseconds
+ * @j: jiffies64 value
+ *
+ * Return: milliseconds value
+ */
u64 jiffies64_to_msecs(const u64 j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
@@ -719,6 +790,8 @@ EXPORT_SYMBOL(jiffies64_to_msecs);
* note:
* NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
* ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ *
+ * Return: nsecs converted to jiffies64 value
*/
u64 nsecs_to_jiffies64(u64 n)
{
@@ -750,6 +823,8 @@ EXPORT_SYMBOL(nsecs_to_jiffies64);
* note:
* NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
* ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ *
+ * Return: nsecs converted to jiffies value
*/
unsigned long nsecs_to_jiffies(u64 n)
{
@@ -757,10 +832,16 @@ unsigned long nsecs_to_jiffies(u64 n)
}
EXPORT_SYMBOL_GPL(nsecs_to_jiffies);
-/*
- * Add two timespec64 values and do a safety check for overflow.
+/**
+ * timespec64_add_safe - Add two timespec64 values and do a safety check
+ * for overflow.
+ * @lhs: first (left) timespec64 to add
+ * @rhs: second (right) timespec64 to add
+ *
* It's assumed that both values are valid (>= 0).
* And, each timespec64 is in normalized form.
+ *
+ * Return: sum of @lhs + @rhs
*/
struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
const struct timespec64 rhs)
@@ -778,6 +859,15 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
return res;
}
+/**
+ * get_timespec64 - get user's time value into kernel space
+ * @ts: destination &struct timespec64
+ * @uts: user's time value as &struct __kernel_timespec
+ *
+ * Handles compat or 32-bit modes.
+ *
+ * Return: %0 on success or negative errno on error
+ */
int get_timespec64(struct timespec64 *ts,
const struct __kernel_timespec __user *uts)
{
@@ -801,6 +891,14 @@ int get_timespec64(struct timespec64 *ts,
}
EXPORT_SYMBOL_GPL(get_timespec64);
+/**
+ * put_timespec64 - convert timespec64 value to __kernel_timespec format and
+ * copy the latter to userspace
+ * @ts: input &struct timespec64
+ * @uts: user's &struct __kernel_timespec
+ *
+ * Return: %0 on success or negative errno on error
+ */
int put_timespec64(const struct timespec64 *ts,
struct __kernel_timespec __user *uts)
{
@@ -839,6 +937,15 @@ static int __put_old_timespec32(const struct timespec64 *ts64,
return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
}
+/**
+ * get_old_timespec32 - get user's old-format time value into kernel space
+ * @ts: destination &struct timespec64
+ * @uts: user's old-format time value (&struct old_timespec32)
+ *
+ * Handles X86_X32_ABI compatibility conversion.
+ *
+ * Return: %0 on success or negative errno on error
+ */
int get_old_timespec32(struct timespec64 *ts, const void __user *uts)
{
if (COMPAT_USE_64BIT_TIME)
@@ -848,6 +955,16 @@ int get_old_timespec32(struct timespec64 *ts, const void __user *uts)
}
EXPORT_SYMBOL_GPL(get_old_timespec32);
+/**
+ * put_old_timespec32 - convert timespec64 value to &struct old_timespec32 and
+ * copy the latter to userspace
+ * @ts: input &struct timespec64
+ * @uts: user's &struct old_timespec32
+ *
+ * Handles X86_X32_ABI compatibility conversion.
+ *
+ * Return: %0 on success or negative errno on error
+ */
int put_old_timespec32(const struct timespec64 *ts, void __user *uts)
{
if (COMPAT_USE_64BIT_TIME)
@@ -857,6 +974,13 @@ int put_old_timespec32(const struct timespec64 *ts, void __user *uts)
}
EXPORT_SYMBOL_GPL(put_old_timespec32);
+/**
+ * get_itimerspec64 - get user's &struct __kernel_itimerspec into kernel space
+ * @it: destination &struct itimerspec64
+ * @uit: user's &struct __kernel_itimerspec
+ *
+ * Return: %0 on success or negative errno on error
+ */
int get_itimerspec64(struct itimerspec64 *it,
const struct __kernel_itimerspec __user *uit)
{
@@ -872,6 +996,14 @@ int get_itimerspec64(struct itimerspec64 *it,
}
EXPORT_SYMBOL_GPL(get_itimerspec64);
+/**
+ * put_itimerspec64 - convert &struct itimerspec64 to __kernel_itimerspec format
+ * and copy the latter to userspace
+ * @it: input &struct itimerspec64
+ * @uit: user's &struct __kernel_itimerspec
+ *
+ * Return: %0 on success or negative errno on error
+ */
int put_itimerspec64(const struct itimerspec64 *it,
struct __kernel_itimerspec __user *uit)
{
@@ -887,6 +1019,13 @@ int put_itimerspec64(const struct itimerspec64 *it,
}
EXPORT_SYMBOL_GPL(put_itimerspec64);
+/**
+ * get_old_itimerspec32 - get user's &struct old_itimerspec32 into kernel space
+ * @its: destination &struct itimerspec64
+ * @uits: user's &struct old_itimerspec32
+ *
+ * Return: %0 on success or negative errno on error
+ */
int get_old_itimerspec32(struct itimerspec64 *its,
const struct old_itimerspec32 __user *uits)
{
@@ -898,6 +1037,14 @@ int get_old_itimerspec32(struct itimerspec64 *its,
}
EXPORT_SYMBOL_GPL(get_old_itimerspec32);
+/**
+ * put_old_itimerspec32 - convert &struct itimerspec64 to &struct
+ * old_itimerspec32 and copy the latter to userspace
+ * @its: input &struct itimerspec64
+ * @uits: user's &struct old_itimerspec32
+ *
+ * Return: %0 on success or negative errno on error
+ */
int put_old_itimerspec32(const struct itimerspec64 *its,
struct old_itimerspec32 __user *uits)
{
diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c
new file mode 100644
index 000000000000..ca058c8af6ba
--- /dev/null
+++ b/kernel/time/time_test.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: LGPL-2.1+
+
+#include <kunit/test.h>
+#include <linux/time.h>
+
+/*
+ * Traditional implementation of leap year evaluation.
+ */
+static bool is_leap(long year)
+{
+ return year % 4 == 0 && (year % 100 != 0 || year % 400 == 0);
+}
+
+/*
+ * Gets the last day of a month.
+ */
+static int last_day_of_month(long year, int month)
+{
+ if (month == 2)
+ return 28 + is_leap(year);
+ if (month == 4 || month == 6 || month == 9 || month == 11)
+ return 30;
+ return 31;
+}
+
+/*
+ * Advances a date by one day.
+ */
+static void advance_date(long *year, int *month, int *mday, int *yday)
+{
+ if (*mday != last_day_of_month(*year, *month)) {
+ ++*mday;
+ ++*yday;
+ return;
+ }
+
+ *mday = 1;
+ if (*month != 12) {
+ ++*month;
+ ++*yday;
+ return;
+ }
+
+ *month = 1;
+ *yday = 0;
+ ++*year;
+}
+
+/*
+ * Checks every day in a 160000 years interval centered at 1970-01-01
+ * against the expected result.
+ */
+static void time64_to_tm_test_date_range(struct kunit *test)
+{
+ /*
+ * 80000 years = (80000 / 400) * 400 years
+ * = (80000 / 400) * 146097 days
+ * = (80000 / 400) * 146097 * 86400 seconds
+ */
+ time64_t total_secs = ((time64_t) 80000) / 400 * 146097 * 86400;
+ long year = 1970 - 80000;
+ int month = 1;
+ int mdday = 1;
+ int yday = 0;
+
+ struct tm result;
+ time64_t secs;
+ s64 days;
+
+ for (secs = -total_secs; secs <= total_secs; secs += 86400) {
+
+ time64_to_tm(secs, 0, &result);
+
+ days = div_s64(secs, 86400);
+
+ #define FAIL_MSG "%05ld/%02d/%02d (%2d) : %ld", \
+ year, month, mdday, yday, days
+
+ KUNIT_ASSERT_EQ_MSG(test, year - 1900, result.tm_year, FAIL_MSG);
+ KUNIT_ASSERT_EQ_MSG(test, month - 1, result.tm_mon, FAIL_MSG);
+ KUNIT_ASSERT_EQ_MSG(test, mdday, result.tm_mday, FAIL_MSG);
+ KUNIT_ASSERT_EQ_MSG(test, yday, result.tm_yday, FAIL_MSG);
+
+ advance_date(&year, &month, &mdday, &yday);
+ }
+}
+
+static struct kunit_case time_test_cases[] = {
+ KUNIT_CASE_SLOW(time64_to_tm_test_date_range),
+ {}
+};
+
+static struct kunit_suite time_test_suite = {
+ .name = "time_test_cases",
+ .test_cases = time_test_cases,
+};
+
+kunit_test_suite(time_test_suite);
+MODULE_LICENSE("GPL");
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
index 589e0a552129..59b922c826e7 100644
--- a/kernel/time/timeconv.c
+++ b/kernel/time/timeconv.c
@@ -22,47 +22,16 @@
/*
* Converts the calendar time to broken-down time representation
- * Based on code from glibc-2.6
*
* 2009-7-14:
* Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com>
+ * 2021-06-02:
+ * Reimplemented by Cassio Neri <cassio.neri@gmail.com>
*/
#include <linux/time.h>
#include <linux/module.h>
-
-/*
- * Nonzero if YEAR is a leap year (every 4 years,
- * except every 100th isn't, and every 400th is).
- */
-static int __isleap(long year)
-{
- return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0);
-}
-
-/* do a mathdiv for long type */
-static long math_div(long a, long b)
-{
- return a / b - (a % b < 0);
-}
-
-/* How many leap years between y1 and y2, y1 must less or equal to y2 */
-static long leaps_between(long y1, long y2)
-{
- long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100)
- + math_div(y1 - 1, 400);
- long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100)
- + math_div(y2 - 1, 400);
- return leaps2 - leaps1;
-}
-
-/* How many days come before each month (0-12). */
-static const unsigned short __mon_yday[2][13] = {
- /* Normal years. */
- {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},
- /* Leap years. */
- {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}
-};
+#include <linux/kernel.h>
#define SECS_PER_HOUR (60 * 60)
#define SECS_PER_DAY (SECS_PER_HOUR * 24)
@@ -70,16 +39,18 @@ static const unsigned short __mon_yday[2][13] = {
/**
* time64_to_tm - converts the calendar time to local broken-down time
*
- * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970,
+ * @totalsecs: the number of seconds elapsed since 00:00:00 on January 1, 1970,
* Coordinated Universal Time (UTC).
- * @offset offset seconds adding to totalsecs.
- * @result pointer to struct tm variable to receive broken-down time
+ * @offset: offset seconds adding to totalsecs.
+ * @result: pointer to struct tm variable to receive broken-down time
*/
void time64_to_tm(time64_t totalsecs, int offset, struct tm *result)
{
- long days, rem, y;
+ u32 u32tmp, day_of_century, year_of_century, day_of_year, month, day;
+ u64 u64tmp, udays, century, year;
+ bool is_Jan_or_Feb, is_leap_year;
+ long days, rem;
int remainder;
- const unsigned short *ip;
days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder);
rem = remainder;
@@ -103,27 +74,68 @@ void time64_to_tm(time64_t totalsecs, int offset, struct tm *result)
if (result->tm_wday < 0)
result->tm_wday += 7;
- y = 1970;
-
- while (days < 0 || days >= (__isleap(y) ? 366 : 365)) {
- /* Guess a corrected year, assuming 365 days per year. */
- long yg = y + math_div(days, 365);
-
- /* Adjust DAYS and Y to match the guessed year. */
- days -= (yg - y) * 365 + leaps_between(y, yg);
- y = yg;
- }
-
- result->tm_year = y - 1900;
-
- result->tm_yday = days;
-
- ip = __mon_yday[__isleap(y)];
- for (y = 11; days < ip[y]; y--)
- continue;
- days -= ip[y];
-
- result->tm_mon = y;
- result->tm_mday = days + 1;
+ /*
+ * The following algorithm is, basically, Proposition 6.3 of Neri
+ * and Schneider [1]. In a few words: it works on the computational
+ * (fictitious) calendar where the year starts in March, month = 2
+ * (*), and finishes in February, month = 13. This calendar is
+ * mathematically convenient because the day of the year does not
+ * depend on whether the year is leap or not. For instance:
+ *
+ * March 1st 0-th day of the year;
+ * ...
+ * April 1st 31-st day of the year;
+ * ...
+ * January 1st 306-th day of the year; (Important!)
+ * ...
+ * February 28th 364-th day of the year;
+ * February 29th 365-th day of the year (if it exists).
+ *
+ * After having worked out the date in the computational calendar
+ * (using just arithmetics) it's easy to convert it to the
+ * corresponding date in the Gregorian calendar.
+ *
+ * [1] "Euclidean Affine Functions and Applications to Calendar
+ * Algorithms". https://arxiv.org/abs/2102.06959
+ *
+ * (*) The numbering of months follows tm more closely and thus,
+ * is slightly different from [1].
+ */
+
+ udays = ((u64) days) + 2305843009213814918ULL;
+
+ u64tmp = 4 * udays + 3;
+ century = div64_u64_rem(u64tmp, 146097, &u64tmp);
+ day_of_century = (u32) (u64tmp / 4);
+
+ u32tmp = 4 * day_of_century + 3;
+ u64tmp = 2939745ULL * u32tmp;
+ year_of_century = upper_32_bits(u64tmp);
+ day_of_year = lower_32_bits(u64tmp) / 2939745 / 4;
+
+ year = 100 * century + year_of_century;
+ is_leap_year = year_of_century ? !(year_of_century % 4) : !(century % 4);
+
+ u32tmp = 2141 * day_of_year + 132377;
+ month = u32tmp >> 16;
+ day = ((u16) u32tmp) / 2141;
+
+ /*
+ * Recall that January 1st is the 306-th day of the year in the
+ * computational (not Gregorian) calendar.
+ */
+ is_Jan_or_Feb = day_of_year >= 306;
+
+ /* Convert to the Gregorian calendar and adjust to Unix time. */
+ year = year + is_Jan_or_Feb - 6313183731940000ULL;
+ month = is_Jan_or_Feb ? month - 12 : month;
+ day = day + 1;
+ day_of_year += is_Jan_or_Feb ? -306 : 31 + 28 + is_leap_year;
+
+ /* Convert to tm's format. */
+ result->tm_year = (long) (year - 1900);
+ result->tm_mon = (int) month;
+ result->tm_mday = (int) day;
+ result->tm_yday = (int) day_of_year;
}
EXPORT_SYMBOL(time64_to_tm);
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index 85b98e727306..e6285288d765 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -76,7 +76,7 @@ static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
return ns;
}
-u64 timecounter_cyc2time(struct timecounter *tc,
+u64 timecounter_cyc2time(const struct timecounter *tc,
u64 cycle_tstamp)
{
u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d20d489841c8..266d02809dbb 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -17,11 +17,13 @@
#include <linux/clocksource.h>
#include <linux/jiffies.h>
#include <linux/time.h>
+#include <linux/timex.h>
#include <linux/tick.h>
#include <linux/stop_machine.h>
#include <linux/pvclock_gtod.h>
#include <linux/compiler.h>
#include <linux/audit.h>
+#include <linux/random.h>
#include "tick-internal.h"
#include "ntp_internal.h"
@@ -39,20 +41,24 @@ enum timekeeping_adv_mode {
TK_ADV_FREQ
};
+DEFINE_RAW_SPINLOCK(timekeeper_lock);
+
/*
* The most important data for readout fits into a single 64 byte
* cache line.
*/
static struct {
- seqcount_t seq;
+ seqcount_raw_spinlock_t seq;
struct timekeeper timekeeper;
} tk_core ____cacheline_aligned = {
- .seq = SEQCNT_ZERO(tk_core.seq),
+ .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock),
};
-static DEFINE_RAW_SPINLOCK(timekeeper_lock);
static struct timekeeper shadow_timekeeper;
+/* flag for if timekeeping is suspended */
+int __read_mostly timekeeping_suspended;
+
/**
* struct tk_fast - NMI safe timekeeper
* @seq: Sequence counter for protecting updates. The lowest bit
@@ -63,7 +69,7 @@ static struct timekeeper shadow_timekeeper;
* See @update_fast_timekeeper() below.
*/
struct tk_fast {
- seqcount_t seq;
+ seqcount_latch_t seq;
struct tk_read_base base[2];
};
@@ -72,26 +78,42 @@ static u64 cycles_at_suspend;
static u64 dummy_clock_read(struct clocksource *cs)
{
- return cycles_at_suspend;
+ if (timekeeping_suspended)
+ return cycles_at_suspend;
+ return local_clock();
}
static struct clocksource dummy_clock = {
.read = dummy_clock_read,
};
+/*
+ * Boot time initialization which allows local_clock() to be utilized
+ * during early boot when clocksources are not available. local_clock()
+ * returns nanoseconds already so no conversion is required, hence mult=1
+ * and shift=0. When the first proper clocksource is installed then
+ * the fast time keepers are updated with the correct values.
+ */
+#define FAST_TK_INIT \
+ { \
+ .clock = &dummy_clock, \
+ .mask = CLOCKSOURCE_MASK(64), \
+ .mult = 1, \
+ .shift = 0, \
+ }
+
static struct tk_fast tk_fast_mono ____cacheline_aligned = {
- .base[0] = { .clock = &dummy_clock, },
- .base[1] = { .clock = &dummy_clock, },
+ .seq = SEQCNT_LATCH_ZERO(tk_fast_mono.seq),
+ .base[0] = FAST_TK_INIT,
+ .base[1] = FAST_TK_INIT,
};
static struct tk_fast tk_fast_raw ____cacheline_aligned = {
- .base[0] = { .clock = &dummy_clock, },
- .base[1] = { .clock = &dummy_clock, },
+ .seq = SEQCNT_LATCH_ZERO(tk_fast_raw.seq),
+ .base[0] = FAST_TK_INIT,
+ .base[1] = FAST_TK_INIT,
};
-/* flag for if timekeeping is suspended */
-int __read_mostly timekeeping_suspended;
-
static inline void tk_normalize_xtime(struct timekeeper *tk)
{
while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
@@ -157,7 +179,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
* tk_clock_read - atomic clocksource read() helper
*
* This helper is necessary to use in the read paths because, while the
- * seqlock ensures we don't return a bad value while structures are updated,
+ * seqcount ensures we don't return a bad value while structures are updated,
* it doesn't protect from potential crashes. There is the possibility that
* the tkr's clocksource may change between the read reference, and the
* clock reference passed to the read function. This can cause crashes if
@@ -222,10 +244,10 @@ static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
unsigned int seq;
/*
- * Since we're called holding a seqlock, the data may shift
+ * Since we're called holding a seqcount, the data may shift
* under us while we're doing the calculation. This can cause
* false positives, since we'd note a problem but throw the
- * results away. So nest another seqlock here to atomically
+ * results away. So nest another seqcount here to atomically
* grab the points we are checking with.
*/
do {
@@ -349,13 +371,6 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
/* Timekeeper helper functions. */
-#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-static u32 default_arch_gettimeoffset(void) { return 0; }
-u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
-#else
-static inline u32 arch_gettimeoffset(void) { return 0; }
-#endif
-
static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 delta)
{
u64 nsec;
@@ -363,8 +378,7 @@ static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 de
nsec = delta * tkr->mult + tkr->xtime_nsec;
nsec >>= tkr->shift;
- /* If arch requires, add in get_arch_timeoffset() */
- return nsec + arch_gettimeoffset();
+ return nsec;
}
static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
@@ -387,6 +401,7 @@ static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 c
/**
* update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
* @tkr: Timekeeping readout base from which we take the update
+ * @tkf: Pointer to NMI safe timekeeper
*
* We want to use this from any context including NMI and tracing /
* instrumenting the timekeeping code itself.
@@ -416,6 +431,30 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr,
memcpy(base + 1, base, sizeof(*base));
}
+static __always_inline u64 fast_tk_get_delta_ns(struct tk_read_base *tkr)
+{
+ u64 delta, cycles = tk_clock_read(tkr);
+
+ delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
+ return timekeeping_delta_to_ns(tkr, delta);
+}
+
+static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
+{
+ struct tk_read_base *tkr;
+ unsigned int seq;
+ u64 now;
+
+ do {
+ seq = raw_read_seqcount_latch(&tkf->seq);
+ tkr = tkf->base + (seq & 0x01);
+ now = ktime_to_ns(tkr->base);
+ now += fast_tk_get_delta_ns(tkr);
+ } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
+
+ return now;
+}
+
/**
* ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
*
@@ -442,40 +481,25 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr,
*
* So reader 6 will observe time going backwards versus reader 5.
*
- * While other CPUs are likely to be able observe that, the only way
+ * While other CPUs are likely to be able to observe that, the only way
* for a CPU local observation is when an NMI hits in the middle of
* the update. Timestamps taken from that NMI context might be ahead
* of the following timestamps. Callers need to be aware of that and
* deal with it.
*/
-static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
-{
- struct tk_read_base *tkr;
- unsigned int seq;
- u64 now;
-
- do {
- seq = raw_read_seqcount_latch(&tkf->seq);
- tkr = tkf->base + (seq & 0x01);
- now = ktime_to_ns(tkr->base);
-
- now += timekeeping_delta_to_ns(tkr,
- clocksource_delta(
- tk_clock_read(tkr),
- tkr->cycle_last,
- tkr->mask));
- } while (read_seqcount_retry(&tkf->seq, seq));
-
- return now;
-}
-
-u64 ktime_get_mono_fast_ns(void)
+u64 notrace ktime_get_mono_fast_ns(void)
{
return __ktime_get_fast_ns(&tk_fast_mono);
}
EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
-u64 ktime_get_raw_fast_ns(void)
+/**
+ * ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw
+ *
+ * Contrary to ktime_get_mono_fast_ns() this is always correct because the
+ * conversion factor is not affected by NTP/PTP correction.
+ */
+u64 notrace ktime_get_raw_fast_ns(void)
{
return __ktime_get_fast_ns(&tk_fast_raw);
}
@@ -486,7 +510,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
*
* To keep it NMI safe since we're accessing from tracing, we're not using a
* separate timekeeper with updates to monotonic clock and boot offset
- * protected with seqlocks. This has the following minor side effects:
+ * protected with seqcounts. This has the following minor side effects:
*
* (1) Its possible that a timestamp be taken after the boot offset is updated
* but before the timekeeper is updated. If this happens, the new boot offset
@@ -501,50 +525,120 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
* (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
* partially updated. Since the tk->offs_boot update is a rare event, this
* should be a rare occurrence which postprocessing should be able to handle.
+ *
+ * The caveats vs. timestamp ordering as documented for ktime_get_mono_fast_ns()
+ * apply as well.
*/
u64 notrace ktime_get_boot_fast_ns(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
- return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot));
+ return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_boot)));
}
EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
-
-/*
- * See comment for __ktime_get_fast_ns() vs. timestamp ordering
+/**
+ * ktime_get_tai_fast_ns - NMI safe and fast access to tai clock.
+ *
+ * The same limitations as described for ktime_get_boot_fast_ns() apply. The
+ * mono time and the TAI offset are not read atomically which may yield wrong
+ * readouts. However, an update of the TAI offset is an rare event e.g., caused
+ * by settime or adjtimex with an offset. The user of this function has to deal
+ * with the possibility of wrong timestamps in post processing.
*/
-static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf)
+u64 notrace ktime_get_tai_fast_ns(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+
+ return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_tai)));
+}
+EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns);
+
+static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
{
struct tk_read_base *tkr;
+ u64 basem, baser, delta;
unsigned int seq;
- u64 now;
do {
seq = raw_read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
- now = ktime_to_ns(tkr->base_real);
+ basem = ktime_to_ns(tkr->base);
+ baser = ktime_to_ns(tkr->base_real);
+ delta = fast_tk_get_delta_ns(tkr);
+ } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
- now += timekeeping_delta_to_ns(tkr,
- clocksource_delta(
- tk_clock_read(tkr),
- tkr->cycle_last,
- tkr->mask));
- } while (read_seqcount_retry(&tkf->seq, seq));
-
- return now;
+ if (mono)
+ *mono = basem + delta;
+ return baser + delta;
}
/**
* ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
+ *
+ * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering.
*/
u64 ktime_get_real_fast_ns(void)
{
- return __ktime_get_real_fast_ns(&tk_fast_mono);
+ return __ktime_get_real_fast(&tk_fast_mono, NULL);
}
EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);
/**
+ * ktime_get_fast_timestamps: - NMI safe timestamps
+ * @snapshot: Pointer to timestamp storage
+ *
+ * Stores clock monotonic, boottime and realtime timestamps.
+ *
+ * Boot time is a racy access on 32bit systems if the sleep time injection
+ * happens late during resume and not in timekeeping_resume(). That could
+ * be avoided by expanding struct tk_read_base with boot offset for 32bit
+ * and adding more overhead to the update. As this is a hard to observe
+ * once per resume event which can be filtered with reasonable effort using
+ * the accurate mono/real timestamps, it's probably not worth the trouble.
+ *
+ * Aside of that it might be possible on 32 and 64 bit to observe the
+ * following when the sleep time injection happens late:
+ *
+ * CPU 0 CPU 1
+ * timekeeping_resume()
+ * ktime_get_fast_timestamps()
+ * mono, real = __ktime_get_real_fast()
+ * inject_sleep_time()
+ * update boot offset
+ * boot = mono + bootoffset;
+ *
+ * That means that boot time already has the sleep time adjustment, but
+ * real time does not. On the next readout both are in sync again.
+ *
+ * Preventing this for 64bit is not really feasible without destroying the
+ * careful cache layout of the timekeeper because the sequence count and
+ * struct tk_read_base would then need two cache lines instead of one.
+ *
+ * Access to the time keeper clock source is disabled across the innermost
+ * steps of suspend/resume. The accessors still work, but the timestamps
+ * are frozen until time keeping is resumed which happens very early.
+ *
+ * For regular suspend/resume there is no observable difference vs. sched
+ * clock, but it might affect some of the nasty low level debug printks.
+ *
+ * OTOH, access to sched clock is not guaranteed across suspend/resume on
+ * all systems either so it depends on the hardware in use.
+ *
+ * If that turns out to be a real problem then this could be mitigated by
+ * using sched clock in a similar way as during early boot. But it's not as
+ * trivial as on early boot because it needs some careful protection
+ * against the clock monotonic timestamp jumping backwards on resume.
+ */
+void ktime_get_fast_timestamps(struct ktime_timestamps *snapshot)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+
+ snapshot->real = __ktime_get_real_fast(&tk_fast_mono, &snapshot->mono);
+ snapshot->boot = snapshot->mono + ktime_to_ns(data_race(tk->offs_boot));
+}
+
+/**
* halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
* @tk: Timekeeper to snapshot.
*
@@ -580,6 +674,7 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
/**
* pvclock_gtod_register_notifier - register a pvclock timedata update listener
+ * @nb: Pointer to the notifier block to register
*/
int pvclock_gtod_register_notifier(struct notifier_block *nb)
{
@@ -599,6 +694,7 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
/**
* pvclock_gtod_unregister_notifier - unregister a pvclock
* timedata update listener
+ * @nb: Pointer to the notifier block to unregister
*/
int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
{
@@ -689,6 +785,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
/**
* timekeeping_forward_now - update clock to the current time
+ * @tk: Pointer to the timekeeper to update
*
* Forward the current clock to update its state since the last call to
* update_wall_time(). This is useful before significant clock changes,
@@ -704,16 +801,8 @@ static void timekeeping_forward_now(struct timekeeper *tk)
tk->tkr_raw.cycle_last = cycle_now;
tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
-
- /* If arch requires, add in get_arch_timeoffset() */
- tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
-
-
tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult;
- /* If arch requires, add in get_arch_timeoffset() */
- tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift;
-
tk_normalize_xtime(tk);
}
@@ -829,7 +918,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
/**
- * ktime_mono_to_any() - convert mononotic time to any other time
+ * ktime_mono_to_any() - convert monotonic time to any other time
* @tmono: time to convert.
* @offs: which offset to use
*/
@@ -921,8 +1010,7 @@ EXPORT_SYMBOL_GPL(ktime_get_seconds);
/**
* ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
*
- * Returns the wall clock seconds since 1970. This replaces the
- * get_seconds() interface which is not y2038 safe on 32bit systems.
+ * Returns the wall clock seconds since 1970.
*
* For 64bit systems the fast access to tk->xtime_sec is preserved. On
* 32bit systems the access must be protected with the sequence
@@ -979,6 +1067,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
do {
seq = read_seqcount_begin(&tk_core.seq);
now = tk_clock_read(&tk->tkr_mono);
+ systime_snapshot->cs_id = tk->tkr_mono.clock->id;
systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
base_real = ktime_add(tk->tkr_mono.base,
@@ -1236,8 +1325,7 @@ int do_settimeofday64(const struct timespec64 *ts)
timekeeping_forward_now(tk);
xt = tk_xtime(tk);
- ts_delta.tv_sec = ts->tv_sec - xt.tv_sec;
- ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec;
+ ts_delta = timespec64_sub(*ts, xt);
if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) {
ret = -EINVAL;
@@ -1253,11 +1341,13 @@ out:
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
- /* signal hrtimers about time change */
- clock_was_set();
+ /* Signal hrtimers about time change */
+ clock_was_set(CLOCK_SET_WALL);
- if (!ret)
+ if (!ret) {
audit_tk_injoffset(ts_delta);
+ add_device_randomness(ts, sizeof(*ts));
+ }
return ret;
}
@@ -1265,7 +1355,7 @@ EXPORT_SYMBOL(do_settimeofday64);
/**
* timekeeping_inject_offset - Adds or subtracts from the current time.
- * @tv: pointer to the timespec variable containing the offset
+ * @ts: Pointer to the timespec variable containing the offset
*
* Adds or subtracts an offset value from the current time.
*/
@@ -1301,8 +1391,8 @@ error: /* even if we error out, we forwarded the time, so call update */
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
- /* signal hrtimers about time change */
- clock_was_set();
+ /* Signal hrtimers about time change */
+ clock_was_set(CLOCK_SET_WALL);
return ret;
}
@@ -1341,9 +1431,8 @@ void timekeeping_warp_clock(void)
}
}
-/**
+/*
* __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
- *
*/
static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
{
@@ -1351,7 +1440,7 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
}
-/**
+/*
* change_clocksource - Swaps clocksources if a new one is available
*
* Accumulates current time interval and initializes new clocksource
@@ -1359,35 +1448,45 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
static int change_clocksource(void *data)
{
struct timekeeper *tk = &tk_core.timekeeper;
- struct clocksource *new, *old;
+ struct clocksource *new, *old = NULL;
unsigned long flags;
+ bool change = false;
new = (struct clocksource *) data;
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
-
- timekeeping_forward_now(tk);
/*
* If the cs is in module, get a module reference. Succeeds
* for built-in code (owner == NULL) as well.
*/
if (try_module_get(new->owner)) {
- if (!new->enable || new->enable(new) == 0) {
- old = tk->tkr_mono.clock;
- tk_setup_internals(tk, new);
- if (old->disable)
- old->disable(old);
- module_put(old->owner);
- } else {
+ if (!new->enable || new->enable(new) == 0)
+ change = true;
+ else
module_put(new->owner);
- }
}
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&tk_core.seq);
+
+ timekeeping_forward_now(tk);
+
+ if (change) {
+ old = tk->tkr_mono.clock;
+ tk_setup_internals(tk, new);
+ }
+
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ if (old) {
+ if (old->disable)
+ old->disable(old);
+
+ module_put(old->owner);
+ }
+
return 0;
}
@@ -1474,6 +1573,7 @@ u64 timekeeping_max_deferment(void)
/**
* read_persistent_clock64 - Return time from the persistent clock.
+ * @ts: Pointer to the storage for the readout value
*
* Weak dummy function for arches that do not yet support it.
* Reads the time from the battery backed persistent clock.
@@ -1490,10 +1590,11 @@ void __weak read_persistent_clock64(struct timespec64 *ts)
/**
* read_persistent_wall_and_boot_offset - Read persistent clock, and also offset
* from the boot.
+ * @wall_time: current time as returned by persistent clock
+ * @boot_offset: offset that is defined as wall_time - boot_time
*
* Weak dummy function for arches that do not yet support it.
- * wall_time - current time as returned by persistent clock
- * boot_offset - offset that is defined as wall_time - boot_time
+ *
* The default function calculates offset based on the current value of
* local_clock(). This way architectures that support sched_clock() but don't
* support dedicated boot time clock will provide the best estimate of the
@@ -1578,7 +1679,8 @@ static struct timespec64 timekeeping_suspend_time;
/**
* __timekeeping_inject_sleeptime - Internal function to add sleep interval
- * @delta: pointer to a timespec delta value
+ * @tk: Pointer to the timekeeper to be updated
+ * @delta: Pointer to the delta value in timespec64 format
*
* Takes a timespec offset measuring a suspend interval and properly
* adds the sleep offset to the timekeeping variables.
@@ -1599,7 +1701,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
}
#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
-/**
+/*
* We have three kinds of time sources to use for sleep time
* injection, the preference order is:
* 1) non-stop clocksource
@@ -1620,7 +1722,7 @@ bool timekeeping_rtc_skipresume(void)
return !suspend_timing_needed;
}
-/**
+/*
* 1) can be determined whether to use or not only when doing
* timekeeping_resume() which is invoked after rtc_suspend(),
* so we can't skip rtc_suspend() surely if system has 1).
@@ -1664,8 +1766,8 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta)
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
- /* signal hrtimers about time change */
- clock_was_set();
+ /* Signal hrtimers about time change */
+ clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT);
}
#endif
@@ -1728,8 +1830,10 @@ void timekeeping_resume(void)
touch_softlockup_watchdog();
+ /* Resume the clockevent device(s) and hrtimers */
tick_resume();
- hrtimers_resume();
+ /* Notify timerfd as resume is equivalent to clock_was_set() */
+ timerfd_resume();
}
int timekeeping_suspend(void)
@@ -1877,7 +1981,7 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
* xtime_nsec_1 = offset + xtime_nsec_2
* Which gives us:
* xtime_nsec_2 = xtime_nsec_1 - offset
- * Which simplfies to:
+ * Which simplifies to:
* xtime_nsec -= offset
*/
if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
@@ -1949,13 +2053,12 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
}
}
-/**
+/*
* accumulate_nsecs_to_secs - Accumulates nsecs into secs
*
* Helper function that accumulates the nsecs greater than a second
* from the xtime_nsec field to the xtime_secs field.
* It also calls into the NTP code to handle leapsecond processing.
- *
*/
static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
{
@@ -1997,11 +2100,11 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
return clock_set;
}
-/**
+/*
* logarithmic_accumulation - shifted accumulation of cycles
*
* This functions accumulates a shifted interval of cycles into
- * into a shifted interval nanoseconds. Allows for O(log) accumulation
+ * a shifted interval nanoseconds. Allows for O(log) accumulation
* loop.
*
* Returns the unconsumed cycles.
@@ -2044,7 +2147,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
* timekeeping_advance - Updates the timekeeper to the current time and
* current NTP tick length
*/
-static void timekeeping_advance(enum timekeeping_adv_mode mode)
+static bool timekeeping_advance(enum timekeeping_adv_mode mode)
{
struct timekeeper *real_tk = &tk_core.timekeeper;
struct timekeeper *tk = &shadow_timekeeper;
@@ -2059,19 +2162,12 @@ static void timekeeping_advance(enum timekeeping_adv_mode mode)
if (unlikely(timekeeping_suspended))
goto out;
-#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
- offset = real_tk->cycle_interval;
-
- if (mode != TK_ADV_TICK)
- goto out;
-#else
offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
/* Check if there's really nothing to do */
if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
goto out;
-#endif
/* Do some additional sanity checking */
timekeeping_check_update(tk, offset);
@@ -2122,9 +2218,8 @@ static void timekeeping_advance(enum timekeeping_adv_mode mode)
write_seqcount_end(&tk_core.seq);
out:
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
- if (clock_set)
- /* Have to call _delayed version, since in irq context*/
- clock_was_set_delayed();
+
+ return !!clock_set;
}
/**
@@ -2133,7 +2228,8 @@ out:
*/
void update_wall_time(void)
{
- timekeeping_advance(TK_ADV_TICK);
+ if (timekeeping_advance(TK_ADV_TICK))
+ clock_was_set_delayed();
}
/**
@@ -2193,7 +2289,7 @@ EXPORT_SYMBOL(ktime_get_coarse_ts64);
void do_timer(unsigned long ticks)
{
jiffies_64 += ticks;
- calc_global_load(ticks);
+ calc_global_load();
}
/**
@@ -2240,7 +2336,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
return base;
}
-/**
+/*
* timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
*/
static int timekeeping_validate_timex(const struct __kernel_timex *txc)
@@ -2273,7 +2369,7 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc)
/*
* Validate if a timespec/timeval used to inject a time
- * offset is valid. Offsets can be postive or negative, so
+ * offset is valid. Offsets can be positive or negative, so
* we don't check tv_sec. The value of the timeval/timespec
* is the sum of its fields,but *NOTE*:
* The field tv_usec/tv_nsec must always be non-negative and
@@ -2305,6 +2401,20 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc)
return 0;
}
+/**
+ * random_get_entropy_fallback - Returns the raw clock source value,
+ * used by random.c for platforms with no valid random_get_entropy().
+ */
+unsigned long random_get_entropy_fallback(void)
+{
+ struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;
+ struct clocksource *clock = READ_ONCE(tkr->clock);
+
+ if (unlikely(timekeeping_suspended || !clock))
+ return 0;
+ return clock->read(clock);
+}
+EXPORT_SYMBOL_GPL(random_get_entropy_fallback);
/**
* do_adjtimex() - Accessor function to NTP __do_adjtimex function
@@ -2313,8 +2423,9 @@ int do_adjtimex(struct __kernel_timex *txc)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct audit_ntp_data ad;
- unsigned long flags;
+ bool clock_set = false;
struct timespec64 ts;
+ unsigned long flags;
s32 orig_tai, tai;
int ret;
@@ -2322,6 +2433,7 @@ int do_adjtimex(struct __kernel_timex *txc)
ret = timekeeping_validate_timex(txc);
if (ret)
return ret;
+ add_device_randomness(txc, sizeof(*txc));
if (txc->modes & ADJ_SETOFFSET) {
struct timespec64 delta;
@@ -2339,6 +2451,7 @@ int do_adjtimex(struct __kernel_timex *txc)
audit_ntp_init(&ad);
ktime_get_real_ts64(&ts);
+ add_device_randomness(&ts, sizeof(ts));
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
@@ -2349,6 +2462,7 @@ int do_adjtimex(struct __kernel_timex *txc)
if (tai != orig_tai) {
__timekeeping_set_tai_offset(tk, tai);
timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
+ clock_set = true;
}
tk_update_leap_state(tk);
@@ -2359,10 +2473,10 @@ int do_adjtimex(struct __kernel_timex *txc)
/* Update the multiplier immediately if frequency was set directly */
if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
- timekeeping_advance(TK_ADV_FREQ);
+ clock_set |= timekeeping_advance(TK_ADV_FREQ);
- if (tai != orig_tai)
- clock_was_set();
+ if (clock_set)
+ clock_was_set(CLOCK_REALTIME);
ntp_notify_cmos_timer();
@@ -2387,19 +2501,3 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
}
EXPORT_SYMBOL(hardpps);
#endif /* CONFIG_NTP_PPS */
-
-/**
- * xtime_update() - advances the timekeeping infrastructure
- * @ticks: number of ticks, that have elapsed since the last call.
- *
- * Must be called with interrupts disabled.
- */
-void xtime_update(unsigned long ticks)
-{
- raw_spin_lock(&jiffies_lock);
- write_seqcount_begin(&jiffies_seq);
- do_timer(ticks);
- write_seqcount_end(&jiffies_seq);
- raw_spin_unlock(&jiffies_lock);
- update_wall_time();
-}
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 099737f6f10c..543beba096c7 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -22,11 +22,12 @@ static inline int sched_clock_suspend(void) { return 0; }
static inline void sched_clock_resume(void) { }
#endif
+extern void update_process_times(int user);
extern void do_timer(unsigned long ticks);
extern void update_wall_time(void);
extern raw_spinlock_t jiffies_lock;
-extern seqcount_t jiffies_seq;
+extern seqcount_raw_spinlock_t jiffies_seq;
#define CS_NAME_LEN 32
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index bcbb52db2256..4ca2787d1642 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -1,12 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _TIMEKEEPING_INTERNAL_H
#define _TIMEKEEPING_INTERNAL_H
-/*
- * timekeeping debug functions
- */
+
#include <linux/clocksource.h>
+#include <linux/spinlock.h>
#include <linux/time.h>
+/*
+ * timekeeping debug functions
+ */
#ifdef CONFIG_DEBUG_FS
extern void tk_debug_account_sleep_time(const struct timespec64 *t);
#else
@@ -31,4 +33,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
}
#endif
+/* Semi public for serialization of non timekeeper VDSO updates. */
+extern raw_spinlock_t timekeeper_lock;
+
#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 026ac01af9da..352b161113cd 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -44,6 +44,7 @@
#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/random.h>
+#include <linux/sysctl.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -157,7 +158,8 @@ EXPORT_SYMBOL(jiffies_64);
/*
* The time start value for each level to select the bucket at enqueue
- * time.
+ * time. We start from the last possible delta of the previous level
+ * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
*/
#define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
@@ -204,8 +206,9 @@ struct timer_base {
unsigned long clk;
unsigned long next_expiry;
unsigned int cpu;
+ bool next_expiry_recalc;
bool is_idle;
- bool must_forward_clk;
+ bool timers_pending;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE];
} ____cacheline_aligned;
@@ -221,7 +224,7 @@ static void timer_update_keys(struct work_struct *work);
static DECLARE_WORK(timer_update_work, timer_update_keys);
#ifdef CONFIG_SMP
-unsigned int sysctl_timer_migration = 1;
+static unsigned int sysctl_timer_migration = 1;
DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
@@ -232,7 +235,42 @@ static void timers_update_migration(void)
else
static_branch_disable(&timers_migration_enabled);
}
-#else
+
+#ifdef CONFIG_SYSCTL
+static int timer_migration_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ mutex_lock(&timer_keys_mutex);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (!ret && write)
+ timers_update_migration();
+ mutex_unlock(&timer_keys_mutex);
+ return ret;
+}
+
+static struct ctl_table timer_sysctl[] = {
+ {
+ .procname = "timer_migration",
+ .data = &sysctl_timer_migration,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = timer_migration_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {}
+};
+
+static int __init timer_sysctl_init(void)
+{
+ register_sysctl("kernel", timer_sysctl);
+ return 0;
+}
+device_initcall(timer_sysctl_init);
+#endif /* CONFIG_SYSCTL */
+#else /* CONFIG_SMP */
static inline void timers_update_migration(void) { }
#endif /* !CONFIG_SMP */
@@ -249,19 +287,6 @@ void timers_update_nohz(void)
schedule_work(&timer_update_work);
}
-int timer_migration_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- int ret;
-
- mutex_lock(&timer_keys_mutex);
- ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!ret && write)
- timers_update_migration();
- mutex_unlock(&timer_keys_mutex);
- return ret;
-}
-
static inline bool is_timers_nohz_active(void)
{
return static_branch_unlikely(&timers_nohz_active);
@@ -488,35 +513,48 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
* Helper function to calculate the array index for a given expiry
* time.
*/
-static inline unsigned calc_index(unsigned expires, unsigned lvl)
+static inline unsigned calc_index(unsigned long expires, unsigned lvl,
+ unsigned long *bucket_expiry)
{
- expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
+
+ /*
+ * The timer wheel has to guarantee that a timer does not fire
+ * early. Early expiry can happen due to:
+ * - Timer is armed at the edge of a tick
+ * - Truncation of the expiry time in the outer wheel levels
+ *
+ * Round up with level granularity to prevent this.
+ */
+ expires = (expires >> LVL_SHIFT(lvl)) + 1;
+ *bucket_expiry = expires << LVL_SHIFT(lvl);
return LVL_OFFS(lvl) + (expires & LVL_MASK);
}
-static int calc_wheel_index(unsigned long expires, unsigned long clk)
+static int calc_wheel_index(unsigned long expires, unsigned long clk,
+ unsigned long *bucket_expiry)
{
unsigned long delta = expires - clk;
unsigned int idx;
if (delta < LVL_START(1)) {
- idx = calc_index(expires, 0);
+ idx = calc_index(expires, 0, bucket_expiry);
} else if (delta < LVL_START(2)) {
- idx = calc_index(expires, 1);
+ idx = calc_index(expires, 1, bucket_expiry);
} else if (delta < LVL_START(3)) {
- idx = calc_index(expires, 2);
+ idx = calc_index(expires, 2, bucket_expiry);
} else if (delta < LVL_START(4)) {
- idx = calc_index(expires, 3);
+ idx = calc_index(expires, 3, bucket_expiry);
} else if (delta < LVL_START(5)) {
- idx = calc_index(expires, 4);
+ idx = calc_index(expires, 4, bucket_expiry);
} else if (delta < LVL_START(6)) {
- idx = calc_index(expires, 5);
+ idx = calc_index(expires, 5, bucket_expiry);
} else if (delta < LVL_START(7)) {
- idx = calc_index(expires, 6);
+ idx = calc_index(expires, 6, bucket_expiry);
} else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
- idx = calc_index(expires, 7);
+ idx = calc_index(expires, 7, bucket_expiry);
} else if ((long) delta < 0) {
idx = clk & LVL_MASK;
+ *bucket_expiry = clk;
} else {
/*
* Force expire obscene large timeouts to expire at the
@@ -525,92 +563,111 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk)
if (delta >= WHEEL_TIMEOUT_CUTOFF)
expires = clk + WHEEL_TIMEOUT_MAX;
- idx = calc_index(expires, LVL_DEPTH - 1);
+ idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
}
return idx;
}
-/*
- * Enqueue the timer into the hash bucket, mark it pending in
- * the bitmap and store the index in the timer flags.
- */
-static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
- unsigned int idx)
-{
- hlist_add_head(&timer->entry, base->vectors + idx);
- __set_bit(idx, base->pending_map);
- timer_set_idx(timer, idx);
-
- trace_timer_start(timer, timer->expires, timer->flags);
-}
-
-static void
-__internal_add_timer(struct timer_base *base, struct timer_list *timer)
-{
- unsigned int idx;
-
- idx = calc_wheel_index(timer->expires, base->clk);
- enqueue_timer(base, timer, idx);
-}
-
static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
- if (!is_timers_nohz_active())
- return;
-
/*
- * TODO: This wants some optimizing similar to the code below, but we
- * will do that when we switch from push to pull for deferrable timers.
+ * Deferrable timers do not prevent the CPU from entering dynticks and
+ * are not taken into account on the idle/nohz_full path. An IPI when a
+ * new deferrable timer is enqueued will wake up the remote CPU but
+ * nothing will be done with the deferrable timer base. Therefore skip
+ * the remote IPI for deferrable timers completely.
*/
- if (timer->flags & TIMER_DEFERRABLE) {
- if (tick_nohz_full_cpu(base->cpu))
- wake_up_nohz_cpu(base->cpu);
+ if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE)
return;
- }
/*
* We might have to IPI the remote CPU if the base is idle and the
* timer is not deferrable. If the other CPU is on the way to idle
* then it can't set base->is_idle as we hold the base lock:
*/
- if (!base->is_idle)
- return;
+ if (base->is_idle)
+ wake_up_nohz_cpu(base->cpu);
+}
- /* Check whether this is the new first expiring timer: */
- if (time_after_eq(timer->expires, base->next_expiry))
- return;
+/*
+ * Enqueue the timer into the hash bucket, mark it pending in
+ * the bitmap, store the index in the timer flags then wake up
+ * the target CPU if needed.
+ */
+static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
+ unsigned int idx, unsigned long bucket_expiry)
+{
+
+ hlist_add_head(&timer->entry, base->vectors + idx);
+ __set_bit(idx, base->pending_map);
+ timer_set_idx(timer, idx);
+
+ trace_timer_start(timer, bucket_expiry);
/*
- * Set the next expiry time and kick the CPU so it can reevaluate the
- * wheel:
+ * Check whether this is the new first expiring timer. The
+ * effective expiry time of the timer is required here
+ * (bucket_expiry) instead of timer->expires.
*/
- if (time_before(timer->expires, base->clk)) {
+ if (time_before(bucket_expiry, base->next_expiry)) {
/*
- * Prevent from forward_timer_base() moving the base->clk
- * backward
+ * Set the next expiry time and kick the CPU so it
+ * can reevaluate the wheel:
*/
- base->next_expiry = base->clk;
- } else {
- base->next_expiry = timer->expires;
+ base->next_expiry = bucket_expiry;
+ base->timers_pending = true;
+ base->next_expiry_recalc = false;
+ trigger_dyntick_cpu(base, timer);
}
- wake_up_nohz_cpu(base->cpu);
}
-static void
-internal_add_timer(struct timer_base *base, struct timer_list *timer)
+static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
{
- __internal_add_timer(base, timer);
- trigger_dyntick_cpu(base, timer);
+ unsigned long bucket_expiry;
+ unsigned int idx;
+
+ idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
+ enqueue_timer(base, timer, idx, bucket_expiry);
}
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
-static struct debug_obj_descr timer_debug_descr;
+static const struct debug_obj_descr timer_debug_descr;
+
+struct timer_hint {
+ void (*function)(struct timer_list *t);
+ long offset;
+};
+
+#define TIMER_HINT(fn, container, timr, hintfn) \
+ { \
+ .function = fn, \
+ .offset = offsetof(container, hintfn) - \
+ offsetof(container, timr) \
+ }
+
+static const struct timer_hint timer_hints[] = {
+ TIMER_HINT(delayed_work_timer_fn,
+ struct delayed_work, timer, work.func),
+ TIMER_HINT(kthread_delayed_work_timer_fn,
+ struct kthread_delayed_work, timer, work.func),
+};
static void *timer_debug_hint(void *addr)
{
- return ((struct timer_list *) addr)->function;
+ struct timer_list *timer = addr;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(timer_hints); i++) {
+ if (timer_hints[i].function == timer->function) {
+ void (**fn)(void) = addr + timer_hints[i].offset;
+
+ return *fn;
+ }
+ }
+
+ return timer->function;
}
static bool timer_is_static_object(void *addr)
@@ -661,7 +718,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
- /* fall through */
+ fallthrough;
default:
return false;
}
@@ -702,7 +759,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
}
}
-static struct debug_obj_descr timer_debug_descr = {
+static const struct debug_obj_descr timer_debug_descr = {
.name = "timer_list",
.debug_hint = timer_debug_hint,
.is_static_object = timer_is_static_object,
@@ -727,11 +784,6 @@ static inline void debug_timer_deactivate(struct timer_list *timer)
debug_object_deactivate(timer, &timer_debug_descr);
}
-static inline void debug_timer_free(struct timer_list *timer)
-{
- debug_object_free(timer, &timer_debug_descr);
-}
-
static inline void debug_timer_assert_init(struct timer_list *timer)
{
debug_object_assert_init(timer, &timer_debug_descr);
@@ -789,6 +841,8 @@ static void do_init_timer(struct timer_list *timer,
{
timer->entry.pprev = NULL;
timer->function = func;
+ if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
+ flags &= TIMER_INIT_FLAGS;
timer->flags = flags | raw_smp_processor_id();
lockdep_init_map(&timer->lockdep_map, name, key, 0);
}
@@ -834,8 +888,10 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
if (!timer_pending(timer))
return 0;
- if (hlist_is_singular_node(&timer->entry, base->vectors + idx))
+ if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
__clear_bit(idx, base->pending_map);
+ base->next_expiry_recalc = true;
+ }
detach_timer(timer, clear_pending);
return 1;
@@ -883,38 +939,34 @@ get_target_base(struct timer_base *base, unsigned tflags)
return get_timer_this_cpu_base(tflags);
}
-static inline void forward_timer_base(struct timer_base *base)
+static inline void __forward_timer_base(struct timer_base *base,
+ unsigned long basej)
{
-#ifdef CONFIG_NO_HZ_COMMON
- unsigned long jnow;
-
/*
- * We only forward the base when we are idle or have just come out of
- * idle (must_forward_clk logic), and have a delta between base clock
- * and jiffies. In the common case, run_timers will take care of it.
+ * Check whether we can forward the base. We can only do that when
+ * @basej is past base->clk otherwise we might rewind base->clk.
*/
- if (likely(!base->must_forward_clk))
- return;
-
- jnow = READ_ONCE(jiffies);
- base->must_forward_clk = base->is_idle;
- if ((long)(jnow - base->clk) < 2)
+ if (time_before_eq(basej, base->clk))
return;
/*
* If the next expiry value is > jiffies, then we fast forward to
* jiffies otherwise we forward to the next expiry value.
*/
- if (time_after(base->next_expiry, jnow)) {
- base->clk = jnow;
+ if (time_after(base->next_expiry, basej)) {
+ base->clk = basej;
} else {
if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
return;
base->clk = base->next_expiry;
}
-#endif
+
}
+static inline void forward_timer_base(struct timer_base *base)
+{
+ __forward_timer_base(base, READ_ONCE(jiffies));
+}
/*
* We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
@@ -960,12 +1012,12 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
static inline int
__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
{
+ unsigned long clk = 0, flags, bucket_expiry;
struct timer_base *base, *new_base;
unsigned int idx = UINT_MAX;
- unsigned long clk = 0, flags;
int ret = 0;
- BUG_ON(!timer->function);
+ debug_assert_init(timer);
/*
* This is a common optimization triggered by the networking code - if
@@ -992,6 +1044,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
* dequeue/enqueue dance.
*/
base = lock_timer_base(timer, &flags);
+ /*
+ * Has @timer been shutdown? This needs to be evaluated
+ * while holding base lock to prevent a race against the
+ * shutdown code.
+ */
+ if (!timer->function)
+ goto out_unlock;
+
forward_timer_base(base);
if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
@@ -1001,7 +1061,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
}
clk = base->clk;
- idx = calc_wheel_index(expires, clk);
+ idx = calc_wheel_index(expires, clk, &bucket_expiry);
/*
* Retrieve and compare the array index of the pending
@@ -1018,6 +1078,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
}
} else {
base = lock_timer_base(timer, &flags);
+ /*
+ * Has @timer been shutdown? This needs to be evaluated
+ * while holding base lock to prevent a race against the
+ * shutdown code.
+ */
+ if (!timer->function)
+ goto out_unlock;
+
forward_timer_base(base);
}
@@ -1031,7 +1099,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
/*
* We are trying to schedule the timer on the new base.
* However we can't change timer's base while it is running,
- * otherwise del_timer_sync() can't detect that the timer's
+ * otherwise timer_delete_sync() can't detect that the timer's
* handler yet has not finished. This also guarantees that the
* timer is serialized wrt itself.
*/
@@ -1054,16 +1122,13 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
/*
* If 'idx' was calculated above and the base time did not advance
* between calculating 'idx' and possibly switching the base, only
- * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise
- * we need to (re)calculate the wheel index via
- * internal_add_timer().
+ * enqueue_timer() is required. Otherwise we need to (re)calculate
+ * the wheel index via internal_add_timer().
*/
- if (idx != UINT_MAX && clk == base->clk) {
- enqueue_timer(base, timer, idx);
- trigger_dyntick_cpu(base, timer);
- } else {
+ if (idx != UINT_MAX && clk == base->clk)
+ enqueue_timer(base, timer, idx, bucket_expiry);
+ else
internal_add_timer(base, timer);
- }
out_unlock:
raw_spin_unlock_irqrestore(&base->lock, flags);
@@ -1072,14 +1137,20 @@ out_unlock:
}
/**
- * mod_timer_pending - modify a pending timer's timeout
- * @timer: the pending timer to be modified
- * @expires: new timeout in jiffies
+ * mod_timer_pending - Modify a pending timer's timeout
+ * @timer: The pending timer to be modified
+ * @expires: New absolute timeout in jiffies
+ *
+ * mod_timer_pending() is the same for pending timers as mod_timer(), but
+ * will not activate inactive timers.
*
- * mod_timer_pending() is the same for pending timers as mod_timer(),
- * but will not re-activate and modify already deleted timers.
+ * If @timer->function == NULL then the start operation is silently
+ * discarded.
*
- * It is useful for unserialized use of timers.
+ * Return:
+ * * %0 - The timer was inactive and not modified or was in
+ * shutdown state and the operation was discarded
+ * * %1 - The timer was active and requeued to expire at @expires
*/
int mod_timer_pending(struct timer_list *timer, unsigned long expires)
{
@@ -1088,24 +1159,31 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires)
EXPORT_SYMBOL(mod_timer_pending);
/**
- * mod_timer - modify a timer's timeout
- * @timer: the timer to be modified
- * @expires: new timeout in jiffies
- *
- * mod_timer() is a more efficient way to update the expire field of an
- * active timer (if the timer is inactive it will be activated)
+ * mod_timer - Modify a timer's timeout
+ * @timer: The timer to be modified
+ * @expires: New absolute timeout in jiffies
*
* mod_timer(timer, expires) is equivalent to:
*
* del_timer(timer); timer->expires = expires; add_timer(timer);
*
+ * mod_timer() is more efficient than the above open coded sequence. In
+ * case that the timer is inactive, the del_timer() part is a NOP. The
+ * timer is in any case activated with the new expiry time @expires.
+ *
* Note that if there are multiple unserialized concurrent users of the
* same timer, then mod_timer() is the only safe way to modify the timeout,
* since add_timer() cannot modify an already running timer.
*
- * The function returns whether it has modified a pending timer or not.
- * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
- * active timer returns 1.)
+ * If @timer->function == NULL then the start operation is silently
+ * discarded. In this case the return value is 0 and meaningless.
+ *
+ * Return:
+ * * %0 - The timer was inactive and started or was in shutdown
+ * state and the operation was discarded
+ * * %1 - The timer was active and requeued to expire at @expires or
+ * the timer was active and not modified because @expires did
+ * not change the effective expiry time
*/
int mod_timer(struct timer_list *timer, unsigned long expires)
{
@@ -1116,11 +1194,22 @@ EXPORT_SYMBOL(mod_timer);
/**
* timer_reduce - Modify a timer's timeout if it would reduce the timeout
* @timer: The timer to be modified
- * @expires: New timeout in jiffies
+ * @expires: New absolute timeout in jiffies
*
* timer_reduce() is very similar to mod_timer(), except that it will only
- * modify a running timer if that would reduce the expiration time (it will
- * start a timer that isn't running).
+ * modify an enqueued timer if that would reduce the expiration time. If
+ * @timer is not enqueued it starts the timer.
+ *
+ * If @timer->function == NULL then the start operation is silently
+ * discarded.
+ *
+ * Return:
+ * * %0 - The timer was inactive and started or was in shutdown
+ * state and the operation was discarded
+ * * %1 - The timer was active and requeued to expire at @expires or
+ * the timer was active and not modified because @expires
+ * did not change the effective expiry time such that the
+ * timer would expire earlier than already scheduled
*/
int timer_reduce(struct timer_list *timer, unsigned long expires)
{
@@ -1129,39 +1218,51 @@ int timer_reduce(struct timer_list *timer, unsigned long expires)
EXPORT_SYMBOL(timer_reduce);
/**
- * add_timer - start a timer
- * @timer: the timer to be added
+ * add_timer - Start a timer
+ * @timer: The timer to be started
*
- * The kernel will do a ->function(@timer) callback from the
- * timer interrupt at the ->expires point in the future. The
- * current time is 'jiffies'.
+ * Start @timer to expire at @timer->expires in the future. @timer->expires
+ * is the absolute expiry time measured in 'jiffies'. When the timer expires
+ * timer->function(timer) will be invoked from soft interrupt context.
*
- * The timer's ->expires, ->function fields must be set prior calling this
- * function.
+ * The @timer->expires and @timer->function fields must be set prior
+ * to calling this function.
*
- * Timers with an ->expires field in the past will be executed in the next
- * timer tick.
+ * If @timer->function == NULL then the start operation is silently
+ * discarded.
+ *
+ * If @timer->expires is already in the past @timer will be queued to
+ * expire at the next timer tick.
+ *
+ * This can only operate on an inactive timer. Attempts to invoke this on
+ * an active timer are rejected with a warning.
*/
void add_timer(struct timer_list *timer)
{
- BUG_ON(timer_pending(timer));
+ if (WARN_ON_ONCE(timer_pending(timer)))
+ return;
__mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer);
/**
- * add_timer_on - start a timer on a particular CPU
- * @timer: the timer to be added
- * @cpu: the CPU to start it on
+ * add_timer_on - Start a timer on a particular CPU
+ * @timer: The timer to be started
+ * @cpu: The CPU to start it on
*
- * This is not very scalable on SMP. Double adds are not possible.
+ * Same as add_timer() except that it starts the timer on the given CPU.
+ *
+ * See add_timer() for further details.
*/
void add_timer_on(struct timer_list *timer, int cpu)
{
struct timer_base *new_base, *base;
unsigned long flags;
- BUG_ON(timer_pending(timer) || !timer->function);
+ debug_assert_init(timer);
+
+ if (WARN_ON_ONCE(timer_pending(timer)))
+ return;
new_base = get_timer_cpu_base(timer->flags, cpu);
@@ -1171,6 +1272,13 @@ void add_timer_on(struct timer_list *timer, int cpu)
* wrong base locked. See lock_timer_base().
*/
base = lock_timer_base(timer, &flags);
+ /*
+ * Has @timer been shutdown? This needs to be evaluated while
+ * holding base lock to prevent a race against the shutdown code.
+ */
+ if (!timer->function)
+ goto out_unlock;
+
if (base != new_base) {
timer->flags |= TIMER_MIGRATING;
@@ -1184,22 +1292,27 @@ void add_timer_on(struct timer_list *timer, int cpu)
debug_timer_activate(timer);
internal_add_timer(base, timer);
+out_unlock:
raw_spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(add_timer_on);
/**
- * del_timer - deactivate a timer.
- * @timer: the timer to be deactivated
- *
- * del_timer() deactivates a timer - this works on both active and inactive
- * timers.
- *
- * The function returns whether it has deactivated a pending timer or not.
- * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
- * active timer returns 1.)
+ * __timer_delete - Internal function: Deactivate a timer
+ * @timer: The timer to be deactivated
+ * @shutdown: If true, this indicates that the timer is about to be
+ * shutdown permanently.
+ *
+ * If @shutdown is true then @timer->function is set to NULL under the
+ * timer base lock which prevents further rearming of the time. In that
+ * case any attempt to rearm @timer after this function returns will be
+ * silently ignored.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending and deactivated
*/
-int del_timer(struct timer_list *timer)
+static int __timer_delete(struct timer_list *timer, bool shutdown)
{
struct timer_base *base;
unsigned long flags;
@@ -1207,24 +1320,90 @@ int del_timer(struct timer_list *timer)
debug_assert_init(timer);
- if (timer_pending(timer)) {
+ /*
+ * If @shutdown is set then the lock has to be taken whether the
+ * timer is pending or not to protect against a concurrent rearm
+ * which might hit between the lockless pending check and the lock
+ * aquisition. By taking the lock it is ensured that such a newly
+ * enqueued timer is dequeued and cannot end up with
+ * timer->function == NULL in the expiry code.
+ *
+ * If timer->function is currently executed, then this makes sure
+ * that the callback cannot requeue the timer.
+ */
+ if (timer_pending(timer) || shutdown) {
base = lock_timer_base(timer, &flags);
ret = detach_if_pending(timer, base, true);
+ if (shutdown)
+ timer->function = NULL;
raw_spin_unlock_irqrestore(&base->lock, flags);
}
return ret;
}
-EXPORT_SYMBOL(del_timer);
/**
- * try_to_del_timer_sync - Try to deactivate a timer
- * @timer: timer to delete
+ * timer_delete - Deactivate a timer
+ * @timer: The timer to be deactivated
+ *
+ * The function only deactivates a pending timer, but contrary to
+ * timer_delete_sync() it does not take into account whether the timer's
+ * callback function is concurrently executed on a different CPU or not.
+ * It neither prevents rearming of the timer. If @timer can be rearmed
+ * concurrently then the return value of this function is meaningless.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending and deactivated
+ */
+int timer_delete(struct timer_list *timer)
+{
+ return __timer_delete(timer, false);
+}
+EXPORT_SYMBOL(timer_delete);
+
+/**
+ * timer_shutdown - Deactivate a timer and prevent rearming
+ * @timer: The timer to be deactivated
+ *
+ * The function does not wait for an eventually running timer callback on a
+ * different CPU but it prevents rearming of the timer. Any attempt to arm
+ * @timer after this function returns will be silently ignored.
+ *
+ * This function is useful for teardown code and should only be used when
+ * timer_shutdown_sync() cannot be invoked due to locking or context constraints.
*
- * This function tries to deactivate a timer. Upon successful (ret >= 0)
- * exit the timer is not queued and the handler is not running on any CPU.
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending
*/
-int try_to_del_timer_sync(struct timer_list *timer)
+int timer_shutdown(struct timer_list *timer)
+{
+ return __timer_delete(timer, true);
+}
+EXPORT_SYMBOL_GPL(timer_shutdown);
+
+/**
+ * __try_to_del_timer_sync - Internal function: Try to deactivate a timer
+ * @timer: Timer to deactivate
+ * @shutdown: If true, this indicates that the timer is about to be
+ * shutdown permanently.
+ *
+ * If @shutdown is true then @timer->function is set to NULL under the
+ * timer base lock which prevents further rearming of the timer. Any
+ * attempt to rearm @timer after this function returns will be silently
+ * ignored.
+ *
+ * This function cannot guarantee that the timer cannot be rearmed
+ * right after dropping the base lock if @shutdown is false. That
+ * needs to be prevented by the calling code if necessary.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending and deactivated
+ * * %-1 - The timer callback function is running on a different CPU
+ */
+static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
{
struct timer_base *base;
unsigned long flags;
@@ -1236,11 +1415,34 @@ int try_to_del_timer_sync(struct timer_list *timer)
if (base->running_timer != timer)
ret = detach_if_pending(timer, base, true);
+ if (shutdown)
+ timer->function = NULL;
raw_spin_unlock_irqrestore(&base->lock, flags);
return ret;
}
+
+/**
+ * try_to_del_timer_sync - Try to deactivate a timer
+ * @timer: Timer to deactivate
+ *
+ * This function tries to deactivate a timer. On success the timer is not
+ * queued and the timer callback function is not running on any CPU.
+ *
+ * This function does not guarantee that the timer cannot be rearmed right
+ * after dropping the base lock. That needs to be prevented by the calling
+ * code if necessary.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending and deactivated
+ * * %-1 - The timer callback function is running on a different CPU
+ */
+int try_to_del_timer_sync(struct timer_list *timer)
+{
+ return __try_to_del_timer_sync(timer, false);
+}
EXPORT_SYMBOL(try_to_del_timer_sync);
#ifdef CONFIG_PREEMPT_RT
@@ -1263,14 +1465,16 @@ static inline void timer_base_unlock_expiry(struct timer_base *base)
* The counterpart to del_timer_wait_running().
*
* If there is a waiter for base->expiry_lock, then it was waiting for the
- * timer callback to finish. Drop expiry_lock and reaquire it. That allows
+ * timer callback to finish. Drop expiry_lock and reacquire it. That allows
* the waiter to acquire the lock and make progress.
*/
static void timer_sync_wait_running(struct timer_base *base)
{
if (atomic_read(&base->timer_waiters)) {
+ raw_spin_unlock_irq(&base->lock);
spin_unlock(&base->expiry_lock);
spin_lock(&base->expiry_lock);
+ raw_spin_lock_irq(&base->lock);
}
}
@@ -1289,7 +1493,7 @@ static void del_timer_wait_running(struct timer_list *timer)
u32 tf;
tf = READ_ONCE(timer->flags);
- if (!(tf & TIMER_MIGRATING)) {
+ if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) {
struct timer_base *base = get_timer_base(tf);
/*
@@ -1314,44 +1518,29 @@ static inline void timer_sync_wait_running(struct timer_base *base) { }
static inline void del_timer_wait_running(struct timer_list *timer) { }
#endif
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
/**
- * del_timer_sync - deactivate a timer and wait for the handler to finish.
- * @timer: the timer to be deactivated
- *
- * This function only differs from del_timer() on SMP: besides deactivating
- * the timer it also makes sure the handler has finished executing on other
- * CPUs.
- *
- * Synchronization rules: Callers must prevent restarting of the timer,
- * otherwise this function is meaningless. It must not be called from
- * interrupt contexts unless the timer is an irqsafe one. The caller must
- * not hold locks which would prevent completion of the timer's
- * handler. The timer's handler must not call add_timer_on(). Upon exit the
- * timer is not queued and the handler is not running on any CPU.
- *
- * Note: For !irqsafe timers, you must not hold locks that are held in
- * interrupt context while calling this function. Even if the lock has
- * nothing to do with the timer in question. Here's why::
- *
- * CPU0 CPU1
- * ---- ----
- * <SOFTIRQ>
- * call_timer_fn();
- * base->running_timer = mytimer;
- * spin_lock_irq(somelock);
- * <IRQ>
- * spin_lock(somelock);
- * del_timer_sync(mytimer);
- * while (base->running_timer == mytimer);
- *
- * Now del_timer_sync() will never return and never release somelock.
- * The interrupt on the other CPU is waiting to grab somelock but
- * it has interrupted the softirq that CPU0 is waiting to finish.
- *
- * The function returns whether it has deactivated a pending timer or not.
+ * __timer_delete_sync - Internal function: Deactivate a timer and wait
+ * for the handler to finish.
+ * @timer: The timer to be deactivated
+ * @shutdown: If true, @timer->function will be set to NULL under the
+ * timer base lock which prevents rearming of @timer
+ *
+ * If @shutdown is not set the timer can be rearmed later. If the timer can
+ * be rearmed concurrently, i.e. after dropping the base lock then the
+ * return value is meaningless.
+ *
+ * If @shutdown is set then @timer->function is set to NULL under timer
+ * base lock which prevents rearming of the timer. Any attempt to rearm
+ * a shutdown timer is silently ignored.
+ *
+ * If the timer should be reused after shutdown it has to be initialized
+ * again.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending and deactivated
*/
-int del_timer_sync(struct timer_list *timer)
+static int __timer_delete_sync(struct timer_list *timer, bool shutdown)
{
int ret;
@@ -1371,10 +1560,17 @@ int del_timer_sync(struct timer_list *timer)
* don't use it in hardirq context, because it
* could lead to deadlock.
*/
- WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
+ WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE));
+
+ /*
+ * Must be able to sleep on PREEMPT_RT because of the slowpath in
+ * del_timer_wait_running().
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
+ lockdep_assert_preemption_enabled();
do {
- ret = try_to_del_timer_sync(timer);
+ ret = __try_to_del_timer_sync(timer, shutdown);
if (unlikely(ret < 0)) {
del_timer_wait_running(timer);
@@ -1384,8 +1580,96 @@ int del_timer_sync(struct timer_list *timer)
return ret;
}
-EXPORT_SYMBOL(del_timer_sync);
-#endif
+
+/**
+ * timer_delete_sync - Deactivate a timer and wait for the handler to finish.
+ * @timer: The timer to be deactivated
+ *
+ * Synchronization rules: Callers must prevent restarting of the timer,
+ * otherwise this function is meaningless. It must not be called from
+ * interrupt contexts unless the timer is an irqsafe one. The caller must
+ * not hold locks which would prevent completion of the timer's callback
+ * function. The timer's handler must not call add_timer_on(). Upon exit
+ * the timer is not queued and the handler is not running on any CPU.
+ *
+ * For !irqsafe timers, the caller must not hold locks that are held in
+ * interrupt context. Even if the lock has nothing to do with the timer in
+ * question. Here's why::
+ *
+ * CPU0 CPU1
+ * ---- ----
+ * <SOFTIRQ>
+ * call_timer_fn();
+ * base->running_timer = mytimer;
+ * spin_lock_irq(somelock);
+ * <IRQ>
+ * spin_lock(somelock);
+ * timer_delete_sync(mytimer);
+ * while (base->running_timer == mytimer);
+ *
+ * Now timer_delete_sync() will never return and never release somelock.
+ * The interrupt on the other CPU is waiting to grab somelock but it has
+ * interrupted the softirq that CPU0 is waiting to finish.
+ *
+ * This function cannot guarantee that the timer is not rearmed again by
+ * some concurrent or preempting code, right after it dropped the base
+ * lock. If there is the possibility of a concurrent rearm then the return
+ * value of the function is meaningless.
+ *
+ * If such a guarantee is needed, e.g. for teardown situations then use
+ * timer_shutdown_sync() instead.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending and deactivated
+ */
+int timer_delete_sync(struct timer_list *timer)
+{
+ return __timer_delete_sync(timer, false);
+}
+EXPORT_SYMBOL(timer_delete_sync);
+
+/**
+ * timer_shutdown_sync - Shutdown a timer and prevent rearming
+ * @timer: The timer to be shutdown
+ *
+ * When the function returns it is guaranteed that:
+ * - @timer is not queued
+ * - The callback function of @timer is not running
+ * - @timer cannot be enqueued again. Any attempt to rearm
+ * @timer is silently ignored.
+ *
+ * See timer_delete_sync() for synchronization rules.
+ *
+ * This function is useful for final teardown of an infrastructure where
+ * the timer is subject to a circular dependency problem.
+ *
+ * A common pattern for this is a timer and a workqueue where the timer can
+ * schedule work and work can arm the timer. On shutdown the workqueue must
+ * be destroyed and the timer must be prevented from rearming. Unless the
+ * code has conditionals like 'if (mything->in_shutdown)' to prevent that
+ * there is no way to get this correct with timer_delete_sync().
+ *
+ * timer_shutdown_sync() is solving the problem. The correct ordering of
+ * calls in this case is:
+ *
+ * timer_shutdown_sync(&mything->timer);
+ * workqueue_destroy(&mything->workqueue);
+ *
+ * After this 'mything' can be safely freed.
+ *
+ * This obviously implies that the timer is not required to be functional
+ * for the rest of the shutdown operation.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending
+ */
+int timer_shutdown_sync(struct timer_list *timer)
+{
+ return __timer_delete_sync(timer, true);
+}
+EXPORT_SYMBOL_GPL(timer_shutdown_sync);
static void call_timer_fn(struct timer_list *timer,
void (*fn)(struct timer_list *),
@@ -1407,8 +1691,8 @@ static void call_timer_fn(struct timer_list *timer,
#endif
/*
* Couple the lock chain with the lock chain at
- * del_timer_sync() by acquiring the lock_map around the fn()
- * call here and in del_timer_sync().
+ * timer_delete_sync() by acquiring the lock_map around the fn()
+ * call here and in timer_delete_sync().
*/
lock_map_acquire(&lockdep_map);
@@ -1451,25 +1735,31 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
fn = timer->function;
+ if (WARN_ON_ONCE(!fn)) {
+ /* Should never happen. Emphasis on should! */
+ base->running_timer = NULL;
+ continue;
+ }
+
if (timer->flags & TIMER_IRQSAFE) {
raw_spin_unlock(&base->lock);
call_timer_fn(timer, fn, baseclk);
- base->running_timer = NULL;
raw_spin_lock(&base->lock);
+ base->running_timer = NULL;
} else {
raw_spin_unlock_irq(&base->lock);
call_timer_fn(timer, fn, baseclk);
+ raw_spin_lock_irq(&base->lock);
base->running_timer = NULL;
timer_sync_wait_running(base);
- raw_spin_lock_irq(&base->lock);
}
}
}
-static int __collect_expired_timers(struct timer_base *base,
- struct hlist_head *heads)
+static int collect_expired_timers(struct timer_base *base,
+ struct hlist_head *heads)
{
- unsigned long clk = base->clk;
+ unsigned long clk = base->clk = base->next_expiry;
struct hlist_head *vec;
int i, levels = 0;
unsigned int idx;
@@ -1491,7 +1781,6 @@ static int __collect_expired_timers(struct timer_base *base,
return levels;
}
-#ifdef CONFIG_NO_HZ_COMMON
/*
* Find the next pending bucket of a level. Search from level start (@offset)
* + @clk upwards and if nothing there, search from start of the level
@@ -1514,8 +1803,10 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset,
/*
* Search the first expiring timer in the various clock levels. Caller must
* hold base->lock.
+ *
+ * Store next expiry time in base->next_expiry.
*/
-static unsigned long __next_timer_interrupt(struct timer_base *base)
+static void next_expiry_recalc(struct timer_base *base)
{
unsigned long clk, next, adj;
unsigned lvl, offset = 0;
@@ -1524,6 +1815,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
clk = base->clk;
for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
+ unsigned long lvl_clk = clk & LVL_CLK_MASK;
if (pos >= 0) {
unsigned long tmp = clk + (unsigned long) pos;
@@ -1531,6 +1823,13 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
tmp <<= LVL_SHIFT(lvl);
if (time_before(tmp, next))
next = tmp;
+
+ /*
+ * If the next expiration happens before we reach
+ * the next level, no need to check further.
+ */
+ if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
+ break;
}
/*
* Clock for the next level. If the current level clock lower
@@ -1568,13 +1867,17 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
* So the simple check whether the lower bits of the current
* level are 0 or not is sufficient for all cases.
*/
- adj = clk & LVL_CLK_MASK ? 1 : 0;
+ adj = lvl_clk ? 1 : 0;
clk >>= LVL_CLK_SHIFT;
clk += adj;
}
- return next;
+
+ base->next_expiry = next;
+ base->next_expiry_recalc = false;
+ base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
}
+#ifdef CONFIG_NO_HZ_COMMON
/*
* Check, if the next hrtimer event is before the next timer wheel
* event:
@@ -1619,9 +1922,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ unsigned long nextevt = basej + NEXT_TIMER_MAX_DELTA;
u64 expires = KTIME_MAX;
- unsigned long nextevt;
- bool is_max_delta;
+ bool was_idle;
/*
* Pretend that there is no timer pending if the cpu is offline.
@@ -1631,39 +1934,45 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
return expires;
raw_spin_lock(&base->lock);
- nextevt = __next_timer_interrupt(base);
- is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
- base->next_expiry = nextevt;
+ if (base->next_expiry_recalc)
+ next_expiry_recalc(base);
+
/*
* We have a fresh next event. Check whether we can forward the
- * base. We can only do that when @basej is past base->clk
- * otherwise we might rewind base->clk.
+ * base.
*/
- if (time_after(basej, base->clk)) {
- if (time_after(nextevt, basej))
- base->clk = basej;
- else if (time_after(nextevt, base->clk))
- base->clk = nextevt;
- }
+ __forward_timer_base(base, basej);
- if (time_before_eq(nextevt, basej)) {
- expires = basem;
- base->is_idle = false;
+ if (base->timers_pending) {
+ nextevt = base->next_expiry;
+
+ /* If we missed a tick already, force 0 delta */
+ if (time_before(nextevt, basej))
+ nextevt = basej;
+ expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
} else {
- if (!is_max_delta)
- expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
/*
- * If we expect to sleep more than a tick, mark the base idle.
- * Also the tick is stopped so any added timer must forward
- * the base clk itself to keep granularity small. This idle
- * logic is only maintained for the BASE_STD base, deferrable
- * timers may still see large granularity skew (by design).
+ * Move next_expiry for the empty base into the future to
+ * prevent a unnecessary raise of the timer softirq when the
+ * next_expiry value will be reached even if there is no timer
+ * pending.
*/
- if ((expires - basem) > TICK_NSEC) {
- base->must_forward_clk = true;
- base->is_idle = true;
- }
+ base->next_expiry = nextevt;
}
+
+ /*
+ * Base is idle if the next event is more than a tick away.
+ *
+ * If the base is marked idle then any timer add operation must forward
+ * the base clk itself to keep granularity small. This idle logic is
+ * only maintained for the BASE_STD base, deferrable timers may still
+ * see large granularity skew (by design).
+ */
+ was_idle = base->is_idle;
+ base->is_idle = time_after(nextevt, basej + 1);
+ if (was_idle != base->is_idle)
+ trace_timer_base_idle(base->is_idle, base->cpu);
+
raw_spin_unlock(&base->lock);
return cmp_next_hrtimer_event(basem, expires);
@@ -1684,73 +1993,12 @@ void timer_clear_idle(void)
* sending the IPI a few instructions smaller for the cost of taking
* the lock in the exit from idle path.
*/
- base->is_idle = false;
-}
-
-static int collect_expired_timers(struct timer_base *base,
- struct hlist_head *heads)
-{
- unsigned long now = READ_ONCE(jiffies);
-
- /*
- * NOHZ optimization. After a long idle sleep we need to forward the
- * base to current jiffies. Avoid a loop by searching the bitfield for
- * the next expiring timer.
- */
- if ((long)(now - base->clk) > 2) {
- unsigned long next = __next_timer_interrupt(base);
-
- /*
- * If the next timer is ahead of time forward to current
- * jiffies, otherwise forward to the next expiry time:
- */
- if (time_after(next, now)) {
- /*
- * The call site will increment base->clk and then
- * terminate the expiry loop immediately.
- */
- base->clk = now;
- return 0;
- }
- base->clk = next;
+ if (base->is_idle) {
+ base->is_idle = false;
+ trace_timer_base_idle(false, smp_processor_id());
}
- return __collect_expired_timers(base, heads);
}
-#else
-static inline int collect_expired_timers(struct timer_base *base,
- struct hlist_head *heads)
-{
- return __collect_expired_timers(base, heads);
-}
-#endif
-
-/*
- * Called from the timer interrupt handler to charge one tick to the current
- * process. user_tick is 1 if the tick is user time, 0 for system.
- */
-void update_process_times(int user_tick)
-{
- struct task_struct *p = current;
-
- /* Note: this timer irq context must be accounted for as well. */
- account_process_tick(p, user_tick);
- run_local_timers();
- rcu_sched_clock_irq(user_tick);
-#ifdef CONFIG_IRQ_WORK
- if (in_irq())
- irq_work_tick();
#endif
- scheduler_tick();
- if (IS_ENABLED(CONFIG_POSIX_TIMERS))
- run_posix_cpu_timers();
-
- /* The current CPU might make use of net randoms without receiving IRQs
- * to renew them often enough. Let's update the net_rand_state from a
- * non-constant value that's not affine to the number of calls to make
- * sure it's updated when there's some activity (we don't care in idle).
- */
- this_cpu_add(net_rand_state.s1, rol32(jiffies, 24) + user_tick);
-}
/**
* __run_timers - run all expired timers (if any) on this CPU.
@@ -1761,32 +2009,30 @@ static inline void __run_timers(struct timer_base *base)
struct hlist_head heads[LVL_DEPTH];
int levels;
- if (!time_after_eq(jiffies, base->clk))
+ if (time_before(jiffies, base->next_expiry))
return;
timer_base_lock_expiry(base);
raw_spin_lock_irq(&base->lock);
- /*
- * timer_base::must_forward_clk must be cleared before running
- * timers so that any timer functions that call mod_timer() will
- * not try to forward the base. Idle tracking / clock forwarding
- * logic is only used with BASE_STD timers.
- *
- * The must_forward_clk flag is cleared unconditionally also for
- * the deferrable base. The deferrable base is not affected by idle
- * tracking and never forwarded, so clearing the flag is a NOOP.
- *
- * The fact that the deferrable base is never forwarded can cause
- * large variations in granularity for deferrable timers, but they
- * can be deferred for long periods due to idle anyway.
- */
- base->must_forward_clk = false;
-
- while (time_after_eq(jiffies, base->clk)) {
-
+ while (time_after_eq(jiffies, base->clk) &&
+ time_after_eq(jiffies, base->next_expiry)) {
levels = collect_expired_timers(base, heads);
+ /*
+ * The two possible reasons for not finding any expired
+ * timer at this clk are that all matching timers have been
+ * dequeued or no timer has been queued since
+ * base::next_expiry was set to base::clk +
+ * NEXT_TIMER_MAX_DELTA.
+ */
+ WARN_ON_ONCE(!levels && !base->next_expiry_recalc
+ && base->timers_pending);
+ /*
+ * While executing timers, base->clk is set 1 offset ahead of
+ * jiffies to avoid endless requeuing to current jiffies.
+ */
base->clk++;
+ next_expiry_recalc(base);
while (levels--)
expire_timers(base, heads + levels);
@@ -1810,24 +2056,45 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
/*
* Called by the local, per-CPU timer interrupt on SMP.
*/
-void run_local_timers(void)
+static void run_local_timers(void)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
hrtimer_run_queues();
/* Raise the softirq only if required. */
- if (time_before(jiffies, base->clk)) {
+ if (time_before(jiffies, base->next_expiry)) {
if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
return;
/* CPU is awake, so check the deferrable base. */
base++;
- if (time_before(jiffies, base->clk))
+ if (time_before(jiffies, base->next_expiry))
return;
}
raise_softirq(TIMER_SOFTIRQ);
}
/*
+ * Called from the timer interrupt handler to charge one tick to the current
+ * process. user_tick is 1 if the tick is user time, 0 for system.
+ */
+void update_process_times(int user_tick)
+{
+ struct task_struct *p = current;
+
+ /* Note: this timer irq context must be accounted for as well. */
+ account_process_tick(p, user_tick);
+ run_local_timers();
+ rcu_sched_clock_irq(user_tick);
+#ifdef CONFIG_IRQ_WORK
+ if (in_irq())
+ irq_work_tick();
+#endif
+ scheduler_tick();
+ if (IS_ENABLED(CONFIG_POSIX_TIMERS))
+ run_posix_cpu_timers();
+}
+
+/*
* Since schedule_timeout()'s timer is defined on the stack, it must store
* the target task on the stack as well.
*/
@@ -1903,7 +2170,7 @@ signed long __sched schedule_timeout(signed long timeout)
printk(KERN_ERR "schedule_timeout: wrong timeout "
"value %lx\n", timeout);
dump_stack();
- current->state = TASK_RUNNING;
+ __set_current_state(TASK_RUNNING);
goto out;
}
}
@@ -1914,7 +2181,7 @@ signed long __sched schedule_timeout(signed long timeout)
timer_setup_on_stack(&timer.timer, process_timeout, 0);
__mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
schedule();
- del_singleshot_timer_sync(&timer.timer);
+ del_timer_sync(&timer.timer);
/* Remove the timer from the object tracker */
destroy_timer_on_stack(&timer.timer);
@@ -1985,8 +2252,9 @@ int timers_prepare_cpu(unsigned int cpu)
base = per_cpu_ptr(&timer_bases[b], cpu);
base->clk = jiffies;
base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->next_expiry_recalc = false;
+ base->timers_pending = false;
base->is_idle = false;
- base->must_forward_clk = true;
}
return 0;
}
@@ -1997,8 +2265,6 @@ int timers_dead_cpu(unsigned int cpu)
struct timer_base *new_base;
int b, i;
- BUG_ON(cpu_online(cpu));
-
for (b = 0; b < NR_BASES; b++) {
old_base = per_cpu_ptr(&timer_bases[b], cpu);
new_base = get_cpu_ptr(&timer_bases[b]);
@@ -2015,7 +2281,8 @@ int timers_dead_cpu(unsigned int cpu)
*/
forward_timer_base(new_base);
- BUG_ON(old_base->running_timer);
+ WARN_ON_ONCE(old_base->running_timer);
+ old_base->running_timer = NULL;
for (i = 0; i < WHEEL_SIZE; i++)
migrate_timer_list(new_base, old_base->vectors + i);
@@ -2039,6 +2306,7 @@ static void __init init_timer_cpu(int cpu)
base->cpu = cpu;
raw_spin_lock_init(&base->lock);
base->clk = jiffies;
+ base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
timer_base_init_expiry_lock(base);
}
}
@@ -2054,6 +2322,7 @@ static void __init init_timer_cpus(void)
void __init init_timers(void)
{
init_timer_cpus();
+ posix_cputimers_init_work();
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
@@ -2087,26 +2356,28 @@ unsigned long msleep_interruptible(unsigned int msecs)
EXPORT_SYMBOL(msleep_interruptible);
/**
- * usleep_range - Sleep for an approximate time
- * @min: Minimum time in usecs to sleep
- * @max: Maximum time in usecs to sleep
+ * usleep_range_state - Sleep for an approximate time in a given state
+ * @min: Minimum time in usecs to sleep
+ * @max: Maximum time in usecs to sleep
+ * @state: State of the current task that will be while sleeping
*
* In non-atomic context where the exact wakeup time is flexible, use
- * usleep_range() instead of udelay(). The sleep improves responsiveness
+ * usleep_range_state() instead of udelay(). The sleep improves responsiveness
* by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
* power usage by allowing hrtimers to take advantage of an already-
* scheduled interrupt instead of scheduling a new one just for this sleep.
*/
-void __sched usleep_range(unsigned long min, unsigned long max)
+void __sched usleep_range_state(unsigned long min, unsigned long max,
+ unsigned int state)
{
ktime_t exp = ktime_add_us(ktime_get(), min);
u64 delta = (u64)(max - min) * NSEC_PER_USEC;
for (;;) {
- __set_current_state(TASK_UNINTERRUPTIBLE);
+ __set_current_state(state);
/* Do not return before the requested sleep time has elapsed */
if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
break;
}
}
-EXPORT_SYMBOL(usleep_range);
+EXPORT_SYMBOL(usleep_range_state);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index acb326f5f50a..ed7d6ad694fb 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -42,24 +42,11 @@ static void SEQ_printf(struct seq_file *m, const char *fmt, ...)
va_end(args);
}
-static void print_name_offset(struct seq_file *m, void *sym)
-{
- char symname[KSYM_NAME_LEN];
-
- if (lookup_symbol_name((unsigned long)sym, symname) < 0)
- SEQ_printf(m, "<%pK>", sym);
- else
- SEQ_printf(m, "%s", symname);
-}
-
static void
print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
int idx, u64 now)
{
- SEQ_printf(m, " #%d: ", idx);
- print_name_offset(m, taddr);
- SEQ_printf(m, ", ");
- print_name_offset(m, timer->function);
+ SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function);
SEQ_printf(m, ", S:%02x", timer->state);
SEQ_printf(m, "\n");
SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
@@ -116,9 +103,7 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution);
- SEQ_printf(m, " .get_time: ");
- print_name_offset(m, base->get_time);
- SEQ_printf(m, "\n");
+ SEQ_printf(m, " .get_time: %ps\n", base->get_time);
#ifdef CONFIG_HIGH_RES_TIMERS
SEQ_printf(m, " .offset: %Lu nsecs\n",
(unsigned long long) ktime_to_ns(base->offset));
@@ -218,44 +203,39 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
SEQ_printf(m, " next_event: %Ld nsecs\n",
(unsigned long long) ktime_to_ns(dev->next_event));
- SEQ_printf(m, " set_next_event: ");
- print_name_offset(m, dev->set_next_event);
- SEQ_printf(m, "\n");
+ SEQ_printf(m, " set_next_event: %ps\n", dev->set_next_event);
- if (dev->set_state_shutdown) {
- SEQ_printf(m, " shutdown: ");
- print_name_offset(m, dev->set_state_shutdown);
- SEQ_printf(m, "\n");
- }
+ if (dev->set_state_shutdown)
+ SEQ_printf(m, " shutdown: %ps\n",
+ dev->set_state_shutdown);
- if (dev->set_state_periodic) {
- SEQ_printf(m, " periodic: ");
- print_name_offset(m, dev->set_state_periodic);
- SEQ_printf(m, "\n");
- }
+ if (dev->set_state_periodic)
+ SEQ_printf(m, " periodic: %ps\n",
+ dev->set_state_periodic);
- if (dev->set_state_oneshot) {
- SEQ_printf(m, " oneshot: ");
- print_name_offset(m, dev->set_state_oneshot);
- SEQ_printf(m, "\n");
- }
+ if (dev->set_state_oneshot)
+ SEQ_printf(m, " oneshot: %ps\n",
+ dev->set_state_oneshot);
- if (dev->set_state_oneshot_stopped) {
- SEQ_printf(m, " oneshot stopped: ");
- print_name_offset(m, dev->set_state_oneshot_stopped);
- SEQ_printf(m, "\n");
- }
+ if (dev->set_state_oneshot_stopped)
+ SEQ_printf(m, " oneshot stopped: %ps\n",
+ dev->set_state_oneshot_stopped);
- if (dev->tick_resume) {
- SEQ_printf(m, " resume: ");
- print_name_offset(m, dev->tick_resume);
- SEQ_printf(m, "\n");
- }
+ if (dev->tick_resume)
+ SEQ_printf(m, " resume: %ps\n",
+ dev->tick_resume);
- SEQ_printf(m, " event_handler: ");
- print_name_offset(m, dev->event_handler);
+ SEQ_printf(m, " event_handler: %ps\n", dev->event_handler);
SEQ_printf(m, "\n");
SEQ_printf(m, " retries: %lu\n", dev->retries);
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+ if (cpu >= 0) {
+ const struct clock_event_device *wd = tick_get_wakeup_device(cpu);
+
+ SEQ_printf(m, "Wakeup Device: %s\n", wd ? wd->name : "<NULL>");
+ }
+#endif
SEQ_printf(m, "\n");
}
@@ -276,7 +256,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
static inline void timer_list_header(struct seq_file *m, u64 now)
{
- SEQ_printf(m, "Timer List Version: v0.8\n");
+ SEQ_printf(m, "Timer List Version: v0.9\n");
SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
SEQ_printf(m, "\n");
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 54ce6eb2ca36..f0d5062d9cbc 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -13,6 +13,8 @@
#include <vdso/helpers.h>
#include <vdso/vsyscall.h>
+#include "timekeeping_internal.h"
+
static inline void update_vdso_data(struct vdso_data *vdata,
struct timekeeper *tk)
{
@@ -106,7 +108,7 @@ void update_vsyscall(struct timekeeper *tk)
/*
* If the current clocksource is not VDSO capable, then spare the
- * update of the high reolution parts.
+ * update of the high resolution parts.
*/
if (clock_mode != VDSO_CLOCKMODE_NONE)
update_vdso_data(vdata, tk);
@@ -127,3 +129,42 @@ void update_vsyscall_tz(void)
__arch_sync_vdso_data(vdata);
}
+
+/**
+ * vdso_update_begin - Start of a VDSO update section
+ *
+ * Allows architecture code to safely update the architecture specific VDSO
+ * data. Disables interrupts, acquires timekeeper lock to serialize against
+ * concurrent updates from timekeeping and invalidates the VDSO data
+ * sequence counter to prevent concurrent readers from accessing
+ * inconsistent data.
+ *
+ * Returns: Saved interrupt flags which need to be handed in to
+ * vdso_update_end().
+ */
+unsigned long vdso_update_begin(void)
+{
+ struct vdso_data *vdata = __arch_get_k_vdso_data();
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ vdso_write_begin(vdata);
+ return flags;
+}
+
+/**
+ * vdso_update_end - End of a VDSO update section
+ * @flags: Interrupt flags as returned from vdso_update_begin()
+ *
+ * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data
+ * synchronization if the architecture requires it, drops timekeeper lock
+ * and restores interrupt flags.
+ */
+void vdso_update_end(unsigned long flags)
+{
+ struct vdso_data *vdata = __arch_get_k_vdso_data();
+
+ vdso_write_end(vdata);
+ __arch_sync_vdso_data(vdata);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+}
diff --git a/kernel/torture.c b/kernel/torture.c
index a1a41484ff6d..c72ab2d251f4 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -37,6 +37,7 @@
#include <linux/ktime.h>
#include <asm/byteorder.h>
#include <linux/torture.h>
+#include <linux/sched/rt.h>
#include "rcu/rcu.h"
MODULE_LICENSE("GPL");
@@ -45,6 +46,18 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
static bool disable_onoff_at_boot;
module_param(disable_onoff_at_boot, bool, 0444);
+static bool ftrace_dump_at_shutdown;
+module_param(ftrace_dump_at_shutdown, bool, 0444);
+
+static int verbose_sleep_frequency;
+module_param(verbose_sleep_frequency, int, 0444);
+
+static int verbose_sleep_duration = 1;
+module_param(verbose_sleep_duration, int, 0444);
+
+static int random_shuffle;
+module_param(random_shuffle, int, 0444);
+
static char *torture_type;
static int verbose;
@@ -55,6 +68,96 @@ static int verbose;
static int fullstop = FULLSTOP_RMMOD;
static DEFINE_MUTEX(fullstop_mutex);
+static atomic_t verbose_sleep_counter;
+
+/*
+ * Sleep if needed from VERBOSE_TOROUT*().
+ */
+void verbose_torout_sleep(void)
+{
+ if (verbose_sleep_frequency > 0 &&
+ verbose_sleep_duration > 0 &&
+ !(atomic_inc_return(&verbose_sleep_counter) % verbose_sleep_frequency))
+ schedule_timeout_uninterruptible(verbose_sleep_duration);
+}
+EXPORT_SYMBOL_GPL(verbose_torout_sleep);
+
+/*
+ * Schedule a high-resolution-timer sleep in nanoseconds, with a 32-bit
+ * nanosecond random fuzz. This function and its friends desynchronize
+ * testing from the timer wheel.
+ */
+int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, const enum hrtimer_mode mode,
+ struct torture_random_state *trsp)
+{
+ ktime_t hto = baset_ns;
+
+ if (trsp)
+ hto += torture_random(trsp) % fuzzt_ns;
+ set_current_state(TASK_IDLE);
+ return schedule_hrtimeout(&hto, mode);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_ns);
+
+/*
+ * Schedule a high-resolution-timer sleep in microseconds, with a 32-bit
+ * nanosecond (not microsecond!) random fuzz.
+ */
+int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state *trsp)
+{
+ ktime_t baset_ns = baset_us * NSEC_PER_USEC;
+
+ return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_us);
+
+/*
+ * Schedule a high-resolution-timer sleep in milliseconds, with a 32-bit
+ * microsecond (not millisecond!) random fuzz.
+ */
+int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state *trsp)
+{
+ ktime_t baset_ns = baset_ms * NSEC_PER_MSEC;
+ u32 fuzzt_ns;
+
+ if ((u32)~0U / NSEC_PER_USEC < fuzzt_us)
+ fuzzt_ns = (u32)~0U;
+ else
+ fuzzt_ns = fuzzt_us * NSEC_PER_USEC;
+ return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_ms);
+
+/*
+ * Schedule a high-resolution-timer sleep in jiffies, with an
+ * implied one-jiffy random fuzz. This is intended to replace calls to
+ * schedule_timeout_interruptible() and friends.
+ */
+int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp)
+{
+ ktime_t baset_ns = jiffies_to_nsecs(baset_j);
+
+ return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), HRTIMER_MODE_REL, trsp);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_jiffies);
+
+/*
+ * Schedule a high-resolution-timer sleep in milliseconds, with a 32-bit
+ * millisecond (not second!) random fuzz.
+ */
+int torture_hrtimeout_s(u32 baset_s, u32 fuzzt_ms, struct torture_random_state *trsp)
+{
+ ktime_t baset_ns = baset_s * NSEC_PER_SEC;
+ u32 fuzzt_ns;
+
+ if ((u32)~0U / NSEC_PER_MSEC < fuzzt_ms)
+ fuzzt_ns = (u32)~0U;
+ else
+ fuzzt_ns = fuzzt_ms * NSEC_PER_MSEC;
+ return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_s);
+
#ifdef CONFIG_HOTPLUG_CPU
/*
@@ -77,6 +180,19 @@ static unsigned long sum_online;
static int min_online = -1;
static int max_online;
+static int torture_online_cpus = NR_CPUS;
+
+/*
+ * Some torture testing leverages confusion as to the number of online
+ * CPUs. This function returns the torture-testing view of this number,
+ * which allows torture tests to load-balance appropriately.
+ */
+int torture_num_online_cpus(void)
+{
+ return READ_ONCE(torture_online_cpus);
+}
+EXPORT_SYMBOL_GPL(torture_num_online_cpus);
+
/*
* Attempt to take a CPU offline. Return false if the CPU is already
* offline or if it is not subject to CPU-hotplug operations. The
@@ -131,6 +247,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
*min_offl = delta;
if (*max_offl < delta)
*max_offl = delta;
+ WRITE_ONCE(torture_online_cpus, torture_online_cpus - 1);
+ WARN_ON_ONCE(torture_online_cpus <= 0);
}
return true;
@@ -187,6 +305,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes,
*min_onl = delta;
if (*max_onl < delta)
*max_onl = delta;
+ WRITE_ONCE(torture_online_cpus, torture_online_cpus + 1);
}
return true;
@@ -194,6 +313,26 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes,
EXPORT_SYMBOL_GPL(torture_online);
/*
+ * Get everything online at the beginning and ends of tests.
+ */
+static void torture_online_all(char *phase)
+{
+ int cpu;
+ int ret;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu_online(cpu))
+ continue;
+ ret = add_cpu(cpu);
+ if (ret && verbose) {
+ pr_alert("%s" TORTURE_FLAG
+ "%s: %s online %d: errno %d\n",
+ __func__, phase, torture_type, cpu, ret);
+ }
+ }
+}
+
+/*
* Execute random CPU-hotplug operations at the interval specified
* by the onoff_interval.
*/
@@ -203,25 +342,12 @@ torture_onoff(void *arg)
int cpu;
int maxcpu = -1;
DEFINE_TORTURE_RANDOM(rand);
- int ret;
VERBOSE_TOROUT_STRING("torture_onoff task started");
for_each_online_cpu(cpu)
maxcpu = cpu;
WARN_ON(maxcpu < 0);
- if (!IS_MODULE(CONFIG_TORTURE_TEST)) {
- for_each_possible_cpu(cpu) {
- if (cpu_online(cpu))
- continue;
- ret = add_cpu(cpu);
- if (ret && verbose) {
- pr_alert("%s" TORTURE_FLAG
- "%s: Initial online %d: errno %d\n",
- __func__, torture_type, cpu, ret);
- }
- }
- }
-
+ torture_online_all("Initial");
if (maxcpu == 0) {
VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled");
goto stop;
@@ -229,26 +355,27 @@ torture_onoff(void *arg)
if (onoff_holdoff > 0) {
VERBOSE_TOROUT_STRING("torture_onoff begin holdoff");
- schedule_timeout_interruptible(onoff_holdoff);
+ torture_hrtimeout_jiffies(onoff_holdoff, &rand);
VERBOSE_TOROUT_STRING("torture_onoff end holdoff");
}
while (!torture_must_stop()) {
if (disable_onoff_at_boot && !rcu_inkernel_boot_has_ended()) {
- schedule_timeout_interruptible(HZ / 10);
+ torture_hrtimeout_jiffies(HZ / 10, &rand);
continue;
}
- cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
+ cpu = torture_random(&rand) % (maxcpu + 1);
if (!torture_offline(cpu,
&n_offline_attempts, &n_offline_successes,
&sum_offline, &min_offline, &max_offline))
torture_online(cpu,
&n_online_attempts, &n_online_successes,
&sum_online, &min_online, &max_online);
- schedule_timeout_interruptible(onoff_interval);
+ torture_hrtimeout_jiffies(onoff_interval, &rand);
}
stop:
torture_kthread_stopping("torture_onoff");
+ torture_online_all("Final");
return 0;
}
@@ -328,7 +455,7 @@ unsigned long
torture_random(struct torture_random_state *trsp)
{
if (--trsp->trs_count < 0) {
- trsp->trs_state += (unsigned long)local_clock();
+ trsp->trs_state += (unsigned long)local_clock() + raw_smp_processor_id();
trsp->trs_count = TORTURE_RANDOM_REFRESH;
}
trsp->trs_state = trsp->trs_state * TORTURE_RANDOM_MULT +
@@ -394,16 +521,16 @@ static void torture_shuffle_task_unregister_all(void)
* A special case is when shuffle_idle_cpu = -1, in which case we allow
* the tasks to run on all CPUs.
*/
-static void torture_shuffle_tasks(void)
+static void torture_shuffle_tasks(struct torture_random_state *trp)
{
struct shuffle_task *stp;
cpumask_setall(shuffle_tmp_mask);
- get_online_cpus();
+ cpus_read_lock();
/* No point in shuffling if there is only one online CPU (ex: UP) */
if (num_online_cpus() == 1) {
- put_online_cpus();
+ cpus_read_unlock();
return;
}
@@ -415,11 +542,13 @@ static void torture_shuffle_tasks(void)
cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask);
mutex_lock(&shuffle_task_mutex);
- list_for_each_entry(stp, &shuffle_task_list, st_l)
- set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask);
+ list_for_each_entry(stp, &shuffle_task_list, st_l) {
+ if (!random_shuffle || torture_random(trp) & 0x1)
+ set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask);
+ }
mutex_unlock(&shuffle_task_mutex);
- put_online_cpus();
+ cpus_read_unlock();
}
/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
@@ -428,10 +557,12 @@ static void torture_shuffle_tasks(void)
*/
static int torture_shuffle(void *arg)
{
+ DEFINE_TORTURE_RANDOM(rand);
+
VERBOSE_TOROUT_STRING("torture_shuffle task started");
do {
- schedule_timeout_interruptible(shuffle_interval);
- torture_shuffle_tasks();
+ torture_hrtimeout_jiffies(shuffle_interval, &rand);
+ torture_shuffle_tasks(&rand);
torture_shutdown_absorb("torture_shuffle");
} while (!torture_must_stop());
torture_kthread_stopping("torture_shuffle");
@@ -448,7 +579,7 @@ int torture_shuffle_init(long shuffint)
shuffle_idle_cpu = -1;
if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
- VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask");
+ TOROUT_ERRSTRING("Failed to alloc mask");
return -ENOMEM;
}
@@ -527,7 +658,8 @@ static int torture_shutdown(void *arg)
torture_shutdown_hook();
else
VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping.");
- rcu_ftrace_dump(DUMP_ALL);
+ if (ftrace_dump_at_shutdown)
+ rcu_ftrace_dump(DUMP_ALL);
kernel_power_off(); /* Shut down the system. */
return 0;
}
@@ -541,7 +673,7 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void))
if (ssecs > 0) {
shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0));
return torture_create_kthread(torture_shutdown, NULL,
- shutdown_task);
+ shutdown_task);
}
return 0;
}
@@ -588,7 +720,7 @@ static void torture_shutdown_cleanup(void)
* suddenly applied to or removed from the system.
*/
static struct task_struct *stutter_task;
-static int stutter_pause_test;
+static ktime_t stutter_till_abs_time;
static int stutter;
static int stutter_gap;
@@ -598,23 +730,16 @@ static int stutter_gap;
*/
bool stutter_wait(const char *title)
{
- int spt;
bool ret = false;
+ ktime_t till_ns;
cond_resched_tasks_rcu_qs();
- spt = READ_ONCE(stutter_pause_test);
- for (; spt; spt = READ_ONCE(stutter_pause_test)) {
+ till_ns = READ_ONCE(stutter_till_abs_time);
+ if (till_ns && ktime_before(ktime_get(), till_ns)) {
+ torture_hrtimeout_ns(till_ns, 0, HRTIMER_MODE_ABS, NULL);
ret = true;
- if (spt == 1) {
- schedule_timeout_interruptible(1);
- } else if (spt == 2) {
- while (READ_ONCE(stutter_pause_test))
- cond_resched();
- } else {
- schedule_timeout_interruptible(round_jiffies_relative(HZ));
- }
- torture_shutdown_absorb(title);
}
+ torture_shutdown_absorb(title);
return ret;
}
EXPORT_SYMBOL_GPL(stutter_wait);
@@ -625,24 +750,18 @@ EXPORT_SYMBOL_GPL(stutter_wait);
*/
static int torture_stutter(void *arg)
{
- int wtime;
+ ktime_t till_ns;
VERBOSE_TOROUT_STRING("torture_stutter task started");
do {
if (!torture_must_stop() && stutter > 1) {
- wtime = stutter;
- if (stutter > HZ + 1) {
- WRITE_ONCE(stutter_pause_test, 1);
- wtime = stutter - HZ - 1;
- schedule_timeout_interruptible(wtime);
- wtime = HZ + 1;
- }
- WRITE_ONCE(stutter_pause_test, 2);
- schedule_timeout_interruptible(wtime);
+ till_ns = ktime_add_ns(ktime_get(),
+ jiffies_to_nsecs(stutter));
+ WRITE_ONCE(stutter_till_abs_time, till_ns);
+ torture_hrtimeout_jiffies(stutter - 1, NULL);
}
- WRITE_ONCE(stutter_pause_test, 0);
if (!torture_must_stop())
- schedule_timeout_interruptible(stutter_gap);
+ torture_hrtimeout_jiffies(stutter_gap, NULL);
torture_shutdown_absorb("torture_stutter");
} while (!torture_must_stop());
torture_kthread_stopping("torture_stutter");
@@ -672,6 +791,13 @@ static void torture_stutter_cleanup(void)
stutter_task = NULL;
}
+static void
+torture_print_module_parms(void)
+{
+ pr_alert("torture module --- %s: disable_onoff_at_boot=%d ftrace_dump_at_shutdown=%d verbose_sleep_frequency=%d verbose_sleep_duration=%d random_shuffle=%d\n",
+ torture_type, disable_onoff_at_boot, ftrace_dump_at_shutdown, verbose_sleep_frequency, verbose_sleep_duration, random_shuffle);
+}
+
/*
* Initialize torture module. Please note that this is -not- invoked via
* the usual module_init() mechanism, but rather by an explicit call from
@@ -685,15 +811,16 @@ bool torture_init_begin(char *ttype, int v)
{
mutex_lock(&fullstop_mutex);
if (torture_type != NULL) {
- pr_alert("torture_init_begin: Refusing %s init: %s running.\n",
- ttype, torture_type);
- pr_alert("torture_init_begin: One torture test at a time!\n");
+ pr_alert("%s: Refusing %s init: %s running.\n",
+ __func__, ttype, torture_type);
+ pr_alert("%s: One torture test at a time!\n", __func__);
mutex_unlock(&fullstop_mutex);
return false;
}
torture_type = ttype;
verbose = v;
fullstop = FULLSTOP_DONTSTOP;
+ torture_print_module_parms();
return true;
}
EXPORT_SYMBOL_GPL(torture_init_begin);
@@ -780,11 +907,11 @@ void torture_kthread_stopping(char *title)
{
char buf[128];
- snprintf(buf, sizeof(buf), "Stopping %s", title);
+ snprintf(buf, sizeof(buf), "%s is stopping", title);
VERBOSE_TOROUT_STRING(buf);
while (!kthread_should_stop()) {
torture_shutdown_absorb(title);
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_uninterruptible(HZ / 20);
}
}
EXPORT_SYMBOL_GPL(torture_kthread_stopping);
@@ -795,17 +922,23 @@ EXPORT_SYMBOL_GPL(torture_kthread_stopping);
* it starts, you will need to open-code your own.
*/
int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
- char *f, struct task_struct **tp)
+ char *f, struct task_struct **tp, void (*cbf)(struct task_struct *tp))
{
int ret = 0;
VERBOSE_TOROUT_STRING(m);
- *tp = kthread_run(fn, arg, "%s", s);
+ *tp = kthread_create(fn, arg, "%s", s);
if (IS_ERR(*tp)) {
ret = PTR_ERR(*tp);
- VERBOSE_TOROUT_ERRSTRING(f);
+ TOROUT_ERRSTRING(f);
*tp = NULL;
+ return ret;
}
+
+ if (cbf)
+ cbf(*tp);
+
+ wake_up_process(*tp); // Process is sleeping, so ordering provided.
torture_shuffle_task_register(*tp);
return ret;
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a4020c0b4508..61c541c36596 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -10,6 +10,17 @@ config USER_STACKTRACE_SUPPORT
config NOP_TRACER
bool
+config HAVE_RETHOOK
+ bool
+
+config RETHOOK
+ bool
+ depends on HAVE_RETHOOK
+ help
+ Enable generic return hooking feature. This is an internal
+ API, which will be used by other function-entry hooking
+ features like fprobe and kprobes.
+
config HAVE_FUNCTION_TRACER
bool
help
@@ -20,6 +31,9 @@ config HAVE_FUNCTION_GRAPH_TRACER
help
See Documentation/trace/ftrace-design.rst
+config HAVE_FUNCTION_GRAPH_RETVAL
+ bool
+
config HAVE_DYNAMIC_FTRACE
bool
help
@@ -31,6 +45,24 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS
config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
bool
+config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
+ bool
+
+config HAVE_DYNAMIC_FTRACE_WITH_ARGS
+ bool
+ help
+ If this is set, then arguments and stack can be found from
+ the ftrace_regs passed into the function callback regs parameter
+ by default, even without setting the REGS flag in the ftrace_ops.
+ This allows for use of ftrace_regs_get_argument() and
+ ftrace_regs_get_stack_pointer().
+
+config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
+ bool
+ help
+ If the architecture generates __patchable_function_entries sections
+ but does not want them included in the ftrace locations.
+
config HAVE_FTRACE_MCOUNT_RECORD
bool
help
@@ -51,11 +83,36 @@ config HAVE_NOP_MCOUNT
help
Arch supports the gcc options -pg with -mrecord-mcount and -nop-mcount
+config HAVE_OBJTOOL_MCOUNT
+ bool
+ help
+ Arch supports objtool --mcount
+
+config HAVE_OBJTOOL_NOP_MCOUNT
+ bool
+ help
+ Arch supports the objtool options --mcount with --mnop.
+ An architecture can select this if it wants to enable nop'ing
+ of ftrace locations.
+
config HAVE_C_RECORDMCOUNT
bool
help
C version of recordmcount available?
+config HAVE_BUILDTIME_MCOUNT_SORT
+ bool
+ help
+ An architecture selects this if it sorts the mcount_loc section
+ at build time.
+
+config BUILDTIME_MCOUNT_SORT
+ bool
+ default y
+ depends on HAVE_BUILDTIME_MCOUNT_SORT && DYNAMIC_FTRACE
+ help
+ Sort the mcount_loc section at build time.
+
config TRACER_MAX_TRACE
bool
@@ -106,6 +163,7 @@ config TRACING
select BINARY_PRINTF
select EVENT_TRACING
select TRACE_CLOCK
+ select TASKS_RCU if PREEMPTION
config GENERIC_TRACER
bool
@@ -121,10 +179,9 @@ config TRACING_SUPPORT
depends on STACKTRACE_SUPPORT
default y
-if TRACING_SUPPORT
-
menuconfig FTRACE
bool "Tracers"
+ depends on TRACING_SUPPORT
default y if DEBUG_KERNEL
help
Enable the kernel tracing infrastructure.
@@ -156,7 +213,8 @@ config FUNCTION_TRACER
sequence is then dynamically patched into a tracer call when
tracing is enabled by the administrator. If it's runtime disabled
(the bootup default), then the overhead of the instructions is very
- small and not measurable even in micro-benchmarks.
+ small and not measurable even in micro-benchmarks (at least on
+ x86, but may have impact on other architectures).
config FUNCTION_GRAPH_TRACER
bool "Kernel Function Graph Tracer"
@@ -172,6 +230,18 @@ config FUNCTION_GRAPH_TRACER
the return value. This is done by setting the current return
address on the current task structure into a stack of calls.
+config FUNCTION_GRAPH_RETVAL
+ bool "Kernel Function Graph Return Value"
+ depends on HAVE_FUNCTION_GRAPH_RETVAL
+ depends on FUNCTION_GRAPH_TRACER
+ default n
+ help
+ Support recording and printing the function return value when
+ using function graph tracer. It can be helpful to locate functions
+ that return errors. This feature is off by default, and you can
+ enable it via the trace option funcgraph-retval.
+ See Documentation/trace/ftrace.rst
+
config DYNAMIC_FTRACE
bool "enable/disable function tracing dynamically"
depends on FUNCTION_TRACER
@@ -187,7 +257,7 @@ config DYNAMIC_FTRACE
enabled, and the functions not enabled will not affect
performance of the system.
- See the files in /sys/kernel/debug/tracing:
+ See the files in /sys/kernel/tracing:
available_filter_functions
set_ftrace_filter
set_ftrace_notrace
@@ -202,9 +272,33 @@ config DYNAMIC_FTRACE_WITH_REGS
config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
def_bool y
- depends on DYNAMIC_FTRACE
+ depends on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS
depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+config DYNAMIC_FTRACE_WITH_CALL_OPS
+ def_bool y
+ depends on HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
+
+config DYNAMIC_FTRACE_WITH_ARGS
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS
+
+config FPROBE
+ bool "Kernel Function Probe (fprobe)"
+ depends on FUNCTION_TRACER
+ depends on DYNAMIC_FTRACE_WITH_REGS
+ depends on HAVE_RETHOOK
+ select RETHOOK
+ default n
+ help
+ This option enables kernel function probe (fprobe) based on ftrace.
+ The fprobe is similar to kprobes, but probes only for kernel function
+ entries and exits. This also can probe multiple functions by one
+ fprobe.
+
+ If unsure, say N.
+
config FUNCTION_PROFILER
bool "Kernel function profiler"
depends on FUNCTION_TRACER
@@ -227,7 +321,7 @@ config STACK_TRACER
select KALLSYMS
help
This special tracer records the maximum stack footprint of the
- kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
+ kernel and displays it in /sys/kernel/tracing/stack_trace.
This tracer works by hooking into every function call that the
kernel executes, and keeping a maximum stack depth value and
@@ -253,7 +347,6 @@ config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
depends on TRACE_IRQFLAGS_SUPPORT
- depends on !ARCH_USES_GETTIMEOFFSET
select TRACE_IRQFLAGS
select GENERIC_TRACER
select TRACER_MAX_TRACE
@@ -268,7 +361,7 @@ config IRQSOFF_TRACER
disabled by default and can be runtime (re-)started
via:
- echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
+ echo 0 > /sys/kernel/tracing/tracing_max_latency
(Note that kernel size and overhead increase with this option
enabled. This option and the preempt-off timing option can be
@@ -277,7 +370,6 @@ config IRQSOFF_TRACER
config PREEMPT_TRACER
bool "Preemption-off Latency Tracer"
default n
- depends on !ARCH_USES_GETTIMEOFFSET
depends on PREEMPTION
select GENERIC_TRACER
select TRACER_MAX_TRACE
@@ -293,7 +385,7 @@ config PREEMPT_TRACER
disabled by default and can be runtime (re-)started
via:
- echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
+ echo 0 > /sys/kernel/tracing/tracing_max_latency
(Note that kernel size and overhead increase with this option
enabled. This option and the irqs-off timing option can be
@@ -312,6 +404,7 @@ config SCHED_TRACER
config HWLAT_TRACER
bool "Tracer to detect hardware latencies (like SMIs)"
select GENERIC_TRACER
+ select TRACER_MAX_TRACE
help
This tracer, when enabled will create one or more kernel threads,
depending on what the cpumask file is set to, which each thread
@@ -344,6 +437,69 @@ config HWLAT_TRACER
file. Every time a latency is greater than tracing_thresh, it will
be recorded into the ring buffer.
+config OSNOISE_TRACER
+ bool "OS Noise tracer"
+ select GENERIC_TRACER
+ select TRACER_MAX_TRACE
+ help
+ In the context of high-performance computing (HPC), the Operating
+ System Noise (osnoise) refers to the interference experienced by an
+ application due to activities inside the operating system. In the
+ context of Linux, NMIs, IRQs, SoftIRQs, and any other system thread
+ can cause noise to the system. Moreover, hardware-related jobs can
+ also cause noise, for example, via SMIs.
+
+ The osnoise tracer leverages the hwlat_detector by running a similar
+ loop with preemption, SoftIRQs and IRQs enabled, thus allowing all
+ the sources of osnoise during its execution. The osnoise tracer takes
+ note of the entry and exit point of any source of interferences,
+ increasing a per-cpu interference counter. It saves an interference
+ counter for each source of interference. The interference counter for
+ NMI, IRQs, SoftIRQs, and threads is increased anytime the tool
+ observes these interferences' entry events. When a noise happens
+ without any interference from the operating system level, the
+ hardware noise counter increases, pointing to a hardware-related
+ noise. In this way, osnoise can account for any source of
+ interference. At the end of the period, the osnoise tracer prints
+ the sum of all noise, the max single noise, the percentage of CPU
+ available for the thread, and the counters for the noise sources.
+
+ In addition to the tracer, a set of tracepoints were added to
+ facilitate the identification of the osnoise source.
+
+ The output will appear in the trace and trace_pipe files.
+
+ To enable this tracer, echo in "osnoise" into the current_tracer
+ file.
+
+config TIMERLAT_TRACER
+ bool "Timerlat tracer"
+ select OSNOISE_TRACER
+ select GENERIC_TRACER
+ help
+ The timerlat tracer aims to help the preemptive kernel developers
+ to find sources of wakeup latencies of real-time threads.
+
+ The tracer creates a per-cpu kernel thread with real-time priority.
+ The tracer thread sets a periodic timer to wakeup itself, and goes
+ to sleep waiting for the timer to fire. At the wakeup, the thread
+ then computes a wakeup latency value as the difference between
+ the current time and the absolute time that the timer was set
+ to expire.
+
+ The tracer prints two lines at every activation. The first is the
+ timer latency observed at the hardirq context before the
+ activation of the thread. The second is the timer latency observed
+ by the thread, which is the same level that cyclictest reports. The
+ ACTIVATION ID field serves to relate the irq execution to its
+ respective thread execution.
+
+ The tracer is build on top of osnoise tracer, and the osnoise:
+ events can be used to trace the source of interference from NMI,
+ IRQs and other threads. It also enables the capture of the
+ stacktrace at the IRQ context, which helps to identify the code
+ path that can cause thread delay.
+
config MMIOTRACE
bool "Memory mapped IO tracing"
depends on HAVE_MMIOTRACE_SUPPORT && PCI
@@ -381,7 +537,7 @@ config TRACER_SNAPSHOT
Allow tracing users to take snapshot of the current buffer using the
ftrace interface, e.g.:
- echo 1 > /sys/kernel/debug/tracing/snapshot
+ echo 1 > /sys/kernel/tracing/snapshot
cat snapshot
config TRACER_SNAPSHOT_PER_CPU_SWAP
@@ -393,7 +549,7 @@ config TRACER_SNAPSHOT_PER_CPU_SWAP
full swap (all buffers). If this is set, then the following is
allowed:
- echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot
+ echo 1 > /sys/kernel/tracing/per_cpu/cpu2/snapshot
After which, only the tracing buffer for CPU 2 was swapped with
the main tracing buffer, and the other CPU buffers remain the same.
@@ -440,7 +596,7 @@ config PROFILE_ANNOTATED_BRANCHES
This tracer profiles all likely and unlikely macros
in the kernel. It will display the results in:
- /sys/kernel/debug/tracing/trace_stat/branch_annotated
+ /sys/kernel/tracing/trace_stat/branch_annotated
Note: this will add a significant overhead; only turn this
on if you need to profile the system's use of these macros.
@@ -453,7 +609,7 @@ config PROFILE_ALL_BRANCHES
taken in the kernel is recorded whether it hit or miss.
The results will be displayed in:
- /sys/kernel/debug/tracing/trace_stat/branch_all
+ /sys/kernel/tracing/trace_stat/branch_all
This option also enables the likely/unlikely profiler.
@@ -504,11 +660,37 @@ config BLK_DEV_IO_TRACE
Tracing also is possible using the ftrace interface, e.g.:
echo 1 > /sys/block/sda/sda1/trace/enable
- echo blk > /sys/kernel/debug/tracing/current_tracer
- cat /sys/kernel/debug/tracing/trace_pipe
+ echo blk > /sys/kernel/tracing/current_tracer
+ cat /sys/kernel/tracing/trace_pipe
If unsure, say N.
+config FPROBE_EVENTS
+ depends on FPROBE
+ depends on HAVE_REGS_AND_STACK_ACCESS_API
+ bool "Enable fprobe-based dynamic events"
+ select TRACING
+ select PROBE_EVENTS
+ select DYNAMIC_EVENTS
+ default y
+ help
+ This allows user to add tracing events on the function entry and
+ exit via ftrace interface. The syntax is same as the kprobe events
+ and the kprobe events on function entry and exit will be
+ transparently converted to this fprobe events.
+
+config PROBE_EVENTS_BTF_ARGS
+ depends on HAVE_FUNCTION_ARG_ACCESS_API
+ depends on FPROBE_EVENTS || KPROBE_EVENTS
+ depends on DEBUG_INFO_BTF && BPF_SYSCALL
+ bool "Support BTF function arguments for probe events"
+ default y
+ help
+ The user can specify the arguments of the probe event using the names
+ of the arguments of the probed function, when the probe location is a
+ kernel function entry or a tracepoint.
+ This is available only if BTF (BPF Type Format) support is enabled.
+
config KPROBE_EVENTS
depends on KPROBES
depends on HAVE_REGS_AND_STACK_ACCESS_API
@@ -531,14 +713,14 @@ config KPROBE_EVENTS
config KPROBE_EVENTS_ON_NOTRACE
bool "Do NOT protect notrace function from kprobe events"
depends on KPROBE_EVENTS
- depends on KPROBES_ON_FTRACE
+ depends on DYNAMIC_FTRACE
default n
help
This is only for the developers who want to debug ftrace itself
using kprobe events.
If kprobes can use ftrace instead of breakpoint, ftrace related
- functions are protected from kprobe-events to prevent an infinit
+ functions are protected from kprobe-events to prevent an infinite
recursion or any unexpected execution path which leads to a kernel
crash.
@@ -595,6 +777,31 @@ config FTRACE_MCOUNT_RECORD
depends on DYNAMIC_FTRACE
depends on HAVE_FTRACE_MCOUNT_RECORD
+config FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
+ bool
+ depends on FTRACE_MCOUNT_RECORD
+
+config FTRACE_MCOUNT_USE_CC
+ def_bool y
+ depends on $(cc-option,-mrecord-mcount)
+ depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
+ depends on FTRACE_MCOUNT_RECORD
+
+config FTRACE_MCOUNT_USE_OBJTOOL
+ def_bool y
+ depends on HAVE_OBJTOOL_MCOUNT
+ depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
+ depends on !FTRACE_MCOUNT_USE_CC
+ depends on FTRACE_MCOUNT_RECORD
+ select OBJTOOL
+
+config FTRACE_MCOUNT_USE_RECORDMCOUNT
+ def_bool y
+ depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
+ depends on !FTRACE_MCOUNT_USE_CC
+ depends on !FTRACE_MCOUNT_USE_OBJTOOL
+ depends on FTRACE_MCOUNT_RECORD
+
config TRACING_MAP
bool
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -622,6 +829,21 @@ config SYNTH_EVENTS
If in doubt, say N.
+config USER_EVENTS
+ bool "User trace events"
+ select TRACING
+ select DYNAMIC_EVENTS
+ help
+ User trace events are user-defined trace events that
+ can be used like an existing kernel trace event. User trace
+ events are generated by writing to a tracefs file. User
+ processes can determine if their tracing events should be
+ generated by registering a value and bit with the kernel
+ that reflects when it is enabled or not.
+
+ See Documentation/trace/user_events.rst.
+ If in doubt, say N.
+
config HIST_TRIGGERS
bool "Histogram triggers"
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -658,7 +880,7 @@ config TRACEPOINT_BENCHMARK
help
This option creates the tracepoint "benchmark:benchmark_event".
When the tracepoint is enabled, it kicks off a kernel thread that
- goes into an infinite loop (calling cond_sched() to let other tasks
+ goes into an infinite loop (calling cond_resched() to let other tasks
run), and calls the tracepoint. Each iteration will record the time
it took to write to the tracepoint and the next iteration that
data will be passed to the tracepoint itself. That is, the tracepoint
@@ -727,6 +949,45 @@ config TRACE_EVAL_MAP_FILE
If unsure, say N.
+config FTRACE_RECORD_RECURSION
+ bool "Record functions that recurse in function tracing"
+ depends on FUNCTION_TRACER
+ help
+ All callbacks that attach to the function tracing have some sort
+ of protection against recursion. Even though the protection exists,
+ it adds overhead. This option will create a file in the tracefs
+ file system called "recursed_functions" that will list the functions
+ that triggered a recursion.
+
+ This will add more overhead to cases that have recursion.
+
+ If unsure, say N
+
+config FTRACE_RECORD_RECURSION_SIZE
+ int "Max number of recursed functions to record"
+ default 128
+ depends on FTRACE_RECORD_RECURSION
+ help
+ This defines the limit of number of functions that can be
+ listed in the "recursed_functions" file, that lists all
+ the functions that caused a recursion to happen.
+ This file can be reset, but the limit can not change in
+ size at runtime.
+
+config RING_BUFFER_RECORD_RECURSION
+ bool "Record functions that recurse in the ring buffer"
+ depends on FTRACE_RECORD_RECURSION
+ # default y, because it is coupled with FTRACE_RECORD_RECURSION
+ default y
+ help
+ The ring buffer has its own internal recursion. Although when
+ recursion happens it won't cause harm because of the protection,
+ but it does cause unwanted overhead. Enabling this option will
+ place where recursion was detected into the ftrace "recursed_functions"
+ file.
+
+ This will add more overhead to cases that have recursion.
+
config GCOV_PROFILE_FTRACE
bool "Enable GCOV profiling on ftrace subsystem"
depends on GCOV_KERNEL
@@ -774,6 +1035,20 @@ config EVENT_TRACE_TEST_SYSCALLS
TBD - enable a way to actually call the syscalls as we test their
events
+config FTRACE_SORT_STARTUP_TEST
+ bool "Verify compile time sorting of ftrace functions"
+ depends on DYNAMIC_FTRACE
+ depends on BUILDTIME_MCOUNT_SORT
+ help
+ Sorting of the mcount_loc sections that is used to find the
+ where the ftrace knows where to patch functions for tracing
+ and other callbacks is done at compile time. But if the sort
+ is not done correctly, it will cause non-deterministic failures.
+ When this is set, the sorted sections will be verified that they
+ are in deed sorted and will warn if they are not.
+
+ If unsure, say N
+
config RING_BUFFER_STARTUP_TEST
bool "Ring buffer startup self test"
depends on RING_BUFFER
@@ -790,13 +1065,33 @@ config RING_BUFFER_STARTUP_TEST
The test runs for 10 seconds. This will slow your boot time
by at least 10 more seconds.
- At the end of the test, statics and more checks are done.
- It will output the stats of each per cpu buffer. What
+ At the end of the test, statistics and more checks are done.
+ It will output the stats of each per cpu buffer: What
was written, the sizes, what was read, what was lost, and
other similar details.
If unsure, say N
+config RING_BUFFER_VALIDATE_TIME_DELTAS
+ bool "Verify ring buffer time stamp deltas"
+ depends on RING_BUFFER
+ help
+ This will audit the time stamps on the ring buffer sub
+ buffer to make sure that all the time deltas for the
+ events on a sub buffer matches the current time stamp.
+ This audit is performed for every event that is not
+ interrupted, or interrupting another event. A check
+ is also made when traversing sub buffers to make sure
+ that all the deltas on the previous sub buffer do not
+ add up to be greater than the current time stamp.
+
+ NOTE: This adds significant overhead to recording of events,
+ and should only be used to test the logic of the ring buffer.
+ Do not use it on production systems.
+
+ Only say Y if you understand what this does, and you
+ still want it enabled. Otherwise say N
+
config MMIOTRACE_TEST
tristate "Test module for mmiotrace"
depends on MMIOTRACE && m
@@ -820,6 +1115,10 @@ config PREEMPTIRQ_DELAY_TEST
irq-disabled critical sections for 500us:
modprobe preemptirq_delay_test test_mode=irq delay=500 burst_size=3
+ What's more, if you want to attach the test on the cpu which the latency
+ tracer is running on, specify cpu_affinity=cpu_num at the end of the
+ command.
+
If unsure, say N
config SYNTH_EVENT_GEN_TEST
@@ -870,7 +1169,6 @@ config HIST_TRIGGERS_DEBUG
If unsure, say N.
-endif # FTRACE
-
-endif # TRACING_SUPPORT
+source "kernel/trace/rv/Kconfig"
+endif # FTRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 6575bb0a0434..057cd975d014 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -2,9 +2,9 @@
# Do not instrument the tracer itself:
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
ifdef CONFIG_FUNCTION_TRACER
-ORIG_CFLAGS := $(KBUILD_CFLAGS)
-KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
# Avoid recursion due to instrumentation.
KCSAN_SANITIZE := n
@@ -31,6 +31,12 @@ ifdef CONFIG_GCOV_PROFILE_FTRACE
GCOV_PROFILE := y
endif
+# Functions in this file could be invoked from early interrupt
+# code and produce random code coverage.
+KCOV_INSTRUMENT_trace_preemptirq.o := n
+
+CFLAGS_bpf_trace.o := -I$(src)
+
CFLAGS_trace_benchmark.o := -I$(src)
CFLAGS_trace_events_filter.o := -I$(src)
@@ -45,6 +51,7 @@ obj-$(CONFIG_TRACING) += trace_output.o
obj-$(CONFIG_TRACING) += trace_seq.o
obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
+obj-$(CONFIG_TRACING) += pid_list.o
obj-$(CONFIG_TRACING_MAP) += tracing_map.o
obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o
obj-$(CONFIG_SYNTH_EVENT_GEN_TEST) += synth_event_gen_test.o
@@ -56,6 +63,7 @@ obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
+obj-$(CONFIG_OSNOISE_TRACER) += trace_osnoise.o
obj-$(CONFIG_NOP_TRACER) += trace_nop.o
obj-$(CONFIG_STACK_TRACER) += trace_stack.o
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
@@ -74,11 +82,14 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o
obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
+obj-$(CONFIG_USER_EVENTS) += trace_events_user.o
obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o
+obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o
obj-$(CONFIG_TRACEPOINTS) += power-traces.o
ifeq ($(CONFIG_PM),y)
obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
@@ -88,9 +99,15 @@ obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
endif
obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o
obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
+obj-$(CONFIG_PROBE_EVENTS_BTF_ARGS) += trace_btf.o
obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o
+obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
+obj-$(CONFIG_FPROBE) += fprobe.o
+obj-$(CONFIG_RETHOOK) += rethook.o
+obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
+obj-$(CONFIG_RV) += rv/
libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 5ef0484513ec..d5d94510afd3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -34,7 +34,7 @@ static struct trace_array *blk_tr;
static bool blk_tracer_enabled __read_mostly;
static LIST_HEAD(running_trace_list);
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
+static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(running_trace_lock);
/* Select an alternative, minimalistic output than the original one */
#define TRACE_BLK_OPT_CLASSIC 0x1
@@ -72,17 +72,17 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
struct blk_io_trace *t;
struct ring_buffer_event *event = NULL;
struct trace_buffer *buffer = NULL;
- int pc = 0;
+ unsigned int trace_ctx = 0;
int cpu = smp_processor_id();
bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
if (blk_tracer) {
buffer = blk_tr->array_buffer.buffer;
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + len + cgid_len,
- 0, pc);
+ trace_ctx);
if (!event)
return;
t = ring_buffer_event_data(event);
@@ -107,7 +107,7 @@ record_it:
memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
if (blk_tracer)
- trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
+ trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
}
}
@@ -121,12 +121,12 @@ static void trace_note_tsk(struct task_struct *tsk)
struct blk_trace *bt;
tsk->btrace_seq = blktrace_seq;
- spin_lock_irqsave(&running_trace_lock, flags);
+ raw_spin_lock_irqsave(&running_trace_lock, flags);
list_for_each_entry(bt, &running_trace_list, running_list) {
trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
sizeof(tsk->comm), 0);
}
- spin_unlock_irqrestore(&running_trace_lock, flags);
+ raw_spin_unlock_irqrestore(&running_trace_lock, flags);
}
static void trace_note_time(struct blk_trace *bt)
@@ -145,13 +145,14 @@ static void trace_note_time(struct blk_trace *bt)
local_irq_restore(flags);
}
-void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
- const char *fmt, ...)
+void __blk_trace_note_message(struct blk_trace *bt,
+ struct cgroup_subsys_state *css, const char *fmt, ...)
{
int n;
va_list args;
unsigned long flags;
char *buf;
+ u64 cgid = 0;
if (unlikely(bt->trace_state != Blktrace_running &&
!blk_tracer_enabled))
@@ -170,17 +171,16 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
va_end(args);
- if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
- blkcg = NULL;
#ifdef CONFIG_BLK_CGROUP
- trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n,
- blkcg ? cgroup_id(blkcg->css.cgroup) : 1);
-#else
- trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, 0);
+ if (css && (blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+ cgid = cgroup_id(css->cgroup);
+ else
+ cgid = 1;
#endif
+ trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, cgid);
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(__trace_note_message);
+EXPORT_SYMBOL_GPL(__blk_trace_note_message);
static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
pid_t pid)
@@ -205,7 +205,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
#define BLK_TC_PREFLUSH BLK_TC_FLUSH
/* The ilog2() calls fall out because they're constant */
-#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
+#define MASK_TC_BIT(rw, __name) ((__force u32)(rw & REQ_ ## __name) << \
(ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
/*
@@ -213,8 +213,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
* blk_io_trace structure and places it in a per-cpu subbuffer.
*/
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
- int op, int op_flags, u32 what, int error, int pdu_len,
- void *pdu_data, u64 cgid)
+ const blk_opf_t opf, u32 what, int error,
+ int pdu_len, void *pdu_data, u64 cgid)
{
struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
@@ -222,20 +222,22 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
struct blk_io_trace *t;
unsigned long flags = 0;
unsigned long *sequence;
+ unsigned int trace_ctx = 0;
pid_t pid;
- int cpu, pc = 0;
+ int cpu;
bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
+ const enum req_op op = opf & REQ_OP_MASK;
if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
return;
what |= ddir_act[op_is_write(op) ? WRITE : READ];
- what |= MASK_TC_BIT(op_flags, SYNC);
- what |= MASK_TC_BIT(op_flags, RAHEAD);
- what |= MASK_TC_BIT(op_flags, META);
- what |= MASK_TC_BIT(op_flags, PREFLUSH);
- what |= MASK_TC_BIT(op_flags, FUA);
+ what |= MASK_TC_BIT(opf, SYNC);
+ what |= MASK_TC_BIT(opf, RAHEAD);
+ what |= MASK_TC_BIT(opf, META);
+ what |= MASK_TC_BIT(opf, PREFLUSH);
+ what |= MASK_TC_BIT(opf, FUA);
if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
what |= BLK_TC_ACT(BLK_TC_DISCARD);
if (op == REQ_OP_FLUSH)
@@ -252,10 +254,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
tracing_record_cmdline(current);
buffer = blk_tr->array_buffer.buffer;
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + pdu_len + cgid_len,
- 0, pc);
+ trace_ctx);
if (!event)
return;
t = ring_buffer_event_data(event);
@@ -301,7 +303,7 @@ record_it:
memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
if (blk_tracer) {
- trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
+ trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
return;
}
}
@@ -309,12 +311,20 @@ record_it:
local_irq_restore(flags);
}
-static void blk_trace_free(struct blk_trace *bt)
+static void blk_trace_free(struct request_queue *q, struct blk_trace *bt)
{
- debugfs_remove(bt->msg_file);
- debugfs_remove(bt->dropped_file);
relay_close(bt->rchan);
- debugfs_remove(bt->dir);
+
+ /*
+ * If 'bt->dir' is not set, then both 'dropped' and 'msg' are created
+ * under 'q->debugfs_dir', thus lookup and remove them.
+ */
+ if (!bt->dir) {
+ debugfs_lookup_and_remove("dropped", q->debugfs_dir);
+ debugfs_lookup_and_remove("msg", q->debugfs_dir);
+ } else {
+ debugfs_remove(bt->dir);
+ }
free_percpu(bt->sequence);
free_percpu(bt->msg_data);
kfree(bt);
@@ -336,10 +346,42 @@ static void put_probe_ref(void)
mutex_unlock(&blk_probe_mutex);
}
-static void blk_trace_cleanup(struct blk_trace *bt)
+static int blk_trace_start(struct blk_trace *bt)
+{
+ if (bt->trace_state != Blktrace_setup &&
+ bt->trace_state != Blktrace_stopped)
+ return -EINVAL;
+
+ blktrace_seq++;
+ smp_mb();
+ bt->trace_state = Blktrace_running;
+ raw_spin_lock_irq(&running_trace_lock);
+ list_add(&bt->running_list, &running_trace_list);
+ raw_spin_unlock_irq(&running_trace_lock);
+ trace_note_time(bt);
+
+ return 0;
+}
+
+static int blk_trace_stop(struct blk_trace *bt)
{
+ if (bt->trace_state != Blktrace_running)
+ return -EINVAL;
+
+ bt->trace_state = Blktrace_stopped;
+ raw_spin_lock_irq(&running_trace_lock);
+ list_del_init(&bt->running_list);
+ raw_spin_unlock_irq(&running_trace_lock);
+ relay_flush(bt->rchan);
+
+ return 0;
+}
+
+static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt)
+{
+ blk_trace_stop(bt);
synchronize_rcu();
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
put_probe_ref();
}
@@ -348,12 +390,11 @@ static int __blk_trace_remove(struct request_queue *q)
struct blk_trace *bt;
bt = rcu_replace_pointer(q->blk_trace, NULL,
- lockdep_is_held(&q->blk_trace_mutex));
+ lockdep_is_held(&q->debugfs_mutex));
if (!bt)
return -EINVAL;
- if (bt->trace_state != Blktrace_running)
- blk_trace_cleanup(bt);
+ blk_trace_cleanup(q, bt);
return 0;
}
@@ -362,9 +403,9 @@ int blk_trace_remove(struct request_queue *q)
{
int ret;
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
ret = __blk_trace_remove(q);
- mutex_unlock(&q->blk_trace_mutex);
+ mutex_unlock(&q->debugfs_mutex);
return ret;
}
@@ -402,7 +443,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
return PTR_ERR(msg);
bt = filp->private_data;
- __trace_note_message(bt, NULL, "%s", msg);
+ __blk_trace_note_message(bt, NULL, "%s", msg);
kfree(msg);
return count;
@@ -449,7 +490,7 @@ static struct dentry *blk_create_buf_file_callback(const char *filename,
&relay_file_operations);
}
-static struct rchan_callbacks blk_relay_callbacks = {
+static const struct rchan_callbacks blk_relay_callbacks = {
.subbuf_start = blk_subbuf_start_callback,
.create_buf_file = blk_create_buf_file_callback,
.remove_buf_file = blk_remove_buf_file_callback,
@@ -458,14 +499,9 @@ static struct rchan_callbacks blk_relay_callbacks = {
static void blk_trace_setup_lba(struct blk_trace *bt,
struct block_device *bdev)
{
- struct hd_struct *part = NULL;
-
- if (bdev)
- part = bdev->bd_part;
-
- if (part) {
- bt->start_lba = part->start_sect;
- bt->end_lba = part->start_sect + part->nr_sects;
+ if (bdev) {
+ bt->start_lba = bdev->bd_start_sect;
+ bt->end_lba = bdev->bd_start_sect + bdev_nr_sectors(bdev);
} else {
bt->start_lba = 0;
bt->end_lba = -1ULL;
@@ -483,12 +519,11 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
struct dentry *dir = NULL;
int ret;
+ lockdep_assert_held(&q->debugfs_mutex);
+
if (!buts->buf_size || !buts->buf_nr)
return -EINVAL;
- if (!blk_debugfs_root)
- return -ENOENT;
-
strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
@@ -503,7 +538,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
* we can be.
*/
if (rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->blk_trace_mutex))) {
+ lockdep_is_held(&q->debugfs_mutex))) {
pr_warn("Concurrent blktraces are not allowed on %s\n",
buts->name);
return -EBUSY;
@@ -522,21 +557,36 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (!bt->msg_data)
goto err;
- ret = -ENOENT;
-
- dir = debugfs_lookup(buts->name, blk_debugfs_root);
- if (!dir)
+ /*
+ * When tracing the whole disk reuse the existing debugfs directory
+ * created by the block layer on init. For partitions block devices,
+ * and scsi-generic block devices we create a temporary new debugfs
+ * directory that will be removed once the trace ends.
+ */
+ if (bdev && !bdev_is_partition(bdev))
+ dir = q->debugfs_dir;
+ else
bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
+ /*
+ * As blktrace relies on debugfs for its interface the debugfs directory
+ * is required, contrary to the usual mantra of not checking for debugfs
+ * files or directories.
+ */
+ if (IS_ERR_OR_NULL(dir)) {
+ pr_warn("debugfs_dir not present for %s so skipping\n",
+ buts->name);
+ ret = -ENOENT;
+ goto err;
+ }
+
bt->dev = dev;
atomic_set(&bt->dropped, 0);
INIT_LIST_HEAD(&bt->running_list);
ret = -EIO;
- bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
- &blk_dropped_fops);
-
- bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
+ debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
+ debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
bt->rchan = relay_open("trace", dir, buts->buf_size,
buts->buf_nr, &blk_relay_callbacks, bt);
@@ -563,10 +613,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
ret = 0;
err:
- if (dir && !bt->dir)
- dput(dir);
if (ret)
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
return ret;
}
@@ -597,9 +645,9 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
{
int ret;
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
ret = __blk_trace_setup(q, name, dev, bdev, arg);
- mutex_unlock(&q->blk_trace_mutex);
+ mutex_unlock(&q->debugfs_mutex);
return ret;
}
@@ -641,53 +689,26 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
static int __blk_trace_startstop(struct request_queue *q, int start)
{
- int ret;
struct blk_trace *bt;
bt = rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->blk_trace_mutex));
+ lockdep_is_held(&q->debugfs_mutex));
if (bt == NULL)
return -EINVAL;
- /*
- * For starting a trace, we can transition from a setup or stopped
- * trace. For stopping a trace, the state must be running
- */
- ret = -EINVAL;
- if (start) {
- if (bt->trace_state == Blktrace_setup ||
- bt->trace_state == Blktrace_stopped) {
- blktrace_seq++;
- smp_mb();
- bt->trace_state = Blktrace_running;
- spin_lock_irq(&running_trace_lock);
- list_add(&bt->running_list, &running_trace_list);
- spin_unlock_irq(&running_trace_lock);
-
- trace_note_time(bt);
- ret = 0;
- }
- } else {
- if (bt->trace_state == Blktrace_running) {
- bt->trace_state = Blktrace_stopped;
- spin_lock_irq(&running_trace_lock);
- list_del_init(&bt->running_list);
- spin_unlock_irq(&running_trace_lock);
- relay_flush(bt->rchan);
- ret = 0;
- }
- }
-
- return ret;
+ if (start)
+ return blk_trace_start(bt);
+ else
+ return blk_trace_stop(bt);
}
int blk_trace_startstop(struct request_queue *q, int start)
{
int ret;
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
ret = __blk_trace_startstop(q, start);
- mutex_unlock(&q->blk_trace_mutex);
+ mutex_unlock(&q->debugfs_mutex);
return ret;
}
@@ -700,7 +721,7 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop);
*/
/**
- * blk_trace_ioctl: - handle the ioctls associated with tracing
+ * blk_trace_ioctl - handle the ioctls associated with tracing
* @bdev: the block device
* @cmd: the ioctl cmd
* @arg: the argument data, if any
@@ -708,30 +729,26 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop);
**/
int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
{
- struct request_queue *q;
+ struct request_queue *q = bdev_get_queue(bdev);
int ret, start = 0;
char b[BDEVNAME_SIZE];
- q = bdev_get_queue(bdev);
- if (!q)
- return -ENXIO;
-
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
switch (cmd) {
case BLKTRACESETUP:
- bdevname(bdev, b);
+ snprintf(b, sizeof(b), "%pg", bdev);
ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
break;
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
case BLKTRACESETUP32:
- bdevname(bdev, b);
+ snprintf(b, sizeof(b), "%pg", bdev);
ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
break;
#endif
case BLKTRACESTART:
start = 1;
- /* fall through */
+ fallthrough;
case BLKTRACESTOP:
ret = __blk_trace_startstop(q, start);
break;
@@ -743,30 +760,26 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
break;
}
- mutex_unlock(&q->blk_trace_mutex);
+ mutex_unlock(&q->debugfs_mutex);
return ret;
}
/**
- * blk_trace_shutdown: - stop and cleanup trace structures
+ * blk_trace_shutdown - stop and cleanup trace structures
* @q: the request queue associated with the device
*
**/
void blk_trace_shutdown(struct request_queue *q)
{
- mutex_lock(&q->blk_trace_mutex);
if (rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->blk_trace_mutex))) {
- __blk_trace_startstop(q, 0);
+ lockdep_is_held(&q->debugfs_mutex)))
__blk_trace_remove(q);
- }
-
- mutex_unlock(&q->blk_trace_mutex);
}
#ifdef CONFIG_BLK_CGROUP
static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{
+ struct cgroup_subsys_state *blkcg_css;
struct blk_trace *bt;
/* We don't use the 'bt' value here except as an optimization... */
@@ -774,24 +787,25 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
return 0;
- if (!bio->bi_blkg)
+ blkcg_css = bio_blkcg_css(bio);
+ if (!blkcg_css)
return 0;
- return cgroup_id(bio_blkcg(bio)->css.cgroup);
+ return cgroup_id(blkcg_css->cgroup);
}
#else
-u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{
return 0;
}
#endif
static u64
-blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
+blk_trace_request_get_cgid(struct request *rq)
{
if (!rq->bio)
return 0;
/* Use the first bio */
- return blk_trace_bio_get_cgid(q, rq->bio);
+ return blk_trace_bio_get_cgid(rq->q, rq->bio);
}
/*
@@ -810,7 +824,7 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
* Records an action against a request. Will log the bio offset + size.
*
**/
-static void blk_add_trace_rq(struct request *rq, int error,
+static void blk_add_trace_rq(struct request *rq, blk_status_t error,
unsigned int nr_bytes, u32 what, u64 cgid)
{
struct blk_trace *bt;
@@ -827,38 +841,40 @@ static void blk_add_trace_rq(struct request *rq, int error,
else
what |= BLK_TC_ACT(BLK_TC_FS);
- __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
- rq->cmd_flags, what, error, 0, NULL, cgid);
+ __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, rq->cmd_flags,
+ what, blk_status_to_errno(error), 0, NULL, cgid);
rcu_read_unlock();
}
-static void blk_add_trace_rq_insert(void *ignore,
- struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_insert(void *ignore, struct request *rq)
{
blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
- blk_trace_request_get_cgid(q, rq));
+ blk_trace_request_get_cgid(rq));
}
-static void blk_add_trace_rq_issue(void *ignore,
- struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_issue(void *ignore, struct request *rq)
{
blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
- blk_trace_request_get_cgid(q, rq));
+ blk_trace_request_get_cgid(rq));
+}
+
+static void blk_add_trace_rq_merge(void *ignore, struct request *rq)
+{
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE,
+ blk_trace_request_get_cgid(rq));
}
-static void blk_add_trace_rq_requeue(void *ignore,
- struct request_queue *q,
- struct request *rq)
+static void blk_add_trace_rq_requeue(void *ignore, struct request *rq)
{
blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
- blk_trace_request_get_cgid(q, rq));
+ blk_trace_request_get_cgid(rq));
}
static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
- int error, unsigned int nr_bytes)
+ blk_status_t error, unsigned int nr_bytes)
{
blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
- blk_trace_request_get_cgid(rq->q, rq));
+ blk_trace_request_get_cgid(rq));
}
/**
@@ -885,15 +901,14 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
}
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, what, error, 0, NULL,
+ bio->bi_opf, what, error, 0, NULL,
blk_trace_bio_get_cgid(q, bio));
rcu_read_unlock();
}
-static void blk_add_trace_bio_bounce(void *ignore,
- struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0);
}
static void blk_add_trace_bio_complete(void *ignore,
@@ -903,63 +918,26 @@ static void blk_add_trace_bio_complete(void *ignore,
blk_status_to_errno(bio->bi_status));
}
-static void blk_add_trace_bio_backmerge(void *ignore,
- struct request_queue *q,
- struct request *rq,
- struct bio *bio)
+static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BACKMERGE,
+ 0);
}
-static void blk_add_trace_bio_frontmerge(void *ignore,
- struct request_queue *q,
- struct request *rq,
- struct bio *bio)
+static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_FRONTMERGE,
+ 0);
}
-static void blk_add_trace_bio_queue(void *ignore,
- struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_queue(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
-}
-
-static void blk_add_trace_getrq(void *ignore,
- struct request_queue *q,
- struct bio *bio, int rw)
-{
- if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
- else {
- struct blk_trace *bt;
-
- rcu_read_lock();
- bt = rcu_dereference(q->blk_trace);
- if (bt)
- __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
- NULL, 0);
- rcu_read_unlock();
- }
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_QUEUE, 0);
}
-
-static void blk_add_trace_sleeprq(void *ignore,
- struct request_queue *q,
- struct bio *bio, int rw)
+static void blk_add_trace_getrq(void *ignore, struct bio *bio)
{
- if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
- else {
- struct blk_trace *bt;
-
- rcu_read_lock();
- bt = rcu_dereference(q->blk_trace);
- if (bt)
- __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
- 0, 0, NULL, 0);
- rcu_read_unlock();
- }
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_GETRQ, 0);
}
static void blk_add_trace_plug(void *ignore, struct request_queue *q)
@@ -969,7 +947,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (bt)
- __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0);
+ __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0);
rcu_read_unlock();
}
@@ -989,15 +967,14 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
else
what = BLK_TA_UNPLUG_TIMER;
- __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0);
+ __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0);
}
rcu_read_unlock();
}
-static void blk_add_trace_split(void *ignore,
- struct request_queue *q, struct bio *bio,
- unsigned int pdu)
+static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
{
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct blk_trace *bt;
rcu_read_lock();
@@ -1006,8 +983,7 @@ static void blk_add_trace_split(void *ignore,
__be64 rpdu = cpu_to_be64(pdu);
__blk_add_trace(bt, bio->bi_iter.bi_sector,
- bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
- BLK_TA_SPLIT,
+ bio->bi_iter.bi_size, bio->bi_opf, BLK_TA_SPLIT,
blk_status_to_errno(bio->bi_status),
sizeof(rpdu), &rpdu,
blk_trace_bio_get_cgid(q, bio));
@@ -1018,20 +994,16 @@ static void blk_add_trace_split(void *ignore,
/**
* blk_add_trace_bio_remap - Add a trace for a bio-remap operation
* @ignore: trace callback data parameter (not used)
- * @q: queue the io is for
* @bio: the source bio
- * @dev: target device
+ * @dev: source device
* @from: source sector
*
- * Description:
- * Device mapper or raid target sometimes need to split a bio because
- * it spans a stripe (or similar). Add a trace for that action.
- *
+ * Called after a bio is remapped to a different device and/or sector.
**/
-static void blk_add_trace_bio_remap(void *ignore,
- struct request_queue *q, struct bio *bio,
- dev_t dev, sector_t from)
+static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev,
+ sector_t from)
{
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct blk_trace *bt;
struct blk_io_trace_remap r;
@@ -1047,7 +1019,7 @@ static void blk_add_trace_bio_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, BLK_TA_REMAP,
+ bio->bi_opf, BLK_TA_REMAP,
blk_status_to_errno(bio->bi_status),
sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
rcu_read_unlock();
@@ -1056,7 +1028,6 @@ static void blk_add_trace_bio_remap(void *ignore,
/**
* blk_add_trace_rq_remap - Add a trace for a request-remap operation
* @ignore: trace callback data parameter (not used)
- * @q: queue the io is for
* @rq: the source request
* @dev: target device
* @from: source sector
@@ -1066,34 +1037,31 @@ static void blk_add_trace_bio_remap(void *ignore,
* Add a trace for that action.
*
**/
-static void blk_add_trace_rq_remap(void *ignore,
- struct request_queue *q,
- struct request *rq, dev_t dev,
+static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev,
sector_t from)
{
struct blk_trace *bt;
struct blk_io_trace_remap r;
rcu_read_lock();
- bt = rcu_dereference(q->blk_trace);
+ bt = rcu_dereference(rq->q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return;
}
r.device_from = cpu_to_be32(dev);
- r.device_to = cpu_to_be32(disk_devt(rq->rq_disk));
+ r.device_to = cpu_to_be32(disk_devt(rq->q->disk));
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
- rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
- sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
+ rq->cmd_flags, BLK_TA_REMAP, 0,
+ sizeof(r), &r, blk_trace_request_get_cgid(rq));
rcu_read_unlock();
}
/**
* blk_add_driver_data - Add binary message with driver-specific data
- * @q: queue the io is for
* @rq: io request
* @data: driver-specific data
* @len: length of driver-specific data
@@ -1102,22 +1070,20 @@ static void blk_add_trace_rq_remap(void *ignore,
* Some drivers might want to write driver-specific data per request.
*
**/
-void blk_add_driver_data(struct request_queue *q,
- struct request *rq,
- void *data, size_t len)
+void blk_add_driver_data(struct request *rq, void *data, size_t len)
{
struct blk_trace *bt;
rcu_read_lock();
- bt = rcu_dereference(q->blk_trace);
+ bt = rcu_dereference(rq->q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return;
}
- __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
+ __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0,
BLK_TA_DRV_DATA, 0, len, data,
- blk_trace_request_get_cgid(q, rq));
+ blk_trace_request_get_cgid(rq));
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1130,6 +1096,8 @@ static void blk_register_tracepoints(void)
WARN_ON(ret);
ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
WARN_ON(ret);
+ ret = register_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
+ WARN_ON(ret);
ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
WARN_ON(ret);
ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
@@ -1146,8 +1114,6 @@ static void blk_register_tracepoints(void)
WARN_ON(ret);
ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
WARN_ON(ret);
- ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
- WARN_ON(ret);
ret = register_trace_block_plug(blk_add_trace_plug, NULL);
WARN_ON(ret);
ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
@@ -1167,7 +1133,6 @@ static void blk_unregister_tracepoints(void)
unregister_trace_block_split(blk_add_trace_split, NULL);
unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
unregister_trace_block_plug(blk_add_trace_plug, NULL);
- unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
@@ -1176,6 +1141,7 @@ static void blk_unregister_tracepoints(void)
unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
+ unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
@@ -1319,7 +1285,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act,
* ones now use the 64bit ino as the whole ID and
* no longer use generation.
*
- * Regarldess of the content, always output
+ * Regardless of the content, always output
* "LOW32,HIGH32" so that FILEID_INO32_GEN fid can
* be mapped back to @id on both 64 and 32bit ino
* setups. See __kernfs_fh_to_dentry().
@@ -1361,7 +1327,7 @@ static void blk_log_dump_pdu(struct trace_seq *s,
i == 0 ? "" : " ", pdu_buf[i]);
/*
- * stop when the rest is just zeroes and indicate so
+ * stop when the rest is just zeros and indicate so
* with a ".." appended
*/
if (i == end && end != pdu_len - 1) {
@@ -1578,7 +1544,8 @@ blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
{
- if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+ if ((iter->ent->type != TRACE_BLK) ||
+ !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
return TRACE_TYPE_UNHANDLED;
return print_one_line(iter, true);
@@ -1642,13 +1609,15 @@ static int blk_trace_remove_queue(struct request_queue *q)
struct blk_trace *bt;
bt = rcu_replace_pointer(q->blk_trace, NULL,
- lockdep_is_held(&q->blk_trace_mutex));
+ lockdep_is_held(&q->debugfs_mutex));
if (bt == NULL)
return -EINVAL;
+ blk_trace_stop(bt);
+
put_probe_ref();
synchronize_rcu();
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
return 0;
}
@@ -1679,7 +1648,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
return 0;
free_bt:
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
return ret;
}
@@ -1791,36 +1760,19 @@ static ssize_t blk_trace_mask2str(char *buf, int mask)
return p - buf;
}
-static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
-{
- if (bdev->bd_disk == NULL)
- return NULL;
-
- return bdev_get_queue(bdev);
-}
-
static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- struct hd_struct *p = dev_to_part(dev);
- struct request_queue *q;
- struct block_device *bdev;
+ struct block_device *bdev = dev_to_bdev(dev);
+ struct request_queue *q = bdev_get_queue(bdev);
struct blk_trace *bt;
ssize_t ret = -ENXIO;
- bdev = bdget(part_devt(p));
- if (bdev == NULL)
- goto out;
-
- q = blk_trace_get_queue(bdev);
- if (q == NULL)
- goto out_bdput;
-
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
bt = rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->blk_trace_mutex));
+ lockdep_is_held(&q->debugfs_mutex));
if (attr == &dev_attr_enable) {
ret = sprintf(buf, "%u\n", !!bt);
goto out_unlock_bdev;
@@ -1838,10 +1790,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
ret = sprintf(buf, "%llu\n", bt->end_lba);
out_unlock_bdev:
- mutex_unlock(&q->blk_trace_mutex);
-out_bdput:
- bdput(bdev);
-out:
+ mutex_unlock(&q->debugfs_mutex);
return ret;
}
@@ -1849,9 +1798,8 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
- struct block_device *bdev;
- struct request_queue *q;
- struct hd_struct *p;
+ struct block_device *bdev = dev_to_bdev(dev);
+ struct request_queue *q = bdev_get_queue(bdev);
struct blk_trace *bt;
u64 value;
ssize_t ret = -EINVAL;
@@ -1867,24 +1815,15 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
goto out;
value = ret;
}
- } else if (kstrtoull(buf, 0, &value))
- goto out;
-
- ret = -ENXIO;
-
- p = dev_to_part(dev);
- bdev = bdget(part_devt(p));
- if (bdev == NULL)
- goto out;
-
- q = blk_trace_get_queue(bdev);
- if (q == NULL)
- goto out_bdput;
+ } else {
+ if (kstrtoull(buf, 0, &value))
+ goto out;
+ }
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
bt = rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->blk_trace_mutex));
+ lockdep_is_held(&q->debugfs_mutex));
if (attr == &dev_attr_enable) {
if (!!value == !!bt) {
ret = 0;
@@ -1901,7 +1840,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
if (bt == NULL) {
ret = blk_trace_setup_queue(q, bdev);
bt = rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->blk_trace_mutex));
+ lockdep_is_held(&q->debugfs_mutex));
}
if (ret == 0) {
@@ -1916,37 +1855,33 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
}
out_unlock_bdev:
- mutex_unlock(&q->blk_trace_mutex);
-out_bdput:
- bdput(bdev);
+ mutex_unlock(&q->debugfs_mutex);
out:
return ret ? ret : count;
}
-
-int blk_trace_init_sysfs(struct device *dev)
-{
- return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
-}
-
-void blk_trace_remove_sysfs(struct device *dev)
-{
- sysfs_remove_group(&dev->kobj, &blk_trace_attr_group);
-}
-
#endif /* CONFIG_BLK_DEV_IO_TRACE */
#ifdef CONFIG_EVENT_TRACING
-void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes)
+/**
+ * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string.
+ * @rwbs: buffer to be filled
+ * @opf: request operation type (REQ_OP_XXX) and flags for the tracepoint
+ *
+ * Description:
+ * Maps each request operation and flag to a single character and fills the
+ * buffer provided by the caller with resulting string.
+ *
+ **/
+void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
{
int i = 0;
- if (op & REQ_PREFLUSH)
+ if (opf & REQ_PREFLUSH)
rwbs[i++] = 'F';
- switch (op & REQ_OP_MASK) {
+ switch (opf & REQ_OP_MASK) {
case REQ_OP_WRITE:
- case REQ_OP_WRITE_SAME:
rwbs[i++] = 'W';
break;
case REQ_OP_DISCARD:
@@ -1966,13 +1901,13 @@ void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes)
rwbs[i++] = 'N';
}
- if (op & REQ_FUA)
+ if (opf & REQ_FUA)
rwbs[i++] = 'F';
- if (op & REQ_RAHEAD)
+ if (opf & REQ_RAHEAD)
rwbs[i++] = 'A';
- if (op & REQ_SYNC)
+ if (opf & REQ_SYNC)
rwbs[i++] = 'S';
- if (op & REQ_META)
+ if (opf & REQ_META)
rwbs[i++] = 'M';
rwbs[i] = '\0';
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 7bc3d6175868..7ac6c52b25eb 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -6,22 +6,45 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
#include <linux/bpf_perf_event.h>
+#include <linux/btf.h>
#include <linux/filter.h>
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/kprobes.h>
+#include <linux/spinlock.h>
#include <linux/syscalls.h>
#include <linux/error-injection.h>
+#include <linux/btf_ids.h>
+#include <linux/bpf_lsm.h>
+#include <linux/fprobe.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
+#include <linux/key.h>
+#include <linux/verification.h>
+#include <linux/namei.h>
+#include <linux/fileattr.h>
+
+#include <net/bpf_sk_storage.h>
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/btf.h>
#include <asm/tlb.h>
#include "trace_probe.h"
#include "trace.h"
+#define CREATE_TRACE_POINTS
+#include "bpf_trace.h"
+
#define bpf_event_rcu_dereference(p) \
rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex))
+#define MAX_UPROBE_MULTI_CNT (1U << 20)
+#define MAX_KPROBE_MULTI_CNT (1U << 20)
+
#ifdef CONFIG_MODULES
struct bpf_trace_module {
struct module *module;
@@ -62,6 +85,15 @@ static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
+ u64 flags, const struct btf **btf,
+ s32 *btf_id);
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx);
+static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx);
+
+static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx);
+static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx);
+
/**
* trace_call_bpf - invoke BPF program
* @call: tracepoint event
@@ -80,9 +112,6 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
unsigned int ret;
- if (in_nmi()) /* not supported yet */
- return 1;
-
cant_sleep();
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
@@ -92,6 +121,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
* and don't send kprobe event into ring-buffer,
* so return zero here
*/
+ rcu_read_lock();
+ bpf_prog_inc_misses_counters(rcu_dereference(call->prog_array));
+ rcu_read_unlock();
ret = 0;
goto out;
}
@@ -100,7 +132,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
* Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
* to all call sites, we did a bpf_prog_array_valid() there to check
* whether call->prog_array is empty or not, which is
- * a heurisitc to speed up execution.
+ * a heuristic to speed up execution.
*
* If bpf_prog_array_valid() fetched prog_array was
* non-NULL, we go into trace_call_bpf() and do the actual
@@ -111,7 +143,10 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
* out of events when it was updated in between this and the
* rcu_dereference() which is accepted risk.
*/
- ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
+ rcu_read_lock();
+ ret = bpf_prog_run_array(rcu_dereference(call->prog_array),
+ ctx, bpf_prog_run);
+ rcu_read_unlock();
out:
__this_cpu_dec(bpf_prog_active);
@@ -168,6 +203,16 @@ bpf_probe_read_user_str_common(void *dst, u32 size,
{
int ret;
+ /*
+ * NB: We rely on strncpy_from_user() not copying junk past the NUL
+ * terminator into `dst`.
+ *
+ * strncpy_from_user() does long-sized strides in the fast path. If the
+ * strncpy does not mask out the bytes after the NUL in `unsafe_ptr`,
+ * then there could be junk after the NUL in `dst`. If user takes `dst`
+ * and keys a hash map with it, then semantically identical strings can
+ * occupy multiple entries in the map.
+ */
ret = strncpy_from_user_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
memset(dst, 0, size);
@@ -189,22 +234,6 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto = {
.arg3_type = ARG_ANYTHING,
};
-static __always_inline int
-bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr)
-{
- int ret = security_locked_down(LOCKDOWN_BPF_READ);
-
- if (unlikely(ret < 0))
- goto fail;
- ret = copy_from_kernel_nofault(dst, unsafe_ptr, size);
- if (unlikely(ret < 0))
- goto fail;
- return ret;
-fail:
- memset(dst, 0, size);
- return ret;
-}
-
BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size,
const void *, unsafe_ptr)
{
@@ -223,10 +252,7 @@ const struct bpf_func_proto bpf_probe_read_kernel_proto = {
static __always_inline int
bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr)
{
- int ret = security_locked_down(LOCKDOWN_BPF_READ);
-
- if (unlikely(ret < 0))
- goto fail;
+ int ret;
/*
* The strncpy_from_kernel_nofault() call will likely not fill the
@@ -239,11 +265,7 @@ bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr)
*/
ret = strncpy_from_kernel_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
- goto fail;
-
- return ret;
-fail:
- memset(dst, 0, size);
+ memset(dst, 0, size);
return ret;
}
@@ -321,8 +343,6 @@ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
if (unlikely(in_interrupt() ||
current->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
- if (unlikely(uaccess_kernel()))
- return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
@@ -334,7 +354,7 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
@@ -349,371 +369,140 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
return &bpf_probe_write_user_proto;
}
-static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
- size_t bufsz)
-{
- void __user *user_ptr = (__force void __user *)unsafe_ptr;
-
- buf[0] = 0;
-
- switch (fmt_ptype) {
- case 's':
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if ((unsigned long)unsafe_ptr < TASK_SIZE) {
- strncpy_from_user_nofault(buf, user_ptr, bufsz);
- break;
- }
- fallthrough;
-#endif
- case 'k':
- strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz);
- break;
- case 'u':
- strncpy_from_user_nofault(buf, user_ptr, bufsz);
- break;
- }
-}
+#define MAX_TRACE_PRINTK_VARARGS 3
+#define BPF_TRACE_PRINTK_SIZE 1024
-/*
- * Only limited trace_printk() conversion specifiers allowed:
- * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pks %pus %s
- */
BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
u64, arg2, u64, arg3)
{
- int i, mod[3] = {}, fmt_cnt = 0;
- char buf[64], fmt_ptype;
- void *unsafe_ptr = NULL;
- bool str_seen = false;
-
- /*
- * bpf_check()->check_func_arg()->check_stack_boundary()
- * guarantees that fmt points to bpf program stack,
- * fmt_size bytes of it were initialized and fmt_size > 0
- */
- if (fmt[--fmt_size] != 0)
- return -EINVAL;
-
- /* check format string for allowed specifiers */
- for (i = 0; i < fmt_size; i++) {
- if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
- return -EINVAL;
-
- if (fmt[i] != '%')
- continue;
-
- if (fmt_cnt >= 3)
- return -EINVAL;
-
- /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
- i++;
- if (fmt[i] == 'l') {
- mod[fmt_cnt]++;
- i++;
- } else if (fmt[i] == 'p') {
- mod[fmt_cnt]++;
- if ((fmt[i + 1] == 'k' ||
- fmt[i + 1] == 'u') &&
- fmt[i + 2] == 's') {
- fmt_ptype = fmt[i + 1];
- i += 2;
- goto fmt_str;
- }
+ u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 };
+ struct bpf_bprintf_data data = {
+ .get_bin_args = true,
+ .get_buf = true,
+ };
+ int ret;
- /* disallow any further format extensions */
- if (fmt[i + 1] != 0 &&
- !isspace(fmt[i + 1]) &&
- !ispunct(fmt[i + 1]))
- return -EINVAL;
-
- goto fmt_next;
- } else if (fmt[i] == 's') {
- mod[fmt_cnt]++;
- fmt_ptype = fmt[i];
-fmt_str:
- if (str_seen)
- /* allow only one '%s' per fmt string */
- return -EINVAL;
- str_seen = true;
-
- if (fmt[i + 1] != 0 &&
- !isspace(fmt[i + 1]) &&
- !ispunct(fmt[i + 1]))
- return -EINVAL;
-
- switch (fmt_cnt) {
- case 0:
- unsafe_ptr = (void *)(long)arg1;
- arg1 = (long)buf;
- break;
- case 1:
- unsafe_ptr = (void *)(long)arg2;
- arg2 = (long)buf;
- break;
- case 2:
- unsafe_ptr = (void *)(long)arg3;
- arg3 = (long)buf;
- break;
- }
+ ret = bpf_bprintf_prepare(fmt, fmt_size, args,
+ MAX_TRACE_PRINTK_VARARGS, &data);
+ if (ret < 0)
+ return ret;
- bpf_trace_copy_string(buf, unsafe_ptr, fmt_ptype,
- sizeof(buf));
- goto fmt_next;
- }
+ ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args);
- if (fmt[i] == 'l') {
- mod[fmt_cnt]++;
- i++;
- }
+ trace_bpf_trace_printk(data.buf);
- if (fmt[i] != 'i' && fmt[i] != 'd' &&
- fmt[i] != 'u' && fmt[i] != 'x')
- return -EINVAL;
-fmt_next:
- fmt_cnt++;
- }
+ bpf_bprintf_cleanup(&data);
-/* Horrid workaround for getting va_list handling working with different
- * argument type combinations generically for 32 and 64 bit archs.
- */
-#define __BPF_TP_EMIT() __BPF_ARG3_TP()
-#define __BPF_TP(...) \
- __trace_printk(0 /* Fake ip */, \
- fmt, ##__VA_ARGS__)
-
-#define __BPF_ARG1_TP(...) \
- ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \
- ? __BPF_TP(arg1, ##__VA_ARGS__) \
- : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \
- ? __BPF_TP((long)arg1, ##__VA_ARGS__) \
- : __BPF_TP((u32)arg1, ##__VA_ARGS__)))
-
-#define __BPF_ARG2_TP(...) \
- ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \
- ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \
- : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \
- ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \
- : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__)))
-
-#define __BPF_ARG3_TP(...) \
- ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \
- ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \
- : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \
- ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \
- : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__)))
-
- return __BPF_TP_EMIT();
+ return ret;
}
static const struct bpf_func_proto bpf_trace_printk_proto = {
.func = bpf_trace_printk,
.gpl_only = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
};
-const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
+static void __set_printk_clr_event(void)
{
/*
- * this program might be calling bpf_trace_printk,
- * so allocate per-cpu printk buffers
+ * This program might be calling bpf_trace_printk,
+ * so enable the associated bpf_trace/bpf_trace_printk event.
+ * Repeat this each time as it is possible a user has
+ * disabled bpf_trace_printk events. By loading a program
+ * calling bpf_trace_printk() however the user has expressed
+ * the intent to see such events.
*/
- trace_printk_init_buffers();
+ if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1))
+ pr_warn_ratelimited("could not enable bpf_trace_printk events");
+}
+const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
+{
+ __set_printk_clr_event();
return &bpf_trace_printk_proto;
}
-#define MAX_SEQ_PRINTF_VARARGS 12
-#define MAX_SEQ_PRINTF_MAX_MEMCPY 6
-#define MAX_SEQ_PRINTF_STR_LEN 128
-
-struct bpf_seq_printf_buf {
- char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN];
-};
-static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf);
-static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used);
-
-BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
- const void *, data, u32, data_len)
+BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, args,
+ u32, data_len)
{
- int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0;
- int i, buf_used, copy_size, num_args;
- u64 params[MAX_SEQ_PRINTF_VARARGS];
- struct bpf_seq_printf_buf *bufs;
- const u64 *args = data;
-
- buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used);
- if (WARN_ON_ONCE(buf_used > 1)) {
- err = -EBUSY;
- goto out;
- }
-
- bufs = this_cpu_ptr(&bpf_seq_printf_buf);
-
- /*
- * bpf_check()->check_func_arg()->check_stack_boundary()
- * guarantees that fmt points to bpf program stack,
- * fmt_size bytes of it were initialized and fmt_size > 0
- */
- if (fmt[--fmt_size] != 0)
- goto out;
-
- if (data_len & 7)
- goto out;
-
- for (i = 0; i < fmt_size; i++) {
- if (fmt[i] == '%') {
- if (fmt[i + 1] == '%')
- i++;
- else if (!data || !data_len)
- goto out;
- }
- }
+ struct bpf_bprintf_data data = {
+ .get_bin_args = true,
+ .get_buf = true,
+ };
+ int ret, num_args;
+ if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
+ (data_len && !args))
+ return -EINVAL;
num_args = data_len / 8;
- /* check format string for allowed specifiers */
- for (i = 0; i < fmt_size; i++) {
- /* only printable ascii for now. */
- if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
- err = -EINVAL;
- goto out;
- }
-
- if (fmt[i] != '%')
- continue;
-
- if (fmt[i + 1] == '%') {
- i++;
- continue;
- }
-
- if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) {
- err = -E2BIG;
- goto out;
- }
-
- if (fmt_cnt >= num_args) {
- err = -EINVAL;
- goto out;
- }
-
- /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
- i++;
-
- /* skip optional "[0 +-][num]" width formating field */
- while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' ||
- fmt[i] == ' ')
- i++;
- if (fmt[i] >= '1' && fmt[i] <= '9') {
- i++;
- while (fmt[i] >= '0' && fmt[i] <= '9')
- i++;
- }
-
- if (fmt[i] == 's') {
- void *unsafe_ptr;
-
- /* try our best to copy */
- if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
- err = -E2BIG;
- goto out;
- }
-
- unsafe_ptr = (void *)(long)args[fmt_cnt];
- err = strncpy_from_kernel_nofault(bufs->buf[memcpy_cnt],
- unsafe_ptr, MAX_SEQ_PRINTF_STR_LEN);
- if (err < 0)
- bufs->buf[memcpy_cnt][0] = '\0';
- params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
+ ret = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data);
+ if (ret < 0)
+ return ret;
- fmt_cnt++;
- memcpy_cnt++;
- continue;
- }
+ ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args);
- if (fmt[i] == 'p') {
- if (fmt[i + 1] == 0 ||
- fmt[i + 1] == 'K' ||
- fmt[i + 1] == 'x') {
- /* just kernel pointers */
- params[fmt_cnt] = args[fmt_cnt];
- fmt_cnt++;
- continue;
- }
+ trace_bpf_trace_printk(data.buf);
- /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
- if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') {
- err = -EINVAL;
- goto out;
- }
- if (fmt[i + 2] != '4' && fmt[i + 2] != '6') {
- err = -EINVAL;
- goto out;
- }
-
- if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
- err = -E2BIG;
- goto out;
- }
+ bpf_bprintf_cleanup(&data);
+ return ret;
+}
- copy_size = (fmt[i + 2] == '4') ? 4 : 16;
+static const struct bpf_func_proto bpf_trace_vprintk_proto = {
+ .func = bpf_trace_vprintk,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
+ .arg4_type = ARG_CONST_SIZE_OR_ZERO,
+};
- err = copy_from_kernel_nofault(bufs->buf[memcpy_cnt],
- (void *) (long) args[fmt_cnt],
- copy_size);
- if (err < 0)
- memset(bufs->buf[memcpy_cnt], 0, copy_size);
- params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
+const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void)
+{
+ __set_printk_clr_event();
+ return &bpf_trace_vprintk_proto;
+}
- i += 2;
- fmt_cnt++;
- memcpy_cnt++;
- continue;
- }
+BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
+ const void *, args, u32, data_len)
+{
+ struct bpf_bprintf_data data = {
+ .get_bin_args = true,
+ };
+ int err, num_args;
- if (fmt[i] == 'l') {
- i++;
- if (fmt[i] == 'l')
- i++;
- }
+ if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
+ (data_len && !args))
+ return -EINVAL;
+ num_args = data_len / 8;
- if (fmt[i] != 'i' && fmt[i] != 'd' &&
- fmt[i] != 'u' && fmt[i] != 'x') {
- err = -EINVAL;
- goto out;
- }
+ err = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data);
+ if (err < 0)
+ return err;
- params[fmt_cnt] = args[fmt_cnt];
- fmt_cnt++;
- }
+ seq_bprintf(m, fmt, data.bin_args);
- /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give
- * all of them to seq_printf().
- */
- seq_printf(m, fmt, params[0], params[1], params[2], params[3],
- params[4], params[5], params[6], params[7], params[8],
- params[9], params[10], params[11]);
+ bpf_bprintf_cleanup(&data);
- err = seq_has_overflowed(m) ? -EOVERFLOW : 0;
-out:
- this_cpu_dec(bpf_seq_printf_buf_used);
- return err;
+ return seq_has_overflowed(m) ? -EOVERFLOW : 0;
}
-static int bpf_seq_printf_btf_ids[5];
+BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file)
+
static const struct bpf_func_proto bpf_seq_printf_proto = {
.func = bpf_seq_printf,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg1_btf_id = &btf_seq_file_ids[0],
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
- .arg4_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
- .btf_id = bpf_seq_printf_btf_ids,
};
BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
@@ -721,15 +510,39 @@ BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
return seq_write(m, data, len) ? -EOVERFLOW : 0;
}
-static int bpf_seq_write_btf_ids[5];
static const struct bpf_func_proto bpf_seq_write_proto = {
.func = bpf_seq_write,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg1_btf_id = &btf_seq_file_ids[0],
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BPF_CALL_4(bpf_seq_printf_btf, struct seq_file *, m, struct btf_ptr *, ptr,
+ u32, btf_ptr_size, u64, flags)
+{
+ const struct btf *btf;
+ s32 btf_id;
+ int ret;
+
+ ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id);
+ if (ret)
+ return ret;
+
+ return btf_type_seq_show_flags(btf, btf_id, ptr->ptr, m, flags);
+}
+
+static const struct bpf_func_proto bpf_seq_printf_btf_proto = {
+ .func = bpf_seq_printf_btf,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_seq_file_ids[0],
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
- .btf_id = bpf_seq_write_btf_ids,
+ .arg4_type = ARG_ANYTHING,
};
static __always_inline int
@@ -848,8 +661,7 @@ static DEFINE_PER_CPU(int, bpf_trace_nest_level);
BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags, void *, data, u64, size)
{
- struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
- int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
+ struct bpf_trace_sample_data *sds;
struct perf_raw_record raw = {
.frag = {
.size = size,
@@ -857,7 +669,11 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
},
};
struct perf_sample_data *sd;
- int err;
+ int nest_level, err;
+
+ preempt_disable();
+ sds = this_cpu_ptr(&bpf_trace_sds);
+ nest_level = this_cpu_inc_return(bpf_trace_nest_level);
if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
err = -EBUSY;
@@ -872,12 +688,12 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
}
perf_sample_data_init(sd, 0, 0);
- sd->raw = &raw;
+ perf_sample_save_raw_data(sd, &raw);
err = __bpf_perf_event_output(regs, map, flags, sd);
-
out:
this_cpu_dec(bpf_trace_nest_level);
+ preempt_enable();
return err;
}
@@ -888,7 +704,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -902,7 +718,6 @@ static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds);
u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
- int nest_level = this_cpu_inc_return(bpf_event_output_nest_level);
struct perf_raw_frag frag = {
.copy = ctx_copy,
.size = ctx_size,
@@ -919,8 +734,12 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
};
struct perf_sample_data *sd;
struct pt_regs *regs;
+ int nest_level;
u64 ret;
+ preempt_disable();
+ nest_level = this_cpu_inc_return(bpf_event_output_nest_level);
+
if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) {
ret = -EBUSY;
goto out;
@@ -930,11 +749,12 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
perf_fetch_caller_regs(regs);
perf_sample_data_init(sd, 0, 0);
- sd->raw = &raw;
+ perf_sample_save_raw_data(sd, &raw);
ret = __bpf_perf_event_output(regs, map, flags, sd);
out:
this_cpu_dec(bpf_event_output_nest_level);
+ preempt_enable();
return ret;
}
@@ -949,6 +769,35 @@ const struct bpf_func_proto bpf_get_current_task_proto = {
.ret_type = RET_INTEGER,
};
+BPF_CALL_0(bpf_get_current_task_btf)
+{
+ return (unsigned long) current;
+}
+
+const struct bpf_func_proto bpf_get_current_task_btf_proto = {
+ .func = bpf_get_current_task_btf,
+ .gpl_only = true,
+ .ret_type = RET_PTR_TO_BTF_ID_TRUSTED,
+ .ret_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+};
+
+BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task)
+{
+ return (unsigned long) task_pt_regs(task);
+}
+
+BTF_ID_LIST(bpf_task_pt_regs_ids)
+BTF_ID(struct, pt_regs)
+
+const struct bpf_func_proto bpf_task_pt_regs_proto = {
+ .func = bpf_task_pt_regs,
+ .gpl_only = true,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .ret_type = RET_PTR_TO_BTF_ID,
+ .ret_btf_id = &bpf_task_pt_regs_ids[0],
+};
+
BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
@@ -987,6 +836,7 @@ static void do_bpf_send_signal(struct irq_work *entry)
work = container_of(entry, struct send_signal_irq_work, irq_work);
group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type);
+ put_task_struct(work->task);
}
static int bpf_send_signal_common(u32 sig, enum pid_type type)
@@ -1000,10 +850,11 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
*/
if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
- if (unlikely(uaccess_kernel()))
- return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
+ /* Task should not be pid=1 to avoid kernel panic. */
+ if (unlikely(is_global_init(current)))
+ return -EPERM;
if (irqs_disabled()) {
/* Do an early check on signal validity. Otherwise,
@@ -1013,14 +864,14 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
return -EINVAL;
work = this_cpu_ptr(&send_signal_work);
- if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY)
+ if (irq_work_is_busy(&work->irq_work))
return -EBUSY;
/* Add the current task, which is the target of sending signal,
* to the irq_work. The current task may change when queued
* irq works get executed.
*/
- work->task = current;
+ work->task = get_task_struct(current);
work->sig = sig;
work->type = type;
irq_work_queue(&work->irq_work);
@@ -1054,7 +905,603 @@ static const struct bpf_func_proto bpf_send_signal_thread_proto = {
.arg1_type = ARG_ANYTHING,
};
-const struct bpf_func_proto *
+BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz)
+{
+ struct path copy;
+ long len;
+ char *p;
+
+ if (!sz)
+ return 0;
+
+ /*
+ * The path pointer is verified as trusted and safe to use,
+ * but let's double check it's valid anyway to workaround
+ * potentially broken verifier.
+ */
+ len = copy_from_kernel_nofault(&copy, path, sizeof(*path));
+ if (len < 0)
+ return len;
+
+ p = d_path(&copy, buf, sz);
+ if (IS_ERR(p)) {
+ len = PTR_ERR(p);
+ } else {
+ len = buf + sz - p;
+ memmove(buf, p, len);
+ }
+
+ return len;
+}
+
+BTF_SET_START(btf_allowlist_d_path)
+#ifdef CONFIG_SECURITY
+BTF_ID(func, security_file_permission)
+BTF_ID(func, security_inode_getattr)
+BTF_ID(func, security_file_open)
+#endif
+#ifdef CONFIG_SECURITY_PATH
+BTF_ID(func, security_path_truncate)
+#endif
+BTF_ID(func, vfs_truncate)
+BTF_ID(func, vfs_fallocate)
+BTF_ID(func, dentry_open)
+BTF_ID(func, vfs_getattr)
+BTF_ID(func, filp_close)
+BTF_SET_END(btf_allowlist_d_path)
+
+static bool bpf_d_path_allowed(const struct bpf_prog *prog)
+{
+ if (prog->type == BPF_PROG_TYPE_TRACING &&
+ prog->expected_attach_type == BPF_TRACE_ITER)
+ return true;
+
+ if (prog->type == BPF_PROG_TYPE_LSM)
+ return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
+
+ return btf_id_set_contains(&btf_allowlist_d_path,
+ prog->aux->attach_btf_id);
+}
+
+BTF_ID_LIST_SINGLE(bpf_d_path_btf_ids, struct, path)
+
+static const struct bpf_func_proto bpf_d_path_proto = {
+ .func = bpf_d_path,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_d_path_btf_ids[0],
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .allowed = bpf_d_path_allowed,
+};
+
+#define BTF_F_ALL (BTF_F_COMPACT | BTF_F_NONAME | \
+ BTF_F_PTR_RAW | BTF_F_ZERO)
+
+static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
+ u64 flags, const struct btf **btf,
+ s32 *btf_id)
+{
+ const struct btf_type *t;
+
+ if (unlikely(flags & ~(BTF_F_ALL)))
+ return -EINVAL;
+
+ if (btf_ptr_size != sizeof(struct btf_ptr))
+ return -EINVAL;
+
+ *btf = bpf_get_btf_vmlinux();
+
+ if (IS_ERR_OR_NULL(*btf))
+ return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL;
+
+ if (ptr->type_id > 0)
+ *btf_id = ptr->type_id;
+ else
+ return -EINVAL;
+
+ if (*btf_id > 0)
+ t = btf_type_by_id(*btf, *btf_id);
+ if (*btf_id <= 0 || !t)
+ return -ENOENT;
+
+ return 0;
+}
+
+BPF_CALL_5(bpf_snprintf_btf, char *, str, u32, str_size, struct btf_ptr *, ptr,
+ u32, btf_ptr_size, u64, flags)
+{
+ const struct btf *btf;
+ s32 btf_id;
+ int ret;
+
+ ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id);
+ if (ret)
+ return ret;
+
+ return btf_type_snprintf_show(btf, btf_id, ptr->ptr, str, str_size,
+ flags);
+}
+
+const struct bpf_func_proto bpf_snprintf_btf_proto = {
+ .func = bpf_snprintf_btf,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg4_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx)
+{
+ /* This helper call is inlined by verifier. */
+ return ((u64 *)ctx)[-2];
+}
+
+static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
+ .func = bpf_get_func_ip_tracing,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+#ifdef CONFIG_X86_KERNEL_IBT
+static unsigned long get_entry_ip(unsigned long fentry_ip)
+{
+ u32 instr;
+
+ /* Being extra safe in here in case entry ip is on the page-edge. */
+ if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1))
+ return fentry_ip;
+ if (is_endbr(instr))
+ fentry_ip -= ENDBR_INSN_SIZE;
+ return fentry_ip;
+}
+#else
+#define get_entry_ip(fentry_ip) fentry_ip
+#endif
+
+BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
+{
+ struct bpf_trace_run_ctx *run_ctx __maybe_unused;
+ struct kprobe *kp;
+
+#ifdef CONFIG_UPROBES
+ run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+ if (run_ctx->is_uprobe)
+ return ((struct uprobe_dispatch_data *)current->utask->vaddr)->bp_addr;
+#endif
+
+ kp = kprobe_running();
+
+ if (!kp || !(kp->flags & KPROBE_FLAG_ON_FUNC_ENTRY))
+ return 0;
+
+ return get_entry_ip((uintptr_t)kp->addr);
+}
+
+static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
+ .func = bpf_get_func_ip_kprobe,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_func_ip_kprobe_multi, struct pt_regs *, regs)
+{
+ return bpf_kprobe_multi_entry_ip(current->bpf_ctx);
+}
+
+static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe_multi = {
+ .func = bpf_get_func_ip_kprobe_multi,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie_kprobe_multi, struct pt_regs *, regs)
+{
+ return bpf_kprobe_multi_cookie(current->bpf_ctx);
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = {
+ .func = bpf_get_attach_cookie_kprobe_multi,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_func_ip_uprobe_multi, struct pt_regs *, regs)
+{
+ return bpf_uprobe_multi_entry_ip(current->bpf_ctx);
+}
+
+static const struct bpf_func_proto bpf_get_func_ip_proto_uprobe_multi = {
+ .func = bpf_get_func_ip_uprobe_multi,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie_uprobe_multi, struct pt_regs *, regs)
+{
+ return bpf_uprobe_multi_cookie(current->bpf_ctx);
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_umulti = {
+ .func = bpf_get_attach_cookie_uprobe_multi,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx)
+{
+ struct bpf_trace_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+ return run_ctx->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_trace = {
+ .func = bpf_get_attach_cookie_trace,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie_pe, struct bpf_perf_event_data_kern *, ctx)
+{
+ return ctx->event->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = {
+ .func = bpf_get_attach_cookie_pe,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie_tracing, void *, ctx)
+{
+ struct bpf_trace_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+ return run_ctx->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = {
+ .func = bpf_get_attach_cookie_tracing,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
+{
+#ifndef CONFIG_X86
+ return -ENOENT;
+#else
+ static const u32 br_entry_size = sizeof(struct perf_branch_entry);
+ u32 entry_cnt = size / br_entry_size;
+
+ entry_cnt = static_call(perf_snapshot_branch_stack)(buf, entry_cnt);
+
+ if (unlikely(flags))
+ return -EINVAL;
+
+ if (!entry_cnt)
+ return -ENOENT;
+
+ return entry_cnt * br_entry_size;
+#endif
+}
+
+static const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
+ .func = bpf_get_branch_snapshot,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value)
+{
+ /* This helper call is inlined by verifier. */
+ u64 nr_args = ((u64 *)ctx)[-1];
+
+ if ((u64) n >= nr_args)
+ return -EINVAL;
+ *value = ((u64 *)ctx)[n];
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_get_func_arg_proto = {
+ .func = get_func_arg,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_LONG,
+};
+
+BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value)
+{
+ /* This helper call is inlined by verifier. */
+ u64 nr_args = ((u64 *)ctx)[-1];
+
+ *value = ((u64 *)ctx)[nr_args];
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_get_func_ret_proto = {
+ .func = get_func_ret,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_LONG,
+};
+
+BPF_CALL_1(get_func_arg_cnt, void *, ctx)
+{
+ /* This helper call is inlined by verifier. */
+ return ((u64 *)ctx)[-1];
+}
+
+static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
+ .func = get_func_arg_cnt,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+#ifdef CONFIG_KEYS
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_lookup_user_key - lookup a key by its serial
+ * @serial: key handle serial number
+ * @flags: lookup-specific flags
+ *
+ * Search a key with a given *serial* and the provided *flags*.
+ * If found, increment the reference count of the key by one, and
+ * return it in the bpf_key structure.
+ *
+ * The bpf_key structure must be passed to bpf_key_put() when done
+ * with it, so that the key reference count is decremented and the
+ * bpf_key structure is freed.
+ *
+ * Permission checks are deferred to the time the key is used by
+ * one of the available key-specific kfuncs.
+ *
+ * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
+ * special keyring (e.g. session keyring), if it doesn't yet exist.
+ * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
+ * for the key construction, and to retrieve uninstantiated keys (keys
+ * without data attached to them).
+ *
+ * Return: a bpf_key pointer with a valid key pointer if the key is found, a
+ * NULL pointer otherwise.
+ */
+__bpf_kfunc struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
+{
+ key_ref_t key_ref;
+ struct bpf_key *bkey;
+
+ if (flags & ~KEY_LOOKUP_ALL)
+ return NULL;
+
+ /*
+ * Permission check is deferred until the key is used, as the
+ * intent of the caller is unknown here.
+ */
+ key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
+ if (IS_ERR(key_ref))
+ return NULL;
+
+ bkey = kmalloc(sizeof(*bkey), GFP_KERNEL);
+ if (!bkey) {
+ key_put(key_ref_to_ptr(key_ref));
+ return NULL;
+ }
+
+ bkey->key = key_ref_to_ptr(key_ref);
+ bkey->has_ref = true;
+
+ return bkey;
+}
+
+/**
+ * bpf_lookup_system_key - lookup a key by a system-defined ID
+ * @id: key ID
+ *
+ * Obtain a bpf_key structure with a key pointer set to the passed key ID.
+ * The key pointer is marked as invalid, to prevent bpf_key_put() from
+ * attempting to decrement the key reference count on that pointer. The key
+ * pointer set in such way is currently understood only by
+ * verify_pkcs7_signature().
+ *
+ * Set *id* to one of the values defined in include/linux/verification.h:
+ * 0 for the primary keyring (immutable keyring of system keys);
+ * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
+ * (where keys can be added only if they are vouched for by existing keys
+ * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
+ * keyring (primarily used by the integrity subsystem to verify a kexec'ed
+ * kerned image and, possibly, the initramfs signature).
+ *
+ * Return: a bpf_key pointer with an invalid key pointer set from the
+ * pre-determined ID on success, a NULL pointer otherwise
+ */
+__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)
+{
+ struct bpf_key *bkey;
+
+ if (system_keyring_id_check(id) < 0)
+ return NULL;
+
+ bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC);
+ if (!bkey)
+ return NULL;
+
+ bkey->key = (struct key *)(unsigned long)id;
+ bkey->has_ref = false;
+
+ return bkey;
+}
+
+/**
+ * bpf_key_put - decrement key reference count if key is valid and free bpf_key
+ * @bkey: bpf_key structure
+ *
+ * Decrement the reference count of the key inside *bkey*, if the pointer
+ * is valid, and free *bkey*.
+ */
+__bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
+{
+ if (bkey->has_ref)
+ key_put(bkey->key);
+
+ kfree(bkey);
+}
+
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+/**
+ * bpf_verify_pkcs7_signature - verify a PKCS#7 signature
+ * @data_ptr: data to verify
+ * @sig_ptr: signature of the data
+ * @trusted_keyring: keyring with keys trusted for signature verification
+ *
+ * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
+ * with keys in a keyring referenced by *trusted_keyring*.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
+ struct bpf_dynptr_kern *sig_ptr,
+ struct bpf_key *trusted_keyring)
+{
+ const void *data, *sig;
+ u32 data_len, sig_len;
+ int ret;
+
+ if (trusted_keyring->has_ref) {
+ /*
+ * Do the permission check deferred in bpf_lookup_user_key().
+ * See bpf_lookup_user_key() for more details.
+ *
+ * A call to key_task_permission() here would be redundant, as
+ * it is already done by keyring_search() called by
+ * find_asymmetric_key().
+ */
+ ret = key_validate(trusted_keyring->key);
+ if (ret < 0)
+ return ret;
+ }
+
+ data_len = __bpf_dynptr_size(data_ptr);
+ data = __bpf_dynptr_data(data_ptr, data_len);
+ sig_len = __bpf_dynptr_size(sig_ptr);
+ sig = __bpf_dynptr_data(sig_ptr, sig_len);
+
+ return verify_pkcs7_signature(data, data_len, sig, sig_len,
+ trusted_keyring->key,
+ VERIFYING_UNSPECIFIED_SIGNATURE, NULL,
+ NULL);
+}
+#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
+
+__bpf_kfunc_end_defs();
+
+BTF_SET8_START(key_sig_kfunc_set)
+BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
+#endif
+BTF_SET8_END(key_sig_kfunc_set)
+
+static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &key_sig_kfunc_set,
+};
+
+static int __init bpf_key_sig_kfuncs_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &bpf_key_sig_kfunc_set);
+}
+
+late_initcall(bpf_key_sig_kfuncs_init);
+#endif /* CONFIG_KEYS */
+
+/* filesystem kfuncs */
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_get_file_xattr - get xattr of a file
+ * @file: file to get xattr from
+ * @name__str: name of the xattr
+ * @value_ptr: output buffer of the xattr value
+ *
+ * Get xattr *name__str* of *file* and store the output in *value_ptr*.
+ *
+ * For security reasons, only *name__str* with prefix "user." is allowed.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str,
+ struct bpf_dynptr_kern *value_ptr)
+{
+ struct dentry *dentry;
+ u32 value_len;
+ void *value;
+ int ret;
+
+ if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+ return -EPERM;
+
+ value_len = __bpf_dynptr_size(value_ptr);
+ value = __bpf_dynptr_data_rw(value_ptr, value_len);
+ if (!value)
+ return -EINVAL;
+
+ dentry = file_dentry(file);
+ ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ);
+ if (ret)
+ return ret;
+ return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_SET8_START(fs_kfunc_set_ids)
+BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_SET8_END(fs_kfunc_set_ids)
+
+static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+ if (!btf_id_set8_contains(&fs_kfunc_set_ids, kfunc_id))
+ return 0;
+
+ /* Only allow to attach from LSM hooks, to avoid recursion */
+ return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0;
+}
+
+static const struct btf_kfunc_id_set bpf_fs_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &fs_kfunc_set_ids,
+ .filter = bpf_get_file_xattr_filter,
+};
+
+static int __init bpf_fs_kfuncs_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set);
+}
+
+late_initcall(bpf_fs_kfuncs_init);
+
+static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
@@ -1070,6 +1517,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_map_pop_elem_proto;
case BPF_FUNC_map_peek_elem:
return &bpf_map_peek_elem_proto;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ return &bpf_map_lookup_percpu_elem_proto;
case BPF_FUNC_ktime_get_ns:
return &bpf_ktime_get_ns_proto;
case BPF_FUNC_ktime_get_boot_ns:
@@ -1080,6 +1529,10 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_pid_tgid_proto;
case BPF_FUNC_get_current_task:
return &bpf_get_current_task_proto;
+ case BPF_FUNC_get_current_task_btf:
+ return &bpf_get_current_task_btf_proto;
+ case BPF_FUNC_task_pt_regs:
+ return &bpf_task_pt_regs_proto;
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_current_comm:
@@ -1092,29 +1545,36 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_numa_node_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
- case BPF_FUNC_probe_write_user:
- return bpf_get_probe_write_proto();
case BPF_FUNC_current_task_under_cgroup:
return &bpf_current_task_under_cgroup_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
+ case BPF_FUNC_probe_write_user:
+ return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
+ NULL : bpf_get_probe_write_proto();
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
- return &bpf_probe_read_kernel_proto;
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
+ NULL : &bpf_probe_read_kernel_proto;
case BPF_FUNC_probe_read_user_str:
return &bpf_probe_read_user_str_proto;
case BPF_FUNC_probe_read_kernel_str:
- return &bpf_probe_read_kernel_str_proto;
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
+ NULL : &bpf_probe_read_kernel_str_proto;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
case BPF_FUNC_probe_read:
- return &bpf_probe_read_compat_proto;
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
+ NULL : &bpf_probe_read_compat_proto;
case BPF_FUNC_probe_read_str:
- return &bpf_probe_read_compat_str_proto;
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
+ NULL : &bpf_probe_read_compat_str_proto;
#endif
#ifdef CONFIG_CGROUPS
- case BPF_FUNC_get_current_cgroup_id:
- return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_cgrp_storage_get:
+ return &bpf_cgrp_storage_get_proto;
+ case BPF_FUNC_cgrp_storage_delete:
+ return &bpf_cgrp_storage_delete_proto;
#endif
case BPF_FUNC_send_signal:
return &bpf_send_signal_proto;
@@ -1134,8 +1594,42 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_ringbuf_discard_proto;
case BPF_FUNC_ringbuf_query:
return &bpf_ringbuf_query_proto;
+ case BPF_FUNC_jiffies64:
+ return &bpf_jiffies64_proto;
+ case BPF_FUNC_get_task_stack:
+ return &bpf_get_task_stack_proto;
+ case BPF_FUNC_copy_from_user:
+ return &bpf_copy_from_user_proto;
+ case BPF_FUNC_copy_from_user_task:
+ return &bpf_copy_from_user_task_proto;
+ case BPF_FUNC_snprintf_btf:
+ return &bpf_snprintf_btf_proto;
+ case BPF_FUNC_per_cpu_ptr:
+ return &bpf_per_cpu_ptr_proto;
+ case BPF_FUNC_this_cpu_ptr:
+ return &bpf_this_cpu_ptr_proto;
+ case BPF_FUNC_task_storage_get:
+ if (bpf_prog_check_recur(prog))
+ return &bpf_task_storage_get_recur_proto;
+ return &bpf_task_storage_get_proto;
+ case BPF_FUNC_task_storage_delete:
+ if (bpf_prog_check_recur(prog))
+ return &bpf_task_storage_delete_recur_proto;
+ return &bpf_task_storage_delete_proto;
+ case BPF_FUNC_for_each_map_elem:
+ return &bpf_for_each_map_elem_proto;
+ case BPF_FUNC_snprintf:
+ return &bpf_snprintf_proto;
+ case BPF_FUNC_get_func_ip:
+ return &bpf_get_func_ip_proto_tracing;
+ case BPF_FUNC_get_branch_snapshot:
+ return &bpf_get_branch_snapshot_proto;
+ case BPF_FUNC_find_vma:
+ return &bpf_find_vma_proto;
+ case BPF_FUNC_trace_vprintk:
+ return bpf_get_trace_vprintk_proto();
default:
- return NULL;
+ return bpf_base_func_proto(func_id);
}
}
@@ -1153,6 +1647,18 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_override_return:
return &bpf_override_return_proto;
#endif
+ case BPF_FUNC_get_func_ip:
+ if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI)
+ return &bpf_get_func_ip_proto_kprobe_multi;
+ if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
+ return &bpf_get_func_ip_proto_uprobe_multi;
+ return &bpf_get_func_ip_proto_kprobe;
+ case BPF_FUNC_get_attach_cookie:
+ if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI)
+ return &bpf_get_attach_cookie_proto_kmulti;
+ if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
+ return &bpf_get_attach_cookie_proto_umulti;
+ return &bpf_get_attach_cookie_proto_trace;
default:
return bpf_tracing_func_proto(func_id, prog);
}
@@ -1207,7 +1713,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -1263,6 +1769,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_stackid_proto_tp;
case BPF_FUNC_get_stack:
return &bpf_get_stack_proto_tp;
+ case BPF_FUNC_get_attach_cookie:
+ return &bpf_get_attach_cookie_proto_trace;
default:
return bpf_tracing_func_proto(func_id, prog);
}
@@ -1320,9 +1828,6 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto = {
BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx,
void *, buf, u32, size, u64, flags)
{
-#ifndef CONFIG_X86
- return -ENOENT;
-#else
static const u32 br_entry_size = sizeof(struct perf_branch_entry);
struct perf_branch_stack *br_stack = ctx->data->br_stack;
u32 to_copy;
@@ -1330,8 +1835,11 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx,
if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE))
return -EINVAL;
+ if (unlikely(!(ctx->data->sample_flags & PERF_SAMPLE_BRANCH_STACK)))
+ return -ENOENT;
+
if (unlikely(!br_stack))
- return -EINVAL;
+ return -ENOENT;
if (flags & BPF_F_GET_BRANCH_RECORDS_SIZE)
return br_stack->nr * br_entry_size;
@@ -1343,7 +1851,6 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx,
memcpy(buf, br_stack->entries, to_copy);
return to_copy;
-#endif
}
static const struct bpf_func_proto bpf_read_branch_records_proto = {
@@ -1363,13 +1870,15 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_perf_event_output:
return &bpf_perf_event_output_proto_tp;
case BPF_FUNC_get_stackid:
- return &bpf_get_stackid_proto_tp;
+ return &bpf_get_stackid_proto_pe;
case BPF_FUNC_get_stack:
- return &bpf_get_stack_proto_tp;
+ return &bpf_get_stack_proto_pe;
case BPF_FUNC_perf_prog_read_value:
return &bpf_perf_prog_read_value_proto;
case BPF_FUNC_read_branch_records:
return &bpf_read_branch_records_proto;
+ case BPF_FUNC_get_attach_cookie:
+ return &bpf_get_attach_cookie_proto_pe;
default:
return bpf_tracing_func_proto(func_id, prog);
}
@@ -1429,12 +1938,13 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
extern const struct bpf_func_proto bpf_skb_output_proto;
extern const struct bpf_func_proto bpf_xdp_output_proto;
+extern const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto;
BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
struct bpf_map *, map, u64, flags)
@@ -1483,7 +1993,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
@@ -1506,12 +2016,38 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
const struct bpf_func_proto *
tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *fn;
+
switch (func_id) {
#ifdef CONFIG_NET
case BPF_FUNC_skb_output:
return &bpf_skb_output_proto;
case BPF_FUNC_xdp_output:
return &bpf_xdp_output_proto;
+ case BPF_FUNC_skc_to_tcp6_sock:
+ return &bpf_skc_to_tcp6_sock_proto;
+ case BPF_FUNC_skc_to_tcp_sock:
+ return &bpf_skc_to_tcp_sock_proto;
+ case BPF_FUNC_skc_to_tcp_timewait_sock:
+ return &bpf_skc_to_tcp_timewait_sock_proto;
+ case BPF_FUNC_skc_to_tcp_request_sock:
+ return &bpf_skc_to_tcp_request_sock_proto;
+ case BPF_FUNC_skc_to_udp6_sock:
+ return &bpf_skc_to_udp6_sock_proto;
+ case BPF_FUNC_skc_to_unix_sock:
+ return &bpf_skc_to_unix_sock_proto;
+ case BPF_FUNC_skc_to_mptcp_sock:
+ return &bpf_skc_to_mptcp_sock_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_tracing_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_tracing_proto;
+ case BPF_FUNC_sock_from_file:
+ return &bpf_sock_from_file_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_ptr_cookie_proto;
+ case BPF_FUNC_xdp_get_buff_len:
+ return &bpf_xdp_get_buff_len_trace_proto;
#endif
case BPF_FUNC_seq_printf:
return prog->expected_attach_type == BPF_TRACE_ITER ?
@@ -1521,8 +2057,25 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return prog->expected_attach_type == BPF_TRACE_ITER ?
&bpf_seq_write_proto :
NULL;
+ case BPF_FUNC_seq_printf_btf:
+ return prog->expected_attach_type == BPF_TRACE_ITER ?
+ &bpf_seq_printf_btf_proto :
+ NULL;
+ case BPF_FUNC_d_path:
+ return &bpf_d_path_proto;
+ case BPF_FUNC_get_func_arg:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL;
+ case BPF_FUNC_get_func_ret:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL;
+ case BPF_FUNC_get_func_arg_cnt:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
+ case BPF_FUNC_get_attach_cookie:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL;
default:
- return raw_tp_prog_func_proto(func_id, prog);
+ fn = raw_tp_prog_func_proto(func_id, prog);
+ if (!fn && prog->expected_attach_type == BPF_TRACE_ITER)
+ fn = bpf_iter_get_func_proto(func_id, prog);
+ return fn;
}
}
@@ -1531,13 +2084,7 @@ static bool raw_tp_prog_is_valid_access(int off, int size,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
- if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
- return false;
- if (type != BPF_READ)
- return false;
- if (off % size != 0)
- return false;
- return true;
+ return bpf_tracing_ctx_access(off, size, type);
}
static bool tracing_prog_is_valid_access(int off, int size,
@@ -1545,13 +2092,7 @@ static bool tracing_prog_is_valid_access(int off, int size,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
- if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
- return false;
- if (type != BPF_READ)
- return false;
- if (off % size != 0)
- return false;
- return btf_ctx_access(off, size, type, prog, info);
+ return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
}
int __weak bpf_prog_test_run_tracing(struct bpf_prog *prog,
@@ -1567,6 +2108,9 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
};
const struct bpf_prog_ops raw_tracepoint_prog_ops = {
+#ifdef CONFIG_NET
+ .test_run = bpf_prog_test_run_raw_tp,
+#endif
};
const struct bpf_verifier_ops tracing_verifier_ops = {
@@ -1687,7 +2231,8 @@ static DEFINE_MUTEX(bpf_event_mutex);
#define BPF_TRACE_MAX_PROGS 64
int perf_event_attach_bpf_prog(struct perf_event *event,
- struct bpf_prog *prog)
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
{
struct bpf_prog_array *old_array;
struct bpf_prog_array *new_array;
@@ -1714,14 +2259,15 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
goto unlock;
}
- ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+ ret = bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array);
if (ret < 0)
goto unlock;
/* set the new array to event->tp_event and set event->prog */
event->prog = prog;
+ event->bpf_cookie = bpf_cookie;
rcu_assign_pointer(event->tp_event->prog_array, new_array);
- bpf_prog_array_free(old_array);
+ bpf_prog_array_free_sleepable(old_array);
unlock:
mutex_unlock(&bpf_event_mutex);
@@ -1740,14 +2286,14 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
goto unlock;
old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
- ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+ ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array);
if (ret == -ENOENT)
goto unlock;
if (ret < 0) {
bpf_prog_array_delete_safe(old_array, event->prog);
} else {
rcu_assign_pointer(event->tp_event->prog_array, new_array);
- bpf_prog_array_free(old_array);
+ bpf_prog_array_free_sleepable(old_array);
}
bpf_prog_put(event->prog);
@@ -1815,19 +2361,27 @@ struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name)
void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
{
- struct module *mod = __module_address((unsigned long)btp);
+ struct module *mod;
- if (mod)
- module_put(mod);
+ preempt_disable();
+ mod = __module_address((unsigned long)btp);
+ module_put(mod);
+ preempt_enable();
}
static __always_inline
void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
{
cant_sleep();
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ bpf_prog_inc_misses_counter(prog);
+ goto out;
+ }
rcu_read_lock();
- (void) BPF_PROG_RUN(prog, args);
+ (void) bpf_prog_run(prog, args);
rcu_read_unlock();
+out:
+ this_cpu_dec(*(prog->active));
}
#define UNPACK(...) __VA_ARGS__
@@ -1889,7 +2443,8 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
if (prog->aux->max_tp_access > btp->writable_size)
return -EINVAL;
- return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog);
+ return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func,
+ prog);
}
int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
@@ -1904,7 +2459,8 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
u32 *fd_type, const char **buf,
- u64 *probe_offset, u64 *probe_addr)
+ u64 *probe_offset, u64 *probe_addr,
+ unsigned long *missed)
{
bool is_tracepoint, is_syscall_tp;
struct bpf_prog *prog;
@@ -1926,22 +2482,26 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
if (is_tracepoint || is_syscall_tp) {
*buf = is_tracepoint ? event->tp_event->tp->name
: event->tp_event->name;
- *fd_type = BPF_FD_TYPE_TRACEPOINT;
- *probe_offset = 0x0;
- *probe_addr = 0x0;
+ /* We allow NULL pointer for tracepoint */
+ if (fd_type)
+ *fd_type = BPF_FD_TYPE_TRACEPOINT;
+ if (probe_offset)
+ *probe_offset = 0x0;
+ if (probe_addr)
+ *probe_addr = 0x0;
} else {
/* kprobe/uprobe */
err = -EOPNOTSUPP;
#ifdef CONFIG_KPROBE_EVENTS
if (flags & TRACE_EVENT_FL_KPROBE)
err = bpf_get_kprobe_info(event, fd_type, buf,
- probe_offset, probe_addr,
+ probe_offset, probe_addr, missed,
event->attr.type == PERF_TYPE_TRACEPOINT);
#endif
#ifdef CONFIG_UPROBE_EVENTS
if (flags & TRACE_EVENT_FL_UPROBE)
err = bpf_get_uprobe_info(event, fd_type, buf,
- probe_offset,
+ probe_offset, probe_addr,
event->attr.type == PERF_TYPE_TRACEPOINT);
#endif
}
@@ -1969,10 +2529,11 @@ static int bpf_event_notify(struct notifier_block *nb, unsigned long op,
{
struct bpf_trace_module *btm, *tmp;
struct module *mod = module;
+ int ret = 0;
if (mod->num_bpf_raw_events == 0 ||
(op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
- return 0;
+ goto out;
mutex_lock(&bpf_module_mutex);
@@ -1982,6 +2543,8 @@ static int bpf_event_notify(struct notifier_block *nb, unsigned long op,
if (btm) {
btm->module = module;
list_add(&btm->list, &bpf_trace_modules);
+ } else {
+ ret = -ENOMEM;
}
break;
case MODULE_STATE_GOING:
@@ -1997,7 +2560,8 @@ static int bpf_event_notify(struct notifier_block *nb, unsigned long op,
mutex_unlock(&bpf_module_mutex);
- return 0;
+out:
+ return notifier_from_errno(ret);
}
static struct notifier_block bpf_module_nb = {
@@ -2012,3 +2576,895 @@ static int __init bpf_event_init(void)
fs_initcall(bpf_event_init);
#endif /* CONFIG_MODULES */
+
+#ifdef CONFIG_FPROBE
+struct bpf_kprobe_multi_link {
+ struct bpf_link link;
+ struct fprobe fp;
+ unsigned long *addrs;
+ u64 *cookies;
+ u32 cnt;
+ u32 mods_cnt;
+ struct module **mods;
+ u32 flags;
+};
+
+struct bpf_kprobe_multi_run_ctx {
+ struct bpf_run_ctx run_ctx;
+ struct bpf_kprobe_multi_link *link;
+ unsigned long entry_ip;
+};
+
+struct user_syms {
+ const char **syms;
+ char *buf;
+};
+
+static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt)
+{
+ unsigned long __user usymbol;
+ const char **syms = NULL;
+ char *buf = NULL, *p;
+ int err = -ENOMEM;
+ unsigned int i;
+
+ syms = kvmalloc_array(cnt, sizeof(*syms), GFP_KERNEL);
+ if (!syms)
+ goto error;
+
+ buf = kvmalloc_array(cnt, KSYM_NAME_LEN, GFP_KERNEL);
+ if (!buf)
+ goto error;
+
+ for (p = buf, i = 0; i < cnt; i++) {
+ if (__get_user(usymbol, usyms + i)) {
+ err = -EFAULT;
+ goto error;
+ }
+ err = strncpy_from_user(p, (const char __user *) usymbol, KSYM_NAME_LEN);
+ if (err == KSYM_NAME_LEN)
+ err = -E2BIG;
+ if (err < 0)
+ goto error;
+ syms[i] = p;
+ p += err + 1;
+ }
+
+ us->syms = syms;
+ us->buf = buf;
+ return 0;
+
+error:
+ if (err) {
+ kvfree(syms);
+ kvfree(buf);
+ }
+ return err;
+}
+
+static void kprobe_multi_put_modules(struct module **mods, u32 cnt)
+{
+ u32 i;
+
+ for (i = 0; i < cnt; i++)
+ module_put(mods[i]);
+}
+
+static void free_user_syms(struct user_syms *us)
+{
+ kvfree(us->syms);
+ kvfree(us->buf);
+}
+
+static void bpf_kprobe_multi_link_release(struct bpf_link *link)
+{
+ struct bpf_kprobe_multi_link *kmulti_link;
+
+ kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+ unregister_fprobe(&kmulti_link->fp);
+ kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt);
+}
+
+static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_kprobe_multi_link *kmulti_link;
+
+ kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+ kvfree(kmulti_link->addrs);
+ kvfree(kmulti_link->cookies);
+ kfree(kmulti_link->mods);
+ kfree(kmulti_link);
+}
+
+static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ u64 __user *uaddrs = u64_to_user_ptr(info->kprobe_multi.addrs);
+ struct bpf_kprobe_multi_link *kmulti_link;
+ u32 ucount = info->kprobe_multi.count;
+ int err = 0, i;
+
+ if (!uaddrs ^ !ucount)
+ return -EINVAL;
+
+ kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+ info->kprobe_multi.count = kmulti_link->cnt;
+ info->kprobe_multi.flags = kmulti_link->flags;
+ info->kprobe_multi.missed = kmulti_link->fp.nmissed;
+
+ if (!uaddrs)
+ return 0;
+ if (ucount < kmulti_link->cnt)
+ err = -ENOSPC;
+ else
+ ucount = kmulti_link->cnt;
+
+ if (kallsyms_show_value(current_cred())) {
+ if (copy_to_user(uaddrs, kmulti_link->addrs, ucount * sizeof(u64)))
+ return -EFAULT;
+ } else {
+ for (i = 0; i < ucount; i++) {
+ if (put_user(0, uaddrs + i))
+ return -EFAULT;
+ }
+ }
+ return err;
+}
+
+static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
+ .release = bpf_kprobe_multi_link_release,
+ .dealloc = bpf_kprobe_multi_link_dealloc,
+ .fill_link_info = bpf_kprobe_multi_link_fill_link_info,
+};
+
+static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv)
+{
+ const struct bpf_kprobe_multi_link *link = priv;
+ unsigned long *addr_a = a, *addr_b = b;
+ u64 *cookie_a, *cookie_b;
+
+ cookie_a = link->cookies + (addr_a - link->addrs);
+ cookie_b = link->cookies + (addr_b - link->addrs);
+
+ /* swap addr_a/addr_b and cookie_a/cookie_b values */
+ swap(*addr_a, *addr_b);
+ swap(*cookie_a, *cookie_b);
+}
+
+static int bpf_kprobe_multi_addrs_cmp(const void *a, const void *b)
+{
+ const unsigned long *addr_a = a, *addr_b = b;
+
+ if (*addr_a == *addr_b)
+ return 0;
+ return *addr_a < *addr_b ? -1 : 1;
+}
+
+static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv)
+{
+ return bpf_kprobe_multi_addrs_cmp(a, b);
+}
+
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
+{
+ struct bpf_kprobe_multi_run_ctx *run_ctx;
+ struct bpf_kprobe_multi_link *link;
+ u64 *cookie, entry_ip;
+ unsigned long *addr;
+
+ if (WARN_ON_ONCE(!ctx))
+ return 0;
+ run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+ link = run_ctx->link;
+ if (!link->cookies)
+ return 0;
+ entry_ip = run_ctx->entry_ip;
+ addr = bsearch(&entry_ip, link->addrs, link->cnt, sizeof(entry_ip),
+ bpf_kprobe_multi_addrs_cmp);
+ if (!addr)
+ return 0;
+ cookie = link->cookies + (addr - link->addrs);
+ return *cookie;
+}
+
+static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
+{
+ struct bpf_kprobe_multi_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+ return run_ctx->entry_ip;
+}
+
+static int
+kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
+ unsigned long entry_ip, struct pt_regs *regs)
+{
+ struct bpf_kprobe_multi_run_ctx run_ctx = {
+ .link = link,
+ .entry_ip = entry_ip,
+ };
+ struct bpf_run_ctx *old_run_ctx;
+ int err;
+
+ if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+ bpf_prog_inc_misses_counter(link->link.prog);
+ err = 0;
+ goto out;
+ }
+
+ migrate_disable();
+ rcu_read_lock();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ err = bpf_prog_run(link->link.prog, regs);
+ bpf_reset_run_ctx(old_run_ctx);
+ rcu_read_unlock();
+ migrate_enable();
+
+ out:
+ __this_cpu_dec(bpf_prog_active);
+ return err;
+}
+
+static int
+kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
+ unsigned long ret_ip, struct pt_regs *regs,
+ void *data)
+{
+ struct bpf_kprobe_multi_link *link;
+
+ link = container_of(fp, struct bpf_kprobe_multi_link, fp);
+ kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
+ return 0;
+}
+
+static void
+kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip,
+ unsigned long ret_ip, struct pt_regs *regs,
+ void *data)
+{
+ struct bpf_kprobe_multi_link *link;
+
+ link = container_of(fp, struct bpf_kprobe_multi_link, fp);
+ kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
+}
+
+static int symbols_cmp_r(const void *a, const void *b, const void *priv)
+{
+ const char **str_a = (const char **) a;
+ const char **str_b = (const char **) b;
+
+ return strcmp(*str_a, *str_b);
+}
+
+struct multi_symbols_sort {
+ const char **funcs;
+ u64 *cookies;
+};
+
+static void symbols_swap_r(void *a, void *b, int size, const void *priv)
+{
+ const struct multi_symbols_sort *data = priv;
+ const char **name_a = a, **name_b = b;
+
+ swap(*name_a, *name_b);
+
+ /* If defined, swap also related cookies. */
+ if (data->cookies) {
+ u64 *cookie_a, *cookie_b;
+
+ cookie_a = data->cookies + (name_a - data->funcs);
+ cookie_b = data->cookies + (name_b - data->funcs);
+ swap(*cookie_a, *cookie_b);
+ }
+}
+
+struct modules_array {
+ struct module **mods;
+ int mods_cnt;
+ int mods_cap;
+};
+
+static int add_module(struct modules_array *arr, struct module *mod)
+{
+ struct module **mods;
+
+ if (arr->mods_cnt == arr->mods_cap) {
+ arr->mods_cap = max(16, arr->mods_cap * 3 / 2);
+ mods = krealloc_array(arr->mods, arr->mods_cap, sizeof(*mods), GFP_KERNEL);
+ if (!mods)
+ return -ENOMEM;
+ arr->mods = mods;
+ }
+
+ arr->mods[arr->mods_cnt] = mod;
+ arr->mods_cnt++;
+ return 0;
+}
+
+static bool has_module(struct modules_array *arr, struct module *mod)
+{
+ int i;
+
+ for (i = arr->mods_cnt - 1; i >= 0; i--) {
+ if (arr->mods[i] == mod)
+ return true;
+ }
+ return false;
+}
+
+static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u32 addrs_cnt)
+{
+ struct modules_array arr = {};
+ u32 i, err = 0;
+
+ for (i = 0; i < addrs_cnt; i++) {
+ struct module *mod;
+
+ preempt_disable();
+ mod = __module_address(addrs[i]);
+ /* Either no module or we it's already stored */
+ if (!mod || has_module(&arr, mod)) {
+ preempt_enable();
+ continue;
+ }
+ if (!try_module_get(mod))
+ err = -EINVAL;
+ preempt_enable();
+ if (err)
+ break;
+ err = add_module(&arr, mod);
+ if (err) {
+ module_put(mod);
+ break;
+ }
+ }
+
+ /* We return either err < 0 in case of error, ... */
+ if (err) {
+ kprobe_multi_put_modules(arr.mods, arr.mods_cnt);
+ kfree(arr.mods);
+ return err;
+ }
+
+ /* or number of modules found if everything is ok. */
+ *mods = arr.mods;
+ return arr.mods_cnt;
+}
+
+static int addrs_check_error_injection_list(unsigned long *addrs, u32 cnt)
+{
+ u32 i;
+
+ for (i = 0; i < cnt; i++) {
+ if (!within_error_injection_list(addrs[i]))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_kprobe_multi_link *link = NULL;
+ struct bpf_link_primer link_primer;
+ void __user *ucookies;
+ unsigned long *addrs;
+ u32 flags, cnt, size;
+ void __user *uaddrs;
+ u64 *cookies = NULL;
+ void __user *usyms;
+ int err;
+
+ /* no support for 32bit archs yet */
+ if (sizeof(u64) != sizeof(void *))
+ return -EOPNOTSUPP;
+
+ if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI)
+ return -EINVAL;
+
+ flags = attr->link_create.kprobe_multi.flags;
+ if (flags & ~BPF_F_KPROBE_MULTI_RETURN)
+ return -EINVAL;
+
+ uaddrs = u64_to_user_ptr(attr->link_create.kprobe_multi.addrs);
+ usyms = u64_to_user_ptr(attr->link_create.kprobe_multi.syms);
+ if (!!uaddrs == !!usyms)
+ return -EINVAL;
+
+ cnt = attr->link_create.kprobe_multi.cnt;
+ if (!cnt)
+ return -EINVAL;
+ if (cnt > MAX_KPROBE_MULTI_CNT)
+ return -E2BIG;
+
+ size = cnt * sizeof(*addrs);
+ addrs = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return -ENOMEM;
+
+ ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies);
+ if (ucookies) {
+ cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
+ if (!cookies) {
+ err = -ENOMEM;
+ goto error;
+ }
+ if (copy_from_user(cookies, ucookies, size)) {
+ err = -EFAULT;
+ goto error;
+ }
+ }
+
+ if (uaddrs) {
+ if (copy_from_user(addrs, uaddrs, size)) {
+ err = -EFAULT;
+ goto error;
+ }
+ } else {
+ struct multi_symbols_sort data = {
+ .cookies = cookies,
+ };
+ struct user_syms us;
+
+ err = copy_user_syms(&us, usyms, cnt);
+ if (err)
+ goto error;
+
+ if (cookies)
+ data.funcs = us.syms;
+
+ sort_r(us.syms, cnt, sizeof(*us.syms), symbols_cmp_r,
+ symbols_swap_r, &data);
+
+ err = ftrace_lookup_symbols(us.syms, cnt, addrs);
+ free_user_syms(&us);
+ if (err)
+ goto error;
+ }
+
+ if (prog->kprobe_override && addrs_check_error_injection_list(addrs, cnt)) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
+ if (!link) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI,
+ &bpf_kprobe_multi_link_lops, prog);
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto error;
+
+ if (flags & BPF_F_KPROBE_MULTI_RETURN)
+ link->fp.exit_handler = kprobe_multi_link_exit_handler;
+ else
+ link->fp.entry_handler = kprobe_multi_link_handler;
+
+ link->addrs = addrs;
+ link->cookies = cookies;
+ link->cnt = cnt;
+ link->flags = flags;
+
+ if (cookies) {
+ /*
+ * Sorting addresses will trigger sorting cookies as well
+ * (check bpf_kprobe_multi_cookie_swap). This way we can
+ * find cookie based on the address in bpf_get_attach_cookie
+ * helper.
+ */
+ sort_r(addrs, cnt, sizeof(*addrs),
+ bpf_kprobe_multi_cookie_cmp,
+ bpf_kprobe_multi_cookie_swap,
+ link);
+ }
+
+ err = get_modules_for_addrs(&link->mods, addrs, cnt);
+ if (err < 0) {
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+ link->mods_cnt = err;
+
+ err = register_fprobe_ips(&link->fp, addrs, cnt);
+ if (err) {
+ kprobe_multi_put_modules(link->mods, link->mods_cnt);
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+
+ return bpf_link_settle(&link_primer);
+
+error:
+ kfree(link);
+ kvfree(addrs);
+ kvfree(cookies);
+ return err;
+}
+#else /* !CONFIG_FPROBE */
+int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ return -EOPNOTSUPP;
+}
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
+{
+ return 0;
+}
+static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_UPROBES
+struct bpf_uprobe_multi_link;
+
+struct bpf_uprobe {
+ struct bpf_uprobe_multi_link *link;
+ loff_t offset;
+ unsigned long ref_ctr_offset;
+ u64 cookie;
+ struct uprobe_consumer consumer;
+};
+
+struct bpf_uprobe_multi_link {
+ struct path path;
+ struct bpf_link link;
+ u32 cnt;
+ u32 flags;
+ struct bpf_uprobe *uprobes;
+ struct task_struct *task;
+};
+
+struct bpf_uprobe_multi_run_ctx {
+ struct bpf_run_ctx run_ctx;
+ unsigned long entry_ip;
+ struct bpf_uprobe *uprobe;
+};
+
+static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes,
+ u32 cnt)
+{
+ u32 i;
+
+ for (i = 0; i < cnt; i++) {
+ uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset,
+ &uprobes[i].consumer);
+ }
+}
+
+static void bpf_uprobe_multi_link_release(struct bpf_link *link)
+{
+ struct bpf_uprobe_multi_link *umulti_link;
+
+ umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
+ bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt);
+}
+
+static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_uprobe_multi_link *umulti_link;
+
+ umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
+ if (umulti_link->task)
+ put_task_struct(umulti_link->task);
+ path_put(&umulti_link->path);
+ kvfree(umulti_link->uprobes);
+ kfree(umulti_link);
+}
+
+static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ u64 __user *uref_ctr_offsets = u64_to_user_ptr(info->uprobe_multi.ref_ctr_offsets);
+ u64 __user *ucookies = u64_to_user_ptr(info->uprobe_multi.cookies);
+ u64 __user *uoffsets = u64_to_user_ptr(info->uprobe_multi.offsets);
+ u64 __user *upath = u64_to_user_ptr(info->uprobe_multi.path);
+ u32 upath_size = info->uprobe_multi.path_size;
+ struct bpf_uprobe_multi_link *umulti_link;
+ u32 ucount = info->uprobe_multi.count;
+ int err = 0, i;
+ long left;
+
+ if (!upath ^ !upath_size)
+ return -EINVAL;
+
+ if ((uoffsets || uref_ctr_offsets || ucookies) && !ucount)
+ return -EINVAL;
+
+ umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
+ info->uprobe_multi.count = umulti_link->cnt;
+ info->uprobe_multi.flags = umulti_link->flags;
+ info->uprobe_multi.pid = umulti_link->task ?
+ task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0;
+
+ if (upath) {
+ char *p, *buf;
+
+ upath_size = min_t(u32, upath_size, PATH_MAX);
+
+ buf = kmalloc(upath_size, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ p = d_path(&umulti_link->path, buf, upath_size);
+ if (IS_ERR(p)) {
+ kfree(buf);
+ return PTR_ERR(p);
+ }
+ upath_size = buf + upath_size - p;
+ left = copy_to_user(upath, p, upath_size);
+ kfree(buf);
+ if (left)
+ return -EFAULT;
+ info->uprobe_multi.path_size = upath_size;
+ }
+
+ if (!uoffsets && !ucookies && !uref_ctr_offsets)
+ return 0;
+
+ if (ucount < umulti_link->cnt)
+ err = -ENOSPC;
+ else
+ ucount = umulti_link->cnt;
+
+ for (i = 0; i < ucount; i++) {
+ if (uoffsets &&
+ put_user(umulti_link->uprobes[i].offset, uoffsets + i))
+ return -EFAULT;
+ if (uref_ctr_offsets &&
+ put_user(umulti_link->uprobes[i].ref_ctr_offset, uref_ctr_offsets + i))
+ return -EFAULT;
+ if (ucookies &&
+ put_user(umulti_link->uprobes[i].cookie, ucookies + i))
+ return -EFAULT;
+ }
+
+ return err;
+}
+
+static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
+ .release = bpf_uprobe_multi_link_release,
+ .dealloc = bpf_uprobe_multi_link_dealloc,
+ .fill_link_info = bpf_uprobe_multi_link_fill_link_info,
+};
+
+static int uprobe_prog_run(struct bpf_uprobe *uprobe,
+ unsigned long entry_ip,
+ struct pt_regs *regs)
+{
+ struct bpf_uprobe_multi_link *link = uprobe->link;
+ struct bpf_uprobe_multi_run_ctx run_ctx = {
+ .entry_ip = entry_ip,
+ .uprobe = uprobe,
+ };
+ struct bpf_prog *prog = link->link.prog;
+ bool sleepable = prog->aux->sleepable;
+ struct bpf_run_ctx *old_run_ctx;
+ int err = 0;
+
+ if (link->task && current != link->task)
+ return 0;
+
+ if (sleepable)
+ rcu_read_lock_trace();
+ else
+ rcu_read_lock();
+
+ migrate_disable();
+
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ err = bpf_prog_run(link->link.prog, regs);
+ bpf_reset_run_ctx(old_run_ctx);
+
+ migrate_enable();
+
+ if (sleepable)
+ rcu_read_unlock_trace();
+ else
+ rcu_read_unlock();
+ return err;
+}
+
+static bool
+uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx,
+ struct mm_struct *mm)
+{
+ struct bpf_uprobe *uprobe;
+
+ uprobe = container_of(con, struct bpf_uprobe, consumer);
+ return uprobe->link->task->mm == mm;
+}
+
+static int
+uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs)
+{
+ struct bpf_uprobe *uprobe;
+
+ uprobe = container_of(con, struct bpf_uprobe, consumer);
+ return uprobe_prog_run(uprobe, instruction_pointer(regs), regs);
+}
+
+static int
+uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs)
+{
+ struct bpf_uprobe *uprobe;
+
+ uprobe = container_of(con, struct bpf_uprobe, consumer);
+ return uprobe_prog_run(uprobe, func, regs);
+}
+
+static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
+{
+ struct bpf_uprobe_multi_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx);
+ return run_ctx->entry_ip;
+}
+
+static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
+{
+ struct bpf_uprobe_multi_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx);
+ return run_ctx->uprobe->cookie;
+}
+
+int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_uprobe_multi_link *link = NULL;
+ unsigned long __user *uref_ctr_offsets;
+ struct bpf_link_primer link_primer;
+ struct bpf_uprobe *uprobes = NULL;
+ struct task_struct *task = NULL;
+ unsigned long __user *uoffsets;
+ u64 __user *ucookies;
+ void __user *upath;
+ u32 flags, cnt, i;
+ struct path path;
+ char *name;
+ pid_t pid;
+ int err;
+
+ /* no support for 32bit archs yet */
+ if (sizeof(u64) != sizeof(void *))
+ return -EOPNOTSUPP;
+
+ if (prog->expected_attach_type != BPF_TRACE_UPROBE_MULTI)
+ return -EINVAL;
+
+ flags = attr->link_create.uprobe_multi.flags;
+ if (flags & ~BPF_F_UPROBE_MULTI_RETURN)
+ return -EINVAL;
+
+ /*
+ * path, offsets and cnt are mandatory,
+ * ref_ctr_offsets and cookies are optional
+ */
+ upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path);
+ uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets);
+ cnt = attr->link_create.uprobe_multi.cnt;
+
+ if (!upath || !uoffsets || !cnt)
+ return -EINVAL;
+ if (cnt > MAX_UPROBE_MULTI_CNT)
+ return -E2BIG;
+
+ uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets);
+ ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies);
+
+ name = strndup_user(upath, PATH_MAX);
+ if (IS_ERR(name)) {
+ err = PTR_ERR(name);
+ return err;
+ }
+
+ err = kern_path(name, LOOKUP_FOLLOW, &path);
+ kfree(name);
+ if (err)
+ return err;
+
+ if (!d_is_reg(path.dentry)) {
+ err = -EBADF;
+ goto error_path_put;
+ }
+
+ pid = attr->link_create.uprobe_multi.pid;
+ if (pid) {
+ rcu_read_lock();
+ task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+ rcu_read_unlock();
+ if (!task) {
+ err = -ESRCH;
+ goto error_path_put;
+ }
+ }
+
+ err = -ENOMEM;
+
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
+ uprobes = kvcalloc(cnt, sizeof(*uprobes), GFP_KERNEL);
+
+ if (!uprobes || !link)
+ goto error_free;
+
+ for (i = 0; i < cnt; i++) {
+ if (__get_user(uprobes[i].offset, uoffsets + i)) {
+ err = -EFAULT;
+ goto error_free;
+ }
+ if (uprobes[i].offset < 0) {
+ err = -EINVAL;
+ goto error_free;
+ }
+ if (uref_ctr_offsets && __get_user(uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) {
+ err = -EFAULT;
+ goto error_free;
+ }
+ if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) {
+ err = -EFAULT;
+ goto error_free;
+ }
+
+ uprobes[i].link = link;
+
+ if (flags & BPF_F_UPROBE_MULTI_RETURN)
+ uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler;
+ else
+ uprobes[i].consumer.handler = uprobe_multi_link_handler;
+
+ if (pid)
+ uprobes[i].consumer.filter = uprobe_multi_link_filter;
+ }
+
+ link->cnt = cnt;
+ link->uprobes = uprobes;
+ link->path = path;
+ link->task = task;
+ link->flags = flags;
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI,
+ &bpf_uprobe_multi_link_lops, prog);
+
+ for (i = 0; i < cnt; i++) {
+ err = uprobe_register_refctr(d_real_inode(link->path.dentry),
+ uprobes[i].offset,
+ uprobes[i].ref_ctr_offset,
+ &uprobes[i].consumer);
+ if (err) {
+ bpf_uprobe_unregister(&path, uprobes, i);
+ goto error_free;
+ }
+ }
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto error_free;
+
+ return bpf_link_settle(&link_primer);
+
+error_free:
+ kvfree(uprobes);
+ kfree(link);
+ if (task)
+ put_task_struct(task);
+error_path_put:
+ path_put(&path);
+ return err;
+}
+#else /* !CONFIG_UPROBES */
+int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ return -EOPNOTSUPP;
+}
+static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
+{
+ return 0;
+}
+static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
+{
+ return 0;
+}
+#endif /* CONFIG_UPROBES */
diff --git a/kernel/trace/bpf_trace.h b/kernel/trace/bpf_trace.h
new file mode 100644
index 000000000000..9acbc11ac7bb
--- /dev/null
+++ b/kernel/trace/bpf_trace.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bpf_trace
+
+#if !defined(_TRACE_BPF_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+
+#define _TRACE_BPF_TRACE_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(bpf_trace_printk,
+
+ TP_PROTO(const char *bpf_string),
+
+ TP_ARGS(bpf_string),
+
+ TP_STRUCT__entry(
+ __string(bpf_string, bpf_string)
+ ),
+
+ TP_fast_assign(
+ __assign_str(bpf_string, bpf_string);
+ ),
+
+ TP_printk("%s", __get_str(bpf_string))
+);
+
+#endif /* _TRACE_BPF_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE bpf_trace
+
+#include <trace/define_trace.h>
diff --git a/kernel/trace/error_report-traces.c b/kernel/trace/error_report-traces.c
new file mode 100644
index 000000000000..f89792c25b11
--- /dev/null
+++ b/kernel/trace/error_report-traces.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Error reporting trace points.
+ *
+ * Copyright (C) 2021, Google LLC.
+ */
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/error_report.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(error_report_end);
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 1af321dec0f1..c83c005e654e 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -7,6 +7,7 @@
*
* Highly modified by Steven Rostedt (VMware).
*/
+#include <linux/jump_label.h>
#include <linux/suspend.h>
#include <linux/ftrace.h>
#include <linux/slab.h>
@@ -14,6 +15,7 @@
#include <trace/events/sched.h>
#include "ftrace_internal.h"
+#include "trace.h"
#ifdef CONFIG_DYNAMIC_FTRACE
#define ASSIGN_OPS_HASH(opsname, val) \
@@ -23,26 +25,34 @@
#define ASSIGN_OPS_HASH(opsname, val)
#endif
-static bool kill_ftrace_graph;
+DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
int ftrace_graph_active;
/* Both enabled by default (can be cleared by function_graph tracer flags */
static bool fgraph_sleep_time = true;
-/**
- * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called
- *
- * ftrace_graph_stop() is called when a severe error is detected in
- * the function graph tracing. This function is called by the critical
- * paths of function graph to keep those paths from doing any more harm.
+#ifdef CONFIG_DYNAMIC_FTRACE
+/*
+ * archs can override this function if they must do something
+ * to enable hook for graph tracer.
+ */
+int __weak ftrace_enable_ftrace_graph_caller(void)
+{
+ return 0;
+}
+
+/*
+ * archs can override this function if they must do something
+ * to disable hook for graph tracer.
*/
-bool ftrace_graph_is_dead(void)
+int __weak ftrace_disable_ftrace_graph_caller(void)
{
- return kill_ftrace_graph;
+ return 0;
}
+#endif
/**
- * ftrace_graph_stop - set to permanently disable function graph tracincg
+ * ftrace_graph_stop - set to permanently disable function graph tracing
*
* In case of an error int function graph tracing, this is called
* to try to keep function graph tracing from causing any more harm.
@@ -51,7 +61,7 @@ bool ftrace_graph_is_dead(void)
*/
void ftrace_graph_stop(void)
{
- kill_ftrace_graph = true;
+ static_branch_enable(&kill_ftrace_graph);
}
/* Add a function return address to the trace stack on thread info.*/
@@ -115,15 +125,17 @@ int function_graph_enter(unsigned long ret, unsigned long func,
{
struct ftrace_graph_ent trace;
+#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
/*
* Skip graph tracing if the return location is served by direct trampoline,
- * since call sequence and return addresses is unpredicatable anymore.
+ * since call sequence and return addresses are unpredictable anyway.
* Ex: BPF trampoline may call original function and may skip frame
* depending on type of BPF programs attached.
*/
if (ftrace_direct_func_count &&
ftrace_find_rec_direct(ret - MCOUNT_INSN_SIZE))
return -EBUSY;
+#endif
trace.func = func;
trace.depth = ++current->curr_ret_depth;
@@ -225,16 +237,23 @@ static struct notifier_block ftrace_suspend_notifier = {
.notifier_call = ftrace_suspend_notifier_call,
};
+/* fgraph_ret_regs is not defined without CONFIG_FUNCTION_GRAPH_RETVAL */
+struct fgraph_ret_regs;
+
/*
* Send the trace to the ring-buffer.
* @return the original return address.
*/
-unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
+static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs,
+ unsigned long frame_pointer)
{
struct ftrace_graph_ret trace;
unsigned long ret;
ftrace_pop_return_trace(&trace, &ret, frame_pointer);
+#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
+ trace.retval = fgraph_ret_regs_return_value(ret_regs);
+#endif
trace.rettime = trace_clock_local();
ftrace_graph_return(&trace);
/*
@@ -255,6 +274,23 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
return ret;
}
+/*
+ * After all architecures have selected HAVE_FUNCTION_GRAPH_RETVAL, we can
+ * leave only ftrace_return_to_handler(ret_regs).
+ */
+#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL
+unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs)
+{
+ return __ftrace_return_to_handler(ret_regs,
+ fgraph_ret_regs_frame_pointer(ret_regs));
+}
+#else
+unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
+{
+ return __ftrace_return_to_handler(NULL, frame_pointer);
+}
+#endif
+
/**
* ftrace_graph_get_ret_stack - return the entry of the shadow stack
* @task: The task to read the shadow stack from
@@ -333,11 +369,10 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
static struct ftrace_ops graph_ops = {
- .func = ftrace_stub,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE |
- FTRACE_OPS_FL_INITIALIZED |
+ .func = ftrace_graph_func,
+ .flags = FTRACE_OPS_FL_INITIALIZED |
FTRACE_OPS_FL_PID |
- FTRACE_OPS_FL_STUB,
+ FTRACE_OPS_GRAPH_STUB,
#ifdef FTRACE_GRAPH_TRAMP_ADDR
.trampoline = FTRACE_GRAPH_TRAMP_ADDR,
/* trampoline_size is only needed for dynamically allocated tramps */
@@ -387,15 +422,14 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
}
}
- read_lock(&tasklist_lock);
- do_each_thread(g, t) {
+ rcu_read_lock();
+ for_each_process_thread(g, t) {
if (start == end) {
ret = -EAGAIN;
goto unlock;
}
if (t->ret_stack == NULL) {
- atomic_set(&t->tracing_graph_pause, 0);
atomic_set(&t->trace_overrun, 0);
t->curr_ret_stack = -1;
t->curr_ret_depth = -1;
@@ -403,10 +437,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
smp_wmb();
t->ret_stack = ret_stack_list[start++];
}
- } while_each_thread(g, t);
+ }
unlock:
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
free:
for (i = start; i < end; i++)
kfree(ret_stack_list[i]);
@@ -415,7 +449,9 @@ free:
static void
ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
{
unsigned long long timestamp;
int index;
@@ -490,7 +526,6 @@ static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
static void
graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
{
- atomic_set(&t->tracing_graph_pause, 0);
atomic_set(&t->trace_overrun, 0);
t->ftrace_timestamp = 0;
/* make curr_ret_stack visible before we add the ret_stack */
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
new file mode 100644
index 000000000000..9ff018245840
--- /dev/null
+++ b/kernel/trace/fprobe.c
@@ -0,0 +1,386 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fprobe - Simple ftrace probe wrapper for function entry.
+ */
+#define pr_fmt(fmt) "fprobe: " fmt
+
+#include <linux/err.h>
+#include <linux/fprobe.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/rethook.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+
+#include "trace.h"
+
+struct fprobe_rethook_node {
+ struct rethook_node node;
+ unsigned long entry_ip;
+ unsigned long entry_parent_ip;
+ char data[];
+};
+
+static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct fprobe_rethook_node *fpr;
+ struct rethook_node *rh = NULL;
+ struct fprobe *fp;
+ void *entry_data = NULL;
+ int ret = 0;
+
+ fp = container_of(ops, struct fprobe, ops);
+
+ if (fp->exit_handler) {
+ rh = rethook_try_get(fp->rethook);
+ if (!rh) {
+ fp->nmissed++;
+ return;
+ }
+ fpr = container_of(rh, struct fprobe_rethook_node, node);
+ fpr->entry_ip = ip;
+ fpr->entry_parent_ip = parent_ip;
+ if (fp->entry_data_size)
+ entry_data = fpr->data;
+ }
+
+ if (fp->entry_handler)
+ ret = fp->entry_handler(fp, ip, parent_ip, ftrace_get_regs(fregs), entry_data);
+
+ /* If entry_handler returns !0, nmissed is not counted. */
+ if (rh) {
+ if (ret)
+ rethook_recycle(rh);
+ else
+ rethook_hook(rh, ftrace_get_regs(fregs), true);
+ }
+}
+
+static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct fprobe *fp;
+ int bit;
+
+ fp = container_of(ops, struct fprobe, ops);
+ if (fprobe_disabled(fp))
+ return;
+
+ /* recursion detection has to go before any traceable function and
+ * all functions before this point should be marked as notrace
+ */
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0) {
+ fp->nmissed++;
+ return;
+ }
+ __fprobe_handler(ip, parent_ip, ops, fregs);
+ ftrace_test_recursion_unlock(bit);
+
+}
+NOKPROBE_SYMBOL(fprobe_handler);
+
+static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct fprobe *fp;
+ int bit;
+
+ fp = container_of(ops, struct fprobe, ops);
+ if (fprobe_disabled(fp))
+ return;
+
+ /* recursion detection has to go before any traceable function and
+ * all functions called before this point should be marked as notrace
+ */
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0) {
+ fp->nmissed++;
+ return;
+ }
+
+ /*
+ * This user handler is shared with other kprobes and is not expected to be
+ * called recursively. So if any other kprobe handler is running, this will
+ * exit as kprobe does. See the section 'Share the callbacks with kprobes'
+ * in Documentation/trace/fprobe.rst for more information.
+ */
+ if (unlikely(kprobe_running())) {
+ fp->nmissed++;
+ goto recursion_unlock;
+ }
+
+ kprobe_busy_begin();
+ __fprobe_handler(ip, parent_ip, ops, fregs);
+ kprobe_busy_end();
+
+recursion_unlock:
+ ftrace_test_recursion_unlock(bit);
+}
+
+static void fprobe_exit_handler(struct rethook_node *rh, void *data,
+ unsigned long ret_ip, struct pt_regs *regs)
+{
+ struct fprobe *fp = (struct fprobe *)data;
+ struct fprobe_rethook_node *fpr;
+ int bit;
+
+ if (!fp || fprobe_disabled(fp))
+ return;
+
+ fpr = container_of(rh, struct fprobe_rethook_node, node);
+
+ /*
+ * we need to assure no calls to traceable functions in-between the
+ * end of fprobe_handler and the beginning of fprobe_exit_handler.
+ */
+ bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip);
+ if (bit < 0) {
+ fp->nmissed++;
+ return;
+ }
+
+ fp->exit_handler(fp, fpr->entry_ip, ret_ip, regs,
+ fp->entry_data_size ? (void *)fpr->data : NULL);
+ ftrace_test_recursion_unlock(bit);
+}
+NOKPROBE_SYMBOL(fprobe_exit_handler);
+
+static int symbols_cmp(const void *a, const void *b)
+{
+ const char **str_a = (const char **) a;
+ const char **str_b = (const char **) b;
+
+ return strcmp(*str_a, *str_b);
+}
+
+/* Convert ftrace location address from symbols */
+static unsigned long *get_ftrace_locations(const char **syms, int num)
+{
+ unsigned long *addrs;
+
+ /* Convert symbols to symbol address */
+ addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return ERR_PTR(-ENOMEM);
+
+ /* ftrace_lookup_symbols expects sorted symbols */
+ sort(syms, num, sizeof(*syms), symbols_cmp, NULL);
+
+ if (!ftrace_lookup_symbols(syms, num, addrs))
+ return addrs;
+
+ kfree(addrs);
+ return ERR_PTR(-ENOENT);
+}
+
+static void fprobe_init(struct fprobe *fp)
+{
+ fp->nmissed = 0;
+ if (fprobe_shared_with_kprobes(fp))
+ fp->ops.func = fprobe_kprobe_handler;
+ else
+ fp->ops.func = fprobe_handler;
+ fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS;
+}
+
+static int fprobe_init_rethook(struct fprobe *fp, int num)
+{
+ int size;
+
+ if (!fp->exit_handler) {
+ fp->rethook = NULL;
+ return 0;
+ }
+
+ /* Initialize rethook if needed */
+ if (fp->nr_maxactive)
+ num = fp->nr_maxactive;
+ else
+ num *= num_possible_cpus() * 2;
+ if (num <= 0)
+ return -EINVAL;
+
+ size = sizeof(struct fprobe_rethook_node) + fp->entry_data_size;
+
+ /* Initialize rethook */
+ fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler, size, num);
+ if (IS_ERR(fp->rethook))
+ return PTR_ERR(fp->rethook);
+
+ return 0;
+}
+
+static void fprobe_fail_cleanup(struct fprobe *fp)
+{
+ if (!IS_ERR_OR_NULL(fp->rethook)) {
+ /* Don't need to cleanup rethook->handler because this is not used. */
+ rethook_free(fp->rethook);
+ fp->rethook = NULL;
+ }
+ ftrace_free_filter(&fp->ops);
+}
+
+/**
+ * register_fprobe() - Register fprobe to ftrace by pattern.
+ * @fp: A fprobe data structure to be registered.
+ * @filter: A wildcard pattern of probed symbols.
+ * @notfilter: A wildcard pattern of NOT probed symbols.
+ *
+ * Register @fp to ftrace for enabling the probe on the symbols matched to @filter.
+ * If @notfilter is not NULL, the symbols matched the @notfilter are not probed.
+ *
+ * Return 0 if @fp is registered successfully, -errno if not.
+ */
+int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter)
+{
+ struct ftrace_hash *hash;
+ unsigned char *str;
+ int ret, len;
+
+ if (!fp || !filter)
+ return -EINVAL;
+
+ fprobe_init(fp);
+
+ len = strlen(filter);
+ str = kstrdup(filter, GFP_KERNEL);
+ ret = ftrace_set_filter(&fp->ops, str, len, 0);
+ kfree(str);
+ if (ret)
+ return ret;
+
+ if (notfilter) {
+ len = strlen(notfilter);
+ str = kstrdup(notfilter, GFP_KERNEL);
+ ret = ftrace_set_notrace(&fp->ops, str, len, 0);
+ kfree(str);
+ if (ret)
+ goto out;
+ }
+
+ /* TODO:
+ * correctly calculate the total number of filtered symbols
+ * from both filter and notfilter.
+ */
+ hash = rcu_access_pointer(fp->ops.local_hash.filter_hash);
+ if (WARN_ON_ONCE(!hash))
+ goto out;
+
+ ret = fprobe_init_rethook(fp, (int)hash->count);
+ if (!ret)
+ ret = register_ftrace_function(&fp->ops);
+
+out:
+ if (ret)
+ fprobe_fail_cleanup(fp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_fprobe);
+
+/**
+ * register_fprobe_ips() - Register fprobe to ftrace by address.
+ * @fp: A fprobe data structure to be registered.
+ * @addrs: An array of target ftrace location addresses.
+ * @num: The number of entries of @addrs.
+ *
+ * Register @fp to ftrace for enabling the probe on the address given by @addrs.
+ * The @addrs must be the addresses of ftrace location address, which may be
+ * the symbol address + arch-dependent offset.
+ * If you unsure what this mean, please use other registration functions.
+ *
+ * Return 0 if @fp is registered successfully, -errno if not.
+ */
+int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
+{
+ int ret;
+
+ if (!fp || !addrs || num <= 0)
+ return -EINVAL;
+
+ fprobe_init(fp);
+
+ ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0);
+ if (ret)
+ return ret;
+
+ ret = fprobe_init_rethook(fp, num);
+ if (!ret)
+ ret = register_ftrace_function(&fp->ops);
+
+ if (ret)
+ fprobe_fail_cleanup(fp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_fprobe_ips);
+
+/**
+ * register_fprobe_syms() - Register fprobe to ftrace by symbols.
+ * @fp: A fprobe data structure to be registered.
+ * @syms: An array of target symbols.
+ * @num: The number of entries of @syms.
+ *
+ * Register @fp to the symbols given by @syms array. This will be useful if
+ * you are sure the symbols exist in the kernel.
+ *
+ * Return 0 if @fp is registered successfully, -errno if not.
+ */
+int register_fprobe_syms(struct fprobe *fp, const char **syms, int num)
+{
+ unsigned long *addrs;
+ int ret;
+
+ if (!fp || !syms || num <= 0)
+ return -EINVAL;
+
+ addrs = get_ftrace_locations(syms, num);
+ if (IS_ERR(addrs))
+ return PTR_ERR(addrs);
+
+ ret = register_fprobe_ips(fp, addrs, num);
+
+ kfree(addrs);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_fprobe_syms);
+
+bool fprobe_is_registered(struct fprobe *fp)
+{
+ if (!fp || (fp->ops.saved_func != fprobe_handler &&
+ fp->ops.saved_func != fprobe_kprobe_handler))
+ return false;
+ return true;
+}
+
+/**
+ * unregister_fprobe() - Unregister fprobe from ftrace
+ * @fp: A fprobe data structure to be unregistered.
+ *
+ * Unregister fprobe (and remove ftrace hooks from the function entries).
+ *
+ * Return 0 if @fp is unregistered successfully, -errno if not.
+ */
+int unregister_fprobe(struct fprobe *fp)
+{
+ int ret;
+
+ if (!fprobe_is_registered(fp))
+ return -EINVAL;
+
+ if (!IS_ERR_OR_NULL(fp->rethook))
+ rethook_stop(fp->rethook);
+
+ ret = unregister_ftrace_function(&fp->ops);
+ if (ret < 0)
+ return ret;
+
+ if (!IS_ERR_OR_NULL(fp->rethook))
+ rethook_free(fp->rethook);
+
+ ftrace_free_filter(&fp->ops);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_fprobe);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1903b80db6eb..83ba342aef31 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -45,6 +45,12 @@
#include "trace_output.h"
#include "trace_stat.h"
+/* Flags that do not get reset */
+#define FTRACE_NOCLEAR_FLAGS (FTRACE_FL_DISABLED | FTRACE_FL_TOUCHED | \
+ FTRACE_FL_MODIFIED)
+
+#define FTRACE_INVALID_FUNCTION "__ftrace_invalid_address__"
+
#define FTRACE_WARN_ON(cond) \
({ \
int ___r = cond; \
@@ -80,13 +86,13 @@ enum {
struct ftrace_ops ftrace_list_end __read_mostly = {
.func = ftrace_stub,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
+ .flags = FTRACE_OPS_FL_STUB,
INIT_OPS_HASH(ftrace_list_end)
};
/* ftrace_enabled is a method to turn ftrace on or off */
int ftrace_enabled __read_mostly;
-static int last_ftrace_enabled;
+static int __maybe_unused last_ftrace_enabled;
/* Current function tracing op */
struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
@@ -119,13 +125,35 @@ struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
struct ftrace_ops global_ops;
-#if ARCH_SUPPORTS_FTRACE_OPS
-static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs);
-#else
-/* See comment below, where ftrace_ops_list_func is defined */
-static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
-#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
+/* Defined by vmlinux.lds.h see the comment above arch_ftrace_ops_list_func for details */
+void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs);
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
+/*
+ * Stub used to invoke the list ops without requiring a separate trampoline.
+ */
+const struct ftrace_ops ftrace_list_ops = {
+ .func = ftrace_ops_list_func,
+ .flags = FTRACE_OPS_FL_STUB,
+};
+
+static void ftrace_ops_nop_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op,
+ struct ftrace_regs *fregs)
+{
+ /* do nothing */
+}
+
+/*
+ * Stub used when a call site is disabled. May be called transiently by threads
+ * which have made it into ftrace_caller but haven't yet recovered the ops at
+ * the point the call site is disabled.
+ */
+const struct ftrace_ops ftrace_nop_ops = {
+ .func = ftrace_ops_nop_func,
+ .flags = FTRACE_OPS_FL_STUB,
+};
#endif
static inline void ftrace_ops_init(struct ftrace_ops *ops)
@@ -139,11 +167,8 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
#endif
}
-#define FTRACE_PID_IGNORE -1
-#define FTRACE_PID_TRACE -2
-
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
struct trace_array *tr = op->private;
int pid;
@@ -157,7 +182,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
return;
}
- op->saved_func(ip, parent_ip, op, regs);
+ op->saved_func(ip, parent_ip, op, fregs);
}
static void ftrace_sync_ipi(void *data)
@@ -169,7 +194,7 @@ static void ftrace_sync_ipi(void *data)
static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
{
/*
- * If this is a dynamic, RCU, or per CPU ops, or we force list func,
+ * If this is a dynamic or RCU ops, or we force list func,
* then it needs to call the list anyway.
*/
if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_RCU) ||
@@ -233,7 +258,7 @@ static void update_ftrace_function(void)
/*
* For static tracing, we need to be a bit more careful.
* The function change takes affect immediately. Thus,
- * we need to coorditate the setting of the function_trace_ops
+ * we need to coordinate the setting of the function_trace_ops
* with the setting of the ftrace_trace_function.
*
* Set the function to the list ops, which will call the
@@ -326,7 +351,7 @@ int __register_ftrace_function(struct ftrace_ops *ops)
if (!ftrace_enabled && (ops->flags & FTRACE_OPS_FL_PERMANENT))
return -EBUSY;
- if (!core_kernel_data((unsigned long)ops))
+ if (!is_kernel_core_data((unsigned long)ops))
ops->flags |= FTRACE_OPS_FL_DYNAMIC;
add_ftrace_ops(&ftrace_ops_list, ops);
@@ -584,7 +609,7 @@ static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
}
-int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
+static int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
{
struct ftrace_profile_page *pg;
int functions;
@@ -757,7 +782,7 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
static void
function_profile_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct pt_regs *regs)
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
struct ftrace_profile_stat *stat;
struct ftrace_profile *rec;
@@ -869,7 +894,7 @@ static void unregister_ftrace_profiler(void)
#else
static struct ftrace_ops ftrace_profile_ops __read_mostly = {
.func = function_profile_call,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+ .flags = FTRACE_OPS_FL_INITIALIZED,
INIT_OPS_HASH(ftrace_profile_ops)
};
@@ -960,7 +985,6 @@ static struct tracer_stat function_stats __initdata = {
static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
{
struct ftrace_profile_stat *stat;
- struct dentry *entry;
char *name;
int ret;
int cpu;
@@ -991,10 +1015,9 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
}
}
- entry = tracefs_create_file("function_profile_enabled", 0644,
- d_tracer, NULL, &ftrace_profile_fops);
- if (!entry)
- pr_warn("Could not create tracefs 'function_profile_enabled' entry\n");
+ trace_create_file("function_profile_enabled",
+ TRACE_MODE_WRITE, d_tracer, NULL,
+ &ftrace_profile_fops);
}
#else /* CONFIG_FUNCTION_PROFILER */
@@ -1043,13 +1066,12 @@ struct ftrace_ops global_ops = {
.local_hash.notrace_hash = EMPTY_HASH,
.local_hash.filter_hash = EMPTY_HASH,
INIT_OPS_HASH(global_ops)
- .flags = FTRACE_OPS_FL_RECURSION_SAFE |
- FTRACE_OPS_FL_INITIALIZED |
+ .flags = FTRACE_OPS_FL_INITIALIZED |
FTRACE_OPS_FL_PID,
};
/*
- * Used by the stack undwinder to know about dynamic ftrace trampolines.
+ * Used by the stack unwinder to know about dynamic ftrace trampolines.
*/
struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr)
{
@@ -1094,7 +1116,7 @@ struct ftrace_page {
struct ftrace_page *next;
struct dyn_ftrace *records;
int index;
- int size;
+ int order;
};
#define ENTRY_SIZE sizeof(struct dyn_ftrace)
@@ -1161,18 +1183,19 @@ static void __add_hash_entry(struct ftrace_hash *hash,
hash->count++;
}
-static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
+static struct ftrace_func_entry *
+add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
{
struct ftrace_func_entry *entry;
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
- return -ENOMEM;
+ return NULL;
entry->ip = ip;
__add_hash_entry(hash, entry);
- return 0;
+ return entry;
}
static void
@@ -1257,12 +1280,17 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
call_rcu(&hash->rcu, __free_ftrace_hash_rcu);
}
+/**
+ * ftrace_free_filter - remove all filters for an ftrace_ops
+ * @ops - the ops to remove the filters from
+ */
void ftrace_free_filter(struct ftrace_ops *ops)
{
ftrace_ops_init(ops);
free_ftrace_hash(ops->func_hash->filter_hash);
free_ftrace_hash(ops->func_hash->notrace_hash);
}
+EXPORT_SYMBOL_GPL(ftrace_free_filter);
static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
{
@@ -1298,6 +1326,7 @@ static int ftrace_add_mod(struct trace_array *tr,
if (!ftrace_mod)
return -ENOMEM;
+ INIT_LIST_HEAD(&ftrace_mod->list);
ftrace_mod->func = kstrdup(func, GFP_KERNEL);
ftrace_mod->module = kstrdup(module, GFP_KERNEL);
ftrace_mod->enable = enable;
@@ -1321,7 +1350,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
struct ftrace_func_entry *entry;
struct ftrace_hash *new_hash;
int size;
- int ret;
int i;
new_hash = alloc_ftrace_hash(size_bits);
@@ -1338,8 +1366,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
- ret = add_hash_entry(new_hash, entry->ip);
- if (ret < 0)
+ if (add_hash_entry(new_hash, entry->ip) == NULL)
goto free_hash;
}
}
@@ -1371,10 +1398,10 @@ static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size)
int i;
/*
- * Make the hash size about 1/2 the # found
+ * Use around half the size (max bit of it), but
+ * a minimum of 2 is fine (as size of 0 or 1 both give 1 for bits).
*/
- for (size /= 2; size; size >>= 1)
- bits++;
+ bits = fls(size / 2);
/* Don't allocate too much */
if (bits > FTRACE_HASH_MAX_BITS)
@@ -1454,7 +1481,7 @@ static bool hash_contains_ip(unsigned long ip,
{
/*
* The function record is a match if it exists in the filter
- * hash and not in the notrace hash. Note, an emty hash is
+ * hash and not in the notrace hash. Note, an empty hash is
* considered a match for the filter hash, but an empty
* notrace hash is considered not in the notrace hash.
*/
@@ -1540,7 +1567,8 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end)
key.flags = end; /* overload flags, as it is unsigned long */
for (pg = ftrace_pages_start; pg; pg = pg->next) {
- if (end < pg->records[0].ip ||
+ if (pg->index == 0 ||
+ end < pg->records[0].ip ||
start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
continue;
rec = bsearch(&key, pg->records, pg->index,
@@ -1576,17 +1604,34 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end)
}
/**
- * ftrace_location - return true if the ip giving is a traced location
+ * ftrace_location - return the ftrace location
* @ip: the instruction pointer to check
*
- * Returns rec->ip if @ip given is a pointer to a ftrace location.
- * That is, the instruction that is either a NOP or call to
- * the function tracer. It checks the ftrace internal tables to
- * determine if the address belongs or not.
+ * If @ip matches the ftrace location, return @ip.
+ * If @ip matches sym+0, return sym's ftrace location.
+ * Otherwise, return 0.
*/
unsigned long ftrace_location(unsigned long ip)
{
- return ftrace_location_range(ip, ip);
+ struct dyn_ftrace *rec;
+ unsigned long offset;
+ unsigned long size;
+
+ rec = lookup_rec(ip, ip);
+ if (!rec) {
+ if (!kallsyms_lookup_size_offset(ip, &size, &offset))
+ goto out;
+
+ /* map sym+0 to __fentry__ */
+ if (!offset)
+ rec = lookup_rec(ip, ip + size - 1);
+ }
+
+ if (rec)
+ return rec->ip;
+
+out:
+ return 0;
}
/**
@@ -1632,8 +1677,22 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec)
static struct ftrace_ops *
ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
static struct ftrace_ops *
+ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude);
+static struct ftrace_ops *
ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops);
+static bool skip_record(struct dyn_ftrace *rec)
+{
+ /*
+ * At boot up, weak functions are set to disable. Function tracing
+ * can be enabled before they are, and they still need to be disabled now.
+ * If the record is disabled, still continue if it is marked as already
+ * enabled (this is needed to keep the accounting working).
+ */
+ return rec->flags & FTRACE_FL_DISABLED &&
+ !(rec->flags & FTRACE_FL_ENABLED);
+}
+
static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
int filter_hash,
bool inc)
@@ -1683,7 +1742,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
int in_hash = 0;
int match = 0;
- if (rec->flags & FTRACE_FL_DISABLED)
+ if (skip_record(rec))
continue;
if (all) {
@@ -1781,7 +1840,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
* to it.
*/
if (ftrace_rec_count(rec) == 1 &&
- ftrace_find_tramp_ops_any(rec))
+ ftrace_find_tramp_ops_any_other(rec, ops))
rec->flags |= FTRACE_FL_TRAMP;
else
rec->flags &= ~FTRACE_FL_TRAMP;
@@ -1791,6 +1850,18 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
* if rec count is zero.
*/
}
+
+ /*
+ * If the rec has a single associated ops, and ops->func can be
+ * called directly, allow the call site to call via the ops.
+ */
+ if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS) &&
+ ftrace_rec_count(rec) == 1 &&
+ ftrace_ops_get_func(ops) == ops->func)
+ rec->flags |= FTRACE_FL_CALL_OPS;
+ else
+ rec->flags &= ~FTRACE_FL_CALL_OPS;
+
count++;
/* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */
@@ -1859,6 +1930,13 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
* - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected)
* - If the hash is EMPTY_HASH, it hits nothing
* - Anything else hits the recs which match the hash entries.
+ *
+ * DIRECT ops does not have IPMODIFY flag, but we still need to check it
+ * against functions with FTRACE_FL_IPMODIFY. If there is any overlap, call
+ * ops_func(SHARE_IPMODIFY_SELF) to make sure current ops can share with
+ * IPMODIFY. If ops_func(SHARE_IPMODIFY_SELF) returns non-zero, propagate
+ * the return value to the caller and eventually to the owner of the DIRECT
+ * ops.
*/
static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
struct ftrace_hash *old_hash,
@@ -1867,17 +1945,26 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
struct ftrace_page *pg;
struct dyn_ftrace *rec, *end = NULL;
int in_old, in_new;
+ bool is_ipmodify, is_direct;
/* Only update if the ops has been registered */
if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
return 0;
- if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+ is_ipmodify = ops->flags & FTRACE_OPS_FL_IPMODIFY;
+ is_direct = ops->flags & FTRACE_OPS_FL_DIRECT;
+
+ /* neither IPMODIFY nor DIRECT, skip */
+ if (!is_ipmodify && !is_direct)
+ return 0;
+
+ if (WARN_ON_ONCE(is_ipmodify && is_direct))
return 0;
/*
- * Since the IPMODIFY is a very address sensitive action, we do not
- * allow ftrace_ops to set all functions to new hash.
+ * Since the IPMODIFY and DIRECT are very address sensitive
+ * actions, we do not allow ftrace_ops to set all functions to new
+ * hash.
*/
if (!new_hash || !old_hash)
return -EINVAL;
@@ -1895,12 +1982,32 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
continue;
if (in_new) {
- /* New entries must ensure no others are using it */
- if (rec->flags & FTRACE_FL_IPMODIFY)
- goto rollback;
- rec->flags |= FTRACE_FL_IPMODIFY;
- } else /* Removed entry */
+ if (rec->flags & FTRACE_FL_IPMODIFY) {
+ int ret;
+
+ /* Cannot have two ipmodify on same rec */
+ if (is_ipmodify)
+ goto rollback;
+
+ FTRACE_WARN_ON(rec->flags & FTRACE_FL_DIRECT);
+
+ /*
+ * Another ops with IPMODIFY is already
+ * attached. We are now attaching a direct
+ * ops. Run SHARE_IPMODIFY_SELF, to check
+ * whether sharing is supported.
+ */
+ if (!ops->ops_func)
+ return -EBUSY;
+ ret = ops->ops_func(ops, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF);
+ if (ret)
+ return ret;
+ } else if (is_ipmodify) {
+ rec->flags |= FTRACE_FL_IPMODIFY;
+ }
+ } else if (is_ipmodify) {
rec->flags &= ~FTRACE_FL_IPMODIFY;
+ }
} while_for_each_ftrace_rec();
return 0;
@@ -1969,12 +2076,15 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
static void print_ip_ins(const char *fmt, const unsigned char *p)
{
- int i;
+ char ins[MCOUNT_INSN_SIZE];
- printk(KERN_CONT "%s", fmt);
+ if (copy_from_kernel_nofault(ins, p, MCOUNT_INSN_SIZE)) {
+ printk(KERN_CONT "%s[FAULT] %px\n", fmt, p);
+ return;
+ }
- for (i = 0; i < MCOUNT_INSN_SIZE; i++)
- printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
+ printk(KERN_CONT "%s", fmt);
+ pr_cont("%*phC", MCOUNT_INSN_SIZE, ins);
}
enum ftrace_bug_type ftrace_bug_type;
@@ -2046,8 +2156,9 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
struct ftrace_ops *ops = NULL;
pr_info("ftrace record flags: %lx\n", rec->flags);
- pr_cont(" (%ld)%s", ftrace_rec_count(rec),
- rec->flags & FTRACE_FL_REGS ? " R" : " ");
+ pr_cont(" (%ld)%s%s", ftrace_rec_count(rec),
+ rec->flags & FTRACE_FL_REGS ? " R" : " ",
+ rec->flags & FTRACE_FL_CALL_OPS ? " O" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
if (ops) {
@@ -2074,7 +2185,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
ftrace_bug_type = FTRACE_BUG_UNKNOWN;
- if (rec->flags & FTRACE_FL_DISABLED)
+ if (skip_record(rec))
return FTRACE_UPDATE_IGNORE;
/*
@@ -2115,6 +2226,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
* want the direct enabled (it will be done via the
* direct helper). But if DIRECT_EN is set, and
* the count is not one, we need to clear it.
+ *
*/
if (ftrace_rec_count(rec) == 1) {
if (!(rec->flags & FTRACE_FL_DIRECT) !=
@@ -2123,6 +2235,19 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
} else if (rec->flags & FTRACE_FL_DIRECT_EN) {
flag |= FTRACE_FL_DIRECT;
}
+
+ /*
+ * Ops calls are special, as count matters.
+ * As with direct calls, they must only be enabled when count
+ * is one, otherwise they'll be handled via the list ops.
+ */
+ if (ftrace_rec_count(rec) == 1) {
+ if (!(rec->flags & FTRACE_FL_CALL_OPS) !=
+ !(rec->flags & FTRACE_FL_CALL_OPS_EN))
+ flag |= FTRACE_FL_CALL_OPS;
+ } else if (rec->flags & FTRACE_FL_CALL_OPS_EN) {
+ flag |= FTRACE_FL_CALL_OPS;
+ }
}
/* If the state of this record hasn't changed, then do nothing */
@@ -2134,7 +2259,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
flag ^= rec->flags & FTRACE_FL_ENABLED;
if (update) {
- rec->flags |= FTRACE_FL_ENABLED;
+ rec->flags |= FTRACE_FL_ENABLED | FTRACE_FL_TOUCHED;
if (flag & FTRACE_FL_REGS) {
if (rec->flags & FTRACE_FL_REGS)
rec->flags |= FTRACE_FL_REGS_EN;
@@ -2147,6 +2272,11 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
else
rec->flags &= ~FTRACE_FL_TRAMP_EN;
}
+
+ /* Keep track of anything that modifies the function */
+ if (rec->flags & (FTRACE_FL_DIRECT | FTRACE_FL_IPMODIFY))
+ rec->flags |= FTRACE_FL_MODIFIED;
+
if (flag & FTRACE_FL_DIRECT) {
/*
* If there's only one user (direct_ops helper)
@@ -2166,6 +2296,21 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
rec->flags &= ~FTRACE_FL_DIRECT_EN;
}
}
+
+ if (flag & FTRACE_FL_CALL_OPS) {
+ if (ftrace_rec_count(rec) == 1) {
+ if (rec->flags & FTRACE_FL_CALL_OPS)
+ rec->flags |= FTRACE_FL_CALL_OPS_EN;
+ else
+ rec->flags &= ~FTRACE_FL_CALL_OPS_EN;
+ } else {
+ /*
+ * Can only call directly if there's
+ * only one set of associated ops.
+ */
+ rec->flags &= ~FTRACE_FL_CALL_OPS_EN;
+ }
+ }
}
/*
@@ -2188,14 +2333,15 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
if (update) {
/* If there's no more users, clear all flags */
if (!ftrace_rec_count(rec))
- rec->flags = 0;
+ rec->flags &= FTRACE_NOCLEAR_FLAGS;
else
/*
* Just disable the record, but keep the ops TRAMP
* and REGS states. The _EN flags must be disabled though.
*/
rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN |
- FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN);
+ FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN |
+ FTRACE_FL_CALL_OPS_EN);
}
ftrace_bug_type = FTRACE_BUG_NOP;
@@ -2203,7 +2349,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
}
/**
- * ftrace_update_record, set a record that now is tracing or not
+ * ftrace_update_record - set a record that now is tracing or not
* @rec: the record to update
* @enable: set to true if the record is tracing, false to force disable
*
@@ -2216,7 +2362,7 @@ int ftrace_update_record(struct dyn_ftrace *rec, bool enable)
}
/**
- * ftrace_test_record, check if the record has been enabled or not
+ * ftrace_test_record - check if the record has been enabled or not
* @rec: the record to test
* @enable: set to true to check if enabled, false if it is disabled
*
@@ -2248,6 +2394,24 @@ ftrace_find_tramp_ops_any(struct dyn_ftrace *rec)
}
static struct ftrace_ops *
+ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude)
+{
+ struct ftrace_ops *op;
+ unsigned long ip = rec->ip;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+
+ if (op == op_exclude || !op->trampoline)
+ continue;
+
+ if (hash_contains_ip(ip, op->func_hash))
+ return op;
+ } while_for_each_ftrace_op(op);
+
+ return NULL;
+}
+
+static struct ftrace_ops *
ftrace_find_tramp_ops_next(struct dyn_ftrace *rec,
struct ftrace_ops *op)
{
@@ -2350,9 +2514,28 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
return NULL;
}
+struct ftrace_ops *
+ftrace_find_unique_ops(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *op, *found = NULL;
+ unsigned long ip = rec->ip;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+
+ if (hash_contains_ip(ip, op->func_hash)) {
+ if (found)
+ return NULL;
+ found = op;
+ }
+
+ } while_for_each_ftrace_op(op);
+
+ return found;
+}
+
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
/* Protected by rcu_tasks for reading, and direct_mutex for writing */
-static struct ftrace_hash *direct_functions = EMPTY_HASH;
+static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH;
static DEFINE_MUTEX(direct_mutex);
int ftrace_direct_func_count;
@@ -2372,23 +2555,15 @@ unsigned long ftrace_find_rec_direct(unsigned long ip)
}
static void call_direct_funcs(unsigned long ip, unsigned long pip,
- struct ftrace_ops *ops, struct pt_regs *regs)
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
- unsigned long addr;
+ unsigned long addr = READ_ONCE(ops->direct_call);
- addr = ftrace_find_rec_direct(ip);
if (!addr)
return;
- arch_ftrace_set_direct_caller(regs, addr);
+ arch_ftrace_set_direct_caller(fregs, addr);
}
-
-struct ftrace_ops direct_ops = {
- .func = call_direct_funcs,
- .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE
- | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
- | FTRACE_OPS_FL_PERMANENT,
-};
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
/**
@@ -2397,7 +2572,7 @@ struct ftrace_ops direct_ops = {
*
* If the record has the FTRACE_FL_REGS set, that means that it
* wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
- * is not not set, then it wants to convert to the normal callback.
+ * is not set, then it wants to convert to the normal callback.
*
* Returns the address of the trampoline to set to
*/
@@ -2522,7 +2697,7 @@ void __weak ftrace_replace_code(int mod_flags)
do_for_each_ftrace_rec(pg, rec) {
- if (rec->flags & FTRACE_FL_DISABLED)
+ if (skip_record(rec))
continue;
failed = __ftrace_replace_code(rec, enable);
@@ -2542,7 +2717,7 @@ struct ftrace_rec_iter {
};
/**
- * ftrace_rec_iter_start, start up iterating over traced functions
+ * ftrace_rec_iter_start - start up iterating over traced functions
*
* Returns an iterator handle that is used to iterate over all
* the records that represent address locations where functions
@@ -2573,7 +2748,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void)
}
/**
- * ftrace_rec_iter_next, get the next record to process.
+ * ftrace_rec_iter_next - get the next record to process.
* @iter: The handle to the iterator.
*
* Returns the next iterator after the given iterator @iter.
@@ -2598,7 +2773,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
}
/**
- * ftrace_rec_iter_record, get the record at the iterator location
+ * ftrace_rec_iter_record - get the record at the iterator location
* @iter: The current iterator location
*
* Returns the record that the current @iter is at.
@@ -2629,18 +2804,29 @@ ftrace_nop_initialize(struct module *mod, struct dyn_ftrace *rec)
* archs can override this function if they must do something
* before the modifying code is performed.
*/
-int __weak ftrace_arch_code_modify_prepare(void)
+void __weak ftrace_arch_code_modify_prepare(void)
{
- return 0;
}
/*
* archs can override this function if they must do something
* after the modifying code is performed.
*/
-int __weak ftrace_arch_code_modify_post_process(void)
+void __weak ftrace_arch_code_modify_post_process(void)
{
- return 0;
+}
+
+static int update_ftrace_func(ftrace_func_t func)
+{
+ static ftrace_func_t save_func;
+
+ /* Avoid updating if it hasn't changed */
+ if (func == save_func)
+ return 0;
+
+ save_func = func;
+
+ return ftrace_update_ftrace_func(func);
}
void ftrace_modify_all_code(int command)
@@ -2663,7 +2849,7 @@ void ftrace_modify_all_code(int command)
* traced.
*/
if (update) {
- err = ftrace_update_ftrace_func(ftrace_ops_list_func);
+ err = update_ftrace_func(ftrace_ops_list_func);
if (FTRACE_WARN_ON(err))
return;
}
@@ -2679,7 +2865,7 @@ void ftrace_modify_all_code(int command)
/* If irqs are disabled, we are in stop machine */
if (!irqs_disabled())
smp_call_function(ftrace_sync_ipi, NULL, 1);
- err = ftrace_update_ftrace_func(ftrace_trace_function);
+ err = update_ftrace_func(ftrace_trace_function);
if (FTRACE_WARN_ON(err))
return;
}
@@ -2701,7 +2887,7 @@ static int __ftrace_modify_code(void *data)
}
/**
- * ftrace_run_stop_machine, go back to the stop machine method
+ * ftrace_run_stop_machine - go back to the stop machine method
* @command: The command to tell ftrace what to do
*
* If an arch needs to fall back to the stop machine method, the
@@ -2713,7 +2899,7 @@ void ftrace_run_stop_machine(int command)
}
/**
- * arch_ftrace_update_code, modify the code to trace or not trace
+ * arch_ftrace_update_code - modify the code to trace or not trace
* @command: The command that needs to be done
*
* Archs can override this function if it does not need to
@@ -2726,12 +2912,7 @@ void __weak arch_ftrace_update_code(int command)
static void ftrace_run_update_code(int command)
{
- int ret;
-
- ret = ftrace_arch_code_modify_prepare();
- FTRACE_WARN_ON(ret);
- if (ret)
- return;
+ ftrace_arch_code_modify_prepare();
/*
* By default we use stop_machine() to modify the code.
@@ -2741,8 +2922,7 @@ static void ftrace_run_update_code(int command)
*/
arch_ftrace_update_code(command);
- ret = ftrace_arch_code_modify_post_process();
- FTRACE_WARN_ON(ret);
+ ftrace_arch_code_modify_post_process();
}
static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
@@ -2764,6 +2944,51 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
{
}
+/* List of trace_ops that have allocated trampolines */
+static LIST_HEAD(ftrace_ops_trampoline_list);
+
+static void ftrace_add_trampoline_to_kallsyms(struct ftrace_ops *ops)
+{
+ lockdep_assert_held(&ftrace_lock);
+ list_add_rcu(&ops->list, &ftrace_ops_trampoline_list);
+}
+
+static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops)
+{
+ lockdep_assert_held(&ftrace_lock);
+ list_del_rcu(&ops->list);
+ synchronize_rcu();
+}
+
+/*
+ * "__builtin__ftrace" is used as a module name in /proc/kallsyms for symbols
+ * for pages allocated for ftrace purposes, even though "__builtin__ftrace" is
+ * not a module.
+ */
+#define FTRACE_TRAMPOLINE_MOD "__builtin__ftrace"
+#define FTRACE_TRAMPOLINE_SYM "ftrace_trampoline"
+
+static void ftrace_trampoline_free(struct ftrace_ops *ops)
+{
+ if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) &&
+ ops->trampoline) {
+ /*
+ * Record the text poke event before the ksymbol unregister
+ * event.
+ */
+ perf_event_text_poke((void *)ops->trampoline,
+ (void *)ops->trampoline,
+ ops->trampoline_size, NULL, 0);
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ ops->trampoline, ops->trampoline_size,
+ true, FTRACE_TRAMPOLINE_SYM);
+ /* Remove from kallsyms after the perf events */
+ ftrace_remove_trampoline_from_kallsyms(ops);
+ }
+
+ arch_ftrace_trampoline_free(ops);
+}
+
static void ftrace_startup_enable(int command)
{
if (saved_ftrace_func != ftrace_trace_function) {
@@ -2813,6 +3038,8 @@ int ftrace_startup(struct ftrace_ops *ops, int command)
__unregister_ftrace_function(ops);
ftrace_start_up--;
ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
+ ftrace_trampoline_free(ops);
return ret;
}
@@ -2821,6 +3048,16 @@ int ftrace_startup(struct ftrace_ops *ops, int command)
ftrace_startup_enable(command);
+ /*
+ * If ftrace is in an undefined state, we just remove ops from list
+ * to prevent the NULL pointer, instead of totally rolling it back and
+ * free trampoline, because those actions could cause further damage.
+ */
+ if (unlikely(ftrace_disabled)) {
+ __unregister_ftrace_function(ops);
+ return -ENODEV;
+ }
+
ops->flags &= ~FTRACE_OPS_FL_ADDING;
return 0;
@@ -2858,18 +3095,8 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
command |= FTRACE_UPDATE_TRACE_FUNC;
}
- if (!command || !ftrace_enabled) {
- /*
- * If these are dynamic or per_cpu ops, they still
- * need their data freed. Since, function tracing is
- * not currently active, we can just free them
- * without synchronizing all CPUs.
- */
- if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
- goto free_ops;
-
- return 0;
- }
+ if (!command || !ftrace_enabled)
+ goto out;
/*
* If the ops uses a trampoline, then it needs to be
@@ -2894,7 +3121,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
struct dyn_ftrace *rec;
do_for_each_ftrace_rec(pg, rec) {
- if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_FL_DISABLED))
+ if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_NOCLEAR_FLAGS))
pr_warn(" %pS flags:%lx\n",
(void *)rec->ip, rec->flags);
} while_for_each_ftrace_rec();
@@ -2906,11 +3133,10 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
removed_ops = NULL;
ops->flags &= ~FTRACE_OPS_FL_REMOVING;
+out:
/*
* Dynamic ops may be freed, we must make sure that all
* callers are done before leaving this function.
- * The same goes for freeing the per_cpu data of the per_cpu
- * ops.
*/
if (ops->flags & FTRACE_OPS_FL_DYNAMIC) {
/*
@@ -2924,56 +3150,21 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
synchronize_rcu_tasks_rude();
/*
- * When the kernel is preeptive, tasks can be preempted
+ * When the kernel is preemptive, tasks can be preempted
* while on a ftrace trampoline. Just scheduling a task on
* a CPU is not good enough to flush them. Calling
- * synchornize_rcu_tasks() will wait for those tasks to
+ * synchronize_rcu_tasks() will wait for those tasks to
* execute and either schedule voluntarily or enter user space.
*/
if (IS_ENABLED(CONFIG_PREEMPTION))
synchronize_rcu_tasks();
- free_ops:
- arch_ftrace_trampoline_free(ops);
+ ftrace_trampoline_free(ops);
}
return 0;
}
-static void ftrace_startup_sysctl(void)
-{
- int command;
-
- if (unlikely(ftrace_disabled))
- return;
-
- /* Force update next time */
- saved_ftrace_func = NULL;
- /* ftrace_start_up is true if we want ftrace running */
- if (ftrace_start_up) {
- command = FTRACE_UPDATE_CALLS;
- if (ftrace_graph_active)
- command |= FTRACE_START_FUNC_RET;
- ftrace_startup_enable(command);
- }
-}
-
-static void ftrace_shutdown_sysctl(void)
-{
- int command;
-
- if (unlikely(ftrace_disabled))
- return;
-
- /* ftrace_start_up is true if ftrace is running */
- if (ftrace_start_up) {
- command = FTRACE_DISABLE_CALLS;
- if (ftrace_graph_active)
- command |= FTRACE_STOP_FUNC_RET;
- ftrace_run_update_code(command);
- }
-}
-
static u64 ftrace_update_time;
unsigned long ftrace_update_tot_cnt;
unsigned long ftrace_number_of_pages;
@@ -2989,38 +3180,9 @@ static inline int ops_traces_mod(struct ftrace_ops *ops)
ftrace_hash_empty(ops->func_hash->notrace_hash);
}
-/*
- * Check if the current ops references the record.
- *
- * If the ops traces all functions, then it was already accounted for.
- * If the ops does not trace the current record function, skip it.
- * If the ops ignores the function via notrace filter, skip it.
- */
-static inline bool
-ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
-{
- /* If ops isn't enabled, ignore it */
- if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
- return false;
-
- /* If ops traces all then it includes this function */
- if (ops_traces_mod(ops))
- return true;
-
- /* The function must be in the filter */
- if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
- !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
- return false;
-
- /* If in notrace hash, we ignore it too */
- if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip))
- return false;
-
- return true;
-}
-
static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
{
+ bool init_nop = ftrace_need_init_nop();
struct ftrace_page *pg;
struct dyn_ftrace *p;
u64 start, stop;
@@ -3059,8 +3221,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
* Do the initial record conversion from mcount jump
* to the NOP instructions.
*/
- if (!__is_defined(CC_USING_NOP_MCOUNT) &&
- !ftrace_nop_initialize(mod, p))
+ if (init_nop && !ftrace_nop_initialize(mod, p))
break;
update_cnt++;
@@ -3077,19 +3238,15 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
static int ftrace_allocate_records(struct ftrace_page *pg, int count)
{
int order;
+ int pages;
int cnt;
if (WARN_ON(!count))
return -EINVAL;
- order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
-
- /*
- * We want to fill as much as possible. No more than a page
- * may be empty.
- */
- while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE)
- order--;
+ /* We want to fill as much as possible, with no empty pages */
+ pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE);
+ order = fls(pages) - 1;
again:
pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
@@ -3098,7 +3255,7 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
/* if we can't allocate this size, try something smaller */
if (!order)
return -ENOMEM;
- order >>= 1;
+ order--;
goto again;
}
@@ -3106,7 +3263,7 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
ftrace_number_of_groups++;
cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
- pg->size = cnt;
+ pg->order = order;
if (cnt > count)
cnt = count;
@@ -3114,12 +3271,27 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
return cnt;
}
+static void ftrace_free_pages(struct ftrace_page *pages)
+{
+ struct ftrace_page *pg = pages;
+
+ while (pg) {
+ if (pg->records) {
+ free_pages((unsigned long)pg->records, pg->order);
+ ftrace_number_of_pages -= 1 << pg->order;
+ }
+ pages = pg->next;
+ kfree(pg);
+ pg = pages;
+ ftrace_number_of_groups--;
+ }
+}
+
static struct ftrace_page *
ftrace_allocate_pages(unsigned long num_to_init)
{
struct ftrace_page *start_pg;
struct ftrace_page *pg;
- int order;
int cnt;
if (!num_to_init)
@@ -3153,16 +3325,7 @@ ftrace_allocate_pages(unsigned long num_to_init)
return start_pg;
free_pages:
- pg = start_pg;
- while (pg) {
- order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- free_pages((unsigned long)pg->records, order);
- start_pg = pg->next;
- kfree(pg);
- pg = start_pg;
- ftrace_number_of_pages -= 1 << order;
- ftrace_number_of_groups--;
- }
+ ftrace_free_pages(start_pg);
pr_info("ftrace: FAILED to allocate memory for functions\n");
return NULL;
}
@@ -3415,7 +3578,10 @@ t_func_next(struct seq_file *m, loff_t *pos)
!ftrace_lookup_ip(iter->hash, rec->ip)) ||
((iter->flags & FTRACE_ITER_ENABLED) &&
- !(rec->flags & FTRACE_FL_ENABLED))) {
+ !(rec->flags & FTRACE_FL_ENABLED)) ||
+
+ ((iter->flags & FTRACE_ITER_TOUCHED) &&
+ !(rec->flags & FTRACE_FL_TOUCHED))) {
rec = NULL;
goto retry;
@@ -3544,6 +3710,105 @@ static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops,
seq_printf(m, " ->%pS", ptr);
}
+#ifdef FTRACE_MCOUNT_MAX_OFFSET
+/*
+ * Weak functions can still have an mcount/fentry that is saved in
+ * the __mcount_loc section. These can be detected by having a
+ * symbol offset of greater than FTRACE_MCOUNT_MAX_OFFSET, as the
+ * symbol found by kallsyms is not the function that the mcount/fentry
+ * is part of. The offset is much greater in these cases.
+ *
+ * Test the record to make sure that the ip points to a valid kallsyms
+ * and if not, mark it disabled.
+ */
+static int test_for_valid_rec(struct dyn_ftrace *rec)
+{
+ char str[KSYM_SYMBOL_LEN];
+ unsigned long offset;
+ const char *ret;
+
+ ret = kallsyms_lookup(rec->ip, NULL, &offset, NULL, str);
+
+ /* Weak functions can cause invalid addresses */
+ if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) {
+ rec->flags |= FTRACE_FL_DISABLED;
+ return 0;
+ }
+ return 1;
+}
+
+static struct workqueue_struct *ftrace_check_wq __initdata;
+static struct work_struct ftrace_check_work __initdata;
+
+/*
+ * Scan all the mcount/fentry entries to make sure they are valid.
+ */
+static __init void ftrace_check_work_func(struct work_struct *work)
+{
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+
+ mutex_lock(&ftrace_lock);
+ do_for_each_ftrace_rec(pg, rec) {
+ test_for_valid_rec(rec);
+ } while_for_each_ftrace_rec();
+ mutex_unlock(&ftrace_lock);
+}
+
+static int __init ftrace_check_for_weak_functions(void)
+{
+ INIT_WORK(&ftrace_check_work, ftrace_check_work_func);
+
+ ftrace_check_wq = alloc_workqueue("ftrace_check_wq", WQ_UNBOUND, 0);
+
+ queue_work(ftrace_check_wq, &ftrace_check_work);
+ return 0;
+}
+
+static int __init ftrace_check_sync(void)
+{
+ /* Make sure the ftrace_check updates are finished */
+ if (ftrace_check_wq)
+ destroy_workqueue(ftrace_check_wq);
+ return 0;
+}
+
+late_initcall_sync(ftrace_check_sync);
+subsys_initcall(ftrace_check_for_weak_functions);
+
+static int print_rec(struct seq_file *m, unsigned long ip)
+{
+ unsigned long offset;
+ char str[KSYM_SYMBOL_LEN];
+ char *modname;
+ const char *ret;
+
+ ret = kallsyms_lookup(ip, NULL, &offset, &modname, str);
+ /* Weak functions can cause invalid addresses */
+ if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) {
+ snprintf(str, KSYM_SYMBOL_LEN, "%s_%ld",
+ FTRACE_INVALID_FUNCTION, offset);
+ ret = NULL;
+ }
+
+ seq_puts(m, str);
+ if (modname)
+ seq_printf(m, " [%s]", modname);
+ return ret == NULL ? -1 : 0;
+}
+#else
+static inline int test_for_valid_rec(struct dyn_ftrace *rec)
+{
+ return 1;
+}
+
+static inline int print_rec(struct seq_file *m, unsigned long ip)
+{
+ seq_printf(m, "%ps", (void *)ip);
+ return 0;
+}
+#endif
+
static int t_show(struct seq_file *m, void *v)
{
struct ftrace_iterator *iter = m->private;
@@ -3568,15 +3833,26 @@ static int t_show(struct seq_file *m, void *v)
if (!rec)
return 0;
- seq_printf(m, "%ps", (void *)rec->ip);
- if (iter->flags & FTRACE_ITER_ENABLED) {
+ if (iter->flags & FTRACE_ITER_ADDRS)
+ seq_printf(m, "%lx ", rec->ip);
+
+ if (print_rec(m, rec->ip)) {
+ /* This should only happen when a rec is disabled */
+ WARN_ON_ONCE(!(rec->flags & FTRACE_FL_DISABLED));
+ seq_putc(m, '\n');
+ return 0;
+ }
+
+ if (iter->flags & (FTRACE_ITER_ENABLED | FTRACE_ITER_TOUCHED)) {
struct ftrace_ops *ops;
- seq_printf(m, " (%ld)%s%s%s",
+ seq_printf(m, " (%ld)%s%s%s%s%s",
ftrace_rec_count(rec),
rec->flags & FTRACE_FL_REGS ? " R" : " ",
rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ",
- rec->flags & FTRACE_FL_DIRECT ? " D" : " ");
+ rec->flags & FTRACE_FL_DIRECT ? " D" : " ",
+ rec->flags & FTRACE_FL_CALL_OPS ? " O" : " ",
+ rec->flags & FTRACE_FL_MODIFIED ? " M " : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
if (ops) {
@@ -3592,6 +3868,15 @@ static int t_show(struct seq_file *m, void *v)
} else {
add_trampoline_func(m, NULL, rec);
}
+ if (rec->flags & FTRACE_FL_CALL_OPS_EN) {
+ ops = ftrace_find_unique_ops(rec);
+ if (ops) {
+ seq_printf(m, "\tops: %pS (%pS)",
+ ops, ops->func);
+ } else {
+ seq_puts(m, "\tops: ERROR!");
+ }
+ }
if (rec->flags & FTRACE_FL_DIRECT) {
unsigned long direct;
@@ -3661,6 +3946,55 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
return 0;
}
+static int
+ftrace_touched_open(struct inode *inode, struct file *file)
+{
+ struct ftrace_iterator *iter;
+
+ /*
+ * This shows us what functions have ever been enabled
+ * (traced, direct, patched, etc). Not sure if we want lockdown
+ * to hide such critical information for an admin.
+ * Although, perhaps it can show information we don't
+ * want people to see, but if something had traced
+ * something, we probably want to know about it.
+ */
+
+ iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
+ if (!iter)
+ return -ENOMEM;
+
+ iter->pg = ftrace_pages_start;
+ iter->flags = FTRACE_ITER_TOUCHED;
+ iter->ops = &global_ops;
+
+ return 0;
+}
+
+static int
+ftrace_avail_addrs_open(struct inode *inode, struct file *file)
+{
+ struct ftrace_iterator *iter;
+ int ret;
+
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
+ if (!iter)
+ return -ENOMEM;
+
+ iter->pg = ftrace_pages_start;
+ iter->flags = FTRACE_ITER_ADDRS;
+ iter->ops = &global_ops;
+
+ return 0;
+}
+
/**
* ftrace_regex_open - initialize function tracer filter files
* @ops: The ftrace_ops that hold the hash filters
@@ -3855,8 +4189,8 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter)
/* Do nothing if it exists */
if (entry)
return 0;
-
- ret = add_hash_entry(hash, rec->ip);
+ if (add_hash_entry(hash, rec->ip) == NULL)
+ ret = -ENOMEM;
}
return ret;
}
@@ -3886,6 +4220,24 @@ add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g,
return 0;
}
+#ifdef FTRACE_MCOUNT_MAX_OFFSET
+static int lookup_ip(unsigned long ip, char **modname, char *str)
+{
+ unsigned long offset;
+
+ kallsyms_lookup(ip, NULL, &offset, modname, str);
+ if (offset > FTRACE_MCOUNT_MAX_OFFSET)
+ return -1;
+ return 0;
+}
+#else
+static int lookup_ip(unsigned long ip, char **modname, char *str)
+{
+ kallsyms_lookup(ip, NULL, NULL, modname, str);
+ return 0;
+}
+#endif
+
static int
ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g,
struct ftrace_glob *mod_g, int exclude_mod)
@@ -3893,7 +4245,12 @@ ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g,
char str[KSYM_SYMBOL_LEN];
char *modname;
- kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
+ if (lookup_ip(rec->ip, &modname, str)) {
+ /* This should only happen when a rec is disabled */
+ WARN_ON_ONCE(system_state == SYSTEM_RUNNING &&
+ !(rec->flags & FTRACE_FL_DISABLED));
+ return 0;
+ }
if (mod_g) {
int mod_matches = (modname) ? ftrace_match(modname, mod_g) : 0;
@@ -3972,6 +4329,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
}
found = 1;
}
+ cond_resched();
} while_for_each_ftrace_rec();
out_unlock:
mutex_unlock(&ftrace_lock);
@@ -4109,7 +4467,6 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
struct ftrace_hash **orig_hash, *new_hash;
LIST_HEAD(process_mods);
char *func;
- int ret;
mutex_lock(&ops->func_hash->regex_lock);
@@ -4138,8 +4495,7 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
if (!func) /* warn? */
continue;
- list_del(&ftrace_mod->list);
- list_add(&ftrace_mod->list, &process_mods);
+ list_move(&ftrace_mod->list, &process_mods);
/* Use the newly allocated func, as it may be "*" */
kfree(ftrace_mod->func);
@@ -4162,7 +4518,7 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
mutex_lock(&ftrace_lock);
- ret = ftrace_hash_move_and_update_ops(ops, orig_hash,
+ ftrace_hash_move_and_update_ops(ops, orig_hash,
new_hash, enable);
mutex_unlock(&ftrace_lock);
@@ -4240,7 +4596,7 @@ static int __init ftrace_mod_cmd_init(void)
core_initcall(ftrace_mod_cmd_init);
static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
struct ftrace_probe_ops *probe_ops;
struct ftrace_func_probe *probe;
@@ -4316,7 +4672,7 @@ void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper,
* @ip: The instruction pointer address to map @data to
* @data: The data to map to @ip
*
- * Returns 0 on succes otherwise an error.
+ * Returns 0 on success otherwise an error.
*/
int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
unsigned long ip, void *data)
@@ -4346,7 +4702,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
* @ip: The instruction pointer address to remove the data from
*
* Returns the data if it is found, otherwise NULL.
- * Note, if the data pointer is used as the data itself, (see
+ * Note, if the data pointer is used as the data itself, (see
* ftrace_func_mapper_find_ip(), then the return value may be meaningless,
* if the data pointer was set to zero.
*/
@@ -4441,8 +4797,8 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr,
struct ftrace_probe_ops *probe_ops,
void *data)
{
+ struct ftrace_func_probe *probe = NULL, *iter;
struct ftrace_func_entry *entry;
- struct ftrace_func_probe *probe;
struct ftrace_hash **orig_hash;
struct ftrace_hash *old_hash;
struct ftrace_hash *hash;
@@ -4461,11 +4817,13 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr,
mutex_lock(&ftrace_lock);
/* Check if the probe_ops is already registered */
- list_for_each_entry(probe, &tr->func_probes, list) {
- if (probe->probe_ops == probe_ops)
+ list_for_each_entry(iter, &tr->func_probes, list) {
+ if (iter->probe_ops == probe_ops) {
+ probe = iter;
break;
+ }
}
- if (&probe->list == &tr->func_probes) {
+ if (!probe) {
probe = kzalloc(sizeof(*probe), GFP_KERNEL);
if (!probe) {
mutex_unlock(&ftrace_lock);
@@ -4484,7 +4842,7 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr,
/*
* Note, there's a small window here that the func_hash->filter_hash
- * may be NULL or empty. Need to be carefule when reading the loop.
+ * may be NULL or empty. Need to be careful when reading the loop.
*/
mutex_lock(&probe->ops.func_hash->regex_lock);
@@ -4583,9 +4941,9 @@ int
unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
struct ftrace_probe_ops *probe_ops)
{
+ struct ftrace_func_probe *probe = NULL, *iter;
struct ftrace_ops_hash old_hash_ops;
struct ftrace_func_entry *entry;
- struct ftrace_func_probe *probe;
struct ftrace_glob func_g;
struct ftrace_hash **orig_hash;
struct ftrace_hash *old_hash;
@@ -4613,11 +4971,13 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
mutex_lock(&ftrace_lock);
/* Check if the probe_ops is already registered */
- list_for_each_entry(probe, &tr->func_probes, list) {
- if (probe->probe_ops == probe_ops)
+ list_for_each_entry(iter, &tr->func_probes, list) {
+ if (iter->probe_ops == probe_ops) {
+ probe = iter;
break;
+ }
}
- if (&probe->list == &tr->func_probes)
+ if (!probe)
goto err_unlock_ftrace;
ret = -EINVAL;
@@ -4856,11 +5216,12 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
}
static int
-ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
+__ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
{
struct ftrace_func_entry *entry;
- if (!ftrace_location(ip))
+ ip = ftrace_location(ip);
+ if (!ip)
return -EINVAL;
if (remove) {
@@ -4871,12 +5232,34 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
return 0;
}
- return add_hash_entry(hash, ip);
+ entry = add_hash_entry(hash, ip);
+ return entry ? 0 : -ENOMEM;
+}
+
+static int
+ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips,
+ unsigned int cnt, int remove)
+{
+ unsigned int i;
+ int err;
+
+ for (i = 0; i < cnt; i++) {
+ err = __ftrace_match_addr(hash, ips[i], remove);
+ if (err) {
+ /*
+ * This expects the @hash is a temporary hash and if this
+ * fails the caller must free the @hash.
+ */
+ return err;
+ }
+ }
+ return 0;
}
static int
ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
- unsigned long ip, int remove, int reset, int enable)
+ unsigned long *ips, unsigned int cnt,
+ int remove, int reset, int enable)
{
struct ftrace_hash **orig_hash;
struct ftrace_hash *hash;
@@ -4906,8 +5289,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
ret = -EINVAL;
goto out_regex_unlock;
}
- if (ip) {
- ret = ftrace_match_addr(hash, ip, remove);
+ if (ips) {
+ ret = ftrace_match_addr(hash, ips, cnt, remove);
if (ret < 0)
goto out_regex_unlock;
}
@@ -4924,10 +5307,10 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
}
static int
-ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
- int reset, int enable)
+ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt,
+ int remove, int reset, int enable)
{
- return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable);
+ return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable);
}
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
@@ -4940,353 +5323,302 @@ struct ftrace_direct_func {
static LIST_HEAD(ftrace_direct_funcs);
-/**
- * ftrace_find_direct_func - test an address if it is a registered direct caller
- * @addr: The address of a registered direct caller
- *
- * This searches to see if a ftrace direct caller has been registered
- * at a specific address, and if so, it returns a descriptor for it.
- *
- * This can be used by architecture code to see if an address is
- * a direct caller (trampoline) attached to a fentry/mcount location.
- * This is useful for the function_graph tracer, as it may need to
- * do adjustments if it traced a location that also has a direct
- * trampoline attached to it.
+static int register_ftrace_function_nolock(struct ftrace_ops *ops);
+
+/*
+ * If there are multiple ftrace_ops, use SAVE_REGS by default, so that direct
+ * call will be jumped from ftrace_regs_caller. Only if the architecture does
+ * not support ftrace_regs_caller but direct_call, use SAVE_ARGS so that it
+ * jumps from ftrace_caller for multiple ftrace_ops.
*/
-struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr)
+#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS
+#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_ARGS)
+#else
+#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS)
+#endif
+
+static int check_direct_multi(struct ftrace_ops *ops)
{
- struct ftrace_direct_func *entry;
- bool found = false;
+ if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED))
+ return -EINVAL;
+ if ((ops->flags & MULTI_FLAGS) != MULTI_FLAGS)
+ return -EINVAL;
+ return 0;
+}
- /* May be called by fgraph trampoline (protected by rcu tasks) */
- list_for_each_entry_rcu(entry, &ftrace_direct_funcs, next) {
- if (entry->addr == addr) {
- found = true;
- break;
+static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long addr)
+{
+ struct ftrace_func_entry *entry, *del;
+ int size, i;
+
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ del = __ftrace_lookup_ip(direct_functions, entry->ip);
+ if (del && del->direct == addr) {
+ remove_hash_entry(direct_functions, del);
+ kfree(del);
+ }
}
}
- if (found)
- return entry;
-
- return NULL;
}
/**
* register_ftrace_direct - Call a custom trampoline directly
- * @ip: The address of the nop at the beginning of a function
- * @addr: The address of the trampoline to call at @ip
+ * for multiple functions registered in @ops
+ * @ops: The address of the struct ftrace_ops object
+ * @addr: The address of the trampoline to call at @ops functions
+ *
+ * This is used to connect a direct calls to @addr from the nop locations
+ * of the functions registered in @ops (with by ftrace_set_filter_ip
+ * function).
*
- * This is used to connect a direct call from the nop location (@ip)
- * at the start of ftrace traced functions. The location that it calls
- * (@addr) must be able to handle a direct call, and save the parameters
- * of the function being traced, and restore them (or inject new ones
- * if needed), before returning.
+ * The location that it calls (@addr) must be able to handle a direct call,
+ * and save the parameters of the function being traced, and restore them
+ * (or inject new ones if needed), before returning.
*
* Returns:
* 0 on success
- * -EBUSY - Another direct function is already attached (there can be only one)
- * -ENODEV - @ip does not point to a ftrace nop location (or not supported)
- * -ENOMEM - There was an allocation failure.
+ * -EINVAL - The @ops object was already registered with this call or
+ * when there are no functions in @ops object.
+ * -EBUSY - Another direct function is already attached (there can be only one)
+ * -ENODEV - @ip does not point to a ftrace nop location (or not supported)
+ * -ENOMEM - There was an allocation failure.
*/
-int register_ftrace_direct(unsigned long ip, unsigned long addr)
+int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
{
- struct ftrace_direct_func *direct;
- struct ftrace_func_entry *entry;
- struct ftrace_hash *free_hash = NULL;
- struct dyn_ftrace *rec;
- int ret = -EBUSY;
-
- mutex_lock(&direct_mutex);
+ struct ftrace_hash *hash, *new_hash = NULL, *free_hash = NULL;
+ struct ftrace_func_entry *entry, *new;
+ int err = -EBUSY, size, i;
- /* See if there's a direct function at @ip already */
- if (ftrace_find_rec_direct(ip))
- goto out_unlock;
+ if (ops->func || ops->trampoline)
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED))
+ return -EINVAL;
+ if (ops->flags & FTRACE_OPS_FL_ENABLED)
+ return -EINVAL;
- ret = -ENODEV;
- rec = lookup_rec(ip, ip);
- if (!rec)
- goto out_unlock;
+ hash = ops->func_hash->filter_hash;
+ if (ftrace_hash_empty(hash))
+ return -EINVAL;
- /*
- * Check if the rec says it has a direct call but we didn't
- * find one earlier?
- */
- if (WARN_ON(rec->flags & FTRACE_FL_DIRECT))
- goto out_unlock;
+ mutex_lock(&direct_mutex);
- /* Make sure the ip points to the exact record */
- if (ip != rec->ip) {
- ip = rec->ip;
- /* Need to check this ip for a direct. */
- if (ftrace_find_rec_direct(ip))
- goto out_unlock;
+ /* Make sure requested entries are not already registered.. */
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ if (ftrace_find_rec_direct(entry->ip))
+ goto out_unlock;
+ }
}
- ret = -ENOMEM;
- if (ftrace_hash_empty(direct_functions) ||
- direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
- struct ftrace_hash *new_hash;
- int size = ftrace_hash_empty(direct_functions) ? 0 :
- direct_functions->count + 1;
-
- if (size < 32)
- size = 32;
+ err = -ENOMEM;
- new_hash = dup_hash(direct_functions, size);
- if (!new_hash)
- goto out_unlock;
+ /* Make a copy hash to place the new and the old entries in */
+ size = hash->count + direct_functions->count;
+ if (size > 32)
+ size = 32;
+ new_hash = alloc_ftrace_hash(fls(size));
+ if (!new_hash)
+ goto out_unlock;
- free_hash = direct_functions;
- direct_functions = new_hash;
+ /* Now copy over the existing direct entries */
+ size = 1 << direct_functions->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &direct_functions->buckets[i], hlist) {
+ new = add_hash_entry(new_hash, entry->ip);
+ if (!new)
+ goto out_unlock;
+ new->direct = entry->direct;
+ }
}
- entry = kmalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry)
- goto out_unlock;
-
- direct = ftrace_find_direct_func(addr);
- if (!direct) {
- direct = kmalloc(sizeof(*direct), GFP_KERNEL);
- if (!direct) {
- kfree(entry);
- goto out_unlock;
+ /* ... and add the new entries */
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ new = add_hash_entry(new_hash, entry->ip);
+ if (!new)
+ goto out_unlock;
+ /* Update both the copy and the hash entry */
+ new->direct = addr;
+ entry->direct = addr;
}
- direct->addr = addr;
- direct->count = 0;
- list_add_rcu(&direct->next, &ftrace_direct_funcs);
- ftrace_direct_func_count++;
}
- entry->ip = ip;
- entry->direct = addr;
- __add_hash_entry(direct_functions, entry);
+ free_hash = direct_functions;
+ rcu_assign_pointer(direct_functions, new_hash);
+ new_hash = NULL;
- ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0);
- if (ret)
- remove_hash_entry(direct_functions, entry);
+ ops->func = call_direct_funcs;
+ ops->flags = MULTI_FLAGS;
+ ops->trampoline = FTRACE_REGS_ADDR;
+ ops->direct_call = addr;
- if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) {
- ret = register_ftrace_function(&direct_ops);
- if (ret)
- ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
- }
+ err = register_ftrace_function_nolock(ops);
- if (ret) {
- kfree(entry);
- if (!direct->count) {
- list_del_rcu(&direct->next);
- synchronize_rcu_tasks();
- kfree(direct);
- if (free_hash)
- free_ftrace_hash(free_hash);
- free_hash = NULL;
- ftrace_direct_func_count--;
- }
- } else {
- direct->count++;
- }
out_unlock:
mutex_unlock(&direct_mutex);
- if (free_hash) {
+ if (free_hash && free_hash != EMPTY_HASH) {
synchronize_rcu_tasks();
free_ftrace_hash(free_hash);
}
- return ret;
+ if (new_hash)
+ free_ftrace_hash(new_hash);
+
+ return err;
}
EXPORT_SYMBOL_GPL(register_ftrace_direct);
-static struct ftrace_func_entry *find_direct_entry(unsigned long *ip,
- struct dyn_ftrace **recp)
+/**
+ * unregister_ftrace_direct - Remove calls to custom trampoline
+ * previously registered by register_ftrace_direct for @ops object.
+ * @ops: The address of the struct ftrace_ops object
+ *
+ * This is used to remove a direct calls to @addr from the nop locations
+ * of the functions registered in @ops (with by ftrace_set_filter_ip
+ * function).
+ *
+ * Returns:
+ * 0 on success
+ * -EINVAL - The @ops object was not properly registered.
+ */
+int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
+ bool free_filters)
{
- struct ftrace_func_entry *entry;
- struct dyn_ftrace *rec;
-
- rec = lookup_rec(*ip, *ip);
- if (!rec)
- return NULL;
-
- entry = __ftrace_lookup_ip(direct_functions, rec->ip);
- if (!entry) {
- WARN_ON(rec->flags & FTRACE_FL_DIRECT);
- return NULL;
- }
+ struct ftrace_hash *hash = ops->func_hash->filter_hash;
+ int err;
- WARN_ON(!(rec->flags & FTRACE_FL_DIRECT));
+ if (check_direct_multi(ops))
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EINVAL;
- /* Passed in ip just needs to be on the call site */
- *ip = rec->ip;
+ mutex_lock(&direct_mutex);
+ err = unregister_ftrace_function(ops);
+ remove_direct_functions_hash(hash, addr);
+ mutex_unlock(&direct_mutex);
- if (recp)
- *recp = rec;
+ /* cleanup for possible another register call */
+ ops->func = NULL;
+ ops->trampoline = 0;
- return entry;
+ if (free_filters)
+ ftrace_free_filter(ops);
+ return err;
}
+EXPORT_SYMBOL_GPL(unregister_ftrace_direct);
-int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
+static int
+__modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
{
- struct ftrace_direct_func *direct;
- struct ftrace_func_entry *entry;
- int ret = -ENODEV;
-
- mutex_lock(&direct_mutex);
-
- entry = find_direct_entry(&ip, NULL);
- if (!entry)
- goto out_unlock;
+ struct ftrace_hash *hash;
+ struct ftrace_func_entry *entry, *iter;
+ static struct ftrace_ops tmp_ops = {
+ .func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_STUB,
+ };
+ int i, size;
+ int err;
- if (direct_functions->count == 1)
- unregister_ftrace_function(&direct_ops);
+ lockdep_assert_held_once(&direct_mutex);
- ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
+ /* Enable the tmp_ops to have the same functions as the direct ops */
+ ftrace_ops_init(&tmp_ops);
+ tmp_ops.func_hash = ops->func_hash;
+ tmp_ops.direct_call = addr;
- WARN_ON(ret);
+ err = register_ftrace_function_nolock(&tmp_ops);
+ if (err)
+ return err;
- remove_hash_entry(direct_functions, entry);
+ /*
+ * Now the ftrace_ops_list_func() is called to do the direct callers.
+ * We can safely change the direct functions attached to each entry.
+ */
+ mutex_lock(&ftrace_lock);
- direct = ftrace_find_direct_func(addr);
- if (!WARN_ON(!direct)) {
- /* This is the good path (see the ! before WARN) */
- direct->count--;
- WARN_ON(direct->count < 0);
- if (!direct->count) {
- list_del_rcu(&direct->next);
- synchronize_rcu_tasks();
- kfree(direct);
- kfree(entry);
- ftrace_direct_func_count--;
+ hash = ops->func_hash->filter_hash;
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(iter, &hash->buckets[i], hlist) {
+ entry = __ftrace_lookup_ip(direct_functions, iter->ip);
+ if (!entry)
+ continue;
+ entry->direct = addr;
}
}
- out_unlock:
- mutex_unlock(&direct_mutex);
+ /* Prevent store tearing if a trampoline concurrently accesses the value */
+ WRITE_ONCE(ops->direct_call, addr);
- return ret;
-}
-EXPORT_SYMBOL_GPL(unregister_ftrace_direct);
+ mutex_unlock(&ftrace_lock);
-static struct ftrace_ops stub_ops = {
- .func = ftrace_stub,
-};
+ /* Removing the tmp_ops will add the updated direct callers to the functions */
+ unregister_ftrace_function(&tmp_ops);
+
+ return err;
+}
/**
- * ftrace_modify_direct_caller - modify ftrace nop directly
- * @entry: The ftrace hash entry of the direct helper for @rec
- * @rec: The record representing the function site to patch
- * @old_addr: The location that the site at @rec->ip currently calls
- * @new_addr: The location that the site at @rec->ip should call
+ * modify_ftrace_direct_nolock - Modify an existing direct 'multi' call
+ * to call something else
+ * @ops: The address of the struct ftrace_ops object
+ * @addr: The address of the new trampoline to call at @ops functions
*
- * An architecture may overwrite this function to optimize the
- * changing of the direct callback on an ftrace nop location.
- * This is called with the ftrace_lock mutex held, and no other
- * ftrace callbacks are on the associated record (@rec). Thus,
- * it is safe to modify the ftrace record, where it should be
- * currently calling @old_addr directly, to call @new_addr.
+ * This is used to unregister currently registered direct caller and
+ * register new one @addr on functions registered in @ops object.
*
- * Safety checks should be made to make sure that the code at
- * @rec->ip is currently calling @old_addr. And this must
- * also update entry->direct to @new_addr.
+ * Note there's window between ftrace_shutdown and ftrace_startup calls
+ * where there will be no callbacks called.
+ *
+ * Caller should already have direct_mutex locked, so we don't lock
+ * direct_mutex here.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ * -EINVAL - The @ops object was not properly registered.
*/
-int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
- struct dyn_ftrace *rec,
- unsigned long old_addr,
- unsigned long new_addr)
+int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr)
{
- unsigned long ip = rec->ip;
- int ret;
-
- /*
- * The ftrace_lock was used to determine if the record
- * had more than one registered user to it. If it did,
- * we needed to prevent that from changing to do the quick
- * switch. But if it did not (only a direct caller was attached)
- * then this function is called. But this function can deal
- * with attached callers to the rec that we care about, and
- * since this function uses standard ftrace calls that take
- * the ftrace_lock mutex, we need to release it.
- */
- mutex_unlock(&ftrace_lock);
-
- /*
- * By setting a stub function at the same address, we force
- * the code to call the iterator and the direct_ops helper.
- * This means that @ip does not call the direct call, and
- * we can simply modify it.
- */
- ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0);
- if (ret)
- goto out_lock;
-
- ret = register_ftrace_function(&stub_ops);
- if (ret) {
- ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
- goto out_lock;
- }
-
- entry->direct = new_addr;
-
- /*
- * By removing the stub, we put back the direct call, calling
- * the @new_addr.
- */
- unregister_ftrace_function(&stub_ops);
- ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
-
- out_lock:
- mutex_lock(&ftrace_lock);
+ if (check_direct_multi(ops))
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EINVAL;
- return ret;
+ return __modify_ftrace_direct(ops, addr);
}
+EXPORT_SYMBOL_GPL(modify_ftrace_direct_nolock);
/**
- * modify_ftrace_direct - Modify an existing direct call to call something else
- * @ip: The instruction pointer to modify
- * @old_addr: The address that the current @ip calls directly
- * @new_addr: The address that the @ip should call
+ * modify_ftrace_direct - Modify an existing direct 'multi' call
+ * to call something else
+ * @ops: The address of the struct ftrace_ops object
+ * @addr: The address of the new trampoline to call at @ops functions
*
- * This modifies a ftrace direct caller at an instruction pointer without
- * having to disable it first. The direct call will switch over to the
- * @new_addr without missing anything.
+ * This is used to unregister currently registered direct caller and
+ * register new one @addr on functions registered in @ops object.
+ *
+ * Note there's window between ftrace_shutdown and ftrace_startup calls
+ * where there will be no callbacks called.
*
* Returns: zero on success. Non zero on error, which includes:
- * -ENODEV : the @ip given has no direct caller attached
- * -EINVAL : the @old_addr does not match the current direct caller
+ * -EINVAL - The @ops object was not properly registered.
*/
-int modify_ftrace_direct(unsigned long ip,
- unsigned long old_addr, unsigned long new_addr)
+int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
{
- struct ftrace_func_entry *entry;
- struct dyn_ftrace *rec;
- int ret = -ENODEV;
-
- mutex_lock(&direct_mutex);
-
- mutex_lock(&ftrace_lock);
- entry = find_direct_entry(&ip, &rec);
- if (!entry)
- goto out_unlock;
-
- ret = -EINVAL;
- if (entry->direct != old_addr)
- goto out_unlock;
+ int err;
- /*
- * If there's no other ftrace callback on the rec->ip location,
- * then it can be changed directly by the architecture.
- * If there is another caller, then we just need to change the
- * direct caller helper to point to @new_addr.
- */
- if (ftrace_rec_count(rec) == 1) {
- ret = ftrace_modify_direct_caller(entry, rec, old_addr, new_addr);
- } else {
- entry->direct = new_addr;
- ret = 0;
- }
+ if (check_direct_multi(ops))
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EINVAL;
- out_unlock:
- mutex_unlock(&ftrace_lock);
+ mutex_lock(&direct_mutex);
+ err = __modify_ftrace_direct(ops, addr);
mutex_unlock(&direct_mutex);
- return ret;
+ return err;
}
EXPORT_SYMBOL_GPL(modify_ftrace_direct);
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
@@ -5299,17 +5631,44 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct);
* @reset - non zero to reset all filters before applying this filter.
*
* Filters denote which functions should be enabled when tracing is enabled
- * If @ip is NULL, it failes to update filter.
+ * If @ip is NULL, it fails to update filter.
+ *
+ * This can allocate memory which must be freed before @ops can be freed,
+ * either by removing each filtered addr or by using
+ * ftrace_free_filter(@ops).
*/
int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
int remove, int reset)
{
ftrace_ops_init(ops);
- return ftrace_set_addr(ops, ip, remove, reset, 1);
+ return ftrace_set_addr(ops, &ip, 1, remove, reset, 1);
}
EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
/**
+ * ftrace_set_filter_ips - set functions to filter on in ftrace by addresses
+ * @ops - the ops to set the filter with
+ * @ips - the array of addresses to add to or remove from the filter.
+ * @cnt - the number of addresses in @ips
+ * @remove - non zero to remove ips from the filter
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled
+ * If @ips array or any ip specified within is NULL , it fails to update filter.
+ *
+ * This can allocate memory which must be freed before @ops can be freed,
+ * either by removing each filtered addr or by using
+ * ftrace_free_filter(@ops).
+*/
+int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips,
+ unsigned int cnt, int remove, int reset)
+{
+ ftrace_ops_init(ops);
+ return ftrace_set_addr(ops, ips, cnt, remove, reset, 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_filter_ips);
+
+/**
* ftrace_ops_set_global_filter - setup ops to use global filters
* @ops - the ops which will use the global filters
*
@@ -5330,7 +5689,7 @@ static int
ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
int reset, int enable)
{
- return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable);
+ return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable);
}
/**
@@ -5342,6 +5701,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
*
* Filters denote which functions should be enabled when tracing is enabled.
* If @buf is NULL and reset is set, all functions will be enabled for tracing.
+ *
+ * This can allocate memory which must be freed before @ops can be freed,
+ * either by removing each filtered addr or by using
+ * ftrace_free_filter(@ops).
*/
int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
int len, int reset)
@@ -5361,6 +5724,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);
* Notrace Filters denote which functions should not be enabled when tracing
* is enabled. If @buf is NULL and reset is set, all functions will be enabled
* for tracing.
+ *
+ * This can allocate memory which must be freed before @ops can be freed,
+ * either by removing each filtered addr or by using
+ * ftrace_free_filter(@ops).
*/
int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
int len, int reset)
@@ -5413,7 +5780,7 @@ bool ftrace_filter_param __initdata;
static int __init set_ftrace_notrace(char *str)
{
ftrace_filter_param = true;
- strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+ strscpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_notrace=", set_ftrace_notrace);
@@ -5421,7 +5788,7 @@ __setup("ftrace_notrace=", set_ftrace_notrace);
static int __init set_ftrace_filter(char *str)
{
ftrace_filter_param = true;
- strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+ strscpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_filter=", set_ftrace_filter);
@@ -5433,14 +5800,14 @@ static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer);
static int __init set_graph_function(char *str)
{
- strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
+ strscpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_graph_filter=", set_graph_function);
static int __init set_graph_notrace_function(char *str)
{
- strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE);
+ strscpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_graph_notrace=", set_graph_notrace_function);
@@ -5514,7 +5881,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
struct ftrace_hash **orig_hash;
struct trace_parser *parser;
int filter_hash;
- int ret;
if (file->f_mode & FMODE_READ) {
iter = m->private;
@@ -5524,7 +5890,10 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
parser = &iter->parser;
if (trace_parser_loaded(parser)) {
- ftrace_match_records(iter->hash, parser->buffer, parser->idx);
+ int enable = !(iter->flags & FTRACE_ITER_NOTRACE);
+
+ ftrace_process_regex(iter, parser->buffer,
+ parser->idx, enable);
}
trace_parser_put(parser);
@@ -5536,13 +5905,17 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
if (filter_hash) {
orig_hash = &iter->ops->func_hash->filter_hash;
- if (iter->tr && !list_empty(&iter->tr->mod_trace))
- iter->hash->flags |= FTRACE_HASH_FL_MOD;
+ if (iter->tr) {
+ if (list_empty(&iter->tr->mod_trace))
+ iter->hash->flags &= ~FTRACE_HASH_FL_MOD;
+ else
+ iter->hash->flags |= FTRACE_HASH_FL_MOD;
+ }
} else
orig_hash = &iter->ops->func_hash->notrace_hash;
mutex_lock(&ftrace_lock);
- ret = ftrace_hash_move_and_update_ops(iter->ops, orig_hash,
+ ftrace_hash_move_and_update_ops(iter->ops, orig_hash,
iter->hash, filter_hash);
mutex_unlock(&ftrace_lock);
} else {
@@ -5573,6 +5946,20 @@ static const struct file_operations ftrace_enabled_fops = {
.release = seq_release_private,
};
+static const struct file_operations ftrace_touched_fops = {
+ .open = ftrace_touched_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static const struct file_operations ftrace_avail_addrs_fops = {
+ .open = ftrace_avail_addrs_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
static const struct file_operations ftrace_filter_fops = {
.open = ftrace_filter_open,
.read = seq_read,
@@ -5877,7 +6264,8 @@ ftrace_graph_release(struct inode *inode, struct file *file)
* infrastructure to do the synchronization, thus we must do it
* ourselves.
*/
- synchronize_rcu_tasks_rude();
+ if (old_hash != EMPTY_HASH)
+ synchronize_rcu_tasks_rude();
free_ftrace_hash(old_hash);
}
@@ -5925,7 +6313,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
if (entry)
continue;
- if (add_hash_entry(hash, rec->ip) < 0)
+ if (add_hash_entry(hash, rec->ip) == NULL)
goto out;
} else {
if (entry) {
@@ -6000,10 +6388,10 @@ void ftrace_create_filter_files(struct ftrace_ops *ops,
struct dentry *parent)
{
- trace_create_file("set_ftrace_filter", 0644, parent,
+ trace_create_file("set_ftrace_filter", TRACE_MODE_WRITE, parent,
ops, &ftrace_filter_fops);
- trace_create_file("set_ftrace_notrace", 0644, parent,
+ trace_create_file("set_ftrace_notrace", TRACE_MODE_WRITE, parent,
ops, &ftrace_notrace_fops);
}
@@ -6030,19 +6418,25 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops)
static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
{
- trace_create_file("available_filter_functions", 0444,
+ trace_create_file("available_filter_functions", TRACE_MODE_READ,
d_tracer, NULL, &ftrace_avail_fops);
- trace_create_file("enabled_functions", 0444,
+ trace_create_file("available_filter_functions_addrs", TRACE_MODE_READ,
+ d_tracer, NULL, &ftrace_avail_addrs_fops);
+
+ trace_create_file("enabled_functions", TRACE_MODE_READ,
d_tracer, NULL, &ftrace_enabled_fops);
+ trace_create_file("touched_functions", TRACE_MODE_READ,
+ d_tracer, NULL, &ftrace_touched_fops);
+
ftrace_create_filter_files(&global_ops, d_tracer);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- trace_create_file("set_graph_function", 0644, d_tracer,
+ trace_create_file("set_graph_function", TRACE_MODE_WRITE, d_tracer,
NULL,
&ftrace_graph_fops);
- trace_create_file("set_graph_notrace", 0644, d_tracer,
+ trace_create_file("set_graph_notrace", TRACE_MODE_WRITE, d_tracer,
NULL,
&ftrace_graph_notrace_fops);
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -6062,13 +6456,36 @@ static int ftrace_cmp_ips(const void *a, const void *b)
return 0;
}
+#ifdef CONFIG_FTRACE_SORT_STARTUP_TEST
+static void test_is_sorted(unsigned long *start, unsigned long count)
+{
+ int i;
+
+ for (i = 1; i < count; i++) {
+ if (WARN(start[i - 1] > start[i],
+ "[%d] %pS at %lx is not sorted with %pS at %lx\n", i,
+ (void *)start[i - 1], start[i - 1],
+ (void *)start[i], start[i]))
+ break;
+ }
+ if (i == count)
+ pr_info("ftrace section at %px sorted properly\n", start);
+}
+#else
+static void test_is_sorted(unsigned long *start, unsigned long count)
+{
+}
+#endif
+
static int ftrace_process_locs(struct module *mod,
unsigned long *start,
unsigned long *end)
{
+ struct ftrace_page *pg_unuse = NULL;
struct ftrace_page *start_pg;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
+ unsigned long skipped = 0;
unsigned long count;
unsigned long *p;
unsigned long addr;
@@ -6080,8 +6497,17 @@ static int ftrace_process_locs(struct module *mod,
if (!count)
return 0;
- sort(start, count, sizeof(*start),
- ftrace_cmp_ips, NULL);
+ /*
+ * Sorting mcount in vmlinux at build time depend on
+ * CONFIG_BUILDTIME_MCOUNT_SORT, while mcount loc in
+ * modules can not be sorted at build time.
+ */
+ if (!IS_ENABLED(CONFIG_BUILDTIME_MCOUNT_SORT) || mod) {
+ sort(start, count, sizeof(*start),
+ ftrace_cmp_ips, NULL);
+ } else {
+ test_is_sorted(start, count);
+ }
start_pg = ftrace_allocate_pages(count);
if (!start_pg)
@@ -6114,6 +6540,7 @@ static int ftrace_process_locs(struct module *mod,
p = start;
pg = start_pg;
while (p < end) {
+ unsigned long end_offset;
addr = ftrace_call_adjust(*p++);
/*
* Some architecture linkers will pad between
@@ -6121,10 +6548,13 @@ static int ftrace_process_locs(struct module *mod,
* object files to satisfy alignments.
* Skip any NULL pointers.
*/
- if (!addr)
+ if (!addr) {
+ skipped++;
continue;
+ }
- if (pg->index == pg->size) {
+ end_offset = (pg->index+1) * sizeof(pg->records[0]);
+ if (end_offset > PAGE_SIZE << pg->order) {
/* We should have allocated enough */
if (WARN_ON(!pg->next))
break;
@@ -6135,8 +6565,10 @@ static int ftrace_process_locs(struct module *mod,
rec->ip = addr;
}
- /* We should have used all pages */
- WARN_ON(pg->next);
+ if (pg->next) {
+ pg_unuse = pg->next;
+ pg->next = NULL;
+ }
/* Assign the last page to ftrace_pages */
ftrace_pages = pg;
@@ -6158,6 +6590,11 @@ static int ftrace_process_locs(struct module *mod,
out:
mutex_unlock(&ftrace_lock);
+ /* We should have used all pages unless we skipped some */
+ if (pg_unuse) {
+ WARN_ON(!skipped);
+ ftrace_free_pages(pg_unuse);
+ }
return ret;
}
@@ -6178,6 +6615,59 @@ struct ftrace_mod_map {
unsigned int num_funcs;
};
+static int ftrace_get_trampoline_kallsym(unsigned int symnum,
+ unsigned long *value, char *type,
+ char *name, char *module_name,
+ int *exported)
+{
+ struct ftrace_ops *op;
+
+ list_for_each_entry_rcu(op, &ftrace_ops_trampoline_list, list) {
+ if (!op->trampoline || symnum--)
+ continue;
+ *value = op->trampoline;
+ *type = 't';
+ strscpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN);
+ strscpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN);
+ *exported = 0;
+ return 0;
+ }
+
+ return -ERANGE;
+}
+
+#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) || defined(CONFIG_MODULES)
+/*
+ * Check if the current ops references the given ip.
+ *
+ * If the ops traces all functions, then it was already accounted for.
+ * If the ops does not trace the current record function, skip it.
+ * If the ops ignores the function via notrace filter, skip it.
+ */
+static bool
+ops_references_ip(struct ftrace_ops *ops, unsigned long ip)
+{
+ /* If ops isn't enabled, ignore it */
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return false;
+
+ /* If ops traces all then it includes this function */
+ if (ops_traces_mod(ops))
+ return true;
+
+ /* The function must be in the filter */
+ if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
+ !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip))
+ return false;
+
+ /* If in notrace hash, we ignore it too */
+ if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip))
+ return false;
+
+ return true;
+}
+#endif
+
#ifdef CONFIG_MODULES
#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
@@ -6190,8 +6680,19 @@ static int referenced_filters(struct dyn_ftrace *rec)
int cnt = 0;
for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
- if (ops_references_rec(ops, rec))
- cnt++;
+ if (ops_references_ip(ops, rec->ip)) {
+ if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT))
+ continue;
+ if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+ continue;
+ cnt++;
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+ rec->flags |= FTRACE_FL_REGS;
+ if (cnt == 1 && ops->trampoline)
+ rec->flags |= FTRACE_FL_TRAMP;
+ else
+ rec->flags &= ~FTRACE_FL_TRAMP;
+ }
}
return cnt;
@@ -6220,7 +6721,7 @@ clear_mod_from_hash(struct ftrace_page *pg, struct ftrace_hash *hash)
}
}
-/* Clear any records from hashs */
+/* Clear any records from hashes */
static void clear_mod_from_hashes(struct ftrace_page *pg)
{
struct trace_array *tr;
@@ -6261,7 +6762,6 @@ void ftrace_release_mod(struct module *mod)
struct ftrace_page **last_pg;
struct ftrace_page *tmp_page = NULL;
struct ftrace_page *pg;
- int order;
mutex_lock(&ftrace_lock);
@@ -6283,8 +6783,7 @@ void ftrace_release_mod(struct module *mod)
last_pg = &ftrace_pages_start;
for (pg = ftrace_pages_start; pg; pg = *last_pg) {
rec = &pg->records[0];
- if (within_module_core(rec->ip, mod) ||
- within_module_init(rec->ip, mod)) {
+ if (within_module(rec->ip, mod)) {
/*
* As core pages are first, the first
* page should never be a module page.
@@ -6312,11 +6811,12 @@ void ftrace_release_mod(struct module *mod)
/* Needs to be called outside of ftrace_lock */
clear_mod_from_hashes(pg);
- order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- free_pages((unsigned long)pg->records, order);
+ if (pg->records) {
+ free_pages((unsigned long)pg->records, pg->order);
+ ftrace_number_of_pages -= 1 << pg->order;
+ }
tmp_page = pg->next;
kfree(pg);
- ftrace_number_of_pages -= 1 << order;
ftrace_number_of_groups--;
}
}
@@ -6355,10 +6855,16 @@ void ftrace_module_enable(struct module *mod)
* not part of this module, then skip this pg,
* which the "break" will do.
*/
- if (!within_module_core(rec->ip, mod) &&
- !within_module_init(rec->ip, mod))
+ if (!within_module(rec->ip, mod))
break;
+ /* Weak functions should still be ignored */
+ if (!test_for_valid_rec(rec)) {
+ /* Clear all other flags. Should not be enabled anyway */
+ rec->flags = FTRACE_FL_DISABLED;
+ continue;
+ }
+
cnt = 0;
/*
@@ -6370,8 +6876,8 @@ void ftrace_module_enable(struct module *mod)
if (ftrace_start_up)
cnt += referenced_filters(rec);
- /* This clears FTRACE_FL_DISABLED */
- rec->flags = cnt;
+ rec->flags &= ~FTRACE_FL_DISABLED;
+ rec->flags += cnt;
if (ftrace_start_up && cnt) {
int failed = __ftrace_replace_code(rec, 1);
@@ -6395,11 +6901,16 @@ void ftrace_module_enable(struct module *mod)
void ftrace_module_init(struct module *mod)
{
+ int ret;
+
if (ftrace_disabled || !mod->num_ftrace_callsites)
return;
- ftrace_process_locs(mod, mod->ftrace_callsites,
- mod->ftrace_callsites + mod->num_ftrace_callsites);
+ ret = ftrace_process_locs(mod, mod->ftrace_callsites,
+ mod->ftrace_callsites + mod->num_ftrace_callsites);
+ if (ret)
+ pr_warn("ftrace: failed to allocate entries for module '%s' functions\n",
+ mod->name);
}
static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
@@ -6478,7 +6989,7 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map,
if (off)
*off = addr - found_func->ip;
if (sym)
- strlcpy(sym, found_func->name, KSYM_NAME_LEN);
+ strscpy(sym, found_func->name, KSYM_NAME_LEN);
return found_func->name;
}
@@ -6514,6 +7025,7 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
{
struct ftrace_mod_map *mod_map;
struct ftrace_mod_func *mod_func;
+ int ret;
preempt_disable();
list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
@@ -6531,8 +7043,8 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
*value = mod_func->ip;
*type = 'T';
- strlcpy(name, mod_func->name, KSYM_NAME_LEN);
- strlcpy(module_name, mod_map->mod->name, MODULE_NAME_LEN);
+ strscpy(name, mod_func->name, KSYM_NAME_LEN);
+ strscpy(module_name, mod_map->mod->name, MODULE_NAME_LEN);
*exported = 1;
preempt_enable();
return 0;
@@ -6540,8 +7052,10 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
WARN_ON(1);
break;
}
+ ret = ftrace_get_trampoline_kallsym(symnum, value, type, name,
+ module_name, exported);
preempt_enable();
- return -ERANGE;
+ return ret;
}
#else
@@ -6553,6 +7067,18 @@ allocate_ftrace_mod_map(struct module *mod,
{
return NULL;
}
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+ char *type, char *name, char *module_name,
+ int *exported)
+{
+ int ret;
+
+ preempt_disable();
+ ret = ftrace_get_trampoline_kallsym(symnum, value, type, name,
+ module_name, exported);
+ preempt_enable();
+ return ret;
+}
#endif /* CONFIG_MODULES */
struct ftrace_init_func {
@@ -6618,10 +7144,7 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
struct dyn_ftrace key;
struct ftrace_mod_map *mod_map = NULL;
struct ftrace_init_func *func, *func_next;
- struct list_head clear_hash;
- int order;
-
- INIT_LIST_HEAD(&clear_hash);
+ LIST_HEAD(clear_hash);
key.ip = start;
key.flags = end; /* overload flags, as it is unsigned long */
@@ -6657,9 +7180,10 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
ftrace_update_tot_cnt--;
if (!pg->index) {
*last_pg = pg->next;
- order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- free_pages((unsigned long)pg->records, order);
- ftrace_number_of_pages -= 1 << order;
+ if (pg->records) {
+ free_pages((unsigned long)pg->records, pg->order);
+ ftrace_number_of_pages -= 1 << pg->order;
+ }
ftrace_number_of_groups--;
kfree(pg);
pg = container_of(last_pg, struct ftrace_page, next);
@@ -6685,9 +7209,16 @@ void __init ftrace_free_init_mem(void)
void *start = (void *)(&__init_begin);
void *end = (void *)(&__init_end);
+ ftrace_boot_snapshot();
+
ftrace_free_mem(NULL, start, end);
}
+int __init __weak ftrace_dyn_arch_init(void)
+{
+ return 0;
+}
+
void __init ftrace_init(void)
{
extern unsigned long __start_mcount_loc[];
@@ -6708,17 +7239,21 @@ void __init ftrace_init(void)
}
pr_info("ftrace: allocating %ld entries in %ld pages\n",
- count, count / ENTRIES_PER_PAGE + 1);
-
- last_ftrace_enabled = ftrace_enabled = 1;
+ count, DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
ret = ftrace_process_locs(NULL,
__start_mcount_loc,
__stop_mcount_loc);
+ if (ret) {
+ pr_warn("ftrace: failed to allocate entries for functions\n");
+ goto failed;
+ }
pr_info("ftrace: allocated %ld pages with %ld groups\n",
ftrace_number_of_pages, ftrace_number_of_groups);
+ last_ftrace_enabled = ftrace_enabled = 1;
+
set_ftrace_early_filters();
return;
@@ -6733,7 +7268,24 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
static void ftrace_update_trampoline(struct ftrace_ops *ops)
{
+ unsigned long trampoline = ops->trampoline;
+
arch_ftrace_update_trampoline(ops);
+ if (ops->trampoline && ops->trampoline != trampoline &&
+ (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) {
+ /* Add to kallsyms before the perf events */
+ ftrace_add_trampoline_to_kallsyms(ops);
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ ops->trampoline, ops->trampoline_size, false,
+ FTRACE_TRAMPOLINE_SYM);
+ /*
+ * Record the perf text poke event after the ksymbol register
+ * event.
+ */
+ perf_event_text_poke((void *)ops->trampoline, NULL, 0,
+ (void *)ops->trampoline,
+ ops->trampoline_size);
+ }
}
void ftrace_init_trace_array(struct trace_array *tr)
@@ -6746,8 +7298,7 @@ void ftrace_init_trace_array(struct trace_array *tr)
struct ftrace_ops global_ops = {
.func = ftrace_stub,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE |
- FTRACE_OPS_FL_INITIALIZED |
+ .flags = FTRACE_OPS_FL_INITIALIZED |
FTRACE_OPS_FL_PID,
};
@@ -6759,12 +7310,8 @@ static int __init ftrace_nodyn_init(void)
core_initcall(ftrace_nodyn_init);
static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; }
-static inline void ftrace_startup_enable(int command) { }
static inline void ftrace_startup_all(int command) { }
-# define ftrace_startup_sysctl() do { } while (0)
-# define ftrace_shutdown_sysctl() do { } while (0)
-
static void ftrace_update_trampoline(struct ftrace_ops *ops)
{
}
@@ -6797,20 +7344,20 @@ void ftrace_reset_array_ops(struct trace_array *tr)
static nokprobe_inline void
__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ignored, struct pt_regs *regs)
+ struct ftrace_ops *ignored, struct ftrace_regs *fregs)
{
+ struct pt_regs *regs = ftrace_get_regs(fregs);
struct ftrace_ops *op;
int bit;
- bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
- if (bit < 0)
- return;
-
/*
- * Some of the ops may be dynamically allocated,
- * they must be freed after a synchronize_rcu().
+ * The ftrace_test_and_set_recursion() will disable preemption,
+ * which is required since some of the ops may be dynamically
+ * allocated, they must be freed after a synchronize_rcu().
*/
- preempt_disable_notrace();
+ bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START);
+ if (bit < 0)
+ return;
do_for_each_ftrace_op(op, ftrace_ops_list) {
/* Stub functions don't need to be called nor tested */
@@ -6819,8 +7366,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
/*
* Check the following for each ops before calling their func:
* if RCU flag is set, then rcu_is_watching() must be true
- * if PER_CPU is set, then ftrace_function_local_disable()
- * must be false
* Otherwise test if the ip matches the ops filter
*
* If any of the above fails then the op->func() is not executed.
@@ -6831,11 +7376,10 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
pr_warn("op=%p %pS\n", op, op);
goto out;
}
- op->func(ip, parent_ip, op, regs);
+ op->func(ip, parent_ip, op, fregs);
}
} while_for_each_ftrace_op(op);
out:
- preempt_enable_notrace();
trace_clear_recursion(bit);
}
@@ -6851,44 +7395,41 @@ out:
* Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
* An architecture can pass partial regs with ftrace_ops and still
* set the ARCH_SUPPORTS_FTRACE_OPS.
+ *
+ * In vmlinux.lds.h, ftrace_ops_list_func() is defined to be
+ * arch_ftrace_ops_list_func.
*/
#if ARCH_SUPPORTS_FTRACE_OPS
-static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs)
+void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
- __ftrace_ops_list_func(ip, parent_ip, NULL, regs);
+ __ftrace_ops_list_func(ip, parent_ip, NULL, fregs);
}
-NOKPROBE_SYMBOL(ftrace_ops_list_func);
#else
-static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
+void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
{
__ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
}
-NOKPROBE_SYMBOL(ftrace_ops_no_ops);
#endif
+NOKPROBE_SYMBOL(arch_ftrace_ops_list_func);
/*
* If there's only one function registered but it does not support
- * recursion, needs RCU protection and/or requires per cpu handling, then
- * this function will be called by the mcount trampoline.
+ * recursion, needs RCU protection, then this function will be called
+ * by the mcount trampoline.
*/
static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
int bit;
- if ((op->flags & FTRACE_OPS_FL_RCU) && !rcu_is_watching())
- return;
-
- bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+ bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START);
if (bit < 0)
return;
- preempt_disable_notrace();
-
- op->func(ip, parent_ip, op, regs);
+ if (!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching())
+ op->func(ip, parent_ip, op, fregs);
- preempt_enable_notrace();
trace_clear_recursion(bit);
}
NOKPROBE_SYMBOL(ftrace_ops_assist_func);
@@ -6907,11 +7448,11 @@ NOKPROBE_SYMBOL(ftrace_ops_assist_func);
ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
{
/*
- * If the function does not handle recursion, needs to be RCU safe,
- * or does per cpu logic, then we need to call the assist handler.
+ * If the function does not handle recursion or needs to be RCU safe,
+ * then we need to call the assist handler.
*/
- if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
- ops->flags & FTRACE_OPS_FL_RCU)
+ if (ops->flags & (FTRACE_OPS_FL_RECURSION |
+ FTRACE_OPS_FL_RCU))
return ftrace_ops_assist_func;
return ops->func;
@@ -6919,7 +7460,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
static void
ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
{
struct trace_array *tr = data;
struct trace_pid_list *pid_list;
@@ -6969,12 +7512,12 @@ void ftrace_pid_follow_fork(struct trace_array *tr, bool enable)
if (enable) {
register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
tr);
- register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+ register_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
tr);
} else {
unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
tr);
- unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+ unregister_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
tr);
}
}
@@ -7011,10 +7554,10 @@ static void clear_ftrace_pids(struct trace_array *tr, int type)
synchronize_rcu();
if ((type & TRACE_PIDS) && pid_list)
- trace_free_pid_list(pid_list);
+ trace_pid_list_free(pid_list);
if ((type & TRACE_NO_PIDS) && no_pid_list)
- trace_free_pid_list(no_pid_list);
+ trace_pid_list_free(no_pid_list);
}
void ftrace_clear_pids(struct trace_array *tr)
@@ -7255,7 +7798,7 @@ pid_write(struct file *filp, const char __user *ubuf,
if (filtered_pids) {
synchronize_rcu();
- trace_free_pid_list(filtered_pids);
+ trace_pid_list_free(filtered_pids);
} else if (pid_list && !other_pids) {
/* Register a probe to set whether to ignore the tracing of a task */
register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
@@ -7321,10 +7864,10 @@ static const struct file_operations ftrace_no_pid_fops = {
void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
- trace_create_file("set_ftrace_pid", 0644, d_tracer,
+ trace_create_file("set_ftrace_pid", TRACE_MODE_WRITE, d_tracer,
tr, &ftrace_pid_fops);
- trace_create_file("set_ftrace_notrace_pid", 0644, d_tracer,
- tr, &ftrace_no_pid_fops);
+ trace_create_file("set_ftrace_notrace_pid", TRACE_MODE_WRITE,
+ d_tracer, tr, &ftrace_no_pid_fops);
}
void __init ftrace_init_tracefs_toplevel(struct trace_array *tr,
@@ -7352,16 +7895,155 @@ void ftrace_kill(void)
}
/**
- * Test if ftrace is dead or not.
+ * ftrace_is_dead - Test if ftrace is dead or not.
+ *
+ * Returns 1 if ftrace is "dead", zero otherwise.
*/
int ftrace_is_dead(void)
{
return ftrace_disabled;
}
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+/*
+ * When registering ftrace_ops with IPMODIFY, it is necessary to make sure
+ * it doesn't conflict with any direct ftrace_ops. If there is existing
+ * direct ftrace_ops on a kernel function being patched, call
+ * FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER on it to enable sharing.
+ *
+ * @ops: ftrace_ops being registered.
+ *
+ * Returns:
+ * 0 on success;
+ * Negative on failure.
+ */
+static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops)
+{
+ struct ftrace_func_entry *entry;
+ struct ftrace_hash *hash;
+ struct ftrace_ops *op;
+ int size, i, ret;
+
+ lockdep_assert_held_once(&direct_mutex);
+
+ if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+ return 0;
+
+ hash = ops->func_hash->filter_hash;
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ unsigned long ip = entry->ip;
+ bool found_op = false;
+
+ mutex_lock(&ftrace_lock);
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (!(op->flags & FTRACE_OPS_FL_DIRECT))
+ continue;
+ if (ops_references_ip(op, ip)) {
+ found_op = true;
+ break;
+ }
+ } while_for_each_ftrace_op(op);
+ mutex_unlock(&ftrace_lock);
+
+ if (found_op) {
+ if (!op->ops_func)
+ return -EBUSY;
+
+ ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER);
+ if (ret)
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Similar to prepare_direct_functions_for_ipmodify, clean up after ops
+ * with IPMODIFY is unregistered. The cleanup is optional for most DIRECT
+ * ops.
+ */
+static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops)
+{
+ struct ftrace_func_entry *entry;
+ struct ftrace_hash *hash;
+ struct ftrace_ops *op;
+ int size, i;
+
+ if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+ return;
+
+ mutex_lock(&direct_mutex);
+
+ hash = ops->func_hash->filter_hash;
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ unsigned long ip = entry->ip;
+ bool found_op = false;
+
+ mutex_lock(&ftrace_lock);
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (!(op->flags & FTRACE_OPS_FL_DIRECT))
+ continue;
+ if (ops_references_ip(op, ip)) {
+ found_op = true;
+ break;
+ }
+ } while_for_each_ftrace_op(op);
+ mutex_unlock(&ftrace_lock);
+
+ /* The cleanup is optional, ignore any errors */
+ if (found_op && op->ops_func)
+ op->ops_func(op, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER);
+ }
+ }
+ mutex_unlock(&direct_mutex);
+}
+
+#define lock_direct_mutex() mutex_lock(&direct_mutex)
+#define unlock_direct_mutex() mutex_unlock(&direct_mutex)
+
+#else /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
+static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops)
+{
+ return 0;
+}
+
+static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops)
+{
+}
+
+#define lock_direct_mutex() do { } while (0)
+#define unlock_direct_mutex() do { } while (0)
+
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
+/*
+ * Similar to register_ftrace_function, except we don't lock direct_mutex.
+ */
+static int register_ftrace_function_nolock(struct ftrace_ops *ops)
+{
+ int ret;
+
+ ftrace_ops_init(ops);
+
+ mutex_lock(&ftrace_lock);
+
+ ret = ftrace_startup(ops, 0);
+
+ mutex_unlock(&ftrace_lock);
+
+ return ret;
+}
+
/**
* register_ftrace_function - register a function for profiling
- * @ops - ops structure that holds the function for profiling.
+ * @ops: ops structure that holds the function for profiling.
*
* Register a function to be called by all functions in the
* kernel.
@@ -7372,23 +8054,24 @@ int ftrace_is_dead(void)
*/
int register_ftrace_function(struct ftrace_ops *ops)
{
- int ret = -1;
-
- ftrace_ops_init(ops);
-
- mutex_lock(&ftrace_lock);
+ int ret;
- ret = ftrace_startup(ops, 0);
+ lock_direct_mutex();
+ ret = prepare_direct_functions_for_ipmodify(ops);
+ if (ret < 0)
+ goto out_unlock;
- mutex_unlock(&ftrace_lock);
+ ret = register_ftrace_function_nolock(ops);
+out_unlock:
+ unlock_direct_mutex();
return ret;
}
EXPORT_SYMBOL_GPL(register_ftrace_function);
/**
* unregister_ftrace_function - unregister a function for profiling.
- * @ops - ops structure that holds the function to unregister
+ * @ops: ops structure that holds the function to unregister
*
* Unregister a function that was added to be called by ftrace profiling.
*/
@@ -7400,10 +8083,127 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
ret = ftrace_shutdown(ops, 0);
mutex_unlock(&ftrace_lock);
+ cleanup_direct_functions_after_ipmodify(ops);
return ret;
}
EXPORT_SYMBOL_GPL(unregister_ftrace_function);
+static int symbols_cmp(const void *a, const void *b)
+{
+ const char **str_a = (const char **) a;
+ const char **str_b = (const char **) b;
+
+ return strcmp(*str_a, *str_b);
+}
+
+struct kallsyms_data {
+ unsigned long *addrs;
+ const char **syms;
+ size_t cnt;
+ size_t found;
+};
+
+/* This function gets called for all kernel and module symbols
+ * and returns 1 in case we resolved all the requested symbols,
+ * 0 otherwise.
+ */
+static int kallsyms_callback(void *data, const char *name, unsigned long addr)
+{
+ struct kallsyms_data *args = data;
+ const char **sym;
+ int idx;
+
+ sym = bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp);
+ if (!sym)
+ return 0;
+
+ idx = sym - args->syms;
+ if (args->addrs[idx])
+ return 0;
+
+ if (!ftrace_location(addr))
+ return 0;
+
+ args->addrs[idx] = addr;
+ args->found++;
+ return args->found == args->cnt ? 1 : 0;
+}
+
+/**
+ * ftrace_lookup_symbols - Lookup addresses for array of symbols
+ *
+ * @sorted_syms: array of symbols pointers symbols to resolve,
+ * must be alphabetically sorted
+ * @cnt: number of symbols/addresses in @syms/@addrs arrays
+ * @addrs: array for storing resulting addresses
+ *
+ * This function looks up addresses for array of symbols provided in
+ * @syms array (must be alphabetically sorted) and stores them in
+ * @addrs array, which needs to be big enough to store at least @cnt
+ * addresses.
+ *
+ * This function returns 0 if all provided symbols are found,
+ * -ESRCH otherwise.
+ */
+int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs)
+{
+ struct kallsyms_data args;
+ int found_all;
+
+ memset(addrs, 0, sizeof(*addrs) * cnt);
+ args.addrs = addrs;
+ args.syms = sorted_syms;
+ args.cnt = cnt;
+ args.found = 0;
+
+ found_all = kallsyms_on_each_symbol(kallsyms_callback, &args);
+ if (found_all)
+ return 0;
+ found_all = module_kallsyms_on_each_symbol(NULL, kallsyms_callback, &args);
+ return found_all ? 0 : -ESRCH;
+}
+
+#ifdef CONFIG_SYSCTL
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static void ftrace_startup_sysctl(void)
+{
+ int command;
+
+ if (unlikely(ftrace_disabled))
+ return;
+
+ /* Force update next time */
+ saved_ftrace_func = NULL;
+ /* ftrace_start_up is true if we want ftrace running */
+ if (ftrace_start_up) {
+ command = FTRACE_UPDATE_CALLS;
+ if (ftrace_graph_active)
+ command |= FTRACE_START_FUNC_RET;
+ ftrace_startup_enable(command);
+ }
+}
+
+static void ftrace_shutdown_sysctl(void)
+{
+ int command;
+
+ if (unlikely(ftrace_disabled))
+ return;
+
+ /* ftrace_start_up is true if ftrace is running */
+ if (ftrace_start_up) {
+ command = FTRACE_DISABLE_CALLS;
+ if (ftrace_graph_active)
+ command |= FTRACE_STOP_FUNC_RET;
+ ftrace_run_update_code(command);
+ }
+}
+#else
+# define ftrace_startup_sysctl() do { } while (0)
+# define ftrace_shutdown_sysctl() do { } while (0)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
static bool is_permanent_ops_registered(void)
{
struct ftrace_ops *op;
@@ -7416,10 +8216,9 @@ static bool is_permanent_ops_registered(void)
return false;
}
-int
+static int
ftrace_enable_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = -ENODEV;
@@ -7460,3 +8259,22 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
mutex_unlock(&ftrace_lock);
return ret;
}
+
+static struct ctl_table ftrace_sysctls[] = {
+ {
+ .procname = "ftrace_enabled",
+ .data = &ftrace_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = ftrace_enable_sysctl,
+ },
+ {}
+};
+
+static int __init ftrace_sysctl_init(void)
+{
+ register_sysctl_init("kernel", ftrace_sysctls);
+ return 0;
+}
+late_initcall(ftrace_sysctl_init);
+#endif
diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h
index 382775edf690..5012c04f92c0 100644
--- a/kernel/trace/ftrace_internal.h
+++ b/kernel/trace/ftrace_internal.h
@@ -2,6 +2,9 @@
#ifndef _LINUX_KERNEL_FTRACE_INTERNAL_H
#define _LINUX_KERNEL_FTRACE_INTERNAL_H
+int __register_ftrace_function(struct ftrace_ops *ops);
+int __unregister_ftrace_function(struct ftrace_ops *ops);
+
#ifdef CONFIG_FUNCTION_TRACER
extern struct mutex ftrace_lock;
@@ -15,8 +18,6 @@ int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs);
#else /* !CONFIG_DYNAMIC_FTRACE */
-int __register_ftrace_function(struct ftrace_ops *ops);
-int __unregister_ftrace_function(struct ftrace_ops *ops);
/* Keep as macros so we do not need to define the commands */
# define ftrace_startup(ops, command) \
({ \
diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c
index 18b0f1cbb947..5a4b722b5045 100644
--- a/kernel/trace/kprobe_event_gen_test.c
+++ b/kernel/trace/kprobe_event_gen_test.c
@@ -21,7 +21,7 @@
* Then:
*
* # insmod kernel/trace/kprobe_event_gen_test.ko
- * # cat /sys/kernel/debug/tracing/trace
+ * # cat /sys/kernel/tracing/trace
*
* You should see many instances of the "gen_kprobe_test" and
* "gen_kretprobe_test" events in the trace buffer.
@@ -35,6 +35,49 @@
static struct trace_event_file *gen_kprobe_test;
static struct trace_event_file *gen_kretprobe_test;
+#define KPROBE_GEN_TEST_FUNC "do_sys_open"
+
+/* X86 */
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_32)
+#define KPROBE_GEN_TEST_ARG0 "dfd=%ax"
+#define KPROBE_GEN_TEST_ARG1 "filename=%dx"
+#define KPROBE_GEN_TEST_ARG2 "flags=%cx"
+#define KPROBE_GEN_TEST_ARG3 "mode=+4($stack)"
+
+/* ARM64 */
+#elif defined(CONFIG_ARM64)
+#define KPROBE_GEN_TEST_ARG0 "dfd=%x0"
+#define KPROBE_GEN_TEST_ARG1 "filename=%x1"
+#define KPROBE_GEN_TEST_ARG2 "flags=%x2"
+#define KPROBE_GEN_TEST_ARG3 "mode=%x3"
+
+/* ARM */
+#elif defined(CONFIG_ARM)
+#define KPROBE_GEN_TEST_ARG0 "dfd=%r0"
+#define KPROBE_GEN_TEST_ARG1 "filename=%r1"
+#define KPROBE_GEN_TEST_ARG2 "flags=%r2"
+#define KPROBE_GEN_TEST_ARG3 "mode=%r3"
+
+/* RISCV */
+#elif defined(CONFIG_RISCV)
+#define KPROBE_GEN_TEST_ARG0 "dfd=%a0"
+#define KPROBE_GEN_TEST_ARG1 "filename=%a1"
+#define KPROBE_GEN_TEST_ARG2 "flags=%a2"
+#define KPROBE_GEN_TEST_ARG3 "mode=%a3"
+
+/* others */
+#else
+#define KPROBE_GEN_TEST_ARG0 NULL
+#define KPROBE_GEN_TEST_ARG1 NULL
+#define KPROBE_GEN_TEST_ARG2 NULL
+#define KPROBE_GEN_TEST_ARG3 NULL
+#endif
+
+static bool trace_event_file_is_valid(struct trace_event_file *input)
+{
+ return input && !IS_ERR(input);
+}
+
/*
* Test to make sure we can create a kprobe event, then add more
* fields.
@@ -58,23 +101,23 @@ static int __init test_gen_kprobe_cmd(void)
* fields.
*/
ret = kprobe_event_gen_cmd_start(&cmd, "gen_kprobe_test",
- "do_sys_open",
- "dfd=%ax", "filename=%dx");
+ KPROBE_GEN_TEST_FUNC,
+ KPROBE_GEN_TEST_ARG0, KPROBE_GEN_TEST_ARG1);
if (ret)
- goto free;
+ goto out;
/* Use kprobe_event_add_fields to add the rest of the fields */
- ret = kprobe_event_add_fields(&cmd, "flags=%cx", "mode=+4($stack)");
+ ret = kprobe_event_add_fields(&cmd, KPROBE_GEN_TEST_ARG2, KPROBE_GEN_TEST_ARG3);
if (ret)
- goto free;
+ goto out;
/*
* This actually creates the event.
*/
ret = kprobe_event_gen_cmd_end(&cmd);
if (ret)
- goto free;
+ goto out;
/*
* Now get the gen_kprobe_test event file. We need to prevent
@@ -97,13 +140,13 @@ static int __init test_gen_kprobe_cmd(void)
goto delete;
}
out:
+ kfree(buf);
return ret;
delete:
+ if (trace_event_file_is_valid(gen_kprobe_test))
+ gen_kprobe_test = NULL;
/* We got an error after creating the event, delete it */
- ret = kprobe_event_delete("gen_kprobe_test");
- free:
- kfree(buf);
-
+ kprobe_event_delete("gen_kprobe_test");
goto out;
}
@@ -128,17 +171,17 @@ static int __init test_gen_kretprobe_cmd(void)
* Define the kretprobe event.
*/
ret = kretprobe_event_gen_cmd_start(&cmd, "gen_kretprobe_test",
- "do_sys_open",
+ KPROBE_GEN_TEST_FUNC,
"$retval");
if (ret)
- goto free;
+ goto out;
/*
* This actually creates the event.
*/
ret = kretprobe_event_gen_cmd_end(&cmd);
if (ret)
- goto free;
+ goto out;
/*
* Now get the gen_kretprobe_test event file. We need to
@@ -162,13 +205,13 @@ static int __init test_gen_kretprobe_cmd(void)
goto delete;
}
out:
+ kfree(buf);
return ret;
delete:
+ if (trace_event_file_is_valid(gen_kretprobe_test))
+ gen_kretprobe_test = NULL;
/* We got an error after creating the event, delete it */
- ret = kprobe_event_delete("gen_kretprobe_test");
- free:
- kfree(buf);
-
+ kprobe_event_delete("gen_kretprobe_test");
goto out;
}
@@ -182,10 +225,12 @@ static int __init kprobe_event_gen_test_init(void)
ret = test_gen_kretprobe_cmd();
if (ret) {
- WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr,
- "kprobes",
- "gen_kretprobe_test", false));
- trace_put_event_file(gen_kretprobe_test);
+ if (trace_event_file_is_valid(gen_kretprobe_test)) {
+ WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr,
+ "kprobes",
+ "gen_kretprobe_test", false));
+ trace_put_event_file(gen_kretprobe_test);
+ }
WARN_ON(kprobe_event_delete("gen_kretprobe_test"));
}
@@ -194,24 +239,30 @@ static int __init kprobe_event_gen_test_init(void)
static void __exit kprobe_event_gen_test_exit(void)
{
- /* Disable the event or you can't remove it */
- WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr,
- "kprobes",
- "gen_kprobe_test", false));
+ if (trace_event_file_is_valid(gen_kprobe_test)) {
+ /* Disable the event or you can't remove it */
+ WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr,
+ "kprobes",
+ "gen_kprobe_test", false));
+
+ /* Now give the file and instance back */
+ trace_put_event_file(gen_kprobe_test);
+ }
- /* Now give the file and instance back */
- trace_put_event_file(gen_kprobe_test);
/* Now unregister and free the event */
WARN_ON(kprobe_event_delete("gen_kprobe_test"));
- /* Disable the event or you can't remove it */
- WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr,
- "kprobes",
- "gen_kretprobe_test", false));
+ if (trace_event_file_is_valid(gen_kretprobe_test)) {
+ /* Disable the event or you can't remove it */
+ WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr,
+ "kprobes",
+ "gen_kretprobe_test", false));
+
+ /* Now give the file and instance back */
+ trace_put_event_file(gen_kretprobe_test);
+ }
- /* Now give the file and instance back */
- trace_put_event_file(gen_kretprobe_test);
/* Now unregister and free the event */
WARN_ON(kprobe_event_delete("gen_kretprobe_test"));
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
new file mode 100644
index 000000000000..95106d02b32d
--- /dev/null
+++ b/kernel/trace/pid_list.c
@@ -0,0 +1,495 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 VMware Inc, Steven Rostedt <rostedt@goodmis.org>
+ */
+#include <linux/spinlock.h>
+#include <linux/irq_work.h>
+#include <linux/slab.h>
+#include "trace.h"
+
+/* See pid_list.h for details */
+
+static inline union lower_chunk *get_lower_chunk(struct trace_pid_list *pid_list)
+{
+ union lower_chunk *chunk;
+
+ lockdep_assert_held(&pid_list->lock);
+
+ if (!pid_list->lower_list)
+ return NULL;
+
+ chunk = pid_list->lower_list;
+ pid_list->lower_list = chunk->next;
+ pid_list->free_lower_chunks--;
+ WARN_ON_ONCE(pid_list->free_lower_chunks < 0);
+ chunk->next = NULL;
+ /*
+ * If a refill needs to happen, it can not happen here
+ * as the scheduler run queue locks are held.
+ */
+ if (pid_list->free_lower_chunks <= CHUNK_REALLOC)
+ irq_work_queue(&pid_list->refill_irqwork);
+
+ return chunk;
+}
+
+static inline union upper_chunk *get_upper_chunk(struct trace_pid_list *pid_list)
+{
+ union upper_chunk *chunk;
+
+ lockdep_assert_held(&pid_list->lock);
+
+ if (!pid_list->upper_list)
+ return NULL;
+
+ chunk = pid_list->upper_list;
+ pid_list->upper_list = chunk->next;
+ pid_list->free_upper_chunks--;
+ WARN_ON_ONCE(pid_list->free_upper_chunks < 0);
+ chunk->next = NULL;
+ /*
+ * If a refill needs to happen, it can not happen here
+ * as the scheduler run queue locks are held.
+ */
+ if (pid_list->free_upper_chunks <= CHUNK_REALLOC)
+ irq_work_queue(&pid_list->refill_irqwork);
+
+ return chunk;
+}
+
+static inline void put_lower_chunk(struct trace_pid_list *pid_list,
+ union lower_chunk *chunk)
+{
+ lockdep_assert_held(&pid_list->lock);
+
+ chunk->next = pid_list->lower_list;
+ pid_list->lower_list = chunk;
+ pid_list->free_lower_chunks++;
+}
+
+static inline void put_upper_chunk(struct trace_pid_list *pid_list,
+ union upper_chunk *chunk)
+{
+ lockdep_assert_held(&pid_list->lock);
+
+ chunk->next = pid_list->upper_list;
+ pid_list->upper_list = chunk;
+ pid_list->free_upper_chunks++;
+}
+
+static inline bool upper_empty(union upper_chunk *chunk)
+{
+ /*
+ * If chunk->data has no lower chunks, it will be the same
+ * as a zeroed bitmask. Use find_first_bit() to test it
+ * and if it doesn't find any bits set, then the array
+ * is empty.
+ */
+ int bit = find_first_bit((unsigned long *)chunk->data,
+ sizeof(chunk->data) * 8);
+ return bit >= sizeof(chunk->data) * 8;
+}
+
+static inline int pid_split(unsigned int pid, unsigned int *upper1,
+ unsigned int *upper2, unsigned int *lower)
+{
+ /* MAX_PID should cover all pids */
+ BUILD_BUG_ON(MAX_PID < PID_MAX_LIMIT);
+
+ /* In case a bad pid is passed in, then fail */
+ if (unlikely(pid >= MAX_PID))
+ return -1;
+
+ *upper1 = (pid >> UPPER1_SHIFT) & UPPER_MASK;
+ *upper2 = (pid >> UPPER2_SHIFT) & UPPER_MASK;
+ *lower = pid & LOWER_MASK;
+
+ return 0;
+}
+
+static inline unsigned int pid_join(unsigned int upper1,
+ unsigned int upper2, unsigned int lower)
+{
+ return ((upper1 & UPPER_MASK) << UPPER1_SHIFT) |
+ ((upper2 & UPPER_MASK) << UPPER2_SHIFT) |
+ (lower & LOWER_MASK);
+}
+
+/**
+ * trace_pid_list_is_set - test if the pid is set in the list
+ * @pid_list: The pid list to test
+ * @pid: The pid to see if set in the list.
+ *
+ * Tests if @pid is set in the @pid_list. This is usually called
+ * from the scheduler when a task is scheduled. Its pid is checked
+ * if it should be traced or not.
+ *
+ * Return true if the pid is in the list, false otherwise.
+ */
+bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
+{
+ union upper_chunk *upper_chunk;
+ union lower_chunk *lower_chunk;
+ unsigned long flags;
+ unsigned int upper1;
+ unsigned int upper2;
+ unsigned int lower;
+ bool ret = false;
+
+ if (!pid_list)
+ return false;
+
+ if (pid_split(pid, &upper1, &upper2, &lower) < 0)
+ return false;
+
+ raw_spin_lock_irqsave(&pid_list->lock, flags);
+ upper_chunk = pid_list->upper[upper1];
+ if (upper_chunk) {
+ lower_chunk = upper_chunk->data[upper2];
+ if (lower_chunk)
+ ret = test_bit(lower, lower_chunk->data);
+ }
+ raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+
+ return ret;
+}
+
+/**
+ * trace_pid_list_set - add a pid to the list
+ * @pid_list: The pid list to add the @pid to.
+ * @pid: The pid to add.
+ *
+ * Adds @pid to @pid_list. This is usually done explicitly by a user
+ * adding a task to be traced, or indirectly by the fork function
+ * when children should be traced and a task's pid is in the list.
+ *
+ * Return 0 on success, negative otherwise.
+ */
+int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
+{
+ union upper_chunk *upper_chunk;
+ union lower_chunk *lower_chunk;
+ unsigned long flags;
+ unsigned int upper1;
+ unsigned int upper2;
+ unsigned int lower;
+ int ret;
+
+ if (!pid_list)
+ return -ENODEV;
+
+ if (pid_split(pid, &upper1, &upper2, &lower) < 0)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&pid_list->lock, flags);
+ upper_chunk = pid_list->upper[upper1];
+ if (!upper_chunk) {
+ upper_chunk = get_upper_chunk(pid_list);
+ if (!upper_chunk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ pid_list->upper[upper1] = upper_chunk;
+ }
+ lower_chunk = upper_chunk->data[upper2];
+ if (!lower_chunk) {
+ lower_chunk = get_lower_chunk(pid_list);
+ if (!lower_chunk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ upper_chunk->data[upper2] = lower_chunk;
+ }
+ set_bit(lower, lower_chunk->data);
+ ret = 0;
+ out:
+ raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ return ret;
+}
+
+/**
+ * trace_pid_list_clear - remove a pid from the list
+ * @pid_list: The pid list to remove the @pid from.
+ * @pid: The pid to remove.
+ *
+ * Removes @pid from @pid_list. This is usually done explicitly by a user
+ * removing tasks from tracing, or indirectly by the exit function
+ * when a task that is set to be traced exits.
+ *
+ * Return 0 on success, negative otherwise.
+ */
+int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
+{
+ union upper_chunk *upper_chunk;
+ union lower_chunk *lower_chunk;
+ unsigned long flags;
+ unsigned int upper1;
+ unsigned int upper2;
+ unsigned int lower;
+
+ if (!pid_list)
+ return -ENODEV;
+
+ if (pid_split(pid, &upper1, &upper2, &lower) < 0)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&pid_list->lock, flags);
+ upper_chunk = pid_list->upper[upper1];
+ if (!upper_chunk)
+ goto out;
+
+ lower_chunk = upper_chunk->data[upper2];
+ if (!lower_chunk)
+ goto out;
+
+ clear_bit(lower, lower_chunk->data);
+
+ /* if there's no more bits set, add it to the free list */
+ if (find_first_bit(lower_chunk->data, LOWER_MAX) >= LOWER_MAX) {
+ put_lower_chunk(pid_list, lower_chunk);
+ upper_chunk->data[upper2] = NULL;
+ if (upper_empty(upper_chunk)) {
+ put_upper_chunk(pid_list, upper_chunk);
+ pid_list->upper[upper1] = NULL;
+ }
+ }
+ out:
+ raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ return 0;
+}
+
+/**
+ * trace_pid_list_next - return the next pid in the list
+ * @pid_list: The pid list to examine.
+ * @pid: The pid to start from
+ * @next: The pointer to place the pid that is set starting from @pid.
+ *
+ * Looks for the next consecutive pid that is in @pid_list starting
+ * at the pid specified by @pid. If one is set (including @pid), then
+ * that pid is placed into @next.
+ *
+ * Return 0 when a pid is found, -1 if there are no more pids included.
+ */
+int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid,
+ unsigned int *next)
+{
+ union upper_chunk *upper_chunk;
+ union lower_chunk *lower_chunk;
+ unsigned long flags;
+ unsigned int upper1;
+ unsigned int upper2;
+ unsigned int lower;
+
+ if (!pid_list)
+ return -ENODEV;
+
+ if (pid_split(pid, &upper1, &upper2, &lower) < 0)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&pid_list->lock, flags);
+ for (; upper1 <= UPPER_MASK; upper1++, upper2 = 0) {
+ upper_chunk = pid_list->upper[upper1];
+
+ if (!upper_chunk)
+ continue;
+
+ for (; upper2 <= UPPER_MASK; upper2++, lower = 0) {
+ lower_chunk = upper_chunk->data[upper2];
+ if (!lower_chunk)
+ continue;
+
+ lower = find_next_bit(lower_chunk->data, LOWER_MAX,
+ lower);
+ if (lower < LOWER_MAX)
+ goto found;
+ }
+ }
+
+ found:
+ raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ if (upper1 > UPPER_MASK)
+ return -1;
+
+ *next = pid_join(upper1, upper2, lower);
+ return 0;
+}
+
+/**
+ * trace_pid_list_first - return the first pid in the list
+ * @pid_list: The pid list to examine.
+ * @pid: The pointer to place the pid first found pid that is set.
+ *
+ * Looks for the first pid that is set in @pid_list, and places it
+ * into @pid if found.
+ *
+ * Return 0 when a pid is found, -1 if there are no pids set.
+ */
+int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid)
+{
+ return trace_pid_list_next(pid_list, 0, pid);
+}
+
+static void pid_list_refill_irq(struct irq_work *iwork)
+{
+ struct trace_pid_list *pid_list = container_of(iwork, struct trace_pid_list,
+ refill_irqwork);
+ union upper_chunk *upper = NULL;
+ union lower_chunk *lower = NULL;
+ union upper_chunk **upper_next = &upper;
+ union lower_chunk **lower_next = &lower;
+ int upper_count;
+ int lower_count;
+ int ucnt = 0;
+ int lcnt = 0;
+
+ again:
+ raw_spin_lock(&pid_list->lock);
+ upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks;
+ lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks;
+ raw_spin_unlock(&pid_list->lock);
+
+ if (upper_count <= 0 && lower_count <= 0)
+ return;
+
+ while (upper_count-- > 0) {
+ union upper_chunk *chunk;
+
+ chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ if (!chunk)
+ break;
+ *upper_next = chunk;
+ upper_next = &chunk->next;
+ ucnt++;
+ }
+
+ while (lower_count-- > 0) {
+ union lower_chunk *chunk;
+
+ chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ if (!chunk)
+ break;
+ *lower_next = chunk;
+ lower_next = &chunk->next;
+ lcnt++;
+ }
+
+ raw_spin_lock(&pid_list->lock);
+ if (upper) {
+ *upper_next = pid_list->upper_list;
+ pid_list->upper_list = upper;
+ pid_list->free_upper_chunks += ucnt;
+ }
+ if (lower) {
+ *lower_next = pid_list->lower_list;
+ pid_list->lower_list = lower;
+ pid_list->free_lower_chunks += lcnt;
+ }
+ raw_spin_unlock(&pid_list->lock);
+
+ /*
+ * On success of allocating all the chunks, both counters
+ * will be less than zero. If they are not, then an allocation
+ * failed, and we should not try again.
+ */
+ if (upper_count >= 0 || lower_count >= 0)
+ return;
+ /*
+ * When the locks were released, free chunks could have
+ * been used and allocation needs to be done again. Might as
+ * well allocate it now.
+ */
+ goto again;
+}
+
+/**
+ * trace_pid_list_alloc - create a new pid_list
+ *
+ * Allocates a new pid_list to store pids into.
+ *
+ * Returns the pid_list on success, NULL otherwise.
+ */
+struct trace_pid_list *trace_pid_list_alloc(void)
+{
+ struct trace_pid_list *pid_list;
+ int i;
+
+ /* According to linux/thread.h, pids can be no bigger that 30 bits */
+ WARN_ON_ONCE(pid_max > (1 << 30));
+
+ pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
+ if (!pid_list)
+ return NULL;
+
+ init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq);
+
+ raw_spin_lock_init(&pid_list->lock);
+
+ for (i = 0; i < CHUNK_ALLOC; i++) {
+ union upper_chunk *chunk;
+
+ chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ if (!chunk)
+ break;
+ chunk->next = pid_list->upper_list;
+ pid_list->upper_list = chunk;
+ pid_list->free_upper_chunks++;
+ }
+
+ for (i = 0; i < CHUNK_ALLOC; i++) {
+ union lower_chunk *chunk;
+
+ chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ if (!chunk)
+ break;
+ chunk->next = pid_list->lower_list;
+ pid_list->lower_list = chunk;
+ pid_list->free_lower_chunks++;
+ }
+
+ return pid_list;
+}
+
+/**
+ * trace_pid_list_free - Frees an allocated pid_list.
+ *
+ * Frees the memory for a pid_list that was allocated.
+ */
+void trace_pid_list_free(struct trace_pid_list *pid_list)
+{
+ union upper_chunk *upper;
+ union lower_chunk *lower;
+ int i, j;
+
+ if (!pid_list)
+ return;
+
+ irq_work_sync(&pid_list->refill_irqwork);
+
+ while (pid_list->lower_list) {
+ union lower_chunk *chunk;
+
+ chunk = pid_list->lower_list;
+ pid_list->lower_list = pid_list->lower_list->next;
+ kfree(chunk);
+ }
+
+ while (pid_list->upper_list) {
+ union upper_chunk *chunk;
+
+ chunk = pid_list->upper_list;
+ pid_list->upper_list = pid_list->upper_list->next;
+ kfree(chunk);
+ }
+
+ for (i = 0; i < UPPER1_SIZE; i++) {
+ upper = pid_list->upper[i];
+ if (upper) {
+ for (j = 0; j < UPPER2_SIZE; j++) {
+ lower = upper->data[j];
+ kfree(lower);
+ }
+ kfree(upper);
+ }
+ }
+ kfree(pid_list);
+}
diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h
new file mode 100644
index 000000000000..62e73f1ac85f
--- /dev/null
+++ b/kernel/trace/pid_list.h
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Do not include this file directly. */
+
+#ifndef _TRACE_INTERNAL_PID_LIST_H
+#define _TRACE_INTERNAL_PID_LIST_H
+
+/*
+ * In order to keep track of what pids to trace, a tree is created much
+ * like page tables are used. This creates a sparse bit map, where
+ * the tree is filled in when needed. A PID is at most 30 bits (see
+ * linux/thread.h), and is broken up into 3 sections based on the bit map
+ * of the bits. The 8 MSB is the "upper1" section. The next 8 MSB is the
+ * "upper2" section and the 14 LSB is the "lower" section.
+ *
+ * A trace_pid_list structure holds the "upper1" section, in an
+ * array of 256 pointers (1 or 2K in size) to "upper_chunk" unions, where
+ * each has an array of 256 pointers (1 or 2K in size) to the "lower_chunk"
+ * structures, where each has an array of size 2K bytes representing a bitmask
+ * of the 14 LSB of the PID (256 * 8 = 2048)
+ *
+ * When a trace_pid_list is allocated, it includes the 256 pointer array
+ * of the upper1 unions. Then a "cache" of upper and lower is allocated
+ * where these will be assigned as needed.
+ *
+ * When a bit is set in the pid_list bitmask, the pid to use has
+ * the 8 MSB masked, and this is used to index the array in the
+ * pid_list to find the next upper union. If the element is NULL,
+ * then one is retrieved from the upper_list cache. If none is
+ * available, then -ENOMEM is returned.
+ *
+ * The next 8 MSB is used to index into the "upper2" section. If this
+ * element is NULL, then it is retrieved from the lower_list cache.
+ * Again, if one is not available -ENOMEM is returned.
+ *
+ * Finally the 14 LSB of the PID is used to set the bit in the 16384
+ * bitmask (made up of 2K bytes).
+ *
+ * When the second upper section or the lower section has their last
+ * bit cleared, they are added back to the free list to be reused
+ * when needed.
+ */
+
+#define UPPER_BITS 8
+#define UPPER_MAX (1 << UPPER_BITS)
+#define UPPER1_SIZE (1 << UPPER_BITS)
+#define UPPER2_SIZE (1 << UPPER_BITS)
+
+#define LOWER_BITS 14
+#define LOWER_MAX (1 << LOWER_BITS)
+#define LOWER_SIZE (LOWER_MAX / BITS_PER_LONG)
+
+#define UPPER1_SHIFT (LOWER_BITS + UPPER_BITS)
+#define UPPER2_SHIFT LOWER_BITS
+#define LOWER_MASK (LOWER_MAX - 1)
+
+#define UPPER_MASK (UPPER_MAX - 1)
+
+/* According to linux/thread.h pids can not be bigger than or equal to 1 << 30 */
+#define MAX_PID (1 << 30)
+
+/* Just keep 6 chunks of both upper and lower in the cache on alloc */
+#define CHUNK_ALLOC 6
+
+/* Have 2 chunks free, trigger a refill of the cache */
+#define CHUNK_REALLOC 2
+
+union lower_chunk {
+ union lower_chunk *next;
+ unsigned long data[LOWER_SIZE]; // 2K in size
+};
+
+union upper_chunk {
+ union upper_chunk *next;
+ union lower_chunk *data[UPPER2_SIZE]; // 1 or 2K in size
+};
+
+struct trace_pid_list {
+ raw_spinlock_t lock;
+ struct irq_work refill_irqwork;
+ union upper_chunk *upper[UPPER1_SIZE]; // 1 or 2K in size
+ union upper_chunk *upper_list;
+ union lower_chunk *lower_list;
+ int free_upper_chunks;
+ int free_lower_chunks;
+};
+
+#endif /* _TRACE_INTERNAL_PID_LIST_H */
diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c
index 312d1a0ca3b6..8c4ffd076162 100644
--- a/kernel/trace/preemptirq_delay_test.c
+++ b/kernel/trace/preemptirq_delay_test.c
@@ -21,13 +21,16 @@
static ulong delay = 100;
static char test_mode[12] = "irq";
static uint burst_size = 1;
+static int cpu_affinity = -1;
module_param_named(delay, delay, ulong, 0444);
module_param_string(test_mode, test_mode, 12, 0444);
module_param_named(burst_size, burst_size, uint, 0444);
+module_param_named(cpu_affinity, cpu_affinity, int, 0444);
MODULE_PARM_DESC(delay, "Period in microseconds (100 us default)");
MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt, irq, or alternate (default irq)");
MODULE_PARM_DESC(burst_size, "The size of a burst (default 1)");
+MODULE_PARM_DESC(cpu_affinity, "Cpu num test is running on");
static struct completion done;
@@ -36,7 +39,9 @@ static struct completion done;
static void busy_wait(ulong time)
{
u64 start, end;
+
start = trace_clock_local();
+
do {
end = trace_clock_local();
if (kthread_should_stop())
@@ -47,6 +52,7 @@ static void busy_wait(ulong time)
static __always_inline void irqoff_test(void)
{
unsigned long flags;
+
local_irq_save(flags);
busy_wait(delay);
local_irq_restore(flags);
@@ -113,6 +119,14 @@ static int preemptirq_delay_run(void *data)
{
int i;
int s = MIN(burst_size, NR_TEST_FUNCS);
+ struct cpumask cpu_mask;
+
+ if (cpu_affinity > -1) {
+ cpumask_clear(&cpu_mask);
+ cpumask_set_cpu(cpu_affinity, &cpu_mask);
+ if (set_cpus_allowed_ptr(current, &cpu_mask))
+ pr_err("cpu_affinity:%d, failed\n", cpu_affinity);
+ }
for (i = 0; i < s; i++)
(testfuncs[i])(i);
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
new file mode 100644
index 000000000000..fa03094e9e69
--- /dev/null
+++ b/kernel/trace/rethook.c
@@ -0,0 +1,335 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "rethook: " fmt
+
+#include <linux/bug.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/preempt.h>
+#include <linux/rethook.h>
+#include <linux/slab.h>
+
+/* Return hook list (shadow stack by list) */
+
+/*
+ * This function is called from delayed_put_task_struct() when a task is
+ * dead and cleaned up to recycle any kretprobe instances associated with
+ * this task. These left over instances represent probed functions that
+ * have been called but will never return.
+ */
+void rethook_flush_task(struct task_struct *tk)
+{
+ struct rethook_node *rhn;
+ struct llist_node *node;
+
+ node = __llist_del_all(&tk->rethooks);
+ while (node) {
+ rhn = container_of(node, struct rethook_node, llist);
+ node = node->next;
+ preempt_disable();
+ rethook_recycle(rhn);
+ preempt_enable();
+ }
+}
+
+static void rethook_free_rcu(struct rcu_head *head)
+{
+ struct rethook *rh = container_of(head, struct rethook, rcu);
+ objpool_fini(&rh->pool);
+}
+
+/**
+ * rethook_stop() - Stop using a rethook.
+ * @rh: the struct rethook to stop.
+ *
+ * Stop using a rethook to prepare for freeing it. If you want to wait for
+ * all running rethook handler before calling rethook_free(), you need to
+ * call this first and wait RCU, and call rethook_free().
+ */
+void rethook_stop(struct rethook *rh)
+{
+ rcu_assign_pointer(rh->handler, NULL);
+}
+
+/**
+ * rethook_free() - Free struct rethook.
+ * @rh: the struct rethook to be freed.
+ *
+ * Free the rethook. Before calling this function, user must ensure the
+ * @rh::data is cleaned if needed (or, the handler can access it after
+ * calling this function.) This function will set the @rh to be freed
+ * after all rethook_node are freed (not soon). And the caller must
+ * not touch @rh after calling this.
+ */
+void rethook_free(struct rethook *rh)
+{
+ rethook_stop(rh);
+
+ call_rcu(&rh->rcu, rethook_free_rcu);
+}
+
+static int rethook_init_node(void *nod, void *context)
+{
+ struct rethook_node *node = nod;
+
+ node->rethook = context;
+ return 0;
+}
+
+static int rethook_fini_pool(struct objpool_head *head, void *context)
+{
+ kfree(context);
+ return 0;
+}
+
+static inline rethook_handler_t rethook_get_handler(struct rethook *rh)
+{
+ return (rethook_handler_t)rcu_dereference_check(rh->handler,
+ rcu_read_lock_any_held());
+}
+
+/**
+ * rethook_alloc() - Allocate struct rethook.
+ * @data: a data to pass the @handler when hooking the return.
+ * @handler: the return hook callback function, must NOT be NULL
+ * @size: node size: rethook node and additional data
+ * @num: number of rethook nodes to be preallocated
+ *
+ * Allocate and initialize a new rethook with @data and @handler.
+ * Return pointer of new rethook, or error codes for failures.
+ *
+ * Note that @handler == NULL means this rethook is going to be freed.
+ */
+struct rethook *rethook_alloc(void *data, rethook_handler_t handler,
+ int size, int num)
+{
+ struct rethook *rh;
+
+ if (!handler || num <= 0 || size < sizeof(struct rethook_node))
+ return ERR_PTR(-EINVAL);
+
+ rh = kzalloc(sizeof(struct rethook), GFP_KERNEL);
+ if (!rh)
+ return ERR_PTR(-ENOMEM);
+
+ rh->data = data;
+ rcu_assign_pointer(rh->handler, handler);
+
+ /* initialize the objpool for rethook nodes */
+ if (objpool_init(&rh->pool, num, size, GFP_KERNEL, rh,
+ rethook_init_node, rethook_fini_pool)) {
+ kfree(rh);
+ return ERR_PTR(-ENOMEM);
+ }
+ return rh;
+}
+
+static void free_rethook_node_rcu(struct rcu_head *head)
+{
+ struct rethook_node *node = container_of(head, struct rethook_node, rcu);
+ struct rethook *rh = node->rethook;
+
+ objpool_drop(node, &rh->pool);
+}
+
+/**
+ * rethook_recycle() - return the node to rethook.
+ * @node: The struct rethook_node to be returned.
+ *
+ * Return back the @node to @node::rethook. If the @node::rethook is already
+ * marked as freed, this will free the @node.
+ */
+void rethook_recycle(struct rethook_node *node)
+{
+ rethook_handler_t handler;
+
+ handler = rethook_get_handler(node->rethook);
+ if (likely(handler))
+ objpool_push(node, &node->rethook->pool);
+ else
+ call_rcu(&node->rcu, free_rethook_node_rcu);
+}
+NOKPROBE_SYMBOL(rethook_recycle);
+
+/**
+ * rethook_try_get() - get an unused rethook node.
+ * @rh: The struct rethook which pools the nodes.
+ *
+ * Get an unused rethook node from @rh. If the node pool is empty, this
+ * will return NULL. Caller must disable preemption.
+ */
+struct rethook_node *rethook_try_get(struct rethook *rh)
+{
+ rethook_handler_t handler = rethook_get_handler(rh);
+
+ /* Check whether @rh is going to be freed. */
+ if (unlikely(!handler))
+ return NULL;
+
+ /*
+ * This expects the caller will set up a rethook on a function entry.
+ * When the function returns, the rethook will eventually be reclaimed
+ * or released in the rethook_recycle() with call_rcu().
+ * This means the caller must be run in the RCU-availabe context.
+ */
+ if (unlikely(!rcu_is_watching()))
+ return NULL;
+
+ return (struct rethook_node *)objpool_pop(&rh->pool);
+}
+NOKPROBE_SYMBOL(rethook_try_get);
+
+/**
+ * rethook_hook() - Hook the current function return.
+ * @node: The struct rethook node to hook the function return.
+ * @regs: The struct pt_regs for the function entry.
+ * @mcount: True if this is called from mcount(ftrace) context.
+ *
+ * Hook the current running function return. This must be called when the
+ * function entry (or at least @regs must be the registers of the function
+ * entry.) @mcount is used for identifying the context. If this is called
+ * from ftrace (mcount) callback, @mcount must be set true. If this is called
+ * from the real function entry (e.g. kprobes) @mcount must be set false.
+ * This is because the way to hook the function return depends on the context.
+ */
+void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount)
+{
+ arch_rethook_prepare(node, regs, mcount);
+ __llist_add(&node->llist, &current->rethooks);
+}
+NOKPROBE_SYMBOL(rethook_hook);
+
+/* This assumes the 'tsk' is the current task or is not running. */
+static unsigned long __rethook_find_ret_addr(struct task_struct *tsk,
+ struct llist_node **cur)
+{
+ struct rethook_node *rh = NULL;
+ struct llist_node *node = *cur;
+
+ if (!node)
+ node = tsk->rethooks.first;
+ else
+ node = node->next;
+
+ while (node) {
+ rh = container_of(node, struct rethook_node, llist);
+ if (rh->ret_addr != (unsigned long)arch_rethook_trampoline) {
+ *cur = node;
+ return rh->ret_addr;
+ }
+ node = node->next;
+ }
+ return 0;
+}
+NOKPROBE_SYMBOL(__rethook_find_ret_addr);
+
+/**
+ * rethook_find_ret_addr -- Find correct return address modified by rethook
+ * @tsk: Target task
+ * @frame: A frame pointer
+ * @cur: a storage of the loop cursor llist_node pointer for next call
+ *
+ * Find the correct return address modified by a rethook on @tsk in unsigned
+ * long type.
+ * The @tsk must be 'current' or a task which is not running. @frame is a hint
+ * to get the currect return address - which is compared with the
+ * rethook::frame field. The @cur is a loop cursor for searching the
+ * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the
+ * first call, but '@cur' itself must NOT NULL.
+ *
+ * Returns found address value or zero if not found.
+ */
+unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame,
+ struct llist_node **cur)
+{
+ struct rethook_node *rhn = NULL;
+ unsigned long ret;
+
+ if (WARN_ON_ONCE(!cur))
+ return 0;
+
+ if (WARN_ON_ONCE(tsk != current && task_is_running(tsk)))
+ return 0;
+
+ do {
+ ret = __rethook_find_ret_addr(tsk, cur);
+ if (!ret)
+ break;
+ rhn = container_of(*cur, struct rethook_node, llist);
+ } while (rhn->frame != frame);
+
+ return ret;
+}
+NOKPROBE_SYMBOL(rethook_find_ret_addr);
+
+void __weak arch_rethook_fixup_return(struct pt_regs *regs,
+ unsigned long correct_ret_addr)
+{
+ /*
+ * Do nothing by default. If the architecture which uses a
+ * frame pointer to record real return address on the stack,
+ * it should fill this function to fixup the return address
+ * so that stacktrace works from the rethook handler.
+ */
+}
+
+/* This function will be called from each arch-defined trampoline. */
+unsigned long rethook_trampoline_handler(struct pt_regs *regs,
+ unsigned long frame)
+{
+ struct llist_node *first, *node = NULL;
+ unsigned long correct_ret_addr;
+ rethook_handler_t handler;
+ struct rethook_node *rhn;
+
+ correct_ret_addr = __rethook_find_ret_addr(current, &node);
+ if (!correct_ret_addr) {
+ pr_err("rethook: Return address not found! Maybe there is a bug in the kernel\n");
+ BUG_ON(1);
+ }
+
+ instruction_pointer_set(regs, correct_ret_addr);
+
+ /*
+ * These loops must be protected from rethook_free_rcu() because those
+ * are accessing 'rhn->rethook'.
+ */
+ preempt_disable_notrace();
+
+ /*
+ * Run the handler on the shadow stack. Do not unlink the list here because
+ * stackdump inside the handlers needs to decode it.
+ */
+ first = current->rethooks.first;
+ while (first) {
+ rhn = container_of(first, struct rethook_node, llist);
+ if (WARN_ON_ONCE(rhn->frame != frame))
+ break;
+ handler = rethook_get_handler(rhn->rethook);
+ if (handler)
+ handler(rhn, rhn->rethook->data,
+ correct_ret_addr, regs);
+
+ if (first == node)
+ break;
+ first = first->next;
+ }
+
+ /* Fixup registers for returning to correct address. */
+ arch_rethook_fixup_return(regs, correct_ret_addr);
+
+ /* Unlink used shadow stack */
+ first = current->rethooks.first;
+ current->rethooks.first = node->next;
+ node->next = NULL;
+
+ while (first) {
+ rhn = container_of(first, struct rethook_node, llist);
+ first = first->next;
+ rethook_recycle(rhn);
+ }
+ preempt_enable_notrace();
+
+ return correct_ret_addr;
+}
+NOKPROBE_SYMBOL(rethook_trampoline_handler);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 00867ff82412..aa332ace108b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
*
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*/
+#include <linux/trace_recursion.h>
#include <linux/trace_events.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
@@ -26,8 +27,17 @@
#include <linux/cpu.h>
#include <linux/oom.h>
+#include <asm/local64.h>
#include <asm/local.h>
+/*
+ * The "absolute" timestamp in the buffer is only 59 bits.
+ * If a clock has the 5 MSBs set, it needs to be saved and
+ * reinserted.
+ */
+#define TS_MSB (0xf8ULL << 56)
+#define ABS_TS_MASK (~TS_MSB)
+
static void update_pages_handler(struct work_struct *work);
/*
@@ -129,7 +139,16 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
#define RB_ALIGNMENT 4U
#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
-#define RB_ALIGN_DATA __aligned(RB_ALIGNMENT)
+
+#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
+# define RB_FORCE_8BYTE_ALIGNMENT 0
+# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
+#else
+# define RB_FORCE_8BYTE_ALIGNMENT 1
+# define RB_ARCH_ALIGNMENT 8U
+#endif
+
+#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -145,7 +164,7 @@ enum {
#define extended_time(event) \
(event->type_len >= RINGBUF_TYPE_TIME_EXTEND)
-static inline int rb_null_event(struct ring_buffer_event *event)
+static inline bool rb_null_event(struct ring_buffer_event *event)
{
return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
}
@@ -270,21 +289,14 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
#define for_each_buffer_cpu(buffer, cpu) \
for_each_cpu(cpu, buffer->cpumask)
+#define for_each_online_buffer_cpu(buffer, cpu) \
+ for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask)
+
#define TS_SHIFT 27
#define TS_MASK ((1ULL << TS_SHIFT) - 1)
#define TS_DELTA_TEST (~TS_MASK)
-/**
- * ring_buffer_event_time_stamp - return the event's extended timestamp
- * @event: the event to get the timestamp of
- *
- * Returns the extended timestamp associated with a data event.
- * An extended time_stamp is a 64-bit timestamp represented
- * internally in a special way that makes the best use of space
- * contained within a ring buffer event. This function decodes
- * it and maps it to a straight u64 value.
- */
-u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event)
+static u64 rb_event_time_stamp(struct ring_buffer_event *event)
{
u64 ts;
@@ -306,6 +318,11 @@ struct buffer_data_page {
unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
};
+struct buffer_data_read_page {
+ unsigned order; /* order of the page */
+ struct buffer_data_page *data; /* actual data, stored in this page */
+};
+
/*
* Note, the buffer_page list must be first. The buffer pages
* are allocated in cache lines, which means that each buffer
@@ -320,6 +337,7 @@ struct buffer_page {
unsigned read; /* index for next read */
local_t entries; /* entries on this page */
unsigned long real_end; /* real end of data */
+ unsigned order; /* order of the page */
struct buffer_data_page *page; /* Actual data page */
};
@@ -343,59 +361,23 @@ static void rb_init_page(struct buffer_data_page *bpage)
local_set(&bpage->commit, 0);
}
-/*
- * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
- * this issue out.
- */
+static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
+{
+ return local_read(&bpage->page->commit);
+}
+
static void free_buffer_page(struct buffer_page *bpage)
{
- free_page((unsigned long)bpage->page);
+ free_pages((unsigned long)bpage->page, bpage->order);
kfree(bpage);
}
/*
* We need to fit the time_stamp delta into 27 bits.
*/
-static inline int test_time_stamp(u64 delta)
+static inline bool test_time_stamp(u64 delta)
{
- if (delta & TS_DELTA_TEST)
- return 1;
- return 0;
-}
-
-#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
-
-/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
-#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
-
-int ring_buffer_print_page_header(struct trace_seq *s)
-{
- struct buffer_data_page field;
-
- trace_seq_printf(s, "\tfield: u64 timestamp;\t"
- "offset:0;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)sizeof(field.time_stamp),
- (unsigned int)is_signed_type(u64));
-
- trace_seq_printf(s, "\tfield: local_t commit;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- (unsigned int)sizeof(field.commit),
- (unsigned int)is_signed_type(long));
-
- trace_seq_printf(s, "\tfield: int overwrite;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- 1,
- (unsigned int)is_signed_type(long));
-
- trace_seq_printf(s, "\tfield: char data;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), data),
- (unsigned int)BUF_PAGE_SIZE,
- (unsigned int)is_signed_type(char));
-
- return !trace_seq_has_overflowed(s);
+ return !!(delta & TS_DELTA_TEST);
}
struct rb_irq_work {
@@ -413,21 +395,38 @@ struct rb_irq_work {
struct rb_event_info {
u64 ts;
u64 delta;
+ u64 before;
+ u64 after;
unsigned long length;
struct buffer_page *tail_page;
int add_timestamp;
};
/*
+ * Used for the add_timestamp
+ * NONE
+ * EXTEND - wants a time extend
+ * ABSOLUTE - the buffer requests all events to have absolute time stamps
+ * FORCE - force a full time stamp.
+ */
+enum {
+ RB_ADD_STAMP_NONE = 0,
+ RB_ADD_STAMP_EXTEND = BIT(1),
+ RB_ADD_STAMP_ABSOLUTE = BIT(2),
+ RB_ADD_STAMP_FORCE = BIT(3)
+};
+/*
* Used for which event context the event is in.
- * NMI = 0
- * IRQ = 1
- * SOFTIRQ = 2
- * NORMAL = 3
+ * TRANSITION = 0
+ * NMI = 1
+ * IRQ = 2
+ * SOFTIRQ = 3
+ * NORMAL = 4
*
* See trace_recursive_lock() comment below for more details.
*/
enum {
+ RB_CTX_TRANSITION,
RB_CTX_NMI,
RB_CTX_IRQ,
RB_CTX_SOFTIRQ,
@@ -435,6 +434,13 @@ enum {
RB_CTX_MAX
};
+struct rb_time_struct {
+ local64_t time;
+};
+typedef struct rb_time_struct rb_time_t;
+
+#define MAX_NEST 5
+
/*
* head_page == tail_page && head == tail then buffer is empty.
*/
@@ -465,13 +471,18 @@ struct ring_buffer_per_cpu {
local_t committing;
local_t commits;
local_t pages_touched;
+ local_t pages_lost;
local_t pages_read;
long last_pages_touch;
size_t shortest_full;
unsigned long read;
unsigned long read_bytes;
- u64 write_stamp;
+ rb_time_t write_stamp;
+ rb_time_t before_stamp;
+ u64 event_stamp[MAX_NEST];
u64 read_stamp;
+ /* pages removed since last reset */
+ unsigned long pages_removed;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
@@ -485,6 +496,7 @@ struct trace_buffer {
unsigned flags;
int cpus;
atomic_t record_disabled;
+ atomic_t resizing;
cpumask_var_t cpumask;
struct lock_class_key *reader_lock_key;
@@ -498,6 +510,10 @@ struct trace_buffer {
struct rb_irq_work irq_work;
bool time_stamp_abs;
+
+ unsigned int subbuf_size;
+ unsigned int subbuf_order;
+ unsigned int max_data_size;
};
struct ring_buffer_iter {
@@ -507,12 +523,163 @@ struct ring_buffer_iter {
struct buffer_page *head_page;
struct buffer_page *cache_reader_page;
unsigned long cache_read;
+ unsigned long cache_pages_removed;
u64 read_stamp;
u64 page_stamp;
struct ring_buffer_event *event;
+ size_t event_size;
int missed_events;
};
+int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s)
+{
+ struct buffer_data_page field;
+
+ trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+ "offset:0;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)sizeof(field.time_stamp),
+ (unsigned int)is_signed_type(u64));
+
+ trace_seq_printf(s, "\tfield: local_t commit;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ (unsigned int)sizeof(field.commit),
+ (unsigned int)is_signed_type(long));
+
+ trace_seq_printf(s, "\tfield: int overwrite;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ 1,
+ (unsigned int)is_signed_type(long));
+
+ trace_seq_printf(s, "\tfield: char data;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), data),
+ (unsigned int)buffer->subbuf_size,
+ (unsigned int)is_signed_type(char));
+
+ return !trace_seq_has_overflowed(s);
+}
+
+static inline void rb_time_read(rb_time_t *t, u64 *ret)
+{
+ *ret = local64_read(&t->time);
+}
+static void rb_time_set(rb_time_t *t, u64 val)
+{
+ local64_set(&t->time, val);
+}
+
+/*
+ * Enable this to make sure that the event passed to
+ * ring_buffer_event_time_stamp() is not committed and also
+ * is on the buffer that it passed in.
+ */
+//#define RB_VERIFY_EVENT
+#ifdef RB_VERIFY_EVENT
+static struct list_head *rb_list_head(struct list_head *list);
+static void verify_event(struct ring_buffer_per_cpu *cpu_buffer,
+ void *event)
+{
+ struct buffer_page *page = cpu_buffer->commit_page;
+ struct buffer_page *tail_page = READ_ONCE(cpu_buffer->tail_page);
+ struct list_head *next;
+ long commit, write;
+ unsigned long addr = (unsigned long)event;
+ bool done = false;
+ int stop = 0;
+
+ /* Make sure the event exists and is not committed yet */
+ do {
+ if (page == tail_page || WARN_ON_ONCE(stop++ > 100))
+ done = true;
+ commit = local_read(&page->page->commit);
+ write = local_read(&page->write);
+ if (addr >= (unsigned long)&page->page->data[commit] &&
+ addr < (unsigned long)&page->page->data[write])
+ return;
+
+ next = rb_list_head(page->list.next);
+ page = list_entry(next, struct buffer_page, list);
+ } while (!done);
+ WARN_ON_ONCE(1);
+}
+#else
+static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer,
+ void *event)
+{
+}
+#endif
+
+/*
+ * The absolute time stamp drops the 5 MSBs and some clocks may
+ * require them. The rb_fix_abs_ts() will take a previous full
+ * time stamp, and add the 5 MSB of that time stamp on to the
+ * saved absolute time stamp. Then they are compared in case of
+ * the unlikely event that the latest time stamp incremented
+ * the 5 MSB.
+ */
+static inline u64 rb_fix_abs_ts(u64 abs, u64 save_ts)
+{
+ if (save_ts & TS_MSB) {
+ abs |= save_ts & TS_MSB;
+ /* Check for overflow */
+ if (unlikely(abs < save_ts))
+ abs += 1ULL << 59;
+ }
+ return abs;
+}
+
+static inline u64 rb_time_stamp(struct trace_buffer *buffer);
+
+/**
+ * ring_buffer_event_time_stamp - return the event's current time stamp
+ * @buffer: The buffer that the event is on
+ * @event: the event to get the time stamp of
+ *
+ * Note, this must be called after @event is reserved, and before it is
+ * committed to the ring buffer. And must be called from the same
+ * context where the event was reserved (normal, softirq, irq, etc).
+ *
+ * Returns the time stamp associated with the current event.
+ * If the event has an extended time stamp, then that is used as
+ * the time stamp to return.
+ * In the highly unlikely case that the event was nested more than
+ * the max nesting, then the write_stamp of the buffer is returned,
+ * otherwise current time is returned, but that really neither of
+ * the last two cases should ever happen.
+ */
+u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[smp_processor_id()];
+ unsigned int nest;
+ u64 ts;
+
+ /* If the event includes an absolute time, then just use that */
+ if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
+ ts = rb_event_time_stamp(event);
+ return rb_fix_abs_ts(ts, cpu_buffer->tail_page->page->time_stamp);
+ }
+
+ nest = local_read(&cpu_buffer->committing);
+ verify_event(cpu_buffer, event);
+ if (WARN_ON_ONCE(!nest))
+ goto fail;
+
+ /* Read the current saved nesting level time stamp */
+ if (likely(--nest < MAX_NEST))
+ return cpu_buffer->event_stamp[nest];
+
+ /* Shouldn't happen, warn if it does */
+ WARN_ONCE(1, "nest (%d) greater than max", nest);
+
+ fail:
+ rb_time_read(&cpu_buffer->write_stamp, &ts);
+
+ return ts;
+}
+
/**
* ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
* @buffer: The ring_buffer to get the number of pages from
@@ -526,7 +693,7 @@ size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu)
}
/**
- * ring_buffer_nr_pages_dirty - get the number of used pages in the ring buffer
+ * ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer
* @buffer: The ring_buffer to get the number of pages from
* @cpu: The cpu of the ring_buffer to get the number of pages from
*
@@ -535,10 +702,18 @@ size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu)
size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu)
{
size_t read;
+ size_t lost;
size_t cnt;
read = local_read(&buffer->buffers[cpu]->pages_read);
+ lost = local_read(&buffer->buffers[cpu]->pages_lost);
cnt = local_read(&buffer->buffers[cpu]->pages_touched);
+
+ if (WARN_ON_ONCE(cnt < lost))
+ return 0;
+
+ cnt -= lost;
+
/* The reader can read an empty page, but not more than that */
if (cnt < read) {
WARN_ON_ONCE(read > cnt + 1);
@@ -548,6 +723,26 @@ size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu)
return cnt - read;
}
+static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int full)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ size_t nr_pages;
+ size_t dirty;
+
+ nr_pages = cpu_buffer->nr_pages;
+ if (!nr_pages || !full)
+ return true;
+
+ /*
+ * Add one as dirty will never equal nr_pages, as the sub-buffer
+ * that the writer is on is not counted as dirty.
+ * This is needed if "buffer_percent" is set to 100.
+ */
+ dirty = ring_buffer_nr_dirty_pages(buffer, cpu) + 1;
+
+ return (dirty * 100) >= (full * nr_pages);
+}
+
/*
* rb_wake_up_waiters - wake up tasks waiting for ring buffer input
*
@@ -559,17 +754,99 @@ static void rb_wake_up_waiters(struct irq_work *work)
struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
wake_up_all(&rbwork->waiters);
- if (rbwork->wakeup_full) {
+ if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
+ /* Only cpu_buffer sets the above flags */
+ struct ring_buffer_per_cpu *cpu_buffer =
+ container_of(rbwork, struct ring_buffer_per_cpu, irq_work);
+
+ /* Called from interrupt context */
+ raw_spin_lock(&cpu_buffer->reader_lock);
rbwork->wakeup_full = false;
+ rbwork->full_waiters_pending = false;
+
+ /* Waking up all waiters, they will reset the shortest full */
+ cpu_buffer->shortest_full = 0;
+ raw_spin_unlock(&cpu_buffer->reader_lock);
+
wake_up_all(&rbwork->full_waiters);
}
}
/**
+ * ring_buffer_wake_waiters - wake up any waiters on this ring buffer
+ * @buffer: The ring buffer to wake waiters on
+ * @cpu: The CPU buffer to wake waiters on
+ *
+ * In the case of a file that represents a ring buffer is closing,
+ * it is prudent to wake up any waiters that are on this.
+ */
+void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct rb_irq_work *rbwork;
+
+ if (!buffer)
+ return;
+
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+
+ /* Wake up individual ones too. One level recursion */
+ for_each_buffer_cpu(buffer, cpu)
+ ring_buffer_wake_waiters(buffer, cpu);
+
+ rbwork = &buffer->irq_work;
+ } else {
+ if (WARN_ON_ONCE(!buffer->buffers))
+ return;
+ if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+ /* The CPU buffer may not have been initialized yet */
+ if (!cpu_buffer)
+ return;
+ rbwork = &cpu_buffer->irq_work;
+ }
+
+ /* This can be called in any context */
+ irq_work_queue(&rbwork->work);
+}
+
+static bool rb_watermark_hit(struct trace_buffer *buffer, int cpu, int full)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ bool ret = false;
+
+ /* Reads of all CPUs always waits for any data */
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ return !ring_buffer_empty(buffer);
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (!ring_buffer_empty_cpu(buffer, cpu)) {
+ unsigned long flags;
+ bool pagebusy;
+
+ if (!full)
+ return true;
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+ ret = !pagebusy && full_hit(buffer, cpu, full);
+
+ if (!cpu_buffer->shortest_full ||
+ cpu_buffer->shortest_full > full)
+ cpu_buffer->shortest_full = full;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ }
+ return ret;
+}
+
+/**
* ring_buffer_wait - wait for input to the ring buffer
* @buffer: buffer to wait on
* @cpu: the cpu buffer to wait on
- * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS
+ * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
*
* If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
* as data is added to any of the @buffer's cpu buffers. Otherwise
@@ -577,7 +854,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
*/
int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
{
- struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
+ struct ring_buffer_per_cpu *cpu_buffer;
DEFINE_WAIT(wait);
struct rb_irq_work *work;
int ret = 0;
@@ -598,77 +875,54 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
work = &cpu_buffer->irq_work;
}
+ if (full)
+ prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
+ else
+ prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
- while (true) {
- if (full)
- prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
- else
- prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
-
- /*
- * The events can happen in critical sections where
- * checking a work queue can cause deadlocks.
- * After adding a task to the queue, this flag is set
- * only to notify events to try to wake up the queue
- * using irq_work.
- *
- * We don't clear it even if the buffer is no longer
- * empty. The flag only causes the next event to run
- * irq_work to do the work queue wake up. The worse
- * that can happen if we race with !trace_empty() is that
- * an event will cause an irq_work to try to wake up
- * an empty queue.
- *
- * There's no reason to protect this flag either, as
- * the work queue and irq_work logic will do the necessary
- * synchronization for the wake ups. The only thing
- * that is necessary is that the wake up happens after
- * a task has been queued. It's OK for spurious wake ups.
- */
- if (full)
- work->full_waiters_pending = true;
- else
- work->waiters_pending = true;
-
- if (signal_pending(current)) {
- ret = -EINTR;
- break;
- }
-
- if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
- break;
-
- if (cpu != RING_BUFFER_ALL_CPUS &&
- !ring_buffer_empty_cpu(buffer, cpu)) {
- unsigned long flags;
- bool pagebusy;
- size_t nr_pages;
- size_t dirty;
-
- if (!full)
- break;
+ /*
+ * The events can happen in critical sections where
+ * checking a work queue can cause deadlocks.
+ * After adding a task to the queue, this flag is set
+ * only to notify events to try to wake up the queue
+ * using irq_work.
+ *
+ * We don't clear it even if the buffer is no longer
+ * empty. The flag only causes the next event to run
+ * irq_work to do the work queue wake up. The worse
+ * that can happen if we race with !trace_empty() is that
+ * an event will cause an irq_work to try to wake up
+ * an empty queue.
+ *
+ * There's no reason to protect this flag either, as
+ * the work queue and irq_work logic will do the necessary
+ * synchronization for the wake ups. The only thing
+ * that is necessary is that the wake up happens after
+ * a task has been queued. It's OK for spurious wake ups.
+ */
+ if (full)
+ work->full_waiters_pending = true;
+ else
+ work->waiters_pending = true;
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
- nr_pages = cpu_buffer->nr_pages;
- dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
- if (!cpu_buffer->shortest_full ||
- cpu_buffer->shortest_full < full)
- cpu_buffer->shortest_full = full;
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- if (!pagebusy &&
- (!nr_pages || (dirty * 100) > full * nr_pages))
- break;
- }
+ if (rb_watermark_hit(buffer, cpu, full))
+ goto out;
- schedule();
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
}
+ schedule();
+ out:
if (full)
finish_wait(&work->full_waiters, &wait);
else
finish_wait(&work->waiters, &wait);
+ if (!ret && !rb_watermark_hit(buffer, cpu, full) && signal_pending(current))
+ ret = -EINTR;
+
return ret;
}
@@ -678,6 +932,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
* @cpu: the cpu buffer to wait on
* @filp: the file descriptor
* @poll_table: The poll descriptor
+ * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
*
* If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
* as data is added to any of the @buffer's cpu buffers. Otherwise
@@ -687,23 +942,38 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
* zero otherwise.
*/
__poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
- struct file *filp, poll_table *poll_table)
+ struct file *filp, poll_table *poll_table, int full)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct rb_irq_work *work;
+ struct rb_irq_work *rbwork;
- if (cpu == RING_BUFFER_ALL_CPUS)
- work = &buffer->irq_work;
- else {
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ rbwork = &buffer->irq_work;
+ full = 0;
+ } else {
if (!cpumask_test_cpu(cpu, buffer->cpumask))
- return -EINVAL;
+ return EPOLLERR;
cpu_buffer = buffer->buffers[cpu];
- work = &cpu_buffer->irq_work;
+ rbwork = &cpu_buffer->irq_work;
+ }
+
+ if (full) {
+ unsigned long flags;
+
+ poll_wait(filp, &rbwork->full_waiters, poll_table);
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ rbwork->full_waiters_pending = true;
+ if (!cpu_buffer->shortest_full ||
+ cpu_buffer->shortest_full > full)
+ cpu_buffer->shortest_full = full;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ } else {
+ poll_wait(filp, &rbwork->waiters, poll_table);
+ rbwork->waiters_pending = true;
}
- poll_wait(filp, &work->waiters, poll_table);
- work->waiters_pending = true;
/*
* There's a tight race between setting the waiters_pending and
* checking if the ring buffer is empty. Once the waiters_pending bit
@@ -719,6 +989,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
*/
smp_mb();
+ if (full)
+ return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0;
+
if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
(cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
return EPOLLIN | EPOLLRDNORM;
@@ -746,11 +1019,19 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
static inline u64 rb_time_stamp(struct trace_buffer *buffer)
{
+ u64 ts;
+
+ /* Skip retpolines :-( */
+ if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local))
+ ts = trace_clock_local();
+ else
+ ts = buffer->clock();
+
/* shift to debug/test normalization and TIME_EXTENTS */
- return buffer->clock() << DEBUG_SHIFT;
+ return ts << DEBUG_SHIFT;
}
-u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu)
+u64 ring_buffer_time_stamp(struct trace_buffer *buffer)
{
u64 time;
@@ -868,8 +1149,7 @@ static struct list_head *rb_list_head(struct list_head *list)
* its flags will be non zero.
*/
static inline int
-rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
- struct buffer_page *page, struct list_head *list)
+rb_is_head_page(struct buffer_page *page, struct list_head *list)
{
unsigned long val;
@@ -898,8 +1178,7 @@ static bool rb_is_reader_page(struct buffer_page *page)
/*
* rb_set_list_to_head - set a list_head to be pointing to head.
*/
-static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
- struct list_head *list)
+static void rb_set_list_to_head(struct list_head *list)
{
unsigned long *ptr;
@@ -922,7 +1201,7 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
/*
* Set the previous list pointer to have the HEAD flag.
*/
- rb_set_list_to_head(cpu_buffer, head->list.prev);
+ rb_set_list_to_head(head->list.prev);
}
static void rb_list_head_clear(struct list_head *list)
@@ -997,8 +1276,7 @@ static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
old_flag, RB_PAGE_NORMAL);
}
-static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
- struct buffer_page **bpage)
+static inline void rb_inc_page(struct buffer_page **bpage)
{
struct list_head *p = rb_list_head((*bpage)->list.next);
@@ -1030,11 +1308,11 @@ rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
*/
for (i = 0; i < 3; i++) {
do {
- if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
+ if (rb_is_head_page(page, page->list.prev)) {
cpu_buffer->head_page = page;
return page;
}
- rb_inc_page(cpu_buffer, &page);
+ rb_inc_page(&page);
} while (page != head);
}
@@ -1043,19 +1321,16 @@ rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
return NULL;
}
-static int rb_head_page_replace(struct buffer_page *old,
+static bool rb_head_page_replace(struct buffer_page *old,
struct buffer_page *new)
{
unsigned long *ptr = (unsigned long *)&old->list.prev->next;
unsigned long val;
- unsigned long ret;
val = *ptr & ~RB_FLAG_MASK;
val |= RB_PAGE_HEAD;
- ret = cmpxchg(ptr, val, (unsigned long)&new->list);
-
- return ret == val;
+ return try_cmpxchg(ptr, &val, (unsigned long)&new->list);
}
/*
@@ -1122,28 +1397,12 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
}
}
-static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
+static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage)
{
unsigned long val = (unsigned long)bpage;
- if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
- return 1;
-
- return 0;
-}
-
-/**
- * rb_check_list - make sure a pointer to a list has the last bits zero
- */
-static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
- struct list_head *list)
-{
- if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
- return 1;
- if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
- return 1;
- return 0;
+ RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK);
}
/**
@@ -1153,42 +1412,32 @@ static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
* As a safety measure we check to make sure the data pages have not
* been corrupted.
*/
-static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
+static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct list_head *head = cpu_buffer->pages;
- struct buffer_page *bpage, *tmp;
-
- /* Reset the head page if it exists */
- if (cpu_buffer->head_page)
- rb_set_head_page(cpu_buffer);
+ struct list_head *head = rb_list_head(cpu_buffer->pages);
+ struct list_head *tmp;
- rb_head_page_deactivate(cpu_buffer);
-
- if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
- return -1;
- if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
- return -1;
+ if (RB_WARN_ON(cpu_buffer,
+ rb_list_head(rb_list_head(head->next)->prev) != head))
+ return;
- if (rb_check_list(cpu_buffer, head))
- return -1;
+ if (RB_WARN_ON(cpu_buffer,
+ rb_list_head(rb_list_head(head->prev)->next) != head))
+ return;
- list_for_each_entry_safe(bpage, tmp, head, list) {
+ for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) {
if (RB_WARN_ON(cpu_buffer,
- bpage->list.next->prev != &bpage->list))
- return -1;
+ rb_list_head(rb_list_head(tmp->next)->prev) != tmp))
+ return;
+
if (RB_WARN_ON(cpu_buffer,
- bpage->list.prev->next != &bpage->list))
- return -1;
- if (rb_check_list(cpu_buffer, &bpage->list))
- return -1;
+ rb_list_head(rb_list_head(tmp->prev)->next) != tmp))
+ return;
}
-
- rb_head_page_activate(cpu_buffer);
-
- return 0;
}
-static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
+static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ long nr_pages, struct list_head *pages)
{
struct buffer_page *bpage, *tmp;
bool user_thread = current->mm != NULL;
@@ -1228,16 +1477,20 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
struct page *page;
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
- mflags, cpu_to_node(cpu));
+ mflags, cpu_to_node(cpu_buffer->cpu));
if (!bpage)
goto free_pages;
+ rb_check_bpage(cpu_buffer, bpage);
+
list_add(&bpage->list, pages);
- page = alloc_pages_node(cpu_to_node(cpu), mflags, 0);
+ page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags,
+ cpu_buffer->buffer->subbuf_order);
if (!page)
goto free_pages;
bpage->page = page_address(page);
+ bpage->order = cpu_buffer->buffer->subbuf_order;
rb_init_page(bpage->page);
if (user_thread && fatal_signal_pending(current))
@@ -1266,7 +1519,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
WARN_ON(!nr_pages);
- if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu))
+ if (__rb_allocate_pages(cpu_buffer, nr_pages, &pages))
return -ENOMEM;
/*
@@ -1316,7 +1569,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
rb_check_bpage(cpu_buffer, bpage);
cpu_buffer->reader_page = bpage;
- page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
+
+ page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order);
if (!page)
goto fail_free_reader;
bpage->page = page_address(page);
@@ -1350,11 +1604,13 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
struct list_head *head = cpu_buffer->pages;
struct buffer_page *bpage, *tmp;
- free_buffer_page(cpu_buffer->reader_page);
+ irq_work_sync(&cpu_buffer->irq_work.work);
- rb_head_page_deactivate(cpu_buffer);
+ free_buffer_page(cpu_buffer->reader_page);
if (head) {
+ rb_head_page_deactivate(cpu_buffer);
+
list_for_each_entry_safe(bpage, tmp, head, list) {
list_del_init(&bpage->list);
free_buffer_page(bpage);
@@ -1363,6 +1619,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
free_buffer_page(bpage);
}
+ free_page((unsigned long)cpu_buffer->free_page);
+
kfree(cpu_buffer);
}
@@ -1395,7 +1653,14 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
goto fail_free_buffer;
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ /* Default buffer page size - one system page */
+ buffer->subbuf_order = 0;
+ buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
+
+ /* Max payload is buffer page size - header (8bytes) */
+ buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2);
+
+ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
buffer->flags = flags;
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;
@@ -1456,6 +1721,8 @@ ring_buffer_free(struct trace_buffer *buffer)
cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
+ irq_work_sync(&buffer->irq_work.work);
+
for_each_buffer_cpu(buffer, cpu)
rb_free_cpu_buffer(buffer->buffers[cpu]);
@@ -1494,7 +1761,7 @@ static inline unsigned long rb_page_write(struct buffer_page *bpage)
return local_read(&bpage->write) & RB_WRITE_MASK;
}
-static int
+static bool
rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
{
struct list_head *tail_page, *to_remove, *next_page;
@@ -1535,6 +1802,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
to_remove = rb_list_head(to_remove)->next;
head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;
}
+ /* Read iterators need to reset themselves when some pages removed */
+ cpu_buffer->pages_removed += nr_removed;
next_page = rb_list_head(to_remove)->next;
@@ -1556,12 +1825,6 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
cpu_buffer->head_page = list_entry(next_page,
struct buffer_page, list);
- /*
- * change read pointer to make sure any read iterators reset
- * themselves
- */
- cpu_buffer->read = 0;
-
/* pages are removed, resume tracing and then free the pages */
atomic_dec(&cpu_buffer->record_disabled);
raw_spin_unlock_irq(&cpu_buffer->reader_lock);
@@ -1577,7 +1840,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
cond_resched();
to_remove_page = tmp_iter_page;
- rb_inc_page(cpu_buffer, &tmp_iter_page);
+ rb_inc_page(&tmp_iter_page);
/* update the counters */
page_entries = rb_page_entries(to_remove_page);
@@ -1589,7 +1852,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
* Increment overrun to account for the lost events.
*/
local_add(page_entries, &cpu_buffer->overrun);
- local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+ local_sub(rb_page_commit(to_remove_page), &cpu_buffer->entries_bytes);
+ local_inc(&cpu_buffer->pages_lost);
}
/*
@@ -1606,13 +1870,16 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
return nr_removed == 0;
}
-static int
+static bool
rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
struct list_head *pages = &cpu_buffer->new_pages;
- int retries, success;
+ unsigned long flags;
+ bool success;
+ int retries;
- raw_spin_lock_irq(&cpu_buffer->reader_lock);
+ /* Can be called at early boot up, where interrupts must not been enabled */
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
/*
* We are holding the reader lock, so the reader page won't be swapped
* in the ring buffer. Now we are racing with the writer trying to
@@ -1628,15 +1895,16 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
* spinning.
*/
retries = 10;
- success = 0;
+ success = false;
while (retries--) {
- struct list_head *head_page, *prev_page, *r;
+ struct list_head *head_page, *prev_page;
struct list_head *last_page, *first_page;
struct list_head *head_page_with_bit;
+ struct buffer_page *hpage = rb_set_head_page(cpu_buffer);
- head_page = &rb_set_head_page(cpu_buffer)->list;
- if (!head_page)
+ if (!hpage)
break;
+ head_page = &hpage->list;
prev_page = head_page->prev;
first_page = pages->next;
@@ -1648,16 +1916,16 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
last_page->next = head_page_with_bit;
first_page->prev = prev_page;
- r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
-
- if (r == head_page_with_bit) {
+ /* caution: head_page_with_bit gets updated on cmpxchg failure */
+ if (try_cmpxchg(&prev_page->next,
+ &head_page_with_bit, first_page)) {
/*
* yay, we replaced the page pointer to our new list,
* now, we just have to update to head page's prev
* pointer to point to end of list
*/
head_page->prev = last_page;
- success = 1;
+ success = true;
break;
}
}
@@ -1669,7 +1937,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
* tracing
*/
RB_WARN_ON(cpu_buffer, !success);
- raw_spin_unlock_irq(&cpu_buffer->reader_lock);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
/* free pages if they weren't inserted */
if (!success) {
@@ -1685,7 +1953,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
- int success;
+ bool success;
if (cpu_buffer->nr_pages_to_update > 0)
success = rb_insert_pages(cpu_buffer);
@@ -1711,7 +1979,7 @@ static void update_pages_handler(struct work_struct *work)
* @size: the new size.
* @cpu_id: the cpu buffer to resize
*
- * Minimum size is 2 * BUF_PAGE_SIZE.
+ * Minimum size is 2 * buffer->subbuf_size.
*
* Returns 0 on success and < 0 on failure.
*/
@@ -1720,30 +1988,28 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long nr_pages;
- int cpu, err = 0;
+ int cpu, err;
/*
* Always succeed at resizing a non-existent buffer:
*/
if (!buffer)
- return size;
+ return 0;
/* Make sure the requested buffer exists */
if (cpu_id != RING_BUFFER_ALL_CPUS &&
!cpumask_test_cpu(cpu_id, buffer->cpumask))
- return size;
+ return 0;
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
/* we need a minimum of two pages */
if (nr_pages < 2)
nr_pages = 2;
- size = nr_pages * BUF_PAGE_SIZE;
-
/* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
-
+ atomic_inc(&buffer->resizing);
if (cpu_id == RING_BUFFER_ALL_CPUS) {
/*
@@ -1775,15 +2041,17 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
* allocated without receiving ENOMEM
*/
INIT_LIST_HEAD(&cpu_buffer->new_pages);
- if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update,
- &cpu_buffer->new_pages, cpu)) {
+ if (__rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update,
+ &cpu_buffer->new_pages)) {
/* not enough memory for new pages */
err = -ENOMEM;
goto out_err;
}
+
+ cond_resched();
}
- get_online_cpus();
+ cpus_read_lock();
/*
* Fire off all the required work handlers
* We can't schedule on offline CPUs, but it's not necessary
@@ -1799,8 +2067,16 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
rb_update_pages(cpu_buffer);
cpu_buffer->nr_pages_to_update = 0;
} else {
- schedule_work_on(cpu,
- &cpu_buffer->update_pages_work);
+ /* Run directly if possible. */
+ migrate_disable();
+ if (cpu != smp_processor_id()) {
+ migrate_enable();
+ schedule_work_on(cpu,
+ &cpu_buffer->update_pages_work);
+ } else {
+ update_pages_handler(&cpu_buffer->update_pages_work);
+ migrate_enable();
+ }
}
}
@@ -1815,12 +2091,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
cpu_buffer->nr_pages_to_update = 0;
}
- put_online_cpus();
+ cpus_read_unlock();
} else {
- /* Make sure this CPU has been initialized */
- if (!cpumask_test_cpu(cpu_id, buffer->cpumask))
- goto out;
-
cpu_buffer = buffer->buffers[cpu_id];
if (nr_pages == cpu_buffer->nr_pages)
@@ -1841,25 +2113,33 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
INIT_LIST_HEAD(&cpu_buffer->new_pages);
if (cpu_buffer->nr_pages_to_update > 0 &&
- __rb_allocate_pages(cpu_buffer->nr_pages_to_update,
- &cpu_buffer->new_pages, cpu_id)) {
+ __rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update,
+ &cpu_buffer->new_pages)) {
err = -ENOMEM;
goto out_err;
}
- get_online_cpus();
+ cpus_read_lock();
/* Can't run something on an offline CPU. */
if (!cpu_online(cpu_id))
rb_update_pages(cpu_buffer);
else {
- schedule_work_on(cpu_id,
- &cpu_buffer->update_pages_work);
- wait_for_completion(&cpu_buffer->update_done);
+ /* Run directly if possible. */
+ migrate_disable();
+ if (cpu_id == smp_processor_id()) {
+ rb_update_pages(cpu_buffer);
+ migrate_enable();
+ } else {
+ migrate_enable();
+ schedule_work_on(cpu_id,
+ &cpu_buffer->update_pages_work);
+ wait_for_completion(&cpu_buffer->update_done);
+ }
}
cpu_buffer->nr_pages_to_update = 0;
- put_online_cpus();
+ cpus_read_unlock();
}
out:
@@ -1886,8 +2166,9 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
atomic_dec(&buffer->record_disabled);
}
+ atomic_dec(&buffer->resizing);
mutex_unlock(&buffer->mutex);
- return size;
+ return 0;
out_err:
for_each_buffer_cpu(buffer, cpu) {
@@ -1906,6 +2187,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
}
}
out_err_unlock:
+ atomic_dec(&buffer->resizing);
mutex_unlock(&buffer->mutex);
return err;
}
@@ -1934,11 +2216,6 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->reader_page->read);
}
-static __always_inline unsigned rb_page_commit(struct buffer_page *bpage)
-{
- return local_read(&bpage->page->commit);
-}
-
static struct ring_buffer_event *
rb_iter_head_event(struct ring_buffer_iter *iter)
{
@@ -1957,6 +2234,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
*/
commit = rb_page_commit(iter_head_page);
smp_rmb();
+
+ /* An event needs to be at least 8 bytes in size */
+ if (iter->head > commit - 8)
+ goto reset;
+
event = __rb_page_index(iter_head_page, iter->head);
length = rb_event_length(event);
@@ -1966,7 +2248,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
*/
barrier();
- if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE)
+ if ((iter->head + length) > commit || length > iter->event_size)
/* Writer corrupted the read? */
goto reset;
@@ -2006,11 +2288,13 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
}
static __always_inline unsigned
-rb_event_index(struct ring_buffer_event *event)
+rb_event_index(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event)
{
unsigned long addr = (unsigned long)event;
- return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
+ addr &= (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1;
+
+ return addr - BUF_PAGE_HDR_SIZE;
}
static void rb_inc_iter(struct ring_buffer_iter *iter)
@@ -2026,7 +2310,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
if (iter->head_page == cpu_buffer->reader_page)
iter->head_page = rb_set_head_page(cpu_buffer);
else
- rb_inc_page(cpu_buffer, &iter->head_page);
+ rb_inc_page(&iter->head_page);
iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp;
iter->head = 0;
@@ -2079,7 +2363,8 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
* the counters.
*/
local_add(entries, &cpu_buffer->overrun);
- local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+ local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes);
+ local_inc(&cpu_buffer->pages_lost);
/*
* The entries will be zeroed out when we move the
@@ -2129,7 +2414,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
* want the outer most commit to reset it.
*/
new_head = next_page;
- rb_inc_page(cpu_buffer, &new_head);
+ rb_inc_page(&new_head);
ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
RB_PAGE_NORMAL);
@@ -2198,6 +2483,7 @@ static inline void
rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long tail, struct rb_event_info *info)
{
+ unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
struct buffer_page *tail_page = info->tail_page;
struct ring_buffer_event *event;
unsigned long length = info->length;
@@ -2206,13 +2492,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
* Only the event that crossed the page boundary
* must fill the old tail_page with padding.
*/
- if (tail >= BUF_PAGE_SIZE) {
+ if (tail >= bsize) {
/*
* If the page was filled, then we still need
* to update the real_end. Reset it to zero
* and the reader will ignore it.
*/
- if (tail == BUF_PAGE_SIZE)
+ if (tail == bsize)
tail_page->real_end = 0;
local_sub(length, &tail_page->write);
@@ -2221,9 +2507,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
event = __rb_page_index(tail_page, tail);
- /* account for padding bytes */
- local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
-
/*
* Save the original length to the meta data.
* This will be used by the reader to add lost event
@@ -2237,30 +2520,40 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
* write counter enough to allow another writer to slip
* in on this page.
* We put in a discarded commit instead, to make sure
- * that this space is not used again.
+ * that this space is not used again, and this space will
+ * not be accounted into 'entries_bytes'.
*
* If we are less than the minimum size, we don't need to
* worry about it.
*/
- if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
+ if (tail > (bsize - RB_EVNT_MIN_SIZE)) {
/* No room for any events */
/* Mark the rest of the page with padding */
rb_event_set_padding(event);
+ /* Make sure the padding is visible before the write update */
+ smp_wmb();
+
/* Set the write back to the previous setting */
local_sub(length, &tail_page->write);
return;
}
/* Put in a discarded event */
- event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
+ event->array[0] = (bsize - tail) - RB_EVNT_HDR_SIZE;
event->type_len = RINGBUF_TYPE_PADDING;
/* time delta must be non zero */
event->time_delta = 1;
+ /* account for padding bytes */
+ local_add(bsize - tail, &cpu_buffer->entries_bytes);
+
+ /* Make sure the padding is visible before the tail_page->write update */
+ smp_wmb();
+
/* Set write to end of buffer */
- length = (tail + length) - BUF_PAGE_SIZE;
+ length = (tail + length) - bsize;
local_sub(length, &tail_page->write);
}
@@ -2281,7 +2574,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
next_page = tail_page;
- rb_inc_page(cpu_buffer, &next_page);
+ rb_inc_page(&next_page);
/*
* If for some reason, we had an interrupt storm that made
@@ -2307,7 +2600,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
* the buffer, unless the commit page is still on the
* reader page.
*/
- if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
+ if (rb_is_head_page(next_page, &tail_page->list)) {
/*
* If the commit is not on the reader page, then
@@ -2338,7 +2631,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
* have filled up the buffer with events
* from interrupts and such, and wrapped.
*
- * Note, if the tail page is also the on the
+ * Note, if the tail page is also on the
* reader_page, we let it move out.
*/
if (unlikely((cpu_buffer->commit_page !=
@@ -2372,9 +2665,10 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
return NULL;
}
-/* Slow path, do not inline */
-static noinline struct ring_buffer_event *
-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
+/* Slow path */
+static struct ring_buffer_event *
+rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event, u64 delta, bool abs)
{
if (abs)
event->type_len = RINGBUF_TYPE_TIME_STAMP;
@@ -2382,7 +2676,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
event->type_len = RINGBUF_TYPE_TIME_EXTEND;
/* Not the first event on the page, or not delta? */
- if (abs || rb_event_index(event)) {
+ if (abs || rb_event_index(cpu_buffer, event)) {
event->time_delta = delta & TS_MASK;
event->array[0] = delta >> TS_SHIFT;
} else {
@@ -2394,8 +2688,72 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
return skip_time_extend(event);
}
-static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event);
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline bool sched_clock_stable(void)
+{
+ return true;
+}
+#endif
+
+static void
+rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct rb_event_info *info)
+{
+ u64 write_stamp;
+
+ WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s",
+ (unsigned long long)info->delta,
+ (unsigned long long)info->ts,
+ (unsigned long long)info->before,
+ (unsigned long long)info->after,
+ (unsigned long long)({rb_time_read(&cpu_buffer->write_stamp, &write_stamp); write_stamp;}),
+ sched_clock_stable() ? "" :
+ "If you just came from a suspend/resume,\n"
+ "please switch to the trace global clock:\n"
+ " echo global > /sys/kernel/tracing/trace_clock\n"
+ "or add trace_clock=global to the kernel command line\n");
+}
+
+static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event **event,
+ struct rb_event_info *info,
+ u64 *delta,
+ unsigned int *length)
+{
+ bool abs = info->add_timestamp &
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);
+
+ if (unlikely(info->delta > (1ULL << 59))) {
+ /*
+ * Some timers can use more than 59 bits, and when a timestamp
+ * is added to the buffer, it will lose those bits.
+ */
+ if (abs && (info->ts & TS_MSB)) {
+ info->delta &= ABS_TS_MASK;
+
+ /* did the clock go backwards */
+ } else if (info->before == info->after && info->before > info->ts) {
+ /* not interrupted */
+ static int once;
+
+ /*
+ * This is possible with a recalibrating of the TSC.
+ * Do not produce a call stack, but just report it.
+ */
+ if (!once) {
+ once++;
+ pr_warn("Ring buffer clock went backwards: %llu -> %llu\n",
+ info->before, info->ts);
+ }
+ } else
+ rb_check_timestamp(cpu_buffer, info);
+ if (!abs)
+ info->delta = 0;
+ }
+ *event = rb_add_time_stamp(cpu_buffer, *event, info->delta, abs);
+ *length -= RB_LEN_TIME_EXTEND;
+ *delta = 0;
+}
/**
* rb_update_event - update event type and data
@@ -2415,26 +2773,21 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
{
unsigned length = info->length;
u64 delta = info->delta;
+ unsigned int nest = local_read(&cpu_buffer->committing) - 1;
- /* Only a commit updates the timestamp */
- if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
- delta = 0;
+ if (!WARN_ON_ONCE(nest >= MAX_NEST))
+ cpu_buffer->event_stamp[nest] = info->ts;
/*
* If we need to add a timestamp, then we
* add it to the start of the reserved space.
*/
- if (unlikely(info->add_timestamp)) {
- bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
-
- event = rb_add_time_stamp(event, abs ? info->delta : delta, abs);
- length -= RB_LEN_TIME_EXTEND;
- delta = 0;
- }
+ if (unlikely(info->add_timestamp))
+ rb_add_timestamp(cpu_buffer, &event, info, &delta, &length);
event->time_delta = delta;
length -= RB_EVNT_HDR_SIZE;
- if (length > RB_MAX_SMALL_DATA) {
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
event->type_len = 0;
event->array[0] = length;
} else
@@ -2449,11 +2802,11 @@ static unsigned rb_calculate_event_length(unsigned length)
if (!length)
length++;
- if (length > RB_MAX_SMALL_DATA)
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
length += sizeof(event.array[0]);
length += RB_EVNT_HDR_SIZE;
- length = ALIGN(length, RB_ALIGNMENT);
+ length = ALIGN(length, RB_ARCH_ALIGNMENT);
/*
* In case the time delta is larger than the 27 bits for it
@@ -2473,33 +2826,51 @@ static unsigned rb_calculate_event_length(unsigned length)
return length;
}
-#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static inline bool sched_clock_stable(void)
-{
- return true;
-}
-#endif
-
-static inline int
+static inline bool
rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
unsigned long new_index, old_index;
struct buffer_page *bpage;
- unsigned long index;
unsigned long addr;
- new_index = rb_event_index(event);
+ new_index = rb_event_index(cpu_buffer, event);
old_index = new_index + rb_event_ts_length(event);
addr = (unsigned long)event;
- addr &= PAGE_MASK;
+ addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1);
bpage = READ_ONCE(cpu_buffer->tail_page);
+ /*
+ * Make sure the tail_page is still the same and
+ * the next write location is the end of this event
+ */
if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
unsigned long write_mask =
local_read(&bpage->write) & ~RB_WRITE_MASK;
unsigned long event_length = rb_event_length(event);
+
+ /*
+ * For the before_stamp to be different than the write_stamp
+ * to make sure that the next event adds an absolute
+ * value and does not rely on the saved write stamp, which
+ * is now going to be bogus.
+ *
+ * By setting the before_stamp to zero, the next event
+ * is not going to use the write_stamp and will instead
+ * create an absolute timestamp. This means there's no
+ * reason to update the wirte_stamp!
+ */
+ rb_time_set(&cpu_buffer->before_stamp, 0);
+
+ /*
+ * If an event were to come in now, it would see that the
+ * write_stamp and the before_stamp are different, and assume
+ * that this event just added itself before updating
+ * the write stamp. The interrupting event will fix the
+ * write stamp for us, and use an absolute timestamp.
+ */
+
/*
* This is on the tail page. It is possible that
* a write could come in and move the tail page
@@ -2508,16 +2879,17 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
*/
old_index += write_mask;
new_index += write_mask;
- index = local_cmpxchg(&bpage->write, old_index, new_index);
- if (index == old_index) {
+
+ /* caution: old_index gets updated on cmpxchg failure */
+ if (local_try_cmpxchg(&bpage->write, &old_index, new_index)) {
/* update counters */
local_sub(event_length, &cpu_buffer->entries_bytes);
- return 1;
+ return true;
}
}
/* could not discard */
- return 0;
+ return false;
}
static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
@@ -2548,19 +2920,21 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
if (RB_WARN_ON(cpu_buffer,
rb_is_reader_page(cpu_buffer->tail_page)))
return;
+ /*
+ * No need for a memory barrier here, as the update
+ * of the tail_page did it for this page.
+ */
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
- rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
- /* Only update the write stamp if the page has an event */
- if (rb_page_write(cpu_buffer->commit_page))
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
+ rb_inc_page(&cpu_buffer->commit_page);
/* add barrier to keep gcc from optimizing too much */
barrier();
}
while (rb_commit_index(cpu_buffer) !=
rb_page_write(cpu_buffer->commit_page)) {
+ /* Make sure the readers see the content of what is committed. */
+ smp_wmb();
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
RB_WARN_ON(cpu_buffer,
@@ -2626,64 +3000,15 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
event->time_delta = 1;
}
-static __always_inline bool
-rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- unsigned long addr = (unsigned long)event;
- unsigned long index;
-
- index = rb_event_index(event);
- addr &= PAGE_MASK;
-
- return cpu_buffer->commit_page->page == (void *)addr &&
- rb_commit_index(cpu_buffer) == index;
-}
-
-static __always_inline void
-rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- u64 delta;
-
- /*
- * The event first in the commit queue updates the
- * time stamp.
- */
- if (rb_event_is_commit(cpu_buffer, event)) {
- /*
- * A commit event that is first on a page
- * updates the write timestamp with the page stamp
- */
- if (!rb_event_index(event))
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
- else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
- delta = ring_buffer_event_time_stamp(event);
- cpu_buffer->write_stamp += delta;
- } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
- delta = ring_buffer_event_time_stamp(event);
- cpu_buffer->write_stamp = delta;
- } else
- cpu_buffer->write_stamp += event->time_delta;
- }
-}
-
-static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
+static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
local_inc(&cpu_buffer->entries);
- rb_update_write_stamp(cpu_buffer, event);
rb_end_commit(cpu_buffer);
}
static __always_inline void
rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
{
- size_t nr_pages;
- size_t dirty;
- size_t full;
-
if (buffer->irq_work.waiters_pending) {
buffer->irq_work.waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
@@ -2707,10 +3032,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched);
- full = cpu_buffer->shortest_full;
- nr_pages = cpu_buffer->nr_pages;
- dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu);
- if (full && nr_pages && (dirty * 100) <= full * nr_pages)
+ if (!full_hit(buffer, cpu_buffer->cpu, cpu_buffer->shortest_full))
return;
cpu_buffer->irq_work.wakeup_full = true;
@@ -2719,6 +3041,13 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
irq_work_queue(&cpu_buffer->irq_work.work);
}
+#ifdef CONFIG_RING_BUFFER_RECORD_RECURSION
+# define do_ring_buffer_record_recursion() \
+ do_ftrace_record_recursion(_THIS_IP_, _RET_IP_)
+#else
+# define do_ring_buffer_record_recursion() do { } while (0)
+#endif
+
/*
* The lock and unlock are done within a preempt disable section.
* The current_context per_cpu variable can only be modified
@@ -2729,10 +3058,10 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* a bit of overhead in something as critical as function tracing,
* we use a bitmask trick.
*
- * bit 0 = NMI context
- * bit 1 = IRQ context
- * bit 2 = SoftIRQ context
- * bit 3 = normal context.
+ * bit 1 = NMI context
+ * bit 2 = IRQ context
+ * bit 3 = SoftIRQ context
+ * bit 4 = normal context.
*
* This works because this is the order of contexts that can
* preempt other contexts. A SoftIRQ never preempts an IRQ
@@ -2755,28 +3084,57 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* The least significant bit can be cleared this way, and it
* just so happens that it is the same bit corresponding to
* the current context.
+ *
+ * Now the TRANSITION bit breaks the above slightly. The TRANSITION bit
+ * is set when a recursion is detected at the current context, and if
+ * the TRANSITION bit is already set, it will fail the recursion.
+ * This is needed because there's a lag between the changing of
+ * interrupt context and updating the preempt count. In this case,
+ * a false positive will be found. To handle this, one extra recursion
+ * is allowed, and this is done by the TRANSITION bit. If the TRANSITION
+ * bit is already set, then it is considered a recursion and the function
+ * ends. Otherwise, the TRANSITION bit is set, and that bit is returned.
+ *
+ * On the trace_recursive_unlock(), the TRANSITION bit will be the first
+ * to be cleared. Even if it wasn't the context that set it. That is,
+ * if an interrupt comes in while NORMAL bit is set and the ring buffer
+ * is called before preempt_count() is updated, since the check will
+ * be on the NORMAL bit, the TRANSITION bit will then be set. If an
+ * NMI then comes in, it will set the NMI bit, but when the NMI code
+ * does the trace_recursive_unlock() it will clear the TRANSITION bit
+ * and leave the NMI bit set. But this is fine, because the interrupt
+ * code that set the TRANSITION bit will then clear the NMI bit when it
+ * calls trace_recursive_unlock(). If another NMI comes in, it will
+ * set the TRANSITION bit and continue.
+ *
+ * Note: The TRANSITION bit only handles a single transition between context.
*/
-static __always_inline int
+static __always_inline bool
trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned int val = cpu_buffer->current_context;
- unsigned long pc = preempt_count();
- int bit;
+ int bit = interrupt_context_level();
- if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
- bit = RB_CTX_NORMAL;
- else
- bit = pc & NMI_MASK ? RB_CTX_NMI :
- pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
+ bit = RB_CTX_NORMAL - bit;
- if (unlikely(val & (1 << (bit + cpu_buffer->nest))))
- return 1;
+ if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) {
+ /*
+ * It is possible that this was called by transitioning
+ * between interrupt context, and preempt_count() has not
+ * been updated yet. In this case, use the TRANSITION bit.
+ */
+ bit = RB_CTX_TRANSITION;
+ if (val & (1 << (bit + cpu_buffer->nest))) {
+ do_ring_buffer_record_recursion();
+ return true;
+ }
+ }
val |= (1 << (bit + cpu_buffer->nest));
cpu_buffer->current_context = val;
- return 0;
+ return false;
}
static __always_inline void
@@ -2786,8 +3144,8 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->current_context - (1 << cpu_buffer->nest);
}
-/* The recursive locking above uses 4 bits */
-#define NESTED_BITS 4
+/* The recursive locking above uses 5 bits */
+#define NESTED_BITS 5
/**
* ring_buffer_nest_start - Allow to trace while nested
@@ -2838,21 +3196,19 @@ void ring_buffer_nest_end(struct trace_buffer *buffer)
/**
* ring_buffer_unlock_commit - commit a reserved
* @buffer: The buffer to commit to
- * @event: The event pointer to commit.
*
* This commits the data to the ring buffer, and releases any locks held.
*
* Must be paired with ring_buffer_lock_reserve.
*/
-int ring_buffer_unlock_commit(struct trace_buffer *buffer,
- struct ring_buffer_event *event)
+int ring_buffer_unlock_commit(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
int cpu = raw_smp_processor_id();
cpu_buffer = buffer->buffers[cpu];
- rb_commit(cpu_buffer, event);
+ rb_commit(cpu_buffer);
rb_wakeups(buffer, cpu_buffer);
@@ -2864,58 +3220,366 @@ int ring_buffer_unlock_commit(struct trace_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
-static noinline void
-rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
- struct rb_event_info *info)
+/* Special value to validate all deltas on a page. */
+#define CHECK_FULL_PAGE 1L
+
+#ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS
+
+static const char *show_irq_str(int bits)
{
- WARN_ONCE(info->delta > (1ULL << 59),
- KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
- (unsigned long long)info->delta,
- (unsigned long long)info->ts,
- (unsigned long long)cpu_buffer->write_stamp,
- sched_clock_stable() ? "" :
- "If you just came from a suspend/resume,\n"
- "please switch to the trace global clock:\n"
- " echo global > /sys/kernel/debug/tracing/trace_clock\n"
- "or add trace_clock=global to the kernel command line\n");
- info->add_timestamp = 1;
+ const char *type[] = {
+ ".", // 0
+ "s", // 1
+ "h", // 2
+ "Hs", // 3
+ "n", // 4
+ "Ns", // 5
+ "Nh", // 6
+ "NHs", // 7
+ };
+
+ return type[bits];
+}
+
+/* Assume this is an trace event */
+static const char *show_flags(struct ring_buffer_event *event)
+{
+ struct trace_entry *entry;
+ int bits = 0;
+
+ if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry))
+ return "X";
+
+ entry = ring_buffer_event_data(event);
+
+ if (entry->flags & TRACE_FLAG_SOFTIRQ)
+ bits |= 1;
+
+ if (entry->flags & TRACE_FLAG_HARDIRQ)
+ bits |= 2;
+
+ if (entry->flags & TRACE_FLAG_NMI)
+ bits |= 4;
+
+ return show_irq_str(bits);
+}
+
+static const char *show_irq(struct ring_buffer_event *event)
+{
+ struct trace_entry *entry;
+
+ if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry))
+ return "";
+
+ entry = ring_buffer_event_data(event);
+ if (entry->flags & TRACE_FLAG_IRQS_OFF)
+ return "d";
+ return "";
+}
+
+static const char *show_interrupt_level(void)
+{
+ unsigned long pc = preempt_count();
+ unsigned char level = 0;
+
+ if (pc & SOFTIRQ_OFFSET)
+ level |= 1;
+
+ if (pc & HARDIRQ_MASK)
+ level |= 2;
+
+ if (pc & NMI_MASK)
+ level |= 4;
+
+ return show_irq_str(level);
}
+static void dump_buffer_page(struct buffer_data_page *bpage,
+ struct rb_event_info *info,
+ unsigned long tail)
+{
+ struct ring_buffer_event *event;
+ u64 ts, delta;
+ int e;
+
+ ts = bpage->time_stamp;
+ pr_warn(" [%lld] PAGE TIME STAMP\n", ts);
+
+ for (e = 0; e < tail; e += rb_event_length(event)) {
+
+ event = (struct ring_buffer_event *)(bpage->data + e);
+
+ switch (event->type_len) {
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = rb_event_time_stamp(event);
+ ts += delta;
+ pr_warn(" 0x%x: [%lld] delta:%lld TIME EXTEND\n",
+ e, ts, delta);
+ break;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ delta = rb_event_time_stamp(event);
+ ts = rb_fix_abs_ts(delta, ts);
+ pr_warn(" 0x%x: [%lld] absolute:%lld TIME STAMP\n",
+ e, ts, delta);
+ break;
+
+ case RINGBUF_TYPE_PADDING:
+ ts += event->time_delta;
+ pr_warn(" 0x%x: [%lld] delta:%d PADDING\n",
+ e, ts, event->time_delta);
+ break;
+
+ case RINGBUF_TYPE_DATA:
+ ts += event->time_delta;
+ pr_warn(" 0x%x: [%lld] delta:%d %s%s\n",
+ e, ts, event->time_delta,
+ show_flags(event), show_irq(event));
+ break;
+
+ default:
+ break;
+ }
+ }
+ pr_warn("expected end:0x%lx last event actually ended at:0x%x\n", tail, e);
+}
+
+static DEFINE_PER_CPU(atomic_t, checking);
+static atomic_t ts_dump;
+
+#define buffer_warn_return(fmt, ...) \
+ do { \
+ /* If another report is happening, ignore this one */ \
+ if (atomic_inc_return(&ts_dump) != 1) { \
+ atomic_dec(&ts_dump); \
+ goto out; \
+ } \
+ atomic_inc(&cpu_buffer->record_disabled); \
+ pr_warn(fmt, ##__VA_ARGS__); \
+ dump_buffer_page(bpage, info, tail); \
+ atomic_dec(&ts_dump); \
+ /* There's some cases in boot up that this can happen */ \
+ if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING)) \
+ /* Do not re-enable checking */ \
+ return; \
+ } while (0)
+
+/*
+ * Check if the current event time stamp matches the deltas on
+ * the buffer page.
+ */
+static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
+ struct rb_event_info *info,
+ unsigned long tail)
+{
+ struct ring_buffer_event *event;
+ struct buffer_data_page *bpage;
+ u64 ts, delta;
+ bool full = false;
+ int e;
+
+ bpage = info->tail_page->page;
+
+ if (tail == CHECK_FULL_PAGE) {
+ full = true;
+ tail = local_read(&bpage->commit);
+ } else if (info->add_timestamp &
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)) {
+ /* Ignore events with absolute time stamps */
+ return;
+ }
+
+ /*
+ * Do not check the first event (skip possible extends too).
+ * Also do not check if previous events have not been committed.
+ */
+ if (tail <= 8 || tail > local_read(&bpage->commit))
+ return;
+
+ /*
+ * If this interrupted another event,
+ */
+ if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
+ goto out;
+
+ ts = bpage->time_stamp;
+
+ for (e = 0; e < tail; e += rb_event_length(event)) {
+
+ event = (struct ring_buffer_event *)(bpage->data + e);
+
+ switch (event->type_len) {
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = rb_event_time_stamp(event);
+ ts += delta;
+ break;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ delta = rb_event_time_stamp(event);
+ delta = rb_fix_abs_ts(delta, ts);
+ if (delta < ts) {
+ buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
+ cpu_buffer->cpu, ts, delta);
+ }
+ ts = delta;
+ break;
+
+ case RINGBUF_TYPE_PADDING:
+ if (event->time_delta == 1)
+ break;
+ fallthrough;
+ case RINGBUF_TYPE_DATA:
+ ts += event->time_delta;
+ break;
+
+ default:
+ RB_WARN_ON(cpu_buffer, 1);
+ }
+ }
+ if ((full && ts > info->ts) ||
+ (!full && ts + info->delta != info->ts)) {
+ buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n",
+ cpu_buffer->cpu,
+ ts + info->delta, info->ts, info->delta,
+ info->before, info->after,
+ full ? " (full)" : "", show_interrupt_level());
+ }
+out:
+ atomic_dec(this_cpu_ptr(&checking));
+}
+#else
+static inline void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
+ struct rb_event_info *info,
+ unsigned long tail)
+{
+}
+#endif /* CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS */
+
static struct ring_buffer_event *
__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
struct rb_event_info *info)
{
struct ring_buffer_event *event;
struct buffer_page *tail_page;
- unsigned long tail, write;
-
- /*
- * If the time delta since the last event is too big to
- * hold in the time field of the event, then we append a
- * TIME EXTEND event ahead of the data event.
- */
- if (unlikely(info->add_timestamp))
- info->length += RB_LEN_TIME_EXTEND;
+ unsigned long tail, write, w;
/* Don't let the compiler play games with cpu_buffer->tail_page */
tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
- write = local_add_return(info->length, &tail_page->write);
+
+ /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK;
+ barrier();
+ rb_time_read(&cpu_buffer->before_stamp, &info->before);
+ rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ barrier();
+ info->ts = rb_time_stamp(cpu_buffer->buffer);
+
+ if ((info->add_timestamp & RB_ADD_STAMP_ABSOLUTE)) {
+ info->delta = info->ts;
+ } else {
+ /*
+ * If interrupting an event time update, we may need an
+ * absolute timestamp.
+ * Don't bother if this is the start of a new page (w == 0).
+ */
+ if (!w) {
+ /* Use the sub-buffer timestamp */
+ info->delta = 0;
+ } else if (unlikely(info->before != info->after)) {
+ info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
+ info->length += RB_LEN_TIME_EXTEND;
+ } else {
+ info->delta = info->ts - info->after;
+ if (unlikely(test_time_stamp(info->delta))) {
+ info->add_timestamp |= RB_ADD_STAMP_EXTEND;
+ info->length += RB_LEN_TIME_EXTEND;
+ }
+ }
+ }
+
+ /*B*/ rb_time_set(&cpu_buffer->before_stamp, info->ts);
+
+ /*C*/ write = local_add_return(info->length, &tail_page->write);
/* set write to only the index of the write */
write &= RB_WRITE_MASK;
+
tail = write - info->length;
+ /* See if we shot pass the end of this buffer page */
+ if (unlikely(write > cpu_buffer->buffer->subbuf_size)) {
+ check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
+ return rb_move_tail(cpu_buffer, tail, info);
+ }
+
+ if (likely(tail == w)) {
+ /* Nothing interrupted us between A and C */
+ /*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts);
+ /*
+ * If something came in between C and D, the write stamp
+ * may now not be in sync. But that's fine as the before_stamp
+ * will be different and then next event will just be forced
+ * to use an absolute timestamp.
+ */
+ if (likely(!(info->add_timestamp &
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
+ /* This did not interrupt any time update */
+ info->delta = info->ts - info->after;
+ else
+ /* Just use full timestamp for interrupting event */
+ info->delta = info->ts;
+ check_buffer(cpu_buffer, info, tail);
+ } else {
+ u64 ts;
+ /* SLOW PATH - Interrupted between A and C */
+
+ /* Save the old before_stamp */
+ rb_time_read(&cpu_buffer->before_stamp, &info->before);
+
+ /*
+ * Read a new timestamp and update the before_stamp to make
+ * the next event after this one force using an absolute
+ * timestamp. This is in case an interrupt were to come in
+ * between E and F.
+ */
+ ts = rb_time_stamp(cpu_buffer->buffer);
+ rb_time_set(&cpu_buffer->before_stamp, ts);
+
+ barrier();
+ /*E*/ rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ barrier();
+ /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
+ info->after == info->before && info->after < ts) {
+ /*
+ * Nothing came after this event between C and F, it is
+ * safe to use info->after for the delta as it
+ * matched info->before and is still valid.
+ */
+ info->delta = ts - info->after;
+ } else {
+ /*
+ * Interrupted between C and F:
+ * Lost the previous events time stamp. Just set the
+ * delta to zero, and this will be the same time as
+ * the event this event interrupted. And the events that
+ * came after this will still be correct (as they would
+ * have built their delta on the previous event.
+ */
+ info->delta = 0;
+ }
+ info->ts = ts;
+ info->add_timestamp &= ~RB_ADD_STAMP_FORCE;
+ }
+
/*
* If this is the first commit on the page, then it has the same
* timestamp as the page itself.
*/
- if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer))
+ if (unlikely(!tail && !(info->add_timestamp &
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
info->delta = 0;
- /* See if we shot pass the end of this buffer page */
- if (unlikely(write > BUF_PAGE_SIZE))
- return rb_move_tail(cpu_buffer, tail, info);
-
/* We reserved something on the buffer */
event = __rb_page_index(tail_page, tail);
@@ -2927,7 +3591,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
* If this is the first commit on the page, then update
* its timestamp.
*/
- if (!tail)
+ if (unlikely(!tail))
tail_page->page->time_stamp = info->ts;
/* account for these added bytes */
@@ -2944,9 +3608,16 @@ rb_reserve_next_event(struct trace_buffer *buffer,
struct ring_buffer_event *event;
struct rb_event_info info;
int nr_loops = 0;
- u64 diff;
+ int add_ts_default;
+
+ /* ring buffer does cmpxchg, make sure it is safe in NMI context */
+ if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) &&
+ (unlikely(in_nmi()))) {
+ return NULL;
+ }
rb_start_commit(cpu_buffer);
+ /* The commit page can not change after this */
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
/*
@@ -2964,8 +3635,18 @@ rb_reserve_next_event(struct trace_buffer *buffer,
#endif
info.length = rb_calculate_event_length(length);
+
+ if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
+ add_ts_default = RB_ADD_STAMP_ABSOLUTE;
+ info.length += RB_LEN_TIME_EXTEND;
+ if (info.length > cpu_buffer->buffer->max_data_size)
+ goto out_fail;
+ } else {
+ add_ts_default = RB_ADD_STAMP_NONE;
+ }
+
again:
- info.add_timestamp = 0;
+ info.add_timestamp = add_ts_default;
info.delta = 0;
/*
@@ -2980,35 +3661,16 @@ rb_reserve_next_event(struct trace_buffer *buffer,
if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
goto out_fail;
- info.ts = rb_time_stamp(cpu_buffer->buffer);
- diff = info.ts - cpu_buffer->write_stamp;
-
- /* make sure this diff is calculated here */
- barrier();
-
- if (ring_buffer_time_stamp_abs(buffer)) {
- info.delta = info.ts;
- rb_handle_timestamp(cpu_buffer, &info);
- } else /* Did the write stamp get updated already? */
- if (likely(info.ts >= cpu_buffer->write_stamp)) {
- info.delta = diff;
- if (unlikely(test_time_stamp(info.delta)))
- rb_handle_timestamp(cpu_buffer, &info);
- }
-
event = __rb_reserve_next(cpu_buffer, &info);
if (unlikely(PTR_ERR(event) == -EAGAIN)) {
- if (info.add_timestamp)
+ if (info.add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND))
info.length -= RB_LEN_TIME_EXTEND;
goto again;
}
- if (!event)
- goto out_fail;
-
- return event;
-
+ if (likely(event))
+ return event;
out_fail:
rb_end_commit(cpu_buffer);
return NULL;
@@ -3052,7 +3714,7 @@ ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length)
if (unlikely(atomic_read(&cpu_buffer->record_disabled)))
goto out;
- if (unlikely(length > BUF_MAX_DATA_SIZE))
+ if (unlikely(length > buffer->max_data_size))
goto out;
if (unlikely(trace_recursive_lock(cpu_buffer)))
@@ -3086,7 +3748,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage = cpu_buffer->commit_page;
struct buffer_page *start;
- addr &= PAGE_MASK;
+ addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1);
/* Do the likely case first */
if (likely(bpage->page == (void *)addr)) {
@@ -3098,14 +3760,14 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
* Because the commit page may be on the reader page we
* start with the next page and check the end loop there.
*/
- rb_inc_page(cpu_buffer, &bpage);
+ rb_inc_page(&bpage);
start = bpage;
do {
if (bpage->page == (void *)addr) {
local_dec(&bpage->entries);
return;
}
- rb_inc_page(cpu_buffer, &bpage);
+ rb_inc_page(&bpage);
} while (bpage != start);
/* commit not part of this buffer?? */
@@ -3113,7 +3775,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
}
/**
- * ring_buffer_commit_discard - discard an event that has not been committed
+ * ring_buffer_discard_commit - discard an event that has not been committed
* @buffer: the ring buffer
* @event: non committed event to discard
*
@@ -3154,11 +3816,6 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer,
if (rb_try_to_discard(cpu_buffer, event))
goto out;
- /*
- * The commit is still visible by the reader, so we
- * must still update the timestamp.
- */
- rb_update_write_stamp(cpu_buffer, event);
out:
rb_end_commit(cpu_buffer);
@@ -3207,7 +3864,7 @@ int ring_buffer_write(struct trace_buffer *buffer,
if (atomic_read(&cpu_buffer->record_disabled))
goto out;
- if (length > BUF_MAX_DATA_SIZE)
+ if (length > buffer->max_data_size)
goto out;
if (unlikely(trace_recursive_lock(cpu_buffer)))
@@ -3221,7 +3878,7 @@ int ring_buffer_write(struct trace_buffer *buffer,
memcpy(body, data, length);
- rb_commit(cpu_buffer, event);
+ rb_commit(cpu_buffer);
rb_wakeups(buffer, cpu_buffer);
@@ -3247,10 +3904,30 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
if (unlikely(!head))
return true;
- return reader->read == rb_page_commit(reader) &&
- (commit == reader ||
- (commit == head &&
- head->read == rb_page_commit(commit)));
+ /* Reader should exhaust content in reader page */
+ if (reader->read != rb_page_commit(reader))
+ return false;
+
+ /*
+ * If writers are committing on the reader page, knowing all
+ * committed content has been read, the ring buffer is empty.
+ */
+ if (commit == reader)
+ return true;
+
+ /*
+ * If writers are committing on a page other than reader page
+ * and head page, there should always be content to read.
+ */
+ if (commit != head)
+ return false;
+
+ /*
+ * Writers are committing on the head page, we just need
+ * to care about there're committed data, and the reader will
+ * swap reader page with head page when it is to read data.
+ */
+ return rb_page_commit(commit) == 0;
}
/**
@@ -3297,10 +3974,10 @@ void ring_buffer_record_off(struct trace_buffer *buffer)
unsigned int rd;
unsigned int new_rd;
+ rd = atomic_read(&buffer->record_disabled);
do {
- rd = atomic_read(&buffer->record_disabled);
new_rd = rd | RB_BUFFER_OFF;
- } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+ } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd));
}
EXPORT_SYMBOL_GPL(ring_buffer_record_off);
@@ -3320,10 +3997,10 @@ void ring_buffer_record_on(struct trace_buffer *buffer)
unsigned int rd;
unsigned int new_rd;
+ rd = atomic_read(&buffer->record_disabled);
do {
- rd = atomic_read(&buffer->record_disabled);
new_rd = rd & ~RB_BUFFER_OFF;
- } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+ } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd));
}
EXPORT_SYMBOL_GPL(ring_buffer_record_on);
@@ -3443,7 +4120,7 @@ u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu)
EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
/**
- * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
+ * ring_buffer_bytes_cpu - get the number of bytes unconsumed in a cpu buffer
* @buffer: The ring buffer
* @cpu: The per CPU buffer to read from.
*/
@@ -3621,6 +4298,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
iter->cache_reader_page = iter->head_page;
iter->cache_read = cpu_buffer->read;
+ iter->cache_pages_removed = cpu_buffer->pages_removed;
if (iter->head) {
iter->read_stamp = cpu_buffer->read_stamp;
@@ -3713,12 +4391,13 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
return;
case RINGBUF_TYPE_TIME_EXTEND:
- delta = ring_buffer_event_time_stamp(event);
+ delta = rb_event_time_stamp(event);
cpu_buffer->read_stamp += delta;
return;
case RINGBUF_TYPE_TIME_STAMP:
- delta = ring_buffer_event_time_stamp(event);
+ delta = rb_event_time_stamp(event);
+ delta = rb_fix_abs_ts(delta, cpu_buffer->read_stamp);
cpu_buffer->read_stamp = delta;
return;
@@ -3729,7 +4408,6 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
default:
RB_WARN_ON(cpu_buffer, 1);
}
- return;
}
static void
@@ -3743,12 +4421,13 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
return;
case RINGBUF_TYPE_TIME_EXTEND:
- delta = ring_buffer_event_time_stamp(event);
+ delta = rb_event_time_stamp(event);
iter->read_stamp += delta;
return;
case RINGBUF_TYPE_TIME_STAMP:
- delta = ring_buffer_event_time_stamp(event);
+ delta = rb_event_time_stamp(event);
+ delta = rb_fix_abs_ts(delta, iter->read_stamp);
iter->read_stamp = delta;
return;
@@ -3759,17 +4438,17 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
default:
RB_WARN_ON(iter->cpu_buffer, 1);
}
- return;
}
static struct buffer_page *
rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = NULL;
+ unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
unsigned long overwrite;
unsigned long flags;
int nr_loops = 0;
- int ret;
+ bool ret;
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
@@ -3832,7 +4511,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->pages = reader->list.prev;
/* The reader page will be pointing to the new head */
- rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
+ rb_set_list_to_head(&cpu_buffer->reader_page->list);
/*
* We want to make sure we read the overruns after we set up our
@@ -3871,7 +4550,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
* Now make the new head point back to the reader page.
*/
rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
- rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ rb_inc_page(&cpu_buffer->head_page);
local_inc(&cpu_buffer->pages_read);
@@ -3894,6 +4573,38 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
+ /*
+ * The writer has preempt disable, wait for it. But not forever
+ * Although, 1 second is pretty much "forever"
+ */
+#define USECS_WAIT 1000000
+ for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) {
+ /* If the write is past the end of page, a writer is still updating it */
+ if (likely(!reader || rb_page_write(reader) <= bsize))
+ break;
+
+ udelay(1);
+
+ /* Get the latest version of the reader write value */
+ smp_rmb();
+ }
+
+ /* The writer is not moving forward? Something is wrong */
+ if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT))
+ reader = NULL;
+
+ /*
+ * Make sure we see any padding after the write update
+ * (see rb_reset_tail()).
+ *
+ * In addition, a writer may be writing on the reader page
+ * if the page has not been fully filled, so the read barrier
+ * is also needed to make sure we see the content of what is
+ * committed by the writer (see rb_set_commit_to_write()).
+ */
+ smp_rmb();
+
+
return reader;
}
@@ -3918,6 +4629,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
length = rb_event_length(event);
cpu_buffer->reader_page->read += length;
+ cpu_buffer->read_bytes += length;
}
static void rb_advance_iter(struct ring_buffer_iter *iter)
@@ -4001,7 +4713,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
case RINGBUF_TYPE_TIME_STAMP:
if (ts) {
- *ts = ring_buffer_event_time_stamp(event);
+ *ts = rb_event_time_stamp(event);
+ *ts = rb_fix_abs_ts(*ts, reader->page->time_stamp);
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}
@@ -4042,12 +4755,13 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
buffer = cpu_buffer->buffer;
/*
- * Check if someone performed a consuming read to
- * the buffer. A consuming read invalidates the iterator
- * and we need to reset the iterator in this case.
+ * Check if someone performed a consuming read to the buffer
+ * or removed some pages from the buffer. In these cases,
+ * iterator was invalidated and we need to reset it.
*/
if (unlikely(iter->cache_read != cpu_buffer->read ||
- iter->cache_reader_page != cpu_buffer->reader_page))
+ iter->cache_reader_page != cpu_buffer->reader_page ||
+ iter->cache_pages_removed != cpu_buffer->pages_removed))
rb_iter_reset(iter);
again:
@@ -4092,7 +4806,8 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
case RINGBUF_TYPE_TIME_STAMP:
if (ts) {
- *ts = ring_buffer_event_time_stamp(event);
+ *ts = rb_event_time_stamp(event);
+ *ts = rb_fix_abs_ts(*ts, iter->head_page->page->time_stamp);
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}
@@ -4145,7 +4860,6 @@ rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked)
{
if (likely(locked))
raw_spin_unlock(&cpu_buffer->reader_lock);
- return;
}
/**
@@ -4309,7 +5023,9 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
if (!iter)
return NULL;
- iter->event = kmalloc(BUF_MAX_DATA_SIZE, flags);
+ /* Holds the entire event: data and meta data */
+ iter->event_size = buffer->subbuf_size;
+ iter->event = kmalloc(iter->event_size, flags);
if (!iter->event) {
kfree(iter);
return NULL;
@@ -4425,41 +5141,56 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_advance);
*/
unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
{
- /*
- * Earlier, this method returned
- * BUF_PAGE_SIZE * buffer->nr_pages
- * Since the nr_pages field is now removed, we have converted this to
- * return the per cpu buffer value.
- */
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
- return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
+ return buffer->subbuf_size * buffer->buffers[cpu]->nr_pages;
}
EXPORT_SYMBOL_GPL(ring_buffer_size);
+/**
+ * ring_buffer_max_event_size - return the max data size of an event
+ * @buffer: The ring buffer.
+ *
+ * Returns the maximum size an event can be.
+ */
+unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer)
+{
+ /* If abs timestamp is requested, events have a timestamp too */
+ if (ring_buffer_time_stamp_abs(buffer))
+ return buffer->max_data_size - RB_LEN_TIME_EXTEND;
+ return buffer->max_data_size;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_max_event_size);
+
+static void rb_clear_buffer_page(struct buffer_page *page)
+{
+ local_set(&page->write, 0);
+ local_set(&page->entries, 0);
+ rb_init_page(page->page);
+ page->read = 0;
+}
+
static void
rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
{
+ struct buffer_page *page;
+
rb_head_page_deactivate(cpu_buffer);
cpu_buffer->head_page
= list_entry(cpu_buffer->pages, struct buffer_page, list);
- local_set(&cpu_buffer->head_page->write, 0);
- local_set(&cpu_buffer->head_page->entries, 0);
- local_set(&cpu_buffer->head_page->page->commit, 0);
-
- cpu_buffer->head_page->read = 0;
+ rb_clear_buffer_page(cpu_buffer->head_page);
+ list_for_each_entry(page, cpu_buffer->pages, list) {
+ rb_clear_buffer_page(page);
+ }
cpu_buffer->tail_page = cpu_buffer->head_page;
cpu_buffer->commit_page = cpu_buffer->head_page;
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
INIT_LIST_HEAD(&cpu_buffer->new_pages);
- local_set(&cpu_buffer->reader_page->write, 0);
- local_set(&cpu_buffer->reader_page->entries, 0);
- local_set(&cpu_buffer->reader_page->page->commit, 0);
- cpu_buffer->reader_page->read = 0;
+ rb_clear_buffer_page(cpu_buffer->reader_page);
local_set(&cpu_buffer->entries_bytes, 0);
local_set(&cpu_buffer->overrun, 0);
@@ -4469,19 +5200,43 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->committing, 0);
local_set(&cpu_buffer->commits, 0);
local_set(&cpu_buffer->pages_touched, 0);
+ local_set(&cpu_buffer->pages_lost, 0);
local_set(&cpu_buffer->pages_read, 0);
cpu_buffer->last_pages_touch = 0;
cpu_buffer->shortest_full = 0;
cpu_buffer->read = 0;
cpu_buffer->read_bytes = 0;
- cpu_buffer->write_stamp = 0;
- cpu_buffer->read_stamp = 0;
+ rb_time_set(&cpu_buffer->write_stamp, 0);
+ rb_time_set(&cpu_buffer->before_stamp, 0);
+
+ memset(cpu_buffer->event_stamp, 0, sizeof(cpu_buffer->event_stamp));
cpu_buffer->lost_events = 0;
cpu_buffer->last_overrun = 0;
rb_head_page_activate(cpu_buffer);
+ cpu_buffer->pages_removed = 0;
+}
+
+/* Must have disabled the cpu buffer then done a synchronize_rcu */
+static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
+ goto out;
+
+ arch_spin_lock(&cpu_buffer->lock);
+
+ rb_reset_cpu(cpu_buffer);
+
+ arch_spin_unlock(&cpu_buffer->lock);
+
+ out:
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
}
/**
@@ -4492,35 +5247,71 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
- unsigned long flags;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;
+ /* prevent another thread from changing buffer sizes */
+ mutex_lock(&buffer->mutex);
+
atomic_inc(&cpu_buffer->resize_disabled);
atomic_inc(&cpu_buffer->record_disabled);
/* Make sure all commits have finished */
synchronize_rcu();
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ reset_disabled_cpu_buffer(cpu_buffer);
- if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
- goto out;
+ atomic_dec(&cpu_buffer->record_disabled);
+ atomic_dec(&cpu_buffer->resize_disabled);
- arch_spin_lock(&cpu_buffer->lock);
+ mutex_unlock(&buffer->mutex);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
- rb_reset_cpu(cpu_buffer);
+/* Flag to ensure proper resetting of atomic variables */
+#define RESET_BIT (1 << 30)
- arch_spin_unlock(&cpu_buffer->lock);
+/**
+ * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ */
+void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
- out:
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ /* prevent another thread from changing buffer sizes */
+ mutex_lock(&buffer->mutex);
- atomic_dec(&cpu_buffer->record_disabled);
- atomic_dec(&cpu_buffer->resize_disabled);
+ for_each_online_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ atomic_add(RESET_BIT, &cpu_buffer->resize_disabled);
+ atomic_inc(&cpu_buffer->record_disabled);
+ }
+
+ /* Make sure all commits have finished */
+ synchronize_rcu();
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ /*
+ * If a CPU came online during the synchronize_rcu(), then
+ * ignore it.
+ */
+ if (!(atomic_read(&cpu_buffer->resize_disabled) & RESET_BIT))
+ continue;
+
+ reset_disabled_cpu_buffer(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled);
+ }
+
+ mutex_unlock(&buffer->mutex);
}
-EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
/**
* ring_buffer_reset - reset a ring buffer
@@ -4528,15 +5319,37 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
*/
void ring_buffer_reset(struct trace_buffer *buffer)
{
+ struct ring_buffer_per_cpu *cpu_buffer;
int cpu;
- for_each_buffer_cpu(buffer, cpu)
- ring_buffer_reset_cpu(buffer, cpu);
+ /* prevent another thread from changing buffer sizes */
+ mutex_lock(&buffer->mutex);
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ atomic_inc(&cpu_buffer->resize_disabled);
+ atomic_inc(&cpu_buffer->record_disabled);
+ }
+
+ /* Make sure all commits have finished */
+ synchronize_rcu();
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ reset_disabled_cpu_buffer(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ atomic_dec(&cpu_buffer->resize_disabled);
+ }
+
+ mutex_unlock(&buffer->mutex);
}
EXPORT_SYMBOL_GPL(ring_buffer_reset);
/**
- * rind_buffer_empty - is the ring buffer empty?
+ * ring_buffer_empty - is the ring buffer empty?
* @buffer: The ring buffer to test
*/
bool ring_buffer_empty(struct trace_buffer *buffer)
@@ -4544,8 +5357,8 @@ bool ring_buffer_empty(struct trace_buffer *buffer)
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
bool dolock;
+ bool ret;
int cpu;
- int ret;
/* yes this is racy, but if you don't like the race, lock the buffer */
for_each_buffer_cpu(buffer, cpu) {
@@ -4574,7 +5387,7 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu)
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
bool dolock;
- int ret;
+ bool ret;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return true;
@@ -4620,6 +5433,9 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
goto out;
+ if (buffer_a->subbuf_order != buffer_b->subbuf_order)
+ goto out;
+
ret = -EAGAIN;
if (atomic_read(&buffer_a->record_disabled))
@@ -4649,6 +5465,15 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
if (local_read(&cpu_buffer_b->committing))
goto out_dec;
+ /*
+ * When resize is in progress, we cannot swap it because
+ * it will mess the state of the cpu buffer.
+ */
+ if (atomic_read(&buffer_a->resizing))
+ goto out_dec;
+ if (atomic_read(&buffer_b->resizing))
+ goto out_dec;
+
buffer_a->buffers[cpu] = cpu_buffer_b;
buffer_b->buffers[cpu] = cpu_buffer_a;
@@ -4682,40 +5507,48 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
* Returns:
* The page allocated, or ERR_PTR
*/
-void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
+struct buffer_data_read_page *
+ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct buffer_data_page *bpage = NULL;
+ struct buffer_data_read_page *bpage = NULL;
unsigned long flags;
struct page *page;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return ERR_PTR(-ENODEV);
+ bpage = kzalloc(sizeof(*bpage), GFP_KERNEL);
+ if (!bpage)
+ return ERR_PTR(-ENOMEM);
+
+ bpage->order = buffer->subbuf_order;
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
if (cpu_buffer->free_page) {
- bpage = cpu_buffer->free_page;
+ bpage->data = cpu_buffer->free_page;
cpu_buffer->free_page = NULL;
}
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
- if (bpage)
+ if (bpage->data)
goto out;
- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY, 0);
- if (!page)
+ page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY,
+ cpu_buffer->buffer->subbuf_order);
+ if (!page) {
+ kfree(bpage);
return ERR_PTR(-ENOMEM);
+ }
- bpage = page_address(page);
+ bpage->data = page_address(page);
out:
- rb_init_page(bpage);
+ rb_init_page(bpage->data);
return bpage;
}
@@ -4725,19 +5558,29 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
* ring_buffer_free_read_page - free an allocated read page
* @buffer: the buffer the page was allocate for
* @cpu: the cpu buffer the page came from
- * @data: the page to free
+ * @data_page: the page to free
*
* Free a page allocated from ring_buffer_alloc_read_page.
*/
-void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data)
+void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
+ struct buffer_data_read_page *data_page)
{
- struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
- struct buffer_data_page *bpage = data;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_data_page *bpage = data_page->data;
struct page *page = virt_to_page(bpage);
unsigned long flags;
- /* If the page is still in use someplace else, we can't reuse it */
- if (page_ref_count(page) > 1)
+ if (!buffer || !buffer->buffers || !buffer->buffers[cpu])
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ /*
+ * If the page is still in use someplace else, or order of the page
+ * is different from the subbuffer order of the buffer -
+ * we can't reuse it
+ */
+ if (page_ref_count(page) > 1 || data_page->order != buffer->subbuf_order)
goto out;
local_irq_save(flags);
@@ -4752,7 +5595,8 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data
local_irq_restore(flags);
out:
- free_page((unsigned long)bpage);
+ free_pages((unsigned long)bpage, data_page->order);
+ kfree(data_page);
}
EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
@@ -4773,9 +5617,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
* rpage = ring_buffer_alloc_read_page(buffer, cpu);
* if (IS_ERR(rpage))
* return PTR_ERR(rpage);
- * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
+ * ret = ring_buffer_read_page(buffer, rpage, len, cpu, 0);
* if (ret >= 0)
- * process_page(rpage, ret);
+ * process_page(ring_buffer_read_page_data(rpage), ret);
+ * ring_buffer_free_read_page(buffer, cpu, rpage);
*
* When @full is set, the function will not return true unless
* the writer is off the reader page.
@@ -4790,7 +5635,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
* <0 if no data has been transferred.
*/
int ring_buffer_read_page(struct trace_buffer *buffer,
- void **data_page, size_t len, int cpu, int full)
+ struct buffer_data_read_page *data_page,
+ size_t len, int cpu, int full)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_event *event;
@@ -4815,10 +5661,12 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
len -= BUF_PAGE_HDR_SIZE;
- if (!data_page)
+ if (!data_page || !data_page->data)
+ goto out;
+ if (data_page->order != buffer->subbuf_order)
goto out;
- bpage = *data_page;
+ bpage = data_page->data;
if (!bpage)
goto out;
@@ -4850,7 +5698,15 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
unsigned int pos = 0;
unsigned int size;
- if (full)
+ /*
+ * If a full page is expected, this can still be returned
+ * if there's been a previous partial read and the
+ * rest of the page can be read and the commit page is off
+ * the reader page.
+ */
+ if (full &&
+ (!read || (len < (commit - read)) ||
+ cpu_buffer->reader_page == cpu_buffer->commit_page))
goto out_unlock;
if (len > (commit - read))
@@ -4899,16 +5755,16 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
} else {
/* update the entry counter */
cpu_buffer->read += rb_page_entries(reader);
- cpu_buffer->read_bytes += BUF_PAGE_SIZE;
+ cpu_buffer->read_bytes += rb_page_commit(reader);
/* swap the pages */
rb_init_page(bpage);
bpage = reader->page;
- reader->page = *data_page;
+ reader->page = data_page->data;
local_set(&reader->write, 0);
local_set(&reader->entries, 0);
reader->read = 0;
- *data_page = bpage;
+ data_page->data = bpage;
/*
* Use the real_end for the data size,
@@ -4930,7 +5786,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/* If there is room at the end of the page to save the
* missed events, then record it there.
*/
- if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+ if (buffer->subbuf_size - commit >= sizeof(missed_events)) {
memcpy(&bpage->data[commit], &missed_events,
sizeof(missed_events));
local_add(RB_MISSED_STORED, &bpage->commit);
@@ -4942,8 +5798,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/*
* This page may be off to user land. Zero it out here.
*/
- if (commit < BUF_PAGE_SIZE)
- memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
+ if (commit < buffer->subbuf_size)
+ memset(&bpage->data[commit], 0, buffer->subbuf_size - commit);
out_unlock:
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -4953,6 +5809,213 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
+/**
+ * ring_buffer_read_page_data - get pointer to the data in the page.
+ * @page: the page to get the data from
+ *
+ * Returns pointer to the actual data in this page.
+ */
+void *ring_buffer_read_page_data(struct buffer_data_read_page *page)
+{
+ return page->data;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_page_data);
+
+/**
+ * ring_buffer_subbuf_size_get - get size of the sub buffer.
+ * @buffer: the buffer to get the sub buffer size from
+ *
+ * Returns size of the sub buffer, in bytes.
+ */
+int ring_buffer_subbuf_size_get(struct trace_buffer *buffer)
+{
+ return buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_size_get);
+
+/**
+ * ring_buffer_subbuf_order_get - get order of system sub pages in one buffer page.
+ * @buffer: The ring_buffer to get the system sub page order from
+ *
+ * By default, one ring buffer sub page equals to one system page. This parameter
+ * is configurable, per ring buffer. The size of the ring buffer sub page can be
+ * extended, but must be an order of system page size.
+ *
+ * Returns the order of buffer sub page size, in system pages:
+ * 0 means the sub buffer size is 1 system page and so forth.
+ * In case of an error < 0 is returned.
+ */
+int ring_buffer_subbuf_order_get(struct trace_buffer *buffer)
+{
+ if (!buffer)
+ return -EINVAL;
+
+ return buffer->subbuf_order;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get);
+
+/**
+ * ring_buffer_subbuf_order_set - set the size of ring buffer sub page.
+ * @buffer: The ring_buffer to set the new page size.
+ * @order: Order of the system pages in one sub buffer page
+ *
+ * By default, one ring buffer pages equals to one system page. This API can be
+ * used to set new size of the ring buffer page. The size must be order of
+ * system page size, that's why the input parameter @order is the order of
+ * system pages that are allocated for one ring buffer page:
+ * 0 - 1 system page
+ * 1 - 2 system pages
+ * 3 - 4 system pages
+ * ...
+ *
+ * Returns 0 on success or < 0 in case of an error.
+ */
+int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *bpage, *tmp;
+ int old_order, old_size;
+ int nr_pages;
+ int psize;
+ int err;
+ int cpu;
+
+ if (!buffer || order < 0)
+ return -EINVAL;
+
+ if (buffer->subbuf_order == order)
+ return 0;
+
+ psize = (1 << order) * PAGE_SIZE;
+ if (psize <= BUF_PAGE_HDR_SIZE)
+ return -EINVAL;
+
+ /* Size of a subbuf cannot be greater than the write counter */
+ if (psize > RB_WRITE_MASK + 1)
+ return -EINVAL;
+
+ old_order = buffer->subbuf_order;
+ old_size = buffer->subbuf_size;
+
+ /* prevent another thread from changing buffer sizes */
+ mutex_lock(&buffer->mutex);
+ atomic_inc(&buffer->record_disabled);
+
+ /* Make sure all commits have finished */
+ synchronize_rcu();
+
+ buffer->subbuf_order = order;
+ buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE;
+
+ /* Make sure all new buffers are allocated, before deleting the old ones */
+ for_each_buffer_cpu(buffer, cpu) {
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ continue;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ /* Update the number of pages to match the new size */
+ nr_pages = old_size * buffer->buffers[cpu]->nr_pages;
+ nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size);
+
+ /* we need a minimum of two pages */
+ if (nr_pages < 2)
+ nr_pages = 2;
+
+ cpu_buffer->nr_pages_to_update = nr_pages;
+
+ /* Include the reader page */
+ nr_pages++;
+
+ /* Allocate the new size buffer */
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ if (__rb_allocate_pages(cpu_buffer, nr_pages,
+ &cpu_buffer->new_pages)) {
+ /* not enough memory for new pages */
+ err = -ENOMEM;
+ goto error;
+ }
+ }
+
+ for_each_buffer_cpu(buffer, cpu) {
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ continue;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ /* Clear the head bit to make the link list normal to read */
+ rb_head_page_deactivate(cpu_buffer);
+
+ /* Now walk the list and free all the old sub buffers */
+ list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ /* The above loop stopped an the last page needing to be freed */
+ bpage = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ free_buffer_page(bpage);
+
+ /* Free the current reader page */
+ free_buffer_page(cpu_buffer->reader_page);
+
+ /* One page was allocated for the reader page */
+ cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next,
+ struct buffer_page, list);
+ list_del_init(&cpu_buffer->reader_page->list);
+
+ /* The cpu_buffer pages are a link list with no head */
+ cpu_buffer->pages = cpu_buffer->new_pages.next;
+ cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev;
+ cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next;
+
+ /* Clear the new_pages list */
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+
+ cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update;
+ cpu_buffer->nr_pages_to_update = 0;
+
+ free_pages((unsigned long)cpu_buffer->free_page, old_order);
+ cpu_buffer->free_page = NULL;
+
+ rb_head_page_activate(cpu_buffer);
+
+ rb_check_pages(cpu_buffer);
+ }
+
+ atomic_dec(&buffer->record_disabled);
+ mutex_unlock(&buffer->mutex);
+
+ return 0;
+
+error:
+ buffer->subbuf_order = old_order;
+ buffer->subbuf_size = old_size;
+
+ atomic_dec(&buffer->record_disabled);
+ mutex_unlock(&buffer->mutex);
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (!cpu_buffer->nr_pages_to_update)
+ continue;
+
+ list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set);
+
/*
* We only allocate new buffers, never free them if the CPU goes down.
* If we were to free the buffer, then the user would lose any trace that was in
@@ -5111,7 +6174,7 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested)
}
out:
- ring_buffer_unlock_commit(data->buffer, event);
+ ring_buffer_unlock_commit(data->buffer);
return 0;
}
@@ -5179,16 +6242,13 @@ static __init int test_ringbuffer(void)
rb_data[cpu].buffer = buffer;
rb_data[cpu].cpu = cpu;
rb_data[cpu].cnt = cpu;
- rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
- "rbtester/%d", cpu);
+ rb_threads[cpu] = kthread_run_on_cpu(rb_test, &rb_data[cpu],
+ cpu, "rbtester/%u");
if (WARN_ON(IS_ERR(rb_threads[cpu]))) {
pr_cont("FAILED\n");
ret = PTR_ERR(rb_threads[cpu]);
goto out_free;
}
-
- kthread_bind(rb_threads[cpu], cpu);
- wake_up_process(rb_threads[cpu]);
}
/* Now create the rb hammer! */
@@ -5295,10 +6355,10 @@ static __init int test_ringbuffer(void)
pr_info(" total events: %ld\n", total_lost + total_read);
pr_info(" recorded len bytes: %ld\n", total_len);
pr_info(" recorded size bytes: %ld\n", total_size);
- if (total_lost)
+ if (total_lost) {
pr_info(" With dropped events, record len and size may not match\n"
" alloced and written from above\n");
- if (!total_lost) {
+ } else {
if (RB_WARN_ON(buffer, total_len != total_alloc ||
total_size != total_written))
break;
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 8df0aa810950..008187ebd7fe 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -45,8 +45,8 @@ MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
static int producer_nice = MAX_NICE;
static int consumer_nice = MAX_NICE;
-static int producer_fifo = -1;
-static int consumer_fifo = -1;
+static int producer_fifo;
+static int consumer_fifo;
module_param(producer_nice, int, 0644);
MODULE_PARM_DESC(producer_nice, "nice prio for producer");
@@ -55,10 +55,10 @@ module_param(consumer_nice, int, 0644);
MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
module_param(producer_fifo, int, 0644);
-MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
+MODULE_PARM_DESC(producer_fifo, "use fifo for producer: 0 - disabled, 1 - low prio, 2 - fifo");
module_param(consumer_fifo, int, 0644);
-MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
+MODULE_PARM_DESC(consumer_fifo, "use fifo for consumer: 0 - disabled, 1 - low prio, 2 - fifo");
static int read_events;
@@ -104,10 +104,11 @@ static enum event_status read_event(int cpu)
static enum event_status read_page(int cpu)
{
+ struct buffer_data_read_page *bpage;
struct ring_buffer_event *event;
struct rb_page *rpage;
unsigned long commit;
- void *bpage;
+ int page_size;
int *entry;
int ret;
int inc;
@@ -117,14 +118,15 @@ static enum event_status read_page(int cpu)
if (IS_ERR(bpage))
return EVENT_DROPPED;
- ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
+ page_size = ring_buffer_subbuf_size_get(buffer);
+ ret = ring_buffer_read_page(buffer, bpage, page_size, cpu, 1);
if (ret >= 0) {
- rpage = bpage;
+ rpage = ring_buffer_read_page_data(bpage);
/* The commit may have missed event flags set, clear them */
commit = local_read(&rpage->commit) & 0xfffff;
for (i = 0; i < commit && !test_error ; i += inc) {
- if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
+ if (i >= (page_size - offsetof(struct rb_page, data))) {
TEST_ERROR();
break;
}
@@ -258,7 +260,7 @@ static void ring_buffer_producer(void)
hit++;
entry = ring_buffer_event_data(event);
*entry = smp_processor_id();
- ring_buffer_unlock_commit(buffer, event);
+ ring_buffer_unlock_commit(buffer);
}
}
end_time = ktime_get();
@@ -303,22 +305,22 @@ static void ring_buffer_producer(void)
trace_printk("ERROR!\n");
if (!disable_reader) {
- if (consumer_fifo < 0)
+ if (consumer_fifo)
+ trace_printk("Running Consumer at SCHED_FIFO %s\n",
+ consumer_fifo == 1 ? "low" : "high");
+ else
trace_printk("Running Consumer at nice: %d\n",
consumer_nice);
- else
- trace_printk("Running Consumer at SCHED_FIFO %d\n",
- consumer_fifo);
}
- if (producer_fifo < 0)
+ if (producer_fifo)
+ trace_printk("Running Producer at SCHED_FIFO %s\n",
+ producer_fifo == 1 ? "low" : "high");
+ else
trace_printk("Running Producer at nice: %d\n",
producer_nice);
- else
- trace_printk("Running Producer at SCHED_FIFO %d\n",
- producer_fifo);
/* Let the user know that the test is running at low priority */
- if (producer_fifo < 0 && consumer_fifo < 0 &&
+ if (!producer_fifo && !consumer_fifo &&
producer_nice == MAX_NICE && consumer_nice == MAX_NICE)
trace_printk("WARNING!!! This test is running at lowest priority.\n");
@@ -455,21 +457,19 @@ static int __init ring_buffer_benchmark_init(void)
* Run them as low-prio background tasks by default:
*/
if (!disable_reader) {
- if (consumer_fifo >= 0) {
- struct sched_param param = {
- .sched_priority = consumer_fifo
- };
- sched_setscheduler(consumer, SCHED_FIFO, &param);
- } else
+ if (consumer_fifo >= 2)
+ sched_set_fifo(consumer);
+ else if (consumer_fifo == 1)
+ sched_set_fifo_low(consumer);
+ else
set_user_nice(consumer, consumer_nice);
}
- if (producer_fifo >= 0) {
- struct sched_param param = {
- .sched_priority = producer_fifo
- };
- sched_setscheduler(producer, SCHED_FIFO, &param);
- } else
+ if (producer_fifo >= 2)
+ sched_set_fifo(producer);
+ else if (producer_fifo == 1)
+ sched_set_fifo_low(producer);
+ else
set_user_nice(producer, producer_nice);
return 0;
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
new file mode 100644
index 000000000000..831779607e84
--- /dev/null
+++ b/kernel/trace/rv/Kconfig
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config DA_MON_EVENTS
+ bool
+
+config DA_MON_EVENTS_IMPLICIT
+ select DA_MON_EVENTS
+ bool
+
+config DA_MON_EVENTS_ID
+ select DA_MON_EVENTS
+ bool
+
+menuconfig RV
+ bool "Runtime Verification"
+ depends on TRACING
+ help
+ Enable the kernel runtime verification infrastructure. RV is a
+ lightweight (yet rigorous) method that complements classical
+ exhaustive verification techniques (such as model checking and
+ theorem proving). RV works by analyzing the trace of the system's
+ actual execution, comparing it against a formal specification of
+ the system behavior.
+
+ For further information, see:
+ Documentation/trace/rv/runtime-verification.rst
+
+config RV_MON_WIP
+ depends on RV
+ depends on PREEMPT_TRACER
+ select DA_MON_EVENTS_IMPLICIT
+ bool "wip monitor"
+ help
+ Enable wip (wakeup in preemptive) sample monitor that illustrates
+ the usage of per-cpu monitors, and one limitation of the
+ preempt_disable/enable events.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_wip.rst
+
+config RV_MON_WWNR
+ depends on RV
+ select DA_MON_EVENTS_ID
+ bool "wwnr monitor"
+ help
+ Enable wwnr (wakeup while not running) sample monitor, this is a
+ sample monitor that illustrates the usage of per-task monitor.
+ The model is borken on purpose: it serves to test reactors.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_wwnr.rst
+
+config RV_REACTORS
+ bool "Runtime verification reactors"
+ default y
+ depends on RV
+ help
+ Enables the online runtime verification reactors. A runtime
+ monitor can cause a reaction to the detection of an exception
+ on the model's execution. By default, the monitors have
+ tracing reactions, printing the monitor output via tracepoints,
+ but other reactions can be added (on-demand) via this interface.
+
+config RV_REACT_PRINTK
+ bool "Printk reactor"
+ depends on RV_REACTORS
+ default y
+ help
+ Enables the printk reactor. The printk reactor emits a printk()
+ message if an exception is found.
+
+config RV_REACT_PANIC
+ bool "Panic reactor"
+ depends on RV_REACTORS
+ default y
+ help
+ Enables the panic reactor. The panic reactor emits a printk()
+ message if an exception is found and panic()s the system.
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
new file mode 100644
index 000000000000..963d14875b45
--- /dev/null
+++ b/kernel/trace/rv/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_RV) += rv.o
+obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o
+obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o
+obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
+obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o
+obj-$(CONFIG_RV_REACT_PANIC) += reactor_panic.o
diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c
new file mode 100644
index 000000000000..b2b49a27e886
--- /dev/null
+++ b/kernel/trace/rv/monitors/wip/wip.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "wip"
+
+#include <trace/events/rv.h>
+#include <trace/events/sched.h>
+#include <trace/events/preemptirq.h>
+
+#include "wip.h"
+
+static struct rv_monitor rv_wip;
+DECLARE_DA_MON_PER_CPU(wip, unsigned char);
+
+static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event_wip(preempt_disable_wip);
+}
+
+static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_start_event_wip(preempt_enable_wip);
+}
+
+static void handle_sched_waking(void *data, struct task_struct *task)
+{
+ da_handle_event_wip(sched_waking_wip);
+}
+
+static int enable_wip(void)
+{
+ int retval;
+
+ retval = da_monitor_init_wip();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("wip", preempt_enable, handle_preempt_enable);
+ rv_attach_trace_probe("wip", sched_waking, handle_sched_waking);
+ rv_attach_trace_probe("wip", preempt_disable, handle_preempt_disable);
+
+ return 0;
+}
+
+static void disable_wip(void)
+{
+ rv_wip.enabled = 0;
+
+ rv_detach_trace_probe("wip", preempt_disable, handle_preempt_disable);
+ rv_detach_trace_probe("wip", preempt_enable, handle_preempt_enable);
+ rv_detach_trace_probe("wip", sched_waking, handle_sched_waking);
+
+ da_monitor_destroy_wip();
+}
+
+static struct rv_monitor rv_wip = {
+ .name = "wip",
+ .description = "wakeup in preemptive per-cpu testing monitor.",
+ .enable = enable_wip,
+ .disable = disable_wip,
+ .reset = da_monitor_reset_all_wip,
+ .enabled = 0,
+};
+
+static int __init register_wip(void)
+{
+ rv_register_monitor(&rv_wip);
+ return 0;
+}
+
+static void __exit unregister_wip(void)
+{
+ rv_unregister_monitor(&rv_wip);
+}
+
+module_init(register_wip);
+module_exit(unregister_wip);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Daniel Bristot de Oliveira <bristot@kernel.org>");
+MODULE_DESCRIPTION("wip: wakeup in preemptive - per-cpu sample monitor.");
diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h
new file mode 100644
index 000000000000..2e373f2c65ed
--- /dev/null
+++ b/kernel/trace/rv/monitors/wip/wip.h
@@ -0,0 +1,46 @@
+/*
+ * Automatically generated C representation of wip automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_wip {
+ preemptive_wip = 0,
+ non_preemptive_wip,
+ state_max_wip
+};
+
+#define INVALID_STATE state_max_wip
+
+enum events_wip {
+ preempt_disable_wip = 0,
+ preempt_enable_wip,
+ sched_waking_wip,
+ event_max_wip
+};
+
+struct automaton_wip {
+ char *state_names[state_max_wip];
+ char *event_names[event_max_wip];
+ unsigned char function[state_max_wip][event_max_wip];
+ unsigned char initial_state;
+ bool final_states[state_max_wip];
+};
+
+static const struct automaton_wip automaton_wip = {
+ .state_names = {
+ "preemptive",
+ "non_preemptive"
+ },
+ .event_names = {
+ "preempt_disable",
+ "preempt_enable",
+ "sched_waking"
+ },
+ .function = {
+ { non_preemptive_wip, INVALID_STATE, INVALID_STATE },
+ { INVALID_STATE, preemptive_wip, non_preemptive_wip },
+ },
+ .initial_state = preemptive_wip,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c
new file mode 100644
index 000000000000..0e43dd2db685
--- /dev/null
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "wwnr"
+
+#include <trace/events/rv.h>
+#include <trace/events/sched.h>
+
+#include "wwnr.h"
+
+static struct rv_monitor rv_wwnr;
+DECLARE_DA_MON_PER_TASK(wwnr, unsigned char);
+
+static void handle_switch(void *data, bool preempt, struct task_struct *p,
+ struct task_struct *n, unsigned int prev_state)
+{
+ /* start monitoring only after the first suspension */
+ if (prev_state == TASK_INTERRUPTIBLE)
+ da_handle_start_event_wwnr(p, switch_out_wwnr);
+ else
+ da_handle_event_wwnr(p, switch_out_wwnr);
+
+ da_handle_event_wwnr(n, switch_in_wwnr);
+}
+
+static void handle_wakeup(void *data, struct task_struct *p)
+{
+ da_handle_event_wwnr(p, wakeup_wwnr);
+}
+
+static int enable_wwnr(void)
+{
+ int retval;
+
+ retval = da_monitor_init_wwnr();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("wwnr", sched_switch, handle_switch);
+ rv_attach_trace_probe("wwnr", sched_wakeup, handle_wakeup);
+
+ return 0;
+}
+
+static void disable_wwnr(void)
+{
+ rv_wwnr.enabled = 0;
+
+ rv_detach_trace_probe("wwnr", sched_switch, handle_switch);
+ rv_detach_trace_probe("wwnr", sched_wakeup, handle_wakeup);
+
+ da_monitor_destroy_wwnr();
+}
+
+static struct rv_monitor rv_wwnr = {
+ .name = "wwnr",
+ .description = "wakeup while not running per-task testing model.",
+ .enable = enable_wwnr,
+ .disable = disable_wwnr,
+ .reset = da_monitor_reset_all_wwnr,
+ .enabled = 0,
+};
+
+static int __init register_wwnr(void)
+{
+ rv_register_monitor(&rv_wwnr);
+ return 0;
+}
+
+static void __exit unregister_wwnr(void)
+{
+ rv_unregister_monitor(&rv_wwnr);
+}
+
+module_init(register_wwnr);
+module_exit(unregister_wwnr);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Daniel Bristot de Oliveira <bristot@kernel.org>");
+MODULE_DESCRIPTION("wwnr: wakeup while not running monitor");
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h
new file mode 100644
index 000000000000..d0d9c4b8121b
--- /dev/null
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.h
@@ -0,0 +1,46 @@
+/*
+ * Automatically generated C representation of wwnr automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_wwnr {
+ not_running_wwnr = 0,
+ running_wwnr,
+ state_max_wwnr
+};
+
+#define INVALID_STATE state_max_wwnr
+
+enum events_wwnr {
+ switch_in_wwnr = 0,
+ switch_out_wwnr,
+ wakeup_wwnr,
+ event_max_wwnr
+};
+
+struct automaton_wwnr {
+ char *state_names[state_max_wwnr];
+ char *event_names[event_max_wwnr];
+ unsigned char function[state_max_wwnr][event_max_wwnr];
+ unsigned char initial_state;
+ bool final_states[state_max_wwnr];
+};
+
+static const struct automaton_wwnr automaton_wwnr = {
+ .state_names = {
+ "not_running",
+ "running"
+ },
+ .event_names = {
+ "switch_in",
+ "switch_out",
+ "wakeup"
+ },
+ .function = {
+ { running_wwnr, INVALID_STATE, not_running_wwnr },
+ { INVALID_STATE, not_running_wwnr, INVALID_STATE },
+ },
+ .initial_state = not_running_wwnr,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c
new file mode 100644
index 000000000000..0186ff4cbd0b
--- /dev/null
+++ b/kernel/trace/rv/reactor_panic.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
+ *
+ * Panic RV reactor:
+ * Prints the exception msg to the kernel message log and panic().
+ */
+
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+
+static void rv_panic_reaction(char *msg)
+{
+ panic(msg);
+}
+
+static struct rv_reactor rv_panic = {
+ .name = "panic",
+ .description = "panic the system if an exception is found.",
+ .react = rv_panic_reaction
+};
+
+static int __init register_react_panic(void)
+{
+ rv_register_reactor(&rv_panic);
+ return 0;
+}
+
+static void __exit unregister_react_panic(void)
+{
+ rv_unregister_reactor(&rv_panic);
+}
+
+module_init(register_react_panic);
+module_exit(unregister_react_panic);
+
+MODULE_AUTHOR("Daniel Bristot de Oliveira");
+MODULE_DESCRIPTION("panic rv reactor: panic if an exception is found.");
diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c
new file mode 100644
index 000000000000..178759dbf89f
--- /dev/null
+++ b/kernel/trace/rv/reactor_printk.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
+ *
+ * Printk RV reactor:
+ * Prints the exception msg to the kernel message log.
+ */
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+
+static void rv_printk_reaction(char *msg)
+{
+ printk_deferred(msg);
+}
+
+static struct rv_reactor rv_printk = {
+ .name = "printk",
+ .description = "prints the exception msg to the kernel message log.",
+ .react = rv_printk_reaction
+};
+
+static int __init register_react_printk(void)
+{
+ rv_register_reactor(&rv_printk);
+ return 0;
+}
+
+static void __exit unregister_react_printk(void)
+{
+ rv_unregister_reactor(&rv_printk);
+}
+
+module_init(register_react_printk);
+module_exit(unregister_react_printk);
+
+MODULE_AUTHOR("Daniel Bristot de Oliveira");
+MODULE_DESCRIPTION("printk rv reactor: printk if an exception is hit.");
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
new file mode 100644
index 000000000000..2f68e93fff0b
--- /dev/null
+++ b/kernel/trace/rv/rv.c
@@ -0,0 +1,797 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
+ *
+ * This is the online Runtime Verification (RV) interface.
+ *
+ * RV is a lightweight (yet rigorous) method that complements classical
+ * exhaustive verification techniques (such as model checking and
+ * theorem proving) with a more practical approach to complex systems.
+ *
+ * RV works by analyzing the trace of the system's actual execution,
+ * comparing it against a formal specification of the system behavior.
+ * RV can give precise information on the runtime behavior of the
+ * monitored system while enabling the reaction for unexpected
+ * events, avoiding, for example, the propagation of a failure on
+ * safety-critical systems.
+ *
+ * The development of this interface roots in the development of the
+ * paper:
+ *
+ * De Oliveira, Daniel Bristot; Cucinotta, Tommaso; De Oliveira, Romulo
+ * Silva. Efficient formal verification for the Linux kernel. In:
+ * International Conference on Software Engineering and Formal Methods.
+ * Springer, Cham, 2019. p. 315-332.
+ *
+ * And:
+ *
+ * De Oliveira, Daniel Bristot, et al. Automata-based formal analysis
+ * and verification of the real-time Linux kernel. PhD Thesis, 2020.
+ *
+ * == Runtime monitor interface ==
+ *
+ * A monitor is the central part of the runtime verification of a system.
+ *
+ * The monitor stands in between the formal specification of the desired
+ * (or undesired) behavior, and the trace of the actual system.
+ *
+ * In Linux terms, the runtime verification monitors are encapsulated
+ * inside the "RV monitor" abstraction. A RV monitor includes a reference
+ * model of the system, a set of instances of the monitor (per-cpu monitor,
+ * per-task monitor, and so on), and the helper functions that glue the
+ * monitor to the system via trace. Generally, a monitor includes some form
+ * of trace output as a reaction for event parsing and exceptions,
+ * as depicted bellow:
+ *
+ * Linux +----- RV Monitor ----------------------------------+ Formal
+ * Realm | | Realm
+ * +-------------------+ +----------------+ +-----------------+
+ * | Linux kernel | | Monitor | | Reference |
+ * | Tracing | -> | Instance(s) | <- | Model |
+ * | (instrumentation) | | (verification) | | (specification) |
+ * +-------------------+ +----------------+ +-----------------+
+ * | | |
+ * | V |
+ * | +----------+ |
+ * | | Reaction | |
+ * | +--+--+--+-+ |
+ * | | | | |
+ * | | | +-> trace output ? |
+ * +------------------------|--|----------------------+
+ * | +----> panic ?
+ * +-------> <user-specified>
+ *
+ * This file implements the interface for loading RV monitors, and
+ * to control the verification session.
+ *
+ * == Registering monitors ==
+ *
+ * The struct rv_monitor defines a set of callback functions to control
+ * a verification session. For instance, when a given monitor is enabled,
+ * the "enable" callback function is called to hook the instrumentation
+ * functions to the kernel trace events. The "disable" function is called
+ * when disabling the verification session.
+ *
+ * A RV monitor is registered via:
+ * int rv_register_monitor(struct rv_monitor *monitor);
+ * And unregistered via:
+ * int rv_unregister_monitor(struct rv_monitor *monitor);
+ *
+ * == User interface ==
+ *
+ * The user interface resembles kernel tracing interface. It presents
+ * these files:
+ *
+ * "available_monitors"
+ * - List the available monitors, one per line.
+ *
+ * For example:
+ * # cat available_monitors
+ * wip
+ * wwnr
+ *
+ * "enabled_monitors"
+ * - Lists the enabled monitors, one per line;
+ * - Writing to it enables a given monitor;
+ * - Writing a monitor name with a '!' prefix disables it;
+ * - Truncating the file disables all enabled monitors.
+ *
+ * For example:
+ * # cat enabled_monitors
+ * # echo wip > enabled_monitors
+ * # echo wwnr >> enabled_monitors
+ * # cat enabled_monitors
+ * wip
+ * wwnr
+ * # echo '!wip' >> enabled_monitors
+ * # cat enabled_monitors
+ * wwnr
+ * # echo > enabled_monitors
+ * # cat enabled_monitors
+ * #
+ *
+ * Note that more than one monitor can be enabled concurrently.
+ *
+ * "monitoring_on"
+ * - It is an on/off general switcher for monitoring. Note
+ * that it does not disable enabled monitors or detach events,
+ * but stops the per-entity monitors from monitoring the events
+ * received from the instrumentation. It resembles the "tracing_on"
+ * switcher.
+ *
+ * "monitors/"
+ * Each monitor will have its own directory inside "monitors/". There
+ * the monitor specific files will be presented.
+ * The "monitors/" directory resembles the "events" directory on
+ * tracefs.
+ *
+ * For example:
+ * # cd monitors/wip/
+ * # ls
+ * desc enable
+ * # cat desc
+ * auto-generated wakeup in preemptive monitor.
+ * # cat enable
+ * 0
+ *
+ * For further information, see:
+ * Documentation/trace/rv/runtime-verification.rst
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+#ifdef CONFIG_DA_MON_EVENTS
+#define CREATE_TRACE_POINTS
+#include <trace/events/rv.h>
+#endif
+
+#include "rv.h"
+
+DEFINE_MUTEX(rv_interface_lock);
+
+static struct rv_interface rv_root;
+
+struct dentry *get_monitors_root(void)
+{
+ return rv_root.monitors_dir;
+}
+
+/*
+ * Interface for the monitor register.
+ */
+static LIST_HEAD(rv_monitors_list);
+
+static int task_monitor_count;
+static bool task_monitor_slots[RV_PER_TASK_MONITORS];
+
+int rv_get_task_monitor_slot(void)
+{
+ int i;
+
+ lockdep_assert_held(&rv_interface_lock);
+
+ if (task_monitor_count == RV_PER_TASK_MONITORS)
+ return -EBUSY;
+
+ task_monitor_count++;
+
+ for (i = 0; i < RV_PER_TASK_MONITORS; i++) {
+ if (task_monitor_slots[i] == false) {
+ task_monitor_slots[i] = true;
+ return i;
+ }
+ }
+
+ WARN_ONCE(1, "RV task_monitor_count and slots are out of sync\n");
+
+ return -EINVAL;
+}
+
+void rv_put_task_monitor_slot(int slot)
+{
+ lockdep_assert_held(&rv_interface_lock);
+
+ if (slot < 0 || slot >= RV_PER_TASK_MONITORS) {
+ WARN_ONCE(1, "RV releasing an invalid slot!: %d\n", slot);
+ return;
+ }
+
+ WARN_ONCE(!task_monitor_slots[slot], "RV releasing unused task_monitor_slots: %d\n",
+ slot);
+
+ task_monitor_count--;
+ task_monitor_slots[slot] = false;
+}
+
+/*
+ * This section collects the monitor/ files and folders.
+ */
+static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count,
+ loff_t *ppos)
+{
+ struct rv_monitor_def *mdef = filp->private_data;
+ const char *buff;
+
+ buff = mdef->monitor->enabled ? "1\n" : "0\n";
+
+ return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1);
+}
+
+/*
+ * __rv_disable_monitor - disabled an enabled monitor
+ */
+static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync)
+{
+ lockdep_assert_held(&rv_interface_lock);
+
+ if (mdef->monitor->enabled) {
+ mdef->monitor->enabled = 0;
+ mdef->monitor->disable();
+
+ /*
+ * Wait for the execution of all events to finish.
+ * Otherwise, the data used by the monitor could
+ * be inconsistent. i.e., if the monitor is re-enabled.
+ */
+ if (sync)
+ tracepoint_synchronize_unregister();
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * rv_disable_monitor - disable a given runtime monitor
+ *
+ * Returns 0 on success.
+ */
+int rv_disable_monitor(struct rv_monitor_def *mdef)
+{
+ __rv_disable_monitor(mdef, true);
+ return 0;
+}
+
+/**
+ * rv_enable_monitor - enable a given runtime monitor
+ *
+ * Returns 0 on success, error otherwise.
+ */
+int rv_enable_monitor(struct rv_monitor_def *mdef)
+{
+ int retval;
+
+ lockdep_assert_held(&rv_interface_lock);
+
+ if (mdef->monitor->enabled)
+ return 0;
+
+ retval = mdef->monitor->enable();
+
+ if (!retval)
+ mdef->monitor->enabled = 1;
+
+ return retval;
+}
+
+/*
+ * interface for enabling/disabling a monitor.
+ */
+static ssize_t monitor_enable_write_data(struct file *filp, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct rv_monitor_def *mdef = filp->private_data;
+ int retval;
+ bool val;
+
+ retval = kstrtobool_from_user(user_buf, count, &val);
+ if (retval)
+ return retval;
+
+ mutex_lock(&rv_interface_lock);
+
+ if (val)
+ retval = rv_enable_monitor(mdef);
+ else
+ retval = rv_disable_monitor(mdef);
+
+ mutex_unlock(&rv_interface_lock);
+
+ return retval ? : count;
+}
+
+static const struct file_operations interface_enable_fops = {
+ .open = simple_open,
+ .llseek = no_llseek,
+ .write = monitor_enable_write_data,
+ .read = monitor_enable_read_data,
+};
+
+/*
+ * Interface to read monitors description.
+ */
+static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, size_t count,
+ loff_t *ppos)
+{
+ struct rv_monitor_def *mdef = filp->private_data;
+ char buff[256];
+
+ memset(buff, 0, sizeof(buff));
+
+ snprintf(buff, sizeof(buff), "%s\n", mdef->monitor->description);
+
+ return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1);
+}
+
+static const struct file_operations interface_desc_fops = {
+ .open = simple_open,
+ .llseek = no_llseek,
+ .read = monitor_desc_read_data,
+};
+
+/*
+ * During the registration of a monitor, this function creates
+ * the monitor dir, where the specific options of the monitor
+ * are exposed.
+ */
+static int create_monitor_dir(struct rv_monitor_def *mdef)
+{
+ struct dentry *root = get_monitors_root();
+ const char *name = mdef->monitor->name;
+ struct dentry *tmp;
+ int retval;
+
+ mdef->root_d = rv_create_dir(name, root);
+ if (!mdef->root_d)
+ return -ENOMEM;
+
+ tmp = rv_create_file("enable", RV_MODE_WRITE, mdef->root_d, mdef, &interface_enable_fops);
+ if (!tmp) {
+ retval = -ENOMEM;
+ goto out_remove_root;
+ }
+
+ tmp = rv_create_file("desc", RV_MODE_READ, mdef->root_d, mdef, &interface_desc_fops);
+ if (!tmp) {
+ retval = -ENOMEM;
+ goto out_remove_root;
+ }
+
+ retval = reactor_populate_monitor(mdef);
+ if (retval)
+ goto out_remove_root;
+
+ return 0;
+
+out_remove_root:
+ rv_remove(mdef->root_d);
+ return retval;
+}
+
+/*
+ * Available/Enable monitor shared seq functions.
+ */
+static int monitors_show(struct seq_file *m, void *p)
+{
+ struct rv_monitor_def *mon_def = p;
+
+ seq_printf(m, "%s\n", mon_def->monitor->name);
+ return 0;
+}
+
+/*
+ * Used by the seq file operations at the end of a read
+ * operation.
+ */
+static void monitors_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&rv_interface_lock);
+}
+
+/*
+ * Available monitor seq functions.
+ */
+static void *available_monitors_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&rv_interface_lock);
+ return seq_list_start(&rv_monitors_list, *pos);
+}
+
+static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &rv_monitors_list, pos);
+}
+
+/*
+ * Enable monitor seq functions.
+ */
+static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ struct rv_monitor_def *m_def = p;
+
+ (*pos)++;
+
+ list_for_each_entry_continue(m_def, &rv_monitors_list, list) {
+ if (m_def->monitor->enabled)
+ return m_def;
+ }
+
+ return NULL;
+}
+
+static void *enabled_monitors_start(struct seq_file *m, loff_t *pos)
+{
+ struct rv_monitor_def *m_def;
+ loff_t l;
+
+ mutex_lock(&rv_interface_lock);
+
+ if (list_empty(&rv_monitors_list))
+ return NULL;
+
+ m_def = list_entry(&rv_monitors_list, struct rv_monitor_def, list);
+
+ for (l = 0; l <= *pos; ) {
+ m_def = enabled_monitors_next(m, m_def, &l);
+ if (!m_def)
+ break;
+ }
+
+ return m_def;
+}
+
+/*
+ * available/enabled monitors seq definition.
+ */
+static const struct seq_operations available_monitors_seq_ops = {
+ .start = available_monitors_start,
+ .next = available_monitors_next,
+ .stop = monitors_stop,
+ .show = monitors_show
+};
+
+static const struct seq_operations enabled_monitors_seq_ops = {
+ .start = enabled_monitors_start,
+ .next = enabled_monitors_next,
+ .stop = monitors_stop,
+ .show = monitors_show
+};
+
+/*
+ * available_monitors interface.
+ */
+static int available_monitors_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &available_monitors_seq_ops);
+};
+
+static const struct file_operations available_monitors_ops = {
+ .open = available_monitors_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+/*
+ * enabled_monitors interface.
+ */
+static void disable_all_monitors(void)
+{
+ struct rv_monitor_def *mdef;
+ int enabled = 0;
+
+ mutex_lock(&rv_interface_lock);
+
+ list_for_each_entry(mdef, &rv_monitors_list, list)
+ enabled += __rv_disable_monitor(mdef, false);
+
+ if (enabled) {
+ /*
+ * Wait for the execution of all events to finish.
+ * Otherwise, the data used by the monitor could
+ * be inconsistent. i.e., if the monitor is re-enabled.
+ */
+ tracepoint_synchronize_unregister();
+ }
+
+ mutex_unlock(&rv_interface_lock);
+}
+
+static int enabled_monitors_open(struct inode *inode, struct file *file)
+{
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
+ disable_all_monitors();
+
+ return seq_open(file, &enabled_monitors_seq_ops);
+};
+
+static ssize_t enabled_monitors_write(struct file *filp, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buff[MAX_RV_MONITOR_NAME_SIZE + 2];
+ struct rv_monitor_def *mdef;
+ int retval = -EINVAL;
+ bool enable = true;
+ char *ptr;
+ int len;
+
+ if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1)
+ return -EINVAL;
+
+ memset(buff, 0, sizeof(buff));
+
+ retval = simple_write_to_buffer(buff, sizeof(buff) - 1, ppos, user_buf, count);
+ if (retval < 0)
+ return -EFAULT;
+
+ ptr = strim(buff);
+
+ if (ptr[0] == '!') {
+ enable = false;
+ ptr++;
+ }
+
+ len = strlen(ptr);
+ if (!len)
+ return count;
+
+ mutex_lock(&rv_interface_lock);
+
+ retval = -EINVAL;
+
+ list_for_each_entry(mdef, &rv_monitors_list, list) {
+ if (strcmp(ptr, mdef->monitor->name) != 0)
+ continue;
+
+ /*
+ * Monitor found!
+ */
+ if (enable)
+ retval = rv_enable_monitor(mdef);
+ else
+ retval = rv_disable_monitor(mdef);
+
+ if (!retval)
+ retval = count;
+
+ break;
+ }
+
+ mutex_unlock(&rv_interface_lock);
+ return retval;
+}
+
+static const struct file_operations enabled_monitors_ops = {
+ .open = enabled_monitors_open,
+ .read = seq_read,
+ .write = enabled_monitors_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/*
+ * Monitoring on global switcher!
+ */
+static bool __read_mostly monitoring_on;
+
+/**
+ * rv_monitoring_on - checks if monitoring is on
+ *
+ * Returns 1 if on, 0 otherwise.
+ */
+bool rv_monitoring_on(void)
+{
+ /* Ensures that concurrent monitors read consistent monitoring_on */
+ smp_rmb();
+ return READ_ONCE(monitoring_on);
+}
+
+/*
+ * monitoring_on general switcher.
+ */
+static ssize_t monitoring_on_read_data(struct file *filp, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ const char *buff;
+
+ buff = rv_monitoring_on() ? "1\n" : "0\n";
+
+ return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1);
+}
+
+static void turn_monitoring_off(void)
+{
+ WRITE_ONCE(monitoring_on, false);
+ /* Ensures that concurrent monitors read consistent monitoring_on */
+ smp_wmb();
+}
+
+static void reset_all_monitors(void)
+{
+ struct rv_monitor_def *mdef;
+
+ list_for_each_entry(mdef, &rv_monitors_list, list) {
+ if (mdef->monitor->enabled)
+ mdef->monitor->reset();
+ }
+}
+
+static void turn_monitoring_on(void)
+{
+ WRITE_ONCE(monitoring_on, true);
+ /* Ensures that concurrent monitors read consistent monitoring_on */
+ smp_wmb();
+}
+
+static void turn_monitoring_on_with_reset(void)
+{
+ lockdep_assert_held(&rv_interface_lock);
+
+ if (rv_monitoring_on())
+ return;
+
+ /*
+ * Monitors might be out of sync with the system if events were not
+ * processed because of !rv_monitoring_on().
+ *
+ * Reset all monitors, forcing a re-sync.
+ */
+ reset_all_monitors();
+ turn_monitoring_on();
+}
+
+static ssize_t monitoring_on_write_data(struct file *filp, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ int retval;
+ bool val;
+
+ retval = kstrtobool_from_user(user_buf, count, &val);
+ if (retval)
+ return retval;
+
+ mutex_lock(&rv_interface_lock);
+
+ if (val)
+ turn_monitoring_on_with_reset();
+ else
+ turn_monitoring_off();
+
+ /*
+ * Wait for the execution of all events to finish
+ * before returning to user-space.
+ */
+ tracepoint_synchronize_unregister();
+
+ mutex_unlock(&rv_interface_lock);
+
+ return count;
+}
+
+static const struct file_operations monitoring_on_fops = {
+ .open = simple_open,
+ .llseek = no_llseek,
+ .write = monitoring_on_write_data,
+ .read = monitoring_on_read_data,
+};
+
+static void destroy_monitor_dir(struct rv_monitor_def *mdef)
+{
+ reactor_cleanup_monitor(mdef);
+ rv_remove(mdef->root_d);
+}
+
+/**
+ * rv_register_monitor - register a rv monitor.
+ * @monitor: The rv_monitor to be registered.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int rv_register_monitor(struct rv_monitor *monitor)
+{
+ struct rv_monitor_def *r;
+ int retval = 0;
+
+ if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) {
+ pr_info("Monitor %s has a name longer than %d\n", monitor->name,
+ MAX_RV_MONITOR_NAME_SIZE);
+ return -1;
+ }
+
+ mutex_lock(&rv_interface_lock);
+
+ list_for_each_entry(r, &rv_monitors_list, list) {
+ if (strcmp(monitor->name, r->monitor->name) == 0) {
+ pr_info("Monitor %s is already registered\n", monitor->name);
+ retval = -1;
+ goto out_unlock;
+ }
+ }
+
+ r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL);
+ if (!r) {
+ retval = -ENOMEM;
+ goto out_unlock;
+ }
+
+ r->monitor = monitor;
+
+ retval = create_monitor_dir(r);
+ if (retval) {
+ kfree(r);
+ goto out_unlock;
+ }
+
+ list_add_tail(&r->list, &rv_monitors_list);
+
+out_unlock:
+ mutex_unlock(&rv_interface_lock);
+ return retval;
+}
+
+/**
+ * rv_unregister_monitor - unregister a rv monitor.
+ * @monitor: The rv_monitor to be unregistered.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int rv_unregister_monitor(struct rv_monitor *monitor)
+{
+ struct rv_monitor_def *ptr, *next;
+
+ mutex_lock(&rv_interface_lock);
+
+ list_for_each_entry_safe(ptr, next, &rv_monitors_list, list) {
+ if (strcmp(monitor->name, ptr->monitor->name) == 0) {
+ rv_disable_monitor(ptr);
+ list_del(&ptr->list);
+ destroy_monitor_dir(ptr);
+ }
+ }
+
+ mutex_unlock(&rv_interface_lock);
+ return 0;
+}
+
+int __init rv_init_interface(void)
+{
+ struct dentry *tmp;
+ int retval;
+
+ rv_root.root_dir = rv_create_dir("rv", NULL);
+ if (!rv_root.root_dir)
+ goto out_err;
+
+ rv_root.monitors_dir = rv_create_dir("monitors", rv_root.root_dir);
+ if (!rv_root.monitors_dir)
+ goto out_err;
+
+ tmp = rv_create_file("available_monitors", RV_MODE_READ, rv_root.root_dir, NULL,
+ &available_monitors_ops);
+ if (!tmp)
+ goto out_err;
+
+ tmp = rv_create_file("enabled_monitors", RV_MODE_WRITE, rv_root.root_dir, NULL,
+ &enabled_monitors_ops);
+ if (!tmp)
+ goto out_err;
+
+ tmp = rv_create_file("monitoring_on", RV_MODE_WRITE, rv_root.root_dir, NULL,
+ &monitoring_on_fops);
+ if (!tmp)
+ goto out_err;
+ retval = init_rv_reactors(rv_root.root_dir);
+ if (retval)
+ goto out_err;
+
+ turn_monitoring_on();
+
+ return 0;
+
+out_err:
+ rv_remove(rv_root.root_dir);
+ printk(KERN_ERR "RV: Error while creating the RV interface\n");
+ return 1;
+}
diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h
new file mode 100644
index 000000000000..db6cb0913dbd
--- /dev/null
+++ b/kernel/trace/rv/rv.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/mutex.h>
+
+struct rv_interface {
+ struct dentry *root_dir;
+ struct dentry *monitors_dir;
+};
+
+#include "../trace.h"
+#include <linux/tracefs.h>
+#include <linux/rv.h>
+
+#define RV_MODE_WRITE TRACE_MODE_WRITE
+#define RV_MODE_READ TRACE_MODE_READ
+
+#define rv_create_dir tracefs_create_dir
+#define rv_create_file tracefs_create_file
+#define rv_remove tracefs_remove
+
+#define MAX_RV_MONITOR_NAME_SIZE 32
+#define MAX_RV_REACTOR_NAME_SIZE 32
+
+extern struct mutex rv_interface_lock;
+
+#ifdef CONFIG_RV_REACTORS
+struct rv_reactor_def {
+ struct list_head list;
+ struct rv_reactor *reactor;
+ /* protected by the monitor interface lock */
+ int counter;
+};
+#endif
+
+struct rv_monitor_def {
+ struct list_head list;
+ struct rv_monitor *monitor;
+ struct dentry *root_d;
+#ifdef CONFIG_RV_REACTORS
+ struct rv_reactor_def *rdef;
+ bool reacting;
+#endif
+ bool task_monitor;
+};
+
+struct dentry *get_monitors_root(void);
+int rv_disable_monitor(struct rv_monitor_def *mdef);
+int rv_enable_monitor(struct rv_monitor_def *mdef);
+
+#ifdef CONFIG_RV_REACTORS
+int reactor_populate_monitor(struct rv_monitor_def *mdef);
+void reactor_cleanup_monitor(struct rv_monitor_def *mdef);
+int init_rv_reactors(struct dentry *root_dir);
+#else
+static inline int reactor_populate_monitor(struct rv_monitor_def *mdef)
+{
+ return 0;
+}
+
+static inline void reactor_cleanup_monitor(struct rv_monitor_def *mdef)
+{
+ return;
+}
+
+static inline int init_rv_reactors(struct dentry *root_dir)
+{
+ return 0;
+}
+#endif
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
new file mode 100644
index 000000000000..6aae106695b6
--- /dev/null
+++ b/kernel/trace/rv/rv_reactors.c
@@ -0,0 +1,510 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
+ *
+ * Runtime reactor interface.
+ *
+ * A runtime monitor can cause a reaction to the detection of an
+ * exception on the model's execution. By default, the monitors have
+ * tracing reactions, printing the monitor output via tracepoints.
+ * But other reactions can be added (on-demand) via this interface.
+ *
+ * == Registering reactors ==
+ *
+ * The struct rv_reactor defines a callback function to be executed
+ * in case of a model exception happens. The callback function
+ * receives a message to be (optionally) printed before executing
+ * the reaction.
+ *
+ * A RV reactor is registered via:
+ * int rv_register_reactor(struct rv_reactor *reactor)
+ * And unregistered via:
+ * int rv_unregister_reactor(struct rv_reactor *reactor)
+ *
+ * These functions are exported to modules, enabling reactors to be
+ * dynamically loaded.
+ *
+ * == User interface ==
+ *
+ * The user interface resembles the kernel tracing interface and
+ * presents these files:
+ *
+ * "available_reactors"
+ * - List the available reactors, one per line.
+ *
+ * For example:
+ * # cat available_reactors
+ * nop
+ * panic
+ * printk
+ *
+ * "reacting_on"
+ * - It is an on/off general switch for reactors, disabling
+ * all reactions.
+ *
+ * "monitors/MONITOR/reactors"
+ * - List available reactors, with the select reaction for the given
+ * MONITOR inside []. The default one is the nop (no operation)
+ * reactor.
+ * - Writing the name of an reactor enables it to the given
+ * MONITOR.
+ *
+ * For example:
+ * # cat monitors/wip/reactors
+ * [nop]
+ * panic
+ * printk
+ * # echo panic > monitors/wip/reactors
+ * # cat monitors/wip/reactors
+ * nop
+ * [panic]
+ * printk
+ */
+
+#include <linux/slab.h>
+
+#include "rv.h"
+
+/*
+ * Interface for the reactor register.
+ */
+static LIST_HEAD(rv_reactors_list);
+
+static struct rv_reactor_def *get_reactor_rdef_by_name(char *name)
+{
+ struct rv_reactor_def *r;
+
+ list_for_each_entry(r, &rv_reactors_list, list) {
+ if (strcmp(name, r->reactor->name) == 0)
+ return r;
+ }
+ return NULL;
+}
+
+/*
+ * Available reactors seq functions.
+ */
+static int reactors_show(struct seq_file *m, void *p)
+{
+ struct rv_reactor_def *rea_def = p;
+
+ seq_printf(m, "%s\n", rea_def->reactor->name);
+ return 0;
+}
+
+static void reactors_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&rv_interface_lock);
+}
+
+static void *reactors_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&rv_interface_lock);
+ return seq_list_start(&rv_reactors_list, *pos);
+}
+
+static void *reactors_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &rv_reactors_list, pos);
+}
+
+/*
+ * available_reactors seq definition.
+ */
+static const struct seq_operations available_reactors_seq_ops = {
+ .start = reactors_start,
+ .next = reactors_next,
+ .stop = reactors_stop,
+ .show = reactors_show
+};
+
+/*
+ * available_reactors interface.
+ */
+static int available_reactors_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &available_reactors_seq_ops);
+};
+
+static const struct file_operations available_reactors_ops = {
+ .open = available_reactors_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+/*
+ * Monitor's reactor file.
+ */
+static int monitor_reactor_show(struct seq_file *m, void *p)
+{
+ struct rv_monitor_def *mdef = m->private;
+ struct rv_reactor_def *rdef = p;
+
+ if (mdef->rdef == rdef)
+ seq_printf(m, "[%s]\n", rdef->reactor->name);
+ else
+ seq_printf(m, "%s\n", rdef->reactor->name);
+ return 0;
+}
+
+/*
+ * available_reactors seq definition.
+ */
+static const struct seq_operations monitor_reactors_seq_ops = {
+ .start = reactors_start,
+ .next = reactors_next,
+ .stop = reactors_stop,
+ .show = monitor_reactor_show
+};
+
+static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor_def *rdef,
+ bool reacting)
+{
+ bool monitor_enabled;
+
+ /* nothing to do */
+ if (mdef->rdef == rdef)
+ return;
+
+ monitor_enabled = mdef->monitor->enabled;
+ if (monitor_enabled)
+ rv_disable_monitor(mdef);
+
+ /* swap reactor's usage */
+ mdef->rdef->counter--;
+ rdef->counter++;
+
+ mdef->rdef = rdef;
+ mdef->reacting = reacting;
+ mdef->monitor->react = rdef->reactor->react;
+
+ if (monitor_enabled)
+ rv_enable_monitor(mdef);
+}
+
+static ssize_t
+monitor_reactors_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buff[MAX_RV_REACTOR_NAME_SIZE + 2];
+ struct rv_monitor_def *mdef;
+ struct rv_reactor_def *rdef;
+ struct seq_file *seq_f;
+ int retval = -EINVAL;
+ bool enable;
+ char *ptr;
+ int len;
+
+ if (count < 1 || count > MAX_RV_REACTOR_NAME_SIZE + 1)
+ return -EINVAL;
+
+ memset(buff, 0, sizeof(buff));
+
+ retval = simple_write_to_buffer(buff, sizeof(buff) - 1, ppos, user_buf, count);
+ if (retval < 0)
+ return -EFAULT;
+
+ ptr = strim(buff);
+
+ len = strlen(ptr);
+ if (!len)
+ return count;
+
+ /*
+ * See monitor_reactors_open()
+ */
+ seq_f = file->private_data;
+ mdef = seq_f->private;
+
+ mutex_lock(&rv_interface_lock);
+
+ retval = -EINVAL;
+
+ list_for_each_entry(rdef, &rv_reactors_list, list) {
+ if (strcmp(ptr, rdef->reactor->name) != 0)
+ continue;
+
+ if (rdef == get_reactor_rdef_by_name("nop"))
+ enable = false;
+ else
+ enable = true;
+
+ monitor_swap_reactors(mdef, rdef, enable);
+
+ retval = count;
+ break;
+ }
+
+ mutex_unlock(&rv_interface_lock);
+
+ return retval;
+}
+
+/*
+ * available_reactors interface.
+ */
+static int monitor_reactors_open(struct inode *inode, struct file *file)
+{
+ struct rv_monitor_def *mdef = inode->i_private;
+ struct seq_file *seq_f;
+ int ret;
+
+ ret = seq_open(file, &monitor_reactors_seq_ops);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * seq_open stores the seq_file on the file->private data.
+ */
+ seq_f = file->private_data;
+
+ /*
+ * Copy the create file "private" data to the seq_file private data.
+ */
+ seq_f->private = mdef;
+
+ return 0;
+};
+
+static const struct file_operations monitor_reactors_ops = {
+ .open = monitor_reactors_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = monitor_reactors_write
+};
+
+static int __rv_register_reactor(struct rv_reactor *reactor)
+{
+ struct rv_reactor_def *r;
+
+ list_for_each_entry(r, &rv_reactors_list, list) {
+ if (strcmp(reactor->name, r->reactor->name) == 0) {
+ pr_info("Reactor %s is already registered\n", reactor->name);
+ return -EINVAL;
+ }
+ }
+
+ r = kzalloc(sizeof(struct rv_reactor_def), GFP_KERNEL);
+ if (!r)
+ return -ENOMEM;
+
+ r->reactor = reactor;
+ r->counter = 0;
+
+ list_add_tail(&r->list, &rv_reactors_list);
+
+ return 0;
+}
+
+/**
+ * rv_register_reactor - register a rv reactor.
+ * @reactor: The rv_reactor to be registered.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int rv_register_reactor(struct rv_reactor *reactor)
+{
+ int retval = 0;
+
+ if (strlen(reactor->name) >= MAX_RV_REACTOR_NAME_SIZE) {
+ pr_info("Reactor %s has a name longer than %d\n",
+ reactor->name, MAX_RV_MONITOR_NAME_SIZE);
+ return -EINVAL;
+ }
+
+ mutex_lock(&rv_interface_lock);
+ retval = __rv_register_reactor(reactor);
+ mutex_unlock(&rv_interface_lock);
+ return retval;
+}
+
+/**
+ * rv_unregister_reactor - unregister a rv reactor.
+ * @reactor: The rv_reactor to be unregistered.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int rv_unregister_reactor(struct rv_reactor *reactor)
+{
+ struct rv_reactor_def *ptr, *next;
+ int ret = 0;
+
+ mutex_lock(&rv_interface_lock);
+
+ list_for_each_entry_safe(ptr, next, &rv_reactors_list, list) {
+ if (strcmp(reactor->name, ptr->reactor->name) == 0) {
+
+ if (!ptr->counter) {
+ list_del(&ptr->list);
+ } else {
+ printk(KERN_WARNING
+ "rv: the rv_reactor %s is in use by %d monitor(s)\n",
+ ptr->reactor->name, ptr->counter);
+ printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n",
+ ptr->reactor->name);
+ ret = -EBUSY;
+ break;
+ }
+ }
+ }
+
+ mutex_unlock(&rv_interface_lock);
+ return ret;
+}
+
+/*
+ * reacting_on interface.
+ */
+static bool __read_mostly reacting_on;
+
+/**
+ * rv_reacting_on - checks if reacting is on
+ *
+ * Returns 1 if on, 0 otherwise.
+ */
+bool rv_reacting_on(void)
+{
+ /* Ensures that concurrent monitors read consistent reacting_on */
+ smp_rmb();
+ return READ_ONCE(reacting_on);
+}
+
+static ssize_t reacting_on_read_data(struct file *filp,
+ char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char *buff;
+
+ buff = rv_reacting_on() ? "1\n" : "0\n";
+
+ return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1);
+}
+
+static void turn_reacting_off(void)
+{
+ WRITE_ONCE(reacting_on, false);
+ /* Ensures that concurrent monitors read consistent reacting_on */
+ smp_wmb();
+}
+
+static void turn_reacting_on(void)
+{
+ WRITE_ONCE(reacting_on, true);
+ /* Ensures that concurrent monitors read consistent reacting_on */
+ smp_wmb();
+}
+
+static ssize_t reacting_on_write_data(struct file *filp, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ int retval;
+ bool val;
+
+ retval = kstrtobool_from_user(user_buf, count, &val);
+ if (retval)
+ return retval;
+
+ mutex_lock(&rv_interface_lock);
+
+ if (val)
+ turn_reacting_on();
+ else
+ turn_reacting_off();
+
+ /*
+ * Wait for the execution of all events to finish
+ * before returning to user-space.
+ */
+ tracepoint_synchronize_unregister();
+
+ mutex_unlock(&rv_interface_lock);
+
+ return count;
+}
+
+static const struct file_operations reacting_on_fops = {
+ .open = simple_open,
+ .llseek = no_llseek,
+ .write = reacting_on_write_data,
+ .read = reacting_on_read_data,
+};
+
+/**
+ * reactor_populate_monitor - creates per monitor reactors file
+ * @mdef: monitor's definition.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int reactor_populate_monitor(struct rv_monitor_def *mdef)
+{
+ struct dentry *tmp;
+
+ tmp = rv_create_file("reactors", RV_MODE_WRITE, mdef->root_d, mdef, &monitor_reactors_ops);
+ if (!tmp)
+ return -ENOMEM;
+
+ /*
+ * Configure as the rv_nop reactor.
+ */
+ mdef->rdef = get_reactor_rdef_by_name("nop");
+ mdef->rdef->counter++;
+ mdef->reacting = false;
+
+ return 0;
+}
+
+/**
+ * reactor_cleanup_monitor - cleanup a monitor reference
+ * @mdef: monitor's definition.
+ */
+void reactor_cleanup_monitor(struct rv_monitor_def *mdef)
+{
+ lockdep_assert_held(&rv_interface_lock);
+ mdef->rdef->counter--;
+ WARN_ON_ONCE(mdef->rdef->counter < 0);
+}
+
+/*
+ * Nop reactor register
+ */
+static void rv_nop_reaction(char *msg)
+{
+}
+
+static struct rv_reactor rv_nop = {
+ .name = "nop",
+ .description = "no-operation reactor: do nothing.",
+ .react = rv_nop_reaction
+};
+
+int init_rv_reactors(struct dentry *root_dir)
+{
+ struct dentry *available, *reacting;
+ int retval;
+
+ available = rv_create_file("available_reactors", RV_MODE_READ, root_dir, NULL,
+ &available_reactors_ops);
+ if (!available)
+ goto out_err;
+
+ reacting = rv_create_file("reacting_on", RV_MODE_WRITE, root_dir, NULL, &reacting_on_fops);
+ if (!reacting)
+ goto rm_available;
+
+ retval = __rv_register_reactor(&rv_nop);
+ if (retval)
+ goto rm_reacting;
+
+ turn_reacting_on();
+
+ return 0;
+
+rm_reacting:
+ rv_remove(reacting);
+rm_available:
+ rv_remove(available);
+out_err:
+ return -ENOMEM;
+}
diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c
index 7d56d621ffea..354c2117be43 100644
--- a/kernel/trace/synth_event_gen_test.c
+++ b/kernel/trace/synth_event_gen_test.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Test module for in-kernel sythetic event creation and generation.
+ * Test module for in-kernel synthetic event creation and generation.
*
* Copyright (C) 2019 Tom Zanussi <zanussi@kernel.org>
*/
@@ -22,7 +22,7 @@
* Then:
*
* # insmod kernel/trace/synth_event_gen_test.ko
- * # cat /sys/kernel/debug/tracing/trace
+ * # cat /sys/kernel/tracing/trace
*
* You should see several events in the trace buffer -
* "create_synth_test", "empty_synth_test", and several instances of
@@ -120,15 +120,13 @@ static int __init test_gen_synth_cmd(void)
/* Now generate a gen_synth_test event */
ret = synth_event_trace_array(gen_synth_test, vals, ARRAY_SIZE(vals));
- out:
+ free:
+ kfree(buf);
return ret;
delete:
/* We got an error after creating the event, delete it */
synth_event_delete("gen_synth_test");
- free:
- kfree(buf);
-
- goto out;
+ goto free;
}
/*
@@ -227,24 +225,24 @@ static int __init test_empty_synth_event(void)
/* Now trace an empty_synth_test event */
ret = synth_event_trace_array(empty_synth_test, vals, ARRAY_SIZE(vals));
- out:
+ free:
+ kfree(buf);
return ret;
delete:
/* We got an error after creating the event, delete it */
synth_event_delete("empty_synth_test");
- free:
- kfree(buf);
-
- goto out;
+ goto free;
}
static struct synth_field_desc create_synth_test_fields[] = {
{ .type = "pid_t", .name = "next_pid_field" },
{ .type = "char[16]", .name = "next_comm_field" },
{ .type = "u64", .name = "ts_ns" },
+ { .type = "char[]", .name = "dynstring_field_1" },
{ .type = "u64", .name = "ts_ms" },
{ .type = "unsigned int", .name = "cpu" },
{ .type = "char[64]", .name = "my_string_field" },
+ { .type = "char[]", .name = "dynstring_field_2" },
{ .type = "int", .name = "my_int_field" },
};
@@ -254,7 +252,7 @@ static struct synth_field_desc create_synth_test_fields[] = {
*/
static int __init test_create_synth_event(void)
{
- u64 vals[7];
+ u64 vals[9];
int ret;
/* Create the create_synth_test event with the fields above */
@@ -292,10 +290,12 @@ static int __init test_create_synth_event(void)
vals[0] = 777; /* next_pid_field */
vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */
vals[2] = 1000000; /* ts_ns */
- vals[3] = 1000; /* ts_ms */
- vals[4] = raw_smp_processor_id(); /* cpu */
- vals[5] = (u64)(long)"thneed"; /* my_string_field */
- vals[6] = 398; /* my_int_field */
+ vals[3] = (u64)(long)"xrayspecs"; /* dynstring_field_1 */
+ vals[4] = 1000; /* ts_ms */
+ vals[5] = raw_smp_processor_id(); /* cpu */
+ vals[6] = (u64)(long)"thneed"; /* my_string_field */
+ vals[7] = (u64)(long)"kerplunk"; /* dynstring_field_2 */
+ vals[8] = 398; /* my_int_field */
/* Now generate a create_synth_test event */
ret = synth_event_trace_array(create_synth_test, vals, ARRAY_SIZE(vals));
@@ -303,7 +303,7 @@ static int __init test_create_synth_event(void)
return ret;
delete:
/* We got an error after creating the event, delete it */
- ret = synth_event_delete("create_synth_test");
+ synth_event_delete("create_synth_test");
goto out;
}
@@ -422,13 +422,15 @@ static int __init test_trace_synth_event(void)
int ret;
/* Trace some bogus values just for testing */
- ret = synth_event_trace(create_synth_test, 7, /* number of values */
+ ret = synth_event_trace(create_synth_test, 9, /* number of values */
(u64)444, /* next_pid_field */
(u64)(long)"clackers", /* next_comm_field */
(u64)1000000, /* ts_ns */
+ (u64)(long)"viewmaster",/* dynstring_field_1 */
(u64)1000, /* ts_ms */
(u64)raw_smp_processor_id(), /* cpu */
(u64)(long)"Thneed", /* my_string_field */
+ (u64)(long)"yoyos", /* dynstring_field_2 */
(u64)999); /* my_int_field */
return ret;
}
@@ -475,6 +477,17 @@ static int __init synth_event_gen_test_init(void)
ret = test_trace_synth_event();
WARN_ON(ret);
+
+ /* Disable when done */
+ trace_array_set_clr_event(gen_synth_test->tr,
+ "synthetic",
+ "gen_synth_test", false);
+ trace_array_set_clr_event(empty_synth_test->tr,
+ "synthetic",
+ "empty_synth_test", false);
+ trace_array_set_clr_event(create_synth_test->tr,
+ "synthetic",
+ "create_synth_test", false);
out:
return ret;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bb62269724d5..c9c898307348 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -19,7 +19,6 @@
#include <linux/kallsyms.h>
#include <linux/security.h>
#include <linux/seq_file.h>
-#include <linux/notifier.h>
#include <linux/irqflags.h>
#include <linux/debugfs.h>
#include <linux/tracefs.h>
@@ -39,6 +38,8 @@
#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/init.h>
+#include <linux/panic_notifier.h>
+#include <linux/kmemleak.h>
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/fs.h>
@@ -49,15 +50,12 @@
#include <linux/irq_work.h>
#include <linux/workqueue.h>
+#include <asm/setup.h> /* COMMAND_LINE_SIZE */
+
#include "trace.h"
#include "trace_output.h"
-/*
- * On boot up, the ring buffer is set to the minimum size, so that
- * we do not waste memory on systems that are not using tracing.
- */
-bool ring_buffer_expanded;
-
+#ifdef CONFIG_FTRACE_STARTUP_TEST
/*
* We need to change this state when a selftest is running.
* A selftest will lurk into the ring-buffer to count the
@@ -68,13 +66,27 @@ bool ring_buffer_expanded;
static bool __read_mostly tracing_selftest_running;
/*
- * If a tracer is running, we do not want to run SELFTEST.
+ * If boot-time tracing including tracers/events via kernel cmdline
+ * is running, we do not want to run SELFTEST.
*/
bool __read_mostly tracing_selftest_disabled;
+void __init disable_tracing_selftest(const char *reason)
+{
+ if (!tracing_selftest_disabled) {
+ tracing_selftest_disabled = true;
+ pr_info("Ftrace startup test is disabled due to %s\n", reason);
+ }
+}
+#else
+#define tracing_selftest_running 0
+#define tracing_selftest_disabled 0
+#endif
+
/* Pipe tracepoints to printk */
-struct trace_iterator *tracepoint_print_iter;
+static struct trace_iterator *tracepoint_print_iter;
int tracepoint_printk;
+static bool tracepoint_printk_stop_on_boot __initdata;
static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key);
/* For tracers that don't implement custom flags */
@@ -163,33 +175,41 @@ static union trace_eval_map_item *trace_eval_maps;
#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
int tracing_set_tracer(struct trace_array *tr, const char *buf);
-static void ftrace_trace_userstack(struct trace_buffer *buffer,
- unsigned long flags, int pc);
+static void ftrace_trace_userstack(struct trace_array *tr,
+ struct trace_buffer *buffer,
+ unsigned int trace_ctx);
#define MAX_TRACER_SIZE 100
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;
static bool allocate_snapshot;
+static bool snapshot_at_boot;
+
+static char boot_instance_info[COMMAND_LINE_SIZE] __initdata;
+static int boot_instance_index;
+
+static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata;
+static int boot_snapshot_index;
static int __init set_cmdline_ftrace(char *str)
{
- strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
+ strscpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
default_bootup_tracer = bootup_tracer_buf;
/* We are using ftrace early, expand it */
- ring_buffer_expanded = true;
+ trace_set_ring_buffer_expanded(NULL);
return 1;
}
__setup("ftrace=", set_cmdline_ftrace);
static int __init set_ftrace_dump_on_oops(char *str)
{
- if (*str++ != '=' || !*str) {
+ if (*str++ != '=' || !*str || !strcmp("1", str)) {
ftrace_dump_on_oops = DUMP_ALL;
return 1;
}
- if (!strcmp("orig_cpu", str)) {
+ if (!strcmp("orig_cpu", str) || !strcmp("2", str)) {
ftrace_dump_on_oops = DUMP_ORIG;
return 1;
}
@@ -208,20 +228,59 @@ __setup("traceoff_on_warning", stop_trace_on_warning);
static int __init boot_alloc_snapshot(char *str)
{
- allocate_snapshot = true;
- /* We also need the main ring buffer expanded */
- ring_buffer_expanded = true;
+ char *slot = boot_snapshot_info + boot_snapshot_index;
+ int left = sizeof(boot_snapshot_info) - boot_snapshot_index;
+ int ret;
+
+ if (str[0] == '=') {
+ str++;
+ if (strlen(str) >= left)
+ return -1;
+
+ ret = snprintf(slot, left, "%s\t", str);
+ boot_snapshot_index += ret;
+ } else {
+ allocate_snapshot = true;
+ /* We also need the main ring buffer expanded */
+ trace_set_ring_buffer_expanded(NULL);
+ }
return 1;
}
__setup("alloc_snapshot", boot_alloc_snapshot);
+static int __init boot_snapshot(char *str)
+{
+ snapshot_at_boot = true;
+ boot_alloc_snapshot(str);
+ return 1;
+}
+__setup("ftrace_boot_snapshot", boot_snapshot);
+
+
+static int __init boot_instance(char *str)
+{
+ char *slot = boot_instance_info + boot_instance_index;
+ int left = sizeof(boot_instance_info) - boot_instance_index;
+ int ret;
+
+ if (strlen(str) >= left)
+ return -1;
+
+ ret = snprintf(slot, left, "%s\t", str);
+ boot_instance_index += ret;
+
+ return 1;
+}
+__setup("trace_instance=", boot_instance);
+
+
static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
static int __init set_trace_boot_options(char *str)
{
- strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
- return 0;
+ strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
+ return 1;
}
__setup("trace_options=", set_trace_boot_options);
@@ -230,20 +289,31 @@ static char *trace_boot_clock __initdata;
static int __init set_trace_boot_clock(char *str)
{
- strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE);
+ strscpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE);
trace_boot_clock = trace_boot_clock_buf;
- return 0;
+ return 1;
}
__setup("trace_clock=", set_trace_boot_clock);
static int __init set_tracepoint_printk(char *str)
{
+ /* Ignore the "tp_printk_stop_on_boot" param */
+ if (*str == '_')
+ return 0;
+
if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
tracepoint_printk = 1;
return 1;
}
__setup("tp_printk", set_tracepoint_printk);
+static int __init set_tracepoint_printk_stop(char *str)
+{
+ tracepoint_printk_stop_on_boot = true;
+ return 1;
+}
+__setup("tp_printk_stop_on_boot", set_tracepoint_printk_stop);
+
unsigned long long ns2usecs(u64 nsec)
{
nsec += 500;
@@ -251,13 +321,153 @@ unsigned long long ns2usecs(u64 nsec)
return nsec;
}
+static void
+trace_process_export(struct trace_export *export,
+ struct ring_buffer_event *event, int flag)
+{
+ struct trace_entry *entry;
+ unsigned int size = 0;
+
+ if (export->flags & flag) {
+ entry = ring_buffer_event_data(event);
+ size = ring_buffer_event_length(event);
+ export->write(export, entry, size);
+ }
+}
+
+static DEFINE_MUTEX(ftrace_export_lock);
+
+static struct trace_export __rcu *ftrace_exports_list __read_mostly;
+
+static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled);
+static DEFINE_STATIC_KEY_FALSE(trace_event_exports_enabled);
+static DEFINE_STATIC_KEY_FALSE(trace_marker_exports_enabled);
+
+static inline void ftrace_exports_enable(struct trace_export *export)
+{
+ if (export->flags & TRACE_EXPORT_FUNCTION)
+ static_branch_inc(&trace_function_exports_enabled);
+
+ if (export->flags & TRACE_EXPORT_EVENT)
+ static_branch_inc(&trace_event_exports_enabled);
+
+ if (export->flags & TRACE_EXPORT_MARKER)
+ static_branch_inc(&trace_marker_exports_enabled);
+}
+
+static inline void ftrace_exports_disable(struct trace_export *export)
+{
+ if (export->flags & TRACE_EXPORT_FUNCTION)
+ static_branch_dec(&trace_function_exports_enabled);
+
+ if (export->flags & TRACE_EXPORT_EVENT)
+ static_branch_dec(&trace_event_exports_enabled);
+
+ if (export->flags & TRACE_EXPORT_MARKER)
+ static_branch_dec(&trace_marker_exports_enabled);
+}
+
+static void ftrace_exports(struct ring_buffer_event *event, int flag)
+{
+ struct trace_export *export;
+
+ preempt_disable_notrace();
+
+ export = rcu_dereference_raw_check(ftrace_exports_list);
+ while (export) {
+ trace_process_export(export, event, flag);
+ export = rcu_dereference_raw_check(export->next);
+ }
+
+ preempt_enable_notrace();
+}
+
+static inline void
+add_trace_export(struct trace_export **list, struct trace_export *export)
+{
+ rcu_assign_pointer(export->next, *list);
+ /*
+ * We are entering export into the list but another
+ * CPU might be walking that list. We need to make sure
+ * the export->next pointer is valid before another CPU sees
+ * the export pointer included into the list.
+ */
+ rcu_assign_pointer(*list, export);
+}
+
+static inline int
+rm_trace_export(struct trace_export **list, struct trace_export *export)
+{
+ struct trace_export **p;
+
+ for (p = list; *p != NULL; p = &(*p)->next)
+ if (*p == export)
+ break;
+
+ if (*p != export)
+ return -1;
+
+ rcu_assign_pointer(*p, (*p)->next);
+
+ return 0;
+}
+
+static inline void
+add_ftrace_export(struct trace_export **list, struct trace_export *export)
+{
+ ftrace_exports_enable(export);
+
+ add_trace_export(list, export);
+}
+
+static inline int
+rm_ftrace_export(struct trace_export **list, struct trace_export *export)
+{
+ int ret;
+
+ ret = rm_trace_export(list, export);
+ ftrace_exports_disable(export);
+
+ return ret;
+}
+
+int register_ftrace_export(struct trace_export *export)
+{
+ if (WARN_ON_ONCE(!export->write))
+ return -1;
+
+ mutex_lock(&ftrace_export_lock);
+
+ add_ftrace_export(&ftrace_exports_list, export);
+
+ mutex_unlock(&ftrace_export_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(register_ftrace_export);
+
+int unregister_ftrace_export(struct trace_export *export)
+{
+ int ret;
+
+ mutex_lock(&ftrace_export_lock);
+
+ ret = rm_ftrace_export(&ftrace_exports_list, export);
+
+ mutex_unlock(&ftrace_export_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_ftrace_export);
+
/* trace_flags holds trace_options default values */
#define TRACE_DEFAULT_FLAGS \
(FUNCTION_DEFAULT_FLAGS | \
TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \
TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \
TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \
- TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS)
+ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \
+ TRACE_ITER_HASH_PTR)
/* trace_options that are only supported by global_trace */
#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \
@@ -275,6 +485,13 @@ static struct trace_array global_trace = {
.trace_flags = TRACE_DEFAULT_FLAGS,
};
+void trace_set_ring_buffer_expanded(struct trace_array *tr)
+{
+ if (!tr)
+ tr = &global_trace;
+ tr->ring_buffer_expanded = true;
+}
+
LIST_HEAD(ftrace_trace_arrays);
int trace_array_get(struct trace_array *this_tr)
@@ -303,6 +520,7 @@ static void __trace_array_put(struct trace_array *this_tr)
/**
* trace_array_put - Decrement the reference counter for this trace array.
+ * @this_tr : pointer to the trace array
*
* NOTE: Use this when we no longer need the trace array returned by
* trace_array_get_by_name(). This ensures the trace array can be later
@@ -350,35 +568,23 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,
return 0;
}
-void trace_free_pid_list(struct trace_pid_list *pid_list)
-{
- vfree(pid_list->pids);
- kfree(pid_list);
-}
-
/**
* trace_find_filtered_pid - check if a pid exists in a filtered_pid list
* @filtered_pids: The list of pids to check
* @search_pid: The PID to find in @filtered_pids
*
- * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis.
+ * Returns true if @search_pid is found in @filtered_pids, and false otherwise.
*/
bool
trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
{
- /*
- * If pid_max changed after filtered_pids was created, we
- * by default ignore all pids greater than the previous pid_max.
- */
- if (search_pid >= filtered_pids->pid_max)
- return false;
-
- return test_bit(search_pid, filtered_pids->pids);
+ return trace_pid_list_is_set(filtered_pids, search_pid);
}
/**
* trace_ignore_this_task - should a task be ignored for tracing
* @filtered_pids: The list of pids to check
+ * @filtered_no_pids: The list of pids not to be traced
* @task: The task that should be ignored if not filtered
*
* Checks if @task should be traced or not from @filtered_pids.
@@ -391,7 +597,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids,
struct task_struct *task)
{
/*
- * If filterd_no_pids is not empty, and the task's pid is listed
+ * If filtered_no_pids is not empty, and the task's pid is listed
* in filtered_no_pids, then return true.
* Otherwise, if filtered_pids is empty, that means we can
* trace all tasks. If it has content, then only trace pids
@@ -429,15 +635,11 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
return;
}
- /* Sorry, but we don't support pid_max changing after setting */
- if (task->pid >= pid_list->pid_max)
- return;
-
/* "self" is set for forks, and NULL for exits */
if (self)
- set_bit(task->pid, pid_list->pids);
+ trace_pid_list_set(pid_list, task->pid);
else
- clear_bit(task->pid, pid_list->pids);
+ trace_pid_list_clear(pid_list, task->pid);
}
/**
@@ -454,18 +656,19 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
*/
void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
{
- unsigned long pid = (unsigned long)v;
+ long pid = (unsigned long)v;
+ unsigned int next;
(*pos)++;
- /* pid already is +1 of the actual prevous bit */
- pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
+ /* pid already is +1 of the actual previous bit */
+ if (trace_pid_list_next(pid_list, pid, &next) < 0)
+ return NULL;
- /* Return pid + 1 to allow zero to be represented */
- if (pid < pid_list->pid_max)
- return (void *)(pid + 1);
+ pid = next;
- return NULL;
+ /* Return pid + 1 to allow zero to be represented */
+ return (void *)(pid + 1);
}
/**
@@ -482,12 +685,14 @@ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos)
{
unsigned long pid;
+ unsigned int first;
loff_t l = 0;
- pid = find_first_bit(pid_list->pids, pid_list->pid_max);
- if (pid >= pid_list->pid_max)
+ if (trace_pid_list_first(pid_list, &first) < 0)
return NULL;
+ pid = first;
+
/* Return pid + 1 so that zero can be the exit value */
for (pid++; pid && l < *pos;
pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l))
@@ -523,7 +728,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
unsigned long val;
int nr_pids = 0;
ssize_t read = 0;
- ssize_t ret = 0;
+ ssize_t ret;
loff_t pos;
pid_t pid;
@@ -536,55 +741,48 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
* the user. If the operation fails, then the current list is
* not modified.
*/
- pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
+ pid_list = trace_pid_list_alloc();
if (!pid_list) {
trace_parser_put(&parser);
return -ENOMEM;
}
- pid_list->pid_max = READ_ONCE(pid_max);
-
- /* Only truncating will shrink pid_max */
- if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
- pid_list->pid_max = filtered_pids->pid_max;
-
- pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
- if (!pid_list->pids) {
- trace_parser_put(&parser);
- kfree(pid_list);
- return -ENOMEM;
- }
-
if (filtered_pids) {
/* copy the current bits to the new max */
- for_each_set_bit(pid, filtered_pids->pids,
- filtered_pids->pid_max) {
- set_bit(pid, pid_list->pids);
+ ret = trace_pid_list_first(filtered_pids, &pid);
+ while (!ret) {
+ trace_pid_list_set(pid_list, pid);
+ ret = trace_pid_list_next(filtered_pids, pid + 1, &pid);
nr_pids++;
}
}
+ ret = 0;
while (cnt > 0) {
pos = 0;
ret = trace_get_user(&parser, ubuf, cnt, &pos);
- if (ret < 0 || !trace_parser_loaded(&parser))
+ if (ret < 0)
break;
read += ret;
ubuf += ret;
cnt -= ret;
+ if (!trace_parser_loaded(&parser))
+ break;
+
ret = -EINVAL;
if (kstrtoul(parser.buffer, 0, &val))
break;
- if (val >= pid_list->pid_max)
- break;
pid = (pid_t)val;
- set_bit(pid, pid_list->pids);
+ if (trace_pid_list_set(pid_list, pid) < 0) {
+ ret = -1;
+ break;
+ }
nr_pids++;
trace_parser_clear(&parser);
@@ -593,14 +791,13 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
trace_parser_put(&parser);
if (ret < 0) {
- trace_free_pid_list(pid_list);
+ trace_pid_list_free(pid_list);
return ret;
}
if (!nr_pids) {
/* Cleared the list of pids */
- trace_free_pid_list(pid_list);
- read = ret;
+ trace_pid_list_free(pid_list);
pid_list = NULL;
}
@@ -617,7 +814,7 @@ static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu)
if (!buf->buffer)
return trace_clock_local();
- ts = ring_buffer_time_stamp(buf->buffer, cpu);
+ ts = ring_buffer_time_stamp(buf->buffer);
ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts);
return ts;
@@ -629,7 +826,7 @@ u64 ftrace_now(int cpu)
}
/**
- * tracing_is_enabled - Show if global_trace has been disabled
+ * tracing_is_enabled - Show if global_trace has been enabled
*
* Shows if the global trace has been enabled or not. It uses the
* mirror flag "buffer_disabled" to be used in fast paths such as for
@@ -680,7 +877,7 @@ DEFINE_MUTEX(trace_types_lock);
* The content of events may become garbage if we allow other process consumes
* these events concurrently:
* A) the page of the consumed events may become a normal page
- * (not reader page) in ring buffer, and this page will be rewrited
+ * (not reader page) in ring buffer, and this page will be rewritten
* by events producer.
* B) The page of the consumed events may become a page for splice_read,
* and this page will be returned to system.
@@ -754,23 +951,23 @@ static inline void trace_access_lock_init(void)
#ifdef CONFIG_STACKTRACE
static void __ftrace_trace_stack(struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs);
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs);
static inline void ftrace_trace_stack(struct trace_array *tr,
struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs);
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs);
#else
static inline void __ftrace_trace_stack(struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs)
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs)
{
}
static inline void ftrace_trace_stack(struct trace_array *tr,
struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs)
+ unsigned long trace_ctx,
+ int skip, struct pt_regs *regs)
{
}
@@ -778,24 +975,24 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
static __always_inline void
trace_event_setup(struct ring_buffer_event *event,
- int type, unsigned long flags, int pc)
+ int type, unsigned int trace_ctx)
{
struct trace_entry *ent = ring_buffer_event_data(event);
- tracing_generic_entry_update(ent, type, flags, pc);
+ tracing_generic_entry_update(ent, type, trace_ctx);
}
static __always_inline struct ring_buffer_event *
__trace_buffer_lock_reserve(struct trace_buffer *buffer,
int type,
unsigned long len,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
struct ring_buffer_event *event;
event = ring_buffer_lock_reserve(buffer, len);
if (event != NULL)
- trace_event_setup(event, type, flags, pc);
+ trace_event_setup(event, type, trace_ctx);
return event;
}
@@ -841,40 +1038,37 @@ __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *ev
ring_buffer_write(buffer, event->array[0], &event->array[1]);
/* Release the temp buffer */
this_cpu_dec(trace_buffered_event_cnt);
+ /* ring_buffer_unlock_commit() enables preemption */
+ preempt_enable_notrace();
} else
- ring_buffer_unlock_commit(buffer, event);
+ ring_buffer_unlock_commit(buffer);
}
-/**
- * __trace_puts - write a constant string into the trace buffer.
- * @ip: The address of the caller
- * @str: The constant string to write
- * @size: The size of the string.
- */
-int __trace_puts(unsigned long ip, const char *str, int size)
+int __trace_array_puts(struct trace_array *tr, unsigned long ip,
+ const char *str, int size)
{
struct ring_buffer_event *event;
struct trace_buffer *buffer;
struct print_entry *entry;
- unsigned long irq_flags;
+ unsigned int trace_ctx;
int alloc;
- int pc;
- if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
+ if (!(tr->trace_flags & TRACE_ITER_PRINTK))
return 0;
- pc = preempt_count();
+ if (unlikely(tracing_selftest_running && tr == &global_trace))
+ return 0;
- if (unlikely(tracing_selftest_running || tracing_disabled))
+ if (unlikely(tracing_disabled))
return 0;
alloc = sizeof(*entry) + size + 2; /* possible \n added */
- local_save_flags(irq_flags);
- buffer = global_trace.array_buffer.buffer;
+ trace_ctx = tracing_gen_ctx();
+ buffer = tr->array_buffer.buffer;
ring_buffer_nest_start(buffer);
- event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
- irq_flags, pc);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
+ trace_ctx);
if (!event) {
size = 0;
goto out;
@@ -893,11 +1087,23 @@ int __trace_puts(unsigned long ip, const char *str, int size)
entry->buf[size] = '\0';
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
out:
ring_buffer_nest_end(buffer);
return size;
}
+EXPORT_SYMBOL_GPL(__trace_array_puts);
+
+/**
+ * __trace_puts - write a constant string into the trace buffer.
+ * @ip: The address of the caller
+ * @str: The constant string to write
+ * @size: The size of the string.
+ */
+int __trace_puts(unsigned long ip, const char *str, int size)
+{
+ return __trace_array_puts(&global_trace, ip, str, size);
+}
EXPORT_SYMBOL_GPL(__trace_puts);
/**
@@ -910,25 +1116,22 @@ int __trace_bputs(unsigned long ip, const char *str)
struct ring_buffer_event *event;
struct trace_buffer *buffer;
struct bputs_entry *entry;
- unsigned long irq_flags;
+ unsigned int trace_ctx;
int size = sizeof(struct bputs_entry);
int ret = 0;
- int pc;
if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
return 0;
- pc = preempt_count();
-
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
- local_save_flags(irq_flags);
+ trace_ctx = tracing_gen_ctx();
buffer = global_trace.array_buffer.buffer;
ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
- irq_flags, pc);
+ trace_ctx);
if (!event)
goto out;
@@ -937,7 +1140,7 @@ int __trace_bputs(unsigned long ip, const char *str)
entry->str = str;
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
+ ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);
ret = 1;
out:
@@ -954,22 +1157,22 @@ static void tracing_snapshot_instance_cond(struct trace_array *tr,
unsigned long flags;
if (in_nmi()) {
- internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
- internal_trace_puts("*** snapshot is being ignored ***\n");
+ trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
+ trace_array_puts(tr, "*** snapshot is being ignored ***\n");
return;
}
if (!tr->allocated_snapshot) {
- internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n");
- internal_trace_puts("*** stopping trace here! ***\n");
- tracing_off();
+ trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n");
+ trace_array_puts(tr, "*** stopping trace here! ***\n");
+ tracer_tracing_off(tr);
return;
}
/* Note, snapshot can not be used when the tracer uses it */
if (tracer->use_max_tr) {
- internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n");
- internal_trace_puts("*** Can not use snapshot (sorry) ***\n");
+ trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n");
+ trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
return;
}
@@ -992,7 +1195,7 @@ void tracing_snapshot_instance(struct trace_array *tr)
*
* Note, make sure to allocate the snapshot with either
* a tracing_snapshot_alloc(), or by doing it manually
- * with: echo 1 > /sys/kernel/debug/tracing/snapshot
+ * with: echo 1 > /sys/kernel/tracing/snapshot
*
* If the snapshot buffer is not allocated, it will stop tracing.
* Basically making a permanent snapshot.
@@ -1025,7 +1228,7 @@ void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
EXPORT_SYMBOL_GPL(tracing_snapshot_cond);
/**
- * tracing_snapshot_cond_data - get the user data associated with a snapshot
+ * tracing_cond_snapshot_data - get the user data associated with a snapshot
* @tr: The tracing instance
*
* When the user enables a conditional snapshot using
@@ -1042,12 +1245,14 @@ void *tracing_cond_snapshot_data(struct trace_array *tr)
{
void *cond_data = NULL;
+ local_irq_disable();
arch_spin_lock(&tr->max_lock);
if (tr->cond_snapshot)
cond_data = tr->cond_snapshot->cond_data;
arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
return cond_data;
}
@@ -1059,10 +1264,17 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val);
int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
+ int order;
int ret;
if (!tr->allocated_snapshot) {
+ /* Make the snapshot buffer have the same order as main buffer */
+ order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
+ if (ret < 0)
+ return ret;
+
/* allocate spare buffer */
ret = resize_buffer_duplicate_size(&tr->max_buffer,
&tr->array_buffer, RING_BUFFER_ALL_CPUS);
@@ -1082,6 +1294,7 @@ static void free_snapshot(struct trace_array *tr)
* The max_tr ring buffer has some state (e.g. ring->clock) and
* we want preserve it.
*/
+ ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0);
ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
set_buffer_entries(&tr->max_buffer, 1);
tracing_reset_online_cpus(&tr->max_buffer);
@@ -1183,9 +1396,11 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
goto fail_unlock;
}
+ local_irq_disable();
arch_spin_lock(&tr->max_lock);
tr->cond_snapshot = cond_snapshot;
arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
mutex_unlock(&trace_types_lock);
@@ -1212,6 +1427,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
{
int ret = 0;
+ local_irq_disable();
arch_spin_lock(&tr->max_lock);
if (!tr->cond_snapshot)
@@ -1222,6 +1438,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
}
arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
return ret;
}
@@ -1264,6 +1481,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
return false;
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
+#define free_snapshot(tr) do { } while (0)
#endif /* CONFIG_TRACER_SNAPSHOT */
void tracer_tracing_off(struct trace_array *tr)
@@ -1315,7 +1533,7 @@ void disable_trace_on_warning(void)
bool tracer_tracing_is_on(struct trace_array *tr)
{
if (tr->array_buffer.buffer)
- return ring_buffer_record_is_on(tr->array_buffer.buffer);
+ return ring_buffer_record_is_set_on(tr->array_buffer.buffer);
return !tr->buffer_disabled;
}
@@ -1335,10 +1553,12 @@ static int __init set_buf_size(char *str)
if (!str)
return 0;
buf_size = memparse(str, &str);
- /* nr_entries can not be zero */
- if (buf_size == 0)
- return 0;
- trace_buf_size = buf_size;
+ /*
+ * nr_entries can not be zero and the startup
+ * tests require some buffer space. Therefore
+ * ensure we have at least 4096 bytes of buffer.
+ */
+ trace_buf_size = max(4096UL, buf_size);
return 1;
}
__setup("trace_buf_size=", set_buf_size);
@@ -1372,7 +1592,7 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
#undef C
#define C(a, b) b
-/* These must match the bit postions in trace_iterator_flags */
+/* These must match the bit positions in trace_iterator_flags */
static const char *trace_options[] = {
TRACE_FLAGS
NULL
@@ -1391,6 +1611,7 @@ static struct {
{ ktime_get_mono_fast_ns, "mono", 1 },
{ ktime_get_raw_fast_ns, "mono_raw", 1 },
{ ktime_get_boot_fast_ns, "boot", 1 },
+ { ktime_get_tai_fast_ns, "tai", 1 },
ARCH_TRACE_CLOCKS
};
@@ -1519,23 +1740,24 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
{
int len;
- if (trace_seq_used(s) <= s->seq.readpos)
+ if (trace_seq_used(s) <= s->readpos)
return -EBUSY;
- len = trace_seq_used(s) - s->seq.readpos;
+ len = trace_seq_used(s) - s->readpos;
if (cnt > len)
cnt = len;
- memcpy(buf, s->buffer + s->seq.readpos, cnt);
+ memcpy(buf, s->buffer + s->readpos, cnt);
- s->seq.readpos += cnt;
+ s->readpos += cnt;
return cnt;
}
unsigned long __read_mostly tracing_thresh;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
static const struct file_operations tracing_max_lat_fops;
-#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
- defined(CONFIG_FSNOTIFY)
+#ifdef LATENCY_FS_NOTIFY
static struct workqueue_struct *fsnotify_wq;
@@ -1543,8 +1765,7 @@ static void latency_fsnotify_workfn(struct work_struct *work)
{
struct trace_array *tr = container_of(work, struct trace_array,
fsnotify_work);
- fsnotify(tr->d_max_latency->d_inode, FS_MODIFY,
- tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+ fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY);
}
static void latency_fsnotify_workfn_irq(struct irq_work *iwork)
@@ -1559,8 +1780,9 @@ static void trace_create_maxlat_file(struct trace_array *tr,
{
INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn);
init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq);
- tr->d_max_latency = trace_create_file("tracing_max_latency", 0644,
- d_tracer, &tr->max_latency,
+ tr->d_max_latency = trace_create_file("tracing_max_latency",
+ TRACE_MODE_WRITE,
+ d_tracer, tr,
&tracing_max_lat_fops);
}
@@ -1589,19 +1811,14 @@ void latency_fsnotify(struct trace_array *tr)
irq_work_queue(&tr->fsnotify_irqwork);
}
-/*
- * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
- * defined(CONFIG_FSNOTIFY)
- */
-#else
+#else /* !LATENCY_FS_NOTIFY */
#define trace_create_maxlat_file(tr, d_tracer) \
- trace_create_file("tracing_max_latency", 0644, d_tracer, \
- &tr->max_latency, &tracing_max_lat_fops)
+ trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \
+ d_tracer, tr, &tracing_max_lat_fops)
#endif
-#ifdef CONFIG_TRACER_MAX_TRACE
/*
* Copy the new maximum trace into the separate maximum-trace
* structure. (this way the maximum trace is permanently saved,
@@ -1676,15 +1893,19 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
ring_buffer_record_off(tr->max_buffer.buffer);
#ifdef CONFIG_TRACER_SNAPSHOT
- if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data))
- goto out_unlock;
+ if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) {
+ arch_spin_unlock(&tr->max_lock);
+ return;
+ }
#endif
swap(tr->array_buffer.buffer, tr->max_buffer.buffer);
__update_max_tr(tr, tsk, cpu);
- out_unlock:
arch_spin_unlock(&tr->max_lock);
+
+ /* Any waiters on the old snapshot buffer need to wake up */
+ ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS);
}
/**
@@ -1720,9 +1941,10 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
* place on this CPU. We fail to record, but we reset
* the max trace buffer (no one writes directly to it)
* and flag that it failed.
+ * Another reason is resize is in progress.
*/
trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,
- "Failed to swap buffers due to commit in progress\n");
+ "Failed to swap buffers due to commit or resize in progress\n");
}
WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
@@ -1730,16 +1952,28 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
__update_max_tr(tr, tsk, cpu);
arch_spin_unlock(&tr->max_lock);
}
+
#endif /* CONFIG_TRACER_MAX_TRACE */
static int wait_on_pipe(struct trace_iterator *iter, int full)
{
+ int ret;
+
/* Iterators are static, they should be filled or empty */
if (trace_buffer_iter(iter, iter->cpu_file))
return 0;
- return ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file,
- full);
+ ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ /*
+ * Make sure this is still the snapshot buffer, as if a snapshot were
+ * to happen, this would now be the main buffer.
+ */
+ if (iter->snapshot)
+ iter->array_buffer = &iter->tr->max_buffer;
+#endif
+ return ret;
}
#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -1782,6 +2016,12 @@ static int run_tracer_selftest(struct tracer *type)
if (!selftests_can_run)
return save_selftest(type);
+ if (!tracing_is_on()) {
+ pr_warn("Selftest for tracer %s skipped due to tracing disabled\n",
+ type->name);
+ return 0;
+ }
+
/*
* Run a selftest on this tracer.
* Here we reset the trace buffer, and set the current
@@ -1796,7 +2036,7 @@ static int run_tracer_selftest(struct tracer *type)
#ifdef CONFIG_TRACER_MAX_TRACE
if (type->use_max_tr) {
/* If we expanded the buffers, make sure the max is expanded too */
- if (ring_buffer_expanded)
+ if (tr->ring_buffer_expanded)
ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
RING_BUFFER_ALL_CPUS);
tr->allocated_snapshot = true;
@@ -1822,7 +2062,7 @@ static int run_tracer_selftest(struct tracer *type)
tr->allocated_snapshot = false;
/* Shrink the max buffer again */
- if (ring_buffer_expanded)
+ if (tr->ring_buffer_expanded)
ring_buffer_resize(tr->max_buffer.buffer, 1,
RING_BUFFER_ALL_CPUS);
}
@@ -1832,6 +2072,24 @@ static int run_tracer_selftest(struct tracer *type)
return 0;
}
+static int do_run_tracer_selftest(struct tracer *type)
+{
+ int ret;
+
+ /*
+ * Tests can take a long time, especially if they are run one after the
+ * other, as does happen during bootup when all the tracers are
+ * registered. This could cause the soft lockup watchdog to trigger.
+ */
+ cond_resched();
+
+ tracing_selftest_running = true;
+ ret = run_tracer_selftest(type);
+ tracing_selftest_running = false;
+
+ return ret;
+}
+
static __init int init_trace_selftests(void)
{
struct trace_selftests *p, *n;
@@ -1883,6 +2141,10 @@ static inline int run_tracer_selftest(struct tracer *type)
{
return 0;
}
+static inline int do_run_tracer_selftest(struct tracer *type)
+{
+ return 0;
+}
#endif /* CONFIG_FTRACE_STARTUP_TEST */
static void add_tracer_options(struct trace_array *tr, struct tracer *t);
@@ -1918,8 +2180,6 @@ int __init register_tracer(struct tracer *type)
mutex_lock(&trace_types_lock);
- tracing_selftest_running = true;
-
for (t = trace_types; t; t = t->next) {
if (strcmp(type->name, t->name) == 0) {
/* already found */
@@ -1948,7 +2208,7 @@ int __init register_tracer(struct tracer *type)
/* store the tracer for __set_tracer_option */
type->flags->trace = type;
- ret = run_tracer_selftest(type);
+ ret = do_run_tracer_selftest(type);
if (ret < 0)
goto out;
@@ -1957,7 +2217,6 @@ int __init register_tracer(struct tracer *type)
add_tracer_options(&global_trace, type);
out:
- tracing_selftest_running = false;
mutex_unlock(&trace_types_lock);
if (ret || !default_bootup_tracer)
@@ -1974,11 +2233,7 @@ int __init register_tracer(struct tracer *type)
apply_trace_boot_options();
/* disable other selftests, since this will break it. */
- tracing_selftest_disabled = true;
-#ifdef CONFIG_FTRACE_STARTUP_TEST
- printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
- type->name);
-#endif
+ disable_tracing_selftest("running a tracer");
out_unlock:
return ret;
@@ -2003,7 +2258,6 @@ static void tracing_reset_cpu(struct array_buffer *buf, int cpu)
void tracing_reset_online_cpus(struct array_buffer *buf)
{
struct trace_buffer *buffer = buf->buffer;
- int cpu;
if (!buffer)
return;
@@ -2015,17 +2269,18 @@ void tracing_reset_online_cpus(struct array_buffer *buf)
buf->time_start = buffer_ftrace_now(buf, buf->cpu);
- for_each_online_cpu(cpu)
- ring_buffer_reset_cpu(buffer, cpu);
+ ring_buffer_reset_online_cpus(buffer);
ring_buffer_record_enable(buffer);
}
/* Must have trace_types_lock held */
-void tracing_reset_all_online_cpus(void)
+void tracing_reset_all_online_cpus_unlocked(void)
{
struct trace_array *tr;
+ lockdep_assert_held(&trace_types_lock);
+
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (!tr->clear_trace)
continue;
@@ -2037,23 +2292,39 @@ void tracing_reset_all_online_cpus(void)
}
}
+void tracing_reset_all_online_cpus(void)
+{
+ mutex_lock(&trace_types_lock);
+ tracing_reset_all_online_cpus_unlocked();
+ mutex_unlock(&trace_types_lock);
+}
+
+/*
+ * The tgid_map array maps from pid to tgid; i.e. the value stored at index i
+ * is the tgid last observed corresponding to pid=i.
+ */
static int *tgid_map;
+/* The maximum valid index into tgid_map. */
+static size_t tgid_map_max;
+
#define SAVED_CMDLINES_DEFAULT 128
#define NO_CMDLINE_MAP UINT_MAX
+/*
+ * Preemption must be disabled before acquiring trace_cmdline_lock.
+ * The various trace_arrays' max_lock must be acquired in a context
+ * where interrupt is disabled.
+ */
static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
struct saved_cmdlines_buffer {
unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
unsigned *map_cmdline_to_pid;
unsigned cmdline_num;
int cmdline_idx;
- char *saved_cmdlines;
+ char saved_cmdlines[];
};
static struct saved_cmdlines_buffer *savedcmd;
-/* temporary disable recording */
-static atomic_t trace_record_taskinfo_disabled __read_mostly;
-
static inline char *get_saved_cmdlines(int idx)
{
return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
@@ -2064,47 +2335,60 @@ static inline void set_cmdline(int idx, const char *cmdline)
strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
}
-static int allocate_cmdlines_buffer(unsigned int val,
- struct saved_cmdlines_buffer *s)
+static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
{
+ int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN);
+
+ kfree(s->map_cmdline_to_pid);
+ kmemleak_free(s);
+ free_pages((unsigned long)s, order);
+}
+
+static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
+{
+ struct saved_cmdlines_buffer *s;
+ struct page *page;
+ int orig_size, size;
+ int order;
+
+ /* Figure out how much is needed to hold the given number of cmdlines */
+ orig_size = sizeof(*s) + val * TASK_COMM_LEN;
+ order = get_order(orig_size);
+ size = 1 << (order + PAGE_SHIFT);
+ page = alloc_pages(GFP_KERNEL, order);
+ if (!page)
+ return NULL;
+
+ s = page_address(page);
+ kmemleak_alloc(s, size, 1, GFP_KERNEL);
+ memset(s, 0, sizeof(*s));
+
+ /* Round up to actual allocation */
+ val = (size - sizeof(*s)) / TASK_COMM_LEN;
+ s->cmdline_num = val;
+
s->map_cmdline_to_pid = kmalloc_array(val,
sizeof(*s->map_cmdline_to_pid),
GFP_KERNEL);
- if (!s->map_cmdline_to_pid)
- return -ENOMEM;
-
- s->saved_cmdlines = kmalloc_array(TASK_COMM_LEN, val, GFP_KERNEL);
- if (!s->saved_cmdlines) {
- kfree(s->map_cmdline_to_pid);
- return -ENOMEM;
+ if (!s->map_cmdline_to_pid) {
+ free_saved_cmdlines_buffer(s);
+ return NULL;
}
s->cmdline_idx = 0;
- s->cmdline_num = val;
memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
sizeof(s->map_pid_to_cmdline));
memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
val * sizeof(*s->map_cmdline_to_pid));
- return 0;
+ return s;
}
static int trace_create_savedcmd(void)
{
- int ret;
-
- savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL);
- if (!savedcmd)
- return -ENOMEM;
-
- ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd);
- if (ret < 0) {
- kfree(savedcmd);
- savedcmd = NULL;
- return -ENOMEM;
- }
+ savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT);
- return 0;
+ return savedcmd ? 0 : -ENOMEM;
}
int is_tracing_stopped(void)
@@ -2112,13 +2396,7 @@ int is_tracing_stopped(void)
return global_trace.stop_count;
}
-/**
- * tracing_start - quick start of the tracer
- *
- * If tracing is enabled but was stopped by tracing_stop,
- * this will start the tracer back up.
- */
-void tracing_start(void)
+static void tracing_start_tr(struct trace_array *tr)
{
struct trace_buffer *buffer;
unsigned long flags;
@@ -2126,161 +2404,117 @@ void tracing_start(void)
if (tracing_disabled)
return;
- raw_spin_lock_irqsave(&global_trace.start_lock, flags);
- if (--global_trace.stop_count) {
- if (global_trace.stop_count < 0) {
+ raw_spin_lock_irqsave(&tr->start_lock, flags);
+ if (--tr->stop_count) {
+ if (WARN_ON_ONCE(tr->stop_count < 0)) {
/* Someone screwed up their debugging */
- WARN_ON_ONCE(1);
- global_trace.stop_count = 0;
+ tr->stop_count = 0;
}
goto out;
}
/* Prevent the buffers from switching */
- arch_spin_lock(&global_trace.max_lock);
+ arch_spin_lock(&tr->max_lock);
- buffer = global_trace.array_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
if (buffer)
ring_buffer_record_enable(buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
- buffer = global_trace.max_buffer.buffer;
+ buffer = tr->max_buffer.buffer;
if (buffer)
ring_buffer_record_enable(buffer);
#endif
- arch_spin_unlock(&global_trace.max_lock);
-
- out:
- raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
-}
-
-static void tracing_start_tr(struct trace_array *tr)
-{
- struct trace_buffer *buffer;
- unsigned long flags;
-
- if (tracing_disabled)
- return;
-
- /* If global, we need to also start the max tracer */
- if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
- return tracing_start();
-
- raw_spin_lock_irqsave(&tr->start_lock, flags);
-
- if (--tr->stop_count) {
- if (tr->stop_count < 0) {
- /* Someone screwed up their debugging */
- WARN_ON_ONCE(1);
- tr->stop_count = 0;
- }
- goto out;
- }
-
- buffer = tr->array_buffer.buffer;
- if (buffer)
- ring_buffer_record_enable(buffer);
+ arch_spin_unlock(&tr->max_lock);
out:
raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}
/**
- * tracing_stop - quick stop of the tracer
+ * tracing_start - quick start of the tracer
*
- * Light weight way to stop tracing. Use in conjunction with
- * tracing_start.
+ * If tracing is enabled but was stopped by tracing_stop,
+ * this will start the tracer back up.
*/
-void tracing_stop(void)
+void tracing_start(void)
+
+{
+ return tracing_start_tr(&global_trace);
+}
+
+static void tracing_stop_tr(struct trace_array *tr)
{
struct trace_buffer *buffer;
unsigned long flags;
- raw_spin_lock_irqsave(&global_trace.start_lock, flags);
- if (global_trace.stop_count++)
+ raw_spin_lock_irqsave(&tr->start_lock, flags);
+ if (tr->stop_count++)
goto out;
/* Prevent the buffers from switching */
- arch_spin_lock(&global_trace.max_lock);
+ arch_spin_lock(&tr->max_lock);
- buffer = global_trace.array_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
if (buffer)
ring_buffer_record_disable(buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
- buffer = global_trace.max_buffer.buffer;
+ buffer = tr->max_buffer.buffer;
if (buffer)
ring_buffer_record_disable(buffer);
#endif
- arch_spin_unlock(&global_trace.max_lock);
+ arch_spin_unlock(&tr->max_lock);
out:
- raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
+ raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}
-static void tracing_stop_tr(struct trace_array *tr)
+/**
+ * tracing_stop - quick stop of the tracer
+ *
+ * Light weight way to stop tracing. Use in conjunction with
+ * tracing_start.
+ */
+void tracing_stop(void)
{
- struct trace_buffer *buffer;
- unsigned long flags;
-
- /* If global, we need to also stop the max tracer */
- if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
- return tracing_stop();
-
- raw_spin_lock_irqsave(&tr->start_lock, flags);
- if (tr->stop_count++)
- goto out;
-
- buffer = tr->array_buffer.buffer;
- if (buffer)
- ring_buffer_record_disable(buffer);
-
- out:
- raw_spin_unlock_irqrestore(&tr->start_lock, flags);
+ return tracing_stop_tr(&global_trace);
}
static int trace_save_cmdline(struct task_struct *tsk)
{
- unsigned pid, idx;
+ unsigned tpid, idx;
/* treat recording of idle task as a success */
if (!tsk->pid)
return 1;
- if (unlikely(tsk->pid > PID_MAX_DEFAULT))
- return 0;
+ tpid = tsk->pid & (PID_MAX_DEFAULT - 1);
/*
* It's not the end of the world if we don't get
* the lock, but we also don't want to spin
* nor do we want to disable interrupts,
* so if we miss here, then better luck next time.
+ *
+ * This is called within the scheduler and wake up, so interrupts
+ * had better been disabled and run queue lock been held.
*/
+ lockdep_assert_preemption_disabled();
if (!arch_spin_trylock(&trace_cmdline_lock))
return 0;
- idx = savedcmd->map_pid_to_cmdline[tsk->pid];
+ idx = savedcmd->map_pid_to_cmdline[tpid];
if (idx == NO_CMDLINE_MAP) {
idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
- /*
- * Check whether the cmdline buffer at idx has a pid
- * mapped. We are going to overwrite that entry so we
- * need to clear the map_pid_to_cmdline. Otherwise we
- * would read the new comm for the old pid.
- */
- pid = savedcmd->map_cmdline_to_pid[idx];
- if (pid != NO_CMDLINE_MAP)
- savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
-
- savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
- savedcmd->map_pid_to_cmdline[tsk->pid] = idx;
-
+ savedcmd->map_pid_to_cmdline[tpid] = idx;
savedcmd->cmdline_idx = idx;
}
+ savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
set_cmdline(idx, tsk->comm);
arch_spin_unlock(&trace_cmdline_lock);
@@ -2291,6 +2525,7 @@ static int trace_save_cmdline(struct task_struct *tsk)
static void __trace_find_cmdline(int pid, char comm[])
{
unsigned map;
+ int tpid;
if (!pid) {
strcpy(comm, "<idle>");
@@ -2302,16 +2537,16 @@ static void __trace_find_cmdline(int pid, char comm[])
return;
}
- if (pid > PID_MAX_DEFAULT) {
- strcpy(comm, "<...>");
- return;
+ tpid = pid & (PID_MAX_DEFAULT - 1);
+ map = savedcmd->map_pid_to_cmdline[tpid];
+ if (map != NO_CMDLINE_MAP) {
+ tpid = savedcmd->map_cmdline_to_pid[map];
+ if (tpid == pid) {
+ strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
+ return;
+ }
}
-
- map = savedcmd->map_pid_to_cmdline[pid];
- if (map != NO_CMDLINE_MAP)
- strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
- else
- strcpy(comm, "<...>");
+ strcpy(comm, "<...>");
}
void trace_find_cmdline(int pid, char comm[])
@@ -2325,24 +2560,41 @@ void trace_find_cmdline(int pid, char comm[])
preempt_enable();
}
+static int *trace_find_tgid_ptr(int pid)
+{
+ /*
+ * Pairs with the smp_store_release in set_tracer_flag() to ensure that
+ * if we observe a non-NULL tgid_map then we also observe the correct
+ * tgid_map_max.
+ */
+ int *map = smp_load_acquire(&tgid_map);
+
+ if (unlikely(!map || pid > tgid_map_max))
+ return NULL;
+
+ return &map[pid];
+}
+
int trace_find_tgid(int pid)
{
- if (unlikely(!tgid_map || !pid || pid > PID_MAX_DEFAULT))
- return 0;
+ int *ptr = trace_find_tgid_ptr(pid);
- return tgid_map[pid];
+ return ptr ? *ptr : 0;
}
static int trace_save_tgid(struct task_struct *tsk)
{
+ int *ptr;
+
/* treat recording of idle task as a success */
if (!tsk->pid)
return 1;
- if (unlikely(!tgid_map || tsk->pid > PID_MAX_DEFAULT))
+ ptr = trace_find_tgid_ptr(tsk->pid);
+ if (!ptr)
return 0;
- tgid_map[tsk->pid] = tsk->tgid;
+ *ptr = tsk->tgid;
return 1;
}
@@ -2350,8 +2602,6 @@ static bool tracing_record_taskinfo_skip(int flags)
{
if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
return true;
- if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on())
- return true;
if (!__this_cpu_read(trace_taskinfo_save))
return true;
return false;
@@ -2440,36 +2690,46 @@ enum print_line_t trace_handle_return(struct trace_seq *s)
}
EXPORT_SYMBOL_GPL(trace_handle_return);
-void
-tracing_generic_entry_update(struct trace_entry *entry, unsigned short type,
- unsigned long flags, int pc)
+static unsigned short migration_disable_value(void)
{
- struct task_struct *tsk = current;
-
- entry->preempt_count = pc & 0xff;
- entry->pid = (tsk) ? tsk->pid : 0;
- entry->type = type;
- entry->flags =
-#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
- (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
+#if defined(CONFIG_SMP)
+ return current->migration_disabled;
#else
- TRACE_FLAG_IRQS_NOSUPPORT |
+ return 0;
#endif
- ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) |
- ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
- ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
- (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
- (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
}
-EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
+
+unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
+{
+ unsigned int trace_flags = irqs_status;
+ unsigned int pc;
+
+ pc = preempt_count();
+
+ if (pc & NMI_MASK)
+ trace_flags |= TRACE_FLAG_NMI;
+ if (pc & HARDIRQ_MASK)
+ trace_flags |= TRACE_FLAG_HARDIRQ;
+ if (in_serving_softirq())
+ trace_flags |= TRACE_FLAG_SOFTIRQ;
+ if (softirq_count() >> (SOFTIRQ_SHIFT + 1))
+ trace_flags |= TRACE_FLAG_BH_OFF;
+
+ if (tif_need_resched())
+ trace_flags |= TRACE_FLAG_NEED_RESCHED;
+ if (test_preempt_need_resched())
+ trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
+ return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) |
+ (min_t(unsigned int, migration_disable_value(), 0xf)) << 4;
+}
struct ring_buffer_event *
trace_buffer_lock_reserve(struct trace_buffer *buffer,
int type,
unsigned long len,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
- return __trace_buffer_lock_reserve(buffer, type, len, flags, pc);
+ return __trace_buffer_lock_reserve(buffer, type, len, trace_ctx);
}
DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
@@ -2504,8 +2764,11 @@ void trace_buffered_event_enable(void)
for_each_tracing_cpu(cpu) {
page = alloc_pages_node(cpu_to_node(cpu),
GFP_KERNEL | __GFP_NORETRY, 0);
- if (!page)
- goto failed;
+ /* This is just an optimization and can handle failures */
+ if (!page) {
+ pr_err("Failed to allocate event buffer\n");
+ break;
+ }
event = page_address(page);
memset(event, 0, sizeof(*event));
@@ -2514,15 +2777,11 @@ void trace_buffered_event_enable(void)
preempt_disable();
if (cpu == smp_processor_id() &&
- this_cpu_read(trace_buffered_event) !=
+ __this_cpu_read(trace_buffered_event) !=
per_cpu(trace_buffered_event, cpu))
WARN_ON_ONCE(1);
preempt_enable();
}
-
- return;
- failed:
- trace_buffered_event_disable();
}
static void enable_trace_buffered_event(void *data)
@@ -2557,11 +2816,9 @@ void trace_buffered_event_disable(void)
if (--trace_buffered_event_ref)
return;
- preempt_disable();
/* For each CPU, set the buffer as used. */
- smp_call_function_many(tracing_buffer_mask,
- disable_trace_buffered_event, NULL, 1);
- preempt_enable();
+ on_each_cpu_mask(tracing_buffer_mask, disable_trace_buffered_event,
+ NULL, true);
/* Wait for all current users to finish */
synchronize_rcu();
@@ -2570,17 +2827,19 @@ void trace_buffered_event_disable(void)
free_page((unsigned long)per_cpu(trace_buffered_event, cpu));
per_cpu(trace_buffered_event, cpu) = NULL;
}
+
/*
- * Make sure trace_buffered_event is NULL before clearing
- * trace_buffered_event_cnt.
+ * Wait for all CPUs that potentially started checking if they can use
+ * their event buffer only after the previous synchronize_rcu() call and
+ * they still read a valid pointer from trace_buffered_event. It must be
+ * ensured they don't see cleared trace_buffered_event_cnt else they
+ * could wrongly decide to use the pointed-to buffer which is now freed.
*/
- smp_wmb();
+ synchronize_rcu();
- preempt_disable();
- /* Do the work on each cpu */
- smp_call_function_many(tracing_buffer_mask,
- enable_trace_buffered_event, NULL, 1);
- preempt_enable();
+ /* For each CPU, relinquish the buffer */
+ on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL,
+ true);
}
static struct trace_buffer *temp_buffer;
@@ -2589,44 +2848,86 @@ struct ring_buffer_event *
trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
struct trace_event_file *trace_file,
int type, unsigned long len,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
struct ring_buffer_event *entry;
+ struct trace_array *tr = trace_file->tr;
int val;
- *current_rb = trace_file->tr->array_buffer.buffer;
-
- if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags &
- (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
- (entry = this_cpu_read(trace_buffered_event))) {
- /* Try to use the per cpu buffer first */
- val = this_cpu_inc_return(trace_buffered_event_cnt);
- if (val == 1) {
- trace_event_setup(entry, type, flags, pc);
- entry->array[0] = len;
- return entry;
+ *current_rb = tr->array_buffer.buffer;
+
+ if (!tr->no_filter_buffering_ref &&
+ (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED))) {
+ preempt_disable_notrace();
+ /*
+ * Filtering is on, so try to use the per cpu buffer first.
+ * This buffer will simulate a ring_buffer_event,
+ * where the type_len is zero and the array[0] will
+ * hold the full length.
+ * (see include/linux/ring-buffer.h for details on
+ * how the ring_buffer_event is structured).
+ *
+ * Using a temp buffer during filtering and copying it
+ * on a matched filter is quicker than writing directly
+ * into the ring buffer and then discarding it when
+ * it doesn't match. That is because the discard
+ * requires several atomic operations to get right.
+ * Copying on match and doing nothing on a failed match
+ * is still quicker than no copy on match, but having
+ * to discard out of the ring buffer on a failed match.
+ */
+ if ((entry = __this_cpu_read(trace_buffered_event))) {
+ int max_len = PAGE_SIZE - struct_size(entry, array, 1);
+
+ val = this_cpu_inc_return(trace_buffered_event_cnt);
+
+ /*
+ * Preemption is disabled, but interrupts and NMIs
+ * can still come in now. If that happens after
+ * the above increment, then it will have to go
+ * back to the old method of allocating the event
+ * on the ring buffer, and if the filter fails, it
+ * will have to call ring_buffer_discard_commit()
+ * to remove it.
+ *
+ * Need to also check the unlikely case that the
+ * length is bigger than the temp buffer size.
+ * If that happens, then the reserve is pretty much
+ * guaranteed to fail, as the ring buffer currently
+ * only allows events less than a page. But that may
+ * change in the future, so let the ring buffer reserve
+ * handle the failure in that case.
+ */
+ if (val == 1 && likely(len <= max_len)) {
+ trace_event_setup(entry, type, trace_ctx);
+ entry->array[0] = len;
+ /* Return with preemption disabled */
+ return entry;
+ }
+ this_cpu_dec(trace_buffered_event_cnt);
}
- this_cpu_dec(trace_buffered_event_cnt);
+ /* __trace_buffer_lock_reserve() disables preemption */
+ preempt_enable_notrace();
}
- entry = __trace_buffer_lock_reserve(*current_rb,
- type, len, flags, pc);
+ entry = __trace_buffer_lock_reserve(*current_rb, type, len,
+ trace_ctx);
/*
* If tracing is off, but we have triggers enabled
* we still need to look at the event data. Use the temp_buffer
- * to store the trace event for the tigger to use. It's recusive
+ * to store the trace event for the trigger to use. It's recursive
* safe and will not be recorded anywhere.
*/
if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) {
*current_rb = temp_buffer;
- entry = __trace_buffer_lock_reserve(*current_rb,
- type, len, flags, pc);
+ entry = __trace_buffer_lock_reserve(*current_rb, type, len,
+ trace_ctx);
}
return entry;
}
EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
-static DEFINE_SPINLOCK(tracepoint_iter_lock);
+static DEFINE_RAW_SPINLOCK(tracepoint_iter_lock);
static DEFINE_MUTEX(tracepoint_printk_mutex);
static void output_printk(struct trace_event_buffer *fbuffer)
@@ -2654,14 +2955,14 @@ static void output_printk(struct trace_event_buffer *fbuffer)
event = &fbuffer->trace_file->event_call->event;
- spin_lock_irqsave(&tracepoint_iter_lock, flags);
+ raw_spin_lock_irqsave(&tracepoint_iter_lock, flags);
trace_seq_init(&iter->seq);
iter->ent = fbuffer->entry;
event_call->event.funcs->trace(iter, 0, event);
trace_seq_putc(&iter->seq, 0);
printk("%s", iter->seq.buffer);
- spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
+ raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
}
int tracepoint_printk_sysctl(struct ctl_table *table, int write,
@@ -2699,12 +3000,26 @@ int tracepoint_printk_sysctl(struct ctl_table *table, int write,
void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
{
+ enum event_trigger_type tt = ETT_NONE;
+ struct trace_event_file *file = fbuffer->trace_file;
+
+ if (__event_trigger_test_discard(file, fbuffer->buffer, fbuffer->event,
+ fbuffer->entry, &tt))
+ goto discard;
+
if (static_key_false(&tracepoint_printk_key.key))
output_printk(fbuffer);
- event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer,
- fbuffer->event, fbuffer->entry,
- fbuffer->flags, fbuffer->pc, fbuffer->regs);
+ if (static_branch_unlikely(&trace_event_exports_enabled))
+ ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT);
+
+ trace_buffer_unlock_commit_regs(file->tr, fbuffer->buffer,
+ fbuffer->event, fbuffer->trace_ctx, fbuffer->regs);
+
+discard:
+ if (tt)
+ event_triggers_post_call(file, tt);
+
}
EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
@@ -2720,7 +3035,7 @@ EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
- unsigned long flags, int pc,
+ unsigned int trace_ctx,
struct pt_regs *regs)
{
__buffer_unlock_commit(buffer, event);
@@ -2731,8 +3046,8 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
* and mmiotrace, but that's ok if they lose a function or
* two. They are not that meaningful.
*/
- ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs);
- ftrace_trace_userstack(buffer, flags, pc);
+ ftrace_trace_stack(tr, buffer, trace_ctx, regs ? 0 : STACK_SKIP, regs);
+ ftrace_trace_userstack(tr, buffer, trace_ctx);
}
/*
@@ -2745,133 +3060,9 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
__buffer_unlock_commit(buffer, event);
}
-static void
-trace_process_export(struct trace_export *export,
- struct ring_buffer_event *event)
-{
- struct trace_entry *entry;
- unsigned int size = 0;
-
- entry = ring_buffer_event_data(event);
- size = ring_buffer_event_length(event);
- export->write(export, entry, size);
-}
-
-static DEFINE_MUTEX(ftrace_export_lock);
-
-static struct trace_export __rcu *ftrace_exports_list __read_mostly;
-
-static DEFINE_STATIC_KEY_FALSE(ftrace_exports_enabled);
-
-static inline void ftrace_exports_enable(void)
-{
- static_branch_enable(&ftrace_exports_enabled);
-}
-
-static inline void ftrace_exports_disable(void)
-{
- static_branch_disable(&ftrace_exports_enabled);
-}
-
-static void ftrace_exports(struct ring_buffer_event *event)
-{
- struct trace_export *export;
-
- preempt_disable_notrace();
-
- export = rcu_dereference_raw_check(ftrace_exports_list);
- while (export) {
- trace_process_export(export, event);
- export = rcu_dereference_raw_check(export->next);
- }
-
- preempt_enable_notrace();
-}
-
-static inline void
-add_trace_export(struct trace_export **list, struct trace_export *export)
-{
- rcu_assign_pointer(export->next, *list);
- /*
- * We are entering export into the list but another
- * CPU might be walking that list. We need to make sure
- * the export->next pointer is valid before another CPU sees
- * the export pointer included into the list.
- */
- rcu_assign_pointer(*list, export);
-}
-
-static inline int
-rm_trace_export(struct trace_export **list, struct trace_export *export)
-{
- struct trace_export **p;
-
- for (p = list; *p != NULL; p = &(*p)->next)
- if (*p == export)
- break;
-
- if (*p != export)
- return -1;
-
- rcu_assign_pointer(*p, (*p)->next);
-
- return 0;
-}
-
-static inline void
-add_ftrace_export(struct trace_export **list, struct trace_export *export)
-{
- if (*list == NULL)
- ftrace_exports_enable();
-
- add_trace_export(list, export);
-}
-
-static inline int
-rm_ftrace_export(struct trace_export **list, struct trace_export *export)
-{
- int ret;
-
- ret = rm_trace_export(list, export);
- if (*list == NULL)
- ftrace_exports_disable();
-
- return ret;
-}
-
-int register_ftrace_export(struct trace_export *export)
-{
- if (WARN_ON_ONCE(!export->write))
- return -1;
-
- mutex_lock(&ftrace_export_lock);
-
- add_ftrace_export(&ftrace_exports_list, export);
-
- mutex_unlock(&ftrace_export_lock);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(register_ftrace_export);
-
-int unregister_ftrace_export(struct trace_export *export)
-{
- int ret;
-
- mutex_lock(&ftrace_export_lock);
-
- ret = rm_ftrace_export(&ftrace_exports_list, export);
-
- mutex_unlock(&ftrace_export_lock);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(unregister_ftrace_export);
-
void
-trace_function(struct trace_array *tr,
- unsigned long ip, unsigned long parent_ip, unsigned long flags,
- int pc)
+trace_function(struct trace_array *tr, unsigned long ip, unsigned long
+ parent_ip, unsigned int trace_ctx)
{
struct trace_event_call *call = &event_function;
struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -2879,7 +3070,7 @@ trace_function(struct trace_array *tr,
struct ftrace_entry *entry;
event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
- flags, pc);
+ trace_ctx);
if (!event)
return;
entry = ring_buffer_event_data(event);
@@ -2887,8 +3078,8 @@ trace_function(struct trace_array *tr,
entry->parent_ip = parent_ip;
if (!call_filter_check_discard(call, entry, buffer, event)) {
- if (static_branch_unlikely(&ftrace_exports_enabled))
- ftrace_exports(event);
+ if (static_branch_unlikely(&trace_function_exports_enabled))
+ ftrace_exports(event, TRACE_EXPORT_FUNCTION);
__buffer_unlock_commit(buffer, event);
}
}
@@ -2913,8 +3104,8 @@ static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
static DEFINE_PER_CPU(int, ftrace_stack_reserve);
static void __ftrace_trace_stack(struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs)
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs)
{
struct trace_event_call *call = &event_kernel_stack;
struct ring_buffer_event *event;
@@ -2932,18 +3123,12 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
skip++;
#endif
- /*
- * Since events can happen in NMIs there's no safe way to
- * use the per cpu ftrace_stacks. We reserve it and if an interrupt
- * or NMI comes in, it will just have to use the default
- * FTRACE_STACK_SIZE.
- */
preempt_disable_notrace();
stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;
/* This should never happen. If it does, yell once and skip */
- if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING))
+ if (WARN_ON_ONCE(stackidx >= FTRACE_KSTACK_NESTING))
goto out;
/*
@@ -2965,15 +3150,16 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
nr_entries = stack_trace_save(fstack->calls, size, skip);
}
- size = nr_entries * sizeof(unsigned long);
event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
- sizeof(*entry) + size, flags, pc);
+ struct_size(entry, caller, nr_entries),
+ trace_ctx);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
- memcpy(&entry->caller, fstack->calls, size);
entry->size = nr_entries;
+ memcpy(&entry->caller, fstack->calls,
+ flex_array_size(entry, caller, nr_entries));
if (!call_filter_check_discard(call, entry, buffer, event))
__buffer_unlock_commit(buffer, event);
@@ -2988,37 +3174,40 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
static inline void ftrace_trace_stack(struct trace_array *tr,
struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs)
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs)
{
if (!(tr->trace_flags & TRACE_ITER_STACKTRACE))
return;
- __ftrace_trace_stack(buffer, flags, skip, pc, regs);
+ __ftrace_trace_stack(buffer, trace_ctx, skip, regs);
}
-void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
- int pc)
+void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
+ int skip)
{
struct trace_buffer *buffer = tr->array_buffer.buffer;
if (rcu_is_watching()) {
- __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
+ __ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
return;
}
+ if (WARN_ON_ONCE(IS_ENABLED(CONFIG_GENERIC_ENTRY)))
+ return;
+
/*
- * When an NMI triggers, RCU is enabled via rcu_nmi_enter(),
+ * When an NMI triggers, RCU is enabled via ct_nmi_enter(),
* but if the above rcu_is_watching() failed, then the NMI
- * triggered someplace critical, and rcu_irq_enter() should
+ * triggered someplace critical, and ct_irq_enter() should
* not be called from NMI.
*/
if (unlikely(in_nmi()))
return;
- rcu_irq_enter_irqson();
- __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
- rcu_irq_exit_irqson();
+ ct_irq_enter_irqson();
+ __ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
+ ct_irq_exit_irqson();
}
/**
@@ -3027,19 +3216,15 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
*/
void trace_dump_stack(int skip)
{
- unsigned long flags;
-
if (tracing_disabled || tracing_selftest_running)
return;
- local_save_flags(flags);
-
#ifndef CONFIG_UNWINDER_ORC
/* Skip 1 to skip this function. */
skip++;
#endif
__ftrace_trace_stack(global_trace.array_buffer.buffer,
- flags, skip, preempt_count(), NULL);
+ tracing_gen_ctx(), skip, NULL);
}
EXPORT_SYMBOL_GPL(trace_dump_stack);
@@ -3047,13 +3232,14 @@ EXPORT_SYMBOL_GPL(trace_dump_stack);
static DEFINE_PER_CPU(int, user_stack_count);
static void
-ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc)
+ftrace_trace_userstack(struct trace_array *tr,
+ struct trace_buffer *buffer, unsigned int trace_ctx)
{
struct trace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
struct userstack_entry *entry;
- if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE))
+ if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE))
return;
/*
@@ -3074,7 +3260,7 @@ ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc)
__this_cpu_inc(user_stack_count);
event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
goto out_drop_count;
entry = ring_buffer_event_data(event);
@@ -3092,38 +3278,73 @@ ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc)
preempt_enable();
}
#else /* CONFIG_USER_STACKTRACE_SUPPORT */
-static void ftrace_trace_userstack(struct trace_buffer *buffer,
- unsigned long flags, int pc)
+static void ftrace_trace_userstack(struct trace_array *tr,
+ struct trace_buffer *buffer,
+ unsigned int trace_ctx)
{
}
#endif /* !CONFIG_USER_STACKTRACE_SUPPORT */
#endif /* CONFIG_STACKTRACE */
+static inline void
+func_repeats_set_delta_ts(struct func_repeats_entry *entry,
+ unsigned long long delta)
+{
+ entry->bottom_delta_ts = delta & U32_MAX;
+ entry->top_delta_ts = (delta >> 32);
+}
+
+void trace_last_func_repeats(struct trace_array *tr,
+ struct trace_func_repeats *last_info,
+ unsigned int trace_ctx)
+{
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
+ struct func_repeats_entry *entry;
+ struct ring_buffer_event *event;
+ u64 delta;
+
+ event = __trace_buffer_lock_reserve(buffer, TRACE_FUNC_REPEATS,
+ sizeof(*entry), trace_ctx);
+ if (!event)
+ return;
+
+ delta = ring_buffer_event_time_stamp(buffer, event) -
+ last_info->ts_last_call;
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = last_info->ip;
+ entry->parent_ip = last_info->parent_ip;
+ entry->count = last_info->count;
+ func_repeats_set_delta_ts(entry, delta);
+
+ __buffer_unlock_commit(buffer, event);
+}
+
/* created for use with alloc_percpu */
struct trace_buffer_struct {
int nesting;
char buffer[4][TRACE_BUF_SIZE];
};
-static struct trace_buffer_struct *trace_percpu_buffer;
+static struct trace_buffer_struct __percpu *trace_percpu_buffer;
/*
- * Thise allows for lockless recording. If we're nested too deeply, then
+ * This allows for lockless recording. If we're nested too deeply, then
* this returns NULL.
*/
static char *get_trace_buf(void)
{
struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);
- if (!buffer || buffer->nesting >= 4)
+ if (!trace_percpu_buffer || buffer->nesting >= 4)
return NULL;
buffer->nesting++;
/* Interrupts must see nesting incremented before we use the buffer */
barrier();
- return &buffer->buffer[buffer->nesting][0];
+ return &buffer->buffer[buffer->nesting - 1][0];
}
static void put_trace_buf(void)
@@ -3135,7 +3356,10 @@ static void put_trace_buf(void)
static int alloc_percpu_trace_buffer(void)
{
- struct trace_buffer_struct *buffers;
+ struct trace_buffer_struct __percpu *buffers;
+
+ if (trace_percpu_buffer)
+ return 0;
buffers = alloc_percpu(struct trace_buffer_struct);
if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer"))
@@ -3173,7 +3397,7 @@ void trace_printk_init_buffers(void)
pr_warn("**********************************************************\n");
/* Expand the buffers to set size */
- tracing_update_buffers();
+ tracing_update_buffers(&global_trace);
buffers_allocated = 1;
@@ -3220,9 +3444,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
struct trace_buffer *buffer;
struct trace_array *tr = &global_trace;
struct bprint_entry *entry;
- unsigned long flags;
+ unsigned int trace_ctx;
char *tbuffer;
- int len = 0, size, pc;
+ int len = 0, size;
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
@@ -3230,7 +3454,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
/* Don't pollute graph traces with trace_vprintk internals */
pause_graph_tracing();
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
preempt_disable_notrace();
tbuffer = get_trace_buf();
@@ -3244,12 +3468,11 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
goto out_put;
- local_save_flags(flags);
size = sizeof(*entry) + sizeof(u32) * len;
buffer = tr->array_buffer.buffer;
ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
- flags, pc);
+ trace_ctx);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -3259,7 +3482,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
memcpy(entry->buf, tbuffer, sizeof(u32) * len);
if (!call_filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
}
out:
@@ -3282,18 +3505,18 @@ __trace_array_vprintk(struct trace_buffer *buffer,
{
struct trace_event_call *call = &event_print;
struct ring_buffer_event *event;
- int len = 0, size, pc;
+ int len = 0, size;
struct print_entry *entry;
- unsigned long flags;
+ unsigned int trace_ctx;
char *tbuffer;
- if (tracing_disabled || tracing_selftest_running)
+ if (tracing_disabled)
return 0;
/* Don't pollute graph traces with trace_vprintk internals */
pause_graph_tracing();
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
preempt_disable_notrace();
@@ -3305,11 +3528,10 @@ __trace_array_vprintk(struct trace_buffer *buffer,
len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
- local_save_flags(flags);
size = sizeof(*entry) + len + 1;
ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
- flags, pc);
+ trace_ctx);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -3318,7 +3540,7 @@ __trace_array_vprintk(struct trace_buffer *buffer,
memcpy(&entry->buf, tbuffer, len + 1);
if (!call_filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
+ ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL);
}
out:
@@ -3336,9 +3558,32 @@ __printf(3, 0)
int trace_array_vprintk(struct trace_array *tr,
unsigned long ip, const char *fmt, va_list args)
{
+ if (tracing_selftest_running && tr == &global_trace)
+ return 0;
+
return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args);
}
+/**
+ * trace_array_printk - Print a message to a specific instance
+ * @tr: The instance trace_array descriptor
+ * @ip: The instruction pointer that this is called from.
+ * @fmt: The format to print (printf format)
+ *
+ * If a subsystem sets up its own instance, they have the right to
+ * printk strings into their tracing instance buffer using this
+ * function. Note, this function will not write into the top level
+ * buffer (use trace_printk() for that), as writing into the top level
+ * buffer should only have events that can be individually disabled.
+ * trace_printk() is only used for debugging a kernel, and should not
+ * be ever incorporated in normal use.
+ *
+ * trace_array_printk() can be used, as it will not add noise to the
+ * top level tracing buffer.
+ *
+ * Note, trace_array_init_printk() must be called on @tr before this
+ * can be used.
+ */
__printf(3, 0)
int trace_array_printk(struct trace_array *tr,
unsigned long ip, const char *fmt, ...)
@@ -3346,12 +3591,16 @@ int trace_array_printk(struct trace_array *tr,
int ret;
va_list ap;
- if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
- return 0;
-
if (!tr)
return -ENOENT;
+ /* This is only allowed for created instances */
+ if (tr == &global_trace)
+ return 0;
+
+ if (!(tr->trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
va_start(ap, fmt);
ret = trace_array_vprintk(tr, ip, fmt, ap);
va_end(ap);
@@ -3359,6 +3608,27 @@ int trace_array_printk(struct trace_array *tr,
}
EXPORT_SYMBOL_GPL(trace_array_printk);
+/**
+ * trace_array_init_printk - Initialize buffers for trace_array_printk()
+ * @tr: The trace array to initialize the buffers for
+ *
+ * As trace_array_printk() only writes into instances, they are OK to
+ * have in the kernel (unlike trace_printk()). This needs to be called
+ * before trace_array_printk() can be used on a trace_array.
+ */
+int trace_array_init_printk(struct trace_array *tr)
+{
+ if (!tr)
+ return -ENOENT;
+
+ /* This is only allowed for created instances */
+ if (tr == &global_trace)
+ return -EINVAL;
+
+ return alloc_percpu_trace_buffer();
+}
+EXPORT_SYMBOL_GPL(trace_array_init_printk);
+
__printf(3, 4)
int trace_array_printk_buf(struct trace_buffer *buffer,
unsigned long ip, const char *fmt, ...)
@@ -3476,8 +3746,300 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
return next;
}
+#define STATIC_FMT_BUF_SIZE 128
+static char static_fmt_buf[STATIC_FMT_BUF_SIZE];
+
+char *trace_iter_expand_format(struct trace_iterator *iter)
+{
+ char *tmp;
+
+ /*
+ * iter->tr is NULL when used with tp_printk, which makes
+ * this get called where it is not safe to call krealloc().
+ */
+ if (!iter->tr || iter->fmt == static_fmt_buf)
+ return NULL;
+
+ tmp = krealloc(iter->fmt, iter->fmt_size + STATIC_FMT_BUF_SIZE,
+ GFP_KERNEL);
+ if (tmp) {
+ iter->fmt_size += STATIC_FMT_BUF_SIZE;
+ iter->fmt = tmp;
+ }
+
+ return tmp;
+}
+
+/* Returns true if the string is safe to dereference from an event */
+static bool trace_safe_str(struct trace_iterator *iter, const char *str,
+ bool star, int len)
+{
+ unsigned long addr = (unsigned long)str;
+ struct trace_event *trace_event;
+ struct trace_event_call *event;
+
+ /* Ignore strings with no length */
+ if (star && !len)
+ return true;
+
+ /* OK if part of the event data */
+ if ((addr >= (unsigned long)iter->ent) &&
+ (addr < (unsigned long)iter->ent + iter->ent_size))
+ return true;
+
+ /* OK if part of the temp seq buffer */
+ if ((addr >= (unsigned long)iter->tmp_seq.buffer) &&
+ (addr < (unsigned long)iter->tmp_seq.buffer + TRACE_SEQ_BUFFER_SIZE))
+ return true;
+
+ /* Core rodata can not be freed */
+ if (is_kernel_rodata(addr))
+ return true;
+
+ if (trace_is_tracepoint_string(str))
+ return true;
+
+ /*
+ * Now this could be a module event, referencing core module
+ * data, which is OK.
+ */
+ if (!iter->ent)
+ return false;
+
+ trace_event = ftrace_find_event(iter->ent->type);
+ if (!trace_event)
+ return false;
+
+ event = container_of(trace_event, struct trace_event_call, event);
+ if ((event->flags & TRACE_EVENT_FL_DYNAMIC) || !event->module)
+ return false;
+
+ /* Would rather have rodata, but this will suffice */
+ if (within_module_core(addr, event->module))
+ return true;
+
+ return false;
+}
+
+static DEFINE_STATIC_KEY_FALSE(trace_no_verify);
+
+static int test_can_verify_check(const char *fmt, ...)
+{
+ char buf[16];
+ va_list ap;
+ int ret;
+
+ /*
+ * The verifier is dependent on vsnprintf() modifies the va_list
+ * passed to it, where it is sent as a reference. Some architectures
+ * (like x86_32) passes it by value, which means that vsnprintf()
+ * does not modify the va_list passed to it, and the verifier
+ * would then need to be able to understand all the values that
+ * vsnprintf can use. If it is passed by value, then the verifier
+ * is disabled.
+ */
+ va_start(ap, fmt);
+ vsnprintf(buf, 16, "%d", ap);
+ ret = va_arg(ap, int);
+ va_end(ap);
+
+ return ret;
+}
+
+static void test_can_verify(void)
+{
+ if (!test_can_verify_check("%d %d", 0, 1)) {
+ pr_info("trace event string verifier disabled\n");
+ static_branch_inc(&trace_no_verify);
+ }
+}
+
+/**
+ * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer
+ * @iter: The iterator that holds the seq buffer and the event being printed
+ * @fmt: The format used to print the event
+ * @ap: The va_list holding the data to print from @fmt.
+ *
+ * This writes the data into the @iter->seq buffer using the data from
+ * @fmt and @ap. If the format has a %s, then the source of the string
+ * is examined to make sure it is safe to print, otherwise it will
+ * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string
+ * pointer.
+ */
+void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
+ va_list ap)
+{
+ const char *p = fmt;
+ const char *str;
+ int i, j;
+
+ if (WARN_ON_ONCE(!fmt))
+ return;
+
+ if (static_branch_unlikely(&trace_no_verify))
+ goto print;
+
+ /* Don't bother checking when doing a ftrace_dump() */
+ if (iter->fmt == static_fmt_buf)
+ goto print;
+
+ while (*p) {
+ bool star = false;
+ int len = 0;
+
+ j = 0;
+
+ /* We only care about %s and variants */
+ for (i = 0; p[i]; i++) {
+ if (i + 1 >= iter->fmt_size) {
+ /*
+ * If we can't expand the copy buffer,
+ * just print it.
+ */
+ if (!trace_iter_expand_format(iter))
+ goto print;
+ }
+
+ if (p[i] == '\\' && p[i+1]) {
+ i++;
+ continue;
+ }
+ if (p[i] == '%') {
+ /* Need to test cases like %08.*s */
+ for (j = 1; p[i+j]; j++) {
+ if (isdigit(p[i+j]) ||
+ p[i+j] == '.')
+ continue;
+ if (p[i+j] == '*') {
+ star = true;
+ continue;
+ }
+ break;
+ }
+ if (p[i+j] == 's')
+ break;
+ star = false;
+ }
+ j = 0;
+ }
+ /* If no %s found then just print normally */
+ if (!p[i])
+ break;
+
+ /* Copy up to the %s, and print that */
+ strncpy(iter->fmt, p, i);
+ iter->fmt[i] = '\0';
+ trace_seq_vprintf(&iter->seq, iter->fmt, ap);
+
+ /*
+ * If iter->seq is full, the above call no longer guarantees
+ * that ap is in sync with fmt processing, and further calls
+ * to va_arg() can return wrong positional arguments.
+ *
+ * Ensure that ap is no longer used in this case.
+ */
+ if (iter->seq.full) {
+ p = "";
+ break;
+ }
+
+ if (star)
+ len = va_arg(ap, int);
+
+ /* The ap now points to the string data of the %s */
+ str = va_arg(ap, const char *);
+
+ /*
+ * If you hit this warning, it is likely that the
+ * trace event in question used %s on a string that
+ * was saved at the time of the event, but may not be
+ * around when the trace is read. Use __string(),
+ * __assign_str() and __get_str() helpers in the TRACE_EVENT()
+ * instead. See samples/trace_events/trace-events-sample.h
+ * for reference.
+ */
+ if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
+ "fmt: '%s' current_buffer: '%s'",
+ fmt, seq_buf_str(&iter->seq.seq))) {
+ int ret;
+
+ /* Try to safely read the string */
+ if (star) {
+ if (len + 1 > iter->fmt_size)
+ len = iter->fmt_size - 1;
+ if (len < 0)
+ len = 0;
+ ret = copy_from_kernel_nofault(iter->fmt, str, len);
+ iter->fmt[len] = 0;
+ star = false;
+ } else {
+ ret = strncpy_from_kernel_nofault(iter->fmt, str,
+ iter->fmt_size);
+ }
+ if (ret < 0)
+ trace_seq_printf(&iter->seq, "(0x%px)", str);
+ else
+ trace_seq_printf(&iter->seq, "(0x%px:%s)",
+ str, iter->fmt);
+ str = "[UNSAFE-MEMORY]";
+ strcpy(iter->fmt, "%s");
+ } else {
+ strncpy(iter->fmt, p + i, j + 1);
+ iter->fmt[j+1] = '\0';
+ }
+ if (star)
+ trace_seq_printf(&iter->seq, iter->fmt, len, str);
+ else
+ trace_seq_printf(&iter->seq, iter->fmt, str);
+
+ p += i + j + 1;
+ }
+ print:
+ if (*p)
+ trace_seq_vprintf(&iter->seq, p, ap);
+}
+
+const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
+{
+ const char *p, *new_fmt;
+ char *q;
+
+ if (WARN_ON_ONCE(!fmt))
+ return fmt;
+
+ if (!iter->tr || iter->tr->trace_flags & TRACE_ITER_HASH_PTR)
+ return fmt;
+
+ p = fmt;
+ new_fmt = q = iter->fmt;
+ while (*p) {
+ if (unlikely(q - new_fmt + 3 > iter->fmt_size)) {
+ if (!trace_iter_expand_format(iter))
+ return fmt;
+
+ q += iter->fmt - new_fmt;
+ new_fmt = iter->fmt;
+ }
+
+ *q++ = *p++;
+
+ /* Replace %p with %px */
+ if (p[-1] == '%') {
+ if (p[0] == '%') {
+ *q++ = *p++;
+ } else if (p[0] == 'p' && !isalnum(p[1])) {
+ *q++ = *p++;
+ *q++ = 'x';
+ }
+ }
+ }
+ *q = '\0';
+
+ return new_fmt;
+}
+
#define STATIC_TEMP_BUF_SIZE 128
-static char static_temp_buf[STATIC_TEMP_BUF_SIZE];
+static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4);
/* Find the next real entry, without updating the iterator itself */
struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
@@ -3507,13 +4069,15 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
if (iter->ent && iter->ent != iter->temp) {
if ((!iter->temp || iter->temp_size < iter->ent_size) &&
!WARN_ON_ONCE(iter->temp == static_temp_buf)) {
- kfree(iter->temp);
- iter->temp = kmalloc(iter->ent_size, GFP_KERNEL);
- if (!iter->temp)
+ void *temp;
+ temp = kmalloc(iter->ent_size, GFP_KERNEL);
+ if (!temp)
return NULL;
+ kfree(iter->temp);
+ iter->temp = temp;
+ iter->temp_size = iter->ent_size;
}
memcpy(iter->temp, iter->ent, iter->ent_size);
- iter->temp_size = iter->ent_size;
iter->ent = iter->temp;
}
entry = __find_next_entry(iter, ent_cpu, NULL, ent_ts);
@@ -3610,15 +4174,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
loff_t l = 0;
int cpu;
- /*
- * copy the tracer to avoid using a global lock all around.
- * iter->trace is a copy of current_trace, the pointer to the
- * name may be used instead of a strcmp(), as iter->trace->name
- * will point to the same string as current_trace->name.
- */
mutex_lock(&trace_types_lock);
- if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name))
- *iter->trace = *tr->current_trace;
+ if (unlikely(tr->current_trace != iter->trace)) {
+ /* Close iter->trace before switching to the new current tracer */
+ if (iter->trace->close)
+ iter->trace->close(iter);
+ iter->trace = tr->current_trace;
+ /* Reopen the new current tracer */
+ if (iter->trace->open)
+ iter->trace->open(iter);
+ }
mutex_unlock(&trace_types_lock);
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -3626,9 +4191,6 @@ static void *s_start(struct seq_file *m, loff_t *pos)
return ERR_PTR(-EBUSY);
#endif
- if (!iter->snapshot)
- atomic_inc(&trace_record_taskinfo_disabled);
-
if (*pos != iter->pos) {
iter->ent = NULL;
iter->cpu = 0;
@@ -3671,9 +4233,6 @@ static void s_stop(struct seq_file *m, void *p)
return;
#endif
- if (!iter->snapshot)
- atomic_dec(&trace_record_taskinfo_disabled);
-
trace_access_unlock(iter->cpu_file);
trace_event_read_unlock();
}
@@ -3743,14 +4302,15 @@ unsigned long trace_total_entries(struct trace_array *tr)
static void print_lat_help_header(struct seq_file *m)
{
- seq_puts(m, "# _------=> CPU# \n"
- "# / _-----=> irqs-off \n"
- "# | / _----=> need-resched \n"
- "# || / _---=> hardirq/softirq \n"
- "# ||| / _--=> preempt-depth \n"
- "# |||| / delay \n"
- "# cmd pid ||||| time | caller \n"
- "# \\ / ||||| \\ | / \n");
+ seq_puts(m, "# _------=> CPU# \n"
+ "# / _-----=> irqs-off/BH-disabled\n"
+ "# | / _----=> need-resched \n"
+ "# || / _---=> hardirq/softirq \n"
+ "# ||| / _--=> preempt-depth \n"
+ "# |||| / _-=> migrate-disable \n"
+ "# ||||| / delay \n"
+ "# cmd pid |||||| time | caller \n"
+ "# \\ / |||||| \\ | / \n");
}
static void print_event_info(struct array_buffer *buf, struct seq_file *m)
@@ -3771,26 +4331,27 @@ static void print_func_help_header(struct array_buffer *buf, struct seq_file *m,
print_event_info(buf, m);
- seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? "TGID " : "");
- seq_printf(m, "# | | %s | | |\n", tgid ? " | " : "");
+ seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? " TGID " : "");
+ seq_printf(m, "# | | %s | | |\n", tgid ? " | " : "");
}
static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m,
unsigned int flags)
{
bool tgid = flags & TRACE_ITER_RECORD_TGID;
- const char *space = " ";
- int prec = tgid ? 10 : 2;
+ static const char space[] = " ";
+ int prec = tgid ? 12 : 2;
print_event_info(buf, m);
- seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space);
- seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space);
- seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space);
- seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space);
- seq_printf(m, "# %.*s||| / delay\n", prec, space);
- seq_printf(m, "# TASK-PID %.*sCPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID ");
- seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | ");
+ seq_printf(m, "# %.*s _-----=> irqs-off/BH-disabled\n", prec, space);
+ seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space);
+ seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space);
+ seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space);
+ seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space);
+ seq_printf(m, "# %.*s|||| / delay\n", prec, space);
+ seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID ");
+ seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | ");
}
void
@@ -3802,9 +4363,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
struct tracer *type = iter->trace;
unsigned long entries;
unsigned long total;
- const char *name = "preemption";
-
- name = type->name;
+ const char *name = type->name;
get_total_entries(buf, &total, &entries);
@@ -3818,17 +4377,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
entries,
total,
buf->cpu,
-#if defined(CONFIG_PREEMPT_NONE)
- "server",
-#elif defined(CONFIG_PREEMPT_VOLUNTARY)
- "desktop",
-#elif defined(CONFIG_PREEMPT)
- "preempt",
-#elif defined(CONFIG_PREEMPT_RT)
- "preempt_rt",
-#else
+ preempt_model_none() ? "server" :
+ preempt_model_voluntary() ? "desktop" :
+ preempt_model_full() ? "preempt" :
+ preempt_model_rt() ? "preempt_rt" :
"unknown",
-#endif
/* These are reserved for later use */
0, 0, 0, 0);
#ifdef CONFIG_SMP
@@ -3908,8 +4461,11 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
if (trace_seq_has_overflowed(s))
return TRACE_TYPE_PARTIAL_LINE;
- if (event)
+ if (event) {
+ if (tr->trace_flags & TRACE_ITER_FIELDS)
+ return print_event_fields(iter, event);
return event->funcs->trace(iter, sym_flags, event);
+ }
trace_seq_printf(s, "Unknown type %d\n", entry->type);
@@ -4202,7 +4758,11 @@ static int s_show(struct seq_file *m, void *v)
iter->leftover = ret;
} else {
- print_trace_line(iter);
+ ret = print_trace_line(iter);
+ if (ret == TRACE_TYPE_PARTIAL_LINE) {
+ iter->seq.full = 0;
+ trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n");
+ }
ret = trace_print_seq(m, &iter->seq);
/*
* If we overflow the seq_file buffer, then it will
@@ -4235,6 +4795,25 @@ static const struct seq_operations tracer_seq_ops = {
.show = s_show,
};
+/*
+ * Note, as iter itself can be allocated and freed in different
+ * ways, this function is only used to free its content, and not
+ * the iterator itself. The only requirement to all the allocations
+ * is that it must zero all fields (kzalloc), as freeing works with
+ * ethier allocated content or NULL.
+ */
+static void free_trace_iter_content(struct trace_iterator *iter)
+{
+ /* The fmt is either NULL, allocated or points to static_fmt_buf */
+ if (iter->fmt != static_fmt_buf)
+ kfree(iter->fmt);
+
+ kfree(iter->temp);
+ kfree(iter->buffer_iter);
+ mutex_destroy(&iter->mutex);
+ free_cpumask_var(iter->started);
+}
+
static struct trace_iterator *
__tracing_open(struct inode *inode, struct file *file, bool snapshot)
{
@@ -4267,15 +4846,17 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
iter->temp_size = 128;
/*
- * We make a copy of the current tracer to avoid concurrent
- * changes on it while we are reading.
+ * trace_event_printf() may need to modify given format
+ * string to replace %p with %px so that it shows real address
+ * instead of hash value. However, that is only for the event
+ * tracing, other tracer may not need. Defer the allocation
+ * until it is needed.
*/
- mutex_lock(&trace_types_lock);
- iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL);
- if (!iter->trace)
- goto fail;
+ iter->fmt = NULL;
+ iter->fmt_size = 0;
- *iter->trace = *tr->current_trace;
+ mutex_lock(&trace_types_lock);
+ iter->trace = tr->current_trace;
if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
goto fail;
@@ -4340,9 +4921,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
fail:
mutex_unlock(&trace_types_lock);
- kfree(iter->trace);
- kfree(iter->temp);
- kfree(iter->buffer_iter);
+ free_trace_iter_content(iter);
release:
seq_release_private(inode, file);
return ERR_PTR(-ENOMEM);
@@ -4383,6 +4962,60 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp)
return 0;
}
+/*
+ * The private pointer of the inode is the trace_event_file.
+ * Update the tr ref count associated to it.
+ */
+int tracing_open_file_tr(struct inode *inode, struct file *filp)
+{
+ struct trace_event_file *file = inode->i_private;
+ int ret;
+
+ ret = tracing_check_open_get_tr(file->tr);
+ if (ret)
+ return ret;
+
+ mutex_lock(&event_mutex);
+
+ /* Fail if the file is marked for removal */
+ if (file->flags & EVENT_FILE_FL_FREED) {
+ trace_array_put(file->tr);
+ ret = -ENODEV;
+ } else {
+ event_file_get(file);
+ }
+
+ mutex_unlock(&event_mutex);
+ if (ret)
+ return ret;
+
+ filp->private_data = inode->i_private;
+
+ return 0;
+}
+
+int tracing_release_file_tr(struct inode *inode, struct file *filp)
+{
+ struct trace_event_file *file = inode->i_private;
+
+ trace_array_put(file->tr);
+ event_file_put(file);
+
+ return 0;
+}
+
+int tracing_single_release_file_tr(struct inode *inode, struct file *filp)
+{
+ tracing_release_file_tr(inode, filp);
+ return single_release(inode, filp);
+}
+
+static int tracing_mark_open(struct inode *inode, struct file *filp)
+{
+ stream_open(inode, filp);
+ return tracing_open_generic_tr(inode, filp);
+}
+
static int tracing_release(struct inode *inode, struct file *file)
{
struct trace_array *tr = inode->i_private;
@@ -4415,17 +5048,13 @@ static int tracing_release(struct inode *inode, struct file *file)
mutex_unlock(&trace_types_lock);
- mutex_destroy(&iter->mutex);
- free_cpumask_var(iter->started);
- kfree(iter->temp);
- kfree(iter->trace);
- kfree(iter->buffer_iter);
+ free_trace_iter_content(iter);
seq_release_private(inode, file);
return 0;
}
-static int tracing_release_generic_tr(struct inode *inode, struct file *file)
+int tracing_release_generic_tr(struct inode *inode, struct file *file)
{
struct trace_array *tr = inode->i_private;
@@ -4612,6 +5241,8 @@ loff_t tracing_lseek(struct file *file, loff_t offset, int whence)
static const struct file_operations tracing_fops = {
.open = tracing_open,
.read = seq_read,
+ .read_iter = seq_read_iter,
+ .splice_read = copy_splice_read,
.write = tracing_write_stub,
.llseek = tracing_lseek,
.release = tracing_release,
@@ -4671,11 +5302,17 @@ int tracing_set_cpumask(struct trace_array *tr,
!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu);
+#endif
}
if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
cpumask_test_cpu(cpu, tracing_cpumask_new)) {
atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu);
+#endif
}
}
arch_spin_unlock(&tr->max_lock);
@@ -4694,7 +5331,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
cpumask_var_t tracing_cpumask_new;
int err;
- if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
return -ENOMEM;
err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
@@ -4799,6 +5436,8 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
+ int *map;
+
if ((mask == TRACE_ITER_RECORD_TGID) ||
(mask == TRACE_ITER_RECORD_CMD))
lockdep_assert_held(&event_mutex);
@@ -4821,10 +5460,19 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
trace_event_enable_cmd_record(enabled);
if (mask == TRACE_ITER_RECORD_TGID) {
- if (!tgid_map)
- tgid_map = kvcalloc(PID_MAX_DEFAULT + 1,
- sizeof(*tgid_map),
- GFP_KERNEL);
+ if (!tgid_map) {
+ tgid_map_max = pid_max;
+ map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
+ GFP_KERNEL);
+
+ /*
+ * Pairs with smp_load_acquire() in
+ * trace_find_tgid_ptr() to ensure that if it observes
+ * the tgid_map we just allocated then it also observes
+ * the corresponding tgid_map_max value.
+ */
+ smp_store_release(&tgid_map, map);
+ }
if (!tgid_map) {
tr->trace_flags &= ~TRACE_ITER_RECORD_TGID;
return -ENOMEM;
@@ -4976,7 +5624,7 @@ static const char readme_msg[] =
" error_log\t- error log for failed commands (that support it)\n"
" buffer_size_kb\t- view and modify size of per cpu buffer\n"
" buffer_total_size_kb - view total size of all cpu buffers\n\n"
- " trace_clock\t\t-change the clock used to order events\n"
+ " trace_clock\t\t- change the clock used to order events\n"
" local: Per cpu clock but may not be synced across CPUs\n"
" global: Synced across CPUs but slows tracing down.\n"
" counter: Not a clock, but just an increment\n"
@@ -4985,7 +5633,7 @@ static const char readme_msg[] =
#ifdef CONFIG_X86_64
" x86-tsc: TSC cycle counter\n"
#endif
- "\n timestamp_mode\t-view the mode used to timestamp events\n"
+ "\n timestamp_mode\t- view the mode used to timestamp events\n"
" delta: Delta difference against a buffer-wide timestamp\n"
" absolute: Absolute (standalone) timestamp\n"
"\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
@@ -5073,37 +5721,52 @@ static const char readme_msg[] =
" uprobe_events\t\t- Create/append/remove/show the userspace dynamic events\n"
"\t\t\t Write into this file to define/undefine new trace events.\n"
#endif
-#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
+#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) || \
+ defined(CONFIG_FPROBE_EVENTS)
"\t accepts: event-definitions (one definition per line)\n"
- "\t Format: p[:[<group>/]<event>] <place> [<args>]\n"
- "\t r[maxactive][:[<group>/]<event>] <place> [<args>]\n"
+#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
+ "\t Format: p[:[<group>/][<event>]] <place> [<args>]\n"
+ "\t r[maxactive][:[<group>/][<event>]] <place> [<args>]\n"
+#endif
+#ifdef CONFIG_FPROBE_EVENTS
+ "\t f[:[<group>/][<event>]] <func-name>[%return] [<args>]\n"
+ "\t t[:[<group>/][<event>]] <tracepoint> [<args>]\n"
+#endif
#ifdef CONFIG_HIST_TRIGGERS
"\t s:[synthetic/]<event> <field> [<field>]\n"
#endif
- "\t -:[<group>/]<event>\n"
+ "\t e[:[<group>/][<event>]] <attached-group>.<attached-event> [<args>] [if <filter>]\n"
+ "\t -:[<group>/][<event>]\n"
#ifdef CONFIG_KPROBE_EVENTS
"\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
- "place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n"
+ "place (kretprobe): [<module>:]<symbol>[+<offset>]%return|<memaddr>\n"
#endif
#ifdef CONFIG_UPROBE_EVENTS
- " place (uprobe): <path>:<offset>[(ref_ctr_offset)]\n"
+ " place (uprobe): <path>:<offset>[%return][(ref_ctr_offset)]\n"
#endif
"\t args: <name>=fetcharg[:type]\n"
- "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n"
+ "\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n"
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
"\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
+ "\t <argname>[->field[->field|.field...]],\n"
+#else
+ "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
+#endif
#else
"\t $stack<index>, $stack, $retval, $comm,\n"
#endif
"\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n"
- "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n"
+ "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n"
"\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n"
- "\t <type>\\[<array-size>\\]\n"
+ "\t symstr, <type>\\[<array-size>\\]\n"
#ifdef CONFIG_HIST_TRIGGERS
"\t field: <stype> <name>;\n"
"\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n"
"\t [unsigned] char/int/long\n"
#endif
+ "\t efield: For event probes ('e' types), the field is on of the fields\n"
+ "\t of the <attached-group>/<attached-event>.\n"
#endif
" events/\t\t- Directory containing all trace event subsystems:\n"
" enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
@@ -5151,18 +5814,34 @@ static const char readme_msg[] =
#ifdef CONFIG_HIST_TRIGGERS
" hist trigger\t- If set, event hits are aggregated into a hash table\n"
"\t Format: hist:keys=<field1[,field2,...]>\n"
+ "\t [:<var1>=<field|var_ref|numeric_literal>[,<var2>=...]]\n"
"\t [:values=<field1[,field2,...]>]\n"
"\t [:sort=<field1[,field2,...]>]\n"
"\t [:size=#entries]\n"
"\t [:pause][:continue][:clear]\n"
"\t [:name=histname1]\n"
+ "\t [:nohitcount]\n"
"\t [:<handler>.<action>]\n"
"\t [if <filter>]\n\n"
+ "\t Note, special fields can be used as well:\n"
+ "\t common_timestamp - to record current timestamp\n"
+ "\t common_cpu - to record the CPU the event happened on\n"
+ "\n"
+ "\t A hist trigger variable can be:\n"
+ "\t - a reference to a field e.g. x=current_timestamp,\n"
+ "\t - a reference to another variable e.g. y=$x,\n"
+ "\t - a numeric literal: e.g. ms_per_sec=1000,\n"
+ "\t - an arithmetic expression: e.g. time_secs=current_timestamp/1000\n"
+ "\n"
+ "\t hist trigger arithmetic expressions support addition(+), subtraction(-),\n"
+ "\t multiplication(*) and division(/) operators. An operand can be either a\n"
+ "\t variable reference, field or numeric literal.\n"
+ "\n"
"\t When a matching event is hit, an entry is added to a hash\n"
"\t table using the key(s) and value(s) named, and the value of a\n"
"\t sum called 'hitcount' is incremented. Keys and values\n"
"\t correspond to fields in the event's format description. Keys\n"
- "\t can be any field, or the special string 'stacktrace'.\n"
+ "\t can be any field, or the special string 'common_stacktrace'.\n"
"\t Compound keys consisting of up to two fields can be specified\n"
"\t by the 'keys' keyword. Values must correspond to numeric\n"
"\t fields. Sort keys consisting of up to two fields can be\n"
@@ -5187,7 +5866,10 @@ static const char readme_msg[] =
"\t .execname display a common_pid as a program name\n"
"\t .syscall display a syscall id as a syscall name\n"
"\t .log2 display log2 value rather than raw number\n"
- "\t .usecs display a common_timestamp in microseconds\n\n"
+ "\t .buckets=size display values in groups of size rather than raw number\n"
+ "\t .usecs display a common_timestamp in microseconds\n"
+ "\t .percent display a number of percentage value\n"
+ "\t .graph display a bar-graph of a value\n\n"
"\t The 'pause' parameter can be used to pause an existing hist\n"
"\t trigger or to start a hist trigger but not log any events\n"
"\t until told to do so. 'continue' can be used to start or\n"
@@ -5195,6 +5877,8 @@ static const char readme_msg[] =
"\t The 'clear' parameter will clear the contents of a running\n"
"\t hist trigger and leave its current paused/active state\n"
"\t unchanged.\n\n"
+ "\t The 'nohitcount' (or NOHC) parameter will suppress display of\n"
+ "\t raw hitcount in the histogram.\n\n"
"\t The enable_hist and disable_hist triggers can be used to\n"
"\t have one event conditionally start and stop another event's\n"
"\t already-attached hist trigger. The syntax is analogous to\n"
@@ -5210,7 +5894,12 @@ static const char readme_msg[] =
"\t trace(<synthetic_event>,param list) - generate synthetic event\n"
"\t save(field,...) - save current event fields\n"
#ifdef CONFIG_TRACER_SNAPSHOT
- "\t snapshot() - snapshot the trace buffer\n"
+ "\t snapshot() - snapshot the trace buffer\n\n"
+#endif
+#ifdef CONFIG_SYNTH_EVENTS
+ " events/synthetic_events\t- Create/append/remove/show synthetic events\n"
+ "\t Write into this file to define/undefine new synthetic events.\n"
+ "\t example: echo 'myevent u64 lat; char name[]; long[] stack' >> synthetic_events\n"
#endif
#endif
;
@@ -5231,37 +5920,16 @@ static const struct file_operations tracing_readme_fops = {
static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
{
- int *ptr = v;
-
- if (*pos || m->count)
- ptr++;
-
- (*pos)++;
-
- for (; ptr <= &tgid_map[PID_MAX_DEFAULT]; ptr++) {
- if (trace_find_tgid(*ptr))
- return ptr;
- }
+ int pid = ++(*pos);
- return NULL;
+ return trace_find_tgid_ptr(pid);
}
static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
{
- void *v;
- loff_t l = 0;
+ int pid = *pos;
- if (!tgid_map)
- return NULL;
-
- v = &tgid_map[0];
- while (l <= *pos) {
- v = saved_tgids_next(m, v, &l);
- if (!v)
- return NULL;
- }
-
- return v;
+ return trace_find_tgid_ptr(pid);
}
static void saved_tgids_stop(struct seq_file *m, void *v)
@@ -5270,9 +5938,14 @@ static void saved_tgids_stop(struct seq_file *m, void *v)
static int saved_tgids_show(struct seq_file *m, void *v)
{
- int pid = (int *)v - tgid_map;
+ int *entry = (int *)v;
+ int pid = entry - tgid_map;
+ int tgid = *entry;
- seq_printf(m, "%d %d\n", pid, trace_find_tgid(pid));
+ if (tgid == 0)
+ return SEQ_SKIP;
+
+ seq_printf(m, "%d %d\n", pid, tgid);
return 0;
}
@@ -5388,37 +6061,29 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
char buf[64];
int r;
+ preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
-static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
-{
- kfree(s->saved_cmdlines);
- kfree(s->map_cmdline_to_pid);
- kfree(s);
-}
-
static int tracing_resize_saved_cmdlines(unsigned int val)
{
struct saved_cmdlines_buffer *s, *savedcmd_temp;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = allocate_cmdlines_buffer(val);
if (!s)
return -ENOMEM;
- if (allocate_cmdlines_buffer(val, s) < 0) {
- kfree(s);
- return -ENOMEM;
- }
-
+ preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
savedcmd_temp = savedcmd;
savedcmd = s;
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
free_saved_cmdlines_buffer(savedcmd_temp);
return 0;
@@ -5606,7 +6271,7 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
static void trace_create_eval_file(struct dentry *d_tracer)
{
- trace_create_file("eval_map", 0444, d_tracer,
+ trace_create_file("eval_map", TRACE_MODE_READ, d_tracer,
NULL, &tracing_eval_map_fops);
}
@@ -5660,6 +6325,15 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val)
per_cpu_ptr(buf->data, cpu)->entries = val;
}
+static void update_buffer_entries(struct array_buffer *buf, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0));
+ } else {
+ per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu);
+ }
+}
+
#ifdef CONFIG_TRACER_MAX_TRACE
/* resize @tr's buffer to the size of @size_tr's entries */
static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
@@ -5698,19 +6372,21 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
* we use the size that was given, and we can forget about
* expanding it later.
*/
- ring_buffer_expanded = true;
+ trace_set_ring_buffer_expanded(tr);
/* May be called before buffers are initialized */
if (!tr->array_buffer.buffer)
return 0;
+ /* Do not allow tracing while resizing ring buffer */
+ tracing_stop_tr(tr);
+
ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu);
if (ret < 0)
- return ret;
+ goto out_start;
#ifdef CONFIG_TRACER_MAX_TRACE
- if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) ||
- !tr->current_trace->use_max_tr)
+ if (!tr->allocated_snapshot)
goto out;
ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
@@ -5735,29 +6411,24 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
WARN_ON(1);
tracing_disabled = 1;
}
- return ret;
+ goto out_start;
}
- if (cpu == RING_BUFFER_ALL_CPUS)
- set_buffer_entries(&tr->max_buffer, size);
- else
- per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size;
+ update_buffer_entries(&tr->max_buffer, cpu);
out:
#endif /* CONFIG_TRACER_MAX_TRACE */
- if (cpu == RING_BUFFER_ALL_CPUS)
- set_buffer_entries(&tr->array_buffer, size);
- else
- per_cpu_ptr(tr->array_buffer.data, cpu)->entries = size;
-
+ update_buffer_entries(&tr->array_buffer, cpu);
+ out_start:
+ tracing_start_tr(tr);
return ret;
}
ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
unsigned long size, int cpu_id)
{
- int ret = size;
+ int ret;
mutex_lock(&trace_types_lock);
@@ -5782,6 +6453,7 @@ out:
/**
* tracing_update_buffers - used by tracing facility to expand ring buffers
+ * @tr: The tracing instance
*
* To save on memory when the tracing is never used on a system with it
* configured in. The ring buffers are set to a minimum size. But once
@@ -5790,13 +6462,13 @@ out:
*
* This function is to be called when a tracer is about to be used.
*/
-int tracing_update_buffers(void)
+int tracing_update_buffers(struct trace_array *tr)
{
int ret = 0;
mutex_lock(&trace_types_lock);
- if (!ring_buffer_expanded)
- ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size,
+ if (!tr->ring_buffer_expanded)
+ ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
mutex_unlock(&trace_types_lock);
@@ -5825,12 +6497,18 @@ static void tracing_set_nop(struct trace_array *tr)
tr->current_trace = &nop_trace;
}
+static bool tracer_options_updated;
+
static void add_tracer_options(struct trace_array *tr, struct tracer *t)
{
/* Only enable if the directory has been created already. */
if (!tr->dir)
return;
+ /* Only create trace option files after update_tracer_options finish */
+ if (!tracer_options_updated)
+ return;
+
create_trace_option_files(tr, t);
}
@@ -5844,7 +6522,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
mutex_lock(&trace_types_lock);
- if (!ring_buffer_expanded) {
+ if (!tr->ring_buffer_expanded) {
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
if (ret < 0)
@@ -5865,10 +6543,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
#ifdef CONFIG_TRACER_SNAPSHOT
if (t->use_max_tr) {
+ local_irq_disable();
arch_spin_lock(&tr->max_lock);
if (tr->cond_snapshot)
ret = -EBUSY;
arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
if (ret)
goto out;
}
@@ -5887,7 +6567,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
}
/* If trace pipe files are being read, we can't change the tracer */
- if (tr->current_trace->ref) {
+ if (tr->trace_ref) {
ret = -EBUSY;
goto out;
}
@@ -5899,12 +6579,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (tr->current_trace->reset)
tr->current_trace->reset(tr);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ had_max_tr = tr->current_trace->use_max_tr;
+
/* Current trace needs to be nop_trace before synchronize_rcu */
tr->current_trace = &nop_trace;
-#ifdef CONFIG_TRACER_MAX_TRACE
- had_max_tr = tr->allocated_snapshot;
-
if (had_max_tr && !t->use_max_tr) {
/*
* We need to make sure that the update_max_tr sees that
@@ -5916,14 +6596,14 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
synchronize_rcu();
free_snapshot(tr);
}
-#endif
-#ifdef CONFIG_TRACER_MAX_TRACE
- if (t->use_max_tr && !had_max_tr) {
+ if (t->use_max_tr && !tr->allocated_snapshot) {
ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
goto out;
}
+#else
+ tr->current_trace = &nop_trace;
#endif
if (t->init) {
@@ -5947,7 +6627,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
{
struct trace_array *tr = filp->private_data;
char buf[MAX_TRACER_SIZE+1];
- int i;
+ char *name;
size_t ret;
int err;
@@ -5961,11 +6641,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
buf[cnt] = 0;
- /* strip ending whitespace. */
- for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
- buf[i] = 0;
+ name = strim(buf);
- err = tracing_set_tracer(tr, buf);
+ err = tracing_set_tracer(tr, name);
if (err)
return err;
@@ -6036,28 +6714,58 @@ out:
return ret;
}
-#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
+#ifdef CONFIG_TRACER_MAX_TRACE
static ssize_t
tracing_max_lat_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos);
+ struct trace_array *tr = filp->private_data;
+
+ return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos);
}
static ssize_t
tracing_max_lat_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos);
+ struct trace_array *tr = filp->private_data;
+
+ return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos);
}
#endif
+static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ if (cpumask_empty(tr->pipe_cpumask)) {
+ cpumask_setall(tr->pipe_cpumask);
+ return 0;
+ }
+ } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
+ cpumask_set_cpu(cpu, tr->pipe_cpumask);
+ return 0;
+ }
+ return -EBUSY;
+}
+
+static void close_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ WARN_ON(!cpumask_full(tr->pipe_cpumask));
+ cpumask_clear(tr->pipe_cpumask);
+ } else {
+ WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask));
+ cpumask_clear_cpu(cpu, tr->pipe_cpumask);
+ }
+}
+
static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
struct trace_iterator *iter;
+ int cpu;
int ret;
ret = tracing_check_open_get_tr(tr);
@@ -6065,13 +6773,16 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
return ret;
mutex_lock(&trace_types_lock);
+ cpu = tracing_get_cpu(inode);
+ ret = open_pipe_on_cpu(tr, cpu);
+ if (ret)
+ goto fail_pipe_on_cpu;
/* create a buffer to store the information to pass to userspace */
iter = kzalloc(sizeof(*iter), GFP_KERNEL);
if (!iter) {
ret = -ENOMEM;
- __trace_array_put(tr);
- goto out;
+ goto fail_alloc_iter;
}
trace_seq_init(&iter->seq);
@@ -6094,7 +6805,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
iter->tr = tr;
iter->array_buffer = &tr->array_buffer;
- iter->cpu_file = tracing_get_cpu(inode);
+ iter->cpu_file = cpu;
mutex_init(&iter->mutex);
filp->private_data = iter;
@@ -6103,13 +6814,16 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
nonseekable_open(inode, filp);
- tr->current_trace->ref++;
-out:
+ tr->trace_ref++;
+
mutex_unlock(&trace_types_lock);
return ret;
fail:
kfree(iter);
+fail_alloc_iter:
+ close_pipe_on_cpu(tr, cpu);
+fail_pipe_on_cpu:
__trace_array_put(tr);
mutex_unlock(&trace_types_lock);
return ret;
@@ -6122,15 +6836,14 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
mutex_lock(&trace_types_lock);
- tr->current_trace->ref--;
+ tr->trace_ref--;
if (iter->trace->pipe_close)
iter->trace->pipe_close(iter);
-
+ close_pipe_on_cpu(tr, iter->cpu_file);
mutex_unlock(&trace_types_lock);
- free_cpumask_var(iter->started);
- mutex_destroy(&iter->mutex);
+ free_trace_iter_content(iter);
kfree(iter);
trace_array_put(tr);
@@ -6154,7 +6867,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl
return EPOLLIN | EPOLLRDNORM;
else
return ring_buffer_poll_wait(iter->array_buffer->buffer, iter->cpu_file,
- filp, poll_table);
+ filp, poll_table, iter->tr->buffer_percent);
}
static __poll_t
@@ -6243,16 +6956,13 @@ waitagain:
goto out;
}
- if (cnt >= PAGE_SIZE)
- cnt = PAGE_SIZE - 1;
+ if (cnt >= TRACE_SEQ_BUFFER_SIZE)
+ cnt = TRACE_SEQ_BUFFER_SIZE - 1;
/* reset all but tr, trace, and overruns */
- memset(&iter->seq, 0,
- sizeof(struct trace_iterator) -
- offsetof(struct trace_iterator, seq));
+ trace_iterator_reset(iter);
cpumask_clear(iter->started);
trace_seq_init(&iter->seq);
- iter->pos = -1;
trace_event_read_lock();
trace_access_lock(iter->cpu_file);
@@ -6262,7 +6972,20 @@ waitagain:
ret = print_trace_line(iter);
if (ret == TRACE_TYPE_PARTIAL_LINE) {
- /* don't print partial lines */
+ /*
+ * If one print_trace_line() fills entire trace_seq in one shot,
+ * trace_seq_to_user() will returns -EBUSY because save_len == 0,
+ * In this case, we need to consume it, otherwise, loop will peek
+ * this event next time, resulting in an infinite loop.
+ */
+ if (save_len == 0) {
+ iter->seq.full = 0;
+ trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n");
+ trace_consume(iter);
+ break;
+ }
+
+ /* In other cases, don't print partial lines */
iter->seq.seq.len = save_len;
break;
}
@@ -6285,7 +7008,7 @@ waitagain:
/* Now copy what we have to the user */
sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
- if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq))
+ if (iter->seq.readpos >= trace_seq_used(&iter->seq))
trace_seq_init(&iter->seq);
/*
@@ -6471,7 +7194,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
}
if (buf_size_same) {
- if (!ring_buffer_expanded)
+ if (!tr->ring_buffer_expanded)
r = sprintf(buf, "%lu (expanded: %lu)\n",
size >> 10,
trace_buf_size >> 10);
@@ -6528,10 +7251,10 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
mutex_lock(&trace_types_lock);
for_each_tracing_cpu(cpu) {
size += per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10;
- if (!ring_buffer_expanded)
+ if (!tr->ring_buffer_expanded)
expanded_size += trace_buf_size >> 10;
}
- if (ring_buffer_expanded)
+ if (tr->ring_buffer_expanded)
r = sprintf(buf, "%lu\n", size);
else
r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size);
@@ -6570,6 +7293,8 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
return 0;
}
+#define TRACE_MARKER_MAX_SIZE 4096
+
static ssize_t
tracing_mark_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
@@ -6579,9 +7304,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
enum event_trigger_type tt = ETT_NONE;
struct trace_buffer *buffer;
struct print_entry *entry;
- unsigned long irq_flags;
+ int meta_size;
ssize_t written;
- int size;
+ size_t size;
int len;
/* Used in tracing_mark_raw_write() as well */
@@ -6594,13 +7319,15 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (!(tr->trace_flags & TRACE_ITER_MARKERS))
return -EINVAL;
- if (cnt > TRACE_BUF_SIZE)
- cnt = TRACE_BUF_SIZE;
+ if ((ssize_t)cnt < 0)
+ return -EINVAL;
- BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
+ if (cnt > TRACE_MARKER_MAX_SIZE)
+ cnt = TRACE_MARKER_MAX_SIZE;
- local_save_flags(irq_flags);
- size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
+ meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */
+ again:
+ size = cnt + meta_size;
/* If less than "<faulted>", then make sure we can still add that */
if (cnt < FAULTED_SIZE)
@@ -6608,10 +7335,26 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
buffer = tr->array_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
- irq_flags, preempt_count());
- if (unlikely(!event))
+ tracing_gen_ctx());
+ if (unlikely(!event)) {
+ /*
+ * If the size was greater than what was allowed, then
+ * make it smaller and try again.
+ */
+ if (size > ring_buffer_max_event_size(buffer)) {
+ /* cnt < FAULTED size should never be bigger than max */
+ if (WARN_ON_ONCE(cnt < FAULTED_SIZE))
+ return -EBADF;
+ cnt = ring_buffer_max_event_size(buffer) - meta_size;
+ /* The above should only happen once */
+ if (WARN_ON_ONCE(cnt + meta_size == size))
+ return -EBADF;
+ goto again;
+ }
+
/* Ring buffer disabled, return as if not open for write */
return -EBADF;
+ }
entry = ring_buffer_event_data(event);
entry->ip = _THIS_IP_;
@@ -6623,12 +7366,11 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
written = -EFAULT;
} else
written = cnt;
- len = cnt;
if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) {
/* do not add \n before testing triggers, but add \0 */
entry->buf[cnt] = '\0';
- tt = event_triggers_call(tr->trace_marker_file, entry, event);
+ tt = event_triggers_call(tr->trace_marker_file, buffer, entry, event);
}
if (entry->buf[cnt - 1] != '\n') {
@@ -6637,20 +7379,16 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
} else
entry->buf[cnt] = '\0';
+ if (static_branch_unlikely(&trace_marker_exports_enabled))
+ ftrace_exports(event, TRACE_EXPORT_MARKER);
__buffer_unlock_commit(buffer, event);
if (tt)
event_triggers_post_call(tr->trace_marker_file, tt);
- if (written > 0)
- *fpos += written;
-
return written;
}
-/* Limit it for now to 3K (including tag) */
-#define RAW_DATA_MAX_SIZE (1024*3)
-
static ssize_t
tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
@@ -6659,7 +7397,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
struct ring_buffer_event *event;
struct trace_buffer *buffer;
struct raw_data_entry *entry;
- unsigned long irq_flags;
ssize_t written;
int size;
int len;
@@ -6673,22 +7410,20 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
return -EINVAL;
/* The marker must at least have a tag id */
- if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE)
+ if (cnt < sizeof(unsigned int))
return -EINVAL;
- if (cnt > TRACE_BUF_SIZE)
- cnt = TRACE_BUF_SIZE;
-
- BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
-
- local_save_flags(irq_flags);
size = sizeof(*entry) + cnt;
if (cnt < FAULT_SIZE_ID)
size += FAULT_SIZE_ID - cnt;
buffer = tr->array_buffer.buffer;
+
+ if (size > ring_buffer_max_event_size(buffer))
+ return -EINVAL;
+
event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
- irq_flags, preempt_count());
+ tracing_gen_ctx());
if (!event)
/* Ring buffer disabled, return as if not open for write */
return -EBADF;
@@ -6705,9 +7440,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
__buffer_unlock_commit(buffer, event);
- if (written > 0)
- *fpos += written;
-
return written;
}
@@ -6836,31 +7568,34 @@ static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file)
return ret;
}
-int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs)
+u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe)
+{
+ if (rbe == this_cpu_read(trace_buffered_event))
+ return ring_buffer_time_stamp(buffer);
+
+ return ring_buffer_event_time_stamp(buffer, rbe);
+}
+
+/*
+ * Set or disable using the per CPU trace_buffer_event when possible.
+ */
+int tracing_set_filter_buffering(struct trace_array *tr, bool set)
{
int ret = 0;
mutex_lock(&trace_types_lock);
- if (abs && tr->time_stamp_abs_ref++)
+ if (set && tr->no_filter_buffering_ref++)
goto out;
- if (!abs) {
- if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) {
+ if (!set) {
+ if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) {
ret = -EINVAL;
goto out;
}
- if (--tr->time_stamp_abs_ref)
- goto out;
+ --tr->no_filter_buffering_ref;
}
-
- ring_buffer_set_time_stamp_abs(tr->array_buffer.buffer, abs);
-
-#ifdef CONFIG_TRACER_MAX_TRACE
- if (tr->max_buffer.buffer)
- ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs);
-#endif
out:
mutex_unlock(&trace_types_lock);
@@ -6871,6 +7606,7 @@ struct ftrace_buffer_info {
struct trace_iterator iter;
void *spare;
unsigned int spare_cpu;
+ unsigned int spare_size;
unsigned int read;
};
@@ -6916,6 +7652,11 @@ out:
return ret;
}
+static void tracing_swap_cpu_buffer(void *tr)
+{
+ update_max_tr_single((struct trace_array *)tr, current, smp_processor_id());
+}
+
static ssize_t
tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
@@ -6926,7 +7667,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
unsigned long val;
int ret;
- ret = tracing_update_buffers();
+ ret = tracing_update_buffers(tr);
if (ret < 0)
return ret;
@@ -6941,10 +7682,12 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
goto out;
}
+ local_irq_disable();
arch_spin_lock(&tr->max_lock);
if (tr->cond_snapshot)
ret = -EBUSY;
arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
if (ret)
goto out;
@@ -6972,13 +7715,15 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
break;
- local_irq_disable();
/* Now, we're going to swap */
- if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
+ local_irq_disable();
update_max_tr(tr, current, smp_processor_id(), NULL);
- else
- update_max_tr_single(tr, current, iter->cpu_file);
- local_irq_enable();
+ local_irq_enable();
+ } else {
+ smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer,
+ (void *)tr, 1);
+ }
break;
default:
if (tr->allocated_snapshot) {
@@ -7057,20 +7802,22 @@ static const struct file_operations tracing_thresh_fops = {
.llseek = generic_file_llseek,
};
-#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
+#ifdef CONFIG_TRACER_MAX_TRACE
static const struct file_operations tracing_max_lat_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_generic_tr,
.read = tracing_max_lat_read,
.write = tracing_max_lat_write,
.llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
};
#endif
static const struct file_operations set_tracer_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_generic_tr,
.read = tracing_set_trace_read,
.write = tracing_set_trace_write,
.llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
};
static const struct file_operations tracing_pipe_fops = {
@@ -7104,16 +7851,14 @@ static const struct file_operations tracing_free_buffer_fops = {
};
static const struct file_operations tracing_mark_fops = {
- .open = tracing_open_generic_tr,
+ .open = tracing_mark_open,
.write = tracing_mark_write,
- .llseek = generic_file_llseek,
.release = tracing_release_generic_tr,
};
static const struct file_operations tracing_mark_raw_fops = {
- .open = tracing_open_generic_tr,
+ .open = tracing_mark_open,
.write = tracing_mark_raw_write,
- .llseek = generic_file_llseek,
.release = tracing_release_generic_tr,
};
@@ -7151,6 +7896,91 @@ static const struct file_operations snapshot_raw_fops = {
#endif /* CONFIG_TRACER_SNAPSHOT */
+/*
+ * trace_min_max_write - Write a u64 value to a trace_min_max_param struct
+ * @filp: The active open file structure
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function implements the write interface for a struct trace_min_max_param.
+ * The filp->private_data must point to a trace_min_max_param structure that
+ * defines where to write the value, the min and the max acceptable values,
+ * and a lock to protect the write.
+ */
+static ssize_t
+trace_min_max_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_min_max_param *param = filp->private_data;
+ u64 val;
+ int err;
+
+ if (!param)
+ return -EFAULT;
+
+ err = kstrtoull_from_user(ubuf, cnt, 10, &val);
+ if (err)
+ return err;
+
+ if (param->lock)
+ mutex_lock(param->lock);
+
+ if (param->min && val < *param->min)
+ err = -EINVAL;
+
+ if (param->max && val > *param->max)
+ err = -EINVAL;
+
+ if (!err)
+ *param->val = val;
+
+ if (param->lock)
+ mutex_unlock(param->lock);
+
+ if (err)
+ return err;
+
+ return cnt;
+}
+
+/*
+ * trace_min_max_read - Read a u64 value from a trace_min_max_param struct
+ * @filp: The active open file structure
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function implements the read interface for a struct trace_min_max_param.
+ * The filp->private_data must point to a trace_min_max_param struct with valid
+ * data.
+ */
+static ssize_t
+trace_min_max_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_min_max_param *param = filp->private_data;
+ char buf[U64_STR_SIZE];
+ int len;
+ u64 val;
+
+ if (!param)
+ return -EFAULT;
+
+ val = *param->val;
+
+ if (cnt > sizeof(buf))
+ cnt = sizeof(buf);
+
+ len = snprintf(buf, sizeof(buf), "%llu\n", val);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
+}
+
+const struct file_operations trace_min_max_fops = {
+ .open = tracing_open_generic,
+ .read = trace_min_max_read,
+ .write = trace_min_max_write,
+};
+
#define TRACING_LOG_ERRS_MAX 8
#define TRACING_LOG_LOC_MAX 128
@@ -7159,7 +7989,7 @@ static const struct file_operations snapshot_raw_fops = {
struct err_info {
const char **errs; /* ptr to loc-specific array of err strings */
u8 type; /* index into errs -> specific err string */
- u8 pos; /* MAX_FILTER_STR_VAL = 256 */
+ u16 pos; /* caret position */
u64 ts;
};
@@ -7167,25 +7997,53 @@ struct tracing_log_err {
struct list_head list;
struct err_info info;
char loc[TRACING_LOG_LOC_MAX]; /* err location */
- char cmd[MAX_FILTER_STR_VAL]; /* what caused err */
+ char *cmd; /* what caused err */
};
static DEFINE_MUTEX(tracing_err_log_lock);
-static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
+static struct tracing_log_err *alloc_tracing_log_err(int len)
{
struct tracing_log_err *err;
+ err = kzalloc(sizeof(*err), GFP_KERNEL);
+ if (!err)
+ return ERR_PTR(-ENOMEM);
+
+ err->cmd = kzalloc(len, GFP_KERNEL);
+ if (!err->cmd) {
+ kfree(err);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return err;
+}
+
+static void free_tracing_log_err(struct tracing_log_err *err)
+{
+ kfree(err->cmd);
+ kfree(err);
+}
+
+static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr,
+ int len)
+{
+ struct tracing_log_err *err;
+ char *cmd;
+
if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) {
- err = kzalloc(sizeof(*err), GFP_KERNEL);
- if (!err)
- err = ERR_PTR(-ENOMEM);
- tr->n_err_log_entries++;
+ err = alloc_tracing_log_err(len);
+ if (PTR_ERR(err) != -ENOMEM)
+ tr->n_err_log_entries++;
return err;
}
-
+ cmd = kzalloc(len, GFP_KERNEL);
+ if (!cmd)
+ return ERR_PTR(-ENOMEM);
err = list_first_entry(&tr->err_log, struct tracing_log_err, list);
+ kfree(err->cmd);
+ err->cmd = cmd;
list_del(&err->list);
return err;
@@ -7196,11 +8054,11 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
* @cmd: The tracing command that caused the error
* @str: The string to position the caret at within @cmd
*
- * Finds the position of the first occurence of @str within @cmd. The
+ * Finds the position of the first occurrence of @str within @cmd. The
* return value can be passed to tracing_log_err() for caret placement
* within @cmd.
*
- * Returns the index within @cmd of the first occurence of @str or 0
+ * Returns the index within @cmd of the first occurrence of @str or 0
* if @str was not found.
*/
unsigned int err_pos(char *cmd, const char *str)
@@ -7246,22 +8104,25 @@ unsigned int err_pos(char *cmd, const char *str)
*/
void tracing_log_err(struct trace_array *tr,
const char *loc, const char *cmd,
- const char **errs, u8 type, u8 pos)
+ const char **errs, u8 type, u16 pos)
{
struct tracing_log_err *err;
+ int len = 0;
if (!tr)
tr = &global_trace;
+ len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1;
+
mutex_lock(&tracing_err_log_lock);
- err = get_tracing_log_err(tr);
+ err = get_tracing_log_err(tr, len);
if (PTR_ERR(err) == -ENOMEM) {
mutex_unlock(&tracing_err_log_lock);
return;
}
snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc);
- snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd);
+ snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd);
err->info.errs = errs;
err->info.type = type;
@@ -7279,7 +8140,7 @@ static void clear_tracing_err_log(struct trace_array *tr)
mutex_lock(&tracing_err_log_lock);
list_for_each_entry_safe(err, next, &tr->err_log, list) {
list_del(&err->list);
- kfree(err);
+ free_tracing_log_err(err);
}
tr->n_err_log_entries = 0;
@@ -7307,9 +8168,9 @@ static void tracing_err_log_seq_stop(struct seq_file *m, void *v)
mutex_unlock(&tracing_err_log_lock);
}
-static void tracing_err_log_show_pos(struct seq_file *m, u8 pos)
+static void tracing_err_log_show_pos(struct seq_file *m, u16 pos)
{
- u8 i;
+ u16 i;
for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++)
seq_putc(m, ' ');
@@ -7392,7 +8253,7 @@ static const struct file_operations tracing_err_log_fops = {
.open = tracing_err_log_open,
.write = tracing_err_log_write,
.read = seq_read,
- .llseek = seq_lseek,
+ .llseek = tracing_lseek,
.release = tracing_err_log_release,
};
@@ -7406,7 +8267,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
if (ret)
return ret;
- info = kzalloc(sizeof(*info), GFP_KERNEL);
+ info = kvzalloc(sizeof(*info), GFP_KERNEL);
if (!info) {
trace_array_put(tr);
return -ENOMEM;
@@ -7424,7 +8285,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
filp->private_data = info;
- tr->current_trace->ref++;
+ tr->trace_ref++;
mutex_unlock(&trace_types_lock);
@@ -7450,6 +8311,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
{
struct ftrace_buffer_info *info = filp->private_data;
struct trace_iterator *iter = &info->iter;
+ void *trace_data;
+ int page_size;
ssize_t ret = 0;
ssize_t size;
@@ -7461,6 +8324,17 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
return -EBUSY;
#endif
+ page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
+
+ /* Make sure the spare matches the current sub buffer size */
+ if (info->spare) {
+ if (page_size != info->spare_size) {
+ ring_buffer_free_read_page(iter->array_buffer->buffer,
+ info->spare_cpu, info->spare);
+ info->spare = NULL;
+ }
+ }
+
if (!info->spare) {
info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer,
iter->cpu_file);
@@ -7469,19 +8343,20 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
info->spare = NULL;
} else {
info->spare_cpu = iter->cpu_file;
+ info->spare_size = page_size;
}
}
if (!info->spare)
return ret;
/* Do we have previous read data to read? */
- if (info->read < PAGE_SIZE)
+ if (info->read < page_size)
goto read;
again:
trace_access_lock(iter->cpu_file);
ret = ring_buffer_read_page(iter->array_buffer->buffer,
- &info->spare,
+ info->spare,
count,
iter->cpu_file, 0);
trace_access_unlock(iter->cpu_file);
@@ -7502,11 +8377,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
info->read = 0;
read:
- size = PAGE_SIZE - info->read;
+ size = page_size - info->read;
if (size > count)
size = count;
-
- ret = copy_to_user(ubuf, info->spare + info->read, size);
+ trace_data = ring_buffer_read_page_data(info->spare);
+ ret = copy_to_user(ubuf, trace_data + info->read, size);
if (ret == size)
return -EFAULT;
@@ -7518,6 +8393,20 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
return size;
}
+static int tracing_buffers_flush(struct file *file, fl_owner_t id)
+{
+ struct ftrace_buffer_info *info = file->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ iter->wait_index++;
+ /* Make sure the waiters see the new wait_index */
+ smp_wmb();
+
+ ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
+
+ return 0;
+}
+
static int tracing_buffers_release(struct inode *inode, struct file *file)
{
struct ftrace_buffer_info *info = file->private_data;
@@ -7525,14 +8414,14 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
mutex_lock(&trace_types_lock);
- iter->tr->current_trace->ref--;
+ iter->tr->trace_ref--;
__trace_array_put(iter->tr);
if (info->spare)
ring_buffer_free_read_page(iter->array_buffer->buffer,
info->spare_cpu, info->spare);
- kfree(info);
+ kvfree(info);
mutex_unlock(&trace_types_lock);
@@ -7611,6 +8500,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
.spd_release = buffer_spd_release,
};
struct buffer_ref *ref;
+ int page_size;
int entries, i;
ssize_t ret = 0;
@@ -7619,13 +8509,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
return -EBUSY;
#endif
- if (*ppos & (PAGE_SIZE - 1))
+ page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
+ if (*ppos & (page_size - 1))
return -EINVAL;
- if (len & (PAGE_SIZE - 1)) {
- if (len < PAGE_SIZE)
+ if (len & (page_size - 1)) {
+ if (len < page_size)
return -EINVAL;
- len &= PAGE_MASK;
+ len &= (~(page_size - 1));
}
if (splice_grow_spd(pipe, &spd))
@@ -7635,7 +8526,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
trace_access_lock(iter->cpu_file);
entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
- for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {
+ for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= page_size) {
struct page *page;
int r;
@@ -7656,7 +8547,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
}
ref->cpu = iter->cpu_file;
- r = ring_buffer_read_page(ref->buffer, &ref->page,
+ r = ring_buffer_read_page(ref->buffer, ref->page,
len, iter->cpu_file, 1);
if (r < 0) {
ring_buffer_free_read_page(ref->buffer, ref->cpu,
@@ -7665,14 +8556,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
break;
}
- page = virt_to_page(ref->page);
+ page = virt_to_page(ring_buffer_read_page_data(ref->page));
spd.pages[i] = page;
- spd.partial[i].len = PAGE_SIZE;
+ spd.partial[i].len = page_size;
spd.partial[i].offset = 0;
spd.partial[i].private = (unsigned long)ref;
spd.nr_pages++;
- *ppos += PAGE_SIZE;
+ *ppos += page_size;
entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
}
@@ -7682,6 +8573,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
/* did we read anything? */
if (!spd.nr_pages) {
+ long wait_index;
+
if (ret)
goto out;
@@ -7689,10 +8582,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
goto out;
- ret = wait_on_pipe(iter, iter->tr->buffer_percent);
+ wait_index = READ_ONCE(iter->wait_index);
+
+ ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent);
if (ret)
goto out;
+ /* No need to wait after waking up when tracing is off */
+ if (!tracer_tracing_is_on(iter->tr))
+ goto out;
+
+ /* Make sure we see the new wait_index */
+ smp_rmb();
+ if (wait_index != iter->wait_index)
+ goto out;
+
goto again;
}
@@ -7703,12 +8607,35 @@ out:
return ret;
}
+/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */
+static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct ftrace_buffer_info *info = file->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ if (cmd)
+ return -ENOIOCTLCMD;
+
+ mutex_lock(&trace_types_lock);
+
+ iter->wait_index++;
+ /* Make sure the waiters see the new wait_index */
+ smp_wmb();
+
+ ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
+
+ mutex_unlock(&trace_types_lock);
+ return 0;
+}
+
static const struct file_operations tracing_buffers_fops = {
.open = tracing_buffers_open,
.read = tracing_buffers_read,
.poll = tracing_buffers_poll,
.release = tracing_buffers_release,
+ .flush = tracing_buffers_flush,
.splice_read = tracing_buffers_splice_read,
+ .unlocked_ioctl = tracing_buffers_ioctl,
.llseek = no_llseek,
};
@@ -7750,7 +8677,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
t, usec_rem);
- t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu));
+ t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer));
usec_rem = do_div(t, USEC_PER_SEC);
trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
} else {
@@ -7759,7 +8686,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
trace_seq_printf(s, "now ts: %llu\n",
- ring_buffer_time_stamp(trace_buf->buffer, cpu));
+ ring_buffer_time_stamp(trace_buf->buffer));
}
cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);
@@ -8034,27 +8961,27 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
}
/* per cpu trace_pipe */
- trace_create_cpu_file("trace_pipe", 0444, d_cpu,
+ trace_create_cpu_file("trace_pipe", TRACE_MODE_READ, d_cpu,
tr, cpu, &tracing_pipe_fops);
/* per cpu trace */
- trace_create_cpu_file("trace", 0644, d_cpu,
+ trace_create_cpu_file("trace", TRACE_MODE_WRITE, d_cpu,
tr, cpu, &tracing_fops);
- trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu,
+ trace_create_cpu_file("trace_pipe_raw", TRACE_MODE_READ, d_cpu,
tr, cpu, &tracing_buffers_fops);
- trace_create_cpu_file("stats", 0444, d_cpu,
+ trace_create_cpu_file("stats", TRACE_MODE_READ, d_cpu,
tr, cpu, &tracing_stats_fops);
- trace_create_cpu_file("buffer_size_kb", 0444, d_cpu,
+ trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu,
tr, cpu, &tracing_entries_fops);
#ifdef CONFIG_TRACER_SNAPSHOT
- trace_create_cpu_file("snapshot", 0644, d_cpu,
+ trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu,
tr, cpu, &snapshot_fops);
- trace_create_cpu_file("snapshot_raw", 0444, d_cpu,
+ trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu,
tr, cpu, &snapshot_raw_fops);
#endif
}
@@ -8108,12 +9035,33 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
return cnt;
}
+static int tracing_open_options(struct inode *inode, struct file *filp)
+{
+ struct trace_option_dentry *topt = inode->i_private;
+ int ret;
+
+ ret = tracing_check_open_get_tr(topt->tr);
+ if (ret)
+ return ret;
+
+ filp->private_data = inode->i_private;
+ return 0;
+}
+
+static int tracing_release_options(struct inode *inode, struct file *file)
+{
+ struct trace_option_dentry *topt = file->private_data;
+
+ trace_array_put(topt->tr);
+ return 0;
+}
static const struct file_operations trace_options_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_options,
.read = trace_options_read,
.write = trace_options_write,
.llseek = generic_file_llseek,
+ .release = tracing_release_options,
};
/*
@@ -8260,8 +9208,8 @@ create_trace_option_file(struct trace_array *tr,
topt->opt = opt;
topt->tr = tr;
- topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
- &trace_options_fops);
+ topt->entry = trace_create_file(opt->name, TRACE_MODE_WRITE,
+ t_options, topt, &trace_options_fops);
}
@@ -8336,7 +9284,7 @@ create_trace_option_core_file(struct trace_array *tr,
if (!t_options)
return NULL;
- return trace_create_file(option, 0644, t_options,
+ return trace_create_file(option, TRACE_MODE_WRITE, t_options,
(void *)&tr->trace_flags_index[index],
&trace_options_core_fops);
}
@@ -8397,6 +9345,8 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
tracer_tracing_off(tr);
if (tr->current_trace->stop)
tr->current_trace->stop(tr);
+ /* Wake up any waiters */
+ ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS);
}
mutex_unlock(&trace_types_lock);
}
@@ -8443,9 +9393,6 @@ buffer_percent_write(struct file *filp, const char __user *ubuf,
if (val > 100)
return -EINVAL;
- if (!val)
- val = 1;
-
tr->buffer_percent = val;
(*ppos)++;
@@ -8461,6 +9408,103 @@ static const struct file_operations buffer_percent_fops = {
.llseek = default_llseek,
};
+static ssize_t
+buffer_subbuf_size_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ size_t size;
+ char buf[64];
+ int order;
+ int r;
+
+ order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ size = (PAGE_SIZE << order) / 1024;
+
+ r = sprintf(buf, "%zd\n", size);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+buffer_subbuf_size_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ unsigned long val;
+ int old_order;
+ int order;
+ int pages;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ val *= 1024; /* value passed in is in KB */
+
+ pages = DIV_ROUND_UP(val, PAGE_SIZE);
+ order = fls(pages - 1);
+
+ /* limit between 1 and 128 system pages */
+ if (order < 0 || order > 7)
+ return -EINVAL;
+
+ /* Do not allow tracing while changing the order of the ring buffer */
+ tracing_stop_tr(tr);
+
+ old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ if (old_order == order)
+ goto out;
+
+ ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, order);
+ if (ret)
+ goto out;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+
+ if (!tr->allocated_snapshot)
+ goto out_max;
+
+ ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
+ if (ret) {
+ /* Put back the old order */
+ cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order);
+ if (WARN_ON_ONCE(cnt)) {
+ /*
+ * AARGH! We are left with different orders!
+ * The max buffer is our "snapshot" buffer.
+ * When a tracer needs a snapshot (one of the
+ * latency tracers), it swaps the max buffer
+ * with the saved snap shot. We succeeded to
+ * update the order of the main buffer, but failed to
+ * update the order of the max buffer. But when we tried
+ * to reset the main buffer to the original size, we
+ * failed there too. This is very unlikely to
+ * happen, but if it does, warn and kill all
+ * tracing.
+ */
+ tracing_disabled = 1;
+ }
+ goto out;
+ }
+ out_max:
+#endif
+ (*ppos)++;
+ out:
+ if (ret)
+ cnt = ret;
+ tracing_start_tr(tr);
+ return cnt;
+}
+
+static const struct file_operations buffer_subbuf_size_fops = {
+ .open = tracing_open_generic_tr,
+ .read = buffer_subbuf_size_read,
+ .write = buffer_subbuf_size_write,
+ .release = tracing_release_generic_tr,
+ .llseek = default_llseek,
+};
+
static struct dentry *trace_instance_dir;
static void
@@ -8493,6 +9537,16 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size
return 0;
}
+static void free_trace_buffer(struct array_buffer *buf)
+{
+ if (buf->buffer) {
+ ring_buffer_free(buf->buffer);
+ buf->buffer = NULL;
+ free_percpu(buf->data);
+ buf->data = NULL;
+ }
+}
+
static int allocate_trace_buffers(struct trace_array *tr, int size)
{
int ret;
@@ -8505,34 +9559,17 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
ret = allocate_trace_buffer(tr, &tr->max_buffer,
allocate_snapshot ? size : 1);
if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) {
- ring_buffer_free(tr->array_buffer.buffer);
- tr->array_buffer.buffer = NULL;
- free_percpu(tr->array_buffer.data);
- tr->array_buffer.data = NULL;
+ free_trace_buffer(&tr->array_buffer);
return -ENOMEM;
}
tr->allocated_snapshot = allocate_snapshot;
- /*
- * Only the top level trace array gets its snapshot allocated
- * from the kernel command line.
- */
allocate_snapshot = false;
#endif
return 0;
}
-static void free_trace_buffer(struct array_buffer *buf)
-{
- if (buf->buffer) {
- ring_buffer_free(buf->buffer);
- buf->buffer = NULL;
- free_percpu(buf->data);
- buf->data = NULL;
- }
-}
-
static void free_trace_buffers(struct trace_array *tr)
{
if (!tr)
@@ -8565,6 +9602,7 @@ static void __update_tracer_options(struct trace_array *tr)
static void update_tracer_options(struct trace_array *tr)
{
mutex_lock(&trace_types_lock);
+ tracer_options_updated = true;
__update_tracer_options(tr);
mutex_unlock(&trace_types_lock);
}
@@ -8597,7 +9635,28 @@ struct trace_array *trace_array_find_get(const char *instance)
return tr;
}
-static struct trace_array *trace_array_create(const char *name)
+static int trace_array_create_dir(struct trace_array *tr)
+{
+ int ret;
+
+ tr->dir = tracefs_create_dir(tr->name, trace_instance_dir);
+ if (!tr->dir)
+ return -EINVAL;
+
+ ret = event_trace_add_tracer(tr->dir, tr);
+ if (ret) {
+ tracefs_remove(tr->dir);
+ return ret;
+ }
+
+ init_tracer_tracefs(tr, tr->dir);
+ __update_tracer_options(tr);
+
+ return ret;
+}
+
+static struct trace_array *
+trace_array_create_systems(const char *name, const char *systems)
{
struct trace_array *tr;
int ret;
@@ -8614,6 +9673,15 @@ static struct trace_array *trace_array_create(const char *name)
if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
goto out_free_tr;
+ if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
+ goto out_free_tr;
+
+ if (systems) {
+ tr->system_names = kstrdup_const(systems, GFP_KERNEL);
+ if (!tr->system_names)
+ goto out_free_tr;
+ }
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -8632,38 +9700,46 @@ static struct trace_array *trace_array_create(const char *name)
if (allocate_trace_buffers(tr, trace_buf_size) < 0)
goto out_free_tr;
- tr->dir = tracefs_create_dir(name, trace_instance_dir);
- if (!tr->dir)
- goto out_free_tr;
+ /* The ring buffer is defaultly expanded */
+ trace_set_ring_buffer_expanded(tr);
- ret = event_trace_add_tracer(tr->dir, tr);
- if (ret) {
- tracefs_remove(tr->dir);
+ if (ftrace_allocate_ftrace_ops(tr) < 0)
goto out_free_tr;
- }
ftrace_init_trace_array(tr);
- init_tracer_tracefs(tr, tr->dir);
init_trace_flags_index(tr);
- __update_tracer_options(tr);
+
+ if (trace_instance_dir) {
+ ret = trace_array_create_dir(tr);
+ if (ret)
+ goto out_free_tr;
+ } else
+ __trace_early_add_events(tr);
list_add(&tr->list, &ftrace_trace_arrays);
tr->ref++;
-
return tr;
out_free_tr:
+ ftrace_free_ftrace_ops(tr);
free_trace_buffers(tr);
+ free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
+ kfree_const(tr->system_names);
kfree(tr->name);
kfree(tr);
return ERR_PTR(ret);
}
+static struct trace_array *trace_array_create(const char *name)
+{
+ return trace_array_create_systems(name, NULL);
+}
+
static int instance_mkdir(const char *name)
{
struct trace_array *tr;
@@ -8689,6 +9765,7 @@ out_unlock:
/**
* trace_array_get_by_name - Create/Lookup a trace array, given its name.
* @name: The name of the trace array to be looked up/created.
+ * @systems: A list of systems to create event directories for (NULL for all)
*
* Returns pointer to trace array with given name.
* NULL, if it cannot be created.
@@ -8702,7 +9779,7 @@ out_unlock:
* trace_array_put() is called, user space can not delete it.
*
*/
-struct trace_array *trace_array_get_by_name(const char *name)
+struct trace_array *trace_array_get_by_name(const char *name, const char *systems)
{
struct trace_array *tr;
@@ -8714,7 +9791,7 @@ struct trace_array *trace_array_get_by_name(const char *name)
goto out_unlock;
}
- tr = trace_array_create(name);
+ tr = trace_array_create_systems(name, systems);
if (IS_ERR(tr))
tr = NULL;
@@ -8733,7 +9810,7 @@ static int __remove_instance(struct trace_array *tr)
int i;
/* Reference counter for a newly created trace array = 1. */
- if (tr->ref > 1 || (tr->current_trace && tr->current_trace->ref))
+ if (tr->ref > 1 || (tr->current_trace && tr->trace_ref))
return -EBUSY;
list_del(&tr->list);
@@ -8750,17 +9827,20 @@ static int __remove_instance(struct trace_array *tr)
ftrace_clear_pids(tr);
ftrace_destroy_function_files(tr);
tracefs_remove(tr->dir);
+ free_percpu(tr->last_func_repeats);
free_trace_buffers(tr);
+ clear_tracing_err_log(tr);
for (i = 0; i < tr->nr_topts; i++) {
kfree(tr->topts[i].topts);
}
kfree(tr->topts);
+ free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
+ kfree_const(tr->system_names);
kfree(tr->name);
kfree(tr);
- tr = NULL;
return 0;
}
@@ -8814,41 +9894,56 @@ static int instance_rmdir(const char *name)
static __init void create_trace_instances(struct dentry *d_tracer)
{
+ struct trace_array *tr;
+
trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer,
instance_mkdir,
instance_rmdir);
if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n"))
return;
+
+ mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (!tr->name)
+ continue;
+ if (MEM_FAIL(trace_array_create_dir(tr) < 0,
+ "Failed to create instance directory\n"))
+ break;
+ }
+
+ mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
}
static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
- struct trace_event_file *file;
int cpu;
- trace_create_file("available_tracers", 0444, d_tracer,
+ trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer,
tr, &show_traces_fops);
- trace_create_file("current_tracer", 0644, d_tracer,
+ trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer,
tr, &set_tracer_fops);
- trace_create_file("tracing_cpumask", 0644, d_tracer,
+ trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_cpumask_fops);
- trace_create_file("trace_options", 0644, d_tracer,
+ trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_iter_fops);
- trace_create_file("trace", 0644, d_tracer,
+ trace_create_file("trace", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_fops);
- trace_create_file("trace_pipe", 0444, d_tracer,
+ trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer,
tr, &tracing_pipe_fops);
- trace_create_file("buffer_size_kb", 0644, d_tracer,
+ trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_entries_fops);
- trace_create_file("buffer_total_size_kb", 0444, d_tracer,
+ trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer,
tr, &tracing_total_entries_fops);
trace_create_file("free_buffer", 0200, d_tracer,
@@ -8857,32 +9952,31 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("trace_marker", 0220, d_tracer,
tr, &tracing_mark_fops);
- file = __find_event_file(tr, "ftrace", "print");
- if (file && file->dir)
- trace_create_file("trigger", 0644, file->dir, file,
- &event_trigger_fops);
- tr->trace_marker_file = file;
+ tr->trace_marker_file = __find_event_file(tr, "ftrace", "print");
trace_create_file("trace_marker_raw", 0220, d_tracer,
tr, &tracing_mark_raw_fops);
- trace_create_file("trace_clock", 0644, d_tracer, tr,
+ trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr,
&trace_clock_fops);
- trace_create_file("tracing_on", 0644, d_tracer,
+ trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer,
tr, &rb_simple_fops);
- trace_create_file("timestamp_mode", 0444, d_tracer, tr,
+ trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr,
&trace_time_stamp_mode_fops);
tr->buffer_percent = 50;
- trace_create_file("buffer_percent", 0444, d_tracer,
+ trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer,
tr, &buffer_percent_fops);
+ trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer,
+ tr, &buffer_subbuf_size_fops);
+
create_trace_options_dir(tr);
-#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
+#ifdef CONFIG_TRACER_MAX_TRACE
trace_create_maxlat_file(tr, d_tracer);
#endif
@@ -8890,11 +9984,11 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
MEM_FAIL(1, "Could not allocate function filter files");
#ifdef CONFIG_TRACER_SNAPSHOT
- trace_create_file("snapshot", 0644, d_tracer,
+ trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
tr, &snapshot_fops);
#endif
- trace_create_file("error_log", 0644, d_tracer,
+ trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_err_log_fops);
for_each_tracing_cpu(cpu)
@@ -8932,40 +10026,42 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
* directory. It is called via fs_initcall() by any of the boot up code
* and expects to return the dentry of the top level tracing directory.
*/
-struct dentry *tracing_init_dentry(void)
+int tracing_init_dentry(void)
{
struct trace_array *tr = &global_trace;
if (security_locked_down(LOCKDOWN_TRACEFS)) {
pr_warn("Tracing disabled due to lockdown\n");
- return ERR_PTR(-EPERM);
+ return -EPERM;
}
/* The top level trace array uses NULL as parent */
if (tr->dir)
- return NULL;
+ return 0;
- if (WARN_ON(!tracefs_initialized()) ||
- (IS_ENABLED(CONFIG_DEBUG_FS) &&
- WARN_ON(!debugfs_initialized())))
- return ERR_PTR(-ENODEV);
+ if (WARN_ON(!tracefs_initialized()))
+ return -ENODEV;
/*
* As there may still be users that expect the tracing
* files to exist in debugfs/tracing, we must automount
* the tracefs file system there, so older tools still
- * work with the newer kerenl.
+ * work with the newer kernel.
*/
tr->dir = debugfs_create_automount("tracing", NULL,
trace_automount, NULL);
- return NULL;
+ return 0;
}
extern struct trace_eval_map *__start_ftrace_eval_maps[];
extern struct trace_eval_map *__stop_ftrace_eval_maps[];
-static void __init trace_eval_init(void)
+static struct workqueue_struct *eval_map_wq __initdata;
+static struct work_struct eval_map_work __initdata;
+static struct work_struct tracerfs_init_work __initdata;
+
+static void __init eval_map_work_func(struct work_struct *work)
{
int len;
@@ -8973,6 +10069,35 @@ static void __init trace_eval_init(void)
trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len);
}
+static int __init trace_eval_init(void)
+{
+ INIT_WORK(&eval_map_work, eval_map_work_func);
+
+ eval_map_wq = alloc_workqueue("eval_map_wq", WQ_UNBOUND, 0);
+ if (!eval_map_wq) {
+ pr_err("Unable to allocate eval_map_wq\n");
+ /* Do work here */
+ eval_map_work_func(&eval_map_work);
+ return -ENOMEM;
+ }
+
+ queue_work(eval_map_wq, &eval_map_work);
+ return 0;
+}
+
+subsys_initcall(trace_eval_init);
+
+static int __init trace_eval_sync(void)
+{
+ /* Make sure the eval map updates are finished */
+ if (eval_map_wq)
+ destroy_workqueue(eval_map_wq);
+ return 0;
+}
+
+late_initcall_sync(trace_eval_sync);
+
+
#ifdef CONFIG_MODULES
static void trace_module_add_evals(struct module *mod)
{
@@ -9035,7 +10160,7 @@ static int trace_module_notify(struct notifier_block *self,
break;
}
- return 0;
+ return NOTIFY_OK;
}
static struct notifier_block trace_module_nb = {
@@ -9044,91 +10169,104 @@ static struct notifier_block trace_module_nb = {
};
#endif /* CONFIG_MODULES */
-static __init int tracer_init_tracefs(void)
+static __init void tracer_init_tracefs_work_func(struct work_struct *work)
{
- struct dentry *d_tracer;
-
- trace_access_lock_init();
-
- d_tracer = tracing_init_dentry();
- if (IS_ERR(d_tracer))
- return 0;
event_trace_init();
- init_tracer_tracefs(&global_trace, d_tracer);
- ftrace_init_tracefs_toplevel(&global_trace, d_tracer);
+ init_tracer_tracefs(&global_trace, NULL);
+ ftrace_init_tracefs_toplevel(&global_trace, NULL);
- trace_create_file("tracing_thresh", 0644, d_tracer,
+ trace_create_file("tracing_thresh", TRACE_MODE_WRITE, NULL,
&global_trace, &tracing_thresh_fops);
- trace_create_file("README", 0444, d_tracer,
+ trace_create_file("README", TRACE_MODE_READ, NULL,
NULL, &tracing_readme_fops);
- trace_create_file("saved_cmdlines", 0444, d_tracer,
+ trace_create_file("saved_cmdlines", TRACE_MODE_READ, NULL,
NULL, &tracing_saved_cmdlines_fops);
- trace_create_file("saved_cmdlines_size", 0644, d_tracer,
+ trace_create_file("saved_cmdlines_size", TRACE_MODE_WRITE, NULL,
NULL, &tracing_saved_cmdlines_size_fops);
- trace_create_file("saved_tgids", 0444, d_tracer,
+ trace_create_file("saved_tgids", TRACE_MODE_READ, NULL,
NULL, &tracing_saved_tgids_fops);
- trace_eval_init();
-
- trace_create_eval_file(d_tracer);
+ trace_create_eval_file(NULL);
#ifdef CONFIG_MODULES
register_module_notifier(&trace_module_nb);
#endif
#ifdef CONFIG_DYNAMIC_FTRACE
- trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
+ trace_create_file("dyn_ftrace_total_info", TRACE_MODE_READ, NULL,
NULL, &tracing_dyn_info_fops);
#endif
- create_trace_instances(d_tracer);
+ create_trace_instances(NULL);
update_tracer_options(&global_trace);
-
- return 0;
}
-static int trace_panic_handler(struct notifier_block *this,
- unsigned long event, void *unused)
+static __init int tracer_init_tracefs(void)
{
- if (ftrace_dump_on_oops)
- ftrace_dump(ftrace_dump_on_oops);
- return NOTIFY_OK;
-}
+ int ret;
-static struct notifier_block trace_panic_notifier = {
- .notifier_call = trace_panic_handler,
- .next = NULL,
- .priority = 150 /* priority: INT_MAX >= x >= 0 */
-};
+ trace_access_lock_init();
-static int trace_die_handler(struct notifier_block *self,
- unsigned long val,
- void *data)
-{
- switch (val) {
- case DIE_OOPS:
- if (ftrace_dump_on_oops)
- ftrace_dump(ftrace_dump_on_oops);
- break;
- default:
- break;
+ ret = tracing_init_dentry();
+ if (ret)
+ return 0;
+
+ if (eval_map_wq) {
+ INIT_WORK(&tracerfs_init_work, tracer_init_tracefs_work_func);
+ queue_work(eval_map_wq, &tracerfs_init_work);
+ } else {
+ tracer_init_tracefs_work_func(NULL);
}
- return NOTIFY_OK;
+
+ rv_init_interface();
+
+ return 0;
}
+fs_initcall(tracer_init_tracefs);
+
+static int trace_die_panic_handler(struct notifier_block *self,
+ unsigned long ev, void *unused);
+
+static struct notifier_block trace_panic_notifier = {
+ .notifier_call = trace_die_panic_handler,
+ .priority = INT_MAX - 1,
+};
+
static struct notifier_block trace_die_notifier = {
- .notifier_call = trace_die_handler,
- .priority = 200
+ .notifier_call = trace_die_panic_handler,
+ .priority = INT_MAX - 1,
};
/*
+ * The idea is to execute the following die/panic callback early, in order
+ * to avoid showing irrelevant information in the trace (like other panic
+ * notifier functions); we are the 2nd to run, after hung_task/rcu_stall
+ * warnings get disabled (to prevent potential log flooding).
+ */
+static int trace_die_panic_handler(struct notifier_block *self,
+ unsigned long ev, void *unused)
+{
+ if (!ftrace_dump_on_oops)
+ return NOTIFY_DONE;
+
+ /* The die notifier requires DIE_OOPS to trigger */
+ if (self == &trace_die_notifier && ev != DIE_OOPS)
+ return NOTIFY_DONE;
+
+ ftrace_dump(ftrace_dump_on_oops);
+
+ return NOTIFY_DONE;
+}
+
+/*
* printk is set to max of 1024, we really don't need it that big.
* Nothing should be printing 1000 characters anyway.
*/
@@ -9181,6 +10319,12 @@ void trace_init_global_iter(struct trace_iterator *iter)
/* Output in nanoseconds only if we are using a clock in nanoseconds. */
if (trace_clocks[iter->tr->clock_id].in_ns)
iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
+
+ /* Can not use kmalloc for iter.temp and iter.fmt */
+ iter->temp = static_temp_buf;
+ iter->temp_size = STATIC_TEMP_BUF_SIZE;
+ iter->fmt = static_fmt_buf;
+ iter->fmt_size = STATIC_FMT_BUF_SIZE;
}
void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
@@ -9210,13 +10354,9 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
tracing_off();
local_irq_save(flags);
- printk_nmi_direct_enter();
/* Simulate the iterator */
trace_init_global_iter(&iter);
- /* Can not use kmalloc for iter.temp */
- iter.temp = static_temp_buf;
- iter.temp_size = STATIC_TEMP_BUF_SIZE;
for_each_tracing_cpu(cpu) {
atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
@@ -9250,7 +10390,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
}
/*
- * We need to stop all tracing on all CPUS to read the
+ * We need to stop all tracing on all CPUS to read
* the next buffer. This is a bit expensive, but is
* not done often. We fill all what we can read,
* and then release the locks again.
@@ -9290,35 +10430,15 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
}
atomic_dec(&dump_running);
- printk_nmi_direct_exit();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(ftrace_dump);
-int trace_run_command(const char *buf, int (*createfn)(int, char **))
-{
- char **argv;
- int argc, ret;
-
- argc = 0;
- ret = 0;
- argv = argv_split(GFP_KERNEL, buf, &argc);
- if (!argv)
- return -ENOMEM;
-
- if (argc)
- ret = createfn(argc, argv);
-
- argv_free(argv);
-
- return ret;
-}
-
#define WRITE_BUFSIZE 4096
ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos,
- int (*createfn)(int, char **))
+ int (*createfn)(const char *))
{
char *kbuf, *buf, *tmp;
int ret = 0;
@@ -9366,7 +10486,7 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
if (tmp)
*tmp = '\0';
- ret = trace_run_command(buf, createfn);
+ ret = createfn(buf);
if (ret)
goto out;
buf += size;
@@ -9381,6 +10501,79 @@ out:
return ret;
}
+#ifdef CONFIG_TRACER_MAX_TRACE
+__init static bool tr_needs_alloc_snapshot(const char *name)
+{
+ char *test;
+ int len = strlen(name);
+ bool ret;
+
+ if (!boot_snapshot_index)
+ return false;
+
+ if (strncmp(name, boot_snapshot_info, len) == 0 &&
+ boot_snapshot_info[len] == '\t')
+ return true;
+
+ test = kmalloc(strlen(name) + 3, GFP_KERNEL);
+ if (!test)
+ return false;
+
+ sprintf(test, "\t%s\t", name);
+ ret = strstr(boot_snapshot_info, test) == NULL;
+ kfree(test);
+ return ret;
+}
+
+__init static void do_allocate_snapshot(const char *name)
+{
+ if (!tr_needs_alloc_snapshot(name))
+ return;
+
+ /*
+ * When allocate_snapshot is set, the next call to
+ * allocate_trace_buffers() (called by trace_array_get_by_name())
+ * will allocate the snapshot buffer. That will alse clear
+ * this flag.
+ */
+ allocate_snapshot = true;
+}
+#else
+static inline void do_allocate_snapshot(const char *name) { }
+#endif
+
+__init static void enable_instances(void)
+{
+ struct trace_array *tr;
+ char *curr_str;
+ char *str;
+ char *tok;
+
+ /* A tab is always appended */
+ boot_instance_info[boot_instance_index - 1] = '\0';
+ str = boot_instance_info;
+
+ while ((curr_str = strsep(&str, "\t"))) {
+
+ tok = strsep(&curr_str, ",");
+
+ if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
+ do_allocate_snapshot(tok);
+
+ tr = trace_array_get_by_name(tok, NULL);
+ if (!tr) {
+ pr_warn("Failed to create instance buffer %s\n", curr_str);
+ continue;
+ }
+ /* Allow user space to delete it */
+ trace_array_put(tr);
+
+ while ((tok = strsep(&curr_str, ","))) {
+ early_enable_events(tr, tok, true);
+ }
+ }
+}
+
__init static int tracer_alloc_buffers(void)
{
int ring_buf_size;
@@ -9393,7 +10586,7 @@ __init static int tracer_alloc_buffers(void)
}
/*
- * Make sure we don't accidently add more trace options
+ * Make sure we don't accidentally add more trace options
* than we have bits for.
*/
BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE);
@@ -9410,7 +10603,7 @@ __init static int tracer_alloc_buffers(void)
trace_printk_init_buffers();
/* To save memory, keep the ring buffer size to its minimum */
- if (ring_buffer_expanded)
+ if (global_trace.ring_buffer_expanded)
ring_buf_size = trace_buf_size;
else
ring_buf_size = 1;
@@ -9422,12 +10615,12 @@ __init static int tracer_alloc_buffers(void)
/*
* The prepare callbacks allocates some memory for the ring buffer. We
- * don't free the buffer if the if the CPU goes down. If we were to free
+ * don't free the buffer if the CPU goes down. If we were to free
* the buffer, then the user would lose any trace that was in the
* buffer. The memory will be removed once the "instance" is removed.
*/
ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE,
- "trace/RB:preapre", trace_rb_cpu_prepare,
+ "trace/RB:prepare", trace_rb_cpu_prepare,
NULL);
if (ret < 0)
goto out_free_cpumask;
@@ -9440,12 +10633,14 @@ __init static int tracer_alloc_buffers(void)
if (trace_create_savedcmd() < 0)
goto out_free_temp_buffer;
+ if (!zalloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
+ goto out_free_savedcmd;
+
/* TODO: make the number of buffers hot pluggable with CPUS */
if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
- goto out_free_savedcmd;
+ goto out_free_pipe_cpumask;
}
-
if (global_trace.buffer_disabled)
tracing_off();
@@ -9494,8 +10689,12 @@ __init static int tracer_alloc_buffers(void)
register_snapshot_cmd();
+ test_can_verify();
+
return 0;
+out_free_pipe_cpumask:
+ free_cpumask_var(global_trace.pipe_cpumask);
out_free_savedcmd:
free_saved_cmdlines_buffer(savedcmd);
out_free_temp_buffer:
@@ -9510,11 +10709,29 @@ out:
return ret;
}
+void __init ftrace_boot_snapshot(void)
+{
+#ifdef CONFIG_TRACER_MAX_TRACE
+ struct trace_array *tr;
+
+ if (!snapshot_at_boot)
+ return;
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (!tr->allocated_snapshot)
+ continue;
+
+ tracing_snapshot_instance(tr);
+ trace_array_puts(tr, "** Boot snapshot taken **\n");
+ }
+#endif
+}
+
void __init early_trace_init(void)
{
if (tracepoint_printk) {
tracepoint_print_iter =
- kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
+ kzalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
if (MEM_FAIL(!tracepoint_print_iter,
"Failed to allocate trace iterator\n"))
tracepoint_printk = 0;
@@ -9522,14 +10739,19 @@ void __init early_trace_init(void)
static_key_enable(&tracepoint_printk_key.key);
}
tracer_alloc_buffers();
+
+ init_events();
}
void __init trace_init(void)
{
trace_event_init();
+
+ if (boot_instance_index)
+ enable_instances();
}
-__init static int clear_boot_tracer(void)
+__init static void clear_boot_tracer(void)
{
/*
* The default tracer at boot buffer is an init section.
@@ -9539,26 +10761,21 @@ __init static int clear_boot_tracer(void)
* about to be freed.
*/
if (!default_bootup_tracer)
- return 0;
+ return;
printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n",
default_bootup_tracer);
default_bootup_tracer = NULL;
-
- return 0;
}
-fs_initcall(tracer_init_tracefs);
-late_initcall_sync(clear_boot_tracer);
-
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-__init static int tracing_set_default_clock(void)
+__init static void tracing_set_default_clock(void)
{
/* sched_clock_stable() is determined in late_initcall */
if (!trace_boot_clock && !sched_clock_stable()) {
if (security_locked_down(LOCKDOWN_TRACEFS)) {
pr_warn("Can not set tracing clock due to lockdown\n");
- return -EPERM;
+ return;
}
printk(KERN_WARNING
@@ -9568,8 +10785,21 @@ __init static int tracing_set_default_clock(void)
"on the kernel command line\n");
tracing_set_clock(&global_trace, "global");
}
+}
+#else
+static inline void tracing_set_default_clock(void) { }
+#endif
+
+__init static int late_trace_init(void)
+{
+ if (tracepoint_printk && tracepoint_printk_stop_on_boot) {
+ static_key_disable(&tracepoint_printk_key.key);
+ tracepoint_printk = 0;
+ }
+ tracing_set_default_clock();
+ clear_boot_tracer();
return 0;
}
-late_initcall_sync(tracing_set_default_clock);
-#endif
+
+late_initcall_sync(late_trace_init);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 13db4000af3f..00f873910c5d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -19,12 +19,19 @@
#include <linux/glob.h>
#include <linux/irq_work.h>
#include <linux/workqueue.h>
+#include <linux/ctype.h>
+#include <linux/once_lite.h>
+
+#include "pid_list.h"
#ifdef CONFIG_FTRACE_SYSCALLS
-#include <asm/unistd.h> /* For NR_SYSCALLS */
+#include <asm/unistd.h> /* For NR_syscalls */
#include <asm/syscall.h> /* some archs define it here */
#endif
+#define TRACE_MODE_WRITE 0640
+#define TRACE_MODE_READ 0440
+
enum trace_type {
__TRACE_FIRST_TYPE = 0,
@@ -43,7 +50,10 @@ enum trace_type {
TRACE_BLK,
TRACE_BPUTS,
TRACE_HWLAT,
+ TRACE_OSNOISE,
+ TRACE_TIMERLAT,
TRACE_RAW_DATA,
+ TRACE_FUNC_REPEATS,
__TRACE_LAST_TYPE,
};
@@ -67,12 +77,25 @@ enum trace_type {
#undef __array
#define __array(type, item, size) type item[size];
+/*
+ * For backward compatibility, older user space expects to see the
+ * kernel_stack event with a fixed size caller field. But today the fix
+ * size is ignored by the kernel, and the real structure is dynamic.
+ * Expose to user space: "unsigned long caller[8];" but the real structure
+ * will be "unsigned long caller[] __counted_by(size)"
+ */
+#undef __stack_array
+#define __stack_array(type, item, size, field) type item[] __counted_by(field);
+
#undef __array_desc
#define __array_desc(type, container, item, size)
#undef __dynamic_array
#define __dynamic_array(type, item) type item[];
+#undef __rel_dynamic_array
+#define __rel_dynamic_array(type, item) type item[];
+
#undef F_STRUCT
#define F_STRUCT(args...) args
@@ -97,16 +120,14 @@ enum trace_type {
#include "trace_entries.h"
/* Use this for memory failure errors */
-#define MEM_FAIL(condition, fmt, ...) ({ \
- static bool __section(.data.once) __warned; \
- int __ret_warn_once = !!(condition); \
- \
- if (unlikely(__ret_warn_once && !__warned)) { \
- __warned = true; \
- pr_err("ERROR: " fmt, ##__VA_ARGS__); \
- } \
- unlikely(__ret_warn_once); \
-})
+#define MEM_FAIL(condition, fmt, ...) \
+ DO_ONCE_LITE_IF(condition, pr_err, "ERROR: " fmt, ##__VA_ARGS__)
+
+#define FAULT_STRING "(fault)"
+
+#define HIST_STACKTRACE_DEPTH 16
+#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long))
+#define HIST_STACKTRACE_SKIP 5
/*
* syscalls are special, and need special handling, this is why
@@ -129,29 +150,25 @@ struct kprobe_trace_entry_head {
unsigned long ip;
};
+struct eprobe_trace_entry_head {
+ struct trace_entry ent;
+};
+
struct kretprobe_trace_entry_head {
struct trace_entry ent;
unsigned long func;
unsigned long ret_ip;
};
-/*
- * trace_flag_type is an enumeration that holds different
- * states when a trace occurs. These are:
- * IRQS_OFF - interrupts were disabled
- * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
- * NEED_RESCHED - reschedule is requested
- * HARDIRQ - inside an interrupt handler
- * SOFTIRQ - inside a softirq handler
- */
-enum trace_flag_type {
- TRACE_FLAG_IRQS_OFF = 0x01,
- TRACE_FLAG_IRQS_NOSUPPORT = 0x02,
- TRACE_FLAG_NEED_RESCHED = 0x04,
- TRACE_FLAG_HARDIRQ = 0x08,
- TRACE_FLAG_SOFTIRQ = 0x10,
- TRACE_FLAG_PREEMPT_RESCHED = 0x20,
- TRACE_FLAG_NMI = 0x40,
+struct fentry_trace_entry_head {
+ struct trace_entry ent;
+ unsigned long ip;
+};
+
+struct fexit_trace_entry_head {
+ struct trace_entry ent;
+ unsigned long func;
+ unsigned long ret_ip;
};
#define TRACE_BUF_SIZE 1024
@@ -205,10 +222,14 @@ struct trace_options {
struct trace_option_dentry *topts;
};
-struct trace_pid_list {
- int pid_max;
- unsigned long *pids;
-};
+struct trace_pid_list *trace_pid_list_alloc(void);
+void trace_pid_list_free(struct trace_pid_list *pid_list);
+bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid);
+int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid);
+int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid);
+int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid);
+int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid,
+ unsigned int *next);
enum {
TRACE_PIDS = BIT(0),
@@ -246,7 +267,7 @@ typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data);
* tracing_snapshot_cond(tr, cond_data), the cond_data passed in is
* passed in turn to the cond_snapshot.update() function. That data
* can be compared by the update() implementation with the cond_data
- * contained wihin the struct cond_snapshot instance associated with
+ * contained within the struct cond_snapshot instance associated with
* the trace_array. Because the tr->max_lock is held throughout the
* update() call, the update() function can directly retrieve the
* cond_snapshot and cond_data associated with the per-instance
@@ -271,7 +292,7 @@ typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data);
* take the snapshot, by returning 'true' if so, 'false' if no
* snapshot should be taken. Because the max_lock is held for
* the duration of update(), the implementation is safe to
- * directly retrieven and save any implementation data it needs
+ * directly retrieved and save any implementation data it needs
* to in association with the snapshot.
*/
struct cond_snapshot {
@@ -280,6 +301,17 @@ struct cond_snapshot {
};
/*
+ * struct trace_func_repeats - used to keep track of the consecutive
+ * (on the same CPU) calls of a single function.
+ */
+struct trace_func_repeats {
+ unsigned long ip;
+ unsigned long parent_ip;
+ unsigned long count;
+ u64 ts_last_call;
+};
+
+/*
* The trace array - an array of per-CPU trace arrays. This is the
* highest level data structure that individual tracers deal with.
* They have on/off state as well:
@@ -303,7 +335,7 @@ struct trace_array {
struct array_buffer max_buffer;
bool allocated_snapshot;
#endif
-#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
+#ifdef CONFIG_TRACER_MAX_TRACE
unsigned long max_latency;
#ifdef CONFIG_FSNOTIFY
struct dentry *d_max_latency;
@@ -345,17 +377,21 @@ struct trace_array {
unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE];
unsigned int flags;
raw_spinlock_t start_lock;
+ const char *system_names;
struct list_head err_log;
struct dentry *dir;
struct dentry *options;
struct dentry *percpu_dir;
- struct dentry *event_dir;
+ struct eventfs_inode *event_dir;
struct trace_options *topts;
struct list_head systems;
struct list_head events;
struct trace_event_file *trace_marker_file;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
+ /* one per_cpu trace_pipe can be opened by only one user */
+ cpumask_var_t pipe_cpumask;
int ref;
+ int trace_ref;
#ifdef CONFIG_FUNCTION_TRACER
struct ftrace_ops *ops;
struct trace_pid_list __rcu *function_pids;
@@ -369,11 +405,17 @@ struct trace_array {
/* function tracing enabled */
int function_enabled;
#endif
- int time_stamp_abs_ref;
+ int no_filter_buffering_ref;
struct list_head hist_vars;
#ifdef CONFIG_TRACER_SNAPSHOT
struct cond_snapshot *cond_snapshot;
#endif
+ struct trace_func_repeats __percpu *last_func_repeats;
+ /*
+ * On boot up, the ring buffer is set to the minimum size, so that
+ * we do not waste memory on systems that are not using tracing.
+ */
+ bool ring_buffer_expanded;
};
enum {
@@ -389,7 +431,8 @@ extern int tracing_check_open_get_tr(struct trace_array *tr);
extern struct trace_array *trace_array_find(const char *instance);
extern struct trace_array *trace_array_find_get(const char *instance);
-extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs);
+extern u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe);
+extern int tracing_set_filter_buffering(struct trace_array *tr, bool set);
extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
extern bool trace_clock_in_ns(struct trace_array *tr);
@@ -448,6 +491,8 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \
IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \
+ IF_ASSIGN(var, ent, struct osnoise_entry, TRACE_OSNOISE);\
+ IF_ASSIGN(var, ent, struct timerlat_entry, TRACE_TIMERLAT);\
IF_ASSIGN(var, ent, struct raw_data_entry, TRACE_RAW_DATA);\
IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
TRACE_MMIO_RW); \
@@ -458,6 +503,8 @@ extern void __ftrace_bad_type(void);
TRACE_GRAPH_ENT); \
IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
TRACE_GRAPH_RET); \
+ IF_ASSIGN(var, ent, struct func_repeats_entry, \
+ TRACE_FUNC_REPEATS); \
__ftrace_bad_type(); \
} while (0)
@@ -547,7 +594,6 @@ struct tracer {
struct tracer *next;
struct tracer_flags *flags;
int enabled;
- int ref;
bool print_max;
bool allow_instances;
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -557,163 +603,6 @@ struct tracer {
bool noboot;
};
-
-/* Only current can touch trace_recursion */
-
-/*
- * For function tracing recursion:
- * The order of these bits are important.
- *
- * When function tracing occurs, the following steps are made:
- * If arch does not support a ftrace feature:
- * call internal function (uses INTERNAL bits) which calls...
- * If callback is registered to the "global" list, the list
- * function is called and recursion checks the GLOBAL bits.
- * then this function calls...
- * The function callback, which can use the FTRACE bits to
- * check for recursion.
- *
- * Now if the arch does not suppport a feature, and it calls
- * the global list function which calls the ftrace callback
- * all three of these steps will do a recursion protection.
- * There's no reason to do one if the previous caller already
- * did. The recursion that we are protecting against will
- * go through the same steps again.
- *
- * To prevent the multiple recursion checks, if a recursion
- * bit is set that is higher than the MAX bit of the current
- * check, then we know that the check was made by the previous
- * caller, and we can skip the current check.
- */
-enum {
- /* Function recursion bits */
- TRACE_FTRACE_BIT,
- TRACE_FTRACE_NMI_BIT,
- TRACE_FTRACE_IRQ_BIT,
- TRACE_FTRACE_SIRQ_BIT,
-
- /* INTERNAL_BITs must be greater than FTRACE_BITs */
- TRACE_INTERNAL_BIT,
- TRACE_INTERNAL_NMI_BIT,
- TRACE_INTERNAL_IRQ_BIT,
- TRACE_INTERNAL_SIRQ_BIT,
-
- TRACE_BRANCH_BIT,
-/*
- * Abuse of the trace_recursion.
- * As we need a way to maintain state if we are tracing the function
- * graph in irq because we want to trace a particular function that
- * was called in irq context but we have irq tracing off. Since this
- * can only be modified by current, we can reuse trace_recursion.
- */
- TRACE_IRQ_BIT,
-
- /* Set if the function is in the set_graph_function file */
- TRACE_GRAPH_BIT,
-
- /*
- * In the very unlikely case that an interrupt came in
- * at a start of graph tracing, and we want to trace
- * the function in that interrupt, the depth can be greater
- * than zero, because of the preempted start of a previous
- * trace. In an even more unlikely case, depth could be 2
- * if a softirq interrupted the start of graph tracing,
- * followed by an interrupt preempting a start of graph
- * tracing in the softirq, and depth can even be 3
- * if an NMI came in at the start of an interrupt function
- * that preempted a softirq start of a function that
- * preempted normal context!!!! Luckily, it can't be
- * greater than 3, so the next two bits are a mask
- * of what the depth is when we set TRACE_GRAPH_BIT
- */
-
- TRACE_GRAPH_DEPTH_START_BIT,
- TRACE_GRAPH_DEPTH_END_BIT,
-
- /*
- * To implement set_graph_notrace, if this bit is set, we ignore
- * function graph tracing of called functions, until the return
- * function is called to clear it.
- */
- TRACE_GRAPH_NOTRACE_BIT,
-};
-
-#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0)
-#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
-#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit)))
-
-#define trace_recursion_depth() \
- (((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3)
-#define trace_recursion_set_depth(depth) \
- do { \
- current->trace_recursion &= \
- ~(3 << TRACE_GRAPH_DEPTH_START_BIT); \
- current->trace_recursion |= \
- ((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT; \
- } while (0)
-
-#define TRACE_CONTEXT_BITS 4
-
-#define TRACE_FTRACE_START TRACE_FTRACE_BIT
-#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
-
-#define TRACE_LIST_START TRACE_INTERNAL_BIT
-#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
-
-#define TRACE_CONTEXT_MASK TRACE_LIST_MAX
-
-static __always_inline int trace_get_context_bit(void)
-{
- int bit;
-
- if (in_interrupt()) {
- if (in_nmi())
- bit = 0;
-
- else if (in_irq())
- bit = 1;
- else
- bit = 2;
- } else
- bit = 3;
-
- return bit;
-}
-
-static __always_inline int trace_test_and_set_recursion(int start, int max)
-{
- unsigned int val = current->trace_recursion;
- int bit;
-
- /* A previous recursion check was made */
- if ((val & TRACE_CONTEXT_MASK) > max)
- return 0;
-
- bit = trace_get_context_bit() + start;
- if (unlikely(val & (1 << bit)))
- return -1;
-
- val |= 1 << bit;
- current->trace_recursion = val;
- barrier();
-
- return bit;
-}
-
-static __always_inline void trace_clear_recursion(int bit)
-{
- unsigned int val = current->trace_recursion;
-
- if (!bit)
- return;
-
- bit = 1 << bit;
- val &= ~bit;
-
- barrier();
- current->trace_recursion = val;
-}
-
static inline struct ring_buffer_iter *
trace_buffer_iter(struct trace_iterator *iter, int cpu)
{
@@ -723,10 +612,14 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
int tracer_init(struct tracer *t, struct trace_array *tr);
int tracing_is_enabled(void);
void tracing_reset_online_cpus(struct array_buffer *buf);
-void tracing_reset_current(int cpu);
void tracing_reset_all_online_cpus(void);
+void tracing_reset_all_online_cpus_unlocked(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
+int tracing_release_generic_tr(struct inode *inode, struct file *file);
+int tracing_open_file_tr(struct inode *inode, struct file *filp);
+int tracing_release_file_tr(struct inode *inode, struct file *filp);
+int tracing_single_release_file_tr(struct inode *inode, struct file *filp);
bool tracing_is_disabled(void);
bool tracer_tracing_is_on(struct trace_array *tr);
void tracer_tracing_on(struct trace_array *tr);
@@ -737,7 +630,7 @@ struct dentry *trace_create_file(const char *name,
void *data,
const struct file_operations *fops);
-struct dentry *tracing_init_dentry(void);
+int tracing_init_dentry(void);
struct ring_buffer_event;
@@ -745,8 +638,7 @@ struct ring_buffer_event *
trace_buffer_lock_reserve(struct trace_buffer *buffer,
int type,
unsigned long len,
- unsigned long flags,
- int pc);
+ unsigned int trace_ctx);
struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_array_cpu *data);
@@ -757,6 +649,12 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
struct ring_buffer_event *event);
+bool trace_is_tracepoint_string(const char *str);
+const char *trace_event_format(struct trace_iterator *iter, const char *fmt);
+void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
+ va_list ap) __printf(2, 0);
+char *trace_iter_expand_format(struct trace_iterator *iter);
+
int trace_empty(struct trace_iterator *iter);
void *trace_find_next_entry_inc(struct trace_iterator *iter);
@@ -771,15 +669,14 @@ unsigned long trace_total_entries(struct trace_array *tr);
void trace_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
- unsigned long flags, int pc);
+ unsigned int trace_ctx);
void trace_graph_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
- unsigned long flags, int pc);
+ unsigned int trace_ctx);
void trace_latency_header(struct seq_file *m);
void trace_default_header(struct seq_file *m);
void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
-int trace_empty(struct trace_iterator *iter);
void trace_graph_return(struct ftrace_graph_ret *trace);
int trace_graph_entry(struct ftrace_graph_ent *trace);
@@ -819,7 +716,6 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos);
void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos);
int trace_pid_show(struct seq_file *m, void *v);
-void trace_free_pid_list(struct trace_pid_list *pid_list);
int trace_pid_write(struct trace_pid_list *filtered_pids,
struct trace_pid_list **new_pid_list,
const char __user *ubuf, size_t cnt);
@@ -829,29 +725,31 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
void *cond_data);
void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
-#endif /* CONFIG_TRACER_MAX_TRACE */
-#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
- defined(CONFIG_FSNOTIFY)
+#ifdef CONFIG_FSNOTIFY
+#define LATENCY_FS_NOTIFY
+#endif
+#endif /* CONFIG_TRACER_MAX_TRACE */
+#ifdef LATENCY_FS_NOTIFY
void latency_fsnotify(struct trace_array *tr);
-
#else
-
static inline void latency_fsnotify(struct trace_array *tr) { }
-
#endif
#ifdef CONFIG_STACKTRACE
-void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
- int pc);
+void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip);
#else
-static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
- int skip, int pc)
+static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
+ int skip)
{
}
#endif /* CONFIG_STACKTRACE */
+void trace_last_func_repeats(struct trace_array *tr,
+ struct trace_func_repeats *last_info,
+ unsigned int trace_ctx);
+
extern u64 ftrace_now(int cpu);
extern void trace_find_cmdline(int pid, char comm[]);
@@ -871,10 +769,12 @@ extern int DYN_FTRACE_TEST_NAME(void);
#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
extern int DYN_FTRACE_TEST_NAME2(void);
-extern bool ring_buffer_expanded;
+extern void trace_set_ring_buffer_expanded(struct trace_array *tr);
extern bool tracing_selftest_disabled;
#ifdef CONFIG_FTRACE_STARTUP_TEST
+extern void __init disable_tracing_selftest(const char *reason);
+
extern int trace_selftest_startup_function(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_function_graph(struct tracer *trace,
@@ -898,6 +798,9 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
*/
#define __tracer_data __refdata
#else
+static inline void __init disable_tracing_selftest(const char *reason)
+{
+}
/* Tracers are seldom changed. Optimize when selftests are disabled. */
#define __tracer_data __read_mostly
#endif /* CONFIG_FTRACE_STARTUP_TEST */
@@ -962,6 +865,8 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
#define TRACE_GRAPH_PRINT_TAIL 0x100
#define TRACE_GRAPH_SLEEP_TIME 0x200
#define TRACE_GRAPH_GRAPH_TIME 0x400
+#define TRACE_GRAPH_PRINT_RETVAL 0x800
+#define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000
#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
@@ -982,10 +887,10 @@ extern void graph_trace_open(struct trace_iterator *iter);
extern void graph_trace_close(struct trace_iterator *iter);
extern int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
- unsigned long flags, int pc);
+ unsigned int trace_ctx);
extern void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
- unsigned long flags, int pc);
+ unsigned int trace_ctx);
#ifdef CONFIG_DYNAMIC_FTRACE
extern struct ftrace_hash __rcu *ftrace_graph_hash;
@@ -1026,7 +931,7 @@ static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
* is set, and called by an interrupt handler, we still
* want to trace it.
*/
- if (in_irq())
+ if (in_hardirq())
trace_recursion_set(TRACE_IRQ_BIT);
else
trace_recursion_clear(TRACE_IRQ_BIT);
@@ -1103,6 +1008,10 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
extern struct list_head ftrace_pids;
#ifdef CONFIG_FUNCTION_TRACER
+
+#define FTRACE_PID_IGNORE -1
+#define FTRACE_PID_TRACE -2
+
struct ftrace_func_command {
struct list_head list;
char *name;
@@ -1114,12 +1023,15 @@ struct ftrace_func_command {
extern bool ftrace_filter_param __initdata;
static inline int ftrace_trace_task(struct trace_array *tr)
{
- return !this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid);
+ return this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid) !=
+ FTRACE_PID_IGNORE;
}
extern int ftrace_is_dead(void);
int ftrace_create_function_files(struct trace_array *tr,
struct dentry *parent);
void ftrace_destroy_function_files(struct trace_array *tr);
+int ftrace_allocate_ftrace_ops(struct trace_array *tr);
+void ftrace_free_ftrace_ops(struct trace_array *tr);
void ftrace_init_global_array_ops(struct trace_array *tr);
void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
void ftrace_reset_array_ops(struct trace_array *tr);
@@ -1141,6 +1053,11 @@ ftrace_create_function_files(struct trace_array *tr,
{
return 0;
}
+static inline int ftrace_allocate_ftrace_ops(struct trace_array *tr)
+{
+ return 0;
+}
+static inline void ftrace_free_ftrace_ops(struct trace_array *tr) { }
static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
static inline __init void
ftrace_init_global_array_ops(struct trace_array *tr) { }
@@ -1318,6 +1235,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(HEX, "hex"), \
C(BIN, "bin"), \
C(BLOCK, "block"), \
+ C(FIELDS, "fields"), \
C(PRINTK, "trace_printk"), \
C(ANNOTATE, "annotate"), \
C(USERSTACKTRACE, "userstacktrace"), \
@@ -1333,6 +1251,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(MARKERS, "markers"), \
C(EVENT_FORK, "event-fork"), \
C(PAUSE_ON_TRACE, "pause-on-trace"), \
+ C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \
FUNCTION_FLAGS \
FGRAPH_FLAGS \
STACK_FLAGS \
@@ -1394,7 +1313,15 @@ static inline void trace_branch_disable(void)
#endif /* CONFIG_BRANCH_TRACER */
/* set ring buffers to default size if not already done so */
-int tracing_update_buffers(void);
+int tracing_update_buffers(struct trace_array *tr);
+
+union trace_synth_field {
+ u8 as_u8;
+ u16 as_u16;
+ u32 as_u32;
+ u64 as_u64;
+ struct trace_dynamic_info as_dynamic;
+};
struct ftrace_event_field {
struct list_head link;
@@ -1404,6 +1331,7 @@ struct ftrace_event_field {
int offset;
int size;
int is_signed;
+ int len;
};
struct prog_entry;
@@ -1424,7 +1352,7 @@ struct trace_subsystem_dir {
struct list_head list;
struct event_subsystem *subsystem;
struct trace_array *tr;
- struct dentry *entry;
+ struct eventfs_inode *ei;
int ref_count;
int nr_events;
};
@@ -1436,15 +1364,15 @@ extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
- unsigned long flags, int pc,
+ unsigned int trcace_ctx,
struct pt_regs *regs);
static inline void trace_buffer_unlock_commit(struct trace_array *tr,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
- trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL);
+ trace_buffer_unlock_commit_regs(tr, buffer, event, trace_ctx, NULL);
}
DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
@@ -1452,22 +1380,26 @@ DECLARE_PER_CPU(int, trace_buffered_event_cnt);
void trace_buffered_event_disable(void);
void trace_buffered_event_enable(void);
+void early_enable_events(struct trace_array *tr, char *buf, bool disable_first);
+
static inline void
__trace_event_discard_commit(struct trace_buffer *buffer,
struct ring_buffer_event *event)
{
if (this_cpu_read(trace_buffered_event) == event) {
- /* Simply release the temp buffer */
+ /* Simply release the temp buffer and enable preemption */
this_cpu_dec(trace_buffered_event_cnt);
+ preempt_enable_notrace();
return;
}
+ /* ring_buffer_discard_commit() enables preemption */
ring_buffer_discard_commit(buffer, event);
}
/*
* Helper function for event_trigger_unlock_commit{_regs}().
* If there are event triggers attached to this event that requires
- * filtering against its fields, then they wil be called as the
+ * filtering against its fields, then they will be called as the
* entry already holds the field information of the current event.
*
* It also checks if the event should be discarded or not.
@@ -1487,26 +1419,37 @@ __event_trigger_test_discard(struct trace_event_file *file,
unsigned long eflags = file->flags;
if (eflags & EVENT_FILE_FL_TRIGGER_COND)
- *tt = event_triggers_call(file, entry, event);
+ *tt = event_triggers_call(file, buffer, entry, event);
- if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
- (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
- !filter_match_preds(file->filter, entry))) {
- __trace_event_discard_commit(buffer, event);
- return true;
- }
+ if (likely(!(file->flags & (EVENT_FILE_FL_SOFT_DISABLED |
+ EVENT_FILE_FL_FILTERED |
+ EVENT_FILE_FL_PID_FILTER))))
+ return false;
+
+ if (file->flags & EVENT_FILE_FL_SOFT_DISABLED)
+ goto discard;
+
+ if (file->flags & EVENT_FILE_FL_FILTERED &&
+ !filter_match_preds(file->filter, entry))
+ goto discard;
+
+ if ((file->flags & EVENT_FILE_FL_PID_FILTER) &&
+ trace_event_ignore_this_pid(file))
+ goto discard;
return false;
+ discard:
+ __trace_event_discard_commit(buffer, event);
+ return true;
}
/**
* event_trigger_unlock_commit - handle triggers and finish event commit
- * @file: The file pointer assoctiated to the event
+ * @file: The file pointer associated with the event
* @buffer: The ring buffer that the event is being written to
* @event: The event meta data in the ring buffer
* @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
+ * @trace_ctx: The tracing context flags.
*
* This is a helper function to handle triggers that require data
* from the event itself. It also tests the event against filters and
@@ -1516,45 +1459,12 @@ static inline void
event_trigger_unlock_commit(struct trace_event_file *file,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
- void *entry, unsigned long irq_flags, int pc)
-{
- enum event_trigger_type tt = ETT_NONE;
-
- if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
- trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
-
- if (tt)
- event_triggers_post_call(file, tt);
-}
-
-/**
- * event_trigger_unlock_commit_regs - handle triggers and finish event commit
- * @file: The file pointer assoctiated to the event
- * @buffer: The ring buffer that the event is being written to
- * @event: The event meta data in the ring buffer
- * @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
- *
- * This is a helper function to handle triggers that require data
- * from the event itself. It also tests the event against filters and
- * if the event is soft disabled and should be discarded.
- *
- * Same as event_trigger_unlock_commit() but calls
- * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit().
- */
-static inline void
-event_trigger_unlock_commit_regs(struct trace_event_file *file,
- struct trace_buffer *buffer,
- struct ring_buffer_event *event,
- void *entry, unsigned long irq_flags, int pc,
- struct pt_regs *regs)
+ void *entry, unsigned int trace_ctx)
{
enum event_trigger_type tt = ETT_NONE;
if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
- trace_buffer_unlock_commit_regs(file->tr, buffer, event,
- irq_flags, pc, regs);
+ trace_buffer_unlock_commit(file->tr, buffer, event, trace_ctx);
if (tt)
event_triggers_post_call(file, tt);
@@ -1576,8 +1486,6 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file,
struct filter_pred;
struct regex;
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
-
typedef int (*regex_match_func)(char *str, struct regex *r, int len);
enum regex_type {
@@ -1596,20 +1504,10 @@ struct regex {
regex_match_func match;
};
-struct filter_pred {
- filter_pred_fn_t fn;
- u64 val;
- struct regex regex;
- unsigned short *ops;
- struct ftrace_event_field *field;
- int offset;
- int not;
- int op;
-};
-
static inline bool is_string_field(struct ftrace_event_field *field)
{
return field->filter_type == FILTER_DYN_STRING ||
+ field->filter_type == FILTER_RDYN_STRING ||
field->filter_type == FILTER_STATIC_STRING ||
field->filter_type == FILTER_PTR_STRING ||
field->filter_type == FILTER_COMM;
@@ -1644,8 +1542,10 @@ extern void trace_event_enable_cmd_record(bool enable);
extern void trace_event_enable_tgid_record(bool enable);
extern int event_trace_init(void);
+extern int init_events(void);
extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
extern int event_trace_del_tracer(struct trace_array *tr);
+extern void __trace_early_add_events(struct trace_array *tr);
extern struct trace_event_file *__find_event_file(struct trace_array *tr,
const char *system,
@@ -1678,9 +1578,14 @@ static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; }
extern int register_trigger_cmds(void);
extern void clear_event_triggers(struct trace_array *tr);
+enum {
+ EVENT_TRIGGER_FL_PROBE = BIT(0),
+};
+
struct event_trigger_data {
unsigned long count;
int ref;
+ int flags;
struct event_trigger_ops *ops;
struct event_command *cmd_ops;
struct event_filter __rcu *filter;
@@ -1707,24 +1612,20 @@ struct enable_trigger_data {
};
extern int event_enable_trigger_print(struct seq_file *m,
- struct event_trigger_ops *ops,
- struct event_trigger_data *data);
-extern void event_enable_trigger_free(struct event_trigger_ops *ops,
struct event_trigger_data *data);
-extern int event_enable_trigger_func(struct event_command *cmd_ops,
- struct trace_event_file *file,
- char *glob, char *cmd, char *param);
+extern void event_enable_trigger_free(struct event_trigger_data *data);
+extern int event_enable_trigger_parse(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd,
+ char *param_and_filter);
extern int event_enable_register_trigger(char *glob,
- struct event_trigger_ops *ops,
struct event_trigger_data *data,
struct trace_event_file *file);
extern void event_enable_unregister_trigger(char *glob,
- struct event_trigger_ops *ops,
struct event_trigger_data *test,
struct trace_event_file *file);
extern void trigger_data_free(struct event_trigger_data *data);
-extern int event_trigger_init(struct event_trigger_ops *ops,
- struct event_trigger_data *data);
+extern int event_trigger_init(struct event_trigger_data *data);
extern int trace_event_trigger_enable_disable(struct trace_event_file *file,
int trigger_enable);
extern void update_cond_flag(struct trace_event_file *file);
@@ -1745,6 +1646,34 @@ get_named_trigger_data(struct event_trigger_data *data);
extern int register_event_command(struct event_command *cmd);
extern int unregister_event_command(struct event_command *cmd);
extern int register_trigger_hist_enable_disable_cmds(void);
+extern bool event_trigger_check_remove(const char *glob);
+extern bool event_trigger_empty_param(const char *param);
+extern int event_trigger_separate_filter(char *param_and_filter, char **param,
+ char **filter, bool param_required);
+extern struct event_trigger_data *
+event_trigger_alloc(struct event_command *cmd_ops,
+ char *cmd,
+ char *param,
+ void *private_data);
+extern int event_trigger_parse_num(char *trigger,
+ struct event_trigger_data *trigger_data);
+extern int event_trigger_set_filter(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *param,
+ struct event_trigger_data *trigger_data);
+extern void event_trigger_reset_filter(struct event_command *cmd_ops,
+ struct event_trigger_data *trigger_data);
+extern int event_trigger_register(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob,
+ struct event_trigger_data *trigger_data);
+extern void event_trigger_unregister(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob,
+ struct event_trigger_data *trigger_data);
+
+extern void event_file_get(struct trace_event_file *file);
+extern void event_file_put(struct trace_event_file *file);
/**
* struct event_trigger_ops - callbacks for trace event triggers
@@ -1752,10 +1681,20 @@ extern int register_trigger_hist_enable_disable_cmds(void);
* The methods in this structure provide per-event trigger hooks for
* various trigger operations.
*
+ * The @init and @free methods are used during trigger setup and
+ * teardown, typically called from an event_command's @parse()
+ * function implementation.
+ *
+ * The @print method is used to print the trigger spec.
+ *
+ * The @trigger method is the function that actually implements the
+ * trigger and is called in the context of the triggering event
+ * whenever that event occurs.
+ *
* All the methods below, except for @init() and @free(), must be
* implemented.
*
- * @func: The trigger 'probe' function called when the triggering
+ * @trigger: The trigger 'probe' function called when the triggering
* event occurs. The data passed into this callback is the data
* that was supplied to the event_command @reg() function that
* registered the trigger (see struct event_command) along with
@@ -1784,15 +1723,13 @@ extern int register_trigger_hist_enable_disable_cmds(void);
* (see trace_event_triggers.c).
*/
struct event_trigger_ops {
- void (*func)(struct event_trigger_data *data,
- void *rec,
- struct ring_buffer_event *rbe);
- int (*init)(struct event_trigger_ops *ops,
- struct event_trigger_data *data);
- void (*free)(struct event_trigger_ops *ops,
- struct event_trigger_data *data);
+ void (*trigger)(struct event_trigger_data *data,
+ struct trace_buffer *buffer,
+ void *rec,
+ struct ring_buffer_event *rbe);
+ int (*init)(struct event_trigger_data *data);
+ void (*free)(struct event_trigger_data *data);
int (*print)(struct seq_file *m,
- struct event_trigger_ops *ops,
struct event_trigger_data *data);
};
@@ -1835,7 +1772,7 @@ struct event_trigger_ops {
* All the methods below, except for @set_filter() and @unreg_all(),
* must be implemented.
*
- * @func: The callback function responsible for parsing and
+ * @parse: The callback function responsible for parsing and
* registering the trigger written to the 'trigger' file by the
* user. It allocates the trigger instance and registers it with
* the appropriate trace event. It makes use of the other
@@ -1870,21 +1807,24 @@ struct event_trigger_ops {
*
* @get_trigger_ops: The callback function invoked to retrieve the
* event_trigger_ops implementation associated with the command.
+ * This callback function allows a single event_command to
+ * support multiple trigger implementations via different sets of
+ * event_trigger_ops, depending on the value of the @param
+ * string.
*/
struct event_command {
struct list_head list;
char *name;
enum event_trigger_type trigger_type;
int flags;
- int (*func)(struct event_command *cmd_ops,
- struct trace_event_file *file,
- char *glob, char *cmd, char *params);
+ int (*parse)(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd,
+ char *param_and_filter);
int (*reg)(char *glob,
- struct event_trigger_ops *ops,
struct event_trigger_data *data,
struct trace_event_file *file);
void (*unreg)(char *glob,
- struct event_trigger_ops *ops,
struct event_trigger_data *data,
struct trace_event_file *file);
void (*unreg_all)(struct trace_event_file *file);
@@ -1968,15 +1908,14 @@ extern int tracing_set_cpumask(struct trace_array *tr,
#define MAX_EVENT_NAME_LEN 64
-extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
extern ssize_t trace_parse_run_command(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos,
- int (*createfn)(int, char**));
+ int (*createfn)(const char *));
extern unsigned int err_pos(char *cmd, const char *str);
extern void tracing_log_err(struct trace_array *tr,
const char *loc, const char *cmd,
- const char **errs, u8 type, u8 pos);
+ const char **errs, u8 type, u16 pos);
/*
* Normal trace_printk() and friends allocates special buffers
@@ -2057,8 +1996,6 @@ static inline void tracer_hardirqs_on(unsigned long a0, unsigned long a1) { }
static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { }
#endif
-extern struct trace_iterator *tracepoint_print_iter;
-
/*
* Reset the state of the trace_iterator so that it can read consumed data.
* Normally, the trace_iterator is used for reading the data when it is not
@@ -2066,15 +2003,68 @@ extern struct trace_iterator *tracepoint_print_iter;
*/
static __always_inline void trace_iterator_reset(struct trace_iterator *iter)
{
- const size_t offset = offsetof(struct trace_iterator, seq);
+ memset_startat(iter, 0, seq);
+ iter->pos = -1;
+}
- /*
- * Keep gcc from complaining about overwriting more than just one
- * member in the structure.
- */
- memset((char *)iter + offset, 0, sizeof(struct trace_iterator) - offset);
+/* Check the name is good for event/group/fields */
+static inline bool __is_good_name(const char *name, bool hash_ok)
+{
+ if (!isalpha(*name) && *name != '_' && (!hash_ok || *name != '-'))
+ return false;
+ while (*++name != '\0') {
+ if (!isalpha(*name) && !isdigit(*name) && *name != '_' &&
+ (!hash_ok || *name != '-'))
+ return false;
+ }
+ return true;
+}
- iter->pos = -1;
+/* Check the name is good for event/group/fields */
+static inline bool is_good_name(const char *name)
+{
+ return __is_good_name(name, false);
+}
+
+/* Check the name is good for system */
+static inline bool is_good_system_name(const char *name)
+{
+ return __is_good_name(name, true);
}
+/* Convert certain expected symbols into '_' when generating event names */
+static inline void sanitize_event_name(char *name)
+{
+ while (*name++ != '\0')
+ if (*name == ':' || *name == '.')
+ *name = '_';
+}
+
+/*
+ * This is a generic way to read and write a u64 value from a file in tracefs.
+ *
+ * The value is stored on the variable pointed by *val. The value needs
+ * to be at least *min and at most *max. The write is protected by an
+ * existing *lock.
+ */
+struct trace_min_max_param {
+ struct mutex *lock;
+ u64 *val;
+ u64 *min;
+ u64 *max;
+};
+
+#define U64_STR_SIZE 24 /* 20 digits max */
+
+extern const struct file_operations trace_min_max_fops;
+
+#ifdef CONFIG_RV
+extern int rv_init_interface(void);
+#else
+static inline int rv_init_interface(void)
+{
+ return 0;
+}
+#endif
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 2e9a4746ea85..54d5fa35c90a 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -31,7 +31,7 @@ static bool ok_to_run;
* it simply writes "START". As the first write is cold cache and
* the rest is hot, we save off that time in bm_first and it is
* reported as "first", which is shown in the second write to the
- * tracepoint. The "first" field is writen within the statics from
+ * tracepoint. The "first" field is written within the statics from
* then on but never changes.
*/
static void trace_do_benchmark(void)
@@ -51,7 +51,7 @@ static void trace_do_benchmark(void)
local_irq_disable();
start = trace_clock_local();
- trace_benchmark_event(bm_str);
+ trace_benchmark_event(bm_str, bm_last);
stop = trace_clock_local();
local_irq_enable();
@@ -112,7 +112,7 @@ static void trace_do_benchmark(void)
int i = 0;
/*
* stddev is the square of standard deviation but
- * we want the actualy number. Use the average
+ * we want the actually number. Use the average
* as our seed to find the std.
*
* The next try is:
@@ -155,7 +155,7 @@ static int benchmark_event_kthread(void *arg)
/*
* We don't go to sleep, but let others run as well.
- * This is bascially a "yield()" to let any task that
+ * This is basically a "yield()" to let any task that
* wants to run, schedule in, but if the CPU is idle,
* we'll keep burning cycles.
*
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h
index 79e6fbe5b365..c3e91060dc94 100644
--- a/kernel/trace/trace_benchmark.h
+++ b/kernel/trace/trace_benchmark.h
@@ -14,19 +14,21 @@ extern void trace_benchmark_unreg(void);
TRACE_EVENT_FN(benchmark_event,
- TP_PROTO(const char *str),
+ TP_PROTO(const char *str, u64 delta),
- TP_ARGS(str),
+ TP_ARGS(str, delta),
TP_STRUCT__entry(
__array( char, str, BENCHMARK_EVENT_STRLEN )
+ __field( u64, delta)
),
TP_fast_assign(
memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN);
+ __entry->delta = delta;
),
- TP_printk("%s", __entry->str),
+ TP_printk("%s delta=%llu", __entry->str, __entry->delta),
trace_benchmark_reg, trace_benchmark_unreg
);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index fa0fc08c6ef8..dbe29b4c6a7a 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -31,7 +31,7 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node)
/* Common ftrace options */
xbc_node_for_each_array_value(node, "options", anode, p) {
- if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) {
+ if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) {
pr_err("String is too long: %s\n", p);
continue;
}
@@ -40,6 +40,16 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node)
pr_err("Failed to set option: %s\n", buf);
}
+ p = xbc_node_find_value(node, "tracing_on", NULL);
+ if (p && *p != '\0') {
+ if (kstrtoul(p, 10, &v))
+ pr_err("Failed to set tracing on: %s\n", p);
+ if (v)
+ tracer_tracing_on(tr);
+ else
+ tracer_tracing_off(tr);
+ }
+
p = xbc_node_find_value(node, "trace_clock", NULL);
if (p && *p != '\0') {
if (tracing_set_clock(tr, p) < 0)
@@ -77,7 +87,7 @@ trace_boot_enable_events(struct trace_array *tr, struct xbc_node *node)
const char *p;
xbc_node_for_each_array_value(node, "events", anode, p) {
- if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) {
+ if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) {
pr_err("String is too long: %s\n", p);
continue;
}
@@ -161,6 +171,293 @@ trace_boot_add_synth_event(struct xbc_node *node, const char *event)
}
#endif
+#ifdef CONFIG_HIST_TRIGGERS
+static int __init __printf(3, 4)
+append_printf(char **bufp, char *end, const char *fmt, ...)
+{
+ va_list args;
+ int ret;
+
+ if (*bufp == end)
+ return -ENOSPC;
+
+ va_start(args, fmt);
+ ret = vsnprintf(*bufp, end - *bufp, fmt, args);
+ if (ret < end - *bufp) {
+ *bufp += ret;
+ } else {
+ *bufp = end;
+ ret = -ERANGE;
+ }
+ va_end(args);
+
+ return ret;
+}
+
+static int __init
+append_str_nospace(char **bufp, char *end, const char *str)
+{
+ char *p = *bufp;
+ int len;
+
+ while (p < end - 1 && *str != '\0') {
+ if (!isspace(*str))
+ *(p++) = *str;
+ str++;
+ }
+ *p = '\0';
+ if (p == end - 1) {
+ *bufp = end;
+ return -ENOSPC;
+ }
+ len = p - *bufp;
+ *bufp = p;
+ return (int)len;
+}
+
+static int __init
+trace_boot_hist_add_array(struct xbc_node *hnode, char **bufp,
+ char *end, const char *key)
+{
+ struct xbc_node *anode;
+ const char *p;
+ char sep;
+
+ p = xbc_node_find_value(hnode, key, &anode);
+ if (p) {
+ if (!anode) {
+ pr_err("hist.%s requires value(s).\n", key);
+ return -EINVAL;
+ }
+
+ append_printf(bufp, end, ":%s", key);
+ sep = '=';
+ xbc_array_for_each_value(anode, p) {
+ append_printf(bufp, end, "%c%s", sep, p);
+ if (sep == '=')
+ sep = ',';
+ }
+ } else
+ return -ENOENT;
+
+ return 0;
+}
+
+static int __init
+trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp,
+ char *end, const char *handler,
+ const char *param)
+{
+ struct xbc_node *knode, *anode;
+ const char *p;
+ char sep;
+
+ /* Compose 'handler' parameter */
+ p = xbc_node_find_value(hnode, param, NULL);
+ if (!p) {
+ pr_err("hist.%s requires '%s' option.\n",
+ xbc_node_get_data(hnode), param);
+ return -EINVAL;
+ }
+ append_printf(bufp, end, ":%s(%s)", handler, p);
+
+ /* Compose 'action' parameter */
+ knode = xbc_node_find_subkey(hnode, "trace");
+ if (!knode)
+ knode = xbc_node_find_subkey(hnode, "save");
+
+ if (knode) {
+ anode = xbc_node_get_child(knode);
+ if (!anode || !xbc_node_is_value(anode)) {
+ pr_err("hist.%s.%s requires value(s).\n",
+ xbc_node_get_data(hnode),
+ xbc_node_get_data(knode));
+ return -EINVAL;
+ }
+
+ append_printf(bufp, end, ".%s", xbc_node_get_data(knode));
+ sep = '(';
+ xbc_array_for_each_value(anode, p) {
+ append_printf(bufp, end, "%c%s", sep, p);
+ if (sep == '(')
+ sep = ',';
+ }
+ append_printf(bufp, end, ")");
+ } else if (xbc_node_find_subkey(hnode, "snapshot")) {
+ append_printf(bufp, end, ".snapshot()");
+ } else {
+ pr_err("hist.%s requires an action.\n",
+ xbc_node_get_data(hnode));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int __init
+trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp,
+ char *end, const char *param)
+{
+ struct xbc_node *node;
+ const char *p, *handler;
+ int ret = 0;
+
+ handler = xbc_node_get_data(hnode);
+
+ xbc_node_for_each_subkey(hnode, node) {
+ p = xbc_node_get_data(node);
+ if (!isdigit(p[0]))
+ continue;
+ /* All digit started node should be instances. */
+ ret = trace_boot_hist_add_one_handler(node, bufp, end, handler, param);
+ if (ret < 0)
+ break;
+ }
+
+ if (xbc_node_find_subkey(hnode, param))
+ ret = trace_boot_hist_add_one_handler(hnode, bufp, end, handler, param);
+
+ return ret;
+}
+
+/*
+ * Histogram boottime tracing syntax.
+ *
+ * ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist[.N] {
+ * keys = <KEY>[,...]
+ * values = <VAL>[,...]
+ * sort = <SORT-KEY>[,...]
+ * size = <ENTRIES>
+ * name = <HISTNAME>
+ * var { <VAR> = <EXPR> ... }
+ * pause|continue|clear
+ * onmax|onchange[.N] { var = <VAR>; <ACTION> [= <PARAM>] }
+ * onmatch[.N] { event = <EVENT>; <ACTION> [= <PARAM>] }
+ * filter = <FILTER>
+ * }
+ *
+ * Where <ACTION> are;
+ *
+ * trace = <EVENT>, <ARG1>[, ...]
+ * save = <ARG1>[, ...]
+ * snapshot
+ */
+static int __init
+trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size)
+{
+ struct xbc_node *node, *knode;
+ char *end = buf + size;
+ const char *p;
+ int ret = 0;
+
+ append_printf(&buf, end, "hist");
+
+ ret = trace_boot_hist_add_array(hnode, &buf, end, "keys");
+ if (ret < 0) {
+ if (ret == -ENOENT)
+ pr_err("hist requires keys.\n");
+ return -EINVAL;
+ }
+
+ ret = trace_boot_hist_add_array(hnode, &buf, end, "values");
+ if (ret == -EINVAL)
+ return ret;
+ ret = trace_boot_hist_add_array(hnode, &buf, end, "sort");
+ if (ret == -EINVAL)
+ return ret;
+
+ p = xbc_node_find_value(hnode, "size", NULL);
+ if (p)
+ append_printf(&buf, end, ":size=%s", p);
+
+ p = xbc_node_find_value(hnode, "name", NULL);
+ if (p)
+ append_printf(&buf, end, ":name=%s", p);
+
+ node = xbc_node_find_subkey(hnode, "var");
+ if (node) {
+ xbc_node_for_each_key_value(node, knode, p) {
+ /* Expression must not include spaces. */
+ append_printf(&buf, end, ":%s=",
+ xbc_node_get_data(knode));
+ append_str_nospace(&buf, end, p);
+ }
+ }
+
+ /* Histogram control attributes (mutual exclusive) */
+ if (xbc_node_find_value(hnode, "pause", NULL))
+ append_printf(&buf, end, ":pause");
+ else if (xbc_node_find_value(hnode, "continue", NULL))
+ append_printf(&buf, end, ":continue");
+ else if (xbc_node_find_value(hnode, "clear", NULL))
+ append_printf(&buf, end, ":clear");
+
+ /* Histogram handler and actions */
+ node = xbc_node_find_subkey(hnode, "onmax");
+ if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0)
+ return -EINVAL;
+ node = xbc_node_find_subkey(hnode, "onchange");
+ if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0)
+ return -EINVAL;
+ node = xbc_node_find_subkey(hnode, "onmatch");
+ if (node && trace_boot_hist_add_handlers(node, &buf, end, "event") < 0)
+ return -EINVAL;
+
+ p = xbc_node_find_value(hnode, "filter", NULL);
+ if (p)
+ append_printf(&buf, end, " if %s", p);
+
+ if (buf == end) {
+ pr_err("hist exceeds the max command length.\n");
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
+static void __init
+trace_boot_init_histograms(struct trace_event_file *file,
+ struct xbc_node *hnode, char *buf, size_t size)
+{
+ struct xbc_node *node;
+ const char *p;
+ char *tmp;
+
+ xbc_node_for_each_subkey(hnode, node) {
+ p = xbc_node_get_data(node);
+ if (!isdigit(p[0]))
+ continue;
+ /* All digit started node should be instances. */
+ if (trace_boot_compose_hist_cmd(node, buf, size) == 0) {
+ tmp = kstrdup(buf, GFP_KERNEL);
+ if (!tmp)
+ return;
+ if (trigger_process_regex(file, buf) < 0)
+ pr_err("Failed to apply hist trigger: %s\n", tmp);
+ kfree(tmp);
+ }
+ }
+
+ if (xbc_node_find_subkey(hnode, "keys")) {
+ if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) {
+ tmp = kstrdup(buf, GFP_KERNEL);
+ if (!tmp)
+ return;
+ if (trigger_process_regex(file, buf) < 0)
+ pr_err("Failed to apply hist trigger: %s\n", tmp);
+ kfree(tmp);
+ }
+ }
+}
+#else
+static void __init
+trace_boot_init_histograms(struct trace_event_file *file,
+ struct xbc_node *hnode, char *buf, size_t size)
+{
+ /* do nothing */
+}
+#endif
+
static void __init
trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode,
struct xbc_node *enode)
@@ -189,18 +486,24 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode,
p = xbc_node_find_value(enode, "filter", NULL);
if (p && *p != '\0') {
- if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf))
+ if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0)
pr_err("filter string is too long: %s\n", p);
else if (apply_event_filter(file, buf) < 0)
pr_err("Failed to apply filter: %s\n", buf);
}
- xbc_node_for_each_array_value(enode, "actions", anode, p) {
- if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf))
- pr_err("action string is too long: %s\n", p);
- else if (trigger_process_regex(file, buf) < 0)
- pr_err("Failed to apply an action: %s\n", buf);
- }
+ if (IS_ENABLED(CONFIG_HIST_TRIGGERS)) {
+ xbc_node_for_each_array_value(enode, "actions", anode, p) {
+ if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0)
+ pr_err("action string is too long: %s\n", p);
+ else if (trigger_process_regex(file, buf) < 0)
+ pr_err("Failed to apply an action: %s\n", p);
+ }
+ anode = xbc_node_find_subkey(enode, "hist");
+ if (anode)
+ trace_boot_init_histograms(file, anode, buf, ARRAY_SIZE(buf));
+ } else if (xbc_node_find_value(enode, "actions", NULL))
+ pr_err("Failed to apply event actions because CONFIG_HIST_TRIGGERS is not set.\n");
if (xbc_node_find_value(enode, "enable", NULL)) {
if (trace_event_enable_disable(file, 1, 0) < 0)
@@ -215,14 +518,37 @@ static void __init
trace_boot_init_events(struct trace_array *tr, struct xbc_node *node)
{
struct xbc_node *gnode, *enode;
+ bool enable, enable_all = false;
+ const char *data;
- node = xbc_node_find_child(node, "event");
+ node = xbc_node_find_subkey(node, "event");
if (!node)
return;
/* per-event key starts with "event.GROUP.EVENT" */
- xbc_node_for_each_child(node, gnode)
- xbc_node_for_each_child(gnode, enode)
+ xbc_node_for_each_subkey(node, gnode) {
+ data = xbc_node_get_data(gnode);
+ if (!strcmp(data, "enable")) {
+ enable_all = true;
+ continue;
+ }
+ enable = false;
+ xbc_node_for_each_subkey(gnode, enode) {
+ data = xbc_node_get_data(enode);
+ if (!strcmp(data, "enable")) {
+ enable = true;
+ continue;
+ }
trace_boot_init_one_event(tr, gnode, enode);
+ }
+ /* Event enablement must be done after event settings */
+ if (enable) {
+ data = xbc_node_get_data(gnode);
+ trace_array_set_clr_event(tr, data, NULL, true);
+ }
+ }
+ /* Ditto */
+ if (enable_all)
+ trace_array_set_clr_event(tr, NULL, NULL, true);
}
#else
#define trace_boot_enable_events(tr, node) do {} while (0)
@@ -274,6 +600,12 @@ trace_boot_enable_tracer(struct trace_array *tr, struct xbc_node *node)
if (tracing_set_tracer(tr, p) < 0)
pr_err("Failed to set given tracer: %s\n", p);
}
+
+ /* Since tracer can free snapshot buffer, allocate snapshot here.*/
+ if (xbc_node_find_value(node, "alloc_snapshot", NULL)) {
+ if (tracing_alloc_snapshot_instance(tr) < 0)
+ pr_err("Failed to allocate snapshot buffer\n");
+ }
}
static void __init
@@ -292,16 +624,16 @@ trace_boot_init_instances(struct xbc_node *node)
struct trace_array *tr;
const char *p;
- node = xbc_node_find_child(node, "instance");
+ node = xbc_node_find_subkey(node, "instance");
if (!node)
return;
- xbc_node_for_each_child(node, inode) {
+ xbc_node_for_each_subkey(node, inode) {
p = xbc_node_get_data(inode);
if (!p || *p == '\0')
continue;
- tr = trace_array_get_by_name(p);
+ tr = trace_array_get_by_name(p, NULL);
if (!tr) {
pr_err("Failed to get trace instance %s\n", p);
continue;
@@ -328,7 +660,12 @@ static int __init trace_boot_init(void)
trace_boot_init_one_instance(tr, trace_node);
trace_boot_init_instances(trace_node);
+ disable_tracing_selftest("running boot-time tracing");
+
return 0;
}
-
-fs_initcall(trace_boot_init);
+/*
+ * Start tracing at the end of core-initcall, so that it starts tracing
+ * from the beginning of postcore_initcall.
+ */
+core_initcall_sync(trace_boot_init);
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index eff099123aa2..e47fdb4c92fb 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -37,7 +37,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
struct ring_buffer_event *event;
struct trace_branch *entry;
unsigned long flags;
- int pc;
+ unsigned int trace_ctx;
const char *p;
if (current->trace_recursion & TRACE_BRANCH_BIT)
@@ -59,10 +59,10 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
if (atomic_read(&data->disabled))
goto out;
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx_flags(flags);
buffer = tr->array_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
goto out;
diff --git a/kernel/trace/trace_btf.c b/kernel/trace/trace_btf.c
new file mode 100644
index 000000000000..5bbdbcbbde3c
--- /dev/null
+++ b/kernel/trace/trace_btf.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/btf.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "trace_btf.h"
+
+/*
+ * Find a function proto type by name, and return the btf_type with its btf
+ * in *@btf_p. Return NULL if not found.
+ * Note that caller has to call btf_put(*@btf_p) after using the btf_type.
+ */
+const struct btf_type *btf_find_func_proto(const char *func_name, struct btf **btf_p)
+{
+ const struct btf_type *t;
+ s32 id;
+
+ id = bpf_find_btf_id(func_name, BTF_KIND_FUNC, btf_p);
+ if (id < 0)
+ return NULL;
+
+ /* Get BTF_KIND_FUNC type */
+ t = btf_type_by_id(*btf_p, id);
+ if (!t || !btf_type_is_func(t))
+ goto err;
+
+ /* The type of BTF_KIND_FUNC is BTF_KIND_FUNC_PROTO */
+ t = btf_type_by_id(*btf_p, t->type);
+ if (!t || !btf_type_is_func_proto(t))
+ goto err;
+
+ return t;
+err:
+ btf_put(*btf_p);
+ return NULL;
+}
+
+/*
+ * Get function parameter with the number of parameters.
+ * This can return NULL if the function has no parameters.
+ * It can return -EINVAL if the @func_proto is not a function proto type.
+ */
+const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, s32 *nr)
+{
+ if (!btf_type_is_func_proto(func_proto))
+ return ERR_PTR(-EINVAL);
+
+ *nr = btf_type_vlen(func_proto);
+ if (*nr > 0)
+ return (const struct btf_param *)(func_proto + 1);
+ else
+ return NULL;
+}
+
+#define BTF_ANON_STACK_MAX 16
+
+struct btf_anon_stack {
+ u32 tid;
+ u32 offset;
+};
+
+/*
+ * Find a member of data structure/union by name and return it.
+ * Return NULL if not found, or -EINVAL if parameter is invalid.
+ * If the member is an member of anonymous union/structure, the offset
+ * of that anonymous union/structure is stored into @anon_offset. Caller
+ * can calculate the correct offset from the root data structure by
+ * adding anon_offset to the member's offset.
+ */
+const struct btf_member *btf_find_struct_member(struct btf *btf,
+ const struct btf_type *type,
+ const char *member_name,
+ u32 *anon_offset)
+{
+ struct btf_anon_stack *anon_stack;
+ const struct btf_member *member;
+ u32 tid, cur_offset = 0;
+ const char *name;
+ int i, top = 0;
+
+ anon_stack = kcalloc(BTF_ANON_STACK_MAX, sizeof(*anon_stack), GFP_KERNEL);
+ if (!anon_stack)
+ return ERR_PTR(-ENOMEM);
+
+retry:
+ if (!btf_type_is_struct(type)) {
+ member = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ for_each_member(i, type, member) {
+ if (!member->name_off) {
+ /* Anonymous union/struct: push it for later use */
+ if (btf_type_skip_modifiers(btf, member->type, &tid) &&
+ top < BTF_ANON_STACK_MAX) {
+ anon_stack[top].tid = tid;
+ anon_stack[top++].offset =
+ cur_offset + member->offset;
+ }
+ } else {
+ name = btf_name_by_offset(btf, member->name_off);
+ if (name && !strcmp(member_name, name)) {
+ if (anon_offset)
+ *anon_offset = cur_offset;
+ goto out;
+ }
+ }
+ }
+ if (top > 0) {
+ /* Pop from the anonymous stack and retry */
+ tid = anon_stack[--top].tid;
+ cur_offset = anon_stack[top].offset;
+ type = btf_type_by_id(btf, tid);
+ goto retry;
+ }
+ member = NULL;
+
+out:
+ kfree(anon_stack);
+ return member;
+}
+
diff --git a/kernel/trace/trace_btf.h b/kernel/trace/trace_btf.h
new file mode 100644
index 000000000000..4bc44bc261e6
--- /dev/null
+++ b/kernel/trace/trace_btf.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/btf.h>
+
+const struct btf_type *btf_find_func_proto(const char *func_name,
+ struct btf **btf_p);
+const struct btf_param *btf_get_func_param(const struct btf_type *func_proto,
+ s32 *nr);
+const struct btf_member *btf_find_struct_member(struct btf *btf,
+ const struct btf_type *type,
+ const char *member_name,
+ u32 *anon_offset);
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index aaf6793ededa..4702efb00ff2 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -95,33 +95,49 @@ u64 notrace trace_clock_global(void)
{
unsigned long flags;
int this_cpu;
- u64 now;
+ u64 now, prev_time;
raw_local_irq_save(flags);
this_cpu = raw_smp_processor_id();
- now = sched_clock_cpu(this_cpu);
+
/*
- * If in an NMI context then dont risk lockups and return the
- * cpu_clock() time:
+ * The global clock "guarantees" that the events are ordered
+ * between CPUs. But if two events on two different CPUS call
+ * trace_clock_global at roughly the same time, it really does
+ * not matter which one gets the earlier time. Just make sure
+ * that the same CPU will always show a monotonic clock.
+ *
+ * Use a read memory barrier to get the latest written
+ * time that was recorded.
*/
- if (unlikely(in_nmi()))
- goto out;
+ smp_rmb();
+ prev_time = READ_ONCE(trace_clock_struct.prev_time);
+ now = sched_clock_cpu(this_cpu);
- arch_spin_lock(&trace_clock_struct.lock);
+ /* Make sure that now is always greater than or equal to prev_time */
+ if ((s64)(now - prev_time) < 0)
+ now = prev_time;
/*
- * TODO: if this happens often then maybe we should reset
- * my_scd->clock to prev_time+1, to make sure
- * we start ticking with the local clock from now on?
+ * If in an NMI context then dont risk lockups and simply return
+ * the current time.
*/
- if ((s64)(now - trace_clock_struct.prev_time) < 0)
- now = trace_clock_struct.prev_time + 1;
+ if (unlikely(in_nmi()))
+ goto out;
- trace_clock_struct.prev_time = now;
+ /* Tracing can cause strange recursion, always use a try lock */
+ if (arch_spin_trylock(&trace_clock_struct.lock)) {
+ /* Reread prev_time in case it was already updated */
+ prev_time = READ_ONCE(trace_clock_struct.prev_time);
+ if ((s64)(now - prev_time) < 0)
+ now = prev_time;
- arch_spin_unlock(&trace_clock_struct.lock);
+ trace_clock_struct.prev_time = now;
+ /* The unlock acts as the wmb for the above rmb */
+ arch_spin_unlock(&trace_clock_struct.lock);
+ }
out:
raw_local_irq_restore(flags);
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index 9f2e8520b748..4376887e0d8a 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -13,11 +13,49 @@
#include <linux/tracefs.h>
#include "trace.h"
+#include "trace_output.h" /* for trace_event_sem */
#include "trace_dynevent.h"
static DEFINE_MUTEX(dyn_event_ops_mutex);
static LIST_HEAD(dyn_event_ops_list);
+bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call)
+{
+ struct trace_event_call *call;
+ bool ret = false;
+
+ if (WARN_ON_ONCE(!(dyn_call->flags & TRACE_EVENT_FL_DYNAMIC)))
+ return false;
+
+ down_read(&trace_event_sem);
+ list_for_each_entry(call, &ftrace_events, list) {
+ if (call == dyn_call) {
+ atomic_inc(&dyn_call->refcnt);
+ ret = true;
+ }
+ }
+ up_read(&trace_event_sem);
+ return ret;
+}
+
+void trace_event_dyn_put_ref(struct trace_event_call *call)
+{
+ if (WARN_ON_ONCE(!(call->flags & TRACE_EVENT_FL_DYNAMIC)))
+ return;
+
+ if (WARN_ON_ONCE(atomic_read(&call->refcnt) <= 0)) {
+ atomic_set(&call->refcnt, 0);
+ return;
+ }
+
+ atomic_dec(&call->refcnt);
+}
+
+bool trace_event_dyn_busy(struct trace_event_call *call)
+{
+ return atomic_read(&call->refcnt) != 0;
+}
+
int dyn_event_register(struct dyn_event_operations *ops)
{
if (!ops || !ops->create || !ops->show || !ops->is_busy ||
@@ -31,23 +69,31 @@ int dyn_event_register(struct dyn_event_operations *ops)
return 0;
}
-int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
+int dyn_event_release(const char *raw_command, struct dyn_event_operations *type)
{
struct dyn_event *pos, *n;
char *system = NULL, *event, *p;
- int ret = -ENOENT;
+ int argc, ret = -ENOENT;
+ char **argv;
+
+ argv = argv_split(GFP_KERNEL, raw_command, &argc);
+ if (!argv)
+ return -ENOMEM;
if (argv[0][0] == '-') {
- if (argv[0][1] != ':')
- return -EINVAL;
+ if (argv[0][1] != ':') {
+ ret = -EINVAL;
+ goto out;
+ }
event = &argv[0][2];
} else {
event = strchr(argv[0], ':');
- if (!event)
- return -EINVAL;
+ if (!event) {
+ ret = -EINVAL;
+ goto out;
+ }
event++;
}
- argc--; argv++;
p = strchr(event, '/');
if (p) {
@@ -55,37 +101,41 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
event = p + 1;
*p = '\0';
}
- if (event[0] == '\0')
- return -EINVAL;
+ if (!system && event[0] == '\0') {
+ ret = -EINVAL;
+ goto out;
+ }
mutex_lock(&event_mutex);
for_each_dyn_event_safe(pos, n) {
if (type && type != pos->ops)
continue;
if (!pos->ops->match(system, event,
- argc, (const char **)argv, pos))
+ argc - 1, (const char **)argv + 1, pos))
continue;
ret = pos->ops->free(pos);
if (ret)
break;
}
+ tracing_reset_all_online_cpus();
mutex_unlock(&event_mutex);
-
+out:
+ argv_free(argv);
return ret;
}
-static int create_dyn_event(int argc, char **argv)
+static int create_dyn_event(const char *raw_command)
{
struct dyn_event_operations *ops;
int ret = -ENODEV;
- if (argv[0][0] == '-' || argv[0][0] == '!')
- return dyn_event_release(argc, argv, NULL);
+ if (raw_command[0] == '-' || raw_command[0] == '!')
+ return dyn_event_release(raw_command, NULL);
mutex_lock(&dyn_event_ops_mutex);
list_for_each_entry(ops, &dyn_event_ops_list, list) {
- ret = ops->create(argc, (const char **)argv);
+ ret = ops->create(raw_command);
if (!ret || ret != -ECANCELED)
break;
}
@@ -165,6 +215,7 @@ int dyn_events_release_all(struct dyn_event_operations *type)
break;
}
out:
+ tracing_reset_all_online_cpus();
mutex_unlock(&event_mutex);
return ret;
@@ -206,19 +257,14 @@ static const struct file_operations dynamic_events_ops = {
/* Make a tracefs interface for controlling dynamic events */
static __init int init_dynamic_event(void)
{
- struct dentry *d_tracer;
- struct dentry *entry;
+ int ret;
- d_tracer = tracing_init_dentry();
- if (IS_ERR(d_tracer))
+ ret = tracing_init_dentry();
+ if (ret)
return 0;
- entry = tracefs_create_file("dynamic_events", 0644, d_tracer,
- NULL, &dynamic_events_ops);
-
- /* Event list interface */
- if (!entry)
- pr_warn("Could not create tracefs 'dynamic_events' entry\n");
+ trace_create_file("dynamic_events", TRACE_MODE_WRITE, NULL,
+ NULL, &dynamic_events_ops);
return 0;
}
@@ -276,7 +322,7 @@ int dynevent_arg_add(struct dynevent_cmd *cmd,
* arguments of the form 'type variable_name;' or 'x+y'.
*
* The lhs argument string will be appended to the current cmd string,
- * followed by an operator, if applicable, followd by the rhs string,
+ * followed by an operator, if applicable, followed by the rhs string,
* followed finally by a separator, if applicable. Before the
* argument is added, the @check_arg function, if present, will be
* used to check the sanity of the current arg strings.
@@ -402,7 +448,7 @@ void dynevent_arg_init(struct dynevent_arg *arg,
* whitespace, all followed by a separator, if applicable. After the
* first arg string is successfully appended to the command string,
* the optional @operator is appended, followed by the second arg and
- * and optional @separator. If no separator was specified when
+ * optional @separator. If no separator was specified when
* initializing the arg, a space will be appended.
*/
void dynevent_arg_pair_init(struct dynevent_arg_pair *arg_pair,
diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h
index d6857a254ede..936477a111d3 100644
--- a/kernel/trace/trace_dynevent.h
+++ b/kernel/trace/trace_dynevent.h
@@ -29,17 +29,17 @@ struct dyn_event;
* @show: Showing method. This is invoked when user reads the event definitions
* via dynamic_events interface.
* @is_busy: Check whether given event is busy so that it can not be deleted.
- * Return true if it is busy, otherwides false.
- * @free: Delete the given event. Return 0 if success, otherwides error.
+ * Return true if it is busy, otherwise false.
+ * @free: Delete the given event. Return 0 if success, otherwise error.
* @match: Check whether given event and system name match this event. The argc
- * and argv is used for exact match. Return true if it matches, otherwides
+ * and argv is used for exact match. Return true if it matches, otherwise
* false.
*
* Except for @create, these methods are called under holding event_mutex.
*/
struct dyn_event_operations {
struct list_head list;
- int (*create)(int argc, const char *argv[]);
+ int (*create)(const char *raw_command);
int (*show)(struct seq_file *m, struct dyn_event *ev);
bool (*is_busy)(struct dyn_event *ev);
int (*free)(struct dyn_event *ev);
@@ -76,13 +76,15 @@ int dyn_event_init(struct dyn_event *ev, struct dyn_event_operations *ops)
return 0;
}
-static inline int dyn_event_add(struct dyn_event *ev)
+static inline int dyn_event_add(struct dyn_event *ev,
+ struct trace_event_call *call)
{
lockdep_assert_held(&event_mutex);
if (!ev || !ev->ops)
return -EINVAL;
+ call->flags |= TRACE_EVENT_FL_DYNAMIC;
list_add_tail(&ev->list, &dyn_event_list);
return 0;
}
@@ -97,7 +99,7 @@ void *dyn_event_seq_start(struct seq_file *m, loff_t *pos);
void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos);
void dyn_event_seq_stop(struct seq_file *m, void *v);
int dyn_events_release_all(struct dyn_event_operations *type);
-int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type);
+int dyn_event_release(const char *raw_command, struct dyn_event_operations *type);
/*
* for_each_dyn_event - iterate over the dyn_event list
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 18c4a58aff79..c47422b20908 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -32,7 +32,7 @@
* to be deciphered for the format file. Although these macros
* may become out of sync with the internal structure, they
* will create a compile error if it happens. Since the
- * internel structures are just tracing helpers, this is not
+ * internal structures are just tracing helpers, this is not
* an issue.
*
* When an internal structure is used, it should use:
@@ -86,6 +86,8 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
);
/* Function return entry */
+#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
+
FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
TRACE_GRAPH_RET,
@@ -93,10 +95,32 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ret, ret )
__field_packed( unsigned long, ret, func )
- __field_packed( unsigned long, ret, overrun )
+ __field_packed( unsigned long, ret, retval )
+ __field_packed( int, ret, depth )
+ __field_packed( unsigned int, ret, overrun )
__field_packed( unsigned long long, ret, calltime)
__field_packed( unsigned long long, ret, rettime )
+ ),
+
+ F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx",
+ (void *)__entry->func, __entry->depth,
+ __entry->calltime, __entry->rettime,
+ __entry->depth, __entry->retval)
+);
+
+#else
+
+FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
+
+ TRACE_GRAPH_RET,
+
+ F_STRUCT(
+ __field_struct( struct ftrace_graph_ret, ret )
+ __field_packed( unsigned long, ret, func )
__field_packed( int, ret, depth )
+ __field_packed( unsigned int, ret, overrun )
+ __field_packed( unsigned long long, ret, calltime)
+ __field_packed( unsigned long long, ret, rettime )
),
F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d",
@@ -105,6 +129,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
__entry->depth)
);
+#endif
+
/*
* Context switch trace entry - which task (and prio) we switched from/to:
*
@@ -164,7 +190,7 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
F_STRUCT(
__field( int, size )
- __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
+ __stack_array( unsigned long, caller, FTRACE_STACK_ENTRIES, size)
),
F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n"
@@ -338,3 +364,66 @@ FTRACE_ENTRY(hwlat, hwlat_entry,
__entry->nmi_total_ts,
__entry->nmi_count)
);
+
+#define FUNC_REPEATS_GET_DELTA_TS(entry) \
+ (((u64)(entry)->top_delta_ts << 32) | (entry)->bottom_delta_ts) \
+
+FTRACE_ENTRY(func_repeats, func_repeats_entry,
+
+ TRACE_FUNC_REPEATS,
+
+ F_STRUCT(
+ __field( unsigned long, ip )
+ __field( unsigned long, parent_ip )
+ __field( u16 , count )
+ __field( u16 , top_delta_ts )
+ __field( u32 , bottom_delta_ts )
+ ),
+
+ F_printk(" %ps <-%ps\t(repeats:%u delta: -%llu)",
+ (void *)__entry->ip,
+ (void *)__entry->parent_ip,
+ __entry->count,
+ FUNC_REPEATS_GET_DELTA_TS(__entry))
+);
+
+FTRACE_ENTRY(osnoise, osnoise_entry,
+
+ TRACE_OSNOISE,
+
+ F_STRUCT(
+ __field( u64, noise )
+ __field( u64, runtime )
+ __field( u64, max_sample )
+ __field( unsigned int, hw_count )
+ __field( unsigned int, nmi_count )
+ __field( unsigned int, irq_count )
+ __field( unsigned int, softirq_count )
+ __field( unsigned int, thread_count )
+ ),
+
+ F_printk("noise:%llu\tmax_sample:%llu\thw:%u\tnmi:%u\tirq:%u\tsoftirq:%u\tthread:%u\n",
+ __entry->noise,
+ __entry->max_sample,
+ __entry->hw_count,
+ __entry->nmi_count,
+ __entry->irq_count,
+ __entry->softirq_count,
+ __entry->thread_count)
+);
+
+FTRACE_ENTRY(timerlat, timerlat_entry,
+
+ TRACE_TIMERLAT,
+
+ F_STRUCT(
+ __field( unsigned int, seqnum )
+ __field( int, context )
+ __field( u64, timer_latency )
+ ),
+
+ F_printk("seq:%u\tcontext:%d\ttimer_latency:%llu\n",
+ __entry->seqnum,
+ __entry->context,
+ __entry->timer_latency)
+);
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
new file mode 100644
index 000000000000..03c851f57969
--- /dev/null
+++ b/kernel/trace/trace_eprobe.c
@@ -0,0 +1,984 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * event probes
+ *
+ * Part of this code was copied from kernel/trace/trace_kprobe.c written by
+ * Masami Hiramatsu <mhiramat@kernel.org>
+ *
+ * Copyright (C) 2021, VMware Inc, Steven Rostedt <rostedt@goodmis.org>
+ * Copyright (C) 2021, VMware Inc, Tzvetomir Stoyanov tz.stoyanov@gmail.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/ftrace.h>
+
+#include "trace_dynevent.h"
+#include "trace_probe.h"
+#include "trace_probe_tmpl.h"
+#include "trace_probe_kernel.h"
+
+#define EPROBE_EVENT_SYSTEM "eprobes"
+
+struct trace_eprobe {
+ /* tracepoint system */
+ const char *event_system;
+
+ /* tracepoint event */
+ const char *event_name;
+
+ /* filter string for the tracepoint */
+ char *filter_str;
+
+ struct trace_event_call *event;
+
+ struct dyn_event devent;
+ struct trace_probe tp;
+};
+
+struct eprobe_data {
+ struct trace_event_file *file;
+ struct trace_eprobe *ep;
+};
+
+
+#define for_each_trace_eprobe_tp(ep, _tp) \
+ list_for_each_entry(ep, trace_probe_probe_list(_tp), tp.list)
+
+static int __trace_eprobe_create(int argc, const char *argv[]);
+
+static void trace_event_probe_cleanup(struct trace_eprobe *ep)
+{
+ if (!ep)
+ return;
+ trace_probe_cleanup(&ep->tp);
+ kfree(ep->event_name);
+ kfree(ep->event_system);
+ if (ep->event)
+ trace_event_put_ref(ep->event);
+ kfree(ep->filter_str);
+ kfree(ep);
+}
+
+static struct trace_eprobe *to_trace_eprobe(struct dyn_event *ev)
+{
+ return container_of(ev, struct trace_eprobe, devent);
+}
+
+static int eprobe_dyn_event_create(const char *raw_command)
+{
+ return trace_probe_create(raw_command, __trace_eprobe_create);
+}
+
+static int eprobe_dyn_event_show(struct seq_file *m, struct dyn_event *ev)
+{
+ struct trace_eprobe *ep = to_trace_eprobe(ev);
+ int i;
+
+ seq_printf(m, "e:%s/%s", trace_probe_group_name(&ep->tp),
+ trace_probe_name(&ep->tp));
+ seq_printf(m, " %s.%s", ep->event_system, ep->event_name);
+
+ for (i = 0; i < ep->tp.nr_args; i++)
+ seq_printf(m, " %s=%s", ep->tp.args[i].name, ep->tp.args[i].comm);
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static int unregister_trace_eprobe(struct trace_eprobe *ep)
+{
+ /* If other probes are on the event, just unregister eprobe */
+ if (trace_probe_has_sibling(&ep->tp))
+ goto unreg;
+
+ /* Enabled event can not be unregistered */
+ if (trace_probe_is_enabled(&ep->tp))
+ return -EBUSY;
+
+ /* Will fail if probe is being used by ftrace or perf */
+ if (trace_probe_unregister_event_call(&ep->tp))
+ return -EBUSY;
+
+unreg:
+ dyn_event_remove(&ep->devent);
+ trace_probe_unlink(&ep->tp);
+
+ return 0;
+}
+
+static int eprobe_dyn_event_release(struct dyn_event *ev)
+{
+ struct trace_eprobe *ep = to_trace_eprobe(ev);
+ int ret = unregister_trace_eprobe(ep);
+
+ if (!ret)
+ trace_event_probe_cleanup(ep);
+ return ret;
+}
+
+static bool eprobe_dyn_event_is_busy(struct dyn_event *ev)
+{
+ struct trace_eprobe *ep = to_trace_eprobe(ev);
+
+ return trace_probe_is_enabled(&ep->tp);
+}
+
+static bool eprobe_dyn_event_match(const char *system, const char *event,
+ int argc, const char **argv, struct dyn_event *ev)
+{
+ struct trace_eprobe *ep = to_trace_eprobe(ev);
+ const char *slash;
+
+ /*
+ * We match the following:
+ * event only - match all eprobes with event name
+ * system and event only - match all system/event probes
+ * system only - match all system probes
+ *
+ * The below has the above satisfied with more arguments:
+ *
+ * attached system/event - If the arg has the system and event
+ * the probe is attached to, match
+ * probes with the attachment.
+ *
+ * If any more args are given, then it requires a full match.
+ */
+
+ /*
+ * If system exists, but this probe is not part of that system
+ * do not match.
+ */
+ if (system && strcmp(trace_probe_group_name(&ep->tp), system) != 0)
+ return false;
+
+ /* Must match the event name */
+ if (event[0] != '\0' && strcmp(trace_probe_name(&ep->tp), event) != 0)
+ return false;
+
+ /* No arguments match all */
+ if (argc < 1)
+ return true;
+
+ /* First argument is the system/event the probe is attached to */
+
+ slash = strchr(argv[0], '/');
+ if (!slash)
+ slash = strchr(argv[0], '.');
+ if (!slash)
+ return false;
+
+ if (strncmp(ep->event_system, argv[0], slash - argv[0]))
+ return false;
+ if (strcmp(ep->event_name, slash + 1))
+ return false;
+
+ argc--;
+ argv++;
+
+ /* If there are no other args, then match */
+ if (argc < 1)
+ return true;
+
+ return trace_probe_match_command_args(&ep->tp, argc, argv);
+}
+
+static struct dyn_event_operations eprobe_dyn_event_ops = {
+ .create = eprobe_dyn_event_create,
+ .show = eprobe_dyn_event_show,
+ .is_busy = eprobe_dyn_event_is_busy,
+ .free = eprobe_dyn_event_release,
+ .match = eprobe_dyn_event_match,
+};
+
+static struct trace_eprobe *alloc_event_probe(const char *group,
+ const char *this_event,
+ struct trace_event_call *event,
+ int nargs)
+{
+ struct trace_eprobe *ep;
+ const char *event_name;
+ const char *sys_name;
+ int ret = -ENOMEM;
+
+ if (!event)
+ return ERR_PTR(-ENODEV);
+
+ sys_name = event->class->system;
+ event_name = trace_event_name(event);
+
+ ep = kzalloc(struct_size(ep, tp.args, nargs), GFP_KERNEL);
+ if (!ep) {
+ trace_event_put_ref(event);
+ goto error;
+ }
+ ep->event = event;
+ ep->event_name = kstrdup(event_name, GFP_KERNEL);
+ if (!ep->event_name)
+ goto error;
+ ep->event_system = kstrdup(sys_name, GFP_KERNEL);
+ if (!ep->event_system)
+ goto error;
+
+ ret = trace_probe_init(&ep->tp, this_event, group, false);
+ if (ret < 0)
+ goto error;
+
+ dyn_event_init(&ep->devent, &eprobe_dyn_event_ops);
+ return ep;
+error:
+ trace_event_probe_cleanup(ep);
+ return ERR_PTR(ret);
+}
+
+static int eprobe_event_define_fields(struct trace_event_call *event_call)
+{
+ struct eprobe_trace_entry_head field;
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(event_call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENOENT;
+
+ return traceprobe_define_arg_fields(event_call, sizeof(field), tp);
+}
+
+static struct trace_event_fields eprobe_fields_array[] = {
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = eprobe_event_define_fields },
+ {}
+};
+
+/* Event entry printers */
+static enum print_line_t
+print_eprobe_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct eprobe_trace_entry_head *field;
+ struct trace_event_call *pevent;
+ struct trace_event *probed_event;
+ struct trace_seq *s = &iter->seq;
+ struct trace_eprobe *ep;
+ struct trace_probe *tp;
+ unsigned int type;
+
+ field = (struct eprobe_trace_entry_head *)iter->ent;
+ tp = trace_probe_primary_from_call(
+ container_of(event, struct trace_event_call, event));
+ if (WARN_ON_ONCE(!tp))
+ goto out;
+
+ ep = container_of(tp, struct trace_eprobe, tp);
+ type = ep->event->event.type;
+
+ trace_seq_printf(s, "%s: (", trace_probe_name(tp));
+
+ probed_event = ftrace_find_event(type);
+ if (probed_event) {
+ pevent = container_of(probed_event, struct trace_event_call, event);
+ trace_seq_printf(s, "%s.%s", pevent->class->system,
+ trace_event_name(pevent));
+ } else {
+ trace_seq_printf(s, "%u", type);
+ }
+
+ trace_seq_putc(s, ')');
+
+ if (trace_probe_print_args(s, tp->args, tp->nr_args,
+ (u8 *)&field[1], field) < 0)
+ goto out;
+
+ trace_seq_putc(s, '\n');
+ out:
+ return trace_handle_return(s);
+}
+
+static nokprobe_inline unsigned long
+get_event_field(struct fetch_insn *code, void *rec)
+{
+ struct ftrace_event_field *field = code->data;
+ unsigned long val;
+ void *addr;
+
+ addr = rec + field->offset;
+
+ if (is_string_field(field)) {
+ switch (field->filter_type) {
+ case FILTER_DYN_STRING:
+ val = (unsigned long)(rec + (*(unsigned int *)addr & 0xffff));
+ break;
+ case FILTER_RDYN_STRING:
+ val = (unsigned long)(addr + (*(unsigned int *)addr & 0xffff));
+ break;
+ case FILTER_STATIC_STRING:
+ val = (unsigned long)addr;
+ break;
+ case FILTER_PTR_STRING:
+ val = (unsigned long)(*(char *)addr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+ return val;
+ }
+
+ switch (field->size) {
+ case 1:
+ if (field->is_signed)
+ val = *(char *)addr;
+ else
+ val = *(unsigned char *)addr;
+ break;
+ case 2:
+ if (field->is_signed)
+ val = *(short *)addr;
+ else
+ val = *(unsigned short *)addr;
+ break;
+ case 4:
+ if (field->is_signed)
+ val = *(int *)addr;
+ else
+ val = *(unsigned int *)addr;
+ break;
+ default:
+ if (field->is_signed)
+ val = *(long *)addr;
+ else
+ val = *(unsigned long *)addr;
+ break;
+ }
+ return val;
+}
+
+static int get_eprobe_size(struct trace_probe *tp, void *rec)
+{
+ struct fetch_insn *code;
+ struct probe_arg *arg;
+ int i, len, ret = 0;
+
+ for (i = 0; i < tp->nr_args; i++) {
+ arg = tp->args + i;
+ if (arg->dynamic) {
+ unsigned long val;
+
+ code = arg->code;
+ retry:
+ switch (code->op) {
+ case FETCH_OP_TP_ARG:
+ val = get_event_field(code, rec);
+ break;
+ case FETCH_NOP_SYMBOL: /* Ignore a place holder */
+ code++;
+ goto retry;
+ default:
+ if (process_common_fetch_insn(code, &val) < 0)
+ continue;
+ }
+ code++;
+ len = process_fetch_insn_bottom(code, val, NULL, NULL);
+ if (len > 0)
+ ret += len;
+ }
+ }
+
+ return ret;
+}
+
+/* Kprobe specific fetch functions */
+
+/* Note that we don't verify it, since the code does not come from user space */
+static int
+process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
+ void *base)
+{
+ unsigned long val;
+ int ret;
+
+ retry:
+ switch (code->op) {
+ case FETCH_OP_TP_ARG:
+ val = get_event_field(code, rec);
+ break;
+ case FETCH_NOP_SYMBOL: /* Ignore a place holder */
+ code++;
+ goto retry;
+ default:
+ ret = process_common_fetch_insn(code, &val);
+ if (ret < 0)
+ return ret;
+ }
+ code++;
+ return process_fetch_insn_bottom(code, val, dest, base);
+}
+NOKPROBE_SYMBOL(process_fetch_insn)
+
+/* eprobe handler */
+static inline void
+__eprobe_trace_func(struct eprobe_data *edata, void *rec)
+{
+ struct eprobe_trace_entry_head *entry;
+ struct trace_event_call *call = trace_probe_event_call(&edata->ep->tp);
+ struct trace_event_buffer fbuffer;
+ int dsize;
+
+ if (WARN_ON_ONCE(call != edata->file->event_call))
+ return;
+
+ if (trace_trigger_soft_disabled(edata->file))
+ return;
+
+ dsize = get_eprobe_size(&edata->ep->tp, rec);
+
+ entry = trace_event_buffer_reserve(&fbuffer, edata->file,
+ sizeof(*entry) + edata->ep->tp.size + dsize);
+
+ if (!entry)
+ return;
+
+ entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
+ store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize);
+
+ trace_event_buffer_commit(&fbuffer);
+}
+
+/*
+ * The event probe implementation uses event triggers to get access to
+ * the event it is attached to, but is not an actual trigger. The below
+ * functions are just stubs to fulfill what is needed to use the trigger
+ * infrastructure.
+ */
+static int eprobe_trigger_init(struct event_trigger_data *data)
+{
+ return 0;
+}
+
+static void eprobe_trigger_free(struct event_trigger_data *data)
+{
+
+}
+
+static int eprobe_trigger_print(struct seq_file *m,
+ struct event_trigger_data *data)
+{
+ /* Do not print eprobe event triggers */
+ return 0;
+}
+
+static void eprobe_trigger_func(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
+ struct ring_buffer_event *rbe)
+{
+ struct eprobe_data *edata = data->private_data;
+
+ if (unlikely(!rec))
+ return;
+
+ __eprobe_trace_func(edata, rec);
+}
+
+static struct event_trigger_ops eprobe_trigger_ops = {
+ .trigger = eprobe_trigger_func,
+ .print = eprobe_trigger_print,
+ .init = eprobe_trigger_init,
+ .free = eprobe_trigger_free,
+};
+
+static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd,
+ char *param_and_filter)
+{
+ return -1;
+}
+
+static int eprobe_trigger_reg_func(char *glob,
+ struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+ return -1;
+}
+
+static void eprobe_trigger_unreg_func(char *glob,
+ struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+
+}
+
+static struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd,
+ char *param)
+{
+ return &eprobe_trigger_ops;
+}
+
+static struct event_command event_trigger_cmd = {
+ .name = "eprobe",
+ .trigger_type = ETT_EVENT_EPROBE,
+ .flags = EVENT_CMD_FL_NEEDS_REC,
+ .parse = eprobe_trigger_cmd_parse,
+ .reg = eprobe_trigger_reg_func,
+ .unreg = eprobe_trigger_unreg_func,
+ .unreg_all = NULL,
+ .get_trigger_ops = eprobe_trigger_get_ops,
+ .set_filter = NULL,
+};
+
+static struct event_trigger_data *
+new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file)
+{
+ struct event_trigger_data *trigger;
+ struct event_filter *filter = NULL;
+ struct eprobe_data *edata;
+ int ret;
+
+ edata = kzalloc(sizeof(*edata), GFP_KERNEL);
+ trigger = kzalloc(sizeof(*trigger), GFP_KERNEL);
+ if (!trigger || !edata) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ trigger->flags = EVENT_TRIGGER_FL_PROBE;
+ trigger->count = -1;
+ trigger->ops = &eprobe_trigger_ops;
+
+ /*
+ * EVENT PROBE triggers are not registered as commands with
+ * register_event_command(), as they are not controlled by the user
+ * from the trigger file
+ */
+ trigger->cmd_ops = &event_trigger_cmd;
+
+ INIT_LIST_HEAD(&trigger->list);
+
+ if (ep->filter_str) {
+ ret = create_event_filter(file->tr, ep->event,
+ ep->filter_str, false, &filter);
+ if (ret)
+ goto error;
+ }
+ RCU_INIT_POINTER(trigger->filter, filter);
+
+ edata->file = file;
+ edata->ep = ep;
+ trigger->private_data = edata;
+
+ return trigger;
+error:
+ free_event_filter(filter);
+ kfree(edata);
+ kfree(trigger);
+ return ERR_PTR(ret);
+}
+
+static int enable_eprobe(struct trace_eprobe *ep,
+ struct trace_event_file *eprobe_file)
+{
+ struct event_trigger_data *trigger;
+ struct trace_event_file *file;
+ struct trace_array *tr = eprobe_file->tr;
+
+ file = find_event_file(tr, ep->event_system, ep->event_name);
+ if (!file)
+ return -ENOENT;
+ trigger = new_eprobe_trigger(ep, eprobe_file);
+ if (IS_ERR(trigger))
+ return PTR_ERR(trigger);
+
+ list_add_tail_rcu(&trigger->list, &file->triggers);
+
+ trace_event_trigger_enable_disable(file, 1);
+ update_cond_flag(file);
+
+ return 0;
+}
+
+static struct trace_event_functions eprobe_funcs = {
+ .trace = print_eprobe_event
+};
+
+static int disable_eprobe(struct trace_eprobe *ep,
+ struct trace_array *tr)
+{
+ struct event_trigger_data *trigger = NULL, *iter;
+ struct trace_event_file *file;
+ struct event_filter *filter;
+ struct eprobe_data *edata;
+
+ file = find_event_file(tr, ep->event_system, ep->event_name);
+ if (!file)
+ return -ENOENT;
+
+ list_for_each_entry(iter, &file->triggers, list) {
+ if (!(iter->flags & EVENT_TRIGGER_FL_PROBE))
+ continue;
+ edata = iter->private_data;
+ if (edata->ep == ep) {
+ trigger = iter;
+ break;
+ }
+ }
+ if (!trigger)
+ return -ENODEV;
+
+ list_del_rcu(&trigger->list);
+
+ trace_event_trigger_enable_disable(file, 0);
+ update_cond_flag(file);
+
+ /* Make sure nothing is using the edata or trigger */
+ tracepoint_synchronize_unregister();
+
+ filter = rcu_access_pointer(trigger->filter);
+
+ if (filter)
+ free_event_filter(filter);
+ kfree(edata);
+ kfree(trigger);
+
+ return 0;
+}
+
+static int enable_trace_eprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
+{
+ struct trace_probe *tp;
+ struct trace_eprobe *ep;
+ bool enabled;
+ int ret = 0;
+ int cnt = 0;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+ enabled = trace_probe_is_enabled(tp);
+
+ /* This also changes "enabled" state */
+ if (file) {
+ ret = trace_probe_add_file(tp, file);
+ if (ret)
+ return ret;
+ } else
+ trace_probe_set_flag(tp, TP_FLAG_PROFILE);
+
+ if (enabled)
+ return 0;
+
+ for_each_trace_eprobe_tp(ep, tp) {
+ ret = enable_eprobe(ep, file);
+ if (ret)
+ break;
+ enabled = true;
+ cnt++;
+ }
+
+ if (ret) {
+ /* Failed to enable one of them. Roll back all */
+ if (enabled) {
+ /*
+ * It's a bug if one failed for something other than memory
+ * not being available but another eprobe succeeded.
+ */
+ WARN_ON_ONCE(ret != -ENOMEM);
+
+ for_each_trace_eprobe_tp(ep, tp) {
+ disable_eprobe(ep, file->tr);
+ if (!--cnt)
+ break;
+ }
+ }
+ if (file)
+ trace_probe_remove_file(tp, file);
+ else
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
+ }
+
+ return ret;
+}
+
+static int disable_trace_eprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
+{
+ struct trace_probe *tp;
+ struct trace_eprobe *ep;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+
+ if (file) {
+ if (!trace_probe_get_file_link(tp, file))
+ return -ENOENT;
+ if (!trace_probe_has_single_file(tp))
+ goto out;
+ trace_probe_clear_flag(tp, TP_FLAG_TRACE);
+ } else
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
+
+ if (!trace_probe_is_enabled(tp)) {
+ for_each_trace_eprobe_tp(ep, tp)
+ disable_eprobe(ep, file->tr);
+ }
+
+ out:
+ if (file)
+ /*
+ * Synchronization is done in below function. For perf event,
+ * file == NULL and perf_trace_event_unreg() calls
+ * tracepoint_synchronize_unregister() to ensure synchronize
+ * event. We don't need to care about it.
+ */
+ trace_probe_remove_file(tp, file);
+
+ return 0;
+}
+
+static int eprobe_register(struct trace_event_call *event,
+ enum trace_reg type, void *data)
+{
+ struct trace_event_file *file = data;
+
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return enable_trace_eprobe(event, file);
+ case TRACE_REG_UNREGISTER:
+ return disable_trace_eprobe(event, file);
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ case TRACE_REG_PERF_UNREGISTER:
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ return 0;
+#endif
+ }
+ return 0;
+}
+
+static inline void init_trace_eprobe_call(struct trace_eprobe *ep)
+{
+ struct trace_event_call *call = trace_probe_event_call(&ep->tp);
+
+ call->flags = TRACE_EVENT_FL_EPROBE;
+ call->event.funcs = &eprobe_funcs;
+ call->class->fields_array = eprobe_fields_array;
+ call->class->reg = eprobe_register;
+}
+
+static struct trace_event_call *
+find_and_get_event(const char *system, const char *event_name)
+{
+ struct trace_event_call *tp_event;
+ const char *name;
+
+ list_for_each_entry(tp_event, &ftrace_events, list) {
+ /* Skip other probes and ftrace events */
+ if (tp_event->flags &
+ (TRACE_EVENT_FL_IGNORE_ENABLE |
+ TRACE_EVENT_FL_KPROBE |
+ TRACE_EVENT_FL_UPROBE |
+ TRACE_EVENT_FL_EPROBE))
+ continue;
+ if (!tp_event->class->system ||
+ strcmp(system, tp_event->class->system))
+ continue;
+ name = trace_event_name(tp_event);
+ if (!name || strcmp(event_name, name))
+ continue;
+ if (!trace_event_try_get_ref(tp_event))
+ return NULL;
+ return tp_event;
+ }
+ return NULL;
+}
+
+static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i)
+{
+ struct traceprobe_parse_context ctx = {
+ .event = ep->event,
+ .flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT,
+ };
+ int ret;
+
+ ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], &ctx);
+ /* Handle symbols "@" */
+ if (!ret)
+ ret = traceprobe_update_arg(&ep->tp.args[i]);
+
+ traceprobe_finish_parse(&ctx);
+ return ret;
+}
+
+static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const char *argv[])
+{
+ struct event_filter *dummy = NULL;
+ int i, ret, len = 0;
+ char *p;
+
+ if (argc == 0) {
+ trace_probe_log_err(0, NO_EP_FILTER);
+ return -EINVAL;
+ }
+
+ /* Recover the filter string */
+ for (i = 0; i < argc; i++)
+ len += strlen(argv[i]) + 1;
+
+ ep->filter_str = kzalloc(len, GFP_KERNEL);
+ if (!ep->filter_str)
+ return -ENOMEM;
+
+ p = ep->filter_str;
+ for (i = 0; i < argc; i++) {
+ if (i)
+ ret = snprintf(p, len, " %s", argv[i]);
+ else
+ ret = snprintf(p, len, "%s", argv[i]);
+ p += ret;
+ len -= ret;
+ }
+
+ /*
+ * Ensure the filter string can be parsed correctly. Note, this
+ * filter string is for the original event, not for the eprobe.
+ */
+ ret = create_event_filter(top_trace_array(), ep->event, ep->filter_str,
+ true, &dummy);
+ free_event_filter(dummy);
+ if (ret)
+ goto error;
+
+ return 0;
+error:
+ kfree(ep->filter_str);
+ ep->filter_str = NULL;
+ return ret;
+}
+
+static int __trace_eprobe_create(int argc, const char *argv[])
+{
+ /*
+ * Argument syntax:
+ * e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] [if FILTER]
+ * Fetch args (no space):
+ * <name>=$<field>[:TYPE]
+ */
+ const char *event = NULL, *group = EPROBE_EVENT_SYSTEM;
+ const char *sys_event = NULL, *sys_name = NULL;
+ struct trace_event_call *event_call;
+ struct trace_eprobe *ep = NULL;
+ char buf1[MAX_EVENT_NAME_LEN];
+ char buf2[MAX_EVENT_NAME_LEN];
+ char gbuf[MAX_EVENT_NAME_LEN];
+ int ret = 0, filter_idx = 0;
+ int i, filter_cnt;
+
+ if (argc < 2 || argv[0][0] != 'e')
+ return -ECANCELED;
+
+ trace_probe_log_init("event_probe", argc, argv);
+
+ event = strchr(&argv[0][1], ':');
+ if (event) {
+ event++;
+ ret = traceprobe_parse_event_name(&event, &group, gbuf,
+ event - argv[0]);
+ if (ret)
+ goto parse_error;
+ }
+
+ trace_probe_log_set_index(1);
+ sys_event = argv[1];
+ ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0);
+ if (ret || !sys_event || !sys_name) {
+ trace_probe_log_err(0, NO_EVENT_INFO);
+ goto parse_error;
+ }
+
+ if (!event) {
+ strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN);
+ event = buf1;
+ }
+
+ for (i = 2; i < argc; i++) {
+ if (!strcmp(argv[i], "if")) {
+ filter_idx = i + 1;
+ filter_cnt = argc - filter_idx;
+ argc = i;
+ break;
+ }
+ }
+
+ mutex_lock(&event_mutex);
+ event_call = find_and_get_event(sys_name, sys_event);
+ ep = alloc_event_probe(group, event, event_call, argc - 2);
+ mutex_unlock(&event_mutex);
+
+ if (IS_ERR(ep)) {
+ ret = PTR_ERR(ep);
+ if (ret == -ENODEV)
+ trace_probe_log_err(0, BAD_ATTACH_EVENT);
+ /* This must return -ENOMEM or missing event, else there is a bug */
+ WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV);
+ ep = NULL;
+ goto error;
+ }
+
+ if (filter_idx) {
+ trace_probe_log_set_index(filter_idx);
+ ret = trace_eprobe_parse_filter(ep, filter_cnt, argv + filter_idx);
+ if (ret)
+ goto parse_error;
+ } else
+ ep->filter_str = NULL;
+
+ argc -= 2; argv += 2;
+ /* parse arguments */
+ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ trace_probe_log_set_index(i + 2);
+ ret = trace_eprobe_tp_update_arg(ep, argv, i);
+ if (ret)
+ goto error;
+ }
+ ret = traceprobe_set_print_fmt(&ep->tp, PROBE_PRINT_EVENT);
+ if (ret < 0)
+ goto error;
+ init_trace_eprobe_call(ep);
+ mutex_lock(&event_mutex);
+ ret = trace_probe_register_event_call(&ep->tp);
+ if (ret) {
+ if (ret == -EEXIST) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, EVENT_EXIST);
+ }
+ mutex_unlock(&event_mutex);
+ goto error;
+ }
+ ret = dyn_event_add(&ep->devent, &ep->tp.event->call);
+ mutex_unlock(&event_mutex);
+ return ret;
+parse_error:
+ ret = -EINVAL;
+error:
+ trace_event_probe_cleanup(ep);
+ return ret;
+}
+
+/*
+ * Register dynevent at core_initcall. This allows kernel to setup eprobe
+ * events in postcore_initcall without tracefs.
+ */
+static __init int trace_events_eprobe_init_early(void)
+{
+ int err = 0;
+
+ err = dyn_event_register(&eprobe_dyn_event_ops);
+ if (err)
+ pr_warn("Could not register eprobe_dyn_event_ops\n");
+
+ return err;
+}
+core_initcall(trace_events_eprobe_init_early);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 643e0b19920d..05e791241812 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -16,7 +16,7 @@ static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
/*
* Force it to be aligned to unsigned long to avoid misaligned accesses
- * suprises
+ * surprises
*/
typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
perf_trace_t;
@@ -157,7 +157,7 @@ static void perf_trace_event_unreg(struct perf_event *p_event)
int i;
if (--tp_event->perf_refcount > 0)
- goto out;
+ return;
tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
@@ -176,8 +176,6 @@ static void perf_trace_event_unreg(struct perf_event *p_event)
perf_trace_buf[i] = NULL;
}
}
-out:
- module_put(tp_event->mod);
}
static int perf_trace_event_open(struct perf_event *p_event)
@@ -224,10 +222,10 @@ int perf_trace_init(struct perf_event *p_event)
list_for_each_entry(tp_event, &ftrace_events, list) {
if (tp_event->event.type == event_id &&
tp_event->class && tp_event->class->reg &&
- try_module_get(tp_event->mod)) {
+ trace_event_try_get_ref(tp_event)) {
ret = perf_trace_event_init(tp_event, p_event);
if (ret)
- module_put(tp_event->mod);
+ trace_event_put_ref(tp_event);
break;
}
}
@@ -241,6 +239,7 @@ void perf_trace_destroy(struct perf_event *p_event)
mutex_lock(&event_mutex);
perf_trace_event_close(p_event);
perf_trace_event_unreg(p_event);
+ trace_event_put_ref(p_event->tp_event);
mutex_unlock(&event_mutex);
}
@@ -252,16 +251,12 @@ int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe)
struct trace_event_call *tp_event;
if (p_event->attr.kprobe_func) {
- func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL);
- if (!func)
- return -ENOMEM;
- ret = strncpy_from_user(
- func, u64_to_user_ptr(p_event->attr.kprobe_func),
- KSYM_NAME_LEN);
- if (ret == KSYM_NAME_LEN)
- ret = -E2BIG;
- if (ret < 0)
- goto out;
+ func = strndup_user(u64_to_user_ptr(p_event->attr.kprobe_func),
+ KSYM_NAME_LEN);
+ if (IS_ERR(func)) {
+ ret = PTR_ERR(func);
+ return (ret == -EINVAL) ? -E2BIG : ret;
+ }
if (func[0] == '\0') {
kfree(func);
@@ -292,6 +287,7 @@ void perf_kprobe_destroy(struct perf_event *p_event)
mutex_lock(&event_mutex);
perf_trace_event_close(p_event);
perf_trace_event_unreg(p_event);
+ trace_event_put_ref(p_event->tp_event);
mutex_unlock(&event_mutex);
destroy_local_trace_kprobe(p_event->tp_event);
@@ -347,6 +343,7 @@ void perf_uprobe_destroy(struct perf_event *p_event)
mutex_lock(&event_mutex);
perf_trace_event_close(p_event);
perf_trace_event_unreg(p_event);
+ trace_event_put_ref(p_event->tp_event);
mutex_unlock(&event_mutex);
destroy_local_trace_uprobe(p_event->tp_event);
}
@@ -400,7 +397,8 @@ void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
- "perf buffer not large enough"))
+ "perf buffer not large enough, wanted %d, have %d",
+ size, PERF_MAX_TRACE_SIZE))
return NULL;
*rctxp = rctx = perf_swevent_get_recursion_context();
@@ -421,28 +419,33 @@ NOKPROBE_SYMBOL(perf_trace_buf_alloc);
void perf_trace_buf_update(void *record, u16 type)
{
struct trace_entry *entry = record;
- int pc = preempt_count();
- unsigned long flags;
- local_save_flags(flags);
- tracing_generic_entry_update(entry, type, flags, pc);
+ tracing_generic_entry_update(entry, type, tracing_gen_ctx());
}
NOKPROBE_SYMBOL(perf_trace_buf_update);
#ifdef CONFIG_FUNCTION_TRACER
static void
perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct pt_regs *pt_regs)
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
struct ftrace_entry *entry;
struct perf_event *event;
struct hlist_head head;
struct pt_regs regs;
int rctx;
+ int bit;
- if ((unsigned long)ops->private != smp_processor_id())
+ if (!rcu_is_watching())
+ return;
+
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
return;
+ if ((unsigned long)ops->private != smp_processor_id())
+ goto out;
+
event = container_of(ops, struct perf_event, ftrace_ops);
/*
@@ -463,13 +466,15 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx);
if (!entry)
- return;
+ goto out;
entry->ip = ip;
entry->parent_ip = parent_ip;
perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
1, &regs, &head, NULL);
+out:
+ ftrace_test_recursion_unlock(bit);
#undef ENTRY_SIZE
}
@@ -477,7 +482,6 @@ static int perf_ftrace_function_register(struct perf_event *event)
{
struct ftrace_ops *ops = &event->ftrace_ops;
- ops->flags = FTRACE_OPS_FL_RCU;
ops->func = perf_ftrace_function_call;
ops->private = (void *)(unsigned long)nr_cpu_ids;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f6f55682d3e2..7c364b87352e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -38,6 +38,15 @@ DEFINE_MUTEX(event_mutex);
LIST_HEAD(ftrace_events);
static LIST_HEAD(ftrace_generic_fields);
static LIST_HEAD(ftrace_common_fields);
+static bool eventdir_initialized;
+
+static LIST_HEAD(module_strings);
+
+struct module_string {
+ struct list_head next;
+ struct module *module;
+ char *str;
+};
#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
@@ -105,7 +114,7 @@ trace_find_event_field(struct trace_event_call *call, char *name)
static int __trace_define_field(struct list_head *head, const char *type,
const char *name, int offset, int size,
- int is_signed, int filter_type)
+ int is_signed, int filter_type, int len)
{
struct ftrace_event_field *field;
@@ -124,6 +133,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
field->offset = offset;
field->size = size;
field->is_signed = is_signed;
+ field->len = len;
list_add(&field->link, head);
@@ -141,14 +151,28 @@ int trace_define_field(struct trace_event_call *call, const char *type,
head = trace_get_fields(call);
return __trace_define_field(head, type, name, offset, size,
- is_signed, filter_type);
+ is_signed, filter_type, 0);
}
EXPORT_SYMBOL_GPL(trace_define_field);
+static int trace_define_field_ext(struct trace_event_call *call, const char *type,
+ const char *name, int offset, int size, int is_signed,
+ int filter_type, int len)
+{
+ struct list_head *head;
+
+ if (WARN_ON(!call->class))
+ return 0;
+
+ head = trace_get_fields(call);
+ return __trace_define_field(head, type, name, offset, size,
+ is_signed, filter_type, len);
+}
+
#define __generic_field(type, item, filter_type) \
ret = __trace_define_field(&ftrace_generic_fields, #type, \
#item, 0, 0, is_signed_type(type), \
- filter_type); \
+ filter_type, 0); \
if (ret) \
return ret;
@@ -157,7 +181,7 @@ EXPORT_SYMBOL_GPL(trace_define_field);
"common_" #item, \
offsetof(typeof(ent), item), \
sizeof(ent.item), \
- is_signed_type(type), FILTER_OTHER); \
+ is_signed_type(type), FILTER_OTHER, 0); \
if (ret) \
return ret;
@@ -167,8 +191,11 @@ static int trace_define_generic_fields(void)
__generic_field(int, CPU, FILTER_CPU);
__generic_field(int, cpu, FILTER_CPU);
+ __generic_field(int, common_cpu, FILTER_CPU);
__generic_field(char *, COMM, FILTER_COMM);
__generic_field(char *, comm, FILTER_COMM);
+ __generic_field(char *, stacktrace, FILTER_STACKTRACE);
+ __generic_field(char *, STACKTRACE, FILTER_STACKTRACE);
return ret;
}
@@ -180,6 +207,7 @@ static int trace_define_common_fields(void)
__common_field(unsigned short, type);
__common_field(unsigned char, flags);
+ /* Holds both preempt_count and migrate_disable */
__common_field(unsigned char, preempt_count);
__common_field(int, pid);
@@ -216,6 +244,221 @@ int trace_event_get_offsets(struct trace_event_call *call)
return tail->offset + tail->size;
}
+/*
+ * Check if the referenced field is an array and return true,
+ * as arrays are OK to dereference.
+ */
+static bool test_field(const char *fmt, struct trace_event_call *call)
+{
+ struct trace_event_fields *field = call->class->fields_array;
+ const char *array_descriptor;
+ const char *p = fmt;
+ int len;
+
+ if (!(len = str_has_prefix(fmt, "REC->")))
+ return false;
+ fmt += len;
+ for (p = fmt; *p; p++) {
+ if (!isalnum(*p) && *p != '_')
+ break;
+ }
+ len = p - fmt;
+
+ for (; field->type; field++) {
+ if (strncmp(field->name, fmt, len) ||
+ field->name[len])
+ continue;
+ array_descriptor = strchr(field->type, '[');
+ /* This is an array and is OK to dereference. */
+ return array_descriptor != NULL;
+ }
+ return false;
+}
+
+/*
+ * Examine the print fmt of the event looking for unsafe dereference
+ * pointers using %p* that could be recorded in the trace event and
+ * much later referenced after the pointer was freed. Dereferencing
+ * pointers are OK, if it is dereferenced into the event itself.
+ */
+static void test_event_printk(struct trace_event_call *call)
+{
+ u64 dereference_flags = 0;
+ bool first = true;
+ const char *fmt, *c, *r, *a;
+ int parens = 0;
+ char in_quote = 0;
+ int start_arg = 0;
+ int arg = 0;
+ int i;
+
+ fmt = call->print_fmt;
+
+ if (!fmt)
+ return;
+
+ for (i = 0; fmt[i]; i++) {
+ switch (fmt[i]) {
+ case '\\':
+ i++;
+ if (!fmt[i])
+ return;
+ continue;
+ case '"':
+ case '\'':
+ /*
+ * The print fmt starts with a string that
+ * is processed first to find %p* usage,
+ * then after the first string, the print fmt
+ * contains arguments that are used to check
+ * if the dereferenced %p* usage is safe.
+ */
+ if (first) {
+ if (fmt[i] == '\'')
+ continue;
+ if (in_quote) {
+ arg = 0;
+ first = false;
+ /*
+ * If there was no %p* uses
+ * the fmt is OK.
+ */
+ if (!dereference_flags)
+ return;
+ }
+ }
+ if (in_quote) {
+ if (in_quote == fmt[i])
+ in_quote = 0;
+ } else {
+ in_quote = fmt[i];
+ }
+ continue;
+ case '%':
+ if (!first || !in_quote)
+ continue;
+ i++;
+ if (!fmt[i])
+ return;
+ switch (fmt[i]) {
+ case '%':
+ continue;
+ case 'p':
+ /* Find dereferencing fields */
+ switch (fmt[i + 1]) {
+ case 'B': case 'R': case 'r':
+ case 'b': case 'M': case 'm':
+ case 'I': case 'i': case 'E':
+ case 'U': case 'V': case 'N':
+ case 'a': case 'd': case 'D':
+ case 'g': case 't': case 'C':
+ case 'O': case 'f':
+ if (WARN_ONCE(arg == 63,
+ "Too many args for event: %s",
+ trace_event_name(call)))
+ return;
+ dereference_flags |= 1ULL << arg;
+ }
+ break;
+ default:
+ {
+ bool star = false;
+ int j;
+
+ /* Increment arg if %*s exists. */
+ for (j = 0; fmt[i + j]; j++) {
+ if (isdigit(fmt[i + j]) ||
+ fmt[i + j] == '.')
+ continue;
+ if (fmt[i + j] == '*') {
+ star = true;
+ continue;
+ }
+ if ((fmt[i + j] == 's') && star)
+ arg++;
+ break;
+ }
+ break;
+ } /* default */
+
+ } /* switch */
+ arg++;
+ continue;
+ case '(':
+ if (in_quote)
+ continue;
+ parens++;
+ continue;
+ case ')':
+ if (in_quote)
+ continue;
+ parens--;
+ if (WARN_ONCE(parens < 0,
+ "Paren mismatch for event: %s\narg='%s'\n%*s",
+ trace_event_name(call),
+ fmt + start_arg,
+ (i - start_arg) + 5, "^"))
+ return;
+ continue;
+ case ',':
+ if (in_quote || parens)
+ continue;
+ i++;
+ while (isspace(fmt[i]))
+ i++;
+ start_arg = i;
+ if (!(dereference_flags & (1ULL << arg)))
+ goto next_arg;
+
+ /* Find the REC-> in the argument */
+ c = strchr(fmt + i, ',');
+ r = strstr(fmt + i, "REC->");
+ if (r && (!c || r < c)) {
+ /*
+ * Addresses of events on the buffer,
+ * or an array on the buffer is
+ * OK to dereference.
+ * There's ways to fool this, but
+ * this is to catch common mistakes,
+ * not malicious code.
+ */
+ a = strchr(fmt + i, '&');
+ if ((a && (a < r)) || test_field(r, call))
+ dereference_flags &= ~(1ULL << arg);
+ } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) &&
+ (!c || r < c)) {
+ dereference_flags &= ~(1ULL << arg);
+ } else if ((r = strstr(fmt + i, "__get_sockaddr(")) &&
+ (!c || r < c)) {
+ dereference_flags &= ~(1ULL << arg);
+ }
+
+ next_arg:
+ i--;
+ arg++;
+ }
+ }
+
+ /*
+ * If you triggered the below warning, the trace event reported
+ * uses an unsafe dereference pointer %p*. As the data stored
+ * at the trace event time may no longer exist when the trace
+ * event is printed, dereferencing to the original source is
+ * unsafe. The source of the dereference must be copied into the
+ * event itself, and the dereference must access the copy instead.
+ */
+ if (WARN_ON_ONCE(dereference_flags)) {
+ arg = 1;
+ while (!(dereference_flags & 1)) {
+ dereference_flags >>= 1;
+ arg++;
+ }
+ pr_warn("event %s has unsafe dereference of argument %d\n",
+ trace_event_name(call), arg);
+ pr_warn("print_fmt: %s\n", fmt);
+ }
+}
+
int trace_event_raw_init(struct trace_event_call *call)
{
int id;
@@ -224,6 +467,8 @@ int trace_event_raw_init(struct trace_event_call *call)
if (!id)
return -ENODEV;
+ test_event_printk(call);
+
return 0;
}
EXPORT_SYMBOL_GPL(trace_event_raw_init);
@@ -257,22 +502,19 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
trace_event_ignore_this_pid(trace_file))
return NULL;
- local_save_flags(fbuffer->flags);
- fbuffer->pc = preempt_count();
/*
* If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
* preemption (adding one to the preempt_count). Since we are
* interested in the preempt_count at the time the tracepoint was
* hit, we need to subtract one to offset the increment.
*/
- if (IS_ENABLED(CONFIG_PREEMPTION))
- fbuffer->pc--;
+ fbuffer->trace_ctx = tracing_gen_ctx_dec();
fbuffer->trace_file = trace_file;
fbuffer->event =
trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file,
event_call->event.type, len,
- fbuffer->flags, fbuffer->pc);
+ fbuffer->trace_ctx);
if (!fbuffer->event)
return NULL;
@@ -369,7 +611,6 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
{
struct trace_event_call *call = file->event_call;
struct trace_array *tr = file->tr;
- unsigned long file_flags = file->flags;
int ret = 0;
int disable;
@@ -393,6 +634,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
break;
disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED;
clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
+ /* Disable use of trace_buffered_event */
+ trace_buffered_event_disable();
} else
disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE);
@@ -431,6 +674,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
if (atomic_inc_return(&file->sm_ref) > 1)
break;
set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
+ /* Enable use of trace_buffered_event */
+ trace_buffered_event_enable();
}
if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
@@ -470,15 +715,6 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
break;
}
- /* Enable or disable use of trace_buffered_event */
- if ((file_flags & EVENT_FILE_FL_SOFT_DISABLED) !=
- (file->flags & EVENT_FILE_FL_SOFT_DISABLED)) {
- if (file->flags & EVENT_FILE_FL_SOFT_DISABLED)
- trace_buffered_event_enable();
- else
- trace_buffered_event_disable();
- }
-
return ret;
}
@@ -538,19 +774,21 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
if (enable) {
register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork,
tr, INT_MIN);
- register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit,
+ register_trace_prio_sched_process_free(event_filter_pid_sched_process_exit,
tr, INT_MAX);
} else {
unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork,
tr);
- unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit,
+ unregister_trace_sched_process_free(event_filter_pid_sched_process_exit,
tr);
}
}
static void
event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
@@ -574,7 +812,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
static void
event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
@@ -676,10 +916,10 @@ static void __ftrace_clear_event_pids(struct trace_array *tr, int type)
tracepoint_synchronize_unregister();
if ((type & TRACE_PIDS) && pid_list)
- trace_free_pid_list(pid_list);
+ trace_pid_list_free(pid_list);
if ((type & TRACE_NO_PIDS) && no_pid_list)
- trace_free_pid_list(no_pid_list);
+ trace_pid_list_free(no_pid_list);
}
static void ftrace_clear_event_pids(struct trace_array *tr, int type)
@@ -744,32 +984,41 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
return;
if (!--dir->nr_events) {
- tracefs_remove(dir->entry);
+ eventfs_remove_dir(dir->ei);
list_del(&dir->list);
__put_system_dir(dir);
}
}
-static void remove_event_file_dir(struct trace_event_file *file)
+void event_file_get(struct trace_event_file *file)
{
- struct dentry *dir = file->dir;
- struct dentry *child;
+ atomic_inc(&file->ref);
+}
- if (dir) {
- spin_lock(&dir->d_lock); /* probably unneeded */
- list_for_each_entry(child, &dir->d_subdirs, d_child) {
- if (d_really_is_positive(child)) /* probably unneeded */
- d_inode(child)->i_private = NULL;
- }
- spin_unlock(&dir->d_lock);
+void event_file_put(struct trace_event_file *file)
+{
+ if (WARN_ON_ONCE(!atomic_read(&file->ref))) {
+ if (file->flags & EVENT_FILE_FL_FREED)
+ kmem_cache_free(file_cachep, file);
+ return;
+ }
- tracefs_remove(dir);
+ if (atomic_dec_and_test(&file->ref)) {
+ /* Count should only go to zero when it is freed */
+ if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED)))
+ return;
+ kmem_cache_free(file_cachep, file);
}
+}
+static void remove_event_file_dir(struct trace_event_file *file)
+{
+ eventfs_remove_dir(file->ei);
list_del(&file->list);
remove_subsystem(file->system);
free_event_filter(file->filter);
- kmem_cache_free(file_cachep, file);
+ file->flags |= EVENT_FILE_FL_FREED;
+ event_file_put(file);
}
/*
@@ -939,7 +1188,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
if (!cnt)
return 0;
- ret = tracing_update_buffers();
+ ret = tracing_update_buffers(tr);
if (ret < 0)
return ret;
@@ -1142,7 +1391,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
flags = file->flags;
mutex_unlock(&event_mutex);
- if (!file)
+ if (!file || flags & EVENT_FILE_FL_FREED)
return -ENODEV;
if (flags & EVENT_FILE_FL_ENABLED &&
@@ -1170,18 +1419,20 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (ret)
return ret;
- ret = tracing_update_buffers();
- if (ret < 0)
- return ret;
-
switch (val) {
case 0:
case 1:
ret = -ENODEV;
mutex_lock(&event_mutex);
file = event_file_data(filp);
- if (likely(file))
+ if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) {
+ ret = tracing_update_buffers(file->tr);
+ if (ret < 0) {
+ mutex_unlock(&event_mutex);
+ return ret;
+ }
ret = ftrace_event_enable_disable(file, val);
+ }
mutex_unlock(&event_mutex);
break;
@@ -1211,7 +1462,8 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
- if (!trace_event_name(call) || !call->class || !call->class->reg)
+ if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) ||
+ !trace_event_name(call) || !call->class || !call->class->reg)
continue;
if (system && strcmp(call->class->system, system->name) != 0)
@@ -1254,7 +1506,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (ret)
return ret;
- ret = tracing_update_buffers();
+ ret = tracing_update_buffers(dir->tr);
if (ret < 0)
return ret;
@@ -1358,12 +1610,17 @@ static int f_show(struct seq_file *m, void *v)
seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
field->type, field->name, field->offset,
field->size, !!field->is_signed);
- else
- seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+ else if (field->len)
+ seq_printf(m, "\tfield:%.*s %s[%d];\toffset:%u;\tsize:%u;\tsigned:%d;\n",
(int)(array_descriptor - field->type),
field->type, field->name,
- array_descriptor, field->offset,
+ field->len, field->offset,
field->size, !!field->is_signed);
+ else
+ seq_printf(m, "\tfield:%.*s %s[];\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+ (int)(array_descriptor - field->type),
+ field->type, field->name,
+ field->offset, field->size, !!field->is_signed);
return 0;
}
@@ -1448,7 +1705,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
file = event_file_data(filp);
- if (file)
+ if (file && !(file->flags & EVENT_FILE_FL_FREED))
print_event_filter(file, s);
mutex_unlock(&event_mutex);
@@ -1495,9 +1752,9 @@ static LIST_HEAD(event_subsystems);
static int subsystem_open(struct inode *inode, struct file *filp)
{
+ struct trace_subsystem_dir *dir = NULL, *iter_dir;
+ struct trace_array *tr = NULL, *iter_tr;
struct event_subsystem *system = NULL;
- struct trace_subsystem_dir *dir = NULL; /* Initialize for gcc */
- struct trace_array *tr;
int ret;
if (tracing_is_disabled())
@@ -1506,10 +1763,12 @@ static int subsystem_open(struct inode *inode, struct file *filp)
/* Make sure the system still exists */
mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
- list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- list_for_each_entry(dir, &tr->systems, list) {
- if (dir == inode->i_private) {
+ list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) {
+ list_for_each_entry(iter_dir, &iter_tr->systems, list) {
+ if (iter_dir == inode->i_private) {
/* Don't open systems with no events */
+ tr = iter_tr;
+ dir = iter_dir;
if (dir->nr_events) {
__get_system_dir(dir);
system = dir->subsystem;
@@ -1525,9 +1784,6 @@ static int subsystem_open(struct inode *inode, struct file *filp)
if (!system)
return -ENODEV;
- /* Some versions of gcc think dir can be uninitialized here */
- WARN_ON(!dir);
-
/* Still need to increment the ref count of the system */
if (trace_array_get(tr) < 0) {
put_system(dir);
@@ -1637,9 +1893,33 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
}
static ssize_t
-show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ struct trace_seq *s;
+ int r;
+
+ if (*ppos)
+ return 0;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ ring_buffer_print_page_header(tr->array_buffer.buffer, s);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
+
+ kfree(s);
+
+ return r;
+}
+
+static ssize_t
+show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
- int (*func)(struct trace_seq *s) = filp->private_data;
struct trace_seq *s;
int r;
@@ -1652,7 +1932,7 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
trace_seq_init(s);
- func(s);
+ ring_buffer_print_entry_header(s);
r = simple_read_from_buffer(ubuf, cnt, ppos,
s->buffer, trace_seq_used(s));
@@ -1724,7 +2004,7 @@ event_pid_write(struct file *filp, const char __user *ubuf,
if (!cnt)
return 0;
- ret = tracing_update_buffers();
+ ret = tracing_update_buffers(tr);
if (ret < 0)
return ret;
@@ -1757,7 +2037,7 @@ event_pid_write(struct file *filp, const char __user *ubuf,
if (filtered_pids) {
tracepoint_synchronize_unregister();
- trace_free_pid_list(filtered_pids);
+ trace_pid_list_free(filtered_pids);
} else if (pid_list && !other_pids) {
register_pid_events(tr);
}
@@ -1858,9 +2138,10 @@ static const struct file_operations ftrace_set_event_notrace_pid_fops = {
};
static const struct file_operations ftrace_enable_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_file_tr,
.read = event_enable_read,
.write = event_enable_write,
+ .release = tracing_release_file_tr,
.llseek = default_llseek,
};
@@ -1877,9 +2158,10 @@ static const struct file_operations ftrace_event_id_fops = {
};
static const struct file_operations ftrace_event_filter_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_file_tr,
.read = event_filter_read,
.write = event_filter_write,
+ .release = tracing_release_file_tr,
.llseek = default_llseek,
};
@@ -1907,10 +2189,18 @@ static const struct file_operations ftrace_tr_enable_fops = {
.release = subsystem_release,
};
-static const struct file_operations ftrace_show_header_fops = {
- .open = tracing_open_generic,
- .read = show_header,
+static const struct file_operations ftrace_show_header_page_fops = {
+ .open = tracing_open_generic_tr,
+ .read = show_header_page_file,
+ .llseek = default_llseek,
+ .release = tracing_release_generic_tr,
+};
+
+static const struct file_operations ftrace_show_header_event_fops = {
+ .open = tracing_open_generic_tr,
+ .read = show_header_event_file,
.llseek = default_llseek,
+ .release = tracing_release_generic_tr,
};
static int
@@ -2032,8 +2322,6 @@ create_new_subsystem(const char *name)
if (!system->name)
goto out_free;
- system->filter = NULL;
-
system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
if (!system->filter)
goto out_free;
@@ -2048,13 +2336,40 @@ create_new_subsystem(const char *name)
return NULL;
}
-static struct dentry *
+static int system_callback(const char *name, umode_t *mode, void **data,
+ const struct file_operations **fops)
+{
+ if (strcmp(name, "filter") == 0)
+ *fops = &ftrace_subsystem_filter_fops;
+
+ else if (strcmp(name, "enable") == 0)
+ *fops = &ftrace_system_enable_fops;
+
+ else
+ return 0;
+
+ *mode = TRACE_MODE_WRITE;
+ return 1;
+}
+
+static struct eventfs_inode *
event_subsystem_dir(struct trace_array *tr, const char *name,
- struct trace_event_file *file, struct dentry *parent)
+ struct trace_event_file *file, struct eventfs_inode *parent)
{
+ struct event_subsystem *system, *iter;
struct trace_subsystem_dir *dir;
- struct event_subsystem *system;
- struct dentry *entry;
+ struct eventfs_inode *ei;
+ int nr_entries;
+ static struct eventfs_entry system_entries[] = {
+ {
+ .name = "filter",
+ .callback = system_callback,
+ },
+ {
+ .name = "enable",
+ .callback = system_callback,
+ }
+ };
/* First see if we did not already create this dir */
list_for_each_entry(dir, &tr->systems, list) {
@@ -2062,18 +2377,18 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
if (strcmp(system->name, name) == 0) {
dir->nr_events++;
file->system = dir;
- return dir->entry;
+ return dir->ei;
}
}
/* Now see if the system itself exists. */
- list_for_each_entry(system, &event_subsystems, list) {
- if (strcmp(system->name, name) == 0)
+ system = NULL;
+ list_for_each_entry(iter, &event_subsystems, list) {
+ if (strcmp(iter->name, name) == 0) {
+ system = iter;
break;
+ }
}
- /* Reset system variable when not found */
- if (&system->list == &event_subsystems)
- system = NULL;
dir = kmalloc(sizeof(*dir), GFP_KERNEL);
if (!dir)
@@ -2086,33 +2401,29 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
} else
__get_system(system);
- dir->entry = tracefs_create_dir(name, parent);
- if (!dir->entry) {
+ /* ftrace only has directories no files */
+ if (strcmp(name, "ftrace") == 0)
+ nr_entries = 0;
+ else
+ nr_entries = ARRAY_SIZE(system_entries);
+
+ ei = eventfs_create_dir(name, parent, system_entries, nr_entries, dir);
+ if (IS_ERR(ei)) {
pr_warn("Failed to create system directory %s\n", name);
__put_system(system);
goto out_free;
}
+ dir->ei = ei;
dir->tr = tr;
dir->ref_count = 1;
dir->nr_events = 1;
dir->subsystem = system;
file->system = dir;
- entry = tracefs_create_file("filter", 0644, dir->entry, dir,
- &ftrace_subsystem_filter_fops);
- if (!entry) {
- kfree(system->filter);
- system->filter = NULL;
- pr_warn("Could not create tracefs '%s/filter' entry\n", name);
- }
-
- trace_create_file("enable", 0644, dir->entry, dir,
- &ftrace_system_enable_fops);
-
list_add(&dir->list, &tr->systems);
- return dir->entry;
+ return dir->ei;
out_free:
kfree(dir);
@@ -2124,43 +2435,10 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
}
static int
-event_create_dir(struct dentry *parent, struct trace_event_file *file)
+event_define_fields(struct trace_event_call *call)
{
- struct trace_event_call *call = file->event_call;
- struct trace_array *tr = file->tr;
struct list_head *head;
- struct dentry *d_events;
- const char *name;
- int ret;
-
- /*
- * If the trace point header did not define TRACE_SYSTEM
- * then the system would be called "TRACE_SYSTEM".
- */
- if (strcmp(call->class->system, TRACE_SYSTEM) != 0) {
- d_events = event_subsystem_dir(tr, call->class->system, file, parent);
- if (!d_events)
- return -ENOMEM;
- } else
- d_events = parent;
-
- name = trace_event_name(call);
- file->dir = tracefs_create_dir(name, d_events);
- if (!file->dir) {
- pr_warn("Could not create tracefs '%s' directory\n", name);
- return -1;
- }
-
- if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
- trace_create_file("enable", 0644, file->dir, file,
- &ftrace_enable_fops);
-
-#ifdef CONFIG_PERF_EVENTS
- if (call->event.type && call->class->reg)
- trace_create_file("id", 0444, file->dir,
- (void *)(long)call->event.type,
- &ftrace_event_id_fops);
-#endif
+ int ret = 0;
/*
* Other events may have the same class. Only update
@@ -2173,54 +2451,184 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
for (; field->type; field++) {
if (field->type == TRACE_FUNCTION_TYPE) {
- ret = field->define_fields(call);
+ field->define_fields(call);
break;
}
offset = ALIGN(offset, field->align);
- ret = trace_define_field(call, field->type, field->name,
+ ret = trace_define_field_ext(call, field->type, field->name,
offset, field->size,
- field->is_signed, field->filter_type);
- if (ret)
+ field->is_signed, field->filter_type,
+ field->len);
+ if (WARN_ON_ONCE(ret)) {
+ pr_err("error code is %d\n", ret);
break;
+ }
offset += field->size;
}
- if (ret < 0) {
- pr_warn("Could not initialize trace point events/%s\n",
- name);
- return -1;
- }
+ }
+
+ return ret;
+}
+
+static int event_callback(const char *name, umode_t *mode, void **data,
+ const struct file_operations **fops)
+{
+ struct trace_event_file *file = *data;
+ struct trace_event_call *call = file->event_call;
+
+ if (strcmp(name, "format") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &ftrace_event_format_fops;
+ *data = call;
+ return 1;
}
/*
* Only event directories that can be enabled should have
- * triggers or filters.
+ * triggers or filters, with the exception of the "print"
+ * event that can have a "trigger" file.
*/
if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) {
- trace_create_file("filter", 0644, file->dir, file,
- &ftrace_event_filter_fops);
+ if (call->class->reg && strcmp(name, "enable") == 0) {
+ *mode = TRACE_MODE_WRITE;
+ *fops = &ftrace_enable_fops;
+ return 1;
+ }
+
+ if (strcmp(name, "filter") == 0) {
+ *mode = TRACE_MODE_WRITE;
+ *fops = &ftrace_event_filter_fops;
+ return 1;
+ }
+ }
+
+ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) ||
+ strcmp(trace_event_name(call), "print") == 0) {
+ if (strcmp(name, "trigger") == 0) {
+ *mode = TRACE_MODE_WRITE;
+ *fops = &event_trigger_fops;
+ return 1;
+ }
+ }
- trace_create_file("trigger", 0644, file->dir, file,
- &event_trigger_fops);
+#ifdef CONFIG_PERF_EVENTS
+ if (call->event.type && call->class->reg &&
+ strcmp(name, "id") == 0) {
+ *mode = TRACE_MODE_READ;
+ *data = (void *)(long)call->event.type;
+ *fops = &ftrace_event_id_fops;
+ return 1;
}
+#endif
#ifdef CONFIG_HIST_TRIGGERS
- trace_create_file("hist", 0444, file->dir, file,
- &event_hist_fops);
+ if (strcmp(name, "hist") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &event_hist_fops;
+ return 1;
+ }
#endif
#ifdef CONFIG_HIST_TRIGGERS_DEBUG
- trace_create_file("hist_debug", 0444, file->dir, file,
- &event_hist_debug_fops);
+ if (strcmp(name, "hist_debug") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &event_hist_debug_fops;
+ return 1;
+ }
+#endif
+#ifdef CONFIG_TRACE_EVENT_INJECT
+ if (call->event.type && call->class->reg &&
+ strcmp(name, "inject") == 0) {
+ *mode = 0200;
+ *fops = &event_inject_fops;
+ return 1;
+ }
#endif
- trace_create_file("format", 0444, file->dir, call,
- &ftrace_event_format_fops);
+ return 0;
+}
+static int
+event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
+{
+ struct trace_event_call *call = file->event_call;
+ struct trace_array *tr = file->tr;
+ struct eventfs_inode *e_events;
+ struct eventfs_inode *ei;
+ const char *name;
+ int nr_entries;
+ int ret;
+ static struct eventfs_entry event_entries[] = {
+ {
+ .name = "enable",
+ .callback = event_callback,
+ },
+ {
+ .name = "filter",
+ .callback = event_callback,
+ },
+ {
+ .name = "trigger",
+ .callback = event_callback,
+ },
+ {
+ .name = "format",
+ .callback = event_callback,
+ },
+#ifdef CONFIG_PERF_EVENTS
+ {
+ .name = "id",
+ .callback = event_callback,
+ },
+#endif
+#ifdef CONFIG_HIST_TRIGGERS
+ {
+ .name = "hist",
+ .callback = event_callback,
+ },
+#endif
+#ifdef CONFIG_HIST_TRIGGERS_DEBUG
+ {
+ .name = "hist_debug",
+ .callback = event_callback,
+ },
+#endif
#ifdef CONFIG_TRACE_EVENT_INJECT
- if (call->event.type && call->class->reg)
- trace_create_file("inject", 0200, file->dir, file,
- &event_inject_fops);
+ {
+ .name = "inject",
+ .callback = event_callback,
+ },
#endif
+ };
+
+ /*
+ * If the trace point header did not define TRACE_SYSTEM
+ * then the system would be called "TRACE_SYSTEM". This should
+ * never happen.
+ */
+ if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0))
+ return -ENODEV;
+
+ e_events = event_subsystem_dir(tr, call->class->system, file, parent);
+ if (!e_events)
+ return -ENOMEM;
+
+ nr_entries = ARRAY_SIZE(event_entries);
+
+ name = trace_event_name(call);
+ ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file);
+ if (IS_ERR(ei)) {
+ pr_warn("Could not create tracefs '%s' directory\n", name);
+ return -1;
+ }
+
+ file->ei = ei;
+
+ ret = event_define_fields(call);
+ if (ret < 0) {
+ pr_warn("Could not initialize trace point events/%s\n", name);
+ return ret;
+ }
return 0;
}
@@ -2301,7 +2709,10 @@ __register_event(struct trace_event_call *call, struct module *mod)
return ret;
list_add(&call->list, &ftrace_events);
- call->mod = mod;
+ if (call->flags & TRACE_EVENT_FL_DYNAMIC)
+ atomic_set(&call->refcnt, 0);
+ else
+ call->module = mod;
return 0;
}
@@ -2404,6 +2815,76 @@ static void update_event_printk(struct trace_event_call *call,
}
}
+static void add_str_to_module(struct module *module, char *str)
+{
+ struct module_string *modstr;
+
+ modstr = kmalloc(sizeof(*modstr), GFP_KERNEL);
+
+ /*
+ * If we failed to allocate memory here, then we'll just
+ * let the str memory leak when the module is removed.
+ * If this fails to allocate, there's worse problems than
+ * a leaked string on module removal.
+ */
+ if (WARN_ON_ONCE(!modstr))
+ return;
+
+ modstr->module = module;
+ modstr->str = str;
+
+ list_add(&modstr->next, &module_strings);
+}
+
+static void update_event_fields(struct trace_event_call *call,
+ struct trace_eval_map *map)
+{
+ struct ftrace_event_field *field;
+ struct list_head *head;
+ char *ptr;
+ char *str;
+ int len = strlen(map->eval_string);
+
+ /* Dynamic events should never have field maps */
+ if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC))
+ return;
+
+ head = trace_get_fields(call);
+ list_for_each_entry(field, head, link) {
+ ptr = strchr(field->type, '[');
+ if (!ptr)
+ continue;
+ ptr++;
+
+ if (!isalpha(*ptr) && *ptr != '_')
+ continue;
+
+ if (strncmp(map->eval_string, ptr, len) != 0)
+ continue;
+
+ str = kstrdup(field->type, GFP_KERNEL);
+ if (WARN_ON_ONCE(!str))
+ return;
+ ptr = str + (ptr - field->type);
+ ptr = eval_replace(ptr, map, len);
+ /* enum/sizeof string smaller than value */
+ if (WARN_ON_ONCE(!ptr)) {
+ kfree(str);
+ continue;
+ }
+
+ /*
+ * If the event is part of a module, then we need to free the string
+ * when the module is removed. Otherwise, it will stay allocated
+ * until a reboot.
+ */
+ if (call->module)
+ add_str_to_module(call->module, str);
+
+ field->type = str;
+ }
+}
+
void trace_event_eval_update(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
@@ -2422,9 +2903,9 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
}
/*
- * Since calls are grouped by systems, the likelyhood that the
+ * Since calls are grouped by systems, the likelihood that the
* next call in the iteration belongs to the same system as the
- * previous call is high. As an optimization, we skip seaching
+ * previous call is high. As an optimization, we skip searching
* for a map[] that matches the call's system if the last call
* was from the same system. That's what last_i is for. If the
* call has the same system as the previous call, then last_i
@@ -2439,21 +2920,59 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
first = false;
}
update_event_printk(call, map[i]);
+ update_event_fields(call, map[i]);
}
}
+ cond_resched();
}
up_write(&trace_event_sem);
}
+static bool event_in_systems(struct trace_event_call *call,
+ const char *systems)
+{
+ const char *system;
+ const char *p;
+
+ if (!systems)
+ return true;
+
+ system = call->class->system;
+ p = strstr(systems, system);
+ if (!p)
+ return false;
+
+ if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',')
+ return false;
+
+ p += strlen(system);
+ return !*p || isspace(*p) || *p == ',';
+}
+
static struct trace_event_file *
trace_create_new_event(struct trace_event_call *call,
struct trace_array *tr)
{
+ struct trace_pid_list *no_pid_list;
+ struct trace_pid_list *pid_list;
struct trace_event_file *file;
+ unsigned int first;
+
+ if (!event_in_systems(call, tr->system_names))
+ return NULL;
file = kmem_cache_alloc(file_cachep, GFP_TRACE);
if (!file)
- return NULL;
+ return ERR_PTR(-ENOMEM);
+
+ pid_list = rcu_dereference_protected(tr->filtered_pids,
+ lockdep_is_held(&event_mutex));
+ no_pid_list = rcu_dereference_protected(tr->filtered_no_pids,
+ lockdep_is_held(&event_mutex));
+
+ if (!trace_pid_list_first(pid_list, &first) ||
+ !trace_pid_list_first(no_pid_list, &first))
+ file->flags |= EVENT_FILE_FL_PID_FILTER;
file->event_call = call;
file->tr = tr;
@@ -2461,10 +2980,47 @@ trace_create_new_event(struct trace_event_call *call,
atomic_set(&file->tm_ref, 0);
INIT_LIST_HEAD(&file->triggers);
list_add(&file->list, &tr->events);
+ event_file_get(file);
return file;
}
+#define MAX_BOOT_TRIGGERS 32
+
+static struct boot_triggers {
+ const char *event;
+ char *trigger;
+} bootup_triggers[MAX_BOOT_TRIGGERS];
+
+static char bootup_trigger_buf[COMMAND_LINE_SIZE];
+static int nr_boot_triggers;
+
+static __init int setup_trace_triggers(char *str)
+{
+ char *trigger;
+ char *buf;
+ int i;
+
+ strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE);
+ trace_set_ring_buffer_expanded(NULL);
+ disable_tracing_selftest("running event triggers");
+
+ buf = bootup_trigger_buf;
+ for (i = 0; i < MAX_BOOT_TRIGGERS; i++) {
+ trigger = strsep(&buf, ",");
+ if (!trigger)
+ break;
+ bootup_triggers[i].event = strsep(&trigger, ".");
+ bootup_triggers[i].trigger = trigger;
+ if (!bootup_triggers[i].trigger)
+ break;
+ }
+
+ nr_boot_triggers = i;
+ return 1;
+}
+__setup("trace_trigger=", setup_trace_triggers);
+
/* Add an event to a trace directory */
static int
__trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
@@ -2472,26 +3028,72 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
struct trace_event_file *file;
file = trace_create_new_event(call, tr);
+ /*
+ * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
+ * allocation, or NULL if the event is not part of the tr->system_names.
+ * When the event is not part of the tr->system_names, return zero, not
+ * an error.
+ */
if (!file)
- return -ENOMEM;
+ return 0;
+
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ if (eventdir_initialized)
+ return event_create_dir(tr->event_dir, file);
+ else
+ return event_define_fields(call);
+}
+
+static void trace_early_triggers(struct trace_event_file *file, const char *name)
+{
+ int ret;
+ int i;
- return event_create_dir(tr->event_dir, file);
+ for (i = 0; i < nr_boot_triggers; i++) {
+ if (strcmp(name, bootup_triggers[i].event))
+ continue;
+ mutex_lock(&event_mutex);
+ ret = trigger_process_regex(file, bootup_triggers[i].trigger);
+ mutex_unlock(&event_mutex);
+ if (ret)
+ pr_err("Failed to register trigger '%s' on event %s\n",
+ bootup_triggers[i].trigger,
+ bootup_triggers[i].event);
+ }
}
/*
- * Just create a decriptor for early init. A descriptor is required
+ * Just create a descriptor for early init. A descriptor is required
* for enabling events at boot. We want to enable events before
* the filesystem is initialized.
*/
-static __init int
+static int
__trace_early_add_new_event(struct trace_event_call *call,
struct trace_array *tr)
{
struct trace_event_file *file;
+ int ret;
file = trace_create_new_event(call, tr);
+ /*
+ * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
+ * allocation, or NULL if the event is not part of the tr->system_names.
+ * When the event is not part of the tr->system_names, return zero, not
+ * an error.
+ */
if (!file)
- return -ENOMEM;
+ return 0;
+
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ ret = event_define_fields(call);
+ if (ret)
+ return ret;
+
+ trace_early_triggers(file, trace_event_name(call));
return 0;
}
@@ -2514,6 +3116,7 @@ int trace_add_event_call(struct trace_event_call *call)
mutex_unlock(&trace_types_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(trace_add_event_call);
/*
* Must be called under locking of trace_types_lock, event_mutex and
@@ -2545,7 +3148,10 @@ static int probe_remove_event_call(struct trace_event_call *call)
* TRACE_REG_UNREGISTER.
*/
if (file->flags & EVENT_FILE_FL_ENABLED)
- return -EBUSY;
+ goto busy;
+
+ if (file->flags & EVENT_FILE_FL_WAS_ENABLED)
+ tr->clear_trace = true;
/*
* The do_for_each_event_file_safe() is
* a double loop. After finding the call for this
@@ -2558,6 +3164,12 @@ static int probe_remove_event_call(struct trace_event_call *call)
__trace_remove_event_call(call);
return 0;
+ busy:
+ /* No need to clear the trace now */
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ tr->clear_trace = false;
+ }
+ return -EBUSY;
}
/* Remove an event_call */
@@ -2575,6 +3187,7 @@ int trace_remove_event_call(struct trace_event_call *call)
return ret;
}
+EXPORT_SYMBOL_GPL(trace_remove_event_call);
#define for_each_event(event, start, end) \
for (event = start; \
@@ -2609,12 +3222,23 @@ static void trace_module_add_events(struct module *mod)
static void trace_module_remove_events(struct module *mod)
{
struct trace_event_call *call, *p;
+ struct module_string *modstr, *m;
down_write(&trace_event_sem);
list_for_each_entry_safe(call, p, &ftrace_events, list) {
- if (call->mod == mod)
+ if ((call->flags & TRACE_EVENT_FL_DYNAMIC) || !call->module)
+ continue;
+ if (call->module == mod)
__trace_remove_event_call(call);
}
+ /* Check for any strings allocade for this module */
+ list_for_each_entry_safe(modstr, m, &module_strings, next) {
+ if (modstr->module != mod)
+ continue;
+ list_del(&modstr->next);
+ kfree(modstr->str);
+ kfree(modstr);
+ }
up_write(&trace_event_sem);
/*
@@ -2625,7 +3249,7 @@ static void trace_module_remove_events(struct module *mod)
* over from this module may be passed to the new module events and
* unexpected results may occur.
*/
- tracing_reset_all_online_cpus();
+ tracing_reset_all_online_cpus_unlocked();
}
static int trace_module_notify(struct notifier_block *self,
@@ -2646,7 +3270,7 @@ static int trace_module_notify(struct notifier_block *self,
mutex_unlock(&trace_types_lock);
mutex_unlock(&event_mutex);
- return 0;
+ return NOTIFY_OK;
}
static struct notifier_block trace_module_nb = {
@@ -2755,7 +3379,7 @@ struct trace_event_file *trace_get_event_file(const char *instance,
}
/* Don't let event modules unload while in use */
- ret = try_module_get(file->event_call->mod);
+ ret = trace_event_try_get_ref(file->event_call);
if (!ret) {
trace_array_put(tr);
ret = -EBUSY;
@@ -2785,7 +3409,7 @@ EXPORT_SYMBOL_GPL(trace_get_event_file);
void trace_put_event_file(struct trace_event_file *file)
{
mutex_lock(&event_mutex);
- module_put(file->event_call->mod);
+ trace_event_put_ref(file->event_call);
mutex_unlock(&event_mutex);
trace_array_put(file->tr);
@@ -2920,7 +3544,7 @@ static int free_probe_data(void *data)
if (!edata->ref) {
/* Remove the SOFT_MODE flag */
__ftrace_event_enable_disable(edata->file, 0, 1);
- module_put(edata->file->event_call->mod);
+ trace_event_put_ref(edata->file->event_call);
kfree(edata);
}
return 0;
@@ -3053,7 +3677,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
out_reg:
/* Don't let event modules unload while probe registered */
- ret = try_module_get(file->event_call->mod);
+ ret = trace_event_try_get_ref(file->event_call);
if (!ret) {
ret = -EBUSY;
goto out_free;
@@ -3083,7 +3707,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
out_disable:
__ftrace_event_enable_disable(file, 0, 1);
out_put:
- module_put(file->event_call->mod);
+ trace_event_put_ref(file->event_call);
out_free:
kfree(data);
goto out;
@@ -3116,14 +3740,13 @@ static inline int register_event_cmds(void) { return 0; }
#endif /* CONFIG_DYNAMIC_FTRACE */
/*
- * The top level array has already had its trace_event_file
- * descriptors created in order to allow for early events to
- * be recorded. This function is called after the tracefs has been
- * initialized, and we now have to create the files associated
- * to the events.
+ * The top level array and trace arrays created by boot-time tracing
+ * have already had its trace_event_file descriptors created in order
+ * to allow for early events to be recorded.
+ * This function is called after the tracefs has been initialized,
+ * and we now have to create the files associated to the events.
*/
-static __init void
-__trace_early_add_event_dirs(struct trace_array *tr)
+static void __trace_early_add_event_dirs(struct trace_array *tr)
{
struct trace_event_file *file;
int ret;
@@ -3138,20 +3761,20 @@ __trace_early_add_event_dirs(struct trace_array *tr)
}
/*
- * For early boot up, the top trace array requires to have
- * a list of events that can be enabled. This must be done before
- * the filesystem is set up in order to allow events to be traced
- * early.
+ * For early boot up, the top trace array and the trace arrays created
+ * by boot-time tracing require to have a list of events that can be
+ * enabled. This must be done before the filesystem is set up in order
+ * to allow events to be traced early.
*/
-static __init void
-__trace_early_add_events(struct trace_array *tr)
+void __trace_early_add_events(struct trace_array *tr)
{
struct trace_event_call *call;
int ret;
list_for_each_entry(call, &ftrace_events, list) {
/* Early boot up should not have any modules loaded */
- if (WARN_ON_ONCE(call->mod))
+ if (!(call->flags & TRACE_EVENT_FL_DYNAMIC) &&
+ WARN_ON_ONCE(call->module))
continue;
ret = __trace_early_add_new_event(call, tr);
@@ -3186,67 +3809,82 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
static __init int setup_trace_event(char *str)
{
- strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
- ring_buffer_expanded = true;
- tracing_selftest_disabled = true;
+ strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
+ trace_set_ring_buffer_expanded(NULL);
+ disable_tracing_selftest("running event tracing");
return 1;
}
__setup("trace_event=", setup_trace_event);
+static int events_callback(const char *name, umode_t *mode, void **data,
+ const struct file_operations **fops)
+{
+ if (strcmp(name, "enable") == 0) {
+ *mode = TRACE_MODE_WRITE;
+ *fops = &ftrace_tr_enable_fops;
+ return 1;
+ }
+
+ if (strcmp(name, "header_page") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &ftrace_show_header_page_fops;
+
+ } else if (strcmp(name, "header_event") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &ftrace_show_header_event_fops;
+ } else
+ return 0;
+
+ return 1;
+}
+
/* Expects to have event_mutex held when called */
static int
create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
{
- struct dentry *d_events;
+ struct eventfs_inode *e_events;
struct dentry *entry;
-
- entry = tracefs_create_file("set_event", 0644, parent,
- tr, &ftrace_set_event_fops);
- if (!entry) {
- pr_warn("Could not create tracefs 'set_event' entry\n");
+ int nr_entries;
+ static struct eventfs_entry events_entries[] = {
+ {
+ .name = "enable",
+ .callback = events_callback,
+ },
+ {
+ .name = "header_page",
+ .callback = events_callback,
+ },
+ {
+ .name = "header_event",
+ .callback = events_callback,
+ },
+ };
+
+ entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
+ tr, &ftrace_set_event_fops);
+ if (!entry)
return -ENOMEM;
- }
- d_events = tracefs_create_dir("events", parent);
- if (!d_events) {
- pr_warn("Could not create tracefs 'events' directory\n");
- return -ENOMEM;
- }
+ nr_entries = ARRAY_SIZE(events_entries);
- entry = trace_create_file("enable", 0644, d_events,
- tr, &ftrace_tr_enable_fops);
- if (!entry) {
- pr_warn("Could not create tracefs 'enable' entry\n");
+ e_events = eventfs_create_events_dir("events", parent, events_entries,
+ nr_entries, tr);
+ if (IS_ERR(e_events)) {
+ pr_warn("Could not create tracefs 'events' directory\n");
return -ENOMEM;
}
/* There are not as crucial, just warn if they are not created */
- entry = tracefs_create_file("set_event_pid", 0644, parent,
- tr, &ftrace_set_event_pid_fops);
- if (!entry)
- pr_warn("Could not create tracefs 'set_event_pid' entry\n");
-
- entry = tracefs_create_file("set_event_notrace_pid", 0644, parent,
- tr, &ftrace_set_event_notrace_pid_fops);
- if (!entry)
- pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n");
-
- /* ring buffer internal formats */
- entry = trace_create_file("header_page", 0444, d_events,
- ring_buffer_print_page_header,
- &ftrace_show_header_fops);
- if (!entry)
- pr_warn("Could not create tracefs 'header_page' entry\n");
+ trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
+ tr, &ftrace_set_event_pid_fops);
- entry = trace_create_file("header_event", 0444, d_events,
- ring_buffer_print_entry_header,
- &ftrace_show_header_fops);
- if (!entry)
- pr_warn("Could not create tracefs 'header_event' entry\n");
+ trace_create_file("set_event_notrace_pid",
+ TRACE_MODE_WRITE, parent, tr,
+ &ftrace_set_event_notrace_pid_fops);
- tr->event_dir = d_events;
+ tr->event_dir = e_events;
return 0;
}
@@ -3258,7 +3896,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
*
* When a new instance is created, it needs to set up its events
* directory, as well as other files associated with events. It also
- * creates the event hierachry in the @parent/events directory.
+ * creates the event hierarchy in the @parent/events directory.
*
* Returns 0 on success.
*
@@ -3275,7 +3913,11 @@ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
goto out;
down_write(&trace_event_sem);
- __trace_add_event_dirs(tr);
+ /* If tr already has the event list, it is initialized in early boot. */
+ if (unlikely(!list_empty(&tr->events)))
+ __trace_early_add_event_dirs(tr);
+ else
+ __trace_add_event_dirs(tr);
up_write(&trace_event_sem);
out:
@@ -3326,7 +3968,7 @@ int event_trace_del_tracer(struct trace_array *tr)
down_write(&trace_event_sem);
__trace_remove_event_dirs(tr);
- tracefs_remove(tr->event_dir);
+ eventfs_remove_events_dir(tr->event_dir);
up_write(&trace_event_sem);
tr->event_dir = NULL;
@@ -3341,10 +3983,9 @@ static __init int event_trace_memsetup(void)
return 0;
}
-static __init void
-early_enable_events(struct trace_array *tr, bool disable_first)
+__init void
+early_enable_events(struct trace_array *tr, char *buf, bool disable_first)
{
- char *buf = bootup_event_buf;
char *token;
int ret;
@@ -3387,6 +4028,8 @@ static __init int event_trace_enable(void)
list_add(&call->list, &ftrace_events);
}
+ register_trigger_cmds();
+
/*
* We need the top trace array to have a working set of trace
* points at early init, before the debug files and directories
@@ -3395,13 +4038,12 @@ static __init int event_trace_enable(void)
*/
__trace_early_add_events(tr);
- early_enable_events(tr, false);
+ early_enable_events(tr, bootup_event_buf, false);
trace_printk_start_comm();
register_event_cmds();
- register_trigger_cmds();
return 0;
}
@@ -3411,10 +4053,10 @@ static __init int event_trace_enable(void)
* initialize events and perhaps start any events that are on the
* command line. Unfortunately, there are some events that will not
* start this early, like the system call tracepoints that need
- * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable()
- * is called before pid 1 starts, and this flag is never set, making
- * the syscall tracepoint never get reached, but the event is enabled
- * regardless (and not doing anything).
+ * to set the %SYSCALL_WORK_SYSCALL_TRACEPOINT flag of pid 1. But
+ * event_trace_enable() is called before pid 1 starts, and this flag
+ * is never set, making the syscall tracepoint never get reached, but
+ * the event is enabled regardless (and not doing anything).
*/
static __init int event_trace_enable_again(void)
{
@@ -3424,40 +4066,38 @@ static __init int event_trace_enable_again(void)
if (!tr)
return -ENODEV;
- early_enable_events(tr, true);
+ early_enable_events(tr, bootup_event_buf, true);
return 0;
}
early_initcall(event_trace_enable_again);
+/* Init fields which doesn't related to the tracefs */
+static __init int event_trace_init_fields(void)
+{
+ if (trace_define_generic_fields())
+ pr_warn("tracing: Failed to allocated generic fields");
+
+ if (trace_define_common_fields())
+ pr_warn("tracing: Failed to allocate common fields");
+
+ return 0;
+}
+
__init int event_trace_init(void)
{
struct trace_array *tr;
- struct dentry *d_tracer;
- struct dentry *entry;
int ret;
tr = top_trace_array();
if (!tr)
return -ENODEV;
- d_tracer = tracing_init_dentry();
- if (IS_ERR(d_tracer))
- return 0;
+ trace_create_file("available_events", TRACE_MODE_READ,
+ NULL, tr, &ftrace_avail_fops);
- entry = tracefs_create_file("available_events", 0444, d_tracer,
- tr, &ftrace_avail_fops);
- if (!entry)
- pr_warn("Could not create tracefs 'available_events' entry\n");
-
- if (trace_define_generic_fields())
- pr_warn("tracing: Failed to allocated generic fields");
-
- if (trace_define_common_fields())
- pr_warn("tracing: Failed to allocate common fields");
-
- ret = early_event_add_tracer(d_tracer, tr);
+ ret = early_event_add_tracer(NULL, tr);
if (ret)
return ret;
@@ -3466,6 +4106,9 @@ __init int event_trace_init(void)
if (ret)
pr_warn("Failed to register trace events module notifier\n");
#endif
+
+ eventdir_initialized = true;
+
return 0;
}
@@ -3474,6 +4117,7 @@ void __init trace_event_init(void)
event_trace_memsetup();
init_ftrace_syscalls();
event_trace_enable();
+ event_trace_init_fields();
}
#ifdef CONFIG_EVENT_TRACE_STARTUP_TEST
@@ -3651,17 +4295,16 @@ static struct trace_event_file event_trace_file __initdata;
static void __init
function_test_events_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs)
+ struct ftrace_ops *op, struct ftrace_regs *regs)
{
struct trace_buffer *buffer;
struct ring_buffer_event *event;
struct ftrace_entry *entry;
- unsigned long flags;
+ unsigned int trace_ctx;
long disabled;
int cpu;
- int pc;
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
preempt_disable_notrace();
cpu = raw_smp_processor_id();
disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
@@ -3669,11 +4312,9 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
if (disabled != 1)
goto out;
- local_save_flags(flags);
-
event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file,
TRACE_FN, sizeof(*entry),
- flags, pc);
+ trace_ctx);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -3681,7 +4322,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
entry->parent_ip = parent_ip;
event_trigger_unlock_commit(&event_trace_file, buffer, event,
- entry, flags, pc);
+ entry, trace_ctx);
out:
atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
preempt_enable_notrace();
@@ -3690,7 +4331,6 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
static struct ftrace_ops trace_ops __initdata =
{
.func = function_test_events_call,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static __init void event_trace_self_test_with_function(void)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index bf44f6bbd0c3..0c611b281a5b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -5,6 +5,7 @@
* Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
*/
+#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/mutex.h>
@@ -42,6 +43,52 @@ enum filter_op_ids { OPS };
static const char * ops[] = { OPS };
+enum filter_pred_fn {
+ FILTER_PRED_FN_NOP,
+ FILTER_PRED_FN_64,
+ FILTER_PRED_FN_64_CPUMASK,
+ FILTER_PRED_FN_S64,
+ FILTER_PRED_FN_U64,
+ FILTER_PRED_FN_32,
+ FILTER_PRED_FN_32_CPUMASK,
+ FILTER_PRED_FN_S32,
+ FILTER_PRED_FN_U32,
+ FILTER_PRED_FN_16,
+ FILTER_PRED_FN_16_CPUMASK,
+ FILTER_PRED_FN_S16,
+ FILTER_PRED_FN_U16,
+ FILTER_PRED_FN_8,
+ FILTER_PRED_FN_8_CPUMASK,
+ FILTER_PRED_FN_S8,
+ FILTER_PRED_FN_U8,
+ FILTER_PRED_FN_COMM,
+ FILTER_PRED_FN_STRING,
+ FILTER_PRED_FN_STRLOC,
+ FILTER_PRED_FN_STRRELLOC,
+ FILTER_PRED_FN_PCHAR_USER,
+ FILTER_PRED_FN_PCHAR,
+ FILTER_PRED_FN_CPU,
+ FILTER_PRED_FN_CPU_CPUMASK,
+ FILTER_PRED_FN_CPUMASK,
+ FILTER_PRED_FN_CPUMASK_CPU,
+ FILTER_PRED_FN_FUNCTION,
+ FILTER_PRED_FN_,
+ FILTER_PRED_TEST_VISITED,
+};
+
+struct filter_pred {
+ struct regex *regex;
+ struct cpumask *mask;
+ unsigned short *ops;
+ struct ftrace_event_field *field;
+ u64 val;
+ u64 val2;
+ enum filter_pred_fn fn_num;
+ int offset;
+ int not;
+ int op;
+};
+
/*
* pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND
* pred_funcs_##type below must match the order of them above.
@@ -55,6 +102,8 @@ static const char * ops[] = { OPS };
C(TOO_MANY_OPEN, "Too many '('"), \
C(TOO_MANY_CLOSE, "Too few '('"), \
C(MISSING_QUOTE, "Missing matching quote"), \
+ C(MISSING_BRACE_OPEN, "Missing '{'"), \
+ C(MISSING_BRACE_CLOSE, "Missing '}'"), \
C(OPERAND_TOO_LONG, "Operand too long"), \
C(EXPECT_STRING, "Expecting string field"), \
C(EXPECT_DIGIT, "Expecting numeric field"), \
@@ -64,8 +113,10 @@ static const char * ops[] = { OPS };
C(BAD_SUBSYS_FILTER, "Couldn't find or set field in one of a subsystem's events"), \
C(TOO_MANY_PREDS, "Too many terms in predicate expression"), \
C(INVALID_FILTER, "Meaningless filter expression"), \
+ C(INVALID_CPULIST, "Invalid cpulist"), \
C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \
C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \
+ C(NO_FUNCTION, "Function not found"), \
C(ERRNO, "Error"), \
C(NO_FILTER, "No filter found")
@@ -91,7 +142,7 @@ static bool is_not(const char *str)
}
/**
- * prog_entry - a singe entry in the filter program
+ * struct prog_entry - a singe entry in the filter program
* @target: Index to jump to on a branch (actually one minus the index)
* @when_to_branch: The value of the result of the predicate to do a branch
* @pred: The predicate to execute.
@@ -103,16 +154,16 @@ struct prog_entry {
};
/**
- * update_preds- assign a program entry a label target
+ * update_preds - assign a program entry a label target
* @prog: The program array
* @N: The index of the current entry in @prog
- * @when_to_branch: What to assign a program entry for its branch condition
+ * @invert: What to assign a program entry for its branch condition
*
* The program entry at @N has a target that points to the index of a program
* entry that can have its target and when_to_branch fields updated.
* Update the current program entry denoted by index @N target field to be
* that of the updated entry. This will denote the entry to update if
- * we are processing an "||" after an "&&"
+ * we are processing an "||" after an "&&".
*/
static void update_preds(struct prog_entry *prog, int N, int invert)
{
@@ -146,6 +197,15 @@ enum {
PROCESS_OR = 4,
};
+static void free_predicate(struct filter_pred *pred)
+{
+ if (pred) {
+ kfree(pred->regex);
+ kfree(pred->mask);
+ kfree(pred);
+ }
+}
+
/*
* Without going into a formal proof, this explains the method that is used in
* parsing the logical expressions.
@@ -256,7 +316,7 @@ enum {
* is "&&" we don't call update_preds(). Instead continue to "c". As the
* next token after "c" is not "&&" but the end of input, we first process the
* "&&" by calling update_preds() for the "&&" then we process the "||" by
- * callin updates_preds() with the values for processing "||".
+ * calling updates_preds() with the values for processing "||".
*
* What does that mean? What update_preds() does is to first save the "target"
* of the program entry indexed by the current program entry's "target"
@@ -296,7 +356,7 @@ enum {
* and "FALSE" the program entry after that, we are now done with the first
* pass.
*
- * Making the above "a || b && c" have a progam of:
+ * Making the above "a || b && c" have a program of:
* prog[0] = { "a", 1, 2 }
* prog[1] = { "b", 0, 2 }
* prog[2] = { "c", 0, 3 }
@@ -390,7 +450,7 @@ enum {
* F: return FALSE
*
* As "r = a; if (!r) goto n5;" is obviously the same as
- * "if (!a) goto n5;" without doing anything we can interperate the
+ * "if (!a) goto n5;" without doing anything we can interpret the
* program as:
* n1: if (!a) goto n5;
* n2: if (!b) goto n5;
@@ -499,7 +559,7 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
ptr++;
break;
}
- /* fall through */
+ fallthrough;
default:
parse_error(pe, FILT_ERR_TOO_MANY_PREDS,
next - str);
@@ -583,50 +643,118 @@ out_free:
kfree(inverts);
if (prog_stack) {
for (i = 0; prog_stack[i].pred; i++)
- kfree(prog_stack[i].pred);
+ free_predicate(prog_stack[i].pred);
kfree(prog_stack);
}
return ERR_PTR(ret);
}
+static inline int
+do_filter_cpumask(int op, const struct cpumask *mask, const struct cpumask *cmp)
+{
+ switch (op) {
+ case OP_EQ:
+ return cpumask_equal(mask, cmp);
+ case OP_NE:
+ return !cpumask_equal(mask, cmp);
+ case OP_BAND:
+ return cpumask_intersects(mask, cmp);
+ default:
+ return 0;
+ }
+}
+
+/* Optimisation of do_filter_cpumask() for scalar fields */
+static inline int
+do_filter_scalar_cpumask(int op, unsigned int cpu, const struct cpumask *mask)
+{
+ /*
+ * Per the weight-of-one cpumask optimisations, the mask passed in this
+ * function has a weight >= 2, so it is never equal to a single scalar.
+ */
+ switch (op) {
+ case OP_EQ:
+ return false;
+ case OP_NE:
+ return true;
+ case OP_BAND:
+ return cpumask_test_cpu(cpu, mask);
+ default:
+ return 0;
+ }
+}
+
+static inline int
+do_filter_cpumask_scalar(int op, const struct cpumask *mask, unsigned int cpu)
+{
+ switch (op) {
+ case OP_EQ:
+ return cpumask_test_cpu(cpu, mask) &&
+ cpumask_nth(1, mask) >= nr_cpu_ids;
+ case OP_NE:
+ return !cpumask_test_cpu(cpu, mask) ||
+ cpumask_nth(1, mask) < nr_cpu_ids;
+ case OP_BAND:
+ return cpumask_test_cpu(cpu, mask);
+ default:
+ return 0;
+ }
+}
+
+enum pred_cmp_types {
+ PRED_CMP_TYPE_NOP,
+ PRED_CMP_TYPE_LT,
+ PRED_CMP_TYPE_LE,
+ PRED_CMP_TYPE_GT,
+ PRED_CMP_TYPE_GE,
+ PRED_CMP_TYPE_BAND,
+};
+
#define DEFINE_COMPARISON_PRED(type) \
-static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \
-{ \
- type *addr = (type *)(event + pred->offset); \
- type val = (type)pred->val; \
- return *addr < val; \
-} \
-static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \
-{ \
- type *addr = (type *)(event + pred->offset); \
- type val = (type)pred->val; \
- return *addr <= val; \
-} \
-static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \
-{ \
- type *addr = (type *)(event + pred->offset); \
- type val = (type)pred->val; \
- return *addr > val; \
-} \
-static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \
+static int filter_pred_##type(struct filter_pred *pred, void *event) \
{ \
- type *addr = (type *)(event + pred->offset); \
- type val = (type)pred->val; \
- return *addr >= val; \
-} \
-static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \
-{ \
- type *addr = (type *)(event + pred->offset); \
- type val = (type)pred->val; \
- return !!(*addr & val); \
-} \
-static const filter_pred_fn_t pred_funcs_##type[] = { \
- filter_pred_LE_##type, \
- filter_pred_LT_##type, \
- filter_pred_GE_##type, \
- filter_pred_GT_##type, \
- filter_pred_BAND_##type, \
-};
+ switch (pred->op) { \
+ case OP_LT: { \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ return *addr < val; \
+ } \
+ case OP_LE: { \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ return *addr <= val; \
+ } \
+ case OP_GT: { \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ return *addr > val; \
+ } \
+ case OP_GE: { \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ return *addr >= val; \
+ } \
+ case OP_BAND: { \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ return !!(*addr & val); \
+ } \
+ default: \
+ return 0; \
+ } \
+}
+
+#define DEFINE_CPUMASK_COMPARISON_PRED(size) \
+static int filter_pred_##size##_cpumask(struct filter_pred *pred, void *event) \
+{ \
+ u##size *addr = (u##size *)(event + pred->offset); \
+ unsigned int cpu = *addr; \
+ \
+ if (cpu >= nr_cpu_ids) \
+ return 0; \
+ \
+ return do_filter_scalar_cpumask(pred->op, cpu, pred->mask); \
+}
#define DEFINE_EQUALITY_PRED(size) \
static int filter_pred_##size(struct filter_pred *pred, void *event) \
@@ -649,37 +777,112 @@ DEFINE_COMPARISON_PRED(u16);
DEFINE_COMPARISON_PRED(s8);
DEFINE_COMPARISON_PRED(u8);
+DEFINE_CPUMASK_COMPARISON_PRED(64);
+DEFINE_CPUMASK_COMPARISON_PRED(32);
+DEFINE_CPUMASK_COMPARISON_PRED(16);
+DEFINE_CPUMASK_COMPARISON_PRED(8);
+
DEFINE_EQUALITY_PRED(64);
DEFINE_EQUALITY_PRED(32);
DEFINE_EQUALITY_PRED(16);
DEFINE_EQUALITY_PRED(8);
+/* user space strings temp buffer */
+#define USTRING_BUF_SIZE 1024
+
+struct ustring_buffer {
+ char buffer[USTRING_BUF_SIZE];
+};
+
+static __percpu struct ustring_buffer *ustring_per_cpu;
+
+static __always_inline char *test_string(char *str)
+{
+ struct ustring_buffer *ubuf;
+ char *kstr;
+
+ if (!ustring_per_cpu)
+ return NULL;
+
+ ubuf = this_cpu_ptr(ustring_per_cpu);
+ kstr = ubuf->buffer;
+
+ /* For safety, do not trust the string pointer */
+ if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE))
+ return NULL;
+ return kstr;
+}
+
+static __always_inline char *test_ustring(char *str)
+{
+ struct ustring_buffer *ubuf;
+ char __user *ustr;
+ char *kstr;
+
+ if (!ustring_per_cpu)
+ return NULL;
+
+ ubuf = this_cpu_ptr(ustring_per_cpu);
+ kstr = ubuf->buffer;
+
+ /* user space address? */
+ ustr = (char __user *)str;
+ if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE))
+ return NULL;
+
+ return kstr;
+}
+
/* Filter predicate for fixed sized arrays of characters */
static int filter_pred_string(struct filter_pred *pred, void *event)
{
char *addr = (char *)(event + pred->offset);
int cmp, match;
- cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len);
+ cmp = pred->regex->match(addr, pred->regex, pred->regex->field_len);
match = cmp ^ pred->not;
return match;
}
-/* Filter predicate for char * pointers */
-static int filter_pred_pchar(struct filter_pred *pred, void *event)
+static __always_inline int filter_pchar(struct filter_pred *pred, char *str)
{
- char **addr = (char **)(event + pred->offset);
int cmp, match;
- int len = strlen(*addr) + 1; /* including tailing '\0' */
+ int len;
- cmp = pred->regex.match(*addr, &pred->regex, len);
+ len = strlen(str) + 1; /* including tailing '\0' */
+ cmp = pred->regex->match(str, pred->regex, len);
match = cmp ^ pred->not;
return match;
}
+/* Filter predicate for char * pointers */
+static int filter_pred_pchar(struct filter_pred *pred, void *event)
+{
+ char **addr = (char **)(event + pred->offset);
+ char *str;
+
+ str = test_string(*addr);
+ if (!str)
+ return 0;
+
+ return filter_pchar(pred, str);
+}
+
+/* Filter predicate for char * pointers in user space*/
+static int filter_pred_pchar_user(struct filter_pred *pred, void *event)
+{
+ char **addr = (char **)(event + pred->offset);
+ char *str;
+
+ str = test_ustring(*addr);
+ if (!str)
+ return 0;
+
+ return filter_pchar(pred, str);
+}
/*
* Filter predicate for dynamic sized arrays of characters.
@@ -699,7 +902,30 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event)
char *addr = (char *)(event + str_loc);
int cmp, match;
- cmp = pred->regex.match(addr, &pred->regex, str_len);
+ cmp = pred->regex->match(addr, pred->regex, str_len);
+
+ match = cmp ^ pred->not;
+
+ return match;
+}
+
+/*
+ * Filter predicate for relative dynamic sized arrays of characters.
+ * These are implemented through a list of strings at the end
+ * of the entry as same as dynamic string.
+ * The difference is that the relative one records the location offset
+ * from the field itself, not the event entry.
+ */
+static int filter_pred_strrelloc(struct filter_pred *pred, void *event)
+{
+ u32 *item = (u32 *)(event + pred->offset);
+ u32 str_item = *item;
+ int str_loc = str_item & 0xffff;
+ int str_len = str_item >> 16;
+ char *addr = (char *)(&item[1]) + str_loc;
+ int cmp, match;
+
+ cmp = pred->regex->match(addr, pred->regex, str_len);
match = cmp ^ pred->not;
@@ -732,19 +958,55 @@ static int filter_pred_cpu(struct filter_pred *pred, void *event)
}
}
+/* Filter predicate for current CPU vs user-provided cpumask */
+static int filter_pred_cpu_cpumask(struct filter_pred *pred, void *event)
+{
+ int cpu = raw_smp_processor_id();
+
+ return do_filter_scalar_cpumask(pred->op, cpu, pred->mask);
+}
+
+/* Filter predicate for cpumask field vs user-provided cpumask */
+static int filter_pred_cpumask(struct filter_pred *pred, void *event)
+{
+ u32 item = *(u32 *)(event + pred->offset);
+ int loc = item & 0xffff;
+ const struct cpumask *mask = (event + loc);
+ const struct cpumask *cmp = pred->mask;
+
+ return do_filter_cpumask(pred->op, mask, cmp);
+}
+
+/* Filter predicate for cpumask field vs user-provided scalar */
+static int filter_pred_cpumask_cpu(struct filter_pred *pred, void *event)
+{
+ u32 item = *(u32 *)(event + pred->offset);
+ int loc = item & 0xffff;
+ const struct cpumask *mask = (event + loc);
+ unsigned int cpu = pred->val;
+
+ return do_filter_cpumask_scalar(pred->op, mask, cpu);
+}
+
/* Filter predicate for COMM. */
static int filter_pred_comm(struct filter_pred *pred, void *event)
{
int cmp;
- cmp = pred->regex.match(current->comm, &pred->regex,
+ cmp = pred->regex->match(current->comm, pred->regex,
TASK_COMM_LEN);
return cmp ^ pred->not;
}
-static int filter_pred_none(struct filter_pred *pred, void *event)
+/* Filter predicate for functions. */
+static int filter_pred_function(struct filter_pred *pred, void *event)
{
- return 0;
+ unsigned long *addr = (unsigned long *)(event + pred->offset);
+ unsigned long start = (unsigned long)pred->val;
+ unsigned long end = (unsigned long)pred->val2;
+ int ret = *addr >= start && *addr < end;
+
+ return pred->op == OP_EQ ? ret : !ret;
}
/*
@@ -756,7 +1018,7 @@ static int filter_pred_none(struct filter_pred *pred, void *event)
*
* Note:
* - @str might not be NULL-terminated if it's of type DYN_STRING
- * or STATIC_STRING, unless @len is zero.
+ * RDYN_STRING, or STATIC_STRING, unless @len is zero.
*/
static int regex_match_full(char *str, struct regex *r, int len)
@@ -861,7 +1123,7 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
static void filter_build_regex(struct filter_pred *pred)
{
- struct regex *r = &pred->regex;
+ struct regex *r = pred->regex;
char *search;
enum regex_type type = MATCH_FULL;
@@ -892,6 +1154,19 @@ static void filter_build_regex(struct filter_pred *pred)
}
}
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+static int test_pred_visited_fn(struct filter_pred *pred, void *event);
+#else
+static int test_pred_visited_fn(struct filter_pred *pred, void *event)
+{
+ return 0;
+}
+#endif
+
+
+static int filter_pred_fn_call(struct filter_pred *pred, void *event);
+
/* return 1 if event matches, 0 otherwise (discard) */
int filter_match_preds(struct event_filter *filter, void *rec)
{
@@ -909,7 +1184,7 @@ int filter_match_preds(struct event_filter *filter, void *rec)
for (i = 0; prog[i].pred; i++) {
struct filter_pred *pred = prog[i].pred;
- int match = pred->fn(pred, rec);
+ int match = filter_pred_fn_call(pred, rec);
if (match == prog[i].when_to_branch)
i = prog[i].target;
}
@@ -1013,7 +1288,7 @@ static void free_prog(struct event_filter *filter)
return;
for (i = 0; prog[i].pred; i++)
- kfree(prog[i].pred);
+ free_predicate(prog[i].pred);
kfree(prog);
}
@@ -1080,8 +1355,15 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
int filter_assign_type(const char *type)
{
- if (strstr(type, "__data_loc") && strstr(type, "char"))
- return FILTER_DYN_STRING;
+ if (strstr(type, "__data_loc")) {
+ if (strstr(type, "char"))
+ return FILTER_DYN_STRING;
+ if (strstr(type, "cpumask_t"))
+ return FILTER_CPUMASK;
+ }
+
+ if (strstr(type, "__rel_loc") && strstr(type, "char"))
+ return FILTER_RDYN_STRING;
if (strchr(type, '[') && strstr(type, "char"))
return FILTER_STATIC_STRING;
@@ -1092,10 +1374,10 @@ int filter_assign_type(const char *type)
return FILTER_OTHER;
}
-static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
- int field_size, int field_is_signed)
+static enum filter_pred_fn select_comparison_fn(enum filter_op_ids op,
+ int field_size, int field_is_signed)
{
- filter_pred_fn_t fn = NULL;
+ enum filter_pred_fn fn = FILTER_PRED_FN_NOP;
int pred_func_index = -1;
switch (op) {
@@ -1104,50 +1386,115 @@ static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
break;
default:
if (WARN_ON_ONCE(op < PRED_FUNC_START))
- return NULL;
+ return fn;
pred_func_index = op - PRED_FUNC_START;
if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX))
- return NULL;
+ return fn;
}
switch (field_size) {
case 8:
if (pred_func_index < 0)
- fn = filter_pred_64;
+ fn = FILTER_PRED_FN_64;
else if (field_is_signed)
- fn = pred_funcs_s64[pred_func_index];
+ fn = FILTER_PRED_FN_S64;
else
- fn = pred_funcs_u64[pred_func_index];
+ fn = FILTER_PRED_FN_U64;
break;
case 4:
if (pred_func_index < 0)
- fn = filter_pred_32;
+ fn = FILTER_PRED_FN_32;
else if (field_is_signed)
- fn = pred_funcs_s32[pred_func_index];
+ fn = FILTER_PRED_FN_S32;
else
- fn = pred_funcs_u32[pred_func_index];
+ fn = FILTER_PRED_FN_U32;
break;
case 2:
if (pred_func_index < 0)
- fn = filter_pred_16;
+ fn = FILTER_PRED_FN_16;
else if (field_is_signed)
- fn = pred_funcs_s16[pred_func_index];
+ fn = FILTER_PRED_FN_S16;
else
- fn = pred_funcs_u16[pred_func_index];
+ fn = FILTER_PRED_FN_U16;
break;
case 1:
if (pred_func_index < 0)
- fn = filter_pred_8;
+ fn = FILTER_PRED_FN_8;
else if (field_is_signed)
- fn = pred_funcs_s8[pred_func_index];
+ fn = FILTER_PRED_FN_S8;
else
- fn = pred_funcs_u8[pred_func_index];
+ fn = FILTER_PRED_FN_U8;
break;
}
return fn;
}
+
+static int filter_pred_fn_call(struct filter_pred *pred, void *event)
+{
+ switch (pred->fn_num) {
+ case FILTER_PRED_FN_64:
+ return filter_pred_64(pred, event);
+ case FILTER_PRED_FN_64_CPUMASK:
+ return filter_pred_64_cpumask(pred, event);
+ case FILTER_PRED_FN_S64:
+ return filter_pred_s64(pred, event);
+ case FILTER_PRED_FN_U64:
+ return filter_pred_u64(pred, event);
+ case FILTER_PRED_FN_32:
+ return filter_pred_32(pred, event);
+ case FILTER_PRED_FN_32_CPUMASK:
+ return filter_pred_32_cpumask(pred, event);
+ case FILTER_PRED_FN_S32:
+ return filter_pred_s32(pred, event);
+ case FILTER_PRED_FN_U32:
+ return filter_pred_u32(pred, event);
+ case FILTER_PRED_FN_16:
+ return filter_pred_16(pred, event);
+ case FILTER_PRED_FN_16_CPUMASK:
+ return filter_pred_16_cpumask(pred, event);
+ case FILTER_PRED_FN_S16:
+ return filter_pred_s16(pred, event);
+ case FILTER_PRED_FN_U16:
+ return filter_pred_u16(pred, event);
+ case FILTER_PRED_FN_8:
+ return filter_pred_8(pred, event);
+ case FILTER_PRED_FN_8_CPUMASK:
+ return filter_pred_8_cpumask(pred, event);
+ case FILTER_PRED_FN_S8:
+ return filter_pred_s8(pred, event);
+ case FILTER_PRED_FN_U8:
+ return filter_pred_u8(pred, event);
+ case FILTER_PRED_FN_COMM:
+ return filter_pred_comm(pred, event);
+ case FILTER_PRED_FN_STRING:
+ return filter_pred_string(pred, event);
+ case FILTER_PRED_FN_STRLOC:
+ return filter_pred_strloc(pred, event);
+ case FILTER_PRED_FN_STRRELLOC:
+ return filter_pred_strrelloc(pred, event);
+ case FILTER_PRED_FN_PCHAR_USER:
+ return filter_pred_pchar_user(pred, event);
+ case FILTER_PRED_FN_PCHAR:
+ return filter_pred_pchar(pred, event);
+ case FILTER_PRED_FN_CPU:
+ return filter_pred_cpu(pred, event);
+ case FILTER_PRED_FN_CPU_CPUMASK:
+ return filter_pred_cpu_cpumask(pred, event);
+ case FILTER_PRED_FN_CPUMASK:
+ return filter_pred_cpumask(pred, event);
+ case FILTER_PRED_FN_CPUMASK_CPU:
+ return filter_pred_cpumask_cpu(pred, event);
+ case FILTER_PRED_FN_FUNCTION:
+ return filter_pred_function(pred, event);
+ case FILTER_PRED_TEST_VISITED:
+ return test_pred_visited_fn(pred, event);
+ default:
+ return 0;
+ }
+}
+
/* Called when a predicate is encountered by predicate_parse() */
static int parse_pred(const char *str, void *data,
int pos, struct filter_parse_error *pe,
@@ -1156,8 +1503,14 @@ static int parse_pred(const char *str, void *data,
struct trace_event_call *call = data;
struct ftrace_event_field *field;
struct filter_pred *pred = NULL;
+ unsigned long offset;
+ unsigned long size;
+ unsigned long ip;
char num_buf[24]; /* Big enough to hold an address */
char *field_name;
+ char *name;
+ bool function = false;
+ bool ustring = false;
char q;
u64 val;
int len;
@@ -1192,6 +1545,18 @@ static int parse_pred(const char *str, void *data,
return -EINVAL;
}
+ /* See if the field is a user space string */
+ if ((len = str_has_prefix(str + i, ".ustring"))) {
+ ustring = true;
+ i += len;
+ }
+
+ /* See if the field is a kernel function name */
+ if ((len = str_has_prefix(str + i, ".function"))) {
+ function = true;
+ i += len;
+ }
+
while (isspace(str[i]))
i++;
@@ -1222,7 +1587,71 @@ static int parse_pred(const char *str, void *data,
pred->offset = field->offset;
pred->op = op;
- if (ftrace_event_is_function(call)) {
+ if (function) {
+ /* The field must be the same size as long */
+ if (field->size != sizeof(long)) {
+ parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i);
+ goto err_free;
+ }
+
+ /* Function only works with '==' or '!=' and an unquoted string */
+ switch (op) {
+ case OP_NE:
+ case OP_EQ:
+ break;
+ default:
+ parse_error(pe, FILT_ERR_INVALID_OP, pos + i);
+ goto err_free;
+ }
+
+ if (isdigit(str[i])) {
+ /* We allow 0xDEADBEEF */
+ while (isalnum(str[i]))
+ i++;
+
+ len = i - s;
+ /* 0xfeedfacedeadbeef is 18 chars max */
+ if (len >= sizeof(num_buf)) {
+ parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i);
+ goto err_free;
+ }
+
+ strncpy(num_buf, str + s, len);
+ num_buf[len] = 0;
+
+ ret = kstrtoul(num_buf, 0, &ip);
+ if (ret) {
+ parse_error(pe, FILT_ERR_INVALID_VALUE, pos + i);
+ goto err_free;
+ }
+ } else {
+ s = i;
+ for (; str[i] && !isspace(str[i]); i++)
+ ;
+
+ len = i - s;
+ name = kmemdup_nul(str + s, len, GFP_KERNEL);
+ if (!name)
+ goto err_mem;
+ ip = kallsyms_lookup_name(name);
+ kfree(name);
+ if (!ip) {
+ parse_error(pe, FILT_ERR_NO_FUNCTION, pos + i);
+ goto err_free;
+ }
+ }
+
+ /* Now find the function start and end address */
+ if (!kallsyms_lookup_size_offset(ip, &size, &offset)) {
+ parse_error(pe, FILT_ERR_NO_FUNCTION, pos + i);
+ goto err_free;
+ }
+
+ pred->fn_num = FILTER_PRED_FN_FUNCTION;
+ pred->val = ip - offset;
+ pred->val2 = pred->val + size;
+
+ } else if (ftrace_event_is_function(call)) {
/*
* Perf does things different with function events.
* It only allows an "ip" field, and expects a string.
@@ -1234,7 +1663,7 @@ static int parse_pred(const char *str, void *data,
parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i);
goto err_free;
}
- pred->fn = filter_pred_none;
+ pred->fn_num = FILTER_PRED_FN_NOP;
/*
* Quotes are not required, but if they exist then we need
@@ -1261,9 +1690,130 @@ static int parse_pred(const char *str, void *data,
goto err_free;
}
- pred->regex.len = len;
- strncpy(pred->regex.pattern, str + s, len);
- pred->regex.pattern[len] = 0;
+ pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL);
+ if (!pred->regex)
+ goto err_mem;
+ pred->regex->len = len;
+ strncpy(pred->regex->pattern, str + s, len);
+ pred->regex->pattern[len] = 0;
+
+ } else if (!strncmp(str + i, "CPUS", 4)) {
+ unsigned int maskstart;
+ bool single;
+ char *tmp;
+
+ switch (field->filter_type) {
+ case FILTER_CPUMASK:
+ case FILTER_CPU:
+ case FILTER_OTHER:
+ break;
+ default:
+ parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i);
+ goto err_free;
+ }
+
+ switch (op) {
+ case OP_EQ:
+ case OP_NE:
+ case OP_BAND:
+ break;
+ default:
+ parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i);
+ goto err_free;
+ }
+
+ /* Skip CPUS */
+ i += 4;
+ if (str[i++] != '{') {
+ parse_error(pe, FILT_ERR_MISSING_BRACE_OPEN, pos + i);
+ goto err_free;
+ }
+ maskstart = i;
+
+ /* Walk the cpulist until closing } */
+ for (; str[i] && str[i] != '}'; i++)
+ ;
+
+ if (str[i] != '}') {
+ parse_error(pe, FILT_ERR_MISSING_BRACE_CLOSE, pos + i);
+ goto err_free;
+ }
+
+ if (maskstart == i) {
+ parse_error(pe, FILT_ERR_INVALID_CPULIST, pos + i);
+ goto err_free;
+ }
+
+ /* Copy the cpulist between { and } */
+ tmp = kmalloc((i - maskstart) + 1, GFP_KERNEL);
+ if (!tmp)
+ goto err_mem;
+
+ strscpy(tmp, str + maskstart, (i - maskstart) + 1);
+ pred->mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ if (!pred->mask) {
+ kfree(tmp);
+ goto err_mem;
+ }
+
+ /* Now parse it */
+ if (cpulist_parse(tmp, pred->mask)) {
+ kfree(tmp);
+ parse_error(pe, FILT_ERR_INVALID_CPULIST, pos + i);
+ goto err_free;
+ }
+ kfree(tmp);
+
+ /* Move along */
+ i++;
+
+ /*
+ * Optimisation: if the user-provided mask has a weight of one
+ * then we can treat it as a scalar input.
+ */
+ single = cpumask_weight(pred->mask) == 1;
+ if (single) {
+ pred->val = cpumask_first(pred->mask);
+ kfree(pred->mask);
+ pred->mask = NULL;
+ }
+
+ if (field->filter_type == FILTER_CPUMASK) {
+ pred->fn_num = single ?
+ FILTER_PRED_FN_CPUMASK_CPU :
+ FILTER_PRED_FN_CPUMASK;
+ } else if (field->filter_type == FILTER_CPU) {
+ if (single) {
+ if (pred->op == OP_BAND)
+ pred->op = OP_EQ;
+
+ pred->fn_num = FILTER_PRED_FN_CPU;
+ } else {
+ pred->fn_num = FILTER_PRED_FN_CPU_CPUMASK;
+ }
+ } else if (single) {
+ if (pred->op == OP_BAND)
+ pred->op = OP_EQ;
+
+ pred->fn_num = select_comparison_fn(pred->op, field->size, false);
+ if (pred->op == OP_NE)
+ pred->not = 1;
+ } else {
+ switch (field->size) {
+ case 8:
+ pred->fn_num = FILTER_PRED_FN_64_CPUMASK;
+ break;
+ case 4:
+ pred->fn_num = FILTER_PRED_FN_32_CPUMASK;
+ break;
+ case 2:
+ pred->fn_num = FILTER_PRED_FN_16_CPUMASK;
+ break;
+ case 1:
+ pred->fn_num = FILTER_PRED_FN_8_CPUMASK;
+ break;
+ }
+ }
/* This is either a string, or an integer */
} else if (str[i] == '\'' || str[i] == '"') {
@@ -1273,7 +1823,7 @@ static int parse_pred(const char *str, void *data,
switch (op) {
case OP_NE:
pred->not = 1;
- /* Fall through */
+ fallthrough;
case OP_GLOB:
case OP_EQ:
break;
@@ -1305,23 +1855,40 @@ static int parse_pred(const char *str, void *data,
goto err_free;
}
- pred->regex.len = len;
- strncpy(pred->regex.pattern, str + s, len);
- pred->regex.pattern[len] = 0;
+ pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL);
+ if (!pred->regex)
+ goto err_mem;
+ pred->regex->len = len;
+ strncpy(pred->regex->pattern, str + s, len);
+ pred->regex->pattern[len] = 0;
filter_build_regex(pred);
if (field->filter_type == FILTER_COMM) {
- pred->fn = filter_pred_comm;
+ pred->fn_num = FILTER_PRED_FN_COMM;
} else if (field->filter_type == FILTER_STATIC_STRING) {
- pred->fn = filter_pred_string;
- pred->regex.field_len = field->size;
+ pred->fn_num = FILTER_PRED_FN_STRING;
+ pred->regex->field_len = field->size;
- } else if (field->filter_type == FILTER_DYN_STRING)
- pred->fn = filter_pred_strloc;
- else
- pred->fn = filter_pred_pchar;
+ } else if (field->filter_type == FILTER_DYN_STRING) {
+ pred->fn_num = FILTER_PRED_FN_STRLOC;
+ } else if (field->filter_type == FILTER_RDYN_STRING)
+ pred->fn_num = FILTER_PRED_FN_STRRELLOC;
+ else {
+
+ if (!ustring_per_cpu) {
+ /* Once allocated, keep it around for good */
+ ustring_per_cpu = alloc_percpu(struct ustring_buffer);
+ if (!ustring_per_cpu)
+ goto err_mem;
+ }
+
+ if (ustring)
+ pred->fn_num = FILTER_PRED_FN_PCHAR_USER;
+ else
+ pred->fn_num = FILTER_PRED_FN_PCHAR;
+ }
/* go past the last quote */
i++;
@@ -1368,10 +1935,10 @@ static int parse_pred(const char *str, void *data,
pred->val = val;
if (field->filter_type == FILTER_CPU)
- pred->fn = filter_pred_cpu;
+ pred->fn_num = FILTER_PRED_FN_CPU;
else {
- pred->fn = select_comparison_fn(pred->op, field->size,
- field->is_signed);
+ pred->fn_num = select_comparison_fn(pred->op, field->size,
+ field->is_signed);
if (pred->op == OP_NE)
pred->not = 1;
}
@@ -1385,8 +1952,11 @@ static int parse_pred(const char *str, void *data,
return i;
err_free:
- kfree(pred);
+ free_predicate(pred);
return -EINVAL;
+err_mem:
+ free_predicate(pred);
+ return -ENOMEM;
}
enum {
@@ -1561,27 +2131,6 @@ static inline void event_clear_filter(struct trace_event_file *file)
RCU_INIT_POINTER(file->filter, NULL);
}
-static inline void
-event_set_no_set_filter_flag(struct trace_event_file *file)
-{
- file->flags |= EVENT_FILE_FL_NO_SET_FILTER;
-}
-
-static inline void
-event_clear_no_set_filter_flag(struct trace_event_file *file)
-{
- file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER;
-}
-
-static inline bool
-event_no_set_filter_flag(struct trace_event_file *file)
-{
- if (file->flags & EVENT_FILE_FL_NO_SET_FILTER)
- return true;
-
- return false;
-}
-
struct filter_list {
struct list_head list;
struct event_filter *filter;
@@ -1714,8 +2263,9 @@ static void create_filter_finish(struct filter_parse_error *pe)
/**
* create_filter - create a filter for a trace_event_call
+ * @tr: the trace array associated with these events
* @call: trace_event_call to create a filter for
- * @filter_str: filter string
+ * @filter_string: filter string
* @set_str: remember @filter_str and enable detailed error in filter
* @filterp: out param for created filter (always updated on return)
* Must be a pointer that references a NULL pointer.
@@ -1762,8 +2312,8 @@ int create_event_filter(struct trace_array *tr,
}
/**
- * create_system_filter - create a filter for an event_subsystem
- * @system: event_subsystem to create a filter for
+ * create_system_filter - create a filter for an event subsystem
+ * @dir: the descriptor for the subsystem directory
* @filter_str: filter string
* @filterp: out param for created filter (always updated on return)
*
@@ -1771,7 +2321,6 @@ int create_event_filter(struct trace_array *tr,
* and always remembers @filter_str.
*/
static int create_system_filter(struct trace_subsystem_dir *dir,
- struct trace_array *tr,
char *filter_str, struct event_filter **filterp)
{
struct filter_parse_error *pe = NULL;
@@ -1779,13 +2328,13 @@ static int create_system_filter(struct trace_subsystem_dir *dir,
err = create_filter_start(filter_str, true, &pe, filterp);
if (!err) {
- err = process_system_preds(dir, tr, pe, filter_str);
+ err = process_system_preds(dir, dir->tr, pe, filter_str);
if (!err) {
/* System filters just show a default message */
kfree((*filterp)->filter_string);
(*filterp)->filter_string = NULL;
} else {
- append_filter_err(tr, pe, *filterp);
+ append_filter_err(dir->tr, pe, *filterp);
}
}
create_filter_finish(pe);
@@ -1800,6 +2349,9 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string)
struct event_filter *filter = NULL;
int err;
+ if (file->flags & EVENT_FILE_FL_FREED)
+ return -ENODEV;
+
if (!strcmp(strstrip(filter_string), "0")) {
filter_disable(file);
filter = event_filter(file);
@@ -1873,7 +2425,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
goto out_unlock;
}
- err = create_system_filter(dir, tr, filter_string, &filter);
+ err = create_system_filter(dir, filter_string, &filter);
if (filter) {
/*
* No event actually uses the system filter
@@ -1950,7 +2502,7 @@ static int __ftrace_function_set_filter(int filter, char *buf, int len,
/*
* The 'ip' field could have multiple filters set, separated
* either by space or comma. We first cut the filter and apply
- * all pieces separatelly.
+ * all pieces separately.
*/
re = ftrace_function_filter_re(buf, len, &re_cnt);
if (!re)
@@ -1999,8 +2551,8 @@ static int ftrace_function_set_filter_pred(struct filter_pred *pred,
return ret;
return __ftrace_function_set_filter(pred->op == OP_EQ,
- pred->regex.pattern,
- pred->regex.len,
+ pred->regex->pattern,
+ pred->regex->len,
data);
}
@@ -2196,7 +2748,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields)
struct filter_pred *pred = prog[i].pred;
struct ftrace_event_field *field = pred->field;
- WARN_ON_ONCE(!pred->fn);
+ WARN_ON_ONCE(pred->fn_num == FILTER_PRED_FN_NOP);
if (!field) {
WARN_ONCE(1, "all leafs should have field defined %d", i);
@@ -2206,7 +2758,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields)
if (!strchr(fields, *field->name))
continue;
- pred->fn = test_pred_visited_fn;
+ pred->fn_num = FILTER_PRED_TEST_VISITED;
}
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 0b933546142e..6ece1308d36a 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -65,7 +65,12 @@
C(INVALID_SORT_MODIFIER,"Invalid sort modifier"), \
C(EMPTY_SORT_FIELD, "Empty sort field"), \
C(TOO_MANY_SORT_FIELDS, "Too many sort fields (Max = 2)"), \
- C(INVALID_SORT_FIELD, "Sort field must be a key or a val"),
+ C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \
+ C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \
+ C(EXPECT_NUMBER, "Expecting numeric literal"), \
+ C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), \
+ C(DIVISION_BY_ZERO, "Division by zero"), \
+ C(NEED_NOHC_VAL, "Non-hitcount value is required for 'nohitcount'"),
#undef C
#define C(a, b) HIST_ERR_##a
@@ -81,18 +86,56 @@ struct hist_field;
typedef u64 (*hist_field_fn_t) (struct hist_field *field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event);
#define HIST_FIELD_OPERANDS_MAX 2
#define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX)
#define HIST_ACTIONS_MAX 8
+#define HIST_CONST_DIGITS_MAX 21
+#define HIST_DIV_SHIFT 20 /* For optimizing division by constants */
enum field_op_id {
FIELD_OP_NONE,
FIELD_OP_PLUS,
FIELD_OP_MINUS,
FIELD_OP_UNARY_MINUS,
+ FIELD_OP_DIV,
+ FIELD_OP_MULT,
+};
+
+enum hist_field_fn {
+ HIST_FIELD_FN_NOP,
+ HIST_FIELD_FN_VAR_REF,
+ HIST_FIELD_FN_COUNTER,
+ HIST_FIELD_FN_CONST,
+ HIST_FIELD_FN_LOG2,
+ HIST_FIELD_FN_BUCKET,
+ HIST_FIELD_FN_TIMESTAMP,
+ HIST_FIELD_FN_CPU,
+ HIST_FIELD_FN_STRING,
+ HIST_FIELD_FN_DYNSTRING,
+ HIST_FIELD_FN_RELDYNSTRING,
+ HIST_FIELD_FN_PSTRING,
+ HIST_FIELD_FN_S64,
+ HIST_FIELD_FN_U64,
+ HIST_FIELD_FN_S32,
+ HIST_FIELD_FN_U32,
+ HIST_FIELD_FN_S16,
+ HIST_FIELD_FN_U16,
+ HIST_FIELD_FN_S8,
+ HIST_FIELD_FN_U8,
+ HIST_FIELD_FN_UMINUS,
+ HIST_FIELD_FN_MINUS,
+ HIST_FIELD_FN_PLUS,
+ HIST_FIELD_FN_DIV,
+ HIST_FIELD_FN_MULT,
+ HIST_FIELD_FN_DIV_POWER2,
+ HIST_FIELD_FN_DIV_NOT_POWER2,
+ HIST_FIELD_FN_DIV_MULT_SHIFT,
+ HIST_FIELD_FN_EXECNAME,
+ HIST_FIELD_FN_STACK,
};
/*
@@ -114,14 +157,15 @@ struct hist_var {
struct hist_field {
struct ftrace_event_field *field;
unsigned long flags;
- hist_field_fn_t fn;
+ unsigned long buckets;
+ const char *type;
+ struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
+ struct hist_trigger_data *hist_data;
+ enum hist_field_fn fn_num;
unsigned int ref;
unsigned int size;
unsigned int offset;
unsigned int is_signed;
- const char *type;
- struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
- struct hist_trigger_data *hist_data;
/*
* Variable fields contain variable-specific info in var.
@@ -147,18 +191,33 @@ struct hist_field {
*/
unsigned int var_ref_idx;
bool read_once;
+
+ unsigned int var_str_idx;
+
+ /* Numeric literals are represented as u64 */
+ u64 constant;
+ /* Used to optimize division by constants */
+ u64 div_multiplier;
};
-static u64 hist_field_none(struct hist_field *field,
+static u64 hist_fn_call(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event);
+
+static u64 hist_field_const(struct hist_field *field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
- return 0;
+ return field->constant;
}
static u64 hist_field_counter(struct hist_field *field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
@@ -167,6 +226,7 @@ static u64 hist_field_counter(struct hist_field *field,
static u64 hist_field_string(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
@@ -177,6 +237,7 @@ static u64 hist_field_string(struct hist_field *hist_field,
static u64 hist_field_dynstring(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
@@ -187,8 +248,23 @@ static u64 hist_field_dynstring(struct hist_field *hist_field,
return (u64)(unsigned long)addr;
}
+static u64 hist_field_reldynstring(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ u32 *item = event + hist_field->field->offset;
+ u32 str_item = *item;
+ int str_loc = str_item & 0xffff;
+ char *addr = (char *)&item[1] + str_loc;
+
+ return (u64)(unsigned long)addr;
+}
+
static u64 hist_field_pstring(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
@@ -199,52 +275,177 @@ static u64 hist_field_pstring(struct hist_field *hist_field,
static u64 hist_field_log2(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
struct hist_field *operand = hist_field->operands[0];
- u64 val = operand->fn(operand, elt, rbe, event);
+ u64 val = hist_fn_call(operand, elt, buffer, rbe, event);
return (u64) ilog2(roundup_pow_of_two(val));
}
+static u64 hist_field_bucket(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand = hist_field->operands[0];
+ unsigned long buckets = hist_field->buckets;
+
+ u64 val = hist_fn_call(operand, elt, buffer, rbe, event);
+
+ if (WARN_ON_ONCE(!buckets))
+ return val;
+
+ if (val >= LONG_MAX)
+ val = div64_ul(val, buckets);
+ else
+ val = (u64)((unsigned long)val / buckets);
+ return val * buckets;
+}
+
static u64 hist_field_plus(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, rbe, event);
- u64 val2 = operand2->fn(operand2, elt, rbe, event);
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+ u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
return val1 + val2;
}
static u64 hist_field_minus(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, rbe, event);
- u64 val2 = operand2->fn(operand2, elt, rbe, event);
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+ u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
return val1 - val2;
}
+static u64 hist_field_div(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+ u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
+
+ /* Return -1 for the undefined case */
+ if (!val2)
+ return -1;
+
+ /* Use shift if the divisor is a power of 2 */
+ if (!(val2 & (val2 - 1)))
+ return val1 >> __ffs64(val2);
+
+ return div64_u64(val1, val2);
+}
+
+static u64 div_by_power_of_two(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+
+ return val1 >> __ffs64(operand2->constant);
+}
+
+static u64 div_by_not_power_of_two(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+
+ return div64_u64(val1, operand2->constant);
+}
+
+static u64 div_by_mult_and_shift(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+
+ /*
+ * If the divisor is a constant, do a multiplication and shift instead.
+ *
+ * Choose Z = some power of 2. If Y <= Z, then:
+ * X / Y = (X * (Z / Y)) / Z
+ *
+ * (Z / Y) is a constant (mult) which is calculated at parse time, so:
+ * X / Y = (X * mult) / Z
+ *
+ * The division by Z can be replaced by a shift since Z is a power of 2:
+ * X / Y = (X * mult) >> HIST_DIV_SHIFT
+ *
+ * As long, as X < Z the results will not be off by more than 1.
+ */
+ if (val1 < (1 << HIST_DIV_SHIFT)) {
+ u64 mult = operand2->div_multiplier;
+
+ return (val1 * mult + ((1 << HIST_DIV_SHIFT) - 1)) >> HIST_DIV_SHIFT;
+ }
+
+ return div64_u64(val1, operand2->constant);
+}
+
+static u64 hist_field_mult(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+ u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
+
+ return val1 * val2;
+}
+
static u64 hist_field_unary_minus(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
struct hist_field *operand = hist_field->operands[0];
- s64 sval = (s64)operand->fn(operand, elt, rbe, event);
+ s64 sval = (s64)hist_fn_call(operand, elt, buffer, rbe, event);
u64 val = (u64)-sval;
return val;
@@ -253,6 +454,7 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field,
#define DEFINE_HIST_FIELD_FN(type) \
static u64 hist_field_##type(struct hist_field *hist_field, \
struct tracing_map_elt *elt, \
+ struct trace_buffer *buffer, \
struct ring_buffer_event *rbe, \
void *event) \
{ \
@@ -279,10 +481,6 @@ DEFINE_HIST_FIELD_FN(u8);
#define for_each_hist_key_field(i, hist_data) \
for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++)
-#define HIST_STACKTRACE_DEPTH 16
-#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long))
-#define HIST_STACKTRACE_SKIP 5
-
#define HITCOUNT_IDX 0
#define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE)
@@ -304,6 +502,10 @@ enum hist_field_flags {
HIST_FIELD_FL_VAR_REF = 1 << 14,
HIST_FIELD_FL_CPU = 1 << 15,
HIST_FIELD_FL_ALIAS = 1 << 16,
+ HIST_FIELD_FL_BUCKET = 1 << 17,
+ HIST_FIELD_FL_CONST = 1 << 18,
+ HIST_FIELD_FL_PERCENT = 1 << 19,
+ HIST_FIELD_FL_GRAPH = 1 << 20,
};
struct var_defs {
@@ -322,6 +524,7 @@ struct hist_trigger_attrs {
bool cont;
bool clear;
bool ts_in_usecs;
+ bool no_hitcount;
unsigned int map_bits;
char *assignment_str[TRACING_MAP_VARS_MAX];
@@ -349,6 +552,7 @@ struct hist_trigger_data {
unsigned int n_keys;
unsigned int n_fields;
unsigned int n_vars;
+ unsigned int n_var_str;
unsigned int key_size;
struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX];
unsigned int n_sort_keys;
@@ -377,7 +581,8 @@ struct hist_trigger_data {
struct action_data;
typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data, u64 *var_ref_vals);
@@ -413,7 +618,7 @@ struct action_data {
* event param, and is passed to the synthetic event
* invocation.
*/
- unsigned int var_ref_idx[TRACING_MAP_VARS_MAX];
+ unsigned int var_ref_idx[SYNTH_FIELDS_MAX];
struct synth_event *synth_event;
bool use_trace_keyword;
char *synth_event_name;
@@ -469,7 +674,8 @@ struct track_data {
struct hist_elt_data {
char *comm;
u64 *var_ref_vals;
- char *field_var_str[SYNTH_FIELDS_MAX];
+ char **field_var_str;
+ int n_field_var_str;
};
struct snapshot_context {
@@ -477,6 +683,25 @@ struct snapshot_context {
void *key;
};
+/*
+ * Returns the specific division function to use if the divisor
+ * is constant. This avoids extra branches when the trigger is hit.
+ */
+static enum hist_field_fn hist_field_get_div_fn(struct hist_field *divisor)
+{
+ u64 div = divisor->constant;
+
+ if (!(div & (div - 1)))
+ return HIST_FIELD_FN_DIV_POWER2;
+
+ /* If the divisor is too large, do a regular division */
+ if (div > (1 << HIST_DIV_SHIFT))
+ return HIST_FIELD_FN_DIV_NOT_POWER2;
+
+ divisor->div_multiplier = div64_u64((u64)(1 << HIST_DIV_SHIFT), div);
+ return HIST_FIELD_FN_DIV_MULT_SHIFT;
+}
+
static void track_data_free(struct track_data *track_data)
{
struct hist_elt_data *elt_data;
@@ -532,11 +757,16 @@ static struct track_data *track_data_alloc(unsigned int key_len,
return data;
}
-static char last_cmd[MAX_FILTER_STR_VAL];
+#define HIST_PREFIX "hist:"
+
+static char *last_cmd;
static char last_cmd_loc[MAX_FILTER_STR_VAL];
static int errpos(char *str)
{
+ if (!str || !last_cmd)
+ return 0;
+
return err_pos(last_cmd, str);
}
@@ -548,8 +778,11 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
if (!str)
return;
- strcpy(last_cmd, "hist:");
- strncat(last_cmd, str, MAX_FILTER_STR_VAL - 1 - sizeof("hist:"));
+ kfree(last_cmd);
+
+ last_cmd = kasprintf(GFP_KERNEL, HIST_PREFIX "%s", str);
+ if (!last_cmd)
+ return;
if (file) {
call = file->event_call;
@@ -562,18 +795,22 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
}
if (system)
- snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name);
+ snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, HIST_PREFIX "%s:%s", system, name);
}
-static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos)
+static void hist_err(struct trace_array *tr, u8 err_type, u16 err_pos)
{
+ if (!last_cmd)
+ return;
+
tracing_log_err(tr, last_cmd_loc, last_cmd, err_text,
err_type, err_pos);
}
static void hist_err_clear(void)
{
- last_cmd[0] = '\0';
+ if (last_cmd)
+ last_cmd[0] = '\0';
last_cmd_loc[0] = '\0';
}
@@ -605,7 +842,8 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
}
static void action_trace(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data, u64 *var_ref_vals)
{
@@ -621,13 +859,14 @@ struct hist_var_data {
static u64 hist_field_timestamp(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
struct hist_trigger_data *hist_data = hist_field->hist_data;
struct trace_array *tr = hist_data->event_file->tr;
- u64 ts = ring_buffer_event_time_stamp(rbe);
+ u64 ts = ring_buffer_event_time_stamp(buffer, rbe);
if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr))
ts = ns2usecs(ts);
@@ -637,6 +876,7 @@ static u64 hist_field_timestamp(struct hist_field *hist_field,
static u64 hist_field_cpu(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
@@ -737,7 +977,7 @@ static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data,
* A trigger can define one or more variables. If any one of them is
* currently referenced by any other trigger, this function will
* determine that.
-
+ *
* Typically used to determine whether or not a trigger can be removed
* - if there are any references to a trigger's variables, it cannot.
*
@@ -1017,6 +1257,7 @@ static struct hist_field *find_event_var(struct hist_trigger_data *hist_data,
static u64 hist_field_var_ref(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
@@ -1083,16 +1324,20 @@ static const char *hist_field_name(struct hist_field *field,
{
const char *field_name = "";
+ if (WARN_ON_ONCE(!field))
+ return field_name;
+
if (level > 1)
return field_name;
if (field->field)
field_name = field->field->name;
else if (field->flags & HIST_FIELD_FL_LOG2 ||
- field->flags & HIST_FIELD_FL_ALIAS)
+ field->flags & HIST_FIELD_FL_ALIAS ||
+ field->flags & HIST_FIELD_FL_BUCKET)
field_name = hist_field_name(field->operands[0], ++level);
else if (field->flags & HIST_FIELD_FL_CPU)
- field_name = "cpu";
+ field_name = "common_cpu";
else if (field->flags & HIST_FIELD_FL_EXPR ||
field->flags & HIST_FIELD_FL_VAR_REF) {
if (field->system) {
@@ -1108,6 +1353,13 @@ static const char *hist_field_name(struct hist_field *field,
field_name = field->name;
} else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
field_name = "common_timestamp";
+ else if (field->flags & HIST_FIELD_FL_STACKTRACE) {
+ if (field->field)
+ field_name = field->field->name;
+ else
+ field_name = "common_stacktrace";
+ } else if (field->flags & HIST_FIELD_FL_HITCOUNT)
+ field_name = "hitcount";
if (field_name == NULL)
field_name = "";
@@ -1115,38 +1367,32 @@ static const char *hist_field_name(struct hist_field *field,
return field_name;
}
-static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
+static enum hist_field_fn select_value_fn(int field_size, int field_is_signed)
{
- hist_field_fn_t fn = NULL;
-
switch (field_size) {
case 8:
if (field_is_signed)
- fn = hist_field_s64;
+ return HIST_FIELD_FN_S64;
else
- fn = hist_field_u64;
- break;
+ return HIST_FIELD_FN_U64;
case 4:
if (field_is_signed)
- fn = hist_field_s32;
+ return HIST_FIELD_FN_S32;
else
- fn = hist_field_u32;
- break;
+ return HIST_FIELD_FN_U32;
case 2:
if (field_is_signed)
- fn = hist_field_s16;
+ return HIST_FIELD_FN_S16;
else
- fn = hist_field_u16;
- break;
+ return HIST_FIELD_FN_U16;
case 1:
if (field_is_signed)
- fn = hist_field_s8;
+ return HIST_FIELD_FN_S8;
else
- fn = hist_field_u8;
- break;
+ return HIST_FIELD_FN_U8;
}
- return fn;
+ return HIST_FIELD_FN_NOP;
}
static int parse_map_size(char *str)
@@ -1304,7 +1550,10 @@ parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str)
ret = parse_assignment(tr, str, attrs);
if (ret)
goto free;
- } else if (strcmp(str, "pause") == 0)
+ } else if (strcmp(str, "nohitcount") == 0 ||
+ strcmp(str, "NOHC") == 0)
+ attrs->no_hitcount = true;
+ else if (strcmp(str, "pause") == 0)
attrs->pause = true;
else if ((strcmp(str, "cont") == 0) ||
(strcmp(str, "continue") == 0))
@@ -1357,9 +1606,11 @@ static void hist_elt_data_free(struct hist_elt_data *elt_data)
{
unsigned int i;
- for (i = 0; i < SYNTH_FIELDS_MAX; i++)
+ for (i = 0; i < elt_data->n_field_var_str; i++)
kfree(elt_data->field_var_str[i]);
+ kfree(elt_data->field_var_str);
+
kfree(elt_data->comm);
kfree(elt_data);
}
@@ -1376,17 +1627,17 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
struct hist_trigger_data *hist_data = elt->map->private_data;
unsigned int size = TASK_COMM_LEN;
struct hist_elt_data *elt_data;
- struct hist_field *key_field;
+ struct hist_field *hist_field;
unsigned int i, n_str;
elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL);
if (!elt_data)
return -ENOMEM;
- for_each_hist_key_field(i, hist_data) {
- key_field = hist_data->fields[i];
+ for_each_hist_field(i, hist_data) {
+ hist_field = hist_data->fields[i];
- if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
+ if (hist_field->flags & HIST_FIELD_FL_EXECNAME) {
elt_data->comm = kzalloc(size, GFP_KERNEL);
if (!elt_data->comm) {
kfree(elt_data);
@@ -1396,10 +1647,24 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
}
}
- n_str = hist_data->n_field_var_str + hist_data->n_save_var_str;
+ n_str = hist_data->n_field_var_str + hist_data->n_save_var_str +
+ hist_data->n_var_str;
+ if (n_str > SYNTH_FIELDS_MAX) {
+ hist_elt_data_free(elt_data);
+ return -EINVAL;
+ }
+
+ BUILD_BUG_ON(STR_VAR_LEN_MAX & (sizeof(u64) - 1));
size = STR_VAR_LEN_MAX;
+ elt_data->field_var_str = kcalloc(n_str, sizeof(char *), GFP_KERNEL);
+ if (!elt_data->field_var_str) {
+ hist_elt_data_free(elt_data);
+ return -EINVAL;
+ }
+ elt_data->n_field_var_str = n_str;
+
for (i = 0; i < n_str; i++) {
elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL);
if (!elt_data->field_var_str[i]) {
@@ -1443,8 +1708,16 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
flags_str = "syscall";
else if (hist_field->flags & HIST_FIELD_FL_LOG2)
flags_str = "log2";
+ else if (hist_field->flags & HIST_FIELD_FL_BUCKET)
+ flags_str = "buckets";
else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS)
flags_str = "usecs";
+ else if (hist_field->flags & HIST_FIELD_FL_PERCENT)
+ flags_str = "percent";
+ else if (hist_field->flags & HIST_FIELD_FL_GRAPH)
+ flags_str = "graph";
+ else if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
+ flags_str = "stacktrace";
return flags_str;
}
@@ -1453,6 +1726,12 @@ static void expr_field_str(struct hist_field *field, char *expr)
{
if (field->flags & HIST_FIELD_FL_VAR_REF)
strcat(expr, "$");
+ else if (field->flags & HIST_FIELD_FL_CONST) {
+ char str[HIST_CONST_DIGITS_MAX];
+
+ snprintf(str, HIST_CONST_DIGITS_MAX, "%llu", field->constant);
+ strcat(expr, str);
+ }
strcat(expr, hist_field_name(field, 0));
@@ -1508,6 +1787,12 @@ static char *expr_str(struct hist_field *field, unsigned int level)
case FIELD_OP_PLUS:
strcat(expr, "+");
break;
+ case FIELD_OP_DIV:
+ strcat(expr, "/");
+ break;
+ case FIELD_OP_MULT:
+ strcat(expr, "*");
+ break;
default:
kfree(expr);
return NULL;
@@ -1518,27 +1803,92 @@ static char *expr_str(struct hist_field *field, unsigned int level)
return expr;
}
-static int contains_operator(char *str)
+/*
+ * If field_op != FIELD_OP_NONE, *sep points to the root operator
+ * of the expression tree to be evaluated.
+ */
+static int contains_operator(char *str, char **sep)
{
enum field_op_id field_op = FIELD_OP_NONE;
- char *op;
+ char *minus_op, *plus_op, *div_op, *mult_op;
- op = strpbrk(str, "+-");
- if (!op)
- return FIELD_OP_NONE;
- switch (*op) {
- case '-':
- if (*str == '-')
+ /*
+ * Report the last occurrence of the operators first, so that the
+ * expression is evaluated left to right. This is important since
+ * subtraction and division are not associative.
+ *
+ * e.g
+ * 64/8/4/2 is 1, i.e 64/8/4/2 = ((64/8)/4)/2
+ * 14-7-5-2 is 0, i.e 14-7-5-2 = ((14-7)-5)-2
+ */
+
+ /*
+ * First, find lower precedence addition and subtraction
+ * since the expression will be evaluated recursively.
+ */
+ minus_op = strrchr(str, '-');
+ if (minus_op) {
+ /*
+ * Unary minus is not supported in sub-expressions. If
+ * present, it is always the next root operator.
+ */
+ if (minus_op == str) {
field_op = FIELD_OP_UNARY_MINUS;
- else
- field_op = FIELD_OP_MINUS;
- break;
- case '+':
- field_op = FIELD_OP_PLUS;
- break;
- default:
- break;
+ goto out;
+ }
+
+ field_op = FIELD_OP_MINUS;
+ }
+
+ plus_op = strrchr(str, '+');
+ if (plus_op || minus_op) {
+ /*
+ * For operators of the same precedence use to rightmost as the
+ * root, so that the expression is evaluated left to right.
+ */
+ if (plus_op > minus_op)
+ field_op = FIELD_OP_PLUS;
+ goto out;
+ }
+
+ /*
+ * Multiplication and division have higher precedence than addition and
+ * subtraction.
+ */
+ div_op = strrchr(str, '/');
+ if (div_op)
+ field_op = FIELD_OP_DIV;
+
+ mult_op = strrchr(str, '*');
+ /*
+ * For operators of the same precedence use to rightmost as the
+ * root, so that the expression is evaluated left to right.
+ */
+ if (mult_op > div_op)
+ field_op = FIELD_OP_MULT;
+
+out:
+ if (sep) {
+ switch (field_op) {
+ case FIELD_OP_UNARY_MINUS:
+ case FIELD_OP_MINUS:
+ *sep = minus_op;
+ break;
+ case FIELD_OP_PLUS:
+ *sep = plus_op;
+ break;
+ case FIELD_OP_DIV:
+ *sep = div_op;
+ break;
+ case FIELD_OP_MULT:
+ *sep = mult_op;
+ break;
+ case FIELD_OP_NONE:
+ default:
+ *sep = NULL;
+ break;
+ }
}
return field_op;
@@ -1556,7 +1906,9 @@ static void __destroy_hist_field(struct hist_field *hist_field)
kfree(hist_field->var.name);
kfree(hist_field->name);
- kfree(hist_field->type);
+
+ /* Can likely be a const */
+ kfree_const(hist_field->type);
kfree(hist_field->system);
kfree(hist_field->event_name);
@@ -1606,12 +1958,19 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
goto out; /* caller will populate */
if (flags & HIST_FIELD_FL_VAR_REF) {
- hist_field->fn = hist_field_var_ref;
+ hist_field->fn_num = HIST_FIELD_FN_VAR_REF;
goto out;
}
if (flags & HIST_FIELD_FL_HITCOUNT) {
- hist_field->fn = hist_field_counter;
+ hist_field->fn_num = HIST_FIELD_FN_COUNTER;
+ hist_field->size = sizeof(u64);
+ hist_field->type = "u64";
+ goto out;
+ }
+
+ if (flags & HIST_FIELD_FL_CONST) {
+ hist_field->fn_num = HIST_FIELD_FN_CONST;
hist_field->size = sizeof(u64);
hist_field->type = kstrdup("u64", GFP_KERNEL);
if (!hist_field->type)
@@ -1620,66 +1979,77 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
}
if (flags & HIST_FIELD_FL_STACKTRACE) {
- hist_field->fn = hist_field_none;
+ if (field)
+ hist_field->fn_num = HIST_FIELD_FN_STACK;
+ else
+ hist_field->fn_num = HIST_FIELD_FN_NOP;
+ hist_field->size = HIST_STACKTRACE_SIZE;
+ hist_field->type = kstrdup_const("unsigned long[]", GFP_KERNEL);
+ if (!hist_field->type)
+ goto free;
goto out;
}
- if (flags & HIST_FIELD_FL_LOG2) {
- unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
- hist_field->fn = hist_field_log2;
+ if (flags & (HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET)) {
+ unsigned long fl = flags & ~(HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET);
+ hist_field->fn_num = flags & HIST_FIELD_FL_LOG2 ? HIST_FIELD_FN_LOG2 :
+ HIST_FIELD_FN_BUCKET;
hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
+ if (!hist_field->operands[0])
+ goto free;
hist_field->size = hist_field->operands[0]->size;
- hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL);
+ hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL);
if (!hist_field->type)
goto free;
goto out;
}
if (flags & HIST_FIELD_FL_TIMESTAMP) {
- hist_field->fn = hist_field_timestamp;
+ hist_field->fn_num = HIST_FIELD_FN_TIMESTAMP;
hist_field->size = sizeof(u64);
- hist_field->type = kstrdup("u64", GFP_KERNEL);
- if (!hist_field->type)
- goto free;
+ hist_field->type = "u64";
goto out;
}
if (flags & HIST_FIELD_FL_CPU) {
- hist_field->fn = hist_field_cpu;
+ hist_field->fn_num = HIST_FIELD_FN_CPU;
hist_field->size = sizeof(int);
- hist_field->type = kstrdup("unsigned int", GFP_KERNEL);
- if (!hist_field->type)
- goto free;
+ hist_field->type = "unsigned int";
goto out;
}
if (WARN_ON_ONCE(!field))
goto out;
- if (is_string_field(field)) {
+ /* Pointers to strings are just pointers and dangerous to dereference */
+ if (is_string_field(field) &&
+ (field->filter_type != FILTER_PTR_STRING)) {
flags |= HIST_FIELD_FL_STRING;
hist_field->size = MAX_FILTER_STR_VAL;
- hist_field->type = kstrdup(field->type, GFP_KERNEL);
+ hist_field->type = kstrdup_const(field->type, GFP_KERNEL);
if (!hist_field->type)
goto free;
- if (field->filter_type == FILTER_STATIC_STRING)
- hist_field->fn = hist_field_string;
- else if (field->filter_type == FILTER_DYN_STRING)
- hist_field->fn = hist_field_dynstring;
+ if (field->filter_type == FILTER_STATIC_STRING) {
+ hist_field->fn_num = HIST_FIELD_FN_STRING;
+ hist_field->size = field->size;
+ } else if (field->filter_type == FILTER_DYN_STRING) {
+ hist_field->fn_num = HIST_FIELD_FN_DYNSTRING;
+ } else if (field->filter_type == FILTER_RDYN_STRING)
+ hist_field->fn_num = HIST_FIELD_FN_RELDYNSTRING;
else
- hist_field->fn = hist_field_pstring;
+ hist_field->fn_num = HIST_FIELD_FN_PSTRING;
} else {
hist_field->size = field->size;
hist_field->is_signed = field->is_signed;
- hist_field->type = kstrdup(field->type, GFP_KERNEL);
+ hist_field->type = kstrdup_const(field->type, GFP_KERNEL);
if (!hist_field->type)
goto free;
- hist_field->fn = select_value_fn(field->size,
- field->is_signed);
- if (!hist_field->fn) {
+ hist_field->fn_num = select_value_fn(field->size,
+ field->is_signed);
+ if (hist_field->fn_num == HIST_FIELD_FN_NOP) {
destroy_hist_field(hist_field, 0);
return NULL;
}
@@ -1759,7 +2129,7 @@ static int init_var_ref(struct hist_field *ref_field,
}
}
- ref_field->type = kstrdup(var_field->type, GFP_KERNEL);
+ ref_field->type = kstrdup_const(var_field->type, GFP_KERNEL);
if (!ref_field->type) {
err = -ENOMEM;
goto free;
@@ -1768,8 +2138,11 @@ static int init_var_ref(struct hist_field *ref_field,
return err;
free:
kfree(ref_field->system);
+ ref_field->system = NULL;
kfree(ref_field->event_name);
+ ref_field->event_name = NULL;
kfree(ref_field->name);
+ ref_field->name = NULL;
goto out;
}
@@ -1822,7 +2195,9 @@ static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data,
return ref_field;
}
}
-
+ /* Sanity check to avoid out-of-bound write on 'hist_data->var_refs' */
+ if (hist_data->n_var_refs >= TRACING_MAP_VARS_MAX)
+ return NULL;
ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL);
if (ref_field) {
if (init_var_ref(ref_field, var_field, system, event_name)) {
@@ -1856,7 +2231,7 @@ static char *field_name_from_var(struct hist_trigger_data *hist_data,
if (strcmp(var_name, name) == 0) {
field = hist_data->attrs->var_defs.expr[i];
- if (contains_operator(field) || is_var_ref(field))
+ if (contains_operator(field, NULL) || is_var_ref(field))
continue;
return field;
}
@@ -1917,7 +2292,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
static struct ftrace_event_field *
parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
- char *field_str, unsigned long *flags)
+ char *field_str, unsigned long *flags, unsigned long *buckets)
{
struct ftrace_event_field *field = NULL;
char *field_name, *modifier, *str;
@@ -1933,18 +2308,47 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
*flags |= HIST_FIELD_FL_HEX;
else if (strcmp(modifier, "sym") == 0)
*flags |= HIST_FIELD_FL_SYM;
- else if (strcmp(modifier, "sym-offset") == 0)
+ /*
+ * 'sym-offset' occurrences in the trigger string are modified
+ * to 'symXoffset' to simplify arithmetic expression parsing.
+ */
+ else if (strcmp(modifier, "symXoffset") == 0)
*flags |= HIST_FIELD_FL_SYM_OFFSET;
else if ((strcmp(modifier, "execname") == 0) &&
(strcmp(field_name, "common_pid") == 0))
*flags |= HIST_FIELD_FL_EXECNAME;
else if (strcmp(modifier, "syscall") == 0)
*flags |= HIST_FIELD_FL_SYSCALL;
+ else if (strcmp(modifier, "stacktrace") == 0)
+ *flags |= HIST_FIELD_FL_STACKTRACE;
else if (strcmp(modifier, "log2") == 0)
*flags |= HIST_FIELD_FL_LOG2;
else if (strcmp(modifier, "usecs") == 0)
*flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
- else {
+ else if (strncmp(modifier, "bucket", 6) == 0) {
+ int ret;
+
+ modifier += 6;
+
+ if (*modifier == 's')
+ modifier++;
+ if (*modifier != '=')
+ goto error;
+ modifier++;
+ ret = kstrtoul(modifier, 0, buckets);
+ if (ret || !(*buckets))
+ goto error;
+ *flags |= HIST_FIELD_FL_BUCKET;
+ } else if (strncmp(modifier, "percent", 7) == 0) {
+ if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY))
+ goto error;
+ *flags |= HIST_FIELD_FL_PERCENT;
+ } else if (strncmp(modifier, "graph", 5) == 0) {
+ if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY))
+ goto error;
+ *flags |= HIST_FIELD_FL_GRAPH;
+ } else {
+ error:
hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier));
field = ERR_PTR(-EINVAL);
goto out;
@@ -1956,14 +2360,32 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
hist_data->enable_timestamps = true;
if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
hist_data->attrs->ts_in_usecs = true;
- } else if (strcmp(field_name, "cpu") == 0)
+ } else if (strcmp(field_name, "common_stacktrace") == 0) {
+ *flags |= HIST_FIELD_FL_STACKTRACE;
+ } else if (strcmp(field_name, "common_cpu") == 0)
*flags |= HIST_FIELD_FL_CPU;
+ else if (strcmp(field_name, "hitcount") == 0)
+ *flags |= HIST_FIELD_FL_HITCOUNT;
else {
field = trace_find_event_field(file->event_call, field_name);
if (!field || !field->size) {
- hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name));
- field = ERR_PTR(-EINVAL);
- goto out;
+ /*
+ * For backward compatibility, if field_name
+ * was "cpu" or "stacktrace", then we treat this
+ * the same as common_cpu and common_stacktrace
+ * respectively. This also works for "CPU", and
+ * "STACKTRACE".
+ */
+ if (field && field->filter_type == FILTER_CPU) {
+ *flags |= HIST_FIELD_FL_CPU;
+ } else if (field && field->filter_type == FILTER_STACKTRACE) {
+ *flags |= HIST_FIELD_FL_STACKTRACE;
+ } else {
+ hist_err(tr, HIST_ERR_FIELD_NOT_FOUND,
+ errpos(field_name));
+ field = ERR_PTR(-EINVAL);
+ goto out;
+ }
}
}
out:
@@ -1983,7 +2405,7 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
if (!alias)
return NULL;
- alias->fn = var_ref->fn;
+ alias->fn_num = var_ref->fn_num;
alias->operands[0] = var_ref;
if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) {
@@ -1996,6 +2418,29 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
return alias;
}
+static struct hist_field *parse_const(struct hist_trigger_data *hist_data,
+ char *str, char *var_name,
+ unsigned long *flags)
+{
+ struct trace_array *tr = hist_data->event_file->tr;
+ struct hist_field *field = NULL;
+ u64 constant;
+
+ if (kstrtoull(str, 0, &constant)) {
+ hist_err(tr, HIST_ERR_EXPECT_NUMBER, errpos(str));
+ return NULL;
+ }
+
+ *flags |= HIST_FIELD_FL_CONST;
+ field = create_hist_field(hist_data, NULL, *flags, var_name);
+ if (!field)
+ return NULL;
+
+ field->constant = constant;
+
+ return field;
+}
+
static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
struct trace_event_file *file, char *str,
unsigned long *flags, char *var_name)
@@ -2003,8 +2448,18 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str;
struct ftrace_event_field *field = NULL;
struct hist_field *hist_field = NULL;
+ unsigned long buckets = 0;
int ret = 0;
+ if (isdigit(str[0])) {
+ hist_field = parse_const(hist_data, str, var_name, flags);
+ if (!hist_field) {
+ ret = -EINVAL;
+ goto out;
+ }
+ return hist_field;
+ }
+
s = strchr(str, '.');
if (s) {
s = strchr(++s, '.');
@@ -2040,7 +2495,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
} else
str = s;
- field = parse_field(hist_data, file, str, flags);
+ field = parse_field(hist_data, file, str, flags, &buckets);
if (IS_ERR(field)) {
ret = PTR_ERR(field);
goto out;
@@ -2051,6 +2506,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
ret = -ENOMEM;
goto out;
}
+ hist_field->buckets = buckets;
return hist_field;
out:
@@ -2060,21 +2516,24 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
struct trace_event_file *file,
char *str, unsigned long flags,
- char *var_name, unsigned int level);
+ char *var_name, unsigned int *n_subexprs);
static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
struct trace_event_file *file,
char *str, unsigned long flags,
- char *var_name, unsigned int level)
+ char *var_name, unsigned int *n_subexprs)
{
struct hist_field *operand1, *expr = NULL;
unsigned long operand_flags;
int ret = 0;
char *s;
+ /* Unary minus operator, increment n_subexprs */
+ ++*n_subexprs;
+
/* we support only -(xxx) i.e. explicit parens required */
- if (level > 3) {
+ if (*n_subexprs > 3) {
hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));
ret = -EINVAL;
goto free;
@@ -2091,8 +2550,16 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
}
s = strrchr(str, ')');
- if (s)
+ if (s) {
+ /* unary minus not supported in sub-expressions */
+ if (*(s+1) != '\0') {
+ hist_err(file->tr, HIST_ERR_UNARY_MINUS_SUBEXPR,
+ errpos(str));
+ ret = -EINVAL;
+ goto free;
+ }
*s = '\0';
+ }
else {
ret = -EINVAL; /* no closing ')' */
goto free;
@@ -2106,19 +2573,28 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
}
operand_flags = 0;
- operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
+ operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, n_subexprs);
if (IS_ERR(operand1)) {
ret = PTR_ERR(operand1);
goto free;
}
+ if (operand1->flags & HIST_FIELD_FL_STRING) {
+ /* String type can not be the operand of unary operator. */
+ hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(str));
+ destroy_hist_field(operand1, 0);
+ ret = -EINVAL;
+ goto free;
+ }
expr->flags |= operand1->flags &
(HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
- expr->fn = hist_field_unary_minus;
+ expr->fn_num = HIST_FIELD_FN_UMINUS;
expr->operands[0] = operand1;
+ expr->size = operand1->size;
+ expr->is_signed = operand1->is_signed;
expr->operator = FIELD_OP_UNARY_MINUS;
expr->name = expr_str(expr, 0);
- expr->type = kstrdup(operand1->type, GFP_KERNEL);
+ expr->type = kstrdup_const(operand1->type, GFP_KERNEL);
if (!expr->type) {
ret = -ENOMEM;
goto free;
@@ -2130,9 +2606,15 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
return ERR_PTR(ret);
}
+/*
+ * If the operands are var refs, return pointers the
+ * variable(s) referenced in var1 and var2, else NULL.
+ */
static int check_expr_operands(struct trace_array *tr,
struct hist_field *operand1,
- struct hist_field *operand2)
+ struct hist_field *operand2,
+ struct hist_field **var1,
+ struct hist_field **var2)
{
unsigned long operand1_flags = operand1->flags;
unsigned long operand2_flags = operand2->flags;
@@ -2145,6 +2627,7 @@ static int check_expr_operands(struct trace_array *tr,
if (!var)
return -EINVAL;
operand1_flags = var->flags;
+ *var1 = var;
}
if ((operand2_flags & HIST_FIELD_FL_VAR_REF) ||
@@ -2155,6 +2638,7 @@ static int check_expr_operands(struct trace_array *tr,
if (!var)
return -EINVAL;
operand2_flags = var->flags;
+ *var2 = var;
}
if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) !=
@@ -2169,64 +2653,102 @@ static int check_expr_operands(struct trace_array *tr,
static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
struct trace_event_file *file,
char *str, unsigned long flags,
- char *var_name, unsigned int level)
+ char *var_name, unsigned int *n_subexprs)
{
struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL;
- unsigned long operand_flags;
+ struct hist_field *var1 = NULL, *var2 = NULL;
+ unsigned long operand_flags, operand2_flags;
int field_op, ret = -EINVAL;
char *sep, *operand1_str;
+ enum hist_field_fn op_fn;
+ bool combine_consts;
- if (level > 3) {
+ if (*n_subexprs > 3) {
hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));
return ERR_PTR(-EINVAL);
}
- field_op = contains_operator(str);
+ field_op = contains_operator(str, &sep);
if (field_op == FIELD_OP_NONE)
return parse_atom(hist_data, file, str, &flags, var_name);
if (field_op == FIELD_OP_UNARY_MINUS)
- return parse_unary(hist_data, file, str, flags, var_name, ++level);
+ return parse_unary(hist_data, file, str, flags, var_name, n_subexprs);
- switch (field_op) {
- case FIELD_OP_MINUS:
- sep = "-";
- break;
- case FIELD_OP_PLUS:
- sep = "+";
- break;
- default:
- goto free;
- }
+ /* Binary operator found, increment n_subexprs */
+ ++*n_subexprs;
- operand1_str = strsep(&str, sep);
- if (!operand1_str || !str)
- goto free;
+ /* Split the expression string at the root operator */
+ if (!sep)
+ return ERR_PTR(-EINVAL);
+
+ *sep = '\0';
+ operand1_str = str;
+ str = sep+1;
+
+ /* Binary operator requires both operands */
+ if (*operand1_str == '\0' || *str == '\0')
+ return ERR_PTR(-EINVAL);
operand_flags = 0;
- operand1 = parse_atom(hist_data, file, operand1_str,
- &operand_flags, NULL);
- if (IS_ERR(operand1)) {
- ret = PTR_ERR(operand1);
- operand1 = NULL;
- goto free;
+
+ /* LHS of string is an expression e.g. a+b in a+b+c */
+ operand1 = parse_expr(hist_data, file, operand1_str, operand_flags, NULL, n_subexprs);
+ if (IS_ERR(operand1))
+ return ERR_CAST(operand1);
+
+ if (operand1->flags & HIST_FIELD_FL_STRING) {
+ hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(operand1_str));
+ ret = -EINVAL;
+ goto free_op1;
}
- /* rest of string could be another expression e.g. b+c in a+b+c */
+ /* RHS of string is another expression e.g. c in a+b+c */
operand_flags = 0;
- operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
+ operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, n_subexprs);
if (IS_ERR(operand2)) {
ret = PTR_ERR(operand2);
- operand2 = NULL;
- goto free;
+ goto free_op1;
+ }
+ if (operand2->flags & HIST_FIELD_FL_STRING) {
+ hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(str));
+ ret = -EINVAL;
+ goto free_operands;
}
- ret = check_expr_operands(file->tr, operand1, operand2);
+ switch (field_op) {
+ case FIELD_OP_MINUS:
+ op_fn = HIST_FIELD_FN_MINUS;
+ break;
+ case FIELD_OP_PLUS:
+ op_fn = HIST_FIELD_FN_PLUS;
+ break;
+ case FIELD_OP_DIV:
+ op_fn = HIST_FIELD_FN_DIV;
+ break;
+ case FIELD_OP_MULT:
+ op_fn = HIST_FIELD_FN_MULT;
+ break;
+ default:
+ ret = -EINVAL;
+ goto free_operands;
+ }
+
+ ret = check_expr_operands(file->tr, operand1, operand2, &var1, &var2);
if (ret)
- goto free;
+ goto free_operands;
- flags |= HIST_FIELD_FL_EXPR;
+ operand_flags = var1 ? var1->flags : operand1->flags;
+ operand2_flags = var2 ? var2->flags : operand2->flags;
+
+ /*
+ * If both operands are constant, the expression can be
+ * collapsed to a single constant.
+ */
+ combine_consts = operand_flags & operand2_flags & HIST_FIELD_FL_CONST;
+
+ flags |= combine_consts ? HIST_FIELD_FL_CONST : HIST_FIELD_FL_EXPR;
flags |= operand1->flags &
(HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
@@ -2234,40 +2756,81 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
expr = create_hist_field(hist_data, NULL, flags, var_name);
if (!expr) {
ret = -ENOMEM;
- goto free;
+ goto free_operands;
}
operand1->read_once = true;
operand2->read_once = true;
+ /* The operands are now owned and free'd by 'expr' */
expr->operands[0] = operand1;
expr->operands[1] = operand2;
- expr->operator = field_op;
- expr->name = expr_str(expr, 0);
- expr->type = kstrdup(operand1->type, GFP_KERNEL);
- if (!expr->type) {
- ret = -ENOMEM;
- goto free;
+
+ if (field_op == FIELD_OP_DIV &&
+ operand2_flags & HIST_FIELD_FL_CONST) {
+ u64 divisor = var2 ? var2->constant : operand2->constant;
+
+ if (!divisor) {
+ hist_err(file->tr, HIST_ERR_DIVISION_BY_ZERO, errpos(str));
+ ret = -EDOM;
+ goto free_expr;
+ }
+
+ /*
+ * Copy the divisor here so we don't have to look it up
+ * later if this is a var ref
+ */
+ operand2->constant = divisor;
+ op_fn = hist_field_get_div_fn(operand2);
}
- switch (field_op) {
- case FIELD_OP_MINUS:
- expr->fn = hist_field_minus;
- break;
- case FIELD_OP_PLUS:
- expr->fn = hist_field_plus;
- break;
- default:
- ret = -EINVAL;
- goto free;
+ expr->fn_num = op_fn;
+
+ if (combine_consts) {
+ if (var1)
+ expr->operands[0] = var1;
+ if (var2)
+ expr->operands[1] = var2;
+
+ expr->constant = hist_fn_call(expr, NULL, NULL, NULL, NULL);
+ expr->fn_num = HIST_FIELD_FN_CONST;
+
+ expr->operands[0] = NULL;
+ expr->operands[1] = NULL;
+
+ /*
+ * var refs won't be destroyed immediately
+ * See: destroy_hist_field()
+ */
+ destroy_hist_field(operand2, 0);
+ destroy_hist_field(operand1, 0);
+
+ expr->name = expr_str(expr, 0);
+ } else {
+ /* The operand sizes should be the same, so just pick one */
+ expr->size = operand1->size;
+ expr->is_signed = operand1->is_signed;
+
+ expr->operator = field_op;
+ expr->type = kstrdup_const(operand1->type, GFP_KERNEL);
+ if (!expr->type) {
+ ret = -ENOMEM;
+ goto free_expr;
+ }
+
+ expr->name = expr_str(expr, 0);
}
return expr;
- free:
- destroy_hist_field(operand1, 0);
+
+free_operands:
destroy_hist_field(operand2, 0);
- destroy_hist_field(expr, 0);
+free_op1:
+ destroy_hist_field(operand1, 0);
+ return ERR_PTR(ret);
+free_expr:
+ destroy_hist_field(expr, 0);
return ERR_PTR(ret);
}
@@ -2289,9 +2852,10 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data,
}
static struct event_command trigger_hist_cmd;
-static int event_hist_trigger_func(struct event_command *cmd_ops,
- struct trace_event_file *file,
- char *glob, char *cmd, char *param);
+static int event_hist_trigger_parse(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd,
+ char *param_and_filter);
static bool compatible_keys(struct hist_trigger_data *target_hist_data,
struct hist_trigger_data *hist_data,
@@ -2389,7 +2953,7 @@ find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
* events. However, for convenience, users are allowed to directly
* specify an event field in an action, which will be automatically
* converted into a variable on their behalf.
-
+ *
* If a user specifies a field on an event that isn't the event the
* histogram currently being defined (the target event histogram), the
* only way that can be accomplished is if a new hist trigger is
@@ -2408,12 +2972,12 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
char *subsys_name, char *event_name, char *field_name)
{
struct trace_array *tr = target_hist_data->event_file->tr;
- struct hist_field *event_var = ERR_PTR(-EINVAL);
struct hist_trigger_data *hist_data;
unsigned int i, n, first = true;
struct field_var_hist *var_hist;
struct trace_event_file *file;
struct hist_field *key_field;
+ struct hist_field *event_var;
char *saved_filter;
char *cmd;
int ret;
@@ -2494,8 +3058,8 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
var_hist->hist_data = hist_data;
/* Create the new histogram with our variable */
- ret = event_hist_trigger_func(&trigger_hist_cmd, file,
- "", "hist", cmd);
+ ret = event_hist_trigger_parse(&trigger_hist_cmd, file,
+ "", "hist", cmd);
if (ret) {
kfree(cmd);
kfree(var_hist->cmd);
@@ -2551,6 +3115,7 @@ find_target_event_var(struct hist_trigger_data *hist_data,
}
static inline void __update_field_vars(struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *rec,
struct field_var **field_vars,
@@ -2561,19 +3126,37 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
unsigned int i, j, var_idx;
u64 var_val;
+ /* Make sure stacktrace can fit in the string variable length */
+ BUILD_BUG_ON((HIST_STACKTRACE_DEPTH + 1) * sizeof(long) >= STR_VAR_LEN_MAX);
+
for (i = 0, j = field_var_str_start; i < n_field_vars; i++) {
struct field_var *field_var = field_vars[i];
struct hist_field *var = field_var->var;
struct hist_field *val = field_var->val;
- var_val = val->fn(val, elt, rbe, rec);
+ var_val = hist_fn_call(val, elt, buffer, rbe, rec);
var_idx = var->var.idx;
- if (val->flags & HIST_FIELD_FL_STRING) {
+ if (val->flags & (HIST_FIELD_FL_STRING |
+ HIST_FIELD_FL_STACKTRACE)) {
char *str = elt_data->field_var_str[j++];
char *val_str = (char *)(uintptr_t)var_val;
-
- strscpy(str, val_str, STR_VAR_LEN_MAX);
+ unsigned int size;
+
+ if (val->flags & HIST_FIELD_FL_STRING) {
+ size = min(val->size, STR_VAR_LEN_MAX);
+ strscpy(str, val_str, size);
+ } else {
+ char *stack_start = str + sizeof(unsigned long);
+ int e;
+
+ e = stack_trace_save((void *)stack_start,
+ HIST_STACKTRACE_DEPTH,
+ HIST_STACKTRACE_SKIP);
+ if (e < HIST_STACKTRACE_DEPTH - 1)
+ ((unsigned long *)stack_start)[e] = 0;
+ *((unsigned long *)str) = e;
+ }
var_val = (u64)(uintptr_t)str;
}
tracing_map_set_var(elt, var_idx, var_val);
@@ -2582,19 +3165,21 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
static void update_field_vars(struct hist_trigger_data *hist_data,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *rec)
{
- __update_field_vars(elt, rbe, rec, hist_data->field_vars,
+ __update_field_vars(elt, buffer, rbe, rec, hist_data->field_vars,
hist_data->n_field_vars, 0);
}
static void save_track_data_vars(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data, u64 *var_ref_vals)
{
- __update_field_vars(elt, rbe, rec, hist_data->save_vars,
+ __update_field_vars(elt, buffer, rbe, rec, hist_data->save_vars,
hist_data->n_save_vars, hist_data->n_field_var_str);
}
@@ -2629,10 +3214,10 @@ static struct hist_field *create_var(struct hist_trigger_data *hist_data,
var->var.hist_data = var->hist_data = hist_data;
var->size = size;
var->var.name = kstrdup(name, GFP_KERNEL);
- var->type = kstrdup(type, GFP_KERNEL);
+ var->type = kstrdup_const(type, GFP_KERNEL);
if (!var->var.name || !var->type) {
+ kfree_const(var->type);
kfree(var->var.name);
- kfree(var->type);
kfree(var);
var = ERR_PTR(-ENOMEM);
}
@@ -2699,7 +3284,7 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
* events. However, for convenience, users are allowed to directly
* specify an event field in an action, which will be automatically
* converted into a variable on their behalf.
-
+ *
* This function creates a field variable with the name var_name on
* the hist trigger currently being defined on the target event. If
* subsys_name and event_name are specified, this function simply
@@ -2770,12 +3355,14 @@ static void save_track_val(struct hist_trigger_data *hist_data,
}
static void save_track_data(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data, u64 *var_ref_vals)
{
if (data->track_data.save_data)
- data->track_data.save_data(hist_data, elt, rec, rbe, key, data, var_ref_vals);
+ data->track_data.save_data(hist_data, elt, buffer, rec, rbe,
+ key, data, var_ref_vals);
}
static bool check_track_val(struct tracing_map_elt *elt,
@@ -2826,7 +3413,8 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
}
static void save_track_data_snapshot(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data,
u64 *var_ref_vals)
@@ -2895,7 +3483,8 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
return false;
}
static void save_track_data_snapshot(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data,
u64 *var_ref_vals) {}
@@ -2937,7 +3526,8 @@ static void track_data_print(struct seq_file *m,
}
static void ontrack_action(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data, u64 *var_ref_vals)
{
@@ -2945,7 +3535,8 @@ static void ontrack_action(struct hist_trigger_data *hist_data,
if (check_track_val(elt, data, var_val)) {
save_track_val(hist_data, elt, data, var_val);
- save_track_data(hist_data, elt, rec, rbe, key, data, var_ref_vals);
+ save_track_data(hist_data, elt, buffer, rec, rbe,
+ key, data, var_ref_vals);
}
}
@@ -3053,6 +3644,7 @@ static int parse_action_params(struct trace_array *tr, char *params,
while (params) {
if (data->n_params >= SYNTH_FIELDS_MAX) {
hist_err(tr, HIST_ERR_TOO_MANY_PARAMS, 0);
+ ret = -EINVAL;
goto out;
}
@@ -3263,7 +3855,8 @@ static void save_field_var(struct hist_trigger_data *hist_data,
{
hist_data->field_vars[hist_data->n_field_vars++] = field_var;
- if (field_var->val->flags & HIST_FIELD_FL_STRING)
+ /* Stack traces are saved in the string storage too */
+ if (field_var->val->flags & (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE))
hist_data->n_field_var_str++;
}
@@ -3279,9 +3872,21 @@ static int check_synth_field(struct synth_event *event,
field = event->fields[field_pos];
+ /*
+ * A dynamic string synth field can accept static or
+ * dynamic. A static string synth field can only accept a
+ * same-sized static string, which is checked for later.
+ */
+ if (strstr(hist_field->type, "char[") && field->is_string
+ && field->is_dynamic)
+ return 0;
+
+ if (strstr(hist_field->type, "long[") && field->is_stack)
+ return 0;
+
if (strcmp(field->type, hist_field->type) != 0) {
if (field->size != hist_field->size ||
- field->is_signed != hist_field->is_signed)
+ (!field->is_string && field->is_signed != hist_field->is_signed))
return -EINVAL;
}
@@ -3336,7 +3941,7 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data,
} else {
field_var = NULL;
/*
- * If no explicit system.event is specfied, default to
+ * If no explicit system.event is specified, default to
* looking for fields on the onmatch(system.event.xxx)
* event.
*/
@@ -3345,6 +3950,8 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data,
event = data->match_data.event;
}
+ if (!event)
+ goto free;
/*
* At this point, we're looking at a field on another
* event. Because we can't modify a hist trigger on
@@ -3378,6 +3985,10 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
lockdep_assert_held(&event_mutex);
+ /* Sanity check to avoid out-of-bound write on 'data->var_ref_idx' */
+ if (data->n_params > SYNTH_FIELDS_MAX)
+ return -EINVAL;
+
if (data->use_trace_keyword)
synth_event_name = data->synth_event_name;
else
@@ -3441,6 +4052,7 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
var_ref_idx = find_var_ref_idx(hist_data, var_ref);
if (WARN_ON(var_ref_idx < 0)) {
+ kfree(p);
ret = var_ref_idx;
goto err;
}
@@ -3526,7 +4138,8 @@ static int action_create(struct hist_trigger_data *hist_data,
}
hist_data->save_vars[hist_data->n_save_vars++] = field_var;
- if (field_var->val->flags & HIST_FIELD_FL_STRING)
+ if (field_var->val->flags &
+ (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE))
hist_data->n_save_var_str++;
kfree(param);
}
@@ -3614,14 +4227,29 @@ static int __create_val_field(struct hist_trigger_data *hist_data,
unsigned long flags)
{
struct hist_field *hist_field;
- int ret = 0;
+ int ret = 0, n_subexprs = 0;
- hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0);
+ hist_field = parse_expr(hist_data, file, field_str, flags, var_name, &n_subexprs);
if (IS_ERR(hist_field)) {
ret = PTR_ERR(hist_field);
goto out;
}
+ /* values and variables should not have some modifiers */
+ if (hist_field->flags & HIST_FIELD_FL_VAR) {
+ /* Variable */
+ if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT |
+ HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2))
+ goto err;
+ } else {
+ /* Value */
+ if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT |
+ HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2 |
+ HIST_FIELD_FL_SYM | HIST_FIELD_FL_SYM_OFFSET |
+ HIST_FIELD_FL_SYSCALL | HIST_FIELD_FL_STACKTRACE))
+ goto err;
+ }
+
hist_data->fields[val_idx] = hist_field;
++hist_data->n_vals;
@@ -3631,6 +4259,9 @@ static int __create_val_field(struct hist_trigger_data *hist_data,
ret = -EINVAL;
out:
return ret;
+ err:
+ hist_err(file->tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(field_str));
+ return -EINVAL;
}
static int create_val_field(struct hist_trigger_data *hist_data,
@@ -3644,6 +4275,124 @@ static int create_val_field(struct hist_trigger_data *hist_data,
return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0);
}
+static const char no_comm[] = "(no comm)";
+
+static u64 hist_field_execname(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_elt_data *elt_data;
+
+ if (WARN_ON_ONCE(!elt))
+ return (u64)(unsigned long)no_comm;
+
+ elt_data = elt->private_data;
+
+ if (WARN_ON_ONCE(!elt_data->comm))
+ return (u64)(unsigned long)no_comm;
+
+ return (u64)(unsigned long)(elt_data->comm);
+}
+
+static u64 hist_field_stack(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ u32 str_item = *(u32 *)(event + hist_field->field->offset);
+ int str_loc = str_item & 0xffff;
+ char *addr = (char *)(event + str_loc);
+
+ return (u64)(unsigned long)addr;
+}
+
+static u64 hist_fn_call(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ switch (hist_field->fn_num) {
+ case HIST_FIELD_FN_VAR_REF:
+ return hist_field_var_ref(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_COUNTER:
+ return hist_field_counter(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_CONST:
+ return hist_field_const(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_LOG2:
+ return hist_field_log2(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_BUCKET:
+ return hist_field_bucket(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_TIMESTAMP:
+ return hist_field_timestamp(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_CPU:
+ return hist_field_cpu(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_STRING:
+ return hist_field_string(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_DYNSTRING:
+ return hist_field_dynstring(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_RELDYNSTRING:
+ return hist_field_reldynstring(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_PSTRING:
+ return hist_field_pstring(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_S64:
+ return hist_field_s64(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_U64:
+ return hist_field_u64(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_S32:
+ return hist_field_s32(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_U32:
+ return hist_field_u32(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_S16:
+ return hist_field_s16(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_U16:
+ return hist_field_u16(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_S8:
+ return hist_field_s8(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_U8:
+ return hist_field_u8(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_UMINUS:
+ return hist_field_unary_minus(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_MINUS:
+ return hist_field_minus(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_PLUS:
+ return hist_field_plus(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_DIV:
+ return hist_field_div(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_MULT:
+ return hist_field_mult(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_DIV_POWER2:
+ return div_by_power_of_two(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_DIV_NOT_POWER2:
+ return div_by_not_power_of_two(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_DIV_MULT_SHIFT:
+ return div_by_mult_and_shift(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_EXECNAME:
+ return hist_field_execname(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_STACK:
+ return hist_field_stack(hist_field, elt, buffer, rbe, event);
+ default:
+ return 0;
+ }
+}
+
+/* Convert a var that points to common_pid.execname to a string */
+static void update_var_execname(struct hist_field *hist_field)
+{
+ hist_field->flags = HIST_FIELD_FL_STRING | HIST_FIELD_FL_VAR |
+ HIST_FIELD_FL_EXECNAME;
+ hist_field->size = MAX_FILTER_STR_VAL;
+ hist_field->is_signed = 0;
+
+ kfree_const(hist_field->type);
+ hist_field->type = "char[]";
+
+ hist_field->fn_num = HIST_FIELD_FN_EXECNAME;
+}
+
static int create_var_field(struct hist_trigger_data *hist_data,
unsigned int val_idx,
struct trace_event_file *file,
@@ -3651,6 +4400,7 @@ static int create_var_field(struct hist_trigger_data *hist_data,
{
struct trace_array *tr = hist_data->event_file->tr;
unsigned long flags = 0;
+ int ret;
if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
return -EINVAL;
@@ -3665,14 +4415,23 @@ static int create_var_field(struct hist_trigger_data *hist_data,
if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX))
return -EINVAL;
- return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags);
+ ret = __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags);
+
+ if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_EXECNAME)
+ update_var_execname(hist_data->fields[val_idx]);
+
+ if (!ret && hist_data->fields[val_idx]->flags &
+ (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE))
+ hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++;
+
+ return ret;
}
static int create_val_fields(struct hist_trigger_data *hist_data,
struct trace_event_file *file)
{
+ unsigned int i, j = 1, n_hitcount = 0;
char *fields_str, *field_str;
- unsigned int i, j = 1;
int ret;
ret = create_hitcount_val(hist_data);
@@ -3689,8 +4448,10 @@ static int create_val_fields(struct hist_trigger_data *hist_data,
if (!field_str)
break;
- if (strcmp(field_str, "hitcount") == 0)
- continue;
+ if (strcmp(field_str, "hitcount") == 0) {
+ if (!n_hitcount++)
+ continue;
+ }
ret = create_val_field(hist_data, j++, file, field_str);
if (ret)
@@ -3700,6 +4461,12 @@ static int create_val_fields(struct hist_trigger_data *hist_data,
if (fields_str && (strcmp(fields_str, "hitcount") != 0))
ret = -EINVAL;
out:
+ /* There is only raw hitcount but nohitcount suppresses it. */
+ if (j == 1 && hist_data->attrs->no_hitcount) {
+ hist_err(hist_data->event_file->tr, HIST_ERR_NEED_NOHC_VAL, 0);
+ ret = -ENOENT;
+ }
+
return ret;
}
@@ -3713,7 +4480,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
struct hist_field *hist_field = NULL;
unsigned long flags = 0;
unsigned int key_size;
- int ret = 0;
+ int ret = 0, n_subexprs = 0;
if (WARN_ON(key_idx >= HIST_FIELDS_MAX))
return -EINVAL;
@@ -3726,7 +4493,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
hist_field = create_hist_field(hist_data, NULL, flags, NULL);
} else {
hist_field = parse_expr(hist_data, file, field_str, flags,
- NULL, 0);
+ NULL, &n_subexprs);
if (IS_ERR(hist_field)) {
ret = PTR_ERR(hist_field);
goto out;
@@ -3866,6 +4633,7 @@ static int parse_var_defs(struct hist_trigger_data *hist_data)
s = kstrdup(field_str, GFP_KERNEL);
if (!s) {
kfree(hist_data->attrs->var_defs.name[n_vars]);
+ hist_data->attrs->var_defs.name[n_vars] = NULL;
ret = -ENOMEM;
goto free;
}
@@ -3889,7 +4657,7 @@ static int create_hist_fields(struct hist_trigger_data *hist_data,
ret = parse_var_defs(hist_data);
if (ret)
- goto out;
+ return ret;
ret = create_val_fields(hist_data, file);
if (ret)
@@ -3900,8 +4668,7 @@ static int create_hist_fields(struct hist_trigger_data *hist_data,
goto out;
ret = create_key_fields(hist_data, file);
- if (ret)
- goto out;
+
out:
free_var_defs(hist_data);
@@ -4038,36 +4805,35 @@ static int parse_actions(struct hist_trigger_data *hist_data)
int len;
for (i = 0; i < hist_data->attrs->n_actions; i++) {
+ enum handler_id hid = 0;
+ char *action_str;
+
str = hist_data->attrs->action_str[i];
- if ((len = str_has_prefix(str, "onmatch("))) {
- char *action_str = str + len;
+ if ((len = str_has_prefix(str, "onmatch(")))
+ hid = HANDLER_ONMATCH;
+ else if ((len = str_has_prefix(str, "onmax(")))
+ hid = HANDLER_ONMAX;
+ else if ((len = str_has_prefix(str, "onchange(")))
+ hid = HANDLER_ONCHANGE;
- data = onmatch_parse(tr, action_str);
- if (IS_ERR(data)) {
- ret = PTR_ERR(data);
- break;
- }
- } else if ((len = str_has_prefix(str, "onmax("))) {
- char *action_str = str + len;
+ action_str = str + len;
- data = track_data_parse(hist_data, action_str,
- HANDLER_ONMAX);
- if (IS_ERR(data)) {
- ret = PTR_ERR(data);
- break;
- }
- } else if ((len = str_has_prefix(str, "onchange("))) {
- char *action_str = str + len;
+ switch (hid) {
+ case HANDLER_ONMATCH:
+ data = onmatch_parse(tr, action_str);
+ break;
+ case HANDLER_ONMAX:
+ case HANDLER_ONCHANGE:
+ data = track_data_parse(hist_data, action_str, hid);
+ break;
+ default:
+ data = ERR_PTR(-EINVAL);
+ break;
+ }
- data = track_data_parse(hist_data, action_str,
- HANDLER_ONCHANGE);
- if (IS_ERR(data)) {
- ret = PTR_ERR(data);
- break;
- }
- } else {
- ret = -EINVAL;
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
break;
}
@@ -4291,7 +5057,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
cmp_fn = tracing_map_cmp_none;
- else if (!field)
+ else if (!field || hist_field->flags & HIST_FIELD_FL_CPU)
cmp_fn = tracing_map_cmp_num(hist_field->size,
hist_field->is_signed);
else if (is_string_field(field))
@@ -4376,7 +5142,8 @@ create_hist_data(unsigned int map_bits,
}
static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe,
u64 *var_ref_vals)
{
@@ -4390,9 +5157,40 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
for_each_hist_val_field(i, hist_data) {
hist_field = hist_data->fields[i];
- hist_val = hist_field->fn(hist_field, elt, rbe, rec);
+ hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec);
if (hist_field->flags & HIST_FIELD_FL_VAR) {
var_idx = hist_field->var.idx;
+
+ if (hist_field->flags &
+ (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) {
+ unsigned int str_start, var_str_idx, idx;
+ char *str, *val_str;
+ unsigned int size;
+
+ str_start = hist_data->n_field_var_str +
+ hist_data->n_save_var_str;
+ var_str_idx = hist_field->var_str_idx;
+ idx = str_start + var_str_idx;
+
+ str = elt_data->field_var_str[idx];
+ val_str = (char *)(uintptr_t)hist_val;
+
+ if (hist_field->flags & HIST_FIELD_FL_STRING) {
+ size = min(hist_field->size, STR_VAR_LEN_MAX);
+ strscpy(str, val_str, size);
+ } else {
+ char *stack_start = str + sizeof(unsigned long);
+ int e;
+
+ e = stack_trace_save((void *)stack_start,
+ HIST_STACKTRACE_DEPTH,
+ HIST_STACKTRACE_SKIP);
+ if (e < HIST_STACKTRACE_DEPTH - 1)
+ ((unsigned long *)stack_start)[e] = 0;
+ *((unsigned long *)str) = e;
+ }
+ hist_val = (u64)(uintptr_t)str;
+ }
tracing_map_set_var(elt, var_idx, hist_val);
continue;
}
@@ -4402,13 +5200,13 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
for_each_hist_key_field(i, hist_data) {
hist_field = hist_data->fields[i];
if (hist_field->flags & HIST_FIELD_FL_VAR) {
- hist_val = hist_field->fn(hist_field, elt, rbe, rec);
+ hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec);
var_idx = hist_field->var.idx;
tracing_map_set_var(elt, var_idx, hist_val);
}
}
- update_field_vars(hist_data, elt, rbe, rec);
+ update_field_vars(hist_data, elt, buffer, rbe, rec);
}
static inline void add_to_key(char *compound_key, void *key,
@@ -4420,10 +5218,9 @@ static inline void add_to_key(char *compound_key, void *key,
struct ftrace_event_field *field;
field = key_field->field;
- if (field->filter_type == FILTER_DYN_STRING)
+ if (field->filter_type == FILTER_DYN_STRING ||
+ field->filter_type == FILTER_RDYN_STRING)
size = *(u32 *)(rec + field->offset) >> 16;
- else if (field->filter_type == FILTER_PTR_STRING)
- size = strlen(key);
else if (field->filter_type == FILTER_STATIC_STRING)
size = field->size;
@@ -4438,7 +5235,8 @@ static inline void add_to_key(char *compound_key, void *key,
static void
hist_trigger_actions(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
u64 *var_ref_vals)
{
@@ -4447,11 +5245,12 @@ hist_trigger_actions(struct hist_trigger_data *hist_data,
for (i = 0; i < hist_data->n_actions; i++) {
data = hist_data->actions[i];
- data->fn(hist_data, elt, rec, rbe, key, data, var_ref_vals);
+ data->fn(hist_data, elt, buffer, rec, rbe, key, data, var_ref_vals);
}
}
-static void event_hist_trigger(struct event_trigger_data *data, void *rec,
+static void event_hist_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe)
{
struct hist_trigger_data *hist_data = data->private_data;
@@ -4465,6 +5264,9 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
void *key = NULL;
unsigned int i;
+ if (unlikely(!rbe))
+ return;
+
memset(compound_key, 0, hist_data->key_size);
for_each_hist_key_field(i, hist_data) {
@@ -4472,11 +5274,20 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
memset(entries, 0, HIST_STACKTRACE_SIZE);
- stack_trace_save(entries, HIST_STACKTRACE_DEPTH,
- HIST_STACKTRACE_SKIP);
+ if (key_field->field) {
+ unsigned long *stack, n_entries;
+
+ field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec);
+ stack = (unsigned long *)(long)field_contents;
+ n_entries = *stack;
+ memcpy(entries, ++stack, n_entries * sizeof(unsigned long));
+ } else {
+ stack_trace_save(entries, HIST_STACKTRACE_DEPTH,
+ HIST_STACKTRACE_SKIP);
+ }
key = entries;
} else {
- field_contents = key_field->fn(key_field, elt, rbe, rec);
+ field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec);
if (key_field->flags & HIST_FIELD_FL_STRING) {
key = (void *)(unsigned long)field_contents;
use_compound_key = true;
@@ -4499,17 +5310,16 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
if (!elt)
return;
- hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals);
+ hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals);
if (resolve_var_refs(hist_data, key, var_ref_vals, true))
- hist_trigger_actions(hist_data, elt, rec, rbe, key, var_ref_vals);
+ hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals);
}
static void hist_trigger_stacktrace_print(struct seq_file *m,
unsigned long *stacktrace_entries,
unsigned int max_entries)
{
- char str[KSYM_SYMBOL_LEN];
unsigned int spaces = 8;
unsigned int i;
@@ -4518,8 +5328,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m,
return;
seq_printf(m, "%*c", 1 + spaces, ' ');
- sprint_symbol(str, stacktrace_entries[i]);
- seq_printf(m, "%s\n", str);
+ seq_printf(m, "%pS\n", (void*)stacktrace_entries[i]);
}
}
@@ -4529,7 +5338,6 @@ static void hist_trigger_print_key(struct seq_file *m,
struct tracing_map_elt *elt)
{
struct hist_field *key_field;
- char str[KSYM_SYMBOL_LEN];
bool multiline = false;
const char *field_name;
unsigned int i;
@@ -4550,14 +5358,12 @@ static void hist_trigger_print_key(struct seq_file *m,
seq_printf(m, "%s: %llx", field_name, uval);
} else if (key_field->flags & HIST_FIELD_FL_SYM) {
uval = *(u64 *)(key + key_field->offset);
- sprint_symbol_no_offset(str, uval);
- seq_printf(m, "%s: [%llx] %-45s", field_name,
- uval, str);
+ seq_printf(m, "%s: [%llx] %-45ps", field_name,
+ uval, (void *)(uintptr_t)uval);
} else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) {
uval = *(u64 *)(key + key_field->offset);
- sprint_symbol(str, uval);
- seq_printf(m, "%s: [%llx] %-55s", field_name,
- uval, str);
+ seq_printf(m, "%s: [%llx] %-55pS", field_name,
+ uval, (void *)(uintptr_t)uval);
} else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
struct hist_elt_data *elt_data = elt->private_data;
char *comm;
@@ -4581,7 +5387,10 @@ static void hist_trigger_print_key(struct seq_file *m,
seq_printf(m, "%s: %-30s[%3llu]", field_name,
syscall_name, uval);
} else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
- seq_puts(m, "stacktrace:\n");
+ if (key_field->field)
+ seq_printf(m, "%s.stacktrace", key_field->field->name);
+ else
+ seq_puts(m, "common_stacktrace:\n");
hist_trigger_stacktrace_print(m,
key + key_field->offset,
HIST_STACKTRACE_DEPTH);
@@ -4589,6 +5398,11 @@ static void hist_trigger_print_key(struct seq_file *m,
} else if (key_field->flags & HIST_FIELD_FL_LOG2) {
seq_printf(m, "%s: ~ 2^%-2llu", field_name,
*(u64 *)(key + key_field->offset));
+ } else if (key_field->flags & HIST_FIELD_FL_BUCKET) {
+ unsigned long buckets = key_field->buckets;
+ uval = *(u64 *)(key + key_field->offset);
+ seq_printf(m, "%s: ~ %llu-%llu", field_name,
+ uval, uval + buckets -1);
} else if (key_field->flags & HIST_FIELD_FL_STRING) {
seq_printf(m, "%s: %-50s", field_name,
(char *)(key + key_field->offset));
@@ -4604,33 +5418,101 @@ static void hist_trigger_print_key(struct seq_file *m,
seq_puts(m, "}");
}
+/* Get the 100 times of the percentage of @val in @total */
+static inline unsigned int __get_percentage(u64 val, u64 total)
+{
+ if (!total)
+ goto div0;
+
+ if (val < (U64_MAX / 10000))
+ return (unsigned int)div64_ul(val * 10000, total);
+
+ total = div64_u64(total, 10000);
+ if (!total)
+ goto div0;
+
+ return (unsigned int)div64_ul(val, total);
+div0:
+ return val ? UINT_MAX : 0;
+}
+
+#define BAR_CHAR '#'
+
+static inline const char *__fill_bar_str(char *buf, int size, u64 val, u64 max)
+{
+ unsigned int len = __get_percentage(val, max);
+ int i;
+
+ if (len == UINT_MAX) {
+ snprintf(buf, size, "[ERROR]");
+ return buf;
+ }
+
+ len = len * size / 10000;
+ for (i = 0; i < len && i < size; i++)
+ buf[i] = BAR_CHAR;
+ while (i < size)
+ buf[i++] = ' ';
+ buf[size] = '\0';
+
+ return buf;
+}
+
+struct hist_val_stat {
+ u64 max;
+ u64 total;
+};
+
+static void hist_trigger_print_val(struct seq_file *m, unsigned int idx,
+ const char *field_name, unsigned long flags,
+ struct hist_val_stat *stats,
+ struct tracing_map_elt *elt)
+{
+ u64 val = tracing_map_read_sum(elt, idx);
+ unsigned int pc;
+ char bar[21];
+
+ if (flags & HIST_FIELD_FL_PERCENT) {
+ pc = __get_percentage(val, stats[idx].total);
+ if (pc == UINT_MAX)
+ seq_printf(m, " %s (%%):[ERROR]", field_name);
+ else
+ seq_printf(m, " %s (%%): %3u.%02u", field_name,
+ pc / 100, pc % 100);
+ } else if (flags & HIST_FIELD_FL_GRAPH) {
+ seq_printf(m, " %s: %20s", field_name,
+ __fill_bar_str(bar, 20, val, stats[idx].max));
+ } else if (flags & HIST_FIELD_FL_HEX) {
+ seq_printf(m, " %s: %10llx", field_name, val);
+ } else {
+ seq_printf(m, " %s: %10llu", field_name, val);
+ }
+}
+
static void hist_trigger_entry_print(struct seq_file *m,
struct hist_trigger_data *hist_data,
+ struct hist_val_stat *stats,
void *key,
struct tracing_map_elt *elt)
{
const char *field_name;
- unsigned int i;
+ unsigned int i = HITCOUNT_IDX;
+ unsigned long flags;
hist_trigger_print_key(m, hist_data, key, elt);
- seq_printf(m, " hitcount: %10llu",
- tracing_map_read_sum(elt, HITCOUNT_IDX));
+ /* At first, show the raw hitcount if !nohitcount */
+ if (!hist_data->attrs->no_hitcount)
+ hist_trigger_print_val(m, i, "hitcount", 0, stats, elt);
for (i = 1; i < hist_data->n_vals; i++) {
field_name = hist_field_name(hist_data->fields[i], 0);
-
- if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR ||
- hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR)
+ flags = hist_data->fields[i]->flags;
+ if (flags & HIST_FIELD_FL_VAR || flags & HIST_FIELD_FL_EXPR)
continue;
- if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
- seq_printf(m, " %s: %10llx", field_name,
- tracing_map_read_sum(elt, i));
- } else {
- seq_printf(m, " %s: %10llu", field_name,
- tracing_map_read_sum(elt, i));
- }
+ seq_puts(m, " ");
+ hist_trigger_print_val(m, i, field_name, flags, stats, elt);
}
print_actions(m, hist_data, elt);
@@ -4643,7 +5525,9 @@ static int print_entries(struct seq_file *m,
{
struct tracing_map_sort_entry **sort_entries = NULL;
struct tracing_map *map = hist_data->map;
- int i, n_entries;
+ int i, j, n_entries;
+ struct hist_val_stat *stats = NULL;
+ u64 val;
n_entries = tracing_map_sort_entries(map, hist_data->sort_keys,
hist_data->n_sort_keys,
@@ -4651,11 +5535,34 @@ static int print_entries(struct seq_file *m,
if (n_entries < 0)
return n_entries;
+ /* Calculate the max and the total for each field if needed. */
+ for (j = 0; j < hist_data->n_vals; j++) {
+ if (!(hist_data->fields[j]->flags &
+ (HIST_FIELD_FL_PERCENT | HIST_FIELD_FL_GRAPH)))
+ continue;
+ if (!stats) {
+ stats = kcalloc(hist_data->n_vals, sizeof(*stats),
+ GFP_KERNEL);
+ if (!stats) {
+ n_entries = -ENOMEM;
+ goto out;
+ }
+ }
+ for (i = 0; i < n_entries; i++) {
+ val = tracing_map_read_sum(sort_entries[i]->elt, j);
+ stats[j].total += val;
+ if (stats[j].max < val)
+ stats[j].max = val;
+ }
+ }
+
for (i = 0; i < n_entries; i++)
- hist_trigger_entry_print(m, hist_data,
+ hist_trigger_entry_print(m, hist_data, stats,
sort_entries[i]->key,
sort_entries[i]->elt);
+ kfree(stats);
+out:
tracing_map_destroy_sort_entries(sort_entries, n_entries);
return n_entries;
@@ -4671,7 +5578,7 @@ static void hist_trigger_show(struct seq_file *m,
seq_puts(m, "\n\n");
seq_puts(m, "# event histogram\n#\n# trigger info: ");
- data->ops->print(m, data->ops, data);
+ data->ops->print(m, data);
seq_puts(m, "#\n\n");
hist_data = data->private_data;
@@ -4715,10 +5622,12 @@ static int event_hist_open(struct inode *inode, struct file *file)
{
int ret;
- ret = security_locked_down(LOCKDOWN_TRACEFS);
+ ret = tracing_open_file_tr(inode, file);
if (ret)
return ret;
+ /* Clear private_data to avoid warning in single_open() */
+ file->private_data = NULL;
return single_open(file, hist_show, file);
}
@@ -4726,7 +5635,7 @@ const struct file_operations event_hist_fops = {
.open = event_hist_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = tracing_single_release_file_tr,
};
#ifdef CONFIG_HIST_TRIGGERS_DEBUG
@@ -4748,6 +5657,8 @@ static void hist_field_debug_show_flags(struct seq_file *m,
if (flags & HIST_FIELD_FL_ALIAS)
seq_puts(m, " HIST_FIELD_FL_ALIAS\n");
+ else if (flags & HIST_FIELD_FL_CONST)
+ seq_puts(m, " HIST_FIELD_FL_CONST\n");
}
static int hist_field_debug_show(struct seq_file *m,
@@ -4769,6 +5680,9 @@ static int hist_field_debug_show(struct seq_file *m,
field->var.idx);
}
+ if (field->flags & HIST_FIELD_FL_CONST)
+ seq_printf(m, " constant: %llu\n", field->constant);
+
if (field->flags & HIST_FIELD_FL_ALIAS)
seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n",
field->var_ref_idx);
@@ -4898,7 +5812,7 @@ static void hist_trigger_debug_show(struct seq_file *m,
seq_puts(m, "\n\n");
seq_puts(m, "# event histogram\n#\n# trigger info: ");
- data->ops->print(m, data->ops, data);
+ data->ops->print(m, data);
seq_puts(m, "#\n\n");
hist_data = data->private_data;
@@ -4987,10 +5901,12 @@ static int event_hist_debug_open(struct inode *inode, struct file *file)
{
int ret;
- ret = security_locked_down(LOCKDOWN_TRACEFS);
+ ret = tracing_open_file_tr(inode, file);
if (ret)
return ret;
+ /* Clear private_data to avoid warning in single_open() */
+ file->private_data = NULL;
return single_open(file, hist_debug_show, file);
}
@@ -4998,7 +5914,7 @@ const struct file_operations event_hist_debug_fops = {
.open = event_hist_debug_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = tracing_single_release_file_tr,
};
#endif
@@ -5010,7 +5926,9 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
seq_printf(m, "%s=", hist_field->var.name);
if (hist_field->flags & HIST_FIELD_FL_CPU)
- seq_puts(m, "cpu");
+ seq_puts(m, "common_cpu");
+ else if (hist_field->flags & HIST_FIELD_FL_CONST)
+ seq_printf(m, "%llu", hist_field->constant);
else if (field_name) {
if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
hist_field->flags & HIST_FIELD_FL_ALIAS)
@@ -5021,25 +5939,28 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
if (hist_field->flags) {
if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) &&
- !(hist_field->flags & HIST_FIELD_FL_EXPR)) {
+ !(hist_field->flags & HIST_FIELD_FL_EXPR) &&
+ !(hist_field->flags & HIST_FIELD_FL_STACKTRACE)) {
const char *flags = get_hist_field_flags(hist_field);
if (flags)
seq_printf(m, ".%s", flags);
}
}
+ if (hist_field->buckets)
+ seq_printf(m, "=%ld", hist_field->buckets);
}
static int event_hist_trigger_print(struct seq_file *m,
- struct event_trigger_ops *ops,
struct event_trigger_data *data)
{
struct hist_trigger_data *hist_data = data->private_data;
struct hist_field *field;
bool have_var = false;
+ bool show_val = false;
unsigned int i;
- seq_puts(m, "hist:");
+ seq_puts(m, HIST_PREFIX);
if (data->name)
seq_printf(m, "%s:", data->name);
@@ -5052,9 +5973,12 @@ static int event_hist_trigger_print(struct seq_file *m,
if (i > hist_data->n_vals)
seq_puts(m, ",");
- if (field->flags & HIST_FIELD_FL_STACKTRACE)
- seq_puts(m, "stacktrace");
- else
+ if (field->flags & HIST_FIELD_FL_STACKTRACE) {
+ if (field->field)
+ seq_printf(m, "%s.stacktrace", field->field->name);
+ else
+ seq_puts(m, "common_stacktrace");
+ } else
hist_field_print(m, field);
}
@@ -5067,12 +5991,16 @@ static int event_hist_trigger_print(struct seq_file *m,
continue;
}
- if (i == HITCOUNT_IDX)
+ if (i == HITCOUNT_IDX) {
+ if (hist_data->attrs->no_hitcount)
+ continue;
seq_puts(m, "hitcount");
- else {
- seq_puts(m, ",");
+ } else {
+ if (show_val)
+ seq_puts(m, ",");
hist_field_print(m, field);
}
+ show_val = true;
}
if (have_var) {
@@ -5123,6 +6051,8 @@ static int event_hist_trigger_print(struct seq_file *m,
seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
if (hist_data->enable_timestamps)
seq_printf(m, ":clock=%s", hist_data->attrs->clock);
+ if (hist_data->attrs->no_hitcount)
+ seq_puts(m, ":nohitcount");
print_actions_spec(m, hist_data);
@@ -5139,8 +6069,7 @@ static int event_hist_trigger_print(struct seq_file *m,
return 0;
}
-static int event_hist_trigger_init(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+static int event_hist_trigger_init(struct event_trigger_data *data)
{
struct hist_trigger_data *hist_data = data->private_data;
@@ -5162,13 +6091,13 @@ static void unregister_field_var_hists(struct hist_trigger_data *hist_data)
for (i = 0; i < hist_data->n_field_var_hists; i++) {
file = hist_data->field_var_hists[i]->hist_data->event_file;
cmd = hist_data->field_var_hists[i]->cmd;
- ret = event_hist_trigger_func(&trigger_hist_cmd, file,
- "!hist", "hist", cmd);
+ ret = event_hist_trigger_parse(&trigger_hist_cmd, file,
+ "!hist", "hist", cmd);
+ WARN_ON_ONCE(ret < 0);
}
}
-static void event_hist_trigger_free(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+static void event_hist_trigger_free(struct event_trigger_data *data)
{
struct hist_trigger_data *hist_data = data->private_data;
@@ -5191,31 +6120,29 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops,
}
static struct event_trigger_ops event_hist_trigger_ops = {
- .func = event_hist_trigger,
+ .trigger = event_hist_trigger,
.print = event_hist_trigger_print,
.init = event_hist_trigger_init,
.free = event_hist_trigger_free,
};
-static int event_hist_trigger_named_init(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+static int event_hist_trigger_named_init(struct event_trigger_data *data)
{
data->ref++;
save_named_trigger(data->named_data->name, data);
- event_hist_trigger_init(ops, data->named_data);
+ event_hist_trigger_init(data->named_data);
return 0;
}
-static void event_hist_trigger_named_free(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+static void event_hist_trigger_named_free(struct event_trigger_data *data)
{
if (WARN_ON_ONCE(data->ref <= 0))
return;
- event_hist_trigger_free(ops, data->named_data);
+ event_hist_trigger_free(data->named_data);
data->ref--;
if (!data->ref) {
@@ -5225,7 +6152,7 @@ static void event_hist_trigger_named_free(struct event_trigger_ops *ops,
}
static struct event_trigger_ops event_hist_trigger_named_ops = {
- .func = event_hist_trigger,
+ .trigger = event_hist_trigger,
.print = event_hist_trigger_print,
.init = event_hist_trigger_named_init,
.free = event_hist_trigger_named_free,
@@ -5342,7 +6269,49 @@ static bool hist_trigger_match(struct event_trigger_data *data,
return true;
}
-static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
+static bool existing_hist_update_only(char *glob,
+ struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+ struct hist_trigger_data *hist_data = data->private_data;
+ struct event_trigger_data *test, *named_data = NULL;
+ bool updated = false;
+
+ if (!hist_data->attrs->pause && !hist_data->attrs->cont &&
+ !hist_data->attrs->clear)
+ goto out;
+
+ if (hist_data->attrs->name) {
+ named_data = find_named_trigger(hist_data->attrs->name);
+ if (named_data) {
+ if (!hist_trigger_match(data, named_data, named_data,
+ true))
+ goto out;
+ }
+ }
+
+ if (hist_data->attrs->name && !named_data)
+ goto out;
+
+ list_for_each_entry(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ if (!hist_trigger_match(data, test, named_data, false))
+ continue;
+ if (hist_data->attrs->pause)
+ test->paused = true;
+ else if (hist_data->attrs->cont)
+ test->paused = false;
+ else if (hist_data->attrs->clear)
+ hist_clear(test);
+ updated = true;
+ goto out;
+ }
+ }
+ out:
+ return updated;
+}
+
+static int hist_register_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
{
@@ -5370,19 +6339,11 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
- if (!hist_trigger_match(data, test, named_data, false))
- continue;
- if (hist_data->attrs->pause)
- test->paused = true;
- else if (hist_data->attrs->cont)
- test->paused = false;
- else if (hist_data->attrs->clear)
- hist_clear(test);
- else {
+ if (hist_trigger_match(data, test, named_data, false)) {
hist_err(tr, HIST_ERR_TRIGGER_EEXIST, 0);
ret = -EEXIST;
+ goto out;
}
- goto out;
}
}
new:
@@ -5402,7 +6363,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
}
if (data->ops->init) {
- ret = data->ops->init(data->ops, data);
+ ret = data->ops->init(data);
if (ret < 0)
goto out;
}
@@ -5416,13 +6377,11 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
goto out;
}
- tracing_set_time_stamp_abs(file->tr, true);
+ tracing_set_filter_buffering(file->tr, true);
}
if (named_data)
destroy_hist_data(hist_data);
-
- ret++;
out:
return ret;
}
@@ -5494,24 +6453,23 @@ static bool hist_trigger_check_refs(struct event_trigger_data *data,
return false;
}
-static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
+static void hist_unregister_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
{
+ struct event_trigger_data *test = NULL, *iter, *named_data = NULL;
struct hist_trigger_data *hist_data = data->private_data;
- struct event_trigger_data *test, *named_data = NULL;
- bool unregistered = false;
lockdep_assert_held(&event_mutex);
if (hist_data->attrs->name)
named_data = find_named_trigger(hist_data->attrs->name);
- list_for_each_entry(test, &file->triggers, list) {
- if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
- if (!hist_trigger_match(data, test, named_data, false))
+ list_for_each_entry(iter, &file->triggers, list) {
+ if (iter->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ if (!hist_trigger_match(data, iter, named_data, false))
continue;
- unregistered = true;
+ test = iter;
list_del_rcu(&test->list);
trace_event_trigger_enable_disable(file, 0);
update_cond_flag(file);
@@ -5519,12 +6477,12 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
}
}
- if (unregistered && test->ops->free)
- test->ops->free(test->ops, test);
+ if (test && test->ops->free)
+ test->ops->free(test);
if (hist_data->enable_timestamps) {
- if (!hist_data->remove || unregistered)
- tracing_set_time_stamp_abs(file->tr, false);
+ if (!hist_data->remove || test)
+ tracing_set_filter_buffering(file->tr, false);
}
}
@@ -5571,74 +6529,86 @@ static void hist_unreg_all(struct trace_event_file *file)
update_cond_flag(file);
if (hist_data->enable_timestamps)
- tracing_set_time_stamp_abs(file->tr, false);
+ tracing_set_filter_buffering(file->tr, false);
if (test->ops->free)
- test->ops->free(test->ops, test);
+ test->ops->free(test);
}
}
}
-static int event_hist_trigger_func(struct event_command *cmd_ops,
- struct trace_event_file *file,
- char *glob, char *cmd, char *param)
+static int event_hist_trigger_parse(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd,
+ char *param_and_filter)
{
unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT;
struct event_trigger_data *trigger_data;
struct hist_trigger_attrs *attrs;
- struct event_trigger_ops *trigger_ops;
struct hist_trigger_data *hist_data;
+ char *param, *filter, *p, *start;
struct synth_event *se;
const char *se_name;
- bool remove = false;
- char *trigger, *p;
+ bool remove;
int ret = 0;
lockdep_assert_held(&event_mutex);
- if (glob && strlen(glob)) {
+ if (WARN_ON(!glob))
+ return -EINVAL;
+
+ if (glob[0]) {
hist_err_clear();
- last_cmd_set(file, param);
+ last_cmd_set(file, param_and_filter);
}
- if (!param)
- return -EINVAL;
+ remove = event_trigger_check_remove(glob);
- if (glob[0] == '!')
- remove = true;
+ if (event_trigger_empty_param(param_and_filter))
+ return -EINVAL;
/*
* separate the trigger from the filter (k:v [if filter])
* allowing for whitespace in the trigger
*/
- p = trigger = param;
+ p = param = param_and_filter;
do {
p = strstr(p, "if");
if (!p)
break;
- if (p == param)
+ if (p == param_and_filter)
return -EINVAL;
if (*(p - 1) != ' ' && *(p - 1) != '\t') {
p++;
continue;
}
- if (p >= param + strlen(param) - (sizeof("if") - 1) - 1)
+ if (p >= param_and_filter + strlen(param_and_filter) - (sizeof("if") - 1) - 1)
return -EINVAL;
if (*(p + sizeof("if") - 1) != ' ' && *(p + sizeof("if") - 1) != '\t') {
p++;
continue;
}
break;
- } while (p);
+ } while (1);
if (!p)
- param = NULL;
+ filter = NULL;
else {
*(p - 1) = '\0';
- param = strstrip(p);
- trigger = strstrip(trigger);
+ filter = strstrip(p);
+ param = strstrip(param);
+ }
+
+ /*
+ * To simplify arithmetic expression parsing, replace occurrences of
+ * '.sym-offset' modifier with '.symXoffset'
+ */
+ start = strstr(param, ".sym-offset");
+ while (start) {
+ *(start + 4) = 'X';
+ start = strstr(start + 11, ".sym-offset");
}
- attrs = parse_hist_trigger_attrs(file->tr, trigger);
+ attrs = parse_hist_trigger_attrs(file->tr, param);
if (IS_ERR(attrs))
return PTR_ERR(attrs);
@@ -5651,29 +6621,15 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
return PTR_ERR(hist_data);
}
- trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
-
- trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+ trigger_data = event_trigger_alloc(cmd_ops, cmd, param, hist_data);
if (!trigger_data) {
ret = -ENOMEM;
goto out_free;
}
- trigger_data->count = -1;
- trigger_data->ops = trigger_ops;
- trigger_data->cmd_ops = cmd_ops;
-
- INIT_LIST_HEAD(&trigger_data->list);
- RCU_INIT_POINTER(trigger_data->filter, NULL);
-
- trigger_data->private_data = hist_data;
-
- /* if param is non-empty, it's supposed to be a filter */
- if (param && cmd_ops->set_filter) {
- ret = cmd_ops->set_filter(param, trigger_data, file);
- if (ret < 0)
- goto out_free;
- }
+ ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data);
+ if (ret < 0)
+ goto out_free;
if (remove) {
if (!have_hist_trigger_match(trigger_data, file))
@@ -5684,7 +6640,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
goto out_free;
}
- cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+ event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
se_name = trace_event_name(file->event_call);
se = find_synth_event(se_name);
if (se)
@@ -5693,29 +6649,26 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
goto out_free;
}
- ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
- /*
- * The above returns on success the # of triggers registered,
- * but if it didn't register any it returns zero. Consider no
- * triggers registered a failure too.
- */
- if (!ret) {
- if (!(attrs->pause || attrs->cont || attrs->clear))
- ret = -ENOENT;
+ if (existing_hist_update_only(glob, trigger_data, file))
goto out_free;
- } else if (ret < 0)
+
+ ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
+ if (ret < 0)
goto out_free;
if (get_named_trigger_data(trigger_data))
goto enable;
- if (has_hist_vars(hist_data))
- save_hist_vars(hist_data);
-
ret = create_actions(hist_data);
if (ret)
goto out_unreg;
+ if (has_hist_vars(hist_data) || hist_data->n_var_refs) {
+ ret = save_hist_vars(hist_data);
+ if (ret)
+ goto out_unreg;
+ }
+
ret = tracing_map_init(hist_data->map);
if (ret)
goto out_unreg;
@@ -5728,18 +6681,15 @@ enable:
se = find_synth_event(se_name);
if (se)
se->ref++;
- /* Just return zero, not the number of registered triggers */
- ret = 0;
out:
- if (ret == 0)
+ if (ret == 0 && glob[0])
hist_err_clear();
return ret;
out_unreg:
- cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+ event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
out_free:
- if (cmd_ops->set_filter)
- cmd_ops->set_filter(NULL, trigger_data, NULL);
+ event_trigger_reset_filter(cmd_ops, trigger_data);
remove_hist_vars(hist_data);
@@ -5753,7 +6703,7 @@ static struct event_command trigger_hist_cmd = {
.name = "hist",
.trigger_type = ETT_EVENT_HIST,
.flags = EVENT_CMD_FL_NEEDS_REC,
- .func = event_hist_trigger_func,
+ .parse = event_hist_trigger_parse,
.reg = hist_register_trigger,
.unreg = hist_unregister_trigger,
.unreg_all = hist_unreg_all,
@@ -5772,7 +6722,8 @@ __init int register_trigger_hist_cmd(void)
}
static void
-hist_enable_trigger(struct event_trigger_data *data, void *rec,
+hist_enable_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
struct enable_trigger_data *enable_data = data->private_data;
@@ -5790,7 +6741,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec,
}
static void
-hist_enable_count_trigger(struct event_trigger_data *data, void *rec,
+hist_enable_count_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
if (!data->count)
@@ -5799,32 +6751,32 @@ hist_enable_count_trigger(struct event_trigger_data *data, void *rec,
if (data->count != -1)
(data->count)--;
- hist_enable_trigger(data, rec, event);
+ hist_enable_trigger(data, buffer, rec, event);
}
static struct event_trigger_ops hist_enable_trigger_ops = {
- .func = hist_enable_trigger,
+ .trigger = hist_enable_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
static struct event_trigger_ops hist_enable_count_trigger_ops = {
- .func = hist_enable_count_trigger,
+ .trigger = hist_enable_count_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
static struct event_trigger_ops hist_disable_trigger_ops = {
- .func = hist_enable_trigger,
+ .trigger = hist_enable_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
static struct event_trigger_ops hist_disable_count_trigger_ops = {
- .func = hist_enable_count_trigger,
+ .trigger = hist_enable_count_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
@@ -5858,7 +6810,7 @@ static void hist_enable_unreg_all(struct trace_event_file *file)
update_cond_flag(file);
trace_event_trigger_enable_disable(file, 0);
if (test->ops->free)
- test->ops->free(test->ops, test);
+ test->ops->free(test);
}
}
}
@@ -5866,7 +6818,7 @@ static void hist_enable_unreg_all(struct trace_event_file *file)
static struct event_command trigger_hist_enable_cmd = {
.name = ENABLE_HIST_STR,
.trigger_type = ETT_HIST_ENABLE,
- .func = event_enable_trigger_func,
+ .parse = event_enable_trigger_parse,
.reg = event_enable_register_trigger,
.unreg = event_enable_unregister_trigger,
.unreg_all = hist_enable_unreg_all,
@@ -5877,7 +6829,7 @@ static struct event_command trigger_hist_enable_cmd = {
static struct event_command trigger_hist_disable_cmd = {
.name = DISABLE_HIST_STR,
.trigger_type = ETT_HIST_ENABLE,
- .func = event_enable_trigger_func,
+ .parse = event_enable_trigger_parse,
.reg = event_enable_register_trigger,
.unreg = event_enable_unregister_trigger,
.unreg_all = hist_enable_unreg_all,
diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c
index 22bcf7c51d1e..8650562bdaa9 100644
--- a/kernel/trace/trace_events_inject.c
+++ b/kernel/trace/trace_events_inject.c
@@ -168,10 +168,14 @@ static void *trace_alloc_entry(struct trace_event_call *call, int *size)
continue;
if (field->filter_type == FILTER_STATIC_STRING)
continue;
- if (field->filter_type == FILTER_DYN_STRING) {
+ if (field->filter_type == FILTER_DYN_STRING ||
+ field->filter_type == FILTER_RDYN_STRING) {
u32 *str_item;
int str_loc = entry_size & 0xffff;
+ if (field->filter_type == FILTER_RDYN_STRING)
+ str_loc -= field->offset + field->size;
+
str_item = (u32 *)(entry + field->offset);
*str_item = str_loc; /* string length is 0. */
} else {
@@ -192,7 +196,6 @@ static void *trace_alloc_entry(struct trace_event_call *call, int *size)
static int parse_entry(char *str, struct trace_event_call *call, void **pentry)
{
struct ftrace_event_field *field;
- unsigned long irq_flags;
void *entry = NULL;
int entry_size;
u64 val = 0;
@@ -203,9 +206,8 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry)
if (!entry)
return -ENOMEM;
- local_save_flags(irq_flags);
- tracing_generic_entry_update(entry, call->event.type, irq_flags,
- preempt_count());
+ tracing_generic_entry_update(entry, call->event.type,
+ tracing_gen_ctx());
while ((len = parse_field(str, call, &field, &val)) > 0) {
if (is_function_field(field))
@@ -215,8 +217,9 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry)
char *addr = (char *)(unsigned long) val;
if (field->filter_type == FILTER_STATIC_STRING) {
- strlcpy(entry + field->offset, addr, field->size);
- } else if (field->filter_type == FILTER_DYN_STRING) {
+ strscpy(entry + field->offset, addr, field->size);
+ } else if (field->filter_type == FILTER_DYN_STRING ||
+ field->filter_type == FILTER_RDYN_STRING) {
int str_len = strlen(addr) + 1;
int str_loc = entry_size & 0xffff;
u32 *str_item;
@@ -229,8 +232,10 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry)
}
entry = *pentry;
- strlcpy(entry + (entry_size - str_len), addr, str_len);
+ strscpy(entry + (entry_size - str_len), addr, str_len);
str_item = (u32 *)(entry + field->offset);
+ if (field->filter_type == FILTER_RDYN_STRING)
+ str_loc -= field->offset + field->size;
*str_item = (str_len << 16) | str_loc;
} else {
char **paddr;
@@ -323,7 +328,8 @@ event_inject_read(struct file *file, char __user *buf, size_t size,
}
const struct file_operations event_inject_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_file_tr,
.read = event_inject_read,
.write = event_inject_write,
+ .release = tracing_release_file_tr,
};
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index c6cca0d1d584..c82b401a294d 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -17,10 +17,74 @@
/* for gfp flag names */
#include <linux/trace_events.h>
#include <trace/events/mmflags.h>
+#include "trace_probe.h"
+#include "trace_probe_kernel.h"
#include "trace_synth.h"
-static int create_synth_event(int argc, const char **argv);
+#undef ERRORS
+#define ERRORS \
+ C(BAD_NAME, "Illegal name"), \
+ C(INVALID_CMD, "Command must be of the form: <name> field[;field] ..."),\
+ C(INVALID_DYN_CMD, "Command must be of the form: s or -:[synthetic/]<name> field[;field] ..."),\
+ C(EVENT_EXISTS, "Event already exists"), \
+ C(TOO_MANY_FIELDS, "Too many fields"), \
+ C(INCOMPLETE_TYPE, "Incomplete type"), \
+ C(INVALID_TYPE, "Invalid type"), \
+ C(INVALID_FIELD, "Invalid field"), \
+ C(INVALID_ARRAY_SPEC, "Invalid array specification"),
+
+#undef C
+#define C(a, b) SYNTH_ERR_##a
+
+enum { ERRORS };
+
+#undef C
+#define C(a, b) b
+
+static const char *err_text[] = { ERRORS };
+
+static DEFINE_MUTEX(lastcmd_mutex);
+static char *last_cmd;
+
+static int errpos(const char *str)
+{
+ int ret = 0;
+
+ mutex_lock(&lastcmd_mutex);
+ if (!str || !last_cmd)
+ goto out;
+
+ ret = err_pos(last_cmd, str);
+ out:
+ mutex_unlock(&lastcmd_mutex);
+ return ret;
+}
+
+static void last_cmd_set(const char *str)
+{
+ if (!str)
+ return;
+
+ mutex_lock(&lastcmd_mutex);
+ kfree(last_cmd);
+ last_cmd = kstrdup(str, GFP_KERNEL);
+ mutex_unlock(&lastcmd_mutex);
+}
+
+static void synth_err(u8 err_type, u16 err_pos)
+{
+ mutex_lock(&lastcmd_mutex);
+ if (!last_cmd)
+ goto out;
+
+ tracing_log_err(NULL, "synthetic_events", last_cmd, err_text,
+ err_type, err_pos);
+ out:
+ mutex_unlock(&lastcmd_mutex);
+}
+
+static int create_synth_event(const char *raw_command);
static int synth_event_show(struct seq_file *m, struct dyn_event *ev);
static int synth_event_release(struct dyn_event *ev);
static bool synth_event_is_busy(struct dyn_event *ev);
@@ -63,7 +127,7 @@ static bool synth_event_match(const char *system, const char *event,
struct synth_trace_event {
struct trace_entry ent;
- u64 fields[];
+ union trace_synth_field fields[];
};
static int synth_event_define_fields(struct trace_event_call *call)
@@ -88,7 +152,7 @@ static int synth_event_define_fields(struct trace_event_call *call)
event->fields[i]->offset = n_u64;
- if (event->fields[i]->is_string) {
+ if (event->fields[i]->is_string && !event->fields[i]->is_dynamic) {
offset += STR_VAR_LEN_MAX;
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
} else {
@@ -120,6 +184,14 @@ static int synth_field_is_string(char *type)
return false;
}
+static int synth_field_is_stack(char *type)
+{
+ if (strstr(type, "long[") != NULL)
+ return true;
+
+ return false;
+}
+
static int synth_field_string_size(char *type)
{
char buf[4], *end, *start;
@@ -132,13 +204,16 @@ static int synth_field_string_size(char *type)
start += sizeof("char[") - 1;
end = strchr(type, ']');
- if (!end || end < start)
+ if (!end || end < start || type + strlen(type) > end + 1)
return -EINVAL;
len = end - start;
if (len > 3)
return -EINVAL;
+ if (len == 0)
+ return 0; /* variable-length string */
+
strncpy(buf, start, len);
buf[len] = '\0';
@@ -184,12 +259,16 @@ static int synth_field_size(char *type)
size = sizeof(long);
else if (strcmp(type, "unsigned long") == 0)
size = sizeof(unsigned long);
+ else if (strcmp(type, "bool") == 0)
+ size = sizeof(bool);
else if (strcmp(type, "pid_t") == 0)
size = sizeof(pid_t);
else if (strcmp(type, "gfp_t") == 0)
size = sizeof(gfp_t);
else if (synth_field_is_string(type))
size = synth_field_string_size(type);
+ else if (synth_field_is_stack(type))
+ size = 0;
return size;
}
@@ -226,11 +305,15 @@ static const char *synth_field_fmt(char *type)
fmt = "%ld";
else if (strcmp(type, "unsigned long") == 0)
fmt = "%lu";
+ else if (strcmp(type, "bool") == 0)
+ fmt = "%d";
else if (strcmp(type, "pid_t") == 0)
fmt = "%d";
else if (strcmp(type, "gfp_t") == 0)
fmt = "%x";
else if (synth_field_is_string(type))
+ fmt = "%.*s";
+ else if (synth_field_is_stack(type))
fmt = "%s";
return fmt;
@@ -238,23 +321,23 @@ static const char *synth_field_fmt(char *type)
static void print_synth_event_num_val(struct trace_seq *s,
char *print_fmt, char *name,
- int size, u64 val, char *space)
+ int size, union trace_synth_field *val, char *space)
{
switch (size) {
case 1:
- trace_seq_printf(s, print_fmt, name, (u8)val, space);
+ trace_seq_printf(s, print_fmt, name, val->as_u8, space);
break;
case 2:
- trace_seq_printf(s, print_fmt, name, (u16)val, space);
+ trace_seq_printf(s, print_fmt, name, val->as_u16, space);
break;
case 4:
- trace_seq_printf(s, print_fmt, name, (u32)val, space);
+ trace_seq_printf(s, print_fmt, name, val->as_u32, space);
break;
default:
- trace_seq_printf(s, print_fmt, name, val, space);
+ trace_seq_printf(s, print_fmt, name, val->as_u64, space);
break;
}
}
@@ -267,7 +350,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
struct trace_seq *s = &iter->seq;
struct synth_trace_event *entry;
struct synth_event *se;
- unsigned int i, n_u64;
+ unsigned int i, j, n_u64;
char print_fmt[32];
const char *fmt;
@@ -290,10 +373,29 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
/* parameter values */
if (se->fields[i]->is_string) {
- trace_seq_printf(s, print_fmt, se->fields[i]->name,
- (char *)&entry->fields[n_u64],
- i == se->n_fields - 1 ? "" : " ");
- n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ if (se->fields[i]->is_dynamic) {
+ union trace_synth_field *data = &entry->fields[n_u64];
+
+ trace_seq_printf(s, print_fmt, se->fields[i]->name,
+ STR_VAR_LEN_MAX,
+ (char *)entry + data->as_dynamic.offset,
+ i == se->n_fields - 1 ? "" : " ");
+ n_u64++;
+ } else {
+ trace_seq_printf(s, print_fmt, se->fields[i]->name,
+ STR_VAR_LEN_MAX,
+ (char *)&entry->fields[n_u64].as_u64,
+ i == se->n_fields - 1 ? "" : " ");
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ }
+ } else if (se->fields[i]->is_stack) {
+ union trace_synth_field *data = &entry->fields[n_u64];
+ unsigned long *p = (void *)entry + data->as_dynamic.offset;
+
+ trace_seq_printf(s, "%s=STACK:\n", se->fields[i]->name);
+ for (j = 1; j < data->as_dynamic.len / sizeof(long); j++)
+ trace_seq_printf(s, "=> %pS\n", (void *)p[j]);
+ n_u64++;
} else {
struct trace_print_flags __flags[] = {
__def_gfpflag_names, {-1, NULL} };
@@ -302,13 +404,13 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
print_synth_event_num_val(s, print_fmt,
se->fields[i]->name,
se->fields[i]->size,
- entry->fields[n_u64],
+ &entry->fields[n_u64],
space);
if (strcmp(se->fields[i]->type, "gfp_t") == 0) {
trace_seq_puts(s, " (");
trace_print_flags_seq(s, "|",
- entry->fields[n_u64],
+ entry->fields[n_u64].as_u64,
__flags);
trace_seq_putc(s, ')');
}
@@ -325,16 +427,91 @@ static struct trace_event_functions synth_event_funcs = {
.trace = print_synth_event
};
+static unsigned int trace_string(struct synth_trace_event *entry,
+ struct synth_event *event,
+ char *str_val,
+ bool is_dynamic,
+ unsigned int data_size,
+ unsigned int *n_u64)
+{
+ unsigned int len = 0;
+ char *str_field;
+ int ret;
+
+ if (is_dynamic) {
+ union trace_synth_field *data = &entry->fields[*n_u64];
+
+ len = fetch_store_strlen((unsigned long)str_val);
+ data->as_dynamic.offset = struct_size(entry, fields, event->n_u64) + data_size;
+ data->as_dynamic.len = len;
+
+ ret = fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry);
+
+ (*n_u64)++;
+ } else {
+ str_field = (char *)&entry->fields[*n_u64].as_u64;
+
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)str_val < TASK_SIZE)
+ ret = strncpy_from_user_nofault(str_field, (const void __user *)str_val, STR_VAR_LEN_MAX);
+ else
+#endif
+ ret = strncpy_from_kernel_nofault(str_field, str_val, STR_VAR_LEN_MAX);
+
+ if (ret < 0)
+ strcpy(str_field, FAULT_STRING);
+
+ (*n_u64) += STR_VAR_LEN_MAX / sizeof(u64);
+ }
+
+ return len;
+}
+
+static unsigned int trace_stack(struct synth_trace_event *entry,
+ struct synth_event *event,
+ long *stack,
+ unsigned int data_size,
+ unsigned int *n_u64)
+{
+ union trace_synth_field *data = &entry->fields[*n_u64];
+ unsigned int len;
+ u32 data_offset;
+ void *data_loc;
+
+ data_offset = struct_size(entry, fields, event->n_u64);
+ data_offset += data_size;
+
+ for (len = 0; len < HIST_STACKTRACE_DEPTH; len++) {
+ if (!stack[len])
+ break;
+ }
+
+ len *= sizeof(long);
+
+ /* Find the dynamic section to copy the stack into. */
+ data_loc = (void *)entry + data_offset;
+ memcpy(data_loc, stack, len);
+
+ /* Fill in the field that holds the offset/len combo */
+
+ data->as_dynamic.offset = data_offset;
+ data->as_dynamic.len = len;
+
+ (*n_u64)++;
+
+ return len;
+}
+
static notrace void trace_event_raw_event_synth(void *__data,
u64 *var_ref_vals,
unsigned int *var_ref_idx)
{
+ unsigned int i, n_u64, val_idx, len, data_size = 0;
struct trace_event_file *trace_file = __data;
struct synth_trace_event *entry;
struct trace_event_buffer fbuffer;
struct trace_buffer *buffer;
struct synth_event *event;
- unsigned int i, n_u64, val_idx;
int fields_size = 0;
event = trace_file->event_call->data;
@@ -344,6 +521,24 @@ static notrace void trace_event_raw_event_synth(void *__data,
fields_size = event->n_u64 * sizeof(u64);
+ for (i = 0; i < event->n_dynamic_fields; i++) {
+ unsigned int field_pos = event->dynamic_fields[i]->field_pos;
+ char *str_val;
+
+ val_idx = var_ref_idx[field_pos];
+ str_val = (char *)(long)var_ref_vals[val_idx];
+
+ if (event->dynamic_fields[i]->is_stack) {
+ /* reserve one extra element for size */
+ len = *((unsigned long *)str_val) + 1;
+ len *= sizeof(unsigned long);
+ } else {
+ len = fetch_store_strlen((unsigned long)str_val);
+ }
+
+ fields_size += len;
+ }
+
/*
* Avoid ring buffer recursion detection, as this event
* is being performed within another event.
@@ -360,29 +555,36 @@ static notrace void trace_event_raw_event_synth(void *__data,
val_idx = var_ref_idx[i];
if (event->fields[i]->is_string) {
char *str_val = (char *)(long)var_ref_vals[val_idx];
- char *str_field = (char *)&entry->fields[n_u64];
- strscpy(str_field, str_val, STR_VAR_LEN_MAX);
- n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ len = trace_string(entry, event, str_val,
+ event->fields[i]->is_dynamic,
+ data_size, &n_u64);
+ data_size += len; /* only dynamic string increments */
+ } else if (event->fields[i]->is_stack) {
+ long *stack = (long *)(long)var_ref_vals[val_idx];
+
+ len = trace_stack(entry, event, stack,
+ data_size, &n_u64);
+ data_size += len;
} else {
struct synth_field *field = event->fields[i];
u64 val = var_ref_vals[val_idx];
switch (field->size) {
case 1:
- *(u8 *)&entry->fields[n_u64] = (u8)val;
+ entry->fields[n_u64].as_u8 = (u8)val;
break;
case 2:
- *(u16 *)&entry->fields[n_u64] = (u16)val;
+ entry->fields[n_u64].as_u16 = (u16)val;
break;
case 4:
- *(u32 *)&entry->fields[n_u64] = (u32)val;
+ entry->fields[n_u64].as_u32 = (u32)val;
break;
default:
- entry->fields[n_u64] = val;
+ entry->fields[n_u64].as_u64 = val;
break;
}
n_u64++;
@@ -422,8 +624,16 @@ static int __set_synth_event_print_fmt(struct synth_event *event,
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
for (i = 0; i < event->n_fields; i++) {
- pos += snprintf(buf + pos, LEN_OR_ZERO,
- ", REC->%s", event->fields[i]->name);
+ if (event->fields[i]->is_string &&
+ event->fields[i]->is_dynamic)
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", __get_str(%s)", event->fields[i]->name);
+ else if (event->fields[i]->is_stack)
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", __get_stacktrace(%s)", event->fields[i]->name);
+ else
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", REC->%s", event->fields[i]->name);
}
#undef LEN_OR_ZERO
@@ -459,28 +669,53 @@ static void free_synth_field(struct synth_field *field)
kfree(field);
}
-static struct synth_field *parse_synth_field(int argc, const char **argv,
- int *consumed)
+static int check_field_version(const char *prefix, const char *field_type,
+ const char *field_name)
{
- struct synth_field *field;
- const char *prefix = NULL, *field_type = argv[0], *field_name, *array;
- int len, ret = 0;
+ /*
+ * For backward compatibility, the old synthetic event command
+ * format did not require semicolons, and in order to not
+ * break user space, that old format must still work. If a new
+ * feature is added, then the format that uses the new feature
+ * will be required to have semicolons, as nothing that uses
+ * the old format would be using the new, yet to be created,
+ * feature. When a new feature is added, this will detect it,
+ * and return a number greater than 1, and require the format
+ * to use semicolons.
+ */
+ return 1;
+}
- if (field_type[0] == ';')
- field_type++;
+static struct synth_field *parse_synth_field(int argc, char **argv,
+ int *consumed, int *field_version)
+{
+ const char *prefix = NULL, *field_type = argv[0], *field_name, *array;
+ struct synth_field *field;
+ int len, ret = -ENOMEM;
+ struct seq_buf s;
+ ssize_t size;
if (!strcmp(field_type, "unsigned")) {
- if (argc < 3)
+ if (argc < 3) {
+ synth_err(SYNTH_ERR_INCOMPLETE_TYPE, errpos(field_type));
return ERR_PTR(-EINVAL);
+ }
prefix = "unsigned ";
field_type = argv[1];
field_name = argv[2];
- *consumed = 3;
+ *consumed += 3;
} else {
field_name = argv[1];
- *consumed = 2;
+ *consumed += 2;
+ }
+
+ if (!field_name) {
+ synth_err(SYNTH_ERR_INVALID_FIELD, errpos(field_type));
+ return ERR_PTR(-EINVAL);
}
+ *field_version = check_field_version(prefix, field_type, field_name);
+
field = kzalloc(sizeof(*field), GFP_KERNEL);
if (!field)
return ERR_PTR(-ENOMEM);
@@ -489,48 +724,85 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
array = strchr(field_name, '[');
if (array)
len -= strlen(array);
- else if (field_name[len - 1] == ';')
- len--;
field->name = kmemdup_nul(field_name, len, GFP_KERNEL);
- if (!field->name) {
- ret = -ENOMEM;
+ if (!field->name)
+ goto free;
+
+ if (!is_good_name(field->name)) {
+ synth_err(SYNTH_ERR_BAD_NAME, errpos(field_name));
+ ret = -EINVAL;
goto free;
}
- if (field_type[0] == ';')
- field_type++;
len = strlen(field_type) + 1;
+
if (array)
len += strlen(array);
+
if (prefix)
len += strlen(prefix);
field->type = kzalloc(len, GFP_KERNEL);
- if (!field->type) {
- ret = -ENOMEM;
+ if (!field->type)
goto free;
- }
+
+ seq_buf_init(&s, field->type, len);
if (prefix)
- strcat(field->type, prefix);
- strcat(field->type, field_type);
- if (array) {
- strcat(field->type, array);
- if (field->type[len - 1] == ';')
- field->type[len - 1] = '\0';
- }
+ seq_buf_puts(&s, prefix);
+ seq_buf_puts(&s, field_type);
+ if (array)
+ seq_buf_puts(&s, array);
+ if (WARN_ON_ONCE(!seq_buf_buffer_left(&s)))
+ goto free;
+
+ s.buffer[s.len] = '\0';
- field->size = synth_field_size(field->type);
- if (!field->size) {
+ size = synth_field_size(field->type);
+ if (size < 0) {
+ if (array)
+ synth_err(SYNTH_ERR_INVALID_ARRAY_SPEC, errpos(field_name));
+ else
+ synth_err(SYNTH_ERR_INVALID_TYPE, errpos(field_type));
ret = -EINVAL;
goto free;
+ } else if (size == 0) {
+ if (synth_field_is_string(field->type) ||
+ synth_field_is_stack(field->type)) {
+ char *type;
+
+ len = sizeof("__data_loc ") + strlen(field->type) + 1;
+ type = kzalloc(len, GFP_KERNEL);
+ if (!type)
+ goto free;
+
+ seq_buf_init(&s, type, len);
+ seq_buf_puts(&s, "__data_loc ");
+ seq_buf_puts(&s, field->type);
+
+ if (WARN_ON_ONCE(!seq_buf_buffer_left(&s)))
+ goto free;
+ s.buffer[s.len] = '\0';
+
+ kfree(field->type);
+ field->type = type;
+
+ field->is_dynamic = true;
+ size = sizeof(u64);
+ } else {
+ synth_err(SYNTH_ERR_INVALID_TYPE, errpos(field_type));
+ ret = -EINVAL;
+ goto free;
+ }
}
+ field->size = size;
if (synth_field_is_string(field->type))
field->is_string = true;
+ else if (synth_field_is_stack(field->type))
+ field->is_stack = true;
field->is_signed = synth_field_signed(field->type);
-
out:
return field;
free:
@@ -629,10 +901,9 @@ static int register_synth_event(struct synth_event *event)
}
ret = set_synth_event_print_fmt(call);
- if (ret < 0) {
+ /* unregister_trace_event() will be called inside */
+ if (ret < 0)
trace_remove_event_call(call);
- goto err;
- }
out:
return ret;
err:
@@ -661,6 +932,7 @@ static void free_synth_event(struct synth_event *event)
free_synth_field(event->fields[i]);
kfree(event->fields);
+ kfree(event->dynamic_fields);
kfree(event->name);
kfree(event->class.system);
free_synth_tracepoint(event->tp);
@@ -671,8 +943,8 @@ static void free_synth_event(struct synth_event *event)
static struct synth_event *alloc_synth_event(const char *name, int n_fields,
struct synth_field **fields)
{
+ unsigned int i, j, n_dynamic_fields = 0;
struct synth_event *event;
- unsigned int i;
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event) {
@@ -694,11 +966,31 @@ static struct synth_event *alloc_synth_event(const char *name, int n_fields,
goto out;
}
+ for (i = 0; i < n_fields; i++)
+ if (fields[i]->is_dynamic)
+ n_dynamic_fields++;
+
+ if (n_dynamic_fields) {
+ event->dynamic_fields = kcalloc(n_dynamic_fields,
+ sizeof(*event->dynamic_fields),
+ GFP_KERNEL);
+ if (!event->dynamic_fields) {
+ free_synth_event(event);
+ event = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+ }
+
dyn_event_init(&event->devent, &synth_event_ops);
- for (i = 0; i < n_fields; i++)
+ for (i = 0, j = 0; i < n_fields; i++) {
+ fields[i]->field_pos = i;
event->fields[i] = fields[i];
+ if (fields[i]->is_dynamic)
+ event->dynamic_fields[j++] = fields[i];
+ }
+ event->n_dynamic_fields = j;
event->n_fields = n_fields;
out:
return event;
@@ -710,6 +1002,10 @@ static int synth_event_check_arg_fn(void *data)
int size;
size = synth_field_size((char *)arg_pair->lhs);
+ if (size == 0) {
+ if (strstr((char *)arg_pair->lhs, "["))
+ return 0;
+ }
return size ? 0 : -EINVAL;
}
@@ -842,7 +1138,7 @@ EXPORT_SYMBOL_GPL(synth_event_add_fields);
* @cmd: A pointer to the dynevent_cmd struct representing the new event
* @name: The name of the synthetic event
* @mod: The module creating the event, NULL if not created from a module
- * @args: Variable number of arg (pairs), one pair for each field
+ * @...: Variable number of arg (pairs), one pair for each field
*
* NOTE: Users normally won't want to call this function directly, but
* rather use the synth_event_gen_cmd_start() wrapper, which
@@ -914,6 +1210,7 @@ EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start);
* synth_event_gen_cmd_array_start - Start synthetic event command from an array
* @cmd: A pointer to the dynevent_cmd struct representing the new event
* @name: The name of the synthetic event
+ * @mod: The module creating the event, NULL if not created from a module
* @fields: An array of type/name field descriptions
* @n_fields: The number of field descriptions contained in the fields array
*
@@ -971,11 +1268,13 @@ int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name,
}
EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start);
-static int __create_synth_event(int argc, const char *name, const char **argv)
+static int __create_synth_event(const char *name, const char *raw_fields)
{
+ char **argv, *field_str, *tmp_fields, *saved_fields = NULL;
struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
+ int consumed, cmd_version = 1, n_fields_this_loop;
+ int i, argc, n_fields = 0, ret = 0;
struct synth_event *event = NULL;
- int i, consumed = 0, n_fields = 0, ret = 0;
/*
* Argument syntax:
@@ -984,35 +1283,100 @@ static int __create_synth_event(int argc, const char *name, const char **argv)
* where 'field' = type field_name
*/
- if (name[0] == '\0' || argc < 1)
+ if (name[0] == '\0') {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
return -EINVAL;
+ }
+
+ if (!is_good_name(name)) {
+ synth_err(SYNTH_ERR_BAD_NAME, errpos(name));
+ return -EINVAL;
+ }
mutex_lock(&event_mutex);
event = find_synth_event(name);
if (event) {
+ synth_err(SYNTH_ERR_EVENT_EXISTS, errpos(name));
ret = -EEXIST;
- goto out;
+ goto err;
}
- for (i = 0; i < argc - 1; i++) {
- if (strcmp(argv[i], ";") == 0)
- continue;
- if (n_fields == SYNTH_FIELDS_MAX) {
- ret = -EINVAL;
+ tmp_fields = saved_fields = kstrdup(raw_fields, GFP_KERNEL);
+ if (!tmp_fields) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ while ((field_str = strsep(&tmp_fields, ";")) != NULL) {
+ argv = argv_split(GFP_KERNEL, field_str, &argc);
+ if (!argv) {
+ ret = -ENOMEM;
goto err;
}
- field = parse_synth_field(argc - i, &argv[i], &consumed);
- if (IS_ERR(field)) {
- ret = PTR_ERR(field);
+ if (!argc) {
+ argv_free(argv);
+ continue;
+ }
+
+ n_fields_this_loop = 0;
+ consumed = 0;
+ while (argc > consumed) {
+ int field_version;
+
+ field = parse_synth_field(argc - consumed,
+ argv + consumed, &consumed,
+ &field_version);
+ if (IS_ERR(field)) {
+ ret = PTR_ERR(field);
+ goto err_free_arg;
+ }
+
+ /*
+ * Track the highest version of any field we
+ * found in the command.
+ */
+ if (field_version > cmd_version)
+ cmd_version = field_version;
+
+ /*
+ * Now sort out what is and isn't valid for
+ * each supported version.
+ *
+ * If we see more than 1 field per loop, it
+ * means we have multiple fields between
+ * semicolons, and that's something we no
+ * longer support in a version 2 or greater
+ * command.
+ */
+ if (cmd_version > 1 && n_fields_this_loop >= 1) {
+ synth_err(SYNTH_ERR_INVALID_CMD, errpos(field_str));
+ ret = -EINVAL;
+ goto err_free_arg;
+ }
+
+ if (n_fields == SYNTH_FIELDS_MAX) {
+ synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0);
+ ret = -EINVAL;
+ goto err_free_arg;
+ }
+ fields[n_fields++] = field;
+
+ n_fields_this_loop++;
+ }
+ argv_free(argv);
+
+ if (consumed < argc) {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
+ ret = -EINVAL;
goto err;
}
- fields[n_fields++] = field;
- i += consumed - 1;
+
}
- if (i < argc && strcmp(argv[i], ";") != 0) {
+ if (n_fields == 0) {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
ret = -EINVAL;
goto err;
}
@@ -1025,13 +1389,17 @@ static int __create_synth_event(int argc, const char *name, const char **argv)
}
ret = register_synth_event(event);
if (!ret)
- dyn_event_add(&event->devent);
+ dyn_event_add(&event->devent, &event->call);
else
free_synth_event(event);
out:
mutex_unlock(&event_mutex);
+ kfree(saved_fields);
+
return ret;
+ err_free_arg:
+ argv_free(argv);
err:
for (i = 0; i < n_fields; i++)
free_synth_field(fields[i]);
@@ -1041,7 +1409,7 @@ static int __create_synth_event(int argc, const char *name, const char **argv)
/**
* synth_event_create - Create a new synthetic event
- * @name: The name of the new sythetic event
+ * @name: The name of the new synthetic event
* @fields: An array of type/name field descriptions
* @n_fields: The number of field descriptions contained in the fields array
* @mod: The module creating the event, NULL if not created from a module
@@ -1094,13 +1462,15 @@ static int destroy_synth_event(struct synth_event *se)
int ret;
if (se->ref)
- ret = -EBUSY;
- else {
- ret = unregister_synth_event(se);
- if (!ret) {
- dyn_event_remove(&se->devent);
- free_synth_event(se);
- }
+ return -EBUSY;
+
+ if (trace_event_dyn_busy(&se->call))
+ return -EBUSY;
+
+ ret = unregister_synth_event(se);
+ if (!ret) {
+ dyn_event_remove(&se->devent);
+ free_synth_event(se);
}
return ret;
@@ -1108,7 +1478,7 @@ static int destroy_synth_event(struct synth_event *se)
/**
* synth_event_delete - Delete a synthetic event
- * @event_name: The name of the new sythetic event
+ * @event_name: The name of the new synthetic event
*
* Delete a synthetic event that was created with synth_event_create().
*
@@ -1129,7 +1499,6 @@ int synth_event_delete(const char *event_name)
mutex_unlock(&event_mutex);
if (mod) {
- mutex_lock(&trace_types_lock);
/*
* It is safest to reset the ring buffer if the module
* being unloaded registered any events that were
@@ -1141,26 +1510,85 @@ int synth_event_delete(const char *event_name)
* occur.
*/
tracing_reset_all_online_cpus();
- mutex_unlock(&trace_types_lock);
}
return ret;
}
EXPORT_SYMBOL_GPL(synth_event_delete);
-static int create_or_delete_synth_event(int argc, char **argv)
+static int check_command(const char *raw_command)
{
- const char *name = argv[0];
- int ret;
+ char **argv = NULL, *cmd, *saved_cmd, *name_and_field;
+ int argc, ret = 0;
+
+ cmd = saved_cmd = kstrdup(raw_command, GFP_KERNEL);
+ if (!cmd)
+ return -ENOMEM;
+
+ name_and_field = strsep(&cmd, ";");
+ if (!name_and_field) {
+ ret = -EINVAL;
+ goto free;
+ }
+
+ if (name_and_field[0] == '!')
+ goto free;
+
+ argv = argv_split(GFP_KERNEL, name_and_field, &argc);
+ if (!argv) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ argv_free(argv);
+
+ if (argc < 3)
+ ret = -EINVAL;
+free:
+ kfree(saved_cmd);
+
+ return ret;
+}
+
+static int create_or_delete_synth_event(const char *raw_command)
+{
+ char *name = NULL, *fields, *p;
+ int ret = 0;
+
+ raw_command = skip_spaces(raw_command);
+ if (raw_command[0] == '\0')
+ return ret;
+
+ last_cmd_set(raw_command);
+
+ ret = check_command(raw_command);
+ if (ret) {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
+ return ret;
+ }
+
+ p = strpbrk(raw_command, " \t");
+ if (!p && raw_command[0] != '!') {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
+ ret = -EINVAL;
+ goto free;
+ }
+
+ name = kmemdup_nul(raw_command, p ? p - raw_command : strlen(raw_command), GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
- /* trace_run_command() ensures argc != 0 */
if (name[0] == '!') {
ret = synth_event_delete(name + 1);
- return ret;
+ goto free;
}
- ret = __create_synth_event(argc - 1, name, (const char **)argv + 1);
- return ret == -ECANCELED ? -EINVAL : ret;
+ fields = skip_spaces(p);
+
+ ret = __create_synth_event(name, fields);
+free:
+ kfree(name);
+
+ return ret;
}
static int synth_event_run_command(struct dynevent_cmd *cmd)
@@ -1168,7 +1596,7 @@ static int synth_event_run_command(struct dynevent_cmd *cmd)
struct synth_event *se;
int ret;
- ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event);
+ ret = create_or_delete_synth_event(cmd->seq.buffer);
if (ret)
return ret;
@@ -1198,10 +1626,9 @@ void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen)
EXPORT_SYMBOL_GPL(synth_event_cmd_init);
static inline int
-__synth_event_trace_start(struct trace_event_file *file,
- struct synth_event_trace_state *trace_state)
+__synth_event_trace_init(struct trace_event_file *file,
+ struct synth_event_trace_state *trace_state)
{
- int entry_size, fields_size = 0;
int ret = 0;
memset(trace_state, '\0', sizeof(*trace_state));
@@ -1211,8 +1638,8 @@ __synth_event_trace_start(struct trace_event_file *file,
* ENABLED bit is set (which attaches the probe thus allowing
* this code to be called, etc). Because this is called
* directly by the user, we don't have that but we still need
- * to honor not logging when disabled. For the the iterated
- * trace case, we save the enabed state upon start and just
+ * to honor not logging when disabled. For the iterated
+ * trace case, we save the enabled state upon start and just
* ignore the following data calls.
*/
if (!(file->flags & EVENT_FILE_FL_ENABLED) ||
@@ -1223,8 +1650,20 @@ __synth_event_trace_start(struct trace_event_file *file,
}
trace_state->event = file->event_call->data;
+out:
+ return ret;
+}
+
+static inline int
+__synth_event_trace_start(struct trace_event_file *file,
+ struct synth_event_trace_state *trace_state,
+ int dynamic_fields_size)
+{
+ int entry_size, fields_size = 0;
+ int ret = 0;
fields_size = trace_state->event->n_u64 * sizeof(u64);
+ fields_size += dynamic_fields_size;
/*
* Avoid ring buffer recursion detection, as this event
@@ -1241,7 +1680,7 @@ __synth_event_trace_start(struct trace_event_file *file,
ring_buffer_nest_end(trace_state->buffer);
ret = -EINVAL;
}
-out:
+
return ret;
}
@@ -1257,7 +1696,7 @@ __synth_event_trace_end(struct synth_event_trace_state *trace_state)
* synth_event_trace - Trace a synthetic event
* @file: The trace_event_file representing the synthetic event
* @n_vals: The number of values in vals
- * @args: Variable number of args containing the event values
+ * @...: Variable number of args containing the event values
*
* Trace a synthetic event using the values passed in the variable
* argument list.
@@ -1274,23 +1713,46 @@ __synth_event_trace_end(struct synth_event_trace_state *trace_state)
*/
int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...)
{
+ unsigned int i, n_u64, len, data_size = 0;
struct synth_event_trace_state state;
- unsigned int i, n_u64;
va_list args;
int ret;
- ret = __synth_event_trace_start(file, &state);
+ ret = __synth_event_trace_init(file, &state);
if (ret) {
if (ret == -ENOENT)
ret = 0; /* just disabled, not really an error */
return ret;
}
+ if (state.event->n_dynamic_fields) {
+ va_start(args, n_vals);
+
+ for (i = 0; i < state.event->n_fields; i++) {
+ u64 val = va_arg(args, u64);
+
+ if (state.event->fields[i]->is_string &&
+ state.event->fields[i]->is_dynamic) {
+ char *str_val = (char *)(long)val;
+
+ data_size += strlen(str_val) + 1;
+ }
+ }
+
+ va_end(args);
+ }
+
+ ret = __synth_event_trace_start(file, &state, data_size);
+ if (ret)
+ return ret;
+
if (n_vals != state.event->n_fields) {
ret = -EINVAL;
goto out;
}
+ data_size = 0;
+
va_start(args, n_vals);
for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) {
u64 val;
@@ -1299,28 +1761,29 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...)
if (state.event->fields[i]->is_string) {
char *str_val = (char *)(long)val;
- char *str_field = (char *)&state.entry->fields[n_u64];
- strscpy(str_field, str_val, STR_VAR_LEN_MAX);
- n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ len = trace_string(state.entry, state.event, str_val,
+ state.event->fields[i]->is_dynamic,
+ data_size, &n_u64);
+ data_size += len; /* only dynamic string increments */
} else {
struct synth_field *field = state.event->fields[i];
switch (field->size) {
case 1:
- *(u8 *)&state.entry->fields[n_u64] = (u8)val;
+ state.entry->fields[n_u64].as_u8 = (u8)val;
break;
case 2:
- *(u16 *)&state.entry->fields[n_u64] = (u16)val;
+ state.entry->fields[n_u64].as_u16 = (u16)val;
break;
case 4:
- *(u32 *)&state.entry->fields[n_u64] = (u32)val;
+ state.entry->fields[n_u64].as_u32 = (u32)val;
break;
default:
- state.entry->fields[n_u64] = val;
+ state.entry->fields[n_u64].as_u64 = val;
break;
}
n_u64++;
@@ -1355,48 +1818,65 @@ EXPORT_SYMBOL_GPL(synth_event_trace);
int synth_event_trace_array(struct trace_event_file *file, u64 *vals,
unsigned int n_vals)
{
+ unsigned int i, n_u64, field_pos, len, data_size = 0;
struct synth_event_trace_state state;
- unsigned int i, n_u64;
+ char *str_val;
int ret;
- ret = __synth_event_trace_start(file, &state);
+ ret = __synth_event_trace_init(file, &state);
if (ret) {
if (ret == -ENOENT)
ret = 0; /* just disabled, not really an error */
return ret;
}
+ if (state.event->n_dynamic_fields) {
+ for (i = 0; i < state.event->n_dynamic_fields; i++) {
+ field_pos = state.event->dynamic_fields[i]->field_pos;
+ str_val = (char *)(long)vals[field_pos];
+ len = strlen(str_val) + 1;
+ data_size += len;
+ }
+ }
+
+ ret = __synth_event_trace_start(file, &state, data_size);
+ if (ret)
+ return ret;
+
if (n_vals != state.event->n_fields) {
ret = -EINVAL;
goto out;
}
+ data_size = 0;
+
for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) {
if (state.event->fields[i]->is_string) {
char *str_val = (char *)(long)vals[i];
- char *str_field = (char *)&state.entry->fields[n_u64];
- strscpy(str_field, str_val, STR_VAR_LEN_MAX);
- n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ len = trace_string(state.entry, state.event, str_val,
+ state.event->fields[i]->is_dynamic,
+ data_size, &n_u64);
+ data_size += len; /* only dynamic string increments */
} else {
struct synth_field *field = state.event->fields[i];
u64 val = vals[i];
switch (field->size) {
case 1:
- *(u8 *)&state.entry->fields[n_u64] = (u8)val;
+ state.entry->fields[n_u64].as_u8 = (u8)val;
break;
case 2:
- *(u16 *)&state.entry->fields[n_u64] = (u16)val;
+ state.entry->fields[n_u64].as_u16 = (u16)val;
break;
case 4:
- *(u32 *)&state.entry->fields[n_u64] = (u32)val;
+ state.entry->fields[n_u64].as_u32 = (u32)val;
break;
default:
- state.entry->fields[n_u64] = val;
+ state.entry->fields[n_u64].as_u64 = val;
break;
}
n_u64++;
@@ -1445,9 +1925,17 @@ int synth_event_trace_start(struct trace_event_file *file,
if (!trace_state)
return -EINVAL;
- ret = __synth_event_trace_start(file, trace_state);
- if (ret == -ENOENT)
- ret = 0; /* just disabled, not really an error */
+ ret = __synth_event_trace_init(file, trace_state);
+ if (ret) {
+ if (ret == -ENOENT)
+ ret = 0; /* just disabled, not really an error */
+ return ret;
+ }
+
+ if (trace_state->event->n_dynamic_fields)
+ return -ENOTSUPP;
+
+ ret = __synth_event_trace_start(file, trace_state, 0);
return ret;
}
@@ -1508,6 +1996,11 @@ static int __synth_event_add_val(const char *field_name, u64 val,
char *str_val = (char *)(long)val;
char *str_field;
+ if (field->is_dynamic) { /* add_val can't do dynamic strings */
+ ret = -EINVAL;
+ goto out;
+ }
+
if (!str_val) {
ret = -EINVAL;
goto out;
@@ -1518,19 +2011,19 @@ static int __synth_event_add_val(const char *field_name, u64 val,
} else {
switch (field->size) {
case 1:
- *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val;
+ trace_state->entry->fields[field->offset].as_u8 = (u8)val;
break;
case 2:
- *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val;
+ trace_state->entry->fields[field->offset].as_u16 = (u16)val;
break;
case 4:
- *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val;
+ trace_state->entry->fields[field->offset].as_u32 = (u32)val;
break;
default:
- trace_state->entry->fields[field->offset] = val;
+ trace_state->entry->fields[field->offset].as_u64 = val;
break;
}
}
@@ -1576,7 +2069,7 @@ EXPORT_SYMBOL_GPL(synth_event_add_next_val);
/**
* synth_event_add_val - Add a named field's value to an open synth trace
* @field_name: The name of the synthetic event field value to set
- * @val: The value to set the next field to
+ * @val: The value to set the named field to
* @trace_state: A pointer to object tracking the piecewise trace state
*
* Set the value of the named field in an event that's been opened by
@@ -1639,23 +2132,60 @@ int synth_event_trace_end(struct synth_event_trace_state *trace_state)
}
EXPORT_SYMBOL_GPL(synth_event_trace_end);
-static int create_synth_event(int argc, const char **argv)
+static int create_synth_event(const char *raw_command)
{
- const char *name = argv[0];
- int len;
+ char *fields, *p;
+ const char *name;
+ int len, ret = 0;
+
+ raw_command = skip_spaces(raw_command);
+ if (raw_command[0] == '\0')
+ return ret;
+
+ last_cmd_set(raw_command);
+ name = raw_command;
+
+ /* Don't try to process if not our system */
if (name[0] != 's' || name[1] != ':')
return -ECANCELED;
name += 2;
+ p = strpbrk(raw_command, " \t");
+ if (!p) {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
+ return -EINVAL;
+ }
+
+ fields = skip_spaces(p);
+
/* This interface accepts group name prefix */
if (strchr(name, '/')) {
len = str_has_prefix(name, SYNTH_SYSTEM "/");
- if (len == 0)
+ if (len == 0) {
+ synth_err(SYNTH_ERR_INVALID_DYN_CMD, 0);
return -EINVAL;
+ }
name += len;
}
- return __create_synth_event(argc - 1, name, argv + 1);
+
+ len = name - raw_command;
+
+ ret = check_command(raw_command + len);
+ if (ret) {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
+ return ret;
+ }
+
+ name = kmemdup_nul(raw_command + len, p - raw_command - len, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ ret = __create_synth_event(name, fields);
+
+ kfree(name);
+
+ return ret;
}
static int synth_event_release(struct dyn_event *ev)
@@ -1666,6 +2196,9 @@ static int synth_event_release(struct dyn_event *ev)
if (event->ref)
return -EBUSY;
+ if (trace_event_dyn_busy(&event->call))
+ return -EBUSY;
+
ret = unregister_synth_event(event);
if (ret)
return ret;
@@ -1679,14 +2212,22 @@ static int __synth_event_show(struct seq_file *m, struct synth_event *event)
{
struct synth_field *field;
unsigned int i;
+ char *type, *t;
seq_printf(m, "%s\t", event->name);
for (i = 0; i < event->n_fields; i++) {
field = event->fields[i];
+ type = field->type;
+ t = strstr(type, "__data_loc");
+ if (t) { /* __data_loc belongs in format but not event desc */
+ t += sizeof("__data_loc");
+ type = t;
+ }
+
/* parameter values */
- seq_printf(m, "%s %s%s", field->type, field->name,
+ seq_printf(m, "%s %s%s", type, field->name,
i == event->n_fields - 1 ? "" : "; ");
}
@@ -1754,26 +2295,32 @@ static const struct file_operations synth_events_fops = {
.release = seq_release,
};
-static __init int trace_events_synth_init(void)
+/*
+ * Register dynevent at core_initcall. This allows kernel to setup kprobe
+ * events in postcore_initcall without tracefs.
+ */
+static __init int trace_events_synth_init_early(void)
{
- struct dentry *entry = NULL;
- struct dentry *d_tracer;
int err = 0;
err = dyn_event_register(&synth_event_ops);
- if (err) {
+ if (err)
pr_warn("Could not register synth_event_ops\n");
- return err;
- }
- d_tracer = tracing_init_dentry();
- if (IS_ERR(d_tracer)) {
- err = PTR_ERR(d_tracer);
+ return err;
+}
+core_initcall(trace_events_synth_init_early);
+
+static __init int trace_events_synth_init(void)
+{
+ struct dentry *entry = NULL;
+ int err = 0;
+ err = tracing_init_dentry();
+ if (err)
goto err;
- }
- entry = tracefs_create_file("synthetic_events", 0644, d_tracer,
- NULL, &synth_events_fops);
+ entry = tracefs_create_file("synthetic_events", TRACE_MODE_WRITE,
+ NULL, NULL, &synth_events_fops);
if (!entry) {
err = -ENODEV;
goto err;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index f725802160c0..b33c3861fbbb 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -31,7 +31,9 @@ void trigger_data_free(struct event_trigger_data *data)
/**
* event_triggers_call - Call triggers associated with a trace event
* @file: The trace_event_file associated with the event
+ * @buffer: The ring buffer that the event is being written to
* @rec: The trace entry for the event, NULL for unconditional invocation
+ * @event: The event meta data in the ring buffer
*
* For each trigger associated with an event, invoke the trigger
* function registered with the associated trigger command. If rec is
@@ -53,7 +55,8 @@ void trigger_data_free(struct event_trigger_data *data)
* any trigger that should be deferred, ETT_NONE if nothing to defer.
*/
enum event_trigger_type
-event_triggers_call(struct trace_event_file *file, void *rec,
+event_triggers_call(struct trace_event_file *file,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
struct event_trigger_data *data;
@@ -67,7 +70,7 @@ event_triggers_call(struct trace_event_file *file, void *rec,
if (data->paused)
continue;
if (!rec) {
- data->ops->func(data, rec, event);
+ data->ops->trigger(data, buffer, rec, event);
continue;
}
filter = rcu_dereference_sched(data->filter);
@@ -77,12 +80,26 @@ event_triggers_call(struct trace_event_file *file, void *rec,
tt |= data->cmd_ops->trigger_type;
continue;
}
- data->ops->func(data, rec, event);
+ data->ops->trigger(data, buffer, rec, event);
}
return tt;
}
EXPORT_SYMBOL_GPL(event_triggers_call);
+bool __trace_trigger_soft_disabled(struct trace_event_file *file)
+{
+ unsigned long eflags = file->flags;
+
+ if (eflags & EVENT_FILE_FL_TRIGGER_MODE)
+ event_triggers_call(file, NULL, NULL, NULL);
+ if (eflags & EVENT_FILE_FL_SOFT_DISABLED)
+ return true;
+ if (eflags & EVENT_FILE_FL_PID_FILTER)
+ return trace_event_ignore_this_pid(file);
+ return false;
+}
+EXPORT_SYMBOL_GPL(__trace_trigger_soft_disabled);
+
/**
* event_triggers_post_call - Call 'post_triggers' for a trace event
* @file: The trace_event_file associated with the event
@@ -105,7 +122,7 @@ event_triggers_post_call(struct trace_event_file *file,
if (data->paused)
continue;
if (data->cmd_ops->trigger_type & tt)
- data->ops->func(data, NULL, NULL);
+ data->ops->trigger(data, NULL, NULL, NULL);
}
}
EXPORT_SYMBOL_GPL(event_triggers_post_call);
@@ -123,6 +140,19 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
return seq_list_next(t, &event_file->triggers, pos);
}
+static bool check_user_trigger(struct trace_event_file *file)
+{
+ struct event_trigger_data *data;
+
+ list_for_each_entry_rcu(data, &file->triggers, list,
+ lockdep_is_held(&event_mutex)) {
+ if (data->flags & EVENT_TRIGGER_FL_PROBE)
+ continue;
+ return true;
+ }
+ return false;
+}
+
static void *trigger_start(struct seq_file *m, loff_t *pos)
{
struct trace_event_file *event_file;
@@ -133,7 +163,7 @@ static void *trigger_start(struct seq_file *m, loff_t *pos)
if (unlikely(!event_file))
return ERR_PTR(-ENODEV);
- if (list_empty(&event_file->triggers))
+ if (list_empty(&event_file->triggers) || !check_user_trigger(event_file))
return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL;
return seq_list_start(&event_file->triggers, *pos);
@@ -161,7 +191,7 @@ static int trigger_show(struct seq_file *m, void *v)
}
data = list_entry(v, struct event_trigger_data, list);
- data->ops->print(m, data->ops, data);
+ data->ops->print(m, data);
return 0;
}
@@ -232,7 +262,7 @@ int trigger_process_regex(struct trace_event_file *file, char *buff)
mutex_lock(&trigger_cmd_mutex);
list_for_each_entry(p, &trigger_commands, list) {
if (strcmp(p->name, command) == 0) {
- ret = p->func(p, file, buff, command, next);
+ ret = p->parse(p, file, buff, command, next);
goto out_unlock;
}
}
@@ -405,7 +435,6 @@ event_trigger_print(const char *name, struct seq_file *m,
/**
* event_trigger_init - Generic event_trigger_ops @init implementation
- * @ops: The trigger ops associated with the trigger
* @data: Trigger-specific data
*
* Common implementation of event trigger initialization.
@@ -415,8 +444,7 @@ event_trigger_print(const char *name, struct seq_file *m,
*
* Return: 0 on success, errno otherwise
*/
-int event_trigger_init(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+int event_trigger_init(struct event_trigger_data *data)
{
data->ref++;
return 0;
@@ -424,7 +452,6 @@ int event_trigger_init(struct event_trigger_ops *ops,
/**
* event_trigger_free - Generic event_trigger_ops @free implementation
- * @ops: The trigger ops associated with the trigger
* @data: Trigger-specific data
*
* Common implementation of event trigger de-initialization.
@@ -433,8 +460,7 @@ int event_trigger_init(struct event_trigger_ops *ops,
* implementations.
*/
static void
-event_trigger_free(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+event_trigger_free(struct event_trigger_data *data)
{
if (WARN_ON_ONCE(data->ref <= 0))
return;
@@ -488,7 +514,7 @@ clear_event_triggers(struct trace_array *tr)
trace_event_trigger_enable_disable(file, 0);
list_del_rcu(&data->list);
if (data->ops->free)
- data->ops->free(data->ops, data);
+ data->ops->free(data);
}
}
}
@@ -527,7 +553,6 @@ void update_cond_flag(struct trace_event_file *file)
/**
* register_trigger - Generic event_command @reg implementation
* @glob: The raw string used to register the trigger
- * @ops: The trigger ops associated with the trigger
* @data: Trigger-specific data to associate with the trigger
* @file: The trace_event_file associated with the event
*
@@ -538,7 +563,7 @@ void update_cond_flag(struct trace_event_file *file)
*
* Return: 0 on success, errno otherwise
*/
-static int register_trigger(char *glob, struct event_trigger_ops *ops,
+static int register_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
{
@@ -555,19 +580,18 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops,
}
if (data->ops->init) {
- ret = data->ops->init(data->ops, data);
+ ret = data->ops->init(data);
if (ret < 0)
goto out;
}
list_add_rcu(&data->list, &file->triggers);
- ret++;
update_cond_flag(file);
- if (trace_event_trigger_enable_disable(file, 1) < 0) {
+ ret = trace_event_trigger_enable_disable(file, 1);
+ if (ret < 0) {
list_del_rcu(&data->list);
update_cond_flag(file);
- ret--;
}
out:
return ret;
@@ -576,7 +600,6 @@ out:
/**
* unregister_trigger - Generic event_command @unreg implementation
* @glob: The raw string used to register the trigger
- * @ops: The trigger ops associated with the trigger
* @test: Trigger-specific data used to find the trigger to remove
* @file: The trace_event_file associated with the event
*
@@ -585,18 +608,17 @@ out:
* Usually used directly as the @unreg method in event command
* implementations.
*/
-static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
+static void unregister_trigger(char *glob,
struct event_trigger_data *test,
struct trace_event_file *file)
{
- struct event_trigger_data *data;
- bool unregistered = false;
+ struct event_trigger_data *data = NULL, *iter;
lockdep_assert_held(&event_mutex);
- list_for_each_entry(data, &file->triggers, list) {
- if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
- unregistered = true;
+ list_for_each_entry(iter, &file->triggers, list) {
+ if (iter->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
+ data = iter;
list_del_rcu(&data->list);
trace_event_trigger_enable_disable(file, 0);
update_cond_flag(file);
@@ -604,117 +626,407 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
}
}
- if (unregistered && data->ops->free)
- data->ops->free(data->ops, data);
+ if (data && data->ops->free)
+ data->ops->free(data);
}
+/*
+ * Event trigger parsing helper functions.
+ *
+ * These functions help make it easier to write an event trigger
+ * parsing function i.e. the struct event_command.parse() callback
+ * function responsible for parsing and registering a trigger command
+ * written to the 'trigger' file.
+ *
+ * A trigger command (or just 'trigger' for short) takes the form:
+ * [trigger] [if filter]
+ *
+ * The struct event_command.parse() callback (and other struct
+ * event_command functions) refer to several components of a trigger
+ * command. Those same components are referenced by the event trigger
+ * parsing helper functions defined below. These components are:
+ *
+ * cmd - the trigger command name
+ * glob - the trigger command name optionally prefaced with '!'
+ * param_and_filter - text following cmd and ':'
+ * param - text following cmd and ':' and stripped of filter
+ * filter - the optional filter text following (and including) 'if'
+ *
+ * To illustrate the use of these componenents, here are some concrete
+ * examples. For the following triggers:
+ *
+ * echo 'traceon:5 if pid == 0' > trigger
+ * - 'traceon' is both cmd and glob
+ * - '5 if pid == 0' is the param_and_filter
+ * - '5' is the param
+ * - 'if pid == 0' is the filter
+ *
+ * echo 'enable_event:sys:event:n' > trigger
+ * - 'enable_event' is both cmd and glob
+ * - 'sys:event:n' is the param_and_filter
+ * - 'sys:event:n' is the param
+ * - there is no filter
+ *
+ * echo 'hist:keys=pid if prio > 50' > trigger
+ * - 'hist' is both cmd and glob
+ * - 'keys=pid if prio > 50' is the param_and_filter
+ * - 'keys=pid' is the param
+ * - 'if prio > 50' is the filter
+ *
+ * echo '!enable_event:sys:event:n' > trigger
+ * - 'enable_event' the cmd
+ * - '!enable_event' is the glob
+ * - 'sys:event:n' is the param_and_filter
+ * - 'sys:event:n' is the param
+ * - there is no filter
+ *
+ * echo 'traceoff' > trigger
+ * - 'traceoff' is both cmd and glob
+ * - there is no param_and_filter
+ * - there is no param
+ * - there is no filter
+ *
+ * There are a few different categories of event trigger covered by
+ * these helpers:
+ *
+ * - triggers that don't require a parameter e.g. traceon
+ * - triggers that do require a parameter e.g. enable_event and hist
+ * - triggers that though they may not require a param may support an
+ * optional 'n' param (n = number of times the trigger should fire)
+ * e.g.: traceon:5 or enable_event:sys:event:n
+ * - triggers that do not support an 'n' param e.g. hist
+ *
+ * These functions can be used or ignored as necessary - it all
+ * depends on the complexity of the trigger, and the granularity of
+ * the functions supported reflects the fact that some implementations
+ * may need to customize certain aspects of their implementations and
+ * won't need certain functions. For instance, the hist trigger
+ * implementation doesn't use event_trigger_separate_filter() because
+ * it has special requirements for handling the filter.
+ */
+
/**
- * event_trigger_callback - Generic event_command @func implementation
- * @cmd_ops: The command ops, used for trigger registration
- * @file: The trace_event_file associated with the event
- * @glob: The raw string used to register the trigger
- * @cmd: The cmd portion of the string used to register the trigger
- * @param: The params portion of the string used to register the trigger
+ * event_trigger_check_remove - check whether an event trigger specifies remove
+ * @glob: The trigger command string, with optional remove(!) operator
*
- * Common implementation for event command parsing and trigger
- * instantiation.
+ * The event trigger callback implementations pass in 'glob' as a
+ * parameter. This is the command name either with or without a
+ * remove(!) operator. This function simply parses the glob and
+ * determines whether the command corresponds to a trigger removal or
+ * a trigger addition.
*
- * Usually used directly as the @func method in event command
- * implementations.
+ * Return: true if this is a remove command, false otherwise
+ */
+bool event_trigger_check_remove(const char *glob)
+{
+ return (glob && glob[0] == '!') ? true : false;
+}
+
+/**
+ * event_trigger_empty_param - check whether the param is empty
+ * @param: The trigger param string
+ *
+ * The event trigger callback implementations pass in 'param' as a
+ * parameter. This corresponds to the string following the command
+ * name minus the command name. This function can be called by a
+ * callback implementation for any command that requires a param; a
+ * callback that doesn't require a param can ignore it.
+ *
+ * Return: true if this is an empty param, false otherwise
+ */
+bool event_trigger_empty_param(const char *param)
+{
+ return !param;
+}
+
+/**
+ * event_trigger_separate_filter - separate an event trigger from a filter
+ * @param_and_filter: String containing trigger and possibly filter
+ * @param: outparam, will be filled with a pointer to the trigger
+ * @filter: outparam, will be filled with a pointer to the filter
+ * @param_required: Specifies whether or not the param string is required
+ *
+ * Given a param string of the form '[trigger] [if filter]', this
+ * function separates the filter from the trigger and returns the
+ * trigger in @param and the filter in @filter. Either the @param
+ * or the @filter may be set to NULL by this function - if not set to
+ * NULL, they will contain strings corresponding to the trigger and
+ * filter.
+ *
+ * There are two cases that need to be handled with respect to the
+ * passed-in param: either the param is required, or it is not
+ * required. If @param_required is set, and there's no param, it will
+ * return -EINVAL. If @param_required is not set and there's a param
+ * that starts with a number, that corresponds to the case of a
+ * trigger with :n (n = number of times the trigger should fire) and
+ * the parsing continues normally; otherwise the function just returns
+ * and assumes param just contains a filter and there's nothing else
+ * to do.
*
* Return: 0 on success, errno otherwise
*/
-static int
-event_trigger_callback(struct event_command *cmd_ops,
- struct trace_event_file *file,
- char *glob, char *cmd, char *param)
+int event_trigger_separate_filter(char *param_and_filter, char **param,
+ char **filter, bool param_required)
{
- struct event_trigger_data *trigger_data;
- struct event_trigger_ops *trigger_ops;
- char *trigger = NULL;
- char *number;
- int ret;
+ int ret = 0;
- /* separate the trigger from the filter (t:n [if filter]) */
- if (param && isdigit(param[0])) {
- trigger = strsep(&param, " \t");
- if (param) {
- param = skip_spaces(param);
- if (!*param)
- param = NULL;
- }
+ *param = *filter = NULL;
+
+ if (!param_and_filter) {
+ if (param_required)
+ ret = -EINVAL;
+ goto out;
}
- trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
+ /*
+ * Here we check for an optional param. The only legal
+ * optional param is :n, and if that's the case, continue
+ * below. Otherwise we assume what's left is a filter and
+ * return it as the filter string for the caller to deal with.
+ */
+ if (!param_required && param_and_filter && !isdigit(param_and_filter[0])) {
+ *filter = param_and_filter;
+ goto out;
+ }
+
+ /*
+ * Separate the param from the filter (param [if filter]).
+ * Here we have either an optional :n param or a required
+ * param and an optional filter.
+ */
+ *param = strsep(&param_and_filter, " \t");
+
+ /*
+ * Here we have a filter, though it may be empty.
+ */
+ if (param_and_filter) {
+ *filter = skip_spaces(param_and_filter);
+ if (!**filter)
+ *filter = NULL;
+ }
+out:
+ return ret;
+}
+
+/**
+ * event_trigger_alloc - allocate and init event_trigger_data for a trigger
+ * @cmd_ops: The event_command operations for the trigger
+ * @cmd: The cmd string
+ * @param: The param string
+ * @private_data: User data to associate with the event trigger
+ *
+ * Allocate an event_trigger_data instance and initialize it. The
+ * @cmd_ops are used along with the @cmd and @param to get the
+ * trigger_ops to assign to the event_trigger_data. @private_data can
+ * also be passed in and associated with the event_trigger_data.
+ *
+ * Use event_trigger_free() to free an event_trigger_data object.
+ *
+ * Return: The trigger_data object success, NULL otherwise
+ */
+struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops,
+ char *cmd,
+ char *param,
+ void *private_data)
+{
+ struct event_trigger_data *trigger_data;
+ struct event_trigger_ops *trigger_ops;
+
+ trigger_ops = cmd_ops->get_trigger_ops(cmd, param);
- ret = -ENOMEM;
trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
if (!trigger_data)
- goto out;
+ return NULL;
trigger_data->count = -1;
trigger_data->ops = trigger_ops;
trigger_data->cmd_ops = cmd_ops;
- trigger_data->private_data = file;
+ trigger_data->private_data = private_data;
+
INIT_LIST_HEAD(&trigger_data->list);
INIT_LIST_HEAD(&trigger_data->named_list);
+ RCU_INIT_POINTER(trigger_data->filter, NULL);
- if (glob[0] == '!') {
- cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
- kfree(trigger_data);
- ret = 0;
- goto out;
- }
+ return trigger_data;
+}
+
+/**
+ * event_trigger_parse_num - parse and return the number param for a trigger
+ * @param: The param string
+ * @trigger_data: The trigger_data for the trigger
+ *
+ * Parse the :n (n = number of times the trigger should fire) param
+ * and set the count variable in the trigger_data to the parsed count.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+int event_trigger_parse_num(char *param,
+ struct event_trigger_data *trigger_data)
+{
+ char *number;
+ int ret = 0;
- if (trigger) {
- number = strsep(&trigger, ":");
+ if (param) {
+ number = strsep(&param, ":");
- ret = -EINVAL;
if (!strlen(number))
- goto out_free;
+ return -EINVAL;
/*
* We use the callback data field (which is a pointer)
* as our counter.
*/
ret = kstrtoul(number, 0, &trigger_data->count);
- if (ret)
- goto out_free;
}
- if (!param) /* if param is non-empty, it's supposed to be a filter */
- goto out_reg;
+ return ret;
+}
+
+/**
+ * event_trigger_set_filter - set an event trigger's filter
+ * @cmd_ops: The event_command operations for the trigger
+ * @file: The event file for the trigger's event
+ * @param: The string containing the filter
+ * @trigger_data: The trigger_data for the trigger
+ *
+ * Set the filter for the trigger. If the filter is NULL, just return
+ * without error.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+int event_trigger_set_filter(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *param,
+ struct event_trigger_data *trigger_data)
+{
+ if (param && cmd_ops->set_filter)
+ return cmd_ops->set_filter(param, trigger_data, file);
+
+ return 0;
+}
+
+/**
+ * event_trigger_reset_filter - reset an event trigger's filter
+ * @cmd_ops: The event_command operations for the trigger
+ * @trigger_data: The trigger_data for the trigger
+ *
+ * Reset the filter for the trigger to no filter.
+ */
+void event_trigger_reset_filter(struct event_command *cmd_ops,
+ struct event_trigger_data *trigger_data)
+{
+ if (cmd_ops->set_filter)
+ cmd_ops->set_filter(NULL, trigger_data, NULL);
+}
+
+/**
+ * event_trigger_register - register an event trigger
+ * @cmd_ops: The event_command operations for the trigger
+ * @file: The event file for the trigger's event
+ * @glob: The trigger command string, with optional remove(!) operator
+ * @trigger_data: The trigger_data for the trigger
+ *
+ * Register an event trigger. The @cmd_ops are used to call the
+ * cmd_ops->reg() function which actually does the registration.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+int event_trigger_register(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob,
+ struct event_trigger_data *trigger_data)
+{
+ return cmd_ops->reg(glob, trigger_data, file);
+}
+
+/**
+ * event_trigger_unregister - unregister an event trigger
+ * @cmd_ops: The event_command operations for the trigger
+ * @file: The event file for the trigger's event
+ * @glob: The trigger command string, with optional remove(!) operator
+ * @trigger_data: The trigger_data for the trigger
+ *
+ * Unregister an event trigger. The @cmd_ops are used to call the
+ * cmd_ops->unreg() function which actually does the unregistration.
+ */
+void event_trigger_unregister(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob,
+ struct event_trigger_data *trigger_data)
+{
+ cmd_ops->unreg(glob, trigger_data, file);
+}
+
+/*
+ * End event trigger parsing helper functions.
+ */
+
+/**
+ * event_trigger_parse - Generic event_command @parse implementation
+ * @cmd_ops: The command ops, used for trigger registration
+ * @file: The trace_event_file associated with the event
+ * @glob: The raw string used to register the trigger
+ * @cmd: The cmd portion of the string used to register the trigger
+ * @param_and_filter: The param and filter portion of the string used to register the trigger
+ *
+ * Common implementation for event command parsing and trigger
+ * instantiation.
+ *
+ * Usually used directly as the @parse method in event command
+ * implementations.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+static int
+event_trigger_parse(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd, char *param_and_filter)
+{
+ struct event_trigger_data *trigger_data;
+ char *param, *filter;
+ bool remove;
+ int ret;
+
+ remove = event_trigger_check_remove(glob);
+
+ ret = event_trigger_separate_filter(param_and_filter, &param, &filter, false);
+ if (ret)
+ return ret;
+
+ ret = -ENOMEM;
+ trigger_data = event_trigger_alloc(cmd_ops, cmd, param, file);
+ if (!trigger_data)
+ goto out;
+
+ if (remove) {
+ event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
+ kfree(trigger_data);
+ ret = 0;
+ goto out;
+ }
- if (!cmd_ops->set_filter)
- goto out_reg;
+ ret = event_trigger_parse_num(param, trigger_data);
+ if (ret)
+ goto out_free;
- ret = cmd_ops->set_filter(param, trigger_data, file);
+ ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data);
if (ret < 0)
goto out_free;
- out_reg:
/* Up the trigger_data count to make sure reg doesn't free it on failure */
- event_trigger_init(trigger_ops, trigger_data);
- ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
- /*
- * The above returns on success the # of functions enabled,
- * but if it didn't find any functions it returns zero.
- * Consider no functions a failure too.
- */
- if (!ret) {
- cmd_ops->unreg(glob, trigger_ops, trigger_data, file);
- ret = -ENOENT;
- } else if (ret > 0)
- ret = 0;
+ event_trigger_init(trigger_data);
+
+ ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
+ if (ret)
+ goto out_free;
/* Down the counter of trigger_data or free it if not used anymore */
- event_trigger_free(trigger_ops, trigger_data);
+ event_trigger_free(trigger_data);
out:
return ret;
out_free:
- if (cmd_ops->set_filter)
- cmd_ops->set_filter(NULL, trigger_data, NULL);
+ event_trigger_reset_filter(cmd_ops, trigger_data);
kfree(trigger_data);
goto out;
}
@@ -757,7 +1069,14 @@ int set_trigger_filter(char *filter_str,
/* The filter is for the 'trigger' event, not the triggered event */
ret = create_event_filter(file->tr, file->event_call,
- filter_str, false, &filter);
+ filter_str, true, &filter);
+
+ /* Only enabled set_str for error handling */
+ if (filter) {
+ kfree(filter->filter_string);
+ filter->filter_string = NULL;
+ }
+
/*
* If create_event_filter() fails, filter still needs to be freed.
* Which the calling code will do with data->filter.
@@ -768,8 +1087,14 @@ int set_trigger_filter(char *filter_str,
rcu_assign_pointer(data->filter, filter);
if (tmp) {
- /* Make sure the call is done with the filter */
- tracepoint_synchronize_unregister();
+ /*
+ * Make sure the call is done with the filter.
+ * It is possible that a filter could fail at boot up,
+ * and then this path will be called. Avoid the synchronization
+ * in that case.
+ */
+ if (system_state != SYSTEM_BOOTING)
+ tracepoint_synchronize_unregister();
free_event_filter(tmp);
}
@@ -915,7 +1240,8 @@ void unpause_named_trigger(struct event_trigger_data *data)
/**
* set_named_trigger_data - Associate common named trigger data
- * @data: The trigger data of a named trigger to unpause
+ * @data: The trigger data to associate
+ * @named_data: The common named trigger to be associated
*
* Named triggers are sets of triggers that share a common set of
* trigger data. The first named trigger registered with a given name
@@ -937,9 +1263,20 @@ get_named_trigger_data(struct event_trigger_data *data)
}
static void
-traceon_trigger(struct event_trigger_data *data, void *rec,
+traceon_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
+ struct trace_event_file *file = data->private_data;
+
+ if (file) {
+ if (tracer_tracing_is_on(file->tr))
+ return;
+
+ tracer_tracing_on(file->tr);
+ return;
+ }
+
if (tracing_is_on())
return;
@@ -947,11 +1284,19 @@ traceon_trigger(struct event_trigger_data *data, void *rec,
}
static void
-traceon_count_trigger(struct event_trigger_data *data, void *rec,
+traceon_count_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
- if (tracing_is_on())
- return;
+ struct trace_event_file *file = data->private_data;
+
+ if (file) {
+ if (tracer_tracing_is_on(file->tr))
+ return;
+ } else {
+ if (tracing_is_on())
+ return;
+ }
if (!data->count)
return;
@@ -959,13 +1304,27 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec,
if (data->count != -1)
(data->count)--;
- tracing_on();
+ if (file)
+ tracer_tracing_on(file->tr);
+ else
+ tracing_on();
}
static void
-traceoff_trigger(struct event_trigger_data *data, void *rec,
+traceoff_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
+ struct trace_event_file *file = data->private_data;
+
+ if (file) {
+ if (!tracer_tracing_is_on(file->tr))
+ return;
+
+ tracer_tracing_off(file->tr);
+ return;
+ }
+
if (!tracing_is_on())
return;
@@ -973,11 +1332,19 @@ traceoff_trigger(struct event_trigger_data *data, void *rec,
}
static void
-traceoff_count_trigger(struct event_trigger_data *data, void *rec,
+traceoff_count_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
- if (!tracing_is_on())
- return;
+ struct trace_event_file *file = data->private_data;
+
+ if (file) {
+ if (!tracer_tracing_is_on(file->tr))
+ return;
+ } else {
+ if (!tracing_is_on())
+ return;
+ }
if (!data->count)
return;
@@ -985,48 +1352,49 @@ traceoff_count_trigger(struct event_trigger_data *data, void *rec,
if (data->count != -1)
(data->count)--;
- tracing_off();
+ if (file)
+ tracer_tracing_off(file->tr);
+ else
+ tracing_off();
}
static int
-traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+traceon_trigger_print(struct seq_file *m, struct event_trigger_data *data)
{
return event_trigger_print("traceon", m, (void *)data->count,
data->filter_str);
}
static int
-traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data)
{
return event_trigger_print("traceoff", m, (void *)data->count,
data->filter_str);
}
static struct event_trigger_ops traceon_trigger_ops = {
- .func = traceon_trigger,
+ .trigger = traceon_trigger,
.print = traceon_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
static struct event_trigger_ops traceon_count_trigger_ops = {
- .func = traceon_count_trigger,
+ .trigger = traceon_count_trigger,
.print = traceon_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
static struct event_trigger_ops traceoff_trigger_ops = {
- .func = traceoff_trigger,
+ .trigger = traceoff_trigger,
.print = traceoff_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
static struct event_trigger_ops traceoff_count_trigger_ops = {
- .func = traceoff_count_trigger,
+ .trigger = traceoff_count_trigger,
.print = traceoff_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
@@ -1051,7 +1419,7 @@ onoff_get_trigger_ops(char *cmd, char *param)
static struct event_command trigger_traceon_cmd = {
.name = "traceon",
.trigger_type = ETT_TRACE_ONOFF,
- .func = event_trigger_callback,
+ .parse = event_trigger_parse,
.reg = register_trigger,
.unreg = unregister_trigger,
.get_trigger_ops = onoff_get_trigger_ops,
@@ -1062,7 +1430,7 @@ static struct event_command trigger_traceoff_cmd = {
.name = "traceoff",
.trigger_type = ETT_TRACE_ONOFF,
.flags = EVENT_CMD_FL_POST_TRIGGER,
- .func = event_trigger_callback,
+ .parse = event_trigger_parse,
.reg = register_trigger,
.unreg = unregister_trigger,
.get_trigger_ops = onoff_get_trigger_ops,
@@ -1071,7 +1439,8 @@ static struct event_command trigger_traceoff_cmd = {
#ifdef CONFIG_TRACER_SNAPSHOT
static void
-snapshot_trigger(struct event_trigger_data *data, void *rec,
+snapshot_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
struct trace_event_file *file = data->private_data;
@@ -1083,7 +1452,8 @@ snapshot_trigger(struct event_trigger_data *data, void *rec,
}
static void
-snapshot_count_trigger(struct event_trigger_data *data, void *rec,
+snapshot_count_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
if (!data->count)
@@ -1092,37 +1462,38 @@ snapshot_count_trigger(struct event_trigger_data *data, void *rec,
if (data->count != -1)
(data->count)--;
- snapshot_trigger(data, rec, event);
+ snapshot_trigger(data, buffer, rec, event);
}
static int
-register_snapshot_trigger(char *glob, struct event_trigger_ops *ops,
+register_snapshot_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
{
- if (tracing_alloc_snapshot_instance(file->tr) != 0)
- return 0;
+ int ret = tracing_alloc_snapshot_instance(file->tr);
- return register_trigger(glob, ops, data, file);
+ if (ret < 0)
+ return ret;
+
+ return register_trigger(glob, data, file);
}
static int
-snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data)
{
return event_trigger_print("snapshot", m, (void *)data->count,
data->filter_str);
}
static struct event_trigger_ops snapshot_trigger_ops = {
- .func = snapshot_trigger,
+ .trigger = snapshot_trigger,
.print = snapshot_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
static struct event_trigger_ops snapshot_count_trigger_ops = {
- .func = snapshot_count_trigger,
+ .trigger = snapshot_count_trigger,
.print = snapshot_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
@@ -1137,7 +1508,7 @@ snapshot_get_trigger_ops(char *cmd, char *param)
static struct event_command trigger_snapshot_cmd = {
.name = "snapshot",
.trigger_type = ETT_SNAPSHOT,
- .func = event_trigger_callback,
+ .parse = event_trigger_parse,
.reg = register_snapshot_trigger,
.unreg = unregister_trigger,
.get_trigger_ops = snapshot_get_trigger_ops,
@@ -1176,14 +1547,21 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
#endif
static void
-stacktrace_trigger(struct event_trigger_data *data, void *rec,
+stacktrace_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
- trace_dump_stack(STACK_SKIP);
+ struct trace_event_file *file = data->private_data;
+
+ if (file)
+ __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP);
+ else
+ trace_dump_stack(STACK_SKIP);
}
static void
-stacktrace_count_trigger(struct event_trigger_data *data, void *rec,
+stacktrace_count_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
if (!data->count)
@@ -1192,26 +1570,25 @@ stacktrace_count_trigger(struct event_trigger_data *data, void *rec,
if (data->count != -1)
(data->count)--;
- stacktrace_trigger(data, rec, event);
+ stacktrace_trigger(data, buffer, rec, event);
}
static int
-stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data)
{
return event_trigger_print("stacktrace", m, (void *)data->count,
data->filter_str);
}
static struct event_trigger_ops stacktrace_trigger_ops = {
- .func = stacktrace_trigger,
+ .trigger = stacktrace_trigger,
.print = stacktrace_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
static struct event_trigger_ops stacktrace_count_trigger_ops = {
- .func = stacktrace_count_trigger,
+ .trigger = stacktrace_count_trigger,
.print = stacktrace_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
@@ -1227,7 +1604,7 @@ static struct event_command trigger_stacktrace_cmd = {
.name = "stacktrace",
.trigger_type = ETT_STACKTRACE,
.flags = EVENT_CMD_FL_POST_TRIGGER,
- .func = event_trigger_callback,
+ .parse = event_trigger_parse,
.reg = register_trigger,
.unreg = unregister_trigger,
.get_trigger_ops = stacktrace_get_trigger_ops,
@@ -1254,7 +1631,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
}
static void
-event_enable_trigger(struct event_trigger_data *data, void *rec,
+event_enable_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
struct enable_trigger_data *enable_data = data->private_data;
@@ -1266,7 +1644,8 @@ event_enable_trigger(struct event_trigger_data *data, void *rec,
}
static void
-event_enable_count_trigger(struct event_trigger_data *data, void *rec,
+event_enable_count_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
struct enable_trigger_data *enable_data = data->private_data;
@@ -1281,11 +1660,10 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec,
if (data->count != -1)
(data->count)--;
- event_enable_trigger(data, rec, event);
+ event_enable_trigger(data, buffer, rec, event);
}
int event_enable_trigger_print(struct seq_file *m,
- struct event_trigger_ops *ops,
struct event_trigger_data *data)
{
struct enable_trigger_data *enable_data = data->private_data;
@@ -1310,8 +1688,7 @@ int event_enable_trigger_print(struct seq_file *m,
return 0;
}
-void event_enable_trigger_free(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+void event_enable_trigger_free(struct event_trigger_data *data)
{
struct enable_trigger_data *enable_data = data->private_data;
@@ -1322,75 +1699,69 @@ void event_enable_trigger_free(struct event_trigger_ops *ops,
if (!data->ref) {
/* Remove the SOFT_MODE flag */
trace_event_enable_disable(enable_data->file, 0, 1);
- module_put(enable_data->file->event_call->mod);
+ trace_event_put_ref(enable_data->file->event_call);
trigger_data_free(data);
kfree(enable_data);
}
}
static struct event_trigger_ops event_enable_trigger_ops = {
- .func = event_enable_trigger,
+ .trigger = event_enable_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
static struct event_trigger_ops event_enable_count_trigger_ops = {
- .func = event_enable_count_trigger,
+ .trigger = event_enable_count_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
static struct event_trigger_ops event_disable_trigger_ops = {
- .func = event_enable_trigger,
+ .trigger = event_enable_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
static struct event_trigger_ops event_disable_count_trigger_ops = {
- .func = event_enable_count_trigger,
+ .trigger = event_enable_count_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
-int event_enable_trigger_func(struct event_command *cmd_ops,
- struct trace_event_file *file,
- char *glob, char *cmd, char *param)
+int event_enable_trigger_parse(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd, char *param_and_filter)
{
struct trace_event_file *event_enable_file;
struct enable_trigger_data *enable_data;
struct event_trigger_data *trigger_data;
- struct event_trigger_ops *trigger_ops;
struct trace_array *tr = file->tr;
+ char *param, *filter;
+ bool enable, remove;
const char *system;
const char *event;
bool hist = false;
- char *trigger;
- char *number;
- bool enable;
int ret;
- if (!param)
- return -EINVAL;
+ remove = event_trigger_check_remove(glob);
- /* separate the trigger from the filter (s:e:n [if filter]) */
- trigger = strsep(&param, " \t");
- if (!trigger)
+ if (event_trigger_empty_param(param_and_filter))
return -EINVAL;
- if (param) {
- param = skip_spaces(param);
- if (!*param)
- param = NULL;
- }
- system = strsep(&trigger, ":");
- if (!trigger)
+ ret = event_trigger_separate_filter(param_and_filter, &param, &filter, true);
+ if (ret)
+ return ret;
+
+ system = strsep(&param, ":");
+ if (!param)
return -EINVAL;
- event = strsep(&trigger, ":");
+ event = strsep(&param, ":");
ret = -EINVAL;
event_enable_file = find_event_file(tr, system, event);
@@ -1406,32 +1777,24 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
#else
enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
#endif
- trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
-
ret = -ENOMEM;
- trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
- if (!trigger_data)
- goto out;
enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL);
- if (!enable_data) {
- kfree(trigger_data);
+ if (!enable_data)
goto out;
- }
-
- trigger_data->count = -1;
- trigger_data->ops = trigger_ops;
- trigger_data->cmd_ops = cmd_ops;
- INIT_LIST_HEAD(&trigger_data->list);
- RCU_INIT_POINTER(trigger_data->filter, NULL);
enable_data->hist = hist;
enable_data->enable = enable;
enable_data->file = event_enable_file;
- trigger_data->private_data = enable_data;
- if (glob[0] == '!') {
- cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+ trigger_data = event_trigger_alloc(cmd_ops, cmd, param, enable_data);
+ if (!trigger_data) {
+ kfree(enable_data);
+ goto out;
+ }
+
+ if (remove) {
+ event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
kfree(trigger_data);
kfree(enable_data);
ret = 0;
@@ -1439,37 +1802,18 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
}
/* Up the trigger_data count to make sure nothing frees it on failure */
- event_trigger_init(trigger_ops, trigger_data);
+ event_trigger_init(trigger_data);
- if (trigger) {
- number = strsep(&trigger, ":");
-
- ret = -EINVAL;
- if (!strlen(number))
- goto out_free;
-
- /*
- * We use the callback data field (which is a pointer)
- * as our counter.
- */
- ret = kstrtoul(number, 0, &trigger_data->count);
- if (ret)
- goto out_free;
- }
-
- if (!param) /* if param is non-empty, it's supposed to be a filter */
- goto out_reg;
-
- if (!cmd_ops->set_filter)
- goto out_reg;
+ ret = event_trigger_parse_num(param, trigger_data);
+ if (ret)
+ goto out_free;
- ret = cmd_ops->set_filter(param, trigger_data, file);
+ ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data);
if (ret < 0)
goto out_free;
- out_reg:
/* Don't let event modules unload while probe registered */
- ret = try_module_get(event_enable_file->event_call->mod);
+ ret = trace_event_try_get_ref(event_enable_file->event_call);
if (!ret) {
ret = -EBUSY;
goto out_free;
@@ -1478,37 +1822,27 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
ret = trace_event_enable_disable(event_enable_file, 1, 1);
if (ret < 0)
goto out_put;
- ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
- /*
- * The above returns on success the # of functions enabled,
- * but if it didn't find any functions it returns zero.
- * Consider no functions a failure too.
- */
- if (!ret) {
- ret = -ENOENT;
- goto out_disable;
- } else if (ret < 0)
+
+ ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
+ if (ret)
goto out_disable;
- /* Just return zero, not the number of enabled functions */
- ret = 0;
- event_trigger_free(trigger_ops, trigger_data);
+
+ event_trigger_free(trigger_data);
out:
return ret;
-
out_disable:
trace_event_enable_disable(event_enable_file, 0, 1);
out_put:
- module_put(event_enable_file->event_call->mod);
+ trace_event_put_ref(event_enable_file->event_call);
out_free:
- if (cmd_ops->set_filter)
- cmd_ops->set_filter(NULL, trigger_data, NULL);
- event_trigger_free(trigger_ops, trigger_data);
+ event_trigger_reset_filter(cmd_ops, trigger_data);
+ event_trigger_free(trigger_data);
kfree(enable_data);
+
goto out;
}
int event_enable_register_trigger(char *glob,
- struct event_trigger_ops *ops,
struct event_trigger_data *data,
struct trace_event_file *file)
{
@@ -1531,43 +1865,40 @@ int event_enable_register_trigger(char *glob,
}
if (data->ops->init) {
- ret = data->ops->init(data->ops, data);
+ ret = data->ops->init(data);
if (ret < 0)
goto out;
}
list_add_rcu(&data->list, &file->triggers);
- ret++;
update_cond_flag(file);
- if (trace_event_trigger_enable_disable(file, 1) < 0) {
+ ret = trace_event_trigger_enable_disable(file, 1);
+ if (ret < 0) {
list_del_rcu(&data->list);
update_cond_flag(file);
- ret--;
}
out:
return ret;
}
void event_enable_unregister_trigger(char *glob,
- struct event_trigger_ops *ops,
struct event_trigger_data *test,
struct trace_event_file *file)
{
struct enable_trigger_data *test_enable_data = test->private_data;
+ struct event_trigger_data *data = NULL, *iter;
struct enable_trigger_data *enable_data;
- struct event_trigger_data *data;
- bool unregistered = false;
lockdep_assert_held(&event_mutex);
- list_for_each_entry(data, &file->triggers, list) {
- enable_data = data->private_data;
+ list_for_each_entry(iter, &file->triggers, list) {
+ enable_data = iter->private_data;
if (enable_data &&
- (data->cmd_ops->trigger_type ==
+ (iter->cmd_ops->trigger_type ==
test->cmd_ops->trigger_type) &&
(enable_data->file == test_enable_data->file)) {
- unregistered = true;
+ data = iter;
list_del_rcu(&data->list);
trace_event_trigger_enable_disable(file, 0);
update_cond_flag(file);
@@ -1575,8 +1906,8 @@ void event_enable_unregister_trigger(char *glob,
}
}
- if (unregistered && data->ops->free)
- data->ops->free(data->ops, data);
+ if (data && data->ops->free)
+ data->ops->free(data);
}
static struct event_trigger_ops *
@@ -1604,7 +1935,7 @@ event_enable_get_trigger_ops(char *cmd, char *param)
static struct event_command trigger_enable_cmd = {
.name = ENABLE_EVENT_STR,
.trigger_type = ETT_EVENT_ENABLE,
- .func = event_enable_trigger_func,
+ .parse = event_enable_trigger_parse,
.reg = event_enable_register_trigger,
.unreg = event_enable_unregister_trigger,
.get_trigger_ops = event_enable_get_trigger_ops,
@@ -1614,7 +1945,7 @@ static struct event_command trigger_enable_cmd = {
static struct event_command trigger_disable_cmd = {
.name = DISABLE_EVENT_STR,
.trigger_type = ETT_EVENT_ENABLE,
- .func = event_enable_trigger_func,
+ .parse = event_enable_trigger_parse,
.reg = event_enable_register_trigger,
.unreg = event_enable_unregister_trigger,
.get_trigger_ops = event_enable_get_trigger_ops,
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
new file mode 100644
index 000000000000..e76f5e1efdf2
--- /dev/null
+++ b/kernel/trace/trace_events_user.c
@@ -0,0 +1,2784 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021, Microsoft Corporation.
+ *
+ * Authors:
+ * Beau Belgrave <beaub@linux.microsoft.com>
+ */
+
+#include <linux/bitmap.h>
+#include <linux/cdev.h>
+#include <linux/hashtable.h>
+#include <linux/list.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/ioctl.h>
+#include <linux/jhash.h>
+#include <linux/refcount.h>
+#include <linux/trace_events.h>
+#include <linux/tracefs.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/user_events.h>
+#include "trace_dynevent.h"
+#include "trace_output.h"
+#include "trace.h"
+
+#define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1)
+
+#define FIELD_DEPTH_TYPE 0
+#define FIELD_DEPTH_NAME 1
+#define FIELD_DEPTH_SIZE 2
+
+/* Limit how long of an event name plus args within the subsystem. */
+#define MAX_EVENT_DESC 512
+#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
+#define MAX_FIELD_ARRAY_SIZE 1024
+
+/*
+ * Internal bits (kernel side only) to keep track of connected probes:
+ * These are used when status is requested in text form about an event. These
+ * bits are compared against an internal byte on the event to determine which
+ * probes to print out to the user.
+ *
+ * These do not reflect the mapped bytes between the user and kernel space.
+ */
+#define EVENT_STATUS_FTRACE BIT(0)
+#define EVENT_STATUS_PERF BIT(1)
+#define EVENT_STATUS_OTHER BIT(7)
+
+/*
+ * Stores the system name, tables, and locks for a group of events. This
+ * allows isolation for events by various means.
+ */
+struct user_event_group {
+ char *system_name;
+ struct hlist_node node;
+ struct mutex reg_mutex;
+ DECLARE_HASHTABLE(register_table, 8);
+};
+
+/* Group for init_user_ns mapping, top-most group */
+static struct user_event_group *init_group;
+
+/* Max allowed events for the whole system */
+static unsigned int max_user_events = 32768;
+
+/* Current number of events on the whole system */
+static unsigned int current_user_events;
+
+/*
+ * Stores per-event properties, as users register events
+ * within a file a user_event might be created if it does not
+ * already exist. These are globally used and their lifetime
+ * is tied to the refcnt member. These cannot go away until the
+ * refcnt reaches one.
+ */
+struct user_event {
+ struct user_event_group *group;
+ struct tracepoint tracepoint;
+ struct trace_event_call call;
+ struct trace_event_class class;
+ struct dyn_event devent;
+ struct hlist_node node;
+ struct list_head fields;
+ struct list_head validators;
+ struct work_struct put_work;
+ refcount_t refcnt;
+ int min_size;
+ int reg_flags;
+ char status;
+};
+
+/*
+ * Stores per-mm/event properties that enable an address to be
+ * updated properly for each task. As tasks are forked, we use
+ * these to track enablement sites that are tied to an event.
+ */
+struct user_event_enabler {
+ struct list_head mm_enablers_link;
+ struct user_event *event;
+ unsigned long addr;
+
+ /* Track enable bit, flags, etc. Aligned for bitops. */
+ unsigned long values;
+};
+
+/* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */
+#define ENABLE_VAL_BIT_MASK 0x3F
+
+/* Bit 6 is for faulting status of enablement */
+#define ENABLE_VAL_FAULTING_BIT 6
+
+/* Bit 7 is for freeing status of enablement */
+#define ENABLE_VAL_FREEING_BIT 7
+
+/* Bit 8 is for marking 32-bit on 64-bit */
+#define ENABLE_VAL_32_ON_64_BIT 8
+
+#define ENABLE_VAL_COMPAT_MASK (1 << ENABLE_VAL_32_ON_64_BIT)
+
+/* Only duplicate the bit and compat values */
+#define ENABLE_VAL_DUP_MASK (ENABLE_VAL_BIT_MASK | ENABLE_VAL_COMPAT_MASK)
+
+#define ENABLE_BITOPS(e) (&(e)->values)
+
+#define ENABLE_BIT(e) ((int)((e)->values & ENABLE_VAL_BIT_MASK))
+
+/* Used for asynchronous faulting in of pages */
+struct user_event_enabler_fault {
+ struct work_struct work;
+ struct user_event_mm *mm;
+ struct user_event_enabler *enabler;
+ int attempt;
+};
+
+static struct kmem_cache *fault_cache;
+
+/* Global list of memory descriptors using user_events */
+static LIST_HEAD(user_event_mms);
+static DEFINE_SPINLOCK(user_event_mms_lock);
+
+/*
+ * Stores per-file events references, as users register events
+ * within a file this structure is modified and freed via RCU.
+ * The lifetime of this struct is tied to the lifetime of the file.
+ * These are not shared and only accessible by the file that created it.
+ */
+struct user_event_refs {
+ struct rcu_head rcu;
+ int count;
+ struct user_event *events[];
+};
+
+struct user_event_file_info {
+ struct user_event_group *group;
+ struct user_event_refs *refs;
+};
+
+#define VALIDATOR_ENSURE_NULL (1 << 0)
+#define VALIDATOR_REL (1 << 1)
+
+struct user_event_validator {
+ struct list_head user_event_link;
+ int offset;
+ int flags;
+};
+
+static inline void align_addr_bit(unsigned long *addr, int *bit,
+ unsigned long *flags)
+{
+ if (IS_ALIGNED(*addr, sizeof(long))) {
+#ifdef __BIG_ENDIAN
+ /* 32 bit on BE 64 bit requires a 32 bit offset when aligned. */
+ if (test_bit(ENABLE_VAL_32_ON_64_BIT, flags))
+ *bit += 32;
+#endif
+ return;
+ }
+
+ *addr = ALIGN_DOWN(*addr, sizeof(long));
+
+ /*
+ * We only support 32 and 64 bit values. The only time we need
+ * to align is a 32 bit value on a 64 bit kernel, which on LE
+ * is always 32 bits, and on BE requires no change when unaligned.
+ */
+#ifdef __LITTLE_ENDIAN
+ *bit += 32;
+#endif
+}
+
+typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted);
+
+static int user_event_parse(struct user_event_group *group, char *name,
+ char *args, char *flags,
+ struct user_event **newuser, int reg_flags);
+
+static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm);
+static struct user_event_mm *user_event_mm_get_all(struct user_event *user);
+static void user_event_mm_put(struct user_event_mm *mm);
+static int destroy_user_event(struct user_event *user);
+
+static u32 user_event_key(char *name)
+{
+ return jhash(name, strlen(name), 0);
+}
+
+static bool user_event_capable(u16 reg_flags)
+{
+ /* Persistent events require CAP_PERFMON / CAP_SYS_ADMIN */
+ if (reg_flags & USER_EVENT_REG_PERSIST) {
+ if (!perfmon_capable())
+ return false;
+ }
+
+ return true;
+}
+
+static struct user_event *user_event_get(struct user_event *user)
+{
+ refcount_inc(&user->refcnt);
+
+ return user;
+}
+
+static void delayed_destroy_user_event(struct work_struct *work)
+{
+ struct user_event *user = container_of(
+ work, struct user_event, put_work);
+
+ mutex_lock(&event_mutex);
+
+ if (!refcount_dec_and_test(&user->refcnt))
+ goto out;
+
+ if (destroy_user_event(user)) {
+ /*
+ * The only reason this would fail here is if we cannot
+ * update the visibility of the event. In this case the
+ * event stays in the hashtable, waiting for someone to
+ * attempt to delete it later.
+ */
+ pr_warn("user_events: Unable to delete event\n");
+ refcount_set(&user->refcnt, 1);
+ }
+out:
+ mutex_unlock(&event_mutex);
+}
+
+static void user_event_put(struct user_event *user, bool locked)
+{
+ bool delete;
+
+ if (unlikely(!user))
+ return;
+
+ /*
+ * When the event is not enabled for auto-delete there will always
+ * be at least 1 reference to the event. During the event creation
+ * we initially set the refcnt to 2 to achieve this. In those cases
+ * the caller must acquire event_mutex and after decrement check if
+ * the refcnt is 1, meaning this is the last reference. When auto
+ * delete is enabled, there will only be 1 ref, IE: refcnt will be
+ * only set to 1 during creation to allow the below checks to go
+ * through upon the last put. The last put must always be done with
+ * the event mutex held.
+ */
+ if (!locked) {
+ lockdep_assert_not_held(&event_mutex);
+ delete = refcount_dec_and_mutex_lock(&user->refcnt, &event_mutex);
+ } else {
+ lockdep_assert_held(&event_mutex);
+ delete = refcount_dec_and_test(&user->refcnt);
+ }
+
+ if (!delete)
+ return;
+
+ /*
+ * We now have the event_mutex in all cases, which ensures that
+ * no new references will be taken until event_mutex is released.
+ * New references come through find_user_event(), which requires
+ * the event_mutex to be held.
+ */
+
+ if (user->reg_flags & USER_EVENT_REG_PERSIST) {
+ /* We should not get here when persist flag is set */
+ pr_alert("BUG: Auto-delete engaged on persistent event\n");
+ goto out;
+ }
+
+ /*
+ * Unfortunately we have to attempt the actual destroy in a work
+ * queue. This is because not all cases handle a trace_event_call
+ * being removed within the class->reg() operation for unregister.
+ */
+ INIT_WORK(&user->put_work, delayed_destroy_user_event);
+
+ /*
+ * Since the event is still in the hashtable, we have to re-inc
+ * the ref count to 1. This count will be decremented and checked
+ * in the work queue to ensure it's still the last ref. This is
+ * needed because a user-process could register the same event in
+ * between the time of event_mutex release and the work queue
+ * running the delayed destroy. If we removed the item now from
+ * the hashtable, this would result in a timing window where a
+ * user process would fail a register because the trace_event_call
+ * register would fail in the tracing layers.
+ */
+ refcount_set(&user->refcnt, 1);
+
+ if (WARN_ON_ONCE(!schedule_work(&user->put_work))) {
+ /*
+ * If we fail we must wait for an admin to attempt delete or
+ * another register/close of the event, whichever is first.
+ */
+ pr_warn("user_events: Unable to queue delayed destroy\n");
+ }
+out:
+ /* Ensure if we didn't have event_mutex before we unlock it */
+ if (!locked)
+ mutex_unlock(&event_mutex);
+}
+
+static void user_event_group_destroy(struct user_event_group *group)
+{
+ kfree(group->system_name);
+ kfree(group);
+}
+
+static char *user_event_group_system_name(void)
+{
+ char *system_name;
+ int len = sizeof(USER_EVENTS_SYSTEM) + 1;
+
+ system_name = kmalloc(len, GFP_KERNEL);
+
+ if (!system_name)
+ return NULL;
+
+ snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM);
+
+ return system_name;
+}
+
+static struct user_event_group *current_user_event_group(void)
+{
+ return init_group;
+}
+
+static struct user_event_group *user_event_group_create(void)
+{
+ struct user_event_group *group;
+
+ group = kzalloc(sizeof(*group), GFP_KERNEL);
+
+ if (!group)
+ return NULL;
+
+ group->system_name = user_event_group_system_name();
+
+ if (!group->system_name)
+ goto error;
+
+ mutex_init(&group->reg_mutex);
+ hash_init(group->register_table);
+
+ return group;
+error:
+ if (group)
+ user_event_group_destroy(group);
+
+ return NULL;
+};
+
+static void user_event_enabler_destroy(struct user_event_enabler *enabler,
+ bool locked)
+{
+ list_del_rcu(&enabler->mm_enablers_link);
+
+ /* No longer tracking the event via the enabler */
+ user_event_put(enabler->event, locked);
+
+ kfree(enabler);
+}
+
+static int user_event_mm_fault_in(struct user_event_mm *mm, unsigned long uaddr,
+ int attempt)
+{
+ bool unlocked;
+ int ret;
+
+ /*
+ * Normally this is low, ensure that it cannot be taken advantage of by
+ * bad user processes to cause excessive looping.
+ */
+ if (attempt > 10)
+ return -EFAULT;
+
+ mmap_read_lock(mm->mm);
+
+ /* Ensure MM has tasks, cannot use after exit_mm() */
+ if (refcount_read(&mm->tasks) == 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = fixup_user_fault(mm->mm, uaddr, FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
+ &unlocked);
+out:
+ mmap_read_unlock(mm->mm);
+
+ return ret;
+}
+
+static int user_event_enabler_write(struct user_event_mm *mm,
+ struct user_event_enabler *enabler,
+ bool fixup_fault, int *attempt);
+
+static void user_event_enabler_fault_fixup(struct work_struct *work)
+{
+ struct user_event_enabler_fault *fault = container_of(
+ work, struct user_event_enabler_fault, work);
+ struct user_event_enabler *enabler = fault->enabler;
+ struct user_event_mm *mm = fault->mm;
+ unsigned long uaddr = enabler->addr;
+ int attempt = fault->attempt;
+ int ret;
+
+ ret = user_event_mm_fault_in(mm, uaddr, attempt);
+
+ if (ret && ret != -ENOENT) {
+ struct user_event *user = enabler->event;
+
+ pr_warn("user_events: Fault for mm: 0x%pK @ 0x%llx event: %s\n",
+ mm->mm, (unsigned long long)uaddr, EVENT_NAME(user));
+ }
+
+ /* Prevent state changes from racing */
+ mutex_lock(&event_mutex);
+
+ /* User asked for enabler to be removed during fault */
+ if (test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))) {
+ user_event_enabler_destroy(enabler, true);
+ goto out;
+ }
+
+ /*
+ * If we managed to get the page, re-issue the write. We do not
+ * want to get into a possible infinite loop, which is why we only
+ * attempt again directly if the page came in. If we couldn't get
+ * the page here, then we will try again the next time the event is
+ * enabled/disabled.
+ */
+ clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
+
+ if (!ret) {
+ mmap_read_lock(mm->mm);
+ user_event_enabler_write(mm, enabler, true, &attempt);
+ mmap_read_unlock(mm->mm);
+ }
+out:
+ mutex_unlock(&event_mutex);
+
+ /* In all cases we no longer need the mm or fault */
+ user_event_mm_put(mm);
+ kmem_cache_free(fault_cache, fault);
+}
+
+static bool user_event_enabler_queue_fault(struct user_event_mm *mm,
+ struct user_event_enabler *enabler,
+ int attempt)
+{
+ struct user_event_enabler_fault *fault;
+
+ fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN);
+
+ if (!fault)
+ return false;
+
+ INIT_WORK(&fault->work, user_event_enabler_fault_fixup);
+ fault->mm = user_event_mm_get(mm);
+ fault->enabler = enabler;
+ fault->attempt = attempt;
+
+ /* Don't try to queue in again while we have a pending fault */
+ set_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
+
+ if (!schedule_work(&fault->work)) {
+ /* Allow another attempt later */
+ clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
+
+ user_event_mm_put(mm);
+ kmem_cache_free(fault_cache, fault);
+
+ return false;
+ }
+
+ return true;
+}
+
+static int user_event_enabler_write(struct user_event_mm *mm,
+ struct user_event_enabler *enabler,
+ bool fixup_fault, int *attempt)
+{
+ unsigned long uaddr = enabler->addr;
+ unsigned long *ptr;
+ struct page *page;
+ void *kaddr;
+ int bit = ENABLE_BIT(enabler);
+ int ret;
+
+ lockdep_assert_held(&event_mutex);
+ mmap_assert_locked(mm->mm);
+
+ *attempt += 1;
+
+ /* Ensure MM has tasks, cannot use after exit_mm() */
+ if (refcount_read(&mm->tasks) == 0)
+ return -ENOENT;
+
+ if (unlikely(test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)) ||
+ test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))))
+ return -EBUSY;
+
+ align_addr_bit(&uaddr, &bit, ENABLE_BITOPS(enabler));
+
+ ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
+ &page, NULL);
+
+ if (unlikely(ret <= 0)) {
+ if (!fixup_fault)
+ return -EFAULT;
+
+ if (!user_event_enabler_queue_fault(mm, enabler, *attempt))
+ pr_warn("user_events: Unable to queue fault handler\n");
+
+ return -EFAULT;
+ }
+
+ kaddr = kmap_local_page(page);
+ ptr = kaddr + (uaddr & ~PAGE_MASK);
+
+ /* Update bit atomically, user tracers must be atomic as well */
+ if (enabler->event && enabler->event->status)
+ set_bit(bit, ptr);
+ else
+ clear_bit(bit, ptr);
+
+ kunmap_local(kaddr);
+ unpin_user_pages_dirty_lock(&page, 1, true);
+
+ return 0;
+}
+
+static bool user_event_enabler_exists(struct user_event_mm *mm,
+ unsigned long uaddr, unsigned char bit)
+{
+ struct user_event_enabler *enabler;
+
+ list_for_each_entry(enabler, &mm->enablers, mm_enablers_link) {
+ if (enabler->addr == uaddr && ENABLE_BIT(enabler) == bit)
+ return true;
+ }
+
+ return false;
+}
+
+static void user_event_enabler_update(struct user_event *user)
+{
+ struct user_event_enabler *enabler;
+ struct user_event_mm *next;
+ struct user_event_mm *mm;
+ int attempt;
+
+ lockdep_assert_held(&event_mutex);
+
+ /*
+ * We need to build a one-shot list of all the mms that have an
+ * enabler for the user_event passed in. This list is only valid
+ * while holding the event_mutex. The only reason for this is due
+ * to the global mm list being RCU protected and we use methods
+ * which can wait (mmap_read_lock and pin_user_pages_remote).
+ *
+ * NOTE: user_event_mm_get_all() increments the ref count of each
+ * mm that is added to the list to prevent removal timing windows.
+ * We must always put each mm after they are used, which may wait.
+ */
+ mm = user_event_mm_get_all(user);
+
+ while (mm) {
+ next = mm->next;
+ mmap_read_lock(mm->mm);
+
+ list_for_each_entry(enabler, &mm->enablers, mm_enablers_link) {
+ if (enabler->event == user) {
+ attempt = 0;
+ user_event_enabler_write(mm, enabler, true, &attempt);
+ }
+ }
+
+ mmap_read_unlock(mm->mm);
+ user_event_mm_put(mm);
+ mm = next;
+ }
+}
+
+static bool user_event_enabler_dup(struct user_event_enabler *orig,
+ struct user_event_mm *mm)
+{
+ struct user_event_enabler *enabler;
+
+ /* Skip pending frees */
+ if (unlikely(test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(orig))))
+ return true;
+
+ enabler = kzalloc(sizeof(*enabler), GFP_NOWAIT | __GFP_ACCOUNT);
+
+ if (!enabler)
+ return false;
+
+ enabler->event = user_event_get(orig->event);
+ enabler->addr = orig->addr;
+
+ /* Only dup part of value (ignore future flags, etc) */
+ enabler->values = orig->values & ENABLE_VAL_DUP_MASK;
+
+ /* Enablers not exposed yet, RCU not required */
+ list_add(&enabler->mm_enablers_link, &mm->enablers);
+
+ return true;
+}
+
+static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm)
+{
+ refcount_inc(&mm->refcnt);
+
+ return mm;
+}
+
+static struct user_event_mm *user_event_mm_get_all(struct user_event *user)
+{
+ struct user_event_mm *found = NULL;
+ struct user_event_enabler *enabler;
+ struct user_event_mm *mm;
+
+ /*
+ * We use the mm->next field to build a one-shot list from the global
+ * RCU protected list. To build this list the event_mutex must be held.
+ * This lets us build a list without requiring allocs that could fail
+ * when user based events are most wanted for diagnostics.
+ */
+ lockdep_assert_held(&event_mutex);
+
+ /*
+ * We do not want to block fork/exec while enablements are being
+ * updated, so we use RCU to walk the current tasks that have used
+ * user_events ABI for 1 or more events. Each enabler found in each
+ * task that matches the event being updated has a write to reflect
+ * the kernel state back into the process. Waits/faults must not occur
+ * during this. So we scan the list under RCU for all the mm that have
+ * the event within it. This is needed because mm_read_lock() can wait.
+ * Each user mm returned has a ref inc to handle remove RCU races.
+ */
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(mm, &user_event_mms, mms_link) {
+ list_for_each_entry_rcu(enabler, &mm->enablers, mm_enablers_link) {
+ if (enabler->event == user) {
+ mm->next = found;
+ found = user_event_mm_get(mm);
+ break;
+ }
+ }
+ }
+
+ rcu_read_unlock();
+
+ return found;
+}
+
+static struct user_event_mm *user_event_mm_alloc(struct task_struct *t)
+{
+ struct user_event_mm *user_mm;
+
+ user_mm = kzalloc(sizeof(*user_mm), GFP_KERNEL_ACCOUNT);
+
+ if (!user_mm)
+ return NULL;
+
+ user_mm->mm = t->mm;
+ INIT_LIST_HEAD(&user_mm->enablers);
+ refcount_set(&user_mm->refcnt, 1);
+ refcount_set(&user_mm->tasks, 1);
+
+ /*
+ * The lifetime of the memory descriptor can slightly outlast
+ * the task lifetime if a ref to the user_event_mm is taken
+ * between list_del_rcu() and call_rcu(). Therefore we need
+ * to take a reference to it to ensure it can live this long
+ * under this corner case. This can also occur in clones that
+ * outlast the parent.
+ */
+ mmgrab(user_mm->mm);
+
+ return user_mm;
+}
+
+static void user_event_mm_attach(struct user_event_mm *user_mm, struct task_struct *t)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&user_event_mms_lock, flags);
+ list_add_rcu(&user_mm->mms_link, &user_event_mms);
+ spin_unlock_irqrestore(&user_event_mms_lock, flags);
+
+ t->user_event_mm = user_mm;
+}
+
+static struct user_event_mm *current_user_event_mm(void)
+{
+ struct user_event_mm *user_mm = current->user_event_mm;
+
+ if (user_mm)
+ goto inc;
+
+ user_mm = user_event_mm_alloc(current);
+
+ if (!user_mm)
+ goto error;
+
+ user_event_mm_attach(user_mm, current);
+inc:
+ refcount_inc(&user_mm->refcnt);
+error:
+ return user_mm;
+}
+
+static void user_event_mm_destroy(struct user_event_mm *mm)
+{
+ struct user_event_enabler *enabler, *next;
+
+ list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link)
+ user_event_enabler_destroy(enabler, false);
+
+ mmdrop(mm->mm);
+ kfree(mm);
+}
+
+static void user_event_mm_put(struct user_event_mm *mm)
+{
+ if (mm && refcount_dec_and_test(&mm->refcnt))
+ user_event_mm_destroy(mm);
+}
+
+static void delayed_user_event_mm_put(struct work_struct *work)
+{
+ struct user_event_mm *mm;
+
+ mm = container_of(to_rcu_work(work), struct user_event_mm, put_rwork);
+ user_event_mm_put(mm);
+}
+
+void user_event_mm_remove(struct task_struct *t)
+{
+ struct user_event_mm *mm;
+ unsigned long flags;
+
+ might_sleep();
+
+ mm = t->user_event_mm;
+ t->user_event_mm = NULL;
+
+ /* Clone will increment the tasks, only remove if last clone */
+ if (!refcount_dec_and_test(&mm->tasks))
+ return;
+
+ /* Remove the mm from the list, so it can no longer be enabled */
+ spin_lock_irqsave(&user_event_mms_lock, flags);
+ list_del_rcu(&mm->mms_link);
+ spin_unlock_irqrestore(&user_event_mms_lock, flags);
+
+ /*
+ * We need to wait for currently occurring writes to stop within
+ * the mm. This is required since exit_mm() snaps the current rss
+ * stats and clears them. On the final mmdrop(), check_mm() will
+ * report a bug if these increment.
+ *
+ * All writes/pins are done under mmap_read lock, take the write
+ * lock to ensure in-progress faults have completed. Faults that
+ * are pending but yet to run will check the task count and skip
+ * the fault since the mm is going away.
+ */
+ mmap_write_lock(mm->mm);
+ mmap_write_unlock(mm->mm);
+
+ /*
+ * Put for mm must be done after RCU delay to handle new refs in
+ * between the list_del_rcu() and now. This ensures any get refs
+ * during rcu_read_lock() are accounted for during list removal.
+ *
+ * CPU A | CPU B
+ * ---------------------------------------------------------------
+ * user_event_mm_remove() | rcu_read_lock();
+ * list_del_rcu() | list_for_each_entry_rcu();
+ * call_rcu() | refcount_inc();
+ * . | rcu_read_unlock();
+ * schedule_work() | .
+ * user_event_mm_put() | .
+ *
+ * mmdrop() cannot be called in the softirq context of call_rcu()
+ * so we use a work queue after call_rcu() to run within.
+ */
+ INIT_RCU_WORK(&mm->put_rwork, delayed_user_event_mm_put);
+ queue_rcu_work(system_wq, &mm->put_rwork);
+}
+
+void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm)
+{
+ struct user_event_mm *mm = user_event_mm_alloc(t);
+ struct user_event_enabler *enabler;
+
+ if (!mm)
+ return;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(enabler, &old_mm->enablers, mm_enablers_link) {
+ if (!user_event_enabler_dup(enabler, mm))
+ goto error;
+ }
+
+ rcu_read_unlock();
+
+ user_event_mm_attach(mm, t);
+ return;
+error:
+ rcu_read_unlock();
+ user_event_mm_destroy(mm);
+}
+
+static bool current_user_event_enabler_exists(unsigned long uaddr,
+ unsigned char bit)
+{
+ struct user_event_mm *user_mm = current_user_event_mm();
+ bool exists;
+
+ if (!user_mm)
+ return false;
+
+ exists = user_event_enabler_exists(user_mm, uaddr, bit);
+
+ user_event_mm_put(user_mm);
+
+ return exists;
+}
+
+static struct user_event_enabler
+*user_event_enabler_create(struct user_reg *reg, struct user_event *user,
+ int *write_result)
+{
+ struct user_event_enabler *enabler;
+ struct user_event_mm *user_mm;
+ unsigned long uaddr = (unsigned long)reg->enable_addr;
+ int attempt = 0;
+
+ user_mm = current_user_event_mm();
+
+ if (!user_mm)
+ return NULL;
+
+ enabler = kzalloc(sizeof(*enabler), GFP_KERNEL_ACCOUNT);
+
+ if (!enabler)
+ goto out;
+
+ enabler->event = user;
+ enabler->addr = uaddr;
+ enabler->values = reg->enable_bit;
+
+#if BITS_PER_LONG >= 64
+ if (reg->enable_size == 4)
+ set_bit(ENABLE_VAL_32_ON_64_BIT, ENABLE_BITOPS(enabler));
+#endif
+
+retry:
+ /* Prevents state changes from racing with new enablers */
+ mutex_lock(&event_mutex);
+
+ /* Attempt to reflect the current state within the process */
+ mmap_read_lock(user_mm->mm);
+ *write_result = user_event_enabler_write(user_mm, enabler, false,
+ &attempt);
+ mmap_read_unlock(user_mm->mm);
+
+ /*
+ * If the write works, then we will track the enabler. A ref to the
+ * underlying user_event is held by the enabler to prevent it going
+ * away while the enabler is still in use by a process. The ref is
+ * removed when the enabler is destroyed. This means a event cannot
+ * be forcefully deleted from the system until all tasks using it
+ * exit or run exec(), which includes forks and clones.
+ */
+ if (!*write_result) {
+ user_event_get(user);
+ list_add_rcu(&enabler->mm_enablers_link, &user_mm->enablers);
+ }
+
+ mutex_unlock(&event_mutex);
+
+ if (*write_result) {
+ /* Attempt to fault-in and retry if it worked */
+ if (!user_event_mm_fault_in(user_mm, uaddr, attempt))
+ goto retry;
+
+ kfree(enabler);
+ enabler = NULL;
+ }
+out:
+ user_event_mm_put(user_mm);
+
+ return enabler;
+}
+
+static __always_inline __must_check
+bool user_event_last_ref(struct user_event *user)
+{
+ int last = 0;
+
+ if (user->reg_flags & USER_EVENT_REG_PERSIST)
+ last = 1;
+
+ return refcount_read(&user->refcnt) == last;
+}
+
+static __always_inline __must_check
+size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i)
+{
+ size_t ret;
+
+ pagefault_disable();
+
+ ret = copy_from_iter_nocache(addr, bytes, i);
+
+ pagefault_enable();
+
+ return ret;
+}
+
+static struct list_head *user_event_get_fields(struct trace_event_call *call)
+{
+ struct user_event *user = (struct user_event *)call->data;
+
+ return &user->fields;
+}
+
+/*
+ * Parses a register command for user_events
+ * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]]
+ *
+ * Example event named 'test' with a 20 char 'msg' field with an unsigned int
+ * 'id' field after:
+ * test char[20] msg;unsigned int id
+ *
+ * NOTE: Offsets are from the user data perspective, they are not from the
+ * trace_entry/buffer perspective. We automatically add the common properties
+ * sizes to the offset for the user.
+ *
+ * Upon success user_event has its ref count increased by 1.
+ */
+static int user_event_parse_cmd(struct user_event_group *group,
+ char *raw_command, struct user_event **newuser,
+ int reg_flags)
+{
+ char *name = raw_command;
+ char *args = strpbrk(name, " ");
+ char *flags;
+
+ if (args)
+ *args++ = '\0';
+
+ flags = strpbrk(name, ":");
+
+ if (flags)
+ *flags++ = '\0';
+
+ return user_event_parse(group, name, args, flags, newuser, reg_flags);
+}
+
+static int user_field_array_size(const char *type)
+{
+ const char *start = strchr(type, '[');
+ char val[8];
+ char *bracket;
+ int size = 0;
+
+ if (start == NULL)
+ return -EINVAL;
+
+ if (strscpy(val, start + 1, sizeof(val)) <= 0)
+ return -EINVAL;
+
+ bracket = strchr(val, ']');
+
+ if (!bracket)
+ return -EINVAL;
+
+ *bracket = '\0';
+
+ if (kstrtouint(val, 0, &size))
+ return -EINVAL;
+
+ if (size > MAX_FIELD_ARRAY_SIZE)
+ return -EINVAL;
+
+ return size;
+}
+
+static int user_field_size(const char *type)
+{
+ /* long is not allowed from a user, since it's ambigious in size */
+ if (strcmp(type, "s64") == 0)
+ return sizeof(s64);
+ if (strcmp(type, "u64") == 0)
+ return sizeof(u64);
+ if (strcmp(type, "s32") == 0)
+ return sizeof(s32);
+ if (strcmp(type, "u32") == 0)
+ return sizeof(u32);
+ if (strcmp(type, "int") == 0)
+ return sizeof(int);
+ if (strcmp(type, "unsigned int") == 0)
+ return sizeof(unsigned int);
+ if (strcmp(type, "s16") == 0)
+ return sizeof(s16);
+ if (strcmp(type, "u16") == 0)
+ return sizeof(u16);
+ if (strcmp(type, "short") == 0)
+ return sizeof(short);
+ if (strcmp(type, "unsigned short") == 0)
+ return sizeof(unsigned short);
+ if (strcmp(type, "s8") == 0)
+ return sizeof(s8);
+ if (strcmp(type, "u8") == 0)
+ return sizeof(u8);
+ if (strcmp(type, "char") == 0)
+ return sizeof(char);
+ if (strcmp(type, "unsigned char") == 0)
+ return sizeof(unsigned char);
+ if (str_has_prefix(type, "char["))
+ return user_field_array_size(type);
+ if (str_has_prefix(type, "unsigned char["))
+ return user_field_array_size(type);
+ if (str_has_prefix(type, "__data_loc "))
+ return sizeof(u32);
+ if (str_has_prefix(type, "__rel_loc "))
+ return sizeof(u32);
+
+ /* Uknown basic type, error */
+ return -EINVAL;
+}
+
+static void user_event_destroy_validators(struct user_event *user)
+{
+ struct user_event_validator *validator, *next;
+ struct list_head *head = &user->validators;
+
+ list_for_each_entry_safe(validator, next, head, user_event_link) {
+ list_del(&validator->user_event_link);
+ kfree(validator);
+ }
+}
+
+static void user_event_destroy_fields(struct user_event *user)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head = &user->fields;
+
+ list_for_each_entry_safe(field, next, head, link) {
+ list_del(&field->link);
+ kfree(field);
+ }
+}
+
+static int user_event_add_field(struct user_event *user, const char *type,
+ const char *name, int offset, int size,
+ int is_signed, int filter_type)
+{
+ struct user_event_validator *validator;
+ struct ftrace_event_field *field;
+ int validator_flags = 0;
+
+ field = kmalloc(sizeof(*field), GFP_KERNEL_ACCOUNT);
+
+ if (!field)
+ return -ENOMEM;
+
+ if (str_has_prefix(type, "__data_loc "))
+ goto add_validator;
+
+ if (str_has_prefix(type, "__rel_loc ")) {
+ validator_flags |= VALIDATOR_REL;
+ goto add_validator;
+ }
+
+ goto add_field;
+
+add_validator:
+ if (strstr(type, "char") != NULL)
+ validator_flags |= VALIDATOR_ENSURE_NULL;
+
+ validator = kmalloc(sizeof(*validator), GFP_KERNEL_ACCOUNT);
+
+ if (!validator) {
+ kfree(field);
+ return -ENOMEM;
+ }
+
+ validator->flags = validator_flags;
+ validator->offset = offset;
+
+ /* Want sequential access when validating */
+ list_add_tail(&validator->user_event_link, &user->validators);
+
+add_field:
+ field->type = type;
+ field->name = name;
+ field->offset = offset;
+ field->size = size;
+ field->is_signed = is_signed;
+ field->filter_type = filter_type;
+
+ if (filter_type == FILTER_OTHER)
+ field->filter_type = filter_assign_type(type);
+
+ list_add(&field->link, &user->fields);
+
+ /*
+ * Min size from user writes that are required, this does not include
+ * the size of trace_entry (common fields).
+ */
+ user->min_size = (offset + size) - sizeof(struct trace_entry);
+
+ return 0;
+}
+
+/*
+ * Parses the values of a field within the description
+ * Format: type name [size]
+ */
+static int user_event_parse_field(char *field, struct user_event *user,
+ u32 *offset)
+{
+ char *part, *type, *name;
+ u32 depth = 0, saved_offset = *offset;
+ int len, size = -EINVAL;
+ bool is_struct = false;
+
+ field = skip_spaces(field);
+
+ if (*field == '\0')
+ return 0;
+
+ /* Handle types that have a space within */
+ len = str_has_prefix(field, "unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "struct ");
+ if (len) {
+ is_struct = true;
+ goto skip_next;
+ }
+
+ len = str_has_prefix(field, "__data_loc unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__data_loc ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__rel_loc unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__rel_loc ");
+ if (len)
+ goto skip_next;
+
+ goto parse;
+skip_next:
+ type = field;
+ field = strpbrk(field + len, " ");
+
+ if (field == NULL)
+ return -EINVAL;
+
+ *field++ = '\0';
+ depth++;
+parse:
+ name = NULL;
+
+ while ((part = strsep(&field, " ")) != NULL) {
+ switch (depth++) {
+ case FIELD_DEPTH_TYPE:
+ type = part;
+ break;
+ case FIELD_DEPTH_NAME:
+ name = part;
+ break;
+ case FIELD_DEPTH_SIZE:
+ if (!is_struct)
+ return -EINVAL;
+
+ if (kstrtou32(part, 10, &size))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ if (depth < FIELD_DEPTH_SIZE || !name)
+ return -EINVAL;
+
+ if (depth == FIELD_DEPTH_SIZE)
+ size = user_field_size(type);
+
+ if (size == 0)
+ return -EINVAL;
+
+ if (size < 0)
+ return size;
+
+ *offset = saved_offset + size;
+
+ return user_event_add_field(user, type, name, saved_offset, size,
+ type[0] != 'u', FILTER_OTHER);
+}
+
+static int user_event_parse_fields(struct user_event *user, char *args)
+{
+ char *field;
+ u32 offset = sizeof(struct trace_entry);
+ int ret = -EINVAL;
+
+ if (args == NULL)
+ return 0;
+
+ while ((field = strsep(&args, ";")) != NULL) {
+ ret = user_event_parse_field(field, user, &offset);
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static struct trace_event_fields user_event_fields_array[1];
+
+static const char *user_field_format(const char *type)
+{
+ if (strcmp(type, "s64") == 0)
+ return "%lld";
+ if (strcmp(type, "u64") == 0)
+ return "%llu";
+ if (strcmp(type, "s32") == 0)
+ return "%d";
+ if (strcmp(type, "u32") == 0)
+ return "%u";
+ if (strcmp(type, "int") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned int") == 0)
+ return "%u";
+ if (strcmp(type, "s16") == 0)
+ return "%d";
+ if (strcmp(type, "u16") == 0)
+ return "%u";
+ if (strcmp(type, "short") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned short") == 0)
+ return "%u";
+ if (strcmp(type, "s8") == 0)
+ return "%d";
+ if (strcmp(type, "u8") == 0)
+ return "%u";
+ if (strcmp(type, "char") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned char") == 0)
+ return "%u";
+ if (strstr(type, "char[") != NULL)
+ return "%s";
+
+ /* Unknown, likely struct, allowed treat as 64-bit */
+ return "%llu";
+}
+
+static bool user_field_is_dyn_string(const char *type, const char **str_func)
+{
+ if (str_has_prefix(type, "__data_loc ")) {
+ *str_func = "__get_str";
+ goto check;
+ }
+
+ if (str_has_prefix(type, "__rel_loc ")) {
+ *str_func = "__get_rel_str";
+ goto check;
+ }
+
+ return false;
+check:
+ return strstr(type, "char") != NULL;
+}
+
+#define LEN_OR_ZERO (len ? len - pos : 0)
+static int user_dyn_field_set_string(int argc, const char **argv, int *iout,
+ char *buf, int len, bool *colon)
+{
+ int pos = 0, i = *iout;
+
+ *colon = false;
+
+ for (; i < argc; ++i) {
+ if (i != *iout)
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", argv[i]);
+
+ if (strchr(argv[i], ';')) {
+ ++i;
+ *colon = true;
+ break;
+ }
+ }
+
+ /* Actual set, advance i */
+ if (len != 0)
+ *iout = i;
+
+ return pos + 1;
+}
+
+static int user_field_set_string(struct ftrace_event_field *field,
+ char *buf, int len, bool colon)
+{
+ int pos = 0;
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->type);
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->name);
+
+ if (str_has_prefix(field->type, "struct "))
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " %d", field->size);
+
+ if (colon)
+ pos += snprintf(buf + pos, LEN_OR_ZERO, ";");
+
+ return pos + 1;
+}
+
+static int user_event_set_print_fmt(struct user_event *user, char *buf, int len)
+{
+ struct ftrace_event_field *field;
+ struct list_head *head = &user->fields;
+ int pos = 0, depth = 0;
+ const char *str_func;
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ list_for_each_entry_reverse(field, head, link) {
+ if (depth != 0)
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s",
+ field->name, user_field_format(field->type));
+
+ depth++;
+ }
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ list_for_each_entry_reverse(field, head, link) {
+ if (user_field_is_dyn_string(field->type, &str_func))
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", %s(%s)", str_func, field->name);
+ else
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", REC->%s", field->name);
+ }
+
+ return pos + 1;
+}
+#undef LEN_OR_ZERO
+
+static int user_event_create_print_fmt(struct user_event *user)
+{
+ char *print_fmt;
+ int len;
+
+ len = user_event_set_print_fmt(user, NULL, 0);
+
+ print_fmt = kmalloc(len, GFP_KERNEL_ACCOUNT);
+
+ if (!print_fmt)
+ return -ENOMEM;
+
+ user_event_set_print_fmt(user, print_fmt, len);
+
+ user->call.print_fmt = print_fmt;
+
+ return 0;
+}
+
+static enum print_line_t user_event_print_trace(struct trace_iterator *iter,
+ int flags,
+ struct trace_event *event)
+{
+ return print_event_fields(iter, event);
+}
+
+static struct trace_event_functions user_event_funcs = {
+ .trace = user_event_print_trace,
+};
+
+static int user_event_set_call_visible(struct user_event *user, bool visible)
+{
+ int ret;
+ const struct cred *old_cred;
+ struct cred *cred;
+
+ cred = prepare_creds();
+
+ if (!cred)
+ return -ENOMEM;
+
+ /*
+ * While by default tracefs is locked down, systems can be configured
+ * to allow user_event files to be less locked down. The extreme case
+ * being "other" has read/write access to user_events_data/status.
+ *
+ * When not locked down, processes may not have permissions to
+ * add/remove calls themselves to tracefs. We need to temporarily
+ * switch to root file permission to allow for this scenario.
+ */
+ cred->fsuid = GLOBAL_ROOT_UID;
+
+ old_cred = override_creds(cred);
+
+ if (visible)
+ ret = trace_add_event_call(&user->call);
+ else
+ ret = trace_remove_event_call(&user->call);
+
+ revert_creds(old_cred);
+ put_cred(cred);
+
+ return ret;
+}
+
+static int destroy_user_event(struct user_event *user)
+{
+ int ret = 0;
+
+ lockdep_assert_held(&event_mutex);
+
+ /* Must destroy fields before call removal */
+ user_event_destroy_fields(user);
+
+ ret = user_event_set_call_visible(user, false);
+
+ if (ret)
+ return ret;
+
+ dyn_event_remove(&user->devent);
+ hash_del(&user->node);
+
+ user_event_destroy_validators(user);
+ kfree(user->call.print_fmt);
+ kfree(EVENT_NAME(user));
+ kfree(user);
+
+ if (current_user_events > 0)
+ current_user_events--;
+ else
+ pr_alert("BUG: Bad current_user_events\n");
+
+ return ret;
+}
+
+static struct user_event *find_user_event(struct user_event_group *group,
+ char *name, u32 *outkey)
+{
+ struct user_event *user;
+ u32 key = user_event_key(name);
+
+ *outkey = key;
+
+ hash_for_each_possible(group->register_table, user, node, key)
+ if (!strcmp(EVENT_NAME(user), name))
+ return user_event_get(user);
+
+ return NULL;
+}
+
+static int user_event_validate(struct user_event *user, void *data, int len)
+{
+ struct list_head *head = &user->validators;
+ struct user_event_validator *validator;
+ void *pos, *end = data + len;
+ u32 loc, offset, size;
+
+ list_for_each_entry(validator, head, user_event_link) {
+ pos = data + validator->offset;
+
+ /* Already done min_size check, no bounds check here */
+ loc = *(u32 *)pos;
+ offset = loc & 0xffff;
+ size = loc >> 16;
+
+ if (likely(validator->flags & VALIDATOR_REL))
+ pos += offset + sizeof(loc);
+ else
+ pos = data + offset;
+
+ pos += size;
+
+ if (unlikely(pos > end))
+ return -EFAULT;
+
+ if (likely(validator->flags & VALIDATOR_ENSURE_NULL))
+ if (unlikely(*(char *)(pos - 1) != '\0'))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/*
+ * Writes the user supplied payload out to a trace file.
+ */
+static void user_event_ftrace(struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted)
+{
+ struct trace_event_file *file;
+ struct trace_entry *entry;
+ struct trace_event_buffer event_buffer;
+ size_t size = sizeof(*entry) + i->count;
+
+ file = (struct trace_event_file *)tpdata;
+
+ if (!file ||
+ !(file->flags & EVENT_FILE_FL_ENABLED) ||
+ trace_trigger_soft_disabled(file))
+ return;
+
+ /* Allocates and fills trace_entry, + 1 of this is data payload */
+ entry = trace_event_buffer_reserve(&event_buffer, file, size);
+
+ if (unlikely(!entry))
+ return;
+
+ if (unlikely(i->count != 0 && !copy_nofault(entry + 1, i->count, i)))
+ goto discard;
+
+ if (!list_empty(&user->validators) &&
+ unlikely(user_event_validate(user, entry, size)))
+ goto discard;
+
+ trace_event_buffer_commit(&event_buffer);
+
+ return;
+discard:
+ *faulted = true;
+ __trace_event_discard_commit(event_buffer.buffer,
+ event_buffer.event);
+}
+
+#ifdef CONFIG_PERF_EVENTS
+/*
+ * Writes the user supplied payload out to perf ring buffer.
+ */
+static void user_event_perf(struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted)
+{
+ struct hlist_head *perf_head;
+
+ perf_head = this_cpu_ptr(user->call.perf_events);
+
+ if (perf_head && !hlist_empty(perf_head)) {
+ struct trace_entry *perf_entry;
+ struct pt_regs *regs;
+ size_t size = sizeof(*perf_entry) + i->count;
+ int context;
+
+ perf_entry = perf_trace_buf_alloc(ALIGN(size, 8),
+ &regs, &context);
+
+ if (unlikely(!perf_entry))
+ return;
+
+ perf_fetch_caller_regs(regs);
+
+ if (unlikely(i->count != 0 && !copy_nofault(perf_entry + 1, i->count, i)))
+ goto discard;
+
+ if (!list_empty(&user->validators) &&
+ unlikely(user_event_validate(user, perf_entry, size)))
+ goto discard;
+
+ perf_trace_buf_submit(perf_entry, size, context,
+ user->call.event.type, 1, regs,
+ perf_head, NULL);
+
+ return;
+discard:
+ *faulted = true;
+ perf_swevent_put_recursion_context(context);
+ }
+}
+#endif
+
+/*
+ * Update the enabled bit among all user processes.
+ */
+static void update_enable_bit_for(struct user_event *user)
+{
+ struct tracepoint *tp = &user->tracepoint;
+ char status = 0;
+
+ if (atomic_read(&tp->key.enabled) > 0) {
+ struct tracepoint_func *probe_func_ptr;
+ user_event_func_t probe_func;
+
+ rcu_read_lock_sched();
+
+ probe_func_ptr = rcu_dereference_sched(tp->funcs);
+
+ if (probe_func_ptr) {
+ do {
+ probe_func = probe_func_ptr->func;
+
+ if (probe_func == user_event_ftrace)
+ status |= EVENT_STATUS_FTRACE;
+#ifdef CONFIG_PERF_EVENTS
+ else if (probe_func == user_event_perf)
+ status |= EVENT_STATUS_PERF;
+#endif
+ else
+ status |= EVENT_STATUS_OTHER;
+ } while ((++probe_func_ptr)->func);
+ }
+
+ rcu_read_unlock_sched();
+ }
+
+ user->status = status;
+
+ user_event_enabler_update(user);
+}
+
+/*
+ * Register callback for our events from tracing sub-systems.
+ */
+static int user_event_reg(struct trace_event_call *call,
+ enum trace_reg type,
+ void *data)
+{
+ struct user_event *user = (struct user_event *)call->data;
+ int ret = 0;
+
+ if (!user)
+ return -ENOENT;
+
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ ret = tracepoint_probe_register(call->tp,
+ call->class->probe,
+ data);
+ if (!ret)
+ goto inc;
+ break;
+
+ case TRACE_REG_UNREGISTER:
+ tracepoint_probe_unregister(call->tp,
+ call->class->probe,
+ data);
+ goto dec;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ ret = tracepoint_probe_register(call->tp,
+ call->class->perf_probe,
+ data);
+ if (!ret)
+ goto inc;
+ break;
+
+ case TRACE_REG_PERF_UNREGISTER:
+ tracepoint_probe_unregister(call->tp,
+ call->class->perf_probe,
+ data);
+ goto dec;
+
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ break;
+#endif
+ }
+
+ return ret;
+inc:
+ user_event_get(user);
+ update_enable_bit_for(user);
+ return 0;
+dec:
+ update_enable_bit_for(user);
+ user_event_put(user, true);
+ return 0;
+}
+
+static int user_event_create(const char *raw_command)
+{
+ struct user_event_group *group;
+ struct user_event *user;
+ char *name;
+ int ret;
+
+ if (!str_has_prefix(raw_command, USER_EVENTS_PREFIX))
+ return -ECANCELED;
+
+ raw_command += USER_EVENTS_PREFIX_LEN;
+ raw_command = skip_spaces(raw_command);
+
+ name = kstrdup(raw_command, GFP_KERNEL_ACCOUNT);
+
+ if (!name)
+ return -ENOMEM;
+
+ group = current_user_event_group();
+
+ if (!group) {
+ kfree(name);
+ return -ENOENT;
+ }
+
+ mutex_lock(&group->reg_mutex);
+
+ /* Dyn events persist, otherwise they would cleanup immediately */
+ ret = user_event_parse_cmd(group, name, &user, USER_EVENT_REG_PERSIST);
+
+ if (!ret)
+ user_event_put(user, false);
+
+ mutex_unlock(&group->reg_mutex);
+
+ if (ret)
+ kfree(name);
+
+ return ret;
+}
+
+static int user_event_show(struct seq_file *m, struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+ struct ftrace_event_field *field;
+ struct list_head *head;
+ int depth = 0;
+
+ seq_printf(m, "%s%s", USER_EVENTS_PREFIX, EVENT_NAME(user));
+
+ head = trace_get_fields(&user->call);
+
+ list_for_each_entry_reverse(field, head, link) {
+ if (depth == 0)
+ seq_puts(m, " ");
+ else
+ seq_puts(m, "; ");
+
+ seq_printf(m, "%s %s", field->type, field->name);
+
+ if (str_has_prefix(field->type, "struct "))
+ seq_printf(m, " %d", field->size);
+
+ depth++;
+ }
+
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static bool user_event_is_busy(struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+
+ return !user_event_last_ref(user);
+}
+
+static int user_event_free(struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+
+ if (!user_event_last_ref(user))
+ return -EBUSY;
+
+ if (!user_event_capable(user->reg_flags))
+ return -EPERM;
+
+ return destroy_user_event(user);
+}
+
+static bool user_field_match(struct ftrace_event_field *field, int argc,
+ const char **argv, int *iout)
+{
+ char *field_name = NULL, *dyn_field_name = NULL;
+ bool colon = false, match = false;
+ int dyn_len, len;
+
+ if (*iout >= argc)
+ return false;
+
+ dyn_len = user_dyn_field_set_string(argc, argv, iout, dyn_field_name,
+ 0, &colon);
+
+ len = user_field_set_string(field, field_name, 0, colon);
+
+ if (dyn_len != len)
+ return false;
+
+ dyn_field_name = kmalloc(dyn_len, GFP_KERNEL);
+ field_name = kmalloc(len, GFP_KERNEL);
+
+ if (!dyn_field_name || !field_name)
+ goto out;
+
+ user_dyn_field_set_string(argc, argv, iout, dyn_field_name,
+ dyn_len, &colon);
+
+ user_field_set_string(field, field_name, len, colon);
+
+ match = strcmp(dyn_field_name, field_name) == 0;
+out:
+ kfree(dyn_field_name);
+ kfree(field_name);
+
+ return match;
+}
+
+static bool user_fields_match(struct user_event *user, int argc,
+ const char **argv)
+{
+ struct ftrace_event_field *field;
+ struct list_head *head = &user->fields;
+ int i = 0;
+
+ list_for_each_entry_reverse(field, head, link) {
+ if (!user_field_match(field, argc, argv, &i))
+ return false;
+ }
+
+ if (i != argc)
+ return false;
+
+ return true;
+}
+
+static bool user_event_match(const char *system, const char *event,
+ int argc, const char **argv, struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+ bool match;
+
+ match = strcmp(EVENT_NAME(user), event) == 0 &&
+ (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
+
+ if (match && argc > 0)
+ match = user_fields_match(user, argc, argv);
+ else if (match && argc == 0)
+ match = list_empty(&user->fields);
+
+ return match;
+}
+
+static struct dyn_event_operations user_event_dops = {
+ .create = user_event_create,
+ .show = user_event_show,
+ .is_busy = user_event_is_busy,
+ .free = user_event_free,
+ .match = user_event_match,
+};
+
+static int user_event_trace_register(struct user_event *user)
+{
+ int ret;
+
+ ret = register_trace_event(&user->call.event);
+
+ if (!ret)
+ return -ENODEV;
+
+ ret = user_event_set_call_visible(user, true);
+
+ if (ret)
+ unregister_trace_event(&user->call.event);
+
+ return ret;
+}
+
+/*
+ * Parses the event name, arguments and flags then registers if successful.
+ * The name buffer lifetime is owned by this method for success cases only.
+ * Upon success the returned user_event has its ref count increased by 1.
+ */
+static int user_event_parse(struct user_event_group *group, char *name,
+ char *args, char *flags,
+ struct user_event **newuser, int reg_flags)
+{
+ int ret;
+ u32 key;
+ struct user_event *user;
+ int argc = 0;
+ char **argv;
+
+ /* Currently don't support any text based flags */
+ if (flags != NULL)
+ return -EINVAL;
+
+ if (!user_event_capable(reg_flags))
+ return -EPERM;
+
+ /* Prevent dyn_event from racing */
+ mutex_lock(&event_mutex);
+ user = find_user_event(group, name, &key);
+ mutex_unlock(&event_mutex);
+
+ if (user) {
+ if (args) {
+ argv = argv_split(GFP_KERNEL, args, &argc);
+ if (!argv) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ ret = user_fields_match(user, argc, (const char **)argv);
+ argv_free(argv);
+
+ } else
+ ret = list_empty(&user->fields);
+
+ if (ret) {
+ *newuser = user;
+ /*
+ * Name is allocated by caller, free it since it already exists.
+ * Caller only worries about failure cases for freeing.
+ */
+ kfree(name);
+ } else {
+ ret = -EADDRINUSE;
+ goto error;
+ }
+
+ return 0;
+error:
+ user_event_put(user, false);
+ return ret;
+ }
+
+ user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT);
+
+ if (!user)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&user->class.fields);
+ INIT_LIST_HEAD(&user->fields);
+ INIT_LIST_HEAD(&user->validators);
+
+ user->group = group;
+ user->tracepoint.name = name;
+
+ ret = user_event_parse_fields(user, args);
+
+ if (ret)
+ goto put_user;
+
+ ret = user_event_create_print_fmt(user);
+
+ if (ret)
+ goto put_user;
+
+ user->call.data = user;
+ user->call.class = &user->class;
+ user->call.name = name;
+ user->call.flags = TRACE_EVENT_FL_TRACEPOINT;
+ user->call.tp = &user->tracepoint;
+ user->call.event.funcs = &user_event_funcs;
+ user->class.system = group->system_name;
+
+ user->class.fields_array = user_event_fields_array;
+ user->class.get_fields = user_event_get_fields;
+ user->class.reg = user_event_reg;
+ user->class.probe = user_event_ftrace;
+#ifdef CONFIG_PERF_EVENTS
+ user->class.perf_probe = user_event_perf;
+#endif
+
+ mutex_lock(&event_mutex);
+
+ if (current_user_events >= max_user_events) {
+ ret = -EMFILE;
+ goto put_user_lock;
+ }
+
+ ret = user_event_trace_register(user);
+
+ if (ret)
+ goto put_user_lock;
+
+ user->reg_flags = reg_flags;
+
+ if (user->reg_flags & USER_EVENT_REG_PERSIST) {
+ /* Ensure we track self ref and caller ref (2) */
+ refcount_set(&user->refcnt, 2);
+ } else {
+ /* Ensure we track only caller ref (1) */
+ refcount_set(&user->refcnt, 1);
+ }
+
+ dyn_event_init(&user->devent, &user_event_dops);
+ dyn_event_add(&user->devent, &user->call);
+ hash_add(group->register_table, &user->node, key);
+ current_user_events++;
+
+ mutex_unlock(&event_mutex);
+
+ *newuser = user;
+ return 0;
+put_user_lock:
+ mutex_unlock(&event_mutex);
+put_user:
+ user_event_destroy_fields(user);
+ user_event_destroy_validators(user);
+ kfree(user->call.print_fmt);
+ kfree(user);
+ return ret;
+}
+
+/*
+ * Deletes a previously created event if it is no longer being used.
+ */
+static int delete_user_event(struct user_event_group *group, char *name)
+{
+ u32 key;
+ struct user_event *user = find_user_event(group, name, &key);
+
+ if (!user)
+ return -ENOENT;
+
+ user_event_put(user, true);
+
+ if (!user_event_last_ref(user))
+ return -EBUSY;
+
+ if (!user_event_capable(user->reg_flags))
+ return -EPERM;
+
+ return destroy_user_event(user);
+}
+
+/*
+ * Validates the user payload and writes via iterator.
+ */
+static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
+{
+ struct user_event_file_info *info = file->private_data;
+ struct user_event_refs *refs;
+ struct user_event *user = NULL;
+ struct tracepoint *tp;
+ ssize_t ret = i->count;
+ int idx;
+
+ if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx)))
+ return -EFAULT;
+
+ if (idx < 0)
+ return -EINVAL;
+
+ rcu_read_lock_sched();
+
+ refs = rcu_dereference_sched(info->refs);
+
+ /*
+ * The refs->events array is protected by RCU, and new items may be
+ * added. But the user retrieved from indexing into the events array
+ * shall be immutable while the file is opened.
+ */
+ if (likely(refs && idx < refs->count))
+ user = refs->events[idx];
+
+ rcu_read_unlock_sched();
+
+ if (unlikely(user == NULL))
+ return -ENOENT;
+
+ if (unlikely(i->count < user->min_size))
+ return -EINVAL;
+
+ tp = &user->tracepoint;
+
+ /*
+ * It's possible key.enabled disables after this check, however
+ * we don't mind if a few events are included in this condition.
+ */
+ if (likely(atomic_read(&tp->key.enabled) > 0)) {
+ struct tracepoint_func *probe_func_ptr;
+ user_event_func_t probe_func;
+ struct iov_iter copy;
+ void *tpdata;
+ bool faulted;
+
+ if (unlikely(fault_in_iov_iter_readable(i, i->count)))
+ return -EFAULT;
+
+ faulted = false;
+
+ rcu_read_lock_sched();
+
+ probe_func_ptr = rcu_dereference_sched(tp->funcs);
+
+ if (probe_func_ptr) {
+ do {
+ copy = *i;
+ probe_func = probe_func_ptr->func;
+ tpdata = probe_func_ptr->data;
+ probe_func(user, &copy, tpdata, &faulted);
+ } while ((++probe_func_ptr)->func);
+ }
+
+ rcu_read_unlock_sched();
+
+ if (unlikely(faulted))
+ return -EFAULT;
+ } else
+ return -EBADF;
+
+ return ret;
+}
+
+static int user_events_open(struct inode *node, struct file *file)
+{
+ struct user_event_group *group;
+ struct user_event_file_info *info;
+
+ group = current_user_event_group();
+
+ if (!group)
+ return -ENOENT;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL_ACCOUNT);
+
+ if (!info)
+ return -ENOMEM;
+
+ info->group = group;
+
+ file->private_data = info;
+
+ return 0;
+}
+
+static ssize_t user_events_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct iov_iter i;
+
+ if (unlikely(*ppos != 0))
+ return -EFAULT;
+
+ if (unlikely(import_ubuf(ITER_SOURCE, (char __user *)ubuf, count, &i)))
+ return -EFAULT;
+
+ return user_events_write_core(file, &i);
+}
+
+static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i)
+{
+ return user_events_write_core(kp->ki_filp, i);
+}
+
+static int user_events_ref_add(struct user_event_file_info *info,
+ struct user_event *user)
+{
+ struct user_event_group *group = info->group;
+ struct user_event_refs *refs, *new_refs;
+ int i, size, count = 0;
+
+ refs = rcu_dereference_protected(info->refs,
+ lockdep_is_held(&group->reg_mutex));
+
+ if (refs) {
+ count = refs->count;
+
+ for (i = 0; i < count; ++i)
+ if (refs->events[i] == user)
+ return i;
+ }
+
+ size = struct_size(refs, events, count + 1);
+
+ new_refs = kzalloc(size, GFP_KERNEL_ACCOUNT);
+
+ if (!new_refs)
+ return -ENOMEM;
+
+ new_refs->count = count + 1;
+
+ for (i = 0; i < count; ++i)
+ new_refs->events[i] = refs->events[i];
+
+ new_refs->events[i] = user_event_get(user);
+
+ rcu_assign_pointer(info->refs, new_refs);
+
+ if (refs)
+ kfree_rcu(refs, rcu);
+
+ return i;
+}
+
+static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
+{
+ u32 size;
+ long ret;
+
+ ret = get_user(size, &ureg->size);
+
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE)
+ return -E2BIG;
+
+ if (size < offsetofend(struct user_reg, write_index))
+ return -EINVAL;
+
+ ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+
+ if (ret)
+ return ret;
+
+ /* Ensure only valid flags */
+ if (kreg->flags & ~(USER_EVENT_REG_MAX-1))
+ return -EINVAL;
+
+ /* Ensure supported size */
+ switch (kreg->enable_size) {
+ case 4:
+ /* 32-bit */
+ break;
+#if BITS_PER_LONG >= 64
+ case 8:
+ /* 64-bit */
+ break;
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ /* Ensure natural alignment */
+ if (kreg->enable_addr % kreg->enable_size)
+ return -EINVAL;
+
+ /* Ensure bit range for size */
+ if (kreg->enable_bit > (kreg->enable_size * BITS_PER_BYTE) - 1)
+ return -EINVAL;
+
+ /* Ensure accessible */
+ if (!access_ok((const void __user *)(uintptr_t)kreg->enable_addr,
+ kreg->enable_size))
+ return -EFAULT;
+
+ kreg->size = size;
+
+ return 0;
+}
+
+/*
+ * Registers a user_event on behalf of a user process.
+ */
+static long user_events_ioctl_reg(struct user_event_file_info *info,
+ unsigned long uarg)
+{
+ struct user_reg __user *ureg = (struct user_reg __user *)uarg;
+ struct user_reg reg;
+ struct user_event *user;
+ struct user_event_enabler *enabler;
+ char *name;
+ long ret;
+ int write_result;
+
+ ret = user_reg_get(ureg, &reg);
+
+ if (ret)
+ return ret;
+
+ /*
+ * Prevent users from using the same address and bit multiple times
+ * within the same mm address space. This can cause unexpected behavior
+ * for user processes that is far easier to debug if this is explictly
+ * an error upon registering.
+ */
+ if (current_user_event_enabler_exists((unsigned long)reg.enable_addr,
+ reg.enable_bit))
+ return -EADDRINUSE;
+
+ name = strndup_user((const char __user *)(uintptr_t)reg.name_args,
+ MAX_EVENT_DESC);
+
+ if (IS_ERR(name)) {
+ ret = PTR_ERR(name);
+ return ret;
+ }
+
+ ret = user_event_parse_cmd(info->group, name, &user, reg.flags);
+
+ if (ret) {
+ kfree(name);
+ return ret;
+ }
+
+ ret = user_events_ref_add(info, user);
+
+ /* No longer need parse ref, ref_add either worked or not */
+ user_event_put(user, false);
+
+ /* Positive number is index and valid */
+ if (ret < 0)
+ return ret;
+
+ /*
+ * user_events_ref_add succeeded:
+ * At this point we have a user_event, it's lifetime is bound by the
+ * reference count, not this file. If anything fails, the user_event
+ * still has a reference until the file is released. During release
+ * any remaining references (from user_events_ref_add) are decremented.
+ *
+ * Attempt to create an enabler, which too has a lifetime tied in the
+ * same way for the event. Once the task that caused the enabler to be
+ * created exits or issues exec() then the enablers it has created
+ * will be destroyed and the ref to the event will be decremented.
+ */
+ enabler = user_event_enabler_create(&reg, user, &write_result);
+
+ if (!enabler)
+ return -ENOMEM;
+
+ /* Write failed/faulted, give error back to caller */
+ if (write_result)
+ return write_result;
+
+ put_user((u32)ret, &ureg->write_index);
+
+ return 0;
+}
+
+/*
+ * Deletes a user_event on behalf of a user process.
+ */
+static long user_events_ioctl_del(struct user_event_file_info *info,
+ unsigned long uarg)
+{
+ void __user *ubuf = (void __user *)uarg;
+ char *name;
+ long ret;
+
+ name = strndup_user(ubuf, MAX_EVENT_DESC);
+
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ /* event_mutex prevents dyn_event from racing */
+ mutex_lock(&event_mutex);
+ ret = delete_user_event(info->group, name);
+ mutex_unlock(&event_mutex);
+
+ kfree(name);
+
+ return ret;
+}
+
+static long user_unreg_get(struct user_unreg __user *ureg,
+ struct user_unreg *kreg)
+{
+ u32 size;
+ long ret;
+
+ ret = get_user(size, &ureg->size);
+
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE)
+ return -E2BIG;
+
+ if (size < offsetofend(struct user_unreg, disable_addr))
+ return -EINVAL;
+
+ ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+
+ /* Ensure no reserved values, since we don't support any yet */
+ if (kreg->__reserved || kreg->__reserved2)
+ return -EINVAL;
+
+ return ret;
+}
+
+static int user_event_mm_clear_bit(struct user_event_mm *user_mm,
+ unsigned long uaddr, unsigned char bit,
+ unsigned long flags)
+{
+ struct user_event_enabler enabler;
+ int result;
+ int attempt = 0;
+
+ memset(&enabler, 0, sizeof(enabler));
+ enabler.addr = uaddr;
+ enabler.values = bit | flags;
+retry:
+ /* Prevents state changes from racing with new enablers */
+ mutex_lock(&event_mutex);
+
+ /* Force the bit to be cleared, since no event is attached */
+ mmap_read_lock(user_mm->mm);
+ result = user_event_enabler_write(user_mm, &enabler, false, &attempt);
+ mmap_read_unlock(user_mm->mm);
+
+ mutex_unlock(&event_mutex);
+
+ if (result) {
+ /* Attempt to fault-in and retry if it worked */
+ if (!user_event_mm_fault_in(user_mm, uaddr, attempt))
+ goto retry;
+ }
+
+ return result;
+}
+
+/*
+ * Unregisters an enablement address/bit within a task/user mm.
+ */
+static long user_events_ioctl_unreg(unsigned long uarg)
+{
+ struct user_unreg __user *ureg = (struct user_unreg __user *)uarg;
+ struct user_event_mm *mm = current->user_event_mm;
+ struct user_event_enabler *enabler, *next;
+ struct user_unreg reg;
+ unsigned long flags;
+ long ret;
+
+ ret = user_unreg_get(ureg, &reg);
+
+ if (ret)
+ return ret;
+
+ if (!mm)
+ return -ENOENT;
+
+ flags = 0;
+ ret = -ENOENT;
+
+ /*
+ * Flags freeing and faulting are used to indicate if the enabler is in
+ * use at all. When faulting is set a page-fault is occurring asyncly.
+ * During async fault if freeing is set, the enabler will be destroyed.
+ * If no async fault is happening, we can destroy it now since we hold
+ * the event_mutex during these checks.
+ */
+ mutex_lock(&event_mutex);
+
+ list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) {
+ if (enabler->addr == reg.disable_addr &&
+ ENABLE_BIT(enabler) == reg.disable_bit) {
+ set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler));
+
+ /* We must keep compat flags for the clear */
+ flags |= enabler->values & ENABLE_VAL_COMPAT_MASK;
+
+ if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)))
+ user_event_enabler_destroy(enabler, true);
+
+ /* Removed at least one */
+ ret = 0;
+ }
+ }
+
+ mutex_unlock(&event_mutex);
+
+ /* Ensure bit is now cleared for user, regardless of event status */
+ if (!ret)
+ ret = user_event_mm_clear_bit(mm, reg.disable_addr,
+ reg.disable_bit, flags);
+
+ return ret;
+}
+
+/*
+ * Handles the ioctl from user mode to register or alter operations.
+ */
+static long user_events_ioctl(struct file *file, unsigned int cmd,
+ unsigned long uarg)
+{
+ struct user_event_file_info *info = file->private_data;
+ struct user_event_group *group = info->group;
+ long ret = -ENOTTY;
+
+ switch (cmd) {
+ case DIAG_IOCSREG:
+ mutex_lock(&group->reg_mutex);
+ ret = user_events_ioctl_reg(info, uarg);
+ mutex_unlock(&group->reg_mutex);
+ break;
+
+ case DIAG_IOCSDEL:
+ mutex_lock(&group->reg_mutex);
+ ret = user_events_ioctl_del(info, uarg);
+ mutex_unlock(&group->reg_mutex);
+ break;
+
+ case DIAG_IOCSUNREG:
+ mutex_lock(&group->reg_mutex);
+ ret = user_events_ioctl_unreg(uarg);
+ mutex_unlock(&group->reg_mutex);
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * Handles the final close of the file from user mode.
+ */
+static int user_events_release(struct inode *node, struct file *file)
+{
+ struct user_event_file_info *info = file->private_data;
+ struct user_event_group *group;
+ struct user_event_refs *refs;
+ int i;
+
+ if (!info)
+ return -EINVAL;
+
+ group = info->group;
+
+ /*
+ * Ensure refs cannot change under any situation by taking the
+ * register mutex during the final freeing of the references.
+ */
+ mutex_lock(&group->reg_mutex);
+
+ refs = info->refs;
+
+ if (!refs)
+ goto out;
+
+ /*
+ * The lifetime of refs has reached an end, it's tied to this file.
+ * The underlying user_events are ref counted, and cannot be freed.
+ * After this decrement, the user_events may be freed elsewhere.
+ */
+ for (i = 0; i < refs->count; ++i)
+ user_event_put(refs->events[i], false);
+
+out:
+ file->private_data = NULL;
+
+ mutex_unlock(&group->reg_mutex);
+
+ kfree(refs);
+ kfree(info);
+
+ return 0;
+}
+
+static const struct file_operations user_data_fops = {
+ .open = user_events_open,
+ .write = user_events_write,
+ .write_iter = user_events_write_iter,
+ .unlocked_ioctl = user_events_ioctl,
+ .release = user_events_release,
+};
+
+static void *user_seq_start(struct seq_file *m, loff_t *pos)
+{
+ if (*pos)
+ return NULL;
+
+ return (void *)1;
+}
+
+static void *user_seq_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ ++*pos;
+ return NULL;
+}
+
+static void user_seq_stop(struct seq_file *m, void *p)
+{
+}
+
+static int user_seq_show(struct seq_file *m, void *p)
+{
+ struct user_event_group *group = m->private;
+ struct user_event *user;
+ char status;
+ int i, active = 0, busy = 0;
+
+ if (!group)
+ return -EINVAL;
+
+ mutex_lock(&group->reg_mutex);
+
+ hash_for_each(group->register_table, i, user, node) {
+ status = user->status;
+
+ seq_printf(m, "%s", EVENT_NAME(user));
+
+ if (status != 0)
+ seq_puts(m, " #");
+
+ if (status != 0) {
+ seq_puts(m, " Used by");
+ if (status & EVENT_STATUS_FTRACE)
+ seq_puts(m, " ftrace");
+ if (status & EVENT_STATUS_PERF)
+ seq_puts(m, " perf");
+ if (status & EVENT_STATUS_OTHER)
+ seq_puts(m, " other");
+ busy++;
+ }
+
+ seq_puts(m, "\n");
+ active++;
+ }
+
+ mutex_unlock(&group->reg_mutex);
+
+ seq_puts(m, "\n");
+ seq_printf(m, "Active: %d\n", active);
+ seq_printf(m, "Busy: %d\n", busy);
+
+ return 0;
+}
+
+static const struct seq_operations user_seq_ops = {
+ .start = user_seq_start,
+ .next = user_seq_next,
+ .stop = user_seq_stop,
+ .show = user_seq_show,
+};
+
+static int user_status_open(struct inode *node, struct file *file)
+{
+ struct user_event_group *group;
+ int ret;
+
+ group = current_user_event_group();
+
+ if (!group)
+ return -ENOENT;
+
+ ret = seq_open(file, &user_seq_ops);
+
+ if (!ret) {
+ /* Chain group to seq_file */
+ struct seq_file *m = file->private_data;
+
+ m->private = group;
+ }
+
+ return ret;
+}
+
+static const struct file_operations user_status_fops = {
+ .open = user_status_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/*
+ * Creates a set of tracefs files to allow user mode interactions.
+ */
+static int create_user_tracefs(void)
+{
+ struct dentry *edata, *emmap;
+
+ edata = tracefs_create_file("user_events_data", TRACE_MODE_WRITE,
+ NULL, NULL, &user_data_fops);
+
+ if (!edata) {
+ pr_warn("Could not create tracefs 'user_events_data' entry\n");
+ goto err;
+ }
+
+ emmap = tracefs_create_file("user_events_status", TRACE_MODE_READ,
+ NULL, NULL, &user_status_fops);
+
+ if (!emmap) {
+ tracefs_remove(edata);
+ pr_warn("Could not create tracefs 'user_events_mmap' entry\n");
+ goto err;
+ }
+
+ return 0;
+err:
+ return -ENODEV;
+}
+
+static int set_max_user_events_sysctl(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ mutex_lock(&event_mutex);
+
+ ret = proc_douintvec(table, write, buffer, lenp, ppos);
+
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+static struct ctl_table user_event_sysctls[] = {
+ {
+ .procname = "user_events_max",
+ .data = &max_user_events,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = set_max_user_events_sysctl,
+ },
+ {}
+};
+
+static int __init trace_events_user_init(void)
+{
+ int ret;
+
+ fault_cache = KMEM_CACHE(user_event_enabler_fault, 0);
+
+ if (!fault_cache)
+ return -ENOMEM;
+
+ init_group = user_event_group_create();
+
+ if (!init_group) {
+ kmem_cache_destroy(fault_cache);
+ return -ENOMEM;
+ }
+
+ ret = create_user_tracefs();
+
+ if (ret) {
+ pr_warn("user_events could not register with tracefs\n");
+ user_event_group_destroy(init_group);
+ kmem_cache_destroy(fault_cache);
+ init_group = NULL;
+ return ret;
+ }
+
+ if (dyn_event_register(&user_event_dops))
+ pr_warn("user_events could not register with dyn_events\n");
+
+ register_sysctl_init("kernel", user_event_sysctls);
+
+ return 0;
+}
+
+fs_initcall(trace_events_user_init);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 70d3d0a09053..1698fc22afa0 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -26,7 +26,7 @@ static int ftrace_event_register(struct trace_event_call *call,
/*
* The FTRACE_ENTRY_REG macro allows ftrace entry to define register
- * function and thus become accesible via perf.
+ * function and thus become accessible via perf.
*/
#undef FTRACE_ENTRY_REG
#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \
@@ -51,6 +51,9 @@ static int ftrace_event_register(struct trace_event_call *call,
#undef __array
#define __array(type, item, size) type item[size];
+#undef __stack_array
+#define __stack_array(type, item, size, field) __array(type, item, size)
+
#undef __array_desc
#define __array_desc(type, container, item, size) type item[size];
@@ -111,7 +114,11 @@ static void __always_unused ____ftrace_check_##name(void) \
#define __array(_type, _item, _len) { \
.type = #_type"["__stringify(_len)"]", .name = #_item, \
.size = sizeof(_type[_len]), .align = __alignof__(_type), \
- is_signed_type(_type), .filter_type = FILTER_OTHER },
+ is_signed_type(_type), .filter_type = FILTER_OTHER, \
+ .len = _len },
+
+#undef __stack_array
+#define __stack_array(_type, _item, _len, _field) __array(_type, _item, _len)
#undef __array_desc
#define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len)
@@ -148,6 +155,9 @@ static struct trace_event_fields ftrace_event_fields_##name[] = { \
#undef __array
#define __array(type, item, len)
+#undef __stack_array
+#define __stack_array(type, item, len, field)
+
#undef __array_desc
#define __array_desc(type, container, item, len)
@@ -176,7 +186,7 @@ struct trace_event_call __used event_##call = { \
.flags = TRACE_EVENT_FL_IGNORE_ENABLE, \
}; \
static struct trace_event_call __used \
-__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
+__section("_ftrace_events") *__event_##call = &event_##call;
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
new file mode 100644
index 000000000000..7d2ddbcfa377
--- /dev/null
+++ b/kernel/trace/trace_fprobe.c
@@ -0,0 +1,1231 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Fprobe-based tracing events
+ * Copyright (C) 2022 Google LLC.
+ */
+#define pr_fmt(fmt) "trace_fprobe: " fmt
+
+#include <linux/fprobe.h>
+#include <linux/module.h>
+#include <linux/rculist.h>
+#include <linux/security.h>
+#include <linux/tracepoint.h>
+#include <linux/uaccess.h>
+
+#include "trace_dynevent.h"
+#include "trace_probe.h"
+#include "trace_probe_kernel.h"
+#include "trace_probe_tmpl.h"
+
+#define FPROBE_EVENT_SYSTEM "fprobes"
+#define TRACEPOINT_EVENT_SYSTEM "tracepoints"
+#define RETHOOK_MAXACTIVE_MAX 4096
+
+static int trace_fprobe_create(const char *raw_command);
+static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev);
+static int trace_fprobe_release(struct dyn_event *ev);
+static bool trace_fprobe_is_busy(struct dyn_event *ev);
+static bool trace_fprobe_match(const char *system, const char *event,
+ int argc, const char **argv, struct dyn_event *ev);
+
+static struct dyn_event_operations trace_fprobe_ops = {
+ .create = trace_fprobe_create,
+ .show = trace_fprobe_show,
+ .is_busy = trace_fprobe_is_busy,
+ .free = trace_fprobe_release,
+ .match = trace_fprobe_match,
+};
+
+/*
+ * Fprobe event core functions
+ */
+struct trace_fprobe {
+ struct dyn_event devent;
+ struct fprobe fp;
+ const char *symbol;
+ struct tracepoint *tpoint;
+ struct module *mod;
+ struct trace_probe tp;
+};
+
+static bool is_trace_fprobe(struct dyn_event *ev)
+{
+ return ev->ops == &trace_fprobe_ops;
+}
+
+static struct trace_fprobe *to_trace_fprobe(struct dyn_event *ev)
+{
+ return container_of(ev, struct trace_fprobe, devent);
+}
+
+/**
+ * for_each_trace_fprobe - iterate over the trace_fprobe list
+ * @pos: the struct trace_fprobe * for each entry
+ * @dpos: the struct dyn_event * to use as a loop cursor
+ */
+#define for_each_trace_fprobe(pos, dpos) \
+ for_each_dyn_event(dpos) \
+ if (is_trace_fprobe(dpos) && (pos = to_trace_fprobe(dpos)))
+
+static bool trace_fprobe_is_return(struct trace_fprobe *tf)
+{
+ return tf->fp.exit_handler != NULL;
+}
+
+static bool trace_fprobe_is_tracepoint(struct trace_fprobe *tf)
+{
+ return tf->tpoint != NULL;
+}
+
+static const char *trace_fprobe_symbol(struct trace_fprobe *tf)
+{
+ return tf->symbol ? tf->symbol : "unknown";
+}
+
+static bool trace_fprobe_is_busy(struct dyn_event *ev)
+{
+ struct trace_fprobe *tf = to_trace_fprobe(ev);
+
+ return trace_probe_is_enabled(&tf->tp);
+}
+
+static bool trace_fprobe_match_command_head(struct trace_fprobe *tf,
+ int argc, const char **argv)
+{
+ char buf[MAX_ARGSTR_LEN + 1];
+
+ if (!argc)
+ return true;
+
+ snprintf(buf, sizeof(buf), "%s", trace_fprobe_symbol(tf));
+ if (strcmp(buf, argv[0]))
+ return false;
+ argc--; argv++;
+
+ return trace_probe_match_command_args(&tf->tp, argc, argv);
+}
+
+static bool trace_fprobe_match(const char *system, const char *event,
+ int argc, const char **argv, struct dyn_event *ev)
+{
+ struct trace_fprobe *tf = to_trace_fprobe(ev);
+
+ if (event[0] != '\0' && strcmp(trace_probe_name(&tf->tp), event))
+ return false;
+
+ if (system && strcmp(trace_probe_group_name(&tf->tp), system))
+ return false;
+
+ return trace_fprobe_match_command_head(tf, argc, argv);
+}
+
+static bool trace_fprobe_is_registered(struct trace_fprobe *tf)
+{
+ return fprobe_is_registered(&tf->fp);
+}
+
+/*
+ * Note that we don't verify the fetch_insn code, since it does not come
+ * from user space.
+ */
+static int
+process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
+ void *base)
+{
+ struct pt_regs *regs = rec;
+ unsigned long val;
+ int ret;
+
+retry:
+ /* 1st stage: get value from context */
+ switch (code->op) {
+ case FETCH_OP_STACK:
+ val = regs_get_kernel_stack_nth(regs, code->param);
+ break;
+ case FETCH_OP_STACKP:
+ val = kernel_stack_pointer(regs);
+ break;
+ case FETCH_OP_RETVAL:
+ val = regs_return_value(regs);
+ break;
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+ case FETCH_OP_ARG:
+ val = regs_get_kernel_argument(regs, code->param);
+ break;
+#endif
+ case FETCH_NOP_SYMBOL: /* Ignore a place holder */
+ code++;
+ goto retry;
+ default:
+ ret = process_common_fetch_insn(code, &val);
+ if (ret < 0)
+ return ret;
+ }
+ code++;
+
+ return process_fetch_insn_bottom(code, val, dest, base);
+}
+NOKPROBE_SYMBOL(process_fetch_insn)
+
+/* function entry handler */
+static nokprobe_inline void
+__fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
+ struct pt_regs *regs,
+ struct trace_event_file *trace_file)
+{
+ struct fentry_trace_entry_head *entry;
+ struct trace_event_call *call = trace_probe_event_call(&tf->tp);
+ struct trace_event_buffer fbuffer;
+ int dsize;
+
+ if (WARN_ON_ONCE(call != trace_file->event_call))
+ return;
+
+ if (trace_trigger_soft_disabled(trace_file))
+ return;
+
+ dsize = __get_data_size(&tf->tp, regs);
+
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file,
+ sizeof(*entry) + tf->tp.size + dsize);
+ if (!entry)
+ return;
+
+ fbuffer.regs = regs;
+ entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
+ entry->ip = entry_ip;
+ store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+
+ trace_event_buffer_commit(&fbuffer);
+}
+
+static void
+fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
+ struct pt_regs *regs)
+{
+ struct event_file_link *link;
+
+ trace_probe_for_each_link_rcu(link, &tf->tp)
+ __fentry_trace_func(tf, entry_ip, regs, link->file);
+}
+NOKPROBE_SYMBOL(fentry_trace_func);
+
+/* Kretprobe handler */
+static nokprobe_inline void
+__fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
+ unsigned long ret_ip, struct pt_regs *regs,
+ struct trace_event_file *trace_file)
+{
+ struct fexit_trace_entry_head *entry;
+ struct trace_event_buffer fbuffer;
+ struct trace_event_call *call = trace_probe_event_call(&tf->tp);
+ int dsize;
+
+ if (WARN_ON_ONCE(call != trace_file->event_call))
+ return;
+
+ if (trace_trigger_soft_disabled(trace_file))
+ return;
+
+ dsize = __get_data_size(&tf->tp, regs);
+
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file,
+ sizeof(*entry) + tf->tp.size + dsize);
+ if (!entry)
+ return;
+
+ fbuffer.regs = regs;
+ entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
+ entry->func = entry_ip;
+ entry->ret_ip = ret_ip;
+ store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+
+ trace_event_buffer_commit(&fbuffer);
+}
+
+static void
+fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
+ unsigned long ret_ip, struct pt_regs *regs)
+{
+ struct event_file_link *link;
+
+ trace_probe_for_each_link_rcu(link, &tf->tp)
+ __fexit_trace_func(tf, entry_ip, ret_ip, regs, link->file);
+}
+NOKPROBE_SYMBOL(fexit_trace_func);
+
+#ifdef CONFIG_PERF_EVENTS
+
+static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
+ struct pt_regs *regs)
+{
+ struct trace_event_call *call = trace_probe_event_call(&tf->tp);
+ struct fentry_trace_entry_head *entry;
+ struct hlist_head *head;
+ int size, __size, dsize;
+ int rctx;
+
+ head = this_cpu_ptr(call->perf_events);
+ if (hlist_empty(head))
+ return 0;
+
+ dsize = __get_data_size(&tf->tp, regs);
+ __size = sizeof(*entry) + tf->tp.size + dsize;
+ size = ALIGN(__size + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+
+ entry = perf_trace_buf_alloc(size, NULL, &rctx);
+ if (!entry)
+ return 0;
+
+ entry->ip = entry_ip;
+ memset(&entry[1], 0, dsize);
+ store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+ perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
+ head, NULL);
+ return 0;
+}
+NOKPROBE_SYMBOL(fentry_perf_func);
+
+static void
+fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
+ unsigned long ret_ip, struct pt_regs *regs)
+{
+ struct trace_event_call *call = trace_probe_event_call(&tf->tp);
+ struct fexit_trace_entry_head *entry;
+ struct hlist_head *head;
+ int size, __size, dsize;
+ int rctx;
+
+ head = this_cpu_ptr(call->perf_events);
+ if (hlist_empty(head))
+ return;
+
+ dsize = __get_data_size(&tf->tp, regs);
+ __size = sizeof(*entry) + tf->tp.size + dsize;
+ size = ALIGN(__size + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+
+ entry = perf_trace_buf_alloc(size, NULL, &rctx);
+ if (!entry)
+ return;
+
+ entry->func = entry_ip;
+ entry->ret_ip = ret_ip;
+ store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+ perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
+ head, NULL);
+}
+NOKPROBE_SYMBOL(fexit_perf_func);
+#endif /* CONFIG_PERF_EVENTS */
+
+static int fentry_dispatcher(struct fprobe *fp, unsigned long entry_ip,
+ unsigned long ret_ip, struct pt_regs *regs,
+ void *entry_data)
+{
+ struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
+ int ret = 0;
+
+ if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE))
+ fentry_trace_func(tf, entry_ip, regs);
+#ifdef CONFIG_PERF_EVENTS
+ if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE))
+ ret = fentry_perf_func(tf, entry_ip, regs);
+#endif
+ return ret;
+}
+NOKPROBE_SYMBOL(fentry_dispatcher);
+
+static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip,
+ unsigned long ret_ip, struct pt_regs *regs,
+ void *entry_data)
+{
+ struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
+
+ if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE))
+ fexit_trace_func(tf, entry_ip, ret_ip, regs);
+#ifdef CONFIG_PERF_EVENTS
+ if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE))
+ fexit_perf_func(tf, entry_ip, ret_ip, regs);
+#endif
+}
+NOKPROBE_SYMBOL(fexit_dispatcher);
+
+static void free_trace_fprobe(struct trace_fprobe *tf)
+{
+ if (tf) {
+ trace_probe_cleanup(&tf->tp);
+ kfree(tf->symbol);
+ kfree(tf);
+ }
+}
+
+/*
+ * Allocate new trace_probe and initialize it (including fprobe).
+ */
+static struct trace_fprobe *alloc_trace_fprobe(const char *group,
+ const char *event,
+ const char *symbol,
+ struct tracepoint *tpoint,
+ int maxactive,
+ int nargs, bool is_return)
+{
+ struct trace_fprobe *tf;
+ int ret = -ENOMEM;
+
+ tf = kzalloc(struct_size(tf, tp.args, nargs), GFP_KERNEL);
+ if (!tf)
+ return ERR_PTR(ret);
+
+ tf->symbol = kstrdup(symbol, GFP_KERNEL);
+ if (!tf->symbol)
+ goto error;
+
+ if (is_return)
+ tf->fp.exit_handler = fexit_dispatcher;
+ else
+ tf->fp.entry_handler = fentry_dispatcher;
+
+ tf->tpoint = tpoint;
+ tf->fp.nr_maxactive = maxactive;
+
+ ret = trace_probe_init(&tf->tp, event, group, false);
+ if (ret < 0)
+ goto error;
+
+ dyn_event_init(&tf->devent, &trace_fprobe_ops);
+ return tf;
+error:
+ free_trace_fprobe(tf);
+ return ERR_PTR(ret);
+}
+
+static struct trace_fprobe *find_trace_fprobe(const char *event,
+ const char *group)
+{
+ struct dyn_event *pos;
+ struct trace_fprobe *tf;
+
+ for_each_trace_fprobe(tf, pos)
+ if (strcmp(trace_probe_name(&tf->tp), event) == 0 &&
+ strcmp(trace_probe_group_name(&tf->tp), group) == 0)
+ return tf;
+ return NULL;
+}
+
+static inline int __enable_trace_fprobe(struct trace_fprobe *tf)
+{
+ if (trace_fprobe_is_registered(tf))
+ enable_fprobe(&tf->fp);
+
+ return 0;
+}
+
+static void __disable_trace_fprobe(struct trace_probe *tp)
+{
+ struct trace_fprobe *tf;
+
+ list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
+ if (!trace_fprobe_is_registered(tf))
+ continue;
+ disable_fprobe(&tf->fp);
+ }
+}
+
+/*
+ * Enable trace_probe
+ * if the file is NULL, enable "perf" handler, or enable "trace" handler.
+ */
+static int enable_trace_fprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
+{
+ struct trace_probe *tp;
+ struct trace_fprobe *tf;
+ bool enabled;
+ int ret = 0;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+ enabled = trace_probe_is_enabled(tp);
+
+ /* This also changes "enabled" state */
+ if (file) {
+ ret = trace_probe_add_file(tp, file);
+ if (ret)
+ return ret;
+ } else
+ trace_probe_set_flag(tp, TP_FLAG_PROFILE);
+
+ if (!enabled) {
+ list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
+ /* TODO: check the fprobe is gone */
+ __enable_trace_fprobe(tf);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Disable trace_probe
+ * if the file is NULL, disable "perf" handler, or disable "trace" handler.
+ */
+static int disable_trace_fprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
+{
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+
+ if (file) {
+ if (!trace_probe_get_file_link(tp, file))
+ return -ENOENT;
+ if (!trace_probe_has_single_file(tp))
+ goto out;
+ trace_probe_clear_flag(tp, TP_FLAG_TRACE);
+ } else
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
+
+ if (!trace_probe_is_enabled(tp))
+ __disable_trace_fprobe(tp);
+
+ out:
+ if (file)
+ /*
+ * Synchronization is done in below function. For perf event,
+ * file == NULL and perf_trace_event_unreg() calls
+ * tracepoint_synchronize_unregister() to ensure synchronize
+ * event. We don't need to care about it.
+ */
+ trace_probe_remove_file(tp, file);
+
+ return 0;
+}
+
+/* Event entry printers */
+static enum print_line_t
+print_fentry_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct fentry_trace_entry_head *field;
+ struct trace_seq *s = &iter->seq;
+ struct trace_probe *tp;
+
+ field = (struct fentry_trace_entry_head *)iter->ent;
+ tp = trace_probe_primary_from_call(
+ container_of(event, struct trace_event_call, event));
+ if (WARN_ON_ONCE(!tp))
+ goto out;
+
+ trace_seq_printf(s, "%s: (", trace_probe_name(tp));
+
+ if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
+ goto out;
+
+ trace_seq_putc(s, ')');
+
+ if (trace_probe_print_args(s, tp->args, tp->nr_args,
+ (u8 *)&field[1], field) < 0)
+ goto out;
+
+ trace_seq_putc(s, '\n');
+ out:
+ return trace_handle_return(s);
+}
+
+static enum print_line_t
+print_fexit_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct fexit_trace_entry_head *field;
+ struct trace_seq *s = &iter->seq;
+ struct trace_probe *tp;
+
+ field = (struct fexit_trace_entry_head *)iter->ent;
+ tp = trace_probe_primary_from_call(
+ container_of(event, struct trace_event_call, event));
+ if (WARN_ON_ONCE(!tp))
+ goto out;
+
+ trace_seq_printf(s, "%s: (", trace_probe_name(tp));
+
+ if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
+ goto out;
+
+ trace_seq_puts(s, " <- ");
+
+ if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
+ goto out;
+
+ trace_seq_putc(s, ')');
+
+ if (trace_probe_print_args(s, tp->args, tp->nr_args,
+ (u8 *)&field[1], field) < 0)
+ goto out;
+
+ trace_seq_putc(s, '\n');
+
+ out:
+ return trace_handle_return(s);
+}
+
+static int fentry_event_define_fields(struct trace_event_call *event_call)
+{
+ int ret;
+ struct fentry_trace_entry_head field;
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(event_call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENOENT;
+
+ DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
+
+ return traceprobe_define_arg_fields(event_call, sizeof(field), tp);
+}
+
+static int fexit_event_define_fields(struct trace_event_call *event_call)
+{
+ int ret;
+ struct fexit_trace_entry_head field;
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(event_call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENOENT;
+
+ DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
+ DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
+
+ return traceprobe_define_arg_fields(event_call, sizeof(field), tp);
+}
+
+static struct trace_event_functions fentry_funcs = {
+ .trace = print_fentry_event
+};
+
+static struct trace_event_functions fexit_funcs = {
+ .trace = print_fexit_event
+};
+
+static struct trace_event_fields fentry_fields_array[] = {
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = fentry_event_define_fields },
+ {}
+};
+
+static struct trace_event_fields fexit_fields_array[] = {
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = fexit_event_define_fields },
+ {}
+};
+
+static int fprobe_register(struct trace_event_call *event,
+ enum trace_reg type, void *data);
+
+static inline void init_trace_event_call(struct trace_fprobe *tf)
+{
+ struct trace_event_call *call = trace_probe_event_call(&tf->tp);
+
+ if (trace_fprobe_is_return(tf)) {
+ call->event.funcs = &fexit_funcs;
+ call->class->fields_array = fexit_fields_array;
+ } else {
+ call->event.funcs = &fentry_funcs;
+ call->class->fields_array = fentry_fields_array;
+ }
+
+ call->flags = TRACE_EVENT_FL_FPROBE;
+ call->class->reg = fprobe_register;
+}
+
+static int register_fprobe_event(struct trace_fprobe *tf)
+{
+ init_trace_event_call(tf);
+
+ return trace_probe_register_event_call(&tf->tp);
+}
+
+static int unregister_fprobe_event(struct trace_fprobe *tf)
+{
+ return trace_probe_unregister_event_call(&tf->tp);
+}
+
+/* Internal register function - just handle fprobe and flags */
+static int __register_trace_fprobe(struct trace_fprobe *tf)
+{
+ int i, ret;
+
+ /* Should we need new LOCKDOWN flag for fprobe? */
+ ret = security_locked_down(LOCKDOWN_KPROBES);
+ if (ret)
+ return ret;
+
+ if (trace_fprobe_is_registered(tf))
+ return -EINVAL;
+
+ for (i = 0; i < tf->tp.nr_args; i++) {
+ ret = traceprobe_update_arg(&tf->tp.args[i]);
+ if (ret)
+ return ret;
+ }
+
+ /* Set/clear disabled flag according to tp->flag */
+ if (trace_probe_is_enabled(&tf->tp))
+ tf->fp.flags &= ~FPROBE_FL_DISABLED;
+ else
+ tf->fp.flags |= FPROBE_FL_DISABLED;
+
+ if (trace_fprobe_is_tracepoint(tf)) {
+ struct tracepoint *tpoint = tf->tpoint;
+ unsigned long ip = (unsigned long)tpoint->probestub;
+ /*
+ * Here, we do 2 steps to enable fprobe on a tracepoint.
+ * At first, put __probestub_##TP function on the tracepoint
+ * and put a fprobe on the stub function.
+ */
+ ret = tracepoint_probe_register_prio_may_exist(tpoint,
+ tpoint->probestub, NULL, 0);
+ if (ret < 0)
+ return ret;
+ return register_fprobe_ips(&tf->fp, &ip, 1);
+ }
+
+ /* TODO: handle filter, nofilter or symbol list */
+ return register_fprobe(&tf->fp, tf->symbol, NULL);
+}
+
+/* Internal unregister function - just handle fprobe and flags */
+static void __unregister_trace_fprobe(struct trace_fprobe *tf)
+{
+ if (trace_fprobe_is_registered(tf)) {
+ unregister_fprobe(&tf->fp);
+ memset(&tf->fp, 0, sizeof(tf->fp));
+ if (trace_fprobe_is_tracepoint(tf)) {
+ tracepoint_probe_unregister(tf->tpoint,
+ tf->tpoint->probestub, NULL);
+ tf->tpoint = NULL;
+ tf->mod = NULL;
+ }
+ }
+}
+
+/* TODO: make this trace_*probe common function */
+/* Unregister a trace_probe and probe_event */
+static int unregister_trace_fprobe(struct trace_fprobe *tf)
+{
+ /* If other probes are on the event, just unregister fprobe */
+ if (trace_probe_has_sibling(&tf->tp))
+ goto unreg;
+
+ /* Enabled event can not be unregistered */
+ if (trace_probe_is_enabled(&tf->tp))
+ return -EBUSY;
+
+ /* If there's a reference to the dynamic event */
+ if (trace_event_dyn_busy(trace_probe_event_call(&tf->tp)))
+ return -EBUSY;
+
+ /* Will fail if probe is being used by ftrace or perf */
+ if (unregister_fprobe_event(tf))
+ return -EBUSY;
+
+unreg:
+ __unregister_trace_fprobe(tf);
+ dyn_event_remove(&tf->devent);
+ trace_probe_unlink(&tf->tp);
+
+ return 0;
+}
+
+static bool trace_fprobe_has_same_fprobe(struct trace_fprobe *orig,
+ struct trace_fprobe *comp)
+{
+ struct trace_probe_event *tpe = orig->tp.event;
+ int i;
+
+ list_for_each_entry(orig, &tpe->probes, tp.list) {
+ if (strcmp(trace_fprobe_symbol(orig),
+ trace_fprobe_symbol(comp)))
+ continue;
+
+ /*
+ * trace_probe_compare_arg_type() ensured that nr_args and
+ * each argument name and type are same. Let's compare comm.
+ */
+ for (i = 0; i < orig->tp.nr_args; i++) {
+ if (strcmp(orig->tp.args[i].comm,
+ comp->tp.args[i].comm))
+ break;
+ }
+
+ if (i == orig->tp.nr_args)
+ return true;
+ }
+
+ return false;
+}
+
+static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to)
+{
+ int ret;
+
+ if (trace_fprobe_is_return(tf) != trace_fprobe_is_return(to) ||
+ trace_fprobe_is_tracepoint(tf) != trace_fprobe_is_tracepoint(to)) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, DIFF_PROBE_TYPE);
+ return -EEXIST;
+ }
+ ret = trace_probe_compare_arg_type(&tf->tp, &to->tp);
+ if (ret) {
+ /* Note that argument starts index = 2 */
+ trace_probe_log_set_index(ret + 1);
+ trace_probe_log_err(0, DIFF_ARG_TYPE);
+ return -EEXIST;
+ }
+ if (trace_fprobe_has_same_fprobe(to, tf)) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, SAME_PROBE);
+ return -EEXIST;
+ }
+
+ /* Append to existing event */
+ ret = trace_probe_append(&tf->tp, &to->tp);
+ if (ret)
+ return ret;
+
+ ret = __register_trace_fprobe(tf);
+ if (ret)
+ trace_probe_unlink(&tf->tp);
+ else
+ dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp));
+
+ return ret;
+}
+
+/* Register a trace_probe and probe_event */
+static int register_trace_fprobe(struct trace_fprobe *tf)
+{
+ struct trace_fprobe *old_tf;
+ int ret;
+
+ mutex_lock(&event_mutex);
+
+ old_tf = find_trace_fprobe(trace_probe_name(&tf->tp),
+ trace_probe_group_name(&tf->tp));
+ if (old_tf) {
+ ret = append_trace_fprobe(tf, old_tf);
+ goto end;
+ }
+
+ /* Register new event */
+ ret = register_fprobe_event(tf);
+ if (ret) {
+ if (ret == -EEXIST) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, EVENT_EXIST);
+ } else
+ pr_warn("Failed to register probe event(%d)\n", ret);
+ goto end;
+ }
+
+ /* Register fprobe */
+ ret = __register_trace_fprobe(tf);
+ if (ret < 0)
+ unregister_fprobe_event(tf);
+ else
+ dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp));
+
+end:
+ mutex_unlock(&event_mutex);
+ return ret;
+}
+
+#ifdef CONFIG_MODULES
+static int __tracepoint_probe_module_cb(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct tp_module *tp_mod = data;
+ struct trace_fprobe *tf;
+ struct dyn_event *pos;
+
+ if (val != MODULE_STATE_GOING)
+ return NOTIFY_DONE;
+
+ mutex_lock(&event_mutex);
+ for_each_trace_fprobe(tf, pos) {
+ if (tp_mod->mod == tf->mod) {
+ tracepoint_probe_unregister(tf->tpoint,
+ tf->tpoint->probestub, NULL);
+ tf->tpoint = NULL;
+ tf->mod = NULL;
+ }
+ }
+ mutex_unlock(&event_mutex);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block tracepoint_module_nb = {
+ .notifier_call = __tracepoint_probe_module_cb,
+};
+#endif /* CONFIG_MODULES */
+
+struct __find_tracepoint_cb_data {
+ const char *tp_name;
+ struct tracepoint *tpoint;
+};
+
+static void __find_tracepoint_cb(struct tracepoint *tp, void *priv)
+{
+ struct __find_tracepoint_cb_data *data = priv;
+
+ if (!data->tpoint && !strcmp(data->tp_name, tp->name))
+ data->tpoint = tp;
+}
+
+static struct tracepoint *find_tracepoint(const char *tp_name)
+{
+ struct __find_tracepoint_cb_data data = {
+ .tp_name = tp_name,
+ };
+
+ for_each_kernel_tracepoint(__find_tracepoint_cb, &data);
+
+ return data.tpoint;
+}
+
+static int parse_symbol_and_return(int argc, const char *argv[],
+ char **symbol, bool *is_return,
+ bool is_tracepoint)
+{
+ char *tmp = strchr(argv[1], '%');
+ int i;
+
+ if (tmp) {
+ int len = tmp - argv[1];
+
+ if (!is_tracepoint && !strcmp(tmp, "%return")) {
+ *is_return = true;
+ } else {
+ trace_probe_log_err(len, BAD_ADDR_SUFFIX);
+ return -EINVAL;
+ }
+ *symbol = kmemdup_nul(argv[1], len, GFP_KERNEL);
+ } else
+ *symbol = kstrdup(argv[1], GFP_KERNEL);
+ if (!*symbol)
+ return -ENOMEM;
+
+ if (*is_return)
+ return 0;
+
+ /* If there is $retval, this should be a return fprobe. */
+ for (i = 2; i < argc; i++) {
+ tmp = strstr(argv[i], "$retval");
+ if (tmp && !isalnum(tmp[7]) && tmp[7] != '_') {
+ if (is_tracepoint) {
+ trace_probe_log_set_index(i);
+ trace_probe_log_err(tmp - argv[i], RETVAL_ON_PROBE);
+ return -EINVAL;
+ }
+ *is_return = true;
+ break;
+ }
+ }
+ return 0;
+}
+
+static int __trace_fprobe_create(int argc, const char *argv[])
+{
+ /*
+ * Argument syntax:
+ * - Add fentry probe:
+ * f[:[GRP/][EVENT]] [MOD:]KSYM [FETCHARGS]
+ * - Add fexit probe:
+ * f[N][:[GRP/][EVENT]] [MOD:]KSYM%return [FETCHARGS]
+ * - Add tracepoint probe:
+ * t[:[GRP/][EVENT]] TRACEPOINT [FETCHARGS]
+ *
+ * Fetch args:
+ * $retval : fetch return value
+ * $stack : fetch stack address
+ * $stackN : fetch Nth entry of stack (N:0-)
+ * $argN : fetch Nth argument (N:1-)
+ * $comm : fetch current task comm
+ * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
+ * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
+ * Dereferencing memory fetch:
+ * +|-offs(ARG) : fetch memory at ARG +|- offs address.
+ * Alias name of args:
+ * NAME=FETCHARG : set NAME as alias of FETCHARG.
+ * Type of args:
+ * FETCHARG:TYPE : use TYPE instead of unsigned long.
+ */
+ struct trace_fprobe *tf = NULL;
+ int i, len, new_argc = 0, ret = 0;
+ bool is_return = false;
+ char *symbol = NULL;
+ const char *event = NULL, *group = FPROBE_EVENT_SYSTEM;
+ const char **new_argv = NULL;
+ int maxactive = 0;
+ char buf[MAX_EVENT_NAME_LEN];
+ char gbuf[MAX_EVENT_NAME_LEN];
+ char sbuf[KSYM_NAME_LEN];
+ char abuf[MAX_BTF_ARGS_LEN];
+ bool is_tracepoint = false;
+ struct tracepoint *tpoint = NULL;
+ struct traceprobe_parse_context ctx = {
+ .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE,
+ };
+
+ if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2)
+ return -ECANCELED;
+
+ if (argv[0][0] == 't') {
+ is_tracepoint = true;
+ group = TRACEPOINT_EVENT_SYSTEM;
+ }
+
+ trace_probe_log_init("trace_fprobe", argc, argv);
+
+ event = strchr(&argv[0][1], ':');
+ if (event)
+ event++;
+
+ if (isdigit(argv[0][1])) {
+ if (event)
+ len = event - &argv[0][1] - 1;
+ else
+ len = strlen(&argv[0][1]);
+ if (len > MAX_EVENT_NAME_LEN - 1) {
+ trace_probe_log_err(1, BAD_MAXACT);
+ goto parse_error;
+ }
+ memcpy(buf, &argv[0][1], len);
+ buf[len] = '\0';
+ ret = kstrtouint(buf, 0, &maxactive);
+ if (ret || !maxactive) {
+ trace_probe_log_err(1, BAD_MAXACT);
+ goto parse_error;
+ }
+ /* fprobe rethook instances are iterated over via a list. The
+ * maximum should stay reasonable.
+ */
+ if (maxactive > RETHOOK_MAXACTIVE_MAX) {
+ trace_probe_log_err(1, MAXACT_TOO_BIG);
+ goto parse_error;
+ }
+ }
+
+ trace_probe_log_set_index(1);
+
+ /* a symbol(or tracepoint) must be specified */
+ ret = parse_symbol_and_return(argc, argv, &symbol, &is_return, is_tracepoint);
+ if (ret < 0)
+ goto parse_error;
+
+ if (!is_return && maxactive) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(1, BAD_MAXACT_TYPE);
+ goto parse_error;
+ }
+
+ trace_probe_log_set_index(0);
+ if (event) {
+ ret = traceprobe_parse_event_name(&event, &group, gbuf,
+ event - argv[0]);
+ if (ret)
+ goto parse_error;
+ }
+
+ if (!event) {
+ /* Make a new event name */
+ if (is_tracepoint)
+ snprintf(buf, MAX_EVENT_NAME_LEN, "%s%s",
+ isdigit(*symbol) ? "_" : "", symbol);
+ else
+ snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol,
+ is_return ? "exit" : "entry");
+ sanitize_event_name(buf);
+ event = buf;
+ }
+
+ if (is_return)
+ ctx.flags |= TPARG_FL_RETURN;
+ else
+ ctx.flags |= TPARG_FL_FENTRY;
+
+ if (is_tracepoint) {
+ ctx.flags |= TPARG_FL_TPOINT;
+ tpoint = find_tracepoint(symbol);
+ if (!tpoint) {
+ trace_probe_log_set_index(1);
+ trace_probe_log_err(0, NO_TRACEPOINT);
+ goto parse_error;
+ }
+ ctx.funcname = kallsyms_lookup(
+ (unsigned long)tpoint->probestub,
+ NULL, NULL, NULL, sbuf);
+ } else
+ ctx.funcname = symbol;
+
+ argc -= 2; argv += 2;
+ new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc,
+ abuf, MAX_BTF_ARGS_LEN, &ctx);
+ if (IS_ERR(new_argv)) {
+ ret = PTR_ERR(new_argv);
+ new_argv = NULL;
+ goto out;
+ }
+ if (new_argv) {
+ argc = new_argc;
+ argv = new_argv;
+ }
+
+ /* setup a probe */
+ tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive,
+ argc, is_return);
+ if (IS_ERR(tf)) {
+ ret = PTR_ERR(tf);
+ /* This must return -ENOMEM, else there is a bug */
+ WARN_ON_ONCE(ret != -ENOMEM);
+ goto out; /* We know tf is not allocated */
+ }
+
+ if (is_tracepoint)
+ tf->mod = __module_text_address(
+ (unsigned long)tf->tpoint->probestub);
+
+ /* parse arguments */
+ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ trace_probe_log_set_index(i + 2);
+ ctx.offset = 0;
+ ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx);
+ if (ret)
+ goto error; /* This can be -ENOMEM */
+ }
+
+ ret = traceprobe_set_print_fmt(&tf->tp,
+ is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL);
+ if (ret < 0)
+ goto error;
+
+ ret = register_trace_fprobe(tf);
+ if (ret) {
+ trace_probe_log_set_index(1);
+ if (ret == -EILSEQ)
+ trace_probe_log_err(0, BAD_INSN_BNDRY);
+ else if (ret == -ENOENT)
+ trace_probe_log_err(0, BAD_PROBE_ADDR);
+ else if (ret != -ENOMEM && ret != -EEXIST)
+ trace_probe_log_err(0, FAIL_REG_PROBE);
+ goto error;
+ }
+
+out:
+ traceprobe_finish_parse(&ctx);
+ trace_probe_log_clear();
+ kfree(new_argv);
+ kfree(symbol);
+ return ret;
+
+parse_error:
+ ret = -EINVAL;
+error:
+ free_trace_fprobe(tf);
+ goto out;
+}
+
+static int trace_fprobe_create(const char *raw_command)
+{
+ return trace_probe_create(raw_command, __trace_fprobe_create);
+}
+
+static int trace_fprobe_release(struct dyn_event *ev)
+{
+ struct trace_fprobe *tf = to_trace_fprobe(ev);
+ int ret = unregister_trace_fprobe(tf);
+
+ if (!ret)
+ free_trace_fprobe(tf);
+ return ret;
+}
+
+static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev)
+{
+ struct trace_fprobe *tf = to_trace_fprobe(ev);
+ int i;
+
+ if (trace_fprobe_is_tracepoint(tf))
+ seq_putc(m, 't');
+ else
+ seq_putc(m, 'f');
+ if (trace_fprobe_is_return(tf) && tf->fp.nr_maxactive)
+ seq_printf(m, "%d", tf->fp.nr_maxactive);
+ seq_printf(m, ":%s/%s", trace_probe_group_name(&tf->tp),
+ trace_probe_name(&tf->tp));
+
+ seq_printf(m, " %s%s", trace_fprobe_symbol(tf),
+ trace_fprobe_is_return(tf) ? "%return" : "");
+
+ for (i = 0; i < tf->tp.nr_args; i++)
+ seq_printf(m, " %s=%s", tf->tp.args[i].name, tf->tp.args[i].comm);
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+/*
+ * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex.
+ */
+static int fprobe_register(struct trace_event_call *event,
+ enum trace_reg type, void *data)
+{
+ struct trace_event_file *file = data;
+
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return enable_trace_fprobe(event, file);
+ case TRACE_REG_UNREGISTER:
+ return disable_trace_fprobe(event, file);
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return enable_trace_fprobe(event, NULL);
+ case TRACE_REG_PERF_UNREGISTER:
+ return disable_trace_fprobe(event, NULL);
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ return 0;
+#endif
+ }
+ return 0;
+}
+
+/*
+ * Register dynevent at core_initcall. This allows kernel to setup fprobe
+ * events in postcore_initcall without tracefs.
+ */
+static __init int init_fprobe_trace_early(void)
+{
+ int ret;
+
+ ret = dyn_event_register(&trace_fprobe_ops);
+ if (ret)
+ return ret;
+
+#ifdef CONFIG_MODULES
+ ret = register_tracepoint_module_notifier(&tracepoint_module_nb);
+ if (ret)
+ return ret;
+#endif
+
+ return 0;
+}
+core_initcall(init_fprobe_trace_early);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index dd4dff71d89a..9f1bfbe105e8 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -23,40 +23,63 @@ static void tracing_start_function_trace(struct trace_array *tr);
static void tracing_stop_function_trace(struct trace_array *tr);
static void
function_trace_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs);
+ struct ftrace_ops *op, struct ftrace_regs *fregs);
static void
function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs);
+ struct ftrace_ops *op, struct ftrace_regs *fregs);
+static void
+function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs);
+static void
+function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op,
+ struct ftrace_regs *fregs);
static struct tracer_flags func_flags;
/* Our option */
enum {
- TRACE_FUNC_OPT_STACK = 0x1,
+
+ TRACE_FUNC_NO_OPTS = 0x0, /* No flags set. */
+ TRACE_FUNC_OPT_STACK = 0x1,
+ TRACE_FUNC_OPT_NO_REPEATS = 0x2,
+
+ /* Update this to next highest bit. */
+ TRACE_FUNC_OPT_HIGHEST_BIT = 0x4
};
-static int allocate_ftrace_ops(struct trace_array *tr)
+#define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_HIGHEST_BIT - 1)
+
+int ftrace_allocate_ftrace_ops(struct trace_array *tr)
{
struct ftrace_ops *ops;
+ /* The top level array uses the "global_ops" */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ return 0;
+
ops = kzalloc(sizeof(*ops), GFP_KERNEL);
if (!ops)
return -ENOMEM;
/* Currently only the non stack version is supported */
ops->func = function_trace_call;
- ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID;
+ ops->flags = FTRACE_OPS_FL_PID;
tr->ops = ops;
ops->private = tr;
+
return 0;
}
+void ftrace_free_ftrace_ops(struct trace_array *tr)
+{
+ kfree(tr->ops);
+ tr->ops = NULL;
+}
int ftrace_create_function_files(struct trace_array *tr,
struct dentry *parent)
{
- int ret;
-
/*
* The top level array uses the "global_ops", and the files are
* created on boot up.
@@ -64,9 +87,8 @@ int ftrace_create_function_files(struct trace_array *tr,
if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
return 0;
- ret = allocate_ftrace_ops(tr);
- if (ret)
- return ret;
+ if (!tr->ops)
+ return -EINVAL;
ftrace_create_filter_files(tr->ops, parent);
@@ -76,14 +98,40 @@ int ftrace_create_function_files(struct trace_array *tr,
void ftrace_destroy_function_files(struct trace_array *tr)
{
ftrace_destroy_filter_files(tr->ops);
- kfree(tr->ops);
- tr->ops = NULL;
+ ftrace_free_ftrace_ops(tr);
+}
+
+static ftrace_func_t select_trace_function(u32 flags_val)
+{
+ switch (flags_val & TRACE_FUNC_OPT_MASK) {
+ case TRACE_FUNC_NO_OPTS:
+ return function_trace_call;
+ case TRACE_FUNC_OPT_STACK:
+ return function_stack_trace_call;
+ case TRACE_FUNC_OPT_NO_REPEATS:
+ return function_no_repeats_trace_call;
+ case TRACE_FUNC_OPT_STACK | TRACE_FUNC_OPT_NO_REPEATS:
+ return function_stack_no_repeats_trace_call;
+ default:
+ return NULL;
+ }
+}
+
+static bool handle_func_repeats(struct trace_array *tr, u32 flags_val)
+{
+ if (!tr->last_func_repeats &&
+ (flags_val & TRACE_FUNC_OPT_NO_REPEATS)) {
+ tr->last_func_repeats = alloc_percpu(struct trace_func_repeats);
+ if (!tr->last_func_repeats)
+ return false;
+ }
+
+ return true;
}
static int function_trace_init(struct trace_array *tr)
{
ftrace_func_t func;
-
/*
* Instance trace_arrays get their ops allocated
* at instance creation. Unless it failed
@@ -92,17 +140,16 @@ static int function_trace_init(struct trace_array *tr)
if (!tr->ops)
return -ENOMEM;
- /* Currently only the global instance can do stack tracing */
- if (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
- func_flags.val & TRACE_FUNC_OPT_STACK)
- func = function_stack_trace_call;
- else
- func = function_trace_call;
+ func = select_trace_function(func_flags.val);
+ if (!func)
+ return -EINVAL;
+
+ if (!handle_func_repeats(tr, func_flags.val))
+ return -ENOMEM;
ftrace_init_array_ops(tr, func);
- tr->array_buffer.cpu = get_cpu();
- put_cpu();
+ tr->array_buffer.cpu = raw_smp_processor_id();
tracing_start_cmdline_record();
tracing_start_function_trace(tr);
@@ -123,35 +170,29 @@ static void function_trace_start(struct trace_array *tr)
static void
function_trace_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
struct trace_array *tr = op->private;
struct trace_array_cpu *data;
- unsigned long flags;
+ unsigned int trace_ctx;
int bit;
int cpu;
- int pc;
if (unlikely(!tr->function_enabled))
return;
- pc = preempt_count();
- preempt_disable_notrace();
-
- bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
if (bit < 0)
- goto out;
+ return;
+
+ trace_ctx = tracing_gen_ctx();
cpu = smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
- if (!atomic_read(&data->disabled)) {
- local_save_flags(flags);
- trace_function(tr, ip, parent_ip, flags, pc);
- }
- trace_clear_recursion(bit);
+ if (!atomic_read(&data->disabled))
+ trace_function(tr, ip, parent_ip, trace_ctx);
- out:
- preempt_enable_notrace();
+ ftrace_test_recursion_unlock(bit);
}
#ifdef CONFIG_UNWINDER_ORC
@@ -174,14 +215,14 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
static void
function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
struct trace_array *tr = op->private;
struct trace_array_cpu *data;
unsigned long flags;
long disabled;
int cpu;
- int pc;
+ unsigned int trace_ctx;
if (unlikely(!tr->function_enabled))
return;
@@ -196,24 +237,143 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
- pc = preempt_count();
- trace_function(tr, ip, parent_ip, flags, pc);
- __trace_stack(tr, flags, STACK_SKIP, pc);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ trace_function(tr, ip, parent_ip, trace_ctx);
+ __trace_stack(tr, trace_ctx, STACK_SKIP);
}
atomic_dec(&data->disabled);
local_irq_restore(flags);
}
+static inline bool is_repeat_check(struct trace_array *tr,
+ struct trace_func_repeats *last_info,
+ unsigned long ip, unsigned long parent_ip)
+{
+ if (last_info->ip == ip &&
+ last_info->parent_ip == parent_ip &&
+ last_info->count < U16_MAX) {
+ last_info->ts_last_call =
+ ring_buffer_time_stamp(tr->array_buffer.buffer);
+ last_info->count++;
+ return true;
+ }
+
+ return false;
+}
+
+static inline void process_repeats(struct trace_array *tr,
+ unsigned long ip, unsigned long parent_ip,
+ struct trace_func_repeats *last_info,
+ unsigned int trace_ctx)
+{
+ if (last_info->count) {
+ trace_last_func_repeats(tr, last_info, trace_ctx);
+ last_info->count = 0;
+ }
+
+ last_info->ip = ip;
+ last_info->parent_ip = parent_ip;
+}
+
+static void
+function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op,
+ struct ftrace_regs *fregs)
+{
+ struct trace_func_repeats *last_info;
+ struct trace_array *tr = op->private;
+ struct trace_array_cpu *data;
+ unsigned int trace_ctx;
+ unsigned long flags;
+ int bit;
+ int cpu;
+
+ if (unlikely(!tr->function_enabled))
+ return;
+
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
+ return;
+
+ cpu = smp_processor_id();
+ data = per_cpu_ptr(tr->array_buffer.data, cpu);
+ if (atomic_read(&data->disabled))
+ goto out;
+
+ /*
+ * An interrupt may happen at any place here. But as far as I can see,
+ * the only damage that this can cause is to mess up the repetition
+ * counter without valuable data being lost.
+ * TODO: think about a solution that is better than just hoping to be
+ * lucky.
+ */
+ last_info = per_cpu_ptr(tr->last_func_repeats, cpu);
+ if (is_repeat_check(tr, last_info, ip, parent_ip))
+ goto out;
+
+ local_save_flags(flags);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ process_repeats(tr, ip, parent_ip, last_info, trace_ctx);
+
+ trace_function(tr, ip, parent_ip, trace_ctx);
+
+out:
+ ftrace_test_recursion_unlock(bit);
+}
+
+static void
+function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op,
+ struct ftrace_regs *fregs)
+{
+ struct trace_func_repeats *last_info;
+ struct trace_array *tr = op->private;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+ unsigned int trace_ctx;
+
+ if (unlikely(!tr->function_enabled))
+ return;
+
+ /*
+ * Need to use raw, since this must be called before the
+ * recursive protection is performed.
+ */
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = per_cpu_ptr(tr->array_buffer.data, cpu);
+ disabled = atomic_inc_return(&data->disabled);
+
+ if (likely(disabled == 1)) {
+ last_info = per_cpu_ptr(tr->last_func_repeats, cpu);
+ if (is_repeat_check(tr, last_info, ip, parent_ip))
+ goto out;
+
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ process_repeats(tr, ip, parent_ip, last_info, trace_ctx);
+
+ trace_function(tr, ip, parent_ip, trace_ctx);
+ __trace_stack(tr, trace_ctx, STACK_SKIP);
+ }
+
+ out:
+ atomic_dec(&data->disabled);
+ local_irq_restore(flags);
+}
+
static struct tracer_opt func_opts[] = {
#ifdef CONFIG_STACKTRACE
{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
#endif
+ { TRACER_OPT(func-no-repeats, TRACE_FUNC_OPT_NO_REPEATS) },
{ } /* Always set a last empty entry */
};
static struct tracer_flags func_flags = {
- .val = 0, /* By default: all flags disabled */
+ .val = TRACE_FUNC_NO_OPTS, /* By default: all flags disabled */
.opts = func_opts
};
@@ -235,30 +395,32 @@ static struct tracer function_trace;
static int
func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
- switch (bit) {
- case TRACE_FUNC_OPT_STACK:
- /* do nothing if already set */
- if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
- break;
-
- /* We can change this flag when not running. */
- if (tr->current_trace != &function_trace)
- break;
+ ftrace_func_t func;
+ u32 new_flags;
- unregister_ftrace_function(tr->ops);
+ /* Do nothing if already set. */
+ if (!!set == !!(func_flags.val & bit))
+ return 0;
- if (set) {
- tr->ops->func = function_stack_trace_call;
- register_ftrace_function(tr->ops);
- } else {
- tr->ops->func = function_trace_call;
- register_ftrace_function(tr->ops);
- }
+ /* We can change this flag only when not running. */
+ if (tr->current_trace != &function_trace)
+ return 0;
- break;
- default:
+ new_flags = (func_flags.val & ~bit) | (set ? bit : 0);
+ func = select_trace_function(new_flags);
+ if (!func)
return -EINVAL;
- }
+
+ /* Check if there's anything to change. */
+ if (tr->ops->func == func)
+ return 0;
+
+ if (!handle_func_repeats(tr, new_flags))
+ return -ENOMEM;
+
+ unregister_ftrace_function(tr->ops);
+ tr->ops->func = func;
+ register_ftrace_function(tr->ops);
return 0;
}
@@ -401,13 +563,11 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
static __always_inline void trace_stack(struct trace_array *tr)
{
- unsigned long flags;
- int pc;
+ unsigned int trace_ctx;
- local_save_flags(flags);
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
- __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc);
+ __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP);
}
static void
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4a9c49c08ec9..c35fbaab2a47 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -58,6 +58,12 @@ static struct tracer_opt trace_opts[] = {
{ TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
/* Display function name after trailing } */
{ TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) },
+#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
+ /* Display function return value ? */
+ { TRACER_OPT(funcgraph-retval, TRACE_GRAPH_PRINT_RETVAL) },
+ /* Display function return value in hexadecimal format ? */
+ { TRACER_OPT(funcgraph-retval-hex, TRACE_GRAPH_PRINT_RETVAL_HEX) },
+#endif
/* Include sleep time (scheduled out) between entry and return */
{ TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) },
@@ -96,8 +102,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
- unsigned long flags,
- int pc)
+ unsigned int trace_ctx)
{
struct trace_event_call *call = &event_funcgraph_entry;
struct ring_buffer_event *event;
@@ -105,7 +110,7 @@ int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent_entry *entry;
event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
return 0;
entry = ring_buffer_event_data(event);
@@ -121,7 +126,7 @@ static inline int ftrace_graph_ignore_irqs(void)
if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT))
return 0;
- return in_irq();
+ return in_hardirq();
}
int trace_graph_entry(struct ftrace_graph_ent *trace)
@@ -129,10 +134,10 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
struct trace_array *tr = graph_array;
struct trace_array_cpu *data;
unsigned long flags;
+ unsigned int trace_ctx;
long disabled;
int ret;
int cpu;
- int pc;
if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT))
return 0;
@@ -174,8 +179,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
data = per_cpu_ptr(tr->array_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
- pc = preempt_count();
- ret = __trace_graph_entry(tr, trace, flags, pc);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ ret = __trace_graph_entry(tr, trace, trace_ctx);
} else {
ret = 0;
}
@@ -188,7 +193,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
static void
__trace_graph_function(struct trace_array *tr,
- unsigned long ip, unsigned long flags, int pc)
+ unsigned long ip, unsigned int trace_ctx)
{
u64 time = trace_clock_local();
struct ftrace_graph_ent ent = {
@@ -202,22 +207,21 @@ __trace_graph_function(struct trace_array *tr,
.rettime = time,
};
- __trace_graph_entry(tr, &ent, flags, pc);
- __trace_graph_return(tr, &ret, flags, pc);
+ __trace_graph_entry(tr, &ent, trace_ctx);
+ __trace_graph_return(tr, &ret, trace_ctx);
}
void
trace_graph_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
- __trace_graph_function(tr, ip, flags, pc);
+ __trace_graph_function(tr, ip, trace_ctx);
}
void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
- unsigned long flags,
- int pc)
+ unsigned int trace_ctx)
{
struct trace_event_call *call = &event_funcgraph_exit;
struct ring_buffer_event *event;
@@ -225,7 +229,7 @@ void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret_entry *entry;
event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
return;
entry = ring_buffer_event_data(event);
@@ -239,9 +243,9 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
struct trace_array *tr = graph_array;
struct trace_array_cpu *data;
unsigned long flags;
+ unsigned int trace_ctx;
long disabled;
int cpu;
- int pc;
ftrace_graph_addr_finish(trace);
@@ -255,8 +259,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
data = per_cpu_ptr(tr->array_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
- pc = preempt_count();
- __trace_graph_return(tr, trace, flags, pc);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ __trace_graph_return(tr, trace, trace_ctx);
}
atomic_dec(&data->disabled);
local_irq_restore(flags);
@@ -621,6 +625,56 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
trace_seq_puts(s, "| ");
}
+#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
+
+#define __TRACE_GRAPH_PRINT_RETVAL TRACE_GRAPH_PRINT_RETVAL
+
+static void print_graph_retval(struct trace_seq *s, unsigned long retval,
+ bool leaf, void *func, bool hex_format)
+{
+ unsigned long err_code = 0;
+
+ if (retval == 0 || hex_format)
+ goto done;
+
+ /* Check if the return value matches the negative format */
+ if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) &&
+ (((u64)retval) >> 32) == 0) {
+ /* sign extension */
+ err_code = (unsigned long)(s32)retval;
+ } else {
+ err_code = retval;
+ }
+
+ if (!IS_ERR_VALUE(err_code))
+ err_code = 0;
+
+done:
+ if (leaf) {
+ if (hex_format || (err_code == 0))
+ trace_seq_printf(s, "%ps(); /* = 0x%lx */\n",
+ func, retval);
+ else
+ trace_seq_printf(s, "%ps(); /* = %ld */\n",
+ func, err_code);
+ } else {
+ if (hex_format || (err_code == 0))
+ trace_seq_printf(s, "} /* %ps = 0x%lx */\n",
+ func, retval);
+ else
+ trace_seq_printf(s, "} /* %ps = %ld */\n",
+ func, err_code);
+ }
+}
+
+#else
+
+#define __TRACE_GRAPH_PRINT_RETVAL 0
+
+#define print_graph_retval(_seq, _retval, _leaf, _func, _format) do {} while (0)
+
+#endif
+
/* Case of a leaf function on its call entry */
static enum print_line_t
print_graph_entry_leaf(struct trace_iterator *iter,
@@ -665,7 +719,15 @@ print_graph_entry_leaf(struct trace_iterator *iter,
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
trace_seq_putc(s, ' ');
- trace_seq_printf(s, "%ps();\n", (void *)call->func);
+ /*
+ * Write out the function return value if the option function-retval is
+ * enabled.
+ */
+ if (flags & __TRACE_GRAPH_PRINT_RETVAL)
+ print_graph_retval(s, graph_ret->retval, true, (void *)call->func,
+ !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
+ else
+ trace_seq_printf(s, "%ps();\n", (void *)call->func);
print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET,
cpu, iter->ent->pid, flags);
@@ -766,7 +828,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
* - we are inside irq code
* - we just entered irq code
*
- * retunns 0 if
+ * returns 0 if
* - funcgraph-interrupts option is set
* - we are not inside irq code
*/
@@ -944,20 +1006,29 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
trace_seq_putc(s, ' ');
/*
- * If the return function does not have a matching entry,
- * then the entry was lost. Instead of just printing
- * the '}' and letting the user guess what function this
- * belongs to, write out the function name. Always do
- * that if the funcgraph-tail option is enabled.
+ * Always write out the function name and its return value if the
+ * function-retval option is enabled.
*/
- if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
- trace_seq_puts(s, "}\n");
- else
- trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
+ if (flags & __TRACE_GRAPH_PRINT_RETVAL) {
+ print_graph_retval(s, trace->retval, false, (void *)trace->func,
+ !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
+ } else {
+ /*
+ * If the return function does not have a matching entry,
+ * then the entry was lost. Instead of just printing
+ * the '}' and letting the user guess what function this
+ * belongs to, write out the function name. Always do
+ * that if the funcgraph-tail option is enabled.
+ */
+ if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
+ trace_seq_puts(s, "}\n");
+ else
+ trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
+ }
/* Overrun */
if (flags & TRACE_GRAPH_PRINT_OVERRUN)
- trace_seq_printf(s, " (Overruns: %lu)\n",
+ trace_seq_printf(s, " (Overruns: %u)\n",
trace->overrun);
print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
@@ -1336,13 +1407,13 @@ static const struct file_operations graph_depth_fops = {
static __init int init_graph_tracefs(void)
{
- struct dentry *d_tracer;
+ int ret;
- d_tracer = tracing_init_dentry();
- if (IS_ERR(d_tracer))
+ ret = tracing_init_dentry();
+ if (ret)
return 0;
- trace_create_file("max_graph_depth", 0644, d_tracer,
+ trace_create_file("max_graph_depth", TRACE_MODE_WRITE, NULL,
NULL, &graph_depth_fops);
return 0;
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index e2be7bb7ef7e..b791524a6536 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -34,7 +34,7 @@
* Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
* Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com>
*
- * Includes useful feedback from Clark Williams <clark@redhat.com>
+ * Includes useful feedback from Clark Williams <williams@redhat.com>
*
*/
#include <linux/kthread.h>
@@ -54,20 +54,33 @@ static struct trace_array *hwlat_trace;
#define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */
#define DEFAULT_LAT_THRESHOLD 10 /* 10us */
-/* sampling thread*/
-static struct task_struct *hwlat_kthread;
-
static struct dentry *hwlat_sample_width; /* sample width us */
static struct dentry *hwlat_sample_window; /* sample window us */
+static struct dentry *hwlat_thread_mode; /* hwlat thread mode */
+
+enum {
+ MODE_NONE = 0,
+ MODE_ROUND_ROBIN,
+ MODE_PER_CPU,
+ MODE_MAX
+};
+static char *thread_mode_str[] = { "none", "round-robin", "per-cpu" };
/* Save the previous tracing_thresh value */
static unsigned long save_tracing_thresh;
-/* NMI timestamp counters */
-static u64 nmi_ts_start;
-static u64 nmi_total_ts;
-static int nmi_count;
-static int nmi_cpu;
+/* runtime kthread data */
+struct hwlat_kthread_data {
+ struct task_struct *kthread;
+ /* NMI timestamp counters */
+ u64 nmi_ts_start;
+ u64 nmi_total_ts;
+ int nmi_count;
+ int nmi_cpu;
+};
+
+static struct hwlat_kthread_data hwlat_single_cpu_data;
+static DEFINE_PER_CPU(struct hwlat_kthread_data, hwlat_per_cpu_data);
/* Tells NMIs to call back to the hwlat tracer to record timestamps */
bool trace_hwlat_callback_enabled;
@@ -83,7 +96,7 @@ struct hwlat_sample {
u64 nmi_total_ts; /* Total time spent in NMIs */
struct timespec64 timestamp; /* wall time */
int nmi_count; /* # NMIs during this sample */
- int count; /* # of iteratons over threash */
+ int count; /* # of iterations over thresh */
};
/* keep the global state somewhere. */
@@ -96,11 +109,24 @@ static struct hwlat_data {
u64 sample_window; /* total sampling window (on+off) */
u64 sample_width; /* active sampling portion of window */
+ int thread_mode; /* thread mode */
+
} hwlat_data = {
.sample_window = DEFAULT_SAMPLE_WINDOW,
.sample_width = DEFAULT_SAMPLE_WIDTH,
+ .thread_mode = MODE_ROUND_ROBIN
};
+static struct hwlat_kthread_data *get_cpu_data(void)
+{
+ if (hwlat_data.thread_mode == MODE_PER_CPU)
+ return this_cpu_ptr(&hwlat_per_cpu_data);
+ else
+ return &hwlat_single_cpu_data;
+}
+
+static bool hwlat_busy;
+
static void trace_hwlat_sample(struct hwlat_sample *sample)
{
struct trace_array *tr = hwlat_trace;
@@ -108,14 +134,9 @@ static void trace_hwlat_sample(struct hwlat_sample *sample)
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct hwlat_entry *entry;
- unsigned long flags;
- int pc;
-
- pc = preempt_count();
- local_save_flags(flags);
event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry),
- flags, pc);
+ tracing_gen_ctx());
if (!event)
return;
entry = ring_buffer_event_data(event);
@@ -141,7 +162,9 @@ static void trace_hwlat_sample(struct hwlat_sample *sample)
void trace_hwlat_callback(bool enter)
{
- if (smp_processor_id() != nmi_cpu)
+ struct hwlat_kthread_data *kdata = get_cpu_data();
+
+ if (!kdata->kthread)
return;
/*
@@ -150,15 +173,24 @@ void trace_hwlat_callback(bool enter)
*/
if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) {
if (enter)
- nmi_ts_start = time_get();
+ kdata->nmi_ts_start = time_get();
else
- nmi_total_ts += time_get() - nmi_ts_start;
+ kdata->nmi_total_ts += time_get() - kdata->nmi_ts_start;
}
if (enter)
- nmi_count++;
+ kdata->nmi_count++;
}
+/*
+ * hwlat_err - report a hwlat error.
+ */
+#define hwlat_err(msg) ({ \
+ struct trace_array *tr = hwlat_trace; \
+ \
+ trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, msg); \
+})
+
/**
* get_sample - sample the CPU TSC and look for likely hardware latencies
*
@@ -168,6 +200,7 @@ void trace_hwlat_callback(bool enter)
*/
static int get_sample(void)
{
+ struct hwlat_kthread_data *kdata = get_cpu_data();
struct trace_array *tr = hwlat_trace;
struct hwlat_sample s;
time_type start, t1, t2, last_t2;
@@ -180,9 +213,8 @@ static int get_sample(void)
do_div(thresh, NSEC_PER_USEC); /* modifies interval value */
- nmi_cpu = smp_processor_id();
- nmi_total_ts = 0;
- nmi_count = 0;
+ kdata->nmi_total_ts = 0;
+ kdata->nmi_count = 0;
/* Make sure NMIs see this first */
barrier();
@@ -202,7 +234,7 @@ static int get_sample(void)
outer_diff = time_to_us(time_sub(t1, last_t2));
/* This shouldn't happen */
if (outer_diff < 0) {
- pr_err(BANNER "time running backwards\n");
+ hwlat_err(BANNER "time running backwards\n");
goto out;
}
if (outer_diff > outer_sample)
@@ -214,7 +246,7 @@ static int get_sample(void)
/* Check for possible overflows */
if (total < last_total) {
- pr_err("Time total overflowed\n");
+ hwlat_err("Time total overflowed\n");
break;
}
last_total = total;
@@ -230,7 +262,7 @@ static int get_sample(void)
/* This shouldn't happen */
if (diff < 0) {
- pr_err(BANNER "time running backwards\n");
+ hwlat_err(BANNER "time running backwards\n");
goto out;
}
@@ -252,15 +284,15 @@ static int get_sample(void)
ret = 1;
/* We read in microseconds */
- if (nmi_total_ts)
- do_div(nmi_total_ts, NSEC_PER_USEC);
+ if (kdata->nmi_total_ts)
+ do_div(kdata->nmi_total_ts, NSEC_PER_USEC);
hwlat_data.count++;
s.seqnum = hwlat_data.count;
s.duration = sample;
s.outer_duration = outer_sample;
- s.nmi_total_ts = nmi_total_ts;
- s.nmi_count = nmi_count;
+ s.nmi_total_ts = kdata->nmi_total_ts;
+ s.nmi_count = kdata->nmi_count;
s.count = count;
trace_hwlat_sample(&s);
@@ -278,42 +310,41 @@ out:
}
static struct cpumask save_cpumask;
-static bool disable_migrate;
static void move_to_next_cpu(void)
{
struct cpumask *current_mask = &save_cpumask;
+ struct trace_array *tr = hwlat_trace;
int next_cpu;
- if (disable_migrate)
- return;
/*
* If for some reason the user modifies the CPU affinity
* of this thread, then stop migrating for the duration
* of the current test.
*/
if (!cpumask_equal(current_mask, current->cpus_ptr))
- goto disable;
+ goto change_mode;
- get_online_cpus();
- cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
- next_cpu = cpumask_next(smp_processor_id(), current_mask);
- put_online_cpus();
+ cpus_read_lock();
+ cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
+ next_cpu = cpumask_next(raw_smp_processor_id(), current_mask);
+ cpus_read_unlock();
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first(current_mask);
if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */
- goto disable;
+ goto change_mode;
cpumask_clear(current_mask);
cpumask_set_cpu(next_cpu, current_mask);
- sched_setaffinity(0, current_mask);
+ set_cpus_allowed_ptr(current, current_mask);
return;
- disable:
- disable_migrate = true;
+ change_mode:
+ hwlat_data.thread_mode = MODE_NONE;
+ pr_info(BANNER "cpumask changed while in round-robin mode, switching to mode none\n");
}
/*
@@ -332,7 +363,8 @@ static int kthread_fn(void *data)
while (!kthread_should_stop()) {
- move_to_next_cpu();
+ if (hwlat_data.thread_mode == MODE_ROUND_ROBIN)
+ move_to_next_cpu();
local_irq_disable();
get_sample();
@@ -355,179 +387,377 @@ static int kthread_fn(void *data)
return 0;
}
-/**
- * start_kthread - Kick off the hardware latency sampling/detector kthread
+/*
+ * stop_stop_kthread - Inform the hardware latency sampling/detector kthread to stop
+ *
+ * This kicks the running hardware latency sampling/detector kernel thread and
+ * tells it to stop sampling now. Use this on unload and at system shutdown.
+ */
+static void stop_single_kthread(void)
+{
+ struct hwlat_kthread_data *kdata = get_cpu_data();
+ struct task_struct *kthread;
+
+ cpus_read_lock();
+ kthread = kdata->kthread;
+
+ if (!kthread)
+ goto out_put_cpus;
+
+ kthread_stop(kthread);
+ kdata->kthread = NULL;
+
+out_put_cpus:
+ cpus_read_unlock();
+}
+
+
+/*
+ * start_single_kthread - Kick off the hardware latency sampling/detector kthread
*
* This starts the kernel thread that will sit and sample the CPU timestamp
* counter (TSC or similar) and look for potential hardware latencies.
*/
-static int start_kthread(struct trace_array *tr)
+static int start_single_kthread(struct trace_array *tr)
{
+ struct hwlat_kthread_data *kdata = get_cpu_data();
struct cpumask *current_mask = &save_cpumask;
struct task_struct *kthread;
int next_cpu;
- if (WARN_ON(hwlat_kthread))
- return 0;
-
- /* Just pick the first CPU on first iteration */
- current_mask = &save_cpumask;
- get_online_cpus();
- cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
- put_online_cpus();
- next_cpu = cpumask_first(current_mask);
+ cpus_read_lock();
+ if (kdata->kthread)
+ goto out_put_cpus;
kthread = kthread_create(kthread_fn, NULL, "hwlatd");
if (IS_ERR(kthread)) {
pr_err(BANNER "could not start sampling thread\n");
+ cpus_read_unlock();
return -ENOMEM;
}
- cpumask_clear(current_mask);
- cpumask_set_cpu(next_cpu, current_mask);
- sched_setaffinity(kthread->pid, current_mask);
+ /* Just pick the first CPU on first iteration */
+ cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
+
+ if (hwlat_data.thread_mode == MODE_ROUND_ROBIN) {
+ next_cpu = cpumask_first(current_mask);
+ cpumask_clear(current_mask);
+ cpumask_set_cpu(next_cpu, current_mask);
+
+ }
+
+ set_cpus_allowed_ptr(kthread, current_mask);
- hwlat_kthread = kthread;
+ kdata->kthread = kthread;
wake_up_process(kthread);
+out_put_cpus:
+ cpus_read_unlock();
return 0;
}
-/**
- * stop_kthread - Inform the hardware latency samping/detector kthread to stop
+/*
+ * stop_cpu_kthread - Stop a hwlat cpu kthread
+ */
+static void stop_cpu_kthread(unsigned int cpu)
+{
+ struct task_struct *kthread;
+
+ kthread = per_cpu(hwlat_per_cpu_data, cpu).kthread;
+ if (kthread)
+ kthread_stop(kthread);
+ per_cpu(hwlat_per_cpu_data, cpu).kthread = NULL;
+}
+
+/*
+ * stop_per_cpu_kthreads - Inform the hardware latency sampling/detector kthread to stop
*
- * This kicks the running hardware latency sampling/detector kernel thread and
+ * This kicks the running hardware latency sampling/detector kernel threads and
* tells it to stop sampling now. Use this on unload and at system shutdown.
*/
-static void stop_kthread(void)
+static void stop_per_cpu_kthreads(void)
{
- if (!hwlat_kthread)
- return;
- kthread_stop(hwlat_kthread);
- hwlat_kthread = NULL;
+ unsigned int cpu;
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu)
+ stop_cpu_kthread(cpu);
+ cpus_read_unlock();
}
/*
- * hwlat_read - Wrapper read function for reading both window and width
- * @filp: The active open file structure
- * @ubuf: The userspace provided buffer to read value into
- * @cnt: The maximum number of bytes to read
- * @ppos: The current "file" position
- *
- * This function provides a generic read implementation for the global state
- * "hwlat_data" structure filesystem entries.
+ * start_cpu_kthread - Start a hwlat cpu kthread
*/
-static ssize_t hwlat_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static int start_cpu_kthread(unsigned int cpu)
{
- char buf[U64STR_SIZE];
- u64 *entry = filp->private_data;
- u64 val;
- int len;
+ struct task_struct *kthread;
- if (!entry)
- return -EFAULT;
+ /* Do not start a new hwlatd thread if it is already running */
+ if (per_cpu(hwlat_per_cpu_data, cpu).kthread)
+ return 0;
- if (cnt > sizeof(buf))
- cnt = sizeof(buf);
+ kthread = kthread_run_on_cpu(kthread_fn, NULL, cpu, "hwlatd/%u");
+ if (IS_ERR(kthread)) {
+ pr_err(BANNER "could not start sampling thread\n");
+ return -ENOMEM;
+ }
- val = *entry;
+ per_cpu(hwlat_per_cpu_data, cpu).kthread = kthread;
- len = snprintf(buf, sizeof(buf), "%llu\n", val);
+ return 0;
+}
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
+#ifdef CONFIG_HOTPLUG_CPU
+static void hwlat_hotplug_workfn(struct work_struct *dummy)
+{
+ struct trace_array *tr = hwlat_trace;
+ unsigned int cpu = smp_processor_id();
+
+ mutex_lock(&trace_types_lock);
+ mutex_lock(&hwlat_data.lock);
+ cpus_read_lock();
+
+ if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU)
+ goto out_unlock;
+
+ if (!cpumask_test_cpu(cpu, tr->tracing_cpumask))
+ goto out_unlock;
+
+ start_cpu_kthread(cpu);
+
+out_unlock:
+ cpus_read_unlock();
+ mutex_unlock(&hwlat_data.lock);
+ mutex_unlock(&trace_types_lock);
}
-/**
- * hwlat_width_write - Write function for "width" entry
- * @filp: The active open file structure
- * @ubuf: The user buffer that contains the value to write
- * @cnt: The maximum number of bytes to write to "file"
- * @ppos: The current position in @file
+static DECLARE_WORK(hwlat_hotplug_work, hwlat_hotplug_workfn);
+
+/*
+ * hwlat_cpu_init - CPU hotplug online callback function
+ */
+static int hwlat_cpu_init(unsigned int cpu)
+{
+ schedule_work_on(cpu, &hwlat_hotplug_work);
+ return 0;
+}
+
+/*
+ * hwlat_cpu_die - CPU hotplug offline callback function
+ */
+static int hwlat_cpu_die(unsigned int cpu)
+{
+ stop_cpu_kthread(cpu);
+ return 0;
+}
+
+static void hwlat_init_hotplug_support(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "trace/hwlat:online",
+ hwlat_cpu_init, hwlat_cpu_die);
+ if (ret < 0)
+ pr_warn(BANNER "Error to init cpu hotplug support\n");
+
+ return;
+}
+#else /* CONFIG_HOTPLUG_CPU */
+static void hwlat_init_hotplug_support(void)
+{
+ return;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/*
+ * start_per_cpu_kthreads - Kick off the hardware latency sampling/detector kthreads
*
- * This function provides a write implementation for the "width" interface
- * to the hardware latency detector. It can be used to configure
- * for how many us of the total window us we will actively sample for any
- * hardware-induced latency periods. Obviously, it is not possible to
- * sample constantly and have the system respond to a sample reader, or,
- * worse, without having the system appear to have gone out to lunch. It
- * is enforced that width is less that the total window size.
+ * This starts the kernel threads that will sit on potentially all cpus and
+ * sample the CPU timestamp counter (TSC or similar) and look for potential
+ * hardware latencies.
*/
-static ssize_t
-hwlat_width_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static int start_per_cpu_kthreads(struct trace_array *tr)
{
- u64 val;
- int err;
+ struct cpumask *current_mask = &save_cpumask;
+ unsigned int cpu;
+ int retval;
- err = kstrtoull_from_user(ubuf, cnt, 10, &val);
- if (err)
- return err;
+ cpus_read_lock();
+ /*
+ * Run only on CPUs in which hwlat is allowed to run.
+ */
+ cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
+
+ for_each_cpu(cpu, current_mask) {
+ retval = start_cpu_kthread(cpu);
+ if (retval)
+ goto out_error;
+ }
+ cpus_read_unlock();
+
+ return 0;
+
+out_error:
+ cpus_read_unlock();
+ stop_per_cpu_kthreads();
+ return retval;
+}
+
+static void *s_mode_start(struct seq_file *s, loff_t *pos)
+{
+ int mode = *pos;
mutex_lock(&hwlat_data.lock);
- if (val < hwlat_data.sample_window)
- hwlat_data.sample_width = val;
+
+ if (mode >= MODE_MAX)
+ return NULL;
+
+ return pos;
+}
+
+static void *s_mode_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ int mode = ++(*pos);
+
+ if (mode >= MODE_MAX)
+ return NULL;
+
+ return pos;
+}
+
+static int s_mode_show(struct seq_file *s, void *v)
+{
+ loff_t *pos = v;
+ int mode = *pos;
+
+ if (mode == hwlat_data.thread_mode)
+ seq_printf(s, "[%s]", thread_mode_str[mode]);
else
- err = -EINVAL;
- mutex_unlock(&hwlat_data.lock);
+ seq_printf(s, "%s", thread_mode_str[mode]);
- if (err)
- return err;
+ if (mode < MODE_MAX - 1) /* if mode is any but last */
+ seq_puts(s, " ");
- return cnt;
+ return 0;
}
+static void s_mode_stop(struct seq_file *s, void *v)
+{
+ seq_puts(s, "\n");
+ mutex_unlock(&hwlat_data.lock);
+}
+
+static const struct seq_operations thread_mode_seq_ops = {
+ .start = s_mode_start,
+ .next = s_mode_next,
+ .show = s_mode_show,
+ .stop = s_mode_stop
+};
+
+static int hwlat_mode_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &thread_mode_seq_ops);
+};
+
+static void hwlat_tracer_start(struct trace_array *tr);
+static void hwlat_tracer_stop(struct trace_array *tr);
+
/**
- * hwlat_window_write - Write function for "window" entry
+ * hwlat_mode_write - Write function for "mode" entry
* @filp: The active open file structure
* @ubuf: The user buffer that contains the value to write
* @cnt: The maximum number of bytes to write to "file"
* @ppos: The current position in @file
*
- * This function provides a write implementation for the "window" interface
- * to the hardware latency detetector. The window is the total time
- * in us that will be considered one sample period. Conceptually, windows
- * occur back-to-back and contain a sample width period during which
- * actual sampling occurs. Can be used to write a new total window size. It
- * is enfoced that any value written must be greater than the sample width
- * size, or an error results.
+ * This function provides a write implementation for the "mode" interface
+ * to the hardware latency detector. hwlatd has different operation modes.
+ * The "none" sets the allowed cpumask for a single hwlatd thread at the
+ * startup and lets the scheduler handle the migration. The default mode is
+ * the "round-robin" one, in which a single hwlatd thread runs, migrating
+ * among the allowed CPUs in a round-robin fashion. The "per-cpu" mode
+ * creates one hwlatd thread per allowed CPU.
*/
-static ssize_t
-hwlat_window_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static ssize_t hwlat_mode_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- u64 val;
- int err;
+ struct trace_array *tr = hwlat_trace;
+ const char *mode;
+ char buf[64];
+ int ret, i;
- err = kstrtoull_from_user(ubuf, cnt, 10, &val);
- if (err)
- return err;
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ mode = strstrip(buf);
+
+ ret = -EINVAL;
+
+ /*
+ * trace_types_lock is taken to avoid concurrency on start/stop
+ * and hwlat_busy.
+ */
+ mutex_lock(&trace_types_lock);
+ if (hwlat_busy)
+ hwlat_tracer_stop(tr);
mutex_lock(&hwlat_data.lock);
- if (hwlat_data.sample_width < val)
- hwlat_data.sample_window = val;
- else
- err = -EINVAL;
+
+ for (i = 0; i < MODE_MAX; i++) {
+ if (strcmp(mode, thread_mode_str[i]) == 0) {
+ hwlat_data.thread_mode = i;
+ ret = cnt;
+ }
+ }
+
mutex_unlock(&hwlat_data.lock);
- if (err)
- return err;
+ if (hwlat_busy)
+ hwlat_tracer_start(tr);
+ mutex_unlock(&trace_types_lock);
+
+ *ppos += cnt;
+
- return cnt;
+
+ return ret;
}
-static const struct file_operations width_fops = {
- .open = tracing_open_generic,
- .read = hwlat_read,
- .write = hwlat_width_write,
+/*
+ * The width parameter is read/write using the generic trace_min_max_param
+ * method. The *val is protected by the hwlat_data lock and is upper
+ * bounded by the window parameter.
+ */
+static struct trace_min_max_param hwlat_width = {
+ .lock = &hwlat_data.lock,
+ .val = &hwlat_data.sample_width,
+ .max = &hwlat_data.sample_window,
+ .min = NULL,
};
-static const struct file_operations window_fops = {
- .open = tracing_open_generic,
- .read = hwlat_read,
- .write = hwlat_window_write,
+/*
+ * The window parameter is read/write using the generic trace_min_max_param
+ * method. The *val is protected by the hwlat_data lock and is lower
+ * bounded by the width parameter.
+ */
+static struct trace_min_max_param hwlat_window = {
+ .lock = &hwlat_data.lock,
+ .val = &hwlat_data.sample_window,
+ .max = NULL,
+ .min = &hwlat_data.sample_width,
};
+static const struct file_operations thread_mode_fops = {
+ .open = hwlat_mode_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = hwlat_mode_write
+};
/**
* init_tracefs - A function to initialize the tracefs interface files
*
@@ -538,31 +768,38 @@ static const struct file_operations window_fops = {
*/
static int init_tracefs(void)
{
- struct dentry *d_tracer;
+ int ret;
struct dentry *top_dir;
- d_tracer = tracing_init_dentry();
- if (IS_ERR(d_tracer))
+ ret = tracing_init_dentry();
+ if (ret)
return -ENOMEM;
- top_dir = tracefs_create_dir("hwlat_detector", d_tracer);
+ top_dir = tracefs_create_dir("hwlat_detector", NULL);
if (!top_dir)
return -ENOMEM;
- hwlat_sample_window = tracefs_create_file("window", 0640,
+ hwlat_sample_window = tracefs_create_file("window", TRACE_MODE_WRITE,
top_dir,
- &hwlat_data.sample_window,
- &window_fops);
+ &hwlat_window,
+ &trace_min_max_fops);
if (!hwlat_sample_window)
goto err;
- hwlat_sample_width = tracefs_create_file("width", 0644,
+ hwlat_sample_width = tracefs_create_file("width", TRACE_MODE_WRITE,
top_dir,
- &hwlat_data.sample_width,
- &width_fops);
+ &hwlat_width,
+ &trace_min_max_fops);
if (!hwlat_sample_width)
goto err;
+ hwlat_thread_mode = trace_create_file("mode", TRACE_MODE_WRITE,
+ top_dir,
+ NULL,
+ &thread_mode_fops);
+ if (!hwlat_thread_mode)
+ goto err;
+
return 0;
err:
@@ -574,18 +811,22 @@ static void hwlat_tracer_start(struct trace_array *tr)
{
int err;
- err = start_kthread(tr);
+ if (hwlat_data.thread_mode == MODE_PER_CPU)
+ err = start_per_cpu_kthreads(tr);
+ else
+ err = start_single_kthread(tr);
if (err)
pr_err(BANNER "Cannot start hwlat kthread\n");
}
static void hwlat_tracer_stop(struct trace_array *tr)
{
- stop_kthread();
+ if (hwlat_data.thread_mode == MODE_PER_CPU)
+ stop_per_cpu_kthreads();
+ else
+ stop_single_kthread();
}
-static bool hwlat_busy;
-
static int hwlat_tracer_init(struct trace_array *tr)
{
/* Only allow one instance to enable this */
@@ -594,7 +835,6 @@ static int hwlat_tracer_init(struct trace_array *tr)
hwlat_trace = tr;
- disable_migrate = false;
hwlat_data.count = 0;
tr->max_latency = 0;
save_tracing_thresh = tracing_thresh;
@@ -613,7 +853,7 @@ static int hwlat_tracer_init(struct trace_array *tr)
static void hwlat_tracer_reset(struct trace_array *tr)
{
- stop_kthread();
+ hwlat_tracer_stop(tr);
/* the tracing threshold is static between runs */
last_tracing_thresh = tracing_thresh;
@@ -642,6 +882,8 @@ __init static int init_hwlat_tracer(void)
if (ret)
return ret;
+ hwlat_init_hotplug_support();
+
init_tracefs();
return 0;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 10bbb0f381d5..ba37f768e2f2 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -138,16 +138,19 @@ static int func_prolog_dec(struct trace_array *tr,
*/
static void
irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
+ unsigned int trace_ctx;
if (!func_prolog_dec(tr, &data, &flags))
return;
- trace_function(tr, ip, parent_ip, flags, preempt_count());
+ trace_ctx = tracing_gen_ctx_flags(flags);
+
+ trace_function(tr, ip, parent_ip, trace_ctx);
atomic_dec(&data->disabled);
}
@@ -177,8 +180,8 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
+ unsigned int trace_ctx;
int ret;
- int pc;
if (ftrace_graph_ignore_func(trace))
return 0;
@@ -195,8 +198,8 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
if (!func_prolog_dec(tr, &data, &flags))
return 0;
- pc = preempt_count();
- ret = __trace_graph_entry(tr, trace, flags, pc);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ ret = __trace_graph_entry(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
return ret;
@@ -207,15 +210,15 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
- int pc;
+ unsigned int trace_ctx;
ftrace_graph_addr_finish(trace);
if (!func_prolog_dec(tr, &data, &flags))
return;
- pc = preempt_count();
- __trace_graph_return(tr, trace, flags, pc);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ __trace_graph_return(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
}
@@ -228,7 +231,8 @@ static void irqsoff_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
-
+ else
+ iter->private = NULL;
}
static void irqsoff_trace_close(struct trace_iterator *iter)
@@ -267,12 +271,12 @@ static void irqsoff_print_header(struct seq_file *s)
static void
__trace_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
if (is_graph(tr))
- trace_graph_function(tr, ip, parent_ip, flags, pc);
+ trace_graph_function(tr, ip, parent_ip, trace_ctx);
else
- trace_function(tr, ip, parent_ip, flags, pc);
+ trace_function(tr, ip, parent_ip, trace_ctx);
}
#else
@@ -322,15 +326,13 @@ check_critical_timing(struct trace_array *tr,
{
u64 T0, T1, delta;
unsigned long flags;
- int pc;
+ unsigned int trace_ctx;
T0 = data->preempt_timestamp;
T1 = ftrace_now(cpu);
delta = T1-T0;
- local_save_flags(flags);
-
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
if (!report_latency(tr, delta))
goto out;
@@ -341,9 +343,9 @@ check_critical_timing(struct trace_array *tr,
if (!report_latency(tr, delta))
goto out_unlock;
- __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+ __trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx);
/* Skip 5 functions to get to the irq/preempt enable function */
- __trace_stack(tr, flags, 5, pc);
+ __trace_stack(tr, trace_ctx, 5);
if (data->critical_sequence != max_sequence)
goto out_unlock;
@@ -363,16 +365,15 @@ out_unlock:
out:
data->critical_sequence = max_sequence;
data->preempt_timestamp = ftrace_now(cpu);
- __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+ __trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx);
}
static nokprobe_inline void
-start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
+start_critical_timing(unsigned long ip, unsigned long parent_ip)
{
int cpu;
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
- unsigned long flags;
if (!tracer_enabled || !tracing_is_enabled())
return;
@@ -393,9 +394,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
data->preempt_timestamp = ftrace_now(cpu);
data->critical_start = parent_ip ? : ip;
- local_save_flags(flags);
-
- __trace_function(tr, ip, parent_ip, flags, pc);
+ __trace_function(tr, ip, parent_ip, tracing_gen_ctx());
per_cpu(tracing_cpu, cpu) = 1;
@@ -403,12 +402,12 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
}
static nokprobe_inline void
-stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
+stop_critical_timing(unsigned long ip, unsigned long parent_ip)
{
int cpu;
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
- unsigned long flags;
+ unsigned int trace_ctx;
cpu = raw_smp_processor_id();
/* Always clear the tracing cpu on stopping the trace */
@@ -428,8 +427,8 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
atomic_inc(&data->disabled);
- local_save_flags(flags);
- __trace_function(tr, ip, parent_ip, flags, pc);
+ trace_ctx = tracing_gen_ctx();
+ __trace_function(tr, ip, parent_ip, trace_ctx);
check_critical_timing(tr, data, parent_ip ? : ip, cpu);
data->critical_start = 0;
atomic_dec(&data->disabled);
@@ -438,20 +437,16 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
/* start and stop critical timings used to for stoppage (in idle) */
void start_critical_timings(void)
{
- int pc = preempt_count();
-
- if (preempt_trace(pc) || irq_trace())
- start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);
+ if (preempt_trace(preempt_count()) || irq_trace())
+ start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
EXPORT_SYMBOL_GPL(start_critical_timings);
NOKPROBE_SYMBOL(start_critical_timings);
void stop_critical_timings(void)
{
- int pc = preempt_count();
-
- if (preempt_trace(pc) || irq_trace())
- stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);
+ if (preempt_trace(preempt_count()) || irq_trace())
+ stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
EXPORT_SYMBOL_GPL(stop_critical_timings);
NOKPROBE_SYMBOL(stop_critical_timings);
@@ -562,6 +557,8 @@ static int __irqsoff_tracer_init(struct trace_array *tr)
/* non overwrite screws up the latency tracers */
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
+ /* without pause, we will produce garbage if another latency occurs */
+ set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, 1);
tr->max_latency = 0;
irqsoff_trace = tr;
@@ -583,11 +580,13 @@ static void __irqsoff_tracer_reset(struct trace_array *tr)
{
int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
+ int pause_flag = save_flags & TRACE_ITER_PAUSE_ON_TRACE;
stop_irqsoff_tracer(tr, is_graph(tr));
set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
+ set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, pause_flag);
ftrace_reset_array_ops(tr);
irqsoff_busy = false;
@@ -609,19 +608,15 @@ static void irqsoff_tracer_stop(struct trace_array *tr)
*/
void tracer_hardirqs_on(unsigned long a0, unsigned long a1)
{
- unsigned int pc = preempt_count();
-
- if (!preempt_trace(pc) && irq_trace())
- stop_critical_timing(a0, a1, pc);
+ if (!preempt_trace(preempt_count()) && irq_trace())
+ stop_critical_timing(a0, a1);
}
NOKPROBE_SYMBOL(tracer_hardirqs_on);
void tracer_hardirqs_off(unsigned long a0, unsigned long a1)
{
- unsigned int pc = preempt_count();
-
- if (!preempt_trace(pc) && irq_trace())
- start_critical_timing(a0, a1, pc);
+ if (!preempt_trace(preempt_count()) && irq_trace())
+ start_critical_timing(a0, a1);
}
NOKPROBE_SYMBOL(tracer_hardirqs_off);
@@ -661,18 +656,14 @@ static struct tracer irqsoff_tracer __read_mostly =
#ifdef CONFIG_PREEMPT_TRACER
void tracer_preempt_on(unsigned long a0, unsigned long a1)
{
- int pc = preempt_count();
-
- if (preempt_trace(pc) && !irq_trace())
- stop_critical_timing(a0, a1, pc);
+ if (preempt_trace(preempt_count()) && !irq_trace())
+ stop_critical_timing(a0, a1);
}
void tracer_preempt_off(unsigned long a0, unsigned long a1)
{
- int pc = preempt_count();
-
- if (preempt_trace(pc) && !irq_trace())
- start_critical_timing(a0, a1, pc);
+ if (preempt_trace(preempt_count()) && !irq_trace())
+ start_critical_timing(a0, a1);
}
static int preemptoff_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 9da76104f7a2..59857a1ee44c 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -147,11 +147,17 @@ static int kdb_ftdump(int argc, const char **argv)
return 0;
}
+static kdbtab_t ftdump_cmd = {
+ .name = "ftdump",
+ .func = kdb_ftdump,
+ .usage = "[skip_#entries] [cpu]",
+ .help = "Dump ftrace log; -skip dumps last #entries",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+};
+
static __init int kdb_ftrace_register(void)
{
- kdb_register_flags("ftdump", kdb_ftdump, "[skip_#entries] [cpu]",
- "Dump ftrace log; -skip dumps last #entries", 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register(&ftdump_cmd);
return 0;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index aefb6065b508..c4c6e0e0068b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -7,6 +7,7 @@
*/
#define pr_fmt(fmt) "trace_kprobe: " fmt
+#include <linux/bpf-cgroup.h>
#include <linux/security.h>
#include <linux/module.h>
#include <linux/uaccess.h>
@@ -19,22 +20,24 @@
#include "trace_kprobe_selftest.h"
#include "trace_probe.h"
#include "trace_probe_tmpl.h"
+#include "trace_probe_kernel.h"
#define KPROBE_EVENT_SYSTEM "kprobes"
#define KRETPROBE_MAXACTIVE_MAX 4096
/* Kprobe early definition from command line */
static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata;
-static bool kprobe_boot_events_enabled __initdata;
static int __init set_kprobe_boot_events(char *str)
{
- strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE);
- return 0;
+ strscpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE);
+ disable_tracing_selftest("running kprobe events");
+
+ return 1;
}
__setup("kprobe_event=", set_kprobe_boot_events);
-static int trace_kprobe_create(int argc, const char **argv);
+static int trace_kprobe_create(const char *raw_command);
static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev);
static int trace_kprobe_release(struct dyn_event *ev);
static bool trace_kprobe_is_busy(struct dyn_event *ev);
@@ -79,10 +82,6 @@ static struct trace_kprobe *to_trace_kprobe(struct dyn_event *ev)
for_each_dyn_event(dpos) \
if (is_trace_kprobe(dpos) && (pos = to_trace_kprobe(dpos)))
-#define SIZEOF_TRACE_KPROBE(n) \
- (offsetof(struct trace_kprobe, tp.args) + \
- (sizeof(struct probe_arg) * (n)))
-
static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
{
return tk->rp.handler != NULL;
@@ -100,15 +99,16 @@ static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk
static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk)
{
- return !!(kprobe_gone(&tk->rp.kp));
+ return kprobe_gone(&tk->rp.kp);
}
static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
struct module *mod)
{
- int len = strlen(mod->name);
+ int len = strlen(module_name(mod));
const char *name = trace_kprobe_symbol(tk);
- return strncmp(mod->name, name, len) == 0 && name[len] == ':';
+
+ return strncmp(module_name(mod), name, len) == 0 && name[len] == ':';
}
static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
@@ -122,9 +122,9 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
if (!p)
return true;
*p = '\0';
- mutex_lock(&module_mutex);
+ rcu_read_lock_sched();
ret = !!find_module(tk->symbol);
- mutex_unlock(&module_mutex);
+ rcu_read_unlock_sched();
*p = ':';
return ret;
@@ -164,7 +164,8 @@ static bool trace_kprobe_match(const char *system, const char *event,
{
struct trace_kprobe *tk = to_trace_kprobe(ev);
- return strcmp(trace_probe_name(&tk->tp), event) == 0 &&
+ return (event[0] == '\0' ||
+ strcmp(trace_probe_name(&tk->tp), event) == 0) &&
(!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0) &&
trace_kprobe_match_command_head(tk, argc, argv);
}
@@ -219,9 +220,9 @@ bool trace_kprobe_on_func_entry(struct trace_event_call *call)
{
struct trace_kprobe *tk = trace_kprobe_primary_from_call(call);
- return tk ? kprobe_on_func_entry(tk->rp.kp.addr,
+ return tk ? (kprobe_on_func_entry(tk->rp.kp.addr,
tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name,
- tk->rp.kp.addr ? 0 : tk->rp.kp.offset) : false;
+ tk->rp.kp.addr ? 0 : tk->rp.kp.offset) == 0) : false;
}
bool trace_kprobe_error_injectable(struct trace_event_call *call)
@@ -263,7 +264,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
struct trace_kprobe *tk;
int ret = -ENOMEM;
- tk = kzalloc(SIZEOF_TRACE_KPROBE(nargs), GFP_KERNEL);
+ tk = kzalloc(struct_size(tk, tp.args, nargs), GFP_KERNEL);
if (!tk)
return ERR_PTR(ret);
@@ -329,11 +330,9 @@ static inline int __enable_trace_kprobe(struct trace_kprobe *tk)
static void __disable_trace_kprobe(struct trace_probe *tp)
{
- struct trace_probe *pos;
struct trace_kprobe *tk;
- list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
- tk = container_of(pos, struct trace_kprobe, tp);
+ list_for_each_entry(tk, trace_probe_probe_list(tp), tp.list) {
if (!trace_kprobe_is_registered(tk))
continue;
if (trace_kprobe_is_return(tk))
@@ -350,7 +349,7 @@ static void __disable_trace_kprobe(struct trace_probe *tp)
static int enable_trace_kprobe(struct trace_event_call *call,
struct trace_event_file *file)
{
- struct trace_probe *pos, *tp;
+ struct trace_probe *tp;
struct trace_kprobe *tk;
bool enabled;
int ret = 0;
@@ -371,8 +370,7 @@ static int enable_trace_kprobe(struct trace_event_call *call,
if (enabled)
return 0;
- list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
- tk = container_of(pos, struct trace_kprobe, tp);
+ list_for_each_entry(tk, trace_probe_probe_list(tp), tp.list) {
if (trace_kprobe_has_gone(tk))
continue;
ret = __enable_trace_kprobe(tk);
@@ -432,7 +430,7 @@ static int disable_trace_kprobe(struct trace_event_call *call,
return 0;
}
-#if defined(CONFIG_KPROBES_ON_FTRACE) && \
+#if defined(CONFIG_DYNAMIC_FTRACE) && \
!defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE)
static bool __within_notrace_func(unsigned long addr)
{
@@ -489,8 +487,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
return -EINVAL;
if (within_notrace_func(tk)) {
- pr_warn("Could not probe notrace function %s\n",
- trace_kprobe_symbol(tk));
+ pr_warn("Could not probe notrace function %ps\n",
+ (void *)trace_kprobe_address(tk));
return -EINVAL;
}
@@ -541,6 +539,10 @@ static int unregister_trace_kprobe(struct trace_kprobe *tk)
if (trace_probe_is_enabled(&tk->tp))
return -EBUSY;
+ /* If there's a reference to the dynamic event */
+ if (trace_event_dyn_busy(trace_probe_event_call(&tk->tp)))
+ return -EBUSY;
+
/* Will fail if probe is being used by ftrace or perf */
if (unregister_kprobe_event(tk))
return -EBUSY;
@@ -557,11 +559,9 @@ static bool trace_kprobe_has_same_kprobe(struct trace_kprobe *orig,
struct trace_kprobe *comp)
{
struct trace_probe_event *tpe = orig->tp.event;
- struct trace_probe *pos;
int i;
- list_for_each_entry(pos, &tpe->probes, list) {
- orig = container_of(pos, struct trace_kprobe, tp);
+ list_for_each_entry(orig, &tpe->probes, tp.list) {
if (strcmp(trace_kprobe_symbol(orig),
trace_kprobe_symbol(comp)) ||
trace_kprobe_offset(orig) != trace_kprobe_offset(comp))
@@ -616,7 +616,7 @@ static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to)
if (ret)
trace_probe_unlink(&tk->tp);
else
- dyn_event_add(&tk->devent);
+ dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp));
return ret;
}
@@ -645,7 +645,11 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
/* Register new event */
ret = register_kprobe_event(tk);
if (ret) {
- pr_warn("Failed to register probe event(%d)\n", ret);
+ if (ret == -EEXIST) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, EVENT_EXIST);
+ } else
+ pr_warn("Failed to register probe event(%d)\n", ret);
goto end;
}
@@ -659,7 +663,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
if (ret < 0)
unregister_kprobe_event(tk);
else
- dyn_event_add(&tk->devent);
+ dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp));
end:
mutex_unlock(&event_mutex);
@@ -688,7 +692,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
if (ret)
pr_warn("Failed to re-register probe %s on %s: %d\n",
trace_probe_name(&tk->tp),
- mod->name, ret);
+ module_name(mod), ret);
}
}
mutex_unlock(&event_mutex);
@@ -701,22 +705,52 @@ static struct notifier_block trace_kprobe_module_nb = {
.priority = 1 /* Invoked after kprobe module callback */
};
-/* Convert certain expected symbols into '_' when generating event names */
-static inline void sanitize_event_name(char *name)
+static int count_symbols(void *data, unsigned long unused)
{
- while (*name++ != '\0')
- if (*name == ':' || *name == '.')
- *name = '_';
+ unsigned int *count = data;
+
+ (*count)++;
+
+ return 0;
}
-static int trace_kprobe_create(int argc, const char *argv[])
+struct sym_count_ctx {
+ unsigned int count;
+ const char *name;
+};
+
+static int count_mod_symbols(void *data, const char *name, unsigned long unused)
+{
+ struct sym_count_ctx *ctx = data;
+
+ if (strcmp(name, ctx->name) == 0)
+ ctx->count++;
+
+ return 0;
+}
+
+static unsigned int number_of_same_symbols(char *func_name)
+{
+ struct sym_count_ctx ctx = { .count = 0, .name = func_name };
+
+ kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count);
+
+ module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx);
+
+ return ctx.count;
+}
+
+static int __trace_kprobe_create(int argc, const char *argv[])
{
/*
* Argument syntax:
* - Add kprobe:
- * p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS]
+ * p[:[GRP/][EVENT]] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS]
* - Add kretprobe:
- * r[MAXACTIVE][:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS]
+ * r[MAXACTIVE][:[GRP/][EVENT]] [MOD:]KSYM[+0] [FETCHARGS]
+ * Or
+ * p[:[GRP/][EVENT]] [MOD:]KSYM[+0]%return [FETCHARGS]
+ *
* Fetch args:
* $retval : fetch return value
* $stack : fetch stack address
@@ -733,20 +767,23 @@ static int trace_kprobe_create(int argc, const char *argv[])
* FETCHARG:TYPE : use TYPE instead of unsigned long.
*/
struct trace_kprobe *tk = NULL;
- int i, len, ret = 0;
+ int i, len, new_argc = 0, ret = 0;
bool is_return = false;
char *symbol = NULL, *tmp = NULL;
+ const char **new_argv = NULL;
const char *event = NULL, *group = KPROBE_EVENT_SYSTEM;
+ enum probe_print_type ptype;
int maxactive = 0;
long offset = 0;
void *addr = NULL;
char buf[MAX_EVENT_NAME_LEN];
- unsigned int flags = TPARG_FL_KERNEL;
+ char gbuf[MAX_EVENT_NAME_LEN];
+ char abuf[MAX_BTF_ARGS_LEN];
+ struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL };
switch (argv[0][0]) {
case 'r':
is_return = true;
- flags |= TPARG_FL_RETURN;
break;
case 'p':
break;
@@ -764,7 +801,7 @@ static int trace_kprobe_create(int argc, const char *argv[])
if (isdigit(argv[0][1])) {
if (!is_return) {
- trace_probe_log_err(1, MAXACT_NO_KPROBE);
+ trace_probe_log_err(1, BAD_MAXACT_TYPE);
goto parse_error;
}
if (event)
@@ -804,27 +841,70 @@ static int trace_kprobe_create(int argc, const char *argv[])
symbol = kstrdup(argv[1], GFP_KERNEL);
if (!symbol)
return -ENOMEM;
+
+ tmp = strchr(symbol, '%');
+ if (tmp) {
+ if (!strcmp(tmp, "%return")) {
+ *tmp = '\0';
+ is_return = true;
+ } else {
+ trace_probe_log_err(tmp - symbol, BAD_ADDR_SUFFIX);
+ goto parse_error;
+ }
+ }
+
/* TODO: support .init module functions */
ret = traceprobe_split_symbol_offset(symbol, &offset);
if (ret || offset < 0 || offset > UINT_MAX) {
trace_probe_log_err(0, BAD_PROBE_ADDR);
goto parse_error;
}
- if (kprobe_on_func_entry(NULL, symbol, offset))
- flags |= TPARG_FL_FENTRY;
- if (offset && is_return && !(flags & TPARG_FL_FENTRY)) {
+ if (is_return)
+ ctx.flags |= TPARG_FL_RETURN;
+ ret = kprobe_on_func_entry(NULL, symbol, offset);
+ if (ret == 0 && !is_return)
+ ctx.flags |= TPARG_FL_FENTRY;
+ /* Defer the ENOENT case until register kprobe */
+ if (ret == -EINVAL && is_return) {
trace_probe_log_err(0, BAD_RETPROBE);
goto parse_error;
}
}
+ if (symbol && !strchr(symbol, ':')) {
+ unsigned int count;
+
+ count = number_of_same_symbols(symbol);
+ if (count > 1) {
+ /*
+ * Users should use ADDR to remove the ambiguity of
+ * using KSYM only.
+ */
+ trace_probe_log_err(0, NON_UNIQ_SYMBOL);
+ ret = -EADDRNOTAVAIL;
+
+ goto error;
+ } else if (count == 0) {
+ /*
+ * We can return ENOENT earlier than when register the
+ * kprobe.
+ */
+ trace_probe_log_err(0, BAD_PROBE_ADDR);
+ ret = -ENOENT;
+
+ goto error;
+ }
+ }
+
trace_probe_log_set_index(0);
if (event) {
- ret = traceprobe_parse_event_name(&event, &group, buf,
+ ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
goto parse_error;
- } else {
+ }
+
+ if (!event) {
/* Make a new event name */
if (symbol)
snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld",
@@ -836,33 +916,41 @@ static int trace_kprobe_create(int argc, const char *argv[])
event = buf;
}
+ argc -= 2; argv += 2;
+ ctx.funcname = symbol;
+ new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc,
+ abuf, MAX_BTF_ARGS_LEN, &ctx);
+ if (IS_ERR(new_argv)) {
+ ret = PTR_ERR(new_argv);
+ new_argv = NULL;
+ goto out;
+ }
+ if (new_argv) {
+ argc = new_argc;
+ argv = new_argv;
+ }
+
/* setup a probe */
tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
- argc - 2, is_return);
+ argc, is_return);
if (IS_ERR(tk)) {
ret = PTR_ERR(tk);
/* This must return -ENOMEM, else there is a bug */
WARN_ON_ONCE(ret != -ENOMEM);
goto out; /* We know tk is not allocated */
}
- argc -= 2; argv += 2;
/* parse arguments */
for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
- tmp = kstrdup(argv[i], GFP_KERNEL);
- if (!tmp) {
- ret = -ENOMEM;
- goto error;
- }
-
trace_probe_log_set_index(i + 2);
- ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags);
- kfree(tmp);
+ ctx.offset = 0;
+ ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx);
if (ret)
goto error; /* This can be -ENOMEM */
}
- ret = traceprobe_set_print_fmt(&tk->tp, is_return);
+ ptype = is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
+ ret = traceprobe_set_print_fmt(&tk->tp, ptype);
if (ret < 0)
goto error;
@@ -879,7 +967,9 @@ static int trace_kprobe_create(int argc, const char *argv[])
}
out:
+ traceprobe_finish_parse(&ctx);
trace_probe_log_clear();
+ kfree(new_argv);
kfree(symbol);
return ret;
@@ -890,20 +980,25 @@ error:
goto out;
}
-static int create_or_delete_trace_kprobe(int argc, char **argv)
+static int trace_kprobe_create(const char *raw_command)
+{
+ return trace_probe_create(raw_command, __trace_kprobe_create);
+}
+
+static int create_or_delete_trace_kprobe(const char *raw_command)
{
int ret;
- if (argv[0][0] == '-')
- return dyn_event_release(argc, argv, &trace_kprobe_ops);
+ if (raw_command[0] == '-')
+ return dyn_event_release(raw_command, &trace_kprobe_ops);
- ret = trace_kprobe_create(argc, (const char **)argv);
+ ret = trace_kprobe_create(raw_command);
return ret == -ECANCELED ? -EINVAL : ret;
}
static int trace_kprobe_run_command(struct dynevent_cmd *cmd)
{
- return trace_run_command(cmd->seq.buffer, create_or_delete_trace_kprobe);
+ return create_or_delete_trace_kprobe(cmd->seq.buffer);
}
/**
@@ -925,10 +1020,10 @@ EXPORT_SYMBOL_GPL(kprobe_event_cmd_init);
/**
* __kprobe_event_gen_cmd_start - Generate a kprobe event command from arg list
* @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @kretprobe: Is this a return probe?
* @name: The name of the kprobe event
* @loc: The location of the kprobe event
- * @kretprobe: Is this a return probe?
- * @args: Variable number of arg (pairs), one pair for each field
+ * @...: Variable number of arg (pairs), one pair for each field
*
* NOTE: Users normally won't want to call this function directly, but
* rather use the kprobe_event_gen_cmd_start() wrapper, which automatically
@@ -1001,7 +1096,7 @@ EXPORT_SYMBOL_GPL(__kprobe_event_gen_cmd_start);
/**
* __kprobe_event_add_fields - Add probe fields to a kprobe command from arg list
* @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @args: Variable number of arg (pairs), one pair for each field
+ * @...: Variable number of arg (pairs), one pair for each field
*
* NOTE: Users normally won't want to call this function directly, but
* rather use the kprobe_event_add_fields() wrapper, which
@@ -1064,7 +1159,7 @@ int kprobe_event_delete(const char *name)
snprintf(buf, MAX_EVENT_NAME_LEN, "-:%s", name);
- return trace_run_command(buf, create_or_delete_trace_kprobe);
+ return create_or_delete_trace_kprobe(buf);
}
EXPORT_SYMBOL_GPL(kprobe_event_delete);
@@ -1154,20 +1249,28 @@ static const struct file_operations kprobe_events_ops = {
.write = probes_write,
};
+static unsigned long trace_kprobe_missed(struct trace_kprobe *tk)
+{
+ return trace_kprobe_is_return(tk) ?
+ tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed;
+}
+
/* Probes profiling interfaces */
static int probes_profile_seq_show(struct seq_file *m, void *v)
{
struct dyn_event *ev = v;
struct trace_kprobe *tk;
+ unsigned long nmissed;
if (!is_trace_kprobe(ev))
return 0;
tk = to_trace_kprobe(ev);
+ nmissed = trace_kprobe_missed(tk);
seq_printf(m, " %-44s %15lu %15lu\n",
trace_probe_name(&tk->tp),
trace_kprobe_nhit(tk),
- tk->rp.kp.nmissed);
+ nmissed);
return 0;
}
@@ -1198,117 +1301,14 @@ static const struct file_operations kprobe_profile_ops = {
.release = seq_release,
};
-/* Kprobe specific fetch functions */
-
-/* Return the length of string -- including null terminal byte */
-static nokprobe_inline int
-fetch_store_strlen_user(unsigned long addr)
-{
- const void __user *uaddr = (__force const void __user *)addr;
-
- return strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
-}
-
-/* Return the length of string -- including null terminal byte */
-static nokprobe_inline int
-fetch_store_strlen(unsigned long addr)
-{
- int ret, len = 0;
- u8 c;
-
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if (addr < TASK_SIZE)
- return fetch_store_strlen_user(addr);
-#endif
-
- do {
- ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
- len++;
- } while (c && ret == 0 && len < MAX_STRING_SIZE);
-
- return (ret < 0) ? ret : len;
-}
-
-/*
- * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
- * with max length and relative data location.
- */
-static nokprobe_inline int
-fetch_store_string_user(unsigned long addr, void *dest, void *base)
-{
- const void __user *uaddr = (__force const void __user *)addr;
- int maxlen = get_loc_len(*(u32 *)dest);
- void *__dest;
- long ret;
-
- if (unlikely(!maxlen))
- return -ENOMEM;
-
- __dest = get_loc_data(dest, base);
-
- ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
- if (ret >= 0)
- *(u32 *)dest = make_data_loc(ret, __dest - base);
-
- return ret;
-}
-
-/*
- * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
- * length and relative data location.
- */
-static nokprobe_inline int
-fetch_store_string(unsigned long addr, void *dest, void *base)
-{
- int maxlen = get_loc_len(*(u32 *)dest);
- void *__dest;
- long ret;
-
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if ((unsigned long)addr < TASK_SIZE)
- return fetch_store_string_user(addr, dest, base);
-#endif
-
- if (unlikely(!maxlen))
- return -ENOMEM;
-
- __dest = get_loc_data(dest, base);
-
- /*
- * Try to get string again, since the string can be changed while
- * probing.
- */
- ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
- if (ret >= 0)
- *(u32 *)dest = make_data_loc(ret, __dest - base);
-
- return ret;
-}
-
-static nokprobe_inline int
-probe_mem_read_user(void *dest, void *src, size_t size)
-{
- const void __user *uaddr = (__force const void __user *)src;
-
- return copy_from_user_nofault(dest, uaddr, size);
-}
-
-static nokprobe_inline int
-probe_mem_read(void *dest, void *src, size_t size)
-{
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if ((unsigned long)src < TASK_SIZE)
- return probe_mem_read_user(dest, src, size);
-#endif
- return copy_from_kernel_nofault(dest, src, size);
-}
-
/* Note that we don't verify it, since the code does not come from user space */
static int
-process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest,
+process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
void *base)
{
+ struct pt_regs *regs = rec;
unsigned long val;
+ int ret;
retry:
/* 1st stage: get value from context */
@@ -1325,15 +1325,6 @@ retry:
case FETCH_OP_RETVAL:
val = regs_return_value(regs);
break;
- case FETCH_OP_IMM:
- val = code->immediate;
- break;
- case FETCH_OP_COMM:
- val = (unsigned long)current->comm;
- break;
- case FETCH_OP_DATA:
- val = (unsigned long)code->data;
- break;
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
case FETCH_OP_ARG:
val = regs_get_kernel_argument(regs, code->param);
@@ -1343,7 +1334,9 @@ retry:
code++;
goto retry;
default:
- return -EILSEQ;
+ ret = process_common_fetch_insn(code, &val);
+ if (ret < 0)
+ return ret;
}
code++;
@@ -1366,22 +1359,14 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
if (trace_trigger_soft_disabled(trace_file))
return;
- local_save_flags(fbuffer.flags);
- fbuffer.pc = preempt_count();
- fbuffer.trace_file = trace_file;
-
dsize = __get_data_size(&tk->tp, regs);
- fbuffer.event =
- trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file,
- call->event.type,
- sizeof(*entry) + tk->tp.size + dsize,
- fbuffer.flags, fbuffer.pc);
- if (!fbuffer.event)
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file,
+ sizeof(*entry) + tk->tp.size + dsize);
+ if (!entry)
return;
fbuffer.regs = regs;
- entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->ip = (unsigned long)tk->rp.kp.addr;
store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
@@ -1414,23 +1399,16 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
if (trace_trigger_soft_disabled(trace_file))
return;
- local_save_flags(fbuffer.flags);
- fbuffer.pc = preempt_count();
- fbuffer.trace_file = trace_file;
-
dsize = __get_data_size(&tk->tp, regs);
- fbuffer.event =
- trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file,
- call->event.type,
- sizeof(*entry) + tk->tp.size + dsize,
- fbuffer.flags, fbuffer.pc);
- if (!fbuffer.event)
+
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file,
+ sizeof(*entry) + tk->tp.size + dsize);
+ if (!entry)
return;
fbuffer.regs = regs;
- entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->func = (unsigned long)tk->rp.kp.addr;
- entry->ret_ip = (unsigned long)ri->ret_addr;
+ entry->ret_ip = get_kretprobe_retaddr(ri);
store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
@@ -1469,7 +1447,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
trace_seq_putc(s, ')');
- if (print_probe_args(s, tp->args, tp->nr_args,
+ if (trace_probe_print_args(s, tp->args, tp->nr_args,
(u8 *)&field[1], field) < 0)
goto out;
@@ -1504,7 +1482,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
trace_seq_putc(s, ')');
- if (print_probe_args(s, tp->args, tp->nr_args,
+ if (trace_probe_print_args(s, tp->args, tp->nr_args,
(u8 *)&field[1], field) < 0)
goto out;
@@ -1625,7 +1603,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
return;
entry->func = (unsigned long)tk->rp.kp.addr;
- entry->ret_ip = (unsigned long)ri->ret_addr;
+ entry->ret_ip = get_kretprobe_retaddr(ri);
store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
@@ -1634,7 +1612,8 @@ NOKPROBE_SYMBOL(kretprobe_perf_func);
int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
const char **symbol, u64 *probe_offset,
- u64 *probe_addr, bool perf_type_tracepoint)
+ u64 *probe_addr, unsigned long *missed,
+ bool perf_type_tracepoint)
{
const char *pevent = trace_event_name(event->tp_event);
const char *group = event->tp_event->class->system;
@@ -1649,15 +1628,12 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
*fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE
: BPF_FD_TYPE_KPROBE;
- if (tk->symbol) {
- *symbol = tk->symbol;
- *probe_offset = tk->rp.kp.offset;
- *probe_addr = 0;
- } else {
- *symbol = NULL;
- *probe_offset = 0;
- *probe_addr = (unsigned long)tk->rp.kp.addr;
- }
+ *probe_offset = tk->rp.kp.offset;
+ *probe_addr = kallsyms_show_value(current_cred()) ?
+ (unsigned long)tk->rp.kp.addr : 0;
+ *symbol = tk->symbol;
+ if (missed)
+ *missed = trace_kprobe_missed(tk);
return 0;
}
#endif /* CONFIG_PERF_EVENTS */
@@ -1714,8 +1690,18 @@ NOKPROBE_SYMBOL(kprobe_dispatcher);
static int
kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
{
- struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
+ struct kretprobe *rp = get_kretprobe(ri);
+ struct trace_kprobe *tk;
+
+ /*
+ * There is a small chance that get_kretprobe(ri) returns NULL when
+ * the kretprobe is unregister on another CPU between kretprobe's
+ * trampoline_handler and this function.
+ */
+ if (unlikely(!rp))
+ return 0;
+ tk = container_of(rp, struct trace_kprobe, rp);
raw_cpu_inc(*tk->nhit);
if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE))
@@ -1724,7 +1710,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE))
kretprobe_perf_func(tk, ri, regs);
#endif
- return 0; /* We don't tweek kernel, so just return 0 */
+ return 0; /* We don't tweak kernel, so just return 0 */
}
NOKPROBE_SYMBOL(kretprobe_dispatcher);
@@ -1777,15 +1763,35 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
}
#ifdef CONFIG_PERF_EVENTS
+
/* create a trace_kprobe, but don't add it to global lists */
struct trace_event_call *
create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
bool is_return)
{
+ enum probe_print_type ptype;
struct trace_kprobe *tk;
int ret;
char *event;
+ if (func) {
+ unsigned int count;
+
+ count = number_of_same_symbols(func);
+ if (count > 1)
+ /*
+ * Users should use addr to remove the ambiguity of
+ * using func only.
+ */
+ return ERR_PTR(-EADDRNOTAVAIL);
+ else if (count == 0)
+ /*
+ * We can return ENOENT earlier than when register the
+ * kprobe.
+ */
+ return ERR_PTR(-ENOENT);
+ }
+
/*
* local trace_kprobes are not added to dyn_event, so they are never
* searched in find_trace_kprobe(). Therefore, there is no concern of
@@ -1805,7 +1811,9 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
init_trace_event_call(tk);
- if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) {
+ ptype = trace_kprobe_is_return(tk) ?
+ PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
+ if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0) {
ret = -ENOMEM;
goto error;
}
@@ -1867,11 +1875,9 @@ static __init void setup_boot_kprobe_events(void)
if (p)
*p++ = '\0';
- ret = trace_run_command(cmd, create_or_delete_trace_kprobe);
+ ret = create_or_delete_trace_kprobe(cmd);
if (ret)
pr_warn("Failed to add event(%d): %s\n", ret, cmd);
- else
- kprobe_boot_events_enabled = true;
cmd = p;
}
@@ -1880,8 +1886,8 @@ static __init void setup_boot_kprobe_events(void)
}
/*
- * Register dynevent at subsys_initcall. This allows kernel to setup kprobe
- * events in fs_initcall without tracefs.
+ * Register dynevent at core_initcall. This allows kernel to setup kprobe
+ * events in postcore_initcall without tracefs.
*/
static __init int init_kprobe_trace_early(void)
{
@@ -1896,31 +1902,24 @@ static __init int init_kprobe_trace_early(void)
return 0;
}
-subsys_initcall(init_kprobe_trace_early);
+core_initcall(init_kprobe_trace_early);
/* Make a tracefs interface for controlling probe points */
static __init int init_kprobe_trace(void)
{
- struct dentry *d_tracer;
- struct dentry *entry;
+ int ret;
- d_tracer = tracing_init_dentry();
- if (IS_ERR(d_tracer))
+ ret = tracing_init_dentry();
+ if (ret)
return 0;
- entry = tracefs_create_file("kprobe_events", 0644, d_tracer,
- NULL, &kprobe_events_ops);
-
/* Event list interface */
- if (!entry)
- pr_warn("Could not create tracefs 'kprobe_events' entry\n");
+ trace_create_file("kprobe_events", TRACE_MODE_WRITE,
+ NULL, NULL, &kprobe_events_ops);
/* Profile interface */
- entry = tracefs_create_file("kprobe_profile", 0444, d_tracer,
- NULL, &kprobe_profile_ops);
-
- if (!entry)
- pr_warn("Could not create tracefs 'kprobe_profile' entry\n");
+ trace_create_file("kprobe_profile", TRACE_MODE_READ,
+ NULL, NULL, &kprobe_profile_ops);
setup_boot_kprobe_events();
@@ -1956,17 +1955,14 @@ static __init int kprobe_trace_self_tests_init(void)
if (tracing_is_disabled())
return -ENODEV;
- if (kprobe_boot_events_enabled) {
- pr_info("Skipping kprobe tests due to kprobe_event on cmdline\n");
+ if (tracing_selftest_disabled)
return 0;
- }
target = kprobe_trace_selftest_target;
pr_info("Testing kprobe tracing: ");
- ret = trace_run_command("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)",
- create_or_delete_trace_kprobe);
+ ret = create_or_delete_trace_kprobe("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)");
if (WARN_ON_ONCE(ret)) {
pr_warn("error on probing function entry.\n");
warn++;
@@ -1987,8 +1983,7 @@ static __init int kprobe_trace_self_tests_init(void)
}
}
- ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target $retval",
- create_or_delete_trace_kprobe);
+ ret = create_or_delete_trace_kprobe("r:testprobe2 kprobe_trace_selftest_target $retval");
if (WARN_ON_ONCE(ret)) {
pr_warn("error on probing function return.\n");
warn++;
@@ -2061,13 +2056,13 @@ static __init int kprobe_trace_self_tests_init(void)
trace_probe_event_call(&tk->tp), file);
}
- ret = trace_run_command("-:testprobe", create_or_delete_trace_kprobe);
+ ret = create_or_delete_trace_kprobe("-:testprobe");
if (WARN_ON_ONCE(ret)) {
pr_warn("error on deleting a probe.\n");
warn++;
}
- ret = trace_run_command("-:testprobe2", create_or_delete_trace_kprobe);
+ ret = create_or_delete_trace_kprobe("-:testprobe2");
if (WARN_ON_ONCE(ret)) {
pr_warn("error on deleting a probe.\n");
warn++;
diff --git a/kernel/trace/trace_kprobe_selftest.c b/kernel/trace/trace_kprobe_selftest.c
index 16548ee4c8c6..3851cd1e6a62 100644
--- a/kernel/trace/trace_kprobe_selftest.c
+++ b/kernel/trace/trace_kprobe_selftest.c
@@ -1,4 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+
+#include "trace_kprobe_selftest.h"
+
/*
* Function used during the kprobe self test. This function is in a separate
* compile unit so it can be compile with CC_FLAGS_FTRACE to ensure that it
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 84582bf1ed5f..64e77b513697 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -5,8 +5,6 @@
* Copyright (C) 2008 Pekka Paalanen <pq@iki.fi>
*/
-#define DEBUG 1
-
#include <linux/kernel.h>
#include <linux/mmiotrace.h>
#include <linux/pci.h>
@@ -300,10 +298,11 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_rw *entry;
- int pc = preempt_count();
+ unsigned int trace_ctx;
+ trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW,
- sizeof(*entry), 0, pc);
+ sizeof(*entry), trace_ctx);
if (!event) {
atomic_inc(&dropped_count);
return;
@@ -312,7 +311,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
entry->rw = *rw;
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, 0, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -330,10 +329,11 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_map *entry;
- int pc = preempt_count();
+ unsigned int trace_ctx;
+ trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP,
- sizeof(*entry), 0, pc);
+ sizeof(*entry), trace_ctx);
if (!event) {
atomic_inc(&dropped_count);
return;
@@ -342,7 +342,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
entry->map = *map;
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, 0, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
new file mode 100644
index 000000000000..a8e28f9b9271
--- /dev/null
+++ b/kernel/trace/trace_osnoise.c
@@ -0,0 +1,3133 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * OS Noise Tracer: computes the OS Noise suffered by a running thread.
+ * Timerlat Tracer: measures the wakeup latency of a timer triggered IRQ and thread.
+ *
+ * Based on "hwlat_detector" tracer by:
+ * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
+ * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com>
+ * With feedback from Clark Williams <williams@redhat.com>
+ *
+ * And also based on the rtsl tracer presented on:
+ * DE OLIVEIRA, Daniel Bristot, et al. Demystifying the real-time linux
+ * scheduling latency. In: 32nd Euromicro Conference on Real-Time Systems
+ * (ECRTS 2020). Schloss Dagstuhl-Leibniz-Zentrum fur Informatik, 2020.
+ *
+ * Copyright (C) 2021 Daniel Bristot de Oliveira, Red Hat, Inc. <bristot@redhat.com>
+ */
+
+#include <linux/kthread.h>
+#include <linux/tracefs.h>
+#include <linux/uaccess.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+#include <linux/sched/clock.h>
+#include <uapi/linux/sched/types.h>
+#include <linux/sched.h>
+#include "trace.h"
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/trace/irq_vectors.h>
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#include <trace/events/irq.h>
+#include <trace/events/sched.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/osnoise.h>
+
+/*
+ * Default values.
+ */
+#define BANNER "osnoise: "
+#define DEFAULT_SAMPLE_PERIOD 1000000 /* 1s */
+#define DEFAULT_SAMPLE_RUNTIME 1000000 /* 1s */
+
+#define DEFAULT_TIMERLAT_PERIOD 1000 /* 1ms */
+#define DEFAULT_TIMERLAT_PRIO 95 /* FIFO 95 */
+
+/*
+ * osnoise/options entries.
+ */
+enum osnoise_options_index {
+ OSN_DEFAULTS = 0,
+ OSN_WORKLOAD,
+ OSN_PANIC_ON_STOP,
+ OSN_PREEMPT_DISABLE,
+ OSN_IRQ_DISABLE,
+ OSN_MAX
+};
+
+static const char * const osnoise_options_str[OSN_MAX] = {
+ "DEFAULTS",
+ "OSNOISE_WORKLOAD",
+ "PANIC_ON_STOP",
+ "OSNOISE_PREEMPT_DISABLE",
+ "OSNOISE_IRQ_DISABLE" };
+
+#define OSN_DEFAULT_OPTIONS 0x2
+static unsigned long osnoise_options = OSN_DEFAULT_OPTIONS;
+
+/*
+ * trace_array of the enabled osnoise/timerlat instances.
+ */
+struct osnoise_instance {
+ struct list_head list;
+ struct trace_array *tr;
+};
+
+static struct list_head osnoise_instances;
+
+static bool osnoise_has_registered_instances(void)
+{
+ return !!list_first_or_null_rcu(&osnoise_instances,
+ struct osnoise_instance,
+ list);
+}
+
+/*
+ * osnoise_instance_registered - check if a tr is already registered
+ */
+static int osnoise_instance_registered(struct trace_array *tr)
+{
+ struct osnoise_instance *inst;
+ int found = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ if (inst->tr == tr)
+ found = 1;
+ }
+ rcu_read_unlock();
+
+ return found;
+}
+
+/*
+ * osnoise_register_instance - register a new trace instance
+ *
+ * Register a trace_array *tr in the list of instances running
+ * osnoise/timerlat tracers.
+ */
+static int osnoise_register_instance(struct trace_array *tr)
+{
+ struct osnoise_instance *inst;
+
+ /*
+ * register/unregister serialization is provided by trace's
+ * trace_types_lock.
+ */
+ lockdep_assert_held(&trace_types_lock);
+
+ inst = kmalloc(sizeof(*inst), GFP_KERNEL);
+ if (!inst)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD_RCU(&inst->list);
+ inst->tr = tr;
+ list_add_tail_rcu(&inst->list, &osnoise_instances);
+
+ return 0;
+}
+
+/*
+ * osnoise_unregister_instance - unregister a registered trace instance
+ *
+ * Remove the trace_array *tr from the list of instances running
+ * osnoise/timerlat tracers.
+ */
+static void osnoise_unregister_instance(struct trace_array *tr)
+{
+ struct osnoise_instance *inst;
+ int found = 0;
+
+ /*
+ * register/unregister serialization is provided by trace's
+ * trace_types_lock.
+ */
+ list_for_each_entry_rcu(inst, &osnoise_instances, list,
+ lockdep_is_held(&trace_types_lock)) {
+ if (inst->tr == tr) {
+ list_del_rcu(&inst->list);
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found)
+ return;
+
+ kvfree_rcu_mightsleep(inst);
+}
+
+/*
+ * NMI runtime info.
+ */
+struct osn_nmi {
+ u64 count;
+ u64 delta_start;
+};
+
+/*
+ * IRQ runtime info.
+ */
+struct osn_irq {
+ u64 count;
+ u64 arrival_time;
+ u64 delta_start;
+};
+
+#define IRQ_CONTEXT 0
+#define THREAD_CONTEXT 1
+#define THREAD_URET 2
+/*
+ * sofirq runtime info.
+ */
+struct osn_softirq {
+ u64 count;
+ u64 arrival_time;
+ u64 delta_start;
+};
+
+/*
+ * thread runtime info.
+ */
+struct osn_thread {
+ u64 count;
+ u64 arrival_time;
+ u64 delta_start;
+};
+
+/*
+ * Runtime information: this structure saves the runtime information used by
+ * one sampling thread.
+ */
+struct osnoise_variables {
+ struct task_struct *kthread;
+ bool sampling;
+ pid_t pid;
+ struct osn_nmi nmi;
+ struct osn_irq irq;
+ struct osn_softirq softirq;
+ struct osn_thread thread;
+ local_t int_counter;
+};
+
+/*
+ * Per-cpu runtime information.
+ */
+static DEFINE_PER_CPU(struct osnoise_variables, per_cpu_osnoise_var);
+
+/*
+ * this_cpu_osn_var - Return the per-cpu osnoise_variables on its relative CPU
+ */
+static inline struct osnoise_variables *this_cpu_osn_var(void)
+{
+ return this_cpu_ptr(&per_cpu_osnoise_var);
+}
+
+#ifdef CONFIG_TIMERLAT_TRACER
+/*
+ * Runtime information for the timer mode.
+ */
+struct timerlat_variables {
+ struct task_struct *kthread;
+ struct hrtimer timer;
+ u64 rel_period;
+ u64 abs_period;
+ bool tracing_thread;
+ u64 count;
+ bool uthread_migrate;
+};
+
+static DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var);
+
+/*
+ * this_cpu_tmr_var - Return the per-cpu timerlat_variables on its relative CPU
+ */
+static inline struct timerlat_variables *this_cpu_tmr_var(void)
+{
+ return this_cpu_ptr(&per_cpu_timerlat_var);
+}
+
+/*
+ * tlat_var_reset - Reset the values of the given timerlat_variables
+ */
+static inline void tlat_var_reset(void)
+{
+ struct timerlat_variables *tlat_var;
+ int cpu;
+ /*
+ * So far, all the values are initialized as 0, so
+ * zeroing the structure is perfect.
+ */
+ for_each_cpu(cpu, cpu_online_mask) {
+ tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu);
+ memset(tlat_var, 0, sizeof(*tlat_var));
+ }
+}
+#else /* CONFIG_TIMERLAT_TRACER */
+#define tlat_var_reset() do {} while (0)
+#endif /* CONFIG_TIMERLAT_TRACER */
+
+/*
+ * osn_var_reset - Reset the values of the given osnoise_variables
+ */
+static inline void osn_var_reset(void)
+{
+ struct osnoise_variables *osn_var;
+ int cpu;
+
+ /*
+ * So far, all the values are initialized as 0, so
+ * zeroing the structure is perfect.
+ */
+ for_each_cpu(cpu, cpu_online_mask) {
+ osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu);
+ memset(osn_var, 0, sizeof(*osn_var));
+ }
+}
+
+/*
+ * osn_var_reset_all - Reset the value of all per-cpu osnoise_variables
+ */
+static inline void osn_var_reset_all(void)
+{
+ osn_var_reset();
+ tlat_var_reset();
+}
+
+/*
+ * Tells NMIs to call back to the osnoise tracer to record timestamps.
+ */
+bool trace_osnoise_callback_enabled;
+
+/*
+ * osnoise sample structure definition. Used to store the statistics of a
+ * sample run.
+ */
+struct osnoise_sample {
+ u64 runtime; /* runtime */
+ u64 noise; /* noise */
+ u64 max_sample; /* max single noise sample */
+ int hw_count; /* # HW (incl. hypervisor) interference */
+ int nmi_count; /* # NMIs during this sample */
+ int irq_count; /* # IRQs during this sample */
+ int softirq_count; /* # softirqs during this sample */
+ int thread_count; /* # threads during this sample */
+};
+
+#ifdef CONFIG_TIMERLAT_TRACER
+/*
+ * timerlat sample structure definition. Used to store the statistics of
+ * a sample run.
+ */
+struct timerlat_sample {
+ u64 timer_latency; /* timer_latency */
+ unsigned int seqnum; /* unique sequence */
+ int context; /* timer context */
+};
+#endif
+
+/*
+ * Protect the interface.
+ */
+static struct mutex interface_lock;
+
+/*
+ * Tracer data.
+ */
+static struct osnoise_data {
+ u64 sample_period; /* total sampling period */
+ u64 sample_runtime; /* active sampling portion of period */
+ u64 stop_tracing; /* stop trace in the internal operation (loop/irq) */
+ u64 stop_tracing_total; /* stop trace in the final operation (report/thread) */
+#ifdef CONFIG_TIMERLAT_TRACER
+ u64 timerlat_period; /* timerlat period */
+ u64 print_stack; /* print IRQ stack if total > */
+ int timerlat_tracer; /* timerlat tracer */
+#endif
+ bool tainted; /* infor users and developers about a problem */
+} osnoise_data = {
+ .sample_period = DEFAULT_SAMPLE_PERIOD,
+ .sample_runtime = DEFAULT_SAMPLE_RUNTIME,
+ .stop_tracing = 0,
+ .stop_tracing_total = 0,
+#ifdef CONFIG_TIMERLAT_TRACER
+ .print_stack = 0,
+ .timerlat_period = DEFAULT_TIMERLAT_PERIOD,
+ .timerlat_tracer = 0,
+#endif
+};
+
+#ifdef CONFIG_TIMERLAT_TRACER
+static inline bool timerlat_enabled(void)
+{
+ return osnoise_data.timerlat_tracer;
+}
+
+static inline int timerlat_softirq_exit(struct osnoise_variables *osn_var)
+{
+ struct timerlat_variables *tlat_var = this_cpu_tmr_var();
+ /*
+ * If the timerlat is enabled, but the irq handler did
+ * not run yet enabling timerlat_tracer, do not trace.
+ */
+ if (!tlat_var->tracing_thread) {
+ osn_var->softirq.arrival_time = 0;
+ osn_var->softirq.delta_start = 0;
+ return 0;
+ }
+ return 1;
+}
+
+static inline int timerlat_thread_exit(struct osnoise_variables *osn_var)
+{
+ struct timerlat_variables *tlat_var = this_cpu_tmr_var();
+ /*
+ * If the timerlat is enabled, but the irq handler did
+ * not run yet enabling timerlat_tracer, do not trace.
+ */
+ if (!tlat_var->tracing_thread) {
+ osn_var->thread.delta_start = 0;
+ osn_var->thread.arrival_time = 0;
+ return 0;
+ }
+ return 1;
+}
+#else /* CONFIG_TIMERLAT_TRACER */
+static inline bool timerlat_enabled(void)
+{
+ return false;
+}
+
+static inline int timerlat_softirq_exit(struct osnoise_variables *osn_var)
+{
+ return 1;
+}
+static inline int timerlat_thread_exit(struct osnoise_variables *osn_var)
+{
+ return 1;
+}
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Print the osnoise header info.
+ */
+static void print_osnoise_headers(struct seq_file *s)
+{
+ if (osnoise_data.tainted)
+ seq_puts(s, "# osnoise is tainted!\n");
+
+ seq_puts(s, "# _-------=> irqs-off\n");
+ seq_puts(s, "# / _------=> need-resched\n");
+ seq_puts(s, "# | / _-----=> need-resched-lazy\n");
+ seq_puts(s, "# || / _----=> hardirq/softirq\n");
+ seq_puts(s, "# ||| / _---=> preempt-depth\n");
+ seq_puts(s, "# |||| / _--=> preempt-lazy-depth\n");
+ seq_puts(s, "# ||||| / _-=> migrate-disable\n");
+
+ seq_puts(s, "# |||||| / ");
+ seq_puts(s, " MAX\n");
+
+ seq_puts(s, "# ||||| / ");
+ seq_puts(s, " SINGLE Interference counters:\n");
+
+ seq_puts(s, "# ||||||| RUNTIME ");
+ seq_puts(s, " NOISE %% OF CPU NOISE +-----------------------------+\n");
+
+ seq_puts(s, "# TASK-PID CPU# ||||||| TIMESTAMP IN US ");
+ seq_puts(s, " IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD\n");
+
+ seq_puts(s, "# | | | ||||||| | | ");
+ seq_puts(s, " | | | | | | | |\n");
+}
+#else /* CONFIG_PREEMPT_RT */
+static void print_osnoise_headers(struct seq_file *s)
+{
+ if (osnoise_data.tainted)
+ seq_puts(s, "# osnoise is tainted!\n");
+
+ seq_puts(s, "# _-----=> irqs-off\n");
+ seq_puts(s, "# / _----=> need-resched\n");
+ seq_puts(s, "# | / _---=> hardirq/softirq\n");
+ seq_puts(s, "# || / _--=> preempt-depth\n");
+ seq_puts(s, "# ||| / _-=> migrate-disable ");
+ seq_puts(s, " MAX\n");
+ seq_puts(s, "# |||| / delay ");
+ seq_puts(s, " SINGLE Interference counters:\n");
+
+ seq_puts(s, "# ||||| RUNTIME ");
+ seq_puts(s, " NOISE %% OF CPU NOISE +-----------------------------+\n");
+
+ seq_puts(s, "# TASK-PID CPU# ||||| TIMESTAMP IN US ");
+ seq_puts(s, " IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD\n");
+
+ seq_puts(s, "# | | | ||||| | | ");
+ seq_puts(s, " | | | | | | | |\n");
+}
+#endif /* CONFIG_PREEMPT_RT */
+
+/*
+ * osnoise_taint - report an osnoise error.
+ */
+#define osnoise_taint(msg) ({ \
+ struct osnoise_instance *inst; \
+ struct trace_buffer *buffer; \
+ \
+ rcu_read_lock(); \
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) { \
+ buffer = inst->tr->array_buffer.buffer; \
+ trace_array_printk_buf(buffer, _THIS_IP_, msg); \
+ } \
+ rcu_read_unlock(); \
+ osnoise_data.tainted = true; \
+})
+
+/*
+ * Record an osnoise_sample into the tracer buffer.
+ */
+static void
+__trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer)
+{
+ struct trace_event_call *call = &event_osnoise;
+ struct ring_buffer_event *event;
+ struct osnoise_entry *entry;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_OSNOISE, sizeof(*entry),
+ tracing_gen_ctx());
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->runtime = sample->runtime;
+ entry->noise = sample->noise;
+ entry->max_sample = sample->max_sample;
+ entry->hw_count = sample->hw_count;
+ entry->nmi_count = sample->nmi_count;
+ entry->irq_count = sample->irq_count;
+ entry->softirq_count = sample->softirq_count;
+ entry->thread_count = sample->thread_count;
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit_nostack(buffer, event);
+}
+
+/*
+ * Record an osnoise_sample on all osnoise instances.
+ */
+static void trace_osnoise_sample(struct osnoise_sample *sample)
+{
+ struct osnoise_instance *inst;
+ struct trace_buffer *buffer;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ buffer = inst->tr->array_buffer.buffer;
+ __trace_osnoise_sample(sample, buffer);
+ }
+ rcu_read_unlock();
+}
+
+#ifdef CONFIG_TIMERLAT_TRACER
+/*
+ * Print the timerlat header info.
+ */
+#ifdef CONFIG_PREEMPT_RT
+static void print_timerlat_headers(struct seq_file *s)
+{
+ seq_puts(s, "# _-------=> irqs-off\n");
+ seq_puts(s, "# / _------=> need-resched\n");
+ seq_puts(s, "# | / _-----=> need-resched-lazy\n");
+ seq_puts(s, "# || / _----=> hardirq/softirq\n");
+ seq_puts(s, "# ||| / _---=> preempt-depth\n");
+ seq_puts(s, "# |||| / _--=> preempt-lazy-depth\n");
+ seq_puts(s, "# ||||| / _-=> migrate-disable\n");
+ seq_puts(s, "# |||||| /\n");
+ seq_puts(s, "# ||||||| ACTIVATION\n");
+ seq_puts(s, "# TASK-PID CPU# ||||||| TIMESTAMP ID ");
+ seq_puts(s, " CONTEXT LATENCY\n");
+ seq_puts(s, "# | | | ||||||| | | ");
+ seq_puts(s, " | |\n");
+}
+#else /* CONFIG_PREEMPT_RT */
+static void print_timerlat_headers(struct seq_file *s)
+{
+ seq_puts(s, "# _-----=> irqs-off\n");
+ seq_puts(s, "# / _----=> need-resched\n");
+ seq_puts(s, "# | / _---=> hardirq/softirq\n");
+ seq_puts(s, "# || / _--=> preempt-depth\n");
+ seq_puts(s, "# ||| / _-=> migrate-disable\n");
+ seq_puts(s, "# |||| / delay\n");
+ seq_puts(s, "# ||||| ACTIVATION\n");
+ seq_puts(s, "# TASK-PID CPU# ||||| TIMESTAMP ID ");
+ seq_puts(s, " CONTEXT LATENCY\n");
+ seq_puts(s, "# | | | ||||| | | ");
+ seq_puts(s, " | |\n");
+}
+#endif /* CONFIG_PREEMPT_RT */
+
+static void
+__trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer)
+{
+ struct trace_event_call *call = &event_osnoise;
+ struct ring_buffer_event *event;
+ struct timerlat_entry *entry;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_TIMERLAT, sizeof(*entry),
+ tracing_gen_ctx());
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->seqnum = sample->seqnum;
+ entry->context = sample->context;
+ entry->timer_latency = sample->timer_latency;
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit_nostack(buffer, event);
+}
+
+/*
+ * Record an timerlat_sample into the tracer buffer.
+ */
+static void trace_timerlat_sample(struct timerlat_sample *sample)
+{
+ struct osnoise_instance *inst;
+ struct trace_buffer *buffer;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ buffer = inst->tr->array_buffer.buffer;
+ __trace_timerlat_sample(sample, buffer);
+ }
+ rcu_read_unlock();
+}
+
+#ifdef CONFIG_STACKTRACE
+
+#define MAX_CALLS 256
+
+/*
+ * Stack trace will take place only at IRQ level, so, no need
+ * to control nesting here.
+ */
+struct trace_stack {
+ int stack_size;
+ int nr_entries;
+ unsigned long calls[MAX_CALLS];
+};
+
+static DEFINE_PER_CPU(struct trace_stack, trace_stack);
+
+/*
+ * timerlat_save_stack - save a stack trace without printing
+ *
+ * Save the current stack trace without printing. The
+ * stack will be printed later, after the end of the measurement.
+ */
+static void timerlat_save_stack(int skip)
+{
+ unsigned int size, nr_entries;
+ struct trace_stack *fstack;
+
+ fstack = this_cpu_ptr(&trace_stack);
+
+ size = ARRAY_SIZE(fstack->calls);
+
+ nr_entries = stack_trace_save(fstack->calls, size, skip);
+
+ fstack->stack_size = nr_entries * sizeof(unsigned long);
+ fstack->nr_entries = nr_entries;
+
+ return;
+
+}
+
+static void
+__timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, unsigned int size)
+{
+ struct trace_event_call *call = &event_osnoise;
+ struct ring_buffer_event *event;
+ struct stack_entry *entry;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_STACK, sizeof(*entry) + size,
+ tracing_gen_ctx());
+ if (!event)
+ return;
+
+ entry = ring_buffer_event_data(event);
+
+ memcpy(&entry->caller, fstack->calls, size);
+ entry->size = fstack->nr_entries;
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit_nostack(buffer, event);
+}
+
+/*
+ * timerlat_dump_stack - dump a stack trace previously saved
+ */
+static void timerlat_dump_stack(u64 latency)
+{
+ struct osnoise_instance *inst;
+ struct trace_buffer *buffer;
+ struct trace_stack *fstack;
+ unsigned int size;
+
+ /*
+ * trace only if latency > print_stack config, if enabled.
+ */
+ if (!osnoise_data.print_stack || osnoise_data.print_stack > latency)
+ return;
+
+ preempt_disable_notrace();
+ fstack = this_cpu_ptr(&trace_stack);
+ size = fstack->stack_size;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ buffer = inst->tr->array_buffer.buffer;
+ __timerlat_dump_stack(buffer, fstack, size);
+
+ }
+ rcu_read_unlock();
+ preempt_enable_notrace();
+}
+#else /* CONFIG_STACKTRACE */
+#define timerlat_dump_stack(u64 latency) do {} while (0)
+#define timerlat_save_stack(a) do {} while (0)
+#endif /* CONFIG_STACKTRACE */
+#endif /* CONFIG_TIMERLAT_TRACER */
+
+/*
+ * Macros to encapsulate the time capturing infrastructure.
+ */
+#define time_get() trace_clock_local()
+#define time_to_us(x) div_u64(x, 1000)
+#define time_sub(a, b) ((a) - (b))
+
+/*
+ * cond_move_irq_delta_start - Forward the delta_start of a running IRQ
+ *
+ * If an IRQ is preempted by an NMI, its delta_start is pushed forward
+ * to discount the NMI interference.
+ *
+ * See get_int_safe_duration().
+ */
+static inline void
+cond_move_irq_delta_start(struct osnoise_variables *osn_var, u64 duration)
+{
+ if (osn_var->irq.delta_start)
+ osn_var->irq.delta_start += duration;
+}
+
+#ifndef CONFIG_PREEMPT_RT
+/*
+ * cond_move_softirq_delta_start - Forward the delta_start of a running softirq.
+ *
+ * If a softirq is preempted by an IRQ or NMI, its delta_start is pushed
+ * forward to discount the interference.
+ *
+ * See get_int_safe_duration().
+ */
+static inline void
+cond_move_softirq_delta_start(struct osnoise_variables *osn_var, u64 duration)
+{
+ if (osn_var->softirq.delta_start)
+ osn_var->softirq.delta_start += duration;
+}
+#else /* CONFIG_PREEMPT_RT */
+#define cond_move_softirq_delta_start(osn_var, duration) do {} while (0)
+#endif
+
+/*
+ * cond_move_thread_delta_start - Forward the delta_start of a running thread
+ *
+ * If a noisy thread is preempted by an softirq, IRQ or NMI, its delta_start
+ * is pushed forward to discount the interference.
+ *
+ * See get_int_safe_duration().
+ */
+static inline void
+cond_move_thread_delta_start(struct osnoise_variables *osn_var, u64 duration)
+{
+ if (osn_var->thread.delta_start)
+ osn_var->thread.delta_start += duration;
+}
+
+/*
+ * get_int_safe_duration - Get the duration of a window
+ *
+ * The irq, softirq and thread varaibles need to have its duration without
+ * the interference from higher priority interrupts. Instead of keeping a
+ * variable to discount the interrupt interference from these variables, the
+ * starting time of these variables are pushed forward with the interrupt's
+ * duration. In this way, a single variable is used to:
+ *
+ * - Know if a given window is being measured.
+ * - Account its duration.
+ * - Discount the interference.
+ *
+ * To avoid getting inconsistent values, e.g.,:
+ *
+ * now = time_get()
+ * ---> interrupt!
+ * delta_start -= int duration;
+ * <---
+ * duration = now - delta_start;
+ *
+ * result: negative duration if the variable duration before the
+ * interrupt was smaller than the interrupt execution.
+ *
+ * A counter of interrupts is used. If the counter increased, try
+ * to capture an interference safe duration.
+ */
+static inline s64
+get_int_safe_duration(struct osnoise_variables *osn_var, u64 *delta_start)
+{
+ u64 int_counter, now;
+ s64 duration;
+
+ do {
+ int_counter = local_read(&osn_var->int_counter);
+ /* synchronize with interrupts */
+ barrier();
+
+ now = time_get();
+ duration = (now - *delta_start);
+
+ /* synchronize with interrupts */
+ barrier();
+ } while (int_counter != local_read(&osn_var->int_counter));
+
+ /*
+ * This is an evidence of race conditions that cause
+ * a value to be "discounted" too much.
+ */
+ if (duration < 0)
+ osnoise_taint("Negative duration!\n");
+
+ *delta_start = 0;
+
+ return duration;
+}
+
+/*
+ *
+ * set_int_safe_time - Save the current time on *time, aware of interference
+ *
+ * Get the time, taking into consideration a possible interference from
+ * higher priority interrupts.
+ *
+ * See get_int_safe_duration() for an explanation.
+ */
+static u64
+set_int_safe_time(struct osnoise_variables *osn_var, u64 *time)
+{
+ u64 int_counter;
+
+ do {
+ int_counter = local_read(&osn_var->int_counter);
+ /* synchronize with interrupts */
+ barrier();
+
+ *time = time_get();
+
+ /* synchronize with interrupts */
+ barrier();
+ } while (int_counter != local_read(&osn_var->int_counter));
+
+ return int_counter;
+}
+
+#ifdef CONFIG_TIMERLAT_TRACER
+/*
+ * copy_int_safe_time - Copy *src into *desc aware of interference
+ */
+static u64
+copy_int_safe_time(struct osnoise_variables *osn_var, u64 *dst, u64 *src)
+{
+ u64 int_counter;
+
+ do {
+ int_counter = local_read(&osn_var->int_counter);
+ /* synchronize with interrupts */
+ barrier();
+
+ *dst = *src;
+
+ /* synchronize with interrupts */
+ barrier();
+ } while (int_counter != local_read(&osn_var->int_counter));
+
+ return int_counter;
+}
+#endif /* CONFIG_TIMERLAT_TRACER */
+
+/*
+ * trace_osnoise_callback - NMI entry/exit callback
+ *
+ * This function is called at the entry and exit NMI code. The bool enter
+ * distinguishes between either case. This function is used to note a NMI
+ * occurrence, compute the noise caused by the NMI, and to remove the noise
+ * it is potentially causing on other interference variables.
+ */
+void trace_osnoise_callback(bool enter)
+{
+ struct osnoise_variables *osn_var = this_cpu_osn_var();
+ u64 duration;
+
+ if (!osn_var->sampling)
+ return;
+
+ /*
+ * Currently trace_clock_local() calls sched_clock() and the
+ * generic version is not NMI safe.
+ */
+ if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) {
+ if (enter) {
+ osn_var->nmi.delta_start = time_get();
+ local_inc(&osn_var->int_counter);
+ } else {
+ duration = time_get() - osn_var->nmi.delta_start;
+
+ trace_nmi_noise(osn_var->nmi.delta_start, duration);
+
+ cond_move_irq_delta_start(osn_var, duration);
+ cond_move_softirq_delta_start(osn_var, duration);
+ cond_move_thread_delta_start(osn_var, duration);
+ }
+ }
+
+ if (enter)
+ osn_var->nmi.count++;
+}
+
+/*
+ * osnoise_trace_irq_entry - Note the starting of an IRQ
+ *
+ * Save the starting time of an IRQ. As IRQs are non-preemptive to other IRQs,
+ * it is safe to use a single variable (ons_var->irq) to save the statistics.
+ * The arrival_time is used to report... the arrival time. The delta_start
+ * is used to compute the duration at the IRQ exit handler. See
+ * cond_move_irq_delta_start().
+ */
+void osnoise_trace_irq_entry(int id)
+{
+ struct osnoise_variables *osn_var = this_cpu_osn_var();
+
+ if (!osn_var->sampling)
+ return;
+ /*
+ * This value will be used in the report, but not to compute
+ * the execution time, so it is safe to get it unsafe.
+ */
+ osn_var->irq.arrival_time = time_get();
+ set_int_safe_time(osn_var, &osn_var->irq.delta_start);
+ osn_var->irq.count++;
+
+ local_inc(&osn_var->int_counter);
+}
+
+/*
+ * osnoise_irq_exit - Note the end of an IRQ, sava data and trace
+ *
+ * Computes the duration of the IRQ noise, and trace it. Also discounts the
+ * interference from other sources of noise could be currently being accounted.
+ */
+void osnoise_trace_irq_exit(int id, const char *desc)
+{
+ struct osnoise_variables *osn_var = this_cpu_osn_var();
+ s64 duration;
+
+ if (!osn_var->sampling)
+ return;
+
+ duration = get_int_safe_duration(osn_var, &osn_var->irq.delta_start);
+ trace_irq_noise(id, desc, osn_var->irq.arrival_time, duration);
+ osn_var->irq.arrival_time = 0;
+ cond_move_softirq_delta_start(osn_var, duration);
+ cond_move_thread_delta_start(osn_var, duration);
+}
+
+/*
+ * trace_irqentry_callback - Callback to the irq:irq_entry traceevent
+ *
+ * Used to note the starting of an IRQ occurece.
+ */
+static void trace_irqentry_callback(void *data, int irq,
+ struct irqaction *action)
+{
+ osnoise_trace_irq_entry(irq);
+}
+
+/*
+ * trace_irqexit_callback - Callback to the irq:irq_exit traceevent
+ *
+ * Used to note the end of an IRQ occurece.
+ */
+static void trace_irqexit_callback(void *data, int irq,
+ struct irqaction *action, int ret)
+{
+ osnoise_trace_irq_exit(irq, action->name);
+}
+
+/*
+ * arch specific register function.
+ */
+int __weak osnoise_arch_register(void)
+{
+ return 0;
+}
+
+/*
+ * arch specific unregister function.
+ */
+void __weak osnoise_arch_unregister(void)
+{
+ return;
+}
+
+/*
+ * hook_irq_events - Hook IRQ handling events
+ *
+ * This function hooks the IRQ related callbacks to the respective trace
+ * events.
+ */
+static int hook_irq_events(void)
+{
+ int ret;
+
+ ret = register_trace_irq_handler_entry(trace_irqentry_callback, NULL);
+ if (ret)
+ goto out_err;
+
+ ret = register_trace_irq_handler_exit(trace_irqexit_callback, NULL);
+ if (ret)
+ goto out_unregister_entry;
+
+ ret = osnoise_arch_register();
+ if (ret)
+ goto out_irq_exit;
+
+ return 0;
+
+out_irq_exit:
+ unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL);
+out_unregister_entry:
+ unregister_trace_irq_handler_entry(trace_irqentry_callback, NULL);
+out_err:
+ return -EINVAL;
+}
+
+/*
+ * unhook_irq_events - Unhook IRQ handling events
+ *
+ * This function unhooks the IRQ related callbacks to the respective trace
+ * events.
+ */
+static void unhook_irq_events(void)
+{
+ osnoise_arch_unregister();
+ unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL);
+ unregister_trace_irq_handler_entry(trace_irqentry_callback, NULL);
+}
+
+#ifndef CONFIG_PREEMPT_RT
+/*
+ * trace_softirq_entry_callback - Note the starting of a softirq
+ *
+ * Save the starting time of a softirq. As softirqs are non-preemptive to
+ * other softirqs, it is safe to use a single variable (ons_var->softirq)
+ * to save the statistics. The arrival_time is used to report... the
+ * arrival time. The delta_start is used to compute the duration at the
+ * softirq exit handler. See cond_move_softirq_delta_start().
+ */
+static void trace_softirq_entry_callback(void *data, unsigned int vec_nr)
+{
+ struct osnoise_variables *osn_var = this_cpu_osn_var();
+
+ if (!osn_var->sampling)
+ return;
+ /*
+ * This value will be used in the report, but not to compute
+ * the execution time, so it is safe to get it unsafe.
+ */
+ osn_var->softirq.arrival_time = time_get();
+ set_int_safe_time(osn_var, &osn_var->softirq.delta_start);
+ osn_var->softirq.count++;
+
+ local_inc(&osn_var->int_counter);
+}
+
+/*
+ * trace_softirq_exit_callback - Note the end of an softirq
+ *
+ * Computes the duration of the softirq noise, and trace it. Also discounts the
+ * interference from other sources of noise could be currently being accounted.
+ */
+static void trace_softirq_exit_callback(void *data, unsigned int vec_nr)
+{
+ struct osnoise_variables *osn_var = this_cpu_osn_var();
+ s64 duration;
+
+ if (!osn_var->sampling)
+ return;
+
+ if (unlikely(timerlat_enabled()))
+ if (!timerlat_softirq_exit(osn_var))
+ return;
+
+ duration = get_int_safe_duration(osn_var, &osn_var->softirq.delta_start);
+ trace_softirq_noise(vec_nr, osn_var->softirq.arrival_time, duration);
+ cond_move_thread_delta_start(osn_var, duration);
+ osn_var->softirq.arrival_time = 0;
+}
+
+/*
+ * hook_softirq_events - Hook softirq handling events
+ *
+ * This function hooks the softirq related callbacks to the respective trace
+ * events.
+ */
+static int hook_softirq_events(void)
+{
+ int ret;
+
+ ret = register_trace_softirq_entry(trace_softirq_entry_callback, NULL);
+ if (ret)
+ goto out_err;
+
+ ret = register_trace_softirq_exit(trace_softirq_exit_callback, NULL);
+ if (ret)
+ goto out_unreg_entry;
+
+ return 0;
+
+out_unreg_entry:
+ unregister_trace_softirq_entry(trace_softirq_entry_callback, NULL);
+out_err:
+ return -EINVAL;
+}
+
+/*
+ * unhook_softirq_events - Unhook softirq handling events
+ *
+ * This function hooks the softirq related callbacks to the respective trace
+ * events.
+ */
+static void unhook_softirq_events(void)
+{
+ unregister_trace_softirq_entry(trace_softirq_entry_callback, NULL);
+ unregister_trace_softirq_exit(trace_softirq_exit_callback, NULL);
+}
+#else /* CONFIG_PREEMPT_RT */
+/*
+ * softirq are threads on the PREEMPT_RT mode.
+ */
+static int hook_softirq_events(void)
+{
+ return 0;
+}
+static void unhook_softirq_events(void)
+{
+}
+#endif
+
+/*
+ * thread_entry - Record the starting of a thread noise window
+ *
+ * It saves the context switch time for a noisy thread, and increments
+ * the interference counters.
+ */
+static void
+thread_entry(struct osnoise_variables *osn_var, struct task_struct *t)
+{
+ if (!osn_var->sampling)
+ return;
+ /*
+ * The arrival time will be used in the report, but not to compute
+ * the execution time, so it is safe to get it unsafe.
+ */
+ osn_var->thread.arrival_time = time_get();
+
+ set_int_safe_time(osn_var, &osn_var->thread.delta_start);
+
+ osn_var->thread.count++;
+ local_inc(&osn_var->int_counter);
+}
+
+/*
+ * thread_exit - Report the end of a thread noise window
+ *
+ * It computes the total noise from a thread, tracing if needed.
+ */
+static void
+thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
+{
+ s64 duration;
+
+ if (!osn_var->sampling)
+ return;
+
+ if (unlikely(timerlat_enabled()))
+ if (!timerlat_thread_exit(osn_var))
+ return;
+
+ duration = get_int_safe_duration(osn_var, &osn_var->thread.delta_start);
+
+ trace_thread_noise(t, osn_var->thread.arrival_time, duration);
+
+ osn_var->thread.arrival_time = 0;
+}
+
+#ifdef CONFIG_TIMERLAT_TRACER
+/*
+ * osnoise_stop_exception - Stop tracing and the tracer.
+ */
+static __always_inline void osnoise_stop_exception(char *msg, int cpu)
+{
+ struct osnoise_instance *inst;
+ struct trace_array *tr;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ tr = inst->tr;
+ trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
+ "stop tracing hit on cpu %d due to exception: %s\n",
+ smp_processor_id(),
+ msg);
+
+ if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options))
+ panic("tracer hit on cpu %d due to exception: %s\n",
+ smp_processor_id(),
+ msg);
+
+ tracer_tracing_off(tr);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * trace_sched_migrate_callback - sched:sched_migrate_task trace event handler
+ *
+ * his function is hooked to the sched:sched_migrate_task trace event, and monitors
+ * timerlat user-space thread migration.
+ */
+static void trace_sched_migrate_callback(void *data, struct task_struct *p, int dest_cpu)
+{
+ struct osnoise_variables *osn_var;
+ long cpu = task_cpu(p);
+
+ osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu);
+ if (osn_var->pid == p->pid && dest_cpu != cpu) {
+ per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1;
+ osnoise_taint("timerlat user-thread migrated\n");
+ osnoise_stop_exception("timerlat user-thread migrated", cpu);
+ }
+}
+
+static int register_migration_monitor(void)
+{
+ int ret = 0;
+
+ /*
+ * Timerlat thread migration check is only required when running timerlat in user-space.
+ * Thus, enable callback only if timerlat is set with no workload.
+ */
+ if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options))
+ ret = register_trace_sched_migrate_task(trace_sched_migrate_callback, NULL);
+
+ return ret;
+}
+
+static void unregister_migration_monitor(void)
+{
+ if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options))
+ unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL);
+}
+#else
+static int register_migration_monitor(void)
+{
+ return 0;
+}
+static void unregister_migration_monitor(void) {}
+#endif
+/*
+ * trace_sched_switch - sched:sched_switch trace event handler
+ *
+ * This function is hooked to the sched:sched_switch trace event, and it is
+ * used to record the beginning and to report the end of a thread noise window.
+ */
+static void
+trace_sched_switch_callback(void *data, bool preempt,
+ struct task_struct *p,
+ struct task_struct *n,
+ unsigned int prev_state)
+{
+ struct osnoise_variables *osn_var = this_cpu_osn_var();
+ int workload = test_bit(OSN_WORKLOAD, &osnoise_options);
+
+ if ((p->pid != osn_var->pid) || !workload)
+ thread_exit(osn_var, p);
+
+ if ((n->pid != osn_var->pid) || !workload)
+ thread_entry(osn_var, n);
+}
+
+/*
+ * hook_thread_events - Hook the instrumentation for thread noise
+ *
+ * Hook the osnoise tracer callbacks to handle the noise from other
+ * threads on the necessary kernel events.
+ */
+static int hook_thread_events(void)
+{
+ int ret;
+
+ ret = register_trace_sched_switch(trace_sched_switch_callback, NULL);
+ if (ret)
+ return -EINVAL;
+
+ ret = register_migration_monitor();
+ if (ret)
+ goto out_unreg;
+
+ return 0;
+
+out_unreg:
+ unregister_trace_sched_switch(trace_sched_switch_callback, NULL);
+ return -EINVAL;
+}
+
+/*
+ * unhook_thread_events - unhook the instrumentation for thread noise
+ *
+ * Unook the osnoise tracer callbacks to handle the noise from other
+ * threads on the necessary kernel events.
+ */
+static void unhook_thread_events(void)
+{
+ unregister_trace_sched_switch(trace_sched_switch_callback, NULL);
+ unregister_migration_monitor();
+}
+
+/*
+ * save_osn_sample_stats - Save the osnoise_sample statistics
+ *
+ * Save the osnoise_sample statistics before the sampling phase. These
+ * values will be used later to compute the diff betwneen the statistics
+ * before and after the osnoise sampling.
+ */
+static void
+save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s)
+{
+ s->nmi_count = osn_var->nmi.count;
+ s->irq_count = osn_var->irq.count;
+ s->softirq_count = osn_var->softirq.count;
+ s->thread_count = osn_var->thread.count;
+}
+
+/*
+ * diff_osn_sample_stats - Compute the osnoise_sample statistics
+ *
+ * After a sample period, compute the difference on the osnoise_sample
+ * statistics. The struct osnoise_sample *s contains the statistics saved via
+ * save_osn_sample_stats() before the osnoise sampling.
+ */
+static void
+diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s)
+{
+ s->nmi_count = osn_var->nmi.count - s->nmi_count;
+ s->irq_count = osn_var->irq.count - s->irq_count;
+ s->softirq_count = osn_var->softirq.count - s->softirq_count;
+ s->thread_count = osn_var->thread.count - s->thread_count;
+}
+
+/*
+ * osnoise_stop_tracing - Stop tracing and the tracer.
+ */
+static __always_inline void osnoise_stop_tracing(void)
+{
+ struct osnoise_instance *inst;
+ struct trace_array *tr;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ tr = inst->tr;
+ trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
+ "stop tracing hit on cpu %d\n", smp_processor_id());
+
+ if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options))
+ panic("tracer hit stop condition on CPU %d\n", smp_processor_id());
+
+ tracer_tracing_off(tr);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * osnoise_has_tracing_on - Check if there is at least one instance on
+ */
+static __always_inline int osnoise_has_tracing_on(void)
+{
+ struct osnoise_instance *inst;
+ int trace_is_on = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list)
+ trace_is_on += tracer_tracing_is_on(inst->tr);
+ rcu_read_unlock();
+
+ return trace_is_on;
+}
+
+/*
+ * notify_new_max_latency - Notify a new max latency via fsnotify interface.
+ */
+static void notify_new_max_latency(u64 latency)
+{
+ struct osnoise_instance *inst;
+ struct trace_array *tr;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ tr = inst->tr;
+ if (tracer_tracing_is_on(tr) && tr->max_latency < latency) {
+ tr->max_latency = latency;
+ latency_fsnotify(tr);
+ }
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * run_osnoise - Sample the time and look for osnoise
+ *
+ * Used to capture the time, looking for potential osnoise latency repeatedly.
+ * Different from hwlat_detector, it is called with preemption and interrupts
+ * enabled. This allows irqs, softirqs and threads to run, interfering on the
+ * osnoise sampling thread, as they would do with a regular thread.
+ */
+static int run_osnoise(void)
+{
+ bool disable_irq = test_bit(OSN_IRQ_DISABLE, &osnoise_options);
+ struct osnoise_variables *osn_var = this_cpu_osn_var();
+ u64 start, sample, last_sample;
+ u64 last_int_count, int_count;
+ s64 noise = 0, max_noise = 0;
+ s64 total, last_total = 0;
+ struct osnoise_sample s;
+ bool disable_preemption;
+ unsigned int threshold;
+ u64 runtime, stop_in;
+ u64 sum_noise = 0;
+ int hw_count = 0;
+ int ret = -1;
+
+ /*
+ * Disabling preemption is only required if IRQs are enabled,
+ * and the options is set on.
+ */
+ disable_preemption = !disable_irq && test_bit(OSN_PREEMPT_DISABLE, &osnoise_options);
+
+ /*
+ * Considers the current thread as the workload.
+ */
+ osn_var->pid = current->pid;
+
+ /*
+ * Save the current stats for the diff
+ */
+ save_osn_sample_stats(osn_var, &s);
+
+ /*
+ * if threshold is 0, use the default value of 5 us.
+ */
+ threshold = tracing_thresh ? : 5000;
+
+ /*
+ * Apply PREEMPT and IRQ disabled options.
+ */
+ if (disable_irq)
+ local_irq_disable();
+
+ if (disable_preemption)
+ preempt_disable();
+
+ /*
+ * Make sure NMIs see sampling first
+ */
+ osn_var->sampling = true;
+ barrier();
+
+ /*
+ * Transform the *_us config to nanoseconds to avoid the
+ * division on the main loop.
+ */
+ runtime = osnoise_data.sample_runtime * NSEC_PER_USEC;
+ stop_in = osnoise_data.stop_tracing * NSEC_PER_USEC;
+
+ /*
+ * Start timestemp
+ */
+ start = time_get();
+
+ /*
+ * "previous" loop.
+ */
+ last_int_count = set_int_safe_time(osn_var, &last_sample);
+
+ do {
+ /*
+ * Get sample!
+ */
+ int_count = set_int_safe_time(osn_var, &sample);
+
+ noise = time_sub(sample, last_sample);
+
+ /*
+ * This shouldn't happen.
+ */
+ if (noise < 0) {
+ osnoise_taint("negative noise!");
+ goto out;
+ }
+
+ /*
+ * Sample runtime.
+ */
+ total = time_sub(sample, start);
+
+ /*
+ * Check for possible overflows.
+ */
+ if (total < last_total) {
+ osnoise_taint("total overflow!");
+ break;
+ }
+
+ last_total = total;
+
+ if (noise >= threshold) {
+ int interference = int_count - last_int_count;
+
+ if (noise > max_noise)
+ max_noise = noise;
+
+ if (!interference)
+ hw_count++;
+
+ sum_noise += noise;
+
+ trace_sample_threshold(last_sample, noise, interference);
+
+ if (osnoise_data.stop_tracing)
+ if (noise > stop_in)
+ osnoise_stop_tracing();
+ }
+
+ /*
+ * In some cases, notably when running on a nohz_full CPU with
+ * a stopped tick PREEMPT_RCU has no way to account for QSs.
+ * This will eventually cause unwarranted noise as PREEMPT_RCU
+ * will force preemption as the means of ending the current
+ * grace period. We avoid this problem by calling
+ * rcu_momentary_dyntick_idle(), which performs a zero duration
+ * EQS allowing PREEMPT_RCU to end the current grace period.
+ * This call shouldn't be wrapped inside an RCU critical
+ * section.
+ *
+ * Note that in non PREEMPT_RCU kernels QSs are handled through
+ * cond_resched()
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
+ if (!disable_irq)
+ local_irq_disable();
+
+ rcu_momentary_dyntick_idle();
+
+ if (!disable_irq)
+ local_irq_enable();
+ }
+
+ /*
+ * For the non-preemptive kernel config: let threads runs, if
+ * they so wish, unless set not do to so.
+ */
+ if (!disable_irq && !disable_preemption)
+ cond_resched();
+
+ last_sample = sample;
+ last_int_count = int_count;
+
+ } while (total < runtime && !kthread_should_stop());
+
+ /*
+ * Finish the above in the view for interrupts.
+ */
+ barrier();
+
+ osn_var->sampling = false;
+
+ /*
+ * Make sure sampling data is no longer updated.
+ */
+ barrier();
+
+ /*
+ * Return to the preemptive state.
+ */
+ if (disable_preemption)
+ preempt_enable();
+
+ if (disable_irq)
+ local_irq_enable();
+
+ /*
+ * Save noise info.
+ */
+ s.noise = time_to_us(sum_noise);
+ s.runtime = time_to_us(total);
+ s.max_sample = time_to_us(max_noise);
+ s.hw_count = hw_count;
+
+ /* Save interference stats info */
+ diff_osn_sample_stats(osn_var, &s);
+
+ trace_osnoise_sample(&s);
+
+ notify_new_max_latency(max_noise);
+
+ if (osnoise_data.stop_tracing_total)
+ if (s.noise > osnoise_data.stop_tracing_total)
+ osnoise_stop_tracing();
+
+ return 0;
+out:
+ return ret;
+}
+
+static struct cpumask osnoise_cpumask;
+static struct cpumask save_cpumask;
+
+/*
+ * osnoise_sleep - sleep until the next period
+ */
+static void osnoise_sleep(bool skip_period)
+{
+ u64 interval;
+ ktime_t wake_time;
+
+ mutex_lock(&interface_lock);
+ if (skip_period)
+ interval = osnoise_data.sample_period;
+ else
+ interval = osnoise_data.sample_period - osnoise_data.sample_runtime;
+ mutex_unlock(&interface_lock);
+
+ /*
+ * differently from hwlat_detector, the osnoise tracer can run
+ * without a pause because preemption is on.
+ */
+ if (!interval) {
+ /* Let synchronize_rcu_tasks() make progress */
+ cond_resched_tasks_rcu_qs();
+ return;
+ }
+
+ wake_time = ktime_add_us(ktime_get(), interval);
+ __set_current_state(TASK_INTERRUPTIBLE);
+
+ while (schedule_hrtimeout(&wake_time, HRTIMER_MODE_ABS)) {
+ if (kthread_should_stop())
+ break;
+ }
+}
+
+/*
+ * osnoise_migration_pending - checks if the task needs to migrate
+ *
+ * osnoise/timerlat threads are per-cpu. If there is a pending request to
+ * migrate the thread away from the current CPU, something bad has happened.
+ * Play the good citizen and leave.
+ *
+ * Returns 0 if it is safe to continue, 1 otherwise.
+ */
+static inline int osnoise_migration_pending(void)
+{
+ if (!current->migration_pending)
+ return 0;
+
+ /*
+ * If migration is pending, there is a task waiting for the
+ * tracer to enable migration. The tracer does not allow migration,
+ * thus: taint and leave to unblock the blocked thread.
+ */
+ osnoise_taint("migration requested to osnoise threads, leaving.");
+
+ /*
+ * Unset this thread from the threads managed by the interface.
+ * The tracers are responsible for cleaning their env before
+ * exiting.
+ */
+ mutex_lock(&interface_lock);
+ this_cpu_osn_var()->kthread = NULL;
+ mutex_unlock(&interface_lock);
+
+ return 1;
+}
+
+/*
+ * osnoise_main - The osnoise detection kernel thread
+ *
+ * Calls run_osnoise() function to measure the osnoise for the configured runtime,
+ * every period.
+ */
+static int osnoise_main(void *data)
+{
+ unsigned long flags;
+
+ /*
+ * This thread was created pinned to the CPU using PF_NO_SETAFFINITY.
+ * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread.
+ *
+ * To work around this limitation, disable migration and remove the
+ * flag.
+ */
+ migrate_disable();
+ raw_spin_lock_irqsave(&current->pi_lock, flags);
+ current->flags &= ~(PF_NO_SETAFFINITY);
+ raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+
+ while (!kthread_should_stop()) {
+ if (osnoise_migration_pending())
+ break;
+
+ /* skip a period if tracing is off on all instances */
+ if (!osnoise_has_tracing_on()) {
+ osnoise_sleep(true);
+ continue;
+ }
+
+ run_osnoise();
+ osnoise_sleep(false);
+ }
+
+ migrate_enable();
+ return 0;
+}
+
+#ifdef CONFIG_TIMERLAT_TRACER
+/*
+ * timerlat_irq - hrtimer handler for timerlat.
+ */
+static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
+{
+ struct osnoise_variables *osn_var = this_cpu_osn_var();
+ struct timerlat_variables *tlat;
+ struct timerlat_sample s;
+ u64 now;
+ u64 diff;
+
+ /*
+ * I am not sure if the timer was armed for this CPU. So, get
+ * the timerlat struct from the timer itself, not from this
+ * CPU.
+ */
+ tlat = container_of(timer, struct timerlat_variables, timer);
+
+ now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer));
+
+ /*
+ * Enable the osnoise: events for thread an softirq.
+ */
+ tlat->tracing_thread = true;
+
+ osn_var->thread.arrival_time = time_get();
+
+ /*
+ * A hardirq is running: the timer IRQ. It is for sure preempting
+ * a thread, and potentially preempting a softirq.
+ *
+ * At this point, it is not interesting to know the duration of the
+ * preempted thread (and maybe softirq), but how much time they will
+ * delay the beginning of the execution of the timer thread.
+ *
+ * To get the correct (net) delay added by the softirq, its delta_start
+ * is set as the IRQ one. In this way, at the return of the IRQ, the delta
+ * start of the sofitrq will be zeroed, accounting then only the time
+ * after that.
+ *
+ * The thread follows the same principle. However, if a softirq is
+ * running, the thread needs to receive the softirq delta_start. The
+ * reason being is that the softirq will be the last to be unfolded,
+ * resseting the thread delay to zero.
+ *
+ * The PREEMPT_RT is a special case, though. As softirqs run as threads
+ * on RT, moving the thread is enough.
+ */
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && osn_var->softirq.delta_start) {
+ copy_int_safe_time(osn_var, &osn_var->thread.delta_start,
+ &osn_var->softirq.delta_start);
+
+ copy_int_safe_time(osn_var, &osn_var->softirq.delta_start,
+ &osn_var->irq.delta_start);
+ } else {
+ copy_int_safe_time(osn_var, &osn_var->thread.delta_start,
+ &osn_var->irq.delta_start);
+ }
+
+ /*
+ * Compute the current time with the expected time.
+ */
+ diff = now - tlat->abs_period;
+
+ tlat->count++;
+ s.seqnum = tlat->count;
+ s.timer_latency = diff;
+ s.context = IRQ_CONTEXT;
+
+ trace_timerlat_sample(&s);
+
+ if (osnoise_data.stop_tracing) {
+ if (time_to_us(diff) >= osnoise_data.stop_tracing) {
+
+ /*
+ * At this point, if stop_tracing is set and <= print_stack,
+ * print_stack is set and would be printed in the thread handler.
+ *
+ * Thus, print the stack trace as it is helpful to define the
+ * root cause of an IRQ latency.
+ */
+ if (osnoise_data.stop_tracing <= osnoise_data.print_stack) {
+ timerlat_save_stack(0);
+ timerlat_dump_stack(time_to_us(diff));
+ }
+
+ osnoise_stop_tracing();
+ notify_new_max_latency(diff);
+
+ wake_up_process(tlat->kthread);
+
+ return HRTIMER_NORESTART;
+ }
+ }
+
+ wake_up_process(tlat->kthread);
+
+ if (osnoise_data.print_stack)
+ timerlat_save_stack(0);
+
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * wait_next_period - Wait for the next period for timerlat
+ */
+static int wait_next_period(struct timerlat_variables *tlat)
+{
+ ktime_t next_abs_period, now;
+ u64 rel_period = osnoise_data.timerlat_period * 1000;
+
+ now = hrtimer_cb_get_time(&tlat->timer);
+ next_abs_period = ns_to_ktime(tlat->abs_period + rel_period);
+
+ /*
+ * Save the next abs_period.
+ */
+ tlat->abs_period = (u64) ktime_to_ns(next_abs_period);
+
+ /*
+ * If the new abs_period is in the past, skip the activation.
+ */
+ while (ktime_compare(now, next_abs_period) > 0) {
+ next_abs_period = ns_to_ktime(tlat->abs_period + rel_period);
+ tlat->abs_period = (u64) ktime_to_ns(next_abs_period);
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ hrtimer_start(&tlat->timer, next_abs_period, HRTIMER_MODE_ABS_PINNED_HARD);
+ schedule();
+ return 1;
+}
+
+/*
+ * timerlat_main- Timerlat main
+ */
+static int timerlat_main(void *data)
+{
+ struct osnoise_variables *osn_var = this_cpu_osn_var();
+ struct timerlat_variables *tlat = this_cpu_tmr_var();
+ struct timerlat_sample s;
+ struct sched_param sp;
+ unsigned long flags;
+ u64 now, diff;
+
+ /*
+ * Make the thread RT, that is how cyclictest is usually used.
+ */
+ sp.sched_priority = DEFAULT_TIMERLAT_PRIO;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+
+ /*
+ * This thread was created pinned to the CPU using PF_NO_SETAFFINITY.
+ * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread.
+ *
+ * To work around this limitation, disable migration and remove the
+ * flag.
+ */
+ migrate_disable();
+ raw_spin_lock_irqsave(&current->pi_lock, flags);
+ current->flags &= ~(PF_NO_SETAFFINITY);
+ raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+
+ tlat->count = 0;
+ tlat->tracing_thread = false;
+
+ hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
+ tlat->timer.function = timerlat_irq;
+ tlat->kthread = current;
+ osn_var->pid = current->pid;
+ /*
+ * Anotate the arrival time.
+ */
+ tlat->abs_period = hrtimer_cb_get_time(&tlat->timer);
+
+ wait_next_period(tlat);
+
+ osn_var->sampling = 1;
+
+ while (!kthread_should_stop()) {
+
+ now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer));
+ diff = now - tlat->abs_period;
+
+ s.seqnum = tlat->count;
+ s.timer_latency = diff;
+ s.context = THREAD_CONTEXT;
+
+ trace_timerlat_sample(&s);
+
+ notify_new_max_latency(diff);
+
+ timerlat_dump_stack(time_to_us(diff));
+
+ tlat->tracing_thread = false;
+ if (osnoise_data.stop_tracing_total)
+ if (time_to_us(diff) >= osnoise_data.stop_tracing_total)
+ osnoise_stop_tracing();
+
+ if (osnoise_migration_pending())
+ break;
+
+ wait_next_period(tlat);
+ }
+
+ hrtimer_cancel(&tlat->timer);
+ migrate_enable();
+ return 0;
+}
+#else /* CONFIG_TIMERLAT_TRACER */
+static int timerlat_main(void *data)
+{
+ return 0;
+}
+#endif /* CONFIG_TIMERLAT_TRACER */
+
+/*
+ * stop_kthread - stop a workload thread
+ */
+static void stop_kthread(unsigned int cpu)
+{
+ struct task_struct *kthread;
+
+ kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread;
+ if (kthread) {
+ if (test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ kthread_stop(kthread);
+ } else {
+ /*
+ * This is a user thread waiting on the timerlat_fd. We need
+ * to close all users, and the best way to guarantee this is
+ * by killing the thread. NOTE: this is a purpose specific file.
+ */
+ kill_pid(kthread->thread_pid, SIGKILL, 1);
+ put_task_struct(kthread);
+ }
+ per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
+ } else {
+ /* if no workload, just return */
+ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ /*
+ * This is set in the osnoise tracer case.
+ */
+ per_cpu(per_cpu_osnoise_var, cpu).sampling = false;
+ barrier();
+ return;
+ }
+ }
+}
+
+/*
+ * stop_per_cpu_kthread - Stop per-cpu threads
+ *
+ * Stop the osnoise sampling htread. Use this on unload and at system
+ * shutdown.
+ */
+static void stop_per_cpu_kthreads(void)
+{
+ int cpu;
+
+ cpus_read_lock();
+
+ for_each_online_cpu(cpu)
+ stop_kthread(cpu);
+
+ cpus_read_unlock();
+}
+
+/*
+ * start_kthread - Start a workload tread
+ */
+static int start_kthread(unsigned int cpu)
+{
+ struct task_struct *kthread;
+ void *main = osnoise_main;
+ char comm[24];
+
+ if (timerlat_enabled()) {
+ snprintf(comm, 24, "timerlat/%d", cpu);
+ main = timerlat_main;
+ } else {
+ /* if no workload, just return */
+ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ per_cpu(per_cpu_osnoise_var, cpu).sampling = true;
+ barrier();
+ return 0;
+ }
+ snprintf(comm, 24, "osnoise/%d", cpu);
+ }
+
+ kthread = kthread_run_on_cpu(main, NULL, cpu, comm);
+
+ if (IS_ERR(kthread)) {
+ pr_err(BANNER "could not start sampling thread\n");
+ stop_per_cpu_kthreads();
+ return -ENOMEM;
+ }
+
+ per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread;
+
+ return 0;
+}
+
+/*
+ * start_per_cpu_kthread - Kick off per-cpu osnoise sampling kthreads
+ *
+ * This starts the kernel thread that will look for osnoise on many
+ * cpus.
+ */
+static int start_per_cpu_kthreads(void)
+{
+ struct cpumask *current_mask = &save_cpumask;
+ int retval = 0;
+ int cpu;
+
+ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ if (timerlat_enabled())
+ return 0;
+ }
+
+ cpus_read_lock();
+ /*
+ * Run only on online CPUs in which osnoise is allowed to run.
+ */
+ cpumask_and(current_mask, cpu_online_mask, &osnoise_cpumask);
+
+ for_each_possible_cpu(cpu)
+ per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
+
+ for_each_cpu(cpu, current_mask) {
+ retval = start_kthread(cpu);
+ if (retval) {
+ cpus_read_unlock();
+ stop_per_cpu_kthreads();
+ return retval;
+ }
+ }
+
+ cpus_read_unlock();
+
+ return retval;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void osnoise_hotplug_workfn(struct work_struct *dummy)
+{
+ unsigned int cpu = smp_processor_id();
+
+ mutex_lock(&trace_types_lock);
+
+ if (!osnoise_has_registered_instances())
+ goto out_unlock_trace;
+
+ mutex_lock(&interface_lock);
+ cpus_read_lock();
+
+ if (!cpumask_test_cpu(cpu, &osnoise_cpumask))
+ goto out_unlock;
+
+ start_kthread(cpu);
+
+out_unlock:
+ cpus_read_unlock();
+ mutex_unlock(&interface_lock);
+out_unlock_trace:
+ mutex_unlock(&trace_types_lock);
+}
+
+static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn);
+
+/*
+ * osnoise_cpu_init - CPU hotplug online callback function
+ */
+static int osnoise_cpu_init(unsigned int cpu)
+{
+ schedule_work_on(cpu, &osnoise_hotplug_work);
+ return 0;
+}
+
+/*
+ * osnoise_cpu_die - CPU hotplug offline callback function
+ */
+static int osnoise_cpu_die(unsigned int cpu)
+{
+ stop_kthread(cpu);
+ return 0;
+}
+
+static void osnoise_init_hotplug_support(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "trace/osnoise:online",
+ osnoise_cpu_init, osnoise_cpu_die);
+ if (ret < 0)
+ pr_warn(BANNER "Error to init cpu hotplug support\n");
+
+ return;
+}
+#else /* CONFIG_HOTPLUG_CPU */
+static void osnoise_init_hotplug_support(void)
+{
+ return;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/*
+ * seq file functions for the osnoise/options file.
+ */
+static void *s_options_start(struct seq_file *s, loff_t *pos)
+{
+ int option = *pos;
+
+ mutex_lock(&interface_lock);
+
+ if (option >= OSN_MAX)
+ return NULL;
+
+ return pos;
+}
+
+static void *s_options_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ int option = ++(*pos);
+
+ if (option >= OSN_MAX)
+ return NULL;
+
+ return pos;
+}
+
+static int s_options_show(struct seq_file *s, void *v)
+{
+ loff_t *pos = v;
+ int option = *pos;
+
+ if (option == OSN_DEFAULTS) {
+ if (osnoise_options == OSN_DEFAULT_OPTIONS)
+ seq_printf(s, "%s", osnoise_options_str[option]);
+ else
+ seq_printf(s, "NO_%s", osnoise_options_str[option]);
+ goto out;
+ }
+
+ if (test_bit(option, &osnoise_options))
+ seq_printf(s, "%s", osnoise_options_str[option]);
+ else
+ seq_printf(s, "NO_%s", osnoise_options_str[option]);
+
+out:
+ if (option != OSN_MAX)
+ seq_puts(s, " ");
+
+ return 0;
+}
+
+static void s_options_stop(struct seq_file *s, void *v)
+{
+ seq_puts(s, "\n");
+ mutex_unlock(&interface_lock);
+}
+
+static const struct seq_operations osnoise_options_seq_ops = {
+ .start = s_options_start,
+ .next = s_options_next,
+ .show = s_options_show,
+ .stop = s_options_stop
+};
+
+static int osnoise_options_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &osnoise_options_seq_ops);
+};
+
+/**
+ * osnoise_options_write - Write function for "options" entry
+ * @filp: The active open file structure
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in @file
+ *
+ * Writing the option name sets the option, writing the "NO_"
+ * prefix in front of the option name disables it.
+ *
+ * Writing "DEFAULTS" resets the option values to the default ones.
+ */
+static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ int running, option, enable, retval;
+ char buf[256], *option_str;
+
+ if (cnt >= 256)
+ return -EINVAL;
+
+ if (copy_from_user(buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ if (strncmp(buf, "NO_", 3)) {
+ option_str = strstrip(buf);
+ enable = true;
+ } else {
+ option_str = strstrip(&buf[3]);
+ enable = false;
+ }
+
+ option = match_string(osnoise_options_str, OSN_MAX, option_str);
+ if (option < 0)
+ return -EINVAL;
+
+ /*
+ * trace_types_lock is taken to avoid concurrency on start/stop.
+ */
+ mutex_lock(&trace_types_lock);
+ running = osnoise_has_registered_instances();
+ if (running)
+ stop_per_cpu_kthreads();
+
+ mutex_lock(&interface_lock);
+ /*
+ * avoid CPU hotplug operations that might read options.
+ */
+ cpus_read_lock();
+
+ retval = cnt;
+
+ if (enable) {
+ if (option == OSN_DEFAULTS)
+ osnoise_options = OSN_DEFAULT_OPTIONS;
+ else
+ set_bit(option, &osnoise_options);
+ } else {
+ if (option == OSN_DEFAULTS)
+ retval = -EINVAL;
+ else
+ clear_bit(option, &osnoise_options);
+ }
+
+ cpus_read_unlock();
+ mutex_unlock(&interface_lock);
+
+ if (running)
+ start_per_cpu_kthreads();
+ mutex_unlock(&trace_types_lock);
+
+ return retval;
+}
+
+/*
+ * osnoise_cpus_read - Read function for reading the "cpus" file
+ * @filp: The active open file structure
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * Prints the "cpus" output into the user-provided buffer.
+ */
+static ssize_t
+osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count,
+ loff_t *ppos)
+{
+ char *mask_str;
+ int len;
+
+ mutex_lock(&interface_lock);
+
+ len = snprintf(NULL, 0, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)) + 1;
+ mask_str = kmalloc(len, GFP_KERNEL);
+ if (!mask_str) {
+ count = -ENOMEM;
+ goto out_unlock;
+ }
+
+ len = snprintf(mask_str, len, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask));
+ if (len >= count) {
+ count = -EINVAL;
+ goto out_free;
+ }
+
+ count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
+
+out_free:
+ kfree(mask_str);
+out_unlock:
+ mutex_unlock(&interface_lock);
+
+ return count;
+}
+
+/*
+ * osnoise_cpus_write - Write function for "cpus" entry
+ * @filp: The active open file structure
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in @file
+ *
+ * This function provides a write implementation for the "cpus"
+ * interface to the osnoise trace. By default, it lists all CPUs,
+ * in this way, allowing osnoise threads to run on any online CPU
+ * of the system. It serves to restrict the execution of osnoise to the
+ * set of CPUs writing via this interface. Why not use "tracing_cpumask"?
+ * Because the user might be interested in tracing what is running on
+ * other CPUs. For instance, one might run osnoise in one HT CPU
+ * while observing what is running on the sibling HT CPU.
+ */
+static ssize_t
+osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
+ loff_t *ppos)
+{
+ cpumask_var_t osnoise_cpumask_new;
+ int running, err;
+ char buf[256];
+
+ if (count >= 256)
+ return -EINVAL;
+
+ if (copy_from_user(buf, ubuf, count))
+ return -EFAULT;
+
+ if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = cpulist_parse(buf, osnoise_cpumask_new);
+ if (err)
+ goto err_free;
+
+ /*
+ * trace_types_lock is taken to avoid concurrency on start/stop.
+ */
+ mutex_lock(&trace_types_lock);
+ running = osnoise_has_registered_instances();
+ if (running)
+ stop_per_cpu_kthreads();
+
+ mutex_lock(&interface_lock);
+ /*
+ * osnoise_cpumask is read by CPU hotplug operations.
+ */
+ cpus_read_lock();
+
+ cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new);
+
+ cpus_read_unlock();
+ mutex_unlock(&interface_lock);
+
+ if (running)
+ start_per_cpu_kthreads();
+ mutex_unlock(&trace_types_lock);
+
+ free_cpumask_var(osnoise_cpumask_new);
+ return count;
+
+err_free:
+ free_cpumask_var(osnoise_cpumask_new);
+
+ return err;
+}
+
+#ifdef CONFIG_TIMERLAT_TRACER
+static int timerlat_fd_open(struct inode *inode, struct file *file)
+{
+ struct osnoise_variables *osn_var;
+ struct timerlat_variables *tlat;
+ long cpu = (long) inode->i_cdev;
+
+ mutex_lock(&interface_lock);
+
+ /*
+ * This file is accessible only if timerlat is enabled, and
+ * NO_OSNOISE_WORKLOAD is set.
+ */
+ if (!timerlat_enabled() || test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ mutex_unlock(&interface_lock);
+ return -EINVAL;
+ }
+
+ migrate_disable();
+
+ osn_var = this_cpu_osn_var();
+
+ /*
+ * The osn_var->pid holds the single access to this file.
+ */
+ if (osn_var->pid) {
+ mutex_unlock(&interface_lock);
+ migrate_enable();
+ return -EBUSY;
+ }
+
+ /*
+ * timerlat tracer is a per-cpu tracer. Check if the user-space too
+ * is pinned to a single CPU. The tracer laters monitor if the task
+ * migrates and then disables tracer if it does. However, it is
+ * worth doing this basic acceptance test to avoid obviusly wrong
+ * setup.
+ */
+ if (current->nr_cpus_allowed > 1 || cpu != smp_processor_id()) {
+ mutex_unlock(&interface_lock);
+ migrate_enable();
+ return -EPERM;
+ }
+
+ /*
+ * From now on, it is good to go.
+ */
+ file->private_data = inode->i_cdev;
+
+ get_task_struct(current);
+
+ osn_var->kthread = current;
+ osn_var->pid = current->pid;
+
+ /*
+ * Setup is done.
+ */
+ mutex_unlock(&interface_lock);
+
+ tlat = this_cpu_tmr_var();
+ tlat->count = 0;
+
+ hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
+ tlat->timer.function = timerlat_irq;
+
+ migrate_enable();
+ return 0;
+};
+
+/*
+ * timerlat_fd_read - Read function for "timerlat_fd" file
+ * @file: The active open file structure
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * Prints 1 on timerlat, the number of interferences on osnoise, -1 on error.
+ */
+static ssize_t
+timerlat_fd_read(struct file *file, char __user *ubuf, size_t count,
+ loff_t *ppos)
+{
+ long cpu = (long) file->private_data;
+ struct osnoise_variables *osn_var;
+ struct timerlat_variables *tlat;
+ struct timerlat_sample s;
+ s64 diff;
+ u64 now;
+
+ migrate_disable();
+
+ tlat = this_cpu_tmr_var();
+
+ /*
+ * While in user-space, the thread is migratable. There is nothing
+ * we can do about it.
+ * So, if the thread is running on another CPU, stop the machinery.
+ */
+ if (cpu == smp_processor_id()) {
+ if (tlat->uthread_migrate) {
+ migrate_enable();
+ return -EINVAL;
+ }
+ } else {
+ per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1;
+ osnoise_taint("timerlat user thread migrate\n");
+ osnoise_stop_tracing();
+ migrate_enable();
+ return -EINVAL;
+ }
+
+ osn_var = this_cpu_osn_var();
+
+ /*
+ * The timerlat in user-space runs in a different order:
+ * the read() starts from the execution of the previous occurrence,
+ * sleeping for the next occurrence.
+ *
+ * So, skip if we are entering on read() before the first wakeup
+ * from timerlat IRQ:
+ */
+ if (likely(osn_var->sampling)) {
+ now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer));
+ diff = now - tlat->abs_period;
+
+ /*
+ * it was not a timer firing, but some other signal?
+ */
+ if (diff < 0)
+ goto out;
+
+ s.seqnum = tlat->count;
+ s.timer_latency = diff;
+ s.context = THREAD_URET;
+
+ trace_timerlat_sample(&s);
+
+ notify_new_max_latency(diff);
+
+ tlat->tracing_thread = false;
+ if (osnoise_data.stop_tracing_total)
+ if (time_to_us(diff) >= osnoise_data.stop_tracing_total)
+ osnoise_stop_tracing();
+ } else {
+ tlat->tracing_thread = false;
+ tlat->kthread = current;
+
+ /* Annotate now to drift new period */
+ tlat->abs_period = hrtimer_cb_get_time(&tlat->timer);
+
+ osn_var->sampling = 1;
+ }
+
+ /* wait for the next period */
+ wait_next_period(tlat);
+
+ /* This is the wakeup from this cycle */
+ now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer));
+ diff = now - tlat->abs_period;
+
+ /*
+ * it was not a timer firing, but some other signal?
+ */
+ if (diff < 0)
+ goto out;
+
+ s.seqnum = tlat->count;
+ s.timer_latency = diff;
+ s.context = THREAD_CONTEXT;
+
+ trace_timerlat_sample(&s);
+
+ if (osnoise_data.stop_tracing_total) {
+ if (time_to_us(diff) >= osnoise_data.stop_tracing_total) {
+ timerlat_dump_stack(time_to_us(diff));
+ notify_new_max_latency(diff);
+ osnoise_stop_tracing();
+ }
+ }
+
+out:
+ migrate_enable();
+ return 0;
+}
+
+static int timerlat_fd_release(struct inode *inode, struct file *file)
+{
+ struct osnoise_variables *osn_var;
+ struct timerlat_variables *tlat_var;
+ long cpu = (long) file->private_data;
+
+ migrate_disable();
+ mutex_lock(&interface_lock);
+
+ osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu);
+ tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu);
+
+ hrtimer_cancel(&tlat_var->timer);
+ memset(tlat_var, 0, sizeof(*tlat_var));
+
+ osn_var->sampling = 0;
+ osn_var->pid = 0;
+
+ /*
+ * We are leaving, not being stopped... see stop_kthread();
+ */
+ if (osn_var->kthread) {
+ put_task_struct(osn_var->kthread);
+ osn_var->kthread = NULL;
+ }
+
+ mutex_unlock(&interface_lock);
+ migrate_enable();
+ return 0;
+}
+#endif
+
+/*
+ * osnoise/runtime_us: cannot be greater than the period.
+ */
+static struct trace_min_max_param osnoise_runtime = {
+ .lock = &interface_lock,
+ .val = &osnoise_data.sample_runtime,
+ .max = &osnoise_data.sample_period,
+ .min = NULL,
+};
+
+/*
+ * osnoise/period_us: cannot be smaller than the runtime.
+ */
+static struct trace_min_max_param osnoise_period = {
+ .lock = &interface_lock,
+ .val = &osnoise_data.sample_period,
+ .max = NULL,
+ .min = &osnoise_data.sample_runtime,
+};
+
+/*
+ * osnoise/stop_tracing_us: no limit.
+ */
+static struct trace_min_max_param osnoise_stop_tracing_in = {
+ .lock = &interface_lock,
+ .val = &osnoise_data.stop_tracing,
+ .max = NULL,
+ .min = NULL,
+};
+
+/*
+ * osnoise/stop_tracing_total_us: no limit.
+ */
+static struct trace_min_max_param osnoise_stop_tracing_total = {
+ .lock = &interface_lock,
+ .val = &osnoise_data.stop_tracing_total,
+ .max = NULL,
+ .min = NULL,
+};
+
+#ifdef CONFIG_TIMERLAT_TRACER
+/*
+ * osnoise/print_stack: print the stacktrace of the IRQ handler if the total
+ * latency is higher than val.
+ */
+static struct trace_min_max_param osnoise_print_stack = {
+ .lock = &interface_lock,
+ .val = &osnoise_data.print_stack,
+ .max = NULL,
+ .min = NULL,
+};
+
+/*
+ * osnoise/timerlat_period: min 100 us, max 1 s
+ */
+static u64 timerlat_min_period = 100;
+static u64 timerlat_max_period = 1000000;
+static struct trace_min_max_param timerlat_period = {
+ .lock = &interface_lock,
+ .val = &osnoise_data.timerlat_period,
+ .max = &timerlat_max_period,
+ .min = &timerlat_min_period,
+};
+
+static const struct file_operations timerlat_fd_fops = {
+ .open = timerlat_fd_open,
+ .read = timerlat_fd_read,
+ .release = timerlat_fd_release,
+ .llseek = generic_file_llseek,
+};
+#endif
+
+static const struct file_operations cpus_fops = {
+ .open = tracing_open_generic,
+ .read = osnoise_cpus_read,
+ .write = osnoise_cpus_write,
+ .llseek = generic_file_llseek,
+};
+
+static const struct file_operations osnoise_options_fops = {
+ .open = osnoise_options_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = osnoise_options_write
+};
+
+#ifdef CONFIG_TIMERLAT_TRACER
+#ifdef CONFIG_STACKTRACE
+static int init_timerlat_stack_tracefs(struct dentry *top_dir)
+{
+ struct dentry *tmp;
+
+ tmp = tracefs_create_file("print_stack", TRACE_MODE_WRITE, top_dir,
+ &osnoise_print_stack, &trace_min_max_fops);
+ if (!tmp)
+ return -ENOMEM;
+
+ return 0;
+}
+#else /* CONFIG_STACKTRACE */
+static int init_timerlat_stack_tracefs(struct dentry *top_dir)
+{
+ return 0;
+}
+#endif /* CONFIG_STACKTRACE */
+
+static int osnoise_create_cpu_timerlat_fd(struct dentry *top_dir)
+{
+ struct dentry *timerlat_fd;
+ struct dentry *per_cpu;
+ struct dentry *cpu_dir;
+ char cpu_str[30]; /* see trace.c: tracing_init_tracefs_percpu() */
+ long cpu;
+
+ /*
+ * Why not using tracing instance per_cpu/ dir?
+ *
+ * Because osnoise/timerlat have a single workload, having
+ * multiple files like these are wast of memory.
+ */
+ per_cpu = tracefs_create_dir("per_cpu", top_dir);
+ if (!per_cpu)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ snprintf(cpu_str, 30, "cpu%ld", cpu);
+ cpu_dir = tracefs_create_dir(cpu_str, per_cpu);
+ if (!cpu_dir)
+ goto out_clean;
+
+ timerlat_fd = trace_create_file("timerlat_fd", TRACE_MODE_READ,
+ cpu_dir, NULL, &timerlat_fd_fops);
+ if (!timerlat_fd)
+ goto out_clean;
+
+ /* Record the CPU */
+ d_inode(timerlat_fd)->i_cdev = (void *)(cpu);
+ }
+
+ return 0;
+
+out_clean:
+ tracefs_remove(per_cpu);
+ return -ENOMEM;
+}
+
+/*
+ * init_timerlat_tracefs - A function to initialize the timerlat interface files
+ */
+static int init_timerlat_tracefs(struct dentry *top_dir)
+{
+ struct dentry *tmp;
+ int retval;
+
+ tmp = tracefs_create_file("timerlat_period_us", TRACE_MODE_WRITE, top_dir,
+ &timerlat_period, &trace_min_max_fops);
+ if (!tmp)
+ return -ENOMEM;
+
+ retval = osnoise_create_cpu_timerlat_fd(top_dir);
+ if (retval)
+ return retval;
+
+ return init_timerlat_stack_tracefs(top_dir);
+}
+#else /* CONFIG_TIMERLAT_TRACER */
+static int init_timerlat_tracefs(struct dentry *top_dir)
+{
+ return 0;
+}
+#endif /* CONFIG_TIMERLAT_TRACER */
+
+/*
+ * init_tracefs - A function to initialize the tracefs interface files
+ *
+ * This function creates entries in tracefs for "osnoise" and "timerlat".
+ * It creates these directories in the tracing directory, and within that
+ * directory the use can change and view the configs.
+ */
+static int init_tracefs(void)
+{
+ struct dentry *top_dir;
+ struct dentry *tmp;
+ int ret;
+
+ ret = tracing_init_dentry();
+ if (ret)
+ return -ENOMEM;
+
+ top_dir = tracefs_create_dir("osnoise", NULL);
+ if (!top_dir)
+ return 0;
+
+ tmp = tracefs_create_file("period_us", TRACE_MODE_WRITE, top_dir,
+ &osnoise_period, &trace_min_max_fops);
+ if (!tmp)
+ goto err;
+
+ tmp = tracefs_create_file("runtime_us", TRACE_MODE_WRITE, top_dir,
+ &osnoise_runtime, &trace_min_max_fops);
+ if (!tmp)
+ goto err;
+
+ tmp = tracefs_create_file("stop_tracing_us", TRACE_MODE_WRITE, top_dir,
+ &osnoise_stop_tracing_in, &trace_min_max_fops);
+ if (!tmp)
+ goto err;
+
+ tmp = tracefs_create_file("stop_tracing_total_us", TRACE_MODE_WRITE, top_dir,
+ &osnoise_stop_tracing_total, &trace_min_max_fops);
+ if (!tmp)
+ goto err;
+
+ tmp = trace_create_file("cpus", TRACE_MODE_WRITE, top_dir, NULL, &cpus_fops);
+ if (!tmp)
+ goto err;
+
+ tmp = trace_create_file("options", TRACE_MODE_WRITE, top_dir, NULL,
+ &osnoise_options_fops);
+ if (!tmp)
+ goto err;
+
+ ret = init_timerlat_tracefs(top_dir);
+ if (ret)
+ goto err;
+
+ return 0;
+
+err:
+ tracefs_remove(top_dir);
+ return -ENOMEM;
+}
+
+static int osnoise_hook_events(void)
+{
+ int retval;
+
+ /*
+ * Trace is already hooked, we are re-enabling from
+ * a stop_tracing_*.
+ */
+ if (trace_osnoise_callback_enabled)
+ return 0;
+
+ retval = hook_irq_events();
+ if (retval)
+ return -EINVAL;
+
+ retval = hook_softirq_events();
+ if (retval)
+ goto out_unhook_irq;
+
+ retval = hook_thread_events();
+ /*
+ * All fine!
+ */
+ if (!retval)
+ return 0;
+
+ unhook_softirq_events();
+out_unhook_irq:
+ unhook_irq_events();
+ return -EINVAL;
+}
+
+static void osnoise_unhook_events(void)
+{
+ unhook_thread_events();
+ unhook_softirq_events();
+ unhook_irq_events();
+}
+
+/*
+ * osnoise_workload_start - start the workload and hook to events
+ */
+static int osnoise_workload_start(void)
+{
+ int retval;
+
+ /*
+ * Instances need to be registered after calling workload
+ * start. Hence, if there is already an instance, the
+ * workload was already registered. Otherwise, this
+ * code is on the way to register the first instance,
+ * and the workload will start.
+ */
+ if (osnoise_has_registered_instances())
+ return 0;
+
+ osn_var_reset_all();
+
+ retval = osnoise_hook_events();
+ if (retval)
+ return retval;
+
+ /*
+ * Make sure that ftrace_nmi_enter/exit() see reset values
+ * before enabling trace_osnoise_callback_enabled.
+ */
+ barrier();
+ trace_osnoise_callback_enabled = true;
+
+ retval = start_per_cpu_kthreads();
+ if (retval) {
+ trace_osnoise_callback_enabled = false;
+ /*
+ * Make sure that ftrace_nmi_enter/exit() see
+ * trace_osnoise_callback_enabled as false before continuing.
+ */
+ barrier();
+
+ osnoise_unhook_events();
+ return retval;
+ }
+
+ return 0;
+}
+
+/*
+ * osnoise_workload_stop - stop the workload and unhook the events
+ */
+static void osnoise_workload_stop(void)
+{
+ /*
+ * Instances need to be unregistered before calling
+ * stop. Hence, if there is a registered instance, more
+ * than one instance is running, and the workload will not
+ * yet stop. Otherwise, this code is on the way to disable
+ * the last instance, and the workload can stop.
+ */
+ if (osnoise_has_registered_instances())
+ return;
+
+ /*
+ * If callbacks were already disabled in a previous stop
+ * call, there is no need to disable then again.
+ *
+ * For instance, this happens when tracing is stopped via:
+ * echo 0 > tracing_on
+ * echo nop > current_tracer.
+ */
+ if (!trace_osnoise_callback_enabled)
+ return;
+
+ trace_osnoise_callback_enabled = false;
+ /*
+ * Make sure that ftrace_nmi_enter/exit() see
+ * trace_osnoise_callback_enabled as false before continuing.
+ */
+ barrier();
+
+ stop_per_cpu_kthreads();
+
+ osnoise_unhook_events();
+}
+
+static void osnoise_tracer_start(struct trace_array *tr)
+{
+ int retval;
+
+ /*
+ * If the instance is already registered, there is no need to
+ * register it again.
+ */
+ if (osnoise_instance_registered(tr))
+ return;
+
+ retval = osnoise_workload_start();
+ if (retval)
+ pr_err(BANNER "Error starting osnoise tracer\n");
+
+ osnoise_register_instance(tr);
+}
+
+static void osnoise_tracer_stop(struct trace_array *tr)
+{
+ osnoise_unregister_instance(tr);
+ osnoise_workload_stop();
+}
+
+static int osnoise_tracer_init(struct trace_array *tr)
+{
+ /*
+ * Only allow osnoise tracer if timerlat tracer is not running
+ * already.
+ */
+ if (timerlat_enabled())
+ return -EBUSY;
+
+ tr->max_latency = 0;
+
+ osnoise_tracer_start(tr);
+ return 0;
+}
+
+static void osnoise_tracer_reset(struct trace_array *tr)
+{
+ osnoise_tracer_stop(tr);
+}
+
+static struct tracer osnoise_tracer __read_mostly = {
+ .name = "osnoise",
+ .init = osnoise_tracer_init,
+ .reset = osnoise_tracer_reset,
+ .start = osnoise_tracer_start,
+ .stop = osnoise_tracer_stop,
+ .print_header = print_osnoise_headers,
+ .allow_instances = true,
+};
+
+#ifdef CONFIG_TIMERLAT_TRACER
+static void timerlat_tracer_start(struct trace_array *tr)
+{
+ int retval;
+
+ /*
+ * If the instance is already registered, there is no need to
+ * register it again.
+ */
+ if (osnoise_instance_registered(tr))
+ return;
+
+ retval = osnoise_workload_start();
+ if (retval)
+ pr_err(BANNER "Error starting timerlat tracer\n");
+
+ osnoise_register_instance(tr);
+
+ return;
+}
+
+static void timerlat_tracer_stop(struct trace_array *tr)
+{
+ int cpu;
+
+ osnoise_unregister_instance(tr);
+
+ /*
+ * Instruct the threads to stop only if this is the last instance.
+ */
+ if (!osnoise_has_registered_instances()) {
+ for_each_online_cpu(cpu)
+ per_cpu(per_cpu_osnoise_var, cpu).sampling = 0;
+ }
+
+ osnoise_workload_stop();
+}
+
+static int timerlat_tracer_init(struct trace_array *tr)
+{
+ /*
+ * Only allow timerlat tracer if osnoise tracer is not running already.
+ */
+ if (osnoise_has_registered_instances() && !osnoise_data.timerlat_tracer)
+ return -EBUSY;
+
+ /*
+ * If this is the first instance, set timerlat_tracer to block
+ * osnoise tracer start.
+ */
+ if (!osnoise_has_registered_instances())
+ osnoise_data.timerlat_tracer = 1;
+
+ tr->max_latency = 0;
+ timerlat_tracer_start(tr);
+
+ return 0;
+}
+
+static void timerlat_tracer_reset(struct trace_array *tr)
+{
+ timerlat_tracer_stop(tr);
+
+ /*
+ * If this is the last instance, reset timerlat_tracer allowing
+ * osnoise to be started.
+ */
+ if (!osnoise_has_registered_instances())
+ osnoise_data.timerlat_tracer = 0;
+}
+
+static struct tracer timerlat_tracer __read_mostly = {
+ .name = "timerlat",
+ .init = timerlat_tracer_init,
+ .reset = timerlat_tracer_reset,
+ .start = timerlat_tracer_start,
+ .stop = timerlat_tracer_stop,
+ .print_header = print_timerlat_headers,
+ .allow_instances = true,
+};
+
+__init static int init_timerlat_tracer(void)
+{
+ return register_tracer(&timerlat_tracer);
+}
+#else /* CONFIG_TIMERLAT_TRACER */
+__init static int init_timerlat_tracer(void)
+{
+ return 0;
+}
+#endif /* CONFIG_TIMERLAT_TRACER */
+
+__init static int init_osnoise_tracer(void)
+{
+ int ret;
+
+ mutex_init(&interface_lock);
+
+ cpumask_copy(&osnoise_cpumask, cpu_all_mask);
+
+ ret = register_tracer(&osnoise_tracer);
+ if (ret) {
+ pr_err(BANNER "Error registering osnoise!\n");
+ return ret;
+ }
+
+ ret = init_timerlat_tracer();
+ if (ret) {
+ pr_err(BANNER "Error registering timerlat!\n");
+ return ret;
+ }
+
+ osnoise_init_hotplug_support();
+
+ INIT_LIST_HEAD_RCU(&osnoise_instances);
+
+ init_tracefs();
+
+ return 0;
+}
+late_initcall(init_osnoise_tracer);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 73976de7f8cc..d8b302d01083 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -8,8 +8,10 @@
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/ftrace.h>
+#include <linux/kprobes.h>
#include <linux/sched/clock.h>
#include <linux/sched/mm.h>
+#include <linux/idr.h>
#include "trace_output.h"
@@ -20,8 +22,6 @@ DECLARE_RWSEM(trace_event_sem);
static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
-static int next_event_type = __TRACE_LAST_TYPE + 1;
-
enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
@@ -221,8 +221,11 @@ trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len,
const char *ret = trace_seq_buffer_ptr(p);
const char *fmt = concatenate ? "%*phN" : "%*ph";
- for (i = 0; i < buf_len; i += 16)
+ for (i = 0; i < buf_len; i += 16) {
+ if (!concatenate && i != 0)
+ trace_seq_putc(p, ' ');
trace_seq_printf(p, fmt, min(buf_len - i, 16), &buf[i]);
+ }
trace_seq_putc(p, 0);
return ret;
@@ -312,13 +315,24 @@ int trace_raw_output_prep(struct trace_iterator *iter,
}
EXPORT_SYMBOL(trace_raw_output_prep);
-static int trace_output_raw(struct trace_iterator *iter, char *name,
- char *fmt, va_list ap)
+void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ trace_check_vprintf(iter, trace_event_format(iter, fmt), ap);
+ va_end(ap);
+}
+EXPORT_SYMBOL(trace_event_printf);
+
+static __printf(3, 0)
+int trace_output_raw(struct trace_iterator *iter, char *name,
+ char *fmt, va_list ap)
{
struct trace_seq *s = &iter->seq;
trace_seq_printf(s, "%s: ", name);
- trace_seq_vprintf(s, fmt, ap);
+ trace_seq_vprintf(s, trace_event_format(iter, fmt), ap);
return trace_handle_return(s);
}
@@ -336,25 +350,15 @@ int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
}
EXPORT_SYMBOL_GPL(trace_output_call);
-#ifdef CONFIG_KRETPROBES
-static inline const char *kretprobed(const char *name)
+static inline const char *kretprobed(const char *name, unsigned long addr)
{
- static const char tramp_name[] = "kretprobe_trampoline";
- int size = sizeof(tramp_name);
-
- if (strncmp(tramp_name, name, size) == 0)
+ if (is_kretprobe_trampoline(addr))
return "[unknown/kretprobe'd]";
return name;
}
-#else
-static inline const char *kretprobed(const char *name)
-{
- return name;
-}
-#endif /* CONFIG_KRETPROBES */
-static void
-seq_print_sym(struct trace_seq *s, unsigned long address, bool offset)
+void
+trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset)
{
#ifdef CONFIG_KALLSYMS
char str[KSYM_SYMBOL_LEN];
@@ -364,7 +368,7 @@ seq_print_sym(struct trace_seq *s, unsigned long address, bool offset)
sprint_symbol(str, address);
else
kallsyms_lookup(address, NULL, NULL, NULL, str);
- name = kretprobed(str);
+ name = kretprobed(str, address);
if (name && strlen(name)) {
trace_seq_puts(s, name);
@@ -400,7 +404,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
vmstart = vma->vm_start;
}
if (file) {
- ret = trace_seq_path(s, &file->f_path);
+ ret = trace_seq_path(s, file_user_path(file));
if (ret)
trace_seq_printf(s, "[+0x%lx]",
ip - vmstart);
@@ -420,7 +424,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
goto out;
}
- seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET);
+ trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET);
if (sym_flags & TRACE_ITER_SYM_ADDR)
trace_seq_printf(s, " <" IP_FMT ">", ip);
@@ -444,14 +448,18 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
char irqs_off;
int hardirq;
int softirq;
+ int bh_off;
int nmi;
nmi = entry->flags & TRACE_FLAG_NMI;
hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+ bh_off = entry->flags & TRACE_FLAG_BH_OFF;
irqs_off =
+ (entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' :
(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+ bh_off ? 'b' :
(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
'.';
@@ -482,8 +490,13 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
trace_seq_printf(s, "%c%c%c",
irqs_off, need_resched, hardsoft_irq);
- if (entry->preempt_count)
- trace_seq_printf(s, "%x", entry->preempt_count);
+ if (entry->preempt_count & 0xf)
+ trace_seq_printf(s, "%x", entry->preempt_count & 0xf);
+ else
+ trace_seq_putc(s, '.');
+
+ if (entry->preempt_count & 0xf0)
+ trace_seq_printf(s, "%x", entry->preempt_count >> 4);
else
trace_seq_putc(s, '.');
@@ -497,7 +510,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
trace_find_cmdline(entry->pid, comm);
- trace_seq_printf(s, "%8.8s-%-5d %3d",
+ trace_seq_printf(s, "%8.8s-%-7d %3d",
comm, entry->pid, cpu);
return trace_print_lat_fmt(s, entry);
@@ -577,26 +590,39 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
return !trace_seq_has_overflowed(s);
}
+static void trace_print_time(struct trace_seq *s, struct trace_iterator *iter,
+ unsigned long long ts)
+{
+ unsigned long secs, usec_rem;
+ unsigned long long t;
+
+ if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
+ t = ns2usecs(ts);
+ usec_rem = do_div(t, USEC_PER_SEC);
+ secs = (unsigned long)t;
+ trace_seq_printf(s, " %5lu.%06lu", secs, usec_rem);
+ } else
+ trace_seq_printf(s, " %12llu", ts);
+}
+
int trace_print_context(struct trace_iterator *iter)
{
struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
- unsigned long long t;
- unsigned long secs, usec_rem;
char comm[TASK_COMM_LEN];
trace_find_cmdline(entry->pid, comm);
- trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
+ trace_seq_printf(s, "%16s-%-7d ", comm, entry->pid);
if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
unsigned int tgid = trace_find_tgid(entry->pid);
if (!tgid)
- trace_seq_printf(s, "(-----) ");
+ trace_seq_printf(s, "(-------) ");
else
- trace_seq_printf(s, "(%5d) ", tgid);
+ trace_seq_printf(s, "(%7d) ", tgid);
}
trace_seq_printf(s, "[%03d] ", iter->cpu);
@@ -604,13 +630,8 @@ int trace_print_context(struct trace_iterator *iter)
if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
trace_print_lat_fmt(s, entry);
- if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
- t = ns2usecs(iter->ts);
- usec_rem = do_div(t, USEC_PER_SEC);
- secs = (unsigned long)t;
- trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
- } else
- trace_seq_printf(s, " %12llu: ", iter->ts);
+ trace_print_time(s, iter, iter->ts);
+ trace_seq_puts(s, ": ");
return !trace_seq_has_overflowed(s);
}
@@ -636,9 +657,9 @@ int trace_print_lat_context(struct trace_iterator *iter)
trace_find_cmdline(entry->pid, comm);
trace_seq_printf(
- s, "%16s %5d %3d %d %08x %08lx ",
+ s, "%16s %7d %3d %d %08x %08lx ",
comm, entry->pid, iter->cpu, entry->flags,
- entry->preempt_count, iter->idx);
+ entry->preempt_count & 0xf, iter->idx);
} else {
lat_print_generic(s, entry, iter->cpu);
}
@@ -670,34 +691,24 @@ struct trace_event *ftrace_find_event(int type)
return NULL;
}
-static LIST_HEAD(ftrace_event_list);
+static DEFINE_IDA(trace_event_ida);
-static int trace_search_list(struct list_head **list)
+static void free_trace_event_type(int type)
{
- struct trace_event *e;
- int last = __TRACE_LAST_TYPE;
-
- if (list_empty(&ftrace_event_list)) {
- *list = &ftrace_event_list;
- return last + 1;
- }
+ if (type >= __TRACE_LAST_TYPE)
+ ida_free(&trace_event_ida, type);
+}
- /*
- * We used up all possible max events,
- * lets see if somebody freed one.
- */
- list_for_each_entry(e, &ftrace_event_list, list) {
- if (e->type != last + 1)
- break;
- last++;
- }
+static int alloc_trace_event_type(void)
+{
+ int next;
- /* Did we used up all 65 thousand events??? */
- if ((last + 1) > TRACE_EVENT_TYPE_MAX)
+ /* Skip static defined type numbers */
+ next = ida_alloc_range(&trace_event_ida, __TRACE_LAST_TYPE,
+ TRACE_EVENT_TYPE_MAX, GFP_KERNEL);
+ if (next < 0)
return 0;
-
- *list = &e->list;
- return last + 1;
+ return next;
}
void trace_event_read_lock(void)
@@ -738,31 +749,12 @@ int register_trace_event(struct trace_event *event)
if (WARN_ON(!event->funcs))
goto out;
- INIT_LIST_HEAD(&event->list);
-
if (!event->type) {
- struct list_head *list = NULL;
-
- if (next_event_type > TRACE_EVENT_TYPE_MAX) {
-
- event->type = trace_search_list(&list);
- if (!event->type)
- goto out;
-
- } else {
-
- event->type = next_event_type++;
- list = &ftrace_event_list;
- }
-
- if (WARN_ON(ftrace_find_event(event->type)))
+ event->type = alloc_trace_event_type();
+ if (!event->type)
goto out;
-
- list_add_tail(&event->list, list);
-
- } else if (event->type > __TRACE_LAST_TYPE) {
- printk(KERN_WARNING "Need to add type to trace.h\n");
- WARN_ON(1);
+ } else if (WARN(event->type > __TRACE_LAST_TYPE,
+ "Need to add type to trace.h")) {
goto out;
} else {
/* Is this event already used */
@@ -797,7 +789,7 @@ EXPORT_SYMBOL_GPL(register_trace_event);
int __unregister_trace_event(struct trace_event *event)
{
hlist_del(&event->node);
- list_del(&event->list);
+ free_trace_event_type(event->type);
return 0;
}
@@ -819,6 +811,176 @@ EXPORT_SYMBOL_GPL(unregister_trace_event);
* Standard events
*/
+static void print_array(struct trace_iterator *iter, void *pos,
+ struct ftrace_event_field *field)
+{
+ int offset;
+ int len;
+ int i;
+
+ offset = *(int *)pos & 0xffff;
+ len = *(int *)pos >> 16;
+
+ if (field)
+ offset += field->offset + sizeof(int);
+
+ if (offset + len > iter->ent_size) {
+ trace_seq_puts(&iter->seq, "<OVERFLOW>");
+ return;
+ }
+
+ pos = (void *)iter->ent + offset;
+
+ for (i = 0; i < len; i++, pos++) {
+ if (i)
+ trace_seq_putc(&iter->seq, ',');
+ trace_seq_printf(&iter->seq, "%02x", *(unsigned char *)pos);
+ }
+}
+
+static void print_fields(struct trace_iterator *iter, struct trace_event_call *call,
+ struct list_head *head)
+{
+ struct ftrace_event_field *field;
+ int offset;
+ int len;
+ int ret;
+ void *pos;
+
+ list_for_each_entry_reverse(field, head, link) {
+ trace_seq_printf(&iter->seq, " %s=", field->name);
+ if (field->offset + field->size > iter->ent_size) {
+ trace_seq_puts(&iter->seq, "<OVERFLOW>");
+ continue;
+ }
+ pos = (void *)iter->ent + field->offset;
+
+ switch (field->filter_type) {
+ case FILTER_COMM:
+ case FILTER_STATIC_STRING:
+ trace_seq_printf(&iter->seq, "%.*s", field->size, (char *)pos);
+ break;
+ case FILTER_RDYN_STRING:
+ case FILTER_DYN_STRING:
+ offset = *(int *)pos & 0xffff;
+ len = *(int *)pos >> 16;
+
+ if (field->filter_type == FILTER_RDYN_STRING)
+ offset += field->offset + sizeof(int);
+
+ if (offset + len > iter->ent_size) {
+ trace_seq_puts(&iter->seq, "<OVERFLOW>");
+ break;
+ }
+ pos = (void *)iter->ent + offset;
+ trace_seq_printf(&iter->seq, "%.*s", len, (char *)pos);
+ break;
+ case FILTER_PTR_STRING:
+ if (!iter->fmt_size)
+ trace_iter_expand_format(iter);
+ pos = *(void **)pos;
+ ret = strncpy_from_kernel_nofault(iter->fmt, pos,
+ iter->fmt_size);
+ if (ret < 0)
+ trace_seq_printf(&iter->seq, "(0x%px)", pos);
+ else
+ trace_seq_printf(&iter->seq, "(0x%px:%s)",
+ pos, iter->fmt);
+ break;
+ case FILTER_TRACE_FN:
+ pos = *(void **)pos;
+ trace_seq_printf(&iter->seq, "%pS", pos);
+ break;
+ case FILTER_CPU:
+ case FILTER_OTHER:
+ switch (field->size) {
+ case 1:
+ if (isprint(*(char *)pos)) {
+ trace_seq_printf(&iter->seq, "'%c'",
+ *(unsigned char *)pos);
+ }
+ trace_seq_printf(&iter->seq, "(%d)",
+ *(unsigned char *)pos);
+ break;
+ case 2:
+ trace_seq_printf(&iter->seq, "0x%x (%d)",
+ *(unsigned short *)pos,
+ *(unsigned short *)pos);
+ break;
+ case 4:
+ /* dynamic array info is 4 bytes */
+ if (strstr(field->type, "__data_loc")) {
+ print_array(iter, pos, NULL);
+ break;
+ }
+
+ if (strstr(field->type, "__rel_loc")) {
+ print_array(iter, pos, field);
+ break;
+ }
+
+ trace_seq_printf(&iter->seq, "0x%x (%d)",
+ *(unsigned int *)pos,
+ *(unsigned int *)pos);
+ break;
+ case 8:
+ trace_seq_printf(&iter->seq, "0x%llx (%lld)",
+ *(unsigned long long *)pos,
+ *(unsigned long long *)pos);
+ break;
+ default:
+ trace_seq_puts(&iter->seq, "<INVALID-SIZE>");
+ break;
+ }
+ break;
+ default:
+ trace_seq_puts(&iter->seq, "<INVALID-TYPE>");
+ }
+ }
+ trace_seq_putc(&iter->seq, '\n');
+}
+
+enum print_line_t print_event_fields(struct trace_iterator *iter,
+ struct trace_event *event)
+{
+ struct trace_event_call *call;
+ struct list_head *head;
+
+ /* ftrace defined events have separate call structures */
+ if (event->type <= __TRACE_LAST_TYPE) {
+ bool found = false;
+
+ down_read(&trace_event_sem);
+ list_for_each_entry(call, &ftrace_events, list) {
+ if (call->event.type == event->type) {
+ found = true;
+ break;
+ }
+ /* No need to search all events */
+ if (call->event.type > __TRACE_LAST_TYPE)
+ break;
+ }
+ up_read(&trace_event_sem);
+ if (!found) {
+ trace_seq_printf(&iter->seq, "UNKNOWN TYPE %d\n", event->type);
+ goto out;
+ }
+ } else {
+ call = container_of(event, struct trace_event_call, event);
+ }
+ head = trace_get_fields(call);
+
+ trace_seq_printf(&iter->seq, "%s:", trace_event_name(call));
+
+ if (head && !list_empty(head))
+ print_fields(iter, call, head);
+ else
+ trace_seq_puts(&iter->seq, "No fields found\n");
+
+ out:
+ return trace_handle_return(&iter->seq);
+}
+
enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
@@ -827,6 +989,17 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
return trace_handle_return(&iter->seq);
}
+static void print_fn_trace(struct trace_seq *s, unsigned long ip,
+ unsigned long parent_ip, int flags)
+{
+ seq_print_ip_sym(s, ip, flags);
+
+ if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) {
+ trace_seq_puts(s, " <-");
+ seq_print_ip_sym(s, parent_ip, flags);
+ }
+}
+
/* TRACE_FN */
static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
struct trace_event *event)
@@ -836,13 +1009,7 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- seq_print_ip_sym(s, field->ip, flags);
-
- if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
- trace_seq_puts(s, " <-");
- seq_print_ip_sym(s, field->parent_ip, flags);
- }
-
+ print_fn_trace(s, field->ip, field->parent_ip, flags);
trace_seq_putc(s, '\n');
return trace_handle_return(s);
@@ -917,7 +1084,7 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
S = task_index_to_char(field->prev_state);
trace_find_cmdline(field->next_pid, comm);
trace_seq_printf(&iter->seq,
- " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+ " %7d:%3d:%c %s [%03d] %7d:%3d:%c %s\n",
field->prev_pid,
field->prev_prio,
S, delim,
@@ -1179,7 +1346,6 @@ trace_hwlat_print(struct trace_iterator *iter, int flags,
return trace_handle_return(s);
}
-
static enum print_line_t
trace_hwlat_raw(struct trace_iterator *iter, int flags,
struct trace_event *event)
@@ -1209,6 +1375,124 @@ static struct trace_event trace_hwlat_event = {
.funcs = &trace_hwlat_funcs,
};
+/* TRACE_OSNOISE */
+static enum print_line_t
+trace_osnoise_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_entry *entry = iter->ent;
+ struct trace_seq *s = &iter->seq;
+ struct osnoise_entry *field;
+ u64 ratio, ratio_dec;
+ u64 net_runtime;
+
+ trace_assign_type(field, entry);
+
+ /*
+ * compute the available % of cpu time.
+ */
+ net_runtime = field->runtime - field->noise;
+ ratio = net_runtime * 10000000;
+ do_div(ratio, field->runtime);
+ ratio_dec = do_div(ratio, 100000);
+
+ trace_seq_printf(s, "%llu %10llu %3llu.%05llu %7llu",
+ field->runtime,
+ field->noise,
+ ratio, ratio_dec,
+ field->max_sample);
+
+ trace_seq_printf(s, " %6u", field->hw_count);
+ trace_seq_printf(s, " %6u", field->nmi_count);
+ trace_seq_printf(s, " %6u", field->irq_count);
+ trace_seq_printf(s, " %6u", field->softirq_count);
+ trace_seq_printf(s, " %6u", field->thread_count);
+
+ trace_seq_putc(s, '\n');
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t
+trace_osnoise_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct osnoise_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(s, "%lld %llu %llu %u %u %u %u %u\n",
+ field->runtime,
+ field->noise,
+ field->max_sample,
+ field->hw_count,
+ field->nmi_count,
+ field->irq_count,
+ field->softirq_count,
+ field->thread_count);
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_osnoise_funcs = {
+ .trace = trace_osnoise_print,
+ .raw = trace_osnoise_raw,
+};
+
+static struct trace_event trace_osnoise_event = {
+ .type = TRACE_OSNOISE,
+ .funcs = &trace_osnoise_funcs,
+};
+
+/* TRACE_TIMERLAT */
+
+static char *timerlat_lat_context[] = {"irq", "thread", "user-ret"};
+static enum print_line_t
+trace_timerlat_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_entry *entry = iter->ent;
+ struct trace_seq *s = &iter->seq;
+ struct timerlat_entry *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_printf(s, "#%-5u context %6s timer_latency %9llu ns\n",
+ field->seqnum,
+ timerlat_lat_context[field->context],
+ field->timer_latency);
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t
+trace_timerlat_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct timerlat_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(s, "%u %d %llu\n",
+ field->seqnum,
+ field->context,
+ field->timer_latency);
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_timerlat_funcs = {
+ .trace = trace_timerlat_print,
+ .raw = trace_timerlat_raw,
+};
+
+static struct trace_event trace_timerlat_event = {
+ .type = TRACE_TIMERLAT,
+ .funcs = &trace_timerlat_funcs,
+};
+
/* TRACE_BPUTS */
static enum print_line_t
trace_bputs_print(struct trace_iterator *iter, int flags,
@@ -1363,6 +1647,51 @@ static struct trace_event trace_raw_data_event = {
.funcs = &trace_raw_data_funcs,
};
+static enum print_line_t
+trace_func_repeats_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct func_repeats_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(s, "%lu %lu %u %llu\n",
+ field->ip,
+ field->parent_ip,
+ field->count,
+ FUNC_REPEATS_GET_DELTA_TS(field));
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t
+trace_func_repeats_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct func_repeats_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ print_fn_trace(s, field->ip, field->parent_ip, flags);
+ trace_seq_printf(s, " (repeats: %u, last_ts:", field->count);
+ trace_print_time(s, iter,
+ iter->ts - FUNC_REPEATS_GET_DELTA_TS(field));
+ trace_seq_puts(s, ")\n");
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_func_repeats_funcs = {
+ .trace = trace_func_repeats_print,
+ .raw = trace_func_repeats_raw,
+};
+
+static struct trace_event trace_func_repeats_event = {
+ .type = TRACE_FUNC_REPEATS,
+ .funcs = &trace_func_repeats_funcs,
+};
static struct trace_event *events[] __initdata = {
&trace_fn_event,
@@ -1374,26 +1703,23 @@ static struct trace_event *events[] __initdata = {
&trace_bprint_event,
&trace_print_event,
&trace_hwlat_event,
+ &trace_osnoise_event,
+ &trace_timerlat_event,
&trace_raw_data_event,
+ &trace_func_repeats_event,
NULL
};
-__init static int init_events(void)
+__init int init_events(void)
{
struct trace_event *event;
int i, ret;
for (i = 0; events[i]; i++) {
event = events[i];
-
ret = register_trace_event(event);
- if (!ret) {
- printk(KERN_WARNING "event %d failed to register\n",
- event->type);
- WARN_ON_ONCE(1);
- }
+ WARN_ONCE(!ret, "event %d failed to register", event->type);
}
return 0;
}
-early_initcall(init_events);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 2f742b74e7e6..dca40f1f1da4 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -16,8 +16,11 @@ extern int
seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
unsigned long sym_flags);
+extern void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset);
extern int trace_print_context(struct trace_iterator *iter);
extern int trace_print_lat_context(struct trace_iterator *iter);
+extern enum print_line_t print_event_fields(struct trace_iterator *iter,
+ struct trace_event *event);
extern void trace_event_read_lock(void);
extern void trace_event_read_unlock(void);
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index f10073e62603..e37446f7916e 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -15,6 +15,20 @@
#define CREATE_TRACE_POINTS
#include <trace/events/preemptirq.h>
+/*
+ * Use regular trace points on architectures that implement noinstr
+ * tooling: these calls will only happen with RCU enabled, which can
+ * use a regular tracepoint.
+ *
+ * On older architectures, use the rcuidle tracing methods (which
+ * aren't NMI-safe - so exclude NMI contexts):
+ */
+#ifdef CONFIG_ARCH_WANTS_NO_INSTR
+#define trace(point) trace_##point
+#else
+#define trace(point) if (!in_nmi()) trace_##point##_rcuidle
+#endif
+
#ifdef CONFIG_TRACE_IRQFLAGS
/* Per-cpu variable to prevent redundant calls when IRQs already off */
static DEFINE_PER_CPU(int, tracing_irq_cpu);
@@ -28,8 +42,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu);
void trace_hardirqs_on_prepare(void)
{
if (this_cpu_read(tracing_irq_cpu)) {
- if (!in_nmi())
- trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
this_cpu_write(tracing_irq_cpu, 0);
}
@@ -40,13 +53,12 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare);
void trace_hardirqs_on(void)
{
if (this_cpu_read(tracing_irq_cpu)) {
- if (!in_nmi())
- trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
this_cpu_write(tracing_irq_cpu, 0);
}
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ lockdep_hardirqs_on_prepare();
lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on);
@@ -63,8 +75,7 @@ void trace_hardirqs_off_finish(void)
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
- if (!in_nmi())
- trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
}
}
@@ -78,56 +89,24 @@ void trace_hardirqs_off(void)
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
- if (!in_nmi())
- trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
}
}
EXPORT_SYMBOL(trace_hardirqs_off);
NOKPROBE_SYMBOL(trace_hardirqs_off);
-
-__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
-{
- if (this_cpu_read(tracing_irq_cpu)) {
- if (!in_nmi())
- trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
- tracer_hardirqs_on(CALLER_ADDR0, caller_addr);
- this_cpu_write(tracing_irq_cpu, 0);
- }
-
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
- lockdep_hardirqs_on(CALLER_ADDR0);
-}
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
-NOKPROBE_SYMBOL(trace_hardirqs_on_caller);
-
-__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
-{
- if (!this_cpu_read(tracing_irq_cpu)) {
- this_cpu_write(tracing_irq_cpu, 1);
- tracer_hardirqs_off(CALLER_ADDR0, caller_addr);
- if (!in_nmi())
- trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
- }
-
- lockdep_hardirqs_off(CALLER_ADDR0);
-}
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
-NOKPROBE_SYMBOL(trace_hardirqs_off_caller);
#endif /* CONFIG_TRACE_IRQFLAGS */
#ifdef CONFIG_TRACE_PREEMPT_TOGGLE
void trace_preempt_on(unsigned long a0, unsigned long a1)
{
- if (!in_nmi())
- trace_preempt_enable_rcuidle(a0, a1);
+ trace(preempt_enable)(a0, a1);
tracer_preempt_on(a0, a1);
}
void trace_preempt_off(unsigned long a0, unsigned long a1)
{
- if (!in_nmi())
- trace_preempt_disable_rcuidle(a0, a1);
+ trace(preempt_disable)(a0, a1);
tracer_preempt_off(a0, a1);
}
#endif
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index d4e31e969206..29f6e95439b6 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -96,7 +96,7 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self,
if (val == MODULE_STATE_COMING)
hold_module_trace_bprintk_format(start, end);
}
- return 0;
+ return NOTIFY_OK;
}
/*
@@ -174,7 +174,7 @@ __init static int
module_trace_bprintk_format_notify(struct notifier_block *self,
unsigned long val, void *data)
{
- return 0;
+ return NOTIFY_OK;
}
static inline const char **
find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
@@ -251,6 +251,17 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
}
EXPORT_SYMBOL_GPL(__ftrace_vprintk);
+bool trace_is_tracepoint_string(const char *str)
+{
+ const char **ptr = __start___tracepoint_str;
+
+ for (ptr = __start___tracepoint_str; ptr < __stop___tracepoint_str; ptr++) {
+ if (str == *ptr)
+ return true;
+ }
+ return false;
+}
+
static const char **find_next(void *v, loff_t *pos)
{
const char **fmt = v;
@@ -367,13 +378,13 @@ static const struct file_operations ftrace_formats_fops = {
static __init int init_trace_printk_function_export(void)
{
- struct dentry *d_tracer;
+ int ret;
- d_tracer = tracing_init_dentry();
- if (IS_ERR(d_tracer))
+ ret = tracing_init_dentry();
+ if (ret)
return 0;
- trace_create_file("printk_formats", 0444, d_tracer,
+ trace_create_file("printk_formats", TRACE_MODE_READ, NULL,
NULL, &ftrace_formats_fops);
return 0;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index d2867ccc6aca..34289f9c6707 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -11,6 +11,9 @@
*/
#define pr_fmt(fmt) "trace_probe: " fmt
+#include <linux/bpf.h>
+#include "trace_btf.h"
+
#include "trace_probe.h"
#undef C
@@ -50,6 +53,7 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(x8, u8, "0x%x")
DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x")
DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x")
DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx")
+DEFINE_BASIC_PRINT_TYPE_FUNC(char, u8, "'%c'")
int PRINT_TYPE_FUNC_NAME(symbol)(struct trace_seq *s, void *data, void *ent)
{
@@ -64,7 +68,7 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, void *data, void *ent)
int len = *(u32 *)data >> 16;
if (!len)
- trace_seq_puts(s, "(fault)");
+ trace_seq_puts(s, FAULT_STRING);
else
trace_seq_printf(s, "\"%s\"",
(const char *)get_loc_data(data, ent));
@@ -76,9 +80,11 @@ const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
/* Fetch type information table */
static const struct fetch_type probe_fetch_types[] = {
/* Special types */
- __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1,
+ __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, 1,
+ "__data_loc char[]"),
+ __ASSIGN_FETCH_TYPE("ustring", string, string, sizeof(u32), 1, 1,
"__data_loc char[]"),
- __ASSIGN_FETCH_TYPE("ustring", string, string, sizeof(u32), 1,
+ __ASSIGN_FETCH_TYPE("symstr", string, string, sizeof(u32), 1, 1,
"__data_loc char[]"),
/* Basic types */
ASSIGN_FETCH_TYPE(u8, u8, 0),
@@ -93,15 +99,21 @@ static const struct fetch_type probe_fetch_types[] = {
ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),
ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),
ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(char, u8, u8, 0),
ASSIGN_FETCH_TYPE_ALIAS(symbol, ADDR_FETCH_TYPE, ADDR_FETCH_TYPE, 0),
ASSIGN_FETCH_TYPE_END
};
-static const struct fetch_type *find_fetch_type(const char *type)
+static const struct fetch_type *find_fetch_type(const char *type, unsigned long flags)
{
int i;
+ /* Reject the symbol/symstr for uprobes */
+ if (type && (flags & TPARG_FL_USER) &&
+ (!strcmp(type, "symbol") || !strcmp(type, "symstr")))
+ return NULL;
+
if (!type)
type = DEFAULT_FETCH_TYPE_STR;
@@ -119,13 +131,13 @@ static const struct fetch_type *find_fetch_type(const char *type)
switch (bs) {
case 8:
- return find_fetch_type("u8");
+ return find_fetch_type("u8", flags);
case 16:
- return find_fetch_type("u16");
+ return find_fetch_type("u16", flags);
case 32:
- return find_fetch_type("u32");
+ return find_fetch_type("u32", flags);
case 64:
- return find_fetch_type("u64");
+ return find_fetch_type("u64", flags);
default:
goto fail;
}
@@ -168,7 +180,7 @@ void __trace_probe_log_err(int offset, int err_type)
if (!trace_probe_log.argv)
return;
- /* Recalcurate the length and allocate buffer */
+ /* Recalculate the length and allocate buffer */
for (i = 0; i < trace_probe_log.argc; i++) {
if (i == trace_probe_log.index)
pos = len;
@@ -182,7 +194,7 @@ void __trace_probe_log_err(int offset, int err_type)
/**
* Set the error position is next to the last arg + space.
* Note that len includes the terminal null and the cursor
- * appaers at pos + 1.
+ * appears at pos + 1.
*/
pos = len;
offset = 0;
@@ -233,6 +245,9 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
int len;
slash = strchr(event, '/');
+ if (!slash)
+ slash = strchr(event, '.');
+
if (slash) {
if (slash == event) {
trace_probe_log_err(offset, NO_GROUP_NAME);
@@ -242,8 +257,8 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
trace_probe_log_err(offset, GROUP_TOO_LONG);
return -EINVAL;
}
- strlcpy(buf, event, slash - event + 1);
- if (!is_good_name(buf)) {
+ strscpy(buf, event, slash - event + 1);
+ if (!is_good_system_name(buf)) {
trace_probe_log_err(offset, BAD_GROUP_NAME);
return -EINVAL;
}
@@ -254,6 +269,10 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
}
len = strlen(event);
if (len == 0) {
+ if (slash) {
+ *pevent = NULL;
+ return 0;
+ }
trace_probe_log_err(offset, NO_EVENT_NAME);
return -EINVAL;
} else if (len > MAX_EVENT_NAME_LEN) {
@@ -267,62 +286,574 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
return 0;
}
+static int parse_trace_event_arg(char *arg, struct fetch_insn *code,
+ struct traceprobe_parse_context *ctx)
+{
+ struct ftrace_event_field *field;
+ struct list_head *head;
+
+ head = trace_get_fields(ctx->event);
+ list_for_each_entry(field, head, link) {
+ if (!strcmp(arg, field->name)) {
+ code->op = FETCH_OP_TP_ARG;
+ code->data = field;
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
+
+static u32 btf_type_int(const struct btf_type *t)
+{
+ return *(u32 *)(t + 1);
+}
+
+static bool btf_type_is_char_ptr(struct btf *btf, const struct btf_type *type)
+{
+ const struct btf_type *real_type;
+ u32 intdata;
+ s32 tid;
+
+ real_type = btf_type_skip_modifiers(btf, type->type, &tid);
+ if (!real_type)
+ return false;
+
+ if (BTF_INFO_KIND(real_type->info) != BTF_KIND_INT)
+ return false;
+
+ intdata = btf_type_int(real_type);
+ return !(BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED)
+ && BTF_INT_BITS(intdata) == 8;
+}
+
+static bool btf_type_is_char_array(struct btf *btf, const struct btf_type *type)
+{
+ const struct btf_type *real_type;
+ const struct btf_array *array;
+ u32 intdata;
+ s32 tid;
+
+ if (BTF_INFO_KIND(type->info) != BTF_KIND_ARRAY)
+ return false;
+
+ array = (const struct btf_array *)(type + 1);
+
+ real_type = btf_type_skip_modifiers(btf, array->type, &tid);
+
+ intdata = btf_type_int(real_type);
+ return !(BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED)
+ && BTF_INT_BITS(intdata) == 8;
+}
+
+static int check_prepare_btf_string_fetch(char *typename,
+ struct fetch_insn **pcode,
+ struct traceprobe_parse_context *ctx)
+{
+ struct btf *btf = ctx->btf;
+
+ if (!btf || !ctx->last_type)
+ return 0;
+
+ /* char [] does not need any change. */
+ if (btf_type_is_char_array(btf, ctx->last_type))
+ return 0;
+
+ /* char * requires dereference the pointer. */
+ if (btf_type_is_char_ptr(btf, ctx->last_type)) {
+ struct fetch_insn *code = *pcode + 1;
+
+ if (code->op == FETCH_OP_END) {
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
+ return -E2BIG;
+ }
+ if (typename[0] == 'u')
+ code->op = FETCH_OP_UDEREF;
+ else
+ code->op = FETCH_OP_DEREF;
+ code->offset = 0;
+ *pcode = code;
+ return 0;
+ }
+ /* Other types are not available for string */
+ trace_probe_log_err(ctx->offset, BAD_TYPE4STR);
+ return -EINVAL;
+}
+
+static const char *fetch_type_from_btf_type(struct btf *btf,
+ const struct btf_type *type,
+ struct traceprobe_parse_context *ctx)
+{
+ u32 intdata;
+
+ /* TODO: const char * could be converted as a string */
+ switch (BTF_INFO_KIND(type->info)) {
+ case BTF_KIND_ENUM:
+ /* enum is "int", so convert to "s32" */
+ return "s32";
+ case BTF_KIND_ENUM64:
+ return "s64";
+ case BTF_KIND_PTR:
+ /* pointer will be converted to "x??" */
+ if (IS_ENABLED(CONFIG_64BIT))
+ return "x64";
+ else
+ return "x32";
+ case BTF_KIND_INT:
+ intdata = btf_type_int(type);
+ if (BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED) {
+ switch (BTF_INT_BITS(intdata)) {
+ case 8:
+ return "s8";
+ case 16:
+ return "s16";
+ case 32:
+ return "s32";
+ case 64:
+ return "s64";
+ }
+ } else { /* unsigned */
+ switch (BTF_INT_BITS(intdata)) {
+ case 8:
+ return "u8";
+ case 16:
+ return "u16";
+ case 32:
+ return "u32";
+ case 64:
+ return "u64";
+ }
+ /* bitfield, size is encoded in the type */
+ ctx->last_bitsize = BTF_INT_BITS(intdata);
+ ctx->last_bitoffs += BTF_INT_OFFSET(intdata);
+ return "u64";
+ }
+ }
+ /* TODO: support other types */
+
+ return NULL;
+}
+
+static int query_btf_context(struct traceprobe_parse_context *ctx)
+{
+ const struct btf_param *param;
+ const struct btf_type *type;
+ struct btf *btf;
+ s32 nr;
+
+ if (ctx->btf)
+ return 0;
+
+ if (!ctx->funcname)
+ return -EINVAL;
+
+ type = btf_find_func_proto(ctx->funcname, &btf);
+ if (!type)
+ return -ENOENT;
+
+ ctx->btf = btf;
+ ctx->proto = type;
+
+ /* ctx->params is optional, since func(void) will not have params. */
+ nr = 0;
+ param = btf_get_func_param(type, &nr);
+ if (!IS_ERR_OR_NULL(param)) {
+ /* Hide the first 'data' argument of tracepoint */
+ if (ctx->flags & TPARG_FL_TPOINT) {
+ nr--;
+ param++;
+ }
+ }
+
+ if (nr > 0) {
+ ctx->nr_params = nr;
+ ctx->params = param;
+ } else {
+ ctx->nr_params = 0;
+ ctx->params = NULL;
+ }
+
+ return 0;
+}
+
+static void clear_btf_context(struct traceprobe_parse_context *ctx)
+{
+ if (ctx->btf) {
+ btf_put(ctx->btf);
+ ctx->btf = NULL;
+ ctx->proto = NULL;
+ ctx->params = NULL;
+ ctx->nr_params = 0;
+ }
+}
+
+/* Return 1 if the field separater is arrow operator ('->') */
+static int split_next_field(char *varname, char **next_field,
+ struct traceprobe_parse_context *ctx)
+{
+ char *field;
+ int ret = 0;
+
+ field = strpbrk(varname, ".-");
+ if (field) {
+ if (field[0] == '-' && field[1] == '>') {
+ field[0] = '\0';
+ field += 2;
+ ret = 1;
+ } else if (field[0] == '.') {
+ field[0] = '\0';
+ field += 1;
+ } else {
+ trace_probe_log_err(ctx->offset + field - varname, BAD_HYPHEN);
+ return -EINVAL;
+ }
+ *next_field = field;
+ }
+
+ return ret;
+}
+
+/*
+ * Parse the field of data structure. The @type must be a pointer type
+ * pointing the target data structure type.
+ */
+static int parse_btf_field(char *fieldname, const struct btf_type *type,
+ struct fetch_insn **pcode, struct fetch_insn *end,
+ struct traceprobe_parse_context *ctx)
+{
+ struct fetch_insn *code = *pcode;
+ const struct btf_member *field;
+ u32 bitoffs, anon_offs;
+ char *next;
+ int is_ptr;
+ s32 tid;
+
+ do {
+ /* Outer loop for solving arrow operator ('->') */
+ if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) {
+ trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
+ return -EINVAL;
+ }
+ /* Convert a struct pointer type to a struct type */
+ type = btf_type_skip_modifiers(ctx->btf, type->type, &tid);
+ if (!type) {
+ trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+ return -EINVAL;
+ }
+
+ bitoffs = 0;
+ do {
+ /* Inner loop for solving dot operator ('.') */
+ next = NULL;
+ is_ptr = split_next_field(fieldname, &next, ctx);
+ if (is_ptr < 0)
+ return is_ptr;
+
+ anon_offs = 0;
+ field = btf_find_struct_member(ctx->btf, type, fieldname,
+ &anon_offs);
+ if (!field) {
+ trace_probe_log_err(ctx->offset, NO_BTF_FIELD);
+ return -ENOENT;
+ }
+ /* Add anonymous structure/union offset */
+ bitoffs += anon_offs;
+
+ /* Accumulate the bit-offsets of the dot-connected fields */
+ if (btf_type_kflag(type)) {
+ bitoffs += BTF_MEMBER_BIT_OFFSET(field->offset);
+ ctx->last_bitsize = BTF_MEMBER_BITFIELD_SIZE(field->offset);
+ } else {
+ bitoffs += field->offset;
+ ctx->last_bitsize = 0;
+ }
+
+ type = btf_type_skip_modifiers(ctx->btf, field->type, &tid);
+ if (!type) {
+ trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+ return -EINVAL;
+ }
+
+ ctx->offset += next - fieldname;
+ fieldname = next;
+ } while (!is_ptr && fieldname);
+
+ if (++code == end) {
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
+ return -EINVAL;
+ }
+ code->op = FETCH_OP_DEREF; /* TODO: user deref support */
+ code->offset = bitoffs / 8;
+ *pcode = code;
+
+ ctx->last_bitoffs = bitoffs % 8;
+ ctx->last_type = type;
+ } while (fieldname);
+
+ return 0;
+}
+
+static int parse_btf_arg(char *varname,
+ struct fetch_insn **pcode, struct fetch_insn *end,
+ struct traceprobe_parse_context *ctx)
+{
+ struct fetch_insn *code = *pcode;
+ const struct btf_param *params;
+ const struct btf_type *type;
+ char *field = NULL;
+ int i, is_ptr, ret;
+ u32 tid;
+
+ if (WARN_ON_ONCE(!ctx->funcname))
+ return -EINVAL;
+
+ is_ptr = split_next_field(varname, &field, ctx);
+ if (is_ptr < 0)
+ return is_ptr;
+ if (!is_ptr && field) {
+ /* dot-connected field on an argument is not supported. */
+ trace_probe_log_err(ctx->offset + field - varname,
+ NOSUP_DAT_ARG);
+ return -EOPNOTSUPP;
+ }
+
+ if (ctx->flags & TPARG_FL_RETURN) {
+ if (strcmp(varname, "$retval") != 0) {
+ trace_probe_log_err(ctx->offset, NO_BTFARG);
+ return -ENOENT;
+ }
+ code->op = FETCH_OP_RETVAL;
+ /* Check whether the function return type is not void */
+ if (query_btf_context(ctx) == 0) {
+ if (ctx->proto->type == 0) {
+ trace_probe_log_err(ctx->offset, NO_RETVAL);
+ return -ENOENT;
+ }
+ tid = ctx->proto->type;
+ goto found;
+ }
+ if (field) {
+ trace_probe_log_err(ctx->offset + field - varname,
+ NO_BTF_ENTRY);
+ return -ENOENT;
+ }
+ return 0;
+ }
+
+ if (!ctx->btf) {
+ ret = query_btf_context(ctx);
+ if (ret < 0 || ctx->nr_params == 0) {
+ trace_probe_log_err(ctx->offset, NO_BTF_ENTRY);
+ return PTR_ERR(params);
+ }
+ }
+ params = ctx->params;
+
+ for (i = 0; i < ctx->nr_params; i++) {
+ const char *name = btf_name_by_offset(ctx->btf, params[i].name_off);
+
+ if (name && !strcmp(name, varname)) {
+ code->op = FETCH_OP_ARG;
+ if (ctx->flags & TPARG_FL_TPOINT)
+ code->param = i + 1;
+ else
+ code->param = i;
+ tid = params[i].type;
+ goto found;
+ }
+ }
+ trace_probe_log_err(ctx->offset, NO_BTFARG);
+ return -ENOENT;
+
+found:
+ type = btf_type_skip_modifiers(ctx->btf, tid, &tid);
+ if (!type) {
+ trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+ return -EINVAL;
+ }
+ /* Initialize the last type information */
+ ctx->last_type = type;
+ ctx->last_bitoffs = 0;
+ ctx->last_bitsize = 0;
+ if (field) {
+ ctx->offset += field - varname;
+ return parse_btf_field(field, type, pcode, end, ctx);
+ }
+ return 0;
+}
+
+static const struct fetch_type *find_fetch_type_from_btf_type(
+ struct traceprobe_parse_context *ctx)
+{
+ struct btf *btf = ctx->btf;
+ const char *typestr = NULL;
+
+ if (btf && ctx->last_type)
+ typestr = fetch_type_from_btf_type(btf, ctx->last_type, ctx);
+
+ return find_fetch_type(typestr, ctx->flags);
+}
+
+static int parse_btf_bitfield(struct fetch_insn **pcode,
+ struct traceprobe_parse_context *ctx)
+{
+ struct fetch_insn *code = *pcode;
+
+ if ((ctx->last_bitsize % 8 == 0) && ctx->last_bitoffs == 0)
+ return 0;
+
+ code++;
+ if (code->op != FETCH_OP_NOP) {
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
+ return -EINVAL;
+ }
+ *pcode = code;
+
+ code->op = FETCH_OP_MOD_BF;
+ code->lshift = 64 - (ctx->last_bitsize + ctx->last_bitoffs);
+ code->rshift = 64 - ctx->last_bitsize;
+ code->basesize = 64 / 8;
+ return 0;
+}
+
+#else
+static void clear_btf_context(struct traceprobe_parse_context *ctx)
+{
+ ctx->btf = NULL;
+}
+
+static int query_btf_context(struct traceprobe_parse_context *ctx)
+{
+ return -EOPNOTSUPP;
+}
+
+static int parse_btf_arg(char *varname,
+ struct fetch_insn **pcode, struct fetch_insn *end,
+ struct traceprobe_parse_context *ctx)
+{
+ trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
+ return -EOPNOTSUPP;
+}
+
+static int parse_btf_bitfield(struct fetch_insn **pcode,
+ struct traceprobe_parse_context *ctx)
+{
+ trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
+ return -EOPNOTSUPP;
+}
+
+#define find_fetch_type_from_btf_type(ctx) \
+ find_fetch_type(NULL, ctx->flags)
+
+static int check_prepare_btf_string_fetch(char *typename,
+ struct fetch_insn **pcode,
+ struct traceprobe_parse_context *ctx)
+{
+ return 0;
+}
+
+#endif
+
#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
-static int parse_probe_vars(char *arg, const struct fetch_type *t,
- struct fetch_insn *code, unsigned int flags, int offs)
+/* Parse $vars. @orig_arg points '$', which syncs to @ctx->offset */
+static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
+ struct fetch_insn **pcode,
+ struct fetch_insn *end,
+ struct traceprobe_parse_context *ctx)
{
+ struct fetch_insn *code = *pcode;
+ int err = TP_ERR_BAD_VAR;
+ char *arg = orig_arg + 1;
unsigned long param;
int ret = 0;
int len;
- if (strcmp(arg, "retval") == 0) {
- if (flags & TPARG_FL_RETURN) {
+ if (ctx->flags & TPARG_FL_TEVENT) {
+ if (code->data)
+ return -EFAULT;
+ ret = parse_trace_event_arg(arg, code, ctx);
+ if (!ret)
+ return 0;
+ if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) {
+ code->op = FETCH_OP_COMM;
+ return 0;
+ }
+ /* backward compatibility */
+ ctx->offset = 0;
+ goto inval;
+ }
+
+ if (str_has_prefix(arg, "retval")) {
+ if (!(ctx->flags & TPARG_FL_RETURN)) {
+ err = TP_ERR_RETVAL_ON_PROBE;
+ goto inval;
+ }
+ if (!(ctx->flags & TPARG_FL_KERNEL) ||
+ !IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS)) {
code->op = FETCH_OP_RETVAL;
- } else {
- trace_probe_log_err(offs, RETVAL_ON_PROBE);
- ret = -EINVAL;
+ return 0;
}
- } else if ((len = str_has_prefix(arg, "stack"))) {
+ return parse_btf_arg(orig_arg, pcode, end, ctx);
+ }
+
+ len = str_has_prefix(arg, "stack");
+ if (len) {
+
if (arg[len] == '\0') {
code->op = FETCH_OP_STACKP;
- } else if (isdigit(arg[len])) {
+ return 0;
+ }
+
+ if (isdigit(arg[len])) {
ret = kstrtoul(arg + len, 10, &param);
- if (ret) {
- goto inval_var;
- } else if ((flags & TPARG_FL_KERNEL) &&
- param > PARAM_MAX_STACK) {
- trace_probe_log_err(offs, BAD_STACK_NUM);
- ret = -EINVAL;
- } else {
- code->op = FETCH_OP_STACK;
- code->param = (unsigned int)param;
+ if (ret)
+ goto inval;
+
+ if ((ctx->flags & TPARG_FL_KERNEL) &&
+ param > PARAM_MAX_STACK) {
+ err = TP_ERR_BAD_STACK_NUM;
+ goto inval;
}
- } else
- goto inval_var;
- } else if (strcmp(arg, "comm") == 0) {
+ code->op = FETCH_OP_STACK;
+ code->param = (unsigned int)param;
+ return 0;
+ }
+ goto inval;
+ }
+
+ if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) {
code->op = FETCH_OP_COMM;
+ return 0;
+ }
+
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
- } else if (((flags & TPARG_FL_MASK) ==
- (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) &&
- (len = str_has_prefix(arg, "arg"))) {
+ len = str_has_prefix(arg, "arg");
+ if (len && tparg_is_function_entry(ctx->flags)) {
ret = kstrtoul(arg + len, 10, &param);
- if (ret) {
- goto inval_var;
- } else if (!param || param > PARAM_MAX_STACK) {
- trace_probe_log_err(offs, BAD_ARG_NUM);
- return -EINVAL;
+ if (ret)
+ goto inval;
+
+ if (!param || param > PARAM_MAX_STACK) {
+ err = TP_ERR_BAD_ARG_NUM;
+ goto inval;
}
+
code->op = FETCH_OP_ARG;
code->param = (unsigned int)param - 1;
+ /*
+ * The tracepoint probe will probe a stub function, and the
+ * first parameter of the stub is a dummy and should be ignored.
+ */
+ if (ctx->flags & TPARG_FL_TPOINT)
+ code->param++;
+ return 0;
+ }
#endif
- } else
- goto inval_var;
- return ret;
-
-inval_var:
- trace_probe_log_err(offs, BAD_VAR);
+inval:
+ __trace_probe_log_err(ctx->offset, err);
return -EINVAL;
}
@@ -346,6 +877,8 @@ static int __parse_imm_string(char *str, char **pbuf, int offs)
return -EINVAL;
}
*pbuf = kstrndup(str, len - 1, GFP_KERNEL);
+ if (!*pbuf)
+ return -ENOMEM;
return 0;
}
@@ -353,7 +886,7 @@ static int __parse_imm_string(char *str, char **pbuf, int offs)
static int
parse_probe_arg(char *arg, const struct fetch_type *type,
struct fetch_insn **pcode, struct fetch_insn *end,
- unsigned int flags, int offs)
+ struct traceprobe_parse_context *ctx)
{
struct fetch_insn *code = *pcode;
unsigned long param;
@@ -364,24 +897,29 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
switch (arg[0]) {
case '$':
- ret = parse_probe_vars(arg + 1, type, code, flags, offs);
+ ret = parse_probe_vars(arg, type, pcode, end, ctx);
break;
case '%': /* named register */
+ if (ctx->flags & (TPARG_FL_TEVENT | TPARG_FL_FPROBE)) {
+ /* eprobe and fprobe do not handle registers */
+ trace_probe_log_err(ctx->offset, BAD_VAR);
+ break;
+ }
ret = regs_query_register_offset(arg + 1);
if (ret >= 0) {
code->op = FETCH_OP_REG;
code->param = (unsigned int)ret;
ret = 0;
} else
- trace_probe_log_err(offs, BAD_REG_NAME);
+ trace_probe_log_err(ctx->offset, BAD_REG_NAME);
break;
case '@': /* memory, file-offset or symbol */
if (isdigit(arg[1])) {
ret = kstrtoul(arg + 1, 0, &param);
if (ret) {
- trace_probe_log_err(offs, BAD_MEM_ADDR);
+ trace_probe_log_err(ctx->offset, BAD_MEM_ADDR);
break;
}
/* load address */
@@ -389,13 +927,13 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
code->immediate = param;
} else if (arg[1] == '+') {
/* kprobes don't support file offsets */
- if (flags & TPARG_FL_KERNEL) {
- trace_probe_log_err(offs, FILE_ON_KPROBE);
+ if (ctx->flags & TPARG_FL_KERNEL) {
+ trace_probe_log_err(ctx->offset, FILE_ON_KPROBE);
return -EINVAL;
}
ret = kstrtol(arg + 2, 0, &offset);
if (ret) {
- trace_probe_log_err(offs, BAD_FILE_OFFS);
+ trace_probe_log_err(ctx->offset, BAD_FILE_OFFS);
break;
}
@@ -403,8 +941,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
code->immediate = (unsigned long)offset; // imm64?
} else {
/* uprobes don't support symbols */
- if (!(flags & TPARG_FL_KERNEL)) {
- trace_probe_log_err(offs, SYM_ON_UPROBE);
+ if (!(ctx->flags & TPARG_FL_KERNEL)) {
+ trace_probe_log_err(ctx->offset, SYM_ON_UPROBE);
return -EINVAL;
}
/* Preserve symbol for updating */
@@ -413,7 +951,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
if (!code->data)
return -ENOMEM;
if (++code == end) {
- trace_probe_log_err(offs, TOO_MANY_OPS);
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
return -EINVAL;
}
code->op = FETCH_OP_IMM;
@@ -421,7 +959,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
}
/* These are fetching from memory */
if (++code == end) {
- trace_probe_log_err(offs, TOO_MANY_OPS);
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
return -EINVAL;
}
*pcode = code;
@@ -440,47 +978,51 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
arg++; /* Skip '+', because kstrtol() rejects it. */
tmp = strchr(arg, '(');
if (!tmp) {
- trace_probe_log_err(offs, DEREF_NEED_BRACE);
+ trace_probe_log_err(ctx->offset, DEREF_NEED_BRACE);
return -EINVAL;
}
*tmp = '\0';
ret = kstrtol(arg, 0, &offset);
if (ret) {
- trace_probe_log_err(offs, BAD_DEREF_OFFS);
+ trace_probe_log_err(ctx->offset, BAD_DEREF_OFFS);
break;
}
- offs += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0);
+ ctx->offset += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0);
arg = tmp + 1;
tmp = strrchr(arg, ')');
if (!tmp) {
- trace_probe_log_err(offs + strlen(arg),
+ trace_probe_log_err(ctx->offset + strlen(arg),
DEREF_OPEN_BRACE);
return -EINVAL;
} else {
- const struct fetch_type *t2 = find_fetch_type(NULL);
+ const struct fetch_type *t2 = find_fetch_type(NULL, ctx->flags);
+ int cur_offs = ctx->offset;
*tmp = '\0';
- ret = parse_probe_arg(arg, t2, &code, end, flags, offs);
+ ret = parse_probe_arg(arg, t2, &code, end, ctx);
if (ret)
break;
+ ctx->offset = cur_offs;
if (code->op == FETCH_OP_COMM ||
code->op == FETCH_OP_DATA) {
- trace_probe_log_err(offs, COMM_CANT_DEREF);
+ trace_probe_log_err(ctx->offset, COMM_CANT_DEREF);
return -EINVAL;
}
if (++code == end) {
- trace_probe_log_err(offs, TOO_MANY_OPS);
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
return -EINVAL;
}
*pcode = code;
code->op = deref;
code->offset = offset;
+ /* Reset the last type if used */
+ ctx->last_type = NULL;
}
break;
case '\\': /* Immediate value */
if (arg[1] == '"') { /* Immediate string */
- ret = __parse_imm_string(arg + 2, &tmp, offs + 2);
+ ret = __parse_imm_string(arg + 2, &tmp, ctx->offset + 2);
if (ret)
break;
code->op = FETCH_OP_DATA;
@@ -488,15 +1030,24 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
} else {
ret = str_to_immediate(arg + 1, &code->immediate);
if (ret)
- trace_probe_log_err(offs + 1, BAD_IMM);
+ trace_probe_log_err(ctx->offset + 1, BAD_IMM);
else
code->op = FETCH_OP_IMM;
}
break;
+ default:
+ if (isalpha(arg[0]) || arg[0] == '_') { /* BTF variable */
+ if (!tparg_is_function_entry(ctx->flags)) {
+ trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
+ return -EINVAL;
+ }
+ ret = parse_btf_arg(arg, pcode, end, ctx);
+ break;
+ }
}
if (!ret && code->op == FETCH_OP_NOP) {
/* Parsed, but do not find fetch method */
- trace_probe_log_err(offs, BAD_FETCH_ARG);
+ trace_probe_log_err(ctx->offset, BAD_FETCH_ARG);
ret = -EINVAL;
}
return ret;
@@ -540,26 +1091,35 @@ static int __parse_bitfield_probe_arg(const char *bf,
}
/* String length checking wrapper */
-static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
- struct probe_arg *parg, unsigned int flags, int offset)
+static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
+ struct probe_arg *parg,
+ struct traceprobe_parse_context *ctx)
{
struct fetch_insn *code, *scode, *tmp = NULL;
char *t, *t2, *t3;
int ret, len;
+ char *arg;
+ arg = kstrdup(argv, GFP_KERNEL);
+ if (!arg)
+ return -ENOMEM;
+
+ ret = -EINVAL;
len = strlen(arg);
if (len > MAX_ARGSTR_LEN) {
- trace_probe_log_err(offset, ARG_TOO_LONG);
- return -EINVAL;
+ trace_probe_log_err(ctx->offset, ARG_TOO_LONG);
+ goto out;
} else if (len == 0) {
- trace_probe_log_err(offset, NO_ARG_BODY);
- return -EINVAL;
+ trace_probe_log_err(ctx->offset, NO_ARG_BODY);
+ goto out;
}
+ ret = -ENOMEM;
parg->comm = kstrdup(arg, GFP_KERNEL);
if (!parg->comm)
- return -ENOMEM;
+ goto out;
+ ret = -EINVAL;
t = strchr(arg, ':');
if (t) {
*t = '\0';
@@ -568,43 +1128,72 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
*t2++ = '\0';
t3 = strchr(t2, ']');
if (!t3) {
- offset += t2 + strlen(t2) - arg;
- trace_probe_log_err(offset,
+ int offs = t2 + strlen(t2) - arg;
+
+ trace_probe_log_err(ctx->offset + offs,
ARRAY_NO_CLOSE);
- return -EINVAL;
+ goto out;
} else if (t3[1] != '\0') {
- trace_probe_log_err(offset + t3 + 1 - arg,
+ trace_probe_log_err(ctx->offset + t3 + 1 - arg,
BAD_ARRAY_SUFFIX);
- return -EINVAL;
+ goto out;
}
*t3 = '\0';
if (kstrtouint(t2, 0, &parg->count) || !parg->count) {
- trace_probe_log_err(offset + t2 - arg,
+ trace_probe_log_err(ctx->offset + t2 - arg,
BAD_ARRAY_NUM);
- return -EINVAL;
+ goto out;
}
if (parg->count > MAX_ARRAY_LEN) {
- trace_probe_log_err(offset + t2 - arg,
+ trace_probe_log_err(ctx->offset + t2 - arg,
ARRAY_TOO_BIG);
- return -EINVAL;
+ goto out;
}
}
}
/*
- * Since $comm and immediate string can not be dereferred,
- * we can find those by strcmp.
+ * Since $comm and immediate string can not be dereferenced,
+ * we can find those by strcmp. But ignore for eprobes.
*/
- if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) {
- /* The type of $comm must be "string", and not an array. */
- if (parg->count || (t && strcmp(t, "string")))
- return -EINVAL;
- parg->type = find_fetch_type("string");
+ if (!(ctx->flags & TPARG_FL_TEVENT) &&
+ (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 ||
+ strncmp(arg, "\\\"", 2) == 0)) {
+ /* The type of $comm must be "string", and not an array type. */
+ if (parg->count || (t && strcmp(t, "string"))) {
+ trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
+ NEED_STRING_TYPE);
+ goto out;
+ }
+ parg->type = find_fetch_type("string", ctx->flags);
} else
- parg->type = find_fetch_type(t);
+ parg->type = find_fetch_type(t, ctx->flags);
if (!parg->type) {
- trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE);
- return -EINVAL;
+ trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_TYPE);
+ goto out;
+ }
+
+ code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
+ if (!code)
+ goto out;
+ code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
+
+ ctx->last_type = NULL;
+ ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
+ ctx);
+ if (ret)
+ goto fail;
+
+ /* Update storing type if BTF is available */
+ if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) &&
+ ctx->last_type) {
+ if (!t) {
+ parg->type = find_fetch_type_from_btf_type(ctx);
+ } else if (strstr(t, "string")) {
+ ret = check_prepare_btf_string_fetch(t, &code, ctx);
+ if (ret)
+ goto fail;
+ }
}
parg->offset = *size;
*size += parg->type->size * (parg->count ?: 1);
@@ -612,45 +1201,49 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
if (parg->count) {
len = strlen(parg->type->fmttype) + 6;
parg->fmt = kmalloc(len, GFP_KERNEL);
- if (!parg->fmt)
- return -ENOMEM;
+ if (!parg->fmt) {
+ ret = -ENOMEM;
+ goto out;
+ }
snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype,
parg->count);
}
- code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
- if (!code)
- return -ENOMEM;
- code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
-
- ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
- flags, offset);
- if (ret)
- goto fail;
-
+ ret = -EINVAL;
/* Store operation */
- if (!strcmp(parg->type->name, "string") ||
- !strcmp(parg->type->name, "ustring")) {
- if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF &&
- code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM &&
- code->op != FETCH_OP_DATA) {
- trace_probe_log_err(offset + (t ? (t - arg) : 0),
- BAD_STRING);
- ret = -EINVAL;
- goto fail;
+ if (parg->type->is_string) {
+ if (!strcmp(parg->type->name, "symstr")) {
+ if (code->op != FETCH_OP_REG && code->op != FETCH_OP_STACK &&
+ code->op != FETCH_OP_RETVAL && code->op != FETCH_OP_ARG &&
+ code->op != FETCH_OP_DEREF && code->op != FETCH_OP_TP_ARG) {
+ trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
+ BAD_SYMSTRING);
+ goto fail;
+ }
+ } else {
+ if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF &&
+ code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM &&
+ code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) {
+ trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
+ BAD_STRING);
+ goto fail;
+ }
}
- if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM ||
- code->op == FETCH_OP_DATA) || parg->count) {
+ if (!strcmp(parg->type->name, "symstr") ||
+ (code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM ||
+ code->op == FETCH_OP_DATA) || code->op == FETCH_OP_TP_ARG ||
+ parg->count) {
/*
* IMM, DATA and COMM is pointing actual address, those
* must be kept, and if parg->count != 0, this is an
* array of string pointers instead of string address
* itself.
+ * For the symstr, it doesn't need to dereference, thus
+ * it just get the value.
*/
code++;
if (code->op != FETCH_OP_NOP) {
- trace_probe_log_err(offset, TOO_MANY_OPS);
- ret = -EINVAL;
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
goto fail;
}
}
@@ -658,6 +1251,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
if (!strcmp(parg->type->name, "ustring") ||
code->op == FETCH_OP_UDEREF)
code->op = FETCH_OP_ST_USTRING;
+ else if (!strcmp(parg->type->name, "symstr"))
+ code->op = FETCH_OP_ST_SYMSTR;
else
code->op = FETCH_OP_ST_STRING;
code->size = parg->type->size;
@@ -671,8 +1266,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
} else {
code++;
if (code->op != FETCH_OP_NOP) {
- trace_probe_log_err(offset, TOO_MANY_OPS);
- ret = -EINVAL;
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
goto fail;
}
code->op = FETCH_OP_ST_RAW;
@@ -683,24 +1277,28 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
if (t != NULL) {
ret = __parse_bitfield_probe_arg(t, parg->type, &code);
if (ret) {
- trace_probe_log_err(offset + t - arg, BAD_BITFIELD);
+ trace_probe_log_err(ctx->offset + t - arg, BAD_BITFIELD);
goto fail;
}
+ } else if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) &&
+ ctx->last_type) {
+ ret = parse_btf_bitfield(&code, ctx);
+ if (ret)
+ goto fail;
}
+ ret = -EINVAL;
/* Loop(Array) operation */
if (parg->count) {
if (scode->op != FETCH_OP_ST_MEM &&
scode->op != FETCH_OP_ST_STRING &&
scode->op != FETCH_OP_ST_USTRING) {
- trace_probe_log_err(offset + (t ? (t - arg) : 0),
+ trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
BAD_STRING);
- ret = -EINVAL;
goto fail;
}
code++;
if (code->op != FETCH_OP_NOP) {
- trace_probe_log_err(offset, TOO_MANY_OPS);
- ret = -EINVAL;
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
goto fail;
}
code->op = FETCH_OP_LP_ARRAY;
@@ -709,6 +1307,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
code++;
code->op = FETCH_OP_END;
+ ret = 0;
/* Shrink down the code buffer */
parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL);
if (!parg->code)
@@ -724,6 +1323,8 @@ fail:
kfree(code->data);
}
kfree(tmp);
+out:
+ kfree(arg);
return ret;
}
@@ -745,11 +1346,38 @@ static int traceprobe_conflict_field_name(const char *name,
return 0;
}
-int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg,
- unsigned int flags)
+static char *generate_probe_arg_name(const char *arg, int idx)
+{
+ char *name = NULL;
+ const char *end;
+
+ /*
+ * If argument name is omitted, try arg as a name (BTF variable)
+ * or "argN".
+ */
+ if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS)) {
+ end = strchr(arg, ':');
+ if (!end)
+ end = arg + strlen(arg);
+
+ name = kmemdup_nul(arg, end - arg, GFP_KERNEL);
+ if (!name || !is_good_name(name)) {
+ kfree(name);
+ name = NULL;
+ }
+ }
+
+ if (!name)
+ name = kasprintf(GFP_KERNEL, "arg%d", idx + 1);
+
+ return name;
+}
+
+int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg,
+ struct traceprobe_parse_context *ctx)
{
struct probe_arg *parg = &tp->args[i];
- char *body;
+ const char *body;
/* Increment count for freeing args in error case */
tp->nr_args++;
@@ -766,8 +1394,7 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg,
parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL);
body++;
} else {
- /* If argument name is omitted, set "argN" */
- parg->name = kasprintf(GFP_KERNEL, "arg%d", i + 1);
+ parg->name = generate_probe_arg_name(arg, i);
body = arg;
}
if (!parg->name)
@@ -781,9 +1408,9 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg,
trace_probe_log_err(0, USED_ARG_NAME);
return -EINVAL;
}
+ ctx->offset = body - arg;
/* Parse fetch argument */
- return traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags,
- body - arg);
+ return traceprobe_parse_probe_arg_body(body, &tp->size, parg, ctx);
}
void traceprobe_free_probe_arg(struct probe_arg *arg)
@@ -802,6 +1429,151 @@ void traceprobe_free_probe_arg(struct probe_arg *arg)
kfree(arg->fmt);
}
+static int argv_has_var_arg(int argc, const char *argv[], int *args_idx,
+ struct traceprobe_parse_context *ctx)
+{
+ int i, found = 0;
+
+ for (i = 0; i < argc; i++)
+ if (str_has_prefix(argv[i], "$arg")) {
+ trace_probe_log_set_index(i + 2);
+
+ if (!tparg_is_function_entry(ctx->flags)) {
+ trace_probe_log_err(0, NOFENTRY_ARGS);
+ return -EINVAL;
+ }
+
+ if (isdigit(argv[i][4])) {
+ found = 1;
+ continue;
+ }
+
+ if (argv[i][4] != '*') {
+ trace_probe_log_err(0, BAD_VAR);
+ return -EINVAL;
+ }
+
+ if (*args_idx >= 0 && *args_idx < argc) {
+ trace_probe_log_err(0, DOUBLE_ARGS);
+ return -EINVAL;
+ }
+ found = 1;
+ *args_idx = i;
+ }
+
+ return found;
+}
+
+static int sprint_nth_btf_arg(int idx, const char *type,
+ char *buf, int bufsize,
+ struct traceprobe_parse_context *ctx)
+{
+ const char *name;
+ int ret;
+
+ if (idx >= ctx->nr_params) {
+ trace_probe_log_err(0, NO_BTFARG);
+ return -ENOENT;
+ }
+ name = btf_name_by_offset(ctx->btf, ctx->params[idx].name_off);
+ if (!name) {
+ trace_probe_log_err(0, NO_BTF_ENTRY);
+ return -ENOENT;
+ }
+ ret = snprintf(buf, bufsize, "%s%s", name, type);
+ if (ret >= bufsize) {
+ trace_probe_log_err(0, ARGS_2LONG);
+ return -E2BIG;
+ }
+ return ret;
+}
+
+/* Return new_argv which must be freed after use */
+const char **traceprobe_expand_meta_args(int argc, const char *argv[],
+ int *new_argc, char *buf, int bufsize,
+ struct traceprobe_parse_context *ctx)
+{
+ const struct btf_param *params = NULL;
+ int i, j, n, used, ret, args_idx = -1;
+ const char **new_argv = NULL;
+
+ ret = argv_has_var_arg(argc, argv, &args_idx, ctx);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ if (!ret) {
+ *new_argc = argc;
+ return NULL;
+ }
+
+ ret = query_btf_context(ctx);
+ if (ret < 0 || ctx->nr_params == 0) {
+ if (args_idx != -1) {
+ /* $arg* requires BTF info */
+ trace_probe_log_err(0, NOSUP_BTFARG);
+ return (const char **)params;
+ }
+ *new_argc = argc;
+ return NULL;
+ }
+
+ if (args_idx >= 0)
+ *new_argc = argc + ctx->nr_params - 1;
+ else
+ *new_argc = argc;
+
+ new_argv = kcalloc(*new_argc, sizeof(char *), GFP_KERNEL);
+ if (!new_argv)
+ return ERR_PTR(-ENOMEM);
+
+ used = 0;
+ for (i = 0, j = 0; i < argc; i++) {
+ trace_probe_log_set_index(i + 2);
+ if (i == args_idx) {
+ for (n = 0; n < ctx->nr_params; n++) {
+ ret = sprint_nth_btf_arg(n, "", buf + used,
+ bufsize - used, ctx);
+ if (ret < 0)
+ goto error;
+
+ new_argv[j++] = buf + used;
+ used += ret + 1;
+ }
+ continue;
+ }
+
+ if (str_has_prefix(argv[i], "$arg")) {
+ char *type = NULL;
+
+ n = simple_strtoul(argv[i] + 4, &type, 10);
+ if (type && !(*type == ':' || *type == '\0')) {
+ trace_probe_log_err(0, BAD_VAR);
+ ret = -ENOENT;
+ goto error;
+ }
+ /* Note: $argN starts from $arg1 */
+ ret = sprint_nth_btf_arg(n - 1, type, buf + used,
+ bufsize - used, ctx);
+ if (ret < 0)
+ goto error;
+ new_argv[j++] = buf + used;
+ used += ret + 1;
+ } else
+ new_argv[j++] = argv[i];
+ }
+
+ return new_argv;
+
+error:
+ kfree(new_argv);
+ return ERR_PTR(ret);
+}
+
+void traceprobe_finish_parse(struct traceprobe_parse_context *ctx)
+{
+ clear_btf_context(ctx);
+}
+
int traceprobe_update_arg(struct probe_arg *arg)
{
struct fetch_insn *code = arg->code;
@@ -839,19 +1611,29 @@ int traceprobe_update_arg(struct probe_arg *arg)
/* When len=0, we just calculate the needed length */
#define LEN_OR_ZERO (len ? len - pos : 0)
static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
- bool is_return)
+ enum probe_print_type ptype)
{
struct probe_arg *parg;
int i, j;
int pos = 0;
const char *fmt, *arg;
- if (!is_return) {
+ switch (ptype) {
+ case PROBE_PRINT_NORMAL:
fmt = "(%lx)";
- arg = "REC->" FIELD_STRING_IP;
- } else {
+ arg = ", REC->" FIELD_STRING_IP;
+ break;
+ case PROBE_PRINT_RETURN:
fmt = "(%lx <- %lx)";
- arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+ arg = ", REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+ break;
+ case PROBE_PRINT_EVENT:
+ fmt = "";
+ arg = "";
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
}
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
@@ -871,13 +1653,12 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
parg->type->fmt);
}
- pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", arg);
for (i = 0; i < tp->nr_args; i++) {
parg = tp->args + i;
if (parg->count) {
- if ((strcmp(parg->type->name, "string") == 0) ||
- (strcmp(parg->type->name, "ustring") == 0))
+ if (parg->type->is_string)
fmt = ", __get_str(%s[%d])";
else
fmt = ", REC->%s[%d]";
@@ -885,8 +1666,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
pos += snprintf(buf + pos, LEN_OR_ZERO,
fmt, parg->name, j);
} else {
- if ((strcmp(parg->type->name, "string") == 0) ||
- (strcmp(parg->type->name, "ustring") == 0))
+ if (parg->type->is_string)
fmt = ", __get_str(%s)";
else
fmt = ", REC->%s";
@@ -900,20 +1680,20 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
}
#undef LEN_OR_ZERO
-int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return)
+int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype)
{
struct trace_event_call *call = trace_probe_event_call(tp);
int len;
char *print_fmt;
/* First: called with 0 length to calculate the needed length */
- len = __set_print_fmt(tp, NULL, 0, is_return);
+ len = __set_print_fmt(tp, NULL, 0, ptype);
print_fmt = kmalloc(len + 1, GFP_KERNEL);
if (!print_fmt)
return -ENOMEM;
/* Second: actually write the @print_fmt */
- __set_print_fmt(tp, print_fmt, len + 1, is_return);
+ __set_print_fmt(tp, print_fmt, len + 1, ptype);
call->print_fmt = print_fmt;
return 0;
@@ -1029,11 +1809,36 @@ error:
return ret;
}
+static struct trace_event_call *
+find_trace_event_call(const char *system, const char *event_name)
+{
+ struct trace_event_call *tp_event;
+ const char *name;
+
+ list_for_each_entry(tp_event, &ftrace_events, list) {
+ if (!tp_event->class->system ||
+ strcmp(system, tp_event->class->system))
+ continue;
+ name = trace_event_name(tp_event);
+ if (!name || strcmp(event_name, name))
+ continue;
+ return tp_event;
+ }
+
+ return NULL;
+}
+
int trace_probe_register_event_call(struct trace_probe *tp)
{
struct trace_event_call *call = trace_probe_event_call(tp);
int ret;
+ lockdep_assert_held(&event_mutex);
+
+ if (find_trace_event_call(trace_probe_group_name(tp),
+ trace_probe_name(tp)))
+ return -EEXIST;
+
ret = register_trace_event(&call->event);
if (!ret)
return -ENODEV;
@@ -1083,8 +1888,7 @@ int trace_probe_remove_file(struct trace_probe *tp,
return -ENOENT;
list_del_rcu(&link->list);
- synchronize_rcu();
- kfree(link);
+ kvfree_rcu_mightsleep(link);
if (list_empty(&tp->event->files))
trace_probe_clear_flag(tp, TP_FLAG_TRACE);
@@ -1134,3 +1938,47 @@ bool trace_probe_match_command_args(struct trace_probe *tp,
}
return true;
}
+
+int trace_probe_create(const char *raw_command, int (*createfn)(int, const char **))
+{
+ int argc = 0, ret = 0;
+ char **argv;
+
+ argv = argv_split(GFP_KERNEL, raw_command, &argc);
+ if (!argv)
+ return -ENOMEM;
+
+ if (argc)
+ ret = createfn(argc, (const char **)argv);
+
+ argv_free(argv);
+
+ return ret;
+}
+
+int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_args,
+ u8 *data, void *field)
+{
+ void *p;
+ int i, j;
+
+ for (i = 0; i < nr_args; i++) {
+ struct probe_arg *a = args + i;
+
+ trace_seq_printf(s, " %s=", a->name);
+ if (likely(!a->count)) {
+ if (!a->type->print(s, data + a->offset, field))
+ return -ENOMEM;
+ continue;
+ }
+ trace_seq_putc(s, '{');
+ p = data + a->offset;
+ for (j = 0; j < a->count; j++) {
+ if (!a->type->print(s, p, field))
+ return -ENOMEM;
+ trace_seq_putc(s, j == a->count - 1 ? '}' : ',');
+ p += a->type->size;
+ }
+ }
+ return 0;
+}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index a22b62813f8c..c1877d018269 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -16,7 +16,6 @@
#include <linux/tracefs.h>
#include <linux/types.h>
#include <linux/string.h>
-#include <linux/ctype.h>
#include <linux/ptrace.h>
#include <linux/perf_event.h>
#include <linux/kprobes.h>
@@ -24,6 +23,7 @@
#include <linux/limits.h>
#include <linux/uaccess.h>
#include <linux/bitops.h>
+#include <linux/btf.h>
#include <asm/bitsperlong.h>
#include "trace.h"
@@ -33,7 +33,9 @@
#define MAX_ARGSTR_LEN 63
#define MAX_ARRAY_LEN 64
#define MAX_ARG_NAME_LEN 32
+#define MAX_BTF_ARGS_LEN 128
#define MAX_STRING_SIZE PATH_MAX
+#define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN)
/* Reserved field names */
#define FIELD_STRING_IP "__probe_ip"
@@ -99,10 +101,12 @@ enum fetch_op {
FETCH_OP_ST_UMEM, /* Mem: .offset, .size */
FETCH_OP_ST_STRING, /* String: .offset, .size */
FETCH_OP_ST_USTRING, /* User String: .offset, .size */
+ FETCH_OP_ST_SYMSTR, /* Kernel Symbol String: .offset, .size */
// Stage 4 (modify) op
FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */
// Stage 5 (loop) op
FETCH_OP_LP_ARRAY, /* Array: .param = loop count */
+ FETCH_OP_TP_ARG, /* Trace Point argument */
FETCH_OP_END,
FETCH_NOP_SYMBOL, /* Unresolved Symbol holder */
};
@@ -133,9 +137,10 @@ struct fetch_insn {
struct fetch_type {
const char *name; /* Name of type */
size_t size; /* Byte size of type */
- int is_signed; /* Signed flag */
+ bool is_signed; /* Signed flag */
+ bool is_string; /* String flag */
print_type_func_t print; /* Print functions */
- const char *fmt; /* Fromat string */
+ const char *fmt; /* Format string */
const char *fmttype; /* Name in format file */
};
@@ -164,6 +169,7 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(x16);
DECLARE_BASIC_PRINT_TYPE_FUNC(x32);
DECLARE_BASIC_PRINT_TYPE_FUNC(x64);
+DECLARE_BASIC_PRINT_TYPE_FUNC(char);
DECLARE_BASIC_PRINT_TYPE_FUNC(string);
DECLARE_BASIC_PRINT_TYPE_FUNC(symbol);
@@ -177,16 +183,19 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(symbol);
#define _ADDR_FETCH_TYPE(t) __ADDR_FETCH_TYPE(t)
#define ADDR_FETCH_TYPE _ADDR_FETCH_TYPE(BITS_PER_LONG)
-#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
- {.name = _name, \
+#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, str, _fmttype) \
+ {.name = _name, \
.size = _size, \
- .is_signed = sign, \
+ .is_signed = (bool)sign, \
+ .is_string = (bool)str, \
.print = PRINT_TYPE_FUNC_NAME(ptype), \
.fmt = PRINT_TYPE_FMT_NAME(ptype), \
.fmttype = _fmttype, \
}
+
+/* Non string types can use these macros */
#define _ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
- __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, #_fmttype)
+ __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, 0, #_fmttype)
#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
_ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, ptype)
@@ -302,7 +311,7 @@ trace_probe_primary_from_call(struct trace_event_call *call)
{
struct trace_probe_event *tpe = trace_probe_event_from_call(call);
- return list_first_entry(&tpe->probes, struct trace_probe, list);
+ return list_first_entry_or_null(&tpe->probes, struct trace_probe, list);
}
static inline struct list_head *trace_probe_probe_list(struct trace_probe *tp)
@@ -342,40 +351,78 @@ struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp,
int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b);
bool trace_probe_match_command_args(struct trace_probe *tp,
int argc, const char **argv);
+int trace_probe_create(const char *raw_command, int (*createfn)(int, const char **));
+int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_args,
+ u8 *data, void *field);
#define trace_probe_for_each_link(pos, tp) \
list_for_each_entry(pos, &(tp)->event->files, list)
#define trace_probe_for_each_link_rcu(pos, tp) \
list_for_each_entry_rcu(pos, &(tp)->event->files, list)
-/* Check the name is good for event/group/fields */
-static inline bool is_good_name(const char *name)
-{
- if (!isalpha(*name) && *name != '_')
- return false;
- while (*++name != '\0') {
- if (!isalpha(*name) && !isdigit(*name) && *name != '_')
- return false;
- }
- return true;
-}
-
+/*
+ * The flags used for parsing trace_probe arguments.
+ * TPARG_FL_RETURN, TPARG_FL_FENTRY and TPARG_FL_TEVENT are mutually exclusive.
+ * TPARG_FL_KERNEL and TPARG_FL_USER are also mutually exclusive.
+ * TPARG_FL_FPROBE and TPARG_FL_TPOINT are optional but it should be with
+ * TPARG_FL_KERNEL.
+ */
#define TPARG_FL_RETURN BIT(0)
#define TPARG_FL_KERNEL BIT(1)
#define TPARG_FL_FENTRY BIT(2)
-#define TPARG_FL_MASK GENMASK(2, 0)
+#define TPARG_FL_TEVENT BIT(3)
+#define TPARG_FL_USER BIT(4)
+#define TPARG_FL_FPROBE BIT(5)
+#define TPARG_FL_TPOINT BIT(6)
+#define TPARG_FL_LOC_MASK GENMASK(4, 0)
+
+static inline bool tparg_is_function_entry(unsigned int flags)
+{
+ return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY);
+}
+
+struct traceprobe_parse_context {
+ struct trace_event_call *event;
+ /* BTF related parameters */
+ const char *funcname; /* Function name in BTF */
+ const struct btf_type *proto; /* Prototype of the function */
+ const struct btf_param *params; /* Parameter of the function */
+ s32 nr_params; /* The number of the parameters */
+ struct btf *btf; /* The BTF to be used */
+ const struct btf_type *last_type; /* Saved type */
+ u32 last_bitoffs; /* Saved bitoffs */
+ u32 last_bitsize; /* Saved bitsize */
+ unsigned int flags;
+ int offset;
+};
extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i,
- char *arg, unsigned int flags);
+ const char *argv,
+ struct traceprobe_parse_context *ctx);
+const char **traceprobe_expand_meta_args(int argc, const char *argv[],
+ int *new_argc, char *buf, int bufsize,
+ struct traceprobe_parse_context *ctx);
extern int traceprobe_update_arg(struct probe_arg *arg);
extern void traceprobe_free_probe_arg(struct probe_arg *arg);
+/*
+ * If either traceprobe_parse_probe_arg() or traceprobe_expand_meta_args() is called,
+ * this MUST be called for clean up the context and return a resource.
+ */
+void traceprobe_finish_parse(struct traceprobe_parse_context *ctx);
+
extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
char *buf, int offset);
-extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return);
+enum probe_print_type {
+ PROBE_PRINT_NORMAL,
+ PROBE_PRINT_RETURN,
+ PROBE_PRINT_EVENT,
+};
+
+extern int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype);
#ifdef CONFIG_PERF_EVENTS
extern struct trace_event_call *
@@ -399,18 +446,23 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(REFCNT_OPEN_BRACE, "Reference counter brace is not closed"), \
C(BAD_REFCNT_SUFFIX, "Reference counter has wrong suffix"), \
C(BAD_UPROBE_OFFS, "Invalid uprobe offset"), \
- C(MAXACT_NO_KPROBE, "Maxactive is not for kprobe"), \
+ C(BAD_MAXACT_TYPE, "Maxactive is only for function exit"), \
C(BAD_MAXACT, "Invalid maxactive number"), \
C(MAXACT_TOO_BIG, "Maxactive is too big"), \
C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \
+ C(NON_UNIQ_SYMBOL, "The symbol is not unique"), \
C(BAD_RETPROBE, "Retprobe address must be an function entry"), \
+ C(NO_TRACEPOINT, "Tracepoint is not found"), \
+ C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \
C(NO_GROUP_NAME, "Group name is not specified"), \
C(GROUP_TOO_LONG, "Group name is too long"), \
C(BAD_GROUP_NAME, "Group name must follow the same rules as C identifiers"), \
C(NO_EVENT_NAME, "Event name is not specified"), \
C(EVENT_TOO_LONG, "Event name is too long"), \
C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \
+ C(EVENT_EXIST, "Given group/event name is already used by another event"), \
C(RETVAL_ON_PROBE, "$retval is not available on probe"), \
+ C(NO_RETVAL, "This function returns 'void' type"), \
C(BAD_STACK_NUM, "Invalid stack number"), \
C(BAD_ARG_NUM, "Invalid argument number"), \
C(BAD_VAR, "Invalid $-valiable specified"), \
@@ -433,6 +485,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(ARRAY_TOO_BIG, "Array number is too big"), \
C(BAD_TYPE, "Unknown type is specified"), \
C(BAD_STRING, "String accepts only memory argument"), \
+ C(BAD_SYMSTRING, "Symbol String doesn't accept data/userdata"), \
C(BAD_BITFIELD, "Invalid bitfield"), \
C(ARG_NAME_TOO_LONG, "Argument name is too long"), \
C(NO_ARG_NAME, "Argument name is not specified"), \
@@ -444,7 +497,26 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(FAIL_REG_PROBE, "Failed to register probe event"),\
C(DIFF_PROBE_TYPE, "Probe type is different from existing probe"),\
C(DIFF_ARG_TYPE, "Argument type or name is different from existing probe"),\
- C(SAME_PROBE, "There is already the exact same probe event"),
+ C(SAME_PROBE, "There is already the exact same probe event"),\
+ C(NO_EVENT_INFO, "This requires both group and event name to attach"),\
+ C(BAD_ATTACH_EVENT, "Attached event does not exist"),\
+ C(BAD_ATTACH_ARG, "Attached event does not have this field"),\
+ C(NO_EP_FILTER, "No filter rule after 'if'"), \
+ C(NOSUP_BTFARG, "BTF is not available or not supported"), \
+ C(NO_BTFARG, "This variable is not found at this probe point"),\
+ C(NO_BTF_ENTRY, "No BTF entry for this probe point"), \
+ C(BAD_VAR_ARGS, "$arg* must be an independent parameter without name etc."),\
+ C(NOFENTRY_ARGS, "$arg* can be used only on function entry"), \
+ C(DOUBLE_ARGS, "$arg* can be used only once in the parameters"), \
+ C(ARGS_2LONG, "$arg* failed because the argument list is too long"), \
+ C(ARGIDX_2BIG, "$argN index is too big"), \
+ C(NO_PTR_STRCT, "This is not a pointer to union/structure."), \
+ C(NOSUP_DAT_ARG, "Non pointer structure/union argument is not supported."),\
+ C(BAD_HYPHEN, "Failed to parse single hyphen. Forgot '>'?"), \
+ C(NO_BTF_FIELD, "This field is not found."), \
+ C(BAD_BTF_TID, "Failed to get BTF type info."),\
+ C(BAD_TYPE4STR, "This type does not fit for string."),\
+ C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),
#undef C
#define C(a, b) TP_ERR_##a
@@ -468,3 +540,8 @@ void __trace_probe_log_err(int offset, int err);
#define trace_probe_log_err(offs, err) \
__trace_probe_log_err(offs, TP_ERR_##err)
+
+struct uprobe_dispatch_data {
+ struct trace_uprobe *tu;
+ unsigned long bp_addr;
+};
diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h
new file mode 100644
index 000000000000..bb723eefd7b7
--- /dev/null
+++ b/kernel/trace/trace_probe_kernel.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TRACE_PROBE_KERNEL_H_
+#define __TRACE_PROBE_KERNEL_H_
+
+/*
+ * This depends on trace_probe.h, but can not include it due to
+ * the way trace_probe_tmpl.h is used by trace_kprobe.c and trace_eprobe.c.
+ * Which means that any other user must include trace_probe.h before including
+ * this file.
+ */
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
+fetch_store_strlen_user(unsigned long addr)
+{
+ const void __user *uaddr = (__force const void __user *)addr;
+
+ return strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
+}
+
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
+fetch_store_strlen(unsigned long addr)
+{
+ int ret, len = 0;
+ u8 c;
+
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if (addr < TASK_SIZE)
+ return fetch_store_strlen_user(addr);
+#endif
+
+ do {
+ ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
+ len++;
+ } while (c && ret == 0 && len < MAX_STRING_SIZE);
+
+ return (ret < 0) ? ret : len;
+}
+
+static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base)
+{
+ if (ret < 0)
+ ret = 0;
+ *(u32 *)dest = make_data_loc(ret, __dest - base);
+}
+
+/*
+ * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
+ * with max length and relative data location.
+ */
+static nokprobe_inline int
+fetch_store_string_user(unsigned long addr, void *dest, void *base)
+{
+ const void __user *uaddr = (__force const void __user *)addr;
+ int maxlen = get_loc_len(*(u32 *)dest);
+ void *__dest;
+ long ret;
+
+ if (unlikely(!maxlen))
+ return -ENOMEM;
+
+ __dest = get_loc_data(dest, base);
+
+ ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
+ set_data_loc(ret, dest, __dest, base);
+
+ return ret;
+}
+
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
+ * length and relative data location.
+ */
+static nokprobe_inline int
+fetch_store_string(unsigned long addr, void *dest, void *base)
+{
+ int maxlen = get_loc_len(*(u32 *)dest);
+ void *__dest;
+ long ret;
+
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)addr < TASK_SIZE)
+ return fetch_store_string_user(addr, dest, base);
+#endif
+
+ if (unlikely(!maxlen))
+ return -ENOMEM;
+
+ __dest = get_loc_data(dest, base);
+
+ /*
+ * Try to get string again, since the string can be changed while
+ * probing.
+ */
+ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
+ set_data_loc(ret, dest, __dest, base);
+
+ return ret;
+}
+
+static nokprobe_inline int
+probe_mem_read_user(void *dest, void *src, size_t size)
+{
+ const void __user *uaddr = (__force const void __user *)src;
+
+ return copy_from_user_nofault(dest, uaddr, size);
+}
+
+static nokprobe_inline int
+probe_mem_read(void *dest, void *src, size_t size)
+{
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)src < TASK_SIZE)
+ return probe_mem_read_user(dest, src, size);
+#endif
+ return copy_from_kernel_nofault(dest, src, size);
+}
+
+#endif /* __TRACE_PROBE_KERNEL_H_ */
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index e5282828f4a6..3935b347f874 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -54,7 +54,7 @@ fetch_apply_bitfield(struct fetch_insn *code, void *buf)
* If dest is NULL, don't store result and return required dynamic data size.
*/
static int
-process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs,
+process_fetch_insn(struct fetch_insn *code, void *rec,
void *dest, void *base);
static nokprobe_inline int fetch_store_strlen(unsigned long addr);
static nokprobe_inline int
@@ -67,6 +67,57 @@ probe_mem_read(void *dest, void *src, size_t size);
static nokprobe_inline int
probe_mem_read_user(void *dest, void *src, size_t size);
+static nokprobe_inline int
+fetch_store_symstrlen(unsigned long addr)
+{
+ char namebuf[KSYM_SYMBOL_LEN];
+ int ret;
+
+ ret = sprint_symbol(namebuf, addr);
+ if (ret < 0)
+ return 0;
+
+ return ret + 1;
+}
+
+/*
+ * Fetch a null-terminated symbol string + offset. Caller MUST set *(u32 *)buf
+ * with max length and relative data location.
+ */
+static nokprobe_inline int
+fetch_store_symstring(unsigned long addr, void *dest, void *base)
+{
+ int maxlen = get_loc_len(*(u32 *)dest);
+ void *__dest;
+
+ if (unlikely(!maxlen))
+ return -ENOMEM;
+
+ __dest = get_loc_data(dest, base);
+
+ return sprint_symbol(__dest, addr);
+}
+
+/* common part of process_fetch_insn*/
+static nokprobe_inline int
+process_common_fetch_insn(struct fetch_insn *code, unsigned long *val)
+{
+ switch (code->op) {
+ case FETCH_OP_IMM:
+ *val = code->immediate;
+ break;
+ case FETCH_OP_COMM:
+ *val = (unsigned long)current->comm;
+ break;
+ case FETCH_OP_DATA:
+ *val = (unsigned long)code->data;
+ break;
+ default:
+ return -EILSEQ;
+ }
+ return 0;
+}
+
/* From the 2nd stage, routine is same */
static nokprobe_inline int
process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val,
@@ -99,16 +150,22 @@ stage2:
stage3:
/* 3rd stage: store value to buffer */
if (unlikely(!dest)) {
- if (code->op == FETCH_OP_ST_STRING) {
+ switch (code->op) {
+ case FETCH_OP_ST_STRING:
ret = fetch_store_strlen(val + code->offset);
code++;
goto array;
- } else if (code->op == FETCH_OP_ST_USTRING) {
- ret += fetch_store_strlen_user(val + code->offset);
+ case FETCH_OP_ST_USTRING:
+ ret = fetch_store_strlen_user(val + code->offset);
code++;
goto array;
- } else
+ case FETCH_OP_ST_SYMSTR:
+ ret = fetch_store_symstrlen(val + code->offset);
+ code++;
+ goto array;
+ default:
return -EILSEQ;
+ }
}
switch (code->op) {
@@ -129,6 +186,10 @@ stage3:
loc = *(u32 *)dest;
ret = fetch_store_string_user(val + code->offset, dest, base);
break;
+ case FETCH_OP_ST_SYMSTR:
+ loc = *(u32 *)dest;
+ ret = fetch_store_symstring(val + code->offset, dest, base);
+ break;
default:
return -EILSEQ;
}
@@ -143,6 +204,8 @@ stage3:
array:
/* the last stage: Loop on array */
if (code->op == FETCH_OP_LP_ARRAY) {
+ if (ret < 0)
+ ret = 0;
total += ret;
if (++i < code->param) {
code = s3;
@@ -167,7 +230,7 @@ array:
return code->op == FETCH_OP_END ? ret : -EILSEQ;
}
-/* Sum up total data length for dynamic arraies (strings) */
+/* Sum up total data length for dynamic arrays (strings) */
static nokprobe_inline int
__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
{
@@ -188,7 +251,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
/* Store the value of each argument */
static nokprobe_inline void
-store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs,
+store_trace_args(void *data, struct trace_probe *tp, void *rec,
int header_size, int maxlen)
{
struct probe_arg *arg;
@@ -203,40 +266,10 @@ store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs,
/* Point the dynamic data area if needed */
if (unlikely(arg->dynamic))
*dl = make_data_loc(maxlen, dyndata - base);
- ret = process_fetch_insn(arg->code, regs, dl, base);
- if (unlikely(ret < 0 && arg->dynamic)) {
- *dl = make_data_loc(0, dyndata - base);
- } else {
+ ret = process_fetch_insn(arg->code, rec, dl, base);
+ if (arg->dynamic && likely(ret > 0)) {
dyndata += ret;
maxlen -= ret;
}
}
}
-
-static inline int
-print_probe_args(struct trace_seq *s, struct probe_arg *args, int nr_args,
- u8 *data, void *field)
-{
- void *p;
- int i, j;
-
- for (i = 0; i < nr_args; i++) {
- struct probe_arg *a = args + i;
-
- trace_seq_printf(s, " %s=", a->name);
- if (likely(!a->count)) {
- if (!a->type->print(s, data + a->offset, field))
- return -ENOMEM;
- continue;
- }
- trace_seq_putc(s, '{');
- p = data + a->offset;
- for (j = 0; j < a->count; j++) {
- if (!a->type->print(s, p, field))
- return -ENOMEM;
- trace_seq_putc(s, j == a->count - 1 ? '}' : ',');
- p += a->type->size;
- }
- }
- return 0;
-}
diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c
new file mode 100644
index 000000000000..a520b11afb0d
--- /dev/null
+++ b/kernel/trace/trace_recursion_record.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace_output.h"
+
+struct recursed_functions {
+ unsigned long ip;
+ unsigned long parent_ip;
+};
+
+static struct recursed_functions recursed_functions[CONFIG_FTRACE_RECORD_RECURSION_SIZE];
+static atomic_t nr_records;
+
+/*
+ * Cache the last found function. Yes, updates to this is racey, but
+ * so is memory cache ;-)
+ */
+static unsigned long cached_function;
+
+void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip)
+{
+ int index = 0;
+ int i;
+ unsigned long old;
+
+ again:
+ /* First check the last one recorded */
+ if (ip == cached_function)
+ return;
+
+ i = atomic_read(&nr_records);
+ /* nr_records is -1 when clearing records */
+ smp_mb__after_atomic();
+ if (i < 0)
+ return;
+
+ /*
+ * If there's two writers and this writer comes in second,
+ * the cmpxchg() below to update the ip will fail. Then this
+ * writer will try again. It is possible that index will now
+ * be greater than nr_records. This is because the writer
+ * that succeeded has not updated the nr_records yet.
+ * This writer could keep trying again until the other writer
+ * updates nr_records. But if the other writer takes an
+ * interrupt, and that interrupt locks up that CPU, we do
+ * not want this CPU to lock up due to the recursion protection,
+ * and have a bug report showing this CPU as the cause of
+ * locking up the computer. To not lose this record, this
+ * writer will simply use the next position to update the
+ * recursed_functions, and it will update the nr_records
+ * accordingly.
+ */
+ if (index < i)
+ index = i;
+ if (index >= CONFIG_FTRACE_RECORD_RECURSION_SIZE)
+ return;
+
+ for (i = index - 1; i >= 0; i--) {
+ if (recursed_functions[i].ip == ip) {
+ cached_function = ip;
+ return;
+ }
+ }
+
+ cached_function = ip;
+
+ /*
+ * We only want to add a function if it hasn't been added before.
+ * Add to the current location before incrementing the count.
+ * If it fails to add, then increment the index (save in i)
+ * and try again.
+ */
+ old = cmpxchg(&recursed_functions[index].ip, 0, ip);
+ if (old != 0) {
+ /* Did something else already added this for us? */
+ if (old == ip)
+ return;
+ /* Try the next location (use i for the next index) */
+ index++;
+ goto again;
+ }
+
+ recursed_functions[index].parent_ip = parent_ip;
+
+ /*
+ * It's still possible that we could race with the clearing
+ * CPU0 CPU1
+ * ---- ----
+ * ip = func
+ * nr_records = -1;
+ * recursed_functions[0] = 0;
+ * i = -1
+ * if (i < 0)
+ * nr_records = 0;
+ * (new recursion detected)
+ * recursed_functions[0] = func
+ * cmpxchg(recursed_functions[0],
+ * func, 0)
+ *
+ * But the worse that could happen is that we get a zero in
+ * the recursed_functions array, and it's likely that "func" will
+ * be recorded again.
+ */
+ i = atomic_read(&nr_records);
+ smp_mb__after_atomic();
+ if (i < 0)
+ cmpxchg(&recursed_functions[index].ip, ip, 0);
+ else if (i <= index)
+ atomic_cmpxchg(&nr_records, i, index + 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_record_recursion);
+
+static DEFINE_MUTEX(recursed_function_lock);
+static struct trace_seq *tseq;
+
+static void *recursed_function_seq_start(struct seq_file *m, loff_t *pos)
+{
+ void *ret = NULL;
+ int index;
+
+ mutex_lock(&recursed_function_lock);
+ index = atomic_read(&nr_records);
+ if (*pos < index) {
+ ret = &recursed_functions[*pos];
+ }
+
+ tseq = kzalloc(sizeof(*tseq), GFP_KERNEL);
+ if (!tseq)
+ return ERR_PTR(-ENOMEM);
+
+ trace_seq_init(tseq);
+
+ return ret;
+}
+
+static void *recursed_function_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ int index;
+ int p;
+
+ index = atomic_read(&nr_records);
+ p = ++(*pos);
+
+ return p < index ? &recursed_functions[p] : NULL;
+}
+
+static void recursed_function_seq_stop(struct seq_file *m, void *v)
+{
+ kfree(tseq);
+ mutex_unlock(&recursed_function_lock);
+}
+
+static int recursed_function_seq_show(struct seq_file *m, void *v)
+{
+ struct recursed_functions *record = v;
+ int ret = 0;
+
+ if (record) {
+ trace_seq_print_sym(tseq, record->parent_ip, true);
+ trace_seq_puts(tseq, ":\t");
+ trace_seq_print_sym(tseq, record->ip, true);
+ trace_seq_putc(tseq, '\n');
+ ret = trace_print_seq(m, tseq);
+ }
+
+ return ret;
+}
+
+static const struct seq_operations recursed_function_seq_ops = {
+ .start = recursed_function_seq_start,
+ .next = recursed_function_seq_next,
+ .stop = recursed_function_seq_stop,
+ .show = recursed_function_seq_show
+};
+
+static int recursed_function_open(struct inode *inode, struct file *file)
+{
+ int ret = 0;
+
+ mutex_lock(&recursed_function_lock);
+ /* If this file was opened for write, then erase contents */
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+ /* disable updating records */
+ atomic_set(&nr_records, -1);
+ smp_mb__after_atomic();
+ memset(recursed_functions, 0, sizeof(recursed_functions));
+ smp_wmb();
+ /* enable them again */
+ atomic_set(&nr_records, 0);
+ }
+ if (file->f_mode & FMODE_READ)
+ ret = seq_open(file, &recursed_function_seq_ops);
+ mutex_unlock(&recursed_function_lock);
+
+ return ret;
+}
+
+static ssize_t recursed_function_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ return count;
+}
+
+static int recursed_function_release(struct inode *inode, struct file *file)
+{
+ if (file->f_mode & FMODE_READ)
+ seq_release(inode, file);
+ return 0;
+}
+
+static const struct file_operations recursed_functions_fops = {
+ .open = recursed_function_open,
+ .write = recursed_function_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = recursed_function_release,
+};
+
+__init static int create_recursed_functions(void)
+{
+
+ trace_create_file("recursed_functions", TRACE_MODE_WRITE,
+ NULL, NULL, &recursed_functions_fops);
+ return 0;
+}
+
+fs_initcall(create_recursed_functions);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index e304196d7c28..c9ffdcfe622e 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -22,7 +22,8 @@ static DEFINE_MUTEX(sched_register_mutex);
static void
probe_sched_switch(void *ignore, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ struct task_struct *prev, struct task_struct *next,
+ unsigned int prev_state)
{
int flags;
@@ -44,7 +45,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee)
if (!flags)
return;
- tracing_record_taskinfo(current, flags);
+ tracing_record_taskinfo_sched_switch(current, wakee, flags);
}
static int tracing_sched_register(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 97b10bb31a1f..0469a04a355f 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -26,9 +26,9 @@ static struct task_struct *wakeup_task;
static int wakeup_cpu;
static int wakeup_current_cpu;
static unsigned wakeup_prio = -1;
-static int wakeup_rt;
-static int wakeup_dl;
-static int tracing_dl = 0;
+static bool wakeup_rt;
+static bool wakeup_dl;
+static bool tracing_dl;
static arch_spinlock_t wakeup_lock =
(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -67,7 +67,7 @@ static bool function_enabled;
static int
func_prolog_preempt_disable(struct trace_array *tr,
struct trace_array_cpu **data,
- int *pc)
+ unsigned int *trace_ctx)
{
long disabled;
int cpu;
@@ -75,7 +75,7 @@ func_prolog_preempt_disable(struct trace_array *tr,
if (likely(!wakeup_task))
return 0;
- *pc = preempt_count();
+ *trace_ctx = tracing_gen_ctx();
preempt_disable_notrace();
cpu = raw_smp_processor_id();
@@ -116,8 +116,8 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
- unsigned long flags;
- int pc, ret = 0;
+ unsigned int trace_ctx;
+ int ret = 0;
if (ftrace_graph_ignore_func(trace))
return 0;
@@ -131,11 +131,10 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
if (ftrace_graph_notrace_addr(trace->func))
return 1;
- if (!func_prolog_preempt_disable(tr, &data, &pc))
+ if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return 0;
- local_save_flags(flags);
- ret = __trace_graph_entry(tr, trace, flags, pc);
+ ret = __trace_graph_entry(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
preempt_enable_notrace();
@@ -146,16 +145,14 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
- unsigned long flags;
- int pc;
+ unsigned int trace_ctx;
ftrace_graph_addr_finish(trace);
- if (!func_prolog_preempt_disable(tr, &data, &pc))
+ if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return;
- local_save_flags(flags);
- __trace_graph_return(tr, trace, flags, pc);
+ __trace_graph_return(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
preempt_enable_notrace();
@@ -171,6 +168,8 @@ static void wakeup_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
+ else
+ iter->private = NULL;
}
static void wakeup_trace_close(struct trace_iterator *iter)
@@ -212,18 +211,18 @@ static void wakeup_print_header(struct seq_file *s)
*/
static void
wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
unsigned long flags;
- int pc;
+ unsigned int trace_ctx;
- if (!func_prolog_preempt_disable(tr, &data, &pc))
+ if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return;
local_irq_save(flags);
- trace_function(tr, ip, parent_ip, flags, pc);
+ trace_function(tr, ip, parent_ip, trace_ctx);
local_irq_restore(flags);
atomic_dec(&data->disabled);
@@ -303,12 +302,12 @@ static void wakeup_print_header(struct seq_file *s)
static void
__trace_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
if (is_graph(tr))
- trace_graph_function(tr, ip, parent_ip, flags, pc);
+ trace_graph_function(tr, ip, parent_ip, trace_ctx);
else
- trace_function(tr, ip, parent_ip, flags, pc);
+ trace_function(tr, ip, parent_ip, trace_ctx);
}
static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
@@ -375,7 +374,7 @@ static void
tracing_sched_switch_trace(struct trace_array *tr,
struct task_struct *prev,
struct task_struct *next,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
struct trace_event_call *call = &event_context_switch;
struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -383,7 +382,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
struct ctx_switch_entry *entry;
event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
return;
entry = ring_buffer_event_data(event);
@@ -396,14 +395,14 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry->next_cpu = task_cpu(next);
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, flags, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
static void
tracing_sched_wakeup_trace(struct trace_array *tr,
struct task_struct *wakee,
struct task_struct *curr,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
struct trace_event_call *call = &event_wakeup;
struct ring_buffer_event *event;
@@ -411,7 +410,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
struct trace_buffer *buffer = tr->array_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
return;
entry = ring_buffer_event_data(event);
@@ -424,19 +423,20 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry->next_cpu = task_cpu(wakee);
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, flags, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
static void notrace
probe_wakeup_sched_switch(void *ignore, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ struct task_struct *prev, struct task_struct *next,
+ unsigned int prev_state)
{
struct trace_array_cpu *data;
u64 T0, T1, delta;
unsigned long flags;
long disabled;
int cpu;
- int pc;
+ unsigned int trace_ctx;
tracing_record_cmdline(prev);
@@ -455,8 +455,6 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
if (next != wakeup_task)
return;
- pc = preempt_count();
-
/* disable local data, not wakeup_cpu data */
cpu = raw_smp_processor_id();
disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
@@ -464,6 +462,8 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
goto out;
local_irq_save(flags);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+
arch_spin_lock(&wakeup_lock);
/* We could race with grabbing wakeup_lock */
@@ -473,9 +473,9 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
/* The task we are waiting for is waking up */
data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu);
- __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
- tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
- __trace_stack(wakeup_trace, flags, 0, pc);
+ __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, trace_ctx);
+ tracing_sched_switch_trace(wakeup_trace, prev, next, trace_ctx);
+ __trace_stack(wakeup_trace, trace_ctx, 0);
T0 = data->preempt_timestamp;
T1 = ftrace_now(cpu);
@@ -501,7 +501,7 @@ static void __wakeup_reset(struct trace_array *tr)
{
wakeup_cpu = -1;
wakeup_prio = -1;
- tracing_dl = 0;
+ tracing_dl = false;
if (wakeup_task)
put_task_struct(wakeup_task);
@@ -527,9 +527,8 @@ probe_wakeup(void *ignore, struct task_struct *p)
{
struct trace_array_cpu *data;
int cpu = smp_processor_id();
- unsigned long flags;
long disabled;
- int pc;
+ unsigned int trace_ctx;
if (likely(!tracer_enabled))
return;
@@ -550,11 +549,12 @@ probe_wakeup(void *ignore, struct task_struct *p)
(!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
return;
- pc = preempt_count();
disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
if (unlikely(disabled != 1))
goto out;
+ trace_ctx = tracing_gen_ctx();
+
/* interrupts should be off from try_to_wake_up */
arch_spin_lock(&wakeup_lock);
@@ -575,25 +575,23 @@ probe_wakeup(void *ignore, struct task_struct *p)
* another task until the first one wakes up.
*/
if (dl_task(p))
- tracing_dl = 1;
+ tracing_dl = true;
else
- tracing_dl = 0;
+ tracing_dl = false;
wakeup_task = get_task_struct(p);
- local_save_flags(flags);
-
data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu);
data->preempt_timestamp = ftrace_now(cpu);
- tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
- __trace_stack(wakeup_trace, flags, 0, pc);
+ tracing_sched_wakeup_trace(wakeup_trace, p, current, trace_ctx);
+ __trace_stack(wakeup_trace, trace_ctx, 0);
/*
* We must be careful in using CALLER_ADDR2. But since wake_up
* is not called by an assembly function (where as schedule is)
* it should be safe to use it here.
*/
- __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+ __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, trace_ctx);
out_locked:
arch_spin_unlock(&wakeup_lock);
@@ -690,8 +688,8 @@ static int wakeup_tracer_init(struct trace_array *tr)
if (wakeup_busy)
return -EBUSY;
- wakeup_dl = 0;
- wakeup_rt = 0;
+ wakeup_dl = false;
+ wakeup_rt = false;
return __wakeup_tracer_init(tr);
}
@@ -700,8 +698,8 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)
if (wakeup_busy)
return -EBUSY;
- wakeup_dl = 0;
- wakeup_rt = 1;
+ wakeup_dl = false;
+ wakeup_rt = true;
return __wakeup_tracer_init(tr);
}
@@ -710,8 +708,8 @@ static int wakeup_dl_tracer_init(struct trace_array *tr)
if (wakeup_busy)
return -EBUSY;
- wakeup_dl = 1;
- wakeup_rt = 0;
+ wakeup_dl = true;
+ wakeup_rt = false;
return __wakeup_tracer_init(tr);
}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index b5e3496cf803..529590499b1f 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -107,7 +107,7 @@ static int trace_selftest_test_probe1_cnt;
static void trace_selftest_test_probe1_func(unsigned long ip,
unsigned long pip,
struct ftrace_ops *op,
- struct pt_regs *pt_regs)
+ struct ftrace_regs *fregs)
{
trace_selftest_test_probe1_cnt++;
}
@@ -116,7 +116,7 @@ static int trace_selftest_test_probe2_cnt;
static void trace_selftest_test_probe2_func(unsigned long ip,
unsigned long pip,
struct ftrace_ops *op,
- struct pt_regs *pt_regs)
+ struct ftrace_regs *fregs)
{
trace_selftest_test_probe2_cnt++;
}
@@ -125,7 +125,7 @@ static int trace_selftest_test_probe3_cnt;
static void trace_selftest_test_probe3_func(unsigned long ip,
unsigned long pip,
struct ftrace_ops *op,
- struct pt_regs *pt_regs)
+ struct ftrace_regs *fregs)
{
trace_selftest_test_probe3_cnt++;
}
@@ -134,7 +134,7 @@ static int trace_selftest_test_global_cnt;
static void trace_selftest_test_global_func(unsigned long ip,
unsigned long pip,
struct ftrace_ops *op,
- struct pt_regs *pt_regs)
+ struct ftrace_regs *fregs)
{
trace_selftest_test_global_cnt++;
}
@@ -143,24 +143,21 @@ static int trace_selftest_test_dyn_cnt;
static void trace_selftest_test_dyn_func(unsigned long ip,
unsigned long pip,
struct ftrace_ops *op,
- struct pt_regs *pt_regs)
+ struct ftrace_regs *fregs)
{
trace_selftest_test_dyn_cnt++;
}
static struct ftrace_ops test_probe1 = {
.func = trace_selftest_test_probe1_func,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static struct ftrace_ops test_probe2 = {
.func = trace_selftest_test_probe2_func,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static struct ftrace_ops test_probe3 = {
.func = trace_selftest_test_probe3_func,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static void print_counts(void)
@@ -290,6 +287,40 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt)
if (trace_selftest_test_probe3_cnt != 4)
goto out_free;
+ /* Remove trace function from probe 3 */
+ func1_name = "!" __stringify(DYN_FTRACE_TEST_NAME);
+ len1 = strlen(func1_name);
+
+ ftrace_set_filter(&test_probe3, func1_name, len1, 0);
+
+ DYN_FTRACE_TEST_NAME();
+
+ print_counts();
+
+ if (trace_selftest_test_probe1_cnt != 3)
+ goto out_free;
+ if (trace_selftest_test_probe2_cnt != 2)
+ goto out_free;
+ if (trace_selftest_test_probe3_cnt != 4)
+ goto out_free;
+ if (cnt > 1) {
+ if (trace_selftest_test_global_cnt == 0)
+ goto out_free;
+ }
+ if (trace_selftest_test_dyn_cnt == 0)
+ goto out_free;
+
+ DYN_FTRACE_TEST_NAME2();
+
+ print_counts();
+
+ if (trace_selftest_test_probe1_cnt != 3)
+ goto out_free;
+ if (trace_selftest_test_probe2_cnt != 3)
+ goto out_free;
+ if (trace_selftest_test_probe3_cnt != 5)
+ goto out_free;
+
ret = 0;
out_free:
unregister_ftrace_function(dyn_ops);
@@ -417,7 +448,7 @@ static int trace_selftest_recursion_cnt;
static void trace_selftest_test_recursion_func(unsigned long ip,
unsigned long pip,
struct ftrace_ops *op,
- struct pt_regs *pt_regs)
+ struct ftrace_regs *fregs)
{
/*
* This function is registered without the recursion safe flag.
@@ -432,7 +463,7 @@ static void trace_selftest_test_recursion_func(unsigned long ip,
static void trace_selftest_test_recursion_safe_func(unsigned long ip,
unsigned long pip,
struct ftrace_ops *op,
- struct pt_regs *pt_regs)
+ struct ftrace_regs *fregs)
{
/*
* We said we would provide our own recursion. By calling
@@ -448,11 +479,11 @@ static void trace_selftest_test_recursion_safe_func(unsigned long ip,
static struct ftrace_ops test_rec_probe = {
.func = trace_selftest_test_recursion_func,
+ .flags = FTRACE_OPS_FL_RECURSION,
};
static struct ftrace_ops test_recsafe_probe = {
.func = trace_selftest_test_recursion_safe_func,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static int
@@ -492,8 +523,13 @@ trace_selftest_function_recursion(void)
unregister_ftrace_function(&test_rec_probe);
ret = -1;
- if (trace_selftest_recursion_cnt != 1) {
- pr_cont("*callback not called once (%d)* ",
+ /*
+ * Recursion allows for transitions between context,
+ * and may call the callback twice.
+ */
+ if (trace_selftest_recursion_cnt != 1 &&
+ trace_selftest_recursion_cnt != 2) {
+ pr_cont("*callback not called once (or twice) (%d)* ",
trace_selftest_recursion_cnt);
goto out;
}
@@ -546,9 +582,11 @@ static enum {
static void trace_selftest_test_regs_func(unsigned long ip,
unsigned long pip,
struct ftrace_ops *op,
- struct pt_regs *pt_regs)
+ struct ftrace_regs *fregs)
{
- if (pt_regs)
+ struct pt_regs *regs = ftrace_get_regs(fregs);
+
+ if (regs)
trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND;
else
trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND;
@@ -556,7 +594,7 @@ static void trace_selftest_test_regs_func(unsigned long ip,
static struct ftrace_ops test_regs_probe = {
.func = trace_selftest_test_regs_func,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS,
+ .flags = FTRACE_OPS_FL_SAVE_REGS,
};
static int
@@ -746,6 +784,10 @@ static struct fgraph_ops fgraph_ops __initdata = {
.retfunc = &trace_graph_return,
};
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+static struct ftrace_ops direct;
+#endif
+
/*
* Pretty much the same than for the function tracer from which the selftest
* has been borrowed.
@@ -756,6 +798,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
{
int ret;
unsigned long count;
+ char *func_name __maybe_unused;
#ifdef CONFIG_DYNAMIC_FTRACE
if (ftrace_filter_param) {
@@ -782,7 +825,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
/* Have we just recovered from a hang? */
if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) {
- tracing_selftest_disabled = true;
+ disable_tracing_selftest("recovering from a hang");
ret = -1;
goto out;
}
@@ -804,8 +847,72 @@ trace_selftest_startup_function_graph(struct tracer *trace,
goto out;
}
- /* Don't test dynamic tracing, the function tracer already did */
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ /*
+ * These tests can take some time to run. Make sure on non PREEMPT
+ * kernels, we do not trigger the softlockup detector.
+ */
+ cond_resched();
+
+ tracing_reset_online_cpus(&tr->array_buffer);
+ set_graph_array(tr);
+ /*
+ * Some archs *cough*PowerPC*cough* add characters to the
+ * start of the function names. We simply put a '*' to
+ * accommodate them.
+ */
+ func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+ ftrace_set_global_filter(func_name, strlen(func_name), 1);
+
+ /*
+ * Register direct function together with graph tracer
+ * and make sure we get graph trace.
+ */
+ ftrace_set_filter_ip(&direct, (unsigned long)DYN_FTRACE_TEST_NAME, 0, 0);
+ ret = register_ftrace_direct(&direct,
+ (unsigned long)ftrace_stub_direct_tramp);
+ if (ret)
+ goto out;
+
+ cond_resched();
+
+ ret = register_ftrace_graph(&fgraph_ops);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ goto out;
+ }
+
+ DYN_FTRACE_TEST_NAME();
+
+ count = 0;
+
+ tracing_stop();
+ /* check the trace buffer */
+ ret = trace_test_buffer(&tr->array_buffer, &count);
+
+ unregister_ftrace_graph(&fgraph_ops);
+
+ ret = unregister_ftrace_direct(&direct,
+ (unsigned long)ftrace_stub_direct_tramp,
+ true);
+ if (ret)
+ goto out;
+
+ cond_resched();
+
+ tracing_start();
+
+ if (!ret && !count) {
+ ret = -1;
+ goto out;
+ }
+
+ /* Enable tracing on all functions again */
+ ftrace_set_global_filter(NULL, 0, 1);
+#endif
+
+ /* Don't test dynamic tracing, the function tracer already did */
out:
/* Stop it if we failed */
if (ret)
@@ -874,7 +981,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
int ret;
/*
- * Now that the big kernel lock is no longer preemptable,
+ * Now that the big kernel lock is no longer preemptible,
* and this is called with the BKL held, it will always
* fail. If preemption is already disabled, simply
* pass the test. When the BKL is removed, or becomes
@@ -936,7 +1043,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
int ret;
/*
- * Now that the big kernel lock is no longer preemptable,
+ * Now that the big kernel lock is no longer preemptible,
* and this is called with the BKL held, it will always
* fail. If preemption is already disabled, simply
* pass the test. When the BKL is removed, or becomes
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 1d84fcc78e3e..c158d65a8a88 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -13,10 +13,7 @@
* trace_seq_init() more than once to reset the trace_seq to start
* from scratch.
*
- * The buffer size is currently PAGE_SIZE, although it may become dynamic
- * in the future.
- *
- * A write to the buffer will either succed or fail. That is, unlike
+ * A write to the buffer will either succeed or fail. That is, unlike
* sprintf() there will not be a partial write (well it may write into
* the buffer but it wont update the pointers). This allows users to
* try to write something into the trace_seq buffer and if it fails
@@ -73,7 +70,7 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
* @fmt: printf format string
*
* The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
+ * copy to user routines. To simplify formatting of a trace
* trace_seq_printf() is used to store strings into a special
* buffer (@s). Then the output may be either used by
* the sequencer or pulled into another buffer.
@@ -131,9 +128,10 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask);
* trace_seq_vprintf - sequence printing of trace information
* @s: trace sequence descriptor
* @fmt: printf format string
+ * @args: Arguments for the format string
*
* The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
+ * copy to user routines. To simplify formatting of a trace
* trace_seq_printf is used to store strings into a special
* buffer (@s). Then the output may be either used by
* the sequencer or pulled into another buffer.
@@ -226,7 +224,7 @@ EXPORT_SYMBOL_GPL(trace_seq_puts);
* @c: simple character to record
*
* The tracer may use either the sequence operations or its own
- * copy to user routines. This function records a simple charater
+ * copy to user routines. This function records a simple character
* into a special buffer (@s) for later retrieval by a sequencer
* or other mechanism.
*/
@@ -348,7 +346,7 @@ int trace_seq_path(struct trace_seq *s, const struct path *path)
EXPORT_SYMBOL_GPL(trace_seq_path);
/**
- * trace_seq_to_user - copy the squence buffer to user space
+ * trace_seq_to_user - copy the sequence buffer to user space
* @s: trace sequence descriptor
* @ubuf: The userspace memory location to copy to
* @cnt: The amount to copy
@@ -363,14 +361,18 @@ EXPORT_SYMBOL_GPL(trace_seq_path);
*
* On failure it returns -EBUSY if all of the content in the
* sequence has been already read, which includes nothing in the
- * sequenc (@s->len == @s->readpos).
+ * sequence (@s->len == @s->readpos).
*
* Returns -EFAULT if the copy to userspace fails.
*/
int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
{
+ int ret;
__trace_seq_init(s);
- return seq_buf_to_user(&s->seq, ubuf, cnt);
+ ret = seq_buf_to_user(&s->seq, ubuf, s->readpos, cnt);
+ if (ret > 0)
+ s->readpos += ret;
+ return ret;
}
EXPORT_SYMBOL_GPL(trace_seq_to_user);
@@ -403,3 +405,26 @@ int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str,
return 1;
}
EXPORT_SYMBOL(trace_seq_hex_dump);
+
+/*
+ * trace_seq_acquire - acquire seq buffer with size len
+ * @s: trace sequence descriptor
+ * @len: size of buffer to be acquired
+ *
+ * acquire buffer with size of @len from trace_seq for output usage,
+ * user can fill string into that buffer.
+ *
+ * Returns start address of acquired buffer.
+ *
+ * it allow multiple usage in one trace output function call.
+ */
+char *trace_seq_acquire(struct trace_seq *s, unsigned int len)
+{
+ char *ret = trace_seq_buffer_ptr(s);
+
+ if (!WARN_ON_ONCE(seq_buf_buffer_left(&s->seq) < len))
+ seq_buf_commit(&s->seq, len);
+
+ return ret;
+}
+EXPORT_SYMBOL(trace_seq_acquire);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 98bba4764c52..5a48dba912ea 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -290,7 +290,7 @@ static void check_stack(unsigned long ip, unsigned long *stack)
static void
stack_trace_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
unsigned long stack;
@@ -318,7 +318,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
static struct ftrace_ops trace_ops __read_mostly =
{
.func = stack_trace_call,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static ssize_t
@@ -554,20 +553,20 @@ __setup("stacktrace", enable_stacktrace);
static __init int stack_trace_init(void)
{
- struct dentry *d_tracer;
+ int ret;
- d_tracer = tracing_init_dentry();
- if (IS_ERR(d_tracer))
+ ret = tracing_init_dentry();
+ if (ret)
return 0;
- trace_create_file("stack_max_size", 0644, d_tracer,
+ trace_create_file("stack_max_size", TRACE_MODE_WRITE, NULL,
&stack_trace_max_size, &stack_max_size_fops);
- trace_create_file("stack_trace", 0444, d_tracer,
+ trace_create_file("stack_trace", TRACE_MODE_READ, NULL,
NULL, &stack_trace_fops);
#ifdef CONFIG_DYNAMIC_FTRACE
- trace_create_file("stack_trace_filter", 0644, d_tracer,
+ trace_create_file("stack_trace_filter", TRACE_MODE_WRITE, NULL,
&trace_ops, &stack_trace_filter_fops);
#endif
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index d1fa19773cc8..bb247beec447 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -276,13 +276,13 @@ static const struct file_operations tracing_stat_fops = {
static int tracing_stat_init(void)
{
- struct dentry *d_tracing;
+ int ret;
- d_tracing = tracing_init_dentry();
- if (IS_ERR(d_tracing))
+ ret = tracing_init_dentry();
+ if (ret)
return -ENODEV;
- stat_dir = tracefs_create_dir("trace_stat", d_tracing);
+ stat_dir = tracefs_create_dir("trace_stat", NULL);
if (!stat_dir) {
pr_warn("Could not create tracefs 'trace_stat' entry\n");
return -ENOMEM;
@@ -297,9 +297,9 @@ static int init_stat_file(struct stat_session *session)
if (!stat_dir && (ret = tracing_stat_init()))
return ret;
- session->file = tracefs_create_file(session->ts->name, 0644,
- stat_dir,
- session, &tracing_stat_fops);
+ session->file = tracefs_create_file(session->ts->name, TRACE_MODE_WRITE,
+ stat_dir, session,
+ &tracing_stat_fops);
if (!session->file)
return -ENOMEM;
return 0;
diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h
index ac35c45207c4..43f6fb6078db 100644
--- a/kernel/trace/trace_synth.h
+++ b/kernel/trace/trace_synth.h
@@ -5,17 +5,20 @@
#include "trace_dynevent.h"
#define SYNTH_SYSTEM "synthetic"
-#define SYNTH_FIELDS_MAX 32
+#define SYNTH_FIELDS_MAX 64
-#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */
+#define STR_VAR_LEN_MAX MAX_FILTER_STR_VAL /* must be multiple of sizeof(u64) */
struct synth_field {
char *type;
char *name;
size_t size;
unsigned int offset;
+ unsigned int field_pos;
bool is_signed;
bool is_string;
+ bool is_dynamic;
+ bool is_stack;
};
struct synth_event {
@@ -24,6 +27,8 @@ struct synth_event {
char *name;
struct synth_field **fields;
unsigned int n_fields;
+ struct synth_field **dynamic_fields;
+ unsigned int n_dynamic_fields;
unsigned int n_u64;
struct trace_event_class class;
struct trace_event_call call;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index d85a2f0f316b..9c581d6da843 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -154,7 +154,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
goto end;
/* parameter types */
- if (tr->trace_flags & TRACE_ITER_VERBOSE)
+ if (tr && tr->trace_flags & TRACE_ITER_VERBOSE)
trace_seq_printf(s, "%s ", entry->types[i]);
/* parameter values */
@@ -201,8 +201,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
return trace_handle_return(s);
}
-extern char *__bad_type_size(void);
-
#define SYSCALL_FIELD(_type, _name) { \
.type = #_type, .name = #_name, \
.size = sizeof(_type), .align = __alignof__(_type), \
@@ -296,11 +294,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
struct trace_event_file *trace_file;
struct syscall_trace_enter *entry;
struct syscall_metadata *sys_data;
- struct ring_buffer_event *event;
- struct trace_buffer *buffer;
- unsigned long irq_flags;
+ struct trace_event_buffer fbuffer;
unsigned long args[6];
- int pc;
int syscall_nr;
int size;
@@ -322,22 +317,16 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
- local_save_flags(irq_flags);
- pc = preempt_count();
-
- buffer = tr->array_buffer.buffer;
- event = trace_buffer_lock_reserve(buffer,
- sys_data->enter_event->event.type, size, irq_flags, pc);
- if (!event)
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
+ if (!entry)
return;
- entry = ring_buffer_event_data(event);
+ entry = ring_buffer_event_data(fbuffer.event);
entry->nr = syscall_nr;
syscall_get_arguments(current, regs, args);
memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
- event_trigger_unlock_commit(trace_file, buffer, event, entry,
- irq_flags, pc);
+ trace_event_buffer_commit(&fbuffer);
}
static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
@@ -346,10 +335,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
struct trace_event_file *trace_file;
struct syscall_trace_exit *entry;
struct syscall_metadata *sys_data;
- struct ring_buffer_event *event;
- struct trace_buffer *buffer;
- unsigned long irq_flags;
- int pc;
+ struct trace_event_buffer fbuffer;
int syscall_nr;
syscall_nr = trace_get_syscall_nr(current, regs);
@@ -368,22 +354,15 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
if (!sys_data)
return;
- local_save_flags(irq_flags);
- pc = preempt_count();
-
- buffer = tr->array_buffer.buffer;
- event = trace_buffer_lock_reserve(buffer,
- sys_data->exit_event->event.type, sizeof(*entry),
- irq_flags, pc);
- if (!event)
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry));
+ if (!entry)
return;
- entry = ring_buffer_event_data(event);
+ entry = ring_buffer_event_data(fbuffer.event);
entry->nr = syscall_nr;
entry->ret = syscall_get_return_value(current, regs);
- event_trigger_unlock_commit(trace_file, buffer, event, entry,
- irq_flags, pc);
+ trace_event_buffer_commit(&fbuffer);
}
static int reg_event_syscall_enter(struct trace_event_file *file,
@@ -576,12 +555,15 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re
struct syscall_trace_enter *rec)
{
struct syscall_tp_t {
- unsigned long long regs;
- unsigned long syscall_nr;
+ struct trace_entry ent;
+ int syscall_nr;
unsigned long args[SYSCALL_DEFINE_MAXARGS];
- } param;
+ } __aligned(8) param;
int i;
+ BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
+
+ /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
for (i = 0; i < sys_data->nb_args; i++)
@@ -678,11 +660,12 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg
struct syscall_trace_exit *rec)
{
struct syscall_tp_t {
- unsigned long long regs;
- unsigned long syscall_nr;
+ struct trace_entry ent;
+ int syscall_nr;
unsigned long ret;
- } param;
+ } __aligned(8) param;
+ /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
param.ret = rec->ret;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index fdd47f99b18f..a84b85d8aac1 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -7,6 +7,7 @@
*/
#define pr_fmt(fmt) "trace_uprobe: " fmt
+#include <linux/bpf-cgroup.h>
#include <linux/security.h>
#include <linux/ctype.h>
#include <linux/module.h>
@@ -15,6 +16,7 @@
#include <linux/namei.h>
#include <linux/string.h>
#include <linux/rculist.h>
+#include <linux/filter.h>
#include "trace_dynevent.h"
#include "trace_probe.h"
@@ -34,7 +36,7 @@ struct uprobe_trace_entry_head {
#define DATAOF_TRACE_ENTRY(entry, is_return) \
((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return))
-static int trace_uprobe_create(int argc, const char **argv);
+static int trace_uprobe_create(const char *raw_command);
static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev);
static int trace_uprobe_release(struct dyn_event *ev);
static bool trace_uprobe_is_busy(struct dyn_event *ev);
@@ -83,18 +85,9 @@ static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev)
for_each_dyn_event(dpos) \
if (is_trace_uprobe(dpos) && (pos = to_trace_uprobe(dpos)))
-#define SIZEOF_TRACE_UPROBE(n) \
- (offsetof(struct trace_uprobe, tp.args) + \
- (sizeof(struct probe_arg) * (n)))
-
static int register_uprobe_event(struct trace_uprobe *tu);
static int unregister_uprobe_event(struct trace_uprobe *tu);
-struct uprobe_dispatch_data {
- struct trace_uprobe *tu;
- unsigned long bp_addr;
-};
-
static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
static int uretprobe_dispatcher(struct uprobe_consumer *con,
unsigned long func, struct pt_regs *regs);
@@ -158,7 +151,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
return -ENOMEM;
if (addr == FETCH_TOKEN_COMM)
- ret = strlcpy(dst, current->comm, maxlen);
+ ret = strscpy(dst, current->comm, maxlen);
else
ret = strncpy_from_user(dst, src, maxlen);
if (ret >= 0) {
@@ -172,7 +165,8 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
*/
ret++;
*(u32 *)dest = make_data_loc(ret, (void *)dst - base);
- }
+ } else
+ *(u32 *)dest = make_data_loc(0, (void *)dst - base);
return ret;
}
@@ -217,10 +211,12 @@ static unsigned long translate_user_vaddr(unsigned long file_offset)
/* Note that we don't verify it, since the code does not come from user space */
static int
-process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest,
+process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
void *base)
{
+ struct pt_regs *regs = rec;
unsigned long val;
+ int ret;
/* 1st stage: get value from context */
switch (code->op) {
@@ -236,20 +232,16 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest,
case FETCH_OP_RETVAL:
val = regs_return_value(regs);
break;
- case FETCH_OP_IMM:
- val = code->immediate;
- break;
case FETCH_OP_COMM:
val = FETCH_TOKEN_COMM;
break;
- case FETCH_OP_DATA:
- val = (unsigned long)code->data;
- break;
case FETCH_OP_FOFFS:
val = translate_user_vaddr(code->immediate);
break;
default:
- return -EILSEQ;
+ ret = process_common_fetch_insn(code, &val);
+ if (ret < 0)
+ return ret;
}
code++;
@@ -314,7 +306,8 @@ static bool trace_uprobe_match(const char *system, const char *event,
{
struct trace_uprobe *tu = to_trace_uprobe(ev);
- return strcmp(trace_probe_name(&tu->tp), event) == 0 &&
+ return (event[0] == '\0' ||
+ strcmp(trace_probe_name(&tu->tp), event) == 0) &&
(!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0) &&
trace_uprobe_match_command_head(tu, argc, argv);
}
@@ -340,7 +333,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
struct trace_uprobe *tu;
int ret;
- tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL);
+ tu = kzalloc(struct_size(tu, tp.args, nargs), GFP_KERNEL);
if (!tu)
return ERR_PTR(-ENOMEM);
@@ -393,6 +386,10 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu)
if (trace_probe_has_sibling(&tu->tp))
goto unreg;
+ /* If there's a reference to the dynamic event */
+ if (trace_event_dyn_busy(trace_probe_event_call(&tu->tp)))
+ return -EBUSY;
+
ret = unregister_uprobe_event(tu);
if (ret)
return ret;
@@ -408,12 +405,10 @@ static bool trace_uprobe_has_same_uprobe(struct trace_uprobe *orig,
struct trace_uprobe *comp)
{
struct trace_probe_event *tpe = orig->tp.event;
- struct trace_probe *pos;
struct inode *comp_inode = d_real_inode(comp->path.dentry);
int i;
- list_for_each_entry(pos, &tpe->probes, list) {
- orig = container_of(pos, struct trace_uprobe, tp);
+ list_for_each_entry(orig, &tpe->probes, tp.list) {
if (comp_inode != d_real_inode(orig->path.dentry) ||
comp->offset != orig->offset)
continue;
@@ -455,7 +450,7 @@ static int append_trace_uprobe(struct trace_uprobe *tu, struct trace_uprobe *to)
/* Append to existing event */
ret = trace_probe_append(&tu->tp, &to->tp);
if (!ret)
- dyn_event_add(&tu->devent);
+ dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp));
return ret;
}
@@ -514,11 +509,15 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
ret = register_uprobe_event(tu);
if (ret) {
- pr_warn("Failed to register probe event(%d)\n", ret);
+ if (ret == -EEXIST) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, EVENT_EXIST);
+ } else
+ pr_warn("Failed to register probe event(%d)\n", ret);
goto end;
}
- dyn_event_add(&tu->devent);
+ dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp));
end:
mutex_unlock(&event_mutex);
@@ -528,20 +527,21 @@ end:
/*
* Argument syntax:
- * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS]
+ * - Add uprobe: p|r[:[GRP/][EVENT]] PATH:OFFSET[%return][(REF)] [FETCHARGS]
*/
-static int trace_uprobe_create(int argc, const char **argv)
+static int __trace_uprobe_create(int argc, const char **argv)
{
struct trace_uprobe *tu;
const char *event = NULL, *group = UPROBE_EVENT_SYSTEM;
char *arg, *filename, *rctr, *rctr_end, *tmp;
char buf[MAX_EVENT_NAME_LEN];
+ char gbuf[MAX_EVENT_NAME_LEN];
+ enum probe_print_type ptype;
struct path path;
unsigned long offset, ref_ctr_offset;
bool is_return = false;
int i, ret;
- ret = 0;
ref_ctr_offset = 0;
switch (argv[0][0]) {
@@ -617,6 +617,19 @@ static int trace_uprobe_create(int argc, const char **argv)
}
}
+ /* Check if there is %return suffix */
+ tmp = strchr(arg, '%');
+ if (tmp) {
+ if (!strcmp(tmp, "%return")) {
+ *tmp = '\0';
+ is_return = true;
+ } else {
+ trace_probe_log_err(tmp - filename, BAD_ADDR_SUFFIX);
+ ret = -EINVAL;
+ goto fail_address_parse;
+ }
+ }
+
/* Parse uprobe offset. */
ret = kstrtoul(arg, 0, &offset);
if (ret) {
@@ -627,11 +640,13 @@ static int trace_uprobe_create(int argc, const char **argv)
/* setup a probe */
trace_probe_log_set_index(0);
if (event) {
- ret = traceprobe_parse_event_name(&event, &group, buf,
+ ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
goto fail_address_parse;
- } else {
+ }
+
+ if (!event) {
char *tail;
char *ptr;
@@ -667,21 +682,19 @@ static int trace_uprobe_create(int argc, const char **argv)
/* parse arguments */
for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
- tmp = kstrdup(argv[i], GFP_KERNEL);
- if (!tmp) {
- ret = -ENOMEM;
- goto error;
- }
+ struct traceprobe_parse_context ctx = {
+ .flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER,
+ };
trace_probe_log_set_index(i + 2);
- ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp,
- is_return ? TPARG_FL_RETURN : 0);
- kfree(tmp);
+ ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], &ctx);
+ traceprobe_finish_parse(&ctx);
if (ret)
goto error;
}
- ret = traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu));
+ ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
+ ret = traceprobe_set_print_fmt(&tu->tp, ptype);
if (ret < 0)
goto error;
@@ -703,14 +716,19 @@ fail_address_parse:
return ret;
}
-static int create_or_delete_trace_uprobe(int argc, char **argv)
+int trace_uprobe_create(const char *raw_command)
+{
+ return trace_probe_create(raw_command, __trace_uprobe_create);
+}
+
+static int create_or_delete_trace_uprobe(const char *raw_command)
{
int ret;
- if (argv[0][0] == '-')
- return dyn_event_release(argc, argv, &trace_uprobe_ops);
+ if (raw_command[0] == '-')
+ return dyn_event_release(raw_command, &trace_uprobe_ops);
- ret = trace_uprobe_create(argc, (const char **)argv);
+ ret = trace_uprobe_create(raw_command);
return ret == -ECANCELED ? -EINVAL : ret;
}
@@ -931,8 +949,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
struct trace_event_file *trace_file)
{
struct uprobe_trace_entry_head *entry;
- struct trace_buffer *buffer;
- struct ring_buffer_event *event;
+ struct trace_event_buffer fbuffer;
void *data;
int size, esize;
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
@@ -947,12 +964,10 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
size = esize + tu->tp.size + dsize;
- event = trace_event_buffer_lock_reserve(&buffer, trace_file,
- call->event.type, size, 0, 0);
- if (!event)
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
+ if (!entry)
return;
- entry = ring_buffer_event_data(event);
if (is_ret_probe(tu)) {
entry->vaddr[0] = func;
entry->vaddr[1] = instruction_pointer(regs);
@@ -964,7 +979,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
memcpy(data, ucb->buf, tu->tp.size + dsize);
- event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0);
+ trace_event_buffer_commit(&fbuffer);
}
/* uprobe handler */
@@ -1023,7 +1038,7 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
data = DATAOF_TRACE_ENTRY(entry, false);
}
- if (print_probe_args(s, tu->tp.args, tu->tp.nr_args, data, entry) < 0)
+ if (trace_probe_print_args(s, tu->tp.args, tu->tp.nr_args, data, entry) < 0)
goto out;
trace_seq_putc(s, '\n');
@@ -1057,14 +1072,12 @@ static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter)
static void __probe_event_disable(struct trace_probe *tp)
{
- struct trace_probe *pos;
struct trace_uprobe *tu;
tu = container_of(tp, struct trace_uprobe, tp);
WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
- list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
- tu = container_of(pos, struct trace_uprobe, tp);
+ list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
if (!tu->inode)
continue;
@@ -1076,7 +1089,7 @@ static void __probe_event_disable(struct trace_probe *tp)
static int probe_event_enable(struct trace_event_call *call,
struct trace_event_file *file, filter_func_t filter)
{
- struct trace_probe *pos, *tp;
+ struct trace_probe *tp;
struct trace_uprobe *tu;
bool enabled;
int ret;
@@ -1111,8 +1124,7 @@ static int probe_event_enable(struct trace_event_call *call,
if (ret)
goto err_flags;
- list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
- tu = container_of(pos, struct trace_uprobe, tp);
+ list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
ret = trace_uprobe_enable(tu, filter);
if (ret) {
__probe_event_disable(tp);
@@ -1257,7 +1269,7 @@ static bool trace_uprobe_filter_add(struct trace_uprobe_filter *filter,
static int uprobe_perf_close(struct trace_event_call *call,
struct perf_event *event)
{
- struct trace_probe *pos, *tp;
+ struct trace_probe *tp;
struct trace_uprobe *tu;
int ret = 0;
@@ -1269,8 +1281,7 @@ static int uprobe_perf_close(struct trace_event_call *call,
if (trace_uprobe_filter_remove(tu->tp.event->filter, event))
return 0;
- list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
- tu = container_of(pos, struct trace_uprobe, tp);
+ list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
if (ret)
break;
@@ -1282,7 +1293,7 @@ static int uprobe_perf_close(struct trace_event_call *call,
static int uprobe_perf_open(struct trace_event_call *call,
struct perf_event *event)
{
- struct trace_probe *pos, *tp;
+ struct trace_probe *tp;
struct trace_uprobe *tu;
int err = 0;
@@ -1294,7 +1305,7 @@ static int uprobe_perf_open(struct trace_event_call *call,
if (trace_uprobe_filter_add(tu->tp.event->filter, event))
return 0;
- list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
if (err) {
uprobe_perf_close(call, event);
@@ -1333,15 +1344,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
int size, esize;
int rctx;
+#ifdef CONFIG_BPF_EVENTS
if (bpf_prog_array_valid(call)) {
u32 ret;
- preempt_disable();
- ret = trace_call_bpf(call, regs);
- preempt_enable();
+ ret = bpf_prog_run_array_uprobe(call->prog_array, regs, bpf_prog_run);
if (!ret)
return;
}
+#endif /* CONFIG_BPF_EVENTS */
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
@@ -1403,7 +1414,7 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
const char **filename, u64 *probe_offset,
- bool perf_type_tracepoint)
+ u64 *probe_addr, bool perf_type_tracepoint)
{
const char *pevent = trace_event_name(event->tp_event);
const char *group = event->tp_event->class->system;
@@ -1420,6 +1431,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
: BPF_FD_TYPE_UPROBE;
*filename = tu->filename;
*probe_offset = tu->offset;
+ *probe_addr = 0;
return 0;
}
#endif /* CONFIG_PERF_EVENTS */
@@ -1456,7 +1468,6 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
default:
return 0;
}
- return 0;
}
static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
@@ -1568,6 +1579,7 @@ struct trace_event_call *
create_local_trace_uprobe(char *name, unsigned long offs,
unsigned long ref_ctr_offset, bool is_return)
{
+ enum probe_print_type ptype;
struct trace_uprobe *tu;
struct path path;
int ret;
@@ -1600,9 +1612,15 @@ create_local_trace_uprobe(char *name, unsigned long offs,
tu->path = path;
tu->ref_ctr_offset = ref_ctr_offset;
tu->filename = kstrdup(name, GFP_KERNEL);
+ if (!tu->filename) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
init_trace_event_call(tu);
- if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) {
+ ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
+ if (traceprobe_set_print_fmt(&tu->tp, ptype) < 0) {
ret = -ENOMEM;
goto error;
}
@@ -1623,24 +1641,23 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call)
}
#endif /* CONFIG_PERF_EVENTS */
-/* Make a trace interface for controling probe points */
+/* Make a trace interface for controlling probe points */
static __init int init_uprobe_trace(void)
{
- struct dentry *d_tracer;
int ret;
ret = dyn_event_register(&trace_uprobe_ops);
if (ret)
return ret;
- d_tracer = tracing_init_dentry();
- if (IS_ERR(d_tracer))
+ ret = tracing_init_dentry();
+ if (ret)
return 0;
- trace_create_file("uprobe_events", 0644, d_tracer,
+ trace_create_file("uprobe_events", TRACE_MODE_WRITE, NULL,
NULL, &uprobe_events_ops);
/* Profile interface */
- trace_create_file("uprobe_profile", 0444, d_tracer,
+ trace_create_file("uprobe_profile", TRACE_MODE_READ, NULL,
NULL, &uprobe_profile_ops);
return 0;
}
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 74738c9856f1..a4dcf0f24352 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -15,6 +15,7 @@
#include <linux/jhash.h>
#include <linux/slab.h>
#include <linux/sort.h>
+#include <linux/kmemleak.h>
#include "tracing_map.h"
#include "trace.h"
@@ -260,7 +261,7 @@ int tracing_map_add_var(struct tracing_map *map)
* to use cmp_fn.
*
* A key can be a subset of a compound key; for that purpose, the
- * offset param is used to describe where within the the compound key
+ * offset param is used to describe where within the compound key
* the key referenced by this key field resides.
*
* Return: The index identifying the field in the map and associated
@@ -307,6 +308,7 @@ static void tracing_map_array_free(struct tracing_map_array *a)
for (i = 0; i < a->n_pages; i++) {
if (!a->pages[i])
break;
+ kmemleak_free(a->pages[i]);
free_page((unsigned long)a->pages[i]);
}
@@ -342,6 +344,7 @@ static struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts,
a->pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
if (!a->pages[i])
goto free;
+ kmemleak_alloc(a->pages[i], PAGE_SIZE, 1, GFP_KERNEL);
}
out:
return a;
@@ -571,7 +574,12 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
}
memcpy(elt->key, key, map->key_size);
- entry->val = elt;
+ /*
+ * Ensure the initialization is visible and
+ * publish the elt.
+ */
+ smp_wmb();
+ WRITE_ONCE(entry->val, elt);
atomic64_inc(&map->hits);
return entry->val;
@@ -609,7 +617,7 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
* signal that state. There are two user-visible tracing_map
* variables, 'hits' and 'drops', which are updated by this function.
* Every time an element is either successfully inserted or retrieved,
- * the 'hits' value is incrememented. Every time an element insertion
+ * the 'hits' value is incremented. Every time an element insertion
* fails, the 'drops' value is incremented.
*
* This is a lock-free tracing map insertion function implementing a
@@ -642,9 +650,9 @@ struct tracing_map_elt *tracing_map_insert(struct tracing_map *map, void *key)
* tracing_map_elt. This is a lock-free lookup; see
* tracing_map_insert() for details on tracing_map and how it works.
* Every time an element is retrieved, the 'hits' value is
- * incrememented. There is one user-visible tracing_map variable,
+ * incremented. There is one user-visible tracing_map variable,
* 'hits', which is updated by this function. Every time an element
- * is successfully retrieved, the 'hits' value is incrememented. The
+ * is successfully retrieved, the 'hits' value is incremented. The
* 'drops' value is never updated by this function.
*
* Return: the tracing_map_elt pointer val associated with the key.
@@ -834,29 +842,35 @@ int tracing_map_init(struct tracing_map *map)
return err;
}
-static int cmp_entries_dup(const struct tracing_map_sort_entry **a,
- const struct tracing_map_sort_entry **b)
+static int cmp_entries_dup(const void *A, const void *B)
{
+ const struct tracing_map_sort_entry *a, *b;
int ret = 0;
- if (memcmp((*a)->key, (*b)->key, (*a)->elt->map->key_size))
+ a = *(const struct tracing_map_sort_entry **)A;
+ b = *(const struct tracing_map_sort_entry **)B;
+
+ if (memcmp(a->key, b->key, a->elt->map->key_size))
ret = 1;
return ret;
}
-static int cmp_entries_sum(const struct tracing_map_sort_entry **a,
- const struct tracing_map_sort_entry **b)
+static int cmp_entries_sum(const void *A, const void *B)
{
const struct tracing_map_elt *elt_a, *elt_b;
+ const struct tracing_map_sort_entry *a, *b;
struct tracing_map_sort_key *sort_key;
struct tracing_map_field *field;
tracing_map_cmp_fn_t cmp_fn;
void *val_a, *val_b;
int ret = 0;
- elt_a = (*a)->elt;
- elt_b = (*b)->elt;
+ a = *(const struct tracing_map_sort_entry **)A;
+ b = *(const struct tracing_map_sort_entry **)B;
+
+ elt_a = a->elt;
+ elt_b = b->elt;
sort_key = &elt_a->map->sort_key;
@@ -873,18 +887,21 @@ static int cmp_entries_sum(const struct tracing_map_sort_entry **a,
return ret;
}
-static int cmp_entries_key(const struct tracing_map_sort_entry **a,
- const struct tracing_map_sort_entry **b)
+static int cmp_entries_key(const void *A, const void *B)
{
const struct tracing_map_elt *elt_a, *elt_b;
+ const struct tracing_map_sort_entry *a, *b;
struct tracing_map_sort_key *sort_key;
struct tracing_map_field *field;
tracing_map_cmp_fn_t cmp_fn;
void *val_a, *val_b;
int ret = 0;
- elt_a = (*a)->elt;
- elt_b = (*b)->elt;
+ a = *(const struct tracing_map_sort_entry **)A;
+ b = *(const struct tracing_map_sort_entry **)B;
+
+ elt_a = a->elt;
+ elt_b = b->elt;
sort_key = &elt_a->map->sort_key;
@@ -949,7 +966,7 @@ create_sort_entry(void *key, struct tracing_map_elt *elt)
static void detect_dups(struct tracing_map_sort_entry **sort_entries,
int n_entries, unsigned int key_size)
{
- unsigned int dups = 0, total_dups = 0;
+ unsigned int total_dups = 0;
int i;
void *key;
@@ -962,11 +979,10 @@ static void detect_dups(struct tracing_map_sort_entry **sort_entries,
key = sort_entries[0]->key;
for (i = 1; i < n_entries; i++) {
if (!memcmp(sort_entries[i]->key, key, key_size)) {
- dups++; total_dups++;
+ total_dups++;
continue;
}
key = sort_entries[i]->key;
- dups = 0;
}
WARN_ONCE(total_dups > 0,
@@ -989,10 +1005,8 @@ static void sort_secondary(struct tracing_map *map,
struct tracing_map_sort_key *primary_key,
struct tracing_map_sort_key *secondary_key)
{
- int (*primary_fn)(const struct tracing_map_sort_entry **,
- const struct tracing_map_sort_entry **);
- int (*secondary_fn)(const struct tracing_map_sort_entry **,
- const struct tracing_map_sort_entry **);
+ int (*primary_fn)(const void *, const void *);
+ int (*secondary_fn)(const void *, const void *);
unsigned i, start = 0, n_sub = 1;
if (is_key(map, primary_key->field_idx))
@@ -1035,7 +1049,8 @@ static void sort_secondary(struct tracing_map *map,
/**
* tracing_map_sort_entries - Sort the current set of tracing_map_elts in a map
* @map: The tracing_map
- * @sort_key: The sort key to use for sorting
+ * @sort_keys: The sort key to use for sorting
+ * @n_sort_keys: hitcount, always have at least one
* @sort_entries: outval: pointer to allocated and sorted array of entries
*
* tracing_map_sort_entries() sorts the current set of entries in the
@@ -1061,8 +1076,7 @@ int tracing_map_sort_entries(struct tracing_map *map,
unsigned int n_sort_keys,
struct tracing_map_sort_entry ***sort_entries)
{
- int (*cmp_entries_fn)(const struct tracing_map_sort_entry **,
- const struct tracing_map_sort_entry **);
+ int (*cmp_entries_fn)(const void *, const void *);
struct tracing_map_sort_entry *sort_entry, **entries;
int i, n_entries, ret;
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index a6de61fc22de..99c37eeebc16 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -50,7 +50,7 @@ typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
* an instance of tracing_map_elt, where 'elt' in the latter part of
* that variable name is short for 'element'. The purpose of a
* tracing_map_elt is to hold values specific to the particular
- * 32-bit hashed key it's assocated with. Things such as the unique
+ * 32-bit hashed key it's associated with. Things such as the unique
* set of aggregated sums associated with the 32-bit hashed key, along
* with a copy of the full key associated with the entry, and which
* was used to produce the 32-bit hashed key.
@@ -272,10 +272,6 @@ extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i);
extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i);
extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i);
-extern void tracing_map_set_field_descr(struct tracing_map *map,
- unsigned int i,
- unsigned int key_offset,
- tracing_map_cmp_fn_t cmp_fn);
extern int
tracing_map_sort_entries(struct tracing_map *map,
struct tracing_map_sort_key *sort_keys,
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 73956eaff8a9..8d1507dd0724 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -15,12 +15,57 @@
#include <linux/sched/task.h>
#include <linux/static_key.h>
+enum tp_func_state {
+ TP_FUNC_0,
+ TP_FUNC_1,
+ TP_FUNC_2,
+ TP_FUNC_N,
+};
+
extern tracepoint_ptr_t __start___tracepoints_ptrs[];
extern tracepoint_ptr_t __stop___tracepoints_ptrs[];
DEFINE_SRCU(tracepoint_srcu);
EXPORT_SYMBOL_GPL(tracepoint_srcu);
+enum tp_transition_sync {
+ TP_TRANSITION_SYNC_1_0_1,
+ TP_TRANSITION_SYNC_N_2_1,
+
+ _NR_TP_TRANSITION_SYNC,
+};
+
+struct tp_transition_snapshot {
+ unsigned long rcu;
+ unsigned long srcu;
+ bool ongoing;
+};
+
+/* Protected by tracepoints_mutex */
+static struct tp_transition_snapshot tp_transition_snapshot[_NR_TP_TRANSITION_SYNC];
+
+static void tp_rcu_get_state(enum tp_transition_sync sync)
+{
+ struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync];
+
+ /* Keep the latest get_state snapshot. */
+ snapshot->rcu = get_state_synchronize_rcu();
+ snapshot->srcu = start_poll_synchronize_srcu(&tracepoint_srcu);
+ snapshot->ongoing = true;
+}
+
+static void tp_rcu_cond_sync(enum tp_transition_sync sync)
+{
+ struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync];
+
+ if (!snapshot->ongoing)
+ return;
+ cond_synchronize_rcu(snapshot->rcu);
+ if (!poll_state_synchronize_srcu(&tracepoint_srcu, snapshot->srcu))
+ synchronize_srcu(&tracepoint_srcu);
+ snapshot->ongoing = false;
+}
+
/* Set to 1 to enable tracepoint debug output */
static const int tracepoint_debug;
@@ -50,9 +95,15 @@ static bool ok_to_free_tracepoints;
*/
struct tp_probes {
struct rcu_head rcu;
- struct tracepoint_func probes[0];
+ struct tracepoint_func probes[];
};
+/* Called in removal of a func but failed to allocate a new tp_funcs */
+static void tp_stub_func(void)
+{
+ return;
+}
+
static inline void *allocate_probes(int count)
{
struct tp_probes *p = kmalloc(struct_size(p, probes, count),
@@ -130,8 +181,9 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
int prio)
{
struct tracepoint_func *old, *new;
- int nr_probes = 0;
- int pos = -1;
+ int iter_probes; /* Iterate over old probe array. */
+ int nr_probes = 0; /* Counter for probes */
+ int pos = -1; /* Insertion position into new array */
if (WARN_ON(!tp_func->func))
return ERR_PTR(-EINVAL);
@@ -140,13 +192,13 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
old = *funcs;
if (old) {
/* (N -> N+1), (N != 0, 1) probes */
- for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
- /* Insert before probes of lower priority */
- if (pos < 0 && old[nr_probes].prio < prio)
- pos = nr_probes;
- if (old[nr_probes].func == tp_func->func &&
- old[nr_probes].data == tp_func->data)
+ for (iter_probes = 0; old[iter_probes].func; iter_probes++) {
+ if (old[iter_probes].func == tp_stub_func)
+ continue; /* Skip stub functions. */
+ if (old[iter_probes].func == tp_func->func &&
+ old[iter_probes].data == tp_func->data)
return ERR_PTR(-EEXIST);
+ nr_probes++;
}
}
/* + 2 : one for new probe, one for NULL func */
@@ -154,20 +206,24 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
if (new == NULL)
return ERR_PTR(-ENOMEM);
if (old) {
- if (pos < 0) {
- pos = nr_probes;
- memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
- } else {
- /* Copy higher priority probes ahead of the new probe */
- memcpy(new, old, pos * sizeof(struct tracepoint_func));
- /* Copy the rest after it. */
- memcpy(new + pos + 1, old + pos,
- (nr_probes - pos) * sizeof(struct tracepoint_func));
+ nr_probes = 0;
+ for (iter_probes = 0; old[iter_probes].func; iter_probes++) {
+ if (old[iter_probes].func == tp_stub_func)
+ continue;
+ /* Insert before probes of lower priority */
+ if (pos < 0 && old[iter_probes].prio < prio)
+ pos = nr_probes++;
+ new[nr_probes++] = old[iter_probes];
}
- } else
+ if (pos < 0)
+ pos = nr_probes++;
+ /* nr_probes now points to the end of the new array */
+ } else {
pos = 0;
+ nr_probes = 1; /* must point at end of array */
+ }
new[pos] = *tp_func;
- new[nr_probes + 1].func = NULL;
+ new[nr_probes].func = NULL;
*funcs = new;
debug_print_probes(*funcs);
return old;
@@ -188,8 +244,9 @@ static void *func_remove(struct tracepoint_func **funcs,
/* (N -> M), (N > 1, M >= 0) probes */
if (tp_func->func) {
for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
- if (old[nr_probes].func == tp_func->func &&
- old[nr_probes].data == tp_func->data)
+ if ((old[nr_probes].func == tp_func->func &&
+ old[nr_probes].data == tp_func->data) ||
+ old[nr_probes].func == tp_stub_func)
nr_del++;
}
}
@@ -208,24 +265,64 @@ static void *func_remove(struct tracepoint_func **funcs,
/* N -> M, (N > 1, M > 0) */
/* + 1 for NULL */
new = allocate_probes(nr_probes - nr_del + 1);
- if (new == NULL)
- return ERR_PTR(-ENOMEM);
- for (i = 0; old[i].func; i++)
- if (old[i].func != tp_func->func
- || old[i].data != tp_func->data)
- new[j++] = old[i];
- new[nr_probes - nr_del].func = NULL;
- *funcs = new;
+ if (new) {
+ for (i = 0; old[i].func; i++) {
+ if ((old[i].func != tp_func->func ||
+ old[i].data != tp_func->data) &&
+ old[i].func != tp_stub_func)
+ new[j++] = old[i];
+ }
+ new[nr_probes - nr_del].func = NULL;
+ *funcs = new;
+ } else {
+ /*
+ * Failed to allocate, replace the old function
+ * with calls to tp_stub_func.
+ */
+ for (i = 0; old[i].func; i++) {
+ if (old[i].func == tp_func->func &&
+ old[i].data == tp_func->data)
+ WRITE_ONCE(old[i].func, tp_stub_func);
+ }
+ *funcs = old;
+ }
}
debug_print_probes(*funcs);
return old;
}
/*
+ * Count the number of functions (enum tp_func_state) in a tp_funcs array.
+ */
+static enum tp_func_state nr_func_state(const struct tracepoint_func *tp_funcs)
+{
+ if (!tp_funcs)
+ return TP_FUNC_0;
+ if (!tp_funcs[1].func)
+ return TP_FUNC_1;
+ if (!tp_funcs[2].func)
+ return TP_FUNC_2;
+ return TP_FUNC_N; /* 3 or more */
+}
+
+static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func *tp_funcs)
+{
+ void *func = tp->iterator;
+
+ /* Synthetic events do not have static call sites */
+ if (!tp->static_call_key)
+ return;
+ if (nr_func_state(tp_funcs) == TP_FUNC_1)
+ func = tp_funcs[0].func;
+ __static_call_update(tp->static_call_key, tp->static_call_tramp, func);
+}
+
+/*
* Add the probe function to a tracepoint.
*/
static int tracepoint_add_func(struct tracepoint *tp,
- struct tracepoint_func *func, int prio)
+ struct tracepoint_func *func, int prio,
+ bool warn)
{
struct tracepoint_func *old, *tp_funcs;
int ret;
@@ -240,7 +337,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
lockdep_is_held(&tracepoints_mutex));
old = func_add(&tp_funcs, func, prio);
if (IS_ERR(old)) {
- WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
+ WARN_ON_ONCE(warn && PTR_ERR(old) != -ENOMEM);
return PTR_ERR(old);
}
@@ -250,9 +347,42 @@ static int tracepoint_add_func(struct tracepoint *tp,
* a pointer to it. This array is referenced by __DO_TRACE from
* include/linux/tracepoint.h using rcu_dereference_sched().
*/
- rcu_assign_pointer(tp->funcs, tp_funcs);
- if (!static_key_enabled(&tp->key))
- static_key_slow_inc(&tp->key);
+ switch (nr_func_state(tp_funcs)) {
+ case TP_FUNC_1: /* 0->1 */
+ /*
+ * Make sure new static func never uses old data after a
+ * 1->0->1 transition sequence.
+ */
+ tp_rcu_cond_sync(TP_TRANSITION_SYNC_1_0_1);
+ /* Set static call to first function */
+ tracepoint_update_call(tp, tp_funcs);
+ /* Both iterator and static call handle NULL tp->funcs */
+ rcu_assign_pointer(tp->funcs, tp_funcs);
+ static_key_enable(&tp->key);
+ break;
+ case TP_FUNC_2: /* 1->2 */
+ /* Set iterator static call */
+ tracepoint_update_call(tp, tp_funcs);
+ /*
+ * Iterator callback installed before updating tp->funcs.
+ * Requires ordering between RCU assign/dereference and
+ * static call update/call.
+ */
+ fallthrough;
+ case TP_FUNC_N: /* N->N+1 (N>1) */
+ rcu_assign_pointer(tp->funcs, tp_funcs);
+ /*
+ * Make sure static func never uses incorrect data after a
+ * N->...->2->1 (N>1) transition sequence.
+ */
+ if (tp_funcs[0].data != old[0].data)
+ tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
release_probes(old);
return 0;
}
@@ -271,25 +401,91 @@ static int tracepoint_remove_func(struct tracepoint *tp,
tp_funcs = rcu_dereference_protected(tp->funcs,
lockdep_is_held(&tracepoints_mutex));
old = func_remove(&tp_funcs, func);
- if (IS_ERR(old)) {
- WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
+ if (WARN_ON_ONCE(IS_ERR(old)))
return PTR_ERR(old);
- }
- if (!tp_funcs) {
+ if (tp_funcs == old)
+ /* Failed allocating new tp_funcs, replaced func with stub */
+ return 0;
+
+ switch (nr_func_state(tp_funcs)) {
+ case TP_FUNC_0: /* 1->0 */
/* Removed last function */
if (tp->unregfunc && static_key_enabled(&tp->key))
tp->unregfunc();
- if (static_key_enabled(&tp->key))
- static_key_slow_dec(&tp->key);
+ static_key_disable(&tp->key);
+ /* Set iterator static call */
+ tracepoint_update_call(tp, tp_funcs);
+ /* Both iterator and static call handle NULL tp->funcs */
+ rcu_assign_pointer(tp->funcs, NULL);
+ /*
+ * Make sure new static func never uses old data after a
+ * 1->0->1 transition sequence.
+ */
+ tp_rcu_get_state(TP_TRANSITION_SYNC_1_0_1);
+ break;
+ case TP_FUNC_1: /* 2->1 */
+ rcu_assign_pointer(tp->funcs, tp_funcs);
+ /*
+ * Make sure static func never uses incorrect data after a
+ * N->...->2->1 (N>2) transition sequence. If the first
+ * element's data has changed, then force the synchronization
+ * to prevent current readers that have loaded the old data
+ * from calling the new function.
+ */
+ if (tp_funcs[0].data != old[0].data)
+ tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1);
+ tp_rcu_cond_sync(TP_TRANSITION_SYNC_N_2_1);
+ /* Set static call to first function */
+ tracepoint_update_call(tp, tp_funcs);
+ break;
+ case TP_FUNC_2: /* N->N-1 (N>2) */
+ fallthrough;
+ case TP_FUNC_N:
+ rcu_assign_pointer(tp->funcs, tp_funcs);
+ /*
+ * Make sure static func never uses incorrect data after a
+ * N->...->2->1 (N>2) transition sequence.
+ */
+ if (tp_funcs[0].data != old[0].data)
+ tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
- rcu_assign_pointer(tp->funcs, tp_funcs);
release_probes(old);
return 0;
}
/**
+ * tracepoint_probe_register_prio_may_exist - Connect a probe to a tracepoint with priority
+ * @tp: tracepoint
+ * @probe: probe handler
+ * @data: tracepoint data
+ * @prio: priority of this function over other registered functions
+ *
+ * Same as tracepoint_probe_register_prio() except that it will not warn
+ * if the tracepoint is already registered.
+ */
+int tracepoint_probe_register_prio_may_exist(struct tracepoint *tp, void *probe,
+ void *data, int prio)
+{
+ struct tracepoint_func tp_func;
+ int ret;
+
+ mutex_lock(&tracepoints_mutex);
+ tp_func.func = probe;
+ tp_func.data = data;
+ tp_func.prio = prio;
+ ret = tracepoint_add_func(tp, &tp_func, prio, false);
+ mutex_unlock(&tracepoints_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio_may_exist);
+
+/**
* tracepoint_probe_register_prio - Connect a probe to a tracepoint with priority
* @tp: tracepoint
* @probe: probe handler
@@ -312,7 +508,7 @@ int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe,
tp_func.func = probe;
tp_func.data = data;
tp_func.prio = prio;
- ret = tracepoint_add_func(tp, &tp_func, prio);
+ ret = tracepoint_add_func(tp, &tp_func, prio, true);
mutex_unlock(&tracepoints_mutex);
return ret;
}
@@ -375,13 +571,14 @@ static void for_each_tracepoint_range(
bool trace_module_has_bad_taint(struct module *mod)
{
return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) |
- (1 << TAINT_UNSIGNED_MODULE));
+ (1 << TAINT_UNSIGNED_MODULE) | (1 << TAINT_TEST) |
+ (1 << TAINT_LIVEPATCH));
}
static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list);
/**
- * register_tracepoint_notifier - register tracepoint coming/going notifier
+ * register_tracepoint_module_notifier - register tracepoint coming/going notifier
* @nb: notifier block
*
* Notifiers registered with this function are called on module
@@ -407,7 +604,7 @@ end:
EXPORT_SYMBOL_GPL(register_tracepoint_module_notifier);
/**
- * unregister_tracepoint_notifier - unregister tracepoint coming/going notifier
+ * unregister_tracepoint_module_notifier - unregister tracepoint coming/going notifier
* @nb: notifier block
*
* The notifier block callback should expect a "struct tp_module" data
@@ -443,7 +640,6 @@ static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv)
static int tracepoint_module_coming(struct module *mod)
{
struct tp_module *tp_mod;
- int ret = 0;
if (!mod->num_tracepoints)
return 0;
@@ -451,23 +647,22 @@ static int tracepoint_module_coming(struct module *mod)
/*
* We skip modules that taint the kernel, especially those with different
* module headers (for forced load), to make sure we don't cause a crash.
- * Staging, out-of-tree, and unsigned GPL modules are fine.
+ * Staging, out-of-tree, unsigned GPL, and test modules are fine.
*/
if (trace_module_has_bad_taint(mod))
return 0;
- mutex_lock(&tracepoint_module_list_mutex);
+
tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
- if (!tp_mod) {
- ret = -ENOMEM;
- goto end;
- }
+ if (!tp_mod)
+ return -ENOMEM;
tp_mod->mod = mod;
+
+ mutex_lock(&tracepoint_module_list_mutex);
list_add_tail(&tp_mod->list, &tracepoint_module_list);
blocking_notifier_call_chain(&tracepoint_notify_list,
MODULE_STATE_COMING, tp_mod);
-end:
mutex_unlock(&tracepoint_module_list_mutex);
- return ret;
+ return 0;
}
static void tracepoint_module_going(struct module *mod)
@@ -521,7 +716,7 @@ static int tracepoint_module_notify(struct notifier_block *self,
case MODULE_STATE_UNFORMED:
break;
}
- return ret;
+ return notifier_from_errno(ret);
}
static struct notifier_block tracepoint_module_nb = {
@@ -567,7 +762,7 @@ int syscall_regfunc(void)
if (!sys_tracepoint_refcount) {
read_lock(&tasklist_lock);
for_each_process_thread(p, t) {
- set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
+ set_task_syscall_work(t, SYSCALL_TRACEPOINT);
}
read_unlock(&tasklist_lock);
}
@@ -584,7 +779,7 @@ void syscall_unregfunc(void)
if (!sys_tracepoint_refcount) {
read_lock(&tasklist_lock);
for_each_process_thread(p, t) {
- clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
+ clear_task_syscall_work(t, SYSCALL_TRACEPOINT);
}
read_unlock(&tasklist_lock);
}
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 257ffb993ea2..4252f0645b9e 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -23,26 +23,30 @@ void bacct_add_tsk(struct user_namespace *user_ns,
{
const struct cred *tcred;
u64 utime, stime, utimescaled, stimescaled;
- u64 delta;
+ u64 now_ns, delta;
time64_t btime;
BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
/* calculate task elapsed time in nsec */
- delta = ktime_get_ns() - tsk->start_time;
+ now_ns = ktime_get_ns();
+ /* store whole group time first */
+ delta = now_ns - tsk->group_leader->start_time;
/* Convert to micro seconds */
do_div(delta, NSEC_PER_USEC);
+ stats->ac_tgetime = delta;
+ delta = now_ns - tsk->start_time;
+ do_div(delta, NSEC_PER_USEC);
stats->ac_etime = delta;
/* Convert to seconds for btime (note y2106 limit) */
btime = ktime_get_real_seconds() - div_u64(delta, USEC_PER_SEC);
stats->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
stats->ac_btime64 = btime;
- if (thread_group_leader(tsk)) {
+ if (tsk->flags & PF_EXITING)
stats->ac_exitcode = tsk->exit_code;
- if (tsk->flags & PF_FORKNOEXEC)
- stats->ac_flag |= AFORK;
- }
+ if (thread_group_leader(tsk) && (tsk->flags & PF_FORKNOEXEC))
+ stats->ac_flag |= AFORK;
if (tsk->flags & PF_SUPERPRIV)
stats->ac_flag |= ASU;
if (tsk->flags & PF_DUMPCORE)
@@ -52,6 +56,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,
stats->ac_nice = task_nice(tsk);
stats->ac_sched = tsk->policy;
stats->ac_pid = task_pid_nr_ns(tsk, pid_ns);
+ stats->ac_tgid = task_tgid_nr_ns(tsk, pid_ns);
rcu_read_lock();
tcred = __task_cred(tsk);
stats->ac_uid = from_kuid_munged(user_ns, tcred->uid);
@@ -137,7 +142,7 @@ static void __acct_update_integrals(struct task_struct *tsk,
* the rest of the math is done in xacct_add_tsk.
*/
tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
- tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
+ tsk->acct_vm_mem1 += delta * READ_ONCE(tsk->mm->total_vm) >> 10;
}
/**
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 11b1596e2542..4aa6166cb856 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -8,6 +8,12 @@
#include <linux/kmemleak.h>
#include <linux/user_namespace.h>
+struct ucounts init_ucounts = {
+ .ns = &init_user_ns,
+ .uid = GLOBAL_ROOT_UID,
+ .count = ATOMIC_INIT(1),
+};
+
#define UCOUNTS_HASHTABLE_BITS 10
static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)];
static DEFINE_SPINLOCK(ucounts_lock);
@@ -52,14 +58,17 @@ static struct ctl_table_root set_root = {
.permissions = set_permissions,
};
-#define UCOUNT_ENTRY(name) \
- { \
- .procname = name, \
- .maxlen = sizeof(int), \
- .mode = 0644, \
- .proc_handler = proc_dointvec_minmax, \
- .extra1 = SYSCTL_ZERO, \
- .extra2 = SYSCTL_INT_MAX, \
+static long ue_zero = 0;
+static long ue_int_max = INT_MAX;
+
+#define UCOUNT_ENTRY(name) \
+ { \
+ .procname = name, \
+ .maxlen = sizeof(long), \
+ .mode = 0644, \
+ .proc_handler = proc_doulongvec_minmax, \
+ .extra1 = &ue_zero, \
+ .extra2 = &ue_int_max, \
}
static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_user_namespaces"),
@@ -74,6 +83,10 @@ static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_inotify_instances"),
UCOUNT_ENTRY("max_inotify_watches"),
#endif
+#ifdef CONFIG_FANOTIFY
+ UCOUNT_ENTRY("max_fanotify_groups"),
+ UCOUNT_ENTRY("max_fanotify_marks"),
+#endif
{ }
};
#endif /* CONFIG_SYSCTL */
@@ -91,7 +104,8 @@ bool setup_userns_sysctls(struct user_namespace *ns)
for (i = 0; i < UCOUNT_COUNTS; i++) {
tbl[i].data = &ns->ucount_max[i];
}
- ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl);
+ ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl,
+ ARRAY_SIZE(user_table));
}
if (!ns->sysctls) {
kfree(tbl);
@@ -125,10 +139,34 @@ static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struc
return NULL;
}
-static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
+static void hlist_add_ucounts(struct ucounts *ucounts)
+{
+ struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid);
+ spin_lock_irq(&ucounts_lock);
+ hlist_add_head(&ucounts->node, hashent);
+ spin_unlock_irq(&ucounts_lock);
+}
+
+static inline bool get_ucounts_or_wrap(struct ucounts *ucounts)
+{
+ /* Returns true on a successful get, false if the count wraps. */
+ return !atomic_add_negative(1, &ucounts->count);
+}
+
+struct ucounts *get_ucounts(struct ucounts *ucounts)
+{
+ if (!get_ucounts_or_wrap(ucounts)) {
+ put_ucounts(ucounts);
+ ucounts = NULL;
+ }
+ return ucounts;
+}
+
+struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
{
struct hlist_head *hashent = ucounts_hashentry(ns, uid);
struct ucounts *ucounts, *new;
+ bool wrapped;
spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent);
@@ -141,7 +179,7 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
new->ns = ns;
new->uid = uid;
- new->count = 0;
+ atomic_set(&new->count, 1);
spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent);
@@ -149,40 +187,40 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
kfree(new);
} else {
hlist_add_head(&new->node, hashent);
- ucounts = new;
+ get_user_ns(new->ns);
+ spin_unlock_irq(&ucounts_lock);
+ return new;
}
}
- if (ucounts->count == INT_MAX)
- ucounts = NULL;
- else
- ucounts->count += 1;
+ wrapped = !get_ucounts_or_wrap(ucounts);
spin_unlock_irq(&ucounts_lock);
+ if (wrapped) {
+ put_ucounts(ucounts);
+ return NULL;
+ }
return ucounts;
}
-static void put_ucounts(struct ucounts *ucounts)
+void put_ucounts(struct ucounts *ucounts)
{
unsigned long flags;
- spin_lock_irqsave(&ucounts_lock, flags);
- ucounts->count -= 1;
- if (!ucounts->count)
+ if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) {
hlist_del_init(&ucounts->node);
- else
- ucounts = NULL;
- spin_unlock_irqrestore(&ucounts_lock, flags);
-
- kfree(ucounts);
+ spin_unlock_irqrestore(&ucounts_lock, flags);
+ put_user_ns(ucounts->ns);
+ kfree(ucounts);
+ }
}
-static inline bool atomic_inc_below(atomic_t *v, int u)
+static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
{
- int c, old;
- c = atomic_read(v);
+ long c, old;
+ c = atomic_long_read(v);
for (;;) {
if (unlikely(c >= u))
return false;
- old = atomic_cmpxchg(v, c, c+1);
+ old = atomic_long_cmpxchg(v, c, c+1);
if (likely(old == c))
return true;
c = old;
@@ -194,19 +232,19 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
{
struct ucounts *ucounts, *iter, *bad;
struct user_namespace *tns;
- ucounts = get_ucounts(ns, uid);
+ ucounts = alloc_ucounts(ns, uid);
for (iter = ucounts; iter; iter = tns->ucounts) {
- int max;
+ long max;
tns = iter->ns;
max = READ_ONCE(tns->ucount_max[type]);
- if (!atomic_inc_below(&iter->ucount[type], max))
+ if (!atomic_long_inc_below(&iter->ucount[type], max))
goto fail;
}
return ucounts;
fail:
bad = iter;
for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
- atomic_dec(&iter->ucount[type]);
+ atomic_long_dec(&iter->ucount[type]);
put_ucounts(ucounts);
return NULL;
@@ -216,12 +254,107 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
{
struct ucounts *iter;
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
- int dec = atomic_dec_if_positive(&iter->ucount[type]);
+ long dec = atomic_long_dec_if_positive(&iter->ucount[type]);
WARN_ON_ONCE(dec < 0);
}
put_ucounts(ucounts);
}
+long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v)
+{
+ struct ucounts *iter;
+ long max = LONG_MAX;
+ long ret = 0;
+
+ for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+ long new = atomic_long_add_return(v, &iter->rlimit[type]);
+ if (new < 0 || new > max)
+ ret = LONG_MAX;
+ else if (iter == ucounts)
+ ret = new;
+ max = get_userns_rlimit_max(iter->ns, type);
+ }
+ return ret;
+}
+
+bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v)
+{
+ struct ucounts *iter;
+ long new = -1; /* Silence compiler warning */
+ for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+ long dec = atomic_long_sub_return(v, &iter->rlimit[type]);
+ WARN_ON_ONCE(dec < 0);
+ if (iter == ucounts)
+ new = dec;
+ }
+ return (new == 0);
+}
+
+static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts,
+ struct ucounts *last, enum rlimit_type type)
+{
+ struct ucounts *iter, *next;
+ for (iter = ucounts; iter != last; iter = next) {
+ long dec = atomic_long_sub_return(1, &iter->rlimit[type]);
+ WARN_ON_ONCE(dec < 0);
+ next = iter->ns->ucounts;
+ if (dec == 0)
+ put_ucounts(iter);
+ }
+}
+
+void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type)
+{
+ do_dec_rlimit_put_ucounts(ucounts, NULL, type);
+}
+
+long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
+{
+ /* Caller must hold a reference to ucounts */
+ struct ucounts *iter;
+ long max = LONG_MAX;
+ long dec, ret = 0;
+
+ for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+ long new = atomic_long_add_return(1, &iter->rlimit[type]);
+ if (new < 0 || new > max)
+ goto unwind;
+ if (iter == ucounts)
+ ret = new;
+ max = get_userns_rlimit_max(iter->ns, type);
+ /*
+ * Grab an extra ucount reference for the caller when
+ * the rlimit count was previously 0.
+ */
+ if (new != 1)
+ continue;
+ if (!get_ucounts(iter))
+ goto dec_unwind;
+ }
+ return ret;
+dec_unwind:
+ dec = atomic_long_sub_return(1, &iter->rlimit[type]);
+ WARN_ON_ONCE(dec < 0);
+unwind:
+ do_dec_rlimit_put_ucounts(ucounts, iter, type);
+ return 0;
+}
+
+bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long rlimit)
+{
+ struct ucounts *iter;
+ long max = rlimit;
+ if (rlimit > LONG_MAX)
+ max = LONG_MAX;
+ for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+ long val = get_rlimit_value(iter, type);
+ if (val < 0 || val > max)
+ return true;
+ max = get_userns_rlimit_max(iter->ns, type);
+ }
+ return false;
+}
+
static __init int user_namespace_sysctl_init(void)
{
#ifdef CONFIG_SYSCTL
@@ -232,11 +365,13 @@ static __init int user_namespace_sysctl_init(void)
* default set so that registrations in the child sets work
* properly.
*/
- user_header = register_sysctl("user", empty);
+ user_header = register_sysctl_sz("user", empty, 0);
kmemleak_ignore(user_header);
BUG_ON(!user_header);
BUG_ON(!setup_userns_sysctls(&init_user_ns));
#endif
+ hlist_add_ucounts(&init_ucounts);
+ inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1);
return 0;
}
subsys_initcall(user_namespace_sysctl_init);
diff --git a/kernel/umh.c b/kernel/umh.c
index 79f139a7ca03..1b13c5d34624 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -14,6 +14,7 @@
#include <linux/cred.h>
#include <linux/file.h>
#include <linux/fdtable.h>
+#include <linux/fs_struct.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/mount.h>
@@ -26,20 +27,15 @@
#include <linux/ptrace.h>
#include <linux/async.h>
#include <linux/uaccess.h>
-#include <linux/shmem_fs.h>
-#include <linux/pipe_fs_i.h>
+#include <linux/initrd.h>
+#include <linux/freezer.h>
#include <trace/events/module.h>
-#define CAP_BSET (void *)1
-#define CAP_PI (void *)2
-
static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
static DEFINE_SPINLOCK(umh_sysctl_lock);
static DECLARE_RWSEM(umhelper_sem);
-static LIST_HEAD(umh_list);
-static DEFINE_MUTEX(umh_list_lock);
static void call_usermodehelper_freeinfo(struct subprocess_info *info)
{
@@ -76,6 +72,14 @@ static int call_usermodehelper_exec_async(void *data)
spin_unlock_irq(&current->sighand->siglock);
/*
+ * Initial kernel threads share ther FS with init, in order to
+ * get the init root directory. But we've now created a new
+ * thread that is going to execve a user process and has its own
+ * 'struct fs_struct'. Reset umask to the default.
+ */
+ current->fs->umask = 0022;
+
+ /*
* Our parent (unbound workqueue) runs with elevated scheduling
* priority. Avoid propagating that into the userspace child.
*/
@@ -102,16 +106,10 @@ static int call_usermodehelper_exec_async(void *data)
commit_creds(new);
- sub_info->pid = task_pid_nr(current);
- if (sub_info->file) {
- retval = do_execve_file(sub_info->file,
- sub_info->argv, sub_info->envp);
- if (!retval)
- current->flags |= PF_UMH;
- } else
- retval = do_execve(getname_kernel(sub_info->path),
- (const char __user *const __user *)sub_info->argv,
- (const char __user *const __user *)sub_info->envp);
+ wait_for_initramfs();
+ retval = kernel_execve(sub_info->path,
+ (const char *const *)sub_info->argv,
+ (const char *const *)sub_info->envp);
out:
sub_info->retval = retval;
/*
@@ -130,37 +128,16 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
{
pid_t pid;
- /* If SIGCLD is ignored kernel_wait4 won't populate the status. */
+ /* If SIGCLD is ignored do_wait won't populate the status. */
kernel_sigaction(SIGCHLD, SIG_DFL);
- pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
- if (pid < 0) {
+ pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
+ if (pid < 0)
sub_info->retval = pid;
- } else {
- int ret = -ECHILD;
- /*
- * Normally it is bogus to call wait4() from in-kernel because
- * wait4() wants to write the exit code to a userspace address.
- * But call_usermodehelper_exec_sync() always runs as kernel
- * thread (workqueue) and put_user() to a kernel address works
- * OK for kernel threads, due to their having an mm_segment_t
- * which spans the entire address space.
- *
- * Thus the __user pointer cast is valid here.
- */
- kernel_wait4(pid, (int __user *)&ret, 0, NULL);
-
- /*
- * If ret is 0, either call_usermodehelper_exec_async failed and
- * the real error code is already in sub_info->retval or
- * sub_info->retval is 0 anyway, so don't mess with it then.
- */
- if (ret)
- sub_info->retval = ret;
- }
+ else
+ kernel_wait(pid, &sub_info->retval);
/* Restore default kernel sig handler */
kernel_sigaction(SIGCHLD, SIG_IGN);
-
umh_complete(sub_info);
}
@@ -192,8 +169,8 @@ static void call_usermodehelper_exec_work(struct work_struct *work)
* want to pollute current->children, and we need a parent
* that always ignores SIGCHLD to ensure auto-reaping.
*/
- pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
- CLONE_PARENT | SIGCHLD);
+ pid = user_mode_thread(call_usermodehelper_exec_async, sub_info,
+ CLONE_PARENT | SIGCHLD);
if (pid < 0) {
sub_info->retval = pid;
umh_complete(sub_info);
@@ -359,8 +336,8 @@ static void helper_unlock(void)
* @argv: arg vector for process
* @envp: environment for process
* @gfp_mask: gfp mask for memory allocation
- * @cleanup: a cleanup function
* @init: an init function
+ * @cleanup: a cleanup function
* @data: arbitrary context sensitive data
*
* Returns either %NULL on allocation failure, or a subprocess_info
@@ -371,7 +348,7 @@ static void helper_unlock(void)
* exec. A non-zero return code causes the process to error out, exit,
* and return the failure to the calling process
*
- * The cleanup function is just before ethe subprocess_info is about to
+ * The cleanup function is just before the subprocess_info is about to
* be freed. This can be used for freeing the argv and envp. The
* Function must be runnable in either a process context or the
* context in which call_usermodehelper_exec is called.
@@ -405,143 +382,9 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
}
EXPORT_SYMBOL(call_usermodehelper_setup);
-struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
- int (*init)(struct subprocess_info *info, struct cred *new),
- void (*cleanup)(struct subprocess_info *info), void *data)
-{
- struct subprocess_info *sub_info;
- struct umh_info *info = data;
- const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
-
- sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL);
- if (!sub_info)
- return NULL;
-
- sub_info->argv = argv_split(GFP_KERNEL, cmdline, NULL);
- if (!sub_info->argv) {
- kfree(sub_info);
- return NULL;
- }
-
- INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
- sub_info->path = "none";
- sub_info->file = file;
- sub_info->init = init;
- sub_info->cleanup = cleanup;
- sub_info->data = data;
- return sub_info;
-}
-
-static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
-{
- struct umh_info *umh_info = info->data;
- struct file *from_umh[2];
- struct file *to_umh[2];
- int err;
-
- /* create pipe to send data to umh */
- err = create_pipe_files(to_umh, 0);
- if (err)
- return err;
- err = replace_fd(0, to_umh[0], 0);
- fput(to_umh[0]);
- if (err < 0) {
- fput(to_umh[1]);
- return err;
- }
-
- /* create pipe to receive data from umh */
- err = create_pipe_files(from_umh, 0);
- if (err) {
- fput(to_umh[1]);
- replace_fd(0, NULL, 0);
- return err;
- }
- err = replace_fd(1, from_umh[1], 0);
- fput(from_umh[1]);
- if (err < 0) {
- fput(to_umh[1]);
- replace_fd(0, NULL, 0);
- fput(from_umh[0]);
- return err;
- }
-
- umh_info->pipe_to_umh = to_umh[1];
- umh_info->pipe_from_umh = from_umh[0];
- return 0;
-}
-
-static void umh_clean_and_save_pid(struct subprocess_info *info)
-{
- struct umh_info *umh_info = info->data;
-
- /* cleanup if umh_pipe_setup() was successful but exec failed */
- if (info->pid && info->retval) {
- fput(umh_info->pipe_to_umh);
- fput(umh_info->pipe_from_umh);
- }
-
- argv_free(info->argv);
- umh_info->pid = info->pid;
-}
-
-/**
- * fork_usermode_blob - fork a blob of bytes as a usermode process
- * @data: a blob of bytes that can be do_execv-ed as a file
- * @len: length of the blob
- * @info: information about usermode process (shouldn't be NULL)
- *
- * If info->cmdline is set it will be used as command line for the
- * user process, else "usermodehelper" is used.
- *
- * Returns either negative error or zero which indicates success
- * in executing a blob of bytes as a usermode process. In such
- * case 'struct umh_info *info' is populated with two pipes
- * and a pid of the process. The caller is responsible for health
- * check of the user process, killing it via pid, and closing the
- * pipes when user process is no longer needed.
- */
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
-{
- struct subprocess_info *sub_info;
- struct file *file;
- ssize_t written;
- loff_t pos = 0;
- int err;
-
- file = shmem_kernel_file_setup("", len, 0);
- if (IS_ERR(file))
- return PTR_ERR(file);
-
- written = kernel_write(file, data, len, &pos);
- if (written != len) {
- err = written;
- if (err >= 0)
- err = -ENOMEM;
- goto out;
- }
-
- err = -ENOMEM;
- sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup,
- umh_clean_and_save_pid, info);
- if (!sub_info)
- goto out;
-
- err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
- if (!err) {
- mutex_lock(&umh_list_lock);
- list_add(&info->list, &umh_list);
- mutex_unlock(&umh_list_lock);
- }
-out:
- fput(file);
- return err;
-}
-EXPORT_SYMBOL_GPL(fork_usermode_blob);
-
/**
* call_usermodehelper_exec - start a usermode application
- * @sub_info: information about the subprocessa
+ * @sub_info: information about the subprocess
* @wait: wait for the application to finish and return status.
* when UMH_NO_WAIT don't wait at all, but you get no useful error back
* when the program couldn't be exec'ed. This makes it safe to call
@@ -558,6 +401,7 @@ EXPORT_SYMBOL_GPL(fork_usermode_blob);
*/
int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
{
+ unsigned int state = TASK_UNINTERRUPTIBLE;
DECLARE_COMPLETION_ONSTACK(done);
int retval = 0;
@@ -591,18 +435,28 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
if (wait == UMH_NO_WAIT) /* task has freed sub_info */
goto unlock;
+ if (wait & UMH_FREEZABLE)
+ state |= TASK_FREEZABLE;
+
if (wait & UMH_KILLABLE) {
- retval = wait_for_completion_killable(&done);
+ retval = wait_for_completion_state(&done, state | TASK_KILLABLE);
if (!retval)
goto wait_done;
/* umh_complete() will see NULL and free sub_info */
if (xchg(&sub_info->complete, NULL))
goto unlock;
- /* fallthrough, umh_complete() was already called */
+
+ /*
+ * fallthrough; in case of -ERESTARTSYS now do uninterruptible
+ * wait_for_completion_state(). Since umh_complete() shall call
+ * complete() in a moment if xchg() above returned NULL, this
+ * uninterruptible wait_for_completion_state() will not block
+ * SIGKILL'ed processes for long.
+ */
}
+ wait_for_completion_state(&done, state);
- wait_for_completion(&done);
wait_done:
retval = sub_info->retval;
out:
@@ -640,13 +494,14 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
}
EXPORT_SYMBOL(call_usermodehelper);
+#if defined(CONFIG_SYSCTL)
static int proc_cap_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
- unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
- kernel_cap_t new_cap;
- int err, i;
+ unsigned long cap_array[2];
+ kernel_cap_t new_cap, *cap;
+ int err;
if (write && (!capable(CAP_SETPCAP) ||
!capable(CAP_SYS_MODULE)))
@@ -655,16 +510,13 @@ static int proc_cap_handler(struct ctl_table *table, int write,
/*
* convert from the global kernel_cap_t to the ulong array to print to
* userspace if this is a read.
+ *
+ * Legacy format: capabilities are exposed as two 32-bit values
*/
+ cap = table->data;
spin_lock(&umh_sysctl_lock);
- for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
- if (table->data == CAP_BSET)
- cap_array[i] = usermodehelper_bset.cap[i];
- else if (table->data == CAP_PI)
- cap_array[i] = usermodehelper_inheritable.cap[i];
- else
- BUG();
- }
+ cap_array[0] = (u32) cap->val;
+ cap_array[1] = cap->val >> 32;
spin_unlock(&umh_sysctl_lock);
t = *table;
@@ -678,62 +530,43 @@ static int proc_cap_handler(struct ctl_table *table, int write,
if (err < 0)
return err;
- /*
- * convert from the sysctl array of ulongs to the kernel_cap_t
- * internal representation
- */
- for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
- new_cap.cap[i] = cap_array[i];
+ new_cap.val = (u32)cap_array[0];
+ new_cap.val += (u64)cap_array[1] << 32;
/*
* Drop everything not in the new_cap (but don't add things)
*/
if (write) {
spin_lock(&umh_sysctl_lock);
- if (table->data == CAP_BSET)
- usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
- if (table->data == CAP_PI)
- usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+ *cap = cap_intersect(*cap, new_cap);
spin_unlock(&umh_sysctl_lock);
}
return 0;
}
-void __exit_umh(struct task_struct *tsk)
-{
- struct umh_info *info;
- pid_t pid = tsk->pid;
-
- mutex_lock(&umh_list_lock);
- list_for_each_entry(info, &umh_list, list) {
- if (info->pid == pid) {
- list_del(&info->list);
- mutex_unlock(&umh_list_lock);
- goto out;
- }
- }
- mutex_unlock(&umh_list_lock);
- return;
-out:
- if (info->cleanup)
- info->cleanup(info);
-}
-
-struct ctl_table usermodehelper_table[] = {
+static struct ctl_table usermodehelper_table[] = {
{
.procname = "bset",
- .data = CAP_BSET,
- .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .data = &usermodehelper_bset,
+ .maxlen = 2 * sizeof(unsigned long),
.mode = 0600,
.proc_handler = proc_cap_handler,
},
{
.procname = "inheritable",
- .data = CAP_PI,
- .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .data = &usermodehelper_inheritable,
+ .maxlen = 2 * sizeof(unsigned long),
.mode = 0600,
.proc_handler = proc_cap_handler,
},
{ }
};
+
+static int __init init_umh_sysctls(void)
+{
+ register_sysctl_init("kernel/usermodehelper", usermodehelper_table);
+ return 0;
+}
+early_initcall(init_umh_sysctls);
+#endif /* CONFIG_SYSCTL */
diff --git a/kernel/up.c b/kernel/up.c
index c6f323dcd45b..df50828cc2f0 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -36,38 +36,9 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
}
EXPORT_SYMBOL(smp_call_function_single_async);
-void on_each_cpu(smp_call_func_t func, void *info, int wait)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- func(info);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(on_each_cpu);
-
-/*
- * Note we still need to test the mask even for UP
- * because we actually can get an empty mask from
- * code that on SMP might call us without the local
- * CPU in the mask.
- */
-void on_each_cpu_mask(const struct cpumask *mask,
- smp_call_func_t func, void *info, bool wait)
-{
- unsigned long flags;
-
- if (cpumask_test_cpu(0, mask)) {
- local_irq_save(flags);
- func(info);
- local_irq_restore(flags);
- }
-}
-EXPORT_SYMBOL(on_each_cpu_mask);
-
/*
* Preemption is disabled here to make sure the cond_func is called under the
- * same condtions in UP and SMP.
+ * same conditions in UP and SMP.
*/
void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
void *info, bool wait, const struct cpumask *mask)
@@ -75,7 +46,7 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
unsigned long flags;
preempt_disable();
- if (cond_func(0, info)) {
+ if ((!cond_func || cond_func(0, info)) && cpumask_test_cpu(0, mask)) {
local_irq_save(flags);
func(info);
local_irq_restore(flags);
@@ -84,13 +55,6 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);
-void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
- void *info, bool wait)
-{
- on_each_cpu_cond_mask(cond_func, func, info, wait, NULL);
-}
-EXPORT_SYMBOL(on_each_cpu_cond);
-
int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
{
int ret;
diff --git a/kernel/user.c b/kernel/user.c
index b1635d94a1f2..03cedc366dc9 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -18,8 +18,18 @@
#include <linux/interrupt.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
+#include <linux/binfmts.h>
#include <linux/proc_ns.h>
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+struct binfmt_misc init_binfmt_misc = {
+ .entries = LIST_HEAD_INIT(init_binfmt_misc.entries),
+ .enabled = true,
+ .entries_lock = __RW_LOCK_UNLOCKED(init_binfmt_misc.entries_lock),
+};
+EXPORT_SYMBOL_GPL(init_binfmt_misc);
+#endif
+
/*
* userns count is 1 for root user, 1 for init_uts_ns,
* and 1 for... ?
@@ -55,7 +65,7 @@ struct user_namespace init_user_ns = {
},
},
},
- .count = ATOMIC_INIT(3),
+ .ns.count = REFCOUNT_INIT(3),
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
.ns.inum = PROC_USER_INIT_INO,
@@ -67,6 +77,9 @@ struct user_namespace init_user_ns = {
.keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list),
.keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem),
#endif
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+ .binfmt_misc = &init_binfmt_misc,
+#endif
};
EXPORT_SYMBOL_GPL(init_user_ns);
@@ -98,9 +111,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
/* root_user.__count is 1, for init task cred */
struct user_struct root_user = {
.__count = REFCOUNT_INIT(1),
- .processes = ATOMIC_INIT(1),
- .sigpending = ATOMIC_INIT(0),
- .locked_shm = 0,
.uid = GLOBAL_ROOT_UID,
.ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0),
};
@@ -132,6 +142,22 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
return NULL;
}
+static int user_epoll_alloc(struct user_struct *up)
+{
+#ifdef CONFIG_EPOLL
+ return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL);
+#else
+ return 0;
+#endif
+}
+
+static void user_epoll_free(struct user_struct *up)
+{
+#ifdef CONFIG_EPOLL
+ percpu_counter_destroy(&up->epoll_watches);
+#endif
+}
+
/* IRQs are disabled and uidhash_lock is held upon function entry.
* IRQ state (as stored in flags) is restored and uidhash_lock released
* upon function exit.
@@ -141,6 +167,7 @@ static void free_user(struct user_struct *up, unsigned long flags)
{
uid_hash_remove(up);
spin_unlock_irqrestore(&uidhash_lock, flags);
+ user_epoll_free(up);
kmem_cache_free(uid_cachep, up);
}
@@ -171,6 +198,7 @@ void free_uid(struct user_struct *up)
if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags))
free_user(up, flags);
}
+EXPORT_SYMBOL_GPL(free_uid);
struct user_struct *alloc_uid(kuid_t uid)
{
@@ -188,6 +216,10 @@ struct user_struct *alloc_uid(kuid_t uid)
new->uid = uid;
refcount_set(&new->__count, 1);
+ if (user_epoll_alloc(new)) {
+ kmem_cache_free(uid_cachep, new);
+ return NULL;
+ }
ratelimit_state_init(&new->ratelimit, HZ, 100);
ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE);
@@ -198,6 +230,7 @@ struct user_struct *alloc_uid(kuid_t uid)
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
+ user_epoll_free(new);
kmem_cache_free(uid_cachep, new);
} else {
uid_hash_insert(new, hashent);
@@ -219,6 +252,9 @@ static int __init uid_cache_init(void)
for(n = 0; n < UIDHASH_SZ; ++n)
INIT_HLIST_HEAD(uidhash_table + n);
+ if (user_epoll_alloc(&root_user))
+ panic("root_user epoll percpu counter alloc failed");
+
/* Insert the root user immediately (init already runs as root) */
spin_lock_irq(&uidhash_lock);
uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 87804e0371fe..ce4d99df5f0e 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
#include <linux/highuid.h>
#include <linux/cred.h>
#include <linux/securebits.h>
+#include <linux/security.h>
#include <linux/keyctl.h>
#include <linux/key-type.h>
#include <keys/user-type.h>
@@ -21,7 +22,7 @@
#include <linux/bsearch.h>
#include <linux/sort.h>
-static struct kmem_cache *user_ns_cachep __read_mostly;
+static struct kmem_cache *user_ns_cachep __ro_after_init;
static DEFINE_MUTEX(userns_state_mutex);
static bool new_idmap_permitted(const struct file *file,
@@ -58,6 +59,18 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
cred->user_ns = user_ns;
}
+static unsigned long enforced_nproc_rlimit(void)
+{
+ unsigned long limit = RLIM_INFINITY;
+
+ /* Is RLIMIT_NPROC currently enforced? */
+ if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) ||
+ (current_user_ns() != &init_user_ns))
+ limit = rlimit(RLIMIT_NPROC);
+
+ return limit;
+}
+
/*
* Create a new user namespace, deriving the creator from the user in the
* passed credentials, and replacing that user with the new root user for the
@@ -85,7 +98,7 @@ int create_user_ns(struct cred *new)
/*
* Verify that we can not violate the policy of which files
* may be accessed that is specified by the root directory,
- * by verifing that the root directory is at the root of the
+ * by verifying that the root directory is at the root of the
* mount namespace which allows all files to be accessed.
*/
ret = -EPERM;
@@ -101,17 +114,22 @@ int create_user_ns(struct cred *new)
!kgid_has_mapping(parent_ns, group))
goto fail_dec;
+ ret = security_create_user_ns(new);
+ if (ret < 0)
+ goto fail_dec;
+
ret = -ENOMEM;
ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
if (!ns)
goto fail_dec;
+ ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
ret = ns_alloc_inum(&ns->ns);
if (ret)
goto fail_free;
ns->ns.ops = &userns_operations;
- atomic_set(&ns->count, 1);
+ refcount_set(&ns->ns.count, 1);
/* Leave the new->user_ns reference with the new user namespace. */
ns->parent = parent_ns;
ns->level = parent_ns->level + 1;
@@ -121,6 +139,10 @@ int create_user_ns(struct cred *new)
for (i = 0; i < UCOUNT_COUNTS; i++) {
ns->ucount_max[i] = INT_MAX;
}
+ set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit());
+ set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
+ set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
+ set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));
ns->ucounts = ucounts;
/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
@@ -191,13 +213,16 @@ static void free_user_ns(struct work_struct *work)
kfree(ns->projid_map.forward);
kfree(ns->projid_map.reverse);
}
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+ kfree(ns->binfmt_misc);
+#endif
retire_userns_sysctls(ns);
key_free_user_ns(ns);
ns_free_inum(&ns->ns);
kmem_cache_free(user_ns_cachep, ns);
dec_user_namespaces(ucounts);
ns = parent;
- } while (atomic_dec_and_test(&parent->count));
+ } while (refcount_dec_and_test(&parent->ns.count));
}
void __put_user_ns(struct user_namespace *ns)
@@ -206,8 +231,8 @@ void __put_user_ns(struct user_namespace *ns)
}
EXPORT_SYMBOL(__put_user_ns);
-/**
- * idmap_key struct holds the information necessary to find an idmapping in a
+/*
+ * struct idmap_key - holds the information necessary to find an idmapping in a
* sorted idmap array. It is passed to cmp_map_id() as first argument.
*/
struct idmap_key {
@@ -216,7 +241,7 @@ struct idmap_key {
u32 count; /* == 0 unless used with map_id_range_down() */
};
-/**
+/*
* cmp_map_id - Function to be passed to bsearch() to find the requested
* idmapping. Expects struct idmap_key to be passed via @k.
*/
@@ -246,7 +271,7 @@ static int cmp_map_id(const void *k, const void *e)
return 1;
}
-/**
+/*
* map_id_range_down_max - Find idmap via binary search in ordered idmap array.
* Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
*/
@@ -263,7 +288,7 @@ map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 cou
sizeof(struct uid_gid_extent), cmp_map_id);
}
-/**
+/*
* map_id_range_down_base - Find idmap via binary search in static extent array.
* Can only be called if number of mappings is equal or less than
* UID_GID_MAP_MAX_BASE_EXTENTS.
@@ -307,12 +332,12 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
return id;
}
-static u32 map_id_down(struct uid_gid_map *map, u32 id)
+u32 map_id_down(struct uid_gid_map *map, u32 id)
{
return map_id_range_down(map, id, 1);
}
-/**
+/*
* map_id_up_base - Find idmap via binary search in static extent array.
* Can only be called if number of mappings is equal or less than
* UID_GID_MAP_MAX_BASE_EXTENTS.
@@ -333,7 +358,7 @@ map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id)
return NULL;
}
-/**
+/*
* map_id_up_max - Find idmap via binary search in ordered idmap array.
* Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
*/
@@ -350,7 +375,7 @@ map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id)
sizeof(struct uid_gid_extent), cmp_map_id);
}
-static u32 map_id_up(struct uid_gid_map *map, u32 id)
+u32 map_id_up(struct uid_gid_map *map, u32 id)
{
struct uid_gid_extent *extent;
unsigned extents = map->nr_extents;
@@ -515,7 +540,7 @@ EXPORT_SYMBOL(from_kgid_munged);
*
* When there is no mapping defined for the user-namespace projid
* pair INVALID_PROJID is returned. Callers are expected to test
- * for and handle handle INVALID_PROJID being returned. INVALID_PROJID
+ * for and handle INVALID_PROJID being returned. INVALID_PROJID
* may be tested for using projid_valid().
*/
kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid)
@@ -745,7 +770,7 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
return false;
}
-/**
+/*
* insert_extent - Safely insert a new idmap extent into struct uid_gid_map.
* Takes care to allocate a 4K block of memory if the number of mappings exceeds
* UID_GID_MAP_MAX_BASE_EXTENTS.
@@ -814,7 +839,7 @@ static int cmp_extents_reverse(const void *a, const void *b)
return 0;
}
-/**
+/*
* sort_idmaps - Sorts an array of idmap entries.
* Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
*/
@@ -841,6 +866,60 @@ static int sort_idmaps(struct uid_gid_map *map)
return 0;
}
+/**
+ * verify_root_map() - check the uid 0 mapping
+ * @file: idmapping file
+ * @map_ns: user namespace of the target process
+ * @new_map: requested idmap
+ *
+ * If a process requests mapping parent uid 0 into the new ns, verify that the
+ * process writing the map had the CAP_SETFCAP capability as the target process
+ * will be able to write fscaps that are valid in ancestor user namespaces.
+ *
+ * Return: true if the mapping is allowed, false if not.
+ */
+static bool verify_root_map(const struct file *file,
+ struct user_namespace *map_ns,
+ struct uid_gid_map *new_map)
+{
+ int idx;
+ const struct user_namespace *file_ns = file->f_cred->user_ns;
+ struct uid_gid_extent *extent0 = NULL;
+
+ for (idx = 0; idx < new_map->nr_extents; idx++) {
+ if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+ extent0 = &new_map->extent[idx];
+ else
+ extent0 = &new_map->forward[idx];
+ if (extent0->lower_first == 0)
+ break;
+
+ extent0 = NULL;
+ }
+
+ if (!extent0)
+ return true;
+
+ if (map_ns == file_ns) {
+ /* The process unshared its ns and is writing to its own
+ * /proc/self/uid_map. User already has full capabilites in
+ * the new namespace. Verify that the parent had CAP_SETFCAP
+ * when it unshared.
+ * */
+ if (!file_ns->parent_could_setfcap)
+ return false;
+ } else {
+ /* Process p1 is writing to uid_map of p2, who is in a child
+ * user namespace to p1's. Verify that the opener of the map
+ * file has CAP_SETFCAP against the parent of the new map
+ * namespace */
+ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP))
+ return false;
+ }
+
+ return true;
+}
+
static ssize_t map_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos,
int cap_setid,
@@ -848,7 +927,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
struct uid_gid_map *parent_map)
{
struct seq_file *seq = file->private_data;
- struct user_namespace *ns = seq->private;
+ struct user_namespace *map_ns = seq->private;
struct uid_gid_map new_map;
unsigned idx;
struct uid_gid_extent extent;
@@ -895,7 +974,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
/*
* Adjusting namespace settings requires capabilities on the target.
*/
- if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
+ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN))
goto out;
/* Parse the user data */
@@ -959,13 +1038,13 @@ static ssize_t map_write(struct file *file, const char __user *buf,
goto out;
ret = -EINVAL;
}
- /* Be very certaint the new map actually exists */
+ /* Be very certain the new map actually exists */
if (new_map.nr_extents == 0)
goto out;
ret = -EPERM;
/* Validate the user is allowed to use user id's mapped to. */
- if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
+ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map))
goto out;
ret = -EPERM;
@@ -1086,6 +1165,10 @@ static bool new_idmap_permitted(const struct file *file,
struct uid_gid_map *new_map)
{
const struct cred *cred = file->f_cred;
+
+ if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map))
+ return false;
+
/* Don't allow mappings that would allow anything that wouldn't
* be allowed without the establishment of unprivileged mappings.
*/
@@ -1110,7 +1193,7 @@ static bool new_idmap_permitted(const struct file *file,
/* Allow the specified ids if we have the appropriate capability
* (CAP_SETUID or CAP_SETGID) over the parent user namespace.
- * And the opener of the id file also had the approprpiate capability.
+ * And the opener of the id file also has the appropriate capability.
*/
if (ns_capable(ns->parent, cap_setid) &&
file_ns_capable(file, ns->parent, cap_setid))
@@ -1281,6 +1364,9 @@ static int userns_install(struct nsset *nsset, struct ns_common *ns)
put_user_ns(cred->user_ns);
set_cred_user_ns(cred, get_user_ns(user_ns));
+ if (set_cred_ucounts(cred) < 0)
+ return -EINVAL;
+
return 0;
}
@@ -1319,7 +1405,7 @@ const struct proc_ns_operations userns_operations = {
static __init int user_namespaces_init(void)
{
- user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
+ user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT);
return 0;
}
subsys_initcall(user_namespaces_init);
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
new file mode 100644
index 000000000000..8303f4c7ca71
--- /dev/null
+++ b/kernel/usermode_driver.c
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * umd - User mode driver support
+ */
+#include <linux/shmem_fs.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mount.h>
+#include <linux/fs_struct.h>
+#include <linux/task_work.h>
+#include <linux/usermode_driver.h>
+
+static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name)
+{
+ struct file_system_type *type;
+ struct vfsmount *mnt;
+ struct file *file;
+ ssize_t written;
+ loff_t pos = 0;
+
+ type = get_fs_type("tmpfs");
+ if (!type)
+ return ERR_PTR(-ENODEV);
+
+ mnt = kern_mount(type);
+ put_filesystem(type);
+ if (IS_ERR(mnt))
+ return mnt;
+
+ file = file_open_root_mnt(mnt, name, O_CREAT | O_WRONLY, 0700);
+ if (IS_ERR(file)) {
+ kern_unmount(mnt);
+ return ERR_CAST(file);
+ }
+
+ written = kernel_write(file, data, len, &pos);
+ if (written != len) {
+ int err = written;
+ if (err >= 0)
+ err = -ENOMEM;
+ filp_close(file, NULL);
+ kern_unmount(mnt);
+ return ERR_PTR(err);
+ }
+
+ fput(file);
+
+ /* Flush delayed fput so exec can open the file read-only */
+ flush_delayed_fput();
+ task_work_run();
+ return mnt;
+}
+
+/**
+ * umd_load_blob - Remember a blob of bytes for fork_usermode_driver
+ * @info: information about usermode driver
+ * @data: a blob of bytes that can be executed as a file
+ * @len: The lentgh of the blob
+ *
+ */
+int umd_load_blob(struct umd_info *info, const void *data, size_t len)
+{
+ struct vfsmount *mnt;
+
+ if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt))
+ return -EBUSY;
+
+ mnt = blob_to_mnt(data, len, info->driver_name);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+
+ info->wd.mnt = mnt;
+ info->wd.dentry = mnt->mnt_root;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(umd_load_blob);
+
+/**
+ * umd_unload_blob - Disassociate @info from a previously loaded blob
+ * @info: information about usermode driver
+ *
+ */
+int umd_unload_blob(struct umd_info *info)
+{
+ if (WARN_ON_ONCE(!info->wd.mnt ||
+ !info->wd.dentry ||
+ info->wd.mnt->mnt_root != info->wd.dentry))
+ return -EINVAL;
+
+ kern_unmount(info->wd.mnt);
+ info->wd.mnt = NULL;
+ info->wd.dentry = NULL;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(umd_unload_blob);
+
+static int umd_setup(struct subprocess_info *info, struct cred *new)
+{
+ struct umd_info *umd_info = info->data;
+ struct file *from_umh[2];
+ struct file *to_umh[2];
+ int err;
+
+ /* create pipe to send data to umh */
+ err = create_pipe_files(to_umh, 0);
+ if (err)
+ return err;
+ err = replace_fd(0, to_umh[0], 0);
+ fput(to_umh[0]);
+ if (err < 0) {
+ fput(to_umh[1]);
+ return err;
+ }
+
+ /* create pipe to receive data from umh */
+ err = create_pipe_files(from_umh, 0);
+ if (err) {
+ fput(to_umh[1]);
+ replace_fd(0, NULL, 0);
+ return err;
+ }
+ err = replace_fd(1, from_umh[1], 0);
+ fput(from_umh[1]);
+ if (err < 0) {
+ fput(to_umh[1]);
+ replace_fd(0, NULL, 0);
+ fput(from_umh[0]);
+ return err;
+ }
+
+ set_fs_pwd(current->fs, &umd_info->wd);
+ umd_info->pipe_to_umh = to_umh[1];
+ umd_info->pipe_from_umh = from_umh[0];
+ umd_info->tgid = get_pid(task_tgid(current));
+ return 0;
+}
+
+static void umd_cleanup(struct subprocess_info *info)
+{
+ struct umd_info *umd_info = info->data;
+
+ /* cleanup if umh_setup() was successful but exec failed */
+ if (info->retval)
+ umd_cleanup_helper(umd_info);
+}
+
+/**
+ * umd_cleanup_helper - release the resources which were allocated in umd_setup
+ * @info: information about usermode driver
+ */
+void umd_cleanup_helper(struct umd_info *info)
+{
+ fput(info->pipe_to_umh);
+ fput(info->pipe_from_umh);
+ put_pid(info->tgid);
+ info->tgid = NULL;
+}
+EXPORT_SYMBOL_GPL(umd_cleanup_helper);
+
+/**
+ * fork_usermode_driver - fork a usermode driver
+ * @info: information about usermode driver (shouldn't be NULL)
+ *
+ * Returns either negative error or zero which indicates success in
+ * executing a usermode driver. In such case 'struct umd_info *info'
+ * is populated with two pipes and a tgid of the process. The caller is
+ * responsible for health check of the user process, killing it via
+ * tgid, and closing the pipes when user process is no longer needed.
+ */
+int fork_usermode_driver(struct umd_info *info)
+{
+ struct subprocess_info *sub_info;
+ const char *argv[] = { info->driver_name, NULL };
+ int err;
+
+ if (WARN_ON_ONCE(info->tgid))
+ return -EBUSY;
+
+ err = -ENOMEM;
+ sub_info = call_usermodehelper_setup(info->driver_name,
+ (char **)argv, NULL, GFP_KERNEL,
+ umd_setup, umd_cleanup, info);
+ if (!sub_info)
+ goto out;
+
+ err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(fork_usermode_driver);
+
+
diff --git a/kernel/utsname.c b/kernel/utsname.c
index e488d0e2ab45..b1ac3ca870f2 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -33,7 +33,7 @@ static struct uts_namespace *create_uts_ns(void)
uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL);
if (uts_ns)
- kref_init(&uts_ns->kref);
+ refcount_set(&uts_ns->ns.count, 1);
return uts_ns;
}
@@ -103,11 +103,8 @@ struct uts_namespace *copy_utsname(unsigned long flags,
return new_ns;
}
-void free_uts_ns(struct kref *kref)
+void free_uts_ns(struct uts_namespace *ns)
{
- struct uts_namespace *ns;
-
- ns = container_of(kref, struct uts_namespace, kref);
dec_uts_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 4ca61d49885b..019e3a1566cf 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -8,6 +8,7 @@
#include <linux/export.h>
#include <linux/uts.h>
#include <linux/utsname.h>
+#include <linux/random.h>
#include <linux/sysctl.h>
#include <linux/wait.h>
#include <linux/rwsem.h>
@@ -57,6 +58,7 @@ static int proc_do_uts_string(struct ctl_table *table, int write,
* theoretically be incorrect if there are two parallel writes
* at non-zero offsets to the same sysctl.
*/
+ add_device_randomness(tmp_data, sizeof(tmp_data));
down_write(&uts_sem);
memcpy(get_uts(table), tmp_data, sizeof(tmp_data));
up_write(&uts_sem);
@@ -72,8 +74,16 @@ static int proc_do_uts_string(struct ctl_table *table, int write,
static DEFINE_CTL_TABLE_POLL(hostname_poll);
static DEFINE_CTL_TABLE_POLL(domainname_poll);
+// Note: update 'enum uts_proc' to match any changes to this table
static struct ctl_table uts_kern_table[] = {
{
+ .procname = "arch",
+ .data = init_uts_ns.name.machine,
+ .maxlen = sizeof(init_uts_ns.name.machine),
+ .mode = 0444,
+ .proc_handler = proc_do_uts_string,
+ },
+ {
.procname = "ostype",
.data = init_uts_ns.name.sysname,
.maxlen = sizeof(init_uts_ns.name.sysname),
@@ -113,15 +123,6 @@ static struct ctl_table uts_kern_table[] = {
{}
};
-static struct ctl_table uts_root_table[] = {
- {
- .procname = "kernel",
- .mode = 0555,
- .child = uts_kern_table,
- },
- {}
-};
-
#ifdef CONFIG_PROC_SYSCTL
/*
* Notify userspace about a change in a certain entry of uts_kern_table,
@@ -137,7 +138,7 @@ void uts_proc_notify(enum uts_proc proc)
static int __init utsname_sysctl_init(void)
{
- register_sysctl_table(uts_root_table);
+ register_sysctl("kernel", uts_kern_table);
return 0;
}
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
new file mode 100644
index 000000000000..da35e5b7f047
--- /dev/null
+++ b/kernel/vhost_task.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 Oracle Corporation
+ */
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/sched/task.h>
+#include <linux/sched/vhost_task.h>
+#include <linux/sched/signal.h>
+
+enum vhost_task_flags {
+ VHOST_TASK_FLAGS_STOP,
+};
+
+struct vhost_task {
+ bool (*fn)(void *data);
+ void *data;
+ struct completion exited;
+ unsigned long flags;
+ struct task_struct *task;
+};
+
+static int vhost_task_fn(void *data)
+{
+ struct vhost_task *vtsk = data;
+ bool dead = false;
+
+ for (;;) {
+ bool did_work;
+
+ if (!dead && signal_pending(current)) {
+ struct ksignal ksig;
+ /*
+ * Calling get_signal will block in SIGSTOP,
+ * or clear fatal_signal_pending, but remember
+ * what was set.
+ *
+ * This thread won't actually exit until all
+ * of the file descriptors are closed, and
+ * the release function is called.
+ */
+ dead = get_signal(&ksig);
+ if (dead)
+ clear_thread_flag(TIF_SIGPENDING);
+ }
+
+ /* mb paired w/ vhost_task_stop */
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags)) {
+ __set_current_state(TASK_RUNNING);
+ break;
+ }
+
+ did_work = vtsk->fn(vtsk->data);
+ if (!did_work)
+ schedule();
+ }
+
+ complete(&vtsk->exited);
+ do_exit(0);
+}
+
+/**
+ * vhost_task_wake - wakeup the vhost_task
+ * @vtsk: vhost_task to wake
+ *
+ * wake up the vhost_task worker thread
+ */
+void vhost_task_wake(struct vhost_task *vtsk)
+{
+ wake_up_process(vtsk->task);
+}
+EXPORT_SYMBOL_GPL(vhost_task_wake);
+
+/**
+ * vhost_task_stop - stop a vhost_task
+ * @vtsk: vhost_task to stop
+ *
+ * vhost_task_fn ensures the worker thread exits after
+ * VHOST_TASK_FLAGS_SOP becomes true.
+ */
+void vhost_task_stop(struct vhost_task *vtsk)
+{
+ set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
+ vhost_task_wake(vtsk);
+ /*
+ * Make sure vhost_task_fn is no longer accessing the vhost_task before
+ * freeing it below.
+ */
+ wait_for_completion(&vtsk->exited);
+ kfree(vtsk);
+}
+EXPORT_SYMBOL_GPL(vhost_task_stop);
+
+/**
+ * vhost_task_create - create a copy of a task to be used by the kernel
+ * @fn: vhost worker function
+ * @arg: data to be passed to fn
+ * @name: the thread's name
+ *
+ * This returns a specialized task for use by the vhost layer or NULL on
+ * failure. The returned task is inactive, and the caller must fire it up
+ * through vhost_task_start().
+ */
+struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg,
+ const char *name)
+{
+ struct kernel_clone_args args = {
+ .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM |
+ CLONE_THREAD | CLONE_SIGHAND,
+ .exit_signal = 0,
+ .fn = vhost_task_fn,
+ .name = name,
+ .user_worker = 1,
+ .no_files = 1,
+ };
+ struct vhost_task *vtsk;
+ struct task_struct *tsk;
+
+ vtsk = kzalloc(sizeof(*vtsk), GFP_KERNEL);
+ if (!vtsk)
+ return NULL;
+ init_completion(&vtsk->exited);
+ vtsk->data = arg;
+ vtsk->fn = fn;
+
+ args.fn_arg = vtsk;
+
+ tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args);
+ if (IS_ERR(tsk)) {
+ kfree(vtsk);
+ return NULL;
+ }
+
+ vtsk->task = tsk;
+ return vtsk;
+}
+EXPORT_SYMBOL_GPL(vhost_task_create);
+
+/**
+ * vhost_task_start - start a vhost_task created with vhost_task_create
+ * @vtsk: vhost_task to wake up
+ */
+void vhost_task_start(struct vhost_task *vtsk)
+{
+ wake_up_new_task(vtsk->task);
+}
+EXPORT_SYMBOL_GPL(vhost_task_start);
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index f74020f6bd9d..03b90d7d2175 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -4,7 +4,7 @@
* Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
- * See Documentation/watch_queue.rst
+ * See Documentation/core-api/watch_queue.rst
*/
#define pr_fmt(fmt) "watchq: " fmt
@@ -29,11 +29,31 @@
MODULE_DESCRIPTION("Watch queue");
MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
#define WATCH_QUEUE_NOTE_SIZE 128
#define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE)
+/*
+ * This must be called under the RCU read-lock, which makes
+ * sure that the wqueue still exists. It can then take the lock,
+ * and check that the wqueue hasn't been destroyed, which in
+ * turn makes sure that the notification pipe still exists.
+ */
+static inline bool lock_wqueue(struct watch_queue *wqueue)
+{
+ spin_lock_bh(&wqueue->lock);
+ if (unlikely(!wqueue->pipe)) {
+ spin_unlock_bh(&wqueue->lock);
+ return false;
+ }
+ return true;
+}
+
+static inline void unlock_wqueue(struct watch_queue *wqueue)
+{
+ spin_unlock_bh(&wqueue->lock);
+}
+
static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
@@ -54,6 +74,7 @@ static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe,
bit += page->index;
set_bit(bit, wqueue->notes_bitmap);
+ generic_pipe_buf_release(pipe, buf);
}
// No try_steal function => no stealing
@@ -68,6 +89,10 @@ static const struct pipe_buf_operations watch_queue_pipe_buf_ops = {
/*
* Post a notification to a watch queue.
+ *
+ * Must be called with the RCU lock for reading, and the
+ * watch_queue lock held, which guarantees that the pipe
+ * hasn't been released.
*/
static bool post_one_notification(struct watch_queue *wqueue,
struct watch_notification *n)
@@ -79,14 +104,8 @@ static bool post_one_notification(struct watch_queue *wqueue,
unsigned int head, tail, mask, note, offset, len;
bool done = false;
- if (!pipe)
- return false;
-
spin_lock_irq(&pipe->rd_wait.lock);
- if (wqueue->defunct)
- goto out;
-
mask = pipe->ring_size - 1;
head = pipe->head;
tail = pipe->tail;
@@ -112,7 +131,7 @@ static bool post_one_notification(struct watch_queue *wqueue,
buf->offset = offset;
buf->len = len;
buf->flags = PIPE_BUF_FLAG_WHOLE;
- pipe->head = head + 1;
+ smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */
if (!test_and_clear_bit(note, wqueue->notes_bitmap)) {
spin_unlock_irq(&pipe->rd_wait.lock);
@@ -202,7 +221,10 @@ void __post_watch_notification(struct watch_list *wlist,
if (security_post_notification(watch->cred, cred, n) < 0)
continue;
- post_one_notification(wqueue, n);
+ if (lock_wqueue(wqueue)) {
+ post_one_notification(wqueue, n);
+ unlock_wqueue(wqueue);
+ }
}
rcu_read_unlock();
@@ -219,7 +241,6 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
struct page **pages;
unsigned long *bitmap;
unsigned long user_bufs;
- unsigned int bmsize;
int ret, i, nr_pages;
if (!wqueue)
@@ -243,11 +264,13 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
goto error;
}
- ret = pipe_resize_ring(pipe, nr_notes);
+ nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE;
+ ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes));
if (ret < 0)
goto error;
- pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL);
+ ret = -ENOMEM;
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!pages)
goto error;
@@ -258,21 +281,19 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE;
}
- bmsize = (nr_notes + BITS_PER_LONG - 1) / BITS_PER_LONG;
- bmsize *= sizeof(unsigned long);
- bitmap = kmalloc(bmsize, GFP_KERNEL);
+ bitmap = bitmap_alloc(nr_notes, GFP_KERNEL);
if (!bitmap)
goto error_p;
- memset(bitmap, 0xff, bmsize);
+ bitmap_fill(bitmap, nr_notes);
wqueue->notes = pages;
wqueue->notes_bitmap = bitmap;
wqueue->nr_pages = nr_pages;
- wqueue->nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE;
+ wqueue->nr_notes = nr_notes;
return 0;
error_p:
- for (i = 0; i < nr_pages; i++)
+ while (--i >= 0)
__free_page(pages[i]);
kfree(pages);
error:
@@ -310,7 +331,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe,
filter.__reserved != 0)
return -EINVAL;
- tf = memdup_user(_filter->filters, filter.nr_filters * sizeof(*tf));
+ tf = memdup_array_user(_filter->filters, filter.nr_filters, sizeof(*tf));
if (IS_ERR(tf))
return PTR_ERR(tf);
@@ -320,7 +341,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe,
tf[i].info_mask & WATCH_INFO_LENGTH)
goto err_filter;
/* Ignore any unknown types */
- if (tf[i].type >= sizeof(wfilter->type_filter) * 8)
+ if (tf[i].type >= WATCH_TYPE__NR)
continue;
nr_filter++;
}
@@ -336,7 +357,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe,
q = wfilter->filters;
for (i = 0; i < filter.nr_filters; i++) {
- if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG)
+ if (tf[i].type >= WATCH_TYPE__NR)
continue;
q->type = tf[i].type;
@@ -371,6 +392,8 @@ static void __put_watch_queue(struct kref *kref)
for (i = 0; i < wqueue->nr_pages; i++)
__free_page(wqueue->notes[i]);
+ kfree(wqueue->notes);
+ bitmap_free(wqueue->notes_bitmap);
wfilter = rcu_access_pointer(wqueue->filter);
if (wfilter)
@@ -393,7 +416,9 @@ static void free_watch(struct rcu_head *rcu)
struct watch *watch = container_of(rcu, struct watch, rcu);
put_watch_queue(rcu_access_pointer(watch->queue));
+ atomic_dec(&watch->cred->user->nr_watches);
put_cred(watch->cred);
+ kfree(watch);
}
static void __put_watch(struct kref *kref)
@@ -412,7 +437,7 @@ static void put_watch(struct watch *watch)
}
/**
- * init_watch_queue - Initialise a watch
+ * init_watch - Initialise a watch
* @watch: The watch to initialise.
* @wqueue: The queue to assign.
*
@@ -426,6 +451,33 @@ void init_watch(struct watch *watch, struct watch_queue *wqueue)
rcu_assign_pointer(watch->queue, wqueue);
}
+static int add_one_watch(struct watch *watch, struct watch_list *wlist, struct watch_queue *wqueue)
+{
+ const struct cred *cred;
+ struct watch *w;
+
+ hlist_for_each_entry(w, &wlist->watchers, list_node) {
+ struct watch_queue *wq = rcu_access_pointer(w->queue);
+ if (wqueue == wq && watch->id == w->id)
+ return -EBUSY;
+ }
+
+ cred = current_cred();
+ if (atomic_inc_return(&cred->user->nr_watches) > task_rlimit(current, RLIMIT_NOFILE)) {
+ atomic_dec(&cred->user->nr_watches);
+ return -EAGAIN;
+ }
+
+ watch->cred = get_cred(cred);
+ rcu_assign_pointer(watch->watch_list, wlist);
+
+ kref_get(&wqueue->usage);
+ kref_get(&watch->usage);
+ hlist_add_head(&watch->queue_node, &wqueue->watches);
+ hlist_add_head_rcu(&watch->list_node, &wlist->watchers);
+ return 0;
+}
+
/**
* add_watch_to_object - Add a watch on an object to a watch list
* @watch: The watch to add
@@ -440,26 +492,21 @@ void init_watch(struct watch *watch, struct watch_queue *wqueue)
*/
int add_watch_to_object(struct watch *watch, struct watch_list *wlist)
{
- struct watch_queue *wqueue = rcu_access_pointer(watch->queue);
- struct watch *w;
-
- hlist_for_each_entry(w, &wlist->watchers, list_node) {
- struct watch_queue *wq = rcu_access_pointer(w->queue);
- if (wqueue == wq && watch->id == w->id)
- return -EBUSY;
- }
+ struct watch_queue *wqueue;
+ int ret = -ENOENT;
- watch->cred = get_current_cred();
- rcu_assign_pointer(watch->watch_list, wlist);
+ rcu_read_lock();
- spin_lock_bh(&wqueue->lock);
- kref_get(&wqueue->usage);
- kref_get(&watch->usage);
- hlist_add_head(&watch->queue_node, &wqueue->watches);
- spin_unlock_bh(&wqueue->lock);
+ wqueue = rcu_access_pointer(watch->queue);
+ if (lock_wqueue(wqueue)) {
+ spin_lock(&wlist->lock);
+ ret = add_one_watch(watch, wlist, wqueue);
+ spin_unlock(&wlist->lock);
+ unlock_wqueue(wqueue);
+ }
- hlist_add_head(&watch->list_node, &wlist->watchers);
- return 0;
+ rcu_read_unlock();
+ return ret;
}
EXPORT_SYMBOL(add_watch_to_object);
@@ -510,20 +557,15 @@ found:
wqueue = rcu_dereference(watch->queue);
- /* We don't need the watch list lock for the next bit as RCU is
- * protecting *wqueue from deallocation.
- */
- if (wqueue) {
+ if (lock_wqueue(wqueue)) {
post_one_notification(wqueue, &n.watch);
- spin_lock_bh(&wqueue->lock);
-
if (!hlist_unhashed(&watch->queue_node)) {
hlist_del_init_rcu(&watch->queue_node);
put_watch(watch);
}
- spin_unlock_bh(&wqueue->lock);
+ unlock_wqueue(wqueue);
}
if (wlist->release_watch) {
@@ -558,8 +600,11 @@ void watch_queue_clear(struct watch_queue *wqueue)
rcu_read_lock();
spin_lock_bh(&wqueue->lock);
- /* Prevent new additions and prevent notifications from happening */
- wqueue->defunct = true;
+ /*
+ * This pipe can be freed by callers like free_pipe_info().
+ * Removing this reference also prevents new notifications.
+ */
+ wqueue->pipe = NULL;
while (!hlist_empty(&wqueue->watches)) {
watch = hlist_entry(wqueue->watches.first, struct watch, queue_node);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 5abb5b22ad13..81a8862295d6 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -29,22 +29,18 @@
static DEFINE_MUTEX(watchdog_mutex);
-#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
-# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED)
-# define NMI_WATCHDOG_DEFAULT 1
+#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HARDLOCKUP_DETECTOR_SPARC64)
+# define WATCHDOG_HARDLOCKUP_DEFAULT 1
#else
-# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED)
-# define NMI_WATCHDOG_DEFAULT 0
+# define WATCHDOG_HARDLOCKUP_DEFAULT 0
#endif
unsigned long __read_mostly watchdog_enabled;
int __read_mostly watchdog_user_enabled = 1;
-int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
-int __read_mostly soft_watchdog_user_enabled = 1;
+static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT;
+static int __read_mostly watchdog_softlockup_user_enabled = 1;
int __read_mostly watchdog_thresh = 10;
-static int __read_mostly nmi_watchdog_available;
-
-static struct cpumask watchdog_allowed_mask __read_mostly;
+static int __read_mostly watchdog_hardlockup_available;
struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -59,7 +55,7 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
* Should we panic when a soft-lockup or hard-lockup occurs:
*/
unsigned int __read_mostly hardlockup_panic =
- CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+ IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC);
/*
* We may not want to enable hard lockup detection by default in all cases,
* for example when running the kernel as a guest on a hypervisor. In these
@@ -70,7 +66,7 @@ unsigned int __read_mostly hardlockup_panic =
*/
void __init hardlockup_detector_disable(void)
{
- nmi_watchdog_user_enabled = 0;
+ watchdog_hardlockup_user_enabled = 0;
}
static int __init hardlockup_panic_setup(char *str)
@@ -80,54 +76,178 @@ static int __init hardlockup_panic_setup(char *str)
else if (!strncmp(str, "nopanic", 7))
hardlockup_panic = 0;
else if (!strncmp(str, "0", 1))
- nmi_watchdog_user_enabled = 0;
+ watchdog_hardlockup_user_enabled = 0;
else if (!strncmp(str, "1", 1))
- nmi_watchdog_user_enabled = 1;
+ watchdog_hardlockup_user_enabled = 1;
return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
-/*
- * These functions can be overridden if an architecture implements its
- * own hardlockup detector.
- *
- * watchdog_nmi_enable/disable can be implemented to start and stop when
- * softlockup watchdog threads start and stop. The arch must select the
- * SOFTLOCKUP_DETECTOR Kconfig.
- */
-int __weak watchdog_nmi_enable(unsigned int cpu)
+#if defined(CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER)
+
+static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts);
+static DEFINE_PER_CPU(int, hrtimer_interrupts_saved);
+static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned);
+static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched);
+static unsigned long hard_lockup_nmi_warn;
+
+notrace void arch_touch_nmi_watchdog(void)
{
- hardlockup_detector_perf_enable();
- return 0;
+ /*
+ * Using __raw here because some code paths have
+ * preemption enabled. If preemption is enabled
+ * then interrupts should be enabled too, in which
+ * case we shouldn't have to worry about the watchdog
+ * going off.
+ */
+ raw_cpu_write(watchdog_hardlockup_touched, true);
+}
+EXPORT_SYMBOL(arch_touch_nmi_watchdog);
+
+void watchdog_hardlockup_touch_cpu(unsigned int cpu)
+{
+ per_cpu(watchdog_hardlockup_touched, cpu) = true;
+}
+
+static bool is_hardlockup(unsigned int cpu)
+{
+ int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu));
+
+ if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
+ return true;
+
+ /*
+ * NOTE: we don't need any fancy atomic_t or READ_ONCE/WRITE_ONCE
+ * for hrtimer_interrupts_saved. hrtimer_interrupts_saved is
+ * written/read by a single CPU.
+ */
+ per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
+
+ return false;
}
-void __weak watchdog_nmi_disable(unsigned int cpu)
+static void watchdog_hardlockup_kick(void)
{
- hardlockup_detector_perf_disable();
+ int new_interrupts;
+
+ new_interrupts = atomic_inc_return(this_cpu_ptr(&hrtimer_interrupts));
+ watchdog_buddy_check_hardlockup(new_interrupts);
+}
+
+void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
+{
+ if (per_cpu(watchdog_hardlockup_touched, cpu)) {
+ per_cpu(watchdog_hardlockup_touched, cpu) = false;
+ return;
+ }
+
+ /*
+ * Check for a hardlockup by making sure the CPU's timer
+ * interrupt is incrementing. The timer interrupt should have
+ * fired multiple times before we overflow'd. If it hasn't
+ * then this is a good indication the cpu is stuck
+ */
+ if (is_hardlockup(cpu)) {
+ unsigned int this_cpu = smp_processor_id();
+ unsigned long flags;
+
+ /* Only print hardlockups once. */
+ if (per_cpu(watchdog_hardlockup_warned, cpu))
+ return;
+
+ /*
+ * Prevent multiple hard-lockup reports if one cpu is already
+ * engaged in dumping all cpu back traces.
+ */
+ if (sysctl_hardlockup_all_cpu_backtrace) {
+ if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn))
+ return;
+ }
+
+ /*
+ * NOTE: we call printk_cpu_sync_get_irqsave() after printing
+ * the lockup message. While it would be nice to serialize
+ * that printout, we really want to make sure that if some
+ * other CPU somehow locked up while holding the lock associated
+ * with printk_cpu_sync_get_irqsave() that we can still at least
+ * get the message about the lockup out.
+ */
+ pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu);
+ printk_cpu_sync_get_irqsave(flags);
+
+ print_modules();
+ print_irqtrace_events(current);
+ if (cpu == this_cpu) {
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
+ printk_cpu_sync_put_irqrestore(flags);
+ } else {
+ printk_cpu_sync_put_irqrestore(flags);
+ trigger_single_cpu_backtrace(cpu);
+ }
+
+ if (sysctl_hardlockup_all_cpu_backtrace) {
+ trigger_allbutcpu_cpu_backtrace(cpu);
+ if (!hardlockup_panic)
+ clear_bit_unlock(0, &hard_lockup_nmi_warn);
+ }
+
+ if (hardlockup_panic)
+ nmi_panic(regs, "Hard LOCKUP");
+
+ per_cpu(watchdog_hardlockup_warned, cpu) = true;
+ } else {
+ per_cpu(watchdog_hardlockup_warned, cpu) = false;
+ }
}
-/* Return 0, if a NMI watchdog is available. Error code otherwise */
-int __weak __init watchdog_nmi_probe(void)
+#else /* CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */
+
+static inline void watchdog_hardlockup_kick(void) { }
+
+#endif /* !CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */
+
+/*
+ * These functions can be overridden based on the configured hardlockdup detector.
+ *
+ * watchdog_hardlockup_enable/disable can be implemented to start and stop when
+ * softlockup watchdog start and stop. The detector must select the
+ * SOFTLOCKUP_DETECTOR Kconfig.
+ */
+void __weak watchdog_hardlockup_enable(unsigned int cpu) { }
+
+void __weak watchdog_hardlockup_disable(unsigned int cpu) { }
+
+/*
+ * Watchdog-detector specific API.
+ *
+ * Return 0 when hardlockup watchdog is available, negative value otherwise.
+ * Note that the negative value means that a delayed probe might
+ * succeed later.
+ */
+int __weak __init watchdog_hardlockup_probe(void)
{
- return hardlockup_detector_perf_init();
+ return -ENODEV;
}
/**
- * watchdog_nmi_stop - Stop the watchdog for reconfiguration
+ * watchdog_hardlockup_stop - Stop the watchdog for reconfiguration
*
* The reconfiguration steps are:
- * watchdog_nmi_stop();
+ * watchdog_hardlockup_stop();
* update_variables();
- * watchdog_nmi_start();
+ * watchdog_hardlockup_start();
*/
-void __weak watchdog_nmi_stop(void) { }
+void __weak watchdog_hardlockup_stop(void) { }
/**
- * watchdog_nmi_start - Start the watchdog after reconfiguration
+ * watchdog_hardlockup_start - Start the watchdog after reconfiguration
*
- * Counterpart to watchdog_nmi_stop().
+ * Counterpart to watchdog_hardlockup_stop().
*
* The following variables have been updated in update_variables() and
* contain the currently valid configuration:
@@ -135,48 +255,61 @@ void __weak watchdog_nmi_stop(void) { }
* - watchdog_thresh
* - watchdog_cpumask
*/
-void __weak watchdog_nmi_start(void) { }
+void __weak watchdog_hardlockup_start(void) { }
/**
* lockup_detector_update_enable - Update the sysctl enable bit
*
- * Caller needs to make sure that the NMI/perf watchdogs are off, so this
- * can't race with watchdog_nmi_disable().
+ * Caller needs to make sure that the hard watchdogs are off, so this
+ * can't race with watchdog_hardlockup_disable().
*/
static void lockup_detector_update_enable(void)
{
watchdog_enabled = 0;
if (!watchdog_user_enabled)
return;
- if (nmi_watchdog_available && nmi_watchdog_user_enabled)
- watchdog_enabled |= NMI_WATCHDOG_ENABLED;
- if (soft_watchdog_user_enabled)
- watchdog_enabled |= SOFT_WATCHDOG_ENABLED;
+ if (watchdog_hardlockup_available && watchdog_hardlockup_user_enabled)
+ watchdog_enabled |= WATCHDOG_HARDLOCKUP_ENABLED;
+ if (watchdog_softlockup_user_enabled)
+ watchdog_enabled |= WATCHDOG_SOFTOCKUP_ENABLED;
}
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
-#define SOFTLOCKUP_RESET ULONG_MAX
+/*
+ * Delay the soflockup report when running a known slow code.
+ * It does _not_ affect the timestamp of the last successdul reschedule.
+ */
+#define SOFTLOCKUP_DELAY_REPORT ULONG_MAX
#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
#endif
+static struct cpumask watchdog_allowed_mask __read_mostly;
+
/* Global variables, exported for sysctl */
unsigned int __read_mostly softlockup_panic =
- CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
+ IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC);
static bool softlockup_initialized __read_mostly;
static u64 __read_mostly sample_period;
+/* Timestamp taken after the last successful reschedule. */
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
+/* Timestamp of the last softlockup report. */
+static DEFINE_PER_CPU(unsigned long, watchdog_report_ts);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
-static DEFINE_PER_CPU(bool, soft_watchdog_warn);
-static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
-static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static unsigned long soft_lockup_nmi_warn;
+static int __init softlockup_panic_setup(char *str)
+{
+ softlockup_panic = simple_strtoul(str, NULL, 0);
+ return 1;
+}
+__setup("softlockup_panic=", softlockup_panic_setup);
+
static int __init nowatchdog_setup(char *str)
{
watchdog_user_enabled = 0;
@@ -186,7 +319,7 @@ __setup("nowatchdog", nowatchdog_setup);
static int __init nosoftlockup_setup(char *str)
{
- soft_watchdog_user_enabled = 0;
+ watchdog_softlockup_user_enabled = 0;
return 1;
}
__setup("nosoftlockup", nosoftlockup_setup);
@@ -235,10 +368,16 @@ static void set_sample_period(void)
watchdog_update_hrtimer_threshold(sample_period);
}
+static void update_report_ts(void)
+{
+ __this_cpu_write(watchdog_report_ts, get_timestamp());
+}
+
/* Commands for resetting the watchdog */
-static void __touch_watchdog(void)
+static void update_touch_ts(void)
{
__this_cpu_write(watchdog_touch_ts, get_timestamp());
+ update_report_ts();
}
/**
@@ -252,10 +391,10 @@ static void __touch_watchdog(void)
notrace void touch_softlockup_watchdog_sched(void)
{
/*
- * Preemption can be enabled. It doesn't matter which CPU's timestamp
- * gets zeroed here, so use the raw_ operation.
+ * Preemption can be enabled. It doesn't matter which CPU's watchdog
+ * report period gets restarted here, so use the raw_ operation.
*/
- raw_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET);
+ raw_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT);
}
notrace void touch_softlockup_watchdog(void)
@@ -278,51 +417,36 @@ void touch_all_softlockup_watchdogs(void)
* update as well, the only side effect might be a cycle delay for
* the softlockup check.
*/
- for_each_cpu(cpu, &watchdog_allowed_mask)
- per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET;
- wq_watchdog_touch(-1);
+ for_each_cpu(cpu, &watchdog_allowed_mask) {
+ per_cpu(watchdog_report_ts, cpu) = SOFTLOCKUP_DELAY_REPORT;
+ wq_watchdog_touch(cpu);
+ }
}
void touch_softlockup_watchdog_sync(void)
{
__this_cpu_write(softlockup_touch_sync, true);
- __this_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET);
+ __this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT);
}
-static int is_softlockup(unsigned long touch_ts)
+static int is_softlockup(unsigned long touch_ts,
+ unsigned long period_ts,
+ unsigned long now)
{
- unsigned long now = get_timestamp();
-
- if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){
+ if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) {
/* Warn about unreasonable delays. */
- if (time_after(now, touch_ts + get_softlockup_thresh()))
+ if (time_after(now, period_ts + get_softlockup_thresh()))
return now - touch_ts;
}
return 0;
}
/* watchdog detector functions */
-bool is_hardlockup(void)
-{
- unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
-
- if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
- return true;
-
- __this_cpu_write(hrtimer_interrupts_saved, hrint);
- return false;
-}
-
-static void watchdog_interrupt_count(void)
-{
- __this_cpu_inc(hrtimer_interrupts);
-}
-
static DEFINE_PER_CPU(struct completion, softlockup_completion);
static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
/*
- * The watchdog thread function - touches the timestamp.
+ * The watchdog feed function - touches the timestamp.
*
* It only runs once every sample_period seconds (4 seconds by
* default) to reset the softlockup timestamp. If this gets delayed
@@ -331,7 +455,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
*/
static int softlockup_fn(void *data)
{
- __touch_watchdog();
+ update_touch_ts();
complete(this_cpu_ptr(&softlockup_completion));
return 0;
@@ -340,16 +464,16 @@ static int softlockup_fn(void *data)
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
- unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
+ unsigned long touch_ts, period_ts, now;
struct pt_regs *regs = get_irq_regs();
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
+ unsigned long flags;
if (!watchdog_enabled)
return HRTIMER_NORESTART;
- /* kick the hardlockup detector */
- watchdog_interrupt_count();
+ watchdog_hardlockup_kick();
/* kick the softlockup detector */
if (completion_done(this_cpu_ptr(&softlockup_completion))) {
@@ -362,7 +486,26 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
/* .. and repeat */
hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
- if (touch_ts == SOFTLOCKUP_RESET) {
+ /*
+ * Read the current timestamp first. It might become invalid anytime
+ * when a virtual machine is stopped by the host or when the watchog
+ * is touched from NMI.
+ */
+ now = get_timestamp();
+ /*
+ * If a virtual machine is stopped by the host it can look to
+ * the watchdog like a soft lockup. This function touches the watchdog.
+ */
+ kvm_check_and_clear_guest_paused();
+ /*
+ * The stored timestamp is comparable with @now only when not touched.
+ * It might get touched anytime from NMI. Make sure that is_softlockup()
+ * uses the same (valid) value.
+ */
+ period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts));
+
+ /* Reset the interval when touched by known problematic code. */
+ if (period_ts == SOFTLOCKUP_DELAY_REPORT) {
if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
/*
* If the time stamp was touched atomically
@@ -372,43 +515,27 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
sched_clock_tick();
}
- /* Clear the guest paused flag on watchdog reset */
- kvm_check_and_clear_guest_paused();
- __touch_watchdog();
+ update_report_ts();
return HRTIMER_RESTART;
}
- /* check for a softlockup
- * This is done by making sure a high priority task is
- * being scheduled. The task touches the watchdog to
- * indicate it is getting cpu time. If it hasn't then
- * this is a good indication some task is hogging the cpu
- */
- duration = is_softlockup(touch_ts);
+ /* Check for a softlockup. */
+ touch_ts = __this_cpu_read(watchdog_touch_ts);
+ duration = is_softlockup(touch_ts, period_ts, now);
if (unlikely(duration)) {
/*
- * If a virtual machine is stopped by the host it can look to
- * the watchdog like a soft lockup, check to see if the host
- * stopped the vm before we issue the warning
+ * Prevent multiple soft-lockup reports if one cpu is already
+ * engaged in dumping all cpu back traces.
*/
- if (kvm_check_and_clear_guest_paused())
- return HRTIMER_RESTART;
-
- /* only warn once */
- if (__this_cpu_read(soft_watchdog_warn) == true)
- return HRTIMER_RESTART;
-
if (softlockup_all_cpu_backtrace) {
- /* Prevent multiple soft-lockup reports if one cpu is already
- * engaged in dumping cpu back traces
- */
- if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
- /* Someone else will report us. Let's give up */
- __this_cpu_write(soft_watchdog_warn, true);
+ if (test_and_set_bit_lock(0, &soft_lockup_nmi_warn))
return HRTIMER_RESTART;
- }
}
+ /* Start period for the next softlockup warning. */
+ update_report_ts();
+
+ printk_cpu_sync_get_irqsave(flags);
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
@@ -418,24 +545,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
show_regs(regs);
else
dump_stack();
+ printk_cpu_sync_put_irqrestore(flags);
if (softlockup_all_cpu_backtrace) {
- /* Avoid generating two back traces for current
- * given that one is already made above
- */
- trigger_allbutself_cpu_backtrace();
-
- clear_bit(0, &soft_lockup_nmi_warn);
- /* Barrier to sync with other cpus */
- smp_mb__after_atomic();
+ trigger_allbutcpu_cpu_backtrace(smp_processor_id());
+ if (!softlockup_panic)
+ clear_bit_unlock(0, &soft_lockup_nmi_warn);
}
add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
if (softlockup_panic)
panic("softlockup: hung tasks");
- __this_cpu_write(soft_watchdog_warn, true);
- } else
- __this_cpu_write(soft_watchdog_warn, false);
+ }
return HRTIMER_RESTART;
}
@@ -451,7 +572,7 @@ static void watchdog_enable(unsigned int cpu)
complete(done);
/*
- * Start the timer first to prevent the NMI watchdog triggering
+ * Start the timer first to prevent the hardlockup watchdog triggering
* before the timer has a chance to fire.
*/
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
@@ -460,10 +581,10 @@ static void watchdog_enable(unsigned int cpu)
HRTIMER_MODE_REL_PINNED_HARD);
/* Initialize timestamp */
- __touch_watchdog();
- /* Enable the perf event */
- if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
- watchdog_nmi_enable(cpu);
+ update_touch_ts();
+ /* Enable the hardlockup detector */
+ if (watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED)
+ watchdog_hardlockup_enable(cpu);
}
static void watchdog_disable(unsigned int cpu)
@@ -473,11 +594,11 @@ static void watchdog_disable(unsigned int cpu)
WARN_ON_ONCE(cpu != smp_processor_id());
/*
- * Disable the perf event first. That prevents that a large delay
- * between disabling the timer and disabling the perf event causes
- * the perf NMI to detect a false positive.
+ * Disable the hardlockup detector first. That prevents that a large
+ * delay between disabling the timer and disabling the hardlockup
+ * detector causes a false positive.
*/
- watchdog_nmi_disable(cpu);
+ watchdog_hardlockup_disable(cpu);
hrtimer_cancel(hrtimer);
wait_for_completion(this_cpu_ptr(&softlockup_completion));
}
@@ -530,10 +651,10 @@ int lockup_detector_offline_cpu(unsigned int cpu)
return 0;
}
-static void lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(void)
{
cpus_read_lock();
- watchdog_nmi_stop();
+ watchdog_hardlockup_stop();
softlockup_stop_all();
set_sample_period();
@@ -541,7 +662,7 @@ static void lockup_detector_reconfigure(void)
if (watchdog_enabled && watchdog_thresh)
softlockup_start_all();
- watchdog_nmi_start();
+ watchdog_hardlockup_start();
cpus_read_unlock();
/*
* Must be called outside the cpus locked section to prevent
@@ -550,12 +671,15 @@ static void lockup_detector_reconfigure(void)
__lockup_detector_cleanup();
}
+void lockup_detector_reconfigure(void)
+{
+ mutex_lock(&watchdog_mutex);
+ __lockup_detector_reconfigure();
+ mutex_unlock(&watchdog_mutex);
+}
+
/*
- * Create the watchdog thread infrastructure and configure the detector(s).
- *
- * The threads are not unparked as watchdog_allowed_mask is empty. When
- * the threads are successfully initialized, take the proper locks and
- * unpark the threads in the watchdog_cpumask if the watchdog is enabled.
+ * Create the watchdog infrastructure and configure the detector(s).
*/
static __init void lockup_detector_setup(void)
{
@@ -570,23 +694,27 @@ static __init void lockup_detector_setup(void)
return;
mutex_lock(&watchdog_mutex);
- lockup_detector_reconfigure();
+ __lockup_detector_reconfigure();
softlockup_initialized = true;
mutex_unlock(&watchdog_mutex);
}
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
-static void lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(void)
{
cpus_read_lock();
- watchdog_nmi_stop();
+ watchdog_hardlockup_stop();
lockup_detector_update_enable();
- watchdog_nmi_start();
+ watchdog_hardlockup_start();
cpus_read_unlock();
}
+void lockup_detector_reconfigure(void)
+{
+ __lockup_detector_reconfigure();
+}
static inline void lockup_detector_setup(void)
{
- lockup_detector_reconfigure();
+ __lockup_detector_reconfigure();
}
#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
@@ -621,25 +749,25 @@ void lockup_detector_soft_poweroff(void)
#ifdef CONFIG_SYSCTL
-/* Propagate any changes to the watchdog threads */
+/* Propagate any changes to the watchdog infrastructure */
static void proc_watchdog_update(void)
{
/* Remove impossible cpus to keep sysctl output clean. */
cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
- lockup_detector_reconfigure();
+ __lockup_detector_reconfigure();
}
/*
* common function for watchdog, nmi_watchdog and soft_watchdog parameter
*
- * caller | table->data points to | 'which'
- * -------------------|----------------------------|--------------------------
- * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED |
- * | | SOFT_WATCHDOG_ENABLED
- * -------------------|----------------------------|--------------------------
- * proc_nmi_watchdog | nmi_watchdog_user_enabled | NMI_WATCHDOG_ENABLED
- * -------------------|----------------------------|--------------------------
- * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED
+ * caller | table->data points to | 'which'
+ * -------------------|----------------------------------|-------------------------------
+ * proc_watchdog | watchdog_user_enabled | WATCHDOG_HARDLOCKUP_ENABLED |
+ * | | WATCHDOG_SOFTOCKUP_ENABLED
+ * -------------------|----------------------------------|-------------------------------
+ * proc_nmi_watchdog | watchdog_hardlockup_user_enabled | WATCHDOG_HARDLOCKUP_ENABLED
+ * -------------------|----------------------------------|-------------------------------
+ * proc_soft_watchdog | watchdog_softlockup_user_enabled | WATCHDOG_SOFTOCKUP_ENABLED
*/
static int proc_watchdog_common(int which, struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
@@ -671,7 +799,8 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
int proc_watchdog(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED,
+ return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED |
+ WATCHDOG_SOFTOCKUP_ENABLED,
table, write, buffer, lenp, ppos);
}
@@ -681,9 +810,9 @@ int proc_watchdog(struct ctl_table *table, int write,
int proc_nmi_watchdog(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- if (!nmi_watchdog_available && write)
+ if (!watchdog_hardlockup_available && write)
return -ENOTSUPP;
- return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
+ return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED,
table, write, buffer, lenp, ppos);
}
@@ -693,7 +822,7 @@ int proc_nmi_watchdog(struct ctl_table *table, int write,
int proc_soft_watchdog(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- return proc_watchdog_common(SOFT_WATCHDOG_ENABLED,
+ return proc_watchdog_common(WATCHDOG_SOFTOCKUP_ENABLED,
table, write, buffer, lenp, ppos);
}
@@ -737,17 +866,187 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
mutex_unlock(&watchdog_mutex);
return err;
}
+
+static const int sixty = 60;
+
+static struct ctl_table watchdog_sysctls[] = {
+ {
+ .procname = "watchdog",
+ .data = &watchdog_user_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_watchdog,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "watchdog_thresh",
+ .data = &watchdog_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_watchdog_thresh,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&sixty,
+ },
+ {
+ .procname = "watchdog_cpumask",
+ .data = &watchdog_cpumask_bits,
+ .maxlen = NR_CPUS,
+ .mode = 0644,
+ .proc_handler = proc_watchdog_cpumask,
+ },
+#ifdef CONFIG_SOFTLOCKUP_DETECTOR
+ {
+ .procname = "soft_watchdog",
+ .data = &watchdog_softlockup_user_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_soft_watchdog,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "softlockup_panic",
+ .data = &softlockup_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#ifdef CONFIG_SMP
+ {
+ .procname = "softlockup_all_cpu_backtrace",
+ .data = &sysctl_softlockup_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_SMP */
+#endif
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+ {
+ .procname = "hardlockup_panic",
+ .data = &hardlockup_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#ifdef CONFIG_SMP
+ {
+ .procname = "hardlockup_all_cpu_backtrace",
+ .data = &sysctl_hardlockup_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_SMP */
+#endif
+ {}
+};
+
+static struct ctl_table watchdog_hardlockup_sysctl[] = {
+ {
+ .procname = "nmi_watchdog",
+ .data = &watchdog_hardlockup_user_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_nmi_watchdog,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {}
+};
+
+static void __init watchdog_sysctl_init(void)
+{
+ register_sysctl_init("kernel", watchdog_sysctls);
+
+ if (watchdog_hardlockup_available)
+ watchdog_hardlockup_sysctl[0].mode = 0644;
+ register_sysctl_init("kernel", watchdog_hardlockup_sysctl);
+}
+
+#else
+#define watchdog_sysctl_init() do { } while (0)
#endif /* CONFIG_SYSCTL */
+static void __init lockup_detector_delay_init(struct work_struct *work);
+static bool allow_lockup_detector_init_retry __initdata;
+
+static struct work_struct detector_work __initdata =
+ __WORK_INITIALIZER(detector_work, lockup_detector_delay_init);
+
+static void __init lockup_detector_delay_init(struct work_struct *work)
+{
+ int ret;
+
+ ret = watchdog_hardlockup_probe();
+ if (ret) {
+ pr_info("Delayed init of the lockup detector failed: %d\n", ret);
+ pr_info("Hard watchdog permanently disabled\n");
+ return;
+ }
+
+ allow_lockup_detector_init_retry = false;
+
+ watchdog_hardlockup_available = true;
+ lockup_detector_setup();
+}
+
+/*
+ * lockup_detector_retry_init - retry init lockup detector if possible.
+ *
+ * Retry hardlockup detector init. It is useful when it requires some
+ * functionality that has to be initialized later on a particular
+ * platform.
+ */
+void __init lockup_detector_retry_init(void)
+{
+ /* Must be called before late init calls */
+ if (!allow_lockup_detector_init_retry)
+ return;
+
+ schedule_work(&detector_work);
+}
+
+/*
+ * Ensure that optional delayed hardlockup init is proceed before
+ * the init code and memory is freed.
+ */
+static int __init lockup_detector_check(void)
+{
+ /* Prevent any later retry. */
+ allow_lockup_detector_init_retry = false;
+
+ /* Make sure no work is pending. */
+ flush_work(&detector_work);
+
+ watchdog_sysctl_init();
+
+ return 0;
+
+}
+late_initcall_sync(lockup_detector_check);
+
void __init lockup_detector_init(void)
{
if (tick_nohz_full_enabled())
pr_info("Disabling watchdog on nohz_full cores by default\n");
cpumask_copy(&watchdog_cpumask,
- housekeeping_cpumask(HK_FLAG_TIMER));
+ housekeeping_cpumask(HK_TYPE_TIMER));
+
+ if (!watchdog_hardlockup_probe())
+ watchdog_hardlockup_available = true;
+ else
+ allow_lockup_detector_init_retry = true;
- if (!watchdog_nmi_probe())
- nmi_watchdog_available = true;
lockup_detector_setup();
}
diff --git a/kernel/watchdog_buddy.c b/kernel/watchdog_buddy.c
new file mode 100644
index 000000000000..34dbfe091f4b
--- /dev/null
+++ b/kernel/watchdog_buddy.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/kernel.h>
+#include <linux/nmi.h>
+#include <linux/percpu-defs.h>
+
+static cpumask_t __read_mostly watchdog_cpus;
+
+static unsigned int watchdog_next_cpu(unsigned int cpu)
+{
+ unsigned int next_cpu;
+
+ next_cpu = cpumask_next(cpu, &watchdog_cpus);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(&watchdog_cpus);
+
+ if (next_cpu == cpu)
+ return nr_cpu_ids;
+
+ return next_cpu;
+}
+
+int __init watchdog_hardlockup_probe(void)
+{
+ return 0;
+}
+
+void watchdog_hardlockup_enable(unsigned int cpu)
+{
+ unsigned int next_cpu;
+
+ /*
+ * The new CPU will be marked online before the hrtimer interrupt
+ * gets a chance to run on it. If another CPU tests for a
+ * hardlockup on the new CPU before it has run its the hrtimer
+ * interrupt, it will get a false positive. Touch the watchdog on
+ * the new CPU to delay the check for at least 3 sampling periods
+ * to guarantee one hrtimer has run on the new CPU.
+ */
+ watchdog_hardlockup_touch_cpu(cpu);
+
+ /*
+ * We are going to check the next CPU. Our watchdog_hrtimer
+ * need not be zero if the CPU has already been online earlier.
+ * Touch the watchdog on the next CPU to avoid false positive
+ * if we try to check it in less then 3 interrupts.
+ */
+ next_cpu = watchdog_next_cpu(cpu);
+ if (next_cpu < nr_cpu_ids)
+ watchdog_hardlockup_touch_cpu(next_cpu);
+
+ /*
+ * Makes sure that watchdog is touched on this CPU before
+ * other CPUs could see it in watchdog_cpus. The counter
+ * part is in watchdog_buddy_check_hardlockup().
+ */
+ smp_wmb();
+
+ cpumask_set_cpu(cpu, &watchdog_cpus);
+}
+
+void watchdog_hardlockup_disable(unsigned int cpu)
+{
+ unsigned int next_cpu = watchdog_next_cpu(cpu);
+
+ /*
+ * Offlining this CPU will cause the CPU before this one to start
+ * checking the one after this one. If this CPU just finished checking
+ * the next CPU and updating hrtimer_interrupts_saved, and then the
+ * previous CPU checks it within one sample period, it will trigger a
+ * false positive. Touch the watchdog on the next CPU to prevent it.
+ */
+ if (next_cpu < nr_cpu_ids)
+ watchdog_hardlockup_touch_cpu(next_cpu);
+
+ /*
+ * Makes sure that watchdog is touched on the next CPU before
+ * this CPU disappear in watchdog_cpus. The counter part is in
+ * watchdog_buddy_check_hardlockup().
+ */
+ smp_wmb();
+
+ cpumask_clear_cpu(cpu, &watchdog_cpus);
+}
+
+void watchdog_buddy_check_hardlockup(int hrtimer_interrupts)
+{
+ unsigned int next_cpu;
+
+ /*
+ * Test for hardlockups every 3 samples. The sample period is
+ * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
+ * watchdog_thresh (over by 20%).
+ */
+ if (hrtimer_interrupts % 3 != 0)
+ return;
+
+ /* check for a hardlockup on the next CPU */
+ next_cpu = watchdog_next_cpu(smp_processor_id());
+ if (next_cpu >= nr_cpu_ids)
+ return;
+
+ /*
+ * Make sure that the watchdog was touched on next CPU when
+ * watchdog_next_cpu() returned another one because of
+ * a change in watchdog_hardlockup_enable()/disable().
+ */
+ smp_rmb();
+
+ watchdog_hardlockup_check(next_cpu, NULL);
+}
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_perf.c
index 247bf0b1582c..8ea00c4a24b2 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_perf.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Detect hard lockups on a system
+ * Detect hard lockups on a system using perf
*
* started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
*
@@ -20,28 +20,12 @@
#include <asm/irq_regs.h>
#include <linux/perf_event.h>
-static DEFINE_PER_CPU(bool, hard_watchdog_warn);
-static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
static DEFINE_PER_CPU(struct perf_event *, dead_event);
static struct cpumask dead_events_mask;
-static unsigned long hardlockup_allcpu_dumped;
static atomic_t watchdog_cpus = ATOMIC_INIT(0);
-notrace void arch_touch_nmi_watchdog(void)
-{
- /*
- * Using __raw here because some code paths have
- * preemption enabled. If preemption is enabled
- * then interrupts should be enabled too, in which
- * case we shouldn't have to worry about the watchdog
- * going off.
- */
- raw_cpu_write(watchdog_nmi_touch, true);
-}
-EXPORT_SYMBOL(arch_touch_nmi_watchdog);
-
#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP
static DEFINE_PER_CPU(ktime_t, last_timestamp);
static DEFINE_PER_CPU(unsigned int, nmi_rearmed);
@@ -114,61 +98,24 @@ static void watchdog_overflow_callback(struct perf_event *event,
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;
- if (__this_cpu_read(watchdog_nmi_touch) == true) {
- __this_cpu_write(watchdog_nmi_touch, false);
- return;
- }
-
if (!watchdog_check_timestamp())
return;
- /* check for a hardlockup
- * This is done by making sure our timer interrupt
- * is incrementing. The timer interrupt should have
- * fired multiple times before we overflow'd. If it hasn't
- * then this is a good indication the cpu is stuck
- */
- if (is_hardlockup()) {
- int this_cpu = smp_processor_id();
-
- /* only print hardlockups once */
- if (__this_cpu_read(hard_watchdog_warn) == true)
- return;
-
- pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
- this_cpu);
- print_modules();
- print_irqtrace_events(current);
- if (regs)
- show_regs(regs);
- else
- dump_stack();
-
- /*
- * Perform all-CPU dump only once to avoid multiple hardlockups
- * generating interleaving traces
- */
- if (sysctl_hardlockup_all_cpu_backtrace &&
- !test_and_set_bit(0, &hardlockup_allcpu_dumped))
- trigger_allbutself_cpu_backtrace();
-
- if (hardlockup_panic)
- nmi_panic(regs, "Hard LOCKUP");
-
- __this_cpu_write(hard_watchdog_warn, true);
- return;
- }
-
- __this_cpu_write(hard_watchdog_warn, false);
- return;
+ watchdog_hardlockup_check(smp_processor_id(), regs);
}
static int hardlockup_detector_event_create(void)
{
- unsigned int cpu = smp_processor_id();
+ unsigned int cpu;
struct perf_event_attr *wd_attr;
struct perf_event *evt;
+ /*
+ * Preemption is not disabled because memory will be allocated.
+ * Ensure CPU-locality by calling this in per-CPU kthread.
+ */
+ WARN_ON(!is_percpu_thread());
+ cpu = raw_smp_processor_id();
wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
@@ -185,10 +132,14 @@ static int hardlockup_detector_event_create(void)
}
/**
- * hardlockup_detector_perf_enable - Enable the local event
+ * watchdog_hardlockup_enable - Enable the local event
+ *
+ * @cpu: The CPU to enable hard lockup on.
*/
-void hardlockup_detector_perf_enable(void)
+void watchdog_hardlockup_enable(unsigned int cpu)
{
+ WARN_ON_ONCE(cpu != smp_processor_id());
+
if (hardlockup_detector_event_create())
return;
@@ -200,12 +151,16 @@ void hardlockup_detector_perf_enable(void)
}
/**
- * hardlockup_detector_perf_disable - Disable the local event
+ * watchdog_hardlockup_disable - Disable the local event
+ *
+ * @cpu: The CPU to enable hard lockup on.
*/
-void hardlockup_detector_perf_disable(void)
+void watchdog_hardlockup_disable(unsigned int cpu)
{
struct perf_event *event = this_cpu_read(watchdog_ev);
+ WARN_ON_ONCE(cpu != smp_processor_id());
+
if (event) {
perf_event_disable(event);
this_cpu_write(watchdog_ev, NULL);
@@ -268,7 +223,7 @@ void __init hardlockup_detector_perf_restart(void)
lockdep_assert_cpus_held();
- if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ if (!(watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED))
return;
for_each_online_cpu(cpu) {
@@ -279,12 +234,22 @@ void __init hardlockup_detector_perf_restart(void)
}
}
+bool __weak __init arch_perf_nmi_is_available(void)
+{
+ return true;
+}
+
/**
- * hardlockup_detector_perf_init - Probe whether NMI event is available at all
+ * watchdog_hardlockup_probe - Probe whether NMI event is available at all
*/
-int __init hardlockup_detector_perf_init(void)
+int __init watchdog_hardlockup_probe(void)
{
- int ret = hardlockup_detector_event_create();
+ int ret;
+
+ if (!arch_perf_nmi_is_available())
+ return -ENODEV;
+
+ ret = hardlockup_detector_event_create();
if (ret) {
pr_info("Perf NMI watchdog permanently disabled\n");
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c41c3c17b86a..7b482a26d741 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -49,7 +49,10 @@
#include <linux/moduleparam.h>
#include <linux/uaccess.h>
#include <linux/sched/isolation.h>
+#include <linux/sched/debug.h>
#include <linux/nmi.h>
+#include <linux/kvm_para.h>
+#include <linux/delay.h>
#include "workqueue_internal.h"
@@ -119,10 +122,11 @@ enum {
*
* L: pool->lock protected. Access with pool->lock held.
*
- * X: During normal operation, modification requires pool->lock and should
- * be done only from local cpu. Either disabling preemption on local
- * cpu or grabbing pool->lock is enough for read access. If
- * POOL_DISASSOCIATED is set, it's identical to L.
+ * K: Only modified by worker while holding pool->lock. Can be safely read by
+ * self, while holding pool->lock or from IRQ context if %current is the
+ * kworker.
+ *
+ * S: Only modified by worker self.
*
* A: wq_pool_attach_mutex protected.
*
@@ -140,6 +144,8 @@ enum {
* WR: wq->mutex protected for writes. RCU protected for reads.
*
* MD: wq_mayday_lock protected.
+ *
+ * WD: Used internally by the watchdog.
*/
/* struct worker is defined in workqueue_internal.h */
@@ -149,18 +155,29 @@ struct worker_pool {
int cpu; /* I: the associated cpu */
int node; /* I: the associated node ID */
int id; /* I: pool ID */
- unsigned int flags; /* X: flags */
+ unsigned int flags; /* L: flags */
unsigned long watchdog_ts; /* L: watchdog timestamp */
+ bool cpu_stall; /* WD: stalled cpu bound pool */
+
+ /*
+ * The counter is incremented in a process context on the associated CPU
+ * w/ preemption disabled, and decremented or reset in the same context
+ * but w/ pool->lock held. The readers grab pool->lock and are
+ * guaranteed to see if the counter reached zero.
+ */
+ int nr_running;
struct list_head worklist; /* L: list of pending works */
int nr_workers; /* L: total number of workers */
int nr_idle; /* L: currently idle workers */
- struct list_head idle_list; /* X: list of idle workers */
+ struct list_head idle_list; /* L: list of idle workers */
struct timer_list idle_timer; /* L: worker idle timeout */
- struct timer_list mayday_timer; /* L: SOS timer for workers */
+ struct work_struct idle_cull_work; /* L: worker idle cleanup */
+
+ struct timer_list mayday_timer; /* L: SOS timer for workers */
/* a workers is either on busy_hash or idle_list, or the manager */
DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
@@ -168,6 +185,7 @@ struct worker_pool {
struct worker *manager; /* L: purely informational */
struct list_head workers; /* A: attached workers */
+ struct list_head dying_workers; /* A: workers about to die */
struct completion *detach_completion; /* all workers detached */
struct ida worker_ida; /* worker IDs for task name */
@@ -177,18 +195,28 @@ struct worker_pool {
int refcnt; /* PL: refcnt for unbound pools */
/*
- * The current concurrency level. As it's likely to be accessed
- * from other CPUs during try_to_wake_up(), put it in a separate
- * cacheline.
- */
- atomic_t nr_running ____cacheline_aligned_in_smp;
-
- /*
* Destruction of pool is RCU protected to allow dereferences
* from get_work_pool().
*/
struct rcu_head rcu;
-} ____cacheline_aligned_in_smp;
+};
+
+/*
+ * Per-pool_workqueue statistics. These can be monitored using
+ * tools/workqueue/wq_monitor.py.
+ */
+enum pool_workqueue_stats {
+ PWQ_STAT_STARTED, /* work items started execution */
+ PWQ_STAT_COMPLETED, /* work items completed execution */
+ PWQ_STAT_CPU_TIME, /* total CPU time consumed */
+ PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */
+ PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */
+ PWQ_STAT_REPATRIATED, /* unbound workers brought back into scope */
+ PWQ_STAT_MAYDAY, /* maydays to rescuer */
+ PWQ_STAT_RESCUED, /* linked work items executed by rescuer */
+
+ PWQ_NR_STATS,
+};
/*
* The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
@@ -204,19 +232,38 @@ struct pool_workqueue {
int refcnt; /* L: reference count */
int nr_in_flight[WORK_NR_COLORS];
/* L: nr of in_flight works */
+
+ /*
+ * nr_active management and WORK_STRUCT_INACTIVE:
+ *
+ * When pwq->nr_active >= max_active, new work item is queued to
+ * pwq->inactive_works instead of pool->worklist and marked with
+ * WORK_STRUCT_INACTIVE.
+ *
+ * All work items marked with WORK_STRUCT_INACTIVE do not participate
+ * in pwq->nr_active and all work items in pwq->inactive_works are
+ * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE
+ * work items are in pwq->inactive_works. Some of them are ready to
+ * run in pool->worklist or worker->scheduled. Those work itmes are
+ * only struct wq_barrier which is used for flush_work() and should
+ * not participate in pwq->nr_active. For non-barrier work item, it
+ * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
+ */
int nr_active; /* L: nr of active works */
int max_active; /* L: max active works */
- struct list_head delayed_works; /* L: delayed works */
+ struct list_head inactive_works; /* L: inactive works */
struct list_head pwqs_node; /* WR: node on wq->pwqs */
struct list_head mayday_node; /* MD: node on wq->maydays */
+ u64 stats[PWQ_NR_STATS];
+
/*
- * Release of unbound pwq is punted to system_wq. See put_pwq()
- * and pwq_unbound_release_workfn() for details. pool_workqueue
- * itself is also RCU protected so that the first pwq can be
- * determined without grabbing wq->mutex.
+ * Release of unbound pwq is punted to a kthread_worker. See put_pwq()
+ * and pwq_release_workfn() for details. pool_workqueue itself is also
+ * RCU protected so that the first pwq can be determined without
+ * grabbing wq->mutex.
*/
- struct work_struct unbound_release_work;
+ struct kthread_work release_work;
struct rcu_head rcu;
} __aligned(1 << WORK_STRUCT_FLAG_BITS);
@@ -275,17 +322,43 @@ struct workqueue_struct {
/* hot fields used during command issue, aligned to cacheline */
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
- struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
- struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
+ struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */
};
static struct kmem_cache *pwq_cache;
-static cpumask_var_t *wq_numa_possible_cpumask;
- /* possible CPUs of each node */
+/*
+ * Each pod type describes how CPUs should be grouped for unbound workqueues.
+ * See the comment above workqueue_attrs->affn_scope.
+ */
+struct wq_pod_type {
+ int nr_pods; /* number of pods */
+ cpumask_var_t *pod_cpus; /* pod -> cpus */
+ int *pod_node; /* pod -> node */
+ int *cpu_pod; /* cpu -> pod */
+};
+
+static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
+static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;
-static bool wq_disable_numa;
-module_param_named(disable_numa, wq_disable_numa, bool, 0444);
+static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {
+ [WQ_AFFN_DFL] = "default",
+ [WQ_AFFN_CPU] = "cpu",
+ [WQ_AFFN_SMT] = "smt",
+ [WQ_AFFN_CACHE] = "cache",
+ [WQ_AFFN_NUMA] = "numa",
+ [WQ_AFFN_SYSTEM] = "system",
+};
+
+/*
+ * Per-cpu work items which run for longer than the following threshold are
+ * automatically considered CPU intensive and excluded from concurrency
+ * management to prevent them from noticeably delaying other per-cpu work items.
+ * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter.
+ * The actual value is initialized in wq_cpu_intensive_thresh_init().
+ */
+static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX;
+module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
/* see the comment above the definition of WQ_POWER_EFFICIENT */
static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
@@ -293,10 +366,8 @@ module_param_named(power_efficient, wq_power_efficient, bool, 0444);
static bool wq_online; /* can kworkers be created yet? */
-static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
-
-/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
-static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
+/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
+static struct workqueue_attrs *wq_update_pod_attrs_buf;
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
@@ -307,9 +378,18 @@ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
-/* PL: allowable cpus for unbound wqs and work items */
+/* PL&A: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;
+/* PL: user requested unbound cpumask via sysfs */
+static cpumask_var_t wq_requested_unbound_cpumask;
+
+/* PL: isolated cpumask to be excluded from unbound cpumask */
+static cpumask_var_t wq_isolated_cpumask;
+
+/* for further constrain wq_unbound_cpumask by cmdline parameter*/
+static struct cpumask wq_cmdline_cpumask __initdata;
+
/* CPU where unbound work was last round robin scheduled from this CPU */
static DEFINE_PER_CPU(int, wq_rr_cpu_last);
@@ -339,24 +419,32 @@ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
/* I: attributes used when instantiating ordered pools on demand */
static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
-struct workqueue_struct *system_wq __read_mostly;
+/*
+ * I: kthread_worker to release pwq's. pwq release needs to be bounced to a
+ * process context while holding a pool lock. Bounce to a dedicated kthread
+ * worker to avoid A-A deadlocks.
+ */
+static struct kthread_worker *pwq_release_worker __ro_after_init;
+
+struct workqueue_struct *system_wq __ro_after_init;
EXPORT_SYMBOL(system_wq);
-struct workqueue_struct *system_highpri_wq __read_mostly;
+struct workqueue_struct *system_highpri_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_highpri_wq);
-struct workqueue_struct *system_long_wq __read_mostly;
+struct workqueue_struct *system_long_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_long_wq);
-struct workqueue_struct *system_unbound_wq __read_mostly;
+struct workqueue_struct *system_unbound_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_unbound_wq);
-struct workqueue_struct *system_freezable_wq __read_mostly;
+struct workqueue_struct *system_freezable_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_wq);
-struct workqueue_struct *system_power_efficient_wq __read_mostly;
+struct workqueue_struct *system_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_power_efficient_wq);
-struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
+struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
static int worker_thread(void *__worker);
static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
static void show_pwq(struct pool_workqueue *pwq);
+static void show_one_worker_pool(struct worker_pool *pool);
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>
@@ -427,7 +515,7 @@ static void show_pwq(struct pool_workqueue *pwq);
#ifdef CONFIG_DEBUG_OBJECTS_WORK
-static struct debug_obj_descr work_debug_descr;
+static const struct debug_obj_descr work_debug_descr;
static void *work_debug_hint(void *addr)
{
@@ -477,7 +565,7 @@ static bool work_fixup_free(void *addr, enum debug_obj_state state)
}
}
-static struct debug_obj_descr work_debug_descr = {
+static const struct debug_obj_descr work_debug_descr = {
.name = "work_struct",
.debug_hint = work_debug_hint,
.is_static_object = work_is_static_object,
@@ -523,7 +611,7 @@ static inline void debug_work_deactivate(struct work_struct *work) { }
#endif
/**
- * worker_pool_assign_id - allocate ID and assing it to @pool
+ * worker_pool_assign_id - allocate ID and assign it to @pool
* @pool: the pool pointer of interest
*
* Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
@@ -544,43 +632,14 @@ static int worker_pool_assign_id(struct worker_pool *pool)
return ret;
}
-/**
- * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
- * @wq: the target workqueue
- * @node: the node ID
- *
- * This must be called with any of wq_pool_mutex, wq->mutex or RCU
- * read locked.
- * If the pwq needs to be used beyond the locking in effect, the caller is
- * responsible for guaranteeing that the pwq stays online.
- *
- * Return: The unbound pool_workqueue for @node.
- */
-static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
- int node)
-{
- assert_rcu_or_wq_mutex_or_pool_mutex(wq);
-
- /*
- * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
- * delayed item is pending. The plan is to keep CPU -> NODE
- * mapping valid and stable across CPU on/offlines. Once that
- * happens, this workaround can be removed.
- */
- if (unlikely(node == NUMA_NO_NODE))
- return wq->dfl_pwq;
-
- return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
-}
-
static unsigned int work_color_to_flags(int color)
{
return color << WORK_STRUCT_COLOR_SHIFT;
}
-static int get_work_color(struct work_struct *work)
+static int get_work_color(unsigned long work_data)
{
- return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
+ return (work_data >> WORK_STRUCT_COLOR_SHIFT) &
((1 << WORK_STRUCT_COLOR_BITS) - 1);
}
@@ -678,12 +737,17 @@ static void clear_work_data(struct work_struct *work)
set_work_data(work, WORK_STRUCT_NO_POOL, 0);
}
+static inline struct pool_workqueue *work_struct_pwq(unsigned long data)
+{
+ return (struct pool_workqueue *)(data & WORK_STRUCT_WQ_DATA_MASK);
+}
+
static struct pool_workqueue *get_work_pwq(struct work_struct *work)
{
unsigned long data = atomic_long_read(&work->data);
if (data & WORK_STRUCT_PWQ)
- return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
+ return work_struct_pwq(data);
else
return NULL;
}
@@ -711,8 +775,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work)
assert_rcu_or_pool_mutex();
if (data & WORK_STRUCT_PWQ)
- return ((struct pool_workqueue *)
- (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
+ return work_struct_pwq(data)->pool;
pool_id = data >> WORK_OFFQ_POOL_SHIFT;
if (pool_id == WORK_OFFQ_POOL_NONE)
@@ -733,8 +796,7 @@ static int get_work_pool_id(struct work_struct *work)
unsigned long data = atomic_long_read(&work->data);
if (data & WORK_STRUCT_PWQ)
- return ((struct pool_workqueue *)
- (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
+ return work_struct_pwq(data)->pool->id;
return data >> WORK_OFFQ_POOL_SHIFT;
}
@@ -760,11 +822,6 @@ static bool work_is_canceling(struct work_struct *work)
* they're being called with pool->lock held.
*/
-static bool __need_more_worker(struct worker_pool *pool)
-{
- return !atomic_read(&pool->nr_running);
-}
-
/*
* Need to wake up a worker? Called from anything but currently
* running workers.
@@ -775,7 +832,7 @@ static bool __need_more_worker(struct worker_pool *pool)
*/
static bool need_more_worker(struct worker_pool *pool)
{
- return !list_empty(&pool->worklist) && __need_more_worker(pool);
+ return !list_empty(&pool->worklist) && !pool->nr_running;
}
/* Can I start working? Called from busy but !running workers. */
@@ -787,8 +844,7 @@ static bool may_start_working(struct worker_pool *pool)
/* Do I need to keep working? Called from currently running workers. */
static bool keep_working(struct worker_pool *pool)
{
- return !list_empty(&pool->worklist) &&
- atomic_read(&pool->nr_running) <= 1;
+ return !list_empty(&pool->worklist) && (pool->nr_running <= 1);
}
/* Do we need a new worker? Called from manager. */
@@ -807,154 +863,23 @@ static bool too_many_workers(struct worker_pool *pool)
return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
}
-/*
- * Wake up functions.
- */
-
-/* Return the first idle worker. Safe with preemption disabled */
-static struct worker *first_idle_worker(struct worker_pool *pool)
-{
- if (unlikely(list_empty(&pool->idle_list)))
- return NULL;
-
- return list_first_entry(&pool->idle_list, struct worker, entry);
-}
-
-/**
- * wake_up_worker - wake up an idle worker
- * @pool: worker pool to wake worker from
- *
- * Wake up the first idle worker of @pool.
- *
- * CONTEXT:
- * raw_spin_lock_irq(pool->lock).
- */
-static void wake_up_worker(struct worker_pool *pool)
-{
- struct worker *worker = first_idle_worker(pool);
-
- if (likely(worker))
- wake_up_process(worker->task);
-}
-
-/**
- * wq_worker_running - a worker is running again
- * @task: task waking up
- *
- * This function is called when a worker returns from schedule()
- */
-void wq_worker_running(struct task_struct *task)
-{
- struct worker *worker = kthread_data(task);
-
- if (!worker->sleeping)
- return;
- if (!(worker->flags & WORKER_NOT_RUNNING))
- atomic_inc(&worker->pool->nr_running);
- worker->sleeping = 0;
-}
-
-/**
- * wq_worker_sleeping - a worker is going to sleep
- * @task: task going to sleep
- *
- * This function is called from schedule() when a busy worker is
- * going to sleep. Preemption needs to be disabled to protect ->sleeping
- * assignment.
- */
-void wq_worker_sleeping(struct task_struct *task)
-{
- struct worker *next, *worker = kthread_data(task);
- struct worker_pool *pool;
-
- /*
- * Rescuers, which may not have all the fields set up like normal
- * workers, also reach here, let's not access anything before
- * checking NOT_RUNNING.
- */
- if (worker->flags & WORKER_NOT_RUNNING)
- return;
-
- pool = worker->pool;
-
- /* Return if preempted before wq_worker_running() was reached */
- if (worker->sleeping)
- return;
-
- worker->sleeping = 1;
- raw_spin_lock_irq(&pool->lock);
-
- /*
- * The counterpart of the following dec_and_test, implied mb,
- * worklist not empty test sequence is in insert_work().
- * Please read comment there.
- *
- * NOT_RUNNING is clear. This means that we're bound to and
- * running on the local cpu w/ rq lock held and preemption
- * disabled, which in turn means that none else could be
- * manipulating idle_list, so dereferencing idle_list without pool
- * lock is safe.
- */
- if (atomic_dec_and_test(&pool->nr_running) &&
- !list_empty(&pool->worklist)) {
- next = first_idle_worker(pool);
- if (next)
- wake_up_process(next->task);
- }
- raw_spin_unlock_irq(&pool->lock);
-}
-
-/**
- * wq_worker_last_func - retrieve worker's last work function
- * @task: Task to retrieve last work function of.
- *
- * Determine the last function a worker executed. This is called from
- * the scheduler to get a worker's last known identity.
- *
- * CONTEXT:
- * raw_spin_lock_irq(rq->lock)
- *
- * This function is called during schedule() when a kworker is going
- * to sleep. It's used by psi to identify aggregation workers during
- * dequeuing, to allow periodic aggregation to shut-off when that
- * worker is the last task in the system or cgroup to go to sleep.
- *
- * As this function doesn't involve any workqueue-related locking, it
- * only returns stable values when called from inside the scheduler's
- * queuing and dequeuing paths, when @task, which must be a kworker,
- * is guaranteed to not be processing any works.
- *
- * Return:
- * The last work function %current executed as a worker, NULL if it
- * hasn't executed any work yet.
- */
-work_func_t wq_worker_last_func(struct task_struct *task)
-{
- struct worker *worker = kthread_data(task);
-
- return worker->last_func;
-}
-
/**
* worker_set_flags - set worker flags and adjust nr_running accordingly
* @worker: self
* @flags: flags to set
*
* Set @flags in @worker->flags and adjust nr_running accordingly.
- *
- * CONTEXT:
- * raw_spin_lock_irq(pool->lock)
*/
static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
struct worker_pool *pool = worker->pool;
- WARN_ON_ONCE(worker->task != current);
+ lockdep_assert_held(&pool->lock);
/* If transitioning into NOT_RUNNING, adjust nr_running. */
if ((flags & WORKER_NOT_RUNNING) &&
!(worker->flags & WORKER_NOT_RUNNING)) {
- atomic_dec(&pool->nr_running);
+ pool->nr_running--;
}
worker->flags |= flags;
@@ -966,16 +891,13 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags)
* @flags: flags to clear
*
* Clear @flags in @worker->flags and adjust nr_running accordingly.
- *
- * CONTEXT:
- * raw_spin_lock_irq(pool->lock)
*/
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
struct worker_pool *pool = worker->pool;
unsigned int oflags = worker->flags;
- WARN_ON_ONCE(worker->task != current);
+ lockdep_assert_held(&pool->lock);
worker->flags &= ~flags;
@@ -986,7 +908,70 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
*/
if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
if (!(worker->flags & WORKER_NOT_RUNNING))
- atomic_inc(&pool->nr_running);
+ pool->nr_running++;
+}
+
+/* Return the first idle worker. Called with pool->lock held. */
+static struct worker *first_idle_worker(struct worker_pool *pool)
+{
+ if (unlikely(list_empty(&pool->idle_list)))
+ return NULL;
+
+ return list_first_entry(&pool->idle_list, struct worker, entry);
+}
+
+/**
+ * worker_enter_idle - enter idle state
+ * @worker: worker which is entering idle state
+ *
+ * @worker is entering idle state. Update stats and idle timer if
+ * necessary.
+ *
+ * LOCKING:
+ * raw_spin_lock_irq(pool->lock).
+ */
+static void worker_enter_idle(struct worker *worker)
+{
+ struct worker_pool *pool = worker->pool;
+
+ if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
+ WARN_ON_ONCE(!list_empty(&worker->entry) &&
+ (worker->hentry.next || worker->hentry.pprev)))
+ return;
+
+ /* can't use worker_set_flags(), also called from create_worker() */
+ worker->flags |= WORKER_IDLE;
+ pool->nr_idle++;
+ worker->last_active = jiffies;
+
+ /* idle_list is LIFO */
+ list_add(&worker->entry, &pool->idle_list);
+
+ if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
+ mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
+
+ /* Sanity check nr_running. */
+ WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
+}
+
+/**
+ * worker_leave_idle - leave idle state
+ * @worker: worker which is leaving idle state
+ *
+ * @worker is leaving idle state. Update stats.
+ *
+ * LOCKING:
+ * raw_spin_lock_irq(pool->lock).
+ */
+static void worker_leave_idle(struct worker *worker)
+{
+ struct worker_pool *pool = worker->pool;
+
+ if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
+ return;
+ worker_clr_flags(worker, WORKER_IDLE);
+ pool->nr_idle--;
+ list_del_init(&worker->entry);
}
/**
@@ -1042,13 +1027,10 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
* @head: target list to append @work to
* @nextp: out parameter for nested worklist walking
*
- * Schedule linked works starting from @work to @head. Work series to
- * be scheduled starts at @work and includes any consecutive work with
- * WORK_STRUCT_LINKED set in its predecessor.
- *
- * If @nextp is not NULL, it's updated to point to the next work of
- * the last scheduled work. This allows move_linked_works() to be
- * nested inside outer list_for_each_entry_safe().
+ * Schedule linked works starting from @work to @head. Work series to be
+ * scheduled starts at @work and includes any consecutive work with
+ * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on
+ * @nextp.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock).
@@ -1078,6 +1060,348 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,
}
/**
+ * assign_work - assign a work item and its linked work items to a worker
+ * @work: work to assign
+ * @worker: worker to assign to
+ * @nextp: out parameter for nested worklist walking
+ *
+ * Assign @work and its linked work items to @worker. If @work is already being
+ * executed by another worker in the same pool, it'll be punted there.
+ *
+ * If @nextp is not NULL, it's updated to point to the next work of the last
+ * scheduled work. This allows assign_work() to be nested inside
+ * list_for_each_entry_safe().
+ *
+ * Returns %true if @work was successfully assigned to @worker. %false if @work
+ * was punted to another worker already executing it.
+ */
+static bool assign_work(struct work_struct *work, struct worker *worker,
+ struct work_struct **nextp)
+{
+ struct worker_pool *pool = worker->pool;
+ struct worker *collision;
+
+ lockdep_assert_held(&pool->lock);
+
+ /*
+ * A single work shouldn't be executed concurrently by multiple workers.
+ * __queue_work() ensures that @work doesn't jump to a different pool
+ * while still running in the previous pool. Here, we should ensure that
+ * @work is not executed concurrently by multiple workers from the same
+ * pool. Check whether anyone is already processing the work. If so,
+ * defer the work to the currently executing one.
+ */
+ collision = find_worker_executing_work(pool, work);
+ if (unlikely(collision)) {
+ move_linked_works(work, &collision->scheduled, nextp);
+ return false;
+ }
+
+ move_linked_works(work, &worker->scheduled, nextp);
+ return true;
+}
+
+/**
+ * kick_pool - wake up an idle worker if necessary
+ * @pool: pool to kick
+ *
+ * @pool may have pending work items. Wake up worker if necessary. Returns
+ * whether a worker was woken up.
+ */
+static bool kick_pool(struct worker_pool *pool)
+{
+ struct worker *worker = first_idle_worker(pool);
+ struct task_struct *p;
+
+ lockdep_assert_held(&pool->lock);
+
+ if (!need_more_worker(pool) || !worker)
+ return false;
+
+ p = worker->task;
+
+#ifdef CONFIG_SMP
+ /*
+ * Idle @worker is about to execute @work and waking up provides an
+ * opportunity to migrate @worker at a lower cost by setting the task's
+ * wake_cpu field. Let's see if we want to move @worker to improve
+ * execution locality.
+ *
+ * We're waking the worker that went idle the latest and there's some
+ * chance that @worker is marked idle but hasn't gone off CPU yet. If
+ * so, setting the wake_cpu won't do anything. As this is a best-effort
+ * optimization and the race window is narrow, let's leave as-is for
+ * now. If this becomes pronounced, we can skip over workers which are
+ * still on cpu when picking an idle worker.
+ *
+ * If @pool has non-strict affinity, @worker might have ended up outside
+ * its affinity scope. Repatriate.
+ */
+ if (!pool->attrs->affn_strict &&
+ !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) {
+ struct work_struct *work = list_first_entry(&pool->worklist,
+ struct work_struct, entry);
+ p->wake_cpu = cpumask_any_distribute(pool->attrs->__pod_cpumask);
+ get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++;
+ }
+#endif
+ wake_up_process(p);
+ return true;
+}
+
+#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT
+
+/*
+ * Concurrency-managed per-cpu work items that hog CPU for longer than
+ * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism,
+ * which prevents them from stalling other concurrency-managed work items. If a
+ * work function keeps triggering this mechanism, it's likely that the work item
+ * should be using an unbound workqueue instead.
+ *
+ * wq_cpu_intensive_report() tracks work functions which trigger such conditions
+ * and report them so that they can be examined and converted to use unbound
+ * workqueues as appropriate. To avoid flooding the console, each violating work
+ * function is tracked and reported with exponential backoff.
+ */
+#define WCI_MAX_ENTS 128
+
+struct wci_ent {
+ work_func_t func;
+ atomic64_t cnt;
+ struct hlist_node hash_node;
+};
+
+static struct wci_ent wci_ents[WCI_MAX_ENTS];
+static int wci_nr_ents;
+static DEFINE_RAW_SPINLOCK(wci_lock);
+static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS));
+
+static struct wci_ent *wci_find_ent(work_func_t func)
+{
+ struct wci_ent *ent;
+
+ hash_for_each_possible_rcu(wci_hash, ent, hash_node,
+ (unsigned long)func) {
+ if (ent->func == func)
+ return ent;
+ }
+ return NULL;
+}
+
+static void wq_cpu_intensive_report(work_func_t func)
+{
+ struct wci_ent *ent;
+
+restart:
+ ent = wci_find_ent(func);
+ if (ent) {
+ u64 cnt;
+
+ /*
+ * Start reporting from the fourth time and back off
+ * exponentially.
+ */
+ cnt = atomic64_inc_return_relaxed(&ent->cnt);
+ if (cnt >= 4 && is_power_of_2(cnt))
+ printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n",
+ ent->func, wq_cpu_intensive_thresh_us,
+ atomic64_read(&ent->cnt));
+ return;
+ }
+
+ /*
+ * @func is a new violation. Allocate a new entry for it. If wcn_ents[]
+ * is exhausted, something went really wrong and we probably made enough
+ * noise already.
+ */
+ if (wci_nr_ents >= WCI_MAX_ENTS)
+ return;
+
+ raw_spin_lock(&wci_lock);
+
+ if (wci_nr_ents >= WCI_MAX_ENTS) {
+ raw_spin_unlock(&wci_lock);
+ return;
+ }
+
+ if (wci_find_ent(func)) {
+ raw_spin_unlock(&wci_lock);
+ goto restart;
+ }
+
+ ent = &wci_ents[wci_nr_ents++];
+ ent->func = func;
+ atomic64_set(&ent->cnt, 1);
+ hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func);
+
+ raw_spin_unlock(&wci_lock);
+}
+
+#else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
+static void wq_cpu_intensive_report(work_func_t func) {}
+#endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
+
+/**
+ * wq_worker_running - a worker is running again
+ * @task: task waking up
+ *
+ * This function is called when a worker returns from schedule()
+ */
+void wq_worker_running(struct task_struct *task)
+{
+ struct worker *worker = kthread_data(task);
+
+ if (!READ_ONCE(worker->sleeping))
+ return;
+
+ /*
+ * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
+ * and the nr_running increment below, we may ruin the nr_running reset
+ * and leave with an unexpected pool->nr_running == 1 on the newly unbound
+ * pool. Protect against such race.
+ */
+ preempt_disable();
+ if (!(worker->flags & WORKER_NOT_RUNNING))
+ worker->pool->nr_running++;
+ preempt_enable();
+
+ /*
+ * CPU intensive auto-detection cares about how long a work item hogged
+ * CPU without sleeping. Reset the starting timestamp on wakeup.
+ */
+ worker->current_at = worker->task->se.sum_exec_runtime;
+
+ WRITE_ONCE(worker->sleeping, 0);
+}
+
+/**
+ * wq_worker_sleeping - a worker is going to sleep
+ * @task: task going to sleep
+ *
+ * This function is called from schedule() when a busy worker is
+ * going to sleep.
+ */
+void wq_worker_sleeping(struct task_struct *task)
+{
+ struct worker *worker = kthread_data(task);
+ struct worker_pool *pool;
+
+ /*
+ * Rescuers, which may not have all the fields set up like normal
+ * workers, also reach here, let's not access anything before
+ * checking NOT_RUNNING.
+ */
+ if (worker->flags & WORKER_NOT_RUNNING)
+ return;
+
+ pool = worker->pool;
+
+ /* Return if preempted before wq_worker_running() was reached */
+ if (READ_ONCE(worker->sleeping))
+ return;
+
+ WRITE_ONCE(worker->sleeping, 1);
+ raw_spin_lock_irq(&pool->lock);
+
+ /*
+ * Recheck in case unbind_workers() preempted us. We don't
+ * want to decrement nr_running after the worker is unbound
+ * and nr_running has been reset.
+ */
+ if (worker->flags & WORKER_NOT_RUNNING) {
+ raw_spin_unlock_irq(&pool->lock);
+ return;
+ }
+
+ pool->nr_running--;
+ if (kick_pool(pool))
+ worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++;
+
+ raw_spin_unlock_irq(&pool->lock);
+}
+
+/**
+ * wq_worker_tick - a scheduler tick occurred while a kworker is running
+ * @task: task currently running
+ *
+ * Called from scheduler_tick(). We're in the IRQ context and the current
+ * worker's fields which follow the 'K' locking rule can be accessed safely.
+ */
+void wq_worker_tick(struct task_struct *task)
+{
+ struct worker *worker = kthread_data(task);
+ struct pool_workqueue *pwq = worker->current_pwq;
+ struct worker_pool *pool = worker->pool;
+
+ if (!pwq)
+ return;
+
+ pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC;
+
+ if (!wq_cpu_intensive_thresh_us)
+ return;
+
+ /*
+ * If the current worker is concurrency managed and hogged the CPU for
+ * longer than wq_cpu_intensive_thresh_us, it's automatically marked
+ * CPU_INTENSIVE to avoid stalling other concurrency-managed work items.
+ *
+ * Set @worker->sleeping means that @worker is in the process of
+ * switching out voluntarily and won't be contributing to
+ * @pool->nr_running until it wakes up. As wq_worker_sleeping() also
+ * decrements ->nr_running, setting CPU_INTENSIVE here can lead to
+ * double decrements. The task is releasing the CPU anyway. Let's skip.
+ * We probably want to make this prettier in the future.
+ */
+ if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) ||
+ worker->task->se.sum_exec_runtime - worker->current_at <
+ wq_cpu_intensive_thresh_us * NSEC_PER_USEC)
+ return;
+
+ raw_spin_lock(&pool->lock);
+
+ worker_set_flags(worker, WORKER_CPU_INTENSIVE);
+ wq_cpu_intensive_report(worker->current_func);
+ pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;
+
+ if (kick_pool(pool))
+ pwq->stats[PWQ_STAT_CM_WAKEUP]++;
+
+ raw_spin_unlock(&pool->lock);
+}
+
+/**
+ * wq_worker_last_func - retrieve worker's last work function
+ * @task: Task to retrieve last work function of.
+ *
+ * Determine the last function a worker executed. This is called from
+ * the scheduler to get a worker's last known identity.
+ *
+ * CONTEXT:
+ * raw_spin_lock_irq(rq->lock)
+ *
+ * This function is called during schedule() when a kworker is going
+ * to sleep. It's used by psi to identify aggregation workers during
+ * dequeuing, to allow periodic aggregation to shut-off when that
+ * worker is the last task in the system or cgroup to go to sleep.
+ *
+ * As this function doesn't involve any workqueue-related locking, it
+ * only returns stable values when called from inside the scheduler's
+ * queuing and dequeuing paths, when @task, which must be a kworker,
+ * is guaranteed to not be processing any works.
+ *
+ * Return:
+ * The last work function %current executed as a worker, NULL if it
+ * hasn't executed any work yet.
+ */
+work_func_t wq_worker_last_func(struct task_struct *task)
+{
+ struct worker *worker = kthread_data(task);
+
+ return worker->last_func;
+}
+
+/**
* get_pwq - get an extra reference on the specified pool_workqueue
* @pwq: pool_workqueue to get
*
@@ -1103,17 +1427,11 @@ static void put_pwq(struct pool_workqueue *pwq)
lockdep_assert_held(&pwq->pool->lock);
if (likely(--pwq->refcnt))
return;
- if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND)))
- return;
/*
- * @pwq can't be released under pool->lock, bounce to
- * pwq_unbound_release_workfn(). This never recurses on the same
- * pool->lock as this path is taken only for unbound workqueues and
- * the release work item is scheduled on a per-cpu workqueue. To
- * avoid lockdep warning, unbound pool->locks are given lockdep
- * subclass of 1 in get_unbound_pool().
+ * @pwq can't be released under pool->lock, bounce to a dedicated
+ * kthread_worker to avoid A-A deadlocks.
*/
- schedule_work(&pwq->unbound_release_work);
+ kthread_queue_work(pwq_release_worker, &pwq->release_work);
}
/**
@@ -1135,7 +1453,7 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
}
}
-static void pwq_activate_delayed_work(struct work_struct *work)
+static void pwq_activate_inactive_work(struct work_struct *work)
{
struct pool_workqueue *pwq = get_work_pwq(work);
@@ -1143,22 +1461,22 @@ static void pwq_activate_delayed_work(struct work_struct *work)
if (list_empty(&pwq->pool->worklist))
pwq->pool->watchdog_ts = jiffies;
move_linked_works(work, &pwq->pool->worklist, NULL);
- __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
+ __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work));
pwq->nr_active++;
}
-static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
+static void pwq_activate_first_inactive(struct pool_workqueue *pwq)
{
- struct work_struct *work = list_first_entry(&pwq->delayed_works,
+ struct work_struct *work = list_first_entry(&pwq->inactive_works,
struct work_struct, entry);
- pwq_activate_delayed_work(work);
+ pwq_activate_inactive_work(work);
}
/**
* pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
* @pwq: pwq of interest
- * @color: color of work which left the queue
+ * @work_data: work_data of work which left the queue
*
* A work either has completed or is removed from pending queue,
* decrement nr_in_flight of its pwq and handle workqueue flushing.
@@ -1166,21 +1484,21 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
* CONTEXT:
* raw_spin_lock_irq(pool->lock).
*/
-static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
+static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data)
{
- /* uncolored work items don't participate in flushing or nr_active */
- if (color == WORK_NO_COLOR)
- goto out_put;
-
- pwq->nr_in_flight[color]--;
+ int color = get_work_color(work_data);
- pwq->nr_active--;
- if (!list_empty(&pwq->delayed_works)) {
- /* one down, submit a delayed one */
- if (pwq->nr_active < pwq->max_active)
- pwq_activate_first_delayed(pwq);
+ if (!(work_data & WORK_STRUCT_INACTIVE)) {
+ pwq->nr_active--;
+ if (!list_empty(&pwq->inactive_works)) {
+ /* one down, submit an inactive one */
+ if (pwq->nr_active < pwq->max_active)
+ pwq_activate_first_inactive(pwq);
+ }
}
+ pwq->nr_in_flight[color]--;
+
/* is flush in progress and are we at the flushing tip? */
if (likely(pwq->flush_color != color))
goto out_put;
@@ -1212,11 +1530,14 @@ out_put:
* stable state - idle, on timer or on worklist.
*
* Return:
+ *
+ * ======== ================================================================
* 1 if @work was pending and we successfully stole PENDING
* 0 if @work was idle and we claimed PENDING
* -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry
* -ENOENT if someone else is canceling @work, this state may persist
* for arbitrarily long
+ * ======== ================================================================
*
* Note:
* On >= 0 return, the caller owns @work's PENDING bit. To avoid getting
@@ -1277,17 +1598,21 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
debug_work_deactivate(work);
/*
- * A delayed work item cannot be grabbed directly because
- * it might have linked NO_COLOR work items which, if left
- * on the delayed_list, will confuse pwq->nr_active
+ * A cancelable inactive work item must be in the
+ * pwq->inactive_works since a queued barrier can't be
+ * canceled (see the comments in insert_wq_barrier()).
+ *
+ * An inactive work item cannot be grabbed directly because
+ * it might have linked barrier work items which, if left
+ * on the inactive_works list, will confuse pwq->nr_active
* management later on and cause stall. Make sure the work
* item is activated before grabbing.
*/
- if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
- pwq_activate_delayed_work(work);
+ if (*work_data_bits(work) & WORK_STRUCT_INACTIVE)
+ pwq_activate_inactive_work(work);
list_del_init(&work->entry);
- pwq_dec_nr_in_flight(pwq, get_work_color(work));
+ pwq_dec_nr_in_flight(pwq, *work_data_bits(work));
/* work->data points to pwq iff queued, point to pool */
set_work_pool_and_keep_pending(work, pool->id);
@@ -1322,22 +1647,15 @@ fail:
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
struct list_head *head, unsigned int extra_flags)
{
- struct worker_pool *pool = pwq->pool;
+ debug_work_activate(work);
+
+ /* record the work call stack in order to print it in KASAN reports */
+ kasan_record_aux_stack_noalloc(work);
/* we own @work, set data and link */
set_work_pwq(work, pwq, extra_flags);
list_add_tail(&work->entry, head);
get_pwq(pwq);
-
- /*
- * Ensure either wq_worker_sleeping() sees the above
- * list_add_tail() or we see zero nr_running to avoid workers lying
- * around lazily while there are works to be processed.
- */
- smp_mb();
-
- if (__need_more_worker(pool))
- wake_up_worker(pool);
}
/*
@@ -1363,20 +1681,15 @@ static bool is_chained_work(struct workqueue_struct *wq)
*/
static int wq_select_unbound_cpu(int cpu)
{
- static bool printed_dbg_warning;
int new_cpu;
if (likely(!wq_debug_force_rr_cpu)) {
if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
return cpu;
- } else if (!printed_dbg_warning) {
- pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n");
- printed_dbg_warning = true;
+ } else {
+ pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");
}
- if (cpumask_empty(wq_unbound_cpumask))
- return cpu;
-
new_cpu = __this_cpu_read(wq_rr_cpu_last);
new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
if (unlikely(new_cpu >= nr_cpu_ids)) {
@@ -1393,8 +1706,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
struct pool_workqueue *pwq;
- struct worker_pool *last_pool;
- struct list_head *worklist;
+ struct worker_pool *last_pool, *pool;
unsigned int work_flags;
unsigned int req_cpu = cpu;
@@ -1406,32 +1718,35 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
*/
lockdep_assert_irqs_disabled();
- debug_work_activate(work);
- /* if draining, only works from the same workqueue are allowed */
- if (unlikely(wq->flags & __WQ_DRAINING) &&
- WARN_ON_ONCE(!is_chained_work(wq)))
+ /*
+ * For a draining wq, only works from the same workqueue are
+ * allowed. The __WQ_DESTROYING helps to spot the issue that
+ * queues a new work item to a wq after destroy_workqueue(wq).
+ */
+ if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
+ WARN_ON_ONCE(!is_chained_work(wq))))
return;
rcu_read_lock();
retry:
/* pwq which will be used unless @work is executing elsewhere */
- if (wq->flags & WQ_UNBOUND) {
- if (req_cpu == WORK_CPU_UNBOUND)
+ if (req_cpu == WORK_CPU_UNBOUND) {
+ if (wq->flags & WQ_UNBOUND)
cpu = wq_select_unbound_cpu(raw_smp_processor_id());
- pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
- } else {
- if (req_cpu == WORK_CPU_UNBOUND)
+ else
cpu = raw_smp_processor_id();
- pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
}
+ pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu));
+ pool = pwq->pool;
+
/*
* If @work was previously on a different pool, it might still be
* running there, in which case the work needs to be queued on that
* pool to guarantee non-reentrancy.
*/
last_pool = get_work_pool(work);
- if (last_pool && last_pool != pwq->pool) {
+ if (last_pool && last_pool != pool) {
struct worker *worker;
raw_spin_lock(&last_pool->lock);
@@ -1440,26 +1755,27 @@ retry:
if (worker && worker->current_pwq->wq == wq) {
pwq = worker->current_pwq;
+ pool = pwq->pool;
+ WARN_ON_ONCE(pool != last_pool);
} else {
/* meh... not running there, queue here */
raw_spin_unlock(&last_pool->lock);
- raw_spin_lock(&pwq->pool->lock);
+ raw_spin_lock(&pool->lock);
}
} else {
- raw_spin_lock(&pwq->pool->lock);
+ raw_spin_lock(&pool->lock);
}
/*
- * pwq is determined and locked. For unbound pools, we could have
- * raced with pwq release and it could already be dead. If its
- * refcnt is zero, repeat pwq selection. Note that pwqs never die
- * without another pwq replacing it in the numa_pwq_tbl or while
- * work items are executing on it, so the retrying is guaranteed to
- * make forward-progress.
+ * pwq is determined and locked. For unbound pools, we could have raced
+ * with pwq release and it could already be dead. If its refcnt is zero,
+ * repeat pwq selection. Note that unbound pwqs never die without
+ * another pwq replacing it in cpu_pwq or while work items are executing
+ * on it, so the retrying is guaranteed to make forward-progress.
*/
if (unlikely(!pwq->refcnt)) {
if (wq->flags & WQ_UNBOUND) {
- raw_spin_unlock(&pwq->pool->lock);
+ raw_spin_unlock(&pool->lock);
cpu_relax();
goto retry;
}
@@ -1478,20 +1794,20 @@ retry:
work_flags = work_color_to_flags(pwq->work_color);
if (likely(pwq->nr_active < pwq->max_active)) {
+ if (list_empty(&pool->worklist))
+ pool->watchdog_ts = jiffies;
+
trace_workqueue_activate_work(work);
pwq->nr_active++;
- worklist = &pwq->pool->worklist;
- if (list_empty(worklist))
- pwq->pool->watchdog_ts = jiffies;
+ insert_work(pwq, work, &pool->worklist, work_flags);
+ kick_pool(pool);
} else {
- work_flags |= WORK_STRUCT_DELAYED;
- worklist = &pwq->delayed_works;
+ work_flags |= WORK_STRUCT_INACTIVE;
+ insert_work(pwq, work, &pwq->inactive_works, work_flags);
}
- insert_work(pwq, work, worklist, work_flags);
-
out:
- raw_spin_unlock(&pwq->pool->lock);
+ raw_spin_unlock(&pool->lock);
rcu_read_unlock();
}
@@ -1502,7 +1818,10 @@ out:
* @work: work to queue
*
* We queue the work to a specific CPU, the caller must ensure it
- * can't go away.
+ * can't go away. Callers that fail to ensure that the specified
+ * CPU cannot go away will execute on a randomly chosen CPU.
+ * But note well that callers specifying a CPU that never has been
+ * online will get a splat.
*
* Return: %false if @work was already on a queue, %true otherwise.
*/
@@ -1525,7 +1844,7 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
EXPORT_SYMBOL(queue_work_on);
/**
- * workqueue_select_cpu_near - Select a CPU based on NUMA node
+ * select_numa_node_cpu - Select a CPU based on NUMA node
* @node: NUMA node ID that we want to select a CPU from
*
* This function will attempt to find a "random" cpu available on a given
@@ -1533,14 +1852,10 @@ EXPORT_SYMBOL(queue_work_on);
* WORK_CPU_UNBOUND indicating that we should just schedule to any
* available CPU if we need to schedule this work.
*/
-static int workqueue_select_cpu_near(int node)
+static int select_numa_node_cpu(int node)
{
int cpu;
- /* No point in doing this if NUMA isn't enabled for workqueues */
- if (!wq_numa_enabled)
- return WORK_CPU_UNBOUND;
-
/* Delay binding to CPU if node is not valid or online */
if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
return WORK_CPU_UNBOUND;
@@ -1597,7 +1912,7 @@ bool queue_work_node(int node, struct workqueue_struct *wq,
local_irq_save(flags);
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
- int cpu = workqueue_select_cpu_near(node);
+ int cpu = select_numa_node_cpu(node);
__queue_work(cpu, wq, work);
ret = true;
@@ -1744,7 +2059,7 @@ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
rwork->wq = wq;
- call_rcu(&rwork->rcu, rcu_work_rcufn);
+ call_rcu_hurry(&rwork->rcu, rcu_work_rcufn);
return true;
}
@@ -1752,67 +2067,6 @@ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
}
EXPORT_SYMBOL(queue_rcu_work);
-/**
- * worker_enter_idle - enter idle state
- * @worker: worker which is entering idle state
- *
- * @worker is entering idle state. Update stats and idle timer if
- * necessary.
- *
- * LOCKING:
- * raw_spin_lock_irq(pool->lock).
- */
-static void worker_enter_idle(struct worker *worker)
-{
- struct worker_pool *pool = worker->pool;
-
- if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
- WARN_ON_ONCE(!list_empty(&worker->entry) &&
- (worker->hentry.next || worker->hentry.pprev)))
- return;
-
- /* can't use worker_set_flags(), also called from create_worker() */
- worker->flags |= WORKER_IDLE;
- pool->nr_idle++;
- worker->last_active = jiffies;
-
- /* idle_list is LIFO */
- list_add(&worker->entry, &pool->idle_list);
-
- if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
- mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
-
- /*
- * Sanity check nr_running. Because unbind_workers() releases
- * pool->lock between setting %WORKER_UNBOUND and zapping
- * nr_running, the warning may trigger spuriously. Check iff
- * unbind is not in progress.
- */
- WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
- pool->nr_workers == pool->nr_idle &&
- atomic_read(&pool->nr_running));
-}
-
-/**
- * worker_leave_idle - leave idle state
- * @worker: worker which is leaving idle state
- *
- * @worker is leaving idle state. Update stats.
- *
- * LOCKING:
- * raw_spin_lock_irq(pool->lock).
- */
-static void worker_leave_idle(struct worker *worker)
-{
- struct worker_pool *pool = worker->pool;
-
- if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
- return;
- worker_clr_flags(worker, WORKER_IDLE);
- pool->nr_idle--;
- list_del_init(&worker->entry);
-}
-
static struct worker *alloc_worker(int node)
{
struct worker *worker;
@@ -1828,6 +2082,14 @@ static struct worker *alloc_worker(int node)
return worker;
}
+static cpumask_t *pool_allowed_cpus(struct worker_pool *pool)
+{
+ if (pool->cpu < 0 && pool->attrs->affn_strict)
+ return pool->attrs->__pod_cpumask;
+ else
+ return pool->attrs->cpumask;
+}
+
/**
* worker_attach_to_pool() - attach a worker to a pool
* @worker: worker to be attached
@@ -1843,18 +2105,17 @@ static void worker_attach_to_pool(struct worker *worker,
mutex_lock(&wq_pool_attach_mutex);
/*
- * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
- * online CPUs. It'll be re-applied when any of the CPUs come up.
- */
- set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
-
- /*
* The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
* stable across this function. See the comments above the flag
* definition for details.
*/
if (pool->flags & POOL_DISASSOCIATED)
worker->flags |= WORKER_UNBOUND;
+ else
+ kthread_set_per_cpu(worker->task, pool->cpu);
+
+ if (worker->rescue_wq)
+ set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool));
list_add_tail(&worker->node, &pool->workers);
worker->pool = pool;
@@ -1877,10 +2138,11 @@ static void worker_detach_from_pool(struct worker *worker)
mutex_lock(&wq_pool_attach_mutex);
+ kthread_set_per_cpu(worker->task, -1);
list_del(&worker->node);
worker->pool = NULL;
- if (list_empty(&pool->workers))
+ if (list_empty(&pool->workers) && list_empty(&pool->dying_workers))
detach_completion = pool->detach_completion;
mutex_unlock(&wq_pool_attach_mutex);
@@ -1905,18 +2167,23 @@ static void worker_detach_from_pool(struct worker *worker)
*/
static struct worker *create_worker(struct worker_pool *pool)
{
- struct worker *worker = NULL;
- int id = -1;
- char id_buf[16];
+ struct worker *worker;
+ int id;
+ char id_buf[23];
/* ID is needed to determine kthread name */
- id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
- if (id < 0)
- goto fail;
+ id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
+ if (id < 0) {
+ pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n",
+ ERR_PTR(id));
+ return NULL;
+ }
worker = alloc_worker(pool->node);
- if (!worker)
+ if (!worker) {
+ pr_err_once("workqueue: Failed to allocate a worker\n");
goto fail;
+ }
worker->id = id;
@@ -1928,46 +2195,96 @@ static struct worker *create_worker(struct worker_pool *pool)
worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
"kworker/%s", id_buf);
- if (IS_ERR(worker->task))
+ if (IS_ERR(worker->task)) {
+ if (PTR_ERR(worker->task) == -EINTR) {
+ pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n",
+ id_buf);
+ } else {
+ pr_err_once("workqueue: Failed to create a worker thread: %pe",
+ worker->task);
+ }
goto fail;
+ }
set_user_nice(worker->task, pool->attrs->nice);
- kthread_bind_mask(worker->task, pool->attrs->cpumask);
+ kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);
/* start the newly created worker */
raw_spin_lock_irq(&pool->lock);
+
worker->pool->nr_workers++;
worker_enter_idle(worker);
+ kick_pool(pool);
+
+ /*
+ * @worker is waiting on a completion in kthread() and will trigger hung
+ * check if not woken up soon. As kick_pool() might not have waken it
+ * up, wake it up explicitly once more.
+ */
wake_up_process(worker->task);
+
raw_spin_unlock_irq(&pool->lock);
return worker;
fail:
- if (id >= 0)
- ida_simple_remove(&pool->worker_ida, id);
+ ida_free(&pool->worker_ida, id);
kfree(worker);
return NULL;
}
+static void unbind_worker(struct worker *worker)
+{
+ lockdep_assert_held(&wq_pool_attach_mutex);
+
+ kthread_set_per_cpu(worker->task, -1);
+ if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
+ else
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
+}
+
+static void wake_dying_workers(struct list_head *cull_list)
+{
+ struct worker *worker, *tmp;
+
+ list_for_each_entry_safe(worker, tmp, cull_list, entry) {
+ list_del_init(&worker->entry);
+ unbind_worker(worker);
+ /*
+ * If the worker was somehow already running, then it had to be
+ * in pool->idle_list when set_worker_dying() happened or we
+ * wouldn't have gotten here.
+ *
+ * Thus, the worker must either have observed the WORKER_DIE
+ * flag, or have set its state to TASK_IDLE. Either way, the
+ * below will be observed by the worker and is safe to do
+ * outside of pool->lock.
+ */
+ wake_up_process(worker->task);
+ }
+}
+
/**
- * destroy_worker - destroy a workqueue worker
+ * set_worker_dying - Tag a worker for destruction
* @worker: worker to be destroyed
+ * @list: transfer worker away from its pool->idle_list and into list
*
- * Destroy @worker and adjust @pool stats accordingly. The worker should
- * be idle.
+ * Tag @worker for destruction and adjust @pool stats accordingly. The worker
+ * should be idle.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock).
*/
-static void destroy_worker(struct worker *worker)
+static void set_worker_dying(struct worker *worker, struct list_head *list)
{
struct worker_pool *pool = worker->pool;
lockdep_assert_held(&pool->lock);
+ lockdep_assert_held(&wq_pool_attach_mutex);
/* sanity check frenzy */
if (WARN_ON(worker->current_work) ||
@@ -1978,34 +2295,93 @@ static void destroy_worker(struct worker *worker)
pool->nr_workers--;
pool->nr_idle--;
- list_del_init(&worker->entry);
worker->flags |= WORKER_DIE;
- wake_up_process(worker->task);
+
+ list_move(&worker->entry, list);
+ list_move(&worker->node, &pool->dying_workers);
}
+/**
+ * idle_worker_timeout - check if some idle workers can now be deleted.
+ * @t: The pool's idle_timer that just expired
+ *
+ * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in
+ * worker_leave_idle(), as a worker flicking between idle and active while its
+ * pool is at the too_many_workers() tipping point would cause too much timer
+ * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let
+ * it expire and re-evaluate things from there.
+ */
static void idle_worker_timeout(struct timer_list *t)
{
struct worker_pool *pool = from_timer(pool, t, idle_timer);
+ bool do_cull = false;
+
+ if (work_pending(&pool->idle_cull_work))
+ return;
raw_spin_lock_irq(&pool->lock);
- while (too_many_workers(pool)) {
+ if (too_many_workers(pool)) {
struct worker *worker;
unsigned long expires;
/* idle_list is kept in LIFO order, check the last one */
worker = list_entry(pool->idle_list.prev, struct worker, entry);
expires = worker->last_active + IDLE_WORKER_TIMEOUT;
+ do_cull = !time_before(jiffies, expires);
+
+ if (!do_cull)
+ mod_timer(&pool->idle_timer, expires);
+ }
+ raw_spin_unlock_irq(&pool->lock);
+
+ if (do_cull)
+ queue_work(system_unbound_wq, &pool->idle_cull_work);
+}
+
+/**
+ * idle_cull_fn - cull workers that have been idle for too long.
+ * @work: the pool's work for handling these idle workers
+ *
+ * This goes through a pool's idle workers and gets rid of those that have been
+ * idle for at least IDLE_WORKER_TIMEOUT seconds.
+ *
+ * We don't want to disturb isolated CPUs because of a pcpu kworker being
+ * culled, so this also resets worker affinity. This requires a sleepable
+ * context, hence the split between timer callback and work item.
+ */
+static void idle_cull_fn(struct work_struct *work)
+{
+ struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work);
+ LIST_HEAD(cull_list);
+
+ /*
+ * Grabbing wq_pool_attach_mutex here ensures an already-running worker
+ * cannot proceed beyong worker_detach_from_pool() in its self-destruct
+ * path. This is required as a previously-preempted worker could run after
+ * set_worker_dying() has happened but before wake_dying_workers() did.
+ */
+ mutex_lock(&wq_pool_attach_mutex);
+ raw_spin_lock_irq(&pool->lock);
+
+ while (too_many_workers(pool)) {
+ struct worker *worker;
+ unsigned long expires;
+
+ worker = list_entry(pool->idle_list.prev, struct worker, entry);
+ expires = worker->last_active + IDLE_WORKER_TIMEOUT;
if (time_before(jiffies, expires)) {
mod_timer(&pool->idle_timer, expires);
break;
}
- destroy_worker(worker);
+ set_worker_dying(worker, &cull_list);
}
raw_spin_unlock_irq(&pool->lock);
+ wake_dying_workers(&cull_list);
+ mutex_unlock(&wq_pool_attach_mutex);
}
static void send_mayday(struct work_struct *work)
@@ -2028,6 +2404,7 @@ static void send_mayday(struct work_struct *work)
get_pwq(pwq);
list_add_tail(&pwq->mayday_node, &wq->maydays);
wake_up_process(wq->rescuer->task);
+ pwq->stats[PWQ_STAT_MAYDAY]++;
}
}
@@ -2165,9 +2542,7 @@ __acquires(&pool->lock)
{
struct pool_workqueue *pwq = get_work_pwq(work);
struct worker_pool *pool = worker->pool;
- bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
- int work_color;
- struct worker *collision;
+ unsigned long work_data;
#ifdef CONFIG_LOCKDEP
/*
* It is permissible to free the struct work_struct from
@@ -2184,25 +2559,15 @@ __acquires(&pool->lock)
WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
raw_smp_processor_id() != pool->cpu);
- /*
- * A single work shouldn't be executed concurrently by
- * multiple workers on a single cpu. Check whether anyone is
- * already processing the work. If so, defer the work to the
- * currently executing one.
- */
- collision = find_worker_executing_work(pool, work);
- if (unlikely(collision)) {
- move_linked_works(work, &collision->scheduled, NULL);
- return;
- }
-
/* claim and dequeue */
debug_work_deactivate(work);
hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
worker->current_work = work;
worker->current_func = work->func;
worker->current_pwq = pwq;
- work_color = get_work_color(work);
+ worker->current_at = worker->task->se.sum_exec_runtime;
+ work_data = *work_data_bits(work);
+ worker->current_color = get_work_color(work_data);
/*
* Record wq name for cmdline and debug reporting, may get
@@ -2218,18 +2583,16 @@ __acquires(&pool->lock)
* of concurrency management and the next code block will chain
* execution of the pending work items.
*/
- if (unlikely(cpu_intensive))
+ if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))
worker_set_flags(worker, WORKER_CPU_INTENSIVE);
/*
- * Wake up another worker if necessary. The condition is always
- * false for normal per-cpu workers since nr_running would always
- * be >= 1 at this point. This is used to chain execution of the
- * pending work items for WORKER_NOT_RUNNING workers such as the
- * UNBOUND and CPU_INTENSIVE ones.
+ * Kick @pool if necessary. It's always noop for per-cpu worker pools
+ * since nr_running would always be >= 1 at this point. This is used to
+ * chain execution of the pending work items for WORKER_NOT_RUNNING
+ * workers such as the UNBOUND and CPU_INTENSIVE ones.
*/
- if (need_more_worker(pool))
- wake_up_worker(pool);
+ kick_pool(pool);
/*
* Record the last pool and clear PENDING which should be the last
@@ -2239,6 +2602,7 @@ __acquires(&pool->lock)
*/
set_work_pool_and_clear_pending(work, pool->id);
+ pwq->stats[PWQ_STAT_STARTED]++;
raw_spin_unlock_irq(&pool->lock);
lock_map_acquire(&pwq->wq->lockdep_map);
@@ -2272,6 +2636,7 @@ __acquires(&pool->lock)
* point will only record its address.
*/
trace_workqueue_execute_end(work, worker->current_func);
+ pwq->stats[PWQ_STAT_COMPLETED]++;
lock_map_release(&lockdep_map);
lock_map_release(&pwq->wq->lockdep_map);
@@ -2296,9 +2661,12 @@ __acquires(&pool->lock)
raw_spin_lock_irq(&pool->lock);
- /* clear cpu intensive status */
- if (unlikely(cpu_intensive))
- worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
+ /*
+ * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
+ * CPU intensive by wq_worker_tick() if @work hogged CPU longer than
+ * wq_cpu_intensive_thresh_us. Clear it.
+ */
+ worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
/* tag the worker for identification in schedule() */
worker->last_func = worker->current_func;
@@ -2308,7 +2676,8 @@ __acquires(&pool->lock)
worker->current_work = NULL;
worker->current_func = NULL;
worker->current_pwq = NULL;
- pwq_dec_nr_in_flight(pwq, work_color);
+ worker->current_color = INT_MAX;
+ pwq_dec_nr_in_flight(pwq, work_data);
}
/**
@@ -2325,9 +2694,15 @@ __acquires(&pool->lock)
*/
static void process_scheduled_works(struct worker *worker)
{
- while (!list_empty(&worker->scheduled)) {
- struct work_struct *work = list_first_entry(&worker->scheduled,
- struct work_struct, entry);
+ struct work_struct *work;
+ bool first = true;
+
+ while ((work = list_first_entry_or_null(&worker->scheduled,
+ struct work_struct, entry))) {
+ if (first) {
+ worker->pool->watchdog_ts = jiffies;
+ first = false;
+ }
process_one_work(worker, work);
}
}
@@ -2367,12 +2742,12 @@ woke_up:
/* am I supposed to die? */
if (unlikely(worker->flags & WORKER_DIE)) {
raw_spin_unlock_irq(&pool->lock);
- WARN_ON_ONCE(!list_empty(&worker->entry));
set_pf_worker(false);
set_task_comm(worker->task, "kworker/dying");
- ida_simple_remove(&pool->worker_ida, worker->id);
+ ida_free(&pool->worker_ida, worker->id);
worker_detach_from_pool(worker);
+ WARN_ON_ONCE(!list_empty(&worker->entry));
kfree(worker);
return 0;
}
@@ -2408,17 +2783,8 @@ recheck:
list_first_entry(&pool->worklist,
struct work_struct, entry);
- pool->watchdog_ts = jiffies;
-
- if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
- /* optimization path, not strictly necessary */
- process_one_work(worker, work);
- if (unlikely(!list_empty(&worker->scheduled)))
- process_scheduled_works(worker);
- } else {
- move_linked_works(work, &worker->scheduled, NULL);
+ if (assign_work(work, worker, NULL))
process_scheduled_works(worker);
- }
} while (keep_working(pool));
worker_set_flags(worker, WORKER_PREP);
@@ -2462,7 +2828,6 @@ static int rescuer_thread(void *__rescuer)
{
struct worker *rescuer = __rescuer;
struct workqueue_struct *wq = rescuer->rescue_wq;
- struct list_head *scheduled = &rescuer->scheduled;
bool should_stop;
set_user_nice(current, RESCUER_NICE_LEVEL);
@@ -2493,7 +2858,6 @@ repeat:
struct pool_workqueue, mayday_node);
struct worker_pool *pool = pwq->pool;
struct work_struct *work, *n;
- bool first = true;
__set_current_state(TASK_RUNNING);
list_del_init(&pwq->mayday_node);
@@ -2508,23 +2872,20 @@ repeat:
* Slurp in all works issued via this workqueue and
* process'em.
*/
- WARN_ON_ONCE(!list_empty(scheduled));
+ WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
list_for_each_entry_safe(work, n, &pool->worklist, entry) {
- if (get_work_pwq(work) == pwq) {
- if (first)
- pool->watchdog_ts = jiffies;
- move_linked_works(work, scheduled, &n);
- }
- first = false;
+ if (get_work_pwq(work) == pwq &&
+ assign_work(work, rescuer, &n))
+ pwq->stats[PWQ_STAT_RESCUED]++;
}
- if (!list_empty(scheduled)) {
+ if (!list_empty(&rescuer->scheduled)) {
process_scheduled_works(rescuer);
/*
* The above execution of rescued work items could
* have created more to rescue through
- * pwq_activate_first_delayed() or chained
+ * pwq_activate_first_inactive() or chained
* queueing. Let's put @pwq back on mayday list so
* that such back-to-back work items, which may be
* being used to relieve memory pressure, don't
@@ -2551,12 +2912,10 @@ repeat:
put_pwq(pwq);
/*
- * Leave this pool. If need_more_worker() is %true, notify a
- * regular worker; otherwise, we end up with 0 concurrency
- * and stalling the execution.
+ * Leave this pool. Notify regular workers; otherwise, we end up
+ * with 0 concurrency and stalling the execution.
*/
- if (need_more_worker(pool))
- wake_up_worker(pool);
+ kick_pool(pool);
raw_spin_unlock_irq(&pool->lock);
@@ -2651,8 +3010,9 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
struct wq_barrier *barr,
struct work_struct *target, struct worker *worker)
{
+ unsigned int work_flags = 0;
+ unsigned int work_color;
struct list_head *head;
- unsigned int linked = 0;
/*
* debugobject calls are safe here even with pool->lock locked
@@ -2667,24 +3027,30 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
barr->task = current;
+ /* The barrier work item does not participate in pwq->nr_active. */
+ work_flags |= WORK_STRUCT_INACTIVE;
+
/*
* If @target is currently being executed, schedule the
* barrier to the worker; otherwise, put it after @target.
*/
- if (worker)
+ if (worker) {
head = worker->scheduled.next;
- else {
+ work_color = worker->current_color;
+ } else {
unsigned long *bits = work_data_bits(target);
head = target->entry.next;
/* there can already be other linked works, inherit and set */
- linked = *bits & WORK_STRUCT_LINKED;
+ work_flags |= *bits & WORK_STRUCT_LINKED;
+ work_color = get_work_color(*bits);
__set_bit(WORK_STRUCT_LINKED_BIT, bits);
}
- debug_work_activate(&barr->work);
- insert_work(pwq, &barr->work, head,
- work_color_to_flags(WORK_NO_COLOR) | linked);
+ pwq->nr_in_flight[work_color]++;
+ work_flags |= work_color_to_flags(work_color);
+
+ insert_work(pwq, &barr->work, head, work_flags);
}
/**
@@ -2759,13 +3125,13 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
}
/**
- * flush_workqueue - ensure that any scheduled work has run to completion.
+ * __flush_workqueue - ensure that any scheduled work has run to completion.
* @wq: workqueue to flush
*
* This function sleeps until all work items which were queued on entry
* have finished execution, but it is not livelocked by new incoming ones.
*/
-void flush_workqueue(struct workqueue_struct *wq)
+void __flush_workqueue(struct workqueue_struct *wq)
{
struct wq_flusher this_flusher = {
.list = LIST_HEAD_INIT(this_flusher.list),
@@ -2914,7 +3280,7 @@ void flush_workqueue(struct workqueue_struct *wq)
out_unlock:
mutex_unlock(&wq->mutex);
}
-EXPORT_SYMBOL(flush_workqueue);
+EXPORT_SYMBOL(__flush_workqueue);
/**
* drain_workqueue - drain a workqueue
@@ -2942,7 +3308,7 @@ void drain_workqueue(struct workqueue_struct *wq)
wq->flags |= __WQ_DRAINING;
mutex_unlock(&wq->mutex);
reflush:
- flush_workqueue(wq);
+ __flush_workqueue(wq);
mutex_lock(&wq->mutex);
@@ -2950,7 +3316,7 @@ reflush:
bool drained;
raw_spin_lock_irq(&pwq->pool->lock);
- drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
+ drained = !pwq->nr_active && list_empty(&pwq->inactive_works);
raw_spin_unlock_irq(&pwq->pool->lock);
if (drained)
@@ -2958,8 +3324,8 @@ reflush:
if (++flush_cnt == 10 ||
(flush_cnt % 100 == 0 && flush_cnt <= 1000))
- pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n",
- wq->name, flush_cnt);
+ pr_warn("workqueue %s: %s() isn't complete after %u tries\n",
+ wq->name, __func__, flush_cnt);
mutex_unlock(&wq->mutex);
goto reflush;
@@ -3037,10 +3403,8 @@ static bool __flush_work(struct work_struct *work, bool from_cancel)
if (WARN_ON(!work->func))
return false;
- if (!from_cancel) {
- lock_map_acquire(&work->lockdep_map);
- lock_map_release(&work->lockdep_map);
- }
+ lock_map_acquire(&work->lockdep_map);
+ lock_map_release(&work->lockdep_map);
if (start_flush_work(work, &barr, from_cancel)) {
wait_for_completion(&barr.done);
@@ -3229,6 +3593,15 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork)
return ret;
}
+/*
+ * See cancel_delayed_work()
+ */
+bool cancel_work(struct work_struct *work)
+{
+ return __cancel_work(work, false);
+}
+EXPORT_SYMBOL(cancel_work);
+
/**
* cancel_delayed_work - cancel a delayed work
* @dwork: delayed_work to cancel
@@ -3286,7 +3659,7 @@ int schedule_on_each_cpu(work_func_t func)
if (!works)
return -ENOMEM;
- get_online_cpus();
+ cpus_read_lock();
for_each_online_cpu(cpu) {
struct work_struct *work = per_cpu_ptr(works, cpu);
@@ -3298,7 +3671,7 @@ int schedule_on_each_cpu(work_func_t func)
for_each_online_cpu(cpu)
flush_work(per_cpu_ptr(works, cpu));
- put_online_cpus();
+ cpus_read_unlock();
free_percpu(works);
return 0;
}
@@ -3339,6 +3712,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs)
{
if (attrs) {
free_cpumask_var(attrs->cpumask);
+ free_cpumask_var(attrs->__pod_cpumask);
kfree(attrs);
}
}
@@ -3360,8 +3734,11 @@ struct workqueue_attrs *alloc_workqueue_attrs(void)
goto fail;
if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))
goto fail;
+ if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL))
+ goto fail;
cpumask_copy(attrs->cpumask, cpu_possible_mask);
+ attrs->affn_scope = WQ_AFFN_DFL;
return attrs;
fail:
free_workqueue_attrs(attrs);
@@ -3373,12 +3750,26 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
{
to->nice = from->nice;
cpumask_copy(to->cpumask, from->cpumask);
+ cpumask_copy(to->__pod_cpumask, from->__pod_cpumask);
+ to->affn_strict = from->affn_strict;
+
/*
- * Unlike hash and equality test, this function doesn't ignore
- * ->no_numa as it is used for both pool and wq attrs. Instead,
- * get_unbound_pool() explicitly clears ->no_numa after copying.
+ * Unlike hash and equality test, copying shouldn't ignore wq-only
+ * fields as copying is used for both pool and wq attrs. Instead,
+ * get_unbound_pool() explicitly clears the fields.
*/
- to->no_numa = from->no_numa;
+ to->affn_scope = from->affn_scope;
+ to->ordered = from->ordered;
+}
+
+/*
+ * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the
+ * comments in 'struct workqueue_attrs' definition.
+ */
+static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs)
+{
+ attrs->affn_scope = WQ_AFFN_NR_TYPES;
+ attrs->ordered = false;
}
/* hash value of the content of @attr */
@@ -3389,6 +3780,9 @@ static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
hash = jhash_1word(attrs->nice, hash);
hash = jhash(cpumask_bits(attrs->cpumask),
BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
+ hash = jhash(cpumask_bits(attrs->__pod_cpumask),
+ BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
+ hash = jhash_1word(attrs->affn_strict, hash);
return hash;
}
@@ -3400,9 +3794,57 @@ static bool wqattrs_equal(const struct workqueue_attrs *a,
return false;
if (!cpumask_equal(a->cpumask, b->cpumask))
return false;
+ if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask))
+ return false;
+ if (a->affn_strict != b->affn_strict)
+ return false;
return true;
}
+/* Update @attrs with actually available CPUs */
+static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs,
+ const cpumask_t *unbound_cpumask)
+{
+ /*
+ * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If
+ * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to
+ * @unbound_cpumask.
+ */
+ cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask);
+ if (unlikely(cpumask_empty(attrs->cpumask)))
+ cpumask_copy(attrs->cpumask, unbound_cpumask);
+}
+
+/* find wq_pod_type to use for @attrs */
+static const struct wq_pod_type *
+wqattrs_pod_type(const struct workqueue_attrs *attrs)
+{
+ enum wq_affn_scope scope;
+ struct wq_pod_type *pt;
+
+ /* to synchronize access to wq_affn_dfl */
+ lockdep_assert_held(&wq_pool_mutex);
+
+ if (attrs->affn_scope == WQ_AFFN_DFL)
+ scope = wq_affn_dfl;
+ else
+ scope = attrs->affn_scope;
+
+ pt = &wq_pod_types[scope];
+
+ if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) &&
+ likely(pt->nr_pods))
+ return pt;
+
+ /*
+ * Before workqueue_init_topology(), only SYSTEM is available which is
+ * initialized in workqueue_init_early().
+ */
+ pt = &wq_pod_types[WQ_AFFN_SYSTEM];
+ BUG_ON(!pt->nr_pods);
+ return pt;
+}
+
/**
* init_worker_pool - initialize a newly zalloc'd worker_pool
* @pool: worker_pool to initialize
@@ -3426,10 +3868,12 @@ static int init_worker_pool(struct worker_pool *pool)
hash_init(pool->busy_hash);
timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
+ INIT_WORK(&pool->idle_cull_work, idle_cull_fn);
timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
INIT_LIST_HEAD(&pool->workers);
+ INIT_LIST_HEAD(&pool->dying_workers);
ida_init(&pool->worker_ida);
INIT_HLIST_NODE(&pool->hash_node);
@@ -3439,6 +3883,9 @@ static int init_worker_pool(struct worker_pool *pool)
pool->attrs = alloc_workqueue_attrs();
if (!pool->attrs)
return -ENOMEM;
+
+ wqattrs_clear_for_pool(pool->attrs);
+
return 0;
}
@@ -3486,12 +3933,8 @@ static void rcu_free_wq(struct rcu_head *rcu)
container_of(rcu, struct workqueue_struct, rcu);
wq_free_lockdep(wq);
-
- if (!(wq->flags & WQ_UNBOUND))
- free_percpu(wq->cpu_pwqs);
- else
- free_workqueue_attrs(wq->unbound_attrs);
-
+ free_percpu(wq->cpu_pwq);
+ free_workqueue_attrs(wq->unbound_attrs);
kfree(wq);
}
@@ -3504,18 +3947,6 @@ static void rcu_free_pool(struct rcu_head *rcu)
kfree(pool);
}
-/* This returns with the lock held on success (pool manager is inactive). */
-static bool wq_manager_inactive(struct worker_pool *pool)
-{
- raw_spin_lock_irq(&pool->lock);
-
- if (pool->flags & POOL_MANAGER_ACTIVE) {
- raw_spin_unlock_irq(&pool->lock);
- return false;
- }
- return true;
-}
-
/**
* put_unbound_pool - put a worker_pool
* @pool: worker_pool to put
@@ -3531,6 +3962,7 @@ static void put_unbound_pool(struct worker_pool *pool)
{
DECLARE_COMPLETION_ONSTACK(detach_completion);
struct worker *worker;
+ LIST_HEAD(cull_list);
lockdep_assert_held(&wq_pool_mutex);
@@ -3551,20 +3983,38 @@ static void put_unbound_pool(struct worker_pool *pool)
* Become the manager and destroy all workers. This prevents
* @pool's workers from blocking on attach_mutex. We're the last
* manager and @pool gets freed with the flag set.
- * Because of how wq_manager_inactive() works, we will hold the
- * spinlock after a successful wait.
+ *
+ * Having a concurrent manager is quite unlikely to happen as we can
+ * only get here with
+ * pwq->refcnt == pool->refcnt == 0
+ * which implies no work queued to the pool, which implies no worker can
+ * become the manager. However a worker could have taken the role of
+ * manager before the refcnts dropped to 0, since maybe_create_worker()
+ * drops pool->lock
*/
- rcuwait_wait_event(&manager_wait, wq_manager_inactive(pool),
- TASK_UNINTERRUPTIBLE);
- pool->flags |= POOL_MANAGER_ACTIVE;
+ while (true) {
+ rcuwait_wait_event(&manager_wait,
+ !(pool->flags & POOL_MANAGER_ACTIVE),
+ TASK_UNINTERRUPTIBLE);
+
+ mutex_lock(&wq_pool_attach_mutex);
+ raw_spin_lock_irq(&pool->lock);
+ if (!(pool->flags & POOL_MANAGER_ACTIVE)) {
+ pool->flags |= POOL_MANAGER_ACTIVE;
+ break;
+ }
+ raw_spin_unlock_irq(&pool->lock);
+ mutex_unlock(&wq_pool_attach_mutex);
+ }
while ((worker = first_idle_worker(pool)))
- destroy_worker(worker);
+ set_worker_dying(worker, &cull_list);
WARN_ON(pool->nr_workers || pool->nr_idle);
raw_spin_unlock_irq(&pool->lock);
- mutex_lock(&wq_pool_attach_mutex);
- if (!list_empty(&pool->workers))
+ wake_dying_workers(&cull_list);
+
+ if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers))
pool->detach_completion = &detach_completion;
mutex_unlock(&wq_pool_attach_mutex);
@@ -3573,6 +4023,7 @@ static void put_unbound_pool(struct worker_pool *pool)
/* shut down the timers */
del_timer_sync(&pool->idle_timer);
+ cancel_work_sync(&pool->idle_cull_work);
del_timer_sync(&pool->mayday_timer);
/* RCU protected to allow dereferences from get_work_pool() */
@@ -3595,10 +4046,10 @@ static void put_unbound_pool(struct worker_pool *pool)
*/
static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
{
+ struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA];
u32 hash = wqattrs_hash(attrs);
struct worker_pool *pool;
- int node;
- int target_node = NUMA_NO_NODE;
+ int pod, node = NUMA_NO_NODE;
lockdep_assert_held(&wq_pool_mutex);
@@ -3610,31 +4061,22 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
}
}
- /* if cpumask is contained inside a NUMA node, we belong to that node */
- if (wq_numa_enabled) {
- for_each_node(node) {
- if (cpumask_subset(attrs->cpumask,
- wq_numa_possible_cpumask[node])) {
- target_node = node;
- break;
- }
+ /* If __pod_cpumask is contained inside a NUMA pod, that's our node */
+ for (pod = 0; pod < pt->nr_pods; pod++) {
+ if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) {
+ node = pt->pod_node[pod];
+ break;
}
}
/* nope, create a new one */
- pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node);
+ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node);
if (!pool || init_worker_pool(pool) < 0)
goto fail;
- lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
+ pool->node = node;
copy_workqueue_attrs(pool->attrs, attrs);
- pool->node = target_node;
-
- /*
- * no_numa isn't a worker_pool attribute, always clear it. See
- * 'struct workqueue_attrs' comments for detail.
- */
- pool->attrs->no_numa = false;
+ wqattrs_clear_for_pool(pool->attrs);
if (worker_pool_assign_id(pool) < 0)
goto fail;
@@ -3660,28 +4102,33 @@ static void rcu_free_pwq(struct rcu_head *rcu)
}
/*
- * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
- * and needs to be destroyed.
+ * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero
+ * refcnt and needs to be destroyed.
*/
-static void pwq_unbound_release_workfn(struct work_struct *work)
+static void pwq_release_workfn(struct kthread_work *work)
{
struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
- unbound_release_work);
+ release_work);
struct workqueue_struct *wq = pwq->wq;
struct worker_pool *pool = pwq->pool;
- bool is_last;
-
- if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
- return;
+ bool is_last = false;
- mutex_lock(&wq->mutex);
- list_del_rcu(&pwq->pwqs_node);
- is_last = list_empty(&wq->pwqs);
- mutex_unlock(&wq->mutex);
+ /*
+ * When @pwq is not linked, it doesn't hold any reference to the
+ * @wq, and @wq is invalid to access.
+ */
+ if (!list_empty(&pwq->pwqs_node)) {
+ mutex_lock(&wq->mutex);
+ list_del_rcu(&pwq->pwqs_node);
+ is_last = list_empty(&wq->pwqs);
+ mutex_unlock(&wq->mutex);
+ }
- mutex_lock(&wq_pool_mutex);
- put_unbound_pool(pool);
- mutex_unlock(&wq_pool_mutex);
+ if (wq->flags & WQ_UNBOUND) {
+ mutex_lock(&wq_pool_mutex);
+ put_unbound_pool(pool);
+ mutex_unlock(&wq_pool_mutex);
+ }
call_rcu(&pwq->rcu, rcu_free_pwq);
@@ -3700,7 +4147,7 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
* @pwq: target pool_workqueue
*
* If @pwq isn't freezing, set @pwq->max_active to the associated
- * workqueue's saved_max_active and activate delayed work items
+ * workqueue's saved_max_active and activate inactive work items
* accordingly. If @pwq is freezing, clear @pwq->max_active to zero.
*/
static void pwq_adjust_max_active(struct pool_workqueue *pwq)
@@ -3727,15 +4174,11 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
if (!freezable || !workqueue_freezing) {
pwq->max_active = wq->saved_max_active;
- while (!list_empty(&pwq->delayed_works) &&
+ while (!list_empty(&pwq->inactive_works) &&
pwq->nr_active < pwq->max_active)
- pwq_activate_first_delayed(pwq);
+ pwq_activate_first_inactive(pwq);
- /*
- * Need to kick a worker after thawed or an unbound wq's
- * max_active is bumped. It's a slow path. Do it always.
- */
- wake_up_worker(pwq->pool);
+ kick_pool(pwq->pool);
} else {
pwq->max_active = 0;
}
@@ -3743,7 +4186,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
}
-/* initialize newly alloced @pwq which is associated with @wq and @pool */
+/* initialize newly allocated @pwq which is associated with @wq and @pool */
static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
struct worker_pool *pool)
{
@@ -3755,10 +4198,10 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
pwq->wq = wq;
pwq->flush_color = -1;
pwq->refcnt = 1;
- INIT_LIST_HEAD(&pwq->delayed_works);
+ INIT_LIST_HEAD(&pwq->inactive_works);
INIT_LIST_HEAD(&pwq->pwqs_node);
INIT_LIST_HEAD(&pwq->mayday_node);
- INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
+ kthread_init_work(&pwq->release_work, pwq_release_workfn);
}
/* sync @pwq with the current state of its associated wq and link it */
@@ -3806,61 +4249,49 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
}
/**
- * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node
+ * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod
* @attrs: the wq_attrs of the default pwq of the target workqueue
- * @node: the target NUMA node
+ * @cpu: the target CPU
* @cpu_going_down: if >= 0, the CPU to consider as offline
- * @cpumask: outarg, the resulting cpumask
*
- * Calculate the cpumask a workqueue with @attrs should use on @node. If
- * @cpu_going_down is >= 0, that cpu is considered offline during
- * calculation. The result is stored in @cpumask.
+ * Calculate the cpumask a workqueue with @attrs should use on @pod. If
+ * @cpu_going_down is >= 0, that cpu is considered offline during calculation.
+ * The result is stored in @attrs->__pod_cpumask.
*
- * If NUMA affinity is not enabled, @attrs->cpumask is always used. If
- * enabled and @node has online CPUs requested by @attrs, the returned
- * cpumask is the intersection of the possible CPUs of @node and
- * @attrs->cpumask.
+ * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled
+ * and @pod has online CPUs requested by @attrs, the returned cpumask is the
+ * intersection of the possible CPUs of @pod and @attrs->cpumask.
*
- * The caller is responsible for ensuring that the cpumask of @node stays
- * stable.
- *
- * Return: %true if the resulting @cpumask is different from @attrs->cpumask,
- * %false if equal.
+ * The caller is responsible for ensuring that the cpumask of @pod stays stable.
*/
-static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
- int cpu_going_down, cpumask_t *cpumask)
+static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu,
+ int cpu_going_down)
{
- if (!wq_numa_enabled || attrs->no_numa)
- goto use_dfl;
+ const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
+ int pod = pt->cpu_pod[cpu];
- /* does @node have any online CPUs @attrs wants? */
- cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
+ /* does @pod have any online CPUs @attrs wants? */
+ cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask);
+ cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask);
if (cpu_going_down >= 0)
- cpumask_clear_cpu(cpu_going_down, cpumask);
+ cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask);
- if (cpumask_empty(cpumask))
- goto use_dfl;
+ if (cpumask_empty(attrs->__pod_cpumask)) {
+ cpumask_copy(attrs->__pod_cpumask, attrs->cpumask);
+ return;
+ }
- /* yeap, return possible CPUs in @node that @attrs wants */
- cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+ /* yeap, return possible CPUs in @pod that @attrs wants */
+ cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]);
- if (cpumask_empty(cpumask)) {
+ if (cpumask_empty(attrs->__pod_cpumask))
pr_warn_once("WARNING: workqueue cpumask: online intersect > "
"possible intersect\n");
- return false;
- }
-
- return !cpumask_equal(cpumask, attrs->cpumask);
-
-use_dfl:
- cpumask_copy(cpumask, attrs->cpumask);
- return false;
}
-/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */
-static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
- int node,
- struct pool_workqueue *pwq)
+/* install @pwq into @wq's cpu_pwq and return the old pwq */
+static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq,
+ int cpu, struct pool_workqueue *pwq)
{
struct pool_workqueue *old_pwq;
@@ -3870,8 +4301,8 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
/* link_pwq() can handle duplicate calls */
link_pwq(pwq);
- old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
- rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq);
+ old_pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu));
+ rcu_assign_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu), pwq);
return old_pwq;
}
@@ -3888,10 +4319,10 @@ struct apply_wqattrs_ctx {
static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
{
if (ctx) {
- int node;
+ int cpu;
- for_each_node(node)
- put_pwq_unlocked(ctx->pwq_tbl[node]);
+ for_each_possible_cpu(cpu)
+ put_pwq_unlocked(ctx->pwq_tbl[cpu]);
put_pwq_unlocked(ctx->dfl_pwq);
free_workqueue_attrs(ctx->attrs);
@@ -3903,78 +4334,68 @@ static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
/* allocate the attrs and pwqs for later installation */
static struct apply_wqattrs_ctx *
apply_wqattrs_prepare(struct workqueue_struct *wq,
- const struct workqueue_attrs *attrs)
+ const struct workqueue_attrs *attrs,
+ const cpumask_var_t unbound_cpumask)
{
struct apply_wqattrs_ctx *ctx;
- struct workqueue_attrs *new_attrs, *tmp_attrs;
- int node;
+ struct workqueue_attrs *new_attrs;
+ int cpu;
lockdep_assert_held(&wq_pool_mutex);
- ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL);
+ if (WARN_ON(attrs->affn_scope < 0 ||
+ attrs->affn_scope >= WQ_AFFN_NR_TYPES))
+ return ERR_PTR(-EINVAL);
+
+ ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL);
new_attrs = alloc_workqueue_attrs();
- tmp_attrs = alloc_workqueue_attrs();
- if (!ctx || !new_attrs || !tmp_attrs)
+ if (!ctx || !new_attrs)
goto out_free;
/*
- * Calculate the attrs of the default pwq.
- * If the user configured cpumask doesn't overlap with the
- * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
- */
- copy_workqueue_attrs(new_attrs, attrs);
- cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
- if (unlikely(cpumask_empty(new_attrs->cpumask)))
- cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask);
-
- /*
- * We may create multiple pwqs with differing cpumasks. Make a
- * copy of @new_attrs which will be modified and used to obtain
- * pools.
- */
- copy_workqueue_attrs(tmp_attrs, new_attrs);
-
- /*
* If something goes wrong during CPU up/down, we'll fall back to
* the default pwq covering whole @attrs->cpumask. Always create
* it even if we don't use it immediately.
*/
+ copy_workqueue_attrs(new_attrs, attrs);
+ wqattrs_actualize_cpumask(new_attrs, unbound_cpumask);
+ cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
if (!ctx->dfl_pwq)
goto out_free;
- for_each_node(node) {
- if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) {
- ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
- if (!ctx->pwq_tbl[node])
- goto out_free;
- } else {
+ for_each_possible_cpu(cpu) {
+ if (new_attrs->ordered) {
ctx->dfl_pwq->refcnt++;
- ctx->pwq_tbl[node] = ctx->dfl_pwq;
+ ctx->pwq_tbl[cpu] = ctx->dfl_pwq;
+ } else {
+ wq_calc_pod_cpumask(new_attrs, cpu, -1);
+ ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs);
+ if (!ctx->pwq_tbl[cpu])
+ goto out_free;
}
}
/* save the user configured attrs and sanitize it. */
copy_workqueue_attrs(new_attrs, attrs);
cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+ cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
ctx->attrs = new_attrs;
ctx->wq = wq;
- free_workqueue_attrs(tmp_attrs);
return ctx;
out_free:
- free_workqueue_attrs(tmp_attrs);
free_workqueue_attrs(new_attrs);
apply_wqattrs_cleanup(ctx);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
{
- int node;
+ int cpu;
/* all pwqs have been created successfully, let's install'em */
mutex_lock(&ctx->wq->mutex);
@@ -3982,9 +4403,9 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
/* save the previous pwq and install the new one */
- for_each_node(node)
- ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node,
- ctx->pwq_tbl[node]);
+ for_each_possible_cpu(cpu)
+ ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu,
+ ctx->pwq_tbl[cpu]);
/* @dfl_pwq might not have been used, ensure it's linked */
link_pwq(ctx->dfl_pwq);
@@ -3993,19 +4414,6 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
mutex_unlock(&ctx->wq->mutex);
}
-static void apply_wqattrs_lock(void)
-{
- /* CPUs should stay stable across pwq creations and installations */
- get_online_cpus();
- mutex_lock(&wq_pool_mutex);
-}
-
-static void apply_wqattrs_unlock(void)
-{
- mutex_unlock(&wq_pool_mutex);
- put_online_cpus();
-}
-
static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs)
{
@@ -4023,9 +4431,9 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
wq->flags &= ~__WQ_ORDERED;
}
- ctx = apply_wqattrs_prepare(wq, attrs);
- if (!ctx)
- return -ENOMEM;
+ ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
/* the ctx has been prepared successfully, let's commit it */
apply_wqattrs_commit(ctx);
@@ -4039,16 +4447,15 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
* @wq: the target workqueue
* @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
*
- * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA
- * machines, this function maps a separate pwq to each NUMA node with
- * possibles CPUs in @attrs->cpumask so that work items are affine to the
- * NUMA node it was issued on. Older pwqs are released as in-flight work
- * items finish. Note that a work item which repeatedly requeues itself
- * back-to-back will stay on its current pwq.
+ * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps
+ * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that
+ * work items are affine to the pod it was issued on. Older pwqs are released as
+ * in-flight work items finish. Note that a work item which repeatedly requeues
+ * itself back-to-back will stay on its current pwq.
*
* Performs GFP_KERNEL allocations.
*
- * Assumes caller has CPU hotplug read exclusion, i.e. get_online_cpus().
+ * Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock().
*
* Return: 0 on success and -errno on failure.
*/
@@ -4067,40 +4474,37 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
}
/**
- * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
+ * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug
* @wq: the target workqueue
- * @cpu: the CPU coming up or going down
+ * @cpu: the CPU to update pool association for
+ * @hotplug_cpu: the CPU coming up or going down
* @online: whether @cpu is coming up or going down
*
* This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
- * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of
+ * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update pod affinity of
* @wq accordingly.
*
- * If NUMA affinity can't be adjusted due to memory allocation failure, it
- * falls back to @wq->dfl_pwq which may not be optimal but is always
- * correct.
- *
- * Note that when the last allowed CPU of a NUMA node goes offline for a
- * workqueue with a cpumask spanning multiple nodes, the workers which were
- * already executing the work items for the workqueue will lose their CPU
- * affinity and may execute on any CPU. This is similar to how per-cpu
- * workqueues behave on CPU_DOWN. If a workqueue user wants strict
- * affinity, it's the user's responsibility to flush the work item from
- * CPU_DOWN_PREPARE.
+ *
+ * If pod affinity can't be adjusted due to memory allocation failure, it falls
+ * back to @wq->dfl_pwq which may not be optimal but is always correct.
+ *
+ * Note that when the last allowed CPU of a pod goes offline for a workqueue
+ * with a cpumask spanning multiple pods, the workers which were already
+ * executing the work items for the workqueue will lose their CPU affinity and
+ * may execute on any CPU. This is similar to how per-cpu workqueues behave on
+ * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's
+ * responsibility to flush the work item from CPU_DOWN_PREPARE.
*/
-static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
- bool online)
+static void wq_update_pod(struct workqueue_struct *wq, int cpu,
+ int hotplug_cpu, bool online)
{
- int node = cpu_to_node(cpu);
- int cpu_off = online ? -1 : cpu;
+ int off_cpu = online ? -1 : hotplug_cpu;
struct pool_workqueue *old_pwq = NULL, *pwq;
struct workqueue_attrs *target_attrs;
- cpumask_t *cpumask;
lockdep_assert_held(&wq_pool_mutex);
- if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) ||
- wq->unbound_attrs->no_numa)
+ if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered)
return;
/*
@@ -4108,36 +4512,29 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
* Let's use a preallocated one. The following buf is protected by
* CPU hotplug exclusion.
*/
- target_attrs = wq_update_unbound_numa_attrs_buf;
- cpumask = target_attrs->cpumask;
+ target_attrs = wq_update_pod_attrs_buf;
copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
- pwq = unbound_pwq_by_node(wq, node);
+ wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask);
- /*
- * Let's determine what needs to be done. If the target cpumask is
- * different from the default pwq's, we need to compare it to @pwq's
- * and create a new one if they don't match. If the target cpumask
- * equals the default pwq's, the default pwq should be used.
- */
- if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) {
- if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
- return;
- } else {
- goto use_dfl_pwq;
- }
+ /* nothing to do if the target cpumask matches the current pwq */
+ wq_calc_pod_cpumask(target_attrs, cpu, off_cpu);
+ pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu),
+ lockdep_is_held(&wq_pool_mutex));
+ if (wqattrs_equal(target_attrs, pwq->pool->attrs))
+ return;
/* create a new pwq */
pwq = alloc_unbound_pwq(wq, target_attrs);
if (!pwq) {
- pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
+ pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n",
wq->name);
goto use_dfl_pwq;
}
/* Install the new pwq. */
mutex_lock(&wq->mutex);
- old_pwq = numa_pwq_tbl_install(wq, node, pwq);
+ old_pwq = install_unbound_pwq(wq, cpu, pwq);
goto out_unlock;
use_dfl_pwq:
@@ -4145,7 +4542,7 @@ use_dfl_pwq:
raw_spin_lock_irq(&wq->dfl_pwq->pool->lock);
get_pwq(wq->dfl_pwq);
raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock);
- old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
+ old_pwq = install_unbound_pwq(wq, cpu, wq->dfl_pwq);
out_unlock:
mutex_unlock(&wq->mutex);
put_pwq_unlocked(old_pwq);
@@ -4156,27 +4553,32 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
bool highpri = wq->flags & WQ_HIGHPRI;
int cpu, ret;
- if (!(wq->flags & WQ_UNBOUND)) {
- wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
- if (!wq->cpu_pwqs)
- return -ENOMEM;
+ wq->cpu_pwq = alloc_percpu(struct pool_workqueue *);
+ if (!wq->cpu_pwq)
+ goto enomem;
+ if (!(wq->flags & WQ_UNBOUND)) {
for_each_possible_cpu(cpu) {
- struct pool_workqueue *pwq =
- per_cpu_ptr(wq->cpu_pwqs, cpu);
- struct worker_pool *cpu_pools =
- per_cpu(cpu_worker_pools, cpu);
+ struct pool_workqueue **pwq_p =
+ per_cpu_ptr(wq->cpu_pwq, cpu);
+ struct worker_pool *pool =
+ &(per_cpu_ptr(cpu_worker_pools, cpu)[highpri]);
- init_pwq(pwq, wq, &cpu_pools[highpri]);
+ *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL,
+ pool->node);
+ if (!*pwq_p)
+ goto enomem;
+
+ init_pwq(*pwq_p, wq, pool);
mutex_lock(&wq->mutex);
- link_pwq(pwq);
+ link_pwq(*pwq_p);
mutex_unlock(&wq->mutex);
}
return 0;
}
- get_online_cpus();
+ cpus_read_lock();
if (wq->flags & __WQ_ORDERED) {
ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
/* there should only be single pwq for ordering guarantee */
@@ -4186,21 +4588,38 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
} else {
ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
}
- put_online_cpus();
+ cpus_read_unlock();
+
+ /* for unbound pwq, flush the pwq_release_worker ensures that the
+ * pwq_release_workfn() completes before calling kfree(wq).
+ */
+ if (ret)
+ kthread_flush_worker(pwq_release_worker);
return ret;
+
+enomem:
+ if (wq->cpu_pwq) {
+ for_each_possible_cpu(cpu) {
+ struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
+
+ if (pwq)
+ kmem_cache_free(pwq_cache, pwq);
+ }
+ free_percpu(wq->cpu_pwq);
+ wq->cpu_pwq = NULL;
+ }
+ return -ENOMEM;
}
static int wq_clamp_max_active(int max_active, unsigned int flags,
const char *name)
{
- int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
-
- if (max_active < 1 || max_active > lim)
+ if (max_active < 1 || max_active > WQ_MAX_ACTIVE)
pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
- max_active, name, 1, lim);
+ max_active, name, 1, WQ_MAX_ACTIVE);
- return clamp_val(max_active, 1, lim);
+ return clamp_val(max_active, 1, WQ_MAX_ACTIVE);
}
/*
@@ -4216,13 +4635,18 @@ static int init_rescuer(struct workqueue_struct *wq)
return 0;
rescuer = alloc_worker(NUMA_NO_NODE);
- if (!rescuer)
+ if (!rescuer) {
+ pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n",
+ wq->name);
return -ENOMEM;
+ }
rescuer->rescue_wq = wq;
- rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name);
+ rescuer->task = kthread_create(rescuer_thread, rescuer, "kworker/R-%s", wq->name);
if (IS_ERR(rescuer->task)) {
ret = PTR_ERR(rescuer->task);
+ pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe",
+ wq->name, ERR_PTR(ret));
kfree(rescuer);
return ret;
}
@@ -4239,17 +4663,15 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
unsigned int flags,
int max_active, ...)
{
- size_t tbl_size = 0;
va_list args;
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
/*
- * Unbound && max_active == 1 used to imply ordered, which is no
- * longer the case on NUMA machines due to per-node pools. While
+ * Unbound && max_active == 1 used to imply ordered, which is no longer
+ * the case on many machines due to per-pod pools. While
* alloc_ordered_workqueue() is the right way to create an ordered
- * workqueue, keep the previous behavior to avoid subtle breakages
- * on NUMA.
+ * workqueue, keep the previous behavior to avoid subtle breakages.
*/
if ((flags & WQ_UNBOUND) && max_active == 1)
flags |= __WQ_ORDERED;
@@ -4259,10 +4681,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
flags |= WQ_UNBOUND;
/* allocate wq and format name */
- if (flags & WQ_UNBOUND)
- tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);
-
- wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
+ wq = kzalloc(sizeof(*wq), GFP_KERNEL);
if (!wq)
return NULL;
@@ -4342,7 +4761,7 @@ static bool pwq_busy(struct pool_workqueue *pwq)
if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1))
return true;
- if (pwq->nr_active || !list_empty(&pwq->delayed_works))
+ if (pwq->nr_active || !list_empty(&pwq->inactive_works))
return true;
return false;
@@ -4357,7 +4776,7 @@ static bool pwq_busy(struct pool_workqueue *pwq)
void destroy_workqueue(struct workqueue_struct *wq)
{
struct pool_workqueue *pwq;
- int node;
+ int cpu;
/*
* Remove it from sysfs first so that sanity check failure doesn't
@@ -4365,6 +4784,11 @@ void destroy_workqueue(struct workqueue_struct *wq)
*/
workqueue_sysfs_unregister(wq);
+ /* mark the workqueue destruction is in progress */
+ mutex_lock(&wq->mutex);
+ wq->flags |= __WQ_DESTROYING;
+ mutex_unlock(&wq->mutex);
+
/* drain it before proceeding with destruction */
drain_workqueue(wq);
@@ -4397,7 +4821,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
raw_spin_unlock_irq(&pwq->pool->lock);
mutex_unlock(&wq->mutex);
mutex_unlock(&wq_pool_mutex);
- show_workqueue_state();
+ show_one_workqueue(wq);
return;
}
raw_spin_unlock_irq(&pwq->pool->lock);
@@ -4411,33 +4835,23 @@ void destroy_workqueue(struct workqueue_struct *wq)
list_del_rcu(&wq->list);
mutex_unlock(&wq_pool_mutex);
- if (!(wq->flags & WQ_UNBOUND)) {
- wq_unregister_lockdep(wq);
- /*
- * The base ref is never dropped on per-cpu pwqs. Directly
- * schedule RCU free.
- */
- call_rcu(&wq->rcu, rcu_free_wq);
- } else {
- /*
- * We're the sole accessor of @wq at this point. Directly
- * access numa_pwq_tbl[] and dfl_pwq to put the base refs.
- * @wq will be freed when the last pwq is released.
- */
- for_each_node(node) {
- pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
- RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL);
- put_pwq_unlocked(pwq);
- }
+ /*
+ * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq
+ * to put the base refs. @wq will be auto-destroyed from the last
+ * pwq_put. RCU read lock prevents @wq from going away from under us.
+ */
+ rcu_read_lock();
- /*
- * Put dfl_pwq. @wq may be freed any time after dfl_pwq is
- * put. Don't access it afterwards.
- */
- pwq = wq->dfl_pwq;
- wq->dfl_pwq = NULL;
+ for_each_possible_cpu(cpu) {
+ pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu));
+ RCU_INIT_POINTER(*per_cpu_ptr(wq->cpu_pwq, cpu), NULL);
put_pwq_unlocked(pwq);
}
+
+ put_pwq_unlocked(wq->dfl_pwq);
+ wq->dfl_pwq = NULL;
+
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(destroy_workqueue);
@@ -4514,10 +4928,11 @@ bool current_is_workqueue_rescuer(void)
* unreliable and only useful as advisory hints or for debugging.
*
* If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
- * Note that both per-cpu and unbound workqueues may be associated with
- * multiple pool_workqueues which have separate congested states. A
- * workqueue being congested on one CPU doesn't mean the workqueue is also
- * contested on other CPUs / NUMA nodes.
+ *
+ * With the exception of ordered workqueues, all workqueues have per-cpu
+ * pool_workqueues, each with its own congested state. A workqueue being
+ * congested on one CPU doesn't mean that the workqueue is contested on any
+ * other CPUs.
*
* Return:
* %true if congested, %false otherwise.
@@ -4533,12 +4948,9 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
if (cpu == WORK_CPU_UNBOUND)
cpu = smp_processor_id();
- if (!(wq->flags & WQ_UNBOUND))
- pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
- else
- pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
+ pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
+ ret = !list_empty(&pwq->inactive_works);
- ret = !list_empty(&pwq->delayed_works);
preempt_enable();
rcu_read_unlock();
@@ -4660,22 +5072,53 @@ static void pr_cont_pool_info(struct worker_pool *pool)
pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
}
-static void pr_cont_work(bool comma, struct work_struct *work)
+struct pr_cont_work_struct {
+ bool comma;
+ work_func_t func;
+ long ctr;
+};
+
+static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp)
+{
+ if (!pcwsp->ctr)
+ goto out_record;
+ if (func == pcwsp->func) {
+ pcwsp->ctr++;
+ return;
+ }
+ if (pcwsp->ctr == 1)
+ pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func);
+ else
+ pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func);
+ pcwsp->ctr = 0;
+out_record:
+ if ((long)func == -1L)
+ return;
+ pcwsp->comma = comma;
+ pcwsp->func = func;
+ pcwsp->ctr = 1;
+}
+
+static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp)
{
if (work->func == wq_barrier_func) {
struct wq_barrier *barr;
barr = container_of(work, struct wq_barrier, work);
+ pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
pr_cont("%s BAR(%d)", comma ? "," : "",
task_pid_nr(barr->task));
} else {
- pr_cont("%s %ps", comma ? "," : "", work->func);
+ if (!comma)
+ pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
+ pr_cont_work_flush(comma, work->func, pcwsp);
}
}
static void show_pwq(struct pool_workqueue *pwq)
{
+ struct pr_cont_work_struct pcws = { .ctr = 0, };
struct worker_pool *pool = pwq->pool;
struct work_struct *work;
struct worker *worker;
@@ -4708,7 +5151,8 @@ static void show_pwq(struct pool_workqueue *pwq)
worker->rescue_wq ? "(RESCUER)" : "",
worker->current_func);
list_for_each_entry(work, &worker->scheduled, entry)
- pr_cont_work(false, work);
+ pr_cont_work(false, work, &pcws);
+ pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
comma = true;
}
pr_cont("\n");
@@ -4728,100 +5172,161 @@ static void show_pwq(struct pool_workqueue *pwq)
if (get_work_pwq(work) != pwq)
continue;
- pr_cont_work(comma, work);
+ pr_cont_work(comma, work, &pcws);
comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
}
+ pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
pr_cont("\n");
}
- if (!list_empty(&pwq->delayed_works)) {
+ if (!list_empty(&pwq->inactive_works)) {
bool comma = false;
- pr_info(" delayed:");
- list_for_each_entry(work, &pwq->delayed_works, entry) {
- pr_cont_work(comma, work);
+ pr_info(" inactive:");
+ list_for_each_entry(work, &pwq->inactive_works, entry) {
+ pr_cont_work(comma, work, &pcws);
comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
}
+ pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
pr_cont("\n");
}
}
/**
- * show_workqueue_state - dump workqueue state
+ * show_one_workqueue - dump state of specified workqueue
+ * @wq: workqueue whose state will be printed
+ */
+void show_one_workqueue(struct workqueue_struct *wq)
+{
+ struct pool_workqueue *pwq;
+ bool idle = true;
+ unsigned long flags;
+
+ for_each_pwq(pwq, wq) {
+ if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
+ idle = false;
+ break;
+ }
+ }
+ if (idle) /* Nothing to print for idle workqueue */
+ return;
+
+ pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
+
+ for_each_pwq(pwq, wq) {
+ raw_spin_lock_irqsave(&pwq->pool->lock, flags);
+ if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
+ /*
+ * Defer printing to avoid deadlocks in console
+ * drivers that queue work while holding locks
+ * also taken in their write paths.
+ */
+ printk_deferred_enter();
+ show_pwq(pwq);
+ printk_deferred_exit();
+ }
+ raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ /*
+ * We could be printing a lot from atomic context, e.g.
+ * sysrq-t -> show_all_workqueues(). Avoid triggering
+ * hard lockup.
+ */
+ touch_nmi_watchdog();
+ }
+
+}
+
+/**
+ * show_one_worker_pool - dump state of specified worker pool
+ * @pool: worker pool whose state will be printed
+ */
+static void show_one_worker_pool(struct worker_pool *pool)
+{
+ struct worker *worker;
+ bool first = true;
+ unsigned long flags;
+ unsigned long hung = 0;
+
+ raw_spin_lock_irqsave(&pool->lock, flags);
+ if (pool->nr_workers == pool->nr_idle)
+ goto next_pool;
+
+ /* How long the first pending work is waiting for a worker. */
+ if (!list_empty(&pool->worklist))
+ hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000;
+
+ /*
+ * Defer printing to avoid deadlocks in console drivers that
+ * queue work while holding locks also taken in their write
+ * paths.
+ */
+ printk_deferred_enter();
+ pr_info("pool %d:", pool->id);
+ pr_cont_pool_info(pool);
+ pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers);
+ if (pool->manager)
+ pr_cont(" manager: %d",
+ task_pid_nr(pool->manager->task));
+ list_for_each_entry(worker, &pool->idle_list, entry) {
+ pr_cont(" %s%d", first ? "idle: " : "",
+ task_pid_nr(worker->task));
+ first = false;
+ }
+ pr_cont("\n");
+ printk_deferred_exit();
+next_pool:
+ raw_spin_unlock_irqrestore(&pool->lock, flags);
+ /*
+ * We could be printing a lot from atomic context, e.g.
+ * sysrq-t -> show_all_workqueues(). Avoid triggering
+ * hard lockup.
+ */
+ touch_nmi_watchdog();
+
+}
+
+/**
+ * show_all_workqueues - dump workqueue state
*
- * Called from a sysrq handler or try_to_freeze_tasks() and prints out
- * all busy workqueues and pools.
+ * Called from a sysrq handler and prints out all busy workqueues and pools.
*/
-void show_workqueue_state(void)
+void show_all_workqueues(void)
{
struct workqueue_struct *wq;
struct worker_pool *pool;
- unsigned long flags;
int pi;
rcu_read_lock();
pr_info("Showing busy workqueues and worker pools:\n");
- list_for_each_entry_rcu(wq, &workqueues, list) {
- struct pool_workqueue *pwq;
- bool idle = true;
+ list_for_each_entry_rcu(wq, &workqueues, list)
+ show_one_workqueue(wq);
- for_each_pwq(pwq, wq) {
- if (pwq->nr_active || !list_empty(&pwq->delayed_works)) {
- idle = false;
- break;
- }
- }
- if (idle)
- continue;
+ for_each_pool(pool, pi)
+ show_one_worker_pool(pool);
- pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
+ rcu_read_unlock();
+}
- for_each_pwq(pwq, wq) {
- raw_spin_lock_irqsave(&pwq->pool->lock, flags);
- if (pwq->nr_active || !list_empty(&pwq->delayed_works))
- show_pwq(pwq);
- raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
- /*
- * We could be printing a lot from atomic context, e.g.
- * sysrq-t -> show_workqueue_state(). Avoid triggering
- * hard lockup.
- */
- touch_nmi_watchdog();
- }
- }
+/**
+ * show_freezable_workqueues - dump freezable workqueue state
+ *
+ * Called from try_to_freeze_tasks() and prints out all freezable workqueues
+ * still busy.
+ */
+void show_freezable_workqueues(void)
+{
+ struct workqueue_struct *wq;
- for_each_pool(pool, pi) {
- struct worker *worker;
- bool first = true;
+ rcu_read_lock();
- raw_spin_lock_irqsave(&pool->lock, flags);
- if (pool->nr_workers == pool->nr_idle)
- goto next_pool;
-
- pr_info("pool %d:", pool->id);
- pr_cont_pool_info(pool);
- pr_cont(" hung=%us workers=%d",
- jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000,
- pool->nr_workers);
- if (pool->manager)
- pr_cont(" manager: %d",
- task_pid_nr(pool->manager->task));
- list_for_each_entry(worker, &pool->idle_list, entry) {
- pr_cont(" %s%d", first ? "idle: " : "",
- task_pid_nr(worker->task));
- first = false;
- }
- pr_cont("\n");
- next_pool:
- raw_spin_unlock_irqrestore(&pool->lock, flags);
- /*
- * We could be printing a lot from atomic context, e.g.
- * sysrq-t -> show_workqueue_state(). Avoid triggering
- * hard lockup.
- */
- touch_nmi_watchdog();
+ pr_info("Showing freezable workqueues that are still busy:\n");
+
+ list_for_each_entry_rcu(wq, &workqueues, list) {
+ if (!(wq->flags & WQ_FREEZABLE))
+ continue;
+ show_one_workqueue(wq);
}
rcu_read_unlock();
@@ -4895,44 +5400,39 @@ static void unbind_workers(int cpu)
/*
* We've blocked all attach/detach operations. Make all workers
* unbound and set DISASSOCIATED. Before this, all workers
- * except for the ones which are still executing works from
- * before the last CPU down must be on the cpu. After
- * this, they may become diasporas.
+ * must be on the cpu. After this, they may become diasporas.
+ * And the preemption disabled section in their sched callbacks
+ * are guaranteed to see WORKER_UNBOUND since the code here
+ * is on the same cpu.
*/
for_each_pool_worker(worker, pool)
worker->flags |= WORKER_UNBOUND;
pool->flags |= POOL_DISASSOCIATED;
- raw_spin_unlock_irq(&pool->lock);
- mutex_unlock(&wq_pool_attach_mutex);
-
- /*
- * Call schedule() so that we cross rq->lock and thus can
- * guarantee sched callbacks see the %WORKER_UNBOUND flag.
- * This is necessary as scheduler callbacks may be invoked
- * from other cpus.
- */
- schedule();
-
/*
- * Sched callbacks are disabled now. Zap nr_running.
- * After this, nr_running stays zero and need_more_worker()
- * and keep_working() are always true as long as the
- * worklist is not empty. This pool now behaves as an
- * unbound (in terms of concurrency management) pool which
+ * The handling of nr_running in sched callbacks are disabled
+ * now. Zap nr_running. After this, nr_running stays zero and
+ * need_more_worker() and keep_working() are always true as
+ * long as the worklist is not empty. This pool now behaves as
+ * an unbound (in terms of concurrency management) pool which
* are served by workers tied to the pool.
*/
- atomic_set(&pool->nr_running, 0);
+ pool->nr_running = 0;
/*
* With concurrency management just turned off, a busy
* worker blocking could lead to lengthy stalls. Kick off
* unbound chain execution of currently pending work items.
*/
- raw_spin_lock_irq(&pool->lock);
- wake_up_worker(pool);
+ kick_pool(pool);
+
raw_spin_unlock_irq(&pool->lock);
+
+ for_each_pool_worker(worker, pool)
+ unbind_worker(worker);
+
+ mutex_unlock(&wq_pool_attach_mutex);
}
}
@@ -4955,9 +5455,11 @@ static void rebind_workers(struct worker_pool *pool)
* of all workers first and then clear UNBOUND. As we're called
* from CPU_ONLINE, the following shouldn't fail.
*/
- for_each_pool_worker(worker, pool)
+ for_each_pool_worker(worker, pool) {
+ kthread_set_per_cpu(worker->task, pool->cpu);
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
- pool->attrs->cpumask) < 0);
+ pool_allowed_cpus(pool)) < 0);
+ }
raw_spin_lock_irq(&pool->lock);
@@ -4967,17 +5469,6 @@ static void rebind_workers(struct worker_pool *pool)
unsigned int worker_flags = worker->flags;
/*
- * A bound idle worker should actually be on the runqueue
- * of the associated CPU for local wake-ups targeting it to
- * work. Kick all idle workers so that they migrate to the
- * associated CPU. Doing this in the same loop as
- * replacing UNBOUND with REBOUND is safe as no worker will
- * be bound before @pool->lock is released.
- */
- if (worker_flags & WORKER_IDLE)
- wake_up_process(worker->task);
-
- /*
* We want to clear UNBOUND but can't directly call
* worker_clr_flags() or adjust nr_running. Atomically
* replace UNBOUND with another NOT_RUNNING flag REBOUND.
@@ -5061,9 +5552,18 @@ int workqueue_online_cpu(unsigned int cpu)
mutex_unlock(&wq_pool_attach_mutex);
}
- /* update NUMA affinity of unbound workqueues */
- list_for_each_entry(wq, &workqueues, list)
- wq_update_unbound_numa(wq, cpu, true);
+ /* update pod affinity of unbound workqueues */
+ list_for_each_entry(wq, &workqueues, list) {
+ struct workqueue_attrs *attrs = wq->unbound_attrs;
+
+ if (attrs) {
+ const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
+ int tcpu;
+
+ for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
+ wq_update_pod(wq, tcpu, cpu, true);
+ }
+ }
mutex_unlock(&wq_pool_mutex);
return 0;
@@ -5079,10 +5579,19 @@ int workqueue_offline_cpu(unsigned int cpu)
unbind_workers(cpu);
- /* update NUMA affinity of unbound workqueues */
+ /* update pod affinity of unbound workqueues */
mutex_lock(&wq_pool_mutex);
- list_for_each_entry(wq, &workqueues, list)
- wq_update_unbound_numa(wq, cpu, false);
+ list_for_each_entry(wq, &workqueues, list) {
+ struct workqueue_attrs *attrs = wq->unbound_attrs;
+
+ if (attrs) {
+ const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
+ int tcpu;
+
+ for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
+ wq_update_pod(wq, tcpu, cpu, false);
+ }
+ }
mutex_unlock(&wq_pool_mutex);
return 0;
@@ -5103,50 +5612,54 @@ static void work_for_cpu_fn(struct work_struct *work)
}
/**
- * work_on_cpu - run a function in thread context on a particular cpu
+ * work_on_cpu_key - run a function in thread context on a particular cpu
* @cpu: the cpu to run on
* @fn: the function to run
* @arg: the function arg
+ * @key: The lock class key for lock debugging purposes
*
* It is up to the caller to ensure that the cpu doesn't go offline.
* The caller must not hold any locks which would prevent @fn from completing.
*
* Return: The value @fn returns.
*/
-long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
+long work_on_cpu_key(int cpu, long (*fn)(void *),
+ void *arg, struct lock_class_key *key)
{
struct work_for_cpu wfc = { .fn = fn, .arg = arg };
- INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
+ INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);
schedule_work_on(cpu, &wfc.work);
flush_work(&wfc.work);
destroy_work_on_stack(&wfc.work);
return wfc.ret;
}
-EXPORT_SYMBOL_GPL(work_on_cpu);
+EXPORT_SYMBOL_GPL(work_on_cpu_key);
/**
- * work_on_cpu_safe - run a function in thread context on a particular cpu
+ * work_on_cpu_safe_key - run a function in thread context on a particular cpu
* @cpu: the cpu to run on
* @fn: the function to run
* @arg: the function argument
+ * @key: The lock class key for lock debugging purposes
*
* Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
* any locks which would prevent @fn from completing.
*
* Return: The value @fn returns.
*/
-long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
+ void *arg, struct lock_class_key *key)
{
long ret = -ENODEV;
- get_online_cpus();
+ cpus_read_lock();
if (cpu_online(cpu))
- ret = work_on_cpu(cpu, fn, arg);
- put_online_cpus();
+ ret = work_on_cpu_key(cpu, fn, arg, key);
+ cpus_read_unlock();
return ret;
}
-EXPORT_SYMBOL_GPL(work_on_cpu_safe);
+EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
#endif /* CONFIG_SMP */
#ifdef CONFIG_FREEZER
@@ -5155,7 +5668,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu_safe);
* freeze_workqueues_begin - begin freezing workqueues
*
* Start freezing workqueues. After this function returns, all freezable
- * workqueues will queue new works to their delayed_works list instead of
+ * workqueues will queue new works to their inactive_works list instead of
* pool->worklist.
*
* CONTEXT:
@@ -5261,7 +5774,7 @@ out_unlock:
}
#endif /* CONFIG_FREEZER */
-static int workqueue_apply_unbound_cpumask(void)
+static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
{
LIST_HEAD(ctxs);
int ret = 0;
@@ -5277,9 +5790,9 @@ static int workqueue_apply_unbound_cpumask(void)
if (wq->flags & __WQ_ORDERED)
continue;
- ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs);
- if (!ctx) {
- ret = -ENOMEM;
+ ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
+ if (IS_ERR(ctx)) {
+ ret = PTR_ERR(ctx);
break;
}
@@ -5292,70 +5805,118 @@ static int workqueue_apply_unbound_cpumask(void)
apply_wqattrs_cleanup(ctx);
}
+ if (!ret) {
+ mutex_lock(&wq_pool_attach_mutex);
+ cpumask_copy(wq_unbound_cpumask, unbound_cpumask);
+ mutex_unlock(&wq_pool_attach_mutex);
+ }
return ret;
}
/**
- * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
- * @cpumask: the cpumask to set
+ * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask
+ * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
*
- * The low-level workqueues cpumask is a global cpumask that limits
- * the affinity of all unbound workqueues. This function check the @cpumask
- * and apply it to all unbound workqueues and updates all pwqs of them.
- *
- * Retun: 0 - Success
- * -EINVAL - Invalid @cpumask
- * -ENOMEM - Failed to allocate memory for attrs or pwqs.
+ * This function can be called from cpuset code to provide a set of isolated
+ * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold
+ * either cpus_read_lock or cpus_write_lock.
*/
-int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
+int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
{
- int ret = -EINVAL;
- cpumask_var_t saved_cpumask;
+ cpumask_var_t cpumask;
+ int ret = 0;
- if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
return -ENOMEM;
+ lockdep_assert_cpus_held();
+ mutex_lock(&wq_pool_mutex);
+
+ /* Save the current isolated cpumask & export it via sysfs */
+ cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
+
/*
- * Not excluding isolated cpus on purpose.
- * If the user wishes to include them, we allow that.
+ * If the operation fails, it will fall back to
+ * wq_requested_unbound_cpumask which is initially set to
+ * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten
+ * by any subsequent write to workqueue/cpumask sysfs file.
*/
- cpumask_and(cpumask, cpumask, cpu_possible_mask);
- if (!cpumask_empty(cpumask)) {
- apply_wqattrs_lock();
+ if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask))
+ cpumask_copy(cpumask, wq_requested_unbound_cpumask);
+ if (!cpumask_equal(cpumask, wq_unbound_cpumask))
+ ret = workqueue_apply_unbound_cpumask(cpumask);
- /* save the old wq_unbound_cpumask. */
- cpumask_copy(saved_cpumask, wq_unbound_cpumask);
+ mutex_unlock(&wq_pool_mutex);
+ free_cpumask_var(cpumask);
+ return ret;
+}
- /* update wq_unbound_cpumask at first and apply it to wqs. */
- cpumask_copy(wq_unbound_cpumask, cpumask);
- ret = workqueue_apply_unbound_cpumask();
+static int parse_affn_scope(const char *val)
+{
+ int i;
- /* restore the wq_unbound_cpumask when failed. */
- if (ret < 0)
- cpumask_copy(wq_unbound_cpumask, saved_cpumask);
+ for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) {
+ if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i])))
+ return i;
+ }
+ return -EINVAL;
+}
- apply_wqattrs_unlock();
+static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
+{
+ struct workqueue_struct *wq;
+ int affn, cpu;
+
+ affn = parse_affn_scope(val);
+ if (affn < 0)
+ return affn;
+ if (affn == WQ_AFFN_DFL)
+ return -EINVAL;
+
+ cpus_read_lock();
+ mutex_lock(&wq_pool_mutex);
+
+ wq_affn_dfl = affn;
+
+ list_for_each_entry(wq, &workqueues, list) {
+ for_each_online_cpu(cpu) {
+ wq_update_pod(wq, cpu, cpu, true);
+ }
}
- free_cpumask_var(saved_cpumask);
- return ret;
+ mutex_unlock(&wq_pool_mutex);
+ cpus_read_unlock();
+
+ return 0;
+}
+
+static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp)
+{
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]);
}
+static const struct kernel_param_ops wq_affn_dfl_ops = {
+ .set = wq_affn_dfl_set,
+ .get = wq_affn_dfl_get,
+};
+
+module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644);
+
#ifdef CONFIG_SYSFS
/*
* Workqueues with WQ_SYSFS flag set is visible to userland via
* /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
* following attributes.
*
- * per_cpu RO bool : whether the workqueue is per-cpu or unbound
- * max_active RW int : maximum number of in-flight work items
+ * per_cpu RO bool : whether the workqueue is per-cpu or unbound
+ * max_active RW int : maximum number of in-flight work items
*
* Unbound workqueues have the following extra attributes.
*
- * pool_ids RO int : the associated pool IDs for each node
- * nice RW int : nice value of the workers
- * cpumask RW mask : bitmask of allowed CPUs for the workers
- * numa RW bool : whether enable NUMA affinity
+ * nice RW int : nice value of the workers
+ * cpumask RW mask : bitmask of allowed CPUs for the workers
+ * affinity_scope RW str : worker CPU affinity scope (cache, numa, none)
+ * affinity_strict RW bool : worker CPU affinity is strict
*/
struct wq_device {
struct workqueue_struct *wq;
@@ -5408,26 +5969,17 @@ static struct attribute *wq_sysfs_attrs[] = {
};
ATTRIBUTE_GROUPS(wq_sysfs);
-static ssize_t wq_pool_ids_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+static void apply_wqattrs_lock(void)
{
- struct workqueue_struct *wq = dev_to_wq(dev);
- const char *delim = "";
- int node, written = 0;
-
- get_online_cpus();
- rcu_read_lock();
- for_each_node(node) {
- written += scnprintf(buf + written, PAGE_SIZE - written,
- "%s%d:%d", delim, node,
- unbound_pwq_by_node(wq, node)->pool->id);
- delim = " ";
- }
- written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
- rcu_read_unlock();
- put_online_cpus();
+ /* CPUs should stay stable across pwq creations and installations */
+ cpus_read_lock();
+ mutex_lock(&wq_pool_mutex);
+}
- return written;
+static void apply_wqattrs_unlock(void)
+{
+ mutex_unlock(&wq_pool_mutex);
+ cpus_read_unlock();
}
static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
@@ -5520,50 +6072,84 @@ out_unlock:
return ret ?: count;
}
-static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
- char *buf)
+static ssize_t wq_affn_scope_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct workqueue_struct *wq = dev_to_wq(dev);
int written;
mutex_lock(&wq->mutex);
- written = scnprintf(buf, PAGE_SIZE, "%d\n",
- !wq->unbound_attrs->no_numa);
+ if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL)
+ written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
+ wq_affn_names[WQ_AFFN_DFL],
+ wq_affn_names[wq_affn_dfl]);
+ else
+ written = scnprintf(buf, PAGE_SIZE, "%s\n",
+ wq_affn_names[wq->unbound_attrs->affn_scope]);
mutex_unlock(&wq->mutex);
return written;
}
-static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t wq_affn_scope_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
struct workqueue_struct *wq = dev_to_wq(dev);
struct workqueue_attrs *attrs;
- int v, ret = -ENOMEM;
+ int affn, ret = -ENOMEM;
- apply_wqattrs_lock();
+ affn = parse_affn_scope(buf);
+ if (affn < 0)
+ return affn;
+ apply_wqattrs_lock();
attrs = wq_sysfs_prep_attrs(wq);
- if (!attrs)
- goto out_unlock;
-
- ret = -EINVAL;
- if (sscanf(buf, "%d", &v) == 1) {
- attrs->no_numa = !v;
+ if (attrs) {
+ attrs->affn_scope = affn;
ret = apply_workqueue_attrs_locked(wq, attrs);
}
+ apply_wqattrs_unlock();
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
-out_unlock:
+static ssize_t wq_affinity_strict_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n",
+ wq->unbound_attrs->affn_strict);
+}
+
+static ssize_t wq_affinity_strict_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int v, ret = -ENOMEM;
+
+ if (sscanf(buf, "%d", &v) != 1)
+ return -EINVAL;
+
+ apply_wqattrs_lock();
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (attrs) {
+ attrs->affn_strict = (bool)v;
+ ret = apply_workqueue_attrs_locked(wq, attrs);
+ }
apply_wqattrs_unlock();
free_workqueue_attrs(attrs);
return ret ?: count;
}
static struct device_attribute wq_sysfs_unbound_attrs[] = {
- __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
- __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
+ __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store),
+ __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store),
__ATTR_NULL,
};
@@ -5572,19 +6158,74 @@ static struct bus_type wq_subsys = {
.dev_groups = wq_sysfs_groups,
};
-static ssize_t wq_unbound_cpumask_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+/**
+ * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
+ * @cpumask: the cpumask to set
+ *
+ * The low-level workqueues cpumask is a global cpumask that limits
+ * the affinity of all unbound workqueues. This function check the @cpumask
+ * and apply it to all unbound workqueues and updates all pwqs of them.
+ *
+ * Return: 0 - Success
+ * -EINVAL - Invalid @cpumask
+ * -ENOMEM - Failed to allocate memory for attrs or pwqs.
+ */
+static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
+{
+ int ret = -EINVAL;
+
+ /*
+ * Not excluding isolated cpus on purpose.
+ * If the user wishes to include them, we allow that.
+ */
+ cpumask_and(cpumask, cpumask, cpu_possible_mask);
+ if (!cpumask_empty(cpumask)) {
+ apply_wqattrs_lock();
+ cpumask_copy(wq_requested_unbound_cpumask, cpumask);
+ if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ ret = workqueue_apply_unbound_cpumask(cpumask);
+
+out_unlock:
+ apply_wqattrs_unlock();
+ }
+
+ return ret;
+}
+
+static ssize_t __wq_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf, cpumask_var_t mask)
{
int written;
mutex_lock(&wq_pool_mutex);
- written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
- cpumask_pr_args(wq_unbound_cpumask));
+ written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
mutex_unlock(&wq_pool_mutex);
return written;
}
+static ssize_t wq_unbound_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask);
+}
+
+static ssize_t wq_requested_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask);
+}
+
+static ssize_t wq_isolated_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask);
+}
+
static ssize_t wq_unbound_cpumask_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
@@ -5602,19 +6243,35 @@ static ssize_t wq_unbound_cpumask_store(struct device *dev,
return ret ? ret : count;
}
-static struct device_attribute wq_sysfs_cpumask_attr =
+static struct device_attribute wq_sysfs_cpumask_attrs[] = {
__ATTR(cpumask, 0644, wq_unbound_cpumask_show,
- wq_unbound_cpumask_store);
+ wq_unbound_cpumask_store),
+ __ATTR(cpumask_requested, 0444, wq_requested_cpumask_show, NULL),
+ __ATTR(cpumask_isolated, 0444, wq_isolated_cpumask_show, NULL),
+ __ATTR_NULL,
+};
static int __init wq_sysfs_init(void)
{
+ struct device *dev_root;
int err;
err = subsys_virtual_register(&wq_subsys, NULL);
if (err)
return err;
- return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr);
+ dev_root = bus_get_dev_root(&wq_subsys);
+ if (dev_root) {
+ struct device_attribute *attr;
+
+ for (attr = wq_sysfs_cpumask_attrs; attr->attr.name; attr++) {
+ err = device_create_file(dev_root, attr);
+ if (err)
+ break;
+ }
+ put_device(dev_root);
+ }
+ return err;
}
core_initcall(wq_sysfs_init);
@@ -5738,6 +6395,57 @@ static struct timer_list wq_watchdog_timer;
static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
+/*
+ * Show workers that might prevent the processing of pending work items.
+ * The only candidates are CPU-bound workers in the running state.
+ * Pending work items should be handled by another idle worker
+ * in all other situations.
+ */
+static void show_cpu_pool_hog(struct worker_pool *pool)
+{
+ struct worker *worker;
+ unsigned long flags;
+ int bkt;
+
+ raw_spin_lock_irqsave(&pool->lock, flags);
+
+ hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+ if (task_is_running(worker->task)) {
+ /*
+ * Defer printing to avoid deadlocks in console
+ * drivers that queue work while holding locks
+ * also taken in their write paths.
+ */
+ printk_deferred_enter();
+
+ pr_info("pool %d:\n", pool->id);
+ sched_show_task(worker->task);
+
+ printk_deferred_exit();
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&pool->lock, flags);
+}
+
+static void show_cpu_pools_hogs(void)
+{
+ struct worker_pool *pool;
+ int pi;
+
+ pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n");
+
+ rcu_read_lock();
+
+ for_each_pool(pool, pi) {
+ if (pool->cpu_stall)
+ show_cpu_pool_hog(pool);
+
+ }
+
+ rcu_read_unlock();
+}
+
static void wq_watchdog_reset_touched(void)
{
int cpu;
@@ -5751,6 +6459,8 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
{
unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
bool lockup_detected = false;
+ bool cpu_pool_stall = false;
+ unsigned long now = jiffies;
struct worker_pool *pool;
int pi;
@@ -5762,40 +6472,51 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
for_each_pool(pool, pi) {
unsigned long pool_ts, touched, ts;
+ pool->cpu_stall = false;
if (list_empty(&pool->worklist))
continue;
+ /*
+ * If a virtual machine is stopped by the host it can look to
+ * the watchdog like a stall.
+ */
+ kvm_check_and_clear_guest_paused();
+
/* get the latest of pool and touched timestamps */
+ if (pool->cpu >= 0)
+ touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
+ else
+ touched = READ_ONCE(wq_watchdog_touched);
pool_ts = READ_ONCE(pool->watchdog_ts);
- touched = READ_ONCE(wq_watchdog_touched);
if (time_after(pool_ts, touched))
ts = pool_ts;
else
ts = touched;
- if (pool->cpu >= 0) {
- unsigned long cpu_touched =
- READ_ONCE(per_cpu(wq_watchdog_touched_cpu,
- pool->cpu));
- if (time_after(cpu_touched, ts))
- ts = cpu_touched;
- }
-
/* did we stall? */
- if (time_after(jiffies, ts + thresh)) {
+ if (time_after(now, ts + thresh)) {
lockup_detected = true;
+ if (pool->cpu >= 0) {
+ pool->cpu_stall = true;
+ cpu_pool_stall = true;
+ }
pr_emerg("BUG: workqueue lockup - pool");
pr_cont_pool_info(pool);
pr_cont(" stuck for %us!\n",
- jiffies_to_msecs(jiffies - pool_ts) / 1000);
+ jiffies_to_msecs(now - pool_ts) / 1000);
}
+
+
}
rcu_read_unlock();
if (lockup_detected)
- show_workqueue_state();
+ show_all_workqueues();
+
+ if (cpu_pool_stall)
+ show_cpu_pools_hogs();
wq_watchdog_reset_touched();
mod_timer(&wq_watchdog_timer, jiffies + thresh);
@@ -5805,8 +6526,8 @@ notrace void wq_watchdog_touch(int cpu)
{
if (cpu >= 0)
per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
- else
- wq_watchdog_touched = jiffies;
+
+ wq_watchdog_touched = jiffies;
}
static void wq_watchdog_set_thresh(unsigned long thresh)
@@ -5859,71 +6580,65 @@ static inline void wq_watchdog_init(void) { }
#endif /* CONFIG_WQ_WATCHDOG */
-static void __init wq_numa_init(void)
+static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask)
{
- cpumask_var_t *tbl;
- int node, cpu;
-
- if (num_possible_nodes() <= 1)
- return;
-
- if (wq_disable_numa) {
- pr_info("workqueue: NUMA affinity support disabled\n");
+ if (!cpumask_intersects(wq_unbound_cpumask, mask)) {
+ pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n",
+ cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask));
return;
}
- wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs();
- BUG_ON(!wq_update_unbound_numa_attrs_buf);
-
- /*
- * We want masks of possible CPUs of each node which isn't readily
- * available. Build one from cpu_to_node() which should have been
- * fully initialized by now.
- */
- tbl = kcalloc(nr_node_ids, sizeof(tbl[0]), GFP_KERNEL);
- BUG_ON(!tbl);
-
- for_each_node(node)
- BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
- node_online(node) ? node : NUMA_NO_NODE));
-
- for_each_possible_cpu(cpu) {
- node = cpu_to_node(cpu);
- if (WARN_ON(node == NUMA_NO_NODE)) {
- pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
- /* happens iff arch is bonkers, let's just proceed */
- return;
- }
- cpumask_set_cpu(cpu, tbl[node]);
- }
-
- wq_numa_possible_cpumask = tbl;
- wq_numa_enabled = true;
+ cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask);
}
/**
* workqueue_init_early - early init for workqueue subsystem
*
- * This is the first half of two-staged workqueue subsystem initialization
- * and invoked as soon as the bare basics - memory allocation, cpumasks and
- * idr are up. It sets up all the data structures and system workqueues
- * and allows early boot code to create workqueues and queue/cancel work
- * items. Actual work item execution starts only after kthreads can be
- * created and scheduled right before early initcalls.
+ * This is the first step of three-staged workqueue subsystem initialization and
+ * invoked as soon as the bare basics - memory allocation, cpumasks and idr are
+ * up. It sets up all the data structures and system workqueues and allows early
+ * boot code to create workqueues and queue/cancel work items. Actual work item
+ * execution starts only after kthreads can be created and scheduled right
+ * before early initcalls.
*/
void __init workqueue_init_early(void)
{
+ struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
- int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
int i, cpu;
BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
- cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));
+ BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));
+
+ cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+ restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
+ restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
+ if (!cpumask_empty(&wq_cmdline_cpumask))
+ restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);
+
+ cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
+ wq_update_pod_attrs_buf = alloc_workqueue_attrs();
+ BUG_ON(!wq_update_pod_attrs_buf);
+
+ /* initialize WQ_AFFN_SYSTEM pods */
+ pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
+ pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL);
+ pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
+ BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod);
+
+ BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE));
+
+ pt->nr_pods = 1;
+ cpumask_copy(pt->pod_cpus[0], cpu_possible_mask);
+ pt->pod_node[0] = NUMA_NO_NODE;
+ pt->cpu_pod[0] = 0;
+
/* initialize CPU pools */
for_each_possible_cpu(cpu) {
struct worker_pool *pool;
@@ -5933,7 +6648,9 @@ void __init workqueue_init_early(void)
BUG_ON(init_worker_pool(pool));
pool->cpu = cpu;
cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
+ cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu));
pool->attrs->nice = std_nice[i++];
+ pool->attrs->affn_strict = true;
pool->node = cpu_to_node(cpu);
/* alloc pool ID */
@@ -5954,11 +6671,10 @@ void __init workqueue_init_early(void)
/*
* An ordered wq should have only one pwq as ordering is
* guaranteed by max_active which is enforced by pwqs.
- * Turn off NUMA so that dfl_pwq is used for all nodes.
*/
BUG_ON(!(attrs = alloc_workqueue_attrs()));
attrs->nice = std_nice[i];
- attrs->no_numa = true;
+ attrs->ordered = true;
ordered_wq_attrs[i] = attrs;
}
@@ -5966,7 +6682,7 @@ void __init workqueue_init_early(void)
system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
system_long_wq = alloc_workqueue("events_long", 0, 0);
system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
- WQ_UNBOUND_MAX_ACTIVE);
+ WQ_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable",
WQ_FREEZABLE, 0);
system_power_efficient_wq = alloc_workqueue("events_power_efficient",
@@ -5980,14 +6696,53 @@ void __init workqueue_init_early(void)
!system_freezable_power_efficient_wq);
}
+static void __init wq_cpu_intensive_thresh_init(void)
+{
+ unsigned long thresh;
+ unsigned long bogo;
+
+ pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");
+ BUG_ON(IS_ERR(pwq_release_worker));
+
+ /* if the user set it to a specific value, keep it */
+ if (wq_cpu_intensive_thresh_us != ULONG_MAX)
+ return;
+
+ /*
+ * The default of 10ms is derived from the fact that most modern (as of
+ * 2023) processors can do a lot in 10ms and that it's just below what
+ * most consider human-perceivable. However, the kernel also runs on a
+ * lot slower CPUs including microcontrollers where the threshold is way
+ * too low.
+ *
+ * Let's scale up the threshold upto 1 second if BogoMips is below 4000.
+ * This is by no means accurate but it doesn't have to be. The mechanism
+ * is still useful even when the threshold is fully scaled up. Also, as
+ * the reports would usually be applicable to everyone, some machines
+ * operating on longer thresholds won't significantly diminish their
+ * usefulness.
+ */
+ thresh = 10 * USEC_PER_MSEC;
+
+ /* see init/calibrate.c for lpj -> BogoMIPS calculation */
+ bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1);
+ if (bogo < 4000)
+ thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC);
+
+ pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n",
+ loops_per_jiffy, bogo, thresh);
+
+ wq_cpu_intensive_thresh_us = thresh;
+}
+
/**
* workqueue_init - bring workqueue subsystem fully online
*
- * This is the latter half of two-staged workqueue subsystem initialization
- * and invoked as soon as kthreads can be created and scheduled.
- * Workqueues have been created and work items queued on them, but there
- * are no kworkers executing the work items yet. Populate the worker pools
- * with the initial workers and enable future kworker creations.
+ * This is the second step of three-staged workqueue subsystem initialization
+ * and invoked as soon as kthreads can be created and scheduled. Workqueues have
+ * been created and work items queued on them, but there are no kworkers
+ * executing the work items yet. Populate the worker pools with the initial
+ * workers and enable future kworker creations.
*/
void __init workqueue_init(void)
{
@@ -5995,19 +6750,14 @@ void __init workqueue_init(void)
struct worker_pool *pool;
int cpu, bkt;
- /*
- * It'd be simpler to initialize NUMA in workqueue_init_early() but
- * CPU to node mapping may not be available that early on some
- * archs such as power and arm64. As per-cpu pools created
- * previously could be missing node hint and unbound pools NUMA
- * affinity, fix them up.
- *
- * Also, while iterating workqueues, create rescuers if requested.
- */
- wq_numa_init();
+ wq_cpu_intensive_thresh_init();
mutex_lock(&wq_pool_mutex);
+ /*
+ * Per-cpu pools created earlier could be missing node hint. Fix them
+ * up. Also, create a rescuer for workqueues that requested it.
+ */
for_each_possible_cpu(cpu) {
for_each_cpu_worker_pool(pool, cpu) {
pool->node = cpu_to_node(cpu);
@@ -6015,7 +6765,6 @@ void __init workqueue_init(void)
}
list_for_each_entry(wq, &workqueues, list) {
- wq_update_unbound_numa(wq, smp_processor_id(), true);
WARN(init_rescuer(wq),
"workqueue: failed to create early rescuer for %s",
wq->name);
@@ -6037,3 +6786,116 @@ void __init workqueue_init(void)
wq_online = true;
wq_watchdog_init();
}
+
+/*
+ * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to
+ * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique
+ * and consecutive pod ID. The rest of @pt is initialized accordingly.
+ */
+static void __init init_pod_type(struct wq_pod_type *pt,
+ bool (*cpus_share_pod)(int, int))
+{
+ int cur, pre, cpu, pod;
+
+ pt->nr_pods = 0;
+
+ /* init @pt->cpu_pod[] according to @cpus_share_pod() */
+ pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
+ BUG_ON(!pt->cpu_pod);
+
+ for_each_possible_cpu(cur) {
+ for_each_possible_cpu(pre) {
+ if (pre >= cur) {
+ pt->cpu_pod[cur] = pt->nr_pods++;
+ break;
+ }
+ if (cpus_share_pod(cur, pre)) {
+ pt->cpu_pod[cur] = pt->cpu_pod[pre];
+ break;
+ }
+ }
+ }
+
+ /* init the rest to match @pt->cpu_pod[] */
+ pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
+ pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[0]), GFP_KERNEL);
+ BUG_ON(!pt->pod_cpus || !pt->pod_node);
+
+ for (pod = 0; pod < pt->nr_pods; pod++)
+ BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL));
+
+ for_each_possible_cpu(cpu) {
+ cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]);
+ pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu);
+ }
+}
+
+static bool __init cpus_dont_share(int cpu0, int cpu1)
+{
+ return false;
+}
+
+static bool __init cpus_share_smt(int cpu0, int cpu1)
+{
+#ifdef CONFIG_SCHED_SMT
+ return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1));
+#else
+ return false;
+#endif
+}
+
+static bool __init cpus_share_numa(int cpu0, int cpu1)
+{
+ return cpu_to_node(cpu0) == cpu_to_node(cpu1);
+}
+
+/**
+ * workqueue_init_topology - initialize CPU pods for unbound workqueues
+ *
+ * This is the third step of there-staged workqueue subsystem initialization and
+ * invoked after SMP and topology information are fully initialized. It
+ * initializes the unbound CPU pods accordingly.
+ */
+void __init workqueue_init_topology(void)
+{
+ struct workqueue_struct *wq;
+ int cpu;
+
+ init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share);
+ init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt);
+ init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache);
+ init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa);
+
+ mutex_lock(&wq_pool_mutex);
+
+ /*
+ * Workqueues allocated earlier would have all CPUs sharing the default
+ * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU
+ * combinations to apply per-pod sharing.
+ */
+ list_for_each_entry(wq, &workqueues, list) {
+ for_each_online_cpu(cpu) {
+ wq_update_pod(wq, cpu, cpu, true);
+ }
+ }
+
+ mutex_unlock(&wq_pool_mutex);
+}
+
+void __warn_flushing_systemwide_wq(void)
+{
+ pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n");
+ dump_stack();
+}
+EXPORT_SYMBOL(__warn_flushing_systemwide_wq);
+
+static int __init workqueue_unbound_cpus_setup(char *str)
+{
+ if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) {
+ cpumask_clear(&wq_cmdline_cpumask);
+ pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n");
+ }
+
+ return 1;
+}
+__setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 498de0e909a4..f6275944ada7 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -28,12 +28,18 @@ struct worker {
struct hlist_node hentry; /* L: while busy */
};
- struct work_struct *current_work; /* L: work being processed */
- work_func_t current_func; /* L: current_work's fn */
- struct pool_workqueue *current_pwq; /* L: current_work's pwq */
- struct list_head scheduled; /* L: scheduled works */
+ struct work_struct *current_work; /* K: work being processed and its */
+ work_func_t current_func; /* K: function */
+ struct pool_workqueue *current_pwq; /* K: pwq */
+ u64 current_at; /* K: runtime at start or last wakeup */
+ unsigned int current_color; /* K: color */
+
+ int sleeping; /* S: is worker sleeping? */
- /* 64 bytes boundary on 64bit, 32 on 32bit */
+ /* used by the scheduler to determine a worker's last known identity */
+ work_func_t last_func; /* K: last work's fn */
+
+ struct list_head scheduled; /* L: scheduled works */
struct task_struct *task; /* I: worker task */
struct worker_pool *pool; /* A: the associated pool */
@@ -41,10 +47,9 @@ struct worker {
struct list_head node; /* A: anchored at pool->workers */
/* A: runs through worker->node */
- unsigned long last_active; /* L: last active timestamp */
- unsigned int flags; /* X: flags */
+ unsigned long last_active; /* K: last active timestamp */
+ unsigned int flags; /* L: flags */
int id; /* I: worker id */
- int sleeping; /* None */
/*
* Opaque string set with work_set_desc(). Printed out with task
@@ -54,9 +59,6 @@ struct worker {
/* used only by rescuers to point to the target workqueue */
struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
-
- /* used by the scheduler to determine a worker's last known identity */
- work_func_t last_func;
};
/**
@@ -75,6 +77,7 @@ static inline struct worker *current_wq_worker(void)
*/
void wq_worker_running(struct task_struct *task);
void wq_worker_sleeping(struct task_struct *task);
+void wq_worker_tick(struct task_struct *task);
work_func_t wq_worker_last_func(struct task_struct *task);
#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */